ceph/src/osd/PG.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15 #include "PG.h"
  16 // #include "msg/Messenger.h"
  17 #include "messages/MOSDRepScrub.h"
  18 // #include "common/cmdparse.h"
  19 // #include "common/ceph_context.h"
  20
  21 #include "common/errno.h"
  22 #include "common/config.h"
  23 #include "OSD.h"
  24 #include "OpRequest.h"
  25 #include "ScrubStore.h"
  26 #include "Session.h"
  27
  28 #include "common/Timer.h"
  29 #include "common/perf_counters.h"
  30
  31 #include "messages/MOSDOp.h"
  32 #include "messages/MOSDPGNotify.h"
  33 // #include "messages/MOSDPGLog.h"
  34 #include "messages/MOSDPGRemove.h"
  35 #include "messages/MOSDPGInfo.h"
  36 #include "messages/MOSDPGTrim.h"
  37 #include "messages/MOSDPGScan.h"
  38 #include "messages/MOSDPGBackfill.h"
  39 #include "messages/MOSDPGBackfillRemove.h"
  40 #include "messages/MBackfillReserve.h"
  41 #include "messages/MRecoveryReserve.h"
  42 #include "messages/MOSDPGPush.h"
  43 #include "messages/MOSDPGPushReply.h"
  44 #include "messages/MOSDPGPull.h"
  45 #include "messages/MOSDECSubOpWrite.h"
  46 #include "messages/MOSDECSubOpWriteReply.h"
  47 #include "messages/MOSDECSubOpRead.h"
  48 #include "messages/MOSDECSubOpReadReply.h"
  49 #include "messages/MOSDPGUpdateLogMissing.h"
  50 #include "messages/MOSDPGUpdateLogMissingReply.h"
  51 #include "messages/MOSDBackoff.h"
  52 #include "messages/MOSDScrubReserve.h"
  53 #include "messages/MOSDRepOp.h"
  54 #include "messages/MOSDRepOpReply.h"
  55 #include "messages/MOSDRepScrubMap.h"
  56 #include "messages/MOSDPGRecoveryDelete.h"
  57 #include "messages/MOSDPGRecoveryDeleteReply.h"
  58
  59 #include "common/BackTrace.h"
  60 #include "common/EventTrace.h"
  61
  62 #ifdef WITH_LTTNG
  63 #define TRACEPOINT_DEFINE
  64 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
  65 #include "tracing/pg.h"
  66 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
  67 #undef TRACEPOINT_DEFINE
  68 #else
  69 #define tracepoint(...)
  70 #endif
  71
  72 #include <sstream>
  73
  74 #define dout_context cct
  75 #define dout_subsys ceph_subsys_osd
  76 #undef dout_prefix
  77 #define dout_prefix _prefix(_dout, this)
  78
  79 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
  80 // easily skip them
  81 const string infover_key("_infover");
  82 const string info_key("_info");
  83 const string biginfo_key("_biginfo");
  84 const string epoch_key("_epoch");
  85 const string fastinfo_key("_fastinfo");
  86
  87 template <class T>
  88 static ostream& _prefix(std::ostream *_dout, T *t)
  89 {
  90   return t->gen_prefix(*_dout);
  91 }
  92
  93 void PGStateHistory::enter(PG* pg, const utime_t entime, const char* state)
  94 {
  95   // Ignore trimming state machine for now
  96   if (::strstr(state, "Trimming") != NULL) {
  97     return;
  98   } else if (pi != nullptr) {
  99     pi->enter_state(entime, state);
 100   } else {
 101     // Store current state since we can't reliably take the PG lock here
 102     if ( tmppi == nullptr) {
 103       tmppi = std::unique_ptr<PGStateInstance>(new PGStateInstance);
 104     }
 105
 106     thispg = pg;
 107     tmppi->enter_state(entime, state);
 108   }
 109 }
 110
 111 void PGStateHistory::exit(const char* state) {
 112   // Ignore trimming state machine for now
 113   // Do nothing if PG is being destroyed!
 114   if (::strstr(state, "Trimming") != NULL || pg_in_destructor) {
 115     return;
 116   } else {
 117     bool ilocked = false;
 118     if(!thispg->is_locked()) {
 119       thispg->lock();
 120       ilocked = true;
 121     }
 122     if (pi == nullptr) {
 123       buffer.push_back(std::unique_ptr<PGStateInstance>(tmppi.release()));
 124       pi = buffer.back().get();
 125       pi->setepoch(thispg->get_osdmap_epoch());
 126     }
 127
 128     pi->exit_state(ceph_clock_now());
 129     if (::strcmp(state, "Reset") == 0) {
 130       this->reset();
 131     }
 132     if(ilocked) {
 133       thispg->unlock();
 134     }
 135   }
 136 }
 137
 138 void PGStateHistory::dump(Formatter* f) const {
 139   f->open_array_section("history");
 140   for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) {
 141     f->open_object_section("states");
 142     f->dump_stream("epoch") << (*pi)->this_epoch;
 143     for (auto she : (*pi)->state_history) {
 144       f->dump_string("state", std::get<2>(she));
 145       f->dump_stream("enter") << std::get<0>(she);
 146       f->dump_stream("exit") << std::get<1>(she);
 147     }
 148     f->close_section();
 149   }
 150   f->close_section();
 151 }
 152
 153 void PG::get(const char* tag)
 154 {
 155   int after = ++ref;
 156   lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " "
 157                                  << "tag " << (tag ? tag : "(none") << " "
 158                                  << (after - 1) << " -> " << after << dendl;
 159 #ifdef PG_DEBUG_REFS
 160   std::lock_guard l(_ref_id_lock);
 161   _tag_counts[tag]++;
 162 #endif
 163 }
 164
 165 void PG::put(const char* tag)
 166 {
 167 #ifdef PG_DEBUG_REFS
 168   {
 169     std::lock_guard l(_ref_id_lock);
 170     auto tag_counts_entry = _tag_counts.find(tag);
 171     ceph_assert(tag_counts_entry != _tag_counts.end());
 172     --tag_counts_entry->second;
 173     if (tag_counts_entry->second == 0) {
 174       _tag_counts.erase(tag_counts_entry);
 175     }
 176   }
 177 #endif
 178   auto local_cct = cct;
 179   int after = --ref;
 180   lgeneric_subdout(local_cct, refs, 5) << "PG::put " << this << " "
 181                                        << "tag " << (tag ? tag : "(none") << " "
 182                                        << (after + 1) << " -> " << after
 183                                        << dendl;
 184   if (after == 0)
 185     delete this;
 186 }
 187
 188 #ifdef PG_DEBUG_REFS
 189 uint64_t PG::get_with_id()
 190 {
 191   ref++;
 192   std::lock_guard l(_ref_id_lock);
 193   uint64_t id = ++_ref_id;
 194   BackTrace bt(0);
 195   stringstream ss;
 196   bt.print(ss);
 197   lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " " << info.pgid
 198                                  << " got id " << id << " "
 199                                  << (ref - 1) << " -> " << ref
 200                                  << dendl;
 201   ceph_assert(!_live_ids.count(id));
 202   _live_ids.insert(make_pair(id, ss.str()));
 203   return id;
 204 }
 205
 206 void PG::put_with_id(uint64_t id)
 207 {
 208   int newref = --ref;
 209   lgeneric_subdout(cct, refs, 5) << "PG::put " << this << " " << info.pgid
 210                                  << " put id " << id << " "
 211                                  << (newref + 1) << " -> " << newref
 212                                  << dendl;
 213   {
 214     std::lock_guard l(_ref_id_lock);
 215     ceph_assert(_live_ids.count(id));
 216     _live_ids.erase(id);
 217   }
 218   if (newref)
 219     delete this;
 220 }
 221
 222 void PG::dump_live_ids()
 223 {
 224   std::lock_guard l(_ref_id_lock);
 225   dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl;
 226   for (map<uint64_t, string>::iterator i = _live_ids.begin();
 227        i != _live_ids.end();
 228        ++i) {
 229     dout(0) << "\t\tid: " << *i << dendl;
 230   }
 231   dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl;
 232   for (map<string, uint64_t>::iterator i = _tag_counts.begin();
 233        i != _tag_counts.end();
 234        ++i) {
 235     dout(0) << "\t\tid: " << *i << dendl;
 236   }
 237 }
 238 #endif
 239
 240
 241 void PGPool::update(CephContext *cct, OSDMapRef map)
 242 {
 243   const pg_pool_t *pi = map->get_pg_pool(id);
 244   if (!pi) {
 245     return; // pool has been deleted
 246   }
 247   info = *pi;
 248   name = map->get_pool_name(id);
 249
 250   bool updated = false;
 251   if ((map->get_epoch() != cached_epoch + 1) ||
 252       (pi->get_snap_epoch() == map->get_epoch())) {
 253     updated = true;
 254   }
 255
 256   if (map->require_osd_release >= CEPH_RELEASE_MIMIC) {
 257     // mimic tracks removed_snaps_queue in the OSDmap and purged_snaps
 258     // in the pg_info_t, with deltas for both in each OSDMap.  we don't
 259     // need to (and can't) track it here.
 260     cached_removed_snaps.clear();
 261     newly_removed_snaps.clear();
 262   } else {
 263     // legacy (<= luminous) removed_snaps tracking
 264     if (updated) {
 265       if (pi->maybe_updated_removed_snaps(cached_removed_snaps)) {
 266         pi->build_removed_snaps(newly_removed_snaps);
 267         if (cached_removed_snaps.subset_of(newly_removed_snaps)) {
 268           interval_set<snapid_t> removed_snaps = newly_removed_snaps;
 269           newly_removed_snaps.subtract(cached_removed_snaps);
 270           cached_removed_snaps.swap(removed_snaps);
 271         } else {
 272           lgeneric_subdout(cct, osd, 0) << __func__
 273                 << " cached_removed_snaps shrank from " << cached_removed_snaps
 274                 << " to " << newly_removed_snaps << dendl;
 275           cached_removed_snaps.swap(newly_removed_snaps);
 276           newly_removed_snaps.clear();
 277         }
 278       } else {
 279         newly_removed_snaps.clear();
 280       }
 281     } else {
 282       /* 1) map->get_epoch() == cached_epoch + 1 &&
 283        * 2) pi->get_snap_epoch() != map->get_epoch()
 284        *
 285        * From the if branch, 1 && 2 must be true.  From 2, we know that
 286        * this map didn't change the set of removed snaps.  From 1, we
 287        * know that our cached_removed_snaps matches the previous map.
 288        * Thus, from 1 && 2, cached_removed snaps matches the current
 289        * set of removed snaps and all we have to do is clear
 290        * newly_removed_snaps.
 291        */
 292       newly_removed_snaps.clear();
 293     }
 294     lgeneric_subdout(cct, osd, 20)
 295       << "PGPool::update cached_removed_snaps "
 296       << cached_removed_snaps
 297       << " newly_removed_snaps "
 298       << newly_removed_snaps
 299       << " snapc " << snapc
 300       << (updated ? " (updated)":" (no change)")
 301       << dendl;
 302     if (cct->_conf->osd_debug_verify_cached_snaps) {
 303       interval_set<snapid_t> actual_removed_snaps;
 304       pi->build_removed_snaps(actual_removed_snaps);
 305       if (!(actual_removed_snaps == cached_removed_snaps)) {
 306         lgeneric_derr(cct) << __func__
 307                    << ": mismatch between the actual removed snaps "
 308                    << actual_removed_snaps
 309                    << " and pool.cached_removed_snaps "
 310                    << " pool.cached_removed_snaps " << cached_removed_snaps
 311                    << dendl;
 312       }
 313       ceph_assert(actual_removed_snaps == cached_removed_snaps);
 314     }
 315   }
 316   if (info.is_pool_snaps_mode() && updated) {
 317     snapc = pi->get_snap_context();
 318   }
 319   cached_epoch = map->get_epoch();
 320 }
 321
 322 PG::PG(OSDService *o, OSDMapRef curmap,
 323        const PGPool &_pool, spg_t p) :
 324   pg_id(p),
 325   coll(p),
 326   osd(o),
 327   cct(o->cct),
 328   osdmap_ref(curmap),
 329   pool(_pool),
 330   osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
 331   snap_mapper(
 332     cct,
 333     &osdriver,
 334     p.ps(),
 335     p.get_split_bits(_pool.info.get_pg_num()),
 336     _pool.id,
 337     p.shard),
 338   last_persisted_osdmap(curmap->get_epoch()),
 339   deleting(false),
 340   trace_endpoint("0.0.0.0", 0, "PG"),
 341   dirty_info(false), dirty_big_info(false),
 342   info(p),
 343   info_struct_v(0),
 344   pg_log(cct),
 345   pgmeta_oid(p.make_pgmeta_oid()),
 346   missing_loc(this),
 347   stat_queue_item(this),
 348   scrub_queued(false),
 349   recovery_queued(false),
 350   recovery_ops_active(0),
 351   role(-1),
 352   state(0),
 353   send_notify(false),
 354   pg_whoami(osd->whoami, p.shard),
 355   need_up_thru(false),
 356   last_peering_reset(0),
 357   heartbeat_peer_lock("PG::heartbeat_peer_lock"),
 358   backfill_reserved(false),
 359   backfill_reserving(false),
 360   flushes_in_progress(0),
 361   pg_stats_publish_lock("PG::pg_stats_publish_lock"),
 362   pg_stats_publish_valid(false),
 363   finish_sync_event(NULL),
 364   backoff_lock("PG::backoff_lock"),
 365   scrub_after_recovery(false),
 366   active_pushes(0),
 367   recovery_state(this),
 368   peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
 369   acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
 370   upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
 371   last_epoch(0),
 372   last_require_osd_release(curmap->require_osd_release)
 373 {
 374 #ifdef PG_DEBUG_REFS
 375   osd->add_pgid(p, this);
 376 #endif
 377 #ifdef WITH_BLKIN
 378   std::stringstream ss;
 379   ss << "PG " << info.pgid;
 380   trace_endpoint.copy_name(ss.str());
 381 #endif
 382 }
 383
 384 PG::~PG()
 385 {
 386   pgstate_history.set_pg_in_destructor();
 387 #ifdef PG_DEBUG_REFS
 388   osd->remove_pgid(info.pgid, this);
 389 #endif
 390 }
 391
 392 void PG::lock(bool no_lockdep) const
 393 {
 394   _lock.Lock(no_lockdep);
 395   // if we have unrecorded dirty state with the lock dropped, there is a bug
 396   ceph_assert(!dirty_info);
 397   ceph_assert(!dirty_big_info);
 398
 399   dout(30) << "lock" << dendl;
 400 }
 401
 402 std::ostream& PG::gen_prefix(std::ostream& out) const
 403 {
 404   OSDMapRef mapref = osdmap_ref;
 405   if (_lock.is_locked_by_me()) {
 406     out << "osd." << osd->whoami
 407         << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
 408         << " " << *this << " ";
 409   } else {
 410     out << "osd." << osd->whoami
 411         << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
 412         << " pg[" << info.pgid << "(unlocked)] ";
 413   }
 414   return out;
 415 }
 416
 417 /********* PG **********/
 418
 419 void PG::proc_master_log(
 420   ObjectStore::Transaction& t, pg_info_t &oinfo,
 421   pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
 422 {
 423   dout(10) << "proc_master_log for osd." << from << ": "
 424            << olog << " " << omissing << dendl;
 425   ceph_assert(!is_peered() && is_primary());
 426
 427   // merge log into our own log to build master log.  no need to
 428   // make any adjustments to their missing map; we are taking their
 429   // log to be authoritative (i.e., their entries are by definitely
 430   // non-divergent).
 431   merge_log(t, oinfo, olog, from);
 432   peer_info[from] = oinfo;
 433   dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
 434   might_have_unfound.insert(from);
 435
 436   // See doc/dev/osd_internals/last_epoch_started
 437   if (oinfo.last_epoch_started > info.last_epoch_started) {
 438     info.last_epoch_started = oinfo.last_epoch_started;
 439     dirty_info = true;
 440   }
 441   if (oinfo.last_interval_started > info.last_interval_started) {
 442     info.last_interval_started = oinfo.last_interval_started;
 443     dirty_info = true;
 444   }
 445   update_history(oinfo.history);
 446   ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
 447          info.last_epoch_started >= info.history.last_epoch_started);
 448
 449   peer_missing[from].claim(omissing);
 450 }
 451
 452 void PG::proc_replica_log(
 453   pg_info_t &oinfo,
 454   const pg_log_t &olog,
 455   pg_missing_t& omissing,
 456   pg_shard_t from)
 457 {
 458   dout(10) << "proc_replica_log for osd." << from << ": "
 459            << oinfo << " " << olog << " " << omissing << dendl;
 460
 461   pg_log.proc_replica_log(oinfo, olog, omissing, from);
 462
 463   peer_info[from] = oinfo;
 464   dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
 465   might_have_unfound.insert(from);
 466
 467   for (map<hobject_t, pg_missing_item>::const_iterator i =
 468          omissing.get_items().begin();
 469        i != omissing.get_items().end();
 470        ++i) {
 471     dout(20) << " after missing " << i->first << " need " << i->second.need
 472              << " have " << i->second.have << dendl;
 473   }
 474   peer_missing[from].claim(omissing);
 475 }
 476
 477 bool PG::proc_replica_info(
 478   pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
 479 {
 480   map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
 481   if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
 482     dout(10) << " got dup osd." << from << " info " << oinfo << ", identical to ours" << dendl;
 483     return false;
 484   }
 485
 486   if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
 487     dout(10) << " got info " << oinfo << " from down osd." << from
 488              << " discarding" << dendl;
 489     return false;
 490   }
 491
 492   dout(10) << " got osd." << from << " " << oinfo << dendl;
 493   ceph_assert(is_primary());
 494   peer_info[from] = oinfo;
 495   might_have_unfound.insert(from);
 496
 497   update_history(oinfo.history);
 498
 499   // stray?
 500   if (!is_up(from) && !is_acting(from)) {
 501     dout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
 502     stray_set.insert(from);
 503     if (is_clean()) {
 504       purge_strays();
 505     }
 506   }
 507
 508   // was this a new info?  if so, update peers!
 509   if (p == peer_info.end())
 510     update_heartbeat_peers();
 511
 512   return true;
 513 }
 514
 515 void PG::remove_snap_mapped_object(
 516   ObjectStore::Transaction &t, const hobject_t &soid)
 517 {
 518   t.remove(
 519     coll,
 520     ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard));
 521   clear_object_snap_mapping(&t, soid);
 522 }
 523
 524 void PG::clear_object_snap_mapping(
 525   ObjectStore::Transaction *t, const hobject_t &soid)
 526 {
 527   OSDriver::OSTransaction _t(osdriver.get_transaction(t));
 528   if (soid.snap < CEPH_MAXSNAP) {
 529     int r = snap_mapper.remove_oid(
 530       soid,
 531       &_t);
 532     if (!(r == 0 || r == -ENOENT)) {
 533       derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
 534       ceph_abort();
 535     }
 536   }
 537 }
 538
 539 void PG::update_object_snap_mapping(
 540   ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps)
 541 {
 542   OSDriver::OSTransaction _t(osdriver.get_transaction(t));
 543   ceph_assert(soid.snap < CEPH_MAXSNAP);
 544   int r = snap_mapper.remove_oid(
 545     soid,
 546     &_t);
 547   if (!(r == 0 || r == -ENOENT)) {
 548     derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
 549     ceph_abort();
 550   }
 551   snap_mapper.add_oid(
 552     soid,
 553     snaps,
 554     &_t);
 555 }
 556
 557 void PG::merge_log(
 558   ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_shard_t from)
 559 {
 560   PGLogEntryHandler rollbacker{this, &t};
 561   pg_log.merge_log(
 562     oinfo, olog, from, info, &rollbacker, dirty_info, dirty_big_info);
 563 }
 564
 565 void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead)
 566 {
 567   PGLogEntryHandler rollbacker{this, &t};
 568   pg_log.rewind_divergent_log(
 569     newhead, info, &rollbacker, dirty_info, dirty_big_info);
 570 }
 571
 572 /*
 573  * Process information from a replica to determine if it could have any
 574  * objects that i need.
 575  *
 576  * TODO: if the missing set becomes very large, this could get expensive.
 577  * Instead, we probably want to just iterate over our unfound set.
 578  */
 579 bool PG::search_for_missing(
 580   const pg_info_t &oinfo, const pg_missing_t &omissing,
 581   pg_shard_t from,
 582   RecoveryCtx *ctx)
 583 {
 584   uint64_t num_unfound_before = missing_loc.num_unfound();
 585   bool found_missing = missing_loc.add_source_info(
 586     from, oinfo, omissing, ctx->handle);
 587   if (found_missing && num_unfound_before != missing_loc.num_unfound())
 588     publish_stats_to_osd();
 589   // avoid doing this if the peer is empty.  This is abit of paranoia
 590   // to avoid doing something rash if add_source_info() above
 591   // incorrectly decided we found something new. (if the peer has
 592   // last_update=0'0 that's impossible.)
 593   if (found_missing &&
 594       oinfo.last_update != eversion_t()) {
 595     pg_info_t tinfo(oinfo);
 596     tinfo.pgid.shard = pg_whoami.shard;
 597     (*(ctx->info_map))[from.osd].push_back(
 598       make_pair(
 599         pg_notify_t(
 600           from.shard, pg_whoami.shard,
 601           get_osdmap_epoch(),
 602           get_osdmap_epoch(),
 603           tinfo),
 604         past_intervals));
 605   }
 606   return found_missing;
 607 }
 608
 609
 610 // MissingLoc
 611
 612 bool PG::MissingLoc::readable_with_acting(
 613   const hobject_t &hoid,
 614   const set<pg_shard_t> &acting) const {
 615   if (!needs_recovery(hoid))
 616     return true;
 617   if (is_deleted(hoid))
 618     return false;
 619   auto missing_loc_entry = missing_loc.find(hoid);
 620   if (missing_loc_entry == missing_loc.end())
 621     return false;
 622   const set<pg_shard_t> &locs = missing_loc_entry->second;
 623   ldout(pg->cct, 10) << __func__ << ": locs:" << locs << dendl;
 624   set<pg_shard_t> have_acting;
 625   for (set<pg_shard_t>::const_iterator i = locs.begin();
 626        i != locs.end();
 627        ++i) {
 628     if (acting.count(*i))
 629       have_acting.insert(*i);
 630   }
 631   return (*is_readable)(have_acting);
 632 }
 633
 634 void PG::MissingLoc::add_batch_sources_info(
 635   const set<pg_shard_t> &sources, ThreadPool::TPHandle* handle)
 636 {
 637   ldout(pg->cct, 10) << __func__ << ": adding sources in batch "
 638                      << sources.size() << dendl;
 639   unsigned loop = 0;
 640   bool sources_updated = false;
 641   for (map<hobject_t, pg_missing_item>::const_iterator i = needs_recovery_map.begin();
 642       i != needs_recovery_map.end();
 643       ++i) {
 644     if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
 645       handle->reset_tp_timeout();
 646       loop = 0;
 647     }
 648     if (i->second.is_delete())
 649       continue;
 650
 651     auto p = missing_loc.find(i->first);
 652     if (p == missing_loc.end()) {
 653       p = missing_loc.emplace(i->first, set<pg_shard_t>()).first;
 654     } else {
 655       _dec_count(p->second);
 656     }
 657     missing_loc[i->first].insert(sources.begin(), sources.end());
 658     _inc_count(p->second);
 659
 660     if (!sources_updated) {
 661       missing_loc_sources.insert(sources.begin(), sources.end());
 662       sources_updated = true;
 663     }
 664   }
 665 }
 666
 667 bool PG::MissingLoc::add_source_info(
 668   pg_shard_t fromosd,
 669   const pg_info_t &oinfo,
 670   const pg_missing_t &omissing,
 671   ThreadPool::TPHandle* handle)
 672 {
 673   bool found_missing = false;
 674   unsigned loop = 0;
 675   bool sources_updated = false;
 676   // found items?
 677   for (map<hobject_t,pg_missing_item>::const_iterator p = needs_recovery_map.begin();
 678        p != needs_recovery_map.end();
 679        ++p) {
 680     const hobject_t &soid(p->first);
 681     eversion_t need = p->second.need;
 682     if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
 683       handle->reset_tp_timeout();
 684       loop = 0;
 685     }
 686     if (p->second.is_delete()) {
 687       ldout(pg->cct, 10) << __func__ << " " << soid
 688                          << " delete, ignoring source" << dendl;
 689       continue;
 690     }
 691     if (oinfo.last_update < need) {
 692       ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
 693                          << " also missing on osd." << fromosd
 694                          << " (last_update " << oinfo.last_update
 695                          << " < needed " << need << ")" << dendl;
 696       continue;
 697     }
 698     if (!oinfo.last_backfill.is_max() &&
 699         !oinfo.last_backfill_bitwise) {
 700       ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
 701                          << " also missing on osd." << fromosd
 702                          << " (last_backfill " << oinfo.last_backfill
 703                          << " but with wrong sort order)"
 704                          << dendl;
 705       continue;
 706     }
 707     if (p->first >= oinfo.last_backfill) {
 708       // FIXME: this is _probably_ true, although it could conceivably
 709       // be in the undefined region!  Hmm!
 710       ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
 711                          << " also missing on osd." << fromosd
 712                          << " (past last_backfill " << oinfo.last_backfill
 713                          << ")" << dendl;
 714       continue;
 715     }
 716     if (omissing.is_missing(soid)) {
 717       ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
 718                          << " also missing on osd." << fromosd << dendl;
 719       continue;
 720     }
 721
 722     ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
 723                        << " is on osd." << fromosd << dendl;
 724
 725     {
 726       auto p = missing_loc.find(soid);
 727       if (p == missing_loc.end()) {
 728         p = missing_loc.emplace(soid, set<pg_shard_t>()).first;
 729       } else {
 730         _dec_count(p->second);
 731       }
 732       p->second.insert(fromosd);
 733       _inc_count(p->second);
 734     }
 735
 736     if (!sources_updated) {
 737       missing_loc_sources.insert(fromosd);
 738       sources_updated = true;
 739     }
 740     found_missing = true;
 741   }
 742
 743   ldout(pg->cct, 20) << "needs_recovery_map missing " << needs_recovery_map
 744                      << dendl;
 745   return found_missing;
 746 }
 747
 748 void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
 749 {
 750   set<pg_shard_t> now_down;
 751   for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
 752        p != missing_loc_sources.end();
 753        ) {
 754     if (osdmap->is_up(p->osd)) {
 755       ++p;
 756       continue;
 757     }
 758     ldout(pg->cct, 10) << __func__ << " source osd." << *p << " now down" << dendl;
 759     now_down.insert(*p);
 760     missing_loc_sources.erase(p++);
 761   }
 762
 763   if (now_down.empty()) {
 764     ldout(pg->cct, 10) << __func__ << " no source osds (" << missing_loc_sources << ") went down" << dendl;
 765   } else {
 766     ldout(pg->cct, 10) << __func__ << " sources osds " << now_down << " now down, remaining sources are "
 767                        << missing_loc_sources << dendl;
 768
 769     // filter missing_loc
 770     map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin();
 771     while (p != missing_loc.end()) {
 772       set<pg_shard_t>::iterator q = p->second.begin();
 773       bool changed = false;
 774       while (q != p->second.end()) {
 775         if (now_down.count(*q)) {
 776           if (!changed) {
 777             changed = true;
 778             _dec_count(p->second);
 779           }
 780           p->second.erase(q++);
 781         } else {
 782           ++q;
 783         }
 784       }
 785       if (p->second.empty()) {
 786         missing_loc.erase(p++);
 787       } else {
 788         if (changed) {
 789           _inc_count(p->second);
 790         }
 791         ++p;
 792       }
 793     }
 794   }
 795 }
 796
 797 void PG::discover_all_missing(map<int, map<spg_t,pg_query_t> > &query_map)
 798 {
 799   auto &missing = pg_log.get_missing();
 800   uint64_t unfound = get_num_unfound();
 801
 802   dout(10) << __func__ << " "
 803            << missing.num_missing() << " missing, "
 804            << unfound << " unfound"
 805            << dendl;
 806
 807   std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
 808   std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
 809   for (; m != mend; ++m) {
 810     pg_shard_t peer(*m);
 811
 812     if (!get_osdmap()->is_up(peer.osd)) {
 813       dout(20) << __func__ << " skipping down osd." << peer << dendl;
 814       continue;
 815     }
 816
 817     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
 818     if (iter != peer_info.end() &&
 819         (iter->second.is_empty() || iter->second.dne())) {
 820       // ignore empty peers
 821       continue;
 822     }
 823
 824     // If we've requested any of this stuff, the pg_missing_t information
 825     // should be on its way.
 826     // TODO: coalsce requested_* into a single data structure
 827     if (peer_missing.find(peer) != peer_missing.end()) {
 828       dout(20) << __func__ << ": osd." << peer
 829                << ": we already have pg_missing_t" << dendl;
 830       continue;
 831     }
 832     if (peer_log_requested.find(peer) != peer_log_requested.end()) {
 833       dout(20) << __func__ << ": osd." << peer
 834                << ": in peer_log_requested" << dendl;
 835       continue;
 836     }
 837     if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
 838       dout(20) << __func__ << ": osd." << peer
 839                << ": in peer_missing_requested" << dendl;
 840       continue;
 841     }
 842
 843     // Request missing
 844     dout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
 845              << dendl;
 846     peer_missing_requested.insert(peer);
 847     query_map[peer.osd][spg_t(info.pgid.pgid, peer.shard)] =
 848       pg_query_t(
 849         pg_query_t::FULLLOG,
 850         peer.shard, pg_whoami.shard,
 851         info.history, get_osdmap_epoch());
 852   }
 853 }
 854
 855 /******* PG ***********/
 856 bool PG::needs_recovery() const
 857 {
 858   ceph_assert(is_primary());
 859
 860   auto &missing = pg_log.get_missing();
 861
 862   if (missing.num_missing()) {
 863     dout(10) << __func__ << " primary has " << missing.num_missing()
 864       << " missing" << dendl;
 865     return true;
 866   }
 867
 868   ceph_assert(!acting_recovery_backfill.empty());
 869   set<pg_shard_t>::const_iterator end = acting_recovery_backfill.end();
 870   set<pg_shard_t>::const_iterator a = acting_recovery_backfill.begin();
 871   for (; a != end; ++a) {
 872     if (*a == get_primary()) continue;
 873     pg_shard_t peer = *a;
 874     map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
 875     if (pm == peer_missing.end()) {
 876       dout(10) << __func__ << " osd." << peer << " doesn't have missing set"
 877         << dendl;
 878       continue;
 879     }
 880     if (pm->second.num_missing()) {
 881       dout(10) << __func__ << " osd." << peer << " has "
 882         << pm->second.num_missing() << " missing" << dendl;
 883       return true;
 884     }
 885   }
 886
 887   dout(10) << __func__ << " is recovered" << dendl;
 888   return false;
 889 }
 890
 891 bool PG::needs_backfill() const
 892 {
 893   ceph_assert(is_primary());
 894
 895   // We can assume that only possible osds that need backfill
 896   // are on the backfill_targets vector nodes.
 897   set<pg_shard_t>::const_iterator end = backfill_targets.end();
 898   set<pg_shard_t>::const_iterator a = backfill_targets.begin();
 899   for (; a != end; ++a) {
 900     pg_shard_t peer = *a;
 901     map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
 902     if (!pi->second.last_backfill.is_max()) {
 903       dout(10) << __func__ << " osd." << peer << " has last_backfill " << pi->second.last_backfill << dendl;
 904       return true;
 905     }
 906   }
 907
 908   dout(10) << __func__ << " does not need backfill" << dendl;
 909   return false;
 910 }
 911
 912
 913 void PG::check_past_interval_bounds() const
 914 {
 915   auto rpib = get_required_past_interval_bounds(
 916     info,
 917     osd->get_superblock().oldest_map);
 918   if (rpib.first >= rpib.second) {
 919     if (!past_intervals.empty()) {
 920       osd->clog->error() << info.pgid << " required past_interval bounds are"
 921                          << " empty [" << rpib << ") but past_intervals is not: "
 922                          << past_intervals;
 923       derr << info.pgid << " required past_interval bounds are"
 924            << " empty [" << rpib << ") but past_intervals is not: "
 925            << past_intervals << dendl;
 926     }
 927   } else {
 928     if (past_intervals.empty()) {
 929       osd->clog->error() << info.pgid << " required past_interval bounds are"
 930                          << " not empty [" << rpib << ") but past_intervals "
 931                          << past_intervals << " is empty";
 932       derr << info.pgid << " required past_interval bounds are"
 933            << " not empty [" << rpib << ") but past_intervals "
 934            << past_intervals << " is empty" << dendl;
 935       ceph_assert(!past_intervals.empty());
 936     }
 937
 938     auto apib = past_intervals.get_bounds();
 939     if (apib.first > rpib.first) {
 940       osd->clog->error() << info.pgid << " past_intervals [" << apib
 941                          << ") start interval does not contain the required"
 942                          << " bound [" << rpib << ") start";
 943       derr << info.pgid << " past_intervals [" << apib
 944            << ") start interval does not contain the required"
 945            << " bound [" << rpib << ") start" << dendl;
 946       ceph_abort_msg("past_interval start interval mismatch");
 947     }
 948     if (apib.second != rpib.second) {
 949       osd->clog->error() << info.pgid << " past_interal bound [" << apib
 950                          << ") end does not match required [" << rpib
 951                          << ") end";
 952       derr << info.pgid << " past_interal bound [" << apib
 953            << ") end does not match required [" << rpib
 954            << ") end" << dendl;
 955       ceph_abort_msg("past_interval end mismatch");
 956     }
 957   }
 958 }
 959
 960 bool PG::adjust_need_up_thru(const OSDMapRef osdmap)
 961 {
 962   epoch_t up_thru = osdmap->get_up_thru(osd->whoami);
 963   if (need_up_thru &&
 964       up_thru >= info.history.same_interval_since) {
 965     dout(10) << "adjust_need_up_thru now " << up_thru << ", need_up_thru now false" << dendl;
 966     need_up_thru = false;
 967     return true;
 968   }
 969   return false;
 970 }
 971
 972 void PG::remove_down_peer_info(const OSDMapRef osdmap)
 973 {
 974   // Remove any downed osds from peer_info
 975   bool removed = false;
 976   map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
 977   while (p != peer_info.end()) {
 978     if (!osdmap->is_up(p->first.osd)) {
 979       dout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
 980       peer_missing.erase(p->first);
 981       peer_log_requested.erase(p->first);
 982       peer_missing_requested.erase(p->first);
 983       peer_purged.erase(p->first); // so we can re-purge if necessary
 984       peer_info.erase(p++);
 985       removed = true;
 986     } else
 987       ++p;
 988   }
 989
 990   // if we removed anyone, update peers (which include peer_info)
 991   if (removed)
 992     update_heartbeat_peers();
 993   check_recovery_sources(osdmap);
 994 }
 995
 996 /*
 997  * Returns true unless there is a non-lost OSD in might_have_unfound.
 998  */
 999 bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
1000 {
1001   ceph_assert(is_primary());
1002
1003   set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
1004   set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
1005   for (; peer != mend; ++peer) {
1006     if (peer_missing.count(*peer))
1007       continue;
1008     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
1009     if (iter != peer_info.end() &&
1010         (iter->second.is_empty() || iter->second.dne()))
1011       continue;
1012     if (!osdmap->exists(peer->osd))
1013       continue;
1014     const osd_info_t &osd_info(osdmap->get_info(peer->osd));
1015     if (osd_info.lost_at <= osd_info.up_from) {
1016       // If there is even one OSD in might_have_unfound that isn't lost, we
1017       // still might retrieve our unfound.
1018       return false;
1019     }
1020   }
1021   dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound
1022            << " have been queried or are marked lost" << dendl;
1023   return true;
1024 }
1025
1026 PastIntervals::PriorSet PG::build_prior()
1027 {
1028   if (1) {
1029     // sanity check
1030     for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
1031          it != peer_info.end();
1032          ++it) {
1033       ceph_assert(info.history.last_epoch_started >= it->second.history.last_epoch_started);
1034     }
1035   }
1036
1037   const OSDMap &osdmap = *get_osdmap();
1038   PastIntervals::PriorSet prior = past_intervals.get_prior_set(
1039     pool.info.is_erasure(),
1040     info.history.last_epoch_started,
1041     get_pgbackend()->get_is_recoverable_predicate(),
1042     [&](epoch_t start, int osd, epoch_t *lost_at) {
1043       const osd_info_t *pinfo = 0;
1044       if (osdmap.exists(osd)) {
1045         pinfo = &osdmap.get_info(osd);
1046         if (lost_at)
1047           *lost_at = pinfo->lost_at;
1048       }
1049
1050       if (osdmap.is_up(osd)) {
1051         return PastIntervals::UP;
1052       } else if (!pinfo) {
1053         return PastIntervals::DNE;
1054       } else if (pinfo->lost_at > start) {
1055         return PastIntervals::LOST;
1056       } else {
1057         return PastIntervals::DOWN;
1058       }
1059     },
1060     up,
1061     acting,
1062     this);
1063
1064   if (prior.pg_down) {
1065     state_set(PG_STATE_DOWN);
1066   }
1067
1068   if (get_osdmap()->get_up_thru(osd->whoami) < info.history.same_interval_since) {
1069     dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
1070              << " < same_since " << info.history.same_interval_since
1071              << ", must notify monitor" << dendl;
1072     need_up_thru = true;
1073   } else {
1074     dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
1075              << " >= same_since " << info.history.same_interval_since
1076              << ", all is well" << dendl;
1077     need_up_thru = false;
1078   }
1079   set_probe_targets(prior.probe);
1080   return prior;
1081 }
1082
1083 void PG::clear_primary_state()
1084 {
1085   dout(10) << "clear_primary_state" << dendl;
1086
1087   // clear peering state
1088   stray_set.clear();
1089   peer_log_requested.clear();
1090   peer_missing_requested.clear();
1091   peer_info.clear();
1092   peer_bytes.clear();
1093   peer_missing.clear();
1094   need_up_thru = false;
1095   peer_last_complete_ondisk.clear();
1096   peer_activated.clear();
1097   min_last_complete_ondisk = eversion_t();
1098   pg_trim_to = eversion_t();
1099   might_have_unfound.clear();
1100   projected_log = PGLog::IndexedLog();
1101
1102   last_update_ondisk = eversion_t();
1103
1104   snap_trimq.clear();
1105
1106   finish_sync_event = 0;  // so that _finish_recovery doesn't go off in another thread
1107
1108   missing_loc.clear();
1109
1110   release_pg_backoffs();
1111
1112   pg_log.reset_recovery_pointers();
1113
1114   scrubber.reserved_peers.clear();
1115   scrub_after_recovery = false;
1116
1117   agent_clear();
1118 }
1119
1120 PG::Scrubber::Scrubber()
1121  : reserved(false), reserve_failed(false),
1122    epoch_start(0),
1123    active(false),
1124    shallow_errors(0), deep_errors(0), fixed(0),
1125    must_scrub(false), must_deep_scrub(false), must_repair(false),
1126    need_auto(false), time_for_deep(false),
1127    auto_repair(false),
1128    check_repair(false),
1129    deep_scrub_on_error(false),
1130    num_digest_updates_pending(0),
1131    state(INACTIVE),
1132    deep(false)
1133 {}
1134
1135 PG::Scrubber::~Scrubber() {}
1136
1137 /**
1138  * find_best_info
1139  *
1140  * Returns an iterator to the best info in infos sorted by:
1141  *  1) Prefer newer last_update
1142  *  2) Prefer longer tail if it brings another info into contiguity
1143  *  3) Prefer current primary
1144  */
1145 map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
1146   const map<pg_shard_t, pg_info_t> &infos,
1147   bool restrict_to_up_acting,
1148   bool *history_les_bound) const
1149 {
1150   ceph_assert(history_les_bound);
1151   /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1152    * to make changes to this process.  Also, make sure to update it
1153    * when you find bugs! */
1154   eversion_t min_last_update_acceptable = eversion_t::max();
1155   epoch_t max_last_epoch_started_found = 0;
1156   for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1157        i != infos.end();
1158        ++i) {
1159     if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1160         max_last_epoch_started_found < i->second.history.last_epoch_started) {
1161       *history_les_bound = true;
1162       max_last_epoch_started_found = i->second.history.last_epoch_started;
1163     }
1164     if (!i->second.is_incomplete() &&
1165         max_last_epoch_started_found < i->second.last_epoch_started) {
1166       *history_les_bound = false;
1167       max_last_epoch_started_found = i->second.last_epoch_started;
1168     }
1169   }
1170   for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1171        i != infos.end();
1172        ++i) {
1173     if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1174       if (min_last_update_acceptable > i->second.last_update)
1175         min_last_update_acceptable = i->second.last_update;
1176     }
1177   }
1178   if (min_last_update_acceptable == eversion_t::max())
1179     return infos.end();
1180
1181   map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1182   // find osd with newest last_update (oldest for ec_pool).
1183   // if there are multiples, prefer
1184   //  - a longer tail, if it brings another peer into log contiguity
1185   //  - the current primary
1186   for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1187        p != infos.end();
1188        ++p) {
1189     if (restrict_to_up_acting && !is_up(p->first) &&
1190         !is_acting(p->first))
1191       continue;
1192     // Only consider peers with last_update >= min_last_update_acceptable
1193     if (p->second.last_update < min_last_update_acceptable)
1194       continue;
1195     // Disqualify anyone with a too old last_epoch_started
1196     if (p->second.last_epoch_started < max_last_epoch_started_found)
1197       continue;
1198     // Disqualify anyone who is incomplete (not fully backfilled)
1199     if (p->second.is_incomplete())
1200       continue;
1201     if (best == infos.end()) {
1202       best = p;
1203       continue;
1204     }
1205     // Prefer newer last_update
1206     if (pool.info.require_rollback()) {
1207       if (p->second.last_update > best->second.last_update)
1208         continue;
1209       if (p->second.last_update < best->second.last_update) {
1210         best = p;
1211         continue;
1212       }
1213     } else {
1214       if (p->second.last_update < best->second.last_update)
1215         continue;
1216       if (p->second.last_update > best->second.last_update) {
1217         best = p;
1218         continue;
1219       }
1220     }
1221
1222     // Prefer longer tail
1223     if (p->second.log_tail > best->second.log_tail) {
1224       continue;
1225     } else if (p->second.log_tail < best->second.log_tail) {
1226       best = p;
1227       continue;
1228     }
1229
1230     if (!p->second.has_missing() && best->second.has_missing()) {
1231       dout(10) << __func__ << " prefer osd." << p->first
1232                << " because it is complete while best has missing"
1233                << dendl;
1234       best = p;
1235       continue;
1236     } else if (p->second.has_missing() && !best->second.has_missing()) {
1237       dout(10) << __func__ << " skipping osd." << p->first
1238                << " because it has missing while best is complete"
1239                << dendl;
1240       continue;
1241     } else {
1242       // both are complete or have missing
1243       // fall through
1244     }
1245
1246     // prefer current primary (usually the caller), all things being equal
1247     if (p->first == pg_whoami) {
1248       dout(10) << "calc_acting prefer osd." << p->first
1249                << " because it is current primary" << dendl;
1250       best = p;
1251       continue;
1252     }
1253   }
1254   return best;
1255 }
1256
1257 void PG::calc_ec_acting(
1258   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1259   unsigned size,
1260   const vector<int> &acting,
1261   const vector<int> &up,
1262   const map<pg_shard_t, pg_info_t> &all_info,
1263   bool restrict_to_up_acting,
1264   vector<int> *_want,
1265   set<pg_shard_t> *backfill,
1266   set<pg_shard_t> *acting_backfill,
1267   ostream &ss)
1268 {
1269   vector<int> want(size, CRUSH_ITEM_NONE);
1270   map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1271   for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1272        i != all_info.end();
1273        ++i) {
1274     all_info_by_shard[i->first.shard].insert(i->first);
1275   }
1276   for (uint8_t i = 0; i < want.size(); ++i) {
1277     ss << "For position " << (unsigned)i << ": ";
1278     if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1279         !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1280         all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1281         auth_log_shard->second.log_tail) {
1282       ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1283       want[i] = up[i];
1284       continue;
1285     }
1286     if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1287       ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1288          << " and ";
1289       backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1290     }
1291
1292     if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1293         !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1294         all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1295         auth_log_shard->second.log_tail) {
1296       ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1297       want[i] = acting[i];
1298     } else if (!restrict_to_up_acting) {
1299       for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1300            j != all_info_by_shard[shard_id_t(i)].end();
1301            ++j) {
1302         ceph_assert(j->shard == i);
1303         if (!all_info.find(*j)->second.is_incomplete() &&
1304             all_info.find(*j)->second.last_update >=
1305             auth_log_shard->second.log_tail) {
1306           ss << " selecting stray: " << *j << std::endl;
1307           want[i] = j->osd;
1308           break;
1309         }
1310       }
1311       if (want[i] == CRUSH_ITEM_NONE)
1312         ss << " failed to fill position " << (int)i << std::endl;
1313     }
1314   }
1315
1316   for (uint8_t i = 0; i < want.size(); ++i) {
1317     if (want[i] != CRUSH_ITEM_NONE) {
1318       acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1319     }
1320   }
1321   acting_backfill->insert(backfill->begin(), backfill->end());
1322   _want->swap(want);
1323 }
1324
1325 /**
1326  * calculate the desired acting set.
1327  *
1328  * Choose an appropriate acting set.  Prefer up[0], unless it is
1329  * incomplete, or another osd has a longer tail that allows us to
1330  * bring other up nodes up to date.
1331  */
1332 void PG::calc_replicated_acting(
1333   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1334   uint64_t force_auth_primary_missing_objects,
1335   unsigned size,
1336   const vector<int> &acting,
1337   const vector<int> &up,
1338   pg_shard_t up_primary,
1339   const map<pg_shard_t, pg_info_t> &all_info,
1340   bool restrict_to_up_acting,
1341   vector<int> *want,
1342   set<pg_shard_t> *backfill,
1343   set<pg_shard_t> *acting_backfill,
1344   const OSDMapRef osdmap,
1345   ostream &ss)
1346 {
1347   pg_shard_t auth_log_shard_id = auth_log_shard->first;
1348
1349   ss << __func__ << " newest update on osd." << auth_log_shard_id
1350      << " with " << auth_log_shard->second
1351      << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1352
1353   // select primary
1354   auto primary = all_info.find(up_primary);
1355   if (up.size() &&
1356       !primary->second.is_incomplete() &&
1357       primary->second.last_update >=
1358         auth_log_shard->second.log_tail) {
1359     if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1360       auto approx_missing_objects =
1361         primary->second.stats.stats.sum.num_objects_missing;
1362       auto auth_version = auth_log_shard->second.last_update.version;
1363       auto primary_version = primary->second.last_update.version;
1364       if (auth_version > primary_version) {
1365         approx_missing_objects += auth_version - primary_version;
1366       } else {
1367         approx_missing_objects += primary_version - auth_version;
1368       }
1369       if ((uint64_t)approx_missing_objects >
1370           force_auth_primary_missing_objects) {
1371         primary = auth_log_shard;
1372         ss << "up_primary: " << up_primary << ") has approximate "
1373            << approx_missing_objects
1374            << "(>" << force_auth_primary_missing_objects <<") "
1375            << "missing objects, osd." << auth_log_shard_id
1376            << " selected as primary instead"
1377            << std::endl;
1378       } else {
1379         ss << "up_primary: " << up_primary << ") selected as primary"
1380            << std::endl;
1381       }
1382     } else {
1383       ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1384     }
1385   } else {
1386     ceph_assert(!auth_log_shard->second.is_incomplete());
1387     ss << "up[0] needs backfill, osd." << auth_log_shard_id
1388        << " selected as primary instead" << std::endl;
1389     primary = auth_log_shard;
1390   }
1391
1392   ss << __func__ << " primary is osd." << primary->first
1393      << " with " << primary->second << std::endl;
1394   want->push_back(primary->first.osd);
1395   acting_backfill->insert(primary->first);
1396
1397   /* We include auth_log_shard->second.log_tail because in GetLog,
1398    * we will request logs back to the min last_update over our
1399    * acting_backfill set, which will result in our log being extended
1400    * as far backwards as necessary to pick up any peers which can
1401    * be log recovered by auth_log_shard's log */
1402   eversion_t oldest_auth_log_entry =
1403     std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
1404
1405   // select replicas that have log contiguity with primary.
1406   // prefer up, then acting, then any peer_info osds
1407   for (auto i : up) {
1408     pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
1409     if (up_cand == primary->first)
1410       continue;
1411     const pg_info_t &cur_info = all_info.find(up_cand)->second;
1412     if (cur_info.is_incomplete() ||
1413         cur_info.last_update < oldest_auth_log_entry) {
1414       ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1415       backfill->insert(up_cand);
1416       acting_backfill->insert(up_cand);
1417     } else {
1418       want->push_back(i);
1419       acting_backfill->insert(up_cand);
1420       ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
1421     }
1422     if (want->size() >= size) {
1423       break;
1424     }
1425   }
1426
1427   if (want->size() >= size) {
1428     return;
1429   }
1430
1431   std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
1432   candidate_by_last_update.reserve(acting.size());
1433   // This no longer has backfill OSDs, but they are covered above.
1434   for (auto i : acting) {
1435     pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
1436     // skip up osds we already considered above
1437     if (acting_cand == primary->first)
1438       continue;
1439     vector<int>::const_iterator up_it = find(up.begin(), up.end(), i);
1440     if (up_it != up.end())
1441       continue;
1442
1443     const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1444     if (cur_info.is_incomplete() ||
1445         cur_info.last_update < oldest_auth_log_entry) {
1446       ss << " shard " << acting_cand << " (acting) REJECTED "
1447          << cur_info << std::endl;
1448     } else {
1449       candidate_by_last_update.push_back(make_pair(cur_info.last_update, i));
1450     }
1451   }
1452
1453   auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs,
1454                             const std::pair<eversion_t, int> &rhs) {
1455     return lhs.first > rhs.first;
1456   };
1457   // sort by last_update, in descending order.
1458   std::sort(candidate_by_last_update.begin(),
1459             candidate_by_last_update.end(), sort_by_eversion);
1460   for (auto &p: candidate_by_last_update) {
1461     ceph_assert(want->size() < size);
1462     want->push_back(p.second);
1463     pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1464     acting_backfill->insert(s);
1465     ss << " shard " << s << " (acting) accepted "
1466        << all_info.find(s)->second << std::endl;
1467     if (want->size() >= size) {
1468       return;
1469     }
1470   }
1471
1472   if (restrict_to_up_acting) {
1473     return;
1474   }
1475   candidate_by_last_update.clear();
1476   candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
1477   // continue to search stray to find more suitable peers
1478   for (auto &i : all_info) {
1479     // skip up osds we already considered above
1480     if (i.first == primary->first)
1481       continue;
1482     vector<int>::const_iterator up_it = find(up.begin(), up.end(), i.first.osd);
1483     if (up_it != up.end())
1484       continue;
1485     vector<int>::const_iterator acting_it = find(
1486       acting.begin(), acting.end(), i.first.osd);
1487     if (acting_it != acting.end())
1488       continue;
1489
1490     if (i.second.is_incomplete() ||
1491         i.second.last_update < oldest_auth_log_entry) {
1492       ss << " shard " << i.first << " (stray) REJECTED " << i.second
1493          << std::endl;
1494     } else {
1495       candidate_by_last_update.push_back(
1496         make_pair(i.second.last_update, i.first.osd));
1497     }
1498   }
1499
1500   if (candidate_by_last_update.empty()) {
1501     // save us some effort
1502     return;
1503   }
1504
1505   // sort by last_update, in descending order.
1506   std::sort(candidate_by_last_update.begin(),
1507             candidate_by_last_update.end(), sort_by_eversion);
1508
1509   for (auto &p: candidate_by_last_update) {
1510     ceph_assert(want->size() < size);
1511     want->push_back(p.second);
1512     pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1513     acting_backfill->insert(s);
1514     ss << " shard " << s << " (stray) accepted "
1515        << all_info.find(s)->second << std::endl;
1516     if (want->size() >= size) {
1517       return;
1518     }
1519   }
1520 }
1521
1522 bool PG::recoverable_and_ge_min_size(const vector<int> &want) const
1523 {
1524   unsigned num_want_acting = 0;
1525   set<pg_shard_t> have;
1526   for (int i = 0; i < (int)want.size(); ++i) {
1527     if (want[i] != CRUSH_ITEM_NONE) {
1528       ++num_want_acting;
1529       have.insert(
1530         pg_shard_t(
1531           want[i],
1532           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1533     }
1534   }
1535   // We go incomplete if below min_size for ec_pools since backfill
1536   // does not currently maintain rollbackability
1537   // Otherwise, we will go "peered", but not "active"
1538   if (num_want_acting < pool.info.min_size &&
1539       (pool.info.is_erasure() ||
1540        !cct->_conf->osd_allow_recovery_below_min_size)) {
1541     dout(10) << __func__ << " failed, below min size" << dendl;
1542     return false;
1543   }
1544
1545   /* Check whether we have enough acting shards to later perform recovery */
1546   boost::scoped_ptr<IsPGRecoverablePredicate> recoverable_predicate(
1547       get_pgbackend()->get_is_recoverable_predicate());
1548   if (!(*recoverable_predicate)(have)) {
1549     dout(10) << __func__ << " failed, not recoverable" << dendl;
1550     return false;
1551   }
1552
1553   return true;
1554 }
1555
1556 void PG::choose_async_recovery_ec(const map<pg_shard_t, pg_info_t> &all_info,
1557                                   const pg_info_t &auth_info,
1558                                   vector<int> *want,
1559                                   set<pg_shard_t> *async_recovery,
1560                                   const OSDMapRef osdmap) const
1561 {
1562   set<pair<int, pg_shard_t> > candidates_by_cost;
1563   for (uint8_t i = 0; i < want->size(); ++i) {
1564     if ((*want)[i] == CRUSH_ITEM_NONE)
1565       continue;
1566
1567     // Considering log entries to recover is accurate enough for
1568     // now. We could use minimum_to_decode_with_cost() later if
1569     // necessary.
1570     pg_shard_t shard_i((*want)[i], shard_id_t(i));
1571     // do not include strays
1572     if (stray_set.find(shard_i) != stray_set.end())
1573       continue;
1574     // Do not include an osd that is not up, since choosing it as
1575     // an async_recovery_target will move it out of the acting set.
1576     // This results in it being identified as a stray during peering,
1577     // because it is no longer in the up or acting set.
1578     if (!is_up(shard_i))
1579       continue;
1580     auto shard_info = all_info.find(shard_i)->second;
1581     // for ec pools we rollback all entries past the authoritative
1582     // last_update *before* activation. This is relatively inexpensive
1583     // compared to recovery, since it is purely local, so treat shards
1584     // past the authoritative last_update the same as those equal to it.
1585     version_t auth_version = auth_info.last_update.version;
1586     version_t candidate_version = shard_info.last_update.version;
1587     if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1588       auto approx_missing_objects =
1589         shard_info.stats.stats.sum.num_objects_missing;
1590       if (auth_version > candidate_version) {
1591         approx_missing_objects += auth_version - candidate_version;
1592       }
1593       if (static_cast<uint64_t>(approx_missing_objects) >
1594           cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1595         candidates_by_cost.emplace(approx_missing_objects, shard_i);
1596       }
1597     } else {
1598       if (auth_version > candidate_version &&
1599           (auth_version - candidate_version) > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1600         candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i));
1601       }
1602     }
1603   }
1604
1605   dout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1606            << dendl;
1607
1608   // take out as many osds as we can for async recovery, in order of cost
1609   for (auto rit = candidates_by_cost.rbegin();
1610        rit != candidates_by_cost.rend(); ++rit) {
1611     pg_shard_t cur_shard = rit->second;
1612     vector<int> candidate_want(*want);
1613     candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE;
1614     if (recoverable_and_ge_min_size(candidate_want)) {
1615       want->swap(candidate_want);
1616       async_recovery->insert(cur_shard);
1617     }
1618   }
1619   dout(20) << __func__ << " result want=" << *want
1620            << " async_recovery=" << *async_recovery << dendl;
1621 }
1622
1623 void PG::choose_async_recovery_replicated(const map<pg_shard_t, pg_info_t> &all_info,
1624                                           const pg_info_t &auth_info,
1625                                           vector<int> *want,
1626                                           set<pg_shard_t> *async_recovery,
1627                                           const OSDMapRef osdmap) const
1628 {
1629   set<pair<int, pg_shard_t> > candidates_by_cost;
1630   for (auto osd_num : *want) {
1631     pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD);
1632     // do not include strays
1633     if (stray_set.find(shard_i) != stray_set.end())
1634       continue;
1635     // Do not include an osd that is not up, since choosing it as
1636     // an async_recovery_target will move it out of the acting set.
1637     // This results in it being identified as a stray during peering,
1638     // because it is no longer in the up or acting set.
1639     if (!is_up(shard_i))
1640       continue;
1641     auto shard_info = all_info.find(shard_i)->second;
1642     // use the approximate magnitude of the difference in length of
1643     // logs plus historical missing objects as the cost of recovery
1644     version_t auth_version = auth_info.last_update.version;
1645     version_t candidate_version = shard_info.last_update.version;
1646     if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1647       auto approx_missing_objects =
1648         shard_info.stats.stats.sum.num_objects_missing;
1649       if (auth_version > candidate_version) {
1650         approx_missing_objects += auth_version - candidate_version;
1651       } else {
1652         approx_missing_objects += candidate_version - auth_version;
1653       }
1654       if (static_cast<uint64_t>(approx_missing_objects)  >
1655           cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1656         candidates_by_cost.emplace(approx_missing_objects, shard_i);
1657       }
1658     } else {
1659       size_t approx_entries;
1660       if (auth_version > candidate_version) {
1661         approx_entries = auth_version - candidate_version;
1662       } else {
1663         approx_entries = candidate_version - auth_version;
1664       }
1665       if (approx_entries > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1666         candidates_by_cost.insert(make_pair(approx_entries, shard_i));
1667       }
1668     }
1669   }
1670
1671   dout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1672            << dendl;
1673   // take out as many osds as we can for async recovery, in order of cost
1674   for (auto rit = candidates_by_cost.rbegin();
1675        rit != candidates_by_cost.rend(); ++rit) {
1676     if (want->size() <= pool.info.min_size) {
1677       break;
1678     }
1679     pg_shard_t cur_shard = rit->second;
1680     vector<int> candidate_want(*want);
1681     for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) {
1682       if (*it == cur_shard.osd) {
1683         candidate_want.erase(it);
1684         want->swap(candidate_want);
1685         async_recovery->insert(cur_shard);
1686         break;
1687       }
1688     }
1689   }
1690   dout(20) << __func__ << " result want=" << *want
1691            << " async_recovery=" << *async_recovery << dendl;
1692 }
1693
1694 /**
1695  * choose acting
1696  *
1697  * calculate the desired acting, and request a change with the monitor
1698  * if it differs from the current acting.
1699  *
1700  * if restrict_to_up_acting=true, we filter out anything that's not in
1701  * up/acting.  in order to lift this restriction, we need to
1702  *  1) check whether it's worth switching the acting set any time we get
1703  *     a new pg info (not just here, when recovery finishes)
1704  *  2) check whether anything in want_acting went down on each new map
1705  *     (and, if so, calculate a new want_acting)
1706  *  3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap)
1707  * TODO!
1708  */
1709 bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
1710                        bool restrict_to_up_acting,
1711                        bool *history_les_bound)
1712 {
1713   map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
1714   all_info[pg_whoami] = info;
1715
1716   if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
1717     for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
1718          p != all_info.end();
1719          ++p) {
1720       dout(10) << __func__ << " all_info osd." << p->first << " " << p->second << dendl;
1721     }
1722   }
1723
1724   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
1725     find_best_info(all_info, restrict_to_up_acting, history_les_bound);
1726
1727   if (auth_log_shard == all_info.end()) {
1728     if (up != acting) {
1729       dout(10) << __func__ << " no suitable info found (incomplete backfills?),"
1730                << " reverting to up" << dendl;
1731       want_acting = up;
1732       vector<int> empty;
1733       osd->queue_want_pg_temp(info.pgid.pgid, empty);
1734     } else {
1735       dout(10) << __func__ << " failed" << dendl;
1736       ceph_assert(want_acting.empty());
1737     }
1738     return false;
1739   }
1740
1741   ceph_assert(!auth_log_shard->second.is_incomplete());
1742   auth_log_shard_id = auth_log_shard->first;
1743
1744   set<pg_shard_t> want_backfill, want_acting_backfill;
1745   vector<int> want;
1746   stringstream ss;
1747   if (!pool.info.is_erasure())
1748     calc_replicated_acting(
1749       auth_log_shard,
1750       cct->_conf.get_val<uint64_t>(
1751         "osd_force_auth_primary_missing_objects"),
1752       get_osdmap()->get_pg_size(info.pgid.pgid),
1753       acting,
1754       up,
1755       up_primary,
1756       all_info,
1757       restrict_to_up_acting,
1758       &want,
1759       &want_backfill,
1760       &want_acting_backfill,
1761       get_osdmap(),
1762       ss);
1763   else
1764     calc_ec_acting(
1765       auth_log_shard,
1766       get_osdmap()->get_pg_size(info.pgid.pgid),
1767       acting,
1768       up,
1769       all_info,
1770       restrict_to_up_acting,
1771       &want,
1772       &want_backfill,
1773       &want_acting_backfill,
1774       ss);
1775   dout(10) << ss.str() << dendl;
1776
1777   if (!recoverable_and_ge_min_size(want)) {
1778     want_acting.clear();
1779     return false;
1780   }
1781
1782   set<pg_shard_t> want_async_recovery;
1783   if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) {
1784     if (pool.info.is_erasure()) {
1785       choose_async_recovery_ec(all_info, auth_log_shard->second, &want, &want_async_recovery, get_osdmap());
1786     } else {
1787       choose_async_recovery_replicated(all_info, auth_log_shard->second, &want, &want_async_recovery, get_osdmap());
1788     }
1789   }
1790   if (want != acting) {
1791     dout(10) << __func__ << " want " << want << " != acting " << acting
1792              << ", requesting pg_temp change" << dendl;
1793     want_acting = want;
1794
1795     if (!cct->_conf->osd_debug_no_acting_change) {
1796       if (want_acting == up) {
1797         // There can't be any pending backfill if
1798         // want is the same as crush map up OSDs.
1799         ceph_assert(want_backfill.empty());
1800         vector<int> empty;
1801         osd->queue_want_pg_temp(info.pgid.pgid, empty);
1802       } else
1803         osd->queue_want_pg_temp(info.pgid.pgid, want);
1804     }
1805     return false;
1806   }
1807   want_acting.clear();
1808   acting_recovery_backfill = want_acting_backfill;
1809   dout(10) << "acting_recovery_backfill is " << acting_recovery_backfill << dendl;
1810   ceph_assert(backfill_targets.empty() || backfill_targets == want_backfill);
1811   if (backfill_targets.empty()) {
1812     // Caller is GetInfo
1813     backfill_targets = want_backfill;
1814   }
1815   // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
1816   ceph_assert(async_recovery_targets.empty() || async_recovery_targets == want_async_recovery || !needs_recovery());
1817   if (async_recovery_targets.empty() || !needs_recovery()) {
1818     async_recovery_targets = want_async_recovery;
1819   }
1820   // Will not change if already set because up would have had to change
1821   // Verify that nothing in backfill is in stray_set
1822   for (set<pg_shard_t>::iterator i = want_backfill.begin();
1823       i != want_backfill.end();
1824       ++i) {
1825     ceph_assert(stray_set.find(*i) == stray_set.end());
1826   }
1827   dout(10) << "choose_acting want=" << want << " backfill_targets="
1828            << want_backfill << " async_recovery_targets="
1829            << async_recovery_targets << dendl;
1830   return true;
1831 }
1832
1833 /* Build the might_have_unfound set.
1834  *
1835  * This is used by the primary OSD during recovery.
1836  *
1837  * This set tracks the OSDs which might have unfound objects that the primary
1838  * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
1839  * will remove the OSD from the set.
1840  */
1841 void PG::build_might_have_unfound()
1842 {
1843   ceph_assert(might_have_unfound.empty());
1844   ceph_assert(is_primary());
1845
1846   dout(10) << __func__ << dendl;
1847
1848   check_past_interval_bounds();
1849
1850   might_have_unfound = past_intervals.get_might_have_unfound(
1851     pg_whoami,
1852     pool.info.is_erasure());
1853
1854   // include any (stray) peers
1855   for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
1856        p != peer_info.end();
1857        ++p)
1858     might_have_unfound.insert(p->first);
1859
1860   dout(15) << __func__ << ": built " << might_have_unfound << dendl;
1861 }
1862
1863 void PG::activate(ObjectStore::Transaction& t,
1864                   epoch_t activation_epoch,
1865                   map<int, map<spg_t,pg_query_t> >& query_map,
1866                   map<int,
1867                       vector<
1868                         pair<pg_notify_t,
1869                              PastIntervals> > > *activator_map,
1870                   RecoveryCtx *ctx)
1871 {
1872   ceph_assert(!is_peered());
1873   ceph_assert(scrubber.callbacks.empty());
1874   ceph_assert(callbacks_for_degraded_object.empty());
1875
1876   // twiddle pg state
1877   state_clear(PG_STATE_DOWN);
1878
1879   send_notify = false;
1880
1881   if (is_primary()) {
1882     // only update primary last_epoch_started if we will go active
1883     if (acting.size() >= pool.info.min_size) {
1884       ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
1885              info.last_epoch_started <= activation_epoch);
1886       info.last_epoch_started = activation_epoch;
1887       info.last_interval_started = info.history.same_interval_since;
1888     }
1889   } else if (is_acting(pg_whoami)) {
1890     /* update last_epoch_started on acting replica to whatever the primary sent
1891      * unless it's smaller (could happen if we are going peered rather than
1892      * active, see doc/dev/osd_internals/last_epoch_started.rst) */
1893     if (info.last_epoch_started < activation_epoch) {
1894       info.last_epoch_started = activation_epoch;
1895       info.last_interval_started = info.history.same_interval_since;
1896     }
1897   }
1898
1899   auto &missing = pg_log.get_missing();
1900
1901   if (is_primary()) {
1902     last_update_ondisk = info.last_update;
1903     min_last_complete_ondisk = eversion_t(0,0);  // we don't know (yet)!
1904   }
1905   last_update_applied = info.last_update;
1906   last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
1907
1908   need_up_thru = false;
1909
1910   // write pg info, log
1911   dirty_info = true;
1912   dirty_big_info = true; // maybe
1913
1914   // find out when we commit
1915   t.register_on_complete(
1916     new C_PG_ActivateCommitted(
1917       this,
1918       get_osdmap_epoch(),
1919       activation_epoch));
1920
1921   if (is_primary()) {
1922     // initialize snap_trimq
1923     if (get_osdmap()->require_osd_release < CEPH_RELEASE_MIMIC) {
1924       dout(20) << "activate - purged_snaps " << info.purged_snaps
1925                << " cached_removed_snaps " << pool.cached_removed_snaps
1926                << dendl;
1927       snap_trimq = pool.cached_removed_snaps;
1928     } else {
1929       auto& removed_snaps_queue = get_osdmap()->get_removed_snaps_queue();
1930       auto p = removed_snaps_queue.find(info.pgid.pgid.pool());
1931       snap_trimq.clear();
1932       if (p != removed_snaps_queue.end()) {
1933         dout(20) << "activate - purged_snaps " << info.purged_snaps
1934                  << " removed_snaps " << p->second
1935                  << dendl;
1936         for (auto q : p->second) {
1937           snap_trimq.insert(q.first, q.second);
1938         }
1939       }
1940     }
1941     interval_set<snapid_t> purged;
1942     purged.intersection_of(snap_trimq, info.purged_snaps);
1943     snap_trimq.subtract(purged);
1944
1945     if (get_osdmap()->require_osd_release >= CEPH_RELEASE_MIMIC) {
1946       // adjust purged_snaps: PG may have been inactive while snaps were pruned
1947       // from the removed_snaps_queue in the osdmap.  update local purged_snaps
1948       // reflect only those snaps that we thought were pruned and were still in
1949       // the queue.
1950       info.purged_snaps.swap(purged);
1951     }
1952   }
1953
1954   // init complete pointer
1955   if (missing.num_missing() == 0) {
1956     dout(10) << "activate - no missing, moving last_complete " << info.last_complete
1957              << " -> " << info.last_update << dendl;
1958     info.last_complete = info.last_update;
1959     info.stats.stats.sum.num_objects_missing = 0;
1960     pg_log.reset_recovery_pointers();
1961   } else {
1962     dout(10) << "activate - not complete, " << missing << dendl;
1963     info.stats.stats.sum.num_objects_missing = missing.num_missing();
1964     pg_log.activate_not_complete(info);
1965   }
1966
1967   log_weirdness();
1968
1969   // if primary..
1970   if (is_primary()) {
1971     ceph_assert(ctx);
1972     // start up replicas
1973
1974     ceph_assert(!acting_recovery_backfill.empty());
1975     for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
1976          i != acting_recovery_backfill.end();
1977          ++i) {
1978       if (*i == pg_whoami) continue;
1979       pg_shard_t peer = *i;
1980       ceph_assert(peer_info.count(peer));
1981       pg_info_t& pi = peer_info[peer];
1982
1983       dout(10) << "activate peer osd." << peer << " " << pi << dendl;
1984
1985       MOSDPGLog *m = 0;
1986       ceph_assert(peer_missing.count(peer));
1987       pg_missing_t& pm = peer_missing[peer];
1988
1989       bool needs_past_intervals = pi.dne();
1990
1991       /*
1992        * cover case where peer sort order was different and
1993        * last_backfill cannot be interpreted
1994        */
1995       bool force_restart_backfill =
1996         !pi.last_backfill.is_max() &&
1997         !pi.last_backfill_bitwise;
1998
1999       if (pi.last_update == info.last_update && !force_restart_backfill) {
2000         // empty log
2001         if (!pi.last_backfill.is_max())
2002           osd->clog->info() << info.pgid << " continuing backfill to osd."
2003                             << peer
2004                             << " from (" << pi.log_tail << "," << pi.last_update
2005                             << "] " << pi.last_backfill
2006                             << " to " << info.last_update;
2007         if (!pi.is_empty() && activator_map) {
2008           dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
2009           (*activator_map)[peer.osd].push_back(
2010             make_pair(
2011               pg_notify_t(
2012                 peer.shard, pg_whoami.shard,
2013                 get_osdmap_epoch(),
2014                 get_osdmap_epoch(),
2015                 info),
2016               past_intervals));
2017         } else {
2018           dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
2019           m = new MOSDPGLog(
2020             i->shard, pg_whoami.shard,
2021             get_osdmap_epoch(), info,
2022             last_peering_reset);
2023         }
2024       } else if (
2025         pg_log.get_tail() > pi.last_update ||
2026         pi.last_backfill == hobject_t() ||
2027         force_restart_backfill ||
2028         (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
2029         /* ^ This last case covers a situation where a replica is not contiguous
2030          * with the auth_log, but is contiguous with this replica.  Reshuffling
2031          * the active set to handle this would be tricky, so instead we just go
2032          * ahead and backfill it anyway.  This is probably preferrable in any
2033          * case since the replica in question would have to be significantly
2034          * behind.
2035          */
2036         // backfill
2037         osd->clog->debug() << info.pgid << " starting backfill to osd." << peer
2038                          << " from (" << pi.log_tail << "," << pi.last_update
2039                           << "] " << pi.last_backfill
2040                          << " to " << info.last_update;
2041
2042         pi.last_update = info.last_update;
2043         pi.last_complete = info.last_update;
2044         pi.set_last_backfill(hobject_t());
2045         pi.last_epoch_started = info.last_epoch_started;
2046         pi.last_interval_started = info.last_interval_started;
2047         pi.history = info.history;
2048         pi.hit_set = info.hit_set;
2049         // Save num_bytes for reservation request, can't be negative
2050         peer_bytes[peer] = std::max<int64_t>(0, pi.stats.stats.sum.num_bytes);
2051         pi.stats.stats.clear();
2052
2053         // initialize peer with our purged_snaps.
2054         pi.purged_snaps = info.purged_snaps;
2055
2056         m = new MOSDPGLog(
2057           i->shard, pg_whoami.shard,
2058           get_osdmap_epoch(), pi,
2059           last_peering_reset /* epoch to create pg at */);
2060
2061         // send some recent log, so that op dup detection works well.
2062         m->log.copy_up_to(cct, pg_log.get_log(), cct->_conf->osd_min_pg_log_entries);
2063         m->info.log_tail = m->log.tail;
2064         pi.log_tail = m->log.tail;  // sigh...
2065
2066         pm.clear();
2067       } else {
2068         // catch up
2069         ceph_assert(pg_log.get_tail() <= pi.last_update);
2070         m = new MOSDPGLog(
2071           i->shard, pg_whoami.shard,
2072           get_osdmap_epoch(), info,
2073           last_peering_reset /* epoch to create pg at */);
2074         // send new stuff to append to replicas log
2075         m->log.copy_after(cct, pg_log.get_log(), pi.last_update);
2076       }
2077
2078       // share past_intervals if we are creating the pg on the replica
2079       // based on whether our info for that peer was dne() *before*
2080       // updating pi.history in the backfill block above.
2081       if (m && needs_past_intervals)
2082         m->past_intervals = past_intervals;
2083
2084       // update local version of peer's missing list!
2085       if (m && pi.last_backfill != hobject_t()) {
2086         for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
2087              p != m->log.log.end();
2088              ++p) {
2089           if (p->soid <= pi.last_backfill &&
2090               !p->is_error()) {
2091             if (perform_deletes_during_peering() && p->is_delete()) {
2092               pm.rm(p->soid, p->version);
2093             } else {
2094               pm.add_next_event(*p);
2095             }
2096           }
2097         }
2098       }
2099
2100       if (m) {
2101         dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
2102         //m->log.print(cout);
2103         osd->send_message_osd_cluster(peer.osd, m, get_osdmap_epoch());
2104       }
2105
2106       // peer now has
2107       pi.last_update = info.last_update;
2108
2109       // update our missing
2110       if (pm.num_missing() == 0) {
2111         pi.last_complete = pi.last_update;
2112         dout(10) << "activate peer osd." << peer << " " << pi << " uptodate" << dendl;
2113       } else {
2114         dout(10) << "activate peer osd." << peer << " " << pi << " missing " << pm << dendl;
2115       }
2116     }
2117
2118     // Set up missing_loc
2119     set<pg_shard_t> complete_shards;
2120     for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2121          i != acting_recovery_backfill.end();
2122          ++i) {
2123       dout(20) << __func__ << " setting up missing_loc from shard " << *i << " " << dendl;
2124       if (*i == get_primary()) {
2125         missing_loc.add_active_missing(missing);
2126         if (!missing.have_missing())
2127           complete_shards.insert(*i);
2128       } else {
2129         auto peer_missing_entry = peer_missing.find(*i);
2130         ceph_assert(peer_missing_entry != peer_missing.end());
2131         missing_loc.add_active_missing(peer_missing_entry->second);
2132         if (!peer_missing_entry->second.have_missing() &&
2133             peer_info[*i].last_backfill.is_max())
2134           complete_shards.insert(*i);
2135       }
2136     }
2137
2138     // If necessary, create might_have_unfound to help us find our unfound objects.
2139     // NOTE: It's important that we build might_have_unfound before trimming the
2140     // past intervals.
2141     might_have_unfound.clear();
2142     if (needs_recovery()) {
2143       // If only one shard has missing, we do a trick to add all others as recovery
2144       // source, this is considered safe since the PGLogs have been merged locally,
2145       // and covers vast majority of the use cases, like one OSD/host is down for
2146       // a while for hardware repairing
2147       if (complete_shards.size() + 1 == acting_recovery_backfill.size()) {
2148         missing_loc.add_batch_sources_info(complete_shards, ctx->handle);
2149       } else {
2150         missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
2151                                     ctx->handle);
2152         for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2153              i != acting_recovery_backfill.end();
2154              ++i) {
2155           if (*i == pg_whoami) continue;
2156           dout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
2157           ceph_assert(peer_missing.count(*i));
2158           ceph_assert(peer_info.count(*i));
2159           missing_loc.add_source_info(
2160             *i,
2161             peer_info[*i],
2162             peer_missing[*i],
2163             ctx->handle);
2164         }
2165       }
2166       for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
2167            i != peer_missing.end();
2168            ++i) {
2169         if (is_acting_recovery_backfill(i->first))
2170           continue;
2171         ceph_assert(peer_info.count(i->first));
2172         search_for_missing(
2173           peer_info[i->first],
2174           i->second,
2175           i->first,
2176           ctx);
2177       }
2178
2179       build_might_have_unfound();
2180
2181       // Always call now so _update_calc_stats() will be accurate
2182       discover_all_missing(query_map);
2183     }
2184
2185     // num_objects_degraded if calculated should reflect this too, unless no
2186     // missing and we are about to go clean.
2187     if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
2188       state_set(PG_STATE_UNDERSIZED);
2189     }
2190
2191     state_set(PG_STATE_ACTIVATING);
2192     release_pg_backoffs();
2193     projected_last_update = info.last_update;
2194   }
2195   if (acting.size() >= pool.info.min_size) {
2196     PGLogEntryHandler handler{this, &t};
2197     pg_log.roll_forward(&handler);
2198   }
2199 }
2200
2201 bool PG::op_has_sufficient_caps(OpRequestRef& op)
2202 {
2203   // only check MOSDOp
2204   if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
2205     return true;
2206
2207   const MOSDOp *req = static_cast<const MOSDOp*>(op->get_req());
2208
2209   auto priv = req->get_connection()->get_priv();
2210   auto session = static_cast<Session*>(priv.get());
2211   if (!session) {
2212     dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl;
2213     return false;
2214   }
2215   OSDCap& caps = session->caps;
2216   priv.reset();
2217
2218   const string &key = req->get_hobj().get_key().empty() ?
2219     req->get_oid().name :
2220     req->get_hobj().get_key();
2221
2222   bool cap = caps.is_capable(pool.name, req->get_hobj().nspace,
2223                              pool.info.application_metadata,
2224                              key,
2225                              op->need_read_cap(),
2226                              op->need_write_cap(),
2227                              op->classes(),
2228                              session->get_peer_socket_addr());
2229
2230   dout(20) << "op_has_sufficient_caps "
2231            << "session=" << session
2232            << " pool=" << pool.id << " (" << pool.name
2233            << " " << req->get_hobj().nspace
2234            << ")"
2235            << " pool_app_metadata=" << pool.info.application_metadata
2236            << " need_read_cap=" << op->need_read_cap()
2237            << " need_write_cap=" << op->need_write_cap()
2238            << " classes=" << op->classes()
2239            << " -> " << (cap ? "yes" : "NO")
2240            << dendl;
2241   return cap;
2242 }
2243
2244 void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
2245 {
2246   lock();
2247   if (pg_has_reset_since(epoch)) {
2248     dout(10) << "_activate_committed " << epoch
2249              << ", that was an old interval" << dendl;
2250   } else if (is_primary()) {
2251     ceph_assert(!peer_activated.count(pg_whoami));
2252     peer_activated.insert(pg_whoami);
2253     dout(10) << "_activate_committed " << epoch
2254              << " peer_activated now " << peer_activated
2255              << " last_interval_started " << info.history.last_interval_started
2256              << " last_epoch_started " << info.history.last_epoch_started
2257              << " same_interval_since " << info.history.same_interval_since << dendl;
2258     ceph_assert(!acting_recovery_backfill.empty());
2259     if (peer_activated.size() == acting_recovery_backfill.size())
2260       all_activated_and_committed();
2261   } else {
2262     dout(10) << "_activate_committed " << epoch << " telling primary" << dendl;
2263     MOSDPGInfo *m = new MOSDPGInfo(epoch);
2264     pg_notify_t i = pg_notify_t(
2265       get_primary().shard, pg_whoami.shard,
2266       get_osdmap_epoch(),
2267       get_osdmap_epoch(),
2268       info);
2269
2270     i.info.history.last_epoch_started = activation_epoch;
2271     i.info.history.last_interval_started = i.info.history.same_interval_since;
2272     if (acting.size() >= pool.info.min_size) {
2273       state_set(PG_STATE_ACTIVE);
2274     } else {
2275       state_set(PG_STATE_PEERED);
2276     }
2277
2278     m->pg_list.push_back(make_pair(i, PastIntervals()));
2279     osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap_epoch());
2280
2281     // waiters
2282     if (flushes_in_progress == 0) {
2283       requeue_ops(waiting_for_peered);
2284     } else if (!waiting_for_peered.empty()) {
2285       dout(10) << __func__ << " flushes in progress, moving "
2286                << waiting_for_peered.size() << " items to waiting_for_flush"
2287                << dendl;
2288       ceph_assert(waiting_for_flush.empty());
2289       waiting_for_flush.swap(waiting_for_peered);
2290     }
2291   }
2292
2293   ceph_assert(!dirty_info);
2294
2295   unlock();
2296 }
2297
2298 /*
2299  * update info.history.last_epoch_started ONLY after we and all
2300  * replicas have activated AND committed the activate transaction
2301  * (i.e. the peering results are stable on disk).
2302  */
2303 void PG::all_activated_and_committed()
2304 {
2305   dout(10) << "all_activated_and_committed" << dendl;
2306   ceph_assert(is_primary());
2307   ceph_assert(peer_activated.size() == acting_recovery_backfill.size());
2308   ceph_assert(!acting_recovery_backfill.empty());
2309   ceph_assert(blocked_by.empty());
2310
2311   // Degraded?
2312   _update_calc_stats();
2313   if (info.stats.stats.sum.num_objects_degraded) {
2314     state_set(PG_STATE_DEGRADED);
2315   } else {
2316     state_clear(PG_STATE_DEGRADED);
2317   }
2318
2319   queue_peering_event(
2320     PGPeeringEventRef(
2321       std::make_shared<PGPeeringEvent>(
2322         get_osdmap_epoch(),
2323         get_osdmap_epoch(),
2324         AllReplicasActivated())));
2325 }
2326
2327 bool PG::requeue_scrub(bool high_priority)
2328 {
2329   ceph_assert(is_locked());
2330   if (scrub_queued) {
2331     dout(10) << __func__ << ": already queued" << dendl;
2332     return false;
2333   } else {
2334     dout(10) << __func__ << ": queueing" << dendl;
2335     scrub_queued = true;
2336     osd->queue_for_scrub(this, high_priority);
2337     return true;
2338   }
2339 }
2340
2341 void PG::queue_recovery()
2342 {
2343   if (!is_primary() || !is_peered()) {
2344     dout(10) << "queue_recovery -- not primary or not peered " << dendl;
2345     ceph_assert(!recovery_queued);
2346   } else if (recovery_queued) {
2347     dout(10) << "queue_recovery -- already queued" << dendl;
2348   } else {
2349     dout(10) << "queue_recovery -- queuing" << dendl;
2350     recovery_queued = true;
2351     osd->queue_for_recovery(this);
2352   }
2353 }
2354
2355 bool PG::queue_scrub()
2356 {
2357   ceph_assert(is_locked());
2358   if (is_scrubbing()) {
2359     return false;
2360   }
2361   // An interrupted recovery repair could leave this set.
2362   state_clear(PG_STATE_REPAIR);
2363   if (scrubber.need_auto) {
2364     scrubber.must_scrub = true;
2365     scrubber.must_deep_scrub = true;
2366     scrubber.auto_repair = true;
2367     scrubber.need_auto = false;
2368   }
2369   scrubber.priority = scrubber.must_scrub ?
2370          cct->_conf->osd_requested_scrub_priority : get_scrub_priority();
2371   scrubber.must_scrub = false;
2372   state_set(PG_STATE_SCRUBBING);
2373   if (scrubber.must_deep_scrub) {
2374     state_set(PG_STATE_DEEP_SCRUB);
2375     scrubber.must_deep_scrub = false;
2376   }
2377   if (scrubber.must_repair || scrubber.auto_repair) {
2378     state_set(PG_STATE_REPAIR);
2379     scrubber.must_repair = false;
2380   }
2381   requeue_scrub();
2382   return true;
2383 }
2384
2385 unsigned PG::get_scrub_priority()
2386 {
2387   // a higher value -> a higher priority
2388   int64_t pool_scrub_priority = 0;
2389   pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority);
2390   return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
2391 }
2392
2393 void PG::try_mark_clean()
2394 {
2395   if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2396     state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
2397     state_set(PG_STATE_CLEAN);
2398     info.history.last_epoch_clean = get_osdmap_epoch();
2399     info.history.last_interval_clean = info.history.same_interval_since;
2400     past_intervals.clear();
2401     dirty_big_info = true;
2402     dirty_info = true;
2403   }
2404
2405   if (is_active()) {
2406     kick_snap_trim();
2407   } else if (is_peered()) {
2408     if (is_clean()) {
2409       bool target;
2410       if (pool.info.is_pending_merge(info.pgid.pgid, &target)) {
2411         if (target) {
2412           ldout(cct, 10) << "ready to merge (target)" << dendl;
2413           osd->set_ready_to_merge_target(this,
2414                                          info.last_update,
2415                                          info.history.last_epoch_started,
2416                                          info.history.last_epoch_clean);
2417         } else {
2418           ldout(cct, 10) << "ready to merge (source)" << dendl;
2419           osd->set_ready_to_merge_source(this, info.last_update);
2420         }
2421       }
2422     } else {
2423       ldout(cct, 10) << "not clean, not ready to merge" << dendl;
2424       // we should have notified OSD in Active state entry point
2425     }
2426   }
2427
2428   state_clear(PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
2429
2430   share_pg_info();
2431   publish_stats_to_osd();
2432   requeue_ops(waiting_for_clean_to_primary_repair);
2433 }
2434
2435 bool PG::set_force_recovery(bool b)
2436 {
2437   bool did = false;
2438   if (b) {
2439     if (!(state & PG_STATE_FORCED_RECOVERY) &&
2440         (state & (PG_STATE_DEGRADED |
2441                   PG_STATE_RECOVERY_WAIT |
2442                   PG_STATE_RECOVERING))) {
2443       dout(20) << __func__ << " set" << dendl;
2444       state_set(PG_STATE_FORCED_RECOVERY);
2445       publish_stats_to_osd();
2446       did = true;
2447     }
2448   } else if (state & PG_STATE_FORCED_RECOVERY) {
2449     dout(20) << __func__ << " clear" << dendl;
2450     state_clear(PG_STATE_FORCED_RECOVERY);
2451     publish_stats_to_osd();
2452     did = true;
2453   }
2454   if (did) {
2455     dout(20) << __func__ << " state " << pgstate_history.get_current_state() << dendl;
2456     osd->local_reserver.update_priority(info.pgid, get_recovery_priority());
2457   }
2458   return did;
2459 }
2460
2461 bool PG::set_force_backfill(bool b)
2462 {
2463   bool did = false;
2464   if (b) {
2465     if (!(state & PG_STATE_FORCED_BACKFILL) &&
2466         (state & (PG_STATE_DEGRADED |
2467                   PG_STATE_BACKFILL_WAIT |
2468                   PG_STATE_BACKFILLING))) {
2469       dout(10) << __func__ << " set" << dendl;
2470       state_set(PG_STATE_FORCED_BACKFILL);
2471       publish_stats_to_osd();
2472       did = true;
2473     }
2474   } else if (state & PG_STATE_FORCED_BACKFILL) {
2475     dout(10) << __func__ << " clear" << dendl;
2476     state_clear(PG_STATE_FORCED_BACKFILL);
2477     publish_stats_to_osd();
2478     did = true;
2479   }
2480   if (did) {
2481     dout(20) << __func__ << " state " << pgstate_history.get_current_state() << dendl;
2482     osd->local_reserver.update_priority(info.pgid, get_backfill_priority());
2483   }
2484   return did;
2485 }
2486
2487 int PG::clamp_recovery_priority(int priority, int pool_recovery_priority, int max)
2488 {
2489   static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
2490   static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
2491
2492   ceph_assert(max <= OSD_RECOVERY_PRIORITY_MAX);
2493
2494   // User can't set this too high anymore, but might be a legacy value
2495   if (pool_recovery_priority > OSD_POOL_PRIORITY_MAX)
2496     pool_recovery_priority = OSD_POOL_PRIORITY_MAX;
2497   if (pool_recovery_priority < OSD_POOL_PRIORITY_MIN)
2498     pool_recovery_priority = OSD_POOL_PRIORITY_MIN;
2499   // Shift range from min to max to 0 to max - min
2500   pool_recovery_priority += (0 - OSD_POOL_PRIORITY_MIN);
2501   ceph_assert(pool_recovery_priority >= 0 && pool_recovery_priority <= (OSD_POOL_PRIORITY_MAX - OSD_POOL_PRIORITY_MIN));
2502
2503   priority += pool_recovery_priority;
2504
2505   // Clamp to valid range
2506   if (priority > max) {
2507     return max;
2508   } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
2509     return OSD_RECOVERY_PRIORITY_MIN;
2510   } else {
2511     return priority;
2512   }
2513 }
2514
2515 unsigned PG::get_recovery_priority()
2516 {
2517   // a higher value -> a higher priority
2518   int ret = OSD_RECOVERY_PRIORITY_BASE;
2519   int base = ret;
2520
2521   if (state & PG_STATE_FORCED_RECOVERY) {
2522     ret = OSD_RECOVERY_PRIORITY_FORCED;
2523   } else {
2524     // XXX: This priority boost isn't so much about inactive, but about data-at-risk
2525     if (is_degraded() && info.stats.avail_no_missing.size() < pool.info.min_size) {
2526       base = OSD_RECOVERY_INACTIVE_PRIORITY_BASE;
2527       // inactive: no. of replicas < min_size, highest priority since it blocks IO
2528       ret = base + (pool.info.min_size - info.stats.avail_no_missing.size());
2529     }
2530
2531     int64_t pool_recovery_priority = 0;
2532     pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2533
2534     ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
2535   }
2536   dout(20) << __func__ << " recovery priority is " << ret << dendl;
2537   return static_cast<unsigned>(ret);
2538 }
2539
2540 unsigned PG::get_backfill_priority()
2541 {
2542   // a higher value -> a higher priority
2543   int ret = OSD_BACKFILL_PRIORITY_BASE;
2544   int base = ret;
2545
2546   if (state & PG_STATE_FORCED_BACKFILL) {
2547     ret = OSD_BACKFILL_PRIORITY_FORCED;
2548   } else {
2549     if (acting.size() < pool.info.min_size) {
2550       base = OSD_BACKFILL_INACTIVE_PRIORITY_BASE;
2551       // inactive: no. of replicas < min_size, highest priority since it blocks IO
2552       ret = base + (pool.info.min_size - acting.size());
2553
2554     } else if (is_undersized()) {
2555       // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
2556       ceph_assert(pool.info.size > actingset.size());
2557       base = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
2558       ret = base + (pool.info.size - actingset.size());
2559
2560     } else if (is_degraded()) {
2561       // degraded: baseline degraded
2562       base = ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
2563     }
2564
2565     // Adjust with pool's recovery priority
2566     int64_t pool_recovery_priority = 0;
2567     pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2568
2569     ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
2570   }
2571
2572   dout(20) << __func__ << " backfill priority is " << ret << dendl;
2573   return static_cast<unsigned>(ret);
2574 }
2575
2576 unsigned PG::get_delete_priority()
2577 {
2578   auto state = get_osdmap()->get_state(osd->whoami);
2579   if (state & (CEPH_OSD_BACKFILLFULL |
2580                CEPH_OSD_FULL)) {
2581     return OSD_DELETE_PRIORITY_FULL;
2582   } else if (state & CEPH_OSD_NEARFULL) {
2583     return OSD_DELETE_PRIORITY_FULLISH;
2584   } else {
2585     return OSD_DELETE_PRIORITY_NORMAL;
2586   }
2587 }
2588
2589 Context *PG::finish_recovery()
2590 {
2591   dout(10) << "finish_recovery" << dendl;
2592   ceph_assert(info.last_complete == info.last_update);
2593
2594   clear_recovery_state();
2595
2596   /*
2597    * sync all this before purging strays.  but don't block!
2598    */
2599   finish_sync_event = new C_PG_FinishRecovery(this);
2600   return finish_sync_event;
2601 }
2602
2603 void PG::_finish_recovery(Context *c)
2604 {
2605   lock();
2606   // When recovery is initiated by a repair, that flag is left on
2607   state_clear(PG_STATE_REPAIR);
2608   if (deleting) {
2609     unlock();
2610     return;
2611   }
2612   if (c == finish_sync_event) {
2613     dout(10) << "_finish_recovery" << dendl;
2614     finish_sync_event = 0;
2615     purge_strays();
2616
2617     publish_stats_to_osd();
2618
2619     if (scrub_after_recovery) {
2620       dout(10) << "_finish_recovery requeueing for scrub" << dendl;
2621       scrub_after_recovery = false;
2622       scrubber.must_deep_scrub = true;
2623       scrubber.check_repair = true;
2624       queue_scrub();
2625     }
2626   } else {
2627     dout(10) << "_finish_recovery -- stale" << dendl;
2628   }
2629   unlock();
2630 }
2631
2632 void PG::start_recovery_op(const hobject_t& soid)
2633 {
2634   dout(10) << "start_recovery_op " << soid
2635 #ifdef DEBUG_RECOVERY_OIDS
2636            << " (" << recovering_oids << ")"
2637 #endif
2638            << dendl;
2639   ceph_assert(recovery_ops_active >= 0);
2640   recovery_ops_active++;
2641 #ifdef DEBUG_RECOVERY_OIDS
2642   recovering_oids.insert(soid);
2643 #endif
2644   osd->start_recovery_op(this, soid);
2645 }
2646
2647 void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
2648 {
2649   dout(10) << "finish_recovery_op " << soid
2650 #ifdef DEBUG_RECOVERY_OIDS
2651            << " (" << recovering_oids << ")"
2652 #endif
2653            << dendl;
2654   ceph_assert(recovery_ops_active > 0);
2655   recovery_ops_active--;
2656 #ifdef DEBUG_RECOVERY_OIDS
2657   ceph_assert(recovering_oids.count(soid));
2658   recovering_oids.erase(recovering_oids.find(soid));
2659 #endif
2660   osd->finish_recovery_op(this, soid, dequeue);
2661
2662   if (!dequeue) {
2663     queue_recovery();
2664   }
2665 }
2666
2667 void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
2668 {
2669   child->update_snap_mapper_bits(split_bits);
2670   child->update_osdmap_ref(get_osdmap());
2671
2672   child->pool = pool;
2673
2674   // Log
2675   pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2676   child->info.last_complete = info.last_complete;
2677
2678   info.last_update = pg_log.get_head();
2679   child->info.last_update = child->pg_log.get_head();
2680
2681   child->info.last_user_version = info.last_user_version;
2682
2683   info.log_tail = pg_log.get_tail();
2684   child->info.log_tail = child->pg_log.get_tail();
2685
2686   // reset last_complete, we might have modified pg_log & missing above
2687   pg_log.reset_complete_to(&info);
2688   child->pg_log.reset_complete_to(&child->info);
2689
2690   // Info
2691   child->info.history = info.history;
2692   child->info.history.epoch_created = get_osdmap_epoch();
2693   child->info.purged_snaps = info.purged_snaps;
2694
2695   if (info.last_backfill.is_max()) {
2696     child->info.set_last_backfill(hobject_t::get_max());
2697   } else {
2698     // restart backfill on parent and child to be safe.  we could
2699     // probably do better in the bitwise sort case, but it's more
2700     // fragile (there may be special work to do on backfill completion
2701     // in the future).
2702     info.set_last_backfill(hobject_t());
2703     child->info.set_last_backfill(hobject_t());
2704     // restarting backfill implies that the missing set is empty,
2705     // since it is only used for objects prior to last_backfill
2706     pg_log.reset_backfill();
2707     child->pg_log.reset_backfill();
2708   }
2709
2710   child->info.stats = info.stats;
2711   child->info.stats.parent_split_bits = split_bits;
2712   info.stats.stats_invalid = true;
2713   child->info.stats.stats_invalid = true;
2714   child->info.last_epoch_started = info.last_epoch_started;
2715   child->info.last_interval_started = info.last_interval_started;
2716
2717   child->snap_trimq = snap_trimq;
2718
2719   // There can't be recovery/backfill going on now
2720   int primary, up_primary;
2721   vector<int> newup, newacting;
2722   get_osdmap()->pg_to_up_acting_osds(
2723     child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2724   child->init_primary_up_acting(
2725     newup,
2726     newacting,
2727     up_primary,
2728     primary);
2729   child->role = OSDMap::calc_pg_role(osd->whoami, child->acting);
2730
2731   // this comparison includes primary rank via pg_shard_t
2732   if (get_primary() != child->get_primary())
2733     child->info.history.same_primary_since = get_osdmap_epoch();
2734
2735   child->info.stats.up = up;
2736   child->info.stats.up_primary = up_primary;
2737   child->info.stats.acting = acting;
2738   child->info.stats.acting_primary = primary;
2739   child->info.stats.mapping_epoch = get_osdmap_epoch();
2740
2741   // History
2742   child->past_intervals = past_intervals;
2743
2744   _split_into(child_pgid, child, split_bits);
2745
2746   // release all backoffs for simplicity
2747   release_backoffs(hobject_t(), hobject_t::get_max());
2748
2749   child->on_new_interval();
2750
2751   child->send_notify = !child->is_primary();
2752
2753   child->dirty_info = true;
2754   child->dirty_big_info = true;
2755   dirty_info = true;
2756   dirty_big_info = true;
2757 }
2758
2759 void PG::start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
2760 {
2761   out->resize(childpgs.size() + 1);
2762   info.stats.stats.sum.split(*out);
2763 }
2764
2765 void PG::finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction *t)
2766 {
2767   info.stats.stats.sum = stats;
2768   write_if_dirty(*t);
2769 }
2770
2771 void PG::merge_from(map<spg_t,PGRef>& sources, RecoveryCtx *rctx,
2772                     unsigned split_bits,
2773                     const pg_merge_meta_t& last_pg_merge_meta)
2774 {
2775   dout(10) << __func__ << " from " << sources << " split_bits " << split_bits
2776            << dendl;
2777   bool incomplete = false;
2778   if (info.last_complete != info.last_update ||
2779       info.is_incomplete() ||
2780       info.dne()) {
2781     dout(10) << __func__ << " target incomplete" << dendl;
2782     incomplete = true;
2783   }
2784   if (last_pg_merge_meta.source_pgid != pg_t()) {
2785     if (info.pgid.pgid != last_pg_merge_meta.source_pgid.get_parent()) {
2786       dout(10) << __func__ << " target doesn't match expected parent "
2787                << last_pg_merge_meta.source_pgid.get_parent()
2788                << " of source_pgid " << last_pg_merge_meta.source_pgid
2789                << dendl;
2790       incomplete = true;
2791     }
2792     if (info.last_update != last_pg_merge_meta.target_version) {
2793       dout(10) << __func__ << " target version doesn't match expected "
2794                << last_pg_merge_meta.target_version << dendl;
2795       incomplete = true;
2796     }
2797   }
2798
2799   PGLogEntryHandler handler{this, rctx->transaction};
2800   pg_log.roll_forward(&handler);
2801
2802   info.last_complete = info.last_update;  // to fake out trim()
2803   pg_log.reset_recovery_pointers();
2804   pg_log.trim(info.last_update, info);
2805
2806   vector<PGLog*> log_from;
2807   for (auto& i : sources) {
2808     auto& source = i.second;
2809     if (!source) {
2810       dout(10) << __func__ << " source " << i.first << " missing" << dendl;
2811       incomplete = true;
2812       continue;
2813     }
2814     if (source->info.last_complete != source->info.last_update ||
2815         source->info.is_incomplete() ||
2816         source->info.dne()) {
2817       dout(10) << __func__ << " source " << source->pg_id << " incomplete"
2818                << dendl;
2819       incomplete = true;
2820     }
2821     if (last_pg_merge_meta.source_pgid != pg_t()) {
2822       if (source->info.pgid.pgid != last_pg_merge_meta.source_pgid) {
2823         dout(10) << __func__ << " source " << source->info.pgid.pgid
2824                  << " doesn't match expected source pgid "
2825                  << last_pg_merge_meta.source_pgid << dendl;
2826         incomplete = true;
2827       }
2828       if (source->info.last_update != last_pg_merge_meta.source_version) {
2829         dout(10) << __func__ << " source version doesn't match expected "
2830                  << last_pg_merge_meta.target_version << dendl;
2831         incomplete = true;
2832       }
2833     }
2834
2835     // prepare log
2836     PGLogEntryHandler handler{source.get(), rctx->transaction};
2837     source->pg_log.roll_forward(&handler);
2838     source->info.last_complete = source->info.last_update;  // to fake out trim()
2839     source->pg_log.reset_recovery_pointers();
2840     source->pg_log.trim(source->info.last_update, source->info);
2841     log_from.push_back(&source->pg_log);
2842
2843     // wipe out source's pgmeta
2844     rctx->transaction->remove(source->coll, source->pgmeta_oid);
2845
2846     // merge (and destroy source collection)
2847     rctx->transaction->merge_collection(source->coll, coll, split_bits);
2848
2849     // combine stats
2850     info.stats.add(source->info.stats);
2851
2852     // pull up last_update
2853     info.last_update = std::max(info.last_update, source->info.last_update);
2854
2855     // adopt source's PastIntervals if target has none.  we can do this since
2856     // pgp_num has been reduced prior to the merge, so the OSD mappings for
2857     // the PGs are identical.
2858     if (past_intervals.empty() && !source->past_intervals.empty()) {
2859       dout(10) << __func__ << " taking source's past_intervals" << dendl;
2860       past_intervals = source->past_intervals;
2861     }
2862   }
2863
2864   // merge_collection does this, but maybe all of our sources were missing.
2865   rctx->transaction->collection_set_bits(coll, split_bits);
2866
2867   info.last_complete = info.last_update;
2868   info.log_tail = info.last_update;
2869   if (incomplete) {
2870     info.last_backfill = hobject_t();
2871   }
2872
2873   snap_mapper.update_bits(split_bits);
2874
2875   // merge logs
2876   pg_log.merge_from(log_from, info.last_update);
2877
2878   // make sure we have a meaningful last_epoch_started/clean (if we were a
2879   // placeholder)
2880   if (info.last_epoch_started == 0) {
2881     // start with (a) source's history, since these PGs *should* have been
2882     // remapped in concert with each other...
2883     info.history = sources.begin()->second->info.history;
2884
2885     // we use the last_epoch_{started,clean} we got from
2886     // the caller, which are the epochs that were reported by the PGs were
2887     // found to be ready for merge.
2888     info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean;
2889     info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started;
2890     info.last_epoch_started = last_pg_merge_meta.last_epoch_started;
2891     dout(10) << __func__
2892              << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/"
2893              << last_pg_merge_meta.last_epoch_clean
2894              << " from pool last_dec_*, source pg history was "
2895              << sources.begin()->second->info.history
2896              << dendl;
2897
2898     // if the past_intervals start is later than last_epoch_clean, it
2899     // implies the source repeered again but the target didn't, or
2900     // that the source became clean in a later epoch than the target.
2901     // avoid the discrepancy but adjusting the interval start
2902     // backwards to match so that check_past_interval_bounds() will
2903     // not complain.
2904     auto pib = past_intervals.get_bounds();
2905     if (info.history.last_epoch_clean < pib.first) {
2906       dout(10) << __func__ << " last_epoch_clean "
2907                << info.history.last_epoch_clean << " < past_interval start "
2908                << pib.first << ", adjusting start backwards" << dendl;
2909       past_intervals.adjust_start_backwards(info.history.last_epoch_clean);
2910     }
2911
2912     // Similarly, if the same_interval_since value is later than
2913     // last_epoch_clean, the next interval change will result in a
2914     // past_interval start that is later than last_epoch_clean.  This
2915     // can happen if we use the pg_history values from the merge
2916     // source.  Adjust the same_interval_since value backwards if that
2917     // happens.  (We trust the les and lec values more because they came from
2918     // the real target, whereas the history value we stole from the source.)
2919     if (info.history.last_epoch_started < info.history.same_interval_since) {
2920       dout(10) << __func__ << " last_epoch_started "
2921                << info.history.last_epoch_started << " < same_interval_since "
2922                << info.history.same_interval_since
2923                << ", adjusting pg_history backwards" << dendl;
2924       info.history.same_interval_since = info.history.last_epoch_clean;
2925       // make sure same_{up,primary}_since are <= same_interval_since
2926       info.history.same_up_since = std::min(
2927         info.history.same_up_since, info.history.same_interval_since);
2928       info.history.same_primary_since = std::min(
2929         info.history.same_primary_since, info.history.same_interval_since);
2930     }
2931   }
2932
2933   dirty_info = true;
2934   dirty_big_info = true;
2935 }
2936
2937 void PG::add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end)
2938 {
2939   ConnectionRef con = s->con;
2940   if (!con)   // OSD::ms_handle_reset clears s->con without a lock
2941     return;
2942   BackoffRef b(s->have_backoff(info.pgid, begin));
2943   if (b) {
2944     derr << __func__ << " already have backoff for " << s << " begin " << begin
2945          << " " << *b << dendl;
2946     ceph_abort();
2947   }
2948   std::lock_guard l(backoff_lock);
2949   {
2950     b = new Backoff(info.pgid, this, s, ++s->backoff_seq, begin, end);
2951     backoffs[begin].insert(b);
2952     s->add_backoff(b);
2953     dout(10) << __func__ << " session " << s << " added " << *b << dendl;
2954   }
2955   con->send_message(
2956     new MOSDBackoff(
2957       info.pgid,
2958       get_osdmap_epoch(),
2959       CEPH_OSD_BACKOFF_OP_BLOCK,
2960       b->id,
2961       begin,
2962       end));
2963 }
2964
2965 void PG::release_backoffs(const hobject_t& begin, const hobject_t& end)
2966 {
2967   dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl;
2968   vector<BackoffRef> bv;
2969   {
2970     std::lock_guard l(backoff_lock);
2971     auto p = backoffs.lower_bound(begin);
2972     while (p != backoffs.end()) {
2973       int r = cmp(p->first, end);
2974       dout(20) << __func__ << " ? " << r << " " << p->first
2975                << " " << p->second << dendl;
2976       // note: must still examine begin=end=p->first case
2977       if (r > 0 || (r == 0 && begin < end)) {
2978         break;
2979       }
2980       dout(20) << __func__ << " checking " << p->first
2981                << " " << p->second << dendl;
2982       auto q = p->second.begin();
2983       while (q != p->second.end()) {
2984         dout(20) << __func__ << " checking  " << *q << dendl;
2985         int r = cmp((*q)->begin, begin);
2986         if (r == 0 || (r > 0 && (*q)->end < end)) {
2987           bv.push_back(*q);
2988           q = p->second.erase(q);
2989         } else {
2990           ++q;
2991         }
2992       }
2993       if (p->second.empty()) {
2994         p = backoffs.erase(p);
2995       } else {
2996         ++p;
2997       }
2998     }
2999   }
3000   for (auto b : bv) {
3001     std::lock_guard l(b->lock);
3002     dout(10) << __func__ << " " << *b << dendl;
3003     if (b->session) {
3004       ceph_assert(b->pg == this);
3005       ConnectionRef con = b->session->con;
3006       if (con) {   // OSD::ms_handle_reset clears s->con without a lock
3007         con->send_message(
3008           new MOSDBackoff(
3009             info.pgid,
3010             get_osdmap_epoch(),
3011             CEPH_OSD_BACKOFF_OP_UNBLOCK,
3012             b->id,
3013             b->begin,
3014             b->end));
3015       }
3016       if (b->is_new()) {
3017         b->state = Backoff::STATE_DELETING;
3018       } else {
3019         b->session->rm_backoff(b);
3020         b->session.reset();
3021       }
3022       b->pg.reset();
3023     }
3024   }
3025 }
3026
3027 void PG::clear_backoffs()
3028 {
3029   dout(10) << __func__ << " " << dendl;
3030   map<hobject_t,set<BackoffRef>> ls;
3031   {
3032     std::lock_guard l(backoff_lock);
3033     ls.swap(backoffs);
3034   }
3035   for (auto& p : ls) {
3036     for (auto& b : p.second) {
3037       std::lock_guard l(b->lock);
3038       dout(10) << __func__ << " " << *b << dendl;
3039       if (b->session) {
3040         ceph_assert(b->pg == this);
3041         if (b->is_new()) {
3042           b->state = Backoff::STATE_DELETING;
3043         } else {
3044           b->session->rm_backoff(b);
3045           b->session.reset();
3046         }
3047         b->pg.reset();
3048       }
3049     }
3050   }
3051 }
3052
3053 // called by Session::clear_backoffs()
3054 void PG::rm_backoff(BackoffRef b)
3055 {
3056   dout(10) << __func__ << " " << *b << dendl;
3057   std::lock_guard l(backoff_lock);
3058   ceph_assert(b->lock.is_locked_by_me());
3059   ceph_assert(b->pg == this);
3060   auto p = backoffs.find(b->begin);
3061   // may race with release_backoffs()
3062   if (p != backoffs.end()) {
3063     auto q = p->second.find(b);
3064     if (q != p->second.end()) {
3065       p->second.erase(q);
3066       if (p->second.empty()) {
3067         backoffs.erase(p);
3068       }
3069     }
3070   }
3071 }
3072
3073 void PG::clear_recovery_state()
3074 {
3075   dout(10) << "clear_recovery_state" << dendl;
3076
3077   pg_log.reset_recovery_pointers();
3078   finish_sync_event = 0;
3079
3080   hobject_t soid;
3081   while (recovery_ops_active > 0) {
3082 #ifdef DEBUG_RECOVERY_OIDS
3083     soid = *recovering_oids.begin();
3084 #endif
3085     finish_recovery_op(soid, true);
3086   }
3087
3088   async_recovery_targets.clear();
3089   backfill_targets.clear();
3090   backfill_info.clear();
3091   peer_backfill_info.clear();
3092   waiting_on_backfill.clear();
3093   _clear_recovery_state();  // pg impl specific hook
3094 }
3095
3096 void PG::cancel_recovery()
3097 {
3098   dout(10) << "cancel_recovery" << dendl;
3099   clear_recovery_state();
3100 }
3101
3102
3103 void PG::purge_strays()
3104 {
3105   if (is_premerge()) {
3106     dout(10) << "purge_strays " << stray_set << " but premerge, doing nothing"
3107              << dendl;
3108     return;
3109   }
3110   if (cct->_conf.get_val<bool>("osd_debug_no_purge_strays")) {
3111     return;
3112   }
3113   dout(10) << "purge_strays " << stray_set << dendl;
3114
3115   bool removed = false;
3116   for (set<pg_shard_t>::iterator p = stray_set.begin();
3117        p != stray_set.end();
3118        ++p) {
3119     ceph_assert(!is_acting_recovery_backfill(*p));
3120     if (get_osdmap()->is_up(p->osd)) {
3121       dout(10) << "sending PGRemove to osd." << *p << dendl;
3122       vector<spg_t> to_remove;
3123       to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
3124       MOSDPGRemove *m = new MOSDPGRemove(
3125         get_osdmap_epoch(),
3126         to_remove);
3127       osd->send_message_osd_cluster(p->osd, m, get_osdmap_epoch());
3128     } else {
3129       dout(10) << "not sending PGRemove to down osd." << *p << dendl;
3130     }
3131     peer_missing.erase(*p);
3132     peer_info.erase(*p);
3133     peer_purged.insert(*p);
3134     removed = true;
3135   }
3136
3137   // if we removed anyone, update peers (which include peer_info)
3138   if (removed)
3139     update_heartbeat_peers();
3140
3141   stray_set.clear();
3142
3143   // clear _requested maps; we may have to peer() again if we discover
3144   // (more) stray content
3145   peer_log_requested.clear();
3146   peer_missing_requested.clear();
3147 }
3148
3149 void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
3150 {
3151   std::lock_guard l(heartbeat_peer_lock);
3152   probe_targets.clear();
3153   for (set<pg_shard_t>::iterator i = probe_set.begin();
3154        i != probe_set.end();
3155        ++i) {
3156     probe_targets.insert(i->osd);
3157   }
3158 }
3159
3160 void PG::clear_probe_targets()
3161 {
3162   std::lock_guard l(heartbeat_peer_lock);
3163   probe_targets.clear();
3164 }
3165
3166 void PG::update_heartbeat_peers()
3167 {
3168   ceph_assert(is_locked());
3169
3170   if (!is_primary())
3171     return;
3172
3173   set<int> new_peers;
3174   for (unsigned i=0; i<acting.size(); i++) {
3175     if (acting[i] != CRUSH_ITEM_NONE)
3176       new_peers.insert(acting[i]);
3177   }
3178   for (unsigned i=0; i<up.size(); i++) {
3179     if (up[i] != CRUSH_ITEM_NONE)
3180       new_peers.insert(up[i]);
3181   }
3182   for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
3183     p != peer_info.end();
3184     ++p)
3185     new_peers.insert(p->first.osd);
3186
3187   bool need_update = false;
3188   heartbeat_peer_lock.Lock();
3189   if (new_peers == heartbeat_peers) {
3190     dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl;
3191   } else {
3192     dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl;
3193     heartbeat_peers.swap(new_peers);
3194     need_update = true;
3195   }
3196   heartbeat_peer_lock.Unlock();
3197
3198   if (need_update)
3199     osd->need_heartbeat_peer_update();
3200 }
3201
3202
3203 bool PG::check_in_progress_op(
3204   const osd_reqid_t &r,
3205   eversion_t *version,
3206   version_t *user_version,
3207   int *return_code) const
3208 {
3209   return (
3210     projected_log.get_request(r, version, user_version, return_code) ||
3211     pg_log.get_log().get_request(r, version, user_version, return_code));
3212 }
3213
3214 static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard)
3215 {
3216     for (auto&p : pgs)
3217       if (p.shard == shard)
3218         return true;
3219     return false;
3220 }
3221
3222 static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard)
3223 {
3224     for (auto&p : pgs) {
3225       if (p == skip)
3226         continue;
3227       if (p.shard == shard)
3228         return p;
3229     }
3230     return pg_shard_t();
3231 }
3232
3233 void PG::_update_calc_stats()
3234 {
3235   info.stats.version = info.last_update;
3236   info.stats.created = info.history.epoch_created;
3237   info.stats.last_scrub = info.history.last_scrub;
3238   info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
3239   info.stats.last_deep_scrub = info.history.last_deep_scrub;
3240   info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
3241   info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
3242   info.stats.last_epoch_clean = info.history.last_epoch_clean;
3243
3244   info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
3245   info.stats.ondisk_log_size = info.stats.log_size;
3246   info.stats.log_start = pg_log.get_tail();
3247   info.stats.ondisk_log_start = pg_log.get_tail();
3248   info.stats.snaptrimq_len = snap_trimq.size();
3249
3250   unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
3251
3252   // In rare case that upset is too large (usually transient), use as target
3253   // for calculations below.
3254   unsigned target = std::max(num_shards, (unsigned)upset.size());
3255   // For undersized actingset may be larger with OSDs out
3256   unsigned nrep = std::max(actingset.size(), upset.size());
3257   // calc num_object_copies
3258   info.stats.stats.calc_copies(std::max(target, nrep));
3259   info.stats.stats.sum.num_objects_degraded = 0;
3260   info.stats.stats.sum.num_objects_unfound = 0;
3261   info.stats.stats.sum.num_objects_misplaced = 0;
3262   info.stats.avail_no_missing.clear();
3263   info.stats.object_location_counts.clear();
3264
3265   if ((is_remapped() || is_undersized() || !is_clean()) && (is_peered() || is_activating())) {
3266     dout(20) << __func__ << " actingset " << actingset << " upset "
3267              << upset << " acting_recovery_backfill " << acting_recovery_backfill << dendl;
3268     dout(20) << __func__ << " acting " << acting << " up " << up << dendl;
3269
3270     ceph_assert(!acting_recovery_backfill.empty());
3271
3272     bool estimate = false;
3273
3274     // NOTE: we only generate degraded, misplaced and unfound
3275     // values for the summation, not individual stat categories.
3276     int64_t num_objects = info.stats.stats.sum.num_objects;
3277
3278     // Objects missing from up nodes, sorted by # objects.
3279     boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
3280     // Objects missing from nodes not in up, sort by # objects
3281     boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
3282
3283     // Fill missing_target_objects/acting_source_objects
3284
3285     {
3286       int64_t missing;
3287
3288       // Primary first
3289       missing = pg_log.get_missing().num_missing();
3290       ceph_assert(acting_recovery_backfill.count(pg_whoami));
3291       if (upset.count(pg_whoami)) {
3292         missing_target_objects.insert(make_pair(missing, pg_whoami));
3293       } else {
3294         acting_source_objects.insert(make_pair(missing, pg_whoami));
3295       }
3296       info.stats.stats.sum.num_objects_missing_on_primary = missing;
3297       if (missing == 0)
3298         info.stats.avail_no_missing.push_back(pg_whoami);
3299       dout(20) << __func__ << " shard " << pg_whoami
3300                << " primary objects " << num_objects
3301                << " missing " << missing
3302                << dendl;
3303     }
3304
3305     // All other peers
3306     for (auto& peer : peer_info) {
3307       // Primary should not be in the peer_info, skip if it is.
3308       if (peer.first == pg_whoami) continue;
3309       int64_t missing = 0;
3310       int64_t peer_num_objects = peer.second.stats.stats.sum.num_objects;
3311       // Backfill targets always track num_objects accurately
3312       // all other peers track missing accurately.
3313       if (is_backfill_targets(peer.first)) {
3314         missing = std::max((int64_t)0, num_objects - peer_num_objects);
3315       } else {
3316         if (peer_missing.count(peer.first)) {
3317           missing = peer_missing[peer.first].num_missing();
3318         } else {
3319           dout(20) << __func__ << " no peer_missing found for " << peer.first << dendl;
3320           if (is_recovering()) {
3321             estimate = true;
3322           }
3323           missing = std::max((int64_t)0, num_objects - peer_num_objects);
3324         }
3325       }
3326       if (upset.count(peer.first)) {
3327         missing_target_objects.insert(make_pair(missing, peer.first));
3328       } else if (actingset.count(peer.first)) {
3329         acting_source_objects.insert(make_pair(missing, peer.first));
3330       }
3331       peer.second.stats.stats.sum.num_objects_missing = missing;
3332       if (missing == 0)
3333         info.stats.avail_no_missing.push_back(peer.first);
3334       dout(20) << __func__ << " shard " << peer.first
3335                << " objects " << peer_num_objects
3336                << " missing " << missing
3337                << dendl;
3338     }
3339
3340     // Compute object_location_counts
3341     for (auto& ml: missing_loc.get_missing_locs()) {
3342       info.stats.object_location_counts[ml.second]++;
3343       dout(30) << __func__ << " " << ml.first << " object_location_counts["
3344                << ml.second << "]=" << info.stats.object_location_counts[ml.second]
3345                << dendl;
3346     }
3347     int64_t not_missing = num_objects - missing_loc.get_missing_locs().size();
3348     if (not_missing) {
3349         // During recovery we know upset == actingset and is being populated
3350         // During backfill we know that all non-missing objects are in the actingset
3351         info.stats.object_location_counts[actingset] = not_missing;
3352     }
3353     dout(30) << __func__ << " object_location_counts["
3354              << upset << "]=" << info.stats.object_location_counts[upset]
3355              << dendl;
3356     dout(20) << __func__ << " object_location_counts "
3357              << info.stats.object_location_counts << dendl;
3358
3359     // A misplaced object is not stored on the correct OSD
3360     int64_t misplaced = 0;
3361     // a degraded objects has fewer replicas or EC shards than the pool specifies.
3362     int64_t degraded = 0;
3363
3364     if (is_recovering()) {
3365       for (auto& sml: missing_loc.get_missing_by_count()) {
3366         for (auto& ml: sml.second) {
3367           int missing_shards;
3368           if (sml.first == shard_id_t::NO_SHARD) {
3369             dout(20) << __func__ << " ml " << ml.second << " upset size " << upset.size() << " up " << ml.first.up << dendl;
3370             missing_shards = (int)upset.size() - ml.first.up;
3371           } else {
3372             // Handle shards not even in upset below
3373             if (!find_shard(upset, sml.first))
3374               continue;
3375             missing_shards = std::max(0, 1 - ml.first.up);
3376             dout(20) << __func__ << " shard " << sml.first << " ml " << ml.second << " missing shards " << missing_shards << dendl;
3377           }
3378           int odegraded = ml.second * missing_shards;
3379           // Copies on other osds but limited to the possible degraded
3380           int more_osds = std::min(missing_shards, ml.first.other);
3381           int omisplaced = ml.second * more_osds;
3382           ceph_assert(omisplaced <= odegraded);
3383           odegraded -= omisplaced;
3384
3385           misplaced += omisplaced;
3386           degraded += odegraded;
3387         }
3388       }
3389
3390       dout(20) << __func__ << " missing based degraded " << degraded << dendl;
3391       dout(20) << __func__ << " missing based misplaced " << misplaced << dendl;
3392
3393       // Handle undersized case
3394       if (pool.info.is_replicated()) {
3395         // Add degraded for missing targets (num_objects missing)
3396         ceph_assert(target >= upset.size());
3397         unsigned needed = target - upset.size();
3398         degraded += num_objects * needed;
3399       } else {
3400         for (unsigned i = 0 ; i < num_shards; ++i) {
3401           shard_id_t shard(i);
3402
3403           if (!find_shard(upset, shard)) {
3404             pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard);
3405
3406             if (pgs != pg_shard_t()) {
3407               int64_t missing;
3408
3409               if (pgs == pg_whoami)
3410                 missing = info.stats.stats.sum.num_objects_missing_on_primary;
3411               else
3412                 missing = peer_info[pgs].stats.stats.sum.num_objects_missing;
3413
3414               degraded += missing;
3415               misplaced += std::max((int64_t)0, num_objects - missing);
3416             } else {
3417               // No shard anywhere
3418               degraded += num_objects;
3419             }
3420           }
3421         }
3422       }
3423       goto out;
3424     }
3425
3426     // Handle undersized case
3427     if (pool.info.is_replicated()) {
3428       // Add to missing_target_objects
3429       ceph_assert(target >= missing_target_objects.size());
3430       unsigned needed = target - missing_target_objects.size();
3431       if (needed)
3432         missing_target_objects.insert(make_pair(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD)));
3433     } else {
3434       for (unsigned i = 0 ; i < num_shards; ++i) {
3435         shard_id_t shard(i);
3436         bool found = false;
3437         for (const auto& t : missing_target_objects) {
3438           if (std::get<1>(t).shard == shard) {
3439             found = true;
3440             break;
3441           }
3442         }
3443         if (!found)
3444           missing_target_objects.insert(make_pair(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard)));
3445       }
3446     }
3447
3448     for (const auto& item : missing_target_objects)
3449       dout(20) << __func__ << " missing shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
3450     for (const auto& item : acting_source_objects)
3451       dout(20) << __func__ << " acting shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
3452
3453     // Handle all objects not in missing for remapped
3454     // or backfill
3455     for (auto m = missing_target_objects.rbegin();
3456         m != missing_target_objects.rend(); ++m) {
3457
3458       int64_t extra_missing = -1;
3459
3460       if (pool.info.is_replicated()) {
3461         if (!acting_source_objects.empty()) {
3462           auto extra_copy = acting_source_objects.begin();
3463           extra_missing = std::get<0>(*extra_copy);
3464           acting_source_objects.erase(extra_copy);
3465         }
3466       } else {  // Erasure coded
3467         // Use corresponding shard
3468         for (const auto& a : acting_source_objects) {
3469           if (std::get<1>(a).shard == std::get<1>(*m).shard) {
3470             extra_missing = std::get<0>(a);
3471             acting_source_objects.erase(a);
3472             break;
3473           }
3474         }
3475       }
3476
3477       if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
3478         // We don't know which of the objects on the target
3479         // are part of extra_missing so assume are all degraded.
3480         misplaced += std::get<0>(*m) - extra_missing;
3481         degraded += extra_missing;
3482       } else {
3483         // 1. extra_missing == -1, more targets than sources so degraded
3484         // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
3485         //    previously degraded are now present on the target.
3486         degraded += std::get<0>(*m);
3487       }
3488     }
3489     // If there are still acting that haven't been accounted for
3490     // then they are misplaced
3491     for (const auto& a : acting_source_objects) {
3492       int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
3493       dout(20) << __func__ << " extra acting misplaced " << extra_misplaced << dendl;
3494       misplaced += extra_misplaced;
3495     }
3496 out:
3497     // NOTE: Tests use these messages to verify this code
3498     dout(20) << __func__ << " degraded " << degraded << (estimate ? " (est)": "") << dendl;
3499     dout(20) << __func__ << " misplaced " << misplaced << (estimate ? " (est)": "")<< dendl;
3500
3501     info.stats.stats.sum.num_objects_degraded = degraded;
3502     info.stats.stats.sum.num_objects_unfound = get_num_unfound();
3503     info.stats.stats.sum.num_objects_misplaced = misplaced;
3504   }
3505 }
3506
3507 void PG::_update_blocked_by()
3508 {
3509   // set a max on the number of blocking peers we report. if we go
3510   // over, report a random subset.  keep the result sorted.
3511   unsigned keep = std::min<unsigned>(blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
3512   unsigned skip = blocked_by.size() - keep;
3513   info.stats.blocked_by.clear();
3514   info.stats.blocked_by.resize(keep);
3515   unsigned pos = 0;
3516   for (set<int>::iterator p = blocked_by.begin();
3517        p != blocked_by.end() && keep > 0;
3518        ++p) {
3519     if (skip > 0 && (rand() % (skip + keep) < skip)) {
3520       --skip;
3521     } else {
3522       info.stats.blocked_by[pos++] = *p;
3523       --keep;
3524     }
3525   }
3526 }
3527
3528 void PG::publish_stats_to_osd()
3529 {
3530   if (!is_primary())
3531     return;
3532
3533   pg_stats_publish_lock.Lock();
3534
3535   if (info.stats.stats.sum.num_scrub_errors)
3536     state_set(PG_STATE_INCONSISTENT);
3537   else {
3538     state_clear(PG_STATE_INCONSISTENT);
3539     state_clear(PG_STATE_FAILED_REPAIR);
3540   }
3541
3542   utime_t now = ceph_clock_now();
3543   if (info.stats.state != state) {
3544     info.stats.last_change = now;
3545     // Optimistic estimation, if we just find out an inactive PG,
3546     // assumt it is active till now.
3547     if (!(state & PG_STATE_ACTIVE) &&
3548         (info.stats.state & PG_STATE_ACTIVE))
3549       info.stats.last_active = now;
3550
3551     if ((state & PG_STATE_ACTIVE) &&
3552         !(info.stats.state & PG_STATE_ACTIVE))
3553       info.stats.last_became_active = now;
3554     if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
3555         !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
3556       info.stats.last_became_peered = now;
3557     info.stats.state = state;
3558   }
3559
3560   _update_calc_stats();
3561   if (info.stats.stats.sum.num_objects_degraded) {
3562     state_set(PG_STATE_DEGRADED);
3563   } else {
3564     state_clear(PG_STATE_DEGRADED);
3565   }
3566   _update_blocked_by();
3567
3568   pg_stat_t pre_publish = info.stats;
3569   pre_publish.stats.add(unstable_stats);
3570   utime_t cutoff = now;
3571   cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
3572
3573   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_MIMIC) {
3574     // share (some of) our purged_snaps via the pg_stats. limit # of intervals
3575     // because we don't want to make the pg_stat_t structures too expensive.
3576     unsigned max = cct->_conf->osd_max_snap_prune_intervals_per_epoch;
3577     unsigned num = 0;
3578     auto i = info.purged_snaps.begin();
3579     while (num < max && i != info.purged_snaps.end()) {
3580       pre_publish.purged_snaps.insert(i.get_start(), i.get_len());
3581       ++num;
3582       ++i;
3583     }
3584     dout(20) << __func__ << " reporting purged_snaps "
3585              << pre_publish.purged_snaps << dendl;
3586   }
3587
3588   if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
3589       info.stats.last_fresh > cutoff) {
3590     dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3591              << ": no change since " << info.stats.last_fresh << dendl;
3592   } else {
3593     // update our stat summary and timestamps
3594     info.stats.reported_epoch = get_osdmap_epoch();
3595     ++info.stats.reported_seq;
3596
3597     info.stats.last_fresh = now;
3598
3599     if (info.stats.state & PG_STATE_CLEAN)
3600       info.stats.last_clean = now;
3601     if (info.stats.state & PG_STATE_ACTIVE)
3602       info.stats.last_active = now;
3603     if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
3604       info.stats.last_peered = now;
3605     info.stats.last_unstale = now;
3606     if ((info.stats.state & PG_STATE_DEGRADED) == 0)
3607       info.stats.last_undegraded = now;
3608     if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
3609       info.stats.last_fullsized = now;
3610
3611     pg_stats_publish_valid = true;
3612     pg_stats_publish = pre_publish;
3613
3614     dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3615              << ":" << pg_stats_publish.reported_seq << dendl;
3616   }
3617   pg_stats_publish_lock.Unlock();
3618 }
3619
3620 void PG::clear_publish_stats()
3621 {
3622   dout(15) << "clear_stats" << dendl;
3623   pg_stats_publish_lock.Lock();
3624   pg_stats_publish_valid = false;
3625   pg_stats_publish_lock.Unlock();
3626 }
3627
3628 /**
3629  * initialize a newly instantiated pg
3630  *
3631  * Initialize PG state, as when a PG is initially created, or when it
3632  * is first instantiated on the current node.
3633  *
3634  * @param role our role/rank
3635  * @param newup up set
3636  * @param newacting acting set
3637  * @param history pg history
3638  * @param pi past_intervals
3639  * @param backfill true if info should be marked as backfill
3640  * @param t transaction to write out our new state in
3641  */
3642 void PG::init(
3643   int role,
3644   const vector<int>& newup, int new_up_primary,
3645   const vector<int>& newacting, int new_acting_primary,
3646   const pg_history_t& history,
3647   const PastIntervals& pi,
3648   bool backfill,
3649   ObjectStore::Transaction *t)
3650 {
3651   dout(10) << "init role " << role << " up " << newup << " acting " << newacting
3652            << " history " << history
3653            << " past_intervals " << pi
3654            << dendl;
3655
3656   set_role(role);
3657   init_primary_up_acting(
3658     newup,
3659     newacting,
3660     new_up_primary,
3661     new_acting_primary);
3662
3663   info.history = history;
3664   past_intervals = pi;
3665
3666   info.stats.up = up;
3667   info.stats.up_primary = new_up_primary;
3668   info.stats.acting = acting;
3669   info.stats.acting_primary = new_acting_primary;
3670   info.stats.mapping_epoch = info.history.same_interval_since;
3671
3672   if (backfill) {
3673     dout(10) << __func__ << ": Setting backfill" << dendl;
3674     info.set_last_backfill(hobject_t());
3675     info.last_complete = info.last_update;
3676     pg_log.mark_log_for_rewrite();
3677   }
3678
3679   on_new_interval();
3680
3681   dirty_info = true;
3682   dirty_big_info = true;
3683   write_if_dirty(*t);
3684 }
3685
3686 void PG::shutdown()
3687 {
3688   ch->flush();
3689   lock();
3690   on_shutdown();
3691   unlock();
3692 }
3693
3694 #pragma GCC diagnostic ignored "-Wpragmas"
3695 #pragma GCC diagnostic push
3696 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3697
3698 void PG::upgrade(ObjectStore *store)
3699 {
3700   dout(0) << __func__ << " " << info_struct_v << " -> " << latest_struct_v
3701           << dendl;
3702   ceph_assert(info_struct_v <= 10);
3703   ObjectStore::Transaction t;
3704
3705   // <do upgrade steps here>
3706
3707   // finished upgrade!
3708   ceph_assert(info_struct_v == 10);
3709
3710   // update infover_key
3711   if (info_struct_v < latest_struct_v) {
3712     map<string,bufferlist> v;
3713     __u8 ver = latest_struct_v;
3714     encode(ver, v[infover_key]);
3715     t.omap_setkeys(coll, pgmeta_oid, v);
3716   }
3717
3718   dirty_info = true;
3719   dirty_big_info = true;
3720   write_if_dirty(t);
3721
3722   ObjectStore::CollectionHandle ch = store->open_collection(coll);
3723   int r = store->queue_transaction(ch, std::move(t));
3724   if (r != 0) {
3725     derr << __func__ << ": queue_transaction returned "
3726          << cpp_strerror(r) << dendl;
3727     ceph_abort();
3728   }
3729   ceph_assert(r == 0);
3730
3731   C_SaferCond waiter;
3732   if (!ch->flush_commit(&waiter)) {
3733     waiter.wait();
3734   }
3735 }
3736
3737 #pragma GCC diagnostic pop
3738 #pragma GCC diagnostic warning "-Wpragmas"
3739
3740 int PG::_prepare_write_info(CephContext* cct,
3741                             map<string,bufferlist> *km,
3742                             epoch_t epoch,
3743                             pg_info_t &info, pg_info_t &last_written_info,
3744                             PastIntervals &past_intervals,
3745                             bool dirty_big_info,
3746                             bool dirty_epoch,
3747                             bool try_fast_info,
3748                             PerfCounters *logger)
3749 {
3750   if (dirty_epoch) {
3751     encode(epoch, (*km)[epoch_key]);
3752   }
3753
3754   if (logger)
3755     logger->inc(l_osd_pg_info);
3756
3757   // try to do info efficiently?
3758   if (!dirty_big_info && try_fast_info &&
3759       info.last_update > last_written_info.last_update) {
3760     pg_fast_info_t fast;
3761     fast.populate_from(info);
3762     bool did = fast.try_apply_to(&last_written_info);
3763     ceph_assert(did);  // we verified last_update increased above
3764     if (info == last_written_info) {
3765       encode(fast, (*km)[fastinfo_key]);
3766       if (logger)
3767         logger->inc(l_osd_pg_fastinfo);
3768       return 0;
3769     }
3770     generic_dout(30) << __func__ << " fastinfo failed, info:\n";
3771     {
3772       JSONFormatter jf(true);
3773       jf.dump_object("info", info);
3774       jf.flush(*_dout);
3775     }
3776     {
3777       *_dout << "\nlast_written_info:\n";
3778       JSONFormatter jf(true);
3779       jf.dump_object("last_written_info", last_written_info);
3780       jf.flush(*_dout);
3781     }
3782     *_dout << dendl;
3783   }
3784   last_written_info = info;
3785
3786   // info.  store purged_snaps separately.
3787   interval_set<snapid_t> purged_snaps;
3788   purged_snaps.swap(info.purged_snaps);
3789   encode(info, (*km)[info_key]);
3790   purged_snaps.swap(info.purged_snaps);
3791
3792   if (dirty_big_info) {
3793     // potentially big stuff
3794     bufferlist& bigbl = (*km)[biginfo_key];
3795     encode(past_intervals, bigbl);
3796     encode(info.purged_snaps, bigbl);
3797     //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
3798     if (logger)
3799       logger->inc(l_osd_pg_biginfo);
3800   }
3801
3802   return 0;
3803 }
3804
3805 void PG::_create(ObjectStore::Transaction& t, spg_t pgid, int bits)
3806 {
3807   coll_t coll(pgid);
3808   t.create_collection(coll, bits);
3809 }
3810
3811 void PG::_init(ObjectStore::Transaction& t, spg_t pgid, const pg_pool_t *pool)
3812 {
3813   coll_t coll(pgid);
3814
3815   if (pool) {
3816     // Give a hint to the PG collection
3817     bufferlist hint;
3818     uint32_t pg_num = pool->get_pg_num();
3819     uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
3820     encode(pg_num, hint);
3821     encode(expected_num_objects_pg, hint);
3822     uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
3823     t.collection_hint(coll, hint_type, hint);
3824   }
3825
3826   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3827   t.touch(coll, pgmeta_oid);
3828   map<string,bufferlist> values;
3829   __u8 struct_v = latest_struct_v;
3830   encode(struct_v, values[infover_key]);
3831   t.omap_setkeys(coll, pgmeta_oid, values);
3832 }
3833
3834 void PG::prepare_write_info(map<string,bufferlist> *km)
3835 {
3836   info.stats.stats.add(unstable_stats);
3837   unstable_stats.clear();
3838
3839   bool need_update_epoch = last_epoch < get_osdmap_epoch();
3840   int ret = _prepare_write_info(cct, km, get_osdmap_epoch(),
3841                                 info,
3842                                 last_written_info,
3843                                 past_intervals,
3844                                 dirty_big_info, need_update_epoch,
3845                                 cct->_conf->osd_fast_info,
3846                                 osd->logger);
3847   ceph_assert(ret == 0);
3848   if (need_update_epoch)
3849     last_epoch = get_osdmap_epoch();
3850   last_persisted_osdmap = last_epoch;
3851
3852   dirty_info = false;
3853   dirty_big_info = false;
3854 }
3855
3856 #pragma GCC diagnostic ignored "-Wpragmas"
3857 #pragma GCC diagnostic push
3858 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3859
3860 bool PG::_has_removal_flag(ObjectStore *store,
3861                            spg_t pgid)
3862 {
3863   coll_t coll(pgid);
3864   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3865
3866   // first try new way
3867   set<string> keys;
3868   keys.insert("_remove");
3869   map<string,bufferlist> values;
3870   auto ch = store->open_collection(coll);
3871   ceph_assert(ch);
3872   if (store->omap_get_values(ch, pgmeta_oid, keys, &values) == 0 &&
3873       values.size() == 1)
3874     return true;
3875
3876   return false;
3877 }
3878
3879 int PG::peek_map_epoch(ObjectStore *store,
3880                        spg_t pgid,
3881                        epoch_t *pepoch)
3882 {
3883   coll_t coll(pgid);
3884   ghobject_t legacy_infos_oid(OSD::make_infos_oid());
3885   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3886   epoch_t cur_epoch = 0;
3887
3888   // validate collection name
3889   ceph_assert(coll.is_pg());
3890
3891   // try for v8
3892   set<string> keys;
3893   keys.insert(infover_key);
3894   keys.insert(epoch_key);
3895   map<string,bufferlist> values;
3896   auto ch = store->open_collection(coll);
3897   ceph_assert(ch);
3898   int r = store->omap_get_values(ch, pgmeta_oid, keys, &values);
3899   if (r == 0) {
3900     ceph_assert(values.size() == 2);
3901
3902     // sanity check version
3903     auto bp = values[infover_key].cbegin();
3904     __u8 struct_v = 0;
3905     decode(struct_v, bp);
3906     ceph_assert(struct_v >= 8);
3907
3908     // get epoch
3909     bp = values[epoch_key].begin();
3910     decode(cur_epoch, bp);
3911   } else {
3912     // probably bug 10617; see OSD::load_pgs()
3913     return -1;
3914   }
3915
3916   *pepoch = cur_epoch;
3917   return 0;
3918 }
3919
3920 #pragma GCC diagnostic pop
3921 #pragma GCC diagnostic warning "-Wpragmas"
3922
3923 void PG::write_if_dirty(ObjectStore::Transaction& t)
3924 {
3925   map<string,bufferlist> km;
3926   if (dirty_big_info || dirty_info)
3927     prepare_write_info(&km);
3928   pg_log.write_log_and_missing(t, &km, coll, pgmeta_oid, pool.info.require_rollback());
3929   if (!km.empty())
3930     t.omap_setkeys(coll, pgmeta_oid, km);
3931 }
3932
3933 void PG::add_log_entry(const pg_log_entry_t& e, bool applied)
3934 {
3935   // raise last_complete only if we were previously up to date
3936   if (info.last_complete == info.last_update)
3937     info.last_complete = e.version;
3938
3939   // raise last_update.
3940   ceph_assert(e.version > info.last_update);
3941   info.last_update = e.version;
3942
3943   // raise user_version, if it increased (it may have not get bumped
3944   // by all logged updates)
3945   if (e.user_version > info.last_user_version)
3946     info.last_user_version = e.user_version;
3947
3948   // log mutation
3949   pg_log.add(e, applied);
3950   dout(10) << "add_log_entry " << e << dendl;
3951 }
3952
3953
3954 void PG::append_log(
3955   const vector<pg_log_entry_t>& logv,
3956   eversion_t trim_to,
3957   eversion_t roll_forward_to,
3958   ObjectStore::Transaction &t,
3959   bool transaction_applied,
3960   bool async)
3961 {
3962   if (transaction_applied)
3963     update_snap_map(logv, t);
3964
3965   /* The primary has sent an info updating the history, but it may not
3966    * have arrived yet.  We want to make sure that we cannot remember this
3967    * write without remembering that it happened in an interval which went
3968    * active in epoch history.last_epoch_started.
3969    */
3970   if (info.last_epoch_started != info.history.last_epoch_started) {
3971     info.history.last_epoch_started = info.last_epoch_started;
3972   }
3973   if (info.last_interval_started != info.history.last_interval_started) {
3974     info.history.last_interval_started = info.last_interval_started;
3975   }
3976   dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3977
3978   PGLogEntryHandler handler{this, &t};
3979   if (!transaction_applied) {
3980      /* We must be a backfill or async recovery peer, so it's ok if we apply
3981       * out-of-turn since we won't be considered when
3982       * determining a min possible last_update.
3983       *
3984       * We skip_rollforward() here, which advances the crt, without
3985       * doing an actual rollforward. This avoids cleaning up entries
3986       * from the backend and we do not end up in a situation, where the
3987       * object is deleted before we can _merge_object_divergent_entries().
3988       */
3989     pg_log.skip_rollforward();
3990   }
3991
3992   for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3993        p != logv.end();
3994        ++p) {
3995     add_log_entry(*p, transaction_applied);
3996
3997     /* We don't want to leave the rollforward artifacts around
3998      * here past last_backfill.  It's ok for the same reason as
3999      * above */
4000     if (transaction_applied &&
4001         p->soid > info.last_backfill) {
4002       pg_log.roll_forward(&handler);
4003     }
4004   }
4005   auto last = logv.rbegin();
4006   if (is_primary() && last != logv.rend()) {
4007     projected_log.skip_can_rollback_to_to_head();
4008     projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
4009   }
4010
4011   if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
4012     pg_log.roll_forward_to(
4013       roll_forward_to,
4014       &handler);
4015     last_rollback_info_trimmed_to_applied = roll_forward_to;
4016   }
4017
4018   dout(10) << __func__ << " approx pg log length =  "
4019            << pg_log.get_log().approx_size() << dendl;
4020   dout(10) << __func__ << " transaction_applied = "
4021            << transaction_applied << dendl;
4022   if (!transaction_applied || async)
4023     dout(10) << __func__ << " " << pg_whoami
4024              << " is async_recovery or backfill target" << dendl;
4025   pg_log.trim(trim_to, info, transaction_applied, async);
4026
4027   // update the local pg, pg log
4028   dirty_info = true;
4029   write_if_dirty(t);
4030 }
4031
4032 bool PG::check_log_for_corruption(ObjectStore *store)
4033 {
4034   /// TODO: this method needs to work with the omap log
4035   return true;
4036 }
4037
4038 //! Get the name we're going to save our corrupt page log as
4039 std::string PG::get_corrupt_pg_log_name() const
4040 {
4041   const int MAX_BUF = 512;
4042   char buf[MAX_BUF];
4043   struct tm tm_buf;
4044   time_t my_time(time(NULL));
4045   const struct tm *t = localtime_r(&my_time, &tm_buf);
4046   int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t);
4047   if (ret == 0) {
4048     dout(0) << "strftime failed" << dendl;
4049     return "corrupt_log_unknown_time";
4050   }
4051   string out(buf);
4052   out += stringify(info.pgid);
4053   return out;
4054 }
4055
4056 int PG::read_info(
4057   ObjectStore *store, spg_t pgid, const coll_t &coll,
4058   pg_info_t &info, PastIntervals &past_intervals,
4059   __u8 &struct_v)
4060 {
4061   set<string> keys;
4062   keys.insert(infover_key);
4063   keys.insert(info_key);
4064   keys.insert(biginfo_key);
4065   keys.insert(fastinfo_key);
4066   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
4067   map<string,bufferlist> values;
4068   auto ch = store->open_collection(coll);
4069   ceph_assert(ch);
4070   int r = store->omap_get_values(ch, pgmeta_oid, keys, &values);
4071   ceph_assert(r == 0);
4072   ceph_assert(values.size() == 3 ||
4073          values.size() == 4);
4074
4075   auto p = values[infover_key].cbegin();
4076   decode(struct_v, p);
4077   ceph_assert(struct_v >= 10);
4078
4079   p = values[info_key].begin();
4080   decode(info, p);
4081
4082   p = values[biginfo_key].begin();
4083   decode(past_intervals, p);
4084   decode(info.purged_snaps, p);
4085
4086   p = values[fastinfo_key].begin();
4087   if (!p.end()) {
4088     pg_fast_info_t fast;
4089     decode(fast, p);
4090     fast.try_apply_to(&info);
4091   }
4092   return 0;
4093 }
4094
4095 void PG::read_state(ObjectStore *store)
4096 {
4097   int r = read_info(store, pg_id, coll, info, past_intervals,
4098                     info_struct_v);
4099   ceph_assert(r >= 0);
4100
4101   if (info_struct_v < compat_struct_v) {
4102     derr << "PG needs upgrade, but on-disk data is too old; upgrade to"
4103          << " an older version first." << dendl;
4104     ceph_abort_msg("PG too old to upgrade");
4105   }
4106
4107   last_written_info = info;
4108
4109   ostringstream oss;
4110   pg_log.read_log_and_missing(
4111     store,
4112     ch,
4113     pgmeta_oid,
4114     info,
4115     oss,
4116     cct->_conf->osd_ignore_stale_divergent_priors,
4117     cct->_conf->osd_debug_verify_missing_on_start);
4118   if (oss.tellp())
4119     osd->clog->error() << oss.str();
4120
4121   // log any weirdness
4122   log_weirdness();
4123
4124   if (info_struct_v < latest_struct_v) {
4125     upgrade(store);
4126   }
4127
4128   // initialize current mapping
4129   {
4130     int primary, up_primary;
4131     vector<int> acting, up;
4132     get_osdmap()->pg_to_up_acting_osds(
4133       pg_id.pgid, &up, &up_primary, &acting, &primary);
4134     init_primary_up_acting(
4135       up,
4136       acting,
4137       up_primary,
4138       primary);
4139     int rr = OSDMap::calc_pg_role(osd->whoami, acting);
4140     if (pool.info.is_replicated() || rr == pg_whoami.shard)
4141       set_role(rr);
4142     else
4143       set_role(-1);
4144   }
4145
4146   PG::RecoveryCtx rctx(0, 0, 0, new ObjectStore::Transaction);
4147   handle_initialize(&rctx);
4148   // note: we don't activate here because we know the OSD will advance maps
4149   // during boot.
4150   write_if_dirty(*rctx.transaction);
4151   store->queue_transaction(ch, std::move(*rctx.transaction));
4152   delete rctx.transaction;
4153 }
4154
4155 void PG::log_weirdness()
4156 {
4157   if (pg_log.get_tail() != info.log_tail)
4158     osd->clog->error() << info.pgid
4159                        << " info mismatch, log.tail " << pg_log.get_tail()
4160                        << " != info.log_tail " << info.log_tail;
4161   if (pg_log.get_head() != info.last_update)
4162     osd->clog->error() << info.pgid
4163                        << " info mismatch, log.head " << pg_log.get_head()
4164                        << " != info.last_update " << info.last_update;
4165
4166   if (!pg_log.get_log().empty()) {
4167     // sloppy check
4168     if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
4169       osd->clog->error() << info.pgid
4170                         << " log bound mismatch, info (tail,head] ("
4171                         << pg_log.get_tail() << "," << pg_log.get_head() << "]"
4172                         << " actual ["
4173                         << pg_log.get_log().log.begin()->version << ","
4174                          << pg_log.get_log().log.rbegin()->version << "]";
4175   }
4176
4177   if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
4178     osd->clog->error() << info.pgid
4179                       << " caller_ops.size " << pg_log.get_log().caller_ops.size()
4180                        << " > log size " << pg_log.get_log().log.size();
4181   }
4182 }
4183
4184 void PG::update_snap_map(
4185   const vector<pg_log_entry_t> &log_entries,
4186   ObjectStore::Transaction &t)
4187 {
4188   for (vector<pg_log_entry_t>::const_iterator i = log_entries.begin();
4189        i != log_entries.end();
4190        ++i) {
4191     OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4192     if (i->soid.snap < CEPH_MAXSNAP) {
4193       if (i->is_delete()) {
4194         int r = snap_mapper.remove_oid(
4195           i->soid,
4196           &_t);
4197         if (r != 0)
4198           derr << __func__ << " remove_oid " << i->soid << " failed with " << r << dendl;
4199         // On removal tolerate missing key corruption
4200         ceph_assert(r == 0 || r == -ENOENT);
4201       } else if (i->is_update()) {
4202         ceph_assert(i->snaps.length() > 0);
4203         vector<snapid_t> snaps;
4204         bufferlist snapbl = i->snaps;
4205         auto p = snapbl.cbegin();
4206         try {
4207           decode(snaps, p);
4208         } catch (...) {
4209           derr << __func__ << " decode snaps failure on " << *i << dendl;
4210           snaps.clear();
4211         }
4212         set<snapid_t> _snaps(snaps.begin(), snaps.end());
4213
4214         if (i->is_clone() || i->is_promote()) {
4215           snap_mapper.add_oid(
4216             i->soid,
4217             _snaps,
4218             &_t);
4219         } else if (i->is_modify()) {
4220           int r = snap_mapper.update_snaps(
4221             i->soid,
4222             _snaps,
4223             0,
4224             &_t);
4225           ceph_assert(r == 0);
4226         } else {
4227           ceph_assert(i->is_clean());
4228         }
4229       }
4230     }
4231   }
4232 }
4233
4234 /**
4235  * filter trimming|trimmed snaps out of snapcontext
4236  */
4237 void PG::filter_snapc(vector<snapid_t> &snaps)
4238 {
4239   // nothing needs to trim, we can return immediately
4240   if (snap_trimq.empty() && info.purged_snaps.empty())
4241     return;
4242
4243   bool filtering = false;
4244   vector<snapid_t> newsnaps;
4245   for (vector<snapid_t>::iterator p = snaps.begin();
4246        p != snaps.end();
4247        ++p) {
4248     if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
4249       if (!filtering) {
4250         // start building a new vector with what we've seen so far
4251         dout(10) << "filter_snapc filtering " << snaps << dendl;
4252         newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
4253         filtering = true;
4254       }
4255       dout(20) << "filter_snapc  removing trimq|purged snap " << *p << dendl;
4256     } else {
4257       if (filtering)
4258         newsnaps.push_back(*p);  // continue building new vector
4259     }
4260   }
4261   if (filtering) {
4262     snaps.swap(newsnaps);
4263     dout(10) << "filter_snapc  result " << snaps << dendl;
4264   }
4265 }
4266
4267 void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m)
4268 {
4269   for (map<hobject_t, list<OpRequestRef>>::iterator it = m.begin();
4270        it != m.end();
4271        ++it)
4272     requeue_ops(it->second);
4273   m.clear();
4274 }
4275
4276 void PG::requeue_op(OpRequestRef op)
4277 {
4278   auto p = waiting_for_map.find(op->get_source());
4279   if (p != waiting_for_map.end()) {
4280     dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")"
4281              << dendl;
4282     p->second.push_front(op);
4283   } else {
4284     dout(20) << __func__ << " " << op << dendl;
4285     osd->enqueue_front(
4286       OpQueueItem(
4287         unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(info.pgid, op)),
4288         op->get_req()->get_cost(),
4289         op->get_req()->get_priority(),
4290         op->get_req()->get_recv_stamp(),
4291         op->get_req()->get_source().num(),
4292         get_osdmap_epoch()));
4293   }
4294 }
4295
4296 void PG::requeue_ops(list<OpRequestRef> &ls)
4297 {
4298   for (list<OpRequestRef>::reverse_iterator i = ls.rbegin();
4299        i != ls.rend();
4300        ++i) {
4301     requeue_op(*i);
4302   }
4303   ls.clear();
4304 }
4305
4306 void PG::requeue_map_waiters()
4307 {
4308   epoch_t epoch = get_osdmap_epoch();
4309   auto p = waiting_for_map.begin();
4310   while (p != waiting_for_map.end()) {
4311     if (epoch < p->second.front()->min_epoch) {
4312       dout(20) << __func__ << " " << p->first << " front op "
4313                << p->second.front() << " must still wait, doing nothing"
4314                << dendl;
4315       ++p;
4316     } else {
4317       dout(20) << __func__ << " " << p->first << " " << p->second << dendl;
4318       for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
4319         auto req = *q;
4320         osd->enqueue_front(OpQueueItem(
4321           unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(info.pgid, req)),
4322           req->get_req()->get_cost(),
4323           req->get_req()->get_priority(),
4324           req->get_req()->get_recv_stamp(),
4325           req->get_req()->get_source().num(),
4326           epoch));
4327       }
4328       p = waiting_for_map.erase(p);
4329     }
4330   }
4331 }
4332
4333
4334 // ==========================================================================================
4335 // SCRUB
4336
4337 /*
4338  * when holding pg and sched_scrub_lock, then the states are:
4339  *   scheduling:
4340  *     scrubber.reserved = true
4341  *     scrub_rserved_peers includes whoami
4342  *     osd->scrub_pending++
4343  *   scheduling, replica declined:
4344  *     scrubber.reserved = true
4345  *     scrubber.reserved_peers includes -1
4346  *     osd->scrub_pending++
4347  *   pending:
4348  *     scrubber.reserved = true
4349  *     scrubber.reserved_peers.size() == acting.size();
4350  *     pg on scrub_wq
4351  *     osd->scrub_pending++
4352  *   scrubbing:
4353  *     scrubber.reserved = false;
4354  *     scrubber.reserved_peers empty
4355  *     osd->scrubber.active++
4356  */
4357
4358 // returns true if a scrub has been newly kicked off
4359 bool PG::sched_scrub()
4360 {
4361   ceph_assert(is_locked());
4362   ceph_assert(!is_scrubbing());
4363   if (!(is_primary() && is_active() && is_clean())) {
4364     return false;
4365   }
4366
4367   // All processing the first time through commits us to whatever
4368   // choices are made.
4369   if (!scrubber.reserved) {
4370     dout(20) << __func__ << ": Start processing pg " << info.pgid << dendl;
4371
4372     bool allow_deep_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
4373                        pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB));
4374     bool allow_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
4375                   pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB));
4376     bool has_deep_errors = (info.stats.stats.sum.num_deep_scrub_errors > 0);
4377     bool try_to_auto_repair = (cct->_conf->osd_scrub_auto_repair
4378                                && get_pgbackend()->auto_repair_supported());
4379
4380     scrubber.time_for_deep = false;
4381     // Clear these in case user issues the scrub/repair command during
4382     // the scheduling of the scrub/repair (e.g. request reservation)
4383     scrubber.deep_scrub_on_error = false;
4384     scrubber.auto_repair = false;
4385
4386     // All periodic scrub handling goes here because must_scrub is
4387     // always set for must_deep_scrub and must_repair.
4388     if (!scrubber.must_scrub) {
4389       ceph_assert(!scrubber.must_deep_scrub && !scrubber.must_repair);
4390       // Handle deep scrub determination only if allowed
4391       if (allow_deep_scrub) {
4392         // Initial entry and scheduled scrubs without nodeep_scrub set get here
4393         if (scrubber.need_auto) {
4394           dout(20) << __func__ << ": need repair after scrub errors" << dendl;
4395           scrubber.time_for_deep = true;
4396         } else {
4397           double deep_scrub_interval = 0;
4398           pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
4399           if (deep_scrub_interval <= 0) {
4400             deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
4401           }
4402           scrubber.time_for_deep = ceph_clock_now() >=
4403                   info.history.last_deep_scrub_stamp + deep_scrub_interval;
4404
4405           bool deep_coin_flip = false;
4406           // If we randomize when !allow_scrub && allow_deep_scrub, then it guarantees
4407           // we will deep scrub because this function is called often.
4408           if (!scrubber.time_for_deep && allow_scrub)
4409             deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
4410           dout(20) << __func__ << ": time_for_deep=" << scrubber.time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
4411
4412           scrubber.time_for_deep = (scrubber.time_for_deep || deep_coin_flip);
4413         }
4414
4415         if (!scrubber.time_for_deep && has_deep_errors) {
4416           osd->clog->info() << "osd." << osd->whoami
4417                             << " pg " << info.pgid
4418                             << " Deep scrub errors, upgrading scrub to deep-scrub";
4419           scrubber.time_for_deep = true;
4420         }
4421
4422         if (try_to_auto_repair) {
4423           if (scrubber.time_for_deep) {
4424             dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
4425             scrubber.auto_repair = true;
4426           } else if (allow_scrub) {
4427             dout(20) << __func__ << ": auto repair with scrubbing, rescrub if errors found" << dendl;
4428             scrubber.deep_scrub_on_error = true;
4429           }
4430         }
4431       } else { // !allow_deep_scrub
4432         dout(20) << __func__ << ": nodeep_scrub set" << dendl;
4433         if (has_deep_errors) {
4434           osd->clog->error() << "osd." << osd->whoami
4435                              << " pg " << info.pgid
4436                              << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
4437           return false;
4438         }
4439       }
4440
4441       //NOSCRUB so skip regular scrubs
4442       if (!allow_scrub && !scrubber.time_for_deep) {
4443         return false;
4444       }
4445     // scrubber.must_scrub
4446     } else if (!scrubber.must_deep_scrub && has_deep_errors) {
4447         osd->clog->error() << "osd." << osd->whoami
4448                            << " pg " << info.pgid
4449                            << " Regular scrub request, deep-scrub details will be lost";
4450     }
4451     // Unless precluded this was handle above
4452     scrubber.need_auto = false;
4453
4454     ceph_assert(scrubber.reserved_peers.empty());
4455     if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
4456          osd->inc_scrubs_pending()) {
4457       dout(20) << __func__ << ": reserved locally, reserving replicas" << dendl;
4458       scrubber.reserved = true;
4459       scrubber.reserved_peers.insert(pg_whoami);
4460       scrub_reserve_replicas();
4461     } else {
4462       dout(20) << __func__ << ": failed to reserve locally" << dendl;
4463       return false;
4464     }
4465   }
4466
4467   if (scrubber.reserved) {
4468     if (scrubber.reserve_failed) {
4469       dout(20) << __func__ << ": failed, a peer declined" << dendl;
4470       clear_scrub_reserved();
4471       scrub_unreserve_replicas();
4472       return false;
4473     } else if (scrubber.reserved_peers.size() == actingset.size()) {
4474       dout(20) << __func__ << ": success, reserved self and replicas" << dendl;
4475       if (scrubber.time_for_deep) {
4476         dout(10) << __func__ << ": scrub will be deep" << dendl;
4477         state_set(PG_STATE_DEEP_SCRUB);
4478         scrubber.time_for_deep = false;
4479       }
4480       queue_scrub();
4481     } else {
4482       // none declined, since scrubber.reserved is set
4483       dout(20) << __func__ << ": reserved " << scrubber.reserved_peers
4484                << ", waiting for replicas" << dendl;
4485     }
4486   }
4487   return true;
4488 }
4489
4490 bool PG::is_scrub_registered()
4491 {
4492   return !scrubber.scrub_reg_stamp.is_zero();
4493 }
4494
4495 void PG::reg_next_scrub()
4496 {
4497   if (!is_primary())
4498     return;
4499
4500   utime_t reg_stamp;
4501   bool must = false;
4502   if (scrubber.must_scrub || scrubber.need_auto) {
4503     // Set the smallest time that isn't utime_t()
4504     reg_stamp = Scrubber::scrub_must_stamp();
4505     must = true;
4506   } else if (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats) {
4507     reg_stamp = ceph_clock_now();
4508     must = true;
4509   } else {
4510     reg_stamp = info.history.last_scrub_stamp;
4511   }
4512   // note down the sched_time, so we can locate this scrub, and remove it
4513   // later on.
4514   double scrub_min_interval = 0, scrub_max_interval = 0;
4515   pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
4516   pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
4517   ceph_assert(!is_scrub_registered());
4518   scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
4519                                                reg_stamp,
4520                                                scrub_min_interval,
4521                                                scrub_max_interval,
4522                                                must);
4523   dout(10) << __func__ << " pg " << pg_id << " register next scrub, scrub time "
4524       << scrubber.scrub_reg_stamp << ", must = " << (int)must << dendl;
4525 }
4526
4527 void PG::unreg_next_scrub()
4528 {
4529   if (is_scrub_registered()) {
4530     osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
4531     scrubber.scrub_reg_stamp = utime_t();
4532   }
4533 }
4534
4535 void PG::on_info_history_change()
4536 {
4537   unreg_next_scrub();
4538   reg_next_scrub();
4539 }
4540
4541 void PG::scrub_requested(bool deep, bool repair, bool need_auto)
4542 {
4543   unreg_next_scrub();
4544   if (need_auto) {
4545     scrubber.need_auto = true;
4546   } else {
4547     scrubber.must_scrub = true;
4548     scrubber.must_deep_scrub = deep || repair;
4549     scrubber.must_repair = repair;
4550     // User might intervene, so clear this
4551     scrubber.need_auto = false;
4552   }
4553   reg_next_scrub();
4554 }
4555
4556 void PG::do_replica_scrub_map(OpRequestRef op)
4557 {
4558   const MOSDRepScrubMap *m = static_cast<const MOSDRepScrubMap*>(op->get_req());
4559   dout(7) << __func__ << " " << *m << dendl;
4560   if (m->map_epoch < info.history.same_interval_since) {
4561     dout(10) << __func__ << " discarding old from "
4562              << m->map_epoch << " < " << info.history.same_interval_since
4563              << dendl;
4564     return;
4565   }
4566   if (!scrubber.is_chunky_scrub_active()) {
4567     dout(10) << __func__ << " scrub isn't active" << dendl;
4568     return;
4569   }
4570
4571   op->mark_started();
4572
4573   auto p = const_cast<bufferlist&>(m->get_data()).cbegin();
4574   scrubber.received_maps[m->from].decode(p, info.pgid.pool());
4575   dout(10) << "map version is "
4576            << scrubber.received_maps[m->from].valid_through
4577            << dendl;
4578
4579   dout(10) << __func__ << " waiting_on_whom was " << scrubber.waiting_on_whom
4580            << dendl;
4581   ceph_assert(scrubber.waiting_on_whom.count(m->from));
4582   scrubber.waiting_on_whom.erase(m->from);
4583   if (m->preempted) {
4584     dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
4585     scrub_preempted = true;
4586   }
4587   if (scrubber.waiting_on_whom.empty()) {
4588     requeue_scrub(ops_blocked_by_scrub());
4589   }
4590 }
4591
4592 // send scrub v3 messages (chunky scrub)
4593 void PG::_request_scrub_map(
4594   pg_shard_t replica, eversion_t version,
4595   hobject_t start, hobject_t end,
4596   bool deep,
4597   bool allow_preemption)
4598 {
4599   ceph_assert(replica != pg_whoami);
4600   dout(10) << "scrub  requesting scrubmap from osd." << replica
4601            << " deep " << (int)deep << dendl;
4602   MOSDRepScrub *repscrubop = new MOSDRepScrub(
4603     spg_t(info.pgid.pgid, replica.shard), version,
4604     get_osdmap_epoch(),
4605     get_last_peering_reset(),
4606     start, end, deep,
4607     allow_preemption,
4608     scrubber.priority,
4609     ops_blocked_by_scrub());
4610   // default priority, we want the rep scrub processed prior to any recovery
4611   // or client io messages (we are holding a lock!)
4612   osd->send_message_osd_cluster(
4613     replica.osd, repscrubop, get_osdmap_epoch());
4614 }
4615
4616 void PG::handle_scrub_reserve_request(OpRequestRef op)
4617 {
4618   dout(7) << __func__ << " " << *op->get_req() << dendl;
4619   op->mark_started();
4620   if (scrubber.reserved) {
4621     dout(10) << __func__ << " ignoring reserve request: Already reserved"
4622              << dendl;
4623     return;
4624   }
4625   if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
4626       osd->inc_scrubs_pending()) {
4627     scrubber.reserved = true;
4628   } else {
4629     dout(20) << __func__ << ": failed to reserve remotely" << dendl;
4630     scrubber.reserved = false;
4631   }
4632   const MOSDScrubReserve *m =
4633     static_cast<const MOSDScrubReserve*>(op->get_req());
4634   Message *reply = new MOSDScrubReserve(
4635     spg_t(info.pgid.pgid, primary.shard),
4636     m->map_epoch,
4637     scrubber.reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
4638     pg_whoami);
4639   osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
4640 }
4641
4642 void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
4643 {
4644   dout(7) << __func__ << " " << *op->get_req() << dendl;
4645   op->mark_started();
4646   if (!scrubber.reserved) {
4647     dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
4648     return;
4649   }
4650   if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
4651     dout(10) << " already had osd." << from << " reserved" << dendl;
4652   } else {
4653     dout(10) << " osd." << from << " scrub reserve = success" << dendl;
4654     scrubber.reserved_peers.insert(from);
4655     sched_scrub();
4656   }
4657 }
4658
4659 void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
4660 {
4661   dout(7) << __func__ << " " << *op->get_req() << dendl;
4662   op->mark_started();
4663   if (!scrubber.reserved) {
4664     dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
4665     return;
4666   }
4667   if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
4668     dout(10) << " already had osd." << from << " reserved" << dendl;
4669   } else {
4670     /* One decline stops this pg from being scheduled for scrubbing. */
4671     dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
4672     scrubber.reserve_failed = true;
4673     sched_scrub();
4674   }
4675 }
4676
4677 void PG::handle_scrub_reserve_release(OpRequestRef op)
4678 {
4679   dout(7) << __func__ << " " << *op->get_req() << dendl;
4680   op->mark_started();
4681   clear_scrub_reserved();
4682 }
4683
4684 // We can zero the value of primary num_bytes as just an atomic.
4685 // However, setting above zero reserves space for backfill and requires
4686 // the OSDService::stat_lock which protects all OSD usage
4687 void PG::set_reserved_num_bytes(int64_t primary, int64_t local) {
4688   ceph_assert(osd->stat_lock.is_locked_by_me());
4689   primary_num_bytes.store(primary);
4690   local_num_bytes.store(local);
4691   return;
4692 }
4693
4694 void PG::clear_reserved_num_bytes() {
4695   primary_num_bytes.store(0);
4696   local_num_bytes.store(0);
4697   return;
4698 }
4699
4700 void PG::reject_reservation()
4701 {
4702   clear_reserved_num_bytes();
4703   osd->send_message_osd_cluster(
4704     primary.osd,
4705     new MBackfillReserve(
4706       MBackfillReserve::REJECT,
4707       spg_t(info.pgid.pgid, primary.shard),
4708       get_osdmap_epoch()),
4709     get_osdmap_epoch());
4710 }
4711
4712 void PG::schedule_backfill_retry(float delay)
4713 {
4714   std::lock_guard lock(osd->recovery_request_lock);
4715   osd->recovery_request_timer.add_event_after(
4716     delay,
4717     new QueuePeeringEvt<RequestBackfill>(
4718       this, get_osdmap_epoch(),
4719       RequestBackfill()));
4720 }
4721
4722 void PG::schedule_recovery_retry(float delay)
4723 {
4724   std::lock_guard lock(osd->recovery_request_lock);
4725   osd->recovery_request_timer.add_event_after(
4726     delay,
4727     new QueuePeeringEvt<DoRecovery>(
4728       this, get_osdmap_epoch(),
4729       DoRecovery()));
4730 }
4731
4732 void PG::clear_scrub_reserved()
4733 {
4734   scrubber.reserved_peers.clear();
4735   scrubber.reserve_failed = false;
4736
4737   if (scrubber.reserved) {
4738     scrubber.reserved = false;
4739     osd->dec_scrubs_pending();
4740   }
4741 }
4742
4743 void PG::scrub_reserve_replicas()
4744 {
4745   ceph_assert(backfill_targets.empty());
4746   for (set<pg_shard_t>::iterator i = actingset.begin();
4747        i != actingset.end();
4748        ++i) {
4749     if (*i == pg_whoami) continue;
4750     dout(10) << "scrub requesting reserve from osd." << *i << dendl;
4751     osd->send_message_osd_cluster(
4752       i->osd,
4753       new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
4754                            get_osdmap_epoch(),
4755                            MOSDScrubReserve::REQUEST, pg_whoami),
4756       get_osdmap_epoch());
4757   }
4758 }
4759
4760 void PG::scrub_unreserve_replicas()
4761 {
4762   ceph_assert(backfill_targets.empty());
4763   for (set<pg_shard_t>::iterator i = actingset.begin();
4764        i != actingset.end();
4765        ++i) {
4766     if (*i == pg_whoami) continue;
4767     dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
4768     osd->send_message_osd_cluster(
4769       i->osd,
4770       new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
4771                            get_osdmap_epoch(),
4772                            MOSDScrubReserve::RELEASE, pg_whoami),
4773       get_osdmap_epoch());
4774   }
4775 }
4776
4777 void PG::_scan_rollback_obs(const vector<ghobject_t> &rollback_obs)
4778 {
4779   ObjectStore::Transaction t;
4780   eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
4781   for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
4782        i != rollback_obs.end();
4783        ++i) {
4784     if (i->generation < trimmed_to.version) {
4785       dout(10) << __func__ << "osd." << osd->whoami
4786                << " pg " << info.pgid
4787                << " found obsolete rollback obj "
4788                << *i << " generation < trimmed_to "
4789                << trimmed_to
4790                << "...repaired" << dendl;
4791       t.remove(coll, *i);
4792     }
4793   }
4794   if (!t.empty()) {
4795     derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
4796          << dendl;
4797     osd->store->queue_transaction(ch, std::move(t), NULL);
4798   }
4799 }
4800
4801 void PG::_scan_snaps(ScrubMap &smap)
4802 {
4803   hobject_t head;
4804   SnapSet snapset;
4805
4806   // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify
4807   // caller using clean_meta_map(), and it works properly.
4808   dout(20) << __func__ << " start" << dendl;
4809
4810   for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4811        i != smap.objects.rend();
4812        ++i) {
4813     const hobject_t &hoid = i->first;
4814     ScrubMap::object &o = i->second;
4815
4816     dout(20) << __func__ << " " << hoid << dendl;
4817
4818     ceph_assert(!hoid.is_snapdir());
4819     if (hoid.is_head()) {
4820       // parse the SnapSet
4821       bufferlist bl;
4822       if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
4823         continue;
4824       }
4825       bl.push_back(o.attrs[SS_ATTR]);
4826       auto p = bl.cbegin();
4827       try {
4828         decode(snapset, p);
4829       } catch(...) {
4830         continue;
4831       }
4832       head = hoid.get_head();
4833       continue;
4834     }
4835     if (hoid.snap < CEPH_MAXSNAP) {
4836       // check and if necessary fix snap_mapper
4837       if (hoid.get_head() != head) {
4838         derr << __func__ << " no head for " << hoid << " (have " << head << ")"
4839              << dendl;
4840         continue;
4841       }
4842       set<snapid_t> obj_snaps;
4843       auto p = snapset.clone_snaps.find(hoid.snap);
4844       if (p == snapset.clone_snaps.end()) {
4845         derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset
4846              << dendl;
4847         continue;
4848       }
4849       obj_snaps.insert(p->second.begin(), p->second.end());
4850       set<snapid_t> cur_snaps;
4851       int r = snap_mapper.get_snaps(hoid, &cur_snaps);
4852       if (r != 0 && r != -ENOENT) {
4853         derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
4854         ceph_abort();
4855       }
4856       if (r == -ENOENT || cur_snaps != obj_snaps) {
4857         ObjectStore::Transaction t;
4858         OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4859         if (r == 0) {
4860           r = snap_mapper.remove_oid(hoid, &_t);
4861           if (r != 0) {
4862             derr << __func__ << ": remove_oid returned " << cpp_strerror(r)
4863                  << dendl;
4864             ceph_abort();
4865           }
4866           osd->clog->error() << "osd." << osd->whoami
4867                             << " found snap mapper error on pg "
4868                             << info.pgid
4869                             << " oid " << hoid << " snaps in mapper: "
4870                             << cur_snaps << ", oi: "
4871                             << obj_snaps
4872                             << "...repaired";
4873         } else {
4874           osd->clog->error() << "osd." << osd->whoami
4875                             << " found snap mapper error on pg "
4876                             << info.pgid
4877                             << " oid " << hoid << " snaps missing in mapper"
4878                             << ", should be: "
4879                             << obj_snaps
4880                              << " was " << cur_snaps << " r " << r
4881                             << "...repaired";
4882         }
4883         snap_mapper.add_oid(hoid, obj_snaps, &_t);
4884
4885         // wait for repair to apply to avoid confusing other bits of the system.
4886         {
4887           Cond my_cond;
4888           Mutex my_lock("PG::_scan_snaps my_lock");
4889           int r = 0;
4890           bool done;
4891           t.register_on_applied_sync(
4892             new C_SafeCond(&my_lock, &my_cond, &done, &r));
4893           r = osd->store->queue_transaction(ch, std::move(t));
4894           if (r != 0) {
4895             derr << __func__ << ": queue_transaction got " << cpp_strerror(r)
4896                  << dendl;
4897           } else {
4898             my_lock.Lock();
4899             while (!done)
4900               my_cond.Wait(my_lock);
4901             my_lock.Unlock();
4902           }
4903         }
4904       }
4905     }
4906   }
4907 }
4908
4909 void PG::_repair_oinfo_oid(ScrubMap &smap)
4910 {
4911   for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4912        i != smap.objects.rend();
4913        ++i) {
4914     const hobject_t &hoid = i->first;
4915     ScrubMap::object &o = i->second;
4916
4917     bufferlist bl;
4918     if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4919       continue;
4920     }
4921     bl.push_back(o.attrs[OI_ATTR]);
4922     object_info_t oi;
4923     try {
4924       oi.decode(bl);
4925     } catch(...) {
4926       continue;
4927     }
4928     if (oi.soid != hoid) {
4929       ObjectStore::Transaction t;
4930       OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4931       osd->clog->error() << "osd." << osd->whoami
4932                             << " found object info error on pg "
4933                             << info.pgid
4934                             << " oid " << hoid << " oid in object info: "
4935                             << oi.soid
4936                             << "...repaired";
4937       // Fix object info
4938       oi.soid = hoid;
4939       bl.clear();
4940       encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4941
4942       bufferptr bp(bl.c_str(), bl.length());
4943       o.attrs[OI_ATTR] = bp;
4944
4945       t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
4946       int r = osd->store->queue_transaction(ch, std::move(t));
4947       if (r != 0) {
4948         derr << __func__ << ": queue_transaction got " << cpp_strerror(r)
4949              << dendl;
4950       }
4951     }
4952   }
4953 }
4954 int PG::build_scrub_map_chunk(
4955   ScrubMap &map,
4956   ScrubMapBuilder &pos,
4957   hobject_t start,
4958   hobject_t end,
4959   bool deep,
4960   ThreadPool::TPHandle &handle)
4961 {
4962   dout(10) << __func__ << " [" << start << "," << end << ") "
4963            << " pos " << pos
4964            << dendl;
4965
4966   // start
4967   while (pos.empty()) {
4968     pos.deep = deep;
4969     map.valid_through = info.last_update;
4970
4971     // objects
4972     vector<ghobject_t> rollback_obs;
4973     pos.ret = get_pgbackend()->objects_list_range(
4974       start,
4975       end,
4976       &pos.ls,
4977       &rollback_obs);
4978     if (pos.ret < 0) {
4979       dout(5) << "objects_list_range error: " << pos.ret << dendl;
4980       return pos.ret;
4981     }
4982     if (pos.ls.empty()) {
4983       break;
4984     }
4985     _scan_rollback_obs(rollback_obs);
4986     pos.pos = 0;
4987     return -EINPROGRESS;
4988   }
4989
4990   // scan objects
4991   while (!pos.done()) {
4992     int r = get_pgbackend()->be_scan_list(map, pos);
4993     if (r == -EINPROGRESS) {
4994       return r;
4995     }
4996   }
4997
4998   // finish
4999   dout(20) << __func__ << " finishing" << dendl;
5000   ceph_assert(pos.done());
5001   _repair_oinfo_oid(map);
5002   if (!is_primary()) {
5003     ScrubMap for_meta_scrub;
5004     // In case we restarted smaller chunk, clear old data
5005     scrubber.cleaned_meta_map.clear_from(scrubber.start);
5006     scrubber.cleaned_meta_map.insert(map);
5007     scrubber.clean_meta_map(for_meta_scrub);
5008     _scan_snaps(for_meta_scrub);
5009   }
5010
5011   dout(20) << __func__ << " done, got " << map.objects.size() << " items"
5012            << dendl;
5013   return 0;
5014 }
5015
5016 void PG::Scrubber::cleanup_store(ObjectStore::Transaction *t) {
5017   if (!store)
5018     return;
5019   struct OnComplete : Context {
5020     std::unique_ptr<Scrub::Store> store;
5021     explicit OnComplete(
5022       std::unique_ptr<Scrub::Store> &&store)
5023       : store(std::move(store)) {}
5024     void finish(int) override {}
5025   };
5026   store->cleanup(t);
5027   t->register_on_complete(new OnComplete(std::move(store)));
5028   ceph_assert(!store);
5029 }
5030
5031 void PG::repair_object(
5032   const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
5033   pg_shard_t bad_peer)
5034 {
5035   list<pg_shard_t> op_shards;
5036   for (auto i : *ok_peers) {
5037     op_shards.push_back(i.second);
5038   }
5039   dout(10) << "repair_object " << soid << " bad_peer osd."
5040            << bad_peer << " ok_peers osd.{" << op_shards << "}" << dendl;
5041   ScrubMap::object &po = ok_peers->back().first;
5042   eversion_t v;
5043   bufferlist bv;
5044   bv.push_back(po.attrs[OI_ATTR]);
5045   object_info_t oi;
5046   try {
5047     auto bliter = bv.cbegin();
5048     decode(oi, bliter);
5049   } catch (...) {
5050     dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
5051     ceph_abort();
5052   }
5053   if (bad_peer != primary) {
5054     peer_missing[bad_peer].add(soid, oi.version, eversion_t(), false);
5055   } else {
5056     // We should only be scrubbing if the PG is clean.
5057     ceph_assert(waiting_for_unreadable_object.empty());
5058
5059     pg_log.missing_add(soid, oi.version, eversion_t());
5060
5061     pg_log.set_last_requested(0);
5062     dout(10) << __func__ << ": primary = " << primary << dendl;
5063   }
5064
5065   if (is_ec_pg() || bad_peer == primary) {
5066     // we'd better collect all shard for EC pg, and prepare good peers as the
5067     // source of pull in the case of replicated pg.
5068     missing_loc.add_missing(soid, oi.version, eversion_t());
5069     list<pair<ScrubMap::object, pg_shard_t> >::iterator i;
5070     for (i = ok_peers->begin();
5071         i != ok_peers->end();
5072         ++i)
5073       missing_loc.add_location(soid, i->second);
5074   }
5075 }
5076
5077 /* replica_scrub
5078  *
5079  * Wait for last_update_applied to match msg->scrub_to as above. Wait
5080  * for pushes to complete in case of recent recovery. Build a single
5081  * scrubmap of objects that are in the range [msg->start, msg->end).
5082  */
5083 void PG::replica_scrub(
5084   OpRequestRef op,
5085   ThreadPool::TPHandle &handle)
5086 {
5087   const MOSDRepScrub *msg = static_cast<const MOSDRepScrub *>(op->get_req());
5088   ceph_assert(!scrubber.active_rep_scrub);
5089   dout(7) << "replica_scrub" << dendl;
5090
5091   if (msg->map_epoch < info.history.same_interval_since) {
5092     dout(10) << "replica_scrub discarding old replica_scrub from "
5093              << msg->map_epoch << " < " << info.history.same_interval_since
5094              << dendl;
5095     return;
5096   }
5097
5098   ceph_assert(msg->chunky);
5099   if (active_pushes > 0) {
5100     dout(10) << "waiting for active pushes to finish" << dendl;
5101     scrubber.active_rep_scrub = op;
5102     return;
5103   }
5104
5105   scrubber.state = Scrubber::BUILD_MAP_REPLICA;
5106   scrubber.replica_scrub_start = msg->min_epoch;
5107   scrubber.start = msg->start;
5108   scrubber.end = msg->end;
5109   scrubber.max_end = msg->end;
5110   scrubber.deep = msg->deep;
5111   scrubber.epoch_start = info.history.same_interval_since;
5112   if (msg->priority) {
5113     scrubber.priority = msg->priority;
5114   } else {
5115     scrubber.priority = get_scrub_priority();
5116   }
5117
5118   scrub_can_preempt = msg->allow_preemption;
5119   scrub_preempted = false;
5120   scrubber.replica_scrubmap_pos.reset();
5121
5122   requeue_scrub(msg->high_priority);
5123 }
5124
5125 /* Scrub:
5126  * PG_STATE_SCRUBBING is set when the scrub is queued
5127  *
5128  * scrub will be chunky if all OSDs in PG support chunky scrub
5129  * scrub will fail if OSDs are too old.
5130  */
5131 void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
5132 {
5133   if (cct->_conf->osd_scrub_sleep > 0 &&
5134       (scrubber.state == PG::Scrubber::NEW_CHUNK ||
5135        scrubber.state == PG::Scrubber::INACTIVE) &&
5136        scrubber.needs_sleep) {
5137     ceph_assert(!scrubber.sleeping);
5138     dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
5139
5140     // Do an async sleep so we don't block the op queue
5141     OSDService *osds = osd;
5142     spg_t pgid = get_pgid();
5143     int state = scrubber.state;
5144     auto scrub_requeue_callback =
5145         new FunctionContext([osds, pgid, state](int r) {
5146           PGRef pg = osds->osd->lookup_lock_pg(pgid);
5147           if (pg == nullptr) {
5148             lgeneric_dout(osds->osd->cct, 20)
5149                 << "scrub_requeue_callback: Could not find "
5150                 << "PG " << pgid << " can't complete scrub requeue after sleep"
5151                 << dendl;
5152             return;
5153           }
5154           pg->scrubber.sleeping = false;
5155           pg->scrubber.needs_sleep = false;
5156           lgeneric_dout(pg->cct, 20)
5157               << "scrub_requeue_callback: slept for "
5158               << ceph_clock_now() - pg->scrubber.sleep_start
5159               << ", re-queuing scrub with state " << state << dendl;
5160           pg->scrub_queued = false;
5161           pg->requeue_scrub();
5162           pg->scrubber.sleep_start = utime_t();
5163           pg->unlock();
5164         });
5165     std::lock_guard l(osd->sleep_lock);
5166     osd->sleep_timer.add_event_after(cct->_conf->osd_scrub_sleep,
5167                                            scrub_requeue_callback);
5168     scrubber.sleeping = true;
5169     scrubber.sleep_start = ceph_clock_now();
5170     return;
5171   }
5172   if (pg_has_reset_since(queued)) {
5173     return;
5174   }
5175   ceph_assert(scrub_queued);
5176   scrub_queued = false;
5177   scrubber.needs_sleep = true;
5178
5179   // for the replica
5180   if (!is_primary() &&
5181       scrubber.state == PG::Scrubber::BUILD_MAP_REPLICA) {
5182     chunky_scrub(handle);
5183     return;
5184   }
5185
5186   if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
5187     dout(10) << "scrub -- not primary or active or not clean" << dendl;
5188     state_clear(PG_STATE_SCRUBBING);
5189     state_clear(PG_STATE_REPAIR);
5190     state_clear(PG_STATE_DEEP_SCRUB);
5191     publish_stats_to_osd();
5192     return;
5193   }
5194
5195   if (!scrubber.active) {
5196     ceph_assert(backfill_targets.empty());
5197
5198     scrubber.deep = state_test(PG_STATE_DEEP_SCRUB);
5199
5200     dout(10) << "starting a new chunky scrub" << dendl;
5201   }
5202
5203   chunky_scrub(handle);
5204 }
5205
5206 /*
5207  * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
5208  * chunk.
5209  *
5210  * The object store is partitioned into chunks which end on hash boundaries. For
5211  * each chunk, the following logic is performed:
5212  *
5213  *  (1) Block writes on the chunk
5214  *  (2) Request maps from replicas
5215  *  (3) Wait for pushes to be applied (after recovery)
5216  *  (4) Wait for writes to flush on the chunk
5217  *  (5) Wait for maps from replicas
5218  *  (6) Compare / repair all scrub maps
5219  *  (7) Wait for digest updates to apply
5220  *
5221  * This logic is encoded in the mostly linear state machine:
5222  *
5223  *           +------------------+
5224  *  _________v__________        |
5225  * |                    |       |
5226  * |      INACTIVE      |       |
5227  * |____________________|       |
5228  *           |                  |
5229  *           |   +----------+   |
5230  *  _________v___v______    |   |
5231  * |                    |   |   |
5232  * |      NEW_CHUNK     |   |   |
5233  * |____________________|   |   |
5234  *           |              |   |
5235  *  _________v__________    |   |
5236  * |                    |   |   |
5237  * |     WAIT_PUSHES    |   |   |
5238  * |____________________|   |   |
5239  *           |              |   |
5240  *  _________v__________    |   |
5241  * |                    |   |   |
5242  * |  WAIT_LAST_UPDATE  |   |   |
5243  * |____________________|   |   |
5244  *           |              |   |
5245  *  _________v__________    |   |
5246  * |                    |   |   |
5247  * |      BUILD_MAP     |   |   |
5248  * |____________________|   |   |
5249  *           |              |   |
5250  *  _________v__________    |   |
5251  * |                    |   |   |
5252  * |    WAIT_REPLICAS   |   |   |
5253  * |____________________|   |   |
5254  *           |              |   |
5255  *  _________v__________    |   |
5256  * |                    |   |   |
5257  * |    COMPARE_MAPS    |   |   |
5258  * |____________________|   |   |
5259  *           |              |   |
5260  *           |              |   |
5261  *  _________v__________    |   |
5262  * |                    |   |   |
5263  * |WAIT_DIGEST_UPDATES |   |   |
5264  * |____________________|   |   |
5265  *           |   |          |   |
5266  *           |   +----------+   |
5267  *  _________v__________        |
5268  * |                    |       |
5269  * |       FINISH       |       |
5270  * |____________________|       |
5271  *           |                  |
5272  *           +------------------+
5273  *
5274  * The primary determines the last update from the subset by walking the log. If
5275  * it sees a log entry pertaining to a file in the chunk, it tells the replicas
5276  * to wait until that update is applied before building a scrub map. Both the
5277  * primary and replicas will wait for any active pushes to be applied.
5278  *
5279  * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
5280  *
5281  * scrubber.state encodes the current state of the scrub (refer to state diagram
5282  * for details).
5283  */
5284 void PG::chunky_scrub(ThreadPool::TPHandle &handle)
5285 {
5286   // check for map changes
5287   if (scrubber.is_chunky_scrub_active()) {
5288     if (scrubber.epoch_start != info.history.same_interval_since) {
5289       dout(10) << "scrub  pg changed, aborting" << dendl;
5290       scrub_clear_state();
5291       scrub_unreserve_replicas();
5292       return;
5293     }
5294   }
5295
5296   bool done = false;
5297   int ret;
5298
5299   while (!done) {
5300     dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
5301              << " [" << scrubber.start << "," << scrubber.end << ")"
5302              << " max_end " << scrubber.max_end << dendl;
5303
5304     switch (scrubber.state) {
5305       case PG::Scrubber::INACTIVE:
5306         dout(10) << "scrub start" << dendl;
5307         ceph_assert(is_primary());
5308
5309         publish_stats_to_osd();
5310         scrubber.epoch_start = info.history.same_interval_since;
5311         scrubber.active = true;
5312
5313         osd->inc_scrubs_active(scrubber.reserved);
5314         if (scrubber.reserved) {
5315           scrubber.reserved = false;
5316           scrubber.reserved_peers.clear();
5317         }
5318
5319         {
5320           ObjectStore::Transaction t;
5321           scrubber.cleanup_store(&t);
5322           scrubber.store.reset(Scrub::Store::create(osd->store, &t,
5323                                                     info.pgid, coll));
5324           osd->store->queue_transaction(ch, std::move(t), nullptr);
5325         }
5326
5327         // Don't include temporary objects when scrubbing
5328         scrubber.start = info.pgid.pgid.get_hobj_start();
5329         scrubber.state = PG::Scrubber::NEW_CHUNK;
5330
5331         {
5332           bool repair = state_test(PG_STATE_REPAIR);
5333           bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5334           const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5335           stringstream oss;
5336           oss << info.pgid.pgid << " " << mode << " starts" << std::endl;
5337           osd->clog->debug(oss);
5338         }
5339
5340         scrubber.preempt_left = cct->_conf.get_val<uint64_t>(
5341           "osd_scrub_max_preemptions");
5342         scrubber.preempt_divisor = 1;
5343         break;
5344
5345       case PG::Scrubber::NEW_CHUNK:
5346         scrubber.primary_scrubmap = ScrubMap();
5347         scrubber.received_maps.clear();
5348
5349         // begin (possible) preemption window
5350         if (scrub_preempted) {
5351           scrubber.preempt_left--;
5352           scrubber.preempt_divisor *= 2;
5353           dout(10) << __func__ << " preempted, " << scrubber.preempt_left
5354                    << " left" << dendl;
5355           scrub_preempted = false;
5356         }
5357         scrub_can_preempt = scrubber.preempt_left > 0;
5358
5359         {
5360           /* get the start and end of our scrub chunk
5361            *
5362            * Our scrub chunk has an important restriction we're going to need to
5363            * respect. We can't let head be start or end.
5364            * Using a half-open interval means that if end == head,
5365            * we'd scrub/lock head and the clone right next to head in different
5366            * chunks which would allow us to miss clones created between
5367            * scrubbing that chunk and scrubbing the chunk including head.
5368            * This isn't true for any of the other clones since clones can
5369            * only be created "just to the left of" head.  There is one exception
5370            * to this: promotion of clones which always happens to the left of the
5371            * left-most clone, but promote_object checks the scrubber in that
5372            * case, so it should be ok.  Also, it's ok to "miss" clones at the
5373            * left end of the range if we are a tier because they may legitimately
5374            * not exist (see _scrub).
5375            */
5376           int min = std::max<int64_t>(3, cct->_conf->osd_scrub_chunk_min /
5377                                       scrubber.preempt_divisor);
5378           int max = std::max<int64_t>(min, cct->_conf->osd_scrub_chunk_max /
5379                                       scrubber.preempt_divisor);
5380           hobject_t start = scrubber.start;
5381           hobject_t candidate_end;
5382           vector<hobject_t> objects;
5383           ret = get_pgbackend()->objects_list_partial(
5384             start,
5385             min,
5386             max,
5387             &objects,
5388             &candidate_end);
5389           ceph_assert(ret >= 0);
5390
5391           if (!objects.empty()) {
5392             hobject_t back = objects.back();
5393             while (candidate_end.is_head() &&
5394                    candidate_end == back.get_head()) {
5395               candidate_end = back;
5396               objects.pop_back();
5397               if (objects.empty()) {
5398                 ceph_assert(0 ==
5399                        "Somehow we got more than 2 objects which"
5400                        "have the same head but are not clones");
5401               }
5402               back = objects.back();
5403             }
5404             if (candidate_end.is_head()) {
5405               ceph_assert(candidate_end != back.get_head());
5406               candidate_end = candidate_end.get_object_boundary();
5407             }
5408           } else {
5409             ceph_assert(candidate_end.is_max());
5410           }
5411
5412           if (!_range_available_for_scrub(scrubber.start, candidate_end)) {
5413             // we'll be requeued by whatever made us unavailable for scrub
5414             dout(10) << __func__ << ": scrub blocked somewhere in range "
5415                      << "[" << scrubber.start << ", " << candidate_end << ")"
5416                      << dendl;
5417             done = true;
5418             break;
5419           }
5420           scrubber.end = candidate_end;
5421           if (scrubber.end > scrubber.max_end)
5422             scrubber.max_end = scrubber.end;
5423         }
5424
5425         // walk the log to find the latest update that affects our chunk
5426         scrubber.subset_last_update = eversion_t();
5427         for (auto p = projected_log.log.rbegin();
5428              p != projected_log.log.rend();
5429              ++p) {
5430           if (p->soid >= scrubber.start &&
5431               p->soid < scrubber.end) {
5432             scrubber.subset_last_update = p->version;
5433             break;
5434           }
5435         }
5436         if (scrubber.subset_last_update == eversion_t()) {
5437           for (list<pg_log_entry_t>::const_reverse_iterator p =
5438                  pg_log.get_log().log.rbegin();
5439                p != pg_log.get_log().log.rend();
5440                ++p) {
5441             if (p->soid >= scrubber.start &&
5442                 p->soid < scrubber.end) {
5443               scrubber.subset_last_update = p->version;
5444               break;
5445             }
5446           }
5447         }
5448
5449         scrubber.state = PG::Scrubber::WAIT_PUSHES;
5450         break;
5451
5452       case PG::Scrubber::WAIT_PUSHES:
5453         if (active_pushes == 0) {
5454           scrubber.state = PG::Scrubber::WAIT_LAST_UPDATE;
5455         } else {
5456           dout(15) << "wait for pushes to apply" << dendl;
5457           done = true;
5458         }
5459         break;
5460
5461       case PG::Scrubber::WAIT_LAST_UPDATE:
5462         if (last_update_applied < scrubber.subset_last_update) {
5463           // will be requeued by op_applied
5464           dout(15) << "wait for EC read/modify/writes to queue" << dendl;
5465           done = true;
5466           break;
5467         }
5468
5469         // ask replicas to scan
5470         scrubber.waiting_on_whom.insert(pg_whoami);
5471
5472         // request maps from replicas
5473         for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
5474              i != acting_recovery_backfill.end();
5475              ++i) {
5476           if (*i == pg_whoami) continue;
5477           _request_scrub_map(*i, scrubber.subset_last_update,
5478                              scrubber.start, scrubber.end, scrubber.deep,
5479                              scrubber.preempt_left > 0);
5480           scrubber.waiting_on_whom.insert(*i);
5481         }
5482         dout(10) << __func__ << " waiting_on_whom " << scrubber.waiting_on_whom
5483                  << dendl;
5484
5485         scrubber.state = PG::Scrubber::BUILD_MAP;
5486         scrubber.primary_scrubmap_pos.reset();
5487         break;
5488
5489       case PG::Scrubber::BUILD_MAP:
5490         ceph_assert(last_update_applied >= scrubber.subset_last_update);
5491
5492         // build my own scrub map
5493         if (scrub_preempted) {
5494           dout(10) << __func__ << " preempted" << dendl;
5495           scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
5496           break;
5497         }
5498         ret = build_scrub_map_chunk(
5499           scrubber.primary_scrubmap,
5500           scrubber.primary_scrubmap_pos,
5501           scrubber.start, scrubber.end,
5502           scrubber.deep,
5503           handle);
5504         if (ret == -EINPROGRESS) {
5505           requeue_scrub();
5506           done = true;
5507           break;
5508         }
5509         scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
5510         break;
5511
5512       case PG::Scrubber::BUILD_MAP_DONE:
5513         if (scrubber.primary_scrubmap_pos.ret < 0) {
5514           dout(5) << "error: " << scrubber.primary_scrubmap_pos.ret
5515                   << ", aborting" << dendl;
5516           scrub_clear_state();
5517           scrub_unreserve_replicas();
5518           return;
5519         }
5520         dout(10) << __func__ << " waiting_on_whom was "
5521                  << scrubber.waiting_on_whom << dendl;
5522         ceph_assert(scrubber.waiting_on_whom.count(pg_whoami));
5523         scrubber.waiting_on_whom.erase(pg_whoami);
5524
5525         scrubber.state = PG::Scrubber::WAIT_REPLICAS;
5526         break;
5527
5528       case PG::Scrubber::WAIT_REPLICAS:
5529         if (!scrubber.waiting_on_whom.empty()) {
5530           // will be requeued by sub_op_scrub_map
5531           dout(10) << "wait for replicas to build scrub map" << dendl;
5532           done = true;
5533           break;
5534         }
5535         // end (possible) preemption window
5536         scrub_can_preempt = false;
5537         if (scrub_preempted) {
5538           dout(10) << __func__ << " preempted, restarting chunk" << dendl;
5539           scrubber.state = PG::Scrubber::NEW_CHUNK;
5540         } else {
5541           scrubber.state = PG::Scrubber::COMPARE_MAPS;
5542         }
5543         break;
5544
5545       case PG::Scrubber::COMPARE_MAPS:
5546         ceph_assert(last_update_applied >= scrubber.subset_last_update);
5547         ceph_assert(scrubber.waiting_on_whom.empty());
5548
5549         scrub_compare_maps();
5550         scrubber.start = scrubber.end;
5551         scrubber.run_callbacks();
5552
5553         // requeue the writes from the chunk that just finished
5554         requeue_ops(waiting_for_scrub);
5555
5556         scrubber.state = PG::Scrubber::WAIT_DIGEST_UPDATES;
5557
5558         // fall-thru
5559
5560       case PG::Scrubber::WAIT_DIGEST_UPDATES:
5561         if (scrubber.num_digest_updates_pending) {
5562           dout(10) << __func__ << " waiting on "
5563                    << scrubber.num_digest_updates_pending
5564                    << " digest updates" << dendl;
5565           done = true;
5566           break;
5567         }
5568
5569         scrubber.preempt_left = cct->_conf.get_val<uint64_t>(
5570           "osd_scrub_max_preemptions");
5571         scrubber.preempt_divisor = 1;
5572
5573         if (!(scrubber.end.is_max())) {
5574           scrubber.state = PG::Scrubber::NEW_CHUNK;
5575           requeue_scrub();
5576           done = true;
5577         } else {
5578           scrubber.state = PG::Scrubber::FINISH;
5579         }
5580
5581         break;
5582
5583       case PG::Scrubber::FINISH:
5584         scrub_finish();
5585         scrubber.state = PG::Scrubber::INACTIVE;
5586         done = true;
5587
5588         if (!snap_trimq.empty()) {
5589           dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
5590           snap_trimmer_scrub_complete();
5591         }
5592
5593         break;
5594
5595       case PG::Scrubber::BUILD_MAP_REPLICA:
5596         // build my own scrub map
5597         if (scrub_preempted) {
5598           dout(10) << __func__ << " preempted" << dendl;
5599           ret = 0;
5600         } else {
5601           ret = build_scrub_map_chunk(
5602             scrubber.replica_scrubmap,
5603             scrubber.replica_scrubmap_pos,
5604             scrubber.start, scrubber.end,
5605             scrubber.deep,
5606             handle);
5607         }
5608         if (ret == -EINPROGRESS) {
5609           requeue_scrub();
5610           done = true;
5611           break;
5612         }
5613         // reply
5614         {
5615           MOSDRepScrubMap *reply = new MOSDRepScrubMap(
5616             spg_t(info.pgid.pgid, get_primary().shard),
5617             scrubber.replica_scrub_start,
5618             pg_whoami);
5619           reply->preempted = scrub_preempted;
5620           ::encode(scrubber.replica_scrubmap, reply->get_data());
5621           osd->send_message_osd_cluster(
5622             get_primary().osd, reply,
5623             scrubber.replica_scrub_start);
5624         }
5625         scrub_preempted = false;
5626         scrub_can_preempt = false;
5627         scrubber.state = PG::Scrubber::INACTIVE;
5628         scrubber.replica_scrubmap = ScrubMap();
5629         scrubber.replica_scrubmap_pos = ScrubMapBuilder();
5630         scrubber.start = hobject_t();
5631         scrubber.end = hobject_t();
5632         scrubber.max_end = hobject_t();
5633         done = true;
5634         break;
5635
5636       default:
5637         ceph_abort();
5638     }
5639   }
5640   dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
5641            << " [" << scrubber.start << "," << scrubber.end << ")"
5642            << " max_end " << scrubber.max_end << dendl;
5643 }
5644
5645 bool PG::write_blocked_by_scrub(const hobject_t& soid)
5646 {
5647   if (soid < scrubber.start || soid >= scrubber.end) {
5648     return false;
5649   }
5650   if (scrub_can_preempt) {
5651     if (!scrub_preempted) {
5652       dout(10) << __func__ << " " << soid << " preempted" << dendl;
5653       scrub_preempted = true;
5654     } else {
5655       dout(10) << __func__ << " " << soid << " already preempted" << dendl;
5656     }
5657     return false;
5658   }
5659   return true;
5660 }
5661
5662 bool PG::range_intersects_scrub(const hobject_t &start, const hobject_t& end)
5663 {
5664   // does [start, end] intersect [scrubber.start, scrubber.max_end)
5665   return (start < scrubber.max_end &&
5666           end >= scrubber.start);
5667 }
5668
5669 void PG::scrub_clear_state(bool has_error)
5670 {
5671   ceph_assert(is_locked());
5672   state_clear(PG_STATE_SCRUBBING);
5673   if (!has_error)
5674     state_clear(PG_STATE_REPAIR);
5675   state_clear(PG_STATE_DEEP_SCRUB);
5676   publish_stats_to_osd();
5677
5678   // active -> nothing.
5679   if (scrubber.active)
5680     osd->dec_scrubs_active();
5681
5682   requeue_ops(waiting_for_scrub);
5683
5684   scrubber.reset();
5685
5686   // type-specific state clear
5687   _scrub_clear_state();
5688 }
5689
5690 void PG::scrub_compare_maps()
5691 {
5692   dout(10) << __func__ << " has maps, analyzing" << dendl;
5693
5694   // construct authoritative scrub map for type specific scrubbing
5695   scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
5696   map<hobject_t,
5697       pair<boost::optional<uint32_t>,
5698            boost::optional<uint32_t>>> missing_digest;
5699
5700   map<pg_shard_t, ScrubMap *> maps;
5701   maps[pg_whoami] = &scrubber.primary_scrubmap;
5702
5703   for (const auto& i : acting_recovery_backfill) {
5704     if (i == pg_whoami) continue;
5705     dout(2) << __func__ << " replica " << i << " has "
5706             << scrubber.received_maps[i].objects.size()
5707             << " items" << dendl;
5708     maps[i] = &scrubber.received_maps[i];
5709   }
5710
5711   set<hobject_t> master_set;
5712
5713   // Construct master set
5714   for (const auto map : maps) {
5715     for (const auto i : map.second->objects) {
5716       master_set.insert(i.first);
5717     }
5718   }
5719
5720   stringstream ss;
5721   get_pgbackend()->be_omap_checks(maps, master_set,
5722                                   scrubber.omap_stats, ss);
5723
5724   if (!ss.str().empty()) {
5725     osd->clog->warn(ss);
5726   }
5727
5728   if (acting.size() > 1) {
5729     dout(10) << __func__ << "  comparing replica scrub maps" << dendl;
5730
5731     // Map from object with errors to good peer
5732     map<hobject_t, list<pg_shard_t>> authoritative;
5733
5734     dout(2) << __func__ << "   osd." << acting[0] << " has "
5735             << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
5736
5737     ss.str("");
5738     ss.clear();
5739
5740     get_pgbackend()->be_compare_scrubmaps(
5741       maps,
5742       master_set,
5743       state_test(PG_STATE_REPAIR),
5744       scrubber.missing,
5745       scrubber.inconsistent,
5746       authoritative,
5747       missing_digest,
5748       scrubber.shallow_errors,
5749       scrubber.deep_errors,
5750       scrubber.store.get(),
5751       info.pgid, acting,
5752       ss);
5753     dout(2) << ss.str() << dendl;
5754
5755     if (!ss.str().empty()) {
5756       osd->clog->error(ss);
5757     }
5758
5759     for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
5760          i != authoritative.end();
5761          ++i) {
5762       list<pair<ScrubMap::object, pg_shard_t> > good_peers;
5763       for (list<pg_shard_t>::const_iterator j = i->second.begin();
5764            j != i->second.end();
5765            ++j) {
5766         good_peers.push_back(make_pair(maps[*j]->objects[i->first], *j));
5767       }
5768       scrubber.authoritative.insert(
5769         make_pair(
5770           i->first,
5771           good_peers));
5772     }
5773
5774     for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
5775          i != authoritative.end();
5776          ++i) {
5777       scrubber.cleaned_meta_map.objects.erase(i->first);
5778       scrubber.cleaned_meta_map.objects.insert(
5779         *(maps[i->second.back()]->objects.find(i->first))
5780         );
5781     }
5782   }
5783
5784   ScrubMap for_meta_scrub;
5785   scrubber.clean_meta_map(for_meta_scrub);
5786
5787   // ok, do the pg-type specific scrubbing
5788   scrub_snapshot_metadata(for_meta_scrub, missing_digest);
5789   // Called here on the primary can use an authoritative map if it isn't the primary
5790   _scan_snaps(for_meta_scrub);
5791   if (!scrubber.store->empty()) {
5792     if (state_test(PG_STATE_REPAIR)) {
5793       dout(10) << __func__ << ": discarding scrub results" << dendl;
5794       scrubber.store->flush(nullptr);
5795     } else {
5796       dout(10) << __func__ << ": updating scrub object" << dendl;
5797       ObjectStore::Transaction t;
5798       scrubber.store->flush(&t);
5799       osd->store->queue_transaction(ch, std::move(t), nullptr);
5800     }
5801   }
5802 }
5803
5804 bool PG::scrub_process_inconsistent()
5805 {
5806   dout(10) << __func__ << ": checking authoritative" << dendl;
5807   bool repair = state_test(PG_STATE_REPAIR);
5808   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5809   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5810
5811   // authoriative only store objects which missing or inconsistent.
5812   if (!scrubber.authoritative.empty()) {
5813     stringstream ss;
5814     ss << info.pgid << " " << mode << " "
5815        << scrubber.missing.size() << " missing, "
5816        << scrubber.inconsistent.size() << " inconsistent objects";
5817     dout(2) << ss.str() << dendl;
5818     osd->clog->error(ss);
5819     if (repair) {
5820       state_clear(PG_STATE_CLEAN);
5821       for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >>::iterator i =
5822              scrubber.authoritative.begin();
5823            i != scrubber.authoritative.end();
5824            ++i) {
5825         set<pg_shard_t>::iterator j;
5826
5827         auto missing_entry = scrubber.missing.find(i->first);
5828         if (missing_entry != scrubber.missing.end()) {
5829           for (j = missing_entry->second.begin();
5830                j != missing_entry->second.end();
5831                ++j) {
5832             repair_object(
5833               i->first,
5834               &(i->second),
5835               *j);
5836             ++scrubber.fixed;
5837           }
5838         }
5839         if (scrubber.inconsistent.count(i->first)) {
5840           for (j = scrubber.inconsistent[i->first].begin();
5841                j != scrubber.inconsistent[i->first].end();
5842                ++j) {
5843             repair_object(i->first,
5844               &(i->second),
5845               *j);
5846             ++scrubber.fixed;
5847           }
5848         }
5849       }
5850     }
5851   }
5852   return (!scrubber.authoritative.empty() && repair);
5853 }
5854
5855 bool PG::ops_blocked_by_scrub() const {
5856   return (waiting_for_scrub.size() != 0);
5857 }
5858
5859 // the part that actually finalizes a scrub
5860 void PG::scrub_finish()
5861 {
5862   dout(20) << __func__ << dendl;
5863   bool repair = state_test(PG_STATE_REPAIR);
5864   bool do_auto_scrub = false;
5865   // if the repair request comes from auto-repair and large number of errors,
5866   // we would like to cancel auto-repair
5867   if (repair && scrubber.auto_repair
5868       && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) {
5869     state_clear(PG_STATE_REPAIR);
5870     repair = false;
5871   }
5872   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5873   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5874
5875   // if a regular scrub had errors within the limit, do a deep scrub to auto repair.
5876   if (scrubber.deep_scrub_on_error
5877       && scrubber.authoritative.size()
5878       && scrubber.authoritative.size() <= cct->_conf->osd_scrub_auto_repair_num_errors) {
5879     ceph_assert(!deep_scrub);
5880     do_auto_scrub = true;
5881     dout(20) << __func__ << " Try to auto repair after scrub errors" << dendl;
5882   }
5883   scrubber.deep_scrub_on_error = false;
5884
5885   // type-specific finish (can tally more errors)
5886   _scrub_finish();
5887
5888   bool has_error = scrub_process_inconsistent();
5889
5890   {
5891     stringstream oss;
5892     oss << info.pgid.pgid << " " << mode << " ";
5893     int total_errors = scrubber.shallow_errors + scrubber.deep_errors;
5894     if (total_errors)
5895       oss << total_errors << " errors";
5896     else
5897       oss << "ok";
5898     if (!deep_scrub && info.stats.stats.sum.num_deep_scrub_errors)
5899       oss << " ( " << info.stats.stats.sum.num_deep_scrub_errors
5900           << " remaining deep scrub error details lost)";
5901     if (repair)
5902       oss << ", " << scrubber.fixed << " fixed";
5903     if (total_errors)
5904       osd->clog->error(oss);
5905     else
5906       osd->clog->debug(oss);
5907   }
5908
5909   // finish up
5910   unreg_next_scrub();
5911   utime_t now = ceph_clock_now();
5912   info.history.last_scrub = info.last_update;
5913   info.history.last_scrub_stamp = now;
5914   if (scrubber.deep) {
5915     info.history.last_deep_scrub = info.last_update;
5916     info.history.last_deep_scrub_stamp = now;
5917   }
5918   // Since we don't know which errors were fixed, we can only clear them
5919   // when every one has been fixed.
5920   if (repair) {
5921     if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) {
5922       ceph_assert(deep_scrub);
5923       scrubber.shallow_errors = scrubber.deep_errors = 0;
5924       dout(20) << __func__ << " All may be fixed" << dendl;
5925     } else if (has_error) {
5926       // Deep scrub in order to get corrected error counts
5927       scrub_after_recovery = true;
5928       dout(20) << __func__ << " Set scrub_after_recovery" << dendl;
5929     } else if (scrubber.shallow_errors || scrubber.deep_errors) {
5930       // We have errors but nothing can be fixed, so there is no repair
5931       // possible.
5932       state_set(PG_STATE_FAILED_REPAIR);
5933       dout(10) << __func__ << " " << (scrubber.shallow_errors + scrubber.deep_errors)
5934                << " error(s) present with no repair possible" << dendl;
5935     }
5936   }
5937   if (deep_scrub) {
5938     if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0))
5939       info.history.last_clean_scrub_stamp = now;
5940     info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
5941     info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
5942     info.stats.stats.sum.num_large_omap_objects = scrubber.omap_stats.large_omap_objects;
5943     info.stats.stats.sum.num_omap_bytes = scrubber.omap_stats.omap_bytes;
5944     info.stats.stats.sum.num_omap_keys = scrubber.omap_stats.omap_keys;
5945     dout(25) << __func__ << " shard " << pg_whoami << " num_omap_bytes = "
5946              << info.stats.stats.sum.num_omap_bytes << " num_omap_keys = "
5947              << info.stats.stats.sum.num_omap_keys << dendl;
5948   } else {
5949     info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
5950     // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
5951     // because of deep-scrub errors
5952     if (scrubber.shallow_errors == 0)
5953       info.history.last_clean_scrub_stamp = now;
5954   }
5955   info.stats.stats.sum.num_scrub_errors =
5956     info.stats.stats.sum.num_shallow_scrub_errors +
5957     info.stats.stats.sum.num_deep_scrub_errors;
5958   if (scrubber.check_repair) {
5959     scrubber.check_repair = false;
5960     if (info.stats.stats.sum.num_scrub_errors) {
5961       state_set(PG_STATE_FAILED_REPAIR);
5962       dout(10) << __func__ << " " << info.stats.stats.sum.num_scrub_errors
5963                << " error(s) still present after re-scrub" << dendl;
5964     }
5965   }
5966   publish_stats_to_osd();
5967
5968   {
5969     ObjectStore::Transaction t;
5970     dirty_info = true;
5971     write_if_dirty(t);
5972     int tr = osd->store->queue_transaction(ch, std::move(t), NULL);
5973     ceph_assert(tr == 0);
5974   }
5975
5976
5977   if (has_error) {
5978     queue_peering_event(
5979       PGPeeringEventRef(
5980         std::make_shared<PGPeeringEvent>(
5981           get_osdmap_epoch(),
5982           get_osdmap_epoch(),
5983           DoRecovery())));
5984   }
5985
5986   scrub_clear_state(has_error);
5987   scrub_unreserve_replicas();
5988
5989   if (do_auto_scrub) {
5990     scrub_requested(false, false, true);
5991   } else {
5992     reg_next_scrub();
5993   }
5994
5995   if (is_active() && is_primary()) {
5996     share_pg_info();
5997   }
5998 }
5999
6000 void PG::share_pg_info()
6001 {
6002   dout(10) << "share_pg_info" << dendl;
6003
6004   // share new pg_info_t with replicas
6005   ceph_assert(!acting_recovery_backfill.empty());
6006   for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
6007        i != acting_recovery_backfill.end();
6008        ++i) {
6009     if (*i == pg_whoami) continue;
6010     auto pg_shard = *i;
6011     auto peer = peer_info.find(pg_shard);
6012     if (peer != peer_info.end()) {
6013       peer->second.last_epoch_started = info.last_epoch_started;
6014       peer->second.last_interval_started = info.last_interval_started;
6015       peer->second.history.merge(info.history);
6016     }
6017     MOSDPGInfo *m = new MOSDPGInfo(get_osdmap_epoch());
6018     m->pg_list.push_back(
6019       make_pair(
6020         pg_notify_t(
6021           pg_shard.shard, pg_whoami.shard,
6022           get_osdmap_epoch(),
6023           get_osdmap_epoch(),
6024           info),
6025         past_intervals));
6026     osd->send_message_osd_cluster(pg_shard.osd, m, get_osdmap_epoch());
6027   }
6028 }
6029
6030 bool PG::append_log_entries_update_missing(
6031   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
6032   ObjectStore::Transaction &t, boost::optional<eversion_t> trim_to,
6033   boost::optional<eversion_t> roll_forward_to)
6034 {
6035   ceph_assert(!entries.empty());
6036   ceph_assert(entries.begin()->version > info.last_update);
6037
6038   PGLogEntryHandler rollbacker{this, &t};
6039   bool invalidate_stats =
6040     pg_log.append_new_log_entries(info.last_backfill,
6041                                   info.last_backfill_bitwise,
6042                                   entries,
6043                                   &rollbacker);
6044
6045   if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
6046     pg_log.roll_forward(&rollbacker);
6047   }
6048   if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
6049     pg_log.roll_forward_to(*roll_forward_to, &rollbacker);
6050     last_rollback_info_trimmed_to_applied = *roll_forward_to;
6051   }
6052
6053   info.last_update = pg_log.get_head();
6054
6055   if (pg_log.get_missing().num_missing() == 0) {
6056     // advance last_complete since nothing else is missing!
6057     info.last_complete = info.last_update;
6058   }
6059   info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
6060
6061   dout(20) << __func__ << " trim_to bool = " << bool(trim_to) << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
6062   if (trim_to)
6063     pg_log.trim(*trim_to, info);
6064   dirty_info = true;
6065   write_if_dirty(t);
6066   return invalidate_stats;
6067 }
6068
6069
6070 void PG::merge_new_log_entries(
6071   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
6072   ObjectStore::Transaction &t,
6073   boost::optional<eversion_t> trim_to,
6074   boost::optional<eversion_t> roll_forward_to)
6075 {
6076   dout(10) << __func__ << " " << entries << dendl;
6077   ceph_assert(is_primary());
6078
6079   bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
6080   for (set<pg_shard_t>::const_iterator i = acting_recovery_backfill.begin();
6081        i != acting_recovery_backfill.end();
6082        ++i) {
6083     pg_shard_t peer(*i);
6084     if (peer == pg_whoami) continue;
6085     ceph_assert(peer_missing.count(peer));
6086     ceph_assert(peer_info.count(peer));
6087     pg_missing_t& pmissing(peer_missing[peer]);
6088     dout(20) << __func__ << " peer_missing for " << peer << " = " << pmissing << dendl;
6089     pg_info_t& pinfo(peer_info[peer]);
6090     bool invalidate_stats = PGLog::append_log_entries_update_missing(
6091       pinfo.last_backfill,
6092       info.last_backfill_bitwise,
6093       entries,
6094       true,
6095       NULL,
6096       pmissing,
6097       NULL,
6098       this);
6099     pinfo.last_update = info.last_update;
6100     pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
6101     rebuild_missing = rebuild_missing || invalidate_stats;
6102   }
6103
6104   if (!rebuild_missing) {
6105     return;
6106   }
6107
6108   for (auto &&i: entries) {
6109     missing_loc.rebuild(
6110       i.soid,
6111       pg_whoami,
6112       acting_recovery_backfill,
6113       info,
6114       pg_log.get_missing(),
6115       peer_missing,
6116       peer_info);
6117   }
6118 }
6119
6120 void PG::update_history(const pg_history_t& new_history)
6121 {
6122   if (info.history.merge(new_history)) {
6123     dout(20) << __func__ << " advanced history from " << new_history << dendl;
6124     dirty_info = true;
6125     if (info.history.last_epoch_clean >= info.history.same_interval_since) {
6126       dout(20) << __func__ << " clearing past_intervals" << dendl;
6127       past_intervals.clear();
6128       dirty_big_info = true;
6129     }
6130   }
6131   on_info_history_change();
6132 }
6133
6134 void PG::fulfill_info(
6135   pg_shard_t from, const pg_query_t &query,
6136   pair<pg_shard_t, pg_info_t> &notify_info)
6137 {
6138   ceph_assert(from == primary);
6139   ceph_assert(query.type == pg_query_t::INFO);
6140
6141   // info
6142   dout(10) << "sending info" << dendl;
6143   notify_info = make_pair(from, info);
6144 }
6145
6146 void PG::fulfill_log(
6147   pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
6148 {
6149   dout(10) << "log request from " << from << dendl;
6150   ceph_assert(from == primary);
6151   ceph_assert(query.type != pg_query_t::INFO);
6152   ConnectionRef con = osd->get_con_osd_cluster(
6153     from.osd, get_osdmap_epoch());
6154   if (!con) return;
6155
6156   MOSDPGLog *mlog = new MOSDPGLog(
6157     from.shard, pg_whoami.shard,
6158     get_osdmap_epoch(),
6159     info, query_epoch);
6160   mlog->missing = pg_log.get_missing();
6161
6162   // primary -> other, when building master log
6163   if (query.type == pg_query_t::LOG) {
6164     dout(10) << " sending info+missing+log since " << query.since
6165              << dendl;
6166     if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
6167       osd->clog->error() << info.pgid << " got broken pg_query_t::LOG since " << query.since
6168                         << " when my log.tail is " << pg_log.get_tail()
6169                         << ", sending full log instead";
6170       mlog->log = pg_log.get_log();           // primary should not have requested this!!
6171     } else
6172       mlog->log.copy_after(cct, pg_log.get_log(), query.since);
6173   }
6174   else if (query.type == pg_query_t::FULLLOG) {
6175     dout(10) << " sending info+missing+full log" << dendl;
6176     mlog->log = pg_log.get_log();
6177   }
6178
6179   dout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
6180
6181   osd->share_map_peer(from.osd, con.get(), get_osdmap());
6182   osd->send_message_osd_cluster(mlog, con.get());
6183 }
6184
6185 void PG::fulfill_query(const MQuery& query, RecoveryCtx *rctx)
6186 {
6187   if (query.query.type == pg_query_t::INFO) {
6188     pair<pg_shard_t, pg_info_t> notify_info;
6189     update_history(query.query.history);
6190     fulfill_info(query.from, query.query, notify_info);
6191     rctx->send_notify(
6192       notify_info.first,
6193       pg_notify_t(
6194         notify_info.first.shard, pg_whoami.shard,
6195         query.query_epoch,
6196         get_osdmap_epoch(),
6197         notify_info.second),
6198       past_intervals);
6199   } else {
6200     update_history(query.query.history);
6201     fulfill_log(query.from, query.query, query.query_epoch);
6202   }
6203 }
6204
6205 void PG::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
6206 {
6207   bool changed = false;
6208   if (osdmap->test_flag(CEPH_OSDMAP_FULL) &&
6209       !lastmap->test_flag(CEPH_OSDMAP_FULL)) {
6210     dout(10) << " cluster was marked full in " << osdmap->get_epoch() << dendl;
6211     changed = true;
6212   }
6213   const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
6214   if (!pi) {
6215     return; // pool deleted
6216   }
6217   if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
6218     const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
6219     if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
6220       dout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
6221       changed = true;
6222     }
6223   }
6224   if (changed) {
6225     info.history.last_epoch_marked_full = osdmap->get_epoch();
6226     dirty_info = true;
6227   }
6228 }
6229
6230 bool PG::should_restart_peering(
6231   int newupprimary,
6232   int newactingprimary,
6233   const vector<int>& newup,
6234   const vector<int>& newacting,
6235   OSDMapRef lastmap,
6236   OSDMapRef osdmap)
6237 {
6238   if (PastIntervals::is_new_interval(
6239         primary.osd,
6240         newactingprimary,
6241         acting,
6242         newacting,
6243         up_primary.osd,
6244         newupprimary,
6245         up,
6246         newup,
6247         osdmap,
6248         lastmap,
6249         info.pgid.pgid)) {
6250     dout(20) << "new interval newup " << newup
6251              << " newacting " << newacting << dendl;
6252     return true;
6253   }
6254   if (!lastmap->is_up(osd->whoami) && osdmap->is_up(osd->whoami)) {
6255     dout(10) << __func__ << " osd transitioned from down -> up" << dendl;
6256     return true;
6257   }
6258   return false;
6259 }
6260
6261 bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
6262 {
6263   if (last_peering_reset > reply_epoch ||
6264       last_peering_reset > query_epoch) {
6265     dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " << query_epoch
6266              << " last_peering_reset " << last_peering_reset
6267              << dendl;
6268     return true;
6269   }
6270   return false;
6271 }
6272
6273 void PG::set_last_peering_reset()
6274 {
6275   dout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl;
6276   if (last_peering_reset != get_osdmap_epoch()) {
6277     last_peering_reset = get_osdmap_epoch();
6278     reset_interval_flush();
6279   }
6280 }
6281
6282 struct FlushState {
6283   PGRef pg;
6284   epoch_t epoch;
6285   FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
6286   ~FlushState() {
6287     pg->lock();
6288     if (!pg->pg_has_reset_since(epoch))
6289       pg->on_flushed();
6290     pg->unlock();
6291   }
6292 };
6293 typedef std::shared_ptr<FlushState> FlushStateRef;
6294
6295 void PG::start_flush(ObjectStore::Transaction *t)
6296 {
6297   // flush in progress ops
6298   FlushStateRef flush_trigger (std::make_shared<FlushState>(
6299                                this, get_osdmap_epoch()));
6300   flushes_in_progress++;
6301   t->register_on_applied(new ContainerContext<FlushStateRef>(flush_trigger));
6302   t->register_on_commit(new ContainerContext<FlushStateRef>(flush_trigger));
6303 }
6304
6305 void PG::reset_interval_flush()
6306 {
6307   dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
6308   recovery_state.clear_blocked_outgoing();
6309
6310   Context *c = new QueuePeeringEvt<IntervalFlush>(
6311     this, get_osdmap_epoch(), IntervalFlush());
6312   if (!ch->flush_commit(c)) {
6313     dout(10) << "Beginning to block outgoing recovery messages" << dendl;
6314     recovery_state.begin_block_outgoing();
6315   } else {
6316     dout(10) << "Not blocking outgoing recovery messages" << dendl;
6317     delete c;
6318   }
6319 }
6320
6321 /* Called before initializing peering during advance_map */
6322 void PG::start_peering_interval(
6323   const OSDMapRef lastmap,
6324   const vector<int>& newup, int new_up_primary,
6325   const vector<int>& newacting, int new_acting_primary,
6326   ObjectStore::Transaction *t)
6327 {
6328   const OSDMapRef osdmap = get_osdmap();
6329
6330   set_last_peering_reset();
6331
6332   vector<int> oldacting, oldup;
6333   int oldrole = get_role();
6334
6335   if (is_primary()) {
6336     osd->clear_ready_to_merge(this);
6337   }
6338
6339   pg_shard_t old_acting_primary = get_primary();
6340   pg_shard_t old_up_primary = up_primary;
6341   bool was_old_primary = is_primary();
6342   bool was_old_replica = is_replica();
6343
6344   acting.swap(oldacting);
6345   up.swap(oldup);
6346   init_primary_up_acting(
6347     newup,
6348     newacting,
6349     new_up_primary,
6350     new_acting_primary);
6351
6352   if (info.stats.up != up ||
6353       info.stats.acting != acting ||
6354       info.stats.up_primary != new_up_primary ||
6355       info.stats.acting_primary != new_acting_primary) {
6356     info.stats.up = up;
6357     info.stats.up_primary = new_up_primary;
6358     info.stats.acting = acting;
6359     info.stats.acting_primary = new_acting_primary;
6360     info.stats.mapping_epoch = osdmap->get_epoch();
6361   }
6362
6363   pg_stats_publish_lock.Lock();
6364   pg_stats_publish_valid = false;
6365   pg_stats_publish_lock.Unlock();
6366
6367   // This will now be remapped during a backfill in cases
6368   // that it would not have been before.
6369   if (up != acting)
6370     state_set(PG_STATE_REMAPPED);
6371   else
6372     state_clear(PG_STATE_REMAPPED);
6373
6374   int role = osdmap->calc_pg_role(osd->whoami, acting, acting.size());
6375   if (pool.info.is_replicated() || role == pg_whoami.shard)
6376     set_role(role);
6377   else
6378     set_role(-1);
6379
6380   // did acting, up, primary|acker change?
6381   if (!lastmap) {
6382     dout(10) << " no lastmap" << dendl;
6383     dirty_info = true;
6384     dirty_big_info = true;
6385     info.history.same_interval_since = osdmap->get_epoch();
6386   } else {
6387     std::stringstream debug;
6388     ceph_assert(info.history.same_interval_since != 0);
6389     boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
6390       get_is_recoverable_predicate());
6391     bool new_interval = PastIntervals::check_new_interval(
6392       old_acting_primary.osd,
6393       new_acting_primary,
6394       oldacting, newacting,
6395       old_up_primary.osd,
6396       new_up_primary,
6397       oldup, newup,
6398       info.history.same_interval_since,
6399       info.history.last_epoch_clean,
6400       osdmap,
6401       lastmap,
6402       info.pgid.pgid,
6403       recoverable.get(),
6404       &past_intervals,
6405       &debug);
6406     dout(10) << __func__ << ": check_new_interval output: "
6407              << debug.str() << dendl;
6408     if (new_interval) {
6409       if (osdmap->get_epoch() == osd->get_superblock().oldest_map &&
6410           info.history.last_epoch_clean < osdmap->get_epoch()) {
6411         dout(10) << " map gap, clearing past_intervals and faking" << dendl;
6412         // our information is incomplete and useless; someone else was clean
6413         // after everything we know if osdmaps were trimmed.
6414         past_intervals.clear();
6415       } else {
6416         dout(10) << " noting past " << past_intervals << dendl;
6417       }
6418       dirty_info = true;
6419       dirty_big_info = true;
6420       info.history.same_interval_since = osdmap->get_epoch();
6421       if (osdmap->have_pg_pool(info.pgid.pgid.pool()) &&
6422           info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
6423                                   osdmap->get_pg_num(info.pgid.pgid.pool()),
6424                                   nullptr)) {
6425         info.history.last_epoch_split = osdmap->get_epoch();
6426       }
6427     }
6428   }
6429
6430   if (old_up_primary != up_primary ||
6431       oldup != up) {
6432     info.history.same_up_since = osdmap->get_epoch();
6433   }
6434   // this comparison includes primary rank via pg_shard_t
6435   if (old_acting_primary != get_primary()) {
6436     info.history.same_primary_since = osdmap->get_epoch();
6437   }
6438
6439   on_new_interval();
6440
6441   dout(1) << __func__ << " up " << oldup << " -> " << up
6442            << ", acting " << oldacting << " -> " << acting
6443            << ", acting_primary " << old_acting_primary << " -> " << new_acting_primary
6444            << ", up_primary " << old_up_primary << " -> " << new_up_primary
6445            << ", role " << oldrole << " -> " << role
6446            << ", features acting " << acting_features
6447            << " upacting " << upacting_features
6448            << dendl;
6449
6450   // deactivate.
6451   state_clear(PG_STATE_ACTIVE);
6452   state_clear(PG_STATE_PEERED);
6453   state_clear(PG_STATE_PREMERGE);
6454   state_clear(PG_STATE_DOWN);
6455   state_clear(PG_STATE_RECOVERY_WAIT);
6456   state_clear(PG_STATE_RECOVERY_TOOFULL);
6457   state_clear(PG_STATE_RECOVERING);
6458
6459   peer_purged.clear();
6460   acting_recovery_backfill.clear();
6461   scrub_queued = false;
6462
6463   // reset primary/replica state?
6464   if (was_old_primary || is_primary()) {
6465     osd->remove_want_pg_temp(info.pgid.pgid);
6466   } else if (was_old_replica || is_replica()) {
6467     osd->remove_want_pg_temp(info.pgid.pgid);
6468   }
6469   clear_primary_state();
6470
6471
6472   // pg->on_*
6473   on_change(t);
6474
6475   projected_last_update = eversion_t();
6476
6477   ceph_assert(!deleting);
6478
6479   // should we tell the primary we are here?
6480   send_notify = !is_primary();
6481
6482   if (role != oldrole ||
6483       was_old_primary != is_primary()) {
6484     // did primary change?
6485     if (was_old_primary != is_primary()) {
6486       state_clear(PG_STATE_CLEAN);
6487       clear_publish_stats();
6488     }
6489
6490     on_role_change();
6491
6492     // take active waiters
6493     requeue_ops(waiting_for_peered);
6494
6495   } else {
6496     // no role change.
6497     // did primary change?
6498     if (get_primary() != old_acting_primary) {
6499       dout(10) << *this << " " << oldacting << " -> " << acting
6500                << ", acting primary "
6501                << old_acting_primary << " -> " << get_primary()
6502                << dendl;
6503     } else {
6504       // primary is the same.
6505       if (is_primary()) {
6506         // i am (still) primary. but my replica set changed.
6507         state_clear(PG_STATE_CLEAN);
6508
6509         dout(10) << oldacting << " -> " << acting
6510                  << ", replicas changed" << dendl;
6511       }
6512     }
6513   }
6514   cancel_recovery();
6515
6516   if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
6517     dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
6518     osd->queue_want_pg_temp(info.pgid.pgid, acting);
6519   }
6520 }
6521
6522 void PG::on_new_interval()
6523 {
6524   const OSDMapRef osdmap = get_osdmap();
6525
6526   on_info_history_change();
6527
6528   // initialize features
6529   acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
6530   upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
6531   for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
6532     if (*p == CRUSH_ITEM_NONE)
6533       continue;
6534     uint64_t f = osdmap->get_xinfo(*p).features;
6535     acting_features &= f;
6536     upacting_features &= f;
6537   }
6538   for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
6539     if (*p == CRUSH_ITEM_NONE)
6540       continue;
6541     upacting_features &= osdmap->get_xinfo(*p).features;
6542   }
6543
6544   _on_new_interval();
6545 }
6546
6547 void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
6548 {
6549   ceph_assert(!is_primary());
6550
6551   update_history(oinfo.history);
6552   if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
6553     info.stats.stats.sum.num_scrub_errors = 0;
6554     info.stats.stats.sum.num_shallow_scrub_errors = 0;
6555     info.stats.stats.sum.num_deep_scrub_errors = 0;
6556     dirty_info = true;
6557   }
6558
6559   if (!(info.purged_snaps == oinfo.purged_snaps)) {
6560     dout(10) << __func__ << " updating purged_snaps to " << oinfo.purged_snaps
6561              << dendl;
6562     info.purged_snaps = oinfo.purged_snaps;
6563     dirty_info = true;
6564     dirty_big_info = true;
6565   }
6566 }
6567
6568 ostream& operator<<(ostream& out, const PG& pg)
6569 {
6570   out << "pg[" << pg.info
6571       << " " << pg.up;
6572   if (pg.acting != pg.up)
6573     out << "/" << pg.acting;
6574   if (pg.is_ec_pg())
6575     out << "p" << pg.get_primary();
6576   if (!pg.async_recovery_targets.empty())
6577     out << " async=[" << pg.async_recovery_targets << "]";
6578   if (!pg.backfill_targets.empty())
6579     out << " backfill=[" << pg.backfill_targets << "]";
6580   out << " r=" << pg.get_role();
6581   out << " lpr=" << pg.get_last_peering_reset();
6582
6583   if (pg.deleting)
6584     out << " DELETING";
6585
6586   if (!pg.past_intervals.empty()) {
6587     out << " pi=[" << pg.past_intervals.get_bounds()
6588         << ")/" << pg.past_intervals.size();
6589   }
6590
6591   if (pg.is_peered()) {
6592     if (pg.last_update_ondisk != pg.info.last_update)
6593       out << " luod=" << pg.last_update_ondisk;
6594     if (pg.last_update_applied != pg.info.last_update)
6595       out << " lua=" << pg.last_update_applied;
6596   }
6597
6598   if (pg.recovery_ops_active)
6599     out << " rops=" << pg.recovery_ops_active;
6600
6601   if (pg.pg_log.get_tail() != pg.info.log_tail ||
6602       pg.pg_log.get_head() != pg.info.last_update)
6603     out << " (info mismatch, " << pg.pg_log.get_log() << ")";
6604
6605   if (!pg.pg_log.get_log().empty()) {
6606     if ((pg.pg_log.get_log().log.begin()->version <= pg.pg_log.get_tail())) {
6607       out << " (log bound mismatch, actual=["
6608           << pg.pg_log.get_log().log.begin()->version << ","
6609           << pg.pg_log.get_log().log.rbegin()->version << "]";
6610       out << ")";
6611     }
6612   }
6613
6614   out << " crt=" << pg.pg_log.get_can_rollback_to();
6615
6616   if (pg.last_complete_ondisk != pg.info.last_complete)
6617     out << " lcod " << pg.last_complete_ondisk;
6618
6619   if (pg.is_primary()) {
6620     out << " mlcod " << pg.min_last_complete_ondisk;
6621   }
6622
6623   out << " " << pg_state_string(pg.get_state());
6624   if (pg.should_send_notify())
6625     out << " NOTIFY";
6626
6627   if (pg.scrubber.must_repair)
6628     out << " MUST_REPAIR";
6629   if (pg.scrubber.auto_repair)
6630     out << " AUTO_REPAIR";
6631   if (pg.scrubber.check_repair)
6632     out << " CHECK_REPAIR";
6633   if (pg.scrubber.deep_scrub_on_error)
6634     out << " DEEP_SCRUB_ON_ERROR";
6635   if (pg.scrubber.must_deep_scrub)
6636     out << " MUST_DEEP_SCRUB";
6637   if (pg.scrubber.must_scrub)
6638     out << " MUST_SCRUB";
6639   if (pg.scrubber.time_for_deep)
6640     out << " TIME_FOR_DEEP";
6641   if (pg.scrubber.need_auto)
6642     out << " NEED_AUTO";
6643
6644   //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
6645   if (pg.pg_log.get_missing().num_missing()) {
6646     out << " m=" << pg.pg_log.get_missing().num_missing();
6647     if (pg.is_primary()) {
6648       uint64_t unfound = pg.get_num_unfound();
6649       if (unfound)
6650         out << " u=" << unfound;
6651     }
6652   }
6653   if (!pg.is_clean()) {
6654     out << " mbc=" << pg.missing_loc.get_missing_by_count();
6655   }
6656   if (!pg.snap_trimq.empty()) {
6657     out << " trimq=";
6658     // only show a count if the set is large
6659     if (pg.snap_trimq.num_intervals() > 16) {
6660       out << pg.snap_trimq.size();
6661     } else {
6662       out << pg.snap_trimq;
6663     }
6664   }
6665   if (!pg.info.purged_snaps.empty()) {
6666     out << " ps="; // snap trim queue / purged snaps
6667     if (pg.info.purged_snaps.num_intervals() > 16) {
6668       out << pg.info.purged_snaps.size();
6669     } else {
6670       out << pg.info.purged_snaps;
6671     }
6672   }
6673
6674   out << "]";
6675
6676
6677   return out;
6678 }
6679
6680 bool PG::can_discard_op(OpRequestRef& op)
6681 {
6682   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
6683   if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) {
6684     dout(20) << " discard " << *m << dendl;
6685     return true;
6686   }
6687
6688   if (m->get_map_epoch() < info.history.same_primary_since) {
6689     dout(7) << " changed after " << m->get_map_epoch()
6690             << ", dropping " << *m << dendl;
6691     return true;
6692   }
6693
6694   if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) {
6695     // >= luminous client
6696     if (m->get_connection()->has_feature(CEPH_FEATURE_SERVER_NAUTILUS)) {
6697       // >= nautilus client
6698       if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) {
6699         dout(7) << __func__ << " sent before last_force_op_resend "
6700                 << pool.info.last_force_op_resend
6701                 << ", dropping" << *m << dendl;
6702         return true;
6703       }
6704     } else {
6705       // == < nautilus client (luminous or mimic)
6706       if (m->get_map_epoch() < pool.info.get_last_force_op_resend_prenautilus()) {
6707         dout(7) << __func__ << " sent before last_force_op_resend_prenautilus "
6708                 << pool.info.last_force_op_resend_prenautilus
6709                 << ", dropping" << *m << dendl;
6710         return true;
6711       }
6712     }
6713     if (m->get_map_epoch() < info.history.last_epoch_split) {
6714       dout(7) << __func__ << " pg split in "
6715               << info.history.last_epoch_split << ", dropping" << dendl;
6716       return true;
6717     }
6718   } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) {
6719     // < luminous client
6720     if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) {
6721       dout(7) << __func__ << " sent before last_force_op_resend_preluminous "
6722               << pool.info.last_force_op_resend_preluminous
6723               << ", dropping" << *m << dendl;
6724       return true;
6725     }
6726   }
6727
6728   return false;
6729 }
6730
6731 template<typename T, int MSGTYPE>
6732 bool PG::can_discard_replica_op(OpRequestRef& op)
6733 {
6734   const T *m = static_cast<const T *>(op->get_req());
6735   ceph_assert(m->get_type() == MSGTYPE);
6736
6737   int from = m->get_source().num();
6738
6739   // if a repop is replied after a replica goes down in a new osdmap, and
6740   // before the pg advances to this new osdmap, the repop replies before this
6741   // repop can be discarded by that replica OSD, because the primary resets the
6742   // connection to it when handling the new osdmap marking it down, and also
6743   // resets the messenger sesssion when the replica reconnects. to avoid the
6744   // out-of-order replies, the messages from that replica should be discarded.
6745   OSDMapRef next_map = osd->get_next_osdmap();
6746   if (next_map->is_down(from))
6747     return true;
6748   /* Mostly, this overlaps with the old_peering_msg
6749    * condition.  An important exception is pushes
6750    * sent by replicas not in the acting set, since
6751    * if such a replica goes down it does not cause
6752    * a new interval. */
6753   if (next_map->get_down_at(from) >= m->map_epoch)
6754     return true;
6755
6756   // same pg?
6757   //  if pg changes _at all_, we reset and repeer!
6758   if (old_peering_msg(m->map_epoch, m->map_epoch)) {
6759     dout(10) << "can_discard_replica_op pg changed " << info.history
6760              << " after " << m->map_epoch
6761              << ", dropping" << dendl;
6762     return true;
6763   }
6764   return false;
6765 }
6766
6767 bool PG::can_discard_scan(OpRequestRef op)
6768 {
6769   const MOSDPGScan *m = static_cast<const MOSDPGScan *>(op->get_req());
6770   ceph_assert(m->get_type() == MSG_OSD_PG_SCAN);
6771
6772   if (old_peering_msg(m->map_epoch, m->query_epoch)) {
6773     dout(10) << " got old scan, ignoring" << dendl;
6774     return true;
6775   }
6776   return false;
6777 }
6778
6779 bool PG::can_discard_backfill(OpRequestRef op)
6780 {
6781   const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill *>(op->get_req());
6782   ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL);
6783
6784   if (old_peering_msg(m->map_epoch, m->query_epoch)) {
6785     dout(10) << " got old backfill, ignoring" << dendl;
6786     return true;
6787   }
6788
6789   return false;
6790
6791 }
6792
6793 bool PG::can_discard_request(OpRequestRef& op)
6794 {
6795   switch (op->get_req()->get_type()) {
6796   case CEPH_MSG_OSD_OP:
6797     return can_discard_op(op);
6798   case CEPH_MSG_OSD_BACKOFF:
6799     return false; // never discard
6800   case MSG_OSD_REPOP:
6801     return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op);
6802   case MSG_OSD_PG_PUSH:
6803     return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op);
6804   case MSG_OSD_PG_PULL:
6805     return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
6806   case MSG_OSD_PG_PUSH_REPLY:
6807     return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
6808   case MSG_OSD_REPOPREPLY:
6809     return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
6810   case MSG_OSD_PG_RECOVERY_DELETE:
6811     return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op);
6812
6813   case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
6814     return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op);
6815
6816   case MSG_OSD_EC_WRITE:
6817     return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
6818   case MSG_OSD_EC_WRITE_REPLY:
6819     return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op);
6820   case MSG_OSD_EC_READ:
6821     return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
6822   case MSG_OSD_EC_READ_REPLY:
6823     return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
6824   case MSG_OSD_REP_SCRUB:
6825     return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
6826   case MSG_OSD_SCRUB_RESERVE:
6827     return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op);
6828   case MSG_OSD_REP_SCRUBMAP:
6829     return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op);
6830   case MSG_OSD_PG_UPDATE_LOG_MISSING:
6831     return can_discard_replica_op<
6832       MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op);
6833   case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
6834     return can_discard_replica_op<
6835       MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
6836
6837   case MSG_OSD_PG_SCAN:
6838     return can_discard_scan(op);
6839   case MSG_OSD_PG_BACKFILL:
6840     return can_discard_backfill(op);
6841   case MSG_OSD_PG_BACKFILL_REMOVE:
6842     return can_discard_replica_op<MOSDPGBackfillRemove,
6843                                   MSG_OSD_PG_BACKFILL_REMOVE>(op);
6844   }
6845   return true;
6846 }
6847
6848 void PG::take_waiters()
6849 {
6850   dout(10) << "take_waiters" << dendl;
6851   requeue_map_waiters();
6852 }
6853
6854 void PG::do_peering_event(PGPeeringEventRef evt, RecoveryCtx *rctx)
6855 {
6856   dout(10) << __func__ << ": " << evt->get_desc() << dendl;
6857   ceph_assert(have_same_or_newer_map(evt->get_epoch_sent()));
6858   if (old_peering_evt(evt)) {
6859     dout(10) << "discard old " << evt->get_desc() << dendl;
6860   } else {
6861     recovery_state.handle_event(evt, rctx);
6862   }
6863   // write_if_dirty regardless of path above to ensure we capture any work
6864   // done by OSD::advance_pg().
6865   write_if_dirty(*rctx->transaction);
6866 }
6867
6868 void PG::queue_peering_event(PGPeeringEventRef evt)
6869 {
6870   if (old_peering_evt(evt))
6871     return;
6872   osd->osd->enqueue_peering_evt(info.pgid, evt);
6873 }
6874
6875 void PG::queue_null(epoch_t msg_epoch,
6876                     epoch_t query_epoch)
6877 {
6878   dout(10) << "null" << dendl;
6879   queue_peering_event(
6880     PGPeeringEventRef(std::make_shared<PGPeeringEvent>(msg_epoch, query_epoch,
6881                                          NullEvt())));
6882 }
6883
6884 void PG::find_unfound(epoch_t queued, RecoveryCtx *rctx)
6885 {
6886   /*
6887     * if we couldn't start any recovery ops and things are still
6888     * unfound, see if we can discover more missing object locations.
6889     * It may be that our initial locations were bad and we errored
6890     * out while trying to pull.
6891     */
6892   discover_all_missing(*rctx->query_map);
6893   if (rctx->query_map->empty()) {
6894     string action;
6895     if (state_test(PG_STATE_BACKFILLING)) {
6896       auto evt = PGPeeringEventRef(
6897         new PGPeeringEvent(
6898           queued,
6899           queued,
6900           PG::UnfoundBackfill()));
6901       queue_peering_event(evt);
6902       action = "in backfill";
6903     } else if (state_test(PG_STATE_RECOVERING)) {
6904       auto evt = PGPeeringEventRef(
6905         new PGPeeringEvent(
6906           queued,
6907           queued,
6908           PG::UnfoundRecovery()));
6909       queue_peering_event(evt);
6910       action = "in recovery";
6911     } else {
6912       action = "already out of recovery/backfill";
6913     }
6914     dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
6915   } else {
6916     dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
6917     queue_recovery();
6918   }
6919 }
6920
6921 void PG::handle_advance_map(
6922   OSDMapRef osdmap, OSDMapRef lastmap,
6923   vector<int>& newup, int up_primary,
6924   vector<int>& newacting, int acting_primary,
6925   RecoveryCtx *rctx)
6926 {
6927   ceph_assert(lastmap->get_epoch() == osdmap_ref->get_epoch());
6928   ceph_assert(lastmap == osdmap_ref);
6929   dout(10) << "handle_advance_map "
6930            << newup << "/" << newacting
6931            << " -- " << up_primary << "/" << acting_primary
6932            << dendl;
6933   update_osdmap_ref(osdmap);
6934   osd_shard->update_pg_epoch(pg_slot, osdmap->get_epoch());
6935
6936   pool.update(cct, osdmap);
6937
6938   AdvMap evt(
6939     osdmap, lastmap, newup, up_primary,
6940     newacting, acting_primary);
6941   recovery_state.handle_event(evt, rctx);
6942   if (pool.info.last_change == osdmap_ref->get_epoch()) {
6943     on_pool_change();
6944     update_store_with_options();
6945   }
6946   last_require_osd_release = osdmap->require_osd_release;
6947 }
6948
6949 void PG::handle_activate_map(RecoveryCtx *rctx)
6950 {
6951   dout(10) << "handle_activate_map " << dendl;
6952   ActMap evt;
6953   recovery_state.handle_event(evt, rctx);
6954   if (osdmap_ref->get_epoch() - last_persisted_osdmap >
6955     cct->_conf->osd_pg_epoch_persisted_max_stale) {
6956     dout(20) << __func__ << ": Dirtying info: last_persisted is "
6957              << last_persisted_osdmap
6958              << " while current is " << osdmap_ref->get_epoch() << dendl;
6959     dirty_info = true;
6960   } else {
6961     dout(20) << __func__ << ": Not dirtying info: last_persisted is "
6962              << last_persisted_osdmap
6963              << " while current is " << osdmap_ref->get_epoch() << dendl;
6964   }
6965   if (osdmap_ref->check_new_blacklist_entries()) {
6966     check_blacklisted_watchers();
6967   }
6968   write_if_dirty(*rctx->transaction);
6969 }
6970
6971 void PG::handle_initialize(RecoveryCtx *rctx)
6972 {
6973   dout(10) << __func__ << dendl;
6974   Initialize evt;
6975   recovery_state.handle_event(evt, rctx);
6976 }
6977
6978 void PG::handle_query_state(Formatter *f)
6979 {
6980   dout(10) << "handle_query_state" << dendl;
6981   QueryState q(f);
6982   recovery_state.handle_event(q, 0);
6983 }
6984
6985 void PG::update_store_with_options()
6986 {
6987   auto r = osd->store->set_collection_opts(ch, pool.info.opts);
6988   if(r < 0 && r != -EOPNOTSUPP) {
6989     derr << __func__ << " set_collection_opts returns error:" << r << dendl;
6990   }
6991 }
6992
6993 struct C_DeleteMore : public Context {
6994   PGRef pg;
6995   epoch_t epoch;
6996   C_DeleteMore(PG *p, epoch_t e) : pg(p), epoch(e) {}
6997   void finish(int r) override {
6998     ceph_abort();
6999   }
7000   void complete(int r) override {
7001     ceph_assert(r == 0);
7002     pg->lock();
7003     if (!pg->pg_has_reset_since(epoch)) {
7004       pg->osd->queue_for_pg_delete(pg->get_pgid(), epoch);
7005     }
7006     pg->unlock();
7007     delete this;
7008   }
7009 };
7010
7011 void PG::_delete_some(ObjectStore::Transaction *t)
7012 {
7013   dout(10) << __func__ << dendl;
7014
7015   {
7016     float osd_delete_sleep = osd->osd->get_osd_delete_sleep();
7017     if (osd_delete_sleep > 0 && delete_needs_sleep) {
7018       epoch_t e = get_osdmap()->get_epoch();
7019       PGRef pgref(this);
7020       auto delete_requeue_callback = new FunctionContext([this, pgref, e](int r) {
7021         dout(20) << __func__ << " wake up at "
7022                  << ceph_clock_now()
7023                  << ", re-queuing delete" << dendl;
7024         lock();
7025         delete_needs_sleep = false;
7026         if (!pg_has_reset_since(e)) {
7027           osd->queue_for_pg_delete(get_pgid(), e);
7028         }
7029         unlock();
7030       });
7031
7032       utime_t delete_schedule_time = ceph_clock_now();
7033       delete_schedule_time += osd_delete_sleep;
7034       Mutex::Locker l(osd->sleep_lock);
7035       osd->sleep_timer.add_event_at(delete_schedule_time,
7036                                                 delete_requeue_callback);
7037       dout(20) << __func__ << " Delete scheduled at " << delete_schedule_time << dendl;
7038       return;
7039     }
7040   }
7041
7042   delete_needs_sleep = true;
7043
7044   vector<ghobject_t> olist;
7045   int max = std::min(osd->store->get_ideal_list_max(),
7046                      (int)cct->_conf->osd_target_transaction_size);
7047   ghobject_t next;
7048   osd->store->collection_list(
7049     ch,
7050     next,
7051     ghobject_t::get_max(),
7052     max,
7053     &olist,
7054     &next);
7055   dout(20) << __func__ << " " << olist << dendl;
7056
7057   OSDriver::OSTransaction _t(osdriver.get_transaction(t));
7058   int64_t num = 0;
7059   for (auto& oid : olist) {
7060     if (oid == pgmeta_oid) {
7061       continue;
7062     }
7063     if (oid.is_pgmeta()) {
7064       osd->clog->warn() << info.pgid << " found stray pgmeta-like " << oid
7065                         << " during PG removal";
7066     }
7067     int r = snap_mapper.remove_oid(oid.hobj, &_t);
7068     if (r != 0 && r != -ENOENT) {
7069       ceph_abort();
7070     }
7071     t->remove(coll, oid);
7072     ++num;
7073   }
7074   if (num) {
7075     dout(20) << __func__ << " deleting " << num << " objects" << dendl;
7076     Context *fin = new C_DeleteMore(this, get_osdmap_epoch());
7077     t->register_on_commit(fin);
7078   } else {
7079     dout(20) << __func__ << " finished" << dendl;
7080     if (cct->_conf->osd_inject_failure_on_pg_removal) {
7081       _exit(1);
7082     }
7083
7084     // final flush here to ensure completions drop refs.  Of particular concern
7085     // are the SnapMapper ContainerContexts.
7086     {
7087       PGRef pgref(this);
7088       PGLog::clear_info_log(info.pgid, t);
7089       t->remove_collection(coll);
7090       t->register_on_commit(new ContainerContext<PGRef>(pgref));
7091       t->register_on_applied(new ContainerContext<PGRef>(pgref));
7092       osd->store->queue_transaction(ch, std::move(*t));
7093     }
7094     ch->flush();
7095
7096     if (!osd->try_finish_pg_delete(this, pool.info.get_pg_num())) {
7097       dout(1) << __func__ << " raced with merge, reinstantiating" << dendl;
7098       ch = osd->store->create_new_collection(coll);
7099       _create(*t,
7100               info.pgid,
7101               info.pgid.get_split_bits(pool.info.get_pg_num()));
7102       _init(*t, info.pgid, &pool.info);
7103       last_epoch = 0;  // to ensure pg epoch is also written
7104       dirty_info = true;
7105       dirty_big_info = true;
7106     } else {
7107       deleted = true;
7108
7109       // cancel reserver here, since the PG is about to get deleted and the
7110       // exit() methods don't run when that happens.
7111       osd->local_reserver.cancel_reservation(info.pgid);
7112
7113       osd->logger->dec(l_osd_pg_removing);
7114     }
7115   }
7116 }
7117
7118 // Compute pending backfill data
7119 static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes)
7120 {
7121     lgeneric_dout(cct, 20) << __func__ << " Adjust local usage " << (local_bytes >> 10) << "KiB"
7122                                << " primary usage " << (bf_bytes >> 10) << "KiB" << dendl;
7123     return std::max((int64_t)0, bf_bytes - local_bytes);
7124 }
7125
7126 int PG::pg_stat_adjust(osd_stat_t *ns)
7127 {
7128   osd_stat_t &new_stat = *ns;
7129   if (is_primary()) {
7130     return 0;
7131   }
7132   // Adjust the kb_used by adding pending backfill data
7133   uint64_t reserved_num_bytes = get_reserved_num_bytes();
7134
7135   // For now we don't consider projected space gains here
7136   // I suggest we have an optional 2 pass backfill that frees up
7137   // space in a first pass.  This could be triggered when at nearfull
7138   // or near to backfillfull.
7139   if (reserved_num_bytes > 0) {
7140     // TODO: Handle compression by adjusting by the PGs average
7141     // compression precentage.
7142     dout(20) << __func__ << " reserved_num_bytes " << (reserved_num_bytes >> 10) << "KiB"
7143              << " Before kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl;
7144     if (new_stat.statfs.available > reserved_num_bytes)
7145       new_stat.statfs.available -= reserved_num_bytes;
7146     else
7147       new_stat.statfs.available = 0;
7148     dout(20) << __func__ << " After kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl;
7149     return 1;
7150   }
7151   return 0;
7152 }
7153
7154
7155 /*------------ Recovery State Machine----------------*/
7156 #undef dout_prefix
7157 #define dout_prefix (context< RecoveryMachine >().pg->gen_prefix(*_dout) \
7158                      << "state<" << get_state_name() << ">: ")
7159
7160 /*------Crashed-------*/
7161 PG::RecoveryState::Crashed::Crashed(my_context ctx)
7162   : my_base(ctx),
7163     NamedState(context< RecoveryMachine >().pg, "Crashed")
7164 {
7165   context< RecoveryMachine >().log_enter(state_name);
7166   ceph_abort_msg("we got a bad state machine event");
7167 }
7168
7169
7170 /*------Initial-------*/
7171 PG::RecoveryState::Initial::Initial(my_context ctx)
7172   : my_base(ctx),
7173     NamedState(context< RecoveryMachine >().pg, "Initial")
7174 {
7175   context< RecoveryMachine >().log_enter(state_name);
7176 }
7177
7178 boost::statechart::result PG::RecoveryState::Initial::react(const MNotifyRec& notify)
7179 {
7180   PG *pg = context< RecoveryMachine >().pg;
7181   pg->proc_replica_info(
7182     notify.from, notify.notify.info, notify.notify.epoch_sent);
7183   pg->set_last_peering_reset();
7184   return transit< Primary >();
7185 }
7186
7187 boost::statechart::result PG::RecoveryState::Initial::react(const MInfoRec& i)
7188 {
7189   PG *pg = context< RecoveryMachine >().pg;
7190   ceph_assert(!pg->is_primary());
7191   post_event(i);
7192   return transit< Stray >();
7193 }
7194
7195 boost::statechart::result PG::RecoveryState::Initial::react(const MLogRec& i)
7196 {
7197   PG *pg = context< RecoveryMachine >().pg;
7198   ceph_assert(!pg->is_primary());
7199   post_event(i);
7200   return transit< Stray >();
7201 }
7202
7203 void PG::RecoveryState::Initial::exit()
7204 {
7205   context< RecoveryMachine >().log_exit(state_name, enter_time);
7206   PG *pg = context< RecoveryMachine >().pg;
7207   utime_t dur = ceph_clock_now() - enter_time;
7208   pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur);
7209 }
7210
7211 /*------Started-------*/
7212 PG::RecoveryState::Started::Started(my_context ctx)
7213   : my_base(ctx),
7214     NamedState(context< RecoveryMachine >().pg, "Started")
7215 {
7216   context< RecoveryMachine >().log_enter(state_name);
7217 }
7218
7219 boost::statechart::result
7220 PG::RecoveryState::Started::react(const IntervalFlush&)
7221 {
7222   PG *pg = context< RecoveryMachine >().pg;
7223   ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
7224   context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
7225   return discard_event();
7226 }
7227
7228 boost::statechart::result PG::RecoveryState::Started::react(const AdvMap& advmap)
7229 {
7230   PG *pg = context< RecoveryMachine >().pg;
7231   ldout(pg->cct, 10) << "Started advmap" << dendl;
7232   pg->check_full_transition(advmap.lastmap, advmap.osdmap);
7233   if (pg->should_restart_peering(
7234         advmap.up_primary,
7235         advmap.acting_primary,
7236         advmap.newup,
7237         advmap.newacting,
7238         advmap.lastmap,
7239         advmap.osdmap)) {
7240     ldout(pg->cct, 10) << "should_restart_peering, transitioning to Reset"
7241                        << dendl;
7242     post_event(advmap);
7243     return transit< Reset >();
7244   }
7245   pg->remove_down_peer_info(advmap.osdmap);
7246   return discard_event();
7247 }
7248
7249 boost::statechart::result PG::RecoveryState::Started::react(const QueryState& q)
7250 {
7251   q.f->open_object_section("state");
7252   q.f->dump_string("name", state_name);
7253   q.f->dump_stream("enter_time") << enter_time;
7254   q.f->close_section();
7255   return discard_event();
7256 }
7257
7258 void PG::RecoveryState::Started::exit()
7259 {
7260   context< RecoveryMachine >().log_exit(state_name, enter_time);
7261   PG *pg = context< RecoveryMachine >().pg;
7262   utime_t dur = ceph_clock_now() - enter_time;
7263   pg->osd->recoverystate_perf->tinc(rs_started_latency, dur);
7264 }
7265
7266 /*--------Reset---------*/
7267 PG::RecoveryState::Reset::Reset(my_context ctx)
7268   : my_base(ctx),
7269     NamedState(context< RecoveryMachine >().pg, "Reset")
7270 {
7271   context< RecoveryMachine >().log_enter(state_name);
7272   PG *pg = context< RecoveryMachine >().pg;
7273
7274   pg->flushes_in_progress = 0;
7275   pg->set_last_peering_reset();
7276 }
7277
7278 boost::statechart::result
7279 PG::RecoveryState::Reset::react(const IntervalFlush&)
7280 {
7281   PG *pg = context< RecoveryMachine >().pg;
7282   ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
7283   context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
7284   return discard_event();
7285 }
7286
7287 boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
7288 {
7289   PG *pg = context< RecoveryMachine >().pg;
7290   ldout(pg->cct, 10) << "Reset advmap" << dendl;
7291
7292   pg->check_full_transition(advmap.lastmap, advmap.osdmap);
7293
7294   if (pg->should_restart_peering(
7295         advmap.up_primary,
7296         advmap.acting_primary,
7297         advmap.newup,
7298         advmap.newacting,
7299         advmap.lastmap,
7300         advmap.osdmap)) {
7301     ldout(pg->cct, 10) << "should restart peering, calling start_peering_interval again"
7302                        << dendl;
7303     pg->start_peering_interval(
7304       advmap.lastmap,
7305       advmap.newup, advmap.up_primary,
7306       advmap.newacting, advmap.acting_primary,
7307       context< RecoveryMachine >().get_cur_transaction());
7308   }
7309   pg->remove_down_peer_info(advmap.osdmap);
7310   pg->check_past_interval_bounds();
7311   return discard_event();
7312 }
7313
7314 boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&)
7315 {
7316   PG *pg = context< RecoveryMachine >().pg;
7317   if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7318     context< RecoveryMachine >().send_notify(
7319       pg->get_primary(),
7320       pg_notify_t(
7321         pg->get_primary().shard, pg->pg_whoami.shard,
7322         pg->get_osdmap_epoch(),
7323         pg->get_osdmap_epoch(),
7324         pg->info),
7325       pg->past_intervals);
7326   }
7327
7328   pg->update_heartbeat_peers();
7329   pg->take_waiters();
7330
7331   return transit< Started >();
7332 }
7333
7334 boost::statechart::result PG::RecoveryState::Reset::react(const QueryState& q)
7335 {
7336   q.f->open_object_section("state");
7337   q.f->dump_string("name", state_name);
7338   q.f->dump_stream("enter_time") << enter_time;
7339   q.f->close_section();
7340   return discard_event();
7341 }
7342
7343 void PG::RecoveryState::Reset::exit()
7344 {
7345   context< RecoveryMachine >().log_exit(state_name, enter_time);
7346   PG *pg = context< RecoveryMachine >().pg;
7347   utime_t dur = ceph_clock_now() - enter_time;
7348   pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur);
7349 }
7350
7351 /*-------Start---------*/
7352 PG::RecoveryState::Start::Start(my_context ctx)
7353   : my_base(ctx),
7354     NamedState(context< RecoveryMachine >().pg, "Start")
7355 {
7356   context< RecoveryMachine >().log_enter(state_name);
7357
7358   PG *pg = context< RecoveryMachine >().pg;
7359   if (pg->is_primary()) {
7360     ldout(pg->cct, 1) << "transitioning to Primary" << dendl;
7361     post_event(MakePrimary());
7362   } else { //is_stray
7363     ldout(pg->cct, 1) << "transitioning to Stray" << dendl;
7364     post_event(MakeStray());
7365   }
7366 }
7367
7368 void PG::RecoveryState::Start::exit()
7369 {
7370   context< RecoveryMachine >().log_exit(state_name, enter_time);
7371   PG *pg = context< RecoveryMachine >().pg;
7372   utime_t dur = ceph_clock_now() - enter_time;
7373   pg->osd->recoverystate_perf->tinc(rs_start_latency, dur);
7374 }
7375
7376 /*---------Primary--------*/
7377 PG::RecoveryState::Primary::Primary(my_context ctx)
7378   : my_base(ctx),
7379     NamedState(context< RecoveryMachine >().pg, "Started/Primary")
7380 {
7381   context< RecoveryMachine >().log_enter(state_name);
7382   PG *pg = context< RecoveryMachine >().pg;
7383   ceph_assert(pg->want_acting.empty());
7384
7385   // set CREATING bit until we have peered for the first time.
7386   if (pg->info.history.last_epoch_started == 0) {
7387     pg->state_set(PG_STATE_CREATING);
7388     // use the history timestamp, which ultimately comes from the
7389     // monitor in the create case.
7390     utime_t t = pg->info.history.last_scrub_stamp;
7391     pg->info.stats.last_fresh = t;
7392     pg->info.stats.last_active = t;
7393     pg->info.stats.last_change = t;
7394     pg->info.stats.last_peered = t;
7395     pg->info.stats.last_clean = t;
7396     pg->info.stats.last_unstale = t;
7397     pg->info.stats.last_undegraded = t;
7398     pg->info.stats.last_fullsized = t;
7399     pg->info.stats.last_scrub_stamp = t;
7400     pg->info.stats.last_deep_scrub_stamp = t;
7401     pg->info.stats.last_clean_scrub_stamp = t;
7402   }
7403 }
7404
7405 boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& notevt)
7406 {
7407   PG *pg = context< RecoveryMachine >().pg;
7408   ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
7409   pg->proc_replica_info(
7410     notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
7411   return discard_event();
7412 }
7413
7414 boost::statechart::result PG::RecoveryState::Primary::react(const ActMap&)
7415 {
7416   PG *pg = context< RecoveryMachine >().pg;
7417   ldout(pg->cct, 7) << "handle ActMap primary" << dendl;
7418   pg->publish_stats_to_osd();
7419   pg->take_waiters();
7420   return discard_event();
7421 }
7422
7423 boost::statechart::result PG::RecoveryState::Primary::react(
7424   const SetForceRecovery&)
7425 {
7426   PG *pg = context< RecoveryMachine >().pg;
7427   pg->set_force_recovery(true);
7428   return discard_event();
7429 }
7430
7431 boost::statechart::result PG::RecoveryState::Primary::react(
7432   const UnsetForceRecovery&)
7433 {
7434   PG *pg = context< RecoveryMachine >().pg;
7435   pg->set_force_recovery(false);
7436   return discard_event();
7437 }
7438
7439 boost::statechart::result PG::RecoveryState::Primary::react(
7440   const RequestScrub& evt)
7441 {
7442   PG *pg = context< RecoveryMachine >().pg;
7443   if (pg->is_primary()) {
7444     pg->scrub_requested(evt.deep, evt.repair);
7445     ldout(pg->cct,10) << "marking for scrub" << dendl;
7446   }
7447   return discard_event();
7448 }
7449
7450 boost::statechart::result PG::RecoveryState::Primary::react(
7451   const SetForceBackfill&)
7452 {
7453   PG *pg = context< RecoveryMachine >().pg;
7454   pg->set_force_backfill(true);
7455   return discard_event();
7456 }
7457
7458 boost::statechart::result PG::RecoveryState::Primary::react(
7459   const UnsetForceBackfill&)
7460 {
7461   PG *pg = context< RecoveryMachine >().pg;
7462   pg->set_force_backfill(false);
7463   return discard_event();
7464 }
7465
7466 void PG::RecoveryState::Primary::exit()
7467 {
7468   context< RecoveryMachine >().log_exit(state_name, enter_time);
7469   PG *pg = context< RecoveryMachine >().pg;
7470   pg->want_acting.clear();
7471   utime_t dur = ceph_clock_now() - enter_time;
7472   pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur);
7473   pg->clear_primary_state();
7474   pg->state_clear(PG_STATE_CREATING);
7475 }
7476
7477 /*---------Peering--------*/
7478 PG::RecoveryState::Peering::Peering(my_context ctx)
7479   : my_base(ctx),
7480     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering"),
7481     history_les_bound(false)
7482 {
7483   context< RecoveryMachine >().log_enter(state_name);
7484
7485   PG *pg = context< RecoveryMachine >().pg;
7486   ceph_assert(!pg->is_peered());
7487   ceph_assert(!pg->is_peering());
7488   ceph_assert(pg->is_primary());
7489   pg->state_set(PG_STATE_PEERING);
7490 }
7491
7492 boost::statechart::result PG::RecoveryState::Peering::react(const AdvMap& advmap)
7493 {
7494   PG *pg = context< RecoveryMachine >().pg;
7495   ldout(pg->cct, 10) << "Peering advmap" << dendl;
7496   if (prior_set.affected_by_map(*(advmap.osdmap), pg)) {
7497     ldout(pg->cct, 1) << "Peering, affected_by_map, going to Reset" << dendl;
7498     post_event(advmap);
7499     return transit< Reset >();
7500   }
7501
7502   pg->adjust_need_up_thru(advmap.osdmap);
7503
7504   return forward_event();
7505 }
7506
7507 boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q)
7508 {
7509   PG *pg = context< RecoveryMachine >().pg;
7510
7511   q.f->open_object_section("state");
7512   q.f->dump_string("name", state_name);
7513   q.f->dump_stream("enter_time") << enter_time;
7514
7515   q.f->open_array_section("past_intervals");
7516   pg->past_intervals.dump(q.f);
7517   q.f->close_section();
7518
7519   q.f->open_array_section("probing_osds");
7520   for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
7521        p != prior_set.probe.end();
7522        ++p)
7523     q.f->dump_stream("osd") << *p;
7524   q.f->close_section();
7525
7526   if (prior_set.pg_down)
7527     q.f->dump_string("blocked", "peering is blocked due to down osds");
7528
7529   q.f->open_array_section("down_osds_we_would_probe");
7530   for (set<int>::iterator p = prior_set.down.begin();
7531        p != prior_set.down.end();
7532        ++p)
7533     q.f->dump_int("osd", *p);
7534   q.f->close_section();
7535
7536   q.f->open_array_section("peering_blocked_by");
7537   for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
7538        p != prior_set.blocked_by.end();
7539        ++p) {
7540     q.f->open_object_section("osd");
7541     q.f->dump_int("osd", p->first);
7542     q.f->dump_int("current_lost_at", p->second);
7543     q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
7544     q.f->close_section();
7545   }
7546   q.f->close_section();
7547
7548   if (history_les_bound) {
7549     q.f->open_array_section("peering_blocked_by_detail");
7550     q.f->open_object_section("item");
7551     q.f->dump_string("detail","peering_blocked_by_history_les_bound");
7552     q.f->close_section();
7553     q.f->close_section();
7554   }
7555
7556   q.f->close_section();
7557   return forward_event();
7558 }
7559
7560 void PG::RecoveryState::Peering::exit()
7561 {
7562   PG *pg = context< RecoveryMachine >().pg;
7563   ldout(pg->cct, 10) << "Leaving Peering" << dendl;
7564   context< RecoveryMachine >().log_exit(state_name, enter_time);
7565   pg->state_clear(PG_STATE_PEERING);
7566   pg->clear_probe_targets();
7567
7568   utime_t dur = ceph_clock_now() - enter_time;
7569   pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur);
7570 }
7571
7572
7573 /*------Backfilling-------*/
7574 PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
7575   : my_base(ctx),
7576     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Backfilling")
7577 {
7578   context< RecoveryMachine >().log_enter(state_name);
7579   PG *pg = context< RecoveryMachine >().pg;
7580   pg->backfill_reserved = true;
7581   pg->queue_recovery();
7582   pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
7583   pg->state_clear(PG_STATE_BACKFILL_WAIT);
7584   pg->state_set(PG_STATE_BACKFILLING);
7585   pg->publish_stats_to_osd();
7586 }
7587
7588 void PG::RecoveryState::Backfilling::backfill_release_reservations()
7589 {
7590   PG *pg = context< RecoveryMachine >().pg;
7591   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7592   for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
7593        it != pg->backfill_targets.end();
7594        ++it) {
7595     ceph_assert(*it != pg->pg_whoami);
7596     ConnectionRef con = pg->osd->get_con_osd_cluster(
7597       it->osd, pg->get_osdmap_epoch());
7598     if (con) {
7599       pg->osd->send_message_osd_cluster(
7600         new MBackfillReserve(
7601           MBackfillReserve::RELEASE,
7602           spg_t(pg->info.pgid.pgid, it->shard),
7603           pg->get_osdmap_epoch()),
7604         con.get());
7605     }
7606   }
7607 }
7608
7609 void PG::RecoveryState::Backfilling::cancel_backfill()
7610 {
7611   PG *pg = context< RecoveryMachine >().pg;
7612   backfill_release_reservations();
7613   if (!pg->waiting_on_backfill.empty()) {
7614     pg->waiting_on_backfill.clear();
7615     pg->finish_recovery_op(hobject_t::get_max());
7616   }
7617 }
7618
7619 boost::statechart::result
7620 PG::RecoveryState::Backfilling::react(const Backfilled &c)
7621 {
7622   backfill_release_reservations();
7623   return transit<Recovered>();
7624 }
7625
7626 boost::statechart::result
7627 PG::RecoveryState::Backfilling::react(const DeferBackfill &c)
7628 {
7629   PG *pg = context< RecoveryMachine >().pg;
7630   ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl;
7631   pg->state_set(PG_STATE_BACKFILL_WAIT);
7632   pg->state_clear(PG_STATE_BACKFILLING);
7633   cancel_backfill();
7634   pg->schedule_backfill_retry(c.delay);
7635   return transit<NotBackfilling>();
7636 }
7637
7638 boost::statechart::result
7639 PG::RecoveryState::Backfilling::react(const UnfoundBackfill &c)
7640 {
7641   PG *pg = context< RecoveryMachine >().pg;
7642   ldout(pg->cct, 10) << "backfill has unfound, can't continue" << dendl;
7643   pg->state_set(PG_STATE_BACKFILL_UNFOUND);
7644   pg->state_clear(PG_STATE_BACKFILLING);
7645   cancel_backfill();
7646   return transit<NotBackfilling>();
7647 }
7648
7649 boost::statechart::result
7650 PG::RecoveryState::Backfilling::react(const RemoteReservationRevokedTooFull &)
7651 {
7652   PG *pg = context< RecoveryMachine >().pg;
7653   pg->state_set(PG_STATE_BACKFILL_TOOFULL);
7654   pg->state_clear(PG_STATE_BACKFILLING);
7655   cancel_backfill();
7656   pg->schedule_backfill_retry(pg->cct->_conf->osd_backfill_retry_interval);
7657   return transit<NotBackfilling>();
7658 }
7659
7660 boost::statechart::result
7661 PG::RecoveryState::Backfilling::react(const RemoteReservationRevoked &)
7662 {
7663   PG *pg = context< RecoveryMachine >().pg;
7664   pg->state_set(PG_STATE_BACKFILL_WAIT);
7665   cancel_backfill();
7666   if (pg->needs_backfill()) {
7667     return transit<WaitLocalBackfillReserved>();
7668   } else {
7669     // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
7670     return discard_event();
7671   }
7672 }
7673
7674 void PG::RecoveryState::Backfilling::exit()
7675 {
7676   context< RecoveryMachine >().log_exit(state_name, enter_time);
7677   PG *pg = context< RecoveryMachine >().pg;
7678   pg->backfill_reserved = false;
7679   pg->backfill_reserving = false;
7680   pg->state_clear(PG_STATE_BACKFILLING);
7681   pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
7682   utime_t dur = ceph_clock_now() - enter_time;
7683   pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
7684 }
7685
7686 /*--WaitRemoteBackfillReserved--*/
7687
7688 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
7689   : my_base(ctx),
7690     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteBackfillReserved"),
7691     backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
7692 {
7693   context< RecoveryMachine >().log_enter(state_name);
7694   PG *pg = context< RecoveryMachine >().pg;
7695   pg->state_set(PG_STATE_BACKFILL_WAIT);
7696   pg->publish_stats_to_osd();
7697   post_event(RemoteBackfillReserved());
7698 }
7699
7700 boost::statechart::result
7701 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
7702 {
7703   PG *pg = context< RecoveryMachine >().pg;
7704
7705   int64_t num_bytes = pg->info.stats.stats.sum.num_bytes;
7706   ldout(pg->cct, 10) << __func__ << " num_bytes " << num_bytes << dendl;
7707   if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) {
7708     //The primary never backfills itself
7709     ceph_assert(*backfill_osd_it != pg->pg_whoami);
7710     ConnectionRef con = pg->osd->get_con_osd_cluster(
7711       backfill_osd_it->osd, pg->get_osdmap_epoch());
7712     if (con) {
7713       pg->osd->send_message_osd_cluster(
7714         new MBackfillReserve(
7715         MBackfillReserve::REQUEST,
7716         spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
7717         pg->get_osdmap_epoch(),
7718         pg->get_backfill_priority(),
7719         num_bytes,
7720         pg->peer_bytes[*backfill_osd_it]),
7721       con.get());
7722     }
7723     ++backfill_osd_it;
7724   } else {
7725     pg->peer_bytes.clear();
7726     post_event(AllBackfillsReserved());
7727   }
7728   return discard_event();
7729 }
7730
7731 void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
7732 {
7733   context< RecoveryMachine >().log_exit(state_name, enter_time);
7734   PG *pg = context< RecoveryMachine >().pg;
7735   utime_t dur = ceph_clock_now() - enter_time;
7736   pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur);
7737 }
7738
7739 void PG::RecoveryState::WaitRemoteBackfillReserved::retry()
7740 {
7741   PG *pg = context< RecoveryMachine >().pg;
7742   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7743
7744   // Send CANCEL to all previously acquired reservations
7745   set<pg_shard_t>::const_iterator it, begin, end;
7746   begin = context< Active >().remote_shards_to_reserve_backfill.begin();
7747   end = context< Active >().remote_shards_to_reserve_backfill.end();
7748   ceph_assert(begin != end);
7749   for (it = begin; it != backfill_osd_it; ++it) {
7750     //The primary never backfills itself
7751     ceph_assert(*it != pg->pg_whoami);
7752     ConnectionRef con = pg->osd->get_con_osd_cluster(
7753       it->osd, pg->get_osdmap_epoch());
7754     if (con) {
7755       pg->osd->send_message_osd_cluster(
7756         new MBackfillReserve(
7757         MBackfillReserve::RELEASE,
7758         spg_t(pg->info.pgid.pgid, it->shard),
7759         pg->get_osdmap_epoch()),
7760       con.get());
7761     }
7762   }
7763
7764   pg->state_clear(PG_STATE_BACKFILL_WAIT);
7765   pg->state_set(PG_STATE_BACKFILL_TOOFULL);
7766   pg->publish_stats_to_osd();
7767
7768   pg->schedule_backfill_retry(pg->cct->_conf->osd_backfill_retry_interval);
7769 }
7770
7771 boost::statechart::result
7772 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt)
7773 {
7774   retry();
7775   return transit<NotBackfilling>();
7776 }
7777
7778 boost::statechart::result
7779 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt)
7780 {
7781   retry();
7782   return transit<NotBackfilling>();
7783 }
7784
7785 /*--WaitLocalBackfillReserved--*/
7786 PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
7787   : my_base(ctx),
7788     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalBackfillReserved")
7789 {
7790   context< RecoveryMachine >().log_enter(state_name);
7791   PG *pg = context< RecoveryMachine >().pg;
7792   pg->state_set(PG_STATE_BACKFILL_WAIT);
7793   pg->osd->local_reserver.request_reservation(
7794     pg->info.pgid,
7795     new QueuePeeringEvt<LocalBackfillReserved>(
7796       pg, pg->get_osdmap_epoch(),
7797       LocalBackfillReserved()),
7798     pg->get_backfill_priority(),
7799     new QueuePeeringEvt<DeferBackfill>(
7800       pg, pg->get_osdmap_epoch(),
7801       DeferBackfill(0.0)));
7802   pg->publish_stats_to_osd();
7803 }
7804
7805 void PG::RecoveryState::WaitLocalBackfillReserved::exit()
7806 {
7807   context< RecoveryMachine >().log_exit(state_name, enter_time);
7808   PG *pg = context< RecoveryMachine >().pg;
7809   utime_t dur = ceph_clock_now() - enter_time;
7810   pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur);
7811 }
7812
7813 /*----NotBackfilling------*/
7814 PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
7815   : my_base(ctx),
7816     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotBackfilling")
7817 {
7818   context< RecoveryMachine >().log_enter(state_name);
7819   PG *pg = context< RecoveryMachine >().pg;
7820   pg->state_clear(PG_STATE_REPAIR);
7821   pg->publish_stats_to_osd();
7822 }
7823
7824 boost::statechart::result
7825 PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
7826 {
7827   return discard_event();
7828 }
7829
7830 boost::statechart::result
7831 PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
7832 {
7833   return discard_event();
7834 }
7835
7836 void PG::RecoveryState::NotBackfilling::exit()
7837 {
7838   context< RecoveryMachine >().log_exit(state_name, enter_time);
7839   PG *pg = context< RecoveryMachine >().pg;
7840   pg->state_clear(PG_STATE_BACKFILL_UNFOUND);
7841   utime_t dur = ceph_clock_now() - enter_time;
7842   pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
7843 }
7844
7845 /*----NotRecovering------*/
7846 PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx)
7847   : my_base(ctx),
7848     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotRecovering")
7849 {
7850   context< RecoveryMachine >().log_enter(state_name);
7851   PG *pg = context< RecoveryMachine >().pg;
7852   pg->publish_stats_to_osd();
7853 }
7854
7855 void PG::RecoveryState::NotRecovering::exit()
7856 {
7857   context< RecoveryMachine >().log_exit(state_name, enter_time);
7858   PG *pg = context< RecoveryMachine >().pg;
7859   pg->state_clear(PG_STATE_RECOVERY_UNFOUND);
7860   utime_t dur = ceph_clock_now() - enter_time;
7861   pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur);
7862 }
7863
7864 /*---RepNotRecovering----*/
7865 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
7866   : my_base(ctx),
7867     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepNotRecovering")
7868 {
7869   context< RecoveryMachine >().log_enter(state_name);
7870 }
7871
7872 boost::statechart::result
7873 PG::RecoveryState::RepNotRecovering::react(const RejectRemoteReservation &evt)
7874 {
7875   PG *pg = context< RecoveryMachine >().pg;
7876   pg->reject_reservation();
7877   post_event(RemoteReservationRejected());
7878   return discard_event();
7879 }
7880
7881 void PG::RecoveryState::RepNotRecovering::exit()
7882 {
7883   context< RecoveryMachine >().log_exit(state_name, enter_time);
7884   PG *pg = context< RecoveryMachine >().pg;
7885   utime_t dur = ceph_clock_now() - enter_time;
7886   pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur);
7887 }
7888
7889 /*---RepWaitRecoveryReserved--*/
7890 PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
7891   : my_base(ctx),
7892     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitRecoveryReserved")
7893 {
7894   context< RecoveryMachine >().log_enter(state_name);
7895 }
7896
7897 boost::statechart::result
7898 PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
7899 {
7900   PG *pg = context< RecoveryMachine >().pg;
7901   pg->osd->send_message_osd_cluster(
7902     pg->primary.osd,
7903     new MRecoveryReserve(
7904       MRecoveryReserve::GRANT,
7905       spg_t(pg->info.pgid.pgid, pg->primary.shard),
7906       pg->get_osdmap_epoch()),
7907     pg->get_osdmap_epoch());
7908   return transit<RepRecovering>();
7909 }
7910
7911 boost::statechart::result
7912 PG::RecoveryState::RepWaitRecoveryReserved::react(
7913   const RemoteReservationCanceled &evt)
7914 {
7915   PG *pg = context< RecoveryMachine >().pg;
7916   pg->clear_reserved_num_bytes();
7917   pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7918   return transit<RepNotRecovering>();
7919 }
7920
7921 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
7922 {
7923   context< RecoveryMachine >().log_exit(state_name, enter_time);
7924   PG *pg = context< RecoveryMachine >().pg;
7925   utime_t dur = ceph_clock_now() - enter_time;
7926   pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur);
7927 }
7928
7929 /*-RepWaitBackfillReserved*/
7930 PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
7931   : my_base(ctx),
7932     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitBackfillReserved")
7933 {
7934   context< RecoveryMachine >().log_enter(state_name);
7935 }
7936
7937 boost::statechart::result
7938 PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
7939 {
7940   PG *pg = context< RecoveryMachine >().pg;
7941   // Use tentative_bacfill_full() to make sure enough
7942   // space is available to handle target bytes from primary.
7943
7944   // TODO: If we passed num_objects from primary we could account for
7945   // an estimate of the metadata overhead.
7946
7947   // TODO: If we had compressed_allocated and compressed_original from primary
7948   // we could compute compression ratio and adjust accordingly.
7949
7950   // XXX: There is no way to get omap overhead and this would only apply
7951   // to whatever possibly different partition that is storing the database.
7952
7953   // update_osd_stat() from heartbeat will do this on a new
7954   // statfs using pg->primary_num_bytes.
7955   uint64_t pending_adjustment = 0;
7956   int64_t primary_num_bytes = evt.primary_num_bytes;
7957   int64_t local_num_bytes = evt.local_num_bytes;
7958   if (primary_num_bytes) {
7959     // For erasure coded pool overestimate by a full stripe per object
7960     // because we don't know how each objected rounded to the nearest stripe
7961     if (pg->pool.info.is_erasure()) {
7962       primary_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count();
7963       primary_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects;
7964       local_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count();
7965       local_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects;
7966     }
7967     pending_adjustment = pending_backfill(pg->cct, primary_num_bytes, local_num_bytes);
7968     ldout(pg->cct, 10) << __func__ << " primary_num_bytes " << (primary_num_bytes >> 10) << "KiB"
7969                        << " local " << (local_num_bytes >> 10) << "KiB"
7970                        << " pending_adjustments " << (pending_adjustment >> 10) << "KiB"
7971                        << dendl;
7972   }
7973   // This lock protects not only the stats OSDService but also setting the pg primary_num_bytes
7974   // That's why we don't immediately unlock
7975   Mutex::Locker l(pg->osd->stat_lock);
7976   osd_stat_t cur_stat = pg->osd->osd_stat;
7977   if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
7978       (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
7979     ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
7980                        << dendl;
7981     post_event(RejectRemoteReservation());
7982   } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
7983       pg->osd->tentative_backfill_full(pg, pending_adjustment, cur_stat)) {
7984     ldout(pg->cct, 10) << "backfill reservation rejected: backfill full"
7985                        << dendl;
7986     post_event(RejectRemoteReservation());
7987   } else {
7988     Context *preempt = nullptr;
7989     // Don't reserve space if skipped reservation check, this is used
7990     // to test the other backfill full check AND in case a corruption
7991     // of num_bytes requires ignoring that value and trying the
7992     // backfill anyway.
7993     if (primary_num_bytes && !pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation)
7994       pg->set_reserved_num_bytes(primary_num_bytes, local_num_bytes);
7995     else
7996       pg->clear_reserved_num_bytes();
7997     // Use un-ec-adjusted bytes for stats.
7998     pg->info.stats.stats.sum.num_bytes = evt.local_num_bytes;
7999     if (HAVE_FEATURE(pg->upacting_features, RECOVERY_RESERVATION_2)) {
8000       // older peers will interpret preemption as TOOFULL
8001       preempt = new QueuePeeringEvt<RemoteBackfillPreempted>(
8002         pg, pg->get_osdmap_epoch(),
8003         RemoteBackfillPreempted());
8004     }
8005     pg->osd->remote_reserver.request_reservation(
8006       pg->info.pgid,
8007       new QueuePeeringEvt<RemoteBackfillReserved>(
8008         pg, pg->get_osdmap_epoch(),
8009         RemoteBackfillReserved()),
8010       evt.priority,
8011       preempt);
8012   }
8013   return transit<RepWaitBackfillReserved>();
8014 }
8015
8016 boost::statechart::result
8017 PG::RecoveryState::RepNotRecovering::react(const RequestRecoveryPrio &evt)
8018 {
8019   PG *pg = context< RecoveryMachine >().pg;
8020
8021   // fall back to a local reckoning of priority of primary doesn't pass one
8022   // (pre-mimic compat)
8023   int prio = evt.priority ? evt.priority : pg->get_recovery_priority();
8024
8025   Context *preempt = nullptr;
8026   if (HAVE_FEATURE(pg->upacting_features, RECOVERY_RESERVATION_2)) {
8027     // older peers can't handle this
8028     preempt = new QueuePeeringEvt<RemoteRecoveryPreempted>(
8029       pg, pg->get_osdmap_epoch(),
8030       RemoteRecoveryPreempted());
8031   }
8032
8033   pg->osd->remote_reserver.request_reservation(
8034     pg->info.pgid,
8035     new QueuePeeringEvt<RemoteRecoveryReserved>(
8036       pg, pg->get_osdmap_epoch(),
8037       RemoteRecoveryReserved()),
8038     prio,
8039     preempt);
8040   return transit<RepWaitRecoveryReserved>();
8041 }
8042
8043 void PG::RecoveryState::RepWaitBackfillReserved::exit()
8044 {
8045   context< RecoveryMachine >().log_exit(state_name, enter_time);
8046   PG *pg = context< RecoveryMachine >().pg;
8047   utime_t dur = ceph_clock_now() - enter_time;
8048   pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur);
8049 }
8050
8051 boost::statechart::result
8052 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
8053 {
8054   PG *pg = context< RecoveryMachine >().pg;
8055
8056   pg->osd->send_message_osd_cluster(
8057       pg->primary.osd,
8058       new MBackfillReserve(
8059         MBackfillReserve::GRANT,
8060         spg_t(pg->info.pgid.pgid, pg->primary.shard),
8061         pg->get_osdmap_epoch()),
8062       pg->get_osdmap_epoch());
8063   return transit<RepRecovering>();
8064 }
8065
8066 boost::statechart::result
8067 PG::RecoveryState::RepWaitBackfillReserved::react(
8068   const RejectRemoteReservation &evt)
8069 {
8070   PG *pg = context< RecoveryMachine >().pg;
8071   pg->reject_reservation();
8072   post_event(RemoteReservationRejected());
8073   return discard_event();
8074 }
8075
8076 boost::statechart::result
8077 PG::RecoveryState::RepWaitBackfillReserved::react(
8078   const RemoteReservationRejected &evt)
8079 {
8080   PG *pg = context< RecoveryMachine >().pg;
8081   pg->clear_reserved_num_bytes();
8082   pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
8083   return transit<RepNotRecovering>();
8084 }
8085
8086 boost::statechart::result
8087 PG::RecoveryState::RepWaitBackfillReserved::react(
8088   const RemoteReservationCanceled &evt)
8089 {
8090   PG *pg = context< RecoveryMachine >().pg;
8091   pg->clear_reserved_num_bytes();
8092   pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
8093   return transit<RepNotRecovering>();
8094 }
8095
8096 /*---RepRecovering-------*/
8097 PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx)
8098   : my_base(ctx),
8099     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepRecovering")
8100 {
8101   context< RecoveryMachine >().log_enter(state_name);
8102 }
8103
8104 boost::statechart::result
8105 PG::RecoveryState::RepRecovering::react(const RemoteRecoveryPreempted &)
8106 {
8107   PG *pg = context< RecoveryMachine >().pg;
8108   pg->clear_reserved_num_bytes();
8109   pg->osd->send_message_osd_cluster(
8110     pg->primary.osd,
8111     new MRecoveryReserve(
8112       MRecoveryReserve::REVOKE,
8113       spg_t(pg->info.pgid.pgid, pg->primary.shard),
8114       pg->get_osdmap_epoch()),
8115     pg->get_osdmap_epoch());
8116   return discard_event();
8117 }
8118
8119 boost::statechart::result
8120 PG::RecoveryState::RepRecovering::react(const BackfillTooFull &)
8121 {
8122   PG *pg = context< RecoveryMachine >().pg;
8123   pg->clear_reserved_num_bytes();
8124   pg->osd->send_message_osd_cluster(
8125     pg->primary.osd,
8126     new MBackfillReserve(
8127       MBackfillReserve::TOOFULL,
8128       spg_t(pg->info.pgid.pgid, pg->primary.shard),
8129       pg->get_osdmap_epoch()),
8130     pg->get_osdmap_epoch());
8131   return discard_event();
8132 }
8133
8134 boost::statechart::result
8135 PG::RecoveryState::RepRecovering::react(const RemoteBackfillPreempted &)
8136 {
8137   PG *pg = context< RecoveryMachine >().pg;
8138   pg->clear_reserved_num_bytes();
8139   pg->osd->send_message_osd_cluster(
8140     pg->primary.osd,
8141     new MBackfillReserve(
8142       MBackfillReserve::REVOKE,
8143       spg_t(pg->info.pgid.pgid, pg->primary.shard),
8144       pg->get_osdmap_epoch()),
8145     pg->get_osdmap_epoch());
8146   return discard_event();
8147 }
8148
8149 void PG::RecoveryState::RepRecovering::exit()
8150 {
8151   context< RecoveryMachine >().log_exit(state_name, enter_time);
8152   PG *pg = context< RecoveryMachine >().pg;
8153   pg->clear_reserved_num_bytes();
8154   pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
8155   utime_t dur = ceph_clock_now() - enter_time;
8156   pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur);
8157 }
8158
8159 /*------Activating--------*/
8160 PG::RecoveryState::Activating::Activating(my_context ctx)
8161   : my_base(ctx),
8162     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Activating")
8163 {
8164   context< RecoveryMachine >().log_enter(state_name);
8165 }
8166
8167 void PG::RecoveryState::Activating::exit()
8168 {
8169   context< RecoveryMachine >().log_exit(state_name, enter_time);
8170   PG *pg = context< RecoveryMachine >().pg;
8171   utime_t dur = ceph_clock_now() - enter_time;
8172   pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur);
8173 }
8174
8175 PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
8176   : my_base(ctx),
8177     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalRecoveryReserved")
8178 {
8179   context< RecoveryMachine >().log_enter(state_name);
8180   PG *pg = context< RecoveryMachine >().pg;
8181
8182   // Make sure all nodes that part of the recovery aren't full
8183   if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery &&
8184       pg->osd->check_osdmap_full(pg->acting_recovery_backfill)) {
8185     post_event(RecoveryTooFull());
8186     return;
8187   }
8188
8189   pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
8190   pg->state_set(PG_STATE_RECOVERY_WAIT);
8191   pg->osd->local_reserver.request_reservation(
8192     pg->info.pgid,
8193     new QueuePeeringEvt<LocalRecoveryReserved>(
8194       pg, pg->get_osdmap_epoch(),
8195       LocalRecoveryReserved()),
8196     pg->get_recovery_priority(),
8197     new QueuePeeringEvt<DeferRecovery>(
8198       pg, pg->get_osdmap_epoch(),
8199       DeferRecovery(0.0)));
8200   pg->publish_stats_to_osd();
8201 }
8202
8203 boost::statechart::result
8204 PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
8205 {
8206   PG *pg = context< RecoveryMachine >().pg;
8207   pg->state_set(PG_STATE_RECOVERY_TOOFULL);
8208   pg->schedule_recovery_retry(pg->cct->_conf->osd_recovery_retry_interval);
8209   return transit<NotRecovering>();
8210 }
8211
8212 void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
8213 {
8214   context< RecoveryMachine >().log_exit(state_name, enter_time);
8215   PG *pg = context< RecoveryMachine >().pg;
8216   utime_t dur = ceph_clock_now() - enter_time;
8217   pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur);
8218 }
8219
8220 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
8221   : my_base(ctx),
8222     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
8223     remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
8224 {
8225   context< RecoveryMachine >().log_enter(state_name);
8226   post_event(RemoteRecoveryReserved());
8227 }
8228
8229 boost::statechart::result
8230 PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
8231   PG *pg = context< RecoveryMachine >().pg;
8232
8233   if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
8234     ceph_assert(*remote_recovery_reservation_it != pg->pg_whoami);
8235     ConnectionRef con = pg->osd->get_con_osd_cluster(
8236       remote_recovery_reservation_it->osd, pg->get_osdmap_epoch());
8237     if (con) {
8238       pg->osd->send_message_osd_cluster(
8239         new MRecoveryReserve(
8240           MRecoveryReserve::REQUEST,
8241           spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard),
8242           pg->get_osdmap_epoch(),
8243           pg->get_recovery_priority()),
8244         con.get());
8245     }
8246     ++remote_recovery_reservation_it;
8247   } else {
8248     post_event(AllRemotesReserved());
8249   }
8250   return discard_event();
8251 }
8252
8253 void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
8254 {
8255   context< RecoveryMachine >().log_exit(state_name, enter_time);
8256   PG *pg = context< RecoveryMachine >().pg;
8257   utime_t dur = ceph_clock_now() - enter_time;
8258   pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur);
8259 }
8260
8261 PG::RecoveryState::Recovering::Recovering(my_context ctx)
8262   : my_base(ctx),
8263     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovering")
8264 {
8265   context< RecoveryMachine >().log_enter(state_name);
8266
8267   PG *pg = context< RecoveryMachine >().pg;
8268   pg->state_clear(PG_STATE_RECOVERY_WAIT);
8269   pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
8270   pg->state_set(PG_STATE_RECOVERING);
8271   ceph_assert(!pg->state_test(PG_STATE_ACTIVATING));
8272   pg->publish_stats_to_osd();
8273   pg->queue_recovery();
8274 }
8275
8276 void PG::RecoveryState::Recovering::release_reservations(bool cancel)
8277 {
8278   PG *pg = context< RecoveryMachine >().pg;
8279   ceph_assert(cancel || !pg->pg_log.get_missing().have_missing());
8280
8281   // release remote reservations
8282   for (set<pg_shard_t>::const_iterator i =
8283          context< Active >().remote_shards_to_reserve_recovery.begin();
8284         i != context< Active >().remote_shards_to_reserve_recovery.end();
8285         ++i) {
8286     if (*i == pg->pg_whoami) // skip myself
8287       continue;
8288     ConnectionRef con = pg->osd->get_con_osd_cluster(
8289       i->osd, pg->get_osdmap_epoch());
8290     if (con) {
8291       pg->osd->send_message_osd_cluster(
8292         new MRecoveryReserve(
8293           MRecoveryReserve::RELEASE,
8294           spg_t(pg->info.pgid.pgid, i->shard),
8295           pg->get_osdmap_epoch()),
8296         con.get());
8297     }
8298   }
8299 }
8300
8301 boost::statechart::result
8302 PG::RecoveryState::Recovering::react(const AllReplicasRecovered &evt)
8303 {
8304   PG *pg = context< RecoveryMachine >().pg;
8305   pg->state_clear(PG_STATE_FORCED_RECOVERY);
8306   release_reservations();
8307   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8308   return transit<Recovered>();
8309 }
8310
8311 boost::statechart::result
8312 PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
8313 {
8314   PG *pg = context< RecoveryMachine >().pg;
8315   pg->state_clear(PG_STATE_FORCED_RECOVERY);
8316   release_reservations();
8317   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8318   // XXX: Is this needed?
8319   pg->publish_stats_to_osd();
8320   return transit<WaitLocalBackfillReserved>();
8321 }
8322
8323 boost::statechart::result
8324 PG::RecoveryState::Recovering::react(const DeferRecovery &evt)
8325 {
8326   PG *pg = context< RecoveryMachine >().pg;
8327   if (!pg->state_test(PG_STATE_RECOVERING)) {
8328     // we may have finished recovery and have an AllReplicasRecovered
8329     // event queued to move us to the next state.
8330     ldout(pg->cct, 10) << "got defer recovery but not recovering" << dendl;
8331     return discard_event();
8332   }
8333   ldout(pg->cct, 10) << "defer recovery, retry delay " << evt.delay << dendl;
8334   pg->state_set(PG_STATE_RECOVERY_WAIT);
8335   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8336   release_reservations(true);
8337   pg->schedule_recovery_retry(evt.delay);
8338   return transit<NotRecovering>();
8339 }
8340
8341 boost::statechart::result
8342 PG::RecoveryState::Recovering::react(const UnfoundRecovery &evt)
8343 {
8344   PG *pg = context< RecoveryMachine >().pg;
8345   ldout(pg->cct, 10) << "recovery has unfound, can't continue" << dendl;
8346   pg->state_set(PG_STATE_RECOVERY_UNFOUND);
8347   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8348   release_reservations(true);
8349   return transit<NotRecovering>();
8350 }
8351
8352 void PG::RecoveryState::Recovering::exit()
8353 {
8354   context< RecoveryMachine >().log_exit(state_name, enter_time);
8355   PG *pg = context< RecoveryMachine >().pg;
8356   utime_t dur = ceph_clock_now() - enter_time;
8357   pg->state_clear(PG_STATE_RECOVERING);
8358   pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur);
8359 }
8360
8361 PG::RecoveryState::Recovered::Recovered(my_context ctx)
8362   : my_base(ctx),
8363     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovered")
8364 {
8365   pg_shard_t auth_log_shard;
8366
8367   context< RecoveryMachine >().log_enter(state_name);
8368
8369   PG *pg = context< RecoveryMachine >().pg;
8370
8371   ceph_assert(!pg->needs_recovery());
8372
8373   // if we finished backfill, all acting are active; recheck if
8374   // DEGRADED | UNDERSIZED is appropriate.
8375   ceph_assert(!pg->acting_recovery_backfill.empty());
8376   if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
8377       pg->acting_recovery_backfill.size()) {
8378     pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
8379     pg->publish_stats_to_osd();
8380   }
8381
8382   // adjust acting set?  (e.g. because backfill completed...)
8383   bool history_les_bound = false;
8384   if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
8385                                                  true, &history_les_bound)) {
8386     ceph_assert(pg->want_acting.size());
8387   } else if (!pg->async_recovery_targets.empty()) {
8388     pg->choose_acting(auth_log_shard, true, &history_les_bound);
8389   }
8390
8391   if (context< Active >().all_replicas_activated  &&
8392       pg->async_recovery_targets.empty())
8393     post_event(GoClean());
8394 }
8395
8396 void PG::RecoveryState::Recovered::exit()
8397 {
8398   context< RecoveryMachine >().log_exit(state_name, enter_time);
8399   PG *pg = context< RecoveryMachine >().pg;
8400   utime_t dur = ceph_clock_now() - enter_time;
8401   pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur);
8402 }
8403
8404 PG::RecoveryState::Clean::Clean(my_context ctx)
8405   : my_base(ctx),
8406     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Clean")
8407 {
8408   context< RecoveryMachine >().log_enter(state_name);
8409
8410   PG *pg = context< RecoveryMachine >().pg;
8411
8412   if (pg->info.last_complete != pg->info.last_update) {
8413     ceph_abort();
8414   }
8415   Context *c = pg->finish_recovery();
8416   context< RecoveryMachine >().get_cur_transaction()->register_on_commit(c);
8417
8418   pg->try_mark_clean();
8419 }
8420
8421 void PG::RecoveryState::Clean::exit()
8422 {
8423   context< RecoveryMachine >().log_exit(state_name, enter_time);
8424   PG *pg = context< RecoveryMachine >().pg;
8425   pg->state_clear(PG_STATE_CLEAN);
8426   utime_t dur = ceph_clock_now() - enter_time;
8427   pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur);
8428 }
8429
8430 template <typename T>
8431 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
8432 {
8433   set<int> osds_found;
8434   set<pg_shard_t> out;
8435   for (typename T::const_iterator i = in.begin();
8436        i != in.end();
8437        ++i) {
8438     if (*i != skip && !osds_found.count(i->osd)) {
8439       osds_found.insert(i->osd);
8440       out.insert(*i);
8441     }
8442   }
8443   return out;
8444 }
8445
8446 /*---------Active---------*/
8447 PG::RecoveryState::Active::Active(my_context ctx)
8448   : my_base(ctx),
8449     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active"),
8450     remote_shards_to_reserve_recovery(
8451       unique_osd_shard_set(
8452         context< RecoveryMachine >().pg->pg_whoami,
8453         context< RecoveryMachine >().pg->acting_recovery_backfill)),
8454     remote_shards_to_reserve_backfill(
8455       unique_osd_shard_set(
8456         context< RecoveryMachine >().pg->pg_whoami,
8457         context< RecoveryMachine >().pg->backfill_targets)),
8458     all_replicas_activated(false)
8459 {
8460   context< RecoveryMachine >().log_enter(state_name);
8461
8462   PG *pg = context< RecoveryMachine >().pg;
8463
8464   ceph_assert(!pg->backfill_reserving);
8465   ceph_assert(!pg->backfill_reserved);
8466   ceph_assert(pg->is_primary());
8467   ldout(pg->cct, 10) << "In Active, about to call activate" << dendl;
8468   pg->start_flush(context< RecoveryMachine >().get_cur_transaction());
8469   pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
8470                pg->get_osdmap_epoch(),
8471                *context< RecoveryMachine >().get_query_map(),
8472                context< RecoveryMachine >().get_info_map(),
8473                context< RecoveryMachine >().get_recovery_ctx());
8474
8475   // everyone has to commit/ack before we are truly active
8476   pg->blocked_by.clear();
8477   for (set<pg_shard_t>::iterator p = pg->acting_recovery_backfill.begin();
8478        p != pg->acting_recovery_backfill.end();
8479        ++p) {
8480     if (p->shard != pg->pg_whoami.shard) {
8481       pg->blocked_by.insert(p->shard);
8482     }
8483   }
8484   pg->publish_stats_to_osd();
8485   ldout(pg->cct, 10) << "Activate Finished" << dendl;
8486 }
8487
8488 boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
8489 {
8490   PG *pg = context< RecoveryMachine >().pg;
8491   if (pg->should_restart_peering(
8492         advmap.up_primary,
8493         advmap.acting_primary,
8494         advmap.newup,
8495         advmap.newacting,
8496         advmap.lastmap,
8497         advmap.osdmap)) {
8498     ldout(pg->cct, 10) << "Active advmap interval change, fast return" << dendl;
8499     return forward_event();
8500   }
8501   ldout(pg->cct, 10) << "Active advmap" << dendl;
8502   bool need_publish = false;
8503
8504   if (advmap.osdmap->require_osd_release >= CEPH_RELEASE_MIMIC) {
8505     const auto& new_removed_snaps = advmap.osdmap->get_new_removed_snaps();
8506     auto i = new_removed_snaps.find(pg->info.pgid.pool());
8507     if (i != new_removed_snaps.end()) {
8508       bool bad = false;
8509       for (auto j : i->second) {
8510         if (pg->snap_trimq.intersects(j.first, j.second)) {
8511           decltype(pg->snap_trimq) added, overlap;
8512           added.insert(j.first, j.second);
8513           overlap.intersection_of(pg->snap_trimq, added);
8514           if (pg->last_require_osd_release < CEPH_RELEASE_MIMIC) {
8515             lderr(pg->cct) << __func__ << " removed_snaps already contains "
8516                            << overlap << ", but this is the first mimic+ osdmap,"
8517                            << " so it's expected" << dendl;
8518           } else {
8519             lderr(pg->cct) << __func__ << " removed_snaps already contains "
8520                            << overlap << dendl;
8521             bad = true;
8522           }
8523           pg->snap_trimq.union_of(added);
8524         } else {
8525           pg->snap_trimq.insert(j.first, j.second);
8526         }
8527       }
8528       if (pg->last_require_osd_release < CEPH_RELEASE_MIMIC) {
8529         // at upgrade, we report *all* previously removed snaps as removed in
8530         // the first mimic epoch.  remove the ones we previously divined were
8531         // removed (and subsequently purged) from the trimq.
8532         lderr(pg->cct) << __func__ << " first mimic map, filtering purged_snaps"
8533                        << " from new removed_snaps" << dendl;
8534         pg->snap_trimq.subtract(pg->info.purged_snaps);
8535       }
8536       ldout(pg->cct,10) << __func__ << " new removed_snaps " << i->second
8537                         << ", snap_trimq now " << pg->snap_trimq << dendl;
8538       ceph_assert(!bad || !pg->cct->_conf->osd_debug_verify_cached_snaps);
8539       pg->dirty_info = true;
8540       pg->dirty_big_info = true;
8541     }
8542
8543     const auto& new_purged_snaps = advmap.osdmap->get_new_purged_snaps();
8544     auto j = new_purged_snaps.find(pg->info.pgid.pool());
8545     if (j != new_purged_snaps.end()) {
8546       bool bad = false;
8547       for (auto k : j->second) {
8548         if (!pg->info.purged_snaps.contains(k.first, k.second)) {
8549           decltype(pg->info.purged_snaps) rm, overlap;
8550           rm.insert(k.first, k.second);
8551           overlap.intersection_of(pg->info.purged_snaps, rm);
8552           lderr(pg->cct) << __func__ << " purged_snaps does not contain "
8553                          << rm << ", only " << overlap << dendl;
8554           pg->info.purged_snaps.subtract(overlap);
8555           // This can currently happen in the normal (if unlikely) course of
8556           // events.  Because adding snaps to purged_snaps does not increase
8557           // the pg version or add a pg log entry, we don't reliably propagate
8558           // purged_snaps additions to other OSDs.
8559           // One example:
8560           //  - purge S
8561           //  - primary and replicas update purged_snaps
8562           //  - no object updates
8563           //  - pg mapping changes, new primary on different node
8564           //  - new primary pg version == eversion_t(), so info is not
8565           //    propagated.
8566           //bad = true;
8567         } else {
8568           pg->info.purged_snaps.erase(k.first, k.second);
8569         }
8570       }
8571       ldout(pg->cct,10) << __func__ << " new purged_snaps " << j->second
8572                         << ", now " << pg->info.purged_snaps << dendl;
8573       ceph_assert(!bad || !pg->cct->_conf->osd_debug_verify_cached_snaps);
8574       pg->dirty_info = true;
8575       pg->dirty_big_info = true;
8576     }
8577     if (pg->dirty_big_info) {
8578       // share updated purged_snaps to mgr/mon so that we (a) stop reporting
8579       // purged snaps and (b) perhaps share more snaps that we have purged
8580       // but didn't fit in pg_stat_t.
8581       need_publish = true;
8582       pg->share_pg_info();
8583     }
8584   } else if (!pg->pool.newly_removed_snaps.empty()) {
8585     pg->snap_trimq.union_of(pg->pool.newly_removed_snaps);
8586     ldout(pg->cct, 10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl;
8587     pg->dirty_info = true;
8588     pg->dirty_big_info = true;
8589   }
8590
8591   for (size_t i = 0; i < pg->want_acting.size(); i++) {
8592     int osd = pg->want_acting[i];
8593     if (!advmap.osdmap->is_up(osd)) {
8594       pg_shard_t osd_with_shard(osd, shard_id_t(i));
8595       ceph_assert(pg->is_acting(osd_with_shard) || pg->is_up(osd_with_shard));
8596     }
8597   }
8598
8599   /* Check for changes in pool size (if the acting set changed as a result,
8600    * this does not matter) */
8601   if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) !=
8602       pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) {
8603     if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) {
8604       pg->state_clear(PG_STATE_UNDERSIZED);
8605     } else {
8606       pg->state_set(PG_STATE_UNDERSIZED);
8607     }
8608     // degraded changes will be detected by call from publish_stats_to_osd()
8609     need_publish = true;
8610   }
8611
8612   // if we haven't reported our PG stats in a long time, do so now.
8613   if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
8614     ldout(pg->cct, 20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch)
8615                        << " epochs" << dendl;
8616     need_publish = true;
8617   }
8618
8619   if (need_publish)
8620     pg->publish_stats_to_osd();
8621
8622   return forward_event();
8623 }
8624
8625 boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
8626 {
8627   PG *pg = context< RecoveryMachine >().pg;
8628   ldout(pg->cct, 10) << "Active: handling ActMap" << dendl;
8629   ceph_assert(pg->is_primary());
8630
8631   if (pg->have_unfound()) {
8632     // object may have become unfound
8633     pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
8634   }
8635
8636   if (pg->cct->_conf->osd_check_for_log_corruption)
8637     pg->check_log_for_corruption(pg->osd->store);
8638
8639   uint64_t unfound = pg->missing_loc.num_unfound();
8640   if (unfound > 0 &&
8641       pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
8642     if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
8643       pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound
8644                             << " objects unfound and apparently lost, would automatically "
8645                             << "mark these objects lost but this feature is not yet implemented "
8646                             << "(osd_auto_mark_unfound_lost)";
8647     } else
8648       pg->osd->clog->error() << pg->info.pgid.pgid << " has "
8649                              << unfound << " objects unfound and apparently lost";
8650   }
8651
8652   if (pg->is_active()) {
8653     ldout(pg->cct, 10) << "Active: kicking snap trim" << dendl;
8654     pg->kick_snap_trim();
8655   }
8656
8657   if (pg->is_peered() &&
8658       !pg->is_clean() &&
8659       !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) &&
8660       (!pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || pg->is_degraded())) {
8661     pg->queue_recovery();
8662   }
8663   return forward_event();
8664 }
8665
8666 boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& notevt)
8667 {
8668   PG *pg = context< RecoveryMachine >().pg;
8669   ceph_assert(pg->is_primary());
8670   if (pg->peer_info.count(notevt.from)) {
8671     ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
8672                        << ", already have info from that osd, ignoring"
8673                        << dendl;
8674   } else if (pg->peer_purged.count(notevt.from)) {
8675     ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
8676                        << ", already purged that peer, ignoring"
8677                        << dendl;
8678   } else {
8679     ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
8680                        << ", calling proc_replica_info and discover_all_missing"
8681                        << dendl;
8682     pg->proc_replica_info(
8683       notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
8684     if (pg->have_unfound() || (pg->is_degraded() && pg->might_have_unfound.count(notevt.from))) {
8685       pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
8686     }
8687   }
8688   return discard_event();
8689 }
8690
8691 boost::statechart::result PG::RecoveryState::Active::react(const MTrim& trim)
8692 {
8693   PG *pg = context< RecoveryMachine >().pg;
8694   ceph_assert(pg->is_primary());
8695
8696   // peer is informing us of their last_complete_ondisk
8697   ldout(pg->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl;
8698   pg->peer_last_complete_ondisk[pg_shard_t(trim.from, trim.shard)] = trim.trim_to;
8699
8700   // trim log when the pg is recovered
8701   pg->calc_min_last_complete_ondisk();
8702   return discard_event();
8703 }
8704
8705 boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoevt)
8706 {
8707   PG *pg = context< RecoveryMachine >().pg;
8708   ceph_assert(pg->is_primary());
8709
8710   ceph_assert(!pg->acting_recovery_backfill.empty());
8711   // don't update history (yet) if we are active and primary; the replica
8712   // may be telling us they have activated (and committed) but we can't
8713   // share that until _everyone_ does the same.
8714   if (pg->is_acting_recovery_backfill(infoevt.from) &&
8715       pg->peer_activated.count(infoevt.from) == 0) {
8716     ldout(pg->cct, 10) << " peer osd." << infoevt.from
8717                        << " activated and committed" << dendl;
8718     pg->peer_activated.insert(infoevt.from);
8719     pg->blocked_by.erase(infoevt.from.shard);
8720     pg->publish_stats_to_osd();
8721     if (pg->peer_activated.size() == pg->acting_recovery_backfill.size()) {
8722       pg->all_activated_and_committed();
8723     }
8724   }
8725   return discard_event();
8726 }
8727
8728 boost::statechart::result PG::RecoveryState::Active::react(const MLogRec& logevt)
8729 {
8730   PG *pg = context< RecoveryMachine >().pg;
8731   ldout(pg->cct, 10) << "searching osd." << logevt.from
8732                      << " log for unfound items" << dendl;
8733   pg->proc_replica_log(
8734     logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
8735   bool got_missing = pg->search_for_missing(
8736     pg->peer_info[logevt.from],
8737     pg->peer_missing[logevt.from],
8738     logevt.from,
8739     context< RecoveryMachine >().get_recovery_ctx());
8740   // If there are missing AND we are "fully" active then start recovery now
8741   if (got_missing && pg->state_test(PG_STATE_ACTIVE)) {
8742     post_event(DoRecovery());
8743   }
8744   return discard_event();
8745 }
8746
8747 boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
8748 {
8749   PG *pg = context< RecoveryMachine >().pg;
8750
8751   q.f->open_object_section("state");
8752   q.f->dump_string("name", state_name);
8753   q.f->dump_stream("enter_time") << enter_time;
8754
8755   {
8756     q.f->open_array_section("might_have_unfound");
8757     for (set<pg_shard_t>::iterator p = pg->might_have_unfound.begin();
8758          p != pg->might_have_unfound.end();
8759          ++p) {
8760       q.f->open_object_section("osd");
8761       q.f->dump_stream("osd") << *p;
8762       if (pg->peer_missing.count(*p)) {
8763         q.f->dump_string("status", "already probed");
8764       } else if (pg->peer_missing_requested.count(*p)) {
8765         q.f->dump_string("status", "querying");
8766       } else if (!pg->get_osdmap()->is_up(p->osd)) {
8767         q.f->dump_string("status", "osd is down");
8768       } else {
8769         q.f->dump_string("status", "not queried");
8770       }
8771       q.f->close_section();
8772     }
8773     q.f->close_section();
8774   }
8775   {
8776     q.f->open_object_section("recovery_progress");
8777     pg->dump_recovery_info(q.f);
8778     q.f->close_section();
8779   }
8780
8781   {
8782     q.f->open_object_section("scrub");
8783     q.f->dump_stream("scrubber.epoch_start") << pg->scrubber.epoch_start;
8784     q.f->dump_bool("scrubber.active", pg->scrubber.active);
8785     q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state));
8786     q.f->dump_stream("scrubber.start") << pg->scrubber.start;
8787     q.f->dump_stream("scrubber.end") << pg->scrubber.end;
8788     q.f->dump_stream("scrubber.max_end") << pg->scrubber.max_end;
8789     q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update;
8790     q.f->dump_bool("scrubber.deep", pg->scrubber.deep);
8791     {
8792       q.f->open_array_section("scrubber.waiting_on_whom");
8793       for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin();
8794            p != pg->scrubber.waiting_on_whom.end();
8795            ++p) {
8796         q.f->dump_stream("shard") << *p;
8797       }
8798       q.f->close_section();
8799     }
8800     q.f->close_section();
8801   }
8802
8803   q.f->close_section();
8804   return forward_event();
8805 }
8806
8807 boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActivated &evt)
8808 {
8809   PG *pg = context< RecoveryMachine >().pg;
8810   pg_t pgid = pg->info.pgid.pgid;
8811
8812   all_replicas_activated = true;
8813
8814   pg->state_clear(PG_STATE_ACTIVATING);
8815   pg->state_clear(PG_STATE_CREATING);
8816   pg->state_clear(PG_STATE_PREMERGE);
8817
8818   bool merge_target;
8819   if (pg->pool.info.is_pending_merge(pgid, &merge_target)) {
8820     pg->state_set(PG_STATE_PEERED);
8821     pg->state_set(PG_STATE_PREMERGE);
8822
8823     if (pg->actingset.size() != pg->get_osdmap()->get_pg_size(pgid)) {
8824       if (merge_target) {
8825         pg_t src = pgid;
8826         src.set_ps(pg->pool.info.get_pg_num_pending());
8827         assert(src.get_parent() == pgid);
8828         pg->osd->set_not_ready_to_merge_target(pgid, src);
8829       } else {
8830         pg->osd->set_not_ready_to_merge_source(pgid);
8831       }
8832     }
8833   } else if (pg->acting.size() < pg->pool.info.min_size) {
8834     pg->state_set(PG_STATE_PEERED);
8835   } else {
8836     pg->state_set(PG_STATE_ACTIVE);
8837   }
8838
8839   if (pg->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) {
8840     pg->osd->send_pg_created(pgid);
8841   }
8842
8843   pg->info.history.last_epoch_started = pg->info.last_epoch_started;
8844   pg->info.history.last_interval_started = pg->info.last_interval_started;
8845   pg->dirty_info = true;
8846
8847   pg->share_pg_info();
8848   pg->publish_stats_to_osd();
8849
8850   pg->check_local();
8851
8852   // waiters
8853   if (pg->flushes_in_progress == 0) {
8854     pg->requeue_ops(pg->waiting_for_peered);
8855   } else if (!pg->waiting_for_peered.empty()) {
8856     ldout(pg->cct, 10) << __func__ << " flushes in progress, moving "
8857                        << pg->waiting_for_peered.size()
8858                        << " items to waiting_for_flush"
8859                        << dendl;
8860     ceph_assert(pg->waiting_for_flush.empty());
8861     pg->waiting_for_flush.swap(pg->waiting_for_peered);
8862   }
8863
8864   pg->on_activate();
8865
8866   return discard_event();
8867 }
8868
8869 void PG::RecoveryState::Active::exit()
8870 {
8871   context< RecoveryMachine >().log_exit(state_name, enter_time);
8872   PG *pg = context< RecoveryMachine >().pg;
8873   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8874
8875   pg->blocked_by.clear();
8876   pg->backfill_reserved = false;
8877   pg->backfill_reserving = false;
8878   pg->state_clear(PG_STATE_ACTIVATING);
8879   pg->state_clear(PG_STATE_DEGRADED);
8880   pg->state_clear(PG_STATE_UNDERSIZED);
8881   pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
8882   pg->state_clear(PG_STATE_BACKFILL_WAIT);
8883   pg->state_clear(PG_STATE_RECOVERY_WAIT);
8884   pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
8885   utime_t dur = ceph_clock_now() - enter_time;
8886   pg->osd->recoverystate_perf->tinc(rs_active_latency, dur);
8887   pg->agent_stop();
8888 }
8889
8890 /*------ReplicaActive-----*/
8891 PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx)
8892   : my_base(ctx),
8893     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive")
8894 {
8895   context< RecoveryMachine >().log_enter(state_name);
8896
8897   PG *pg = context< RecoveryMachine >().pg;
8898   pg->start_flush(context< RecoveryMachine >().get_cur_transaction());
8899 }
8900
8901
8902 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
8903   const Activate& actevt) {
8904   PG *pg = context< RecoveryMachine >().pg;
8905   ldout(pg->cct, 10) << "In ReplicaActive, about to call activate" << dendl;
8906   map<int, map<spg_t, pg_query_t> > query_map;
8907   pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
8908                actevt.activation_epoch,
8909                query_map, NULL, NULL);
8910   ldout(pg->cct, 10) << "Activate Finished" << dendl;
8911   return discard_event();
8912 }
8913
8914 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MInfoRec& infoevt)
8915 {
8916   PG *pg = context< RecoveryMachine >().pg;
8917   pg->proc_primary_info(*context<RecoveryMachine>().get_cur_transaction(),
8918                         infoevt.info);
8919   return discard_event();
8920 }
8921
8922 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MLogRec& logevt)
8923 {
8924   PG *pg = context< RecoveryMachine >().pg;
8925   ldout(pg->cct, 10) << "received log from " << logevt.from << dendl;
8926   ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
8927   pg->merge_log(*t, logevt.msg->info, logevt.msg->log, logevt.from);
8928   ceph_assert(pg->pg_log.get_head() == pg->info.last_update);
8929
8930   return discard_event();
8931 }
8932
8933 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MTrim& trim)
8934 {
8935   PG *pg = context< RecoveryMachine >().pg;
8936   // primary is instructing us to trim
8937   pg->pg_log.trim(trim.trim_to, pg->info);
8938   pg->dirty_info = true;
8939   return discard_event();
8940 }
8941
8942 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&)
8943 {
8944   PG *pg = context< RecoveryMachine >().pg;
8945   if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
8946     context< RecoveryMachine >().send_notify(
8947       pg->get_primary(),
8948       pg_notify_t(
8949         pg->get_primary().shard, pg->pg_whoami.shard,
8950         pg->get_osdmap_epoch(),
8951         pg->get_osdmap_epoch(),
8952         pg->info),
8953       pg->past_intervals);
8954   }
8955   pg->take_waiters();
8956   return discard_event();
8957 }
8958
8959 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
8960   const MQuery& query)
8961 {
8962   PG *pg = context< RecoveryMachine >().pg;
8963   pg->fulfill_query(query, context<RecoveryMachine>().get_recovery_ctx());
8964   return discard_event();
8965 }
8966
8967 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const QueryState& q)
8968 {
8969   q.f->open_object_section("state");
8970   q.f->dump_string("name", state_name);
8971   q.f->dump_stream("enter_time") << enter_time;
8972   q.f->close_section();
8973   return forward_event();
8974 }
8975
8976 void PG::RecoveryState::ReplicaActive::exit()
8977 {
8978   context< RecoveryMachine >().log_exit(state_name, enter_time);
8979   PG *pg = context< RecoveryMachine >().pg;
8980   pg->clear_reserved_num_bytes();
8981   pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
8982   utime_t dur = ceph_clock_now() - enter_time;
8983   pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur);
8984 }
8985
8986 /*-------Stray---*/
8987 PG::RecoveryState::Stray::Stray(my_context ctx)
8988   : my_base(ctx),
8989     NamedState(context< RecoveryMachine >().pg, "Started/Stray")
8990 {
8991   context< RecoveryMachine >().log_enter(state_name);
8992
8993   PG *pg = context< RecoveryMachine >().pg;
8994   ceph_assert(!pg->is_peered());
8995   ceph_assert(!pg->is_peering());
8996   ceph_assert(!pg->is_primary());
8997
8998   if (!pg->get_osdmap()->have_pg_pool(pg->get_pgid().pool())) {
8999     ldout(pg->cct,10) << __func__ << " pool is deleted" << dendl;
9000     post_event(DeleteStart());
9001   } else {
9002     pg->start_flush(context< RecoveryMachine >().get_cur_transaction());
9003   }
9004 }
9005
9006 boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
9007 {
9008   PG *pg = context< RecoveryMachine >().pg;
9009   MOSDPGLog *msg = logevt.msg.get();
9010   ldout(pg->cct, 10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
9011
9012   ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
9013   if (msg->info.last_backfill == hobject_t()) {
9014     // restart backfill
9015     pg->info = msg->info;
9016     pg->on_info_history_change();
9017     pg->dirty_info = true;
9018     pg->dirty_big_info = true;  // maybe.
9019
9020     PGLogEntryHandler rollbacker{pg, t};
9021     pg->pg_log.reset_backfill_claim_log(msg->log, &rollbacker);
9022
9023     pg->pg_log.reset_backfill();
9024   } else {
9025     pg->merge_log(*t, msg->info, msg->log, logevt.from);
9026   }
9027
9028   ceph_assert(pg->pg_log.get_head() == pg->info.last_update);
9029
9030   post_event(Activate(logevt.msg->info.last_epoch_started));
9031   return transit<ReplicaActive>();
9032 }
9033
9034 boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoevt)
9035 {
9036   PG *pg = context< RecoveryMachine >().pg;
9037   ldout(pg->cct, 10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
9038
9039   if (pg->info.last_update > infoevt.info.last_update) {
9040     // rewind divergent log entries
9041     ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
9042     pg->rewind_divergent_log(*t, infoevt.info.last_update);
9043     pg->info.stats = infoevt.info.stats;
9044     pg->info.hit_set = infoevt.info.hit_set;
9045   }
9046
9047   ceph_assert(infoevt.info.last_update == pg->info.last_update);
9048   ceph_assert(pg->pg_log.get_head() == pg->info.last_update);
9049
9050   post_event(Activate(infoevt.info.last_epoch_started));
9051   return transit<ReplicaActive>();
9052 }
9053
9054 boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query)
9055 {
9056   PG *pg = context< RecoveryMachine >().pg;
9057   pg->fulfill_query(query, context<RecoveryMachine>().get_recovery_ctx());
9058   return discard_event();
9059 }
9060
9061 boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&)
9062 {
9063   PG *pg = context< RecoveryMachine >().pg;
9064   if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
9065     context< RecoveryMachine >().send_notify(
9066       pg->get_primary(),
9067       pg_notify_t(
9068         pg->get_primary().shard, pg->pg_whoami.shard,
9069         pg->get_osdmap_epoch(),
9070         pg->get_osdmap_epoch(),
9071         pg->info),
9072       pg->past_intervals);
9073   }
9074   pg->take_waiters();
9075   return discard_event();
9076 }
9077
9078 void PG::RecoveryState::Stray::exit()
9079 {
9080   context< RecoveryMachine >().log_exit(state_name, enter_time);
9081   PG *pg = context< RecoveryMachine >().pg;
9082   utime_t dur = ceph_clock_now() - enter_time;
9083   pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur);
9084 }
9085
9086
9087 /*--------ToDelete----------*/
9088 PG::RecoveryState::ToDelete::ToDelete(my_context ctx)
9089   : my_base(ctx),
9090     NamedState(context< RecoveryMachine >().pg, "Started/ToDelete")
9091 {
9092   context< RecoveryMachine >().log_enter(state_name);
9093   PG *pg = context< RecoveryMachine >().pg;
9094   pg->osd->logger->inc(l_osd_pg_removing);
9095 }
9096
9097 void PG::RecoveryState::ToDelete::exit()
9098 {
9099   context< RecoveryMachine >().log_exit(state_name, enter_time);
9100   PG *pg = context< RecoveryMachine >().pg;
9101   // note: on a successful removal, this path doesn't execute. see
9102   // _delete_some().
9103   pg->osd->logger->dec(l_osd_pg_removing);
9104   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
9105 }
9106
9107 /*----WaitDeleteReserved----*/
9108 PG::RecoveryState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx)
9109   : my_base(ctx),
9110     NamedState(context< RecoveryMachine >().pg,
9111                "Started/ToDelete/WaitDeleteReseved")
9112 {
9113   context< RecoveryMachine >().log_enter(state_name);
9114   PG *pg = context< RecoveryMachine >().pg;
9115   context<ToDelete>().priority = pg->get_delete_priority();
9116   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
9117   pg->osd->local_reserver.request_reservation(
9118     pg->info.pgid,
9119     new QueuePeeringEvt<DeleteReserved>(
9120       pg, pg->get_osdmap_epoch(),
9121       DeleteReserved()),
9122     context<ToDelete>().priority,
9123     new QueuePeeringEvt<DeleteInterrupted>(
9124       pg, pg->get_osdmap_epoch(),
9125       DeleteInterrupted()));
9126 }
9127
9128 boost::statechart::result PG::RecoveryState::ToDelete::react(
9129   const ActMap& evt)
9130 {
9131   PG *pg = context< RecoveryMachine >().pg;
9132   if (pg->get_delete_priority() != priority) {
9133     ldout(pg->cct,10) << __func__ << " delete priority changed, resetting"
9134                       << dendl;
9135     return transit<ToDelete>();
9136   }
9137   return discard_event();
9138 }
9139
9140 void PG::RecoveryState::WaitDeleteReserved::exit()
9141 {
9142   context< RecoveryMachine >().log_exit(state_name, enter_time);
9143 }
9144
9145 /*----Deleting-----*/
9146 PG::RecoveryState::Deleting::Deleting(my_context ctx)
9147   : my_base(ctx),
9148     NamedState(context< RecoveryMachine >().pg, "Started/ToDelete/Deleting")
9149 {
9150   context< RecoveryMachine >().log_enter(state_name);
9151   PG *pg = context< RecoveryMachine >().pg;
9152   pg->deleting = true;
9153   ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
9154   pg->on_removal(t);
9155   t->register_on_commit(new C_DeleteMore(pg, pg->get_osdmap_epoch()));
9156 }
9157
9158 boost::statechart::result PG::RecoveryState::Deleting::react(
9159   const DeleteSome& evt)
9160 {
9161   PG *pg = context< RecoveryMachine >().pg;
9162   pg->_delete_some(context<RecoveryMachine>().get_cur_transaction());
9163   return discard_event();
9164 }
9165
9166 void PG::RecoveryState::Deleting::exit()
9167 {
9168   context< RecoveryMachine >().log_exit(state_name, enter_time);
9169   PG *pg = context< RecoveryMachine >().pg;
9170   pg->deleting = false;
9171   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
9172 }
9173
9174 /*--------GetInfo---------*/
9175 PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
9176   : my_base(ctx),
9177     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetInfo")
9178 {
9179   context< RecoveryMachine >().log_enter(state_name);
9180
9181   PG *pg = context< RecoveryMachine >().pg;
9182   pg->check_past_interval_bounds();
9183   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
9184
9185   ceph_assert(pg->blocked_by.empty());
9186
9187   prior_set = pg->build_prior();
9188
9189   pg->reset_min_peer_features();
9190   get_infos();
9191   if (prior_set.pg_down) {
9192     post_event(IsDown());
9193   } else if (peer_info_requested.empty()) {
9194     post_event(GotInfo());
9195   }
9196 }
9197
9198 void PG::RecoveryState::GetInfo::get_infos()
9199 {
9200   PG *pg = context< RecoveryMachine >().pg;
9201   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
9202
9203   pg->blocked_by.clear();
9204   for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
9205        it != prior_set.probe.end();
9206        ++it) {
9207     pg_shard_t peer = *it;
9208     if (peer == pg->pg_whoami) {
9209       continue;
9210     }
9211     if (pg->peer_info.count(peer)) {
9212       ldout(pg->cct, 10) << " have osd." << peer << " info " << pg->peer_info[peer] << dendl;
9213       continue;
9214     }
9215     if (peer_info_requested.count(peer)) {
9216       ldout(pg->cct, 10) << " already requested info from osd." << peer << dendl;
9217       pg->blocked_by.insert(peer.osd);
9218     } else if (!pg->get_osdmap()->is_up(peer.osd)) {
9219       ldout(pg->cct, 10) << " not querying info from down osd." << peer << dendl;
9220     } else {
9221       ldout(pg->cct, 10) << " querying info from osd." << peer << dendl;
9222       context< RecoveryMachine >().send_query(
9223         peer, pg_query_t(pg_query_t::INFO,
9224                          it->shard, pg->pg_whoami.shard,
9225                          pg->info.history,
9226                          pg->get_osdmap_epoch()));
9227       peer_info_requested.insert(peer);
9228       pg->blocked_by.insert(peer.osd);
9229     }
9230   }
9231
9232   pg->publish_stats_to_osd();
9233 }
9234
9235 boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt)
9236 {
9237   PG *pg = context< RecoveryMachine >().pg;
9238
9239   set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
9240   if (p != peer_info_requested.end()) {
9241     peer_info_requested.erase(p);
9242     pg->blocked_by.erase(infoevt.from.osd);
9243   }
9244
9245   epoch_t old_start = pg->info.history.last_epoch_started;
9246   if (pg->proc_replica_info(
9247         infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
9248     // we got something new ...
9249     PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
9250     if (old_start < pg->info.history.last_epoch_started) {
9251       ldout(pg->cct, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
9252       prior_set = pg->build_prior();
9253
9254       // filter out any osds that got dropped from the probe set from
9255       // peer_info_requested.  this is less expensive than restarting
9256       // peering (which would re-probe everyone).
9257       set<pg_shard_t>::iterator p = peer_info_requested.begin();
9258       while (p != peer_info_requested.end()) {
9259         if (prior_set.probe.count(*p) == 0) {
9260           ldout(pg->cct, 20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
9261           peer_info_requested.erase(p++);
9262         } else {
9263           ++p;
9264         }
9265       }
9266       get_infos();
9267     }
9268     ldout(pg->cct, 20) << "Adding osd: " << infoevt.from.osd << " peer features: "
9269                        << hex << infoevt.features << dec << dendl;
9270     pg->apply_peer_features(infoevt.features);
9271
9272     // are we done getting everything?
9273     if (peer_info_requested.empty() && !prior_set.pg_down) {
9274       ldout(pg->cct, 20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl;
9275       ldout(pg->cct, 20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl;
9276       ldout(pg->cct, 20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl;
9277       post_event(GotInfo());
9278     }
9279   }
9280   return discard_event();
9281 }
9282
9283 boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q)
9284 {
9285   PG *pg = context< RecoveryMachine >().pg;
9286   q.f->open_object_section("state");
9287   q.f->dump_string("name", state_name);
9288   q.f->dump_stream("enter_time") << enter_time;
9289
9290   q.f->open_array_section("requested_info_from");
9291   for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
9292        p != peer_info_requested.end();
9293        ++p) {
9294     q.f->open_object_section("osd");
9295     q.f->dump_stream("osd") << *p;
9296     if (pg->peer_info.count(*p)) {
9297       q.f->open_object_section("got_info");
9298       pg->peer_info[*p].dump(q.f);
9299       q.f->close_section();
9300     }
9301     q.f->close_section();
9302   }
9303   q.f->close_section();
9304
9305   q.f->close_section();
9306   return forward_event();
9307 }
9308
9309 void PG::RecoveryState::GetInfo::exit()
9310 {
9311   context< RecoveryMachine >().log_exit(state_name, enter_time);
9312   PG *pg = context< RecoveryMachine >().pg;
9313   utime_t dur = ceph_clock_now() - enter_time;
9314   pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur);
9315   pg->blocked_by.clear();
9316 }
9317
9318 /*------GetLog------------*/
9319 PG::RecoveryState::GetLog::GetLog(my_context ctx)
9320   : my_base(ctx),
9321     NamedState(
9322       context< RecoveryMachine >().pg, "Started/Primary/Peering/GetLog"),
9323     msg(0)
9324 {
9325   context< RecoveryMachine >().log_enter(state_name);
9326
9327   PG *pg = context< RecoveryMachine >().pg;
9328
9329   // adjust acting?
9330   if (!pg->choose_acting(auth_log_shard, false,
9331                          &context< Peering >().history_les_bound)) {
9332     if (!pg->want_acting.empty()) {
9333       post_event(NeedActingChange());
9334     } else {
9335       post_event(IsIncomplete());
9336     }
9337     return;
9338   }
9339
9340   // am i the best?
9341   if (auth_log_shard == pg->pg_whoami) {
9342     post_event(GotLog());
9343     return;
9344   }
9345
9346   const pg_info_t& best = pg->peer_info[auth_log_shard];
9347
9348   // am i broken?
9349   if (pg->info.last_update < best.log_tail) {
9350     ldout(pg->cct, 10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
9351     post_event(IsIncomplete());
9352     return;
9353   }
9354
9355   // how much log to request?
9356   eversion_t request_log_from = pg->info.last_update;
9357   ceph_assert(!pg->acting_recovery_backfill.empty());
9358   for (set<pg_shard_t>::iterator p = pg->acting_recovery_backfill.begin();
9359        p != pg->acting_recovery_backfill.end();
9360        ++p) {
9361     if (*p == pg->pg_whoami) continue;
9362     pg_info_t& ri = pg->peer_info[*p];
9363     if (ri.last_update < pg->info.log_tail && ri.last_update >= best.log_tail &&
9364         ri.last_update < request_log_from)
9365       request_log_from = ri.last_update;
9366   }
9367
9368   // how much?
9369   ldout(pg->cct, 10) << " requesting log from osd." << auth_log_shard << dendl;
9370   context<RecoveryMachine>().send_query(
9371     auth_log_shard,
9372     pg_query_t(
9373       pg_query_t::LOG,
9374       auth_log_shard.shard, pg->pg_whoami.shard,
9375       request_log_from, pg->info.history,
9376       pg->get_osdmap_epoch()));
9377
9378   ceph_assert(pg->blocked_by.empty());
9379   pg->blocked_by.insert(auth_log_shard.osd);
9380   pg->publish_stats_to_osd();
9381 }
9382
9383 boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap)
9384 {
9385   PG *pg = context< RecoveryMachine >().pg;
9386   // make sure our log source didn't go down.  we need to check
9387   // explicitly because it may not be part of the prior set, which
9388   // means the Peering state check won't catch it going down.
9389   if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
9390     ldout(pg->cct, 10) << "GetLog: auth_log_shard osd."
9391                        << auth_log_shard.osd << " went down" << dendl;
9392     post_event(advmap);
9393     return transit< Reset >();
9394   }
9395
9396   // let the Peering state do its checks.
9397   return forward_event();
9398 }
9399
9400 boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt)
9401 {
9402   PG *pg = context< RecoveryMachine >().pg;
9403   ceph_assert(!msg);
9404   if (logevt.from != auth_log_shard) {
9405     ldout(pg->cct, 10) << "GetLog: discarding log from "
9406                        << "non-auth_log_shard osd." << logevt.from << dendl;
9407     return discard_event();
9408   }
9409   ldout(pg->cct, 10) << "GetLog: received master log from osd"
9410                      << logevt.from << dendl;
9411   msg = logevt.msg;
9412   post_event(GotLog());
9413   return discard_event();
9414 }
9415
9416 boost::statechart::result PG::RecoveryState::GetLog::react(const GotLog&)
9417 {
9418   PG *pg = context< RecoveryMachine >().pg;
9419   ldout(pg->cct, 10) << "leaving GetLog" << dendl;
9420   if (msg) {
9421     ldout(pg->cct, 10) << "processing master log" << dendl;
9422     pg->proc_master_log(*context<RecoveryMachine>().get_cur_transaction(),
9423                         msg->info, msg->log, msg->missing,
9424                         auth_log_shard);
9425   }
9426   pg->start_flush(context< RecoveryMachine >().get_cur_transaction());
9427   return transit< GetMissing >();
9428 }
9429
9430 boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q)
9431 {
9432   q.f->open_object_section("state");
9433   q.f->dump_string("name", state_name);
9434   q.f->dump_stream("enter_time") << enter_time;
9435   q.f->dump_stream("auth_log_shard") << auth_log_shard;
9436   q.f->close_section();
9437   return forward_event();
9438 }
9439
9440 void PG::RecoveryState::GetLog::exit()
9441 {
9442   context< RecoveryMachine >().log_exit(state_name, enter_time);
9443   PG *pg = context< RecoveryMachine >().pg;
9444   utime_t dur = ceph_clock_now() - enter_time;
9445   pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur);
9446   pg->blocked_by.clear();
9447 }
9448
9449 /*------WaitActingChange--------*/
9450 PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx)
9451   : my_base(ctx),
9452     NamedState(context< RecoveryMachine >().pg, "Started/Primary/WaitActingChange")
9453 {
9454   context< RecoveryMachine >().log_enter(state_name);
9455 }
9456
9457 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const AdvMap& advmap)
9458 {
9459   PG *pg = context< RecoveryMachine >().pg;
9460   OSDMapRef osdmap = advmap.osdmap;
9461
9462   ldout(pg->cct, 10) << "verifying no want_acting " << pg->want_acting << " targets didn't go down" << dendl;
9463   for (vector<int>::iterator p = pg->want_acting.begin(); p != pg->want_acting.end(); ++p) {
9464     if (!osdmap->is_up(*p)) {
9465       ldout(pg->cct, 10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
9466       post_event(advmap);
9467       return transit< Reset >();
9468     }
9469   }
9470   return forward_event();
9471 }
9472
9473 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MLogRec& logevt)
9474 {
9475   PG *pg = context< RecoveryMachine >().pg;
9476   ldout(pg->cct, 10) << "In WaitActingChange, ignoring MLocRec" << dendl;
9477   return discard_event();
9478 }
9479
9480 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MInfoRec& evt)
9481 {
9482   PG *pg = context< RecoveryMachine >().pg;
9483   ldout(pg->cct, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
9484   return discard_event();
9485 }
9486
9487 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MNotifyRec& evt)
9488 {
9489   PG *pg = context< RecoveryMachine >().pg;
9490   ldout(pg->cct, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
9491   return discard_event();
9492 }
9493
9494 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const QueryState& q)
9495 {
9496   q.f->open_object_section("state");
9497   q.f->dump_string("name", state_name);
9498   q.f->dump_stream("enter_time") << enter_time;
9499   q.f->dump_string("comment", "waiting for pg acting set to change");
9500   q.f->close_section();
9501   return forward_event();
9502 }
9503
9504 void PG::RecoveryState::WaitActingChange::exit()
9505 {
9506   context< RecoveryMachine >().log_exit(state_name, enter_time);
9507   PG *pg = context< RecoveryMachine >().pg;
9508   utime_t dur = ceph_clock_now() - enter_time;
9509   pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur);
9510 }
9511
9512 /*------Down--------*/
9513 PG::RecoveryState::Down::Down(my_context ctx)
9514   : my_base(ctx),
9515     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Down")
9516 {
9517   context< RecoveryMachine >().log_enter(state_name);
9518   PG *pg = context< RecoveryMachine >().pg;
9519
9520   pg->state_clear(PG_STATE_PEERING);
9521   pg->state_set(PG_STATE_DOWN);
9522
9523   auto &prior_set = context< Peering >().prior_set;
9524   ceph_assert(pg->blocked_by.empty());
9525   pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
9526   pg->publish_stats_to_osd();
9527 }
9528
9529 void PG::RecoveryState::Down::exit()
9530 {
9531   context< RecoveryMachine >().log_exit(state_name, enter_time);
9532   PG *pg = context< RecoveryMachine >().pg;
9533
9534   pg->state_clear(PG_STATE_DOWN);
9535   utime_t dur = ceph_clock_now() - enter_time;
9536   pg->osd->recoverystate_perf->tinc(rs_down_latency, dur);
9537
9538   pg->blocked_by.clear();
9539 }
9540
9541 boost::statechart::result PG::RecoveryState::Down::react(const QueryState& q)
9542 {
9543   q.f->open_object_section("state");
9544   q.f->dump_string("name", state_name);
9545   q.f->dump_stream("enter_time") << enter_time;
9546   q.f->dump_string("comment",
9547                    "not enough up instances of this PG to go active");
9548   q.f->close_section();
9549   return forward_event();
9550 }
9551
9552 boost::statechart::result PG::RecoveryState::Down::react(const MNotifyRec& infoevt)
9553 {
9554   PG *pg = context< RecoveryMachine >().pg;
9555
9556   ceph_assert(pg->is_primary());
9557   epoch_t old_start = pg->info.history.last_epoch_started;
9558   if (!pg->peer_info.count(infoevt.from) &&
9559       pg->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) {
9560     pg->update_history(infoevt.notify.info.history);
9561   }
9562   // if we got something new to make pg escape down state
9563   if (pg->info.history.last_epoch_started > old_start) {
9564       ldout(pg->cct, 10) << " last_epoch_started moved forward, re-enter getinfo" << dendl;
9565     pg->state_clear(PG_STATE_DOWN);
9566     pg->state_set(PG_STATE_PEERING);
9567     return transit< GetInfo >();
9568   }
9569
9570   return discard_event();
9571 }
9572
9573
9574 /*------Incomplete--------*/
9575 PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
9576   : my_base(ctx),
9577     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Incomplete")
9578 {
9579   context< RecoveryMachine >().log_enter(state_name);
9580   PG *pg = context< RecoveryMachine >().pg;
9581
9582   pg->state_clear(PG_STATE_PEERING);
9583   pg->state_set(PG_STATE_INCOMPLETE);
9584
9585   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
9586   ceph_assert(pg->blocked_by.empty());
9587   pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
9588   pg->publish_stats_to_osd();
9589 }
9590
9591 boost::statechart::result PG::RecoveryState::Incomplete::react(const AdvMap &advmap) {
9592   PG *pg = context< RecoveryMachine >().pg;
9593   int64_t poolnum = pg->info.pgid.pool();
9594
9595   // Reset if min_size turn smaller than previous value, pg might now be able to go active
9596   if (!advmap.osdmap->have_pg_pool(poolnum) ||
9597       advmap.lastmap->get_pools().find(poolnum)->second.min_size >
9598       advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
9599     post_event(advmap);
9600     return transit< Reset >();
9601   }
9602
9603   return forward_event();
9604 }
9605
9606 boost::statechart::result PG::RecoveryState::Incomplete::react(const MNotifyRec& notevt) {
9607   PG *pg = context< RecoveryMachine >().pg;
9608   ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
9609   if (pg->proc_replica_info(
9610     notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
9611     // We got something new, try again!
9612     return transit< GetLog >();
9613   } else {
9614     return discard_event();
9615   }
9616 }
9617
9618 boost::statechart::result PG::RecoveryState::Incomplete::react(
9619   const QueryState& q)
9620 {
9621   q.f->open_object_section("state");
9622   q.f->dump_string("name", state_name);
9623   q.f->dump_stream("enter_time") << enter_time;
9624   q.f->dump_string("comment", "not enough complete instances of this PG");
9625   q.f->close_section();
9626   return forward_event();
9627 }
9628
9629 void PG::RecoveryState::Incomplete::exit()
9630 {
9631   context< RecoveryMachine >().log_exit(state_name, enter_time);
9632   PG *pg = context< RecoveryMachine >().pg;
9633
9634   pg->state_clear(PG_STATE_INCOMPLETE);
9635   utime_t dur = ceph_clock_now() - enter_time;
9636   pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur);
9637
9638   pg->blocked_by.clear();
9639 }
9640
9641 /*------GetMissing--------*/
9642 PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
9643   : my_base(ctx),
9644     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetMissing")
9645 {
9646   context< RecoveryMachine >().log_enter(state_name);
9647
9648   PG *pg = context< RecoveryMachine >().pg;
9649   ceph_assert(!pg->acting_recovery_backfill.empty());
9650   eversion_t since;
9651   for (set<pg_shard_t>::iterator i = pg->acting_recovery_backfill.begin();
9652        i != pg->acting_recovery_backfill.end();
9653        ++i) {
9654     if (*i == pg->get_primary()) continue;
9655     const pg_info_t& pi = pg->peer_info[*i];
9656     // reset this so to make sure the pg_missing_t is initialized and
9657     // has the correct semantics even if we don't need to get a
9658     // missing set from a shard. This way later additions due to
9659     // lost+unfound delete work properly.
9660     pg->peer_missing[*i].may_include_deletes = !pg->perform_deletes_during_peering();
9661
9662     if (pi.is_empty())
9663       continue;                                // no pg data, nothing divergent
9664
9665     if (pi.last_update < pg->pg_log.get_tail()) {
9666       ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
9667       pg->peer_missing[*i].clear();
9668       continue;
9669     }
9670     if (pi.last_backfill == hobject_t()) {
9671       ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
9672       pg->peer_missing[*i].clear();
9673       continue;
9674     }
9675
9676     if (pi.last_update == pi.last_complete &&  // peer has no missing
9677         pi.last_update == pg->info.last_update) {  // peer is up to date
9678       // replica has no missing and identical log as us.  no need to
9679       // pull anything.
9680       // FIXME: we can do better here.  if last_update==last_complete we
9681       //        can infer the rest!
9682       ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl;
9683       pg->peer_missing[*i].clear();
9684       continue;
9685     }
9686
9687     // We pull the log from the peer's last_epoch_started to ensure we
9688     // get enough log to detect divergent updates.
9689     since.epoch = pi.last_epoch_started;
9690     ceph_assert(pi.last_update >= pg->info.log_tail);  // or else choose_acting() did a bad thing
9691     if (pi.log_tail <= since) {
9692       ldout(pg->cct, 10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
9693       context< RecoveryMachine >().send_query(
9694         *i,
9695         pg_query_t(
9696           pg_query_t::LOG,
9697           i->shard, pg->pg_whoami.shard,
9698           since, pg->info.history,
9699           pg->get_osdmap_epoch()));
9700     } else {
9701       ldout(pg->cct, 10) << " requesting fulllog+missing from osd." << *i
9702                          << " (want since " << since << " < log.tail "
9703                          << pi.log_tail << ")" << dendl;
9704       context< RecoveryMachine >().send_query(
9705         *i, pg_query_t(
9706           pg_query_t::FULLLOG,
9707           i->shard, pg->pg_whoami.shard,
9708           pg->info.history, pg->get_osdmap_epoch()));
9709     }
9710     peer_missing_requested.insert(*i);
9711     pg->blocked_by.insert(i->osd);
9712   }
9713
9714   if (peer_missing_requested.empty()) {
9715     if (pg->need_up_thru) {
9716       ldout(pg->cct, 10) << " still need up_thru update before going active"
9717                          << dendl;
9718       post_event(NeedUpThru());
9719       return;
9720     }
9721
9722     // all good!
9723     post_event(Activate(pg->get_osdmap_epoch()));
9724   } else {
9725     pg->publish_stats_to_osd();
9726   }
9727 }
9728
9729 boost::statechart::result PG::RecoveryState::GetMissing::react(const MLogRec& logevt)
9730 {
9731   PG *pg = context< RecoveryMachine >().pg;
9732
9733   peer_missing_requested.erase(logevt.from);
9734   pg->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
9735
9736   if (peer_missing_requested.empty()) {
9737     if (pg->need_up_thru) {
9738       ldout(pg->cct, 10) << " still need up_thru update before going active"
9739                          << dendl;
9740       post_event(NeedUpThru());
9741     } else {
9742       ldout(pg->cct, 10) << "Got last missing, don't need missing "
9743                          << "posting Activate" << dendl;
9744       post_event(Activate(pg->get_osdmap_epoch()));
9745     }
9746   }
9747   return discard_event();
9748 }
9749
9750 boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState& q)
9751 {
9752   PG *pg = context< RecoveryMachine >().pg;
9753   q.f->open_object_section("state");
9754   q.f->dump_string("name", state_name);
9755   q.f->dump_stream("enter_time") << enter_time;
9756
9757   q.f->open_array_section("peer_missing_requested");
9758   for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
9759        p != peer_missing_requested.end();
9760        ++p) {
9761     q.f->open_object_section("osd");
9762     q.f->dump_stream("osd") << *p;
9763     if (pg->peer_missing.count(*p)) {
9764       q.f->open_object_section("got_missing");
9765       pg->peer_missing[*p].dump(q.f);
9766       q.f->close_section();
9767     }
9768     q.f->close_section();
9769   }
9770   q.f->close_section();
9771
9772   q.f->close_section();
9773   return forward_event();
9774 }
9775
9776 void PG::RecoveryState::GetMissing::exit()
9777 {
9778   context< RecoveryMachine >().log_exit(state_name, enter_time);
9779   PG *pg = context< RecoveryMachine >().pg;
9780   utime_t dur = ceph_clock_now() - enter_time;
9781   pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur);
9782   pg->blocked_by.clear();
9783 }
9784
9785 /*------WaitUpThru--------*/
9786 PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx)
9787   : my_base(ctx),
9788     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitUpThru")
9789 {
9790   context< RecoveryMachine >().log_enter(state_name);
9791 }
9792
9793 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const ActMap& am)
9794 {
9795   PG *pg = context< RecoveryMachine >().pg;
9796   if (!pg->need_up_thru) {
9797     post_event(Activate(pg->get_osdmap_epoch()));
9798   }
9799   return forward_event();
9800 }
9801
9802 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const MLogRec& logevt)
9803 {
9804   PG *pg = context< RecoveryMachine >().pg;
9805   ldout(pg->cct, 10) << "Noting missing from osd." << logevt.from << dendl;
9806   pg->peer_missing[logevt.from].claim(logevt.msg->missing);
9807   pg->peer_info[logevt.from] = logevt.msg->info;
9808   return discard_event();
9809 }
9810
9811 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const QueryState& q)
9812 {
9813   q.f->open_object_section("state");
9814   q.f->dump_string("name", state_name);
9815   q.f->dump_stream("enter_time") << enter_time;
9816   q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
9817   q.f->close_section();
9818   return forward_event();
9819 }
9820
9821 void PG::RecoveryState::WaitUpThru::exit()
9822 {
9823   context< RecoveryMachine >().log_exit(state_name, enter_time);
9824   PG *pg = context< RecoveryMachine >().pg;
9825   utime_t dur = ceph_clock_now() - enter_time;
9826   pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur);
9827 }
9828
9829 /*----RecoveryState::RecoveryMachine Methods-----*/
9830 #undef dout_prefix
9831 #define dout_prefix pg->gen_prefix(*_dout)
9832
9833 void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name)
9834 {
9835   PG *pg = context< RecoveryMachine >().pg;
9836   ldout(pg->cct, 5) << "enter " << state_name << dendl;
9837   pg->osd->pg_recovery_stats.log_enter(state_name);
9838 }
9839
9840 void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_t enter_time)
9841 {
9842   utime_t dur = ceph_clock_now() - enter_time;
9843   PG *pg = context< RecoveryMachine >().pg;
9844   ldout(pg->cct, 5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
9845   pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now() - enter_time,
9846                                       event_count, event_time);
9847   event_count = 0;
9848   event_time = utime_t();
9849 }
9850
9851
9852 /*---------------------------------------------------*/
9853 #undef dout_prefix
9854 #define dout_prefix ((debug_pg ? debug_pg->gen_prefix(*_dout) : *_dout) << " PriorSet: ")
9855
9856 void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
9857   ceph_assert(!rctx);
9858   ceph_assert(!orig_ctx);
9859   orig_ctx = new_ctx;
9860   if (new_ctx) {
9861     if (messages_pending_flush) {
9862       rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
9863     } else {
9864       rctx = *new_ctx;
9865     }
9866     rctx->start_time = ceph_clock_now();
9867   }
9868 }
9869
9870 void PG::RecoveryState::begin_block_outgoing() {
9871   ceph_assert(!messages_pending_flush);
9872   ceph_assert(orig_ctx);
9873   ceph_assert(rctx);
9874   messages_pending_flush = BufferedRecoveryMessages();
9875   rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
9876 }
9877
9878 void PG::RecoveryState::clear_blocked_outgoing() {
9879   ceph_assert(orig_ctx);
9880   ceph_assert(rctx);
9881   messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
9882 }
9883
9884 void PG::RecoveryState::end_block_outgoing() {
9885   ceph_assert(messages_pending_flush);
9886   ceph_assert(orig_ctx);
9887   ceph_assert(rctx);
9888
9889   rctx = RecoveryCtx(*orig_ctx);
9890   rctx->accept_buffered_messages(*messages_pending_flush);
9891   messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
9892 }
9893
9894 void PG::RecoveryState::end_handle() {
9895   if (rctx) {
9896     utime_t dur = ceph_clock_now() - rctx->start_time;
9897     machine.event_time += dur;
9898   }
9899
9900   machine.event_count++;
9901   rctx = boost::optional<RecoveryCtx>();
9902   orig_ctx = NULL;
9903 }
9904
9905 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi)
9906 {
9907   out << "BackfillInfo(" << bi.begin << "-" << bi.end
9908       << " " << bi.objects.size() << " objects";
9909   if (!bi.objects.empty())
9910     out << " " << bi.objects;
9911   out << ")";
9912   return out;
9913 }
9914
9915 void PG::dump_pgstate_history(Formatter *f)
9916 {
9917   lock();
9918   pgstate_history.dump(f);
9919   unlock();
9920 }
9921
9922 void PG::dump_missing(Formatter *f)
9923 {
9924   for (auto& i : pg_log.get_missing().get_items()) {
9925     f->open_object_section("object");
9926     f->dump_object("oid", i.first);
9927     f->dump_object("missing_info", i.second);
9928     if (missing_loc.needs_recovery(i.first)) {
9929       f->dump_bool("unfound", missing_loc.is_unfound(i.first));
9930       f->open_array_section("locations");
9931       for (auto l : missing_loc.get_locations(i.first)) {
9932         f->dump_object("shard", l);
9933       }
9934       f->close_section();
9935     }
9936     f->close_section();
9937   }
9938 }
9939
9940 void PG::get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f)
9941 {
9942   pg_stats_publish_lock.Lock();
9943   if (pg_stats_publish_valid) {
9944     f(pg_stats_publish, pg_stats_publish.get_effective_last_epoch_clean());
9945   }
9946   pg_stats_publish_lock.Unlock();
9947 }
9948
9949 void PG::with_heartbeat_peers(std::function<void(int)> f)
9950 {
9951   heartbeat_peer_lock.Lock();
9952   for (auto p : heartbeat_peers) {
9953     f(p);
9954   }
9955   for (auto p : probe_targets) {
9956     f(p);
9957   }
9958   heartbeat_peer_lock.Unlock();
9959 }