ceph/src/osd/PeeringState.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3
   4 #include "PGPeeringEvent.h"
   5 #include "common/ceph_releases.h"
   6 #include "common/dout.h"
   7 #include "PeeringState.h"
   8
   9 #include "messages/MOSDPGRemove.h"
  10 #include "messages/MBackfillReserve.h"
  11 #include "messages/MRecoveryReserve.h"
  12 #include "messages/MOSDScrubReserve.h"
  13 #include "messages/MOSDPGInfo2.h"
  14 #include "messages/MOSDPGTrim.h"
  15 #include "messages/MOSDPGLog.h"
  16 #include "messages/MOSDPGNotify2.h"
  17 #include "messages/MOSDPGQuery2.h"
  18 #include "messages/MOSDPGLease.h"
  19 #include "messages/MOSDPGLeaseAck.h"
  20
  21 #define dout_context cct
  22 #define dout_subsys ceph_subsys_osd
  23
  24 using std::dec;
  25 using std::hex;
  26 using std::make_pair;
  27 using std::map;
  28 using std::ostream;
  29 using std::pair;
  30 using std::set;
  31 using std::string;
  32 using std::stringstream;
  33 using std::vector;
  34
  35 using ceph::Formatter;
  36 using ceph::make_message;
  37
  38 BufferedRecoveryMessages::BufferedRecoveryMessages(PeeringCtx &ctx)
  39   // steal messages from ctx
  40   : message_map{std::move(ctx.message_map)}
  41 {}
  42
  43 void BufferedRecoveryMessages::send_notify(int to, const pg_notify_t &n)
  44 {
  45   spg_t pgid(n.info.pgid.pgid, n.to);
  46   send_osd_message(to, TOPNSPC::make_message<MOSDPGNotify2>(pgid, n));
  47 }
  48
  49 void BufferedRecoveryMessages::send_query(
  50   int to,
  51   spg_t to_spgid,
  52   const pg_query_t &q)
  53 {
  54   send_osd_message(to, TOPNSPC::make_message<MOSDPGQuery2>(to_spgid, q));
  55 }
  56
  57 void BufferedRecoveryMessages::send_info(
  58   int to,
  59   spg_t to_spgid,
  60   epoch_t min_epoch,
  61   epoch_t cur_epoch,
  62   const pg_info_t &info,
  63   std::optional<pg_lease_t> lease,
  64   std::optional<pg_lease_ack_t> lease_ack)
  65 {
  66   send_osd_message(
  67     to,
  68     TOPNSPC::make_message<MOSDPGInfo2>(
  69       to_spgid,
  70       info,
  71       cur_epoch,
  72       min_epoch,
  73       lease,
  74       lease_ack)
  75   );
  76 }
  77
  78 void PGPool::update(OSDMapRef map)
  79 {
  80   const pg_pool_t *pi = map->get_pg_pool(id);
  81   if (!pi) {
  82     return; // pool has been deleted
  83   }
  84   info = *pi;
  85   name = map->get_pool_name(id);
  86
  87   bool updated = false;
  88   if ((map->get_epoch() != cached_epoch + 1) ||
  89       (pi->get_snap_epoch() == map->get_epoch())) {
  90     updated = true;
  91   }
  92
  93   if (info.is_pool_snaps_mode() && updated) {
  94     snapc = pi->get_snap_context();
  95   }
  96   cached_epoch = map->get_epoch();
  97 }
  98
  99 /*-------------Peering State Helpers----------------*/
 100 #undef dout_prefix
 101 #define dout_prefix (dpp->gen_prefix(*_dout))
 102 #undef psdout
 103 #define psdout(x) ldout(cct, x)
 104
 105 PeeringState::PeeringState(
 106   CephContext *cct,
 107   pg_shard_t pg_whoami,
 108   spg_t spgid,
 109   const PGPool &_pool,
 110   OSDMapRef curmap,
 111   DoutPrefixProvider *dpp,
 112   PeeringListener *pl)
 113   : state_history(*pl),
 114     cct(cct),
 115     spgid(spgid),
 116     dpp(dpp),
 117     pl(pl),
 118     orig_ctx(0),
 119     osdmap_ref(curmap),
 120     pool(_pool),
 121     pg_whoami(pg_whoami),
 122     info(spgid),
 123     pg_log(cct),
 124     last_require_osd_release(curmap->require_osd_release),
 125     missing_loc(spgid, this, dpp, cct),
 126     machine(this, cct, spgid, dpp, pl, &state_history)
 127 {
 128   machine.initiate();
 129 }
 130
 131 void PeeringState::start_handle(PeeringCtx *new_ctx) {
 132   ceph_assert(!rctx);
 133   ceph_assert(!orig_ctx);
 134   orig_ctx = new_ctx;
 135   if (new_ctx) {
 136     if (messages_pending_flush) {
 137       rctx.emplace(*messages_pending_flush, *new_ctx);
 138     } else {
 139       rctx.emplace(*new_ctx);
 140     }
 141     rctx->start_time = ceph_clock_now();
 142   }
 143 }
 144
 145 void PeeringState::begin_block_outgoing() {
 146   ceph_assert(!messages_pending_flush);
 147   ceph_assert(orig_ctx);
 148   ceph_assert(rctx);
 149   messages_pending_flush.emplace();
 150   rctx.emplace(*messages_pending_flush, *orig_ctx);
 151 }
 152
 153 void PeeringState::clear_blocked_outgoing() {
 154   ceph_assert(orig_ctx);
 155   ceph_assert(rctx);
 156   messages_pending_flush = std::optional<BufferedRecoveryMessages>();
 157 }
 158
 159 void PeeringState::end_block_outgoing() {
 160   ceph_assert(messages_pending_flush);
 161   ceph_assert(orig_ctx);
 162   ceph_assert(rctx);
 163
 164   orig_ctx->accept_buffered_messages(*messages_pending_flush);
 165   rctx.emplace(*orig_ctx);
 166   messages_pending_flush = std::optional<BufferedRecoveryMessages>();
 167 }
 168
 169 void PeeringState::end_handle() {
 170   if (rctx) {
 171     utime_t dur = ceph_clock_now() - rctx->start_time;
 172     machine.event_time += dur;
 173   }
 174
 175   machine.event_count++;
 176   rctx = std::nullopt;
 177   orig_ctx = NULL;
 178 }
 179
 180 void PeeringState::check_recovery_sources(const OSDMapRef& osdmap)
 181 {
 182   /*
 183    * check that any peers we are planning to (or currently) pulling
 184    * objects from are dealt with.
 185    */
 186   missing_loc.check_recovery_sources(osdmap);
 187   pl->check_recovery_sources(osdmap);
 188
 189   for (auto i = peer_log_requested.begin(); i != peer_log_requested.end();) {
 190     if (!osdmap->is_up(i->osd)) {
 191       psdout(10) << "peer_log_requested removing " << *i << dendl;
 192       peer_log_requested.erase(i++);
 193     } else {
 194       ++i;
 195     }
 196   }
 197
 198   for (auto i = peer_missing_requested.begin();
 199        i != peer_missing_requested.end();) {
 200     if (!osdmap->is_up(i->osd)) {
 201       psdout(10) << "peer_missing_requested removing " << *i << dendl;
 202       peer_missing_requested.erase(i++);
 203     } else {
 204       ++i;
 205     }
 206   }
 207 }
 208
 209 void PeeringState::update_history(const pg_history_t& new_history)
 210 {
 211   auto mnow = pl->get_mnow();
 212   info.history.refresh_prior_readable_until_ub(mnow, prior_readable_until_ub);
 213   if (info.history.merge(new_history)) {
 214     psdout(20) << __func__ << " advanced history from " << new_history << dendl;
 215     dirty_info = true;
 216     if (info.history.last_epoch_clean >= info.history.same_interval_since) {
 217       psdout(20) << __func__ << " clearing past_intervals" << dendl;
 218       past_intervals.clear();
 219       dirty_big_info = true;
 220     }
 221     prior_readable_until_ub = info.history.get_prior_readable_until_ub(mnow);
 222     if (prior_readable_until_ub != ceph::signedspan::zero()) {
 223       dout(20) << __func__
 224                << " prior_readable_until_ub " << prior_readable_until_ub
 225                << " (mnow " << mnow << " + "
 226                << info.history.prior_readable_until_ub << ")" << dendl;
 227     }
 228   }
 229   pl->on_info_history_change();
 230 }
 231
 232 hobject_t PeeringState::earliest_backfill() const
 233 {
 234   hobject_t e = hobject_t::get_max();
 235   for (const pg_shard_t& bt : get_backfill_targets()) {
 236     const pg_info_t &pi = get_peer_info(bt);
 237     e = std::min(pi.last_backfill, e);
 238   }
 239   return e;
 240 }
 241
 242 void PeeringState::purge_strays()
 243 {
 244   if (is_premerge()) {
 245     psdout(10) << "purge_strays " << stray_set << " but premerge, doing nothing"
 246                << dendl;
 247     return;
 248   }
 249   if (cct->_conf.get_val<bool>("osd_debug_no_purge_strays")) {
 250     return;
 251   }
 252   psdout(10) << "purge_strays " << stray_set << dendl;
 253
 254   bool removed = false;
 255   for (auto p = stray_set.begin(); p != stray_set.end(); ++p) {
 256     ceph_assert(!is_acting_recovery_backfill(*p));
 257     if (get_osdmap()->is_up(p->osd)) {
 258       psdout(10) << "sending PGRemove to osd." << *p << dendl;
 259       vector<spg_t> to_remove;
 260       to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
 261       auto m = TOPNSPC::make_message<MOSDPGRemove>(
 262         get_osdmap_epoch(),
 263         to_remove);
 264       pl->send_cluster_message(p->osd, std::move(m), get_osdmap_epoch());
 265     } else {
 266       psdout(10) << "not sending PGRemove to down osd." << *p << dendl;
 267     }
 268     peer_missing.erase(*p);
 269     peer_info.erase(*p);
 270     missing_loc.remove_stray_recovery_sources(*p);
 271     peer_purged.insert(*p);
 272     removed = true;
 273   }
 274
 275   // if we removed anyone, update peers (which include peer_info)
 276   if (removed)
 277     update_heartbeat_peers();
 278
 279   stray_set.clear();
 280
 281   // clear _requested maps; we may have to peer() again if we discover
 282   // (more) stray content
 283   peer_log_requested.clear();
 284   peer_missing_requested.clear();
 285 }
 286
 287 void PeeringState::query_unfound(Formatter *f, string state)
 288 {
 289   psdout(20) << "Enter PeeringState common QueryUnfound" << dendl;
 290   {
 291     f->dump_string("state", state);
 292     f->dump_bool("available_might_have_unfound", true);
 293     f->open_array_section("might_have_unfound");
 294     for (auto p = might_have_unfound.begin();
 295          p != might_have_unfound.end();
 296          ++p) {
 297       if (peer_missing.count(*p)) {
 298         ; // Ignore already probed OSDs
 299       } else {
 300         f->open_object_section("osd");
 301         f->dump_stream("osd") << *p;
 302         if (peer_missing_requested.count(*p)) {
 303           f->dump_string("status", "querying");
 304         } else if (!get_osdmap()->is_up(p->osd)) {
 305           f->dump_string("status", "osd is down");
 306         } else {
 307           f->dump_string("status", "not queried");
 308         }
 309         f->close_section();
 310       }
 311     }
 312     f->close_section();
 313   }
 314   psdout(20) << "Exit PeeringState common QueryUnfound" << dendl;
 315   return;
 316 }
 317
 318 bool PeeringState::proc_replica_info(
 319   pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
 320 {
 321   auto p = peer_info.find(from);
 322   if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
 323     psdout(10) << " got dup osd." << from << " info "
 324                << oinfo << ", identical to ours" << dendl;
 325     return false;
 326   }
 327
 328   if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
 329     psdout(10) << " got info " << oinfo << " from down osd." << from
 330              << " discarding" << dendl;
 331     return false;
 332   }
 333
 334   psdout(10) << " got osd." << from << " " << oinfo << dendl;
 335   ceph_assert(is_primary());
 336   peer_info[from] = oinfo;
 337   might_have_unfound.insert(from);
 338
 339   update_history(oinfo.history);
 340
 341   // stray?
 342   if (!is_up(from) && !is_acting(from)) {
 343     psdout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
 344     stray_set.insert(from);
 345     if (is_clean()) {
 346       purge_strays();
 347     }
 348   }
 349
 350   // was this a new info?  if so, update peers!
 351   if (p == peer_info.end())
 352     update_heartbeat_peers();
 353
 354   return true;
 355 }
 356
 357
 358 void PeeringState::remove_down_peer_info(const OSDMapRef &osdmap)
 359 {
 360   // Remove any downed osds from peer_info
 361   bool removed = false;
 362   auto p = peer_info.begin();
 363   while (p != peer_info.end()) {
 364     if (!osdmap->is_up(p->first.osd)) {
 365       psdout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
 366       peer_missing.erase(p->first);
 367       peer_log_requested.erase(p->first);
 368       peer_missing_requested.erase(p->first);
 369       peer_info.erase(p++);
 370       removed = true;
 371     } else
 372       ++p;
 373   }
 374
 375   // Remove any downed osds from peer_purged so we can re-purge if necessary
 376   auto it = peer_purged.begin();
 377   while (it != peer_purged.end()) {
 378     if (!osdmap->is_up(it->osd)) {
 379       psdout(10) << " dropping down osd." << *it << " from peer_purged" << dendl;
 380       peer_purged.erase(it++);
 381     } else {
 382       ++it;
 383     }
 384   }
 385
 386   // if we removed anyone, update peers (which include peer_info)
 387   if (removed)
 388     update_heartbeat_peers();
 389
 390   check_recovery_sources(osdmap);
 391 }
 392
 393 void PeeringState::update_heartbeat_peers()
 394 {
 395   if (!is_primary())
 396     return;
 397
 398   set<int> new_peers;
 399   for (unsigned i=0; i<acting.size(); i++) {
 400     if (acting[i] != CRUSH_ITEM_NONE)
 401       new_peers.insert(acting[i]);
 402   }
 403   for (unsigned i=0; i<up.size(); i++) {
 404     if (up[i] != CRUSH_ITEM_NONE)
 405       new_peers.insert(up[i]);
 406   }
 407   for (auto p = peer_info.begin(); p != peer_info.end(); ++p) {
 408     new_peers.insert(p->first.osd);
 409   }
 410   pl->update_heartbeat_peers(std::move(new_peers));
 411 }
 412
 413 void PeeringState::write_if_dirty(ObjectStore::Transaction& t)
 414 {
 415   pl->prepare_write(
 416     info,
 417     last_written_info,
 418     past_intervals,
 419     pg_log,
 420     dirty_info,
 421     dirty_big_info,
 422     last_persisted_osdmap < get_osdmap_epoch(),
 423     t);
 424   if (dirty_info || dirty_big_info) {
 425     last_persisted_osdmap = get_osdmap_epoch();
 426     last_written_info = info;
 427     dirty_info = false;
 428     dirty_big_info = false;
 429   }
 430 }
 431
 432 void PeeringState::advance_map(
 433   OSDMapRef osdmap, OSDMapRef lastmap,
 434   vector<int>& newup, int up_primary,
 435   vector<int>& newacting, int acting_primary,
 436   PeeringCtx &rctx)
 437 {
 438   ceph_assert(lastmap == osdmap_ref);
 439   psdout(10) << "handle_advance_map "
 440             << newup << "/" << newacting
 441             << " -- " << up_primary << "/" << acting_primary
 442             << dendl;
 443
 444   update_osdmap_ref(osdmap);
 445   pool.update(osdmap);
 446
 447   AdvMap evt(
 448     osdmap, lastmap, newup, up_primary,
 449     newacting, acting_primary);
 450   handle_event(evt, &rctx);
 451   if (pool.info.last_change == osdmap_ref->get_epoch()) {
 452     pl->on_pool_change();
 453   }
 454   readable_interval = pool.get_readable_interval(cct->_conf);
 455   last_require_osd_release = osdmap->require_osd_release;
 456 }
 457
 458 void PeeringState::activate_map(PeeringCtx &rctx)
 459 {
 460   psdout(10) << __func__ << dendl;
 461   ActMap evt;
 462   handle_event(evt, &rctx);
 463   if (osdmap_ref->get_epoch() - last_persisted_osdmap >
 464     cct->_conf->osd_pg_epoch_persisted_max_stale) {
 465     psdout(20) << __func__ << ": Dirtying info: last_persisted is "
 466               << last_persisted_osdmap
 467               << " while current is " << osdmap_ref->get_epoch() << dendl;
 468     dirty_info = true;
 469   } else {
 470     psdout(20) << __func__ << ": Not dirtying info: last_persisted is "
 471               << last_persisted_osdmap
 472               << " while current is " << osdmap_ref->get_epoch() << dendl;
 473   }
 474   write_if_dirty(rctx.transaction);
 475
 476   if (get_osdmap()->check_new_blocklist_entries()) {
 477     pl->check_blocklisted_watchers();
 478   }
 479 }
 480
 481 void PeeringState::set_last_peering_reset()
 482 {
 483   psdout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl;
 484   if (last_peering_reset != get_osdmap_epoch()) {
 485     last_peering_reset = get_osdmap_epoch();
 486     psdout(10) << "Clearing blocked outgoing recovery messages" << dendl;
 487     clear_blocked_outgoing();
 488     if (!pl->try_flush_or_schedule_async()) {
 489       psdout(10) << "Beginning to block outgoing recovery messages" << dendl;
 490       begin_block_outgoing();
 491     } else {
 492       psdout(10) << "Not blocking outgoing recovery messages" << dendl;
 493     }
 494   }
 495 }
 496
 497 void PeeringState::complete_flush()
 498 {
 499   flushes_in_progress--;
 500   if (flushes_in_progress == 0) {
 501     pl->on_flushed();
 502   }
 503 }
 504
 505 void PeeringState::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
 506 {
 507   const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
 508   if (!pi) {
 509     return; // pool deleted
 510   }
 511   bool changed = false;
 512   if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
 513     const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
 514     if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
 515       psdout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
 516       changed = true;
 517     }
 518   }
 519   if (changed) {
 520     info.history.last_epoch_marked_full = osdmap->get_epoch();
 521     dirty_info = true;
 522   }
 523 }
 524
 525 bool PeeringState::should_restart_peering(
 526   int newupprimary,
 527   int newactingprimary,
 528   const vector<int>& newup,
 529   const vector<int>& newacting,
 530   OSDMapRef lastmap,
 531   OSDMapRef osdmap)
 532 {
 533   if (PastIntervals::is_new_interval(
 534         primary.osd,
 535         newactingprimary,
 536         acting,
 537         newacting,
 538         up_primary.osd,
 539         newupprimary,
 540         up,
 541         newup,
 542         osdmap.get(),
 543         lastmap.get(),
 544         info.pgid.pgid)) {
 545     psdout(20) << "new interval newup " << newup
 546                << " newacting " << newacting << dendl;
 547     return true;
 548   }
 549   if (!lastmap->is_up(pg_whoami.osd) && osdmap->is_up(pg_whoami.osd)) {
 550     psdout(10) << __func__ << " osd transitioned from down -> up"
 551                << dendl;
 552     return true;
 553   }
 554   return false;
 555 }
 556
 557 /* Called before initializing peering during advance_map */
 558 void PeeringState::start_peering_interval(
 559   const OSDMapRef lastmap,
 560   const vector<int>& newup, int new_up_primary,
 561   const vector<int>& newacting, int new_acting_primary,
 562   ObjectStore::Transaction &t)
 563 {
 564   const OSDMapRef osdmap = get_osdmap();
 565
 566   set_last_peering_reset();
 567
 568   vector<int> oldacting, oldup;
 569   int oldrole = get_role();
 570
 571   if (is_primary()) {
 572     pl->clear_ready_to_merge();
 573   }
 574
 575
 576   pg_shard_t old_acting_primary = get_primary();
 577   pg_shard_t old_up_primary = up_primary;
 578   bool was_old_primary = is_primary();
 579   bool was_old_nonprimary = is_nonprimary();
 580
 581   acting.swap(oldacting);
 582   up.swap(oldup);
 583   init_primary_up_acting(
 584     newup,
 585     newacting,
 586     new_up_primary,
 587     new_acting_primary);
 588
 589   if (info.stats.up != up ||
 590       info.stats.acting != acting ||
 591       info.stats.up_primary != new_up_primary ||
 592       info.stats.acting_primary != new_acting_primary) {
 593     info.stats.up = up;
 594     info.stats.up_primary = new_up_primary;
 595     info.stats.acting = acting;
 596     info.stats.acting_primary = new_acting_primary;
 597     info.stats.mapping_epoch = osdmap->get_epoch();
 598   }
 599
 600   pl->clear_publish_stats();
 601
 602   // This will now be remapped during a backfill in cases
 603   // that it would not have been before.
 604   if (up != acting)
 605     state_set(PG_STATE_REMAPPED);
 606   else
 607     state_clear(PG_STATE_REMAPPED);
 608
 609   int role = osdmap->calc_pg_role(pg_whoami, acting);
 610   set_role(role);
 611
 612   // did acting, up, primary|acker change?
 613   if (!lastmap) {
 614     psdout(10) << " no lastmap" << dendl;
 615     dirty_info = true;
 616     dirty_big_info = true;
 617     info.history.same_interval_since = osdmap->get_epoch();
 618   } else {
 619     std::stringstream debug;
 620     ceph_assert(info.history.same_interval_since != 0);
 621     bool new_interval = PastIntervals::check_new_interval(
 622       old_acting_primary.osd,
 623       new_acting_primary,
 624       oldacting, newacting,
 625       old_up_primary.osd,
 626       new_up_primary,
 627       oldup, newup,
 628       info.history.same_interval_since,
 629       info.history.last_epoch_clean,
 630       osdmap.get(),
 631       lastmap.get(),
 632       info.pgid.pgid,
 633       missing_loc.get_recoverable_predicate(),
 634       &past_intervals,
 635       &debug);
 636     psdout(10) << __func__ << ": check_new_interval output: "
 637                << debug.str() << dendl;
 638     if (new_interval) {
 639       if (osdmap->get_epoch() == pl->oldest_stored_osdmap() &&
 640           info.history.last_epoch_clean < osdmap->get_epoch()) {
 641         psdout(10) << " map gap, clearing past_intervals and faking" << dendl;
 642         // our information is incomplete and useless; someone else was clean
 643         // after everything we know if osdmaps were trimmed.
 644         past_intervals.clear();
 645       } else {
 646         psdout(10) << " noting past " << past_intervals << dendl;
 647       }
 648       dirty_info = true;
 649       dirty_big_info = true;
 650       info.history.same_interval_since = osdmap->get_epoch();
 651       if (osdmap->have_pg_pool(info.pgid.pgid.pool()) &&
 652           info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
 653                                   osdmap->get_pg_num(info.pgid.pgid.pool()),
 654                                   nullptr)) {
 655         info.history.last_epoch_split = osdmap->get_epoch();
 656       }
 657     }
 658   }
 659
 660   if (old_up_primary != up_primary ||
 661       oldup != up) {
 662     info.history.same_up_since = osdmap->get_epoch();
 663   }
 664   // this comparison includes primary rank via pg_shard_t
 665   if (old_acting_primary != get_primary()) {
 666     info.history.same_primary_since = osdmap->get_epoch();
 667   }
 668
 669   on_new_interval();
 670   pl->on_info_history_change();
 671
 672   psdout(1) << __func__ << " up " << oldup << " -> " << up
 673             << ", acting " << oldacting << " -> " << acting
 674             << ", acting_primary " << old_acting_primary << " -> "
 675             << new_acting_primary
 676             << ", up_primary " << old_up_primary << " -> " << new_up_primary
 677             << ", role " << oldrole << " -> " << role
 678             << ", features acting " << acting_features
 679             << " upacting " << upacting_features
 680             << dendl;
 681
 682   // deactivate.
 683   state_clear(PG_STATE_ACTIVE);
 684   state_clear(PG_STATE_PEERED);
 685   state_clear(PG_STATE_PREMERGE);
 686   state_clear(PG_STATE_DOWN);
 687   state_clear(PG_STATE_RECOVERY_WAIT);
 688   state_clear(PG_STATE_RECOVERY_TOOFULL);
 689   state_clear(PG_STATE_RECOVERING);
 690
 691   peer_purged.clear();
 692   acting_recovery_backfill.clear();
 693
 694   // reset primary/replica state?
 695   if (was_old_primary || is_primary()) {
 696     pl->clear_want_pg_temp();
 697   } else if (was_old_nonprimary || is_nonprimary()) {
 698     pl->clear_want_pg_temp();
 699   }
 700   clear_primary_state();
 701
 702   pl->on_change(t);
 703
 704   ceph_assert(!deleting);
 705
 706   // should we tell the primary we are here?
 707   send_notify = !is_primary();
 708
 709   if (role != oldrole ||
 710       was_old_primary != is_primary()) {
 711     // did primary change?
 712     if (was_old_primary != is_primary()) {
 713       state_clear(PG_STATE_CLEAN);
 714       // queue/dequeue the scrubber
 715       pl->on_primary_status_change(was_old_primary, is_primary());
 716     }
 717
 718     pl->on_role_change();
 719   } else {
 720     // no role change.
 721     // did primary change?
 722     if (get_primary() != old_acting_primary) {
 723       psdout(10) << oldacting << " -> " << acting
 724                << ", acting primary "
 725                << old_acting_primary << " -> " << get_primary()
 726                << dendl;
 727     } else {
 728       // primary is the same.
 729       if (is_primary()) {
 730         // i am (still) primary. but my replica set changed.
 731         state_clear(PG_STATE_CLEAN);
 732
 733         psdout(10) << oldacting << " -> " << acting
 734                  << ", replicas changed" << dendl;
 735       }
 736     }
 737   }
 738
 739   if (is_primary() && was_old_primary) {
 740     pl->reschedule_scrub();
 741   }
 742
 743   if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
 744     psdout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
 745     pl->queue_want_pg_temp(acting);
 746   }
 747 }
 748
 749 void PeeringState::on_new_interval()
 750 {
 751   dout(20) << __func__ << dendl;
 752   const OSDMapRef osdmap = get_osdmap();
 753
 754   // initialize features
 755   acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
 756   upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
 757   for (auto p = acting.begin(); p != acting.end(); ++p) {
 758     if (*p == CRUSH_ITEM_NONE)
 759       continue;
 760     uint64_t f = osdmap->get_xinfo(*p).features;
 761     acting_features &= f;
 762     upacting_features &= f;
 763   }
 764   for (auto p = up.begin(); p != up.end(); ++p) {
 765     if (*p == CRUSH_ITEM_NONE)
 766       continue;
 767     upacting_features &= osdmap->get_xinfo(*p).features;
 768   }
 769   psdout(20) << __func__ << " upacting_features 0x" << std::hex
 770              << upacting_features << std::dec
 771              << " from " << acting << "+" << up << dendl;
 772
 773   psdout(20) << __func__ << " checking missing set deletes flag. missing = "
 774              << get_pg_log().get_missing() << dendl;
 775
 776   if (!pg_log.get_missing().may_include_deletes &&
 777       !perform_deletes_during_peering()) {
 778     pl->rebuild_missing_set_with_deletes(pg_log);
 779   }
 780   ceph_assert(
 781     pg_log.get_missing().may_include_deletes ==
 782     !perform_deletes_during_peering());
 783
 784   init_hb_stamps();
 785
 786   // update lease bounds for a new interval
 787   auto mnow = pl->get_mnow();
 788   prior_readable_until_ub = std::max(prior_readable_until_ub,
 789                                      readable_until_ub);
 790   prior_readable_until_ub = info.history.refresh_prior_readable_until_ub(
 791     mnow, prior_readable_until_ub);
 792   psdout(10) << __func__ << " prior_readable_until_ub "
 793              << prior_readable_until_ub << " (mnow " << mnow << " + "
 794              << info.history.prior_readable_until_ub << ")" << dendl;
 795   prior_readable_down_osds.clear(); // we populate this when we build the priorset
 796
 797   readable_until =
 798     readable_until_ub =
 799     readable_until_ub_sent =
 800     readable_until_ub_from_primary = ceph::signedspan::zero();
 801
 802   acting_readable_until_ub.clear();
 803   if (is_primary()) {
 804     acting_readable_until_ub.resize(acting.size(), ceph::signedspan::zero());
 805   }
 806
 807   pl->on_new_interval();
 808 }
 809
 810 void PeeringState::init_primary_up_acting(
 811   const vector<int> &newup,
 812   const vector<int> &newacting,
 813   int new_up_primary,
 814   int new_acting_primary)
 815 {
 816   actingset.clear();
 817   acting = newacting;
 818   for (uint8_t i = 0; i < acting.size(); ++i) {
 819     if (acting[i] != CRUSH_ITEM_NONE)
 820       actingset.insert(
 821         pg_shard_t(
 822           acting[i],
 823           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
 824   }
 825   upset.clear();
 826   up = newup;
 827   for (uint8_t i = 0; i < up.size(); ++i) {
 828     if (up[i] != CRUSH_ITEM_NONE)
 829       upset.insert(
 830         pg_shard_t(
 831           up[i],
 832           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
 833   }
 834   if (!pool.info.is_erasure()) {
 835     // replicated
 836     up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
 837     primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
 838   } else {
 839     // erasure
 840     up_primary = pg_shard_t();
 841     primary = pg_shard_t();
 842     for (uint8_t i = 0; i < up.size(); ++i) {
 843       if (up[i] == new_up_primary) {
 844         up_primary = pg_shard_t(up[i], shard_id_t(i));
 845         break;
 846       }
 847     }
 848     for (uint8_t i = 0; i < acting.size(); ++i) {
 849       if (acting[i] == new_acting_primary) {
 850         primary = pg_shard_t(acting[i], shard_id_t(i));
 851         break;
 852       }
 853     }
 854     ceph_assert(up_primary.osd == new_up_primary);
 855     ceph_assert(primary.osd == new_acting_primary);
 856   }
 857 }
 858
 859 void PeeringState::init_hb_stamps()
 860 {
 861   if (is_primary()) {
 862     // we care about all other osds in the acting set
 863     hb_stamps.resize(acting.size() - 1);
 864     unsigned i = 0;
 865     for (auto p : acting) {
 866       if (p == CRUSH_ITEM_NONE || p == get_primary().osd) {
 867         continue;
 868       }
 869       hb_stamps[i++] = pl->get_hb_stamps(p);
 870     }
 871     hb_stamps.resize(i);
 872   } else if (is_nonprimary()) {
 873     // we care about just the primary
 874     hb_stamps.resize(1);
 875     hb_stamps[0] = pl->get_hb_stamps(get_primary().osd);
 876   } else {
 877     hb_stamps.clear();
 878   }
 879   dout(10) << __func__ << " now " << hb_stamps << dendl;
 880 }
 881
 882
 883 void PeeringState::clear_recovery_state()
 884 {
 885   async_recovery_targets.clear();
 886   backfill_targets.clear();
 887 }
 888
 889 void PeeringState::clear_primary_state()
 890 {
 891   psdout(10) << "clear_primary_state" << dendl;
 892
 893   // clear peering state
 894   stray_set.clear();
 895   peer_log_requested.clear();
 896   peer_missing_requested.clear();
 897   peer_info.clear();
 898   peer_bytes.clear();
 899   peer_missing.clear();
 900   peer_last_complete_ondisk.clear();
 901   peer_activated.clear();
 902   min_last_complete_ondisk = eversion_t();
 903   pg_trim_to = eversion_t();
 904   might_have_unfound.clear();
 905   need_up_thru = false;
 906   missing_loc.clear();
 907   pg_log.reset_recovery_pointers();
 908
 909   clear_recovery_state();
 910
 911   last_update_ondisk = eversion_t();
 912   missing_loc.clear();
 913   pl->clear_primary_state();
 914 }
 915
 916 /// return [start,end) bounds for required past_intervals
 917 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
 918   const pg_info_t &info,
 919   epoch_t oldest_map) {
 920   epoch_t start = std::max(
 921     info.history.last_epoch_clean ? info.history.last_epoch_clean :
 922     info.history.epoch_pool_created,
 923     oldest_map);
 924   epoch_t end = std::max(
 925     info.history.same_interval_since,
 926     info.history.epoch_pool_created);
 927   return make_pair(start, end);
 928 }
 929
 930
 931 void PeeringState::check_past_interval_bounds() const
 932 {
 933   auto oldest_epoch = pl->oldest_stored_osdmap();
 934   auto rpib = get_required_past_interval_bounds(
 935     info,
 936     oldest_epoch);
 937   if (rpib.first >= rpib.second) {
 938     // do not warn if the start bound is dictated by oldest_map; the
 939     // past intervals are presumably appropriate given the pg info.
 940     if (!past_intervals.empty() &&
 941         rpib.first > oldest_epoch) {
 942       pl->get_clog_error() << info.pgid << " required past_interval bounds are"
 943                              << " empty [" << rpib << ") but past_intervals is not: "
 944                              << past_intervals;
 945       derr << info.pgid << " required past_interval bounds are"
 946            << " empty [" << rpib << ") but past_intervals is not: "
 947            << past_intervals << dendl;
 948     }
 949   } else {
 950     if (past_intervals.empty()) {
 951       pl->get_clog_error() << info.pgid << " required past_interval bounds are"
 952                              << " not empty [" << rpib << ") but past_intervals "
 953                              << past_intervals << " is empty";
 954       derr << info.pgid << " required past_interval bounds are"
 955            << " not empty [" << rpib << ") but past_intervals "
 956            << past_intervals << " is empty" << dendl;
 957       ceph_assert(!past_intervals.empty());
 958     }
 959
 960     auto apib = past_intervals.get_bounds();
 961     if (apib.first > rpib.first) {
 962       pl->get_clog_error() << info.pgid << " past_intervals [" << apib
 963                              << ") start interval does not contain the required"
 964                              << " bound [" << rpib << ") start";
 965       derr << info.pgid << " past_intervals [" << apib
 966            << ") start interval does not contain the required"
 967            << " bound [" << rpib << ") start" << dendl;
 968       ceph_abort_msg("past_interval start interval mismatch");
 969     }
 970     if (apib.second != rpib.second) {
 971       pl->get_clog_error() << info.pgid << " past_interal bound [" << apib
 972                              << ") end does not match required [" << rpib
 973                              << ") end";
 974       derr << info.pgid << " past_interal bound [" << apib
 975            << ") end does not match required [" << rpib
 976            << ") end" << dendl;
 977       ceph_abort_msg("past_interval end mismatch");
 978     }
 979   }
 980 }
 981
 982 int PeeringState::clamp_recovery_priority(int priority, int pool_recovery_priority, int max)
 983 {
 984   static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
 985   static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
 986
 987   ceph_assert(max <= OSD_RECOVERY_PRIORITY_MAX);
 988
 989   // User can't set this too high anymore, but might be a legacy value
 990   if (pool_recovery_priority > OSD_POOL_PRIORITY_MAX)
 991     pool_recovery_priority = OSD_POOL_PRIORITY_MAX;
 992   if (pool_recovery_priority < OSD_POOL_PRIORITY_MIN)
 993     pool_recovery_priority = OSD_POOL_PRIORITY_MIN;
 994   // Shift range from min to max to 0 to max - min
 995   pool_recovery_priority += (0 - OSD_POOL_PRIORITY_MIN);
 996   ceph_assert(pool_recovery_priority >= 0 && pool_recovery_priority <= (OSD_POOL_PRIORITY_MAX - OSD_POOL_PRIORITY_MIN));
 997
 998   priority += pool_recovery_priority;
 999
1000   // Clamp to valid range
1001   if (priority > max) {
1002     return max;
1003   } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
1004     return OSD_RECOVERY_PRIORITY_MIN;
1005   } else {
1006     return priority;
1007   }
1008 }
1009
1010 unsigned PeeringState::get_recovery_priority()
1011 {
1012   // a higher value -> a higher priority
1013   int ret = OSD_RECOVERY_PRIORITY_BASE;
1014   int base = ret;
1015
1016   if (state & PG_STATE_FORCED_RECOVERY) {
1017     ret = OSD_RECOVERY_PRIORITY_FORCED;
1018   } else {
1019     // XXX: This priority boost isn't so much about inactive, but about data-at-risk
1020     if (is_degraded() && info.stats.avail_no_missing.size() < pool.info.min_size) {
1021       base = OSD_RECOVERY_INACTIVE_PRIORITY_BASE;
1022       // inactive: no. of replicas < min_size, highest priority since it blocks IO
1023       ret = base + (pool.info.min_size - info.stats.avail_no_missing.size());
1024     }
1025
1026     int64_t pool_recovery_priority = 0;
1027     pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
1028
1029     ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
1030   }
1031   psdout(20) << __func__ << " recovery priority is " << ret << dendl;
1032   return static_cast<unsigned>(ret);
1033 }
1034
1035 unsigned PeeringState::get_backfill_priority()
1036 {
1037   // a higher value -> a higher priority
1038   int ret = OSD_BACKFILL_PRIORITY_BASE;
1039   int base = ret;
1040
1041   if (state & PG_STATE_FORCED_BACKFILL) {
1042     ret = OSD_BACKFILL_PRIORITY_FORCED;
1043   } else {
1044     if (actingset.size() < pool.info.min_size) {
1045       base = OSD_BACKFILL_INACTIVE_PRIORITY_BASE;
1046       // inactive: no. of replicas < min_size, highest priority since it blocks IO
1047       ret = base + (pool.info.min_size - actingset.size());
1048
1049     } else if (is_undersized()) {
1050       // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
1051       ceph_assert(pool.info.size > actingset.size());
1052       base = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1053       ret = base + (pool.info.size - actingset.size());
1054
1055     } else if (is_degraded()) {
1056       // degraded: baseline degraded
1057       base = ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1058     }
1059
1060     // Adjust with pool's recovery priority
1061     int64_t pool_recovery_priority = 0;
1062     pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
1063
1064     ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
1065   }
1066
1067   psdout(20) << __func__ << " backfill priority is " << ret << dendl;
1068   return static_cast<unsigned>(ret);
1069 }
1070
1071 unsigned PeeringState::get_delete_priority()
1072 {
1073   auto state = get_osdmap()->get_state(pg_whoami.osd);
1074   if (state & (CEPH_OSD_BACKFILLFULL |
1075                CEPH_OSD_FULL)) {
1076     return OSD_DELETE_PRIORITY_FULL;
1077   } else if (state & CEPH_OSD_NEARFULL) {
1078     return OSD_DELETE_PRIORITY_FULLISH;
1079   } else {
1080     return OSD_DELETE_PRIORITY_NORMAL;
1081   }
1082 }
1083
1084 bool PeeringState::set_force_recovery(bool b)
1085 {
1086   bool did = false;
1087   if (b) {
1088     if (!(state & PG_STATE_FORCED_RECOVERY) &&
1089         (state & (PG_STATE_DEGRADED |
1090                   PG_STATE_RECOVERY_WAIT |
1091                   PG_STATE_RECOVERING))) {
1092       psdout(20) << __func__ << " set" << dendl;
1093       state_set(PG_STATE_FORCED_RECOVERY);
1094       pl->publish_stats_to_osd();
1095       did = true;
1096     }
1097   } else if (state & PG_STATE_FORCED_RECOVERY) {
1098     psdout(20) << __func__ << " clear" << dendl;
1099     state_clear(PG_STATE_FORCED_RECOVERY);
1100     pl->publish_stats_to_osd();
1101     did = true;
1102   }
1103   if (did) {
1104     psdout(20) << __func__ << " state " << get_current_state()
1105              << dendl;
1106     pl->update_local_background_io_priority(get_recovery_priority());
1107   }
1108   return did;
1109 }
1110
1111 bool PeeringState::set_force_backfill(bool b)
1112 {
1113   bool did = false;
1114   if (b) {
1115     if (!(state & PG_STATE_FORCED_BACKFILL) &&
1116         (state & (PG_STATE_DEGRADED |
1117                   PG_STATE_BACKFILL_WAIT |
1118                   PG_STATE_BACKFILLING))) {
1119       psdout(10) << __func__ << " set" << dendl;
1120       state_set(PG_STATE_FORCED_BACKFILL);
1121       pl->publish_stats_to_osd();
1122       did = true;
1123     }
1124   } else if (state & PG_STATE_FORCED_BACKFILL) {
1125     psdout(10) << __func__ << " clear" << dendl;
1126     state_clear(PG_STATE_FORCED_BACKFILL);
1127     pl->publish_stats_to_osd();
1128     did = true;
1129   }
1130   if (did) {
1131     psdout(20) << __func__ << " state " << get_current_state()
1132              << dendl;
1133     pl->update_local_background_io_priority(get_backfill_priority());
1134   }
1135   return did;
1136 }
1137
1138 void PeeringState::schedule_renew_lease()
1139 {
1140   pl->schedule_renew_lease(
1141     last_peering_reset,
1142     readable_interval / 2);
1143 }
1144
1145 void PeeringState::send_lease()
1146 {
1147   epoch_t epoch = pl->get_osdmap_epoch();
1148   for (auto peer : actingset) {
1149     if (peer == pg_whoami) {
1150       continue;
1151     }
1152     pl->send_cluster_message(
1153       peer.osd,
1154       TOPNSPC::make_message<MOSDPGLease>(epoch,
1155                       spg_t(spgid.pgid, peer.shard),
1156                       get_lease()),
1157       epoch);
1158   }
1159 }
1160
1161 void PeeringState::proc_lease(const pg_lease_t& l)
1162 {
1163   assert(HAVE_FEATURE(upacting_features, SERVER_OCTOPUS));
1164   if (!is_nonprimary()) {
1165     psdout(20) << __func__ << " no-op, !nonprimary" << dendl;
1166     return;
1167   }
1168   psdout(10) << __func__ << " " << l << dendl;
1169   if (l.readable_until_ub > readable_until_ub_from_primary) {
1170     readable_until_ub_from_primary = l.readable_until_ub;
1171   }
1172
1173   ceph::signedspan ru = ceph::signedspan::zero();
1174   if (l.readable_until != ceph::signedspan::zero() &&
1175       hb_stamps[0]->peer_clock_delta_ub) {
1176     ru = l.readable_until - *hb_stamps[0]->peer_clock_delta_ub;
1177     psdout(20) << " peer_clock_delta_ub " << *hb_stamps[0]->peer_clock_delta_ub
1178                << " -> ru " << ru << dendl;
1179   }
1180   if (ru > readable_until) {
1181     readable_until = ru;
1182     psdout(20) << __func__ << " readable_until now " << readable_until << dendl;
1183     // NOTE: if we ever decide to block/queue ops on the replica,
1184     // we'll need to wake them up here.
1185   }
1186
1187   ceph::signedspan ruub;
1188   if (hb_stamps[0]->peer_clock_delta_lb) {
1189     ruub = l.readable_until_ub - *hb_stamps[0]->peer_clock_delta_lb;
1190     psdout(20) << " peer_clock_delta_lb " << *hb_stamps[0]->peer_clock_delta_lb
1191                << " -> ruub " << ruub << dendl;
1192   } else {
1193     ruub = pl->get_mnow() + l.interval;
1194     psdout(20) << " no peer_clock_delta_lb -> ruub " << ruub << dendl;
1195   }
1196   if (ruub > readable_until_ub) {
1197     readable_until_ub = ruub;
1198     psdout(20) << __func__ << " readable_until_ub now " << readable_until_ub
1199                << dendl;
1200   }
1201 }
1202
1203 void PeeringState::proc_lease_ack(int from, const pg_lease_ack_t& a)
1204 {
1205   assert(HAVE_FEATURE(upacting_features, SERVER_OCTOPUS));
1206   auto now = pl->get_mnow();
1207   bool was_min = false;
1208   for (unsigned i = 0; i < acting.size(); ++i) {
1209     if (from == acting[i]) {
1210       // the lease_ack value is based on the primary's clock
1211       if (a.readable_until_ub > acting_readable_until_ub[i]) {
1212         if (acting_readable_until_ub[i] == readable_until) {
1213           was_min = true;
1214         }
1215         acting_readable_until_ub[i] = a.readable_until_ub;
1216         break;
1217       }
1218     }
1219   }
1220   if (was_min) {
1221     auto old_ru = readable_until;
1222     recalc_readable_until();
1223     if (now < old_ru) {
1224       pl->recheck_readable();
1225     }
1226   }
1227 }
1228
1229 void PeeringState::proc_renew_lease()
1230 {
1231   assert(HAVE_FEATURE(upacting_features, SERVER_OCTOPUS));
1232   renew_lease(pl->get_mnow());
1233   send_lease();
1234   schedule_renew_lease();
1235 }
1236
1237 void PeeringState::recalc_readable_until()
1238 {
1239   assert(is_primary());
1240   ceph::signedspan min = readable_until_ub_sent;
1241   for (unsigned i = 0; i < acting.size(); ++i) {
1242     if (acting[i] == pg_whoami.osd || acting[i] == CRUSH_ITEM_NONE) {
1243       continue;
1244     }
1245     dout(20) << __func__ << " peer osd." << acting[i]
1246              << " ruub " << acting_readable_until_ub[i] << dendl;
1247     if (acting_readable_until_ub[i] < min) {
1248       min = acting_readable_until_ub[i];
1249     }
1250   }
1251   readable_until = min;
1252   readable_until_ub = min;
1253   dout(20) << __func__ << " readable_until[_ub] " << readable_until
1254            << " (sent " << readable_until_ub_sent << ")" << dendl;
1255 }
1256
1257 bool PeeringState::check_prior_readable_down_osds(const OSDMapRef& map)
1258 {
1259   assert(HAVE_FEATURE(upacting_features, SERVER_OCTOPUS));
1260   bool changed = false;
1261   auto p = prior_readable_down_osds.begin();
1262   while (p != prior_readable_down_osds.end()) {
1263     if (map->is_dead(*p)) {
1264       dout(10) << __func__ << " prior_readable_down_osds osd." << *p
1265                << " is dead as of epoch " << map->get_epoch()
1266                << dendl;
1267       p = prior_readable_down_osds.erase(p);
1268       changed = true;
1269     } else {
1270       ++p;
1271     }
1272   }
1273   if (changed && prior_readable_down_osds.empty()) {
1274     psdout(10) << " empty prior_readable_down_osds, clearing ub" << dendl;
1275     clear_prior_readable_until_ub();
1276     return true;
1277   }
1278   return false;
1279 }
1280
1281 bool PeeringState::adjust_need_up_thru(const OSDMapRef osdmap)
1282 {
1283   epoch_t up_thru = osdmap->get_up_thru(pg_whoami.osd);
1284   if (need_up_thru &&
1285       up_thru >= info.history.same_interval_since) {
1286     psdout(10) << "adjust_need_up_thru now "
1287                << up_thru << ", need_up_thru now false" << dendl;
1288     need_up_thru = false;
1289     return true;
1290   }
1291   return false;
1292 }
1293
1294 PastIntervals::PriorSet PeeringState::build_prior()
1295 {
1296   if (1) {
1297     // sanity check
1298     for (auto it = peer_info.begin(); it != peer_info.end(); ++it) {
1299       ceph_assert(info.history.last_epoch_started >=
1300                   it->second.history.last_epoch_started);
1301     }
1302   }
1303
1304   const OSDMap &osdmap = *get_osdmap();
1305   PastIntervals::PriorSet prior = past_intervals.get_prior_set(
1306     pool.info.is_erasure(),
1307     info.history.last_epoch_started,
1308     &missing_loc.get_recoverable_predicate(),
1309     [&](epoch_t start, int osd, epoch_t *lost_at) {
1310       const osd_info_t *pinfo = 0;
1311       if (osdmap.exists(osd)) {
1312         pinfo = &osdmap.get_info(osd);
1313         if (lost_at)
1314           *lost_at = pinfo->lost_at;
1315       }
1316
1317       if (osdmap.is_up(osd)) {
1318         return PastIntervals::UP;
1319       } else if (!pinfo) {
1320         return PastIntervals::DNE;
1321       } else if (pinfo->lost_at > start) {
1322         return PastIntervals::LOST;
1323       } else {
1324         return PastIntervals::DOWN;
1325       }
1326     },
1327     up,
1328     acting,
1329     dpp);
1330
1331   if (prior.pg_down) {
1332     state_set(PG_STATE_DOWN);
1333   }
1334
1335   if (get_osdmap()->get_up_thru(pg_whoami.osd) <
1336       info.history.same_interval_since) {
1337     psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1338                << " < same_since " << info.history.same_interval_since
1339                << ", must notify monitor" << dendl;
1340     need_up_thru = true;
1341   } else {
1342     psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1343                << " >= same_since " << info.history.same_interval_since
1344                << ", all is well" << dendl;
1345     need_up_thru = false;
1346   }
1347   pl->set_probe_targets(prior.probe);
1348   return prior;
1349 }
1350
1351 bool PeeringState::needs_recovery() const
1352 {
1353   ceph_assert(is_primary());
1354
1355   auto &missing = pg_log.get_missing();
1356
1357   if (missing.num_missing()) {
1358     psdout(10) << __func__ << " primary has " << missing.num_missing()
1359                << " missing" << dendl;
1360     return true;
1361   }
1362
1363   ceph_assert(!acting_recovery_backfill.empty());
1364   for (const pg_shard_t& peer : acting_recovery_backfill) {
1365     if (peer == get_primary()) {
1366       continue;
1367     }
1368     auto pm = peer_missing.find(peer);
1369     if (pm == peer_missing.end()) {
1370       psdout(10) << __func__ << " osd." << peer << " doesn't have missing set"
1371                  << dendl;
1372       continue;
1373     }
1374     if (pm->second.num_missing()) {
1375       psdout(10) << __func__ << " osd." << peer << " has "
1376                  << pm->second.num_missing() << " missing" << dendl;
1377       return true;
1378     }
1379   }
1380
1381   psdout(10) << __func__ << " is recovered" << dendl;
1382   return false;
1383 }
1384
1385 bool PeeringState::needs_backfill() const
1386 {
1387   ceph_assert(is_primary());
1388
1389   // We can assume that only possible osds that need backfill
1390   // are on the backfill_targets vector nodes.
1391   for (const pg_shard_t& peer : backfill_targets) {
1392     auto pi = peer_info.find(peer);
1393     ceph_assert(pi != peer_info.end());
1394     if (!pi->second.last_backfill.is_max()) {
1395       psdout(10) << __func__ << " osd." << peer
1396                  << " has last_backfill " << pi->second.last_backfill << dendl;
1397       return true;
1398     }
1399   }
1400
1401   psdout(10) << __func__ << " does not need backfill" << dendl;
1402   return false;
1403 }
1404
1405 /*
1406  * Returns true unless there is a non-lost OSD in might_have_unfound.
1407  */
1408 bool PeeringState::all_unfound_are_queried_or_lost(
1409   const OSDMapRef osdmap) const
1410 {
1411   ceph_assert(is_primary());
1412
1413   auto peer = might_have_unfound.begin();
1414   auto mend = might_have_unfound.end();
1415   for (; peer != mend; ++peer) {
1416     if (peer_missing.count(*peer))
1417       continue;
1418     auto iter = peer_info.find(*peer);
1419     if (iter != peer_info.end() &&
1420         (iter->second.is_empty() || iter->second.dne()))
1421       continue;
1422     if (!osdmap->exists(peer->osd))
1423       continue;
1424     const osd_info_t &osd_info(osdmap->get_info(peer->osd));
1425     if (osd_info.lost_at <= osd_info.up_from) {
1426       // If there is even one OSD in might_have_unfound that isn't lost, we
1427       // still might retrieve our unfound.
1428       return false;
1429     }
1430   }
1431   psdout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound "
1432              << might_have_unfound
1433              << " have been queried or are marked lost" << dendl;
1434   return true;
1435 }
1436
1437
1438 void PeeringState::reject_reservation()
1439 {
1440   pl->unreserve_recovery_space();
1441   pl->send_cluster_message(
1442     primary.osd,
1443     TOPNSPC::make_message<MBackfillReserve>(
1444       MBackfillReserve::REJECT_TOOFULL,
1445       spg_t(info.pgid.pgid, primary.shard),
1446       get_osdmap_epoch()),
1447     get_osdmap_epoch());
1448 }
1449
1450 /**
1451  * find_best_info
1452  *
1453  * Returns an iterator to the best info in infos sorted by:
1454  *  1) Prefer newer last_update
1455  *  2) Prefer longer tail if it brings another info into contiguity
1456  *  3) Prefer current primary
1457  */
1458 map<pg_shard_t, pg_info_t>::const_iterator PeeringState::find_best_info(
1459   const map<pg_shard_t, pg_info_t> &infos,
1460   bool restrict_to_up_acting,
1461   bool *history_les_bound) const
1462 {
1463   ceph_assert(history_les_bound);
1464   /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1465    * to make changes to this process.  Also, make sure to update it
1466    * when you find bugs! */
1467   epoch_t max_last_epoch_started_found = 0;
1468   for (auto i = infos.begin(); i != infos.end(); ++i) {
1469     if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1470         max_last_epoch_started_found < i->second.history.last_epoch_started) {
1471       *history_les_bound = true;
1472       max_last_epoch_started_found = i->second.history.last_epoch_started;
1473     }
1474     if (!i->second.is_incomplete() &&
1475         max_last_epoch_started_found < i->second.last_epoch_started) {
1476       *history_les_bound = false;
1477       max_last_epoch_started_found = i->second.last_epoch_started;
1478     }
1479   }
1480   eversion_t min_last_update_acceptable = eversion_t::max();
1481   for (auto i = infos.begin(); i != infos.end(); ++i) {
1482     if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1483       if (min_last_update_acceptable > i->second.last_update)
1484         min_last_update_acceptable = i->second.last_update;
1485     }
1486   }
1487   if (min_last_update_acceptable == eversion_t::max())
1488     return infos.end();
1489
1490   auto best = infos.end();
1491   // find osd with newest last_update (oldest for ec_pool).
1492   // if there are multiples, prefer
1493   //  - a longer tail, if it brings another peer into log contiguity
1494   //  - the current primary
1495   for (auto p = infos.begin(); p != infos.end(); ++p) {
1496     if (restrict_to_up_acting && !is_up(p->first) &&
1497         !is_acting(p->first))
1498       continue;
1499     // Only consider peers with last_update >= min_last_update_acceptable
1500     if (p->second.last_update < min_last_update_acceptable)
1501       continue;
1502     // Disqualify anyone with a too old last_epoch_started
1503     if (p->second.last_epoch_started < max_last_epoch_started_found)
1504       continue;
1505     // Disqualify anyone who is incomplete (not fully backfilled)
1506     if (p->second.is_incomplete())
1507       continue;
1508     if (best == infos.end()) {
1509       best = p;
1510       continue;
1511     }
1512     // Prefer newer last_update
1513     if (pool.info.require_rollback()) {
1514       if (p->second.last_update > best->second.last_update)
1515         continue;
1516       if (p->second.last_update < best->second.last_update) {
1517         best = p;
1518         continue;
1519       }
1520     } else {
1521       if (p->second.last_update < best->second.last_update)
1522         continue;
1523       if (p->second.last_update > best->second.last_update) {
1524         best = p;
1525         continue;
1526       }
1527     }
1528
1529     // Prefer longer tail
1530     if (p->second.log_tail > best->second.log_tail) {
1531       continue;
1532     } else if (p->second.log_tail < best->second.log_tail) {
1533       best = p;
1534       continue;
1535     }
1536
1537     if (!p->second.has_missing() && best->second.has_missing()) {
1538       psdout(10) << __func__ << " prefer osd." << p->first
1539                << " because it is complete while best has missing"
1540                << dendl;
1541       best = p;
1542       continue;
1543     } else if (p->second.has_missing() && !best->second.has_missing()) {
1544       psdout(10) << __func__ << " skipping osd." << p->first
1545                << " because it has missing while best is complete"
1546                << dendl;
1547       continue;
1548     } else {
1549       // both are complete or have missing
1550       // fall through
1551     }
1552
1553     // prefer current primary (usually the caller), all things being equal
1554     if (p->first == pg_whoami) {
1555       psdout(10) << "calc_acting prefer osd." << p->first
1556                  << " because it is current primary" << dendl;
1557       best = p;
1558       continue;
1559     }
1560   }
1561   return best;
1562 }
1563
1564 void PeeringState::calc_ec_acting(
1565   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1566   unsigned size,
1567   const vector<int> &acting,
1568   const vector<int> &up,
1569   const map<pg_shard_t, pg_info_t> &all_info,
1570   bool restrict_to_up_acting,
1571   vector<int> *_want,
1572   set<pg_shard_t> *backfill,
1573   set<pg_shard_t> *acting_backfill,
1574   ostream &ss)
1575 {
1576   vector<int> want(size, CRUSH_ITEM_NONE);
1577   map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1578   for (auto i = all_info.begin();
1579        i != all_info.end();
1580        ++i) {
1581     all_info_by_shard[i->first.shard].insert(i->first);
1582   }
1583   for (uint8_t i = 0; i < want.size(); ++i) {
1584     ss << "For position " << (unsigned)i << ": ";
1585     if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1586         !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1587         all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1588         auth_log_shard->second.log_tail) {
1589       ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1590       want[i] = up[i];
1591       continue;
1592     }
1593     if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1594       ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1595          << " and ";
1596       backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1597     }
1598
1599     if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1600         !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1601         all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1602         auth_log_shard->second.log_tail) {
1603       ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1604       want[i] = acting[i];
1605     } else if (!restrict_to_up_acting) {
1606       for (auto j = all_info_by_shard[shard_id_t(i)].begin();
1607            j != all_info_by_shard[shard_id_t(i)].end();
1608            ++j) {
1609         ceph_assert(j->shard == i);
1610         if (!all_info.find(*j)->second.is_incomplete() &&
1611             all_info.find(*j)->second.last_update >=
1612             auth_log_shard->second.log_tail) {
1613           ss << " selecting stray: " << *j << std::endl;
1614           want[i] = j->osd;
1615           break;
1616         }
1617       }
1618       if (want[i] == CRUSH_ITEM_NONE)
1619         ss << " failed to fill position " << (int)i << std::endl;
1620     }
1621   }
1622
1623   for (uint8_t i = 0; i < want.size(); ++i) {
1624     if (want[i] != CRUSH_ITEM_NONE) {
1625       acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1626     }
1627   }
1628   acting_backfill->insert(backfill->begin(), backfill->end());
1629   _want->swap(want);
1630 }
1631
1632 std::pair<map<pg_shard_t, pg_info_t>::const_iterator, eversion_t>
1633 PeeringState::select_replicated_primary(
1634   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1635   uint64_t force_auth_primary_missing_objects,
1636   const std::vector<int> &up,
1637   pg_shard_t up_primary,
1638   const map<pg_shard_t, pg_info_t> &all_info,
1639   const OSDMapRef osdmap,
1640   ostream &ss)
1641 {
1642   pg_shard_t auth_log_shard_id = auth_log_shard->first;
1643
1644   ss << __func__ << " newest update on osd." << auth_log_shard_id
1645      << " with " << auth_log_shard->second << std::endl;
1646
1647   // select primary
1648   auto primary = all_info.find(up_primary);
1649   if (up.size() &&
1650       !primary->second.is_incomplete() &&
1651       primary->second.last_update >=
1652         auth_log_shard->second.log_tail) {
1653     assert(HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS));
1654     auto approx_missing_objects =
1655       primary->second.stats.stats.sum.num_objects_missing;
1656     auto auth_version = auth_log_shard->second.last_update.version;
1657     auto primary_version = primary->second.last_update.version;
1658     if (auth_version > primary_version) {
1659       approx_missing_objects += auth_version - primary_version;
1660     } else {
1661       approx_missing_objects += primary_version - auth_version;
1662     }
1663     if ((uint64_t)approx_missing_objects >
1664         force_auth_primary_missing_objects) {
1665       primary = auth_log_shard;
1666       ss << "up_primary: " << up_primary << ") has approximate "
1667          << approx_missing_objects
1668          << "(>" << force_auth_primary_missing_objects <<") "
1669          << "missing objects, osd." << auth_log_shard_id
1670          << " selected as primary instead"
1671          << std::endl;
1672     } else {
1673       ss << "up_primary: " << up_primary << ") selected as primary"
1674          << std::endl;
1675     }
1676   } else {
1677     ceph_assert(!auth_log_shard->second.is_incomplete());
1678     ss << "up[0] needs backfill, osd." << auth_log_shard_id
1679        << " selected as primary instead" << std::endl;
1680     primary = auth_log_shard;
1681   }
1682
1683   ss << __func__ << " primary is osd." << primary->first
1684      << " with " << primary->second << std::endl;
1685
1686   /* We include auth_log_shard->second.log_tail because in GetLog,
1687    * we will request logs back to the min last_update over our
1688    * acting_backfill set, which will result in our log being extended
1689    * as far backwards as necessary to pick up any peers which can
1690    * be log recovered by auth_log_shard's log */
1691   eversion_t oldest_auth_log_entry =
1692     std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
1693
1694   return std::make_pair(primary, oldest_auth_log_entry);
1695 }
1696
1697
1698 /**
1699  * calculate the desired acting set.
1700  *
1701  * Choose an appropriate acting set.  Prefer up[0], unless it is
1702  * incomplete, or another osd has a longer tail that allows us to
1703  * bring other up nodes up to date.
1704  */
1705 void PeeringState::calc_replicated_acting(
1706   map<pg_shard_t, pg_info_t>::const_iterator primary,
1707   eversion_t oldest_auth_log_entry,
1708   unsigned size,
1709   const vector<int> &acting,
1710   const vector<int> &up,
1711   pg_shard_t up_primary,
1712   const map<pg_shard_t, pg_info_t> &all_info,
1713   bool restrict_to_up_acting,
1714   vector<int> *want,
1715   set<pg_shard_t> *backfill,
1716   set<pg_shard_t> *acting_backfill,
1717   const OSDMapRef osdmap,
1718   const PGPool& pool,
1719   ostream &ss)
1720 {
1721   ss << __func__ << (restrict_to_up_acting ? " restrict_to_up_acting" : "")
1722      << std::endl;
1723
1724   want->push_back(primary->first.osd);
1725   acting_backfill->insert(primary->first);
1726
1727   // select replicas that have log contiguity with primary.
1728   // prefer up, then acting, then any peer_info osds
1729   for (auto i : up) {
1730     pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
1731     if (up_cand == primary->first)
1732       continue;
1733     const pg_info_t &cur_info = all_info.find(up_cand)->second;
1734     if (cur_info.is_incomplete() ||
1735         cur_info.last_update < oldest_auth_log_entry) {
1736       ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1737       backfill->insert(up_cand);
1738       acting_backfill->insert(up_cand);
1739     } else {
1740       want->push_back(i);
1741       acting_backfill->insert(up_cand);
1742       ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
1743     }
1744   }
1745
1746   if (want->size() >= size) {
1747     return;
1748   }
1749
1750   std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
1751   candidate_by_last_update.reserve(acting.size());
1752   // This no longer has backfill OSDs, but they are covered above.
1753   for (auto i : acting) {
1754     pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
1755     // skip up osds we already considered above
1756     if (acting_cand == primary->first)
1757       continue;
1758     auto up_it = find(up.begin(), up.end(), i);
1759     if (up_it != up.end())
1760       continue;
1761
1762     const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1763     if (cur_info.is_incomplete() ||
1764         cur_info.last_update < oldest_auth_log_entry) {
1765       ss << " shard " << acting_cand << " (acting) REJECTED "
1766          << cur_info << std::endl;
1767     } else {
1768       candidate_by_last_update.emplace_back(cur_info.last_update, i);
1769     }
1770   }
1771
1772   auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs,
1773                             const std::pair<eversion_t, int> &rhs) {
1774     return lhs.first > rhs.first;
1775   };
1776   // sort by last_update, in descending order.
1777   std::sort(candidate_by_last_update.begin(),
1778             candidate_by_last_update.end(), sort_by_eversion);
1779   for (auto &p: candidate_by_last_update) {
1780     ceph_assert(want->size() < size);
1781     want->push_back(p.second);
1782     pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1783     acting_backfill->insert(s);
1784     ss << " shard " << s << " (acting) accepted "
1785        << all_info.find(s)->second << std::endl;
1786     if (want->size() >= size) {
1787       return;
1788     }
1789   }
1790
1791   if (restrict_to_up_acting) {
1792     return;
1793   }
1794   candidate_by_last_update.clear();
1795   candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
1796   // continue to search stray to find more suitable peers
1797   for (auto &i : all_info) {
1798     // skip up osds we already considered above
1799     if (i.first == primary->first)
1800       continue;
1801     auto up_it = find(up.begin(), up.end(), i.first.osd);
1802     if (up_it != up.end())
1803       continue;
1804     auto acting_it = find(
1805       acting.begin(), acting.end(), i.first.osd);
1806     if (acting_it != acting.end())
1807       continue;
1808
1809     if (i.second.is_incomplete() ||
1810         i.second.last_update < oldest_auth_log_entry) {
1811       ss << " shard " << i.first << " (stray) REJECTED " << i.second
1812          << std::endl;
1813     } else {
1814       candidate_by_last_update.emplace_back(
1815         i.second.last_update, i.first.osd);
1816     }
1817   }
1818
1819   if (candidate_by_last_update.empty()) {
1820     // save us some effort
1821     return;
1822   }
1823
1824   // sort by last_update, in descending order.
1825   std::sort(candidate_by_last_update.begin(),
1826             candidate_by_last_update.end(), sort_by_eversion);
1827
1828   for (auto &p: candidate_by_last_update) {
1829     ceph_assert(want->size() < size);
1830     want->push_back(p.second);
1831     pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1832     acting_backfill->insert(s);
1833     ss << " shard " << s << " (stray) accepted "
1834        << all_info.find(s)->second << std::endl;
1835     if (want->size() >= size) {
1836       return;
1837     }
1838   }
1839 }
1840
1841 // Defines osd preference order: acting set, then larger last_update
1842 using osd_ord_t = std::tuple<bool, eversion_t>; // <acting, last_update>
1843 using osd_id_t = int;
1844
1845 class bucket_candidates_t {
1846   std::deque<std::pair<osd_ord_t, osd_id_t>> osds;
1847   int selected = 0;
1848
1849 public:
1850   void add_osd(osd_ord_t ord, osd_id_t osd) {
1851     // osds will be added in smallest to largest order
1852     assert(osds.empty() || osds.back().first <= ord);
1853     osds.push_back(std::make_pair(ord, osd));
1854   }
1855   osd_id_t pop_osd() {
1856     ceph_assert(!is_empty());
1857     auto ret = osds.front();
1858     osds.pop_front();
1859     return ret.second;
1860   }
1861
1862   void inc_selected() { selected++; }
1863   unsigned get_num_selected() const { return selected; }
1864
1865   osd_ord_t get_ord() const {
1866     return osds.empty() ? std::make_tuple(false, eversion_t())
1867       : osds.front().first;
1868   }
1869
1870   bool is_empty() const { return osds.empty(); }
1871
1872   bool operator<(const bucket_candidates_t &rhs) const {
1873     return std::make_tuple(-selected, get_ord()) <
1874       std::make_tuple(-rhs.selected, rhs.get_ord());
1875   }
1876
1877   friend std::ostream &operator<<(std::ostream &, const bucket_candidates_t &);
1878 };
1879
1880 std::ostream &operator<<(std::ostream &lhs, const bucket_candidates_t &cand)
1881 {
1882   return lhs << "candidates[" << cand.osds << "]";
1883 }
1884
1885 class bucket_heap_t {
1886   using elem_t = std::reference_wrapper<bucket_candidates_t>;
1887   std::vector<elem_t> heap;
1888
1889   // Max heap -- should emit buckets in order of preference
1890   struct comp {
1891     bool operator()(const elem_t &lhs, const elem_t &rhs) {
1892       return lhs.get() < rhs.get();
1893     }
1894   };
1895 public:
1896   void push_if_nonempty(elem_t e) {
1897     if (!e.get().is_empty()) {
1898       heap.push_back(e);
1899       std::push_heap(heap.begin(), heap.end(), comp());
1900     }
1901   }
1902   elem_t pop() {
1903     std::pop_heap(heap.begin(), heap.end(), comp());
1904     auto ret = heap.back();
1905     heap.pop_back();
1906     return ret;
1907   }
1908
1909   bool is_empty() const { return heap.empty(); }
1910 };
1911
1912 /**
1913  * calc_replicated_acting_stretch
1914  *
1915  * Choose an acting set using as much of the up set as possible; filling
1916  * in the remaining slots so as to maximize the number of crush buckets at
1917  * level pool.info.peering_crush_bucket_barrier represented.
1918  *
1919  * Stretch clusters are a bit special: while they have a "size" the
1920  * same way as normal pools, if we happen to lose a data center
1921  * (we call it a "stretch bucket", but really it'll be a data center or
1922  * a cloud availability zone), we don't actually want to shove
1923  * 2 DC's worth of replication into a single site -- it won't fit!
1924  * So we locally calculate a bucket_max, based
1925  * on the targeted number of stretch buckets for the pool and
1926  * its size. Then we won't pull more than bucket_max from any
1927  * given ancestor even if it leaves us undersized.
1928
1929  * There are two distinct phases: (commented below)
1930  */
1931 void PeeringState::calc_replicated_acting_stretch(
1932   map<pg_shard_t, pg_info_t>::const_iterator primary,
1933   eversion_t oldest_auth_log_entry,
1934   unsigned size,
1935   const vector<int> &acting,
1936   const vector<int> &up,
1937   pg_shard_t up_primary,
1938   const map<pg_shard_t, pg_info_t> &all_info,
1939   bool restrict_to_up_acting,
1940   vector<int> *want,
1941   set<pg_shard_t> *backfill,
1942   set<pg_shard_t> *acting_backfill,
1943   const OSDMapRef osdmap,
1944   const PGPool& pool,
1945   ostream &ss)
1946 {
1947   ceph_assert(want);
1948   ceph_assert(acting_backfill);
1949   ceph_assert(backfill);
1950   ss << __func__ << (restrict_to_up_acting ? " restrict_to_up_acting" : "")
1951      << std::endl;
1952
1953   auto used = [want](int osd) {
1954     return std::find(want->begin(), want->end(), osd) != want->end();
1955   };
1956
1957   auto usable_info = [&](const auto &cur_info) mutable {
1958     return !(cur_info.is_incomplete() ||
1959              cur_info.last_update < oldest_auth_log_entry);
1960   };
1961
1962   auto osd_info = [&](int osd) mutable -> const pg_info_t & {
1963     pg_shard_t cand = pg_shard_t(osd, shard_id_t::NO_SHARD);
1964     const pg_info_t &cur_info = all_info.find(cand)->second;
1965     return cur_info;
1966   };
1967
1968   auto usable_osd = [&](int osd) mutable {
1969     return usable_info(osd_info(osd));
1970   };
1971
1972   std::map<int, bucket_candidates_t> ancestors;
1973   auto get_ancestor = [&](int osd) mutable {
1974     int ancestor = osdmap->crush->get_parent_of_type(
1975       osd,
1976       pool.info.peering_crush_bucket_barrier,
1977       pool.info.crush_rule);
1978     return &ancestors[ancestor];
1979   };
1980
1981   unsigned bucket_max = pool.info.size / pool.info.peering_crush_bucket_target;
1982   if (bucket_max * pool.info.peering_crush_bucket_target < pool.info.size) {
1983     ++bucket_max;
1984   }
1985
1986   /* 1) Select all usable osds from the up set as well as the primary
1987    *
1988    * We also stash any unusable osds from up into backfill.
1989    */
1990   auto add_required = [&](int osd) {
1991     if (!used(osd)) {
1992       want->push_back(osd);
1993       acting_backfill->insert(
1994         pg_shard_t(osd, shard_id_t::NO_SHARD));
1995       get_ancestor(osd)->inc_selected();
1996     }
1997   };
1998   add_required(primary->first.osd);
1999   ss << " osd " << primary->first.osd << " primary accepted "
2000      << osd_info(primary->first.osd) << std::endl;
2001   for (auto upcand: up) {
2002     auto upshard = pg_shard_t(upcand, shard_id_t::NO_SHARD);
2003     auto &curinfo = osd_info(upcand);
2004     if (usable_osd(upcand)) {
2005       ss << " osd " << upcand << " (up) accepted " << curinfo << std::endl;
2006       add_required(upcand);
2007     } else {
2008       ss << " osd " << upcand << " (up) backfill " << curinfo << std::endl;
2009       backfill->insert(upshard);
2010       acting_backfill->insert(upshard);
2011     }
2012   }
2013
2014   if (want->size() >= pool.info.size) { // non-failed CRUSH mappings are valid
2015     ss << " up set sufficient" << std::endl;
2016     return;
2017   }
2018   ss << " up set insufficient, considering remaining osds" << std::endl;
2019
2020   /* 2) Fill out remaining slots from usable osds in all_info
2021    *    while maximizing the number of ancestor nodes at the
2022    *    barrier_id crush level.
2023    */
2024   {
2025     std::vector<std::pair<osd_ord_t, osd_id_t>> candidates;
2026     /* To do this, we first filter the set of usable osd into an ordered
2027      * list of usable osds
2028      */
2029     auto get_osd_ord = [&](bool is_acting, const pg_info_t &info) -> osd_ord_t {
2030       return std::make_tuple(
2031         !is_acting /* acting should sort first */,
2032         info.last_update);
2033     };
2034     for (auto &cand : acting) {
2035       auto &cand_info = osd_info(cand);
2036       if (!used(cand) && usable_info(cand_info)) {
2037         ss << " acting candidate " << cand << " " << cand_info << std::endl;
2038         candidates.push_back(std::make_pair(get_osd_ord(true, cand_info), cand));
2039       }
2040     }
2041     if (!restrict_to_up_acting) {
2042       for (auto &[cand, info] : all_info) {
2043         if (!used(cand.osd) && usable_info(info) &&
2044             (std::find(acting.begin(), acting.end(), cand.osd)
2045              == acting.end())) {
2046           ss << " other candidate " << cand << " " << info << std::endl;
2047           candidates.push_back(
2048             std::make_pair(get_osd_ord(false, info), cand.osd));
2049         }
2050       }
2051     }
2052     std::sort(candidates.begin(), candidates.end());
2053
2054     // We then filter these candidates by ancestor
2055     std::for_each(candidates.begin(), candidates.end(), [&](auto cand) {
2056       get_ancestor(cand.second)->add_osd(cand.first, cand.second);
2057     });
2058   }
2059
2060   auto pop_ancestor = [&](auto &ancestor) {
2061     ceph_assert(!ancestor.is_empty());
2062     auto osd = ancestor.pop_osd();
2063
2064     ss << " accepting candidate " << osd << std::endl;
2065
2066     ceph_assert(!used(osd));
2067     ceph_assert(usable_osd(osd));
2068
2069     want->push_back(osd);
2070     acting_backfill->insert(
2071       pg_shard_t(osd, shard_id_t::NO_SHARD));
2072     ancestor.inc_selected();
2073   };
2074
2075   /* Next, we use the ancestors map to grab a descendant of the
2076    * peering_crush_mandatory_member if not already represented.
2077    *
2078    * TODO: using 0 here to match other users.  Prior to merge, I
2079    * expect that this and other users should instead check against
2080    * CRUSH_ITEM_NONE.
2081    */
2082   if (pool.info.peering_crush_mandatory_member != CRUSH_ITEM_NONE) {
2083     auto aiter = ancestors.find(pool.info.peering_crush_mandatory_member);
2084     if (aiter != ancestors.end() &&
2085         !aiter->second.get_num_selected()) {
2086       ss << " adding required ancestor " << aiter->first << std::endl;
2087       ceph_assert(!aiter->second.is_empty()); // wouldn't exist otherwise
2088       pop_ancestor(aiter->second);
2089     }
2090   }
2091
2092   /* We then place the ancestors in a heap ordered by fewest selected
2093    * and then by the ordering token of the next osd */
2094   bucket_heap_t aheap;
2095   std::for_each(ancestors.begin(), ancestors.end(), [&](auto &anc) {
2096     aheap.push_if_nonempty(anc.second);
2097   });
2098
2099   /* and pull from this heap until it's empty or we have enough.
2100    * "We have enough" is a sufficient check here for
2101    * stretch_set_can_peer() because our heap sorting always
2102    * pulls from ancestors with the least number of included OSDs,
2103    * so if it is possible to satisfy the bucket_count constraints we
2104    * will do so.
2105    */
2106   while (!aheap.is_empty() && want->size() < pool.info.size) {
2107     auto next = aheap.pop();
2108     pop_ancestor(next.get());
2109     if (next.get().get_num_selected() < bucket_max) {
2110       aheap.push_if_nonempty(next);
2111     }
2112   }
2113
2114   /* The end result is that we should have as many buckets covered as
2115    * possible while respecting up, the primary selection,
2116    * the pool size (given bucket count constraints),
2117    * and the mandatory member.
2118    */
2119 }
2120
2121
2122 bool PeeringState::recoverable(const vector<int> &want) const
2123 {
2124   unsigned num_want_acting = 0;
2125   set<pg_shard_t> have;
2126   for (int i = 0; i < (int)want.size(); ++i) {
2127     if (want[i] != CRUSH_ITEM_NONE) {
2128       ++num_want_acting;
2129       have.insert(
2130         pg_shard_t(
2131           want[i],
2132           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2133     }
2134   }
2135
2136   if (num_want_acting < pool.info.min_size) {
2137     if (!cct->_conf.get_val<bool>("osd_allow_recovery_below_min_size")) {
2138       psdout(10) << __func__ << " failed, recovery below min size not enabled" << dendl;
2139       return false;
2140     }
2141   }
2142   if (missing_loc.get_recoverable_predicate()(have)) {
2143     return true;
2144   } else {
2145     psdout(10) << __func__ << " failed, not recoverable " << dendl;
2146     return false;
2147   }
2148 }
2149
2150 void PeeringState::choose_async_recovery_ec(
2151   const map<pg_shard_t, pg_info_t> &all_info,
2152   const pg_info_t &auth_info,
2153   vector<int> *want,
2154   set<pg_shard_t> *async_recovery,
2155   const OSDMapRef osdmap) const
2156 {
2157   set<pair<int, pg_shard_t> > candidates_by_cost;
2158   for (uint8_t i = 0; i < want->size(); ++i) {
2159     if ((*want)[i] == CRUSH_ITEM_NONE)
2160       continue;
2161
2162     // Considering log entries to recover is accurate enough for
2163     // now. We could use minimum_to_decode_with_cost() later if
2164     // necessary.
2165     pg_shard_t shard_i((*want)[i], shard_id_t(i));
2166     // do not include strays
2167     if (stray_set.find(shard_i) != stray_set.end())
2168       continue;
2169     // Do not include an osd that is not up, since choosing it as
2170     // an async_recovery_target will move it out of the acting set.
2171     // This results in it being identified as a stray during peering,
2172     // because it is no longer in the up or acting set.
2173     if (!is_up(shard_i))
2174       continue;
2175     auto shard_info = all_info.find(shard_i)->second;
2176     // for ec pools we rollback all entries past the authoritative
2177     // last_update *before* activation. This is relatively inexpensive
2178     // compared to recovery, since it is purely local, so treat shards
2179     // past the authoritative last_update the same as those equal to it.
2180     version_t auth_version = auth_info.last_update.version;
2181     version_t candidate_version = shard_info.last_update.version;
2182     assert(HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS));
2183     auto approx_missing_objects =
2184       shard_info.stats.stats.sum.num_objects_missing;
2185     if (auth_version > candidate_version) {
2186       approx_missing_objects += auth_version - candidate_version;
2187     }
2188     if (static_cast<uint64_t>(approx_missing_objects) >
2189        cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
2190       candidates_by_cost.emplace(approx_missing_objects, shard_i);
2191     }
2192   }
2193
2194   psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
2195              << dendl;
2196
2197   // take out as many osds as we can for async recovery, in order of cost
2198   for (auto rit = candidates_by_cost.rbegin();
2199        rit != candidates_by_cost.rend(); ++rit) {
2200     pg_shard_t cur_shard = rit->second;
2201     vector<int> candidate_want(*want);
2202     candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE;
2203     if (recoverable(candidate_want)) {
2204       want->swap(candidate_want);
2205       async_recovery->insert(cur_shard);
2206     }
2207   }
2208   psdout(20) << __func__ << " result want=" << *want
2209              << " async_recovery=" << *async_recovery << dendl;
2210 }
2211
2212 void PeeringState::choose_async_recovery_replicated(
2213   const map<pg_shard_t, pg_info_t> &all_info,
2214   const pg_info_t &auth_info,
2215   vector<int> *want,
2216   set<pg_shard_t> *async_recovery,
2217   const OSDMapRef osdmap) const
2218 {
2219   set<pair<int, pg_shard_t> > candidates_by_cost;
2220   for (auto osd_num : *want) {
2221     pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD);
2222     // do not include strays
2223     if (stray_set.find(shard_i) != stray_set.end())
2224       continue;
2225     // Do not include an osd that is not up, since choosing it as
2226     // an async_recovery_target will move it out of the acting set.
2227     // This results in it being identified as a stray during peering,
2228     // because it is no longer in the up or acting set.
2229     if (!is_up(shard_i))
2230       continue;
2231     auto shard_info = all_info.find(shard_i)->second;
2232     // use the approximate magnitude of the difference in length of
2233     // logs plus historical missing objects as the cost of recovery
2234     version_t auth_version = auth_info.last_update.version;
2235     version_t candidate_version = shard_info.last_update.version;
2236     assert(HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS));
2237     auto approx_missing_objects =
2238       shard_info.stats.stats.sum.num_objects_missing;
2239     if (auth_version > candidate_version) {
2240       approx_missing_objects += auth_version - candidate_version;
2241     } else {
2242       approx_missing_objects += candidate_version - auth_version;
2243     }
2244     if (static_cast<uint64_t>(approx_missing_objects)  >
2245        cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
2246       candidates_by_cost.emplace(approx_missing_objects, shard_i);
2247     }
2248   }
2249
2250   psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
2251              << dendl;
2252   // take out as many osds as we can for async recovery, in order of cost
2253   for (auto rit = candidates_by_cost.rbegin();
2254        rit != candidates_by_cost.rend(); ++rit) {
2255     if (want->size() <= pool.info.min_size) {
2256       break;
2257     }
2258     pg_shard_t cur_shard = rit->second;
2259     vector<int> candidate_want(*want);
2260     for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) {
2261       if (*it == cur_shard.osd) {
2262         candidate_want.erase(it);
2263         if (pool.info.stretch_set_can_peer(candidate_want, *osdmap, NULL)) {
2264           // if we're in stretch mode, we can only remove the osd if it doesn't
2265           // break peering limits.
2266           want->swap(candidate_want);
2267           async_recovery->insert(cur_shard);
2268         }
2269         break;
2270       }
2271     }
2272   }
2273
2274   psdout(20) << __func__ << " result want=" << *want
2275              << " async_recovery=" << *async_recovery << dendl;
2276 }
2277
2278 /**
2279  * choose acting
2280  *
2281  * calculate the desired acting, and request a change with the monitor
2282  * if it differs from the current acting.
2283  *
2284  * if restrict_to_up_acting=true, we filter out anything that's not in
2285  * up/acting.  in order to lift this restriction, we need to
2286  *  1) check whether it's worth switching the acting set any time we get
2287  *     a new pg info (not just here, when recovery finishes)
2288  *  2) check whether anything in want_acting went down on each new map
2289  *     (and, if so, calculate a new want_acting)
2290  *  3) remove the assertion in PG::PeeringState::Active::react(const AdvMap)
2291  * TODO!
2292  */
2293 bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
2294                                  bool restrict_to_up_acting,
2295                                  bool *history_les_bound,
2296                                  bool request_pg_temp_change_only)
2297 {
2298   map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
2299   all_info[pg_whoami] = info;
2300
2301   if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
2302     for (auto p = all_info.begin(); p != all_info.end(); ++p) {
2303       psdout(10) << __func__ << " all_info osd." << p->first << " "
2304                  << p->second << dendl;
2305     }
2306   }
2307
2308   auto auth_log_shard = find_best_info(all_info, restrict_to_up_acting,
2309                                        history_les_bound);
2310
2311   if (auth_log_shard == all_info.end()) {
2312     if (up != acting) {
2313       psdout(10) << __func__ << " no suitable info found (incomplete backfills?),"
2314                  << " reverting to up" << dendl;
2315       want_acting = up;
2316       vector<int> empty;
2317       pl->queue_want_pg_temp(empty);
2318     } else {
2319       psdout(10) << __func__ << " failed" << dendl;
2320       ceph_assert(want_acting.empty());
2321     }
2322     return false;
2323   }
2324
2325   ceph_assert(!auth_log_shard->second.is_incomplete());
2326   auth_log_shard_id = auth_log_shard->first;
2327
2328   set<pg_shard_t> want_backfill, want_acting_backfill;
2329   vector<int> want;
2330   stringstream ss;
2331   if (pool.info.is_replicated()) {
2332     auto [primary_shard, oldest_log] = select_replicated_primary(
2333       auth_log_shard,
2334       cct->_conf.get_val<uint64_t>(
2335         "osd_force_auth_primary_missing_objects"),
2336       up,
2337       up_primary,
2338       all_info,
2339       get_osdmap(),
2340       ss);
2341     if (pool.info.is_stretch_pool()) {
2342       calc_replicated_acting_stretch(
2343         primary_shard,
2344         oldest_log,
2345         get_osdmap()->get_pg_size(info.pgid.pgid),
2346         acting,
2347         up,
2348         up_primary,
2349         all_info,
2350         restrict_to_up_acting,
2351         &want,
2352         &want_backfill,
2353         &want_acting_backfill,
2354         get_osdmap(),
2355         pool,
2356         ss);
2357     } else {
2358       calc_replicated_acting(
2359         primary_shard,
2360         oldest_log,
2361         get_osdmap()->get_pg_size(info.pgid.pgid),
2362         acting,
2363         up,
2364         up_primary,
2365         all_info,
2366         restrict_to_up_acting,
2367         &want,
2368         &want_backfill,
2369         &want_acting_backfill,
2370         get_osdmap(),
2371         pool,
2372         ss);
2373     }
2374   } else {
2375     calc_ec_acting(
2376       auth_log_shard,
2377       get_osdmap()->get_pg_size(info.pgid.pgid),
2378       acting,
2379       up,
2380       all_info,
2381       restrict_to_up_acting,
2382       &want,
2383       &want_backfill,
2384       &want_acting_backfill,
2385       ss);
2386   }
2387   psdout(10) << ss.str() << dendl;
2388
2389   if (!recoverable(want)) {
2390     want_acting.clear();
2391     return false;
2392   }
2393
2394   set<pg_shard_t> want_async_recovery;
2395   if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) {
2396     if (pool.info.is_erasure()) {
2397       choose_async_recovery_ec(
2398         all_info, auth_log_shard->second, &want, &want_async_recovery,
2399         get_osdmap());
2400     } else {
2401       choose_async_recovery_replicated(
2402         all_info, auth_log_shard->second, &want, &want_async_recovery,
2403         get_osdmap());
2404     }
2405   }
2406   while (want.size() > pool.info.size) {
2407     // async recovery should have taken out as many osds as it can.
2408     // if not, then always evict the last peer
2409     // (will get synchronously recovered later)
2410     psdout(10) << __func__ << " evicting osd." << want.back()
2411                << " from oversized want " << want << dendl;
2412     want.pop_back();
2413   }
2414   if (want != acting) {
2415     psdout(10) << __func__ << " want " << want << " != acting " << acting
2416                << ", requesting pg_temp change" << dendl;
2417     want_acting = want;
2418
2419     if (!cct->_conf->osd_debug_no_acting_change) {
2420       if (want_acting == up) {
2421         // There can't be any pending backfill if
2422         // want is the same as crush map up OSDs.
2423         ceph_assert(want_backfill.empty());
2424         vector<int> empty;
2425         pl->queue_want_pg_temp(empty);
2426       } else
2427         pl->queue_want_pg_temp(want);
2428     }
2429     return false;
2430   }
2431
2432   if (request_pg_temp_change_only)
2433     return true;
2434   want_acting.clear();
2435   acting_recovery_backfill = want_acting_backfill;
2436   psdout(10) << "acting_recovery_backfill is "
2437              << acting_recovery_backfill << dendl;
2438   ceph_assert(
2439     backfill_targets.empty() ||
2440     backfill_targets == want_backfill);
2441   if (backfill_targets.empty()) {
2442     // Caller is GetInfo
2443     backfill_targets = want_backfill;
2444   }
2445   // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
2446   ceph_assert(
2447     async_recovery_targets.empty() ||
2448     async_recovery_targets == want_async_recovery ||
2449     !needs_recovery());
2450   if (async_recovery_targets.empty() || !needs_recovery()) {
2451     async_recovery_targets = want_async_recovery;
2452   }
2453   // Will not change if already set because up would have had to change
2454   // Verify that nothing in backfill is in stray_set
2455   for (auto i = want_backfill.begin(); i != want_backfill.end(); ++i) {
2456     ceph_assert(stray_set.find(*i) == stray_set.end());
2457   }
2458   psdout(10) << "choose_acting want=" << want << " backfill_targets="
2459            << want_backfill << " async_recovery_targets="
2460            << async_recovery_targets << dendl;
2461   return true;
2462 }
2463
2464 void PeeringState::log_weirdness()
2465 {
2466   if (pg_log.get_tail() != info.log_tail)
2467     pl->get_clog_error() << info.pgid
2468                            << " info mismatch, log.tail " << pg_log.get_tail()
2469                            << " != info.log_tail " << info.log_tail;
2470   if (pg_log.get_head() != info.last_update)
2471     pl->get_clog_error() << info.pgid
2472                            << " info mismatch, log.head " << pg_log.get_head()
2473                            << " != info.last_update " << info.last_update;
2474
2475   if (!pg_log.get_log().empty()) {
2476     // sloppy check
2477     if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
2478       pl->get_clog_error() << info.pgid
2479                              << " log bound mismatch, info (tail,head] ("
2480                              << pg_log.get_tail() << ","
2481                              << pg_log.get_head() << "]"
2482                              << " actual ["
2483                              << pg_log.get_log().log.begin()->version << ","
2484                              << pg_log.get_log().log.rbegin()->version << "]";
2485   }
2486
2487   if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
2488     pl->get_clog_error() << info.pgid
2489                            << " caller_ops.size "
2490                            << pg_log.get_log().caller_ops.size()
2491                            << " > log size " << pg_log.get_log().log.size();
2492   }
2493 }
2494
2495 /*
2496  * Process information from a replica to determine if it could have any
2497  * objects that i need.
2498  *
2499  * TODO: if the missing set becomes very large, this could get expensive.
2500  * Instead, we probably want to just iterate over our unfound set.
2501  */
2502 bool PeeringState::search_for_missing(
2503   const pg_info_t &oinfo, const pg_missing_t &omissing,
2504   pg_shard_t from,
2505   PeeringCtxWrapper &ctx)
2506 {
2507   uint64_t num_unfound_before = missing_loc.num_unfound();
2508   bool found_missing = missing_loc.add_source_info(
2509     from, oinfo, omissing, ctx.handle);
2510   if (found_missing && num_unfound_before != missing_loc.num_unfound())
2511     pl->publish_stats_to_osd();
2512   // avoid doing this if the peer is empty.  This is abit of paranoia
2513   // to avoid doing something rash if add_source_info() above
2514   // incorrectly decided we found something new. (if the peer has
2515   // last_update=0'0 that's impossible.)
2516   if (found_missing &&
2517       oinfo.last_update != eversion_t()) {
2518     pg_info_t tinfo(oinfo);
2519     tinfo.pgid.shard = pg_whoami.shard;
2520     ctx.send_info(
2521       from.osd,
2522       spg_t(info.pgid.pgid, from.shard),
2523       get_osdmap_epoch(),  // fixme: use lower epoch?
2524       get_osdmap_epoch(),
2525       tinfo);
2526   }
2527   return found_missing;
2528 }
2529
2530 bool PeeringState::discover_all_missing(
2531   BufferedRecoveryMessages &rctx)
2532 {
2533   auto &missing = pg_log.get_missing();
2534   uint64_t unfound = get_num_unfound();
2535   bool any = false;  // did we start any queries
2536
2537   psdout(10) << __func__ << " "
2538              << missing.num_missing() << " missing, "
2539              << unfound << " unfound"
2540              << dendl;
2541
2542   auto m = might_have_unfound.begin();
2543   auto mend = might_have_unfound.end();
2544   for (; m != mend; ++m) {
2545     pg_shard_t peer(*m);
2546
2547     if (!get_osdmap()->is_up(peer.osd)) {
2548       psdout(20) << __func__ << " skipping down osd." << peer << dendl;
2549       continue;
2550     }
2551
2552     if (peer_purged.count(peer)) {
2553       psdout(20) << __func__ << " skipping purged osd." << peer << dendl;
2554       continue;
2555     }
2556
2557     auto iter = peer_info.find(peer);
2558     if (iter != peer_info.end() &&
2559         (iter->second.is_empty() || iter->second.dne())) {
2560       // ignore empty peers
2561       continue;
2562     }
2563
2564     // If we've requested any of this stuff, the pg_missing_t information
2565     // should be on its way.
2566     // TODO: coalsce requested_* into a single data structure
2567     if (peer_missing.find(peer) != peer_missing.end()) {
2568       psdout(20) << __func__ << ": osd." << peer
2569                  << ": we already have pg_missing_t" << dendl;
2570       continue;
2571     }
2572     if (peer_log_requested.find(peer) != peer_log_requested.end()) {
2573       psdout(20) << __func__ << ": osd." << peer
2574                  << ": in peer_log_requested" << dendl;
2575       continue;
2576     }
2577     if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
2578       psdout(20) << __func__ << ": osd." << peer
2579                  << ": in peer_missing_requested" << dendl;
2580       continue;
2581     }
2582
2583     // Request missing
2584     psdout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
2585                << dendl;
2586     peer_missing_requested.insert(peer);
2587     rctx.send_query(
2588       peer.osd,
2589       spg_t(info.pgid.pgid, peer.shard),
2590       pg_query_t(
2591         pg_query_t::FULLLOG,
2592         peer.shard, pg_whoami.shard,
2593         info.history, get_osdmap_epoch()));
2594     any = true;
2595   }
2596   return any;
2597 }
2598
2599 /* Build the might_have_unfound set.
2600  *
2601  * This is used by the primary OSD during recovery.
2602  *
2603  * This set tracks the OSDs which might have unfound objects that the primary
2604  * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
2605  * will remove the OSD from the set.
2606  */
2607 void PeeringState::build_might_have_unfound()
2608 {
2609   ceph_assert(might_have_unfound.empty());
2610   ceph_assert(is_primary());
2611
2612   psdout(10) << __func__ << dendl;
2613
2614   check_past_interval_bounds();
2615
2616   might_have_unfound = past_intervals.get_might_have_unfound(
2617     pg_whoami,
2618     pool.info.is_erasure());
2619
2620   // include any (stray) peers
2621   for (auto p = peer_info.begin(); p != peer_info.end(); ++p)
2622     might_have_unfound.insert(p->first);
2623
2624   psdout(15) << __func__ << ": built " << might_have_unfound << dendl;
2625 }
2626
2627 void PeeringState::activate(
2628   ObjectStore::Transaction& t,
2629   epoch_t activation_epoch,
2630   PeeringCtxWrapper &ctx)
2631 {
2632   ceph_assert(!is_peered());
2633
2634   // twiddle pg state
2635   state_clear(PG_STATE_DOWN);
2636
2637   send_notify = false;
2638
2639   if (is_primary()) {
2640     // only update primary last_epoch_started if we will go active
2641     if (acting_set_writeable()) {
2642       ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
2643              info.last_epoch_started <= activation_epoch);
2644       info.last_epoch_started = activation_epoch;
2645       info.last_interval_started = info.history.same_interval_since;
2646     }
2647   } else if (is_acting(pg_whoami)) {
2648     /* update last_epoch_started on acting replica to whatever the primary sent
2649      * unless it's smaller (could happen if we are going peered rather than
2650      * active, see doc/dev/osd_internals/last_epoch_started.rst) */
2651     if (info.last_epoch_started < activation_epoch) {
2652       info.last_epoch_started = activation_epoch;
2653       info.last_interval_started = info.history.same_interval_since;
2654     }
2655   }
2656
2657   auto &missing = pg_log.get_missing();
2658
2659   min_last_complete_ondisk = eversion_t(0,0);  // we don't know (yet)!
2660   if (is_primary()) {
2661     last_update_ondisk = info.last_update;
2662   }
2663   last_update_applied = info.last_update;
2664   last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
2665
2666   need_up_thru = false;
2667
2668   // write pg info, log
2669   dirty_info = true;
2670   dirty_big_info = true; // maybe
2671
2672   pl->schedule_event_on_commit(
2673     t,
2674     std::make_shared<PGPeeringEvent>(
2675       get_osdmap_epoch(),
2676       get_osdmap_epoch(),
2677       ActivateCommitted(
2678         get_osdmap_epoch(),
2679         activation_epoch)));
2680
2681   // init complete pointer
2682   if (missing.num_missing() == 0) {
2683     psdout(10) << "activate - no missing, moving last_complete " << info.last_complete
2684              << " -> " << info.last_update << dendl;
2685     info.last_complete = info.last_update;
2686     info.stats.stats.sum.num_objects_missing = 0;
2687     pg_log.reset_recovery_pointers();
2688   } else {
2689     psdout(10) << "activate - not complete, " << missing << dendl;
2690     info.stats.stats.sum.num_objects_missing = missing.num_missing();
2691     pg_log.activate_not_complete(info);
2692   }
2693
2694   log_weirdness();
2695
2696   if (is_primary()) {
2697     // initialize snap_trimq
2698     interval_set<snapid_t> to_trim;
2699     auto& removed_snaps_queue = get_osdmap()->get_removed_snaps_queue();
2700     auto p = removed_snaps_queue.find(info.pgid.pgid.pool());
2701     if (p != removed_snaps_queue.end()) {
2702       dout(20) << "activate - purged_snaps " << info.purged_snaps
2703                << " removed_snaps " << p->second
2704                << dendl;
2705       for (auto q : p->second) {
2706         to_trim.insert(q.first, q.second);
2707       }
2708     }
2709     interval_set<snapid_t> purged;
2710     purged.intersection_of(to_trim, info.purged_snaps);
2711     to_trim.subtract(purged);
2712
2713     assert(HAVE_FEATURE(upacting_features, SERVER_OCTOPUS));
2714     renew_lease(pl->get_mnow());
2715     // do not schedule until we are actually activated
2716
2717     // adjust purged_snaps: PG may have been inactive while snaps were pruned
2718     // from the removed_snaps_queue in the osdmap.  update local purged_snaps
2719     // reflect only those snaps that we thought were pruned and were still in
2720     // the queue.
2721     info.purged_snaps.swap(purged);
2722
2723     // start up replicas
2724     if (prior_readable_down_osds.empty()) {
2725       dout(10) << __func__ << " no prior_readable_down_osds to wait on, clearing ub"
2726                << dendl;
2727       clear_prior_readable_until_ub();
2728     }
2729     info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
2730                                                  prior_readable_until_ub);
2731
2732     ceph_assert(!acting_recovery_backfill.empty());
2733     for (auto i = acting_recovery_backfill.begin();
2734          i != acting_recovery_backfill.end();
2735          ++i) {
2736       if (*i == pg_whoami) continue;
2737       pg_shard_t peer = *i;
2738       ceph_assert(peer_info.count(peer));
2739       pg_info_t& pi = peer_info[peer];
2740
2741       psdout(10) << "activate peer osd." << peer << " " << pi << dendl;
2742
2743       #if defined(WITH_SEASTAR)
2744       MURef<MOSDPGLog> m;
2745       #else
2746       MRef<MOSDPGLog> m;
2747       #endif
2748       ceph_assert(peer_missing.count(peer));
2749       pg_missing_t& pm = peer_missing[peer];
2750
2751       bool needs_past_intervals = pi.dne();
2752
2753       // Save num_bytes for backfill reservation request, can't be negative
2754       peer_bytes[peer] = std::max<int64_t>(0, pi.stats.stats.sum.num_bytes);
2755
2756       if (pi.last_update == info.last_update) {
2757         // empty log
2758         if (!pi.last_backfill.is_max())
2759           pl->get_clog_info() << info.pgid << " continuing backfill to osd."
2760                                 << peer
2761                                 << " from (" << pi.log_tail << "," << pi.last_update
2762                                 << "] " << pi.last_backfill
2763                                 << " to " << info.last_update;
2764         if (!pi.is_empty()) {
2765           psdout(10) << "activate peer osd." << peer
2766                      << " is up to date, queueing in pending_activators" << dendl;
2767           ctx.send_info(
2768             peer.osd,
2769             spg_t(info.pgid.pgid, peer.shard),
2770             get_osdmap_epoch(), // fixme: use lower epoch?
2771             get_osdmap_epoch(),
2772             info,
2773             get_lease());
2774         } else {
2775           psdout(10) << "activate peer osd." << peer
2776                      << " is up to date, but sending pg_log anyway" << dendl;
2777           m = TOPNSPC::make_message<MOSDPGLog>(
2778             i->shard, pg_whoami.shard,
2779             get_osdmap_epoch(), info,
2780             last_peering_reset);
2781         }
2782       } else if (
2783         pg_log.get_tail() > pi.last_update ||
2784         pi.last_backfill == hobject_t() ||
2785         (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
2786         /* ^ This last case covers a situation where a replica is not contiguous
2787          * with the auth_log, but is contiguous with this replica.  Reshuffling
2788          * the active set to handle this would be tricky, so instead we just go
2789          * ahead and backfill it anyway.  This is probably preferrable in any
2790          * case since the replica in question would have to be significantly
2791          * behind.
2792          */
2793         // backfill
2794         pl->get_clog_debug() << info.pgid << " starting backfill to osd." << peer
2795                                << " from (" << pi.log_tail << "," << pi.last_update
2796                                << "] " << pi.last_backfill
2797                                << " to " << info.last_update;
2798
2799         pi.last_update = info.last_update;
2800         pi.last_complete = info.last_update;
2801         pi.set_last_backfill(hobject_t());
2802         pi.last_epoch_started = info.last_epoch_started;
2803         pi.last_interval_started = info.last_interval_started;
2804         pi.history = info.history;
2805         pi.hit_set = info.hit_set;
2806         pi.stats.stats.clear();
2807         pi.stats.stats.sum.num_bytes = peer_bytes[peer];
2808
2809         // initialize peer with our purged_snaps.
2810         pi.purged_snaps = info.purged_snaps;
2811
2812         m = TOPNSPC::make_message<MOSDPGLog>(
2813           i->shard, pg_whoami.shard,
2814           get_osdmap_epoch(), pi,
2815           last_peering_reset /* epoch to create pg at */);
2816
2817         // send some recent log, so that op dup detection works well.
2818         m->log.copy_up_to(cct, pg_log.get_log(),
2819                           cct->_conf->osd_max_pg_log_entries);
2820         m->info.log_tail = m->log.tail;
2821         pi.log_tail = m->log.tail;  // sigh...
2822
2823         pm.clear();
2824       } else {
2825         // catch up
2826         ceph_assert(pg_log.get_tail() <= pi.last_update);
2827         m = TOPNSPC::make_message<MOSDPGLog>(
2828           i->shard, pg_whoami.shard,
2829           get_osdmap_epoch(), info,
2830           last_peering_reset /* epoch to create pg at */);
2831         // send new stuff to append to replicas log
2832         m->log.copy_after(cct, pg_log.get_log(), pi.last_update);
2833       }
2834
2835       // share past_intervals if we are creating the pg on the replica
2836       // based on whether our info for that peer was dne() *before*
2837       // updating pi.history in the backfill block above.
2838       if (m && needs_past_intervals)
2839         m->past_intervals = past_intervals;
2840
2841       // update local version of peer's missing list!
2842       if (m && pi.last_backfill != hobject_t()) {
2843         for (auto p = m->log.log.begin(); p != m->log.log.end(); ++p) {
2844           if (p->soid <= pi.last_backfill &&
2845               !p->is_error()) {
2846             if (perform_deletes_during_peering() && p->is_delete()) {
2847               pm.rm(p->soid, p->version);
2848             } else {
2849               pm.add_next_event(*p);
2850             }
2851           }
2852         }
2853       }
2854
2855       if (m) {
2856         dout(10) << "activate peer osd." << peer << " sending " << m->log
2857                  << dendl;
2858         m->lease = get_lease();
2859         pl->send_cluster_message(peer.osd, std::move(m), get_osdmap_epoch());
2860       }
2861
2862       // peer now has
2863       pi.last_update = info.last_update;
2864
2865       // update our missing
2866       if (pm.num_missing() == 0) {
2867         pi.last_complete = pi.last_update;
2868         psdout(10) << "activate peer osd." << peer << " " << pi
2869                    << " uptodate" << dendl;
2870       } else {
2871         psdout(10) << "activate peer osd." << peer << " " << pi
2872                    << " missing " << pm << dendl;
2873       }
2874     }
2875
2876     // Set up missing_loc
2877     set<pg_shard_t> complete_shards;
2878     for (auto i = acting_recovery_backfill.begin();
2879          i != acting_recovery_backfill.end();
2880          ++i) {
2881       psdout(20) << __func__ << " setting up missing_loc from shard " << *i
2882                  << " " << dendl;
2883       if (*i == get_primary()) {
2884         missing_loc.add_active_missing(missing);
2885         if (!missing.have_missing())
2886           complete_shards.insert(*i);
2887       } else {
2888         auto peer_missing_entry = peer_missing.find(*i);
2889         ceph_assert(peer_missing_entry != peer_missing.end());
2890         missing_loc.add_active_missing(peer_missing_entry->second);
2891         if (!peer_missing_entry->second.have_missing() &&
2892             peer_info[*i].last_backfill.is_max())
2893           complete_shards.insert(*i);
2894       }
2895     }
2896
2897     // If necessary, create might_have_unfound to help us find our unfound objects.
2898     // NOTE: It's important that we build might_have_unfound before trimming the
2899     // past intervals.
2900     might_have_unfound.clear();
2901     if (needs_recovery()) {
2902       // If only one shard has missing, we do a trick to add all others as recovery
2903       // source, this is considered safe since the PGLogs have been merged locally,
2904       // and covers vast majority of the use cases, like one OSD/host is down for
2905       // a while for hardware repairing
2906       if (complete_shards.size() + 1 == acting_recovery_backfill.size()) {
2907         missing_loc.add_batch_sources_info(complete_shards, ctx.handle);
2908       } else {
2909         missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
2910                                     ctx.handle);
2911         for (auto i = acting_recovery_backfill.begin();
2912              i != acting_recovery_backfill.end();
2913              ++i) {
2914           if (*i == pg_whoami) continue;
2915           psdout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
2916           ceph_assert(peer_missing.count(*i));
2917           ceph_assert(peer_info.count(*i));
2918           missing_loc.add_source_info(
2919             *i,
2920             peer_info[*i],
2921             peer_missing[*i],
2922             ctx.handle);
2923         }
2924       }
2925       for (auto i = peer_missing.begin(); i != peer_missing.end(); ++i) {
2926         if (is_acting_recovery_backfill(i->first))
2927           continue;
2928         ceph_assert(peer_info.count(i->first));
2929         search_for_missing(
2930           peer_info[i->first],
2931           i->second,
2932           i->first,
2933           ctx);
2934       }
2935
2936       build_might_have_unfound();
2937
2938       // Always call now so update_calc_stats() will be accurate
2939       discover_all_missing(ctx.msgs);
2940
2941     }
2942
2943     // num_objects_degraded if calculated should reflect this too, unless no
2944     // missing and we are about to go clean.
2945     if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
2946       state_set(PG_STATE_UNDERSIZED);
2947     }
2948
2949     state_set(PG_STATE_ACTIVATING);
2950     pl->on_activate(std::move(to_trim));
2951   }
2952   if (acting_set_writeable()) {
2953     PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2954     pg_log.roll_forward(rollbacker.get());
2955   }
2956 }
2957
2958 void PeeringState::share_pg_info()
2959 {
2960   psdout(10) << "share_pg_info" << dendl;
2961
2962   info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
2963                                                prior_readable_until_ub);
2964
2965   // share new pg_info_t with replicas
2966   ceph_assert(!acting_recovery_backfill.empty());
2967   for (auto pg_shard : acting_recovery_backfill) {
2968     if (pg_shard == pg_whoami) continue;
2969     if (auto peer = peer_info.find(pg_shard); peer != peer_info.end()) {
2970       peer->second.last_epoch_started = info.last_epoch_started;
2971       peer->second.last_interval_started = info.last_interval_started;
2972       peer->second.history.merge(info.history);
2973     }
2974     auto m = TOPNSPC::make_message<MOSDPGInfo2>(spg_t{info.pgid.pgid, pg_shard.shard},
2975                           info,
2976                           get_osdmap_epoch(),
2977                           get_osdmap_epoch(),
2978                           std::optional<pg_lease_t>{get_lease()},
2979                           std::nullopt);
2980     pl->send_cluster_message(pg_shard.osd, std::move(m), get_osdmap_epoch());
2981   }
2982 }
2983
2984 void PeeringState::merge_log(
2985   ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t&& olog,
2986   pg_shard_t from)
2987 {
2988   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2989   pg_log.merge_log(
2990     oinfo, std::move(olog), from, info, rollbacker.get(),
2991     dirty_info, dirty_big_info);
2992 }
2993
2994 void PeeringState::rewind_divergent_log(
2995   ObjectStore::Transaction& t, eversion_t newhead)
2996 {
2997   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2998   pg_log.rewind_divergent_log(
2999     newhead, info, rollbacker.get(), dirty_info, dirty_big_info);
3000 }
3001
3002
3003 void PeeringState::proc_primary_info(
3004   ObjectStore::Transaction &t, const pg_info_t &oinfo)
3005 {
3006   ceph_assert(!is_primary());
3007
3008   update_history(oinfo.history);
3009   if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
3010     info.stats.stats.sum.num_scrub_errors = 0;
3011     info.stats.stats.sum.num_shallow_scrub_errors = 0;
3012     info.stats.stats.sum.num_deep_scrub_errors = 0;
3013     dirty_info = true;
3014   }
3015
3016   if (!(info.purged_snaps == oinfo.purged_snaps)) {
3017     psdout(10) << __func__ << " updating purged_snaps to "
3018                << oinfo.purged_snaps
3019                << dendl;
3020     info.purged_snaps = oinfo.purged_snaps;
3021     dirty_info = true;
3022     dirty_big_info = true;
3023   }
3024 }
3025
3026 void PeeringState::proc_master_log(
3027   ObjectStore::Transaction& t, pg_info_t &oinfo,
3028   pg_log_t&& olog, pg_missing_t&& omissing, pg_shard_t from)
3029 {
3030   psdout(10) << "proc_master_log for osd." << from << ": "
3031              << olog << " " << omissing << dendl;
3032   ceph_assert(!is_peered() && is_primary());
3033
3034   // merge log into our own log to build master log.  no need to
3035   // make any adjustments to their missing map; we are taking their
3036   // log to be authoritative (i.e., their entries are by definitely
3037   // non-divergent).
3038   merge_log(t, oinfo, std::move(olog), from);
3039   peer_info[from] = oinfo;
3040   psdout(10) << " peer osd." << from << " now " << oinfo
3041              << " " << omissing << dendl;
3042   might_have_unfound.insert(from);
3043
3044   // See doc/dev/osd_internals/last_epoch_started
3045   if (oinfo.last_epoch_started > info.last_epoch_started) {
3046     info.last_epoch_started = oinfo.last_epoch_started;
3047     dirty_info = true;
3048   }
3049   if (oinfo.last_interval_started > info.last_interval_started) {
3050     info.last_interval_started = oinfo.last_interval_started;
3051     dirty_info = true;
3052   }
3053   update_history(oinfo.history);
3054   ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
3055          info.last_epoch_started >= info.history.last_epoch_started);
3056
3057   peer_missing[from].claim(std::move(omissing));
3058 }
3059
3060 void PeeringState::proc_replica_log(
3061   pg_info_t &oinfo,
3062   const pg_log_t &olog,
3063   pg_missing_t&& omissing,
3064   pg_shard_t from)
3065 {
3066   psdout(10) << "proc_replica_log for osd." << from << ": "
3067              << oinfo << " " << olog << " " << omissing << dendl;
3068
3069   pg_log.proc_replica_log(oinfo, olog, omissing, from);
3070
3071   peer_info[from] = oinfo;
3072   psdout(10) << " peer osd." << from << " now "
3073              << oinfo << " " << omissing << dendl;
3074   might_have_unfound.insert(from);
3075
3076   for (auto i = omissing.get_items().begin();
3077        i != omissing.get_items().end();
3078        ++i) {
3079     psdout(20) << " after missing " << i->first
3080                << " need " << i->second.need
3081                << " have " << i->second.have << dendl;
3082   }
3083   peer_missing[from].claim(std::move(omissing));
3084 }
3085
3086 void PeeringState::fulfill_info(
3087   pg_shard_t from, const pg_query_t &query,
3088   pair<pg_shard_t, pg_info_t> &notify_info)
3089 {
3090   ceph_assert(from == primary);
3091   ceph_assert(query.type == pg_query_t::INFO);
3092
3093   // info
3094   psdout(10) << "sending info" << dendl;
3095   notify_info = make_pair(from, info);
3096 }
3097
3098 void PeeringState::fulfill_log(
3099   pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
3100 {
3101   psdout(10) << "log request from " << from << dendl;
3102   ceph_assert(from == primary);
3103   ceph_assert(query.type != pg_query_t::INFO);
3104
3105   auto mlog = TOPNSPC::make_message<MOSDPGLog>(
3106     from.shard, pg_whoami.shard,
3107     get_osdmap_epoch(),
3108     info, query_epoch);
3109   mlog->missing = pg_log.get_missing();
3110
3111   // primary -> other, when building master log
3112   if (query.type == pg_query_t::LOG) {
3113     psdout(10) << " sending info+missing+log since " << query.since
3114                << dendl;
3115     if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
3116       pl->get_clog_error() << info.pgid << " got broken pg_query_t::LOG since "
3117                              << query.since
3118                              << " when my log.tail is " << pg_log.get_tail()
3119                              << ", sending full log instead";
3120       mlog->log = pg_log.get_log();           // primary should not have requested this!!
3121     } else
3122       mlog->log.copy_after(cct, pg_log.get_log(), query.since);
3123   }
3124   else if (query.type == pg_query_t::FULLLOG) {
3125     psdout(10) << " sending info+missing+full log" << dendl;
3126     mlog->log = pg_log.get_log();
3127   }
3128
3129   psdout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
3130
3131   pl->send_cluster_message(from.osd, std::move(mlog), get_osdmap_epoch(), true);
3132 }
3133
3134 void PeeringState::fulfill_query(const MQuery& query, PeeringCtxWrapper &rctx)
3135 {
3136   if (query.query.type == pg_query_t::INFO) {
3137     pair<pg_shard_t, pg_info_t> notify_info;
3138     // note this refreshes our prior_readable_until_ub value
3139     update_history(query.query.history);
3140     fulfill_info(query.from, query.query, notify_info);
3141     rctx.send_notify(
3142       notify_info.first.osd,
3143       pg_notify_t(
3144         notify_info.first.shard, pg_whoami.shard,
3145         query.query_epoch,
3146         get_osdmap_epoch(),
3147         notify_info.second,
3148         past_intervals));
3149   } else {
3150     update_history(query.query.history);
3151     fulfill_log(query.from, query.query, query.query_epoch);
3152   }
3153 }
3154
3155 void PeeringState::try_mark_clean()
3156 {
3157   if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
3158     state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
3159     state_set(PG_STATE_CLEAN);
3160     info.history.last_epoch_clean = get_osdmap_epoch();
3161     info.history.last_interval_clean = info.history.same_interval_since;
3162     past_intervals.clear();
3163     dirty_big_info = true;
3164     dirty_info = true;
3165   }
3166
3167   if (!is_active() && is_peered()) {
3168     if (is_clean()) {
3169       bool target;
3170       if (pool.info.is_pending_merge(info.pgid.pgid, &target)) {
3171         if (target) {
3172           psdout(10) << "ready to merge (target)" << dendl;
3173           pl->set_ready_to_merge_target(
3174             info.last_update,
3175             info.history.last_epoch_started,
3176             info.history.last_epoch_clean);
3177         } else {
3178           psdout(10) << "ready to merge (source)" << dendl;
3179           pl->set_ready_to_merge_source(info.last_update);
3180         }
3181       }
3182     } else {
3183       psdout(10) << "not clean, not ready to merge" << dendl;
3184       // we should have notified OSD in Active state entry point
3185     }
3186   }
3187
3188   state_clear(PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
3189
3190   share_pg_info();
3191   pl->publish_stats_to_osd();
3192   clear_recovery_state();
3193 }
3194
3195 void PeeringState::split_into(
3196   pg_t child_pgid, PeeringState *child, unsigned split_bits)
3197 {
3198   child->update_osdmap_ref(get_osdmap());
3199   child->pool = pool;
3200
3201   // Log
3202   pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
3203   child->info.last_complete = info.last_complete;
3204
3205   info.last_update = pg_log.get_head();
3206   child->info.last_update = child->pg_log.get_head();
3207
3208   child->info.last_user_version = info.last_user_version;
3209
3210   info.log_tail = pg_log.get_tail();
3211   child->info.log_tail = child->pg_log.get_tail();
3212
3213   // reset last_complete, we might have modified pg_log & missing above
3214   pg_log.reset_complete_to(&info);
3215   child->pg_log.reset_complete_to(&child->info);
3216
3217   // Info
3218   child->info.history = info.history;
3219   child->info.history.epoch_created = get_osdmap_epoch();
3220   child->info.purged_snaps = info.purged_snaps;
3221
3222   if (info.last_backfill.is_max()) {
3223     child->info.set_last_backfill(hobject_t::get_max());
3224   } else {
3225     // restart backfill on parent and child to be safe.  we could
3226     // probably do better in the bitwise sort case, but it's more
3227     // fragile (there may be special work to do on backfill completion
3228     // in the future).
3229     info.set_last_backfill(hobject_t());
3230     child->info.set_last_backfill(hobject_t());
3231     // restarting backfill implies that the missing set is empty,
3232     // since it is only used for objects prior to last_backfill
3233     pg_log.reset_backfill();
3234     child->pg_log.reset_backfill();
3235   }
3236
3237   child->info.stats = info.stats;
3238   child->info.stats.parent_split_bits = split_bits;
3239   info.stats.stats_invalid = true;
3240   child->info.stats.stats_invalid = true;
3241   child->info.last_epoch_started = info.last_epoch_started;
3242   child->info.last_interval_started = info.last_interval_started;
3243
3244   // There can't be recovery/backfill going on now
3245   int primary, up_primary;
3246   vector<int> newup, newacting;
3247   get_osdmap()->pg_to_up_acting_osds(
3248     child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
3249   child->init_primary_up_acting(
3250     newup,
3251     newacting,
3252     up_primary,
3253     primary);
3254   child->role = OSDMap::calc_pg_role(pg_whoami, child->acting);
3255
3256   // this comparison includes primary rank via pg_shard_t
3257   if (get_primary() != child->get_primary())
3258     child->info.history.same_primary_since = get_osdmap_epoch();
3259
3260   child->info.stats.up = newup;
3261   child->info.stats.up_primary = up_primary;
3262   child->info.stats.acting = newacting;
3263   child->info.stats.acting_primary = primary;
3264   child->info.stats.mapping_epoch = get_osdmap_epoch();
3265
3266   // History
3267   child->past_intervals = past_intervals;
3268
3269   child->on_new_interval();
3270
3271   child->send_notify = !child->is_primary();
3272
3273   child->dirty_info = true;
3274   child->dirty_big_info = true;
3275   dirty_info = true;
3276   dirty_big_info = true;
3277 }
3278
3279 void PeeringState::merge_from(
3280   map<spg_t,PeeringState *>& sources,
3281   PeeringCtx &rctx,
3282   unsigned split_bits,
3283   const pg_merge_meta_t& last_pg_merge_meta)
3284 {
3285   bool incomplete = false;
3286   if (info.last_complete != info.last_update ||
3287       info.is_incomplete() ||
3288       info.dne()) {
3289     psdout(10) << __func__ << " target incomplete" << dendl;
3290     incomplete = true;
3291   }
3292   if (last_pg_merge_meta.source_pgid != pg_t()) {
3293     if (info.pgid.pgid != last_pg_merge_meta.source_pgid.get_parent()) {
3294       psdout(10) << __func__ << " target doesn't match expected parent "
3295                  << last_pg_merge_meta.source_pgid.get_parent()
3296                  << " of source_pgid " << last_pg_merge_meta.source_pgid
3297                  << dendl;
3298       incomplete = true;
3299     }
3300     if (info.last_update != last_pg_merge_meta.target_version) {
3301       psdout(10) << __func__ << " target version doesn't match expected "
3302                << last_pg_merge_meta.target_version << dendl;
3303       incomplete = true;
3304     }
3305   }
3306
3307   PGLog::LogEntryHandlerRef handler{pl->get_log_handler(rctx.transaction)};
3308   pg_log.roll_forward(handler.get());
3309
3310   info.last_complete = info.last_update;  // to fake out trim()
3311   pg_log.reset_recovery_pointers();
3312   pg_log.trim(info.last_update, info);
3313
3314   vector<PGLog*> log_from;
3315   for (auto& i : sources) {
3316     auto& source = i.second;
3317     if (!source) {
3318       psdout(10) << __func__ << " source " << i.first << " missing" << dendl;
3319       incomplete = true;
3320       continue;
3321     }
3322     if (source->info.last_complete != source->info.last_update ||
3323         source->info.is_incomplete() ||
3324         source->info.dne()) {
3325       psdout(10) << __func__ << " source " << source->pg_whoami
3326                  << " incomplete"
3327                  << dendl;
3328       incomplete = true;
3329     }
3330     if (last_pg_merge_meta.source_pgid != pg_t()) {
3331       if (source->info.pgid.pgid != last_pg_merge_meta.source_pgid) {
3332         dout(10) << __func__ << " source " << source->info.pgid.pgid
3333                  << " doesn't match expected source pgid "
3334                  << last_pg_merge_meta.source_pgid << dendl;
3335         incomplete = true;
3336       }
3337       if (source->info.last_update != last_pg_merge_meta.source_version) {
3338         dout(10) << __func__ << " source version doesn't match expected "
3339                  << last_pg_merge_meta.target_version << dendl;
3340         incomplete = true;
3341       }
3342     }
3343
3344     // prepare log
3345     PGLog::LogEntryHandlerRef handler{
3346       source->pl->get_log_handler(rctx.transaction)};
3347     source->pg_log.roll_forward(handler.get());
3348     source->info.last_complete = source->info.last_update;  // to fake out trim()
3349     source->pg_log.reset_recovery_pointers();
3350     source->pg_log.trim(source->info.last_update, source->info);
3351     log_from.push_back(&source->pg_log);
3352
3353     // combine stats
3354     info.stats.add(source->info.stats);
3355
3356     // pull up last_update
3357     info.last_update = std::max(info.last_update, source->info.last_update);
3358
3359     // adopt source's PastIntervals if target has none.  we can do this since
3360     // pgp_num has been reduced prior to the merge, so the OSD mappings for
3361     // the PGs are identical.
3362     if (past_intervals.empty() && !source->past_intervals.empty()) {
3363       psdout(10) << __func__ << " taking source's past_intervals" << dendl;
3364       past_intervals = source->past_intervals;
3365     }
3366   }
3367
3368   info.last_complete = info.last_update;
3369   info.log_tail = info.last_update;
3370   if (incomplete) {
3371     info.last_backfill = hobject_t();
3372   }
3373
3374   // merge logs
3375   pg_log.merge_from(log_from, info.last_update);
3376
3377   // make sure we have a meaningful last_epoch_started/clean (if we were a
3378   // placeholder)
3379   if (info.history.epoch_created == 0) {
3380     // start with (a) source's history, since these PGs *should* have been
3381     // remapped in concert with each other...
3382     info.history = sources.begin()->second->info.history;
3383
3384     // we use the last_epoch_{started,clean} we got from
3385     // the caller, which are the epochs that were reported by the PGs were
3386     // found to be ready for merge.
3387     info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean;
3388     info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3389     info.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3390     psdout(10) << __func__
3391                << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/"
3392                << last_pg_merge_meta.last_epoch_clean
3393                << " from pool last_dec_*, source pg history was "
3394                << sources.begin()->second->info.history
3395                << dendl;
3396
3397     // above we have pulled down source's history and we need to check
3398     // history.epoch_created again to confirm that source is not a placeholder
3399     // too. (peering requires a sane history.same_interval_since value for any
3400     // non-newly created pg and below here we know we are basically iterating
3401     // back a series of past maps to fake a merge process, hence we need to
3402     // fix history.same_interval_since first so that start_peering_interval()
3403     // will not complain)
3404     if (info.history.epoch_created == 0) {
3405       dout(10) << __func__ << " both merge target and source are placeholders,"
3406                << " set sis to lec " << info.history.last_epoch_clean
3407                << dendl;
3408       info.history.same_interval_since = info.history.last_epoch_clean;
3409     }
3410
3411     // if the past_intervals start is later than last_epoch_clean, it
3412     // implies the source repeered again but the target didn't, or
3413     // that the source became clean in a later epoch than the target.
3414     // avoid the discrepancy but adjusting the interval start
3415     // backwards to match so that check_past_interval_bounds() will
3416     // not complain.
3417     auto pib = past_intervals.get_bounds();
3418     if (info.history.last_epoch_clean < pib.first) {
3419       psdout(10) << __func__ << " last_epoch_clean "
3420                  << info.history.last_epoch_clean << " < past_interval start "
3421                  << pib.first << ", adjusting start backwards" << dendl;
3422       past_intervals.adjust_start_backwards(info.history.last_epoch_clean);
3423     }
3424
3425     // Similarly, if the same_interval_since value is later than
3426     // last_epoch_clean, the next interval change will result in a
3427     // past_interval start that is later than last_epoch_clean.  This
3428     // can happen if we use the pg_history values from the merge
3429     // source.  Adjust the same_interval_since value backwards if that
3430     // happens.  (We trust the les and lec values more because they came from
3431     // the real target, whereas the history value we stole from the source.)
3432     if (info.history.last_epoch_started < info.history.same_interval_since) {
3433       psdout(10) << __func__ << " last_epoch_started "
3434                  << info.history.last_epoch_started << " < same_interval_since "
3435                  << info.history.same_interval_since
3436                  << ", adjusting pg_history backwards" << dendl;
3437       info.history.same_interval_since = info.history.last_epoch_clean;
3438       // make sure same_{up,primary}_since are <= same_interval_since
3439       info.history.same_up_since = std::min(
3440         info.history.same_up_since, info.history.same_interval_since);
3441       info.history.same_primary_since = std::min(
3442         info.history.same_primary_since, info.history.same_interval_since);
3443     }
3444   }
3445
3446   dirty_info = true;
3447   dirty_big_info = true;
3448 }
3449
3450 void PeeringState::start_split_stats(
3451   const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
3452 {
3453   out->resize(childpgs.size() + 1);
3454   info.stats.stats.sum.split(*out);
3455 }
3456
3457 void PeeringState::finish_split_stats(
3458   const object_stat_sum_t& stats, ObjectStore::Transaction &t)
3459 {
3460   info.stats.stats.sum = stats;
3461   write_if_dirty(t);
3462 }
3463
3464 void PeeringState::update_blocked_by()
3465 {
3466   // set a max on the number of blocking peers we report. if we go
3467   // over, report a random subset.  keep the result sorted.
3468   unsigned keep = std::min<unsigned>(
3469     blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
3470   unsigned skip = blocked_by.size() - keep;
3471   info.stats.blocked_by.clear();
3472   info.stats.blocked_by.resize(keep);
3473   unsigned pos = 0;
3474   for (auto p = blocked_by.begin(); p != blocked_by.end() && keep > 0; ++p) {
3475     if (skip > 0 && (rand() % (skip + keep) < skip)) {
3476       --skip;
3477     } else {
3478       info.stats.blocked_by[pos++] = *p;
3479       --keep;
3480     }
3481   }
3482 }
3483
3484 static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard)
3485 {
3486     for (auto&p : pgs)
3487       if (p.shard == shard)
3488         return true;
3489     return false;
3490 }
3491
3492 static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard)
3493 {
3494     for (auto&p : pgs) {
3495       if (p == skip)
3496         continue;
3497       if (p.shard == shard)
3498         return p;
3499     }
3500     return pg_shard_t();
3501 }
3502
3503 void PeeringState::update_calc_stats()
3504 {
3505   info.stats.version = info.last_update;
3506   info.stats.created = info.history.epoch_created;
3507   info.stats.last_scrub = info.history.last_scrub;
3508   info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
3509   info.stats.last_deep_scrub = info.history.last_deep_scrub;
3510   info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
3511   info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
3512   info.stats.last_epoch_clean = info.history.last_epoch_clean;
3513
3514   info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
3515   info.stats.ondisk_log_size = info.stats.log_size;
3516   info.stats.log_start = pg_log.get_tail();
3517   info.stats.ondisk_log_start = pg_log.get_tail();
3518   info.stats.snaptrimq_len = pl->get_snap_trimq_size();
3519
3520   unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
3521
3522   // In rare case that upset is too large (usually transient), use as target
3523   // for calculations below.
3524   unsigned target = std::max(num_shards, (unsigned)upset.size());
3525   // For undersized actingset may be larger with OSDs out
3526   unsigned nrep = std::max(actingset.size(), upset.size());
3527   // calc num_object_copies
3528   info.stats.stats.calc_copies(std::max(target, nrep));
3529   info.stats.stats.sum.num_objects_degraded = 0;
3530   info.stats.stats.sum.num_objects_unfound = 0;
3531   info.stats.stats.sum.num_objects_misplaced = 0;
3532   info.stats.avail_no_missing.clear();
3533   info.stats.object_location_counts.clear();
3534
3535   // We should never hit this condition, but if end up hitting it,
3536   // make sure to update num_objects and set PG_STATE_INCONSISTENT.
3537   if (info.stats.stats.sum.num_objects < 0) {
3538     psdout(0) << __func__ << " negative num_objects = "
3539               << info.stats.stats.sum.num_objects << " setting it to 0 "
3540               << dendl;
3541     info.stats.stats.sum.num_objects = 0;
3542     state_set(PG_STATE_INCONSISTENT);
3543   }
3544
3545   if ((is_remapped() || is_undersized() || !is_clean()) &&
3546       (is_peered()|| is_activating())) {
3547     psdout(20) << __func__ << " actingset " << actingset << " upset "
3548                << upset << " acting_recovery_backfill " << acting_recovery_backfill << dendl;
3549
3550     ceph_assert(!acting_recovery_backfill.empty());
3551
3552     bool estimate = false;
3553
3554     // NOTE: we only generate degraded, misplaced and unfound
3555     // values for the summation, not individual stat categories.
3556     int64_t num_objects = info.stats.stats.sum.num_objects;
3557
3558     // Objects missing from up nodes, sorted by # objects.
3559     boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
3560     // Objects missing from nodes not in up, sort by # objects
3561     boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
3562
3563     // Fill missing_target_objects/acting_source_objects
3564
3565     {
3566       int64_t missing;
3567
3568       // Primary first
3569       missing = pg_log.get_missing().num_missing();
3570       ceph_assert(acting_recovery_backfill.count(pg_whoami));
3571       if (upset.count(pg_whoami)) {
3572         missing_target_objects.emplace(missing, pg_whoami);
3573       } else {
3574         acting_source_objects.emplace(missing, pg_whoami);
3575       }
3576       info.stats.stats.sum.num_objects_missing_on_primary = missing;
3577       if (missing == 0)
3578         info.stats.avail_no_missing.push_back(pg_whoami);
3579       psdout(20) << __func__ << " shard " << pg_whoami
3580                  << " primary objects " << num_objects
3581                  << " missing " << missing
3582                  << dendl;
3583     }
3584
3585     // All other peers
3586     for (auto& peer : peer_info) {
3587       // Primary should not be in the peer_info, skip if it is.
3588       if (peer.first == pg_whoami) continue;
3589       int64_t missing = 0;
3590       int64_t peer_num_objects =
3591         std::max((int64_t)0, peer.second.stats.stats.sum.num_objects);
3592       // Backfill targets always track num_objects accurately
3593       // all other peers track missing accurately.
3594       if (is_backfill_target(peer.first)) {
3595         missing = std::max((int64_t)0, num_objects - peer_num_objects);
3596       } else {
3597         if (peer_missing.count(peer.first)) {
3598           missing = peer_missing[peer.first].num_missing();
3599         } else {
3600           psdout(20) << __func__ << " no peer_missing found for "
3601                      << peer.first << dendl;
3602           if (is_recovering()) {
3603             estimate = true;
3604           }
3605           missing = std::max((int64_t)0, num_objects - peer_num_objects);
3606         }
3607       }
3608       if (upset.count(peer.first)) {
3609         missing_target_objects.emplace(missing, peer.first);
3610       } else if (actingset.count(peer.first)) {
3611         acting_source_objects.emplace(missing, peer.first);
3612       }
3613       peer.second.stats.stats.sum.num_objects_missing = missing;
3614       if (missing == 0)
3615         info.stats.avail_no_missing.push_back(peer.first);
3616       psdout(20) << __func__ << " shard " << peer.first
3617                  << " objects " << peer_num_objects
3618                  << " missing " << missing
3619                  << dendl;
3620     }
3621
3622     // Compute object_location_counts
3623     for (auto& ml: missing_loc.get_missing_locs()) {
3624       info.stats.object_location_counts[ml.second]++;
3625       psdout(30) << __func__ << " " << ml.first << " object_location_counts["
3626                  << ml.second << "]=" << info.stats.object_location_counts[ml.second]
3627                  << dendl;
3628     }
3629     int64_t not_missing = num_objects - missing_loc.get_missing_locs().size();
3630     if (not_missing) {
3631         // During recovery we know upset == actingset and is being populated
3632         // During backfill we know that all non-missing objects are in the actingset
3633         info.stats.object_location_counts[actingset] = not_missing;
3634     }
3635     psdout(30) << __func__ << " object_location_counts["
3636                << upset << "]=" << info.stats.object_location_counts[upset]
3637                << dendl;
3638     psdout(20) << __func__ << " object_location_counts "
3639                << info.stats.object_location_counts << dendl;
3640
3641     // A misplaced object is not stored on the correct OSD
3642     int64_t misplaced = 0;
3643     // a degraded objects has fewer replicas or EC shards than the pool specifies.
3644     int64_t degraded = 0;
3645
3646     if (is_recovering()) {
3647       for (auto& sml: missing_loc.get_missing_by_count()) {
3648         for (auto& ml: sml.second) {
3649           int missing_shards;
3650           if (sml.first == shard_id_t::NO_SHARD) {
3651             psdout(20) << __func__ << " ml " << ml.second
3652                        << " upset size " << upset.size()
3653                        << " up " << ml.first.up << dendl;
3654             missing_shards = (int)upset.size() - ml.first.up;
3655           } else {
3656             // Handle shards not even in upset below
3657             if (!find_shard(upset, sml.first))
3658               continue;
3659             missing_shards = std::max(0, 1 - ml.first.up);
3660             psdout(20) << __func__
3661                        << " shard " << sml.first
3662                        << " ml " << ml.second
3663                        << " missing shards " << missing_shards << dendl;
3664           }
3665           int odegraded = ml.second * missing_shards;
3666           // Copies on other osds but limited to the possible degraded
3667           int more_osds = std::min(missing_shards, ml.first.other);
3668           int omisplaced = ml.second * more_osds;
3669           ceph_assert(omisplaced <= odegraded);
3670           odegraded -= omisplaced;
3671
3672           misplaced += omisplaced;
3673           degraded += odegraded;
3674         }
3675       }
3676
3677       psdout(20) << __func__ << " missing based degraded "
3678                  << degraded << dendl;
3679       psdout(20) << __func__ << " missing based misplaced "
3680                  << misplaced << dendl;
3681
3682       // Handle undersized case
3683       if (pool.info.is_replicated()) {
3684         // Add degraded for missing targets (num_objects missing)
3685         ceph_assert(target >= upset.size());
3686         unsigned needed = target - upset.size();
3687         degraded += num_objects * needed;
3688       } else {
3689         for (unsigned i = 0 ; i < num_shards; ++i) {
3690           shard_id_t shard(i);
3691
3692           if (!find_shard(upset, shard)) {
3693             pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard);
3694
3695             if (pgs != pg_shard_t()) {
3696               int64_t missing;
3697
3698               if (pgs == pg_whoami)
3699                 missing = info.stats.stats.sum.num_objects_missing_on_primary;
3700               else
3701                 missing = peer_info[pgs].stats.stats.sum.num_objects_missing;
3702
3703               degraded += missing;
3704               misplaced += std::max((int64_t)0, num_objects - missing);
3705             } else {
3706               // No shard anywhere
3707               degraded += num_objects;
3708             }
3709           }
3710         }
3711       }
3712       goto out;
3713     }
3714
3715     // Handle undersized case
3716     if (pool.info.is_replicated()) {
3717       // Add to missing_target_objects
3718       ceph_assert(target >= missing_target_objects.size());
3719       unsigned needed = target - missing_target_objects.size();
3720       if (needed)
3721         missing_target_objects.emplace(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD));
3722     } else {
3723       for (unsigned i = 0 ; i < num_shards; ++i) {
3724         shard_id_t shard(i);
3725         bool found = false;
3726         for (const auto& t : missing_target_objects) {
3727           if (std::get<1>(t).shard == shard) {
3728             found = true;
3729             break;
3730           }
3731         }
3732         if (!found)
3733           missing_target_objects.emplace(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard));
3734       }
3735     }
3736
3737     for (const auto& item : missing_target_objects)
3738       psdout(20) << __func__ << " missing shard " << std::get<1>(item)
3739                  << " missing= " << std::get<0>(item) << dendl;
3740     for (const auto& item : acting_source_objects)
3741       psdout(20) << __func__ << " acting shard " << std::get<1>(item)
3742                  << " missing= " << std::get<0>(item) << dendl;
3743
3744     // Handle all objects not in missing for remapped
3745     // or backfill
3746     for (auto m = missing_target_objects.rbegin();
3747         m != missing_target_objects.rend(); ++m) {
3748
3749       int64_t extra_missing = -1;
3750
3751       if (pool.info.is_replicated()) {
3752         if (!acting_source_objects.empty()) {
3753           auto extra_copy = acting_source_objects.begin();
3754           extra_missing = std::get<0>(*extra_copy);
3755           acting_source_objects.erase(extra_copy);
3756         }
3757       } else {  // Erasure coded
3758         // Use corresponding shard
3759         for (const auto& a : acting_source_objects) {
3760           if (std::get<1>(a).shard == std::get<1>(*m).shard) {
3761             extra_missing = std::get<0>(a);
3762             acting_source_objects.erase(a);
3763             break;
3764           }
3765         }
3766       }
3767
3768       if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
3769         // We don't know which of the objects on the target
3770         // are part of extra_missing so assume are all degraded.
3771         misplaced += std::get<0>(*m) - extra_missing;
3772         degraded += extra_missing;
3773       } else {
3774         // 1. extra_missing == -1, more targets than sources so degraded
3775         // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
3776         //    previously degraded are now present on the target.
3777         degraded += std::get<0>(*m);
3778       }
3779     }
3780     // If there are still acting that haven't been accounted for
3781     // then they are misplaced
3782     for (const auto& a : acting_source_objects) {
3783       int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
3784       psdout(20) << __func__ << " extra acting misplaced " << extra_misplaced
3785                  << dendl;
3786       misplaced += extra_misplaced;
3787     }
3788 out:
3789     // NOTE: Tests use these messages to verify this code
3790     psdout(20) << __func__ << " degraded " << degraded
3791                << (estimate ? " (est)": "") << dendl;
3792     psdout(20) << __func__ << " misplaced " << misplaced
3793                << (estimate ? " (est)": "")<< dendl;
3794
3795     info.stats.stats.sum.num_objects_degraded = degraded;
3796     info.stats.stats.sum.num_objects_unfound = get_num_unfound();
3797     info.stats.stats.sum.num_objects_misplaced = misplaced;
3798   }
3799 }
3800
3801 std::optional<pg_stat_t> PeeringState::prepare_stats_for_publish(
3802   const std::optional<pg_stat_t> &pg_stats_publish,
3803   const object_stat_collection_t &unstable_stats)
3804 {
3805   if (info.stats.stats.sum.num_scrub_errors) {
3806     psdout(10) << __func__ << " inconsistent due to " <<
3807       info.stats.stats.sum.num_scrub_errors << " scrub errors" << dendl;
3808     state_set(PG_STATE_INCONSISTENT);
3809   } else {
3810     state_clear(PG_STATE_INCONSISTENT);
3811     state_clear(PG_STATE_FAILED_REPAIR);
3812   }
3813
3814   utime_t now = ceph_clock_now();
3815   if (info.stats.state != state) {
3816     info.stats.last_change = now;
3817     // Optimistic estimation, if we just find out an inactive PG,
3818     // assume it is active till now.
3819     if (!(state & PG_STATE_ACTIVE) &&
3820         (info.stats.state & PG_STATE_ACTIVE))
3821       info.stats.last_active = now;
3822
3823     if ((state & PG_STATE_ACTIVE) &&
3824         !(info.stats.state & PG_STATE_ACTIVE))
3825       info.stats.last_became_active = now;
3826     if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
3827         !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
3828       info.stats.last_became_peered = now;
3829     info.stats.state = state;
3830   }
3831
3832   update_calc_stats();
3833   if (info.stats.stats.sum.num_objects_degraded) {
3834     state_set(PG_STATE_DEGRADED);
3835   } else {
3836     state_clear(PG_STATE_DEGRADED);
3837   }
3838   update_blocked_by();
3839
3840   pg_stat_t pre_publish = info.stats;
3841   pre_publish.stats.add(unstable_stats);
3842   utime_t cutoff = now;
3843   cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
3844
3845   // share (some of) our purged_snaps via the pg_stats. limit # of intervals
3846   // because we don't want to make the pg_stat_t structures too expensive.
3847   unsigned max = cct->_conf->osd_max_snap_prune_intervals_per_epoch;
3848   unsigned num = 0;
3849   auto i = info.purged_snaps.begin();
3850   while (num < max && i != info.purged_snaps.end()) {
3851     pre_publish.purged_snaps.insert(i.get_start(), i.get_len());
3852     ++num;
3853     ++i;
3854   }
3855   psdout(20) << __func__ << " reporting purged_snaps "
3856              << pre_publish.purged_snaps << dendl;
3857
3858   if (pg_stats_publish && pre_publish == *pg_stats_publish &&
3859       info.stats.last_fresh > cutoff) {
3860     psdout(15) << "publish_stats_to_osd " << pg_stats_publish->reported_epoch
3861                << ": no change since " << info.stats.last_fresh << dendl;
3862     return std::nullopt;
3863   } else {
3864     // update our stat summary and timestamps
3865     info.stats.reported_epoch = get_osdmap_epoch();
3866     ++info.stats.reported_seq;
3867
3868     info.stats.last_fresh = now;
3869
3870     if (info.stats.state & PG_STATE_CLEAN)
3871       info.stats.last_clean = now;
3872     if (info.stats.state & PG_STATE_ACTIVE)
3873       info.stats.last_active = now;
3874     if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
3875       info.stats.last_peered = now;
3876     info.stats.last_unstale = now;
3877     if ((info.stats.state & PG_STATE_DEGRADED) == 0)
3878       info.stats.last_undegraded = now;
3879     if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
3880       info.stats.last_fullsized = now;
3881
3882     psdout(15) << "publish_stats_to_osd " << pre_publish.reported_epoch
3883                << ":" << pre_publish.reported_seq << dendl;
3884     return std::make_optional(std::move(pre_publish));
3885   }
3886 }
3887
3888 void PeeringState::init(
3889   int role,
3890   const vector<int>& newup, int new_up_primary,
3891   const vector<int>& newacting, int new_acting_primary,
3892   const pg_history_t& history,
3893   const PastIntervals& pi,
3894   ObjectStore::Transaction &t)
3895 {
3896   psdout(10) << "init role " << role << " up "
3897              << newup << " acting " << newacting
3898              << " history " << history
3899              << " past_intervals " << pi
3900              << dendl;
3901
3902   set_role(role);
3903   init_primary_up_acting(
3904     newup,
3905     newacting,
3906     new_up_primary,
3907     new_acting_primary);
3908
3909   info.history = history;
3910   past_intervals = pi;
3911
3912   info.stats.up = up;
3913   info.stats.up_primary = new_up_primary;
3914   info.stats.acting = acting;
3915   info.stats.acting_primary = new_acting_primary;
3916   info.stats.mapping_epoch = info.history.same_interval_since;
3917
3918   if (!perform_deletes_during_peering()) {
3919     pg_log.set_missing_may_contain_deletes();
3920   }
3921
3922   on_new_interval();
3923
3924   dirty_info = true;
3925   dirty_big_info = true;
3926   write_if_dirty(t);
3927 }
3928
3929 void PeeringState::dump_peering_state(Formatter *f)
3930 {
3931   f->dump_string("state", get_pg_state_string());
3932   f->dump_unsigned("epoch", get_osdmap_epoch());
3933   f->open_array_section("up");
3934   for (auto p = up.begin(); p != up.end(); ++p)
3935     f->dump_unsigned("osd", *p);
3936   f->close_section();
3937   f->open_array_section("acting");
3938   for (auto p = acting.begin(); p != acting.end(); ++p)
3939     f->dump_unsigned("osd", *p);
3940   f->close_section();
3941   if (!backfill_targets.empty()) {
3942     f->open_array_section("backfill_targets");
3943     for (auto p = backfill_targets.begin(); p != backfill_targets.end(); ++p)
3944       f->dump_stream("shard") << *p;
3945     f->close_section();
3946   }
3947   if (!async_recovery_targets.empty()) {
3948     f->open_array_section("async_recovery_targets");
3949     for (auto p = async_recovery_targets.begin();
3950          p != async_recovery_targets.end();
3951          ++p)
3952       f->dump_stream("shard") << *p;
3953     f->close_section();
3954   }
3955   if (!acting_recovery_backfill.empty()) {
3956     f->open_array_section("acting_recovery_backfill");
3957     for (auto p = acting_recovery_backfill.begin();
3958          p != acting_recovery_backfill.end();
3959          ++p)
3960       f->dump_stream("shard") << *p;
3961     f->close_section();
3962   }
3963   f->open_object_section("info");
3964   update_calc_stats();
3965   info.dump(f);
3966   f->close_section();
3967
3968   f->open_array_section("peer_info");
3969   for (auto p = peer_info.begin(); p != peer_info.end(); ++p) {
3970     f->open_object_section("info");
3971     f->dump_stream("peer") << p->first;
3972     p->second.dump(f);
3973     f->close_section();
3974   }
3975   f->close_section();
3976 }
3977
3978 void PeeringState::update_stats(
3979   std::function<bool(pg_history_t &, pg_stat_t &)> f,
3980   ObjectStore::Transaction *t) {
3981   if (f(info.history, info.stats)) {
3982     pl->publish_stats_to_osd();
3983   }
3984   pl->reschedule_scrub();
3985
3986   if (t) {
3987     dirty_info = true;
3988     write_if_dirty(*t);
3989   }
3990 }
3991
3992 void PeeringState::update_stats_wo_resched(
3993   std::function<void(pg_history_t &, pg_stat_t &)> f)
3994 {
3995   f(info.history, info.stats);
3996 }
3997
3998 bool PeeringState::append_log_entries_update_missing(
3999   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
4000   ObjectStore::Transaction &t, std::optional<eversion_t> trim_to,
4001   std::optional<eversion_t> roll_forward_to)
4002 {
4003   ceph_assert(!entries.empty());
4004   ceph_assert(entries.begin()->version > info.last_update);
4005
4006   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
4007   bool invalidate_stats =
4008     pg_log.append_new_log_entries(
4009       info.last_backfill,
4010       entries,
4011       rollbacker.get());
4012
4013   if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
4014     pg_log.roll_forward(rollbacker.get());
4015   }
4016   if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
4017     pg_log.roll_forward_to(*roll_forward_to, rollbacker.get());
4018     last_rollback_info_trimmed_to_applied = *roll_forward_to;
4019   }
4020
4021   info.last_update = pg_log.get_head();
4022
4023   if (pg_log.get_missing().num_missing() == 0) {
4024     // advance last_complete since nothing else is missing!
4025     info.last_complete = info.last_update;
4026   }
4027   info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
4028
4029   psdout(20) << __func__ << " trim_to bool = " << bool(trim_to)
4030              << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
4031   if (trim_to)
4032     pg_log.trim(*trim_to, info);
4033   dirty_info = true;
4034   write_if_dirty(t);
4035   return invalidate_stats;
4036 }
4037
4038 void PeeringState::merge_new_log_entries(
4039   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
4040   ObjectStore::Transaction &t,
4041   std::optional<eversion_t> trim_to,
4042   std::optional<eversion_t> roll_forward_to)
4043 {
4044   psdout(10) << __func__ << " " << entries << dendl;
4045   ceph_assert(is_primary());
4046
4047   bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
4048   for (auto i = acting_recovery_backfill.begin();
4049        i != acting_recovery_backfill.end();
4050        ++i) {
4051     pg_shard_t peer(*i);
4052     if (peer == pg_whoami) continue;
4053     ceph_assert(peer_missing.count(peer));
4054     ceph_assert(peer_info.count(peer));
4055     pg_missing_t& pmissing(peer_missing[peer]);
4056     psdout(20) << __func__ << " peer_missing for " << peer
4057                << " = " << pmissing << dendl;
4058     pg_info_t& pinfo(peer_info[peer]);
4059     bool invalidate_stats = PGLog::append_log_entries_update_missing(
4060       pinfo.last_backfill,
4061       entries,
4062       true,
4063       NULL,
4064       pmissing,
4065       NULL,
4066       dpp);
4067     pinfo.last_update = info.last_update;
4068     pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
4069     rebuild_missing = rebuild_missing || invalidate_stats;
4070   }
4071
4072   if (!rebuild_missing) {
4073     return;
4074   }
4075
4076   for (auto &&i: entries) {
4077     missing_loc.rebuild(
4078       i.soid,
4079       pg_whoami,
4080       acting_recovery_backfill,
4081       info,
4082       pg_log.get_missing(),
4083       peer_missing,
4084       peer_info);
4085   }
4086 }
4087
4088 void PeeringState::add_log_entry(const pg_log_entry_t& e, bool applied)
4089 {
4090   // raise last_complete only if we were previously up to date
4091   if (info.last_complete == info.last_update)
4092     info.last_complete = e.version;
4093
4094   // raise last_update.
4095   ceph_assert(e.version > info.last_update);
4096   info.last_update = e.version;
4097
4098   // raise user_version, if it increased (it may have not get bumped
4099   // by all logged updates)
4100   if (e.user_version > info.last_user_version)
4101     info.last_user_version = e.user_version;
4102
4103   // log mutation
4104   pg_log.add(e, applied);
4105   psdout(10) << "add_log_entry " << e << dendl;
4106 }
4107
4108
4109 void PeeringState::append_log(
4110   vector<pg_log_entry_t>&& logv,
4111   eversion_t trim_to,
4112   eversion_t roll_forward_to,
4113   eversion_t mlcod,
4114   ObjectStore::Transaction &t,
4115   bool transaction_applied,
4116   bool async)
4117 {
4118   /* The primary has sent an info updating the history, but it may not
4119    * have arrived yet.  We want to make sure that we cannot remember this
4120    * write without remembering that it happened in an interval which went
4121    * active in epoch history.last_epoch_started.
4122    */
4123   if (info.last_epoch_started != info.history.last_epoch_started) {
4124     info.history.last_epoch_started = info.last_epoch_started;
4125   }
4126   if (info.last_interval_started != info.history.last_interval_started) {
4127     info.history.last_interval_started = info.last_interval_started;
4128   }
4129   psdout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
4130
4131   PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
4132   if (!transaction_applied) {
4133      /* We must be a backfill or async recovery peer, so it's ok if we apply
4134       * out-of-turn since we won't be considered when
4135       * determining a min possible last_update.
4136       *
4137       * We skip_rollforward() here, which advances the crt, without
4138       * doing an actual rollforward. This avoids cleaning up entries
4139       * from the backend and we do not end up in a situation, where the
4140       * object is deleted before we can _merge_object_divergent_entries().
4141       */
4142     pg_log.skip_rollforward();
4143   }
4144
4145   for (auto p = logv.begin(); p != logv.end(); ++p) {
4146     add_log_entry(*p, transaction_applied);
4147
4148     /* We don't want to leave the rollforward artifacts around
4149      * here past last_backfill.  It's ok for the same reason as
4150      * above */
4151     if (transaction_applied &&
4152         p->soid > info.last_backfill) {
4153       pg_log.roll_forward(handler.get());
4154     }
4155   }
4156   if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
4157     pg_log.roll_forward_to(
4158       roll_forward_to,
4159       handler.get());
4160     last_rollback_info_trimmed_to_applied = roll_forward_to;
4161   }
4162
4163   psdout(10) << __func__ << " approx pg log length =  "
4164              << pg_log.get_log().approx_size() << dendl;
4165   psdout(10) << __func__ << " transaction_applied = "
4166              << transaction_applied << dendl;
4167   if (!transaction_applied || async)
4168     psdout(10) << __func__ << " " << pg_whoami
4169                << " is async_recovery or backfill target" << dendl;
4170   pg_log.trim(trim_to, info, transaction_applied, async);
4171
4172   // update the local pg, pg log
4173   dirty_info = true;
4174   write_if_dirty(t);
4175
4176   if (!is_primary())
4177     min_last_complete_ondisk = mlcod;
4178 }
4179
4180 void PeeringState::recover_got(
4181   const hobject_t &oid, eversion_t v,
4182   bool is_delete,
4183   ObjectStore::Transaction &t)
4184 {
4185   if (v > pg_log.get_can_rollback_to()) {
4186     /* This can only happen during a repair, and even then, it would
4187      * be one heck of a race.  If we are repairing the object, the
4188      * write in question must be fully committed, so it's not valid
4189      * to roll it back anyway (and we'll be rolled forward shortly
4190      * anyway) */
4191     PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
4192     pg_log.roll_forward_to(v, handler.get());
4193   }
4194
4195   psdout(10) << "got missing " << oid << " v " << v << dendl;
4196   pg_log.recover_got(oid, v, info);
4197   if (pg_log.get_log().log.empty()) {
4198     psdout(10) << "last_complete now " << info.last_complete
4199                << " while log is empty" << dendl;
4200   } else if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
4201     psdout(10) << "last_complete now " << info.last_complete
4202                << " log.complete_to " << pg_log.get_log().complete_to->version
4203                << dendl;
4204   } else {
4205     psdout(10) << "last_complete now " << info.last_complete
4206                << " log.complete_to at end" << dendl;
4207     //below is not true in the repair case.
4208     //assert(missing.num_missing() == 0);  // otherwise, complete_to was wrong.
4209     ceph_assert(info.last_complete == info.last_update);
4210   }
4211
4212   if (is_primary()) {
4213     ceph_assert(missing_loc.needs_recovery(oid));
4214     if (!is_delete)
4215       missing_loc.add_location(oid, pg_whoami);
4216   }
4217
4218   // update pg
4219   dirty_info = true;
4220   write_if_dirty(t);
4221 }
4222
4223 void PeeringState::update_backfill_progress(
4224   const hobject_t &updated_backfill,
4225   const pg_stat_t &updated_stats,
4226   bool preserve_local_num_bytes,
4227   ObjectStore::Transaction &t) {
4228   info.set_last_backfill(updated_backfill);
4229   if (preserve_local_num_bytes) {
4230     psdout(25) << __func__ << " primary " << updated_stats.stats.sum.num_bytes
4231                << " local " << info.stats.stats.sum.num_bytes << dendl;
4232     int64_t bytes = info.stats.stats.sum.num_bytes;
4233     info.stats = updated_stats;
4234     info.stats.stats.sum.num_bytes = bytes;
4235   } else {
4236     psdout(20) << __func__ << " final " << updated_stats.stats.sum.num_bytes
4237                << " replaces local " << info.stats.stats.sum.num_bytes << dendl;
4238     info.stats = updated_stats;
4239   }
4240
4241   dirty_info = true;
4242   write_if_dirty(t);
4243 }
4244
4245 void PeeringState::adjust_purged_snaps(
4246   std::function<void(interval_set<snapid_t> &snaps)> f) {
4247   f(info.purged_snaps);
4248   dirty_info = true;
4249   dirty_big_info = true;
4250 }
4251
4252 void PeeringState::on_peer_recover(
4253   pg_shard_t peer,
4254   const hobject_t &soid,
4255   const eversion_t &version)
4256 {
4257   pl->publish_stats_to_osd();
4258   // done!
4259   peer_missing[peer].got(soid, version);
4260   missing_loc.add_location(soid, peer);
4261 }
4262
4263 void PeeringState::begin_peer_recover(
4264   pg_shard_t peer,
4265   const hobject_t soid)
4266 {
4267   peer_missing[peer].revise_have(soid, eversion_t());
4268 }
4269
4270 void PeeringState::force_object_missing(
4271   const set<pg_shard_t> &peers,
4272   const hobject_t &soid,
4273   eversion_t version)
4274 {
4275   for (auto &&peer : peers) {
4276     if (peer != primary) {
4277       peer_missing[peer].add(soid, version, eversion_t(), false);
4278     } else {
4279       pg_log.missing_add(soid, version, eversion_t());
4280       pg_log.reset_complete_to(&info);
4281       pg_log.set_last_requested(0);
4282     }
4283   }
4284
4285   missing_loc.rebuild(
4286     soid,
4287     pg_whoami,
4288     acting_recovery_backfill,
4289     info,
4290     pg_log.get_missing(),
4291     peer_missing,
4292     peer_info);
4293 }
4294
4295 void PeeringState::pre_submit_op(
4296   const hobject_t &hoid,
4297   const vector<pg_log_entry_t>& logv,
4298   eversion_t at_version)
4299 {
4300   if (at_version > eversion_t()) {
4301     for (auto &&i : get_acting_recovery_backfill()) {
4302       if (i == primary) continue;
4303       pg_info_t &pinfo = peer_info[i];
4304       // keep peer_info up to date
4305       if (pinfo.last_complete == pinfo.last_update)
4306         pinfo.last_complete = at_version;
4307       pinfo.last_update = at_version;
4308     }
4309   }
4310
4311   bool requires_missing_loc = false;
4312   for (auto &&i : get_async_recovery_targets()) {
4313     if (i == primary || !get_peer_missing(i).is_missing(hoid))
4314       continue;
4315     requires_missing_loc = true;
4316     for (auto &&entry: logv) {
4317       peer_missing[i].add_next_event(entry);
4318     }
4319   }
4320
4321   if (requires_missing_loc) {
4322     for (auto &&entry: logv) {
4323       psdout(30) << __func__ << " missing_loc before: "
4324                  << missing_loc.get_locations(entry.soid) << dendl;
4325       missing_loc.add_missing(entry.soid, entry.version,
4326                               eversion_t(), entry.is_delete());
4327       // clear out missing_loc
4328       missing_loc.clear_location(entry.soid);
4329       for (auto &i: get_actingset()) {
4330         if (!get_peer_missing(i).is_missing(entry.soid))
4331           missing_loc.add_location(entry.soid, i);
4332       }
4333       psdout(30) << __func__ << " missing_loc after: "
4334                  << missing_loc.get_locations(entry.soid) << dendl;
4335     }
4336   }
4337 }
4338
4339 void PeeringState::recovery_committed_to(eversion_t version)
4340 {
4341   psdout(10) << __func__ << " version " << version
4342              << " now ondisk" << dendl;
4343   last_complete_ondisk = version;
4344
4345   if (last_complete_ondisk == info.last_update) {
4346     if (!is_primary()) {
4347       // Either we are a replica or backfill target.
4348       // we are fully up to date.  tell the primary!
4349       pl->send_cluster_message(
4350         get_primary().osd,
4351         TOPNSPC::make_message<MOSDPGTrim>(
4352           get_osdmap_epoch(),
4353           spg_t(info.pgid.pgid, primary.shard),
4354           last_complete_ondisk),
4355         get_osdmap_epoch());
4356     } else {
4357       calc_min_last_complete_ondisk();
4358     }
4359   }
4360 }
4361
4362 void PeeringState::complete_write(eversion_t v, eversion_t lc)
4363 {
4364   last_update_ondisk = v;
4365   last_complete_ondisk = lc;
4366   calc_min_last_complete_ondisk();
4367 }
4368
4369 void PeeringState::calc_trim_to()
4370 {
4371   size_t target = pl->get_target_pg_log_entries();
4372
4373   eversion_t limit = std::min(
4374     min_last_complete_ondisk,
4375     pg_log.get_can_rollback_to());
4376   if (limit != eversion_t() &&
4377       limit != pg_trim_to &&
4378       pg_log.get_log().approx_size() > target) {
4379     size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target,
4380                              cct->_conf->osd_pg_log_trim_max);
4381     if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4382         cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4383       return;
4384     }
4385     auto it = pg_log.get_log().log.begin();
4386     eversion_t new_trim_to;
4387     for (size_t i = 0; i < num_to_trim; ++i) {
4388       new_trim_to = it->version;
4389       ++it;
4390       if (new_trim_to > limit) {
4391         new_trim_to = limit;
4392         psdout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
4393         break;
4394       }
4395     }
4396     psdout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
4397     pg_trim_to = new_trim_to;
4398     assert(pg_trim_to <= pg_log.get_head());
4399     assert(pg_trim_to <= min_last_complete_ondisk);
4400   }
4401 }
4402
4403 void PeeringState::calc_trim_to_aggressive()
4404 {
4405   size_t target = pl->get_target_pg_log_entries();
4406
4407   // limit pg log trimming up to the can_rollback_to value
4408   eversion_t limit = std::min({
4409     pg_log.get_head(),
4410     pg_log.get_can_rollback_to(),
4411     last_update_ondisk});
4412   psdout(10) << __func__ << " limit = " << limit << dendl;
4413
4414   if (limit != eversion_t() &&
4415       limit != pg_trim_to &&
4416       pg_log.get_log().approx_size() > target) {
4417     psdout(10) << __func__ << " approx pg log length =  "
4418              << pg_log.get_log().approx_size() << dendl;
4419     uint64_t num_to_trim = std::min<uint64_t>(pg_log.get_log().approx_size() - target,
4420                                               cct->_conf->osd_pg_log_trim_max);
4421     psdout(10) << __func__ << " num_to_trim =  " << num_to_trim << dendl;
4422     if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4423         cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4424       return;
4425     }
4426     auto it = pg_log.get_log().log.begin(); // oldest log entry
4427     auto rit = pg_log.get_log().log.rbegin();
4428     eversion_t by_n_to_keep; // start from tail
4429     eversion_t by_n_to_trim = eversion_t::max(); // start from head
4430     for (size_t i = 0; it != pg_log.get_log().log.end(); ++it, ++rit) {
4431       i++;
4432       if (i > target && by_n_to_keep == eversion_t()) {
4433         by_n_to_keep = rit->version;
4434       }
4435       if (i >= num_to_trim && by_n_to_trim == eversion_t::max()) {
4436         by_n_to_trim = it->version;
4437       }
4438       if (by_n_to_keep != eversion_t() &&
4439           by_n_to_trim != eversion_t::max()) {
4440         break;
4441       }
4442     }
4443
4444     if (by_n_to_keep == eversion_t()) {
4445       return;
4446     }
4447
4448     pg_trim_to = std::min({by_n_to_keep, by_n_to_trim, limit});
4449     psdout(10) << __func__ << " pg_trim_to now " << pg_trim_to << dendl;
4450     ceph_assert(pg_trim_to <= pg_log.get_head());
4451   }
4452 }
4453
4454 void PeeringState::apply_op_stats(
4455   const hobject_t &soid,
4456   const object_stat_sum_t &delta_stats)
4457 {
4458   info.stats.stats.add(delta_stats);
4459   info.stats.stats.floor(0);
4460
4461   for (auto i = get_backfill_targets().begin();
4462        i != get_backfill_targets().end();
4463        ++i) {
4464     pg_shard_t bt = *i;
4465     pg_info_t& pinfo = peer_info[bt];
4466     if (soid <= pinfo.last_backfill)
4467       pinfo.stats.stats.add(delta_stats);
4468   }
4469 }
4470
4471 void PeeringState::update_complete_backfill_object_stats(
4472   const hobject_t &hoid,
4473   const pg_stat_t &stats)
4474 {
4475   for (auto &&bt: get_backfill_targets()) {
4476     pg_info_t& pinfo = peer_info[bt];
4477     //Add stats to all peers that were missing object
4478     if (hoid > pinfo.last_backfill)
4479       pinfo.stats.add(stats);
4480   }
4481 }
4482
4483 void PeeringState::update_peer_last_backfill(
4484   pg_shard_t peer,
4485   const hobject_t &new_last_backfill)
4486 {
4487   pg_info_t &pinfo = peer_info[peer];
4488   pinfo.last_backfill = new_last_backfill;
4489   if (new_last_backfill.is_max()) {
4490     /* pinfo.stats might be wrong if we did log-based recovery on the
4491      * backfilled portion in addition to continuing backfill.
4492      */
4493     pinfo.stats = info.stats;
4494   }
4495 }
4496
4497 void PeeringState::set_revert_with_targets(
4498   const hobject_t &soid,
4499   const set<pg_shard_t> &good_peers)
4500 {
4501   for (auto &&peer: good_peers) {
4502     missing_loc.add_location(soid, peer);
4503   }
4504 }
4505
4506 void PeeringState::prepare_backfill_for_missing(
4507   const hobject_t &soid,
4508   const eversion_t &version,
4509   const vector<pg_shard_t> &targets) {
4510   for (auto &&peer: targets) {
4511     peer_missing[peer].add(soid, version, eversion_t(), false);
4512   }
4513 }
4514
4515 void PeeringState::update_hset(const pg_hit_set_history_t &hset_history)
4516 {
4517   info.hit_set = hset_history;
4518 }
4519
4520 /*------------ Peering State Machine----------------*/
4521 #undef dout_prefix
4522 #define dout_prefix (context< PeeringMachine >().dpp->gen_prefix(*_dout) \
4523                     << "state<" << get_state_name() << ">: ")
4524 #undef psdout
4525 #define psdout(x) ldout(context< PeeringMachine >().cct, x)
4526
4527 #define DECLARE_LOCALS                                  \
4528   PeeringState *ps = context< PeeringMachine >().state; \
4529   std::ignore = ps;                                     \
4530   PeeringListener *pl = context< PeeringMachine >().pl; \
4531   std::ignore = pl
4532
4533
4534 /*------Crashed-------*/
4535 PeeringState::Crashed::Crashed(my_context ctx)
4536   : my_base(ctx),
4537     NamedState(context< PeeringMachine >().state_history, "Crashed")
4538 {
4539   context< PeeringMachine >().log_enter(state_name);
4540   ceph_abort_msg("we got a bad state machine event");
4541 }
4542
4543
4544 /*------Initial-------*/
4545 PeeringState::Initial::Initial(my_context ctx)
4546   : my_base(ctx),
4547     NamedState(context< PeeringMachine >().state_history, "Initial")
4548 {
4549   context< PeeringMachine >().log_enter(state_name);
4550 }
4551
4552 boost::statechart::result PeeringState::Initial::react(const MNotifyRec& notify)
4553 {
4554   DECLARE_LOCALS;
4555   ps->proc_replica_info(
4556     notify.from, notify.notify.info, notify.notify.epoch_sent);
4557   ps->set_last_peering_reset();
4558   return transit< Primary >();
4559 }
4560
4561 boost::statechart::result PeeringState::Initial::react(const MInfoRec& i)
4562 {
4563   DECLARE_LOCALS;
4564   ceph_assert(!ps->is_primary());
4565   post_event(i);
4566   return transit< Stray >();
4567 }
4568
4569 boost::statechart::result PeeringState::Initial::react(const MLogRec& i)
4570 {
4571   DECLARE_LOCALS;
4572   ceph_assert(!ps->is_primary());
4573   post_event(i);
4574   return transit< Stray >();
4575 }
4576
4577 void PeeringState::Initial::exit()
4578 {
4579   context< PeeringMachine >().log_exit(state_name, enter_time);
4580   DECLARE_LOCALS;
4581   utime_t dur = ceph_clock_now() - enter_time;
4582   pl->get_peering_perf().tinc(rs_initial_latency, dur);
4583 }
4584
4585 /*------Started-------*/
4586 PeeringState::Started::Started(my_context ctx)
4587   : my_base(ctx),
4588     NamedState(context< PeeringMachine >().state_history, "Started")
4589 {
4590   context< PeeringMachine >().log_enter(state_name);
4591 }
4592
4593 boost::statechart::result
4594 PeeringState::Started::react(const IntervalFlush&)
4595 {
4596   psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4597   context< PeeringMachine >().state->end_block_outgoing();
4598   return discard_event();
4599 }
4600
4601 boost::statechart::result PeeringState::Started::react(const AdvMap& advmap)
4602 {
4603   DECLARE_LOCALS;
4604   psdout(10) << "Started advmap" << dendl;
4605   ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4606   if (ps->should_restart_peering(
4607         advmap.up_primary,
4608         advmap.acting_primary,
4609         advmap.newup,
4610         advmap.newacting,
4611         advmap.lastmap,
4612         advmap.osdmap)) {
4613     psdout(10) << "should_restart_peering, transitioning to Reset"
4614                        << dendl;
4615     post_event(advmap);
4616     return transit< Reset >();
4617   }
4618   ps->remove_down_peer_info(advmap.osdmap);
4619   return discard_event();
4620 }
4621
4622 boost::statechart::result PeeringState::Started::react(const QueryState& q)
4623 {
4624   q.f->open_object_section("state");
4625   q.f->dump_string("name", state_name);
4626   q.f->dump_stream("enter_time") << enter_time;
4627   q.f->close_section();
4628   return discard_event();
4629 }
4630
4631 boost::statechart::result PeeringState::Started::react(const QueryUnfound& q)
4632 {
4633   q.f->dump_string("state", "Started");
4634   q.f->dump_bool("available_might_have_unfound", false);
4635   return discard_event();
4636 }
4637
4638 void PeeringState::Started::exit()
4639 {
4640   context< PeeringMachine >().log_exit(state_name, enter_time);
4641   DECLARE_LOCALS;
4642   utime_t dur = ceph_clock_now() - enter_time;
4643   pl->get_peering_perf().tinc(rs_started_latency, dur);
4644   ps->state_clear(PG_STATE_WAIT | PG_STATE_LAGGY);
4645 }
4646
4647 /*--------Reset---------*/
4648 PeeringState::Reset::Reset(my_context ctx)
4649   : my_base(ctx),
4650     NamedState(context< PeeringMachine >().state_history, "Reset")
4651 {
4652   context< PeeringMachine >().log_enter(state_name);
4653   DECLARE_LOCALS;
4654
4655   ps->flushes_in_progress = 0;
4656   ps->set_last_peering_reset();
4657   ps->log_weirdness();
4658 }
4659
4660 boost::statechart::result
4661 PeeringState::Reset::react(const IntervalFlush&)
4662 {
4663   psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4664   context< PeeringMachine >().state->end_block_outgoing();
4665   return discard_event();
4666 }
4667
4668 boost::statechart::result PeeringState::Reset::react(const AdvMap& advmap)
4669 {
4670   DECLARE_LOCALS;
4671   psdout(10) << "Reset advmap" << dendl;
4672
4673   ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4674
4675   if (ps->should_restart_peering(
4676         advmap.up_primary,
4677         advmap.acting_primary,
4678         advmap.newup,
4679         advmap.newacting,
4680         advmap.lastmap,
4681         advmap.osdmap)) {
4682     psdout(10) << "should restart peering, calling start_peering_interval again"
4683                        << dendl;
4684     ps->start_peering_interval(
4685       advmap.lastmap,
4686       advmap.newup, advmap.up_primary,
4687       advmap.newacting, advmap.acting_primary,
4688       context< PeeringMachine >().get_cur_transaction());
4689   }
4690   ps->remove_down_peer_info(advmap.osdmap);
4691   ps->check_past_interval_bounds();
4692   return discard_event();
4693 }
4694
4695 boost::statechart::result PeeringState::Reset::react(const ActMap&)
4696 {
4697   DECLARE_LOCALS;
4698   if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
4699     ps->info.history.refresh_prior_readable_until_ub(
4700       pl->get_mnow(),
4701       ps->prior_readable_until_ub);
4702     context< PeeringMachine >().send_notify(
4703       ps->get_primary().osd,
4704       pg_notify_t(
4705         ps->get_primary().shard, ps->pg_whoami.shard,
4706         ps->get_osdmap_epoch(),
4707         ps->get_osdmap_epoch(),
4708         ps->info,
4709         ps->past_intervals));
4710   }
4711
4712   ps->update_heartbeat_peers();
4713
4714   return transit< Started >();
4715 }
4716
4717 boost::statechart::result PeeringState::Reset::react(const QueryState& q)
4718 {
4719   q.f->open_object_section("state");
4720   q.f->dump_string("name", state_name);
4721   q.f->dump_stream("enter_time") << enter_time;
4722   q.f->close_section();
4723   return discard_event();
4724 }
4725
4726 boost::statechart::result PeeringState::Reset::react(const QueryUnfound& q)
4727 {
4728   q.f->dump_string("state", "Reset");
4729   q.f->dump_bool("available_might_have_unfound", false);
4730   return discard_event();
4731 }
4732
4733 void PeeringState::Reset::exit()
4734 {
4735   context< PeeringMachine >().log_exit(state_name, enter_time);
4736   DECLARE_LOCALS;
4737   utime_t dur = ceph_clock_now() - enter_time;
4738   pl->get_peering_perf().tinc(rs_reset_latency, dur);
4739 }
4740
4741 /*-------Start---------*/
4742 PeeringState::Start::Start(my_context ctx)
4743   : my_base(ctx),
4744     NamedState(context< PeeringMachine >().state_history, "Start")
4745 {
4746   context< PeeringMachine >().log_enter(state_name);
4747
4748   DECLARE_LOCALS;
4749   if (ps->is_primary()) {
4750     psdout(1) << "transitioning to Primary" << dendl;
4751     post_event(MakePrimary());
4752   } else { //is_stray
4753     psdout(1) << "transitioning to Stray" << dendl;
4754     post_event(MakeStray());
4755   }
4756 }
4757
4758 void PeeringState::Start::exit()
4759 {
4760   context< PeeringMachine >().log_exit(state_name, enter_time);
4761   DECLARE_LOCALS;
4762   utime_t dur = ceph_clock_now() - enter_time;
4763   pl->get_peering_perf().tinc(rs_start_latency, dur);
4764 }
4765
4766 /*---------Primary--------*/
4767 PeeringState::Primary::Primary(my_context ctx)
4768   : my_base(ctx),
4769     NamedState(context< PeeringMachine >().state_history, "Started/Primary")
4770 {
4771   context< PeeringMachine >().log_enter(state_name);
4772   DECLARE_LOCALS;
4773   ceph_assert(ps->want_acting.empty());
4774
4775   // set CREATING bit until we have peered for the first time.
4776   if (ps->info.history.last_epoch_started == 0) {
4777     ps->state_set(PG_STATE_CREATING);
4778     // use the history timestamp, which ultimately comes from the
4779     // monitor in the create case.
4780     utime_t t = ps->info.history.last_scrub_stamp;
4781     ps->info.stats.last_fresh = t;
4782     ps->info.stats.last_active = t;
4783     ps->info.stats.last_change = t;
4784     ps->info.stats.last_peered = t;
4785     ps->info.stats.last_clean = t;
4786     ps->info.stats.last_unstale = t;
4787     ps->info.stats.last_undegraded = t;
4788     ps->info.stats.last_fullsized = t;
4789     ps->info.stats.last_scrub_stamp = t;
4790     ps->info.stats.last_deep_scrub_stamp = t;
4791     ps->info.stats.last_clean_scrub_stamp = t;
4792   }
4793 }
4794
4795 boost::statechart::result PeeringState::Primary::react(const MNotifyRec& notevt)
4796 {
4797   DECLARE_LOCALS;
4798   psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
4799   ps->proc_replica_info(
4800     notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
4801   return discard_event();
4802 }
4803
4804 boost::statechart::result PeeringState::Primary::react(const ActMap&)
4805 {
4806   DECLARE_LOCALS;
4807   psdout(7) << "handle ActMap primary" << dendl;
4808   pl->publish_stats_to_osd();
4809   return discard_event();
4810 }
4811
4812 boost::statechart::result PeeringState::Primary::react(
4813   const SetForceRecovery&)
4814 {
4815   DECLARE_LOCALS;
4816   ps->set_force_recovery(true);
4817   return discard_event();
4818 }
4819
4820 boost::statechart::result PeeringState::Primary::react(
4821   const UnsetForceRecovery&)
4822 {
4823   DECLARE_LOCALS;
4824   ps->set_force_recovery(false);
4825   return discard_event();
4826 }
4827
4828 boost::statechart::result PeeringState::Primary::react(
4829   const RequestScrub& evt)
4830 {
4831   DECLARE_LOCALS;
4832   if (ps->is_primary()) {
4833     pl->scrub_requested(evt.deep, evt.repair);
4834     psdout(10) << "marking for scrub" << dendl;
4835   }
4836   return discard_event();
4837 }
4838
4839 boost::statechart::result PeeringState::Primary::react(
4840   const SetForceBackfill&)
4841 {
4842   DECLARE_LOCALS;
4843   ps->set_force_backfill(true);
4844   return discard_event();
4845 }
4846
4847 boost::statechart::result PeeringState::Primary::react(
4848   const UnsetForceBackfill&)
4849 {
4850   DECLARE_LOCALS;
4851   ps->set_force_backfill(false);
4852   return discard_event();
4853 }
4854
4855 void PeeringState::Primary::exit()
4856 {
4857   context< PeeringMachine >().log_exit(state_name, enter_time);
4858   DECLARE_LOCALS;
4859   ps->want_acting.clear();
4860   utime_t dur = ceph_clock_now() - enter_time;
4861   pl->get_peering_perf().tinc(rs_primary_latency, dur);
4862   pl->clear_primary_state();
4863   ps->state_clear(PG_STATE_CREATING);
4864 }
4865
4866 /*---------Peering--------*/
4867 PeeringState::Peering::Peering(my_context ctx)
4868   : my_base(ctx),
4869     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering"),
4870     history_les_bound(false)
4871 {
4872   context< PeeringMachine >().log_enter(state_name);
4873   DECLARE_LOCALS;
4874
4875   ceph_assert(!ps->is_peered());
4876   ceph_assert(!ps->is_peering());
4877   ceph_assert(ps->is_primary());
4878   ps->state_set(PG_STATE_PEERING);
4879 }
4880
4881 boost::statechart::result PeeringState::Peering::react(const AdvMap& advmap)
4882 {
4883   DECLARE_LOCALS;
4884   psdout(10) << "Peering advmap" << dendl;
4885   if (prior_set.affected_by_map(*(advmap.osdmap), ps->dpp)) {
4886     psdout(1) << "Peering, affected_by_map, going to Reset" << dendl;
4887     post_event(advmap);
4888     return transit< Reset >();
4889   }
4890
4891   ps->adjust_need_up_thru(advmap.osdmap);
4892   ps->check_prior_readable_down_osds(advmap.osdmap);
4893
4894   return forward_event();
4895 }
4896
4897 boost::statechart::result PeeringState::Peering::react(const QueryState& q)
4898 {
4899   DECLARE_LOCALS;
4900
4901   q.f->open_object_section("state");
4902   q.f->dump_string("name", state_name);
4903   q.f->dump_stream("enter_time") << enter_time;
4904
4905   q.f->open_array_section("past_intervals");
4906   ps->past_intervals.dump(q.f);
4907   q.f->close_section();
4908
4909   q.f->open_array_section("probing_osds");
4910   for (auto p = prior_set.probe.begin(); p != prior_set.probe.end(); ++p)
4911     q.f->dump_stream("osd") << *p;
4912   q.f->close_section();
4913
4914   if (prior_set.pg_down)
4915     q.f->dump_string("blocked", "peering is blocked due to down osds");
4916
4917   q.f->open_array_section("down_osds_we_would_probe");
4918   for (auto p = prior_set.down.begin(); p != prior_set.down.end(); ++p)
4919     q.f->dump_int("osd", *p);
4920   q.f->close_section();
4921
4922   q.f->open_array_section("peering_blocked_by");
4923   for (auto p = prior_set.blocked_by.begin();
4924        p != prior_set.blocked_by.end();
4925        ++p) {
4926     q.f->open_object_section("osd");
4927     q.f->dump_int("osd", p->first);
4928     q.f->dump_int("current_lost_at", p->second);
4929     q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
4930     q.f->close_section();
4931   }
4932   q.f->close_section();
4933
4934   if (history_les_bound) {
4935     q.f->open_array_section("peering_blocked_by_detail");
4936     q.f->open_object_section("item");
4937     q.f->dump_string("detail","peering_blocked_by_history_les_bound");
4938     q.f->close_section();
4939     q.f->close_section();
4940   }
4941
4942   q.f->close_section();
4943   return forward_event();
4944 }
4945
4946 boost::statechart::result PeeringState::Peering::react(const QueryUnfound& q)
4947 {
4948   q.f->dump_string("state", "Peering");
4949   q.f->dump_bool("available_might_have_unfound", false);
4950   return discard_event();
4951 }
4952
4953 void PeeringState::Peering::exit()
4954 {
4955
4956   DECLARE_LOCALS;
4957   psdout(10) << "Leaving Peering" << dendl;
4958   context< PeeringMachine >().log_exit(state_name, enter_time);
4959   ps->state_clear(PG_STATE_PEERING);
4960   pl->clear_probe_targets();
4961
4962   utime_t dur = ceph_clock_now() - enter_time;
4963   pl->get_peering_perf().tinc(rs_peering_latency, dur);
4964 }
4965
4966
4967 /*------Backfilling-------*/
4968 PeeringState::Backfilling::Backfilling(my_context ctx)
4969   : my_base(ctx),
4970     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Backfilling")
4971 {
4972   context< PeeringMachine >().log_enter(state_name);
4973
4974
4975   DECLARE_LOCALS;
4976   ps->backfill_reserved = true;
4977   pl->on_backfill_reserved();
4978   ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
4979   ps->state_clear(PG_STATE_BACKFILL_WAIT);
4980   ps->state_set(PG_STATE_BACKFILLING);
4981   pl->publish_stats_to_osd();
4982 }
4983
4984 void PeeringState::Backfilling::backfill_release_reservations()
4985 {
4986   DECLARE_LOCALS;
4987   pl->cancel_local_background_io_reservation();
4988   for (auto it = ps->backfill_targets.begin();
4989        it != ps->backfill_targets.end();
4990        ++it) {
4991     ceph_assert(*it != ps->pg_whoami);
4992     pl->send_cluster_message(
4993       it->osd,
4994       TOPNSPC::make_message<MBackfillReserve>(
4995         MBackfillReserve::RELEASE,
4996         spg_t(ps->info.pgid.pgid, it->shard),
4997         ps->get_osdmap_epoch()),
4998       ps->get_osdmap_epoch());
4999   }
5000 }
5001
5002 void PeeringState::Backfilling::cancel_backfill()
5003 {
5004   DECLARE_LOCALS;
5005   backfill_release_reservations();
5006   pl->on_backfill_canceled();
5007 }
5008
5009 boost::statechart::result
5010 PeeringState::Backfilling::react(const Backfilled &c)
5011 {
5012   backfill_release_reservations();
5013   return transit<Recovered>();
5014 }
5015
5016 boost::statechart::result
5017 PeeringState::Backfilling::react(const DeferBackfill &c)
5018 {
5019   DECLARE_LOCALS;
5020
5021   psdout(10) << "defer backfill, retry delay " << c.delay << dendl;
5022   ps->state_set(PG_STATE_BACKFILL_WAIT);
5023   ps->state_clear(PG_STATE_BACKFILLING);
5024   cancel_backfill();
5025
5026   pl->schedule_event_after(
5027     std::make_shared<PGPeeringEvent>(
5028       ps->get_osdmap_epoch(),
5029       ps->get_osdmap_epoch(),
5030       RequestBackfill()),
5031     c.delay);
5032   return transit<NotBackfilling>();
5033 }
5034
5035 boost::statechart::result
5036 PeeringState::Backfilling::react(const UnfoundBackfill &c)
5037 {
5038   DECLARE_LOCALS;
5039   psdout(10) << "backfill has unfound, can't continue" << dendl;
5040   ps->state_set(PG_STATE_BACKFILL_UNFOUND);
5041   ps->state_clear(PG_STATE_BACKFILLING);
5042   cancel_backfill();
5043   return transit<NotBackfilling>();
5044 }
5045
5046 boost::statechart::result
5047 PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull &)
5048 {
5049   DECLARE_LOCALS;
5050
5051   ps->state_set(PG_STATE_BACKFILL_TOOFULL);
5052   ps->state_clear(PG_STATE_BACKFILLING);
5053   cancel_backfill();
5054
5055   pl->schedule_event_after(
5056     std::make_shared<PGPeeringEvent>(
5057       ps->get_osdmap_epoch(),
5058       ps->get_osdmap_epoch(),
5059       RequestBackfill()),
5060     ps->cct->_conf->osd_backfill_retry_interval);
5061
5062   return transit<NotBackfilling>();
5063 }
5064
5065 boost::statechart::result
5066 PeeringState::Backfilling::react(const RemoteReservationRevoked &)
5067 {
5068   DECLARE_LOCALS;
5069   ps->state_set(PG_STATE_BACKFILL_WAIT);
5070   cancel_backfill();
5071   if (ps->needs_backfill()) {
5072     return transit<WaitLocalBackfillReserved>();
5073   } else {
5074     // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
5075     return discard_event();
5076   }
5077 }
5078
5079 void PeeringState::Backfilling::exit()
5080 {
5081   context< PeeringMachine >().log_exit(state_name, enter_time);
5082   DECLARE_LOCALS;
5083   ps->backfill_reserved = false;
5084   ps->state_clear(PG_STATE_BACKFILLING);
5085   ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
5086   utime_t dur = ceph_clock_now() - enter_time;
5087   pl->get_peering_perf().tinc(rs_backfilling_latency, dur);
5088 }
5089
5090 /*--WaitRemoteBackfillReserved--*/
5091
5092 PeeringState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
5093   : my_base(ctx),
5094     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteBackfillReserved"),
5095     backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
5096 {
5097   context< PeeringMachine >().log_enter(state_name);
5098   DECLARE_LOCALS;
5099
5100   ps->state_set(PG_STATE_BACKFILL_WAIT);
5101   pl->publish_stats_to_osd();
5102   post_event(RemoteBackfillReserved());
5103 }
5104
5105 boost::statechart::result
5106 PeeringState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
5107 {
5108   DECLARE_LOCALS;
5109
5110   int64_t num_bytes = ps->info.stats.stats.sum.num_bytes;
5111   psdout(10) << __func__ << " num_bytes " << num_bytes << dendl;
5112   if (backfill_osd_it !=
5113       context< Active >().remote_shards_to_reserve_backfill.end()) {
5114     // The primary never backfills itself
5115     ceph_assert(*backfill_osd_it != ps->pg_whoami);
5116     pl->send_cluster_message(
5117       backfill_osd_it->osd,
5118       TOPNSPC::make_message<MBackfillReserve>(
5119         MBackfillReserve::REQUEST,
5120         spg_t(context< PeeringMachine >().spgid.pgid, backfill_osd_it->shard),
5121         ps->get_osdmap_epoch(),
5122         ps->get_backfill_priority(),
5123         num_bytes,
5124         ps->peer_bytes[*backfill_osd_it]),
5125       ps->get_osdmap_epoch());
5126     ++backfill_osd_it;
5127   } else {
5128     ps->peer_bytes.clear();
5129     post_event(AllBackfillsReserved());
5130   }
5131   return discard_event();
5132 }
5133
5134 void PeeringState::WaitRemoteBackfillReserved::exit()
5135 {
5136   context< PeeringMachine >().log_exit(state_name, enter_time);
5137   DECLARE_LOCALS;
5138
5139   utime_t dur = ceph_clock_now() - enter_time;
5140   pl->get_peering_perf().tinc(rs_waitremotebackfillreserved_latency, dur);
5141 }
5142
5143 void PeeringState::WaitRemoteBackfillReserved::retry()
5144 {
5145   DECLARE_LOCALS;
5146   pl->cancel_local_background_io_reservation();
5147
5148   // Send CANCEL to all previously acquired reservations
5149   set<pg_shard_t>::const_iterator it, begin, end;
5150   begin = context< Active >().remote_shards_to_reserve_backfill.begin();
5151   end = context< Active >().remote_shards_to_reserve_backfill.end();
5152   ceph_assert(begin != end);
5153   for (it = begin; it != backfill_osd_it; ++it) {
5154     // The primary never backfills itself
5155     ceph_assert(*it != ps->pg_whoami);
5156     pl->send_cluster_message(
5157       it->osd,
5158       TOPNSPC::make_message<MBackfillReserve>(
5159         MBackfillReserve::RELEASE,
5160         spg_t(context< PeeringMachine >().spgid.pgid, it->shard),
5161         ps->get_osdmap_epoch()),
5162       ps->get_osdmap_epoch());
5163   }
5164
5165   ps->state_clear(PG_STATE_BACKFILL_WAIT);
5166   pl->publish_stats_to_osd();
5167
5168   pl->schedule_event_after(
5169     std::make_shared<PGPeeringEvent>(
5170       ps->get_osdmap_epoch(),
5171       ps->get_osdmap_epoch(),
5172       RequestBackfill()),
5173     ps->cct->_conf->osd_backfill_retry_interval);
5174 }
5175
5176 boost::statechart::result
5177 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRejectedTooFull &evt)
5178 {
5179   DECLARE_LOCALS;
5180   ps->state_set(PG_STATE_BACKFILL_TOOFULL);
5181   retry();
5182   return transit<NotBackfilling>();
5183 }
5184
5185 boost::statechart::result
5186 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt)
5187 {
5188   retry();
5189   return transit<NotBackfilling>();
5190 }
5191
5192 /*--WaitLocalBackfillReserved--*/
5193 PeeringState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
5194   : my_base(ctx),
5195     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalBackfillReserved")
5196 {
5197   context< PeeringMachine >().log_enter(state_name);
5198   DECLARE_LOCALS;
5199
5200   ps->state_set(PG_STATE_BACKFILL_WAIT);
5201   pl->request_local_background_io_reservation(
5202     ps->get_backfill_priority(),
5203     std::make_unique<PGPeeringEvent>(
5204       ps->get_osdmap_epoch(),
5205       ps->get_osdmap_epoch(),
5206       LocalBackfillReserved()),
5207     std::make_unique<PGPeeringEvent>(
5208       ps->get_osdmap_epoch(),
5209       ps->get_osdmap_epoch(),
5210       DeferBackfill(0.0)));
5211   pl->publish_stats_to_osd();
5212 }
5213
5214 void PeeringState::WaitLocalBackfillReserved::exit()
5215 {
5216   context< PeeringMachine >().log_exit(state_name, enter_time);
5217   DECLARE_LOCALS;
5218   utime_t dur = ceph_clock_now() - enter_time;
5219   pl->get_peering_perf().tinc(rs_waitlocalbackfillreserved_latency, dur);
5220 }
5221
5222 /*----NotBackfilling------*/
5223 PeeringState::NotBackfilling::NotBackfilling(my_context ctx)
5224   : my_base(ctx),
5225     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotBackfilling")
5226 {
5227   context< PeeringMachine >().log_enter(state_name);
5228   DECLARE_LOCALS;
5229   ps->state_clear(PG_STATE_REPAIR);
5230   pl->publish_stats_to_osd();
5231 }
5232
5233 boost::statechart::result PeeringState::NotBackfilling::react(const QueryUnfound& q)
5234 {
5235   DECLARE_LOCALS;
5236
5237   ps->query_unfound(q.f, "NotBackfilling");
5238   return discard_event();
5239 }
5240
5241 boost::statechart::result
5242 PeeringState::NotBackfilling::react(const RemoteBackfillReserved &evt)
5243 {
5244   return discard_event();
5245 }
5246
5247 boost::statechart::result
5248 PeeringState::NotBackfilling::react(const RemoteReservationRejectedTooFull &evt)
5249 {
5250   return discard_event();
5251 }
5252
5253 void PeeringState::NotBackfilling::exit()
5254 {
5255   context< PeeringMachine >().log_exit(state_name, enter_time);
5256
5257   DECLARE_LOCALS;
5258   ps->state_clear(PG_STATE_BACKFILL_UNFOUND);
5259   utime_t dur = ceph_clock_now() - enter_time;
5260   pl->get_peering_perf().tinc(rs_notbackfilling_latency, dur);
5261 }
5262
5263 /*----NotRecovering------*/
5264 PeeringState::NotRecovering::NotRecovering(my_context ctx)
5265   : my_base(ctx),
5266     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotRecovering")
5267 {
5268   context< PeeringMachine >().log_enter(state_name);
5269   DECLARE_LOCALS;
5270   ps->state_clear(PG_STATE_REPAIR);
5271   pl->publish_stats_to_osd();
5272 }
5273
5274 boost::statechart::result PeeringState::NotRecovering::react(const QueryUnfound& q)
5275 {
5276   DECLARE_LOCALS;
5277
5278   ps->query_unfound(q.f, "NotRecovering");
5279   return discard_event();
5280 }
5281
5282 void PeeringState::NotRecovering::exit()
5283 {
5284   context< PeeringMachine >().log_exit(state_name, enter_time);
5285
5286   DECLARE_LOCALS;
5287   ps->state_clear(PG_STATE_RECOVERY_UNFOUND);
5288   utime_t dur = ceph_clock_now() - enter_time;
5289   pl->get_peering_perf().tinc(rs_notrecovering_latency, dur);
5290 }
5291
5292 /*---RepNotRecovering----*/
5293 PeeringState::RepNotRecovering::RepNotRecovering(my_context ctx)
5294   : my_base(ctx),
5295     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepNotRecovering")
5296 {
5297   context< PeeringMachine >().log_enter(state_name);
5298 }
5299
5300 boost::statechart::result
5301 PeeringState::RepNotRecovering::react(const RejectTooFullRemoteReservation &evt)
5302 {
5303   DECLARE_LOCALS;
5304   ps->reject_reservation();
5305   post_event(RemoteReservationRejectedTooFull());
5306   return discard_event();
5307 }
5308
5309 void PeeringState::RepNotRecovering::exit()
5310 {
5311   context< PeeringMachine >().log_exit(state_name, enter_time);
5312   DECLARE_LOCALS;
5313   utime_t dur = ceph_clock_now() - enter_time;
5314   pl->get_peering_perf().tinc(rs_repnotrecovering_latency, dur);
5315 }
5316
5317 /*---RepWaitRecoveryReserved--*/
5318 PeeringState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
5319   : my_base(ctx),
5320     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitRecoveryReserved")
5321 {
5322   context< PeeringMachine >().log_enter(state_name);
5323 }
5324
5325 boost::statechart::result
5326 PeeringState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
5327 {
5328   DECLARE_LOCALS;
5329   pl->send_cluster_message(
5330     ps->primary.osd,
5331     TOPNSPC::make_message<MRecoveryReserve>(
5332       MRecoveryReserve::GRANT,
5333       spg_t(ps->info.pgid.pgid, ps->primary.shard),
5334       ps->get_osdmap_epoch()),
5335     ps->get_osdmap_epoch());
5336   return transit<RepRecovering>();
5337 }
5338
5339 boost::statechart::result
5340 PeeringState::RepWaitRecoveryReserved::react(
5341   const RemoteReservationCanceled &evt)
5342 {
5343   DECLARE_LOCALS;
5344   pl->unreserve_recovery_space();
5345
5346   pl->cancel_remote_recovery_reservation();
5347   return transit<RepNotRecovering>();
5348 }
5349
5350 void PeeringState::RepWaitRecoveryReserved::exit()
5351 {
5352   context< PeeringMachine >().log_exit(state_name, enter_time);
5353   DECLARE_LOCALS;
5354   utime_t dur = ceph_clock_now() - enter_time;
5355   pl->get_peering_perf().tinc(rs_repwaitrecoveryreserved_latency, dur);
5356 }
5357
5358 /*-RepWaitBackfillReserved*/
5359 PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
5360   : my_base(ctx),
5361     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitBackfillReserved")
5362 {
5363   context< PeeringMachine >().log_enter(state_name);
5364 }
5365
5366 boost::statechart::result
5367 PeeringState::RepNotRecovering::react(const RequestBackfillPrio &evt)
5368 {
5369
5370   DECLARE_LOCALS;
5371
5372   if (!pl->try_reserve_recovery_space(
5373         evt.primary_num_bytes, evt.local_num_bytes)) {
5374     post_event(RejectTooFullRemoteReservation());
5375   } else {
5376     PGPeeringEventURef preempt;
5377     if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5378       // older peers will interpret preemption as TOOFULL
5379       preempt = std::make_unique<PGPeeringEvent>(
5380         pl->get_osdmap_epoch(),
5381         pl->get_osdmap_epoch(),
5382         RemoteBackfillPreempted());
5383     }
5384     pl->request_remote_recovery_reservation(
5385       evt.priority,
5386       std::make_unique<PGPeeringEvent>(
5387         pl->get_osdmap_epoch(),
5388         pl->get_osdmap_epoch(),
5389         RemoteBackfillReserved()),
5390       std::move(preempt));
5391   }
5392   return transit<RepWaitBackfillReserved>();
5393 }
5394
5395 boost::statechart::result
5396 PeeringState::RepNotRecovering::react(const RequestRecoveryPrio &evt)
5397 {
5398   DECLARE_LOCALS;
5399
5400   // fall back to a local reckoning of priority of primary doesn't pass one
5401   // (pre-mimic compat)
5402   int prio = evt.priority ? evt.priority : ps->get_recovery_priority();
5403
5404   PGPeeringEventURef preempt;
5405   if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5406     // older peers can't handle this
5407     preempt = std::make_unique<PGPeeringEvent>(
5408       ps->get_osdmap_epoch(),
5409       ps->get_osdmap_epoch(),
5410       RemoteRecoveryPreempted());
5411   }
5412
5413   pl->request_remote_recovery_reservation(
5414     prio,
5415     std::make_unique<PGPeeringEvent>(
5416       ps->get_osdmap_epoch(),
5417       ps->get_osdmap_epoch(),
5418       RemoteRecoveryReserved()),
5419     std::move(preempt));
5420   return transit<RepWaitRecoveryReserved>();
5421 }
5422
5423 void PeeringState::RepWaitBackfillReserved::exit()
5424 {
5425   context< PeeringMachine >().log_exit(state_name, enter_time);
5426   DECLARE_LOCALS;
5427   utime_t dur = ceph_clock_now() - enter_time;
5428   pl->get_peering_perf().tinc(rs_repwaitbackfillreserved_latency, dur);
5429 }
5430
5431 boost::statechart::result
5432 PeeringState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
5433 {
5434   DECLARE_LOCALS;
5435
5436
5437   pl->send_cluster_message(
5438       ps->primary.osd,
5439       TOPNSPC::make_message<MBackfillReserve>(
5440         MBackfillReserve::GRANT,
5441         spg_t(ps->info.pgid.pgid, ps->primary.shard),
5442         ps->get_osdmap_epoch()),
5443       ps->get_osdmap_epoch());
5444   return transit<RepRecovering>();
5445 }
5446
5447 boost::statechart::result
5448 PeeringState::RepWaitBackfillReserved::react(
5449   const RejectTooFullRemoteReservation &evt)
5450 {
5451   DECLARE_LOCALS;
5452   ps->reject_reservation();
5453   post_event(RemoteReservationRejectedTooFull());
5454   return discard_event();
5455 }
5456
5457 boost::statechart::result
5458 PeeringState::RepWaitBackfillReserved::react(
5459   const RemoteReservationRejectedTooFull &evt)
5460 {
5461   DECLARE_LOCALS;
5462   pl->unreserve_recovery_space();
5463
5464   pl->cancel_remote_recovery_reservation();
5465   return transit<RepNotRecovering>();
5466 }
5467
5468 boost::statechart::result
5469 PeeringState::RepWaitBackfillReserved::react(
5470   const RemoteReservationCanceled &evt)
5471 {
5472   DECLARE_LOCALS;
5473   pl->unreserve_recovery_space();
5474
5475   pl->cancel_remote_recovery_reservation();
5476   return transit<RepNotRecovering>();
5477 }
5478
5479 /*---RepRecovering-------*/
5480 PeeringState::RepRecovering::RepRecovering(my_context ctx)
5481   : my_base(ctx),
5482     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepRecovering")
5483 {
5484   context< PeeringMachine >().log_enter(state_name);
5485 }
5486
5487 boost::statechart::result
5488 PeeringState::RepRecovering::react(const RemoteRecoveryPreempted &)
5489 {
5490   DECLARE_LOCALS;
5491
5492
5493   pl->unreserve_recovery_space();
5494   pl->send_cluster_message(
5495     ps->primary.osd,
5496     TOPNSPC::make_message<MRecoveryReserve>(
5497       MRecoveryReserve::REVOKE,
5498       spg_t(ps->info.pgid.pgid, ps->primary.shard),
5499       ps->get_osdmap_epoch()),
5500     ps->get_osdmap_epoch());
5501   return discard_event();
5502 }
5503
5504 boost::statechart::result
5505 PeeringState::RepRecovering::react(const BackfillTooFull &)
5506 {
5507   DECLARE_LOCALS;
5508
5509
5510   pl->unreserve_recovery_space();
5511   pl->send_cluster_message(
5512     ps->primary.osd,
5513     TOPNSPC::make_message<MBackfillReserve>(
5514       MBackfillReserve::REVOKE_TOOFULL,
5515       spg_t(ps->info.pgid.pgid, ps->primary.shard),
5516       ps->get_osdmap_epoch()),
5517     ps->get_osdmap_epoch());
5518   return discard_event();
5519 }
5520
5521 boost::statechart::result
5522 PeeringState::RepRecovering::react(const RemoteBackfillPreempted &)
5523 {
5524   DECLARE_LOCALS;
5525
5526
5527   pl->unreserve_recovery_space();
5528   pl->send_cluster_message(
5529     ps->primary.osd,
5530     TOPNSPC::make_message<MBackfillReserve>(
5531       MBackfillReserve::REVOKE,
5532       spg_t(ps->info.pgid.pgid, ps->primary.shard),
5533       ps->get_osdmap_epoch()),
5534     ps->get_osdmap_epoch());
5535   return discard_event();
5536 }
5537
5538 void PeeringState::RepRecovering::exit()
5539 {
5540   context< PeeringMachine >().log_exit(state_name, enter_time);
5541   DECLARE_LOCALS;
5542   pl->unreserve_recovery_space();
5543
5544   pl->cancel_remote_recovery_reservation();
5545   utime_t dur = ceph_clock_now() - enter_time;
5546   pl->get_peering_perf().tinc(rs_reprecovering_latency, dur);
5547 }
5548
5549 /*------Activating--------*/
5550 PeeringState::Activating::Activating(my_context ctx)
5551   : my_base(ctx),
5552     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Activating")
5553 {
5554   context< PeeringMachine >().log_enter(state_name);
5555 }
5556
5557 void PeeringState::Activating::exit()
5558 {
5559   context< PeeringMachine >().log_exit(state_name, enter_time);
5560   DECLARE_LOCALS;
5561   utime_t dur = ceph_clock_now() - enter_time;
5562   pl->get_peering_perf().tinc(rs_activating_latency, dur);
5563 }
5564
5565 PeeringState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
5566   : my_base(ctx),
5567     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalRecoveryReserved")
5568 {
5569   context< PeeringMachine >().log_enter(state_name);
5570   DECLARE_LOCALS;
5571
5572   // Make sure all nodes that part of the recovery aren't full
5573   if (!ps->cct->_conf->osd_debug_skip_full_check_in_recovery &&
5574       ps->get_osdmap()->check_full(ps->acting_recovery_backfill)) {
5575     post_event(RecoveryTooFull());
5576     return;
5577   }
5578
5579   ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5580   ps->state_set(PG_STATE_RECOVERY_WAIT);
5581   pl->request_local_background_io_reservation(
5582     ps->get_recovery_priority(),
5583     std::make_unique<PGPeeringEvent>(
5584       ps->get_osdmap_epoch(),
5585       ps->get_osdmap_epoch(),
5586       LocalRecoveryReserved()),
5587     std::make_unique<PGPeeringEvent>(
5588       ps->get_osdmap_epoch(),
5589       ps->get_osdmap_epoch(),
5590       DeferRecovery(0.0)));
5591   pl->publish_stats_to_osd();
5592 }
5593
5594 boost::statechart::result
5595 PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
5596 {
5597   DECLARE_LOCALS;
5598   ps->state_set(PG_STATE_RECOVERY_TOOFULL);
5599   pl->schedule_event_after(
5600     std::make_shared<PGPeeringEvent>(
5601       ps->get_osdmap_epoch(),
5602       ps->get_osdmap_epoch(),
5603       DoRecovery()),
5604     ps->cct->_conf->osd_recovery_retry_interval);
5605   return transit<NotRecovering>();
5606 }
5607
5608 void PeeringState::WaitLocalRecoveryReserved::exit()
5609 {
5610   context< PeeringMachine >().log_exit(state_name, enter_time);
5611   DECLARE_LOCALS;
5612   utime_t dur = ceph_clock_now() - enter_time;
5613   pl->get_peering_perf().tinc(rs_waitlocalrecoveryreserved_latency, dur);
5614 }
5615
5616 PeeringState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
5617   : my_base(ctx),
5618     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
5619     remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
5620 {
5621   context< PeeringMachine >().log_enter(state_name);
5622   post_event(RemoteRecoveryReserved());
5623 }
5624
5625 boost::statechart::result
5626 PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
5627   DECLARE_LOCALS;
5628
5629   if (remote_recovery_reservation_it !=
5630       context< Active >().remote_shards_to_reserve_recovery.end()) {
5631     ceph_assert(*remote_recovery_reservation_it != ps->pg_whoami);
5632     pl->send_cluster_message(
5633       remote_recovery_reservation_it->osd,
5634       TOPNSPC::make_message<MRecoveryReserve>(
5635         MRecoveryReserve::REQUEST,
5636         spg_t(context< PeeringMachine >().spgid.pgid,
5637               remote_recovery_reservation_it->shard),
5638         ps->get_osdmap_epoch(),
5639         ps->get_recovery_priority()),
5640       ps->get_osdmap_epoch());
5641     ++remote_recovery_reservation_it;
5642   } else {
5643     post_event(AllRemotesReserved());
5644   }
5645   return discard_event();
5646 }
5647
5648 void PeeringState::WaitRemoteRecoveryReserved::exit()
5649 {
5650   context< PeeringMachine >().log_exit(state_name, enter_time);
5651   DECLARE_LOCALS;
5652   utime_t dur = ceph_clock_now() - enter_time;
5653   pl->get_peering_perf().tinc(rs_waitremoterecoveryreserved_latency, dur);
5654 }
5655
5656 PeeringState::Recovering::Recovering(my_context ctx)
5657   : my_base(ctx),
5658     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovering")
5659 {
5660   context< PeeringMachine >().log_enter(state_name);
5661
5662   DECLARE_LOCALS;
5663   ps->state_clear(PG_STATE_RECOVERY_WAIT);
5664   ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5665   ps->state_set(PG_STATE_RECOVERING);
5666   pl->on_recovery_reserved();
5667   ceph_assert(!ps->state_test(PG_STATE_ACTIVATING));
5668   pl->publish_stats_to_osd();
5669 }
5670
5671 void PeeringState::Recovering::release_reservations(bool cancel)
5672 {
5673   DECLARE_LOCALS;
5674   ceph_assert(cancel || !ps->pg_log.get_missing().have_missing());
5675
5676   // release remote reservations
5677   for (auto i = context< Active >().remote_shards_to_reserve_recovery.begin();
5678        i != context< Active >().remote_shards_to_reserve_recovery.end();
5679        ++i) {
5680     if (*i == ps->pg_whoami) // skip myself
5681       continue;
5682     pl->send_cluster_message(
5683       i->osd,
5684       TOPNSPC::make_message<MRecoveryReserve>(
5685         MRecoveryReserve::RELEASE,
5686         spg_t(ps->info.pgid.pgid, i->shard),
5687         ps->get_osdmap_epoch()),
5688       ps->get_osdmap_epoch());
5689   }
5690 }
5691
5692 boost::statechart::result
5693 PeeringState::Recovering::react(const AllReplicasRecovered &evt)
5694 {
5695   DECLARE_LOCALS;
5696   ps->state_clear(PG_STATE_FORCED_RECOVERY);
5697   release_reservations();
5698   pl->cancel_local_background_io_reservation();
5699   return transit<Recovered>();
5700 }
5701
5702 boost::statechart::result
5703 PeeringState::Recovering::react(const RequestBackfill &evt)
5704 {
5705   DECLARE_LOCALS;
5706
5707   release_reservations();
5708
5709   ps->state_clear(PG_STATE_FORCED_RECOVERY);
5710   pl->cancel_local_background_io_reservation();
5711   pl->publish_stats_to_osd();
5712   // transit any async_recovery_targets back into acting
5713   // so pg won't have to stay undersized for long
5714   // as backfill might take a long time to complete..
5715   if (!ps->async_recovery_targets.empty()) {
5716     pg_shard_t auth_log_shard;
5717     bool history_les_bound = false;
5718     // FIXME: Uh-oh we have to check this return value; choose_acting can fail!
5719     ps->choose_acting(auth_log_shard, true, &history_les_bound);
5720   }
5721   return transit<WaitLocalBackfillReserved>();
5722 }
5723
5724 boost::statechart::result
5725 PeeringState::Recovering::react(const DeferRecovery &evt)
5726 {
5727   DECLARE_LOCALS;
5728   if (!ps->state_test(PG_STATE_RECOVERING)) {
5729     // we may have finished recovery and have an AllReplicasRecovered
5730     // event queued to move us to the next state.
5731     psdout(10) << "got defer recovery but not recovering" << dendl;
5732     return discard_event();
5733   }
5734   psdout(10) << "defer recovery, retry delay " << evt.delay << dendl;
5735   ps->state_set(PG_STATE_RECOVERY_WAIT);
5736   pl->cancel_local_background_io_reservation();
5737   release_reservations(true);
5738   pl->schedule_event_after(
5739     std::make_shared<PGPeeringEvent>(
5740       ps->get_osdmap_epoch(),
5741       ps->get_osdmap_epoch(),
5742       DoRecovery()),
5743     evt.delay);
5744   return transit<NotRecovering>();
5745 }
5746
5747 boost::statechart::result
5748 PeeringState::Recovering::react(const UnfoundRecovery &evt)
5749 {
5750   DECLARE_LOCALS;
5751   psdout(10) << "recovery has unfound, can't continue" << dendl;
5752   ps->state_set(PG_STATE_RECOVERY_UNFOUND);
5753   pl->cancel_local_background_io_reservation();
5754   release_reservations(true);
5755   return transit<NotRecovering>();
5756 }
5757
5758 void PeeringState::Recovering::exit()
5759 {
5760   context< PeeringMachine >().log_exit(state_name, enter_time);
5761
5762   DECLARE_LOCALS;
5763   utime_t dur = ceph_clock_now() - enter_time;
5764   ps->state_clear(PG_STATE_RECOVERING);
5765   pl->get_peering_perf().tinc(rs_recovering_latency, dur);
5766 }
5767
5768 PeeringState::Recovered::Recovered(my_context ctx)
5769   : my_base(ctx),
5770     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovered")
5771 {
5772   pg_shard_t auth_log_shard;
5773
5774   context< PeeringMachine >().log_enter(state_name);
5775
5776   DECLARE_LOCALS;
5777
5778   ceph_assert(!ps->needs_recovery());
5779
5780   // if we finished backfill, all acting are active; recheck if
5781   // DEGRADED | UNDERSIZED is appropriate.
5782   ceph_assert(!ps->acting_recovery_backfill.empty());
5783   if (ps->get_osdmap()->get_pg_size(context< PeeringMachine >().spgid.pgid) <=
5784       ps->acting_recovery_backfill.size()) {
5785     ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
5786     pl->publish_stats_to_osd();
5787   }
5788
5789   // adjust acting set?  (e.g. because backfill completed...)
5790   bool history_les_bound = false;
5791   if (ps->acting != ps->up && !ps->choose_acting(auth_log_shard,
5792                                                  true, &history_les_bound)) {
5793     ceph_assert(ps->want_acting.size());
5794   } else if (!ps->async_recovery_targets.empty()) {
5795     // FIXME: Uh-oh we have to check this return value; choose_acting can fail!
5796     ps->choose_acting(auth_log_shard, true, &history_les_bound);
5797   }
5798
5799   if (context< Active >().all_replicas_activated  &&
5800       ps->async_recovery_targets.empty())
5801     post_event(GoClean());
5802 }
5803
5804 void PeeringState::Recovered::exit()
5805 {
5806   context< PeeringMachine >().log_exit(state_name, enter_time);
5807   DECLARE_LOCALS;
5808
5809   utime_t dur = ceph_clock_now() - enter_time;
5810   pl->get_peering_perf().tinc(rs_recovered_latency, dur);
5811 }
5812
5813 PeeringState::Clean::Clean(my_context ctx)
5814   : my_base(ctx),
5815     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Clean")
5816 {
5817   context< PeeringMachine >().log_enter(state_name);
5818
5819   DECLARE_LOCALS;
5820
5821   if (ps->info.last_complete != ps->info.last_update) {
5822     ceph_abort();
5823   }
5824
5825
5826   ps->try_mark_clean();
5827
5828   context< PeeringMachine >().get_cur_transaction().register_on_commit(
5829     pl->on_clean());
5830 }
5831
5832 void PeeringState::Clean::exit()
5833 {
5834   context< PeeringMachine >().log_exit(state_name, enter_time);
5835
5836   DECLARE_LOCALS;
5837   ps->state_clear(PG_STATE_CLEAN);
5838   utime_t dur = ceph_clock_now() - enter_time;
5839   pl->get_peering_perf().tinc(rs_clean_latency, dur);
5840 }
5841
5842 template <typename T>
5843 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
5844 {
5845   set<int> osds_found;
5846   set<pg_shard_t> out;
5847   for (auto i = in.begin(); i != in.end(); ++i) {
5848     if (*i != skip && !osds_found.count(i->osd)) {
5849       osds_found.insert(i->osd);
5850       out.insert(*i);
5851     }
5852   }
5853   return out;
5854 }
5855
5856 /*---------Active---------*/
5857 PeeringState::Active::Active(my_context ctx)
5858   : my_base(ctx),
5859     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active"),
5860     remote_shards_to_reserve_recovery(
5861       unique_osd_shard_set(
5862         context< PeeringMachine >().state->pg_whoami,
5863         context< PeeringMachine >().state->acting_recovery_backfill)),
5864     remote_shards_to_reserve_backfill(
5865       unique_osd_shard_set(
5866         context< PeeringMachine >().state->pg_whoami,
5867         context< PeeringMachine >().state->backfill_targets)),
5868     all_replicas_activated(false)
5869 {
5870   context< PeeringMachine >().log_enter(state_name);
5871
5872
5873   DECLARE_LOCALS;
5874
5875   ceph_assert(!ps->backfill_reserved);
5876   ceph_assert(ps->is_primary());
5877   psdout(10) << "In Active, about to call activate" << dendl;
5878   ps->start_flush(context< PeeringMachine >().get_cur_transaction());
5879   ps->activate(context< PeeringMachine >().get_cur_transaction(),
5880                ps->get_osdmap_epoch(),
5881                context< PeeringMachine >().get_recovery_ctx());
5882
5883   // everyone has to commit/ack before we are truly active
5884   ps->blocked_by.clear();
5885   for (auto p = ps->acting_recovery_backfill.begin();
5886        p != ps->acting_recovery_backfill.end();
5887        ++p) {
5888     if (p->shard != ps->pg_whoami.shard) {
5889       ps->blocked_by.insert(p->shard);
5890     }
5891   }
5892   pl->publish_stats_to_osd();
5893   psdout(10) << "Activate Finished" << dendl;
5894 }
5895
5896 boost::statechart::result PeeringState::Active::react(const AdvMap& advmap)
5897 {
5898   DECLARE_LOCALS;
5899
5900   if (ps->should_restart_peering(
5901         advmap.up_primary,
5902         advmap.acting_primary,
5903         advmap.newup,
5904         advmap.newacting,
5905         advmap.lastmap,
5906         advmap.osdmap)) {
5907     psdout(10) << "Active advmap interval change, fast return" << dendl;
5908     return forward_event();
5909   }
5910   psdout(10) << "Active advmap" << dendl;
5911   bool need_publish = false;
5912
5913   pl->on_active_advmap(advmap.osdmap);
5914   if (ps->dirty_big_info) {
5915     // share updated purged_snaps to mgr/mon so that we (a) stop reporting
5916     // purged snaps and (b) perhaps share more snaps that we have purged
5917     // but didn't fit in pg_stat_t.
5918     need_publish = true;
5919     ps->share_pg_info();
5920   }
5921
5922   bool need_acting_change = false;
5923   for (size_t i = 0; i < ps->want_acting.size(); i++) {
5924     int osd = ps->want_acting[i];
5925     if (!advmap.osdmap->is_up(osd)) {
5926       pg_shard_t osd_with_shard(osd, shard_id_t(i));
5927       if (!ps->is_acting(osd_with_shard) && !ps->is_up(osd_with_shard)) {
5928         psdout(10) << "Active stray osd." << osd << " in want_acting is down"
5929                    << dendl;
5930         need_acting_change = true;
5931       }
5932     }
5933   }
5934   if (need_acting_change) {
5935      psdout(10) << "Active need acting change, call choose_acting again"
5936                 << dendl;
5937     // possibly because we re-add some strays into the acting set and
5938     // some of them then go down in a subsequent map before we could see
5939     // the map changing the pg temp.
5940     // call choose_acting again to clear them out.
5941     // note that we leave restrict_to_up_acting to false in order to
5942     // not overkill any chosen stray that is still alive.
5943     pg_shard_t auth_log_shard;
5944     bool history_les_bound = false;
5945     ps->remove_down_peer_info(advmap.osdmap);
5946     ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
5947   }
5948
5949   /* Check for changes in pool size (if the acting set changed as a result,
5950    * this does not matter) */
5951   if (advmap.lastmap->get_pg_size(ps->info.pgid.pgid) !=
5952       ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid)) {
5953     if (ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid) <=
5954         ps->actingset.size()) {
5955       ps->state_clear(PG_STATE_UNDERSIZED);
5956     } else {
5957       ps->state_set(PG_STATE_UNDERSIZED);
5958     }
5959     // degraded changes will be detected by call from publish_stats_to_osd()
5960     need_publish = true;
5961   }
5962
5963   // if we haven't reported our PG stats in a long time, do so now.
5964   if (ps->info.stats.reported_epoch + ps->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
5965     psdout(20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - ps->info.stats.reported_epoch)
5966                        << " epochs" << dendl;
5967     need_publish = true;
5968   }
5969
5970   if (need_publish)
5971     pl->publish_stats_to_osd();
5972
5973   if (ps->check_prior_readable_down_osds(advmap.osdmap)) {
5974     pl->recheck_readable();
5975   }
5976
5977   return forward_event();
5978 }
5979
5980 boost::statechart::result PeeringState::Active::react(const ActMap&)
5981 {
5982   DECLARE_LOCALS;
5983   psdout(10) << "Active: handling ActMap" << dendl;
5984   ceph_assert(ps->is_primary());
5985
5986   pl->on_active_actmap();
5987
5988   if (ps->have_unfound()) {
5989     // object may have become unfound
5990     ps->discover_all_missing(context<PeeringMachine>().get_recovery_ctx().msgs);
5991   }
5992
5993   uint64_t unfound = ps->missing_loc.num_unfound();
5994   if (unfound > 0 &&
5995       ps->all_unfound_are_queried_or_lost(ps->get_osdmap())) {
5996     if (ps->cct->_conf->osd_auto_mark_unfound_lost) {
5997       pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has " << unfound
5998                             << " objects unfound and apparently lost, would automatically "
5999                             << "mark these objects lost but this feature is not yet implemented "
6000                             << "(osd_auto_mark_unfound_lost)";
6001     } else
6002       pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has "
6003                              << unfound << " objects unfound and apparently lost";
6004   }
6005
6006   return forward_event();
6007 }
6008
6009 boost::statechart::result PeeringState::Active::react(const MNotifyRec& notevt)
6010 {
6011
6012   DECLARE_LOCALS;
6013   ceph_assert(ps->is_primary());
6014   if (ps->peer_info.count(notevt.from)) {
6015     psdout(10) << "Active: got notify from " << notevt.from
6016                        << ", already have info from that osd, ignoring"
6017                        << dendl;
6018   } else if (ps->peer_purged.count(notevt.from)) {
6019     psdout(10) << "Active: got notify from " << notevt.from
6020                        << ", already purged that peer, ignoring"
6021                        << dendl;
6022   } else {
6023     psdout(10) << "Active: got notify from " << notevt.from
6024                        << ", calling proc_replica_info and discover_all_missing"
6025                        << dendl;
6026     ps->proc_replica_info(
6027       notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
6028     if (ps->have_unfound() || (ps->is_degraded() && ps->might_have_unfound.count(notevt.from))) {
6029       ps->discover_all_missing(
6030         context<PeeringMachine>().get_recovery_ctx().msgs);
6031     }
6032     // check if it is a previous down acting member that's coming back.
6033     // if so, request pg_temp change to trigger a new interval transition
6034     pg_shard_t auth_log_shard;
6035     bool history_les_bound = false;
6036     // FIXME: Uh-oh we have to check this return value; choose_acting can fail!
6037     ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
6038     if (!ps->want_acting.empty() && ps->want_acting != ps->acting) {
6039       psdout(10) << "Active: got notify from previous acting member "
6040                  << notevt.from << ", requesting pg_temp change"
6041                  << dendl;
6042     }
6043   }
6044   return discard_event();
6045 }
6046
6047 boost::statechart::result PeeringState::Active::react(const MTrim& trim)
6048 {
6049   DECLARE_LOCALS;
6050   ceph_assert(ps->is_primary());
6051
6052   // peer is informing us of their last_complete_ondisk
6053   ldout(ps->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl;
6054   ps->update_peer_last_complete_ondisk(pg_shard_t{trim.from, trim.shard},
6055                                        trim.trim_to);
6056   // trim log when the pg is recovered
6057   ps->calc_min_last_complete_ondisk();
6058   return discard_event();
6059 }
6060
6061 boost::statechart::result PeeringState::Active::react(const MInfoRec& infoevt)
6062 {
6063   DECLARE_LOCALS;
6064   ceph_assert(ps->is_primary());
6065
6066   ceph_assert(!ps->acting_recovery_backfill.empty());
6067   if (infoevt.lease_ack) {
6068     ps->proc_lease_ack(infoevt.from.osd, *infoevt.lease_ack);
6069   }
6070   // don't update history (yet) if we are active and primary; the replica
6071   // may be telling us they have activated (and committed) but we can't
6072   // share that until _everyone_ does the same.
6073   if (ps->is_acting_recovery_backfill(infoevt.from) &&
6074       ps->peer_activated.count(infoevt.from) == 0) {
6075     psdout(10) << " peer osd." << infoevt.from
6076                << " activated and committed" << dendl;
6077     ps->peer_activated.insert(infoevt.from);
6078     ps->blocked_by.erase(infoevt.from.shard);
6079     pl->publish_stats_to_osd();
6080     if (ps->peer_activated.size() == ps->acting_recovery_backfill.size()) {
6081       all_activated_and_committed();
6082     }
6083   }
6084   return discard_event();
6085 }
6086
6087 boost::statechart::result PeeringState::Active::react(const MLogRec& logevt)
6088 {
6089   DECLARE_LOCALS;
6090   psdout(10) << "searching osd." << logevt.from
6091                      << " log for unfound items" << dendl;
6092   ps->proc_replica_log(
6093     logevt.msg->info, logevt.msg->log, std::move(logevt.msg->missing), logevt.from);
6094   bool got_missing = ps->search_for_missing(
6095     ps->peer_info[logevt.from],
6096     ps->peer_missing[logevt.from],
6097     logevt.from,
6098     context< PeeringMachine >().get_recovery_ctx());
6099   // If there are missing AND we are "fully" active then start recovery now
6100   if (got_missing && ps->state_test(PG_STATE_ACTIVE)) {
6101     post_event(DoRecovery());
6102   }
6103   return discard_event();
6104 }
6105
6106 boost::statechart::result PeeringState::Active::react(const QueryState& q)
6107 {
6108   DECLARE_LOCALS;
6109
6110   q.f->open_object_section("state");
6111   q.f->dump_string("name", state_name);
6112   q.f->dump_stream("enter_time") << enter_time;
6113
6114   {
6115     q.f->open_array_section("might_have_unfound");
6116     for (auto p = ps->might_have_unfound.begin();
6117          p != ps->might_have_unfound.end();
6118          ++p) {
6119       q.f->open_object_section("osd");
6120       q.f->dump_stream("osd") << *p;
6121       if (ps->peer_missing.count(*p)) {
6122         q.f->dump_string("status", "already probed");
6123       } else if (ps->peer_missing_requested.count(*p)) {
6124         q.f->dump_string("status", "querying");
6125       } else if (!ps->get_osdmap()->is_up(p->osd)) {
6126         q.f->dump_string("status", "osd is down");
6127       } else {
6128         q.f->dump_string("status", "not queried");
6129       }
6130       q.f->close_section();
6131     }
6132     q.f->close_section();
6133   }
6134   {
6135     q.f->open_object_section("recovery_progress");
6136     q.f->open_array_section("backfill_targets");
6137     for (auto p = ps->backfill_targets.begin();
6138          p != ps->backfill_targets.end(); ++p)
6139       q.f->dump_stream("replica") << *p;
6140     q.f->close_section();
6141     pl->dump_recovery_info(q.f);
6142     q.f->close_section();
6143   }
6144
6145   q.f->close_section();
6146   return forward_event();
6147 }
6148
6149 boost::statechart::result PeeringState::Active::react(const QueryUnfound& q)
6150 {
6151   DECLARE_LOCALS;
6152
6153   ps->query_unfound(q.f, "Active");
6154   return discard_event();
6155 }
6156
6157 boost::statechart::result PeeringState::Active::react(
6158   const ActivateCommitted &evt)
6159 {
6160   DECLARE_LOCALS;
6161   ceph_assert(!ps->peer_activated.count(ps->pg_whoami));
6162   ps->peer_activated.insert(ps->pg_whoami);
6163   psdout(10) << "_activate_committed " << evt.epoch
6164              << " peer_activated now " << ps->peer_activated
6165              << " last_interval_started "
6166              << ps->info.history.last_interval_started
6167              << " last_epoch_started "
6168              << ps->info.history.last_epoch_started
6169              << " same_interval_since "
6170              << ps->info.history.same_interval_since
6171              << dendl;
6172   ceph_assert(!ps->acting_recovery_backfill.empty());
6173   if (ps->peer_activated.size() == ps->acting_recovery_backfill.size())
6174     all_activated_and_committed();
6175   return discard_event();
6176 }
6177
6178 boost::statechart::result PeeringState::Active::react(const AllReplicasActivated &evt)
6179 {
6180
6181   DECLARE_LOCALS;
6182   pg_t pgid = context< PeeringMachine >().spgid.pgid;
6183
6184   all_replicas_activated = true;
6185
6186   ps->state_clear(PG_STATE_ACTIVATING);
6187   ps->state_clear(PG_STATE_CREATING);
6188   ps->state_clear(PG_STATE_PREMERGE);
6189
6190   bool merge_target;
6191   if (ps->pool.info.is_pending_merge(pgid, &merge_target)) {
6192     ps->state_set(PG_STATE_PEERED);
6193     ps->state_set(PG_STATE_PREMERGE);
6194
6195     if (ps->actingset.size() != ps->get_osdmap()->get_pg_size(pgid)) {
6196       if (merge_target) {
6197         pg_t src = pgid;
6198         src.set_ps(ps->pool.info.get_pg_num_pending());
6199         assert(src.get_parent() == pgid);
6200         pl->set_not_ready_to_merge_target(pgid, src);
6201       } else {
6202         pl->set_not_ready_to_merge_source(pgid);
6203       }
6204     }
6205   } else if (!ps->acting_set_writeable()) {
6206     ps->state_set(PG_STATE_PEERED);
6207   } else {
6208     ps->state_set(PG_STATE_ACTIVE);
6209   }
6210
6211   auto mnow = pl->get_mnow();
6212   if (ps->prior_readable_until_ub > mnow) {
6213     psdout(10) << " waiting for prior_readable_until_ub "
6214                << ps->prior_readable_until_ub << " > mnow " << mnow << dendl;
6215     ps->state_set(PG_STATE_WAIT);
6216     pl->queue_check_readable(
6217       ps->last_peering_reset,
6218       ps->prior_readable_until_ub - mnow);
6219   } else {
6220     psdout(10) << " mnow " << mnow << " >= prior_readable_until_ub "
6221                << ps->prior_readable_until_ub << dendl;
6222   }
6223
6224   if (ps->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) {
6225     pl->send_pg_created(pgid);
6226   }
6227
6228   ps->info.history.last_epoch_started = ps->info.last_epoch_started;
6229   ps->info.history.last_interval_started = ps->info.last_interval_started;
6230   ps->dirty_info = true;
6231
6232   ps->share_pg_info();
6233   pl->publish_stats_to_osd();
6234
6235   pl->on_activate_complete();
6236
6237   return discard_event();
6238 }
6239
6240 boost::statechart::result PeeringState::Active::react(const RenewLease& rl)
6241 {
6242   DECLARE_LOCALS;
6243   ps->proc_renew_lease();
6244   return discard_event();
6245 }
6246
6247 boost::statechart::result PeeringState::Active::react(const MLeaseAck& la)
6248 {
6249   DECLARE_LOCALS;
6250   ps->proc_lease_ack(la.from, la.lease_ack);
6251   return discard_event();
6252 }
6253
6254
6255 boost::statechart::result PeeringState::Active::react(const CheckReadable &evt)
6256 {
6257   DECLARE_LOCALS;
6258   pl->recheck_readable();
6259   return discard_event();
6260 }
6261
6262 /*
6263  * update info.history.last_epoch_started ONLY after we and all
6264  * replicas have activated AND committed the activate transaction
6265  * (i.e. the peering results are stable on disk).
6266  */
6267 void PeeringState::Active::all_activated_and_committed()
6268 {
6269   DECLARE_LOCALS;
6270   psdout(10) << "all_activated_and_committed" << dendl;
6271   ceph_assert(ps->is_primary());
6272   ceph_assert(ps->peer_activated.size() == ps->acting_recovery_backfill.size());
6273   ceph_assert(!ps->acting_recovery_backfill.empty());
6274   ceph_assert(ps->blocked_by.empty());
6275
6276   assert(HAVE_FEATURE(ps->upacting_features, SERVER_OCTOPUS));
6277   // this is overkill when the activation is quick, but when it is slow it
6278   // is important, because the lease was renewed by the activate itself but we
6279   // don't know how long ago that was, and simply scheduling now may leave
6280   // a gap in lease coverage.  keep it simple and aggressively renew.
6281   ps->renew_lease(pl->get_mnow());
6282   ps->send_lease();
6283   ps->schedule_renew_lease();
6284
6285   // Degraded?
6286   ps->update_calc_stats();
6287   if (ps->info.stats.stats.sum.num_objects_degraded) {
6288     ps->state_set(PG_STATE_DEGRADED);
6289   } else {
6290     ps->state_clear(PG_STATE_DEGRADED);
6291   }
6292
6293   post_event(PeeringState::AllReplicasActivated());
6294 }
6295
6296
6297 void PeeringState::Active::exit()
6298 {
6299   context< PeeringMachine >().log_exit(state_name, enter_time);
6300
6301
6302   DECLARE_LOCALS;
6303   pl->cancel_local_background_io_reservation();
6304
6305   ps->blocked_by.clear();
6306   ps->backfill_reserved = false;
6307   ps->state_clear(PG_STATE_ACTIVATING);
6308   ps->state_clear(PG_STATE_DEGRADED);
6309   ps->state_clear(PG_STATE_UNDERSIZED);
6310   ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
6311   ps->state_clear(PG_STATE_BACKFILL_WAIT);
6312   ps->state_clear(PG_STATE_RECOVERY_WAIT);
6313   ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
6314   utime_t dur = ceph_clock_now() - enter_time;
6315   pl->get_peering_perf().tinc(rs_active_latency, dur);
6316   pl->on_active_exit();
6317 }
6318
6319 /*------ReplicaActive-----*/
6320 PeeringState::ReplicaActive::ReplicaActive(my_context ctx)
6321   : my_base(ctx),
6322     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive")
6323 {
6324   context< PeeringMachine >().log_enter(state_name);
6325
6326   DECLARE_LOCALS;
6327   ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6328 }
6329
6330
6331 boost::statechart::result PeeringState::ReplicaActive::react(
6332   const Activate& actevt) {
6333   DECLARE_LOCALS;
6334   psdout(10) << "In ReplicaActive, about to call activate" << dendl;
6335   ps->activate(
6336     context< PeeringMachine >().get_cur_transaction(),
6337     actevt.activation_epoch,
6338     context< PeeringMachine >().get_recovery_ctx());
6339   psdout(10) << "Activate Finished" << dendl;
6340   return discard_event();
6341 }
6342
6343 boost::statechart::result PeeringState::ReplicaActive::react(
6344   const ActivateCommitted &evt)
6345 {
6346   DECLARE_LOCALS;
6347   psdout(10) << __func__ << " " << evt.epoch << " telling primary" << dendl;
6348
6349   auto &rctx = context<PeeringMachine>().get_recovery_ctx();
6350   auto epoch = ps->get_osdmap_epoch();
6351   pg_info_t i = ps->info;
6352   i.history.last_epoch_started = evt.activation_epoch;
6353   i.history.last_interval_started = i.history.same_interval_since;
6354   rctx.send_info(
6355     ps->get_primary().osd,
6356     spg_t(ps->info.pgid.pgid, ps->get_primary().shard),
6357     epoch,
6358     epoch,
6359     i,
6360     {}, /* lease */
6361     ps->get_lease_ack());
6362
6363   if (ps->acting_set_writeable()) {
6364     ps->state_set(PG_STATE_ACTIVE);
6365   } else {
6366     ps->state_set(PG_STATE_PEERED);
6367   }
6368   pl->on_activate_committed();
6369
6370   return discard_event();
6371 }
6372
6373 boost::statechart::result PeeringState::ReplicaActive::react(const MLease& l)
6374 {
6375   DECLARE_LOCALS;
6376   spg_t spgid = context< PeeringMachine >().spgid;
6377   epoch_t epoch = pl->get_osdmap_epoch();
6378
6379   ps->proc_lease(l.lease);
6380   pl->send_cluster_message(
6381     ps->get_primary().osd,
6382     TOPNSPC::make_message<MOSDPGLeaseAck>(epoch,
6383                        spg_t(spgid.pgid, ps->get_primary().shard),
6384                        ps->get_lease_ack()),
6385     epoch);
6386   return discard_event();
6387 }
6388
6389 boost::statechart::result PeeringState::ReplicaActive::react(const MInfoRec& infoevt)
6390 {
6391   DECLARE_LOCALS;
6392   ps->proc_primary_info(context<PeeringMachine>().get_cur_transaction(),
6393                         infoevt.info);
6394   return discard_event();
6395 }
6396
6397 boost::statechart::result PeeringState::ReplicaActive::react(const MLogRec& logevt)
6398 {
6399   DECLARE_LOCALS;
6400   psdout(10) << "received log from " << logevt.from << dendl;
6401   ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6402   ps->merge_log(t, logevt.msg->info, std::move(logevt.msg->log), logevt.from);
6403   ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6404   if (logevt.msg->lease) {
6405     ps->proc_lease(*logevt.msg->lease);
6406   }
6407
6408   return discard_event();
6409 }
6410
6411 boost::statechart::result PeeringState::ReplicaActive::react(const MTrim& trim)
6412 {
6413   DECLARE_LOCALS;
6414   // primary is instructing us to trim
6415   ps->pg_log.trim(trim.trim_to, ps->info);
6416   ps->dirty_info = true;
6417   return discard_event();
6418 }
6419
6420 boost::statechart::result PeeringState::ReplicaActive::react(const ActMap&)
6421 {
6422   DECLARE_LOCALS;
6423   if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6424     ps->info.history.refresh_prior_readable_until_ub(
6425       pl->get_mnow(), ps->prior_readable_until_ub);
6426     context< PeeringMachine >().send_notify(
6427       ps->get_primary().osd,
6428       pg_notify_t(
6429         ps->get_primary().shard, ps->pg_whoami.shard,
6430         ps->get_osdmap_epoch(),
6431         ps->get_osdmap_epoch(),
6432         ps->info,
6433         ps->past_intervals));
6434   }
6435   return discard_event();
6436 }
6437
6438 boost::statechart::result PeeringState::ReplicaActive::react(
6439   const MQuery& query)
6440 {
6441   DECLARE_LOCALS;
6442   ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6443   return discard_event();
6444 }
6445
6446 boost::statechart::result PeeringState::ReplicaActive::react(const QueryState& q)
6447 {
6448   q.f->open_object_section("state");
6449   q.f->dump_string("name", state_name);
6450   q.f->dump_stream("enter_time") << enter_time;
6451   q.f->close_section();
6452   return forward_event();
6453 }
6454
6455 boost::statechart::result PeeringState::ReplicaActive::react(const QueryUnfound& q)
6456 {
6457   q.f->dump_string("state", "ReplicaActive");
6458   q.f->dump_bool("available_might_have_unfound", false);
6459   return discard_event();
6460 }
6461
6462 void PeeringState::ReplicaActive::exit()
6463 {
6464   context< PeeringMachine >().log_exit(state_name, enter_time);
6465   DECLARE_LOCALS;
6466   pl->unreserve_recovery_space();
6467
6468   pl->cancel_remote_recovery_reservation();
6469   utime_t dur = ceph_clock_now() - enter_time;
6470   pl->get_peering_perf().tinc(rs_replicaactive_latency, dur);
6471
6472   ps->min_last_complete_ondisk = eversion_t();
6473 }
6474
6475 /*-------Stray---*/
6476 PeeringState::Stray::Stray(my_context ctx)
6477   : my_base(ctx),
6478     NamedState(context< PeeringMachine >().state_history, "Started/Stray")
6479 {
6480   context< PeeringMachine >().log_enter(state_name);
6481
6482
6483   DECLARE_LOCALS;
6484   ceph_assert(!ps->is_peered());
6485   ceph_assert(!ps->is_peering());
6486   ceph_assert(!ps->is_primary());
6487
6488   if (!ps->get_osdmap()->have_pg_pool(ps->info.pgid.pgid.pool())) {
6489     ldout(ps->cct,10) << __func__ << " pool is deleted" << dendl;
6490     post_event(DeleteStart());
6491   } else {
6492     ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6493   }
6494 }
6495
6496 boost::statechart::result PeeringState::Stray::react(const MLogRec& logevt)
6497 {
6498   DECLARE_LOCALS;
6499   MOSDPGLog *msg = logevt.msg.get();
6500   psdout(10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
6501
6502   ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6503   if (msg->info.last_backfill == hobject_t()) {
6504     // restart backfill
6505     ps->info = msg->info;
6506     pl->on_info_history_change();
6507     ps->dirty_info = true;
6508     ps->dirty_big_info = true;  // maybe.
6509
6510     PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6511     ps->pg_log.reset_backfill_claim_log(msg->log, rollbacker.get());
6512
6513     ps->pg_log.reset_backfill();
6514   } else {
6515     ps->merge_log(t, msg->info, std::move(msg->log), logevt.from);
6516   }
6517   if (logevt.msg->lease) {
6518     ps->proc_lease(*logevt.msg->lease);
6519   }
6520
6521   ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6522
6523   post_event(Activate(logevt.msg->info.last_epoch_started));
6524   return transit<ReplicaActive>();
6525 }
6526
6527 boost::statechart::result PeeringState::Stray::react(const MInfoRec& infoevt)
6528 {
6529   DECLARE_LOCALS;
6530   psdout(10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
6531
6532   if (ps->info.last_update > infoevt.info.last_update) {
6533     // rewind divergent log entries
6534     ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6535     ps->rewind_divergent_log(t, infoevt.info.last_update);
6536     ps->info.stats = infoevt.info.stats;
6537     ps->info.hit_set = infoevt.info.hit_set;
6538   }
6539
6540   if (infoevt.lease) {
6541     ps->proc_lease(*infoevt.lease);
6542   }
6543
6544   ceph_assert(infoevt.info.last_update == ps->info.last_update);
6545   ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6546
6547   post_event(Activate(infoevt.info.last_epoch_started));
6548   return transit<ReplicaActive>();
6549 }
6550
6551 boost::statechart::result PeeringState::Stray::react(const MQuery& query)
6552 {
6553   DECLARE_LOCALS;
6554   ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6555   return discard_event();
6556 }
6557
6558 boost::statechart::result PeeringState::Stray::react(const ActMap&)
6559 {
6560   DECLARE_LOCALS;
6561   if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6562     ps->info.history.refresh_prior_readable_until_ub(
6563       pl->get_mnow(), ps->prior_readable_until_ub);
6564     context< PeeringMachine >().send_notify(
6565       ps->get_primary().osd,
6566       pg_notify_t(
6567         ps->get_primary().shard, ps->pg_whoami.shard,
6568         ps->get_osdmap_epoch(),
6569         ps->get_osdmap_epoch(),
6570         ps->info,
6571         ps->past_intervals));
6572   }
6573   return discard_event();
6574 }
6575
6576 void PeeringState::Stray::exit()
6577 {
6578   context< PeeringMachine >().log_exit(state_name, enter_time);
6579   DECLARE_LOCALS;
6580   utime_t dur = ceph_clock_now() - enter_time;
6581   pl->get_peering_perf().tinc(rs_stray_latency, dur);
6582 }
6583
6584
6585 /*--------ToDelete----------*/
6586 PeeringState::ToDelete::ToDelete(my_context ctx)
6587   : my_base(ctx),
6588     NamedState(context< PeeringMachine >().state_history, "Started/ToDelete")
6589 {
6590   context< PeeringMachine >().log_enter(state_name);
6591   DECLARE_LOCALS;
6592   pl->get_perf_logger().inc(l_osd_pg_removing);
6593 }
6594
6595 void PeeringState::ToDelete::exit()
6596 {
6597   context< PeeringMachine >().log_exit(state_name, enter_time);
6598   DECLARE_LOCALS;
6599   // note: on a successful removal, this path doesn't execute. see
6600   // do_delete_work().
6601   pl->get_perf_logger().dec(l_osd_pg_removing);
6602
6603   pl->cancel_local_background_io_reservation();
6604 }
6605
6606 /*----WaitDeleteReserved----*/
6607 PeeringState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx)
6608   : my_base(ctx),
6609     NamedState(context< PeeringMachine >().state_history,
6610                "Started/ToDelete/WaitDeleteReseved")
6611 {
6612   context< PeeringMachine >().log_enter(state_name);
6613   DECLARE_LOCALS;
6614   context< ToDelete >().priority = ps->get_delete_priority();
6615
6616   pl->cancel_local_background_io_reservation();
6617   pl->request_local_background_io_reservation(
6618     context<ToDelete>().priority,
6619     std::make_unique<PGPeeringEvent>(
6620       ps->get_osdmap_epoch(),
6621       ps->get_osdmap_epoch(),
6622       DeleteReserved()),
6623     std::make_unique<PGPeeringEvent>(
6624       ps->get_osdmap_epoch(),
6625       ps->get_osdmap_epoch(),
6626       DeleteInterrupted()));
6627 }
6628
6629 boost::statechart::result PeeringState::ToDelete::react(
6630   const ActMap& evt)
6631 {
6632   DECLARE_LOCALS;
6633   if (ps->get_delete_priority() != priority) {
6634     psdout(10) << __func__ << " delete priority changed, resetting"
6635                    << dendl;
6636     return transit<ToDelete>();
6637   }
6638   return discard_event();
6639 }
6640
6641 void PeeringState::WaitDeleteReserved::exit()
6642 {
6643   context< PeeringMachine >().log_exit(state_name, enter_time);
6644 }
6645
6646 /*----Deleting-----*/
6647 PeeringState::Deleting::Deleting(my_context ctx)
6648   : my_base(ctx),
6649     NamedState(context< PeeringMachine >().state_history, "Started/ToDelete/Deleting")
6650 {
6651   context< PeeringMachine >().log_enter(state_name);
6652
6653   DECLARE_LOCALS;
6654   ps->deleting = true;
6655   ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6656
6657   // clear log
6658   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6659   ps->pg_log.roll_forward(rollbacker.get());
6660
6661   // adjust info to backfill
6662   ps->info.set_last_backfill(hobject_t());
6663   ps->pg_log.reset_backfill();
6664   ps->dirty_info = true;
6665
6666   pl->on_removal(t);
6667 }
6668
6669 boost::statechart::result PeeringState::Deleting::react(
6670   const DeleteSome& evt)
6671 {
6672   DECLARE_LOCALS;
6673   std::pair<ghobject_t, bool> p;
6674   p = pl->do_delete_work(context<PeeringMachine>().get_cur_transaction(),
6675     next);
6676   next = p.first;
6677   return p.second ? discard_event() : terminate();
6678 }
6679
6680 void PeeringState::Deleting::exit()
6681 {
6682   context< PeeringMachine >().log_exit(state_name, enter_time);
6683   DECLARE_LOCALS;
6684   ps->deleting = false;
6685   pl->cancel_local_background_io_reservation();
6686 }
6687
6688 /*--------GetInfo---------*/
6689 PeeringState::GetInfo::GetInfo(my_context ctx)
6690   : my_base(ctx),
6691     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetInfo")
6692 {
6693   context< PeeringMachine >().log_enter(state_name);
6694
6695
6696   DECLARE_LOCALS;
6697   ps->check_past_interval_bounds();
6698   ps->log_weirdness();
6699   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6700
6701   ceph_assert(ps->blocked_by.empty());
6702
6703   prior_set = ps->build_prior();
6704   ps->prior_readable_down_osds = prior_set.down;
6705
6706   if (ps->prior_readable_down_osds.empty()) {
6707     psdout(10) << " no prior_set down osds, will clear prior_readable_until_ub before activating"
6708                << dendl;
6709   }
6710
6711   ps->reset_min_peer_features();
6712   get_infos();
6713   if (prior_set.pg_down) {
6714     post_event(IsDown());
6715   } else if (peer_info_requested.empty()) {
6716     post_event(GotInfo());
6717   }
6718 }
6719
6720 void PeeringState::GetInfo::get_infos()
6721 {
6722   DECLARE_LOCALS;
6723   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6724
6725   ps->blocked_by.clear();
6726   for (auto it = prior_set.probe.begin(); it != prior_set.probe.end(); ++it) {
6727     pg_shard_t peer = *it;
6728     if (peer == ps->pg_whoami) {
6729       continue;
6730     }
6731     if (ps->peer_info.count(peer)) {
6732       psdout(10) << " have osd." << peer << " info " << ps->peer_info[peer] << dendl;
6733       continue;
6734     }
6735     if (peer_info_requested.count(peer)) {
6736       psdout(10) << " already requested info from osd." << peer << dendl;
6737       ps->blocked_by.insert(peer.osd);
6738     } else if (!ps->get_osdmap()->is_up(peer.osd)) {
6739       psdout(10) << " not querying info from down osd." << peer << dendl;
6740     } else {
6741       psdout(10) << " querying info from osd." << peer << dendl;
6742       context< PeeringMachine >().send_query(
6743         peer.osd,
6744         pg_query_t(pg_query_t::INFO,
6745                    it->shard, ps->pg_whoami.shard,
6746                    ps->info.history,
6747                    ps->get_osdmap_epoch()));
6748       peer_info_requested.insert(peer);
6749       ps->blocked_by.insert(peer.osd);
6750     }
6751   }
6752
6753   ps->check_prior_readable_down_osds(ps->get_osdmap());
6754
6755   pl->publish_stats_to_osd();
6756 }
6757
6758 boost::statechart::result PeeringState::GetInfo::react(const MNotifyRec& infoevt)
6759 {
6760
6761   DECLARE_LOCALS;
6762
6763   auto p = peer_info_requested.find(infoevt.from);
6764   if (p != peer_info_requested.end()) {
6765     peer_info_requested.erase(p);
6766     ps->blocked_by.erase(infoevt.from.osd);
6767   }
6768
6769   epoch_t old_start = ps->info.history.last_epoch_started;
6770   if (ps->proc_replica_info(
6771         infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
6772     // we got something new ...
6773     PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6774     if (old_start < ps->info.history.last_epoch_started) {
6775       psdout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
6776       prior_set = ps->build_prior();
6777       ps->prior_readable_down_osds = prior_set.down;
6778
6779       // filter out any osds that got dropped from the probe set from
6780       // peer_info_requested.  this is less expensive than restarting
6781       // peering (which would re-probe everyone).
6782       auto p = peer_info_requested.begin();
6783       while (p != peer_info_requested.end()) {
6784         if (prior_set.probe.count(*p) == 0) {
6785           psdout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
6786           peer_info_requested.erase(p++);
6787         } else {
6788           ++p;
6789         }
6790       }
6791       get_infos();
6792     }
6793     psdout(20) << "Adding osd: " << infoevt.from.osd << " peer features: "
6794                        << hex << infoevt.features << dec << dendl;
6795     ps->apply_peer_features(infoevt.features);
6796
6797     // are we done getting everything?
6798     if (peer_info_requested.empty() && !prior_set.pg_down) {
6799       psdout(20) << "Common peer features: " << hex << ps->get_min_peer_features() << dec << dendl;
6800       psdout(20) << "Common acting features: " << hex << ps->get_min_acting_features() << dec << dendl;
6801       psdout(20) << "Common upacting features: " << hex << ps->get_min_upacting_features() << dec << dendl;
6802       post_event(GotInfo());
6803     }
6804   }
6805   return discard_event();
6806 }
6807
6808 boost::statechart::result PeeringState::GetInfo::react(const QueryState& q)
6809 {
6810   DECLARE_LOCALS;
6811   q.f->open_object_section("state");
6812   q.f->dump_string("name", state_name);
6813   q.f->dump_stream("enter_time") << enter_time;
6814
6815   q.f->open_array_section("requested_info_from");
6816   for (auto p = peer_info_requested.begin();
6817        p != peer_info_requested.end();
6818        ++p) {
6819     q.f->open_object_section("osd");
6820     q.f->dump_stream("osd") << *p;
6821     if (ps->peer_info.count(*p)) {
6822       q.f->open_object_section("got_info");
6823       ps->peer_info[*p].dump(q.f);
6824       q.f->close_section();
6825     }
6826     q.f->close_section();
6827   }
6828   q.f->close_section();
6829
6830   q.f->close_section();
6831   return forward_event();
6832 }
6833
6834 boost::statechart::result PeeringState::GetInfo::react(const QueryUnfound& q)
6835 {
6836   q.f->dump_string("state", "GetInfo");
6837   q.f->dump_bool("available_might_have_unfound", false);
6838   return discard_event();
6839 }
6840
6841 void PeeringState::GetInfo::exit()
6842 {
6843   context< PeeringMachine >().log_exit(state_name, enter_time);
6844
6845   DECLARE_LOCALS;
6846   utime_t dur = ceph_clock_now() - enter_time;
6847   pl->get_peering_perf().tinc(rs_getinfo_latency, dur);
6848   ps->blocked_by.clear();
6849 }
6850
6851 /*------GetLog------------*/
6852 PeeringState::GetLog::GetLog(my_context ctx)
6853   : my_base(ctx),
6854     NamedState(
6855       context< PeeringMachine >().state_history,
6856       "Started/Primary/Peering/GetLog"),
6857     msg(0)
6858 {
6859   context< PeeringMachine >().log_enter(state_name);
6860
6861   DECLARE_LOCALS;
6862
6863   ps->log_weirdness();
6864
6865   // adjust acting?
6866   if (!ps->choose_acting(auth_log_shard, false,
6867                          &context< Peering >().history_les_bound)) {
6868     if (!ps->want_acting.empty()) {
6869       post_event(NeedActingChange());
6870     } else {
6871       post_event(IsIncomplete());
6872     }
6873     return;
6874   }
6875
6876   // am i the best?
6877   if (auth_log_shard == ps->pg_whoami) {
6878     post_event(GotLog());
6879     return;
6880   }
6881
6882   const pg_info_t& best = ps->peer_info[auth_log_shard];
6883
6884   // am i broken?
6885   if (ps->info.last_update < best.log_tail) {
6886     psdout(10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
6887     post_event(IsIncomplete());
6888     return;
6889   }
6890
6891   // how much log to request?
6892   eversion_t request_log_from = ps->info.last_update;
6893   ceph_assert(!ps->acting_recovery_backfill.empty());
6894   for (auto p = ps->acting_recovery_backfill.begin();
6895        p != ps->acting_recovery_backfill.end();
6896        ++p) {
6897     if (*p == ps->pg_whoami) continue;
6898     pg_info_t& ri = ps->peer_info[*p];
6899     if (ri.last_update < ps->info.log_tail && ri.last_update >= best.log_tail &&
6900         ri.last_update < request_log_from)
6901       request_log_from = ri.last_update;
6902   }
6903
6904   // how much?
6905   psdout(10) << " requesting log from osd." << auth_log_shard << dendl;
6906   context<PeeringMachine>().send_query(
6907     auth_log_shard.osd,
6908     pg_query_t(
6909       pg_query_t::LOG,
6910       auth_log_shard.shard, ps->pg_whoami.shard,
6911       request_log_from, ps->info.history,
6912       ps->get_osdmap_epoch()));
6913
6914   ceph_assert(ps->blocked_by.empty());
6915   ps->blocked_by.insert(auth_log_shard.osd);
6916   pl->publish_stats_to_osd();
6917 }
6918
6919 boost::statechart::result PeeringState::GetLog::react(const AdvMap& advmap)
6920 {
6921   // make sure our log source didn't go down.  we need to check
6922   // explicitly because it may not be part of the prior set, which
6923   // means the Peering state check won't catch it going down.
6924   if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
6925     psdout(10) << "GetLog: auth_log_shard osd."
6926                        << auth_log_shard.osd << " went down" << dendl;
6927     post_event(advmap);
6928     return transit< Reset >();
6929   }
6930
6931   // let the Peering state do its checks.
6932   return forward_event();
6933 }
6934
6935 boost::statechart::result PeeringState::GetLog::react(const MLogRec& logevt)
6936 {
6937   ceph_assert(!msg);
6938   if (logevt.from != auth_log_shard) {
6939     psdout(10) << "GetLog: discarding log from "
6940                        << "non-auth_log_shard osd." << logevt.from << dendl;
6941     return discard_event();
6942   }
6943   psdout(10) << "GetLog: received master log from osd."
6944                      << logevt.from << dendl;
6945   msg = logevt.msg;
6946   post_event(GotLog());
6947   return discard_event();
6948 }
6949
6950 boost::statechart::result PeeringState::GetLog::react(const GotLog&)
6951 {
6952
6953   DECLARE_LOCALS;
6954   psdout(10) << "leaving GetLog" << dendl;
6955   if (msg) {
6956     psdout(10) << "processing master log" << dendl;
6957     ps->proc_master_log(context<PeeringMachine>().get_cur_transaction(),
6958                         msg->info, std::move(msg->log), std::move(msg->missing),
6959                         auth_log_shard);
6960   }
6961   ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6962   return transit< GetMissing >();
6963 }
6964
6965 boost::statechart::result PeeringState::GetLog::react(const QueryState& q)
6966 {
6967   q.f->open_object_section("state");
6968   q.f->dump_string("name", state_name);
6969   q.f->dump_stream("enter_time") << enter_time;
6970   q.f->dump_stream("auth_log_shard") << auth_log_shard;
6971   q.f->close_section();
6972   return forward_event();
6973 }
6974
6975 boost::statechart::result PeeringState::GetLog::react(const QueryUnfound& q)
6976 {
6977   q.f->dump_string("state", "GetLog");
6978   q.f->dump_bool("available_might_have_unfound", false);
6979   return discard_event();
6980 }
6981
6982 void PeeringState::GetLog::exit()
6983 {
6984   context< PeeringMachine >().log_exit(state_name, enter_time);
6985
6986   DECLARE_LOCALS;
6987   utime_t dur = ceph_clock_now() - enter_time;
6988   pl->get_peering_perf().tinc(rs_getlog_latency, dur);
6989   ps->blocked_by.clear();
6990 }
6991
6992 /*------WaitActingChange--------*/
6993 PeeringState::WaitActingChange::WaitActingChange(my_context ctx)
6994   : my_base(ctx),
6995     NamedState(context< PeeringMachine >().state_history, "Started/Primary/WaitActingChange")
6996 {
6997   context< PeeringMachine >().log_enter(state_name);
6998 }
6999
7000 boost::statechart::result PeeringState::WaitActingChange::react(const AdvMap& advmap)
7001 {
7002   DECLARE_LOCALS;
7003   OSDMapRef osdmap = advmap.osdmap;
7004
7005   psdout(10) << "verifying no want_acting " << ps->want_acting << " targets didn't go down" << dendl;
7006   for (auto p = ps->want_acting.begin(); p != ps->want_acting.end(); ++p) {
7007     if (!osdmap->is_up(*p)) {
7008       psdout(10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
7009       post_event(advmap);
7010       return transit< Reset >();
7011     }
7012   }
7013   return forward_event();
7014 }
7015
7016 boost::statechart::result PeeringState::WaitActingChange::react(const MLogRec& logevt)
7017 {
7018   psdout(10) << "In WaitActingChange, ignoring MLocRec" << dendl;
7019   return discard_event();
7020 }
7021
7022 boost::statechart::result PeeringState::WaitActingChange::react(const MInfoRec& evt)
7023 {
7024   psdout(10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
7025   return discard_event();
7026 }
7027
7028 boost::statechart::result PeeringState::WaitActingChange::react(const MNotifyRec& evt)
7029 {
7030   psdout(10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
7031   return discard_event();
7032 }
7033
7034 boost::statechart::result PeeringState::WaitActingChange::react(const QueryState& q)
7035 {
7036   q.f->open_object_section("state");
7037   q.f->dump_string("name", state_name);
7038   q.f->dump_stream("enter_time") << enter_time;
7039   q.f->dump_string("comment", "waiting for pg acting set to change");
7040   q.f->close_section();
7041   return forward_event();
7042 }
7043
7044 boost::statechart::result PeeringState::WaitActingChange::react(const QueryUnfound& q)
7045 {
7046   q.f->dump_string("state", "WaitActingChange");
7047   q.f->dump_bool("available_might_have_unfound", false);
7048   return discard_event();
7049 }
7050
7051 void PeeringState::WaitActingChange::exit()
7052 {
7053   context< PeeringMachine >().log_exit(state_name, enter_time);
7054   DECLARE_LOCALS;
7055   utime_t dur = ceph_clock_now() - enter_time;
7056   pl->get_peering_perf().tinc(rs_waitactingchange_latency, dur);
7057 }
7058
7059 /*------Down--------*/
7060 PeeringState::Down::Down(my_context ctx)
7061   : my_base(ctx),
7062     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Down")
7063 {
7064   context< PeeringMachine >().log_enter(state_name);
7065   DECLARE_LOCALS;
7066
7067   ps->state_clear(PG_STATE_PEERING);
7068   ps->state_set(PG_STATE_DOWN);
7069
7070   auto &prior_set = context< Peering >().prior_set;
7071   ceph_assert(ps->blocked_by.empty());
7072   ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
7073   pl->publish_stats_to_osd();
7074 }
7075
7076 void PeeringState::Down::exit()
7077 {
7078   context< PeeringMachine >().log_exit(state_name, enter_time);
7079
7080   DECLARE_LOCALS;
7081
7082   ps->state_clear(PG_STATE_DOWN);
7083   utime_t dur = ceph_clock_now() - enter_time;
7084   pl->get_peering_perf().tinc(rs_down_latency, dur);
7085
7086   ps->blocked_by.clear();
7087 }
7088
7089 boost::statechart::result PeeringState::Down::react(const QueryState& q)
7090 {
7091   q.f->open_object_section("state");
7092   q.f->dump_string("name", state_name);
7093   q.f->dump_stream("enter_time") << enter_time;
7094   q.f->dump_string("comment",
7095                    "not enough up instances of this PG to go active");
7096   q.f->close_section();
7097   return forward_event();
7098 }
7099
7100 boost::statechart::result PeeringState::Down::react(const QueryUnfound& q)
7101 {
7102   q.f->dump_string("state", "Down");
7103   q.f->dump_bool("available_might_have_unfound", false);
7104   return discard_event();
7105 }
7106
7107 boost::statechart::result PeeringState::Down::react(const MNotifyRec& infoevt)
7108 {
7109   DECLARE_LOCALS;
7110
7111   ceph_assert(ps->is_primary());
7112   epoch_t old_start = ps->info.history.last_epoch_started;
7113   if (!ps->peer_info.count(infoevt.from) &&
7114       ps->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) {
7115     ps->update_history(infoevt.notify.info.history);
7116   }
7117   // if we got something new to make pg escape down state
7118   if (ps->info.history.last_epoch_started > old_start) {
7119       psdout(10) << " last_epoch_started moved forward, re-enter getinfo" << dendl;
7120     ps->state_clear(PG_STATE_DOWN);
7121     ps->state_set(PG_STATE_PEERING);
7122     return transit< GetInfo >();
7123   }
7124
7125   return discard_event();
7126 }
7127
7128
7129 /*------Incomplete--------*/
7130 PeeringState::Incomplete::Incomplete(my_context ctx)
7131   : my_base(ctx),
7132     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Incomplete")
7133 {
7134   context< PeeringMachine >().log_enter(state_name);
7135   DECLARE_LOCALS;
7136
7137   ps->state_clear(PG_STATE_PEERING);
7138   ps->state_set(PG_STATE_INCOMPLETE);
7139
7140   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7141   ceph_assert(ps->blocked_by.empty());
7142   ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
7143   pl->publish_stats_to_osd();
7144 }
7145
7146 boost::statechart::result PeeringState::Incomplete::react(const AdvMap &advmap) {
7147   DECLARE_LOCALS;
7148   int64_t poolnum = ps->info.pgid.pool();
7149
7150   // Reset if min_size turn smaller than previous value, pg might now be able to go active
7151   if (!advmap.osdmap->have_pg_pool(poolnum) ||
7152       advmap.lastmap->get_pools().find(poolnum)->second.min_size >
7153       advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
7154     post_event(advmap);
7155     return transit< Reset >();
7156   }
7157
7158   return forward_event();
7159 }
7160
7161 boost::statechart::result PeeringState::Incomplete::react(const MNotifyRec& notevt) {
7162   DECLARE_LOCALS;
7163   psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
7164   if (ps->proc_replica_info(
7165     notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
7166     // We got something new, try again!
7167     return transit< GetLog >();
7168   } else {
7169     return discard_event();
7170   }
7171 }
7172
7173 boost::statechart::result PeeringState::Incomplete::react(
7174   const QueryState& q)
7175 {
7176   q.f->open_object_section("state");
7177   q.f->dump_string("name", state_name);
7178   q.f->dump_stream("enter_time") << enter_time;
7179   q.f->dump_string("comment", "not enough complete instances of this PG");
7180   q.f->close_section();
7181   return forward_event();
7182 }
7183
7184 boost::statechart::result PeeringState::Incomplete::react(const QueryUnfound& q)
7185 {
7186   q.f->dump_string("state", "Incomplete");
7187   q.f->dump_bool("available_might_have_unfound", false);
7188   return discard_event();
7189 }
7190
7191 void PeeringState::Incomplete::exit()
7192 {
7193   context< PeeringMachine >().log_exit(state_name, enter_time);
7194
7195   DECLARE_LOCALS;
7196
7197   ps->state_clear(PG_STATE_INCOMPLETE);
7198   utime_t dur = ceph_clock_now() - enter_time;
7199   pl->get_peering_perf().tinc(rs_incomplete_latency, dur);
7200
7201   ps->blocked_by.clear();
7202 }
7203
7204 /*------GetMissing--------*/
7205 PeeringState::GetMissing::GetMissing(my_context ctx)
7206   : my_base(ctx),
7207     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetMissing")
7208 {
7209   context< PeeringMachine >().log_enter(state_name);
7210
7211   DECLARE_LOCALS;
7212   ps->log_weirdness();
7213   ceph_assert(!ps->acting_recovery_backfill.empty());
7214   eversion_t since;
7215   for (auto i = ps->acting_recovery_backfill.begin();
7216        i != ps->acting_recovery_backfill.end();
7217        ++i) {
7218     if (*i == ps->get_primary()) continue;
7219     const pg_info_t& pi = ps->peer_info[*i];
7220     // reset this so to make sure the pg_missing_t is initialized and
7221     // has the correct semantics even if we don't need to get a
7222     // missing set from a shard. This way later additions due to
7223     // lost+unfound delete work properly.
7224     ps->peer_missing[*i].may_include_deletes = !ps->perform_deletes_during_peering();
7225
7226     if (pi.is_empty())
7227       continue;                                // no pg data, nothing divergent
7228
7229     if (pi.last_update < ps->pg_log.get_tail()) {
7230       psdout(10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
7231       ps->peer_missing[*i].clear();
7232       continue;
7233     }
7234     if (pi.last_backfill == hobject_t()) {
7235       psdout(10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
7236       ps->peer_missing[*i].clear();
7237       continue;
7238     }
7239
7240     if (pi.last_update == pi.last_complete &&  // peer has no missing
7241         pi.last_update == ps->info.last_update) {  // peer is up to date
7242       // replica has no missing and identical log as us.  no need to
7243       // pull anything.
7244       // FIXME: we can do better here.  if last_update==last_complete we
7245       //        can infer the rest!
7246       psdout(10) << " osd." << *i << " has no missing, identical log" << dendl;
7247       ps->peer_missing[*i].clear();
7248       continue;
7249     }
7250
7251     // We pull the log from the peer's last_epoch_started to ensure we
7252     // get enough log to detect divergent updates.
7253     since.epoch = pi.last_epoch_started;
7254     ceph_assert(pi.last_update >= ps->info.log_tail);  // or else choose_acting() did a bad thing
7255     if (pi.log_tail <= since) {
7256       psdout(10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
7257       context< PeeringMachine >().send_query(
7258         i->osd,
7259         pg_query_t(
7260           pg_query_t::LOG,
7261           i->shard, ps->pg_whoami.shard,
7262           since, ps->info.history,
7263           ps->get_osdmap_epoch()));
7264     } else {
7265       psdout(10) << " requesting fulllog+missing from osd." << *i
7266                          << " (want since " << since << " < log.tail "
7267                          << pi.log_tail << ")" << dendl;
7268       context< PeeringMachine >().send_query(
7269         i->osd, pg_query_t(
7270           pg_query_t::FULLLOG,
7271           i->shard, ps->pg_whoami.shard,
7272           ps->info.history, ps->get_osdmap_epoch()));
7273     }
7274     peer_missing_requested.insert(*i);
7275     ps->blocked_by.insert(i->osd);
7276   }
7277
7278   if (peer_missing_requested.empty()) {
7279     if (ps->need_up_thru) {
7280       psdout(10) << " still need up_thru update before going active"
7281                          << dendl;
7282       post_event(NeedUpThru());
7283       return;
7284     }
7285
7286     // all good!
7287     post_event(Activate(ps->get_osdmap_epoch()));
7288   } else {
7289     pl->publish_stats_to_osd();
7290   }
7291 }
7292
7293 boost::statechart::result PeeringState::GetMissing::react(const MLogRec& logevt)
7294 {
7295   DECLARE_LOCALS;
7296
7297   peer_missing_requested.erase(logevt.from);
7298   ps->proc_replica_log(logevt.msg->info,
7299                        logevt.msg->log,
7300                        std::move(logevt.msg->missing),
7301                        logevt.from);
7302
7303   if (peer_missing_requested.empty()) {
7304     if (ps->need_up_thru) {
7305       psdout(10) << " still need up_thru update before going active"
7306                          << dendl;
7307       post_event(NeedUpThru());
7308     } else {
7309       psdout(10) << "Got last missing, don't need missing "
7310                          << "posting Activate" << dendl;
7311       post_event(Activate(ps->get_osdmap_epoch()));
7312     }
7313   }
7314   return discard_event();
7315 }
7316
7317 boost::statechart::result PeeringState::GetMissing::react(const QueryState& q)
7318 {
7319   DECLARE_LOCALS;
7320   q.f->open_object_section("state");
7321   q.f->dump_string("name", state_name);
7322   q.f->dump_stream("enter_time") << enter_time;
7323
7324   q.f->open_array_section("peer_missing_requested");
7325   for (auto p = peer_missing_requested.begin();
7326        p != peer_missing_requested.end();
7327        ++p) {
7328     q.f->open_object_section("osd");
7329     q.f->dump_stream("osd") << *p;
7330     if (ps->peer_missing.count(*p)) {
7331       q.f->open_object_section("got_missing");
7332       ps->peer_missing[*p].dump(q.f);
7333       q.f->close_section();
7334     }
7335     q.f->close_section();
7336   }
7337   q.f->close_section();
7338
7339   q.f->close_section();
7340   return forward_event();
7341 }
7342
7343 boost::statechart::result PeeringState::GetMissing::react(const QueryUnfound& q)
7344 {
7345   q.f->dump_string("state", "GetMising");
7346   q.f->dump_bool("available_might_have_unfound", false);
7347   return discard_event();
7348 }
7349
7350 void PeeringState::GetMissing::exit()
7351 {
7352   context< PeeringMachine >().log_exit(state_name, enter_time);
7353
7354   DECLARE_LOCALS;
7355   utime_t dur = ceph_clock_now() - enter_time;
7356   pl->get_peering_perf().tinc(rs_getmissing_latency, dur);
7357   ps->blocked_by.clear();
7358 }
7359
7360 /*------WaitUpThru--------*/
7361 PeeringState::WaitUpThru::WaitUpThru(my_context ctx)
7362   : my_base(ctx),
7363     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/WaitUpThru")
7364 {
7365   context< PeeringMachine >().log_enter(state_name);
7366 }
7367
7368 boost::statechart::result PeeringState::WaitUpThru::react(const ActMap& am)
7369 {
7370   DECLARE_LOCALS;
7371   if (!ps->need_up_thru) {
7372     post_event(Activate(ps->get_osdmap_epoch()));
7373   }
7374   return forward_event();
7375 }
7376
7377 boost::statechart::result PeeringState::WaitUpThru::react(const MLogRec& logevt)
7378 {
7379   DECLARE_LOCALS;
7380   psdout(10) << "Noting missing from osd." << logevt.from << dendl;
7381   ps->peer_missing[logevt.from].claim(std::move(logevt.msg->missing));
7382   ps->peer_info[logevt.from] = logevt.msg->info;
7383   return discard_event();
7384 }
7385
7386 boost::statechart::result PeeringState::WaitUpThru::react(const QueryState& q)
7387 {
7388   q.f->open_object_section("state");
7389   q.f->dump_string("name", state_name);
7390   q.f->dump_stream("enter_time") << enter_time;
7391   q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
7392   q.f->close_section();
7393   return forward_event();
7394 }
7395
7396 boost::statechart::result PeeringState::WaitUpThru::react(const QueryUnfound& q)
7397 {
7398   q.f->dump_string("state", "WaitUpThru");
7399   q.f->dump_bool("available_might_have_unfound", false);
7400   return discard_event();
7401 }
7402
7403 void PeeringState::WaitUpThru::exit()
7404 {
7405   context< PeeringMachine >().log_exit(state_name, enter_time);
7406   DECLARE_LOCALS;
7407   utime_t dur = ceph_clock_now() - enter_time;
7408   pl->get_peering_perf().tinc(rs_waitupthru_latency, dur);
7409 }
7410
7411 /*----PeeringState::PeeringMachine Methods-----*/
7412 #undef dout_prefix
7413 #define dout_prefix dpp->gen_prefix(*_dout)
7414
7415 void PeeringState::PeeringMachine::log_enter(const char *state_name)
7416 {
7417   DECLARE_LOCALS;
7418   psdout(5) << "enter " << state_name << dendl;
7419   pl->log_state_enter(state_name);
7420 }
7421
7422 void PeeringState::PeeringMachine::log_exit(const char *state_name, utime_t enter_time)
7423 {
7424   DECLARE_LOCALS;
7425   utime_t dur = ceph_clock_now() - enter_time;
7426   psdout(5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
7427   pl->log_state_exit(state_name, enter_time, event_count, event_time);
7428   event_count = 0;
7429   event_time = utime_t();
7430 }
7431
7432 ostream &operator<<(ostream &out, const PeeringState &ps) {
7433   out << "pg[" << ps.info
7434       << " " << pg_vector_string(ps.up);
7435   if (ps.acting != ps.up)
7436     out << "/" << pg_vector_string(ps.acting);
7437   if (ps.is_ec_pg())
7438     out << "p" << ps.get_primary();
7439   if (!ps.async_recovery_targets.empty())
7440     out << " async=[" << ps.async_recovery_targets << "]";
7441   if (!ps.backfill_targets.empty())
7442     out << " backfill=[" << ps.backfill_targets << "]";
7443   out << " r=" << ps.get_role();
7444   out << " lpr=" << ps.get_last_peering_reset();
7445
7446   if (ps.deleting)
7447     out << " DELETING";
7448
7449   if (!ps.past_intervals.empty()) {
7450     out << " pi=[" << ps.past_intervals.get_bounds()
7451         << ")/" << ps.past_intervals.size();
7452   }
7453
7454   if (ps.is_peered()) {
7455     if (ps.last_update_ondisk != ps.info.last_update)
7456       out << " luod=" << ps.last_update_ondisk;
7457     if (ps.last_update_applied != ps.info.last_update)
7458       out << " lua=" << ps.last_update_applied;
7459   }
7460
7461   if (ps.pg_log.get_tail() != ps.info.log_tail ||
7462       ps.pg_log.get_head() != ps.info.last_update)
7463     out << " (info mismatch, " << ps.pg_log.get_log() << ")";
7464
7465   if (!ps.pg_log.get_log().empty()) {
7466     if ((ps.pg_log.get_log().log.begin()->version <= ps.pg_log.get_tail())) {
7467       out << " (log bound mismatch, actual=["
7468           << ps.pg_log.get_log().log.begin()->version << ","
7469           << ps.pg_log.get_log().log.rbegin()->version << "]";
7470       out << ")";
7471     }
7472   }
7473
7474   out << " crt=" << ps.pg_log.get_can_rollback_to();
7475
7476   if (ps.last_complete_ondisk != ps.info.last_complete)
7477     out << " lcod " << ps.last_complete_ondisk;
7478
7479   out << " mlcod " << ps.min_last_complete_ondisk;
7480
7481   out << " " << pg_state_string(ps.get_state());
7482   if (ps.should_send_notify())
7483     out << " NOTIFY";
7484
7485   if (ps.prior_readable_until_ub != ceph::signedspan::zero()) {
7486     out << " pruub " << ps.prior_readable_until_ub
7487         << "@" << ps.get_prior_readable_down_osds();
7488   }
7489   return out;
7490 }
7491
7492 std::vector<pg_shard_t> PeeringState::get_replica_recovery_order() const
7493 {
7494   std::vector<std::pair<unsigned int, pg_shard_t>> replicas_by_num_missing,
7495     async_by_num_missing;
7496   replicas_by_num_missing.reserve(get_acting_recovery_backfill().size() - 1);
7497   for (auto &p : get_acting_recovery_backfill()) {
7498     if (p == get_primary()) {
7499       continue;
7500     }
7501     auto pm = get_peer_missing().find(p);
7502     assert(pm != get_peer_missing().end());
7503     auto nm = pm->second.num_missing();
7504     if (nm != 0) {
7505       if (is_async_recovery_target(p)) {
7506         async_by_num_missing.push_back(make_pair(nm, p));
7507       } else {
7508         replicas_by_num_missing.push_back(make_pair(nm, p));
7509       }
7510     }
7511   }
7512   // sort by number of missing objects, in ascending order.
7513   auto func = [](const std::pair<unsigned int, pg_shard_t> &lhs,
7514                  const std::pair<unsigned int, pg_shard_t> &rhs) {
7515     return lhs.first < rhs.first;
7516   };
7517   // acting goes first
7518   std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func);
7519   // then async_recovery_targets
7520   std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func);
7521   replicas_by_num_missing.insert(replicas_by_num_missing.end(),
7522     async_by_num_missing.begin(), async_by_num_missing.end());
7523
7524   std::vector<pg_shard_t> ret;
7525   ret.reserve(replicas_by_num_missing.size());
7526   for (auto p : replicas_by_num_missing) {
7527     ret.push_back(p.second);
7528   }
7529   return ret;
7530 }
7531
7532