ceph/src/osd/PeeringState.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3
   4 #include "PGPeeringEvent.h"
   5 #include "common/ceph_releases.h"
   6 #include "common/dout.h"
   7 #include "PeeringState.h"
   8
   9 #include "messages/MOSDPGRemove.h"
  10 #include "messages/MBackfillReserve.h"
  11 #include "messages/MRecoveryReserve.h"
  12 #include "messages/MOSDScrubReserve.h"
  13 #include "messages/MOSDPGInfo.h"
  14 #include "messages/MOSDPGInfo2.h"
  15 #include "messages/MOSDPGTrim.h"
  16 #include "messages/MOSDPGLog.h"
  17 #include "messages/MOSDPGNotify.h"
  18 #include "messages/MOSDPGNotify2.h"
  19 #include "messages/MOSDPGQuery.h"
  20 #include "messages/MOSDPGQuery2.h"
  21 #include "messages/MOSDPGLease.h"
  22 #include "messages/MOSDPGLeaseAck.h"
  23
  24 #define dout_context cct
  25 #define dout_subsys ceph_subsys_osd
  26
  27 BufferedRecoveryMessages::BufferedRecoveryMessages(
  28   ceph_release_t r,
  29   PeeringCtx &ctx)
  30   : require_osd_release(r) {
  31   // steal messages from ctx
  32   message_map.swap(ctx.message_map);
  33 }
  34
  35 void BufferedRecoveryMessages::send_notify(int to, const pg_notify_t &n)
  36 {
  37   if (require_osd_release >= ceph_release_t::octopus) {
  38     spg_t pgid(n.info.pgid.pgid, n.to);
  39     send_osd_message(to, make_message<MOSDPGNotify2>(pgid, n));
  40   } else {
  41     send_osd_message(to, make_message<MOSDPGNotify>(n.epoch_sent, vector{n}));
  42   }
  43 }
  44
  45 void BufferedRecoveryMessages::send_query(
  46   int to,
  47   spg_t to_spgid,
  48   const pg_query_t &q)
  49 {
  50   if (require_osd_release >= ceph_release_t::octopus) {
  51     send_osd_message(to,
  52                      make_message<MOSDPGQuery2>(to_spgid, q));
  53   } else {
  54     auto m = make_message<MOSDPGQuery>(
  55       q.epoch_sent,
  56       MOSDPGQuery::pg_list_t{{to_spgid, q}});
  57     send_osd_message(to, m);
  58   }
  59 }
  60
  61 void BufferedRecoveryMessages::send_info(
  62   int to,
  63   spg_t to_spgid,
  64   epoch_t min_epoch,
  65   epoch_t cur_epoch,
  66   const pg_info_t &info,
  67   std::optional<pg_lease_t> lease,
  68   std::optional<pg_lease_ack_t> lease_ack)
  69 {
  70   if (require_osd_release >= ceph_release_t::octopus) {
  71     send_osd_message(
  72       to,
  73       make_message<MOSDPGInfo2>(
  74         to_spgid,
  75         info,
  76         cur_epoch,
  77         min_epoch,
  78         lease,
  79         lease_ack)
  80       );
  81   } else {
  82     send_osd_message(
  83       to,
  84       make_message<MOSDPGInfo>(
  85         cur_epoch,
  86         vector{pg_notify_t{to_spgid.shard,
  87                            info.pgid.shard,
  88                            min_epoch, cur_epoch,
  89                            info, PastIntervals{}}})
  90       );
  91   }
  92 }
  93
  94 void PGPool::update(CephContext *cct, OSDMapRef map)
  95 {
  96   const pg_pool_t *pi = map->get_pg_pool(id);
  97   if (!pi) {
  98     return; // pool has been deleted
  99   }
 100   info = *pi;
 101   name = map->get_pool_name(id);
 102
 103   bool updated = false;
 104   if ((map->get_epoch() != cached_epoch + 1) ||
 105       (pi->get_snap_epoch() == map->get_epoch())) {
 106     updated = true;
 107   }
 108
 109   if (info.is_pool_snaps_mode() && updated) {
 110     snapc = pi->get_snap_context();
 111   }
 112   cached_epoch = map->get_epoch();
 113 }
 114
 115 /*-------------Peering State Helpers----------------*/
 116 #undef dout_prefix
 117 #define dout_prefix (dpp->gen_prefix(*_dout))
 118 #undef psdout
 119 #define psdout(x) ldout(cct, x)
 120
 121 PeeringState::PeeringState(
 122   CephContext *cct,
 123   pg_shard_t pg_whoami,
 124   spg_t spgid,
 125   const PGPool &_pool,
 126   OSDMapRef curmap,
 127   DoutPrefixProvider *dpp,
 128   PeeringListener *pl)
 129   : state_history(*pl),
 130     cct(cct),
 131     spgid(spgid),
 132     dpp(dpp),
 133     pl(pl),
 134     orig_ctx(0),
 135     osdmap_ref(curmap),
 136     pool(_pool),
 137     pg_whoami(pg_whoami),
 138     info(spgid),
 139     pg_log(cct),
 140     missing_loc(spgid, this, dpp, cct),
 141     machine(this, cct, spgid, dpp, pl, &state_history)
 142 {
 143   machine.initiate();
 144 }
 145
 146 void PeeringState::start_handle(PeeringCtx *new_ctx) {
 147   ceph_assert(!rctx);
 148   ceph_assert(!orig_ctx);
 149   orig_ctx = new_ctx;
 150   if (new_ctx) {
 151     if (messages_pending_flush) {
 152       rctx.emplace(*messages_pending_flush, *new_ctx);
 153     } else {
 154       rctx.emplace(*new_ctx);
 155     }
 156     rctx->start_time = ceph_clock_now();
 157   }
 158 }
 159
 160 void PeeringState::begin_block_outgoing() {
 161   ceph_assert(!messages_pending_flush);
 162   ceph_assert(orig_ctx);
 163   ceph_assert(rctx);
 164   messages_pending_flush = BufferedRecoveryMessages(
 165     orig_ctx->require_osd_release);
 166   rctx.emplace(*messages_pending_flush, *orig_ctx);
 167 }
 168
 169 void PeeringState::clear_blocked_outgoing() {
 170   ceph_assert(orig_ctx);
 171   ceph_assert(rctx);
 172   messages_pending_flush = std::optional<BufferedRecoveryMessages>();
 173 }
 174
 175 void PeeringState::end_block_outgoing() {
 176   ceph_assert(messages_pending_flush);
 177   ceph_assert(orig_ctx);
 178   ceph_assert(rctx);
 179
 180   orig_ctx->accept_buffered_messages(*messages_pending_flush);
 181   rctx.emplace(*orig_ctx);
 182   messages_pending_flush = std::optional<BufferedRecoveryMessages>();
 183 }
 184
 185 void PeeringState::end_handle() {
 186   if (rctx) {
 187     utime_t dur = ceph_clock_now() - rctx->start_time;
 188     machine.event_time += dur;
 189   }
 190
 191   machine.event_count++;
 192   rctx = std::nullopt;
 193   orig_ctx = NULL;
 194 }
 195
 196 void PeeringState::check_recovery_sources(const OSDMapRef& osdmap)
 197 {
 198   /*
 199    * check that any peers we are planning to (or currently) pulling
 200    * objects from are dealt with.
 201    */
 202   missing_loc.check_recovery_sources(osdmap);
 203   pl->check_recovery_sources(osdmap);
 204
 205   for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
 206        i != peer_log_requested.end();
 207        ) {
 208     if (!osdmap->is_up(i->osd)) {
 209       psdout(10) << "peer_log_requested removing " << *i << dendl;
 210       peer_log_requested.erase(i++);
 211     } else {
 212       ++i;
 213     }
 214   }
 215
 216   for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
 217        i != peer_missing_requested.end();
 218        ) {
 219     if (!osdmap->is_up(i->osd)) {
 220       psdout(10) << "peer_missing_requested removing " << *i << dendl;
 221       peer_missing_requested.erase(i++);
 222     } else {
 223       ++i;
 224     }
 225   }
 226 }
 227
 228 void PeeringState::update_history(const pg_history_t& new_history)
 229 {
 230   auto mnow = pl->get_mnow();
 231   info.history.refresh_prior_readable_until_ub(mnow, prior_readable_until_ub);
 232   if (info.history.merge(new_history)) {
 233     psdout(20) << __func__ << " advanced history from " << new_history << dendl;
 234     dirty_info = true;
 235     if (info.history.last_epoch_clean >= info.history.same_interval_since) {
 236       psdout(20) << __func__ << " clearing past_intervals" << dendl;
 237       past_intervals.clear();
 238       dirty_big_info = true;
 239     }
 240     prior_readable_until_ub = info.history.get_prior_readable_until_ub(mnow);
 241     if (prior_readable_until_ub != ceph::signedspan::zero()) {
 242       dout(20) << __func__
 243                << " prior_readable_until_ub " << prior_readable_until_ub
 244                << " (mnow " << mnow << " + "
 245                << info.history.prior_readable_until_ub << ")" << dendl;
 246     }
 247   }
 248   pl->on_info_history_change();
 249 }
 250
 251 void PeeringState::purge_strays()
 252 {
 253   if (is_premerge()) {
 254     psdout(10) << "purge_strays " << stray_set << " but premerge, doing nothing"
 255                << dendl;
 256     return;
 257   }
 258   if (cct->_conf.get_val<bool>("osd_debug_no_purge_strays")) {
 259     return;
 260   }
 261   psdout(10) << "purge_strays " << stray_set << dendl;
 262
 263   bool removed = false;
 264   for (set<pg_shard_t>::iterator p = stray_set.begin();
 265        p != stray_set.end();
 266        ++p) {
 267     ceph_assert(!is_acting_recovery_backfill(*p));
 268     if (get_osdmap()->is_up(p->osd)) {
 269       psdout(10) << "sending PGRemove to osd." << *p << dendl;
 270       vector<spg_t> to_remove;
 271       to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
 272       MOSDPGRemove *m = new MOSDPGRemove(
 273         get_osdmap_epoch(),
 274         to_remove);
 275       pl->send_cluster_message(p->osd, m, get_osdmap_epoch());
 276     } else {
 277       psdout(10) << "not sending PGRemove to down osd." << *p << dendl;
 278     }
 279     peer_missing.erase(*p);
 280     peer_info.erase(*p);
 281     missing_loc.remove_stray_recovery_sources(*p);
 282     peer_purged.insert(*p);
 283     removed = true;
 284   }
 285
 286   // if we removed anyone, update peers (which include peer_info)
 287   if (removed)
 288     update_heartbeat_peers();
 289
 290   stray_set.clear();
 291
 292   // clear _requested maps; we may have to peer() again if we discover
 293   // (more) stray content
 294   peer_log_requested.clear();
 295   peer_missing_requested.clear();
 296 }
 297
 298
 299 bool PeeringState::proc_replica_info(
 300   pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
 301 {
 302   map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
 303   if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
 304     psdout(10) << " got dup osd." << from << " info "
 305                << oinfo << ", identical to ours" << dendl;
 306     return false;
 307   }
 308
 309   if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
 310     psdout(10) << " got info " << oinfo << " from down osd." << from
 311              << " discarding" << dendl;
 312     return false;
 313   }
 314
 315   psdout(10) << " got osd." << from << " " << oinfo << dendl;
 316   ceph_assert(is_primary());
 317   peer_info[from] = oinfo;
 318   might_have_unfound.insert(from);
 319
 320   update_history(oinfo.history);
 321
 322   // stray?
 323   if (!is_up(from) && !is_acting(from)) {
 324     psdout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
 325     stray_set.insert(from);
 326     if (is_clean()) {
 327       purge_strays();
 328     }
 329   }
 330
 331   // was this a new info?  if so, update peers!
 332   if (p == peer_info.end())
 333     update_heartbeat_peers();
 334
 335   return true;
 336 }
 337
 338
 339 void PeeringState::remove_down_peer_info(const OSDMapRef &osdmap)
 340 {
 341   // Remove any downed osds from peer_info
 342   bool removed = false;
 343   map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
 344   while (p != peer_info.end()) {
 345     if (!osdmap->is_up(p->first.osd)) {
 346       psdout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
 347       peer_missing.erase(p->first);
 348       peer_log_requested.erase(p->first);
 349       peer_missing_requested.erase(p->first);
 350       peer_purged.erase(p->first);
 351       peer_info.erase(p++);
 352       removed = true;
 353     } else
 354       ++p;
 355   }
 356
 357   // if we removed anyone, update peers (which include peer_info)
 358   if (removed)
 359     update_heartbeat_peers();
 360
 361   check_recovery_sources(osdmap);
 362 }
 363
 364 void PeeringState::update_heartbeat_peers()
 365 {
 366   if (!is_primary())
 367     return;
 368
 369   set<int> new_peers;
 370   for (unsigned i=0; i<acting.size(); i++) {
 371     if (acting[i] != CRUSH_ITEM_NONE)
 372       new_peers.insert(acting[i]);
 373   }
 374   for (unsigned i=0; i<up.size(); i++) {
 375     if (up[i] != CRUSH_ITEM_NONE)
 376       new_peers.insert(up[i]);
 377   }
 378   for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
 379        p != peer_info.end();
 380        ++p) {
 381     new_peers.insert(p->first.osd);
 382   }
 383   pl->update_heartbeat_peers(std::move(new_peers));
 384 }
 385
 386 void PeeringState::write_if_dirty(ObjectStore::Transaction& t)
 387 {
 388   pl->prepare_write(
 389     info,
 390     last_written_info,
 391     past_intervals,
 392     pg_log,
 393     dirty_info,
 394     dirty_big_info,
 395     last_persisted_osdmap < get_osdmap_epoch(),
 396     t);
 397   if (dirty_info || dirty_big_info) {
 398     last_persisted_osdmap = get_osdmap_epoch();
 399     last_written_info = info;
 400     dirty_info = false;
 401     dirty_big_info = false;
 402   }
 403 }
 404
 405 void PeeringState::advance_map(
 406   OSDMapRef osdmap, OSDMapRef lastmap,
 407   vector<int>& newup, int up_primary,
 408   vector<int>& newacting, int acting_primary,
 409   PeeringCtx &rctx)
 410 {
 411   ceph_assert(lastmap == osdmap_ref);
 412   psdout(10) << "handle_advance_map "
 413             << newup << "/" << newacting
 414             << " -- " << up_primary << "/" << acting_primary
 415             << dendl;
 416
 417   update_osdmap_ref(osdmap);
 418   pool.update(cct, osdmap);
 419
 420   AdvMap evt(
 421     osdmap, lastmap, newup, up_primary,
 422     newacting, acting_primary);
 423   handle_event(evt, &rctx);
 424   if (pool.info.last_change == osdmap_ref->get_epoch()) {
 425     pl->on_pool_change();
 426   }
 427   readable_interval = pool.get_readable_interval();
 428   last_require_osd_release = osdmap->require_osd_release;
 429 }
 430
 431 void PeeringState::activate_map(PeeringCtx &rctx)
 432 {
 433   psdout(10) << __func__ << dendl;
 434   ActMap evt;
 435   handle_event(evt, &rctx);
 436   if (osdmap_ref->get_epoch() - last_persisted_osdmap >
 437     cct->_conf->osd_pg_epoch_persisted_max_stale) {
 438     psdout(20) << __func__ << ": Dirtying info: last_persisted is "
 439               << last_persisted_osdmap
 440               << " while current is " << osdmap_ref->get_epoch() << dendl;
 441     dirty_info = true;
 442   } else {
 443     psdout(20) << __func__ << ": Not dirtying info: last_persisted is "
 444               << last_persisted_osdmap
 445               << " while current is " << osdmap_ref->get_epoch() << dendl;
 446   }
 447   write_if_dirty(rctx.transaction);
 448
 449   if (get_osdmap()->check_new_blacklist_entries()) {
 450     pl->check_blacklisted_watchers();
 451   }
 452 }
 453
 454 void PeeringState::set_last_peering_reset()
 455 {
 456   psdout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl;
 457   if (last_peering_reset != get_osdmap_epoch()) {
 458     last_peering_reset = get_osdmap_epoch();
 459     psdout(10) << "Clearing blocked outgoing recovery messages" << dendl;
 460     clear_blocked_outgoing();
 461     if (!pl->try_flush_or_schedule_async()) {
 462       psdout(10) << "Beginning to block outgoing recovery messages" << dendl;
 463       begin_block_outgoing();
 464     } else {
 465       psdout(10) << "Not blocking outgoing recovery messages" << dendl;
 466     }
 467   }
 468 }
 469
 470 void PeeringState::complete_flush()
 471 {
 472   flushes_in_progress--;
 473   if (flushes_in_progress == 0) {
 474     pl->on_flushed();
 475   }
 476 }
 477
 478 void PeeringState::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
 479 {
 480   const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
 481   if (!pi) {
 482     return; // pool deleted
 483   }
 484   bool changed = false;
 485   if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
 486     const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
 487     if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
 488       psdout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
 489       changed = true;
 490     }
 491   }
 492   if (changed) {
 493     info.history.last_epoch_marked_full = osdmap->get_epoch();
 494     dirty_info = true;
 495   }
 496 }
 497
 498 bool PeeringState::should_restart_peering(
 499   int newupprimary,
 500   int newactingprimary,
 501   const vector<int>& newup,
 502   const vector<int>& newacting,
 503   OSDMapRef lastmap,
 504   OSDMapRef osdmap)
 505 {
 506   if (PastIntervals::is_new_interval(
 507         primary.osd,
 508         newactingprimary,
 509         acting,
 510         newacting,
 511         up_primary.osd,
 512         newupprimary,
 513         up,
 514         newup,
 515         osdmap.get(),
 516         lastmap.get(),
 517         info.pgid.pgid)) {
 518     psdout(20) << "new interval newup " << newup
 519                << " newacting " << newacting << dendl;
 520     return true;
 521   }
 522   if (!lastmap->is_up(pg_whoami.osd) && osdmap->is_up(pg_whoami.osd)) {
 523     psdout(10) << __func__ << " osd transitioned from down -> up"
 524                << dendl;
 525     return true;
 526   }
 527   return false;
 528 }
 529
 530 /* Called before initializing peering during advance_map */
 531 void PeeringState::start_peering_interval(
 532   const OSDMapRef lastmap,
 533   const vector<int>& newup, int new_up_primary,
 534   const vector<int>& newacting, int new_acting_primary,
 535   ObjectStore::Transaction &t)
 536 {
 537   const OSDMapRef osdmap = get_osdmap();
 538
 539   set_last_peering_reset();
 540
 541   vector<int> oldacting, oldup;
 542   int oldrole = get_role();
 543
 544   if (is_primary()) {
 545     pl->clear_ready_to_merge();
 546   }
 547
 548
 549   pg_shard_t old_acting_primary = get_primary();
 550   pg_shard_t old_up_primary = up_primary;
 551   bool was_old_primary = is_primary();
 552   bool was_old_nonprimary = is_nonprimary();
 553
 554   acting.swap(oldacting);
 555   up.swap(oldup);
 556   init_primary_up_acting(
 557     newup,
 558     newacting,
 559     new_up_primary,
 560     new_acting_primary);
 561
 562   if (info.stats.up != up ||
 563       info.stats.acting != acting ||
 564       info.stats.up_primary != new_up_primary ||
 565       info.stats.acting_primary != new_acting_primary) {
 566     info.stats.up = up;
 567     info.stats.up_primary = new_up_primary;
 568     info.stats.acting = acting;
 569     info.stats.acting_primary = new_acting_primary;
 570     info.stats.mapping_epoch = osdmap->get_epoch();
 571   }
 572
 573   pl->clear_publish_stats();
 574
 575   // This will now be remapped during a backfill in cases
 576   // that it would not have been before.
 577   if (up != acting)
 578     state_set(PG_STATE_REMAPPED);
 579   else
 580     state_clear(PG_STATE_REMAPPED);
 581
 582   int role = osdmap->calc_pg_role(pg_whoami, acting);
 583   set_role(role);
 584
 585   // did acting, up, primary|acker change?
 586   if (!lastmap) {
 587     psdout(10) << " no lastmap" << dendl;
 588     dirty_info = true;
 589     dirty_big_info = true;
 590     info.history.same_interval_since = osdmap->get_epoch();
 591   } else {
 592     std::stringstream debug;
 593     ceph_assert(info.history.same_interval_since != 0);
 594     bool new_interval = PastIntervals::check_new_interval(
 595       old_acting_primary.osd,
 596       new_acting_primary,
 597       oldacting, newacting,
 598       old_up_primary.osd,
 599       new_up_primary,
 600       oldup, newup,
 601       info.history.same_interval_since,
 602       info.history.last_epoch_clean,
 603       osdmap.get(),
 604       lastmap.get(),
 605       info.pgid.pgid,
 606       missing_loc.get_recoverable_predicate(),
 607       &past_intervals,
 608       &debug);
 609     psdout(10) << __func__ << ": check_new_interval output: "
 610                << debug.str() << dendl;
 611     if (new_interval) {
 612       if (osdmap->get_epoch() == pl->oldest_stored_osdmap() &&
 613           info.history.last_epoch_clean < osdmap->get_epoch()) {
 614         psdout(10) << " map gap, clearing past_intervals and faking" << dendl;
 615         // our information is incomplete and useless; someone else was clean
 616         // after everything we know if osdmaps were trimmed.
 617         past_intervals.clear();
 618       } else {
 619         psdout(10) << " noting past " << past_intervals << dendl;
 620       }
 621       dirty_info = true;
 622       dirty_big_info = true;
 623       info.history.same_interval_since = osdmap->get_epoch();
 624       if (osdmap->have_pg_pool(info.pgid.pgid.pool()) &&
 625           info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
 626                                   osdmap->get_pg_num(info.pgid.pgid.pool()),
 627                                   nullptr)) {
 628         info.history.last_epoch_split = osdmap->get_epoch();
 629       }
 630     }
 631   }
 632
 633   if (old_up_primary != up_primary ||
 634       oldup != up) {
 635     info.history.same_up_since = osdmap->get_epoch();
 636   }
 637   // this comparison includes primary rank via pg_shard_t
 638   if (old_acting_primary != get_primary()) {
 639     info.history.same_primary_since = osdmap->get_epoch();
 640   }
 641
 642   on_new_interval();
 643   pl->on_info_history_change();
 644
 645   psdout(1) << __func__ << " up " << oldup << " -> " << up
 646             << ", acting " << oldacting << " -> " << acting
 647             << ", acting_primary " << old_acting_primary << " -> "
 648             << new_acting_primary
 649             << ", up_primary " << old_up_primary << " -> " << new_up_primary
 650             << ", role " << oldrole << " -> " << role
 651             << ", features acting " << acting_features
 652             << " upacting " << upacting_features
 653             << dendl;
 654
 655   // deactivate.
 656   state_clear(PG_STATE_ACTIVE);
 657   state_clear(PG_STATE_PEERED);
 658   state_clear(PG_STATE_PREMERGE);
 659   state_clear(PG_STATE_DOWN);
 660   state_clear(PG_STATE_RECOVERY_WAIT);
 661   state_clear(PG_STATE_RECOVERY_TOOFULL);
 662   state_clear(PG_STATE_RECOVERING);
 663
 664   peer_purged.clear();
 665   acting_recovery_backfill.clear();
 666
 667   // reset primary/replica state?
 668   if (was_old_primary || is_primary()) {
 669     pl->clear_want_pg_temp();
 670   } else if (was_old_nonprimary || is_nonprimary()) {
 671     pl->clear_want_pg_temp();
 672   }
 673   clear_primary_state();
 674
 675   pl->on_change(t);
 676
 677   ceph_assert(!deleting);
 678
 679   // should we tell the primary we are here?
 680   send_notify = !is_primary();
 681
 682   if (role != oldrole ||
 683       was_old_primary != is_primary()) {
 684     // did primary change?
 685     if (was_old_primary != is_primary()) {
 686       state_clear(PG_STATE_CLEAN);
 687     }
 688
 689     pl->on_role_change();
 690   } else {
 691     // no role change.
 692     // did primary change?
 693     if (get_primary() != old_acting_primary) {
 694       psdout(10) << oldacting << " -> " << acting
 695                << ", acting primary "
 696                << old_acting_primary << " -> " << get_primary()
 697                << dendl;
 698     } else {
 699       // primary is the same.
 700       if (is_primary()) {
 701         // i am (still) primary. but my replica set changed.
 702         state_clear(PG_STATE_CLEAN);
 703
 704         psdout(10) << oldacting << " -> " << acting
 705                  << ", replicas changed" << dendl;
 706       }
 707     }
 708   }
 709
 710   if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
 711     psdout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
 712     pl->queue_want_pg_temp(acting);
 713   }
 714 }
 715
 716 void PeeringState::on_new_interval()
 717 {
 718   dout(20) << __func__ << dendl;
 719   const OSDMapRef osdmap = get_osdmap();
 720
 721   // initialize features
 722   acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
 723   upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
 724   for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
 725     if (*p == CRUSH_ITEM_NONE)
 726       continue;
 727     uint64_t f = osdmap->get_xinfo(*p).features;
 728     acting_features &= f;
 729     upacting_features &= f;
 730   }
 731   for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
 732     if (*p == CRUSH_ITEM_NONE)
 733       continue;
 734     upacting_features &= osdmap->get_xinfo(*p).features;
 735   }
 736   psdout(20) << __func__ << " upacting_features 0x" << std::hex
 737              << upacting_features << std::dec
 738              << " from " << acting << "+" << up << dendl;
 739
 740   psdout(20) << __func__ << " checking missing set deletes flag. missing = "
 741              << get_pg_log().get_missing() << dendl;
 742
 743   if (!pg_log.get_missing().may_include_deletes &&
 744       !perform_deletes_during_peering()) {
 745     pl->rebuild_missing_set_with_deletes(pg_log);
 746   }
 747   ceph_assert(
 748     pg_log.get_missing().may_include_deletes ==
 749     !perform_deletes_during_peering());
 750
 751   init_hb_stamps();
 752
 753   // update lease bounds for a new interval
 754   auto mnow = pl->get_mnow();
 755   prior_readable_until_ub = std::max(prior_readable_until_ub,
 756                                      readable_until_ub);
 757   prior_readable_until_ub = info.history.refresh_prior_readable_until_ub(
 758     mnow, prior_readable_until_ub);
 759   psdout(10) << __func__ << " prior_readable_until_ub "
 760              << prior_readable_until_ub << " (mnow " << mnow << " + "
 761              << info.history.prior_readable_until_ub << ")" << dendl;
 762   prior_readable_down_osds.clear(); // we populate this when we build the priorset
 763
 764   readable_until =
 765     readable_until_ub =
 766     readable_until_ub_sent =
 767     readable_until_ub_from_primary = ceph::signedspan::zero();
 768
 769   acting_readable_until_ub.clear();
 770   if (is_primary()) {
 771     acting_readable_until_ub.resize(acting.size(), ceph::signedspan::zero());
 772   }
 773
 774   pl->on_new_interval();
 775 }
 776
 777 void PeeringState::init_primary_up_acting(
 778   const vector<int> &newup,
 779   const vector<int> &newacting,
 780   int new_up_primary,
 781   int new_acting_primary)
 782 {
 783   actingset.clear();
 784   acting = newacting;
 785   for (uint8_t i = 0; i < acting.size(); ++i) {
 786     if (acting[i] != CRUSH_ITEM_NONE)
 787       actingset.insert(
 788         pg_shard_t(
 789           acting[i],
 790           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
 791   }
 792   upset.clear();
 793   up = newup;
 794   for (uint8_t i = 0; i < up.size(); ++i) {
 795     if (up[i] != CRUSH_ITEM_NONE)
 796       upset.insert(
 797         pg_shard_t(
 798           up[i],
 799           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
 800   }
 801   if (!pool.info.is_erasure()) {
 802     // replicated
 803     up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
 804     primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
 805   } else {
 806     // erasure
 807     up_primary = pg_shard_t();
 808     primary = pg_shard_t();
 809     for (uint8_t i = 0; i < up.size(); ++i) {
 810       if (up[i] == new_up_primary) {
 811         up_primary = pg_shard_t(up[i], shard_id_t(i));
 812         break;
 813       }
 814     }
 815     for (uint8_t i = 0; i < acting.size(); ++i) {
 816       if (acting[i] == new_acting_primary) {
 817         primary = pg_shard_t(acting[i], shard_id_t(i));
 818         break;
 819       }
 820     }
 821     ceph_assert(up_primary.osd == new_up_primary);
 822     ceph_assert(primary.osd == new_acting_primary);
 823   }
 824 }
 825
 826 void PeeringState::init_hb_stamps()
 827 {
 828   if (is_primary()) {
 829     // we care about all other osds in the acting set
 830     hb_stamps.resize(acting.size() - 1);
 831     unsigned i = 0;
 832     for (auto p : acting) {
 833       if (p == CRUSH_ITEM_NONE || p == get_primary().osd) {
 834         continue;
 835       }
 836       hb_stamps[i++] = pl->get_hb_stamps(p);
 837     }
 838     hb_stamps.resize(i);
 839   } else if (is_nonprimary()) {
 840     // we care about just the primary
 841     hb_stamps.resize(1);
 842     hb_stamps[0] = pl->get_hb_stamps(get_primary().osd);
 843   } else {
 844     hb_stamps.clear();
 845   }
 846   dout(10) << __func__ << " now " << hb_stamps << dendl;
 847 }
 848
 849
 850 void PeeringState::clear_recovery_state()
 851 {
 852   async_recovery_targets.clear();
 853   backfill_targets.clear();
 854 }
 855
 856 void PeeringState::clear_primary_state()
 857 {
 858   psdout(10) << "clear_primary_state" << dendl;
 859
 860   // clear peering state
 861   stray_set.clear();
 862   peer_log_requested.clear();
 863   peer_missing_requested.clear();
 864   peer_info.clear();
 865   peer_bytes.clear();
 866   peer_missing.clear();
 867   peer_last_complete_ondisk.clear();
 868   peer_activated.clear();
 869   min_last_complete_ondisk = eversion_t();
 870   pg_trim_to = eversion_t();
 871   might_have_unfound.clear();
 872   need_up_thru = false;
 873   missing_loc.clear();
 874   pg_log.reset_recovery_pointers();
 875
 876   clear_recovery_state();
 877
 878   last_update_ondisk = eversion_t();
 879   missing_loc.clear();
 880   pl->clear_primary_state();
 881 }
 882
 883 /// return [start,end) bounds for required past_intervals
 884 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
 885   const pg_info_t &info,
 886   epoch_t oldest_map) {
 887   epoch_t start = std::max(
 888     info.history.last_epoch_clean ? info.history.last_epoch_clean :
 889     info.history.epoch_pool_created,
 890     oldest_map);
 891   epoch_t end = std::max(
 892     info.history.same_interval_since,
 893     info.history.epoch_pool_created);
 894   return make_pair(start, end);
 895 }
 896
 897
 898 void PeeringState::check_past_interval_bounds() const
 899 {
 900   auto oldest_epoch = pl->oldest_stored_osdmap();
 901   auto rpib = get_required_past_interval_bounds(
 902     info,
 903     oldest_epoch);
 904   if (rpib.first >= rpib.second) {
 905     // do not warn if the start bound is dictated by oldest_map; the
 906     // past intervals are presumably appropriate given the pg info.
 907     if (!past_intervals.empty() &&
 908         rpib.first > oldest_epoch) {
 909       pl->get_clog_error() << info.pgid << " required past_interval bounds are"
 910                              << " empty [" << rpib << ") but past_intervals is not: "
 911                              << past_intervals;
 912       derr << info.pgid << " required past_interval bounds are"
 913            << " empty [" << rpib << ") but past_intervals is not: "
 914            << past_intervals << dendl;
 915     }
 916   } else {
 917     if (past_intervals.empty()) {
 918       pl->get_clog_error() << info.pgid << " required past_interval bounds are"
 919                              << " not empty [" << rpib << ") but past_intervals "
 920                              << past_intervals << " is empty";
 921       derr << info.pgid << " required past_interval bounds are"
 922            << " not empty [" << rpib << ") but past_intervals "
 923            << past_intervals << " is empty" << dendl;
 924       ceph_assert(!past_intervals.empty());
 925     }
 926
 927     auto apib = past_intervals.get_bounds();
 928     if (apib.first > rpib.first) {
 929       pl->get_clog_error() << info.pgid << " past_intervals [" << apib
 930                              << ") start interval does not contain the required"
 931                              << " bound [" << rpib << ") start";
 932       derr << info.pgid << " past_intervals [" << apib
 933            << ") start interval does not contain the required"
 934            << " bound [" << rpib << ") start" << dendl;
 935       ceph_abort_msg("past_interval start interval mismatch");
 936     }
 937     if (apib.second != rpib.second) {
 938       pl->get_clog_error() << info.pgid << " past_interal bound [" << apib
 939                              << ") end does not match required [" << rpib
 940                              << ") end";
 941       derr << info.pgid << " past_interal bound [" << apib
 942            << ") end does not match required [" << rpib
 943            << ") end" << dendl;
 944       ceph_abort_msg("past_interval end mismatch");
 945     }
 946   }
 947 }
 948
 949 int PeeringState::clamp_recovery_priority(int priority, int pool_recovery_priority, int max)
 950 {
 951   static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
 952   static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
 953
 954   ceph_assert(max <= OSD_RECOVERY_PRIORITY_MAX);
 955
 956   // User can't set this too high anymore, but might be a legacy value
 957   if (pool_recovery_priority > OSD_POOL_PRIORITY_MAX)
 958     pool_recovery_priority = OSD_POOL_PRIORITY_MAX;
 959   if (pool_recovery_priority < OSD_POOL_PRIORITY_MIN)
 960     pool_recovery_priority = OSD_POOL_PRIORITY_MIN;
 961   // Shift range from min to max to 0 to max - min
 962   pool_recovery_priority += (0 - OSD_POOL_PRIORITY_MIN);
 963   ceph_assert(pool_recovery_priority >= 0 && pool_recovery_priority <= (OSD_POOL_PRIORITY_MAX - OSD_POOL_PRIORITY_MIN));
 964
 965   priority += pool_recovery_priority;
 966
 967   // Clamp to valid range
 968   if (priority > max) {
 969     return max;
 970   } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
 971     return OSD_RECOVERY_PRIORITY_MIN;
 972   } else {
 973     return priority;
 974   }
 975 }
 976
 977 unsigned PeeringState::get_recovery_priority()
 978 {
 979   // a higher value -> a higher priority
 980   int ret = OSD_RECOVERY_PRIORITY_BASE;
 981   int base = ret;
 982
 983   if (state & PG_STATE_FORCED_RECOVERY) {
 984     ret = OSD_RECOVERY_PRIORITY_FORCED;
 985   } else {
 986     // XXX: This priority boost isn't so much about inactive, but about data-at-risk
 987     if (is_degraded() && info.stats.avail_no_missing.size() < pool.info.min_size) {
 988       base = OSD_RECOVERY_INACTIVE_PRIORITY_BASE;
 989       // inactive: no. of replicas < min_size, highest priority since it blocks IO
 990       ret = base + (pool.info.min_size - info.stats.avail_no_missing.size());
 991     }
 992
 993     int64_t pool_recovery_priority = 0;
 994     pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
 995
 996     ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
 997   }
 998   psdout(20) << __func__ << " recovery priority is " << ret << dendl;
 999   return static_cast<unsigned>(ret);
1000 }
1001
1002 unsigned PeeringState::get_backfill_priority()
1003 {
1004   // a higher value -> a higher priority
1005   int ret = OSD_BACKFILL_PRIORITY_BASE;
1006   int base = ret;
1007
1008   if (state & PG_STATE_FORCED_BACKFILL) {
1009     ret = OSD_BACKFILL_PRIORITY_FORCED;
1010   } else {
1011     if (acting.size() < pool.info.min_size) {
1012       base = OSD_BACKFILL_INACTIVE_PRIORITY_BASE;
1013       // inactive: no. of replicas < min_size, highest priority since it blocks IO
1014       ret = base + (pool.info.min_size - acting.size());
1015
1016     } else if (is_undersized()) {
1017       // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
1018       ceph_assert(pool.info.size > actingset.size());
1019       base = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1020       ret = base + (pool.info.size - actingset.size());
1021
1022     } else if (is_degraded()) {
1023       // degraded: baseline degraded
1024       base = ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1025     }
1026
1027     // Adjust with pool's recovery priority
1028     int64_t pool_recovery_priority = 0;
1029     pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
1030
1031     ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
1032   }
1033
1034   psdout(20) << __func__ << " backfill priority is " << ret << dendl;
1035   return static_cast<unsigned>(ret);
1036 }
1037
1038 unsigned PeeringState::get_delete_priority()
1039 {
1040   auto state = get_osdmap()->get_state(pg_whoami.osd);
1041   if (state & (CEPH_OSD_BACKFILLFULL |
1042                CEPH_OSD_FULL)) {
1043     return OSD_DELETE_PRIORITY_FULL;
1044   } else if (state & CEPH_OSD_NEARFULL) {
1045     return OSD_DELETE_PRIORITY_FULLISH;
1046   } else {
1047     return OSD_DELETE_PRIORITY_NORMAL;
1048   }
1049 }
1050
1051 bool PeeringState::set_force_recovery(bool b)
1052 {
1053   bool did = false;
1054   if (b) {
1055     if (!(state & PG_STATE_FORCED_RECOVERY) &&
1056         (state & (PG_STATE_DEGRADED |
1057                   PG_STATE_RECOVERY_WAIT |
1058                   PG_STATE_RECOVERING))) {
1059       psdout(20) << __func__ << " set" << dendl;
1060       state_set(PG_STATE_FORCED_RECOVERY);
1061       pl->publish_stats_to_osd();
1062       did = true;
1063     }
1064   } else if (state & PG_STATE_FORCED_RECOVERY) {
1065     psdout(20) << __func__ << " clear" << dendl;
1066     state_clear(PG_STATE_FORCED_RECOVERY);
1067     pl->publish_stats_to_osd();
1068     did = true;
1069   }
1070   if (did) {
1071     psdout(20) << __func__ << " state " << get_current_state()
1072              << dendl;
1073     pl->update_local_background_io_priority(get_recovery_priority());
1074   }
1075   return did;
1076 }
1077
1078 bool PeeringState::set_force_backfill(bool b)
1079 {
1080   bool did = false;
1081   if (b) {
1082     if (!(state & PG_STATE_FORCED_BACKFILL) &&
1083         (state & (PG_STATE_DEGRADED |
1084                   PG_STATE_BACKFILL_WAIT |
1085                   PG_STATE_BACKFILLING))) {
1086       psdout(10) << __func__ << " set" << dendl;
1087       state_set(PG_STATE_FORCED_BACKFILL);
1088       pl->publish_stats_to_osd();
1089       did = true;
1090     }
1091   } else if (state & PG_STATE_FORCED_BACKFILL) {
1092     psdout(10) << __func__ << " clear" << dendl;
1093     state_clear(PG_STATE_FORCED_BACKFILL);
1094     pl->publish_stats_to_osd();
1095     did = true;
1096   }
1097   if (did) {
1098     psdout(20) << __func__ << " state " << get_current_state()
1099              << dendl;
1100     pl->update_local_background_io_priority(get_backfill_priority());
1101   }
1102   return did;
1103 }
1104
1105 void PeeringState::schedule_renew_lease()
1106 {
1107   pl->schedule_renew_lease(
1108     last_peering_reset,
1109     readable_interval / 2);
1110 }
1111
1112 void PeeringState::send_lease()
1113 {
1114   epoch_t epoch = pl->get_osdmap_epoch();
1115   for (auto peer : actingset) {
1116     if (peer == pg_whoami) {
1117       continue;
1118     }
1119     pl->send_cluster_message(
1120       peer.osd,
1121       new MOSDPGLease(epoch,
1122                       spg_t(spgid.pgid, peer.shard),
1123                       get_lease()),
1124       epoch);
1125   }
1126 }
1127
1128 void PeeringState::proc_lease(const pg_lease_t& l)
1129 {
1130   if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1131     psdout(20) << __func__ << " no-op, upacting_features 0x" << std::hex
1132                << upacting_features << std::dec
1133                << " does not include SERVER_OCTOPUS" << dendl;
1134     return;
1135   }
1136   if (!is_nonprimary()) {
1137     psdout(20) << __func__ << " no-op, !nonprimary" << dendl;
1138     return;
1139   }
1140   psdout(10) << __func__ << " " << l << dendl;
1141   if (l.readable_until_ub > readable_until_ub_from_primary) {
1142     readable_until_ub_from_primary = l.readable_until_ub;
1143   }
1144
1145   ceph::signedspan ru = ceph::signedspan::zero();
1146   if (l.readable_until != ceph::signedspan::zero() &&
1147       hb_stamps[0]->peer_clock_delta_ub) {
1148     ru = l.readable_until - *hb_stamps[0]->peer_clock_delta_ub;
1149     psdout(20) << " peer_clock_delta_ub " << *hb_stamps[0]->peer_clock_delta_ub
1150                << " -> ru " << ru << dendl;
1151   }
1152   if (ru > readable_until) {
1153     readable_until = ru;
1154     psdout(20) << __func__ << " readable_until now " << readable_until << dendl;
1155     // NOTE: if we ever decide to block/queue ops on the replica,
1156     // we'll need to wake them up here.
1157   }
1158
1159   ceph::signedspan ruub;
1160   if (hb_stamps[0]->peer_clock_delta_lb) {
1161     ruub = l.readable_until_ub - *hb_stamps[0]->peer_clock_delta_lb;
1162     psdout(20) << " peer_clock_delta_lb " << *hb_stamps[0]->peer_clock_delta_lb
1163                << " -> ruub " << ruub << dendl;
1164   } else {
1165     ruub = pl->get_mnow() + l.interval;
1166     psdout(20) << " no peer_clock_delta_lb -> ruub " << ruub << dendl;
1167   }
1168   if (ruub > readable_until_ub) {
1169     readable_until_ub = ruub;
1170     psdout(20) << __func__ << " readable_until_ub now " << readable_until_ub
1171                << dendl;
1172   }
1173 }
1174
1175 void PeeringState::proc_lease_ack(int from, const pg_lease_ack_t& a)
1176 {
1177   if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1178     return;
1179   }
1180   auto now = pl->get_mnow();
1181   bool was_min = false;
1182   for (unsigned i = 0; i < acting.size(); ++i) {
1183     if (from == acting[i]) {
1184       // the lease_ack value is based on the primary's clock
1185       if (a.readable_until_ub > acting_readable_until_ub[i]) {
1186         if (acting_readable_until_ub[i] == readable_until) {
1187           was_min = true;
1188         }
1189         acting_readable_until_ub[i] = a.readable_until_ub;
1190         break;
1191       }
1192     }
1193   }
1194   if (was_min) {
1195     auto old_ru = readable_until;
1196     recalc_readable_until();
1197     if (now < old_ru) {
1198       pl->recheck_readable();
1199     }
1200   }
1201 }
1202
1203 void PeeringState::proc_renew_lease()
1204 {
1205   if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1206     return;
1207   }
1208   renew_lease(pl->get_mnow());
1209   send_lease();
1210   schedule_renew_lease();
1211 }
1212
1213 void PeeringState::recalc_readable_until()
1214 {
1215   assert(is_primary());
1216   ceph::signedspan min = readable_until_ub_sent;
1217   for (unsigned i = 0; i < acting.size(); ++i) {
1218     if (acting[i] == pg_whoami.osd || acting[i] == CRUSH_ITEM_NONE) {
1219       continue;
1220     }
1221     dout(20) << __func__ << " peer osd." << acting[i]
1222              << " ruub " << acting_readable_until_ub[i] << dendl;
1223     if (acting_readable_until_ub[i] < min) {
1224       min = acting_readable_until_ub[i];
1225     }
1226   }
1227   readable_until = min;
1228   readable_until_ub = min;
1229   dout(20) << __func__ << " readable_until[_ub] " << readable_until
1230            << " (sent " << readable_until_ub_sent << ")" << dendl;
1231 }
1232
1233 bool PeeringState::check_prior_readable_down_osds(const OSDMapRef& map)
1234 {
1235   if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1236     return false;
1237   }
1238   bool changed = false;
1239   auto p = prior_readable_down_osds.begin();
1240   while (p != prior_readable_down_osds.end()) {
1241     if (map->is_dead(*p)) {
1242       dout(10) << __func__ << " prior_readable_down_osds osd." << *p
1243                << " is dead as of epoch " << map->get_epoch()
1244                << dendl;
1245       p = prior_readable_down_osds.erase(p);
1246       changed = true;
1247     } else {
1248       ++p;
1249     }
1250   }
1251   if (changed && prior_readable_down_osds.empty()) {
1252     psdout(10) << " empty prior_readable_down_osds, clearing ub" << dendl;
1253     clear_prior_readable_until_ub();
1254     return true;
1255   }
1256   return false;
1257 }
1258
1259 bool PeeringState::adjust_need_up_thru(const OSDMapRef osdmap)
1260 {
1261   epoch_t up_thru = osdmap->get_up_thru(pg_whoami.osd);
1262   if (need_up_thru &&
1263       up_thru >= info.history.same_interval_since) {
1264     psdout(10) << "adjust_need_up_thru now "
1265                << up_thru << ", need_up_thru now false" << dendl;
1266     need_up_thru = false;
1267     return true;
1268   }
1269   return false;
1270 }
1271
1272 PastIntervals::PriorSet PeeringState::build_prior()
1273 {
1274   if (1) {
1275     // sanity check
1276     for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
1277          it != peer_info.end();
1278          ++it) {
1279       ceph_assert(info.history.last_epoch_started >=
1280                   it->second.history.last_epoch_started);
1281     }
1282   }
1283
1284   const OSDMap &osdmap = *get_osdmap();
1285   PastIntervals::PriorSet prior = past_intervals.get_prior_set(
1286     pool.info.is_erasure(),
1287     info.history.last_epoch_started,
1288     &missing_loc.get_recoverable_predicate(),
1289     [&](epoch_t start, int osd, epoch_t *lost_at) {
1290       const osd_info_t *pinfo = 0;
1291       if (osdmap.exists(osd)) {
1292         pinfo = &osdmap.get_info(osd);
1293         if (lost_at)
1294           *lost_at = pinfo->lost_at;
1295       }
1296
1297       if (osdmap.is_up(osd)) {
1298         return PastIntervals::UP;
1299       } else if (!pinfo) {
1300         return PastIntervals::DNE;
1301       } else if (pinfo->lost_at > start) {
1302         return PastIntervals::LOST;
1303       } else {
1304         return PastIntervals::DOWN;
1305       }
1306     },
1307     up,
1308     acting,
1309     dpp);
1310
1311   if (prior.pg_down) {
1312     state_set(PG_STATE_DOWN);
1313   }
1314
1315   if (get_osdmap()->get_up_thru(pg_whoami.osd) <
1316       info.history.same_interval_since) {
1317     psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1318                << " < same_since " << info.history.same_interval_since
1319                << ", must notify monitor" << dendl;
1320     need_up_thru = true;
1321   } else {
1322     psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1323                << " >= same_since " << info.history.same_interval_since
1324                << ", all is well" << dendl;
1325     need_up_thru = false;
1326   }
1327   pl->set_probe_targets(prior.probe);
1328   return prior;
1329 }
1330
1331 bool PeeringState::needs_recovery() const
1332 {
1333   ceph_assert(is_primary());
1334
1335   auto &missing = pg_log.get_missing();
1336
1337   if (missing.num_missing()) {
1338     psdout(10) << __func__ << " primary has " << missing.num_missing()
1339                << " missing" << dendl;
1340     return true;
1341   }
1342
1343   ceph_assert(!acting_recovery_backfill.empty());
1344   set<pg_shard_t>::const_iterator end = acting_recovery_backfill.end();
1345   set<pg_shard_t>::const_iterator a = acting_recovery_backfill.begin();
1346   for (; a != end; ++a) {
1347     if (*a == get_primary()) continue;
1348     pg_shard_t peer = *a;
1349     map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
1350     if (pm == peer_missing.end()) {
1351       psdout(10) << __func__ << " osd." << peer << " doesn't have missing set"
1352                  << dendl;
1353       continue;
1354     }
1355     if (pm->second.num_missing()) {
1356       psdout(10) << __func__ << " osd." << peer << " has "
1357                  << pm->second.num_missing() << " missing" << dendl;
1358       return true;
1359     }
1360   }
1361
1362   psdout(10) << __func__ << " is recovered" << dendl;
1363   return false;
1364 }
1365
1366 bool PeeringState::needs_backfill() const
1367 {
1368   ceph_assert(is_primary());
1369
1370   // We can assume that only possible osds that need backfill
1371   // are on the backfill_targets vector nodes.
1372   set<pg_shard_t>::const_iterator end = backfill_targets.end();
1373   set<pg_shard_t>::const_iterator a = backfill_targets.begin();
1374   for (; a != end; ++a) {
1375     pg_shard_t peer = *a;
1376     map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
1377     if (!pi->second.last_backfill.is_max()) {
1378       psdout(10) << __func__ << " osd." << peer
1379                  << " has last_backfill " << pi->second.last_backfill << dendl;
1380       return true;
1381     }
1382   }
1383
1384   psdout(10) << __func__ << " does not need backfill" << dendl;
1385   return false;
1386 }
1387
1388 /*
1389  * Returns true unless there is a non-lost OSD in might_have_unfound.
1390  */
1391 bool PeeringState::all_unfound_are_queried_or_lost(
1392   const OSDMapRef osdmap) const
1393 {
1394   ceph_assert(is_primary());
1395
1396   set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
1397   set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
1398   for (; peer != mend; ++peer) {
1399     if (peer_missing.count(*peer))
1400       continue;
1401     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
1402     if (iter != peer_info.end() &&
1403         (iter->second.is_empty() || iter->second.dne()))
1404       continue;
1405     if (!osdmap->exists(peer->osd))
1406       continue;
1407     const osd_info_t &osd_info(osdmap->get_info(peer->osd));
1408     if (osd_info.lost_at <= osd_info.up_from) {
1409       // If there is even one OSD in might_have_unfound that isn't lost, we
1410       // still might retrieve our unfound.
1411       return false;
1412     }
1413   }
1414   psdout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound "
1415              << might_have_unfound
1416              << " have been queried or are marked lost" << dendl;
1417   return true;
1418 }
1419
1420
1421 void PeeringState::reject_reservation()
1422 {
1423   pl->unreserve_recovery_space();
1424   pl->send_cluster_message(
1425     primary.osd,
1426     new MBackfillReserve(
1427       MBackfillReserve::REJECT_TOOFULL,
1428       spg_t(info.pgid.pgid, primary.shard),
1429       get_osdmap_epoch()),
1430     get_osdmap_epoch());
1431 }
1432
1433 /**
1434  * find_best_info
1435  *
1436  * Returns an iterator to the best info in infos sorted by:
1437  *  1) Prefer newer last_update
1438  *  2) Prefer longer tail if it brings another info into contiguity
1439  *  3) Prefer current primary
1440  */
1441 map<pg_shard_t, pg_info_t>::const_iterator PeeringState::find_best_info(
1442   const map<pg_shard_t, pg_info_t> &infos,
1443   bool restrict_to_up_acting,
1444   bool *history_les_bound) const
1445 {
1446   ceph_assert(history_les_bound);
1447   /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1448    * to make changes to this process.  Also, make sure to update it
1449    * when you find bugs! */
1450   eversion_t min_last_update_acceptable = eversion_t::max();
1451   epoch_t max_last_epoch_started_found = 0;
1452   for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1453        i != infos.end();
1454        ++i) {
1455     if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1456         max_last_epoch_started_found < i->second.history.last_epoch_started) {
1457       *history_les_bound = true;
1458       max_last_epoch_started_found = i->second.history.last_epoch_started;
1459     }
1460     if (!i->second.is_incomplete() &&
1461         max_last_epoch_started_found < i->second.last_epoch_started) {
1462       *history_les_bound = false;
1463       max_last_epoch_started_found = i->second.last_epoch_started;
1464     }
1465   }
1466   for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1467        i != infos.end();
1468        ++i) {
1469     if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1470       if (min_last_update_acceptable > i->second.last_update)
1471         min_last_update_acceptable = i->second.last_update;
1472     }
1473   }
1474   if (min_last_update_acceptable == eversion_t::max())
1475     return infos.end();
1476
1477   map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1478   // find osd with newest last_update (oldest for ec_pool).
1479   // if there are multiples, prefer
1480   //  - a longer tail, if it brings another peer into log contiguity
1481   //  - the current primary
1482   for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1483        p != infos.end();
1484        ++p) {
1485     if (restrict_to_up_acting && !is_up(p->first) &&
1486         !is_acting(p->first))
1487       continue;
1488     // Only consider peers with last_update >= min_last_update_acceptable
1489     if (p->second.last_update < min_last_update_acceptable)
1490       continue;
1491     // Disqualify anyone with a too old last_epoch_started
1492     if (p->second.last_epoch_started < max_last_epoch_started_found)
1493       continue;
1494     // Disqualify anyone who is incomplete (not fully backfilled)
1495     if (p->second.is_incomplete())
1496       continue;
1497     if (best == infos.end()) {
1498       best = p;
1499       continue;
1500     }
1501     // Prefer newer last_update
1502     if (pool.info.require_rollback()) {
1503       if (p->second.last_update > best->second.last_update)
1504         continue;
1505       if (p->second.last_update < best->second.last_update) {
1506         best = p;
1507         continue;
1508       }
1509     } else {
1510       if (p->second.last_update < best->second.last_update)
1511         continue;
1512       if (p->second.last_update > best->second.last_update) {
1513         best = p;
1514         continue;
1515       }
1516     }
1517
1518     // Prefer longer tail
1519     if (p->second.log_tail > best->second.log_tail) {
1520       continue;
1521     } else if (p->second.log_tail < best->second.log_tail) {
1522       best = p;
1523       continue;
1524     }
1525
1526     if (!p->second.has_missing() && best->second.has_missing()) {
1527       psdout(10) << __func__ << " prefer osd." << p->first
1528                << " because it is complete while best has missing"
1529                << dendl;
1530       best = p;
1531       continue;
1532     } else if (p->second.has_missing() && !best->second.has_missing()) {
1533       psdout(10) << __func__ << " skipping osd." << p->first
1534                << " because it has missing while best is complete"
1535                << dendl;
1536       continue;
1537     } else {
1538       // both are complete or have missing
1539       // fall through
1540     }
1541
1542     // prefer current primary (usually the caller), all things being equal
1543     if (p->first == pg_whoami) {
1544       psdout(10) << "calc_acting prefer osd." << p->first
1545                  << " because it is current primary" << dendl;
1546       best = p;
1547       continue;
1548     }
1549   }
1550   return best;
1551 }
1552
1553 void PeeringState::calc_ec_acting(
1554   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1555   unsigned size,
1556   const vector<int> &acting,
1557   const vector<int> &up,
1558   const map<pg_shard_t, pg_info_t> &all_info,
1559   bool restrict_to_up_acting,
1560   vector<int> *_want,
1561   set<pg_shard_t> *backfill,
1562   set<pg_shard_t> *acting_backfill,
1563   ostream &ss)
1564 {
1565   vector<int> want(size, CRUSH_ITEM_NONE);
1566   map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1567   for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1568        i != all_info.end();
1569        ++i) {
1570     all_info_by_shard[i->first.shard].insert(i->first);
1571   }
1572   for (uint8_t i = 0; i < want.size(); ++i) {
1573     ss << "For position " << (unsigned)i << ": ";
1574     if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1575         !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1576         all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1577         auth_log_shard->second.log_tail) {
1578       ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1579       want[i] = up[i];
1580       continue;
1581     }
1582     if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1583       ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1584          << " and ";
1585       backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1586     }
1587
1588     if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1589         !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1590         all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1591         auth_log_shard->second.log_tail) {
1592       ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1593       want[i] = acting[i];
1594     } else if (!restrict_to_up_acting) {
1595       for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1596            j != all_info_by_shard[shard_id_t(i)].end();
1597            ++j) {
1598         ceph_assert(j->shard == i);
1599         if (!all_info.find(*j)->second.is_incomplete() &&
1600             all_info.find(*j)->second.last_update >=
1601             auth_log_shard->second.log_tail) {
1602           ss << " selecting stray: " << *j << std::endl;
1603           want[i] = j->osd;
1604           break;
1605         }
1606       }
1607       if (want[i] == CRUSH_ITEM_NONE)
1608         ss << " failed to fill position " << (int)i << std::endl;
1609     }
1610   }
1611
1612   for (uint8_t i = 0; i < want.size(); ++i) {
1613     if (want[i] != CRUSH_ITEM_NONE) {
1614       acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1615     }
1616   }
1617   acting_backfill->insert(backfill->begin(), backfill->end());
1618   _want->swap(want);
1619 }
1620
1621 /**
1622  * calculate the desired acting set.
1623  *
1624  * Choose an appropriate acting set.  Prefer up[0], unless it is
1625  * incomplete, or another osd has a longer tail that allows us to
1626  * bring other up nodes up to date.
1627  */
1628 void PeeringState::calc_replicated_acting(
1629   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1630   uint64_t force_auth_primary_missing_objects,
1631   unsigned size,
1632   const vector<int> &acting,
1633   const vector<int> &up,
1634   pg_shard_t up_primary,
1635   const map<pg_shard_t, pg_info_t> &all_info,
1636   bool restrict_to_up_acting,
1637   vector<int> *want,
1638   set<pg_shard_t> *backfill,
1639   set<pg_shard_t> *acting_backfill,
1640   const OSDMapRef osdmap,
1641   ostream &ss)
1642 {
1643   pg_shard_t auth_log_shard_id = auth_log_shard->first;
1644
1645   ss << __func__ << " newest update on osd." << auth_log_shard_id
1646      << " with " << auth_log_shard->second
1647      << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1648
1649   // select primary
1650   auto primary = all_info.find(up_primary);
1651   if (up.size() &&
1652       !primary->second.is_incomplete() &&
1653       primary->second.last_update >=
1654         auth_log_shard->second.log_tail) {
1655     if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1656       auto approx_missing_objects =
1657         primary->second.stats.stats.sum.num_objects_missing;
1658       auto auth_version = auth_log_shard->second.last_update.version;
1659       auto primary_version = primary->second.last_update.version;
1660       if (auth_version > primary_version) {
1661         approx_missing_objects += auth_version - primary_version;
1662       } else {
1663         approx_missing_objects += primary_version - auth_version;
1664       }
1665       if ((uint64_t)approx_missing_objects >
1666           force_auth_primary_missing_objects) {
1667         primary = auth_log_shard;
1668         ss << "up_primary: " << up_primary << ") has approximate "
1669            << approx_missing_objects
1670            << "(>" << force_auth_primary_missing_objects <<") "
1671            << "missing objects, osd." << auth_log_shard_id
1672            << " selected as primary instead"
1673            << std::endl;
1674       } else {
1675         ss << "up_primary: " << up_primary << ") selected as primary"
1676            << std::endl;
1677       }
1678     } else {
1679       ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1680     }
1681   } else {
1682     ceph_assert(!auth_log_shard->second.is_incomplete());
1683     ss << "up[0] needs backfill, osd." << auth_log_shard_id
1684        << " selected as primary instead" << std::endl;
1685     primary = auth_log_shard;
1686   }
1687
1688   ss << __func__ << " primary is osd." << primary->first
1689      << " with " << primary->second << std::endl;
1690   want->push_back(primary->first.osd);
1691   acting_backfill->insert(primary->first);
1692
1693   /* We include auth_log_shard->second.log_tail because in GetLog,
1694    * we will request logs back to the min last_update over our
1695    * acting_backfill set, which will result in our log being extended
1696    * as far backwards as necessary to pick up any peers which can
1697    * be log recovered by auth_log_shard's log */
1698   eversion_t oldest_auth_log_entry =
1699     std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
1700
1701   // select replicas that have log contiguity with primary.
1702   // prefer up, then acting, then any peer_info osds
1703   for (auto i : up) {
1704     pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
1705     if (up_cand == primary->first)
1706       continue;
1707     const pg_info_t &cur_info = all_info.find(up_cand)->second;
1708     if (cur_info.is_incomplete() ||
1709         cur_info.last_update < oldest_auth_log_entry) {
1710       ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1711       backfill->insert(up_cand);
1712       acting_backfill->insert(up_cand);
1713     } else {
1714       want->push_back(i);
1715       acting_backfill->insert(up_cand);
1716       ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
1717     }
1718   }
1719
1720   if (want->size() >= size) {
1721     return;
1722   }
1723
1724   std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
1725   candidate_by_last_update.reserve(acting.size());
1726   // This no longer has backfill OSDs, but they are covered above.
1727   for (auto i : acting) {
1728     pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
1729     // skip up osds we already considered above
1730     if (acting_cand == primary->first)
1731       continue;
1732     vector<int>::const_iterator up_it = find(up.begin(), up.end(), i);
1733     if (up_it != up.end())
1734       continue;
1735
1736     const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1737     if (cur_info.is_incomplete() ||
1738         cur_info.last_update < oldest_auth_log_entry) {
1739       ss << " shard " << acting_cand << " (acting) REJECTED "
1740          << cur_info << std::endl;
1741     } else {
1742       candidate_by_last_update.emplace_back(cur_info.last_update, i);
1743     }
1744   }
1745
1746   auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs,
1747                             const std::pair<eversion_t, int> &rhs) {
1748     return lhs.first > rhs.first;
1749   };
1750   // sort by last_update, in descending order.
1751   std::sort(candidate_by_last_update.begin(),
1752             candidate_by_last_update.end(), sort_by_eversion);
1753   for (auto &p: candidate_by_last_update) {
1754     ceph_assert(want->size() < size);
1755     want->push_back(p.second);
1756     pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1757     acting_backfill->insert(s);
1758     ss << " shard " << s << " (acting) accepted "
1759        << all_info.find(s)->second << std::endl;
1760     if (want->size() >= size) {
1761       return;
1762     }
1763   }
1764
1765   if (restrict_to_up_acting) {
1766     return;
1767   }
1768   candidate_by_last_update.clear();
1769   candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
1770   // continue to search stray to find more suitable peers
1771   for (auto &i : all_info) {
1772     // skip up osds we already considered above
1773     if (i.first == primary->first)
1774       continue;
1775     vector<int>::const_iterator up_it = find(up.begin(), up.end(), i.first.osd);
1776     if (up_it != up.end())
1777       continue;
1778     vector<int>::const_iterator acting_it = find(
1779       acting.begin(), acting.end(), i.first.osd);
1780     if (acting_it != acting.end())
1781       continue;
1782
1783     if (i.second.is_incomplete() ||
1784         i.second.last_update < oldest_auth_log_entry) {
1785       ss << " shard " << i.first << " (stray) REJECTED " << i.second
1786          << std::endl;
1787     } else {
1788       candidate_by_last_update.emplace_back(
1789         i.second.last_update, i.first.osd);
1790     }
1791   }
1792
1793   if (candidate_by_last_update.empty()) {
1794     // save us some effort
1795     return;
1796   }
1797
1798   // sort by last_update, in descending order.
1799   std::sort(candidate_by_last_update.begin(),
1800             candidate_by_last_update.end(), sort_by_eversion);
1801
1802   for (auto &p: candidate_by_last_update) {
1803     ceph_assert(want->size() < size);
1804     want->push_back(p.second);
1805     pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1806     acting_backfill->insert(s);
1807     ss << " shard " << s << " (stray) accepted "
1808        << all_info.find(s)->second << std::endl;
1809     if (want->size() >= size) {
1810       return;
1811     }
1812   }
1813 }
1814
1815 bool PeeringState::recoverable(const vector<int> &want) const
1816 {
1817   unsigned num_want_acting = 0;
1818   set<pg_shard_t> have;
1819   for (int i = 0; i < (int)want.size(); ++i) {
1820     if (want[i] != CRUSH_ITEM_NONE) {
1821       ++num_want_acting;
1822       have.insert(
1823         pg_shard_t(
1824           want[i],
1825           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1826     }
1827   }
1828
1829   if (num_want_acting < pool.info.min_size) {
1830     const bool recovery_ec_pool_below_min_size=
1831       HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_OCTOPUS);
1832
1833     if (pool.info.is_erasure() && !recovery_ec_pool_below_min_size) {
1834       psdout(10) << __func__ << " failed, ec recovery below min size not supported by pre-octopus" << dendl;
1835       return false;
1836     } else if (!cct->_conf.get_val<bool>("osd_allow_recovery_below_min_size")) {
1837       psdout(10) << __func__ << " failed, recovery below min size not enabled" << dendl;
1838       return false;
1839     }
1840   }
1841   if (missing_loc.get_recoverable_predicate()(have)) {
1842     return true;
1843   } else {
1844     psdout(10) << __func__ << " failed, not recoverable " << dendl;
1845     return false;
1846   }
1847 }
1848
1849 void PeeringState::choose_async_recovery_ec(
1850   const map<pg_shard_t, pg_info_t> &all_info,
1851   const pg_info_t &auth_info,
1852   vector<int> *want,
1853   set<pg_shard_t> *async_recovery,
1854   const OSDMapRef osdmap) const
1855 {
1856   set<pair<int, pg_shard_t> > candidates_by_cost;
1857   for (uint8_t i = 0; i < want->size(); ++i) {
1858     if ((*want)[i] == CRUSH_ITEM_NONE)
1859       continue;
1860
1861     // Considering log entries to recover is accurate enough for
1862     // now. We could use minimum_to_decode_with_cost() later if
1863     // necessary.
1864     pg_shard_t shard_i((*want)[i], shard_id_t(i));
1865     // do not include strays
1866     if (stray_set.find(shard_i) != stray_set.end())
1867       continue;
1868     // Do not include an osd that is not up, since choosing it as
1869     // an async_recovery_target will move it out of the acting set.
1870     // This results in it being identified as a stray during peering,
1871     // because it is no longer in the up or acting set.
1872     if (!is_up(shard_i))
1873       continue;
1874     auto shard_info = all_info.find(shard_i)->second;
1875     // for ec pools we rollback all entries past the authoritative
1876     // last_update *before* activation. This is relatively inexpensive
1877     // compared to recovery, since it is purely local, so treat shards
1878     // past the authoritative last_update the same as those equal to it.
1879     version_t auth_version = auth_info.last_update.version;
1880     version_t candidate_version = shard_info.last_update.version;
1881     if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1882       auto approx_missing_objects =
1883         shard_info.stats.stats.sum.num_objects_missing;
1884       if (auth_version > candidate_version) {
1885         approx_missing_objects += auth_version - candidate_version;
1886       }
1887       if (static_cast<uint64_t>(approx_missing_objects) >
1888          cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1889         candidates_by_cost.emplace(approx_missing_objects, shard_i);
1890       }
1891     } else {
1892       if (auth_version > candidate_version &&
1893           (auth_version - candidate_version) > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1894         candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i));
1895       }
1896     }
1897   }
1898
1899   psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1900              << dendl;
1901
1902   // take out as many osds as we can for async recovery, in order of cost
1903   for (auto rit = candidates_by_cost.rbegin();
1904        rit != candidates_by_cost.rend(); ++rit) {
1905     pg_shard_t cur_shard = rit->second;
1906     vector<int> candidate_want(*want);
1907     candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE;
1908     if (recoverable(candidate_want)) {
1909       want->swap(candidate_want);
1910       async_recovery->insert(cur_shard);
1911     }
1912   }
1913   psdout(20) << __func__ << " result want=" << *want
1914              << " async_recovery=" << *async_recovery << dendl;
1915 }
1916
1917 void PeeringState::choose_async_recovery_replicated(
1918   const map<pg_shard_t, pg_info_t> &all_info,
1919   const pg_info_t &auth_info,
1920   vector<int> *want,
1921   set<pg_shard_t> *async_recovery,
1922   const OSDMapRef osdmap) const
1923 {
1924   set<pair<int, pg_shard_t> > candidates_by_cost;
1925   for (auto osd_num : *want) {
1926     pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD);
1927     // do not include strays
1928     if (stray_set.find(shard_i) != stray_set.end())
1929       continue;
1930     // Do not include an osd that is not up, since choosing it as
1931     // an async_recovery_target will move it out of the acting set.
1932     // This results in it being identified as a stray during peering,
1933     // because it is no longer in the up or acting set.
1934     if (!is_up(shard_i))
1935       continue;
1936     auto shard_info = all_info.find(shard_i)->second;
1937     // use the approximate magnitude of the difference in length of
1938     // logs plus historical missing objects as the cost of recovery
1939     version_t auth_version = auth_info.last_update.version;
1940     version_t candidate_version = shard_info.last_update.version;
1941     if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1942       auto approx_missing_objects =
1943         shard_info.stats.stats.sum.num_objects_missing;
1944       if (auth_version > candidate_version) {
1945         approx_missing_objects += auth_version - candidate_version;
1946       } else {
1947         approx_missing_objects += candidate_version - auth_version;
1948       }
1949       if (static_cast<uint64_t>(approx_missing_objects)  >
1950          cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1951         candidates_by_cost.emplace(approx_missing_objects, shard_i);
1952       }
1953     } else {
1954       size_t approx_entries;
1955       if (auth_version > candidate_version) {
1956         approx_entries = auth_version - candidate_version;
1957       } else {
1958         approx_entries = candidate_version - auth_version;
1959       }
1960       if (approx_entries > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1961         candidates_by_cost.insert(make_pair(approx_entries, shard_i));
1962       }
1963     }
1964   }
1965
1966   psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1967              << dendl;
1968   // take out as many osds as we can for async recovery, in order of cost
1969   for (auto rit = candidates_by_cost.rbegin();
1970        rit != candidates_by_cost.rend(); ++rit) {
1971     if (want->size() <= pool.info.min_size) {
1972       break;
1973     }
1974     pg_shard_t cur_shard = rit->second;
1975     vector<int> candidate_want(*want);
1976     for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) {
1977       if (*it == cur_shard.osd) {
1978         candidate_want.erase(it);
1979         want->swap(candidate_want);
1980         async_recovery->insert(cur_shard);
1981         break;
1982       }
1983     }
1984   }
1985   psdout(20) << __func__ << " result want=" << *want
1986              << " async_recovery=" << *async_recovery << dendl;
1987 }
1988
1989
1990
1991 /**
1992  * choose acting
1993  *
1994  * calculate the desired acting, and request a change with the monitor
1995  * if it differs from the current acting.
1996  *
1997  * if restrict_to_up_acting=true, we filter out anything that's not in
1998  * up/acting.  in order to lift this restriction, we need to
1999  *  1) check whether it's worth switching the acting set any time we get
2000  *     a new pg info (not just here, when recovery finishes)
2001  *  2) check whether anything in want_acting went down on each new map
2002  *     (and, if so, calculate a new want_acting)
2003  *  3) remove the assertion in PG::PeeringState::Active::react(const AdvMap)
2004  * TODO!
2005  */
2006 bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
2007                                  bool restrict_to_up_acting,
2008                                  bool *history_les_bound,
2009                                  bool request_pg_temp_change_only)
2010 {
2011   map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
2012   all_info[pg_whoami] = info;
2013
2014   if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
2015     for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
2016          p != all_info.end();
2017          ++p) {
2018       psdout(10) << __func__ << " all_info osd." << p->first << " "
2019                  << p->second << dendl;
2020     }
2021   }
2022
2023   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
2024     find_best_info(all_info, restrict_to_up_acting, history_les_bound);
2025
2026   if (auth_log_shard == all_info.end()) {
2027     if (up != acting) {
2028       psdout(10) << __func__ << " no suitable info found (incomplete backfills?),"
2029                  << " reverting to up" << dendl;
2030       want_acting = up;
2031       vector<int> empty;
2032       pl->queue_want_pg_temp(empty);
2033     } else {
2034       psdout(10) << __func__ << " failed" << dendl;
2035       ceph_assert(want_acting.empty());
2036     }
2037     return false;
2038   }
2039
2040   ceph_assert(!auth_log_shard->second.is_incomplete());
2041   auth_log_shard_id = auth_log_shard->first;
2042
2043   set<pg_shard_t> want_backfill, want_acting_backfill;
2044   vector<int> want;
2045   stringstream ss;
2046   if (pool.info.is_replicated())
2047     calc_replicated_acting(
2048       auth_log_shard,
2049       cct->_conf.get_val<uint64_t>(
2050         "osd_force_auth_primary_missing_objects"),
2051       get_osdmap()->get_pg_size(info.pgid.pgid),
2052       acting,
2053       up,
2054       up_primary,
2055       all_info,
2056       restrict_to_up_acting,
2057       &want,
2058       &want_backfill,
2059       &want_acting_backfill,
2060       get_osdmap(),
2061       ss);
2062   else
2063     calc_ec_acting(
2064       auth_log_shard,
2065       get_osdmap()->get_pg_size(info.pgid.pgid),
2066       acting,
2067       up,
2068       all_info,
2069       restrict_to_up_acting,
2070       &want,
2071       &want_backfill,
2072       &want_acting_backfill,
2073       ss);
2074   psdout(10) << ss.str() << dendl;
2075
2076   if (!recoverable(want)) {
2077     want_acting.clear();
2078     return false;
2079   }
2080
2081   set<pg_shard_t> want_async_recovery;
2082   if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) {
2083     if (pool.info.is_erasure()) {
2084       choose_async_recovery_ec(
2085         all_info, auth_log_shard->second, &want, &want_async_recovery,
2086         get_osdmap());
2087     } else {
2088       choose_async_recovery_replicated(
2089         all_info, auth_log_shard->second, &want, &want_async_recovery,
2090         get_osdmap());
2091     }
2092   }
2093   while (want.size() > pool.info.size) {
2094     // async recovery should have taken out as many osds as it can.
2095     // if not, then always evict the last peer
2096     // (will get synchronously recovered later)
2097     psdout(10) << __func__ << " evicting osd." << want.back()
2098                << " from oversized want " << want << dendl;
2099     want.pop_back();
2100   }
2101   if (want != acting) {
2102     psdout(10) << __func__ << " want " << want << " != acting " << acting
2103                << ", requesting pg_temp change" << dendl;
2104     want_acting = want;
2105
2106     if (!cct->_conf->osd_debug_no_acting_change) {
2107       if (want_acting == up) {
2108         // There can't be any pending backfill if
2109         // want is the same as crush map up OSDs.
2110         ceph_assert(want_backfill.empty());
2111         vector<int> empty;
2112         pl->queue_want_pg_temp(empty);
2113       } else
2114         pl->queue_want_pg_temp(want);
2115     }
2116     return false;
2117   }
2118   if (request_pg_temp_change_only)
2119     return true;
2120   want_acting.clear();
2121   acting_recovery_backfill = want_acting_backfill;
2122   psdout(10) << "acting_recovery_backfill is "
2123              << acting_recovery_backfill << dendl;
2124   ceph_assert(
2125     backfill_targets.empty() ||
2126     backfill_targets == want_backfill);
2127   if (backfill_targets.empty()) {
2128     // Caller is GetInfo
2129     backfill_targets = want_backfill;
2130   }
2131   // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
2132   ceph_assert(
2133     async_recovery_targets.empty() ||
2134     async_recovery_targets == want_async_recovery ||
2135     !needs_recovery());
2136   if (async_recovery_targets.empty() || !needs_recovery()) {
2137     async_recovery_targets = want_async_recovery;
2138   }
2139   // Will not change if already set because up would have had to change
2140   // Verify that nothing in backfill is in stray_set
2141   for (set<pg_shard_t>::iterator i = want_backfill.begin();
2142       i != want_backfill.end();
2143       ++i) {
2144     ceph_assert(stray_set.find(*i) == stray_set.end());
2145   }
2146   psdout(10) << "choose_acting want=" << want << " backfill_targets="
2147            << want_backfill << " async_recovery_targets="
2148            << async_recovery_targets << dendl;
2149   return true;
2150 }
2151
2152 void PeeringState::log_weirdness()
2153 {
2154   if (pg_log.get_tail() != info.log_tail)
2155     pl->get_clog_error() << info.pgid
2156                            << " info mismatch, log.tail " << pg_log.get_tail()
2157                            << " != info.log_tail " << info.log_tail;
2158   if (pg_log.get_head() != info.last_update)
2159     pl->get_clog_error() << info.pgid
2160                            << " info mismatch, log.head " << pg_log.get_head()
2161                            << " != info.last_update " << info.last_update;
2162
2163   if (!pg_log.get_log().empty()) {
2164     // sloppy check
2165     if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
2166       pl->get_clog_error() << info.pgid
2167                              << " log bound mismatch, info (tail,head] ("
2168                              << pg_log.get_tail() << ","
2169                              << pg_log.get_head() << "]"
2170                              << " actual ["
2171                              << pg_log.get_log().log.begin()->version << ","
2172                              << pg_log.get_log().log.rbegin()->version << "]";
2173   }
2174
2175   if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
2176     pl->get_clog_error() << info.pgid
2177                            << " caller_ops.size "
2178                            << pg_log.get_log().caller_ops.size()
2179                            << " > log size " << pg_log.get_log().log.size();
2180   }
2181 }
2182
2183 /*
2184  * Process information from a replica to determine if it could have any
2185  * objects that i need.
2186  *
2187  * TODO: if the missing set becomes very large, this could get expensive.
2188  * Instead, we probably want to just iterate over our unfound set.
2189  */
2190 bool PeeringState::search_for_missing(
2191   const pg_info_t &oinfo, const pg_missing_t &omissing,
2192   pg_shard_t from,
2193   PeeringCtxWrapper &ctx)
2194 {
2195   uint64_t num_unfound_before = missing_loc.num_unfound();
2196   bool found_missing = missing_loc.add_source_info(
2197     from, oinfo, omissing, ctx.handle);
2198   if (found_missing && num_unfound_before != missing_loc.num_unfound())
2199     pl->publish_stats_to_osd();
2200   // avoid doing this if the peer is empty.  This is abit of paranoia
2201   // to avoid doing something rash if add_source_info() above
2202   // incorrectly decided we found something new. (if the peer has
2203   // last_update=0'0 that's impossible.)
2204   if (found_missing &&
2205       oinfo.last_update != eversion_t()) {
2206     pg_info_t tinfo(oinfo);
2207     tinfo.pgid.shard = pg_whoami.shard;
2208     ctx.send_info(
2209       from.osd,
2210       spg_t(info.pgid.pgid, from.shard),
2211       get_osdmap_epoch(),  // fixme: use lower epoch?
2212       get_osdmap_epoch(),
2213       tinfo);
2214   }
2215   return found_missing;
2216 }
2217
2218 bool PeeringState::discover_all_missing(
2219   BufferedRecoveryMessages &rctx)
2220 {
2221   auto &missing = pg_log.get_missing();
2222   uint64_t unfound = get_num_unfound();
2223   bool any = false;  // did we start any queries
2224
2225   psdout(10) << __func__ << " "
2226              << missing.num_missing() << " missing, "
2227              << unfound << " unfound"
2228              << dendl;
2229
2230   std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
2231   std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
2232   for (; m != mend; ++m) {
2233     pg_shard_t peer(*m);
2234
2235     if (!get_osdmap()->is_up(peer.osd)) {
2236       psdout(20) << __func__ << " skipping down osd." << peer << dendl;
2237       continue;
2238     }
2239
2240     if (peer_purged.count(peer)) {
2241       psdout(20) << __func__ << " skipping purged osd." << peer << dendl;
2242       continue;
2243     }
2244
2245     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
2246     if (iter != peer_info.end() &&
2247         (iter->second.is_empty() || iter->second.dne())) {
2248       // ignore empty peers
2249       continue;
2250     }
2251
2252     // If we've requested any of this stuff, the pg_missing_t information
2253     // should be on its way.
2254     // TODO: coalsce requested_* into a single data structure
2255     if (peer_missing.find(peer) != peer_missing.end()) {
2256       psdout(20) << __func__ << ": osd." << peer
2257                  << ": we already have pg_missing_t" << dendl;
2258       continue;
2259     }
2260     if (peer_log_requested.find(peer) != peer_log_requested.end()) {
2261       psdout(20) << __func__ << ": osd." << peer
2262                  << ": in peer_log_requested" << dendl;
2263       continue;
2264     }
2265     if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
2266       psdout(20) << __func__ << ": osd." << peer
2267                  << ": in peer_missing_requested" << dendl;
2268       continue;
2269     }
2270
2271     // Request missing
2272     psdout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
2273                << dendl;
2274     peer_missing_requested.insert(peer);
2275     rctx.send_query(
2276       peer.osd,
2277       spg_t(info.pgid.pgid, peer.shard),
2278       pg_query_t(
2279         pg_query_t::FULLLOG,
2280         peer.shard, pg_whoami.shard,
2281         info.history, get_osdmap_epoch()));
2282     any = true;
2283   }
2284   return any;
2285 }
2286
2287 /* Build the might_have_unfound set.
2288  *
2289  * This is used by the primary OSD during recovery.
2290  *
2291  * This set tracks the OSDs which might have unfound objects that the primary
2292  * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
2293  * will remove the OSD from the set.
2294  */
2295 void PeeringState::build_might_have_unfound()
2296 {
2297   ceph_assert(might_have_unfound.empty());
2298   ceph_assert(is_primary());
2299
2300   psdout(10) << __func__ << dendl;
2301
2302   check_past_interval_bounds();
2303
2304   might_have_unfound = past_intervals.get_might_have_unfound(
2305     pg_whoami,
2306     pool.info.is_erasure());
2307
2308   // include any (stray) peers
2309   for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
2310        p != peer_info.end();
2311        ++p)
2312     might_have_unfound.insert(p->first);
2313
2314   psdout(15) << __func__ << ": built " << might_have_unfound << dendl;
2315 }
2316
2317 void PeeringState::activate(
2318   ObjectStore::Transaction& t,
2319   epoch_t activation_epoch,
2320   PeeringCtxWrapper &ctx)
2321 {
2322   ceph_assert(!is_peered());
2323
2324   // twiddle pg state
2325   state_clear(PG_STATE_DOWN);
2326
2327   send_notify = false;
2328
2329   if (is_primary()) {
2330     // only update primary last_epoch_started if we will go active
2331     if (acting.size() >= pool.info.min_size) {
2332       ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
2333              info.last_epoch_started <= activation_epoch);
2334       info.last_epoch_started = activation_epoch;
2335       info.last_interval_started = info.history.same_interval_since;
2336     }
2337   } else if (is_acting(pg_whoami)) {
2338     /* update last_epoch_started on acting replica to whatever the primary sent
2339      * unless it's smaller (could happen if we are going peered rather than
2340      * active, see doc/dev/osd_internals/last_epoch_started.rst) */
2341     if (info.last_epoch_started < activation_epoch) {
2342       info.last_epoch_started = activation_epoch;
2343       info.last_interval_started = info.history.same_interval_since;
2344     }
2345   }
2346
2347   auto &missing = pg_log.get_missing();
2348
2349   min_last_complete_ondisk = eversion_t(0,0);  // we don't know (yet)!
2350   if (is_primary()) {
2351     last_update_ondisk = info.last_update;
2352   }
2353   last_update_applied = info.last_update;
2354   last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
2355
2356   need_up_thru = false;
2357
2358   // write pg info, log
2359   dirty_info = true;
2360   dirty_big_info = true; // maybe
2361
2362   pl->schedule_event_on_commit(
2363     t,
2364     std::make_shared<PGPeeringEvent>(
2365       get_osdmap_epoch(),
2366       get_osdmap_epoch(),
2367       ActivateCommitted(
2368         get_osdmap_epoch(),
2369         activation_epoch)));
2370
2371   // init complete pointer
2372   if (missing.num_missing() == 0) {
2373     psdout(10) << "activate - no missing, moving last_complete " << info.last_complete
2374              << " -> " << info.last_update << dendl;
2375     info.last_complete = info.last_update;
2376     info.stats.stats.sum.num_objects_missing = 0;
2377     pg_log.reset_recovery_pointers();
2378   } else {
2379     psdout(10) << "activate - not complete, " << missing << dendl;
2380     info.stats.stats.sum.num_objects_missing = missing.num_missing();
2381     pg_log.activate_not_complete(info);
2382   }
2383
2384   log_weirdness();
2385
2386   if (is_primary()) {
2387     // initialize snap_trimq
2388     interval_set<snapid_t> to_trim;
2389     auto& removed_snaps_queue = get_osdmap()->get_removed_snaps_queue();
2390     auto p = removed_snaps_queue.find(info.pgid.pgid.pool());
2391     if (p != removed_snaps_queue.end()) {
2392       dout(20) << "activate - purged_snaps " << info.purged_snaps
2393                << " removed_snaps " << p->second
2394                << dendl;
2395       for (auto q : p->second) {
2396         to_trim.insert(q.first, q.second);
2397       }
2398     }
2399     interval_set<snapid_t> purged;
2400     purged.intersection_of(to_trim, info.purged_snaps);
2401     to_trim.subtract(purged);
2402
2403     if (HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
2404       renew_lease(pl->get_mnow());
2405       // do not schedule until we are actually activated
2406     }
2407
2408     // adjust purged_snaps: PG may have been inactive while snaps were pruned
2409     // from the removed_snaps_queue in the osdmap.  update local purged_snaps
2410     // reflect only those snaps that we thought were pruned and were still in
2411     // the queue.
2412     info.purged_snaps.swap(purged);
2413
2414     // start up replicas
2415     info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
2416                                                  prior_readable_until_ub);
2417
2418     ceph_assert(!acting_recovery_backfill.empty());
2419     for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2420          i != acting_recovery_backfill.end();
2421          ++i) {
2422       if (*i == pg_whoami) continue;
2423       pg_shard_t peer = *i;
2424       ceph_assert(peer_info.count(peer));
2425       pg_info_t& pi = peer_info[peer];
2426
2427       psdout(10) << "activate peer osd." << peer << " " << pi << dendl;
2428
2429       MOSDPGLog *m = 0;
2430       ceph_assert(peer_missing.count(peer));
2431       pg_missing_t& pm = peer_missing[peer];
2432
2433       bool needs_past_intervals = pi.dne();
2434
2435       if (pi.last_update == info.last_update) {
2436         // empty log
2437         if (!pi.last_backfill.is_max())
2438           pl->get_clog_info() << info.pgid << " continuing backfill to osd."
2439                                 << peer
2440                                 << " from (" << pi.log_tail << "," << pi.last_update
2441                                 << "] " << pi.last_backfill
2442                                 << " to " << info.last_update;
2443         if (!pi.is_empty()) {
2444           psdout(10) << "activate peer osd." << peer
2445                      << " is up to date, queueing in pending_activators" << dendl;
2446           ctx.send_info(
2447             peer.osd,
2448             spg_t(info.pgid.pgid, peer.shard),
2449             get_osdmap_epoch(), // fixme: use lower epoch?
2450             get_osdmap_epoch(),
2451             info,
2452             get_lease());
2453         } else {
2454           psdout(10) << "activate peer osd." << peer
2455                      << " is up to date, but sending pg_log anyway" << dendl;
2456           m = new MOSDPGLog(
2457             i->shard, pg_whoami.shard,
2458             get_osdmap_epoch(), info,
2459             last_peering_reset);
2460         }
2461       } else if (
2462         pg_log.get_tail() > pi.last_update ||
2463         pi.last_backfill == hobject_t() ||
2464         (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
2465         /* ^ This last case covers a situation where a replica is not contiguous
2466          * with the auth_log, but is contiguous with this replica.  Reshuffling
2467          * the active set to handle this would be tricky, so instead we just go
2468          * ahead and backfill it anyway.  This is probably preferrable in any
2469          * case since the replica in question would have to be significantly
2470          * behind.
2471          */
2472         // backfill
2473         pl->get_clog_debug() << info.pgid << " starting backfill to osd." << peer
2474                                << " from (" << pi.log_tail << "," << pi.last_update
2475                                << "] " << pi.last_backfill
2476                                << " to " << info.last_update;
2477
2478         pi.last_update = info.last_update;
2479         pi.last_complete = info.last_update;
2480         pi.set_last_backfill(hobject_t());
2481         pi.last_epoch_started = info.last_epoch_started;
2482         pi.last_interval_started = info.last_interval_started;
2483         pi.history = info.history;
2484         pi.hit_set = info.hit_set;
2485         // Save num_bytes for reservation request, can't be negative
2486         peer_bytes[peer] = std::max<int64_t>(0, pi.stats.stats.sum.num_bytes);
2487         pi.stats.stats.clear();
2488         pi.stats.stats.sum.num_bytes = peer_bytes[peer];
2489
2490         // initialize peer with our purged_snaps.
2491         pi.purged_snaps = info.purged_snaps;
2492
2493         m = new MOSDPGLog(
2494           i->shard, pg_whoami.shard,
2495           get_osdmap_epoch(), pi,
2496           last_peering_reset /* epoch to create pg at */);
2497
2498         // send some recent log, so that op dup detection works well.
2499         m->log.copy_up_to(cct, pg_log.get_log(),
2500                           cct->_conf->osd_max_pg_log_entries);
2501         m->info.log_tail = m->log.tail;
2502         pi.log_tail = m->log.tail;  // sigh...
2503
2504         pm.clear();
2505       } else {
2506         // catch up
2507         ceph_assert(pg_log.get_tail() <= pi.last_update);
2508         m = new MOSDPGLog(
2509           i->shard, pg_whoami.shard,
2510           get_osdmap_epoch(), info,
2511           last_peering_reset /* epoch to create pg at */);
2512         // send new stuff to append to replicas log
2513         m->log.copy_after(cct, pg_log.get_log(), pi.last_update);
2514       }
2515
2516       // share past_intervals if we are creating the pg on the replica
2517       // based on whether our info for that peer was dne() *before*
2518       // updating pi.history in the backfill block above.
2519       if (m && needs_past_intervals)
2520         m->past_intervals = past_intervals;
2521
2522       // update local version of peer's missing list!
2523       if (m && pi.last_backfill != hobject_t()) {
2524         for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
2525              p != m->log.log.end();
2526              ++p) {
2527           if (p->soid <= pi.last_backfill &&
2528               !p->is_error()) {
2529             if (perform_deletes_during_peering() && p->is_delete()) {
2530               pm.rm(p->soid, p->version);
2531             } else {
2532               pm.add_next_event(*p);
2533             }
2534           }
2535         }
2536       }
2537
2538       if (m) {
2539         dout(10) << "activate peer osd." << peer << " sending " << m->log
2540                  << dendl;
2541         m->lease = get_lease();
2542         pl->send_cluster_message(peer.osd, m, get_osdmap_epoch());
2543       }
2544
2545       // peer now has
2546       pi.last_update = info.last_update;
2547
2548       // update our missing
2549       if (pm.num_missing() == 0) {
2550         pi.last_complete = pi.last_update;
2551         psdout(10) << "activate peer osd." << peer << " " << pi
2552                    << " uptodate" << dendl;
2553       } else {
2554         psdout(10) << "activate peer osd." << peer << " " << pi
2555                    << " missing " << pm << dendl;
2556       }
2557     }
2558
2559     // Set up missing_loc
2560     set<pg_shard_t> complete_shards;
2561     for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2562          i != acting_recovery_backfill.end();
2563          ++i) {
2564       psdout(20) << __func__ << " setting up missing_loc from shard " << *i
2565                  << " " << dendl;
2566       if (*i == get_primary()) {
2567         missing_loc.add_active_missing(missing);
2568         if (!missing.have_missing())
2569           complete_shards.insert(*i);
2570       } else {
2571         auto peer_missing_entry = peer_missing.find(*i);
2572         ceph_assert(peer_missing_entry != peer_missing.end());
2573         missing_loc.add_active_missing(peer_missing_entry->second);
2574         if (!peer_missing_entry->second.have_missing() &&
2575             peer_info[*i].last_backfill.is_max())
2576           complete_shards.insert(*i);
2577       }
2578     }
2579
2580     // If necessary, create might_have_unfound to help us find our unfound objects.
2581     // NOTE: It's important that we build might_have_unfound before trimming the
2582     // past intervals.
2583     might_have_unfound.clear();
2584     if (needs_recovery()) {
2585       // If only one shard has missing, we do a trick to add all others as recovery
2586       // source, this is considered safe since the PGLogs have been merged locally,
2587       // and covers vast majority of the use cases, like one OSD/host is down for
2588       // a while for hardware repairing
2589       if (complete_shards.size() + 1 == acting_recovery_backfill.size()) {
2590         missing_loc.add_batch_sources_info(complete_shards, ctx.handle);
2591       } else {
2592         missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
2593                                     ctx.handle);
2594         for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2595              i != acting_recovery_backfill.end();
2596              ++i) {
2597           if (*i == pg_whoami) continue;
2598           psdout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
2599           ceph_assert(peer_missing.count(*i));
2600           ceph_assert(peer_info.count(*i));
2601           missing_loc.add_source_info(
2602             *i,
2603             peer_info[*i],
2604             peer_missing[*i],
2605             ctx.handle);
2606         }
2607       }
2608       for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
2609            i != peer_missing.end();
2610            ++i) {
2611         if (is_acting_recovery_backfill(i->first))
2612           continue;
2613         ceph_assert(peer_info.count(i->first));
2614         search_for_missing(
2615           peer_info[i->first],
2616           i->second,
2617           i->first,
2618           ctx);
2619       }
2620
2621       build_might_have_unfound();
2622
2623       // Always call now so update_calc_stats() will be accurate
2624       discover_all_missing(ctx.msgs);
2625
2626     }
2627
2628     // num_objects_degraded if calculated should reflect this too, unless no
2629     // missing and we are about to go clean.
2630     if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
2631       state_set(PG_STATE_UNDERSIZED);
2632     }
2633
2634     state_set(PG_STATE_ACTIVATING);
2635     pl->on_activate(std::move(to_trim));
2636   }
2637   if (acting.size() >= pool.info.min_size) {
2638     PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2639     pg_log.roll_forward(rollbacker.get());
2640   }
2641 }
2642
2643 void PeeringState::share_pg_info()
2644 {
2645   psdout(10) << "share_pg_info" << dendl;
2646
2647   info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
2648                                                prior_readable_until_ub);
2649
2650   // share new pg_info_t with replicas
2651   ceph_assert(!acting_recovery_backfill.empty());
2652   for (auto pg_shard : acting_recovery_backfill) {
2653     if (pg_shard == pg_whoami) continue;
2654     if (auto peer = peer_info.find(pg_shard); peer != peer_info.end()) {
2655       peer->second.last_epoch_started = info.last_epoch_started;
2656       peer->second.last_interval_started = info.last_interval_started;
2657       peer->second.history.merge(info.history);
2658     }
2659     Message* m = nullptr;
2660     if (last_require_osd_release >= ceph_release_t::octopus) {
2661       m = new MOSDPGInfo2{spg_t{info.pgid.pgid, pg_shard.shard},
2662                           info,
2663                           get_osdmap_epoch(),
2664                           get_osdmap_epoch(),
2665                           get_lease(), {}};
2666     } else {
2667       m = new MOSDPGInfo{get_osdmap_epoch(),
2668                          {pg_notify_t{pg_shard.shard,
2669                                       pg_whoami.shard,
2670                                       get_osdmap_epoch(),
2671                                       get_osdmap_epoch(),
2672                                       info,
2673                                       past_intervals}}};
2674     }
2675     pl->send_cluster_message(pg_shard.osd, m, get_osdmap_epoch());
2676   }
2677 }
2678
2679 void PeeringState::merge_log(
2680   ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
2681   pg_shard_t from)
2682 {
2683   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2684   pg_log.merge_log(
2685     oinfo, olog, from, info, rollbacker.get(), dirty_info, dirty_big_info);
2686 }
2687
2688 void PeeringState::rewind_divergent_log(
2689   ObjectStore::Transaction& t, eversion_t newhead)
2690 {
2691   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2692   pg_log.rewind_divergent_log(
2693     newhead, info, rollbacker.get(), dirty_info, dirty_big_info);
2694 }
2695
2696
2697 void PeeringState::proc_primary_info(
2698   ObjectStore::Transaction &t, const pg_info_t &oinfo)
2699 {
2700   ceph_assert(!is_primary());
2701
2702   update_history(oinfo.history);
2703   if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
2704     info.stats.stats.sum.num_scrub_errors = 0;
2705     info.stats.stats.sum.num_shallow_scrub_errors = 0;
2706     info.stats.stats.sum.num_deep_scrub_errors = 0;
2707     dirty_info = true;
2708   }
2709
2710   if (!(info.purged_snaps == oinfo.purged_snaps)) {
2711     psdout(10) << __func__ << " updating purged_snaps to "
2712                << oinfo.purged_snaps
2713                << dendl;
2714     info.purged_snaps = oinfo.purged_snaps;
2715     dirty_info = true;
2716     dirty_big_info = true;
2717   }
2718 }
2719
2720 void PeeringState::proc_master_log(
2721   ObjectStore::Transaction& t, pg_info_t &oinfo,
2722   pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
2723 {
2724   psdout(10) << "proc_master_log for osd." << from << ": "
2725              << olog << " " << omissing << dendl;
2726   ceph_assert(!is_peered() && is_primary());
2727
2728   // merge log into our own log to build master log.  no need to
2729   // make any adjustments to their missing map; we are taking their
2730   // log to be authoritative (i.e., their entries are by definitely
2731   // non-divergent).
2732   merge_log(t, oinfo, olog, from);
2733   peer_info[from] = oinfo;
2734   psdout(10) << " peer osd." << from << " now " << oinfo
2735              << " " << omissing << dendl;
2736   might_have_unfound.insert(from);
2737
2738   // See doc/dev/osd_internals/last_epoch_started
2739   if (oinfo.last_epoch_started > info.last_epoch_started) {
2740     info.last_epoch_started = oinfo.last_epoch_started;
2741     dirty_info = true;
2742   }
2743   if (oinfo.last_interval_started > info.last_interval_started) {
2744     info.last_interval_started = oinfo.last_interval_started;
2745     dirty_info = true;
2746   }
2747   update_history(oinfo.history);
2748   ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
2749          info.last_epoch_started >= info.history.last_epoch_started);
2750
2751   peer_missing[from].claim(omissing);
2752 }
2753
2754 void PeeringState::proc_replica_log(
2755   pg_info_t &oinfo,
2756   const pg_log_t &olog,
2757   pg_missing_t& omissing,
2758   pg_shard_t from)
2759 {
2760   psdout(10) << "proc_replica_log for osd." << from << ": "
2761              << oinfo << " " << olog << " " << omissing << dendl;
2762
2763   pg_log.proc_replica_log(oinfo, olog, omissing, from);
2764
2765   peer_info[from] = oinfo;
2766   psdout(10) << " peer osd." << from << " now "
2767              << oinfo << " " << omissing << dendl;
2768   might_have_unfound.insert(from);
2769
2770   for (map<hobject_t, pg_missing_item>::const_iterator i =
2771          omissing.get_items().begin();
2772        i != omissing.get_items().end();
2773        ++i) {
2774     psdout(20) << " after missing " << i->first
2775                << " need " << i->second.need
2776                << " have " << i->second.have << dendl;
2777   }
2778   peer_missing[from].claim(omissing);
2779 }
2780
2781 void PeeringState::fulfill_info(
2782   pg_shard_t from, const pg_query_t &query,
2783   pair<pg_shard_t, pg_info_t> &notify_info)
2784 {
2785   ceph_assert(from == primary);
2786   ceph_assert(query.type == pg_query_t::INFO);
2787
2788   // info
2789   psdout(10) << "sending info" << dendl;
2790   notify_info = make_pair(from, info);
2791 }
2792
2793 void PeeringState::fulfill_log(
2794   pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
2795 {
2796   psdout(10) << "log request from " << from << dendl;
2797   ceph_assert(from == primary);
2798   ceph_assert(query.type != pg_query_t::INFO);
2799
2800   MOSDPGLog *mlog = new MOSDPGLog(
2801     from.shard, pg_whoami.shard,
2802     get_osdmap_epoch(),
2803     info, query_epoch);
2804   mlog->missing = pg_log.get_missing();
2805
2806   // primary -> other, when building master log
2807   if (query.type == pg_query_t::LOG) {
2808     psdout(10) << " sending info+missing+log since " << query.since
2809                << dendl;
2810     if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
2811       pl->get_clog_error() << info.pgid << " got broken pg_query_t::LOG since "
2812                              << query.since
2813                              << " when my log.tail is " << pg_log.get_tail()
2814                              << ", sending full log instead";
2815       mlog->log = pg_log.get_log();           // primary should not have requested this!!
2816     } else
2817       mlog->log.copy_after(cct, pg_log.get_log(), query.since);
2818   }
2819   else if (query.type == pg_query_t::FULLLOG) {
2820     psdout(10) << " sending info+missing+full log" << dendl;
2821     mlog->log = pg_log.get_log();
2822   }
2823
2824   psdout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
2825
2826   pl->send_cluster_message(from.osd, mlog, get_osdmap_epoch(), true);
2827 }
2828
2829 void PeeringState::fulfill_query(const MQuery& query, PeeringCtxWrapper &rctx)
2830 {
2831   if (query.query.type == pg_query_t::INFO) {
2832     pair<pg_shard_t, pg_info_t> notify_info;
2833     // note this refreshes our prior_readable_until_ub value
2834     update_history(query.query.history);
2835     fulfill_info(query.from, query.query, notify_info);
2836     rctx.send_notify(
2837       notify_info.first.osd,
2838       pg_notify_t(
2839         notify_info.first.shard, pg_whoami.shard,
2840         query.query_epoch,
2841         get_osdmap_epoch(),
2842         notify_info.second,
2843         past_intervals));
2844   } else {
2845     update_history(query.query.history);
2846     fulfill_log(query.from, query.query, query.query_epoch);
2847   }
2848 }
2849
2850 void PeeringState::try_mark_clean()
2851 {
2852   if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2853     state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
2854     state_set(PG_STATE_CLEAN);
2855     info.history.last_epoch_clean = get_osdmap_epoch();
2856     info.history.last_interval_clean = info.history.same_interval_since;
2857     past_intervals.clear();
2858     dirty_big_info = true;
2859     dirty_info = true;
2860   }
2861
2862   if (!is_active() && is_peered()) {
2863     if (is_clean()) {
2864       bool target;
2865       if (pool.info.is_pending_merge(info.pgid.pgid, &target)) {
2866         if (target) {
2867           psdout(10) << "ready to merge (target)" << dendl;
2868           pl->set_ready_to_merge_target(
2869             info.last_update,
2870             info.history.last_epoch_started,
2871             info.history.last_epoch_clean);
2872         } else {
2873           psdout(10) << "ready to merge (source)" << dendl;
2874           pl->set_ready_to_merge_source(info.last_update);
2875         }
2876       }
2877     } else {
2878       psdout(10) << "not clean, not ready to merge" << dendl;
2879       // we should have notified OSD in Active state entry point
2880     }
2881   }
2882
2883   state_clear(PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
2884
2885   share_pg_info();
2886   pl->publish_stats_to_osd();
2887   clear_recovery_state();
2888 }
2889
2890 void PeeringState::split_into(
2891   pg_t child_pgid, PeeringState *child, unsigned split_bits)
2892 {
2893   child->update_osdmap_ref(get_osdmap());
2894   child->pool = pool;
2895
2896   // Log
2897   pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2898   child->info.last_complete = info.last_complete;
2899
2900   info.last_update = pg_log.get_head();
2901   child->info.last_update = child->pg_log.get_head();
2902
2903   child->info.last_user_version = info.last_user_version;
2904
2905   info.log_tail = pg_log.get_tail();
2906   child->info.log_tail = child->pg_log.get_tail();
2907
2908   // reset last_complete, we might have modified pg_log & missing above
2909   pg_log.reset_complete_to(&info);
2910   child->pg_log.reset_complete_to(&child->info);
2911
2912   // Info
2913   child->info.history = info.history;
2914   child->info.history.epoch_created = get_osdmap_epoch();
2915   child->info.purged_snaps = info.purged_snaps;
2916
2917   if (info.last_backfill.is_max()) {
2918     child->info.set_last_backfill(hobject_t::get_max());
2919   } else {
2920     // restart backfill on parent and child to be safe.  we could
2921     // probably do better in the bitwise sort case, but it's more
2922     // fragile (there may be special work to do on backfill completion
2923     // in the future).
2924     info.set_last_backfill(hobject_t());
2925     child->info.set_last_backfill(hobject_t());
2926     // restarting backfill implies that the missing set is empty,
2927     // since it is only used for objects prior to last_backfill
2928     pg_log.reset_backfill();
2929     child->pg_log.reset_backfill();
2930   }
2931
2932   child->info.stats = info.stats;
2933   child->info.stats.parent_split_bits = split_bits;
2934   info.stats.stats_invalid = true;
2935   child->info.stats.stats_invalid = true;
2936   child->info.last_epoch_started = info.last_epoch_started;
2937   child->info.last_interval_started = info.last_interval_started;
2938
2939   // There can't be recovery/backfill going on now
2940   int primary, up_primary;
2941   vector<int> newup, newacting;
2942   get_osdmap()->pg_to_up_acting_osds(
2943     child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2944   child->init_primary_up_acting(
2945     newup,
2946     newacting,
2947     up_primary,
2948     primary);
2949   child->role = OSDMap::calc_pg_role(pg_whoami, child->acting);
2950
2951   // this comparison includes primary rank via pg_shard_t
2952   if (get_primary() != child->get_primary())
2953     child->info.history.same_primary_since = get_osdmap_epoch();
2954
2955   child->info.stats.up = up;
2956   child->info.stats.up_primary = up_primary;
2957   child->info.stats.acting = acting;
2958   child->info.stats.acting_primary = primary;
2959   child->info.stats.mapping_epoch = get_osdmap_epoch();
2960
2961   // History
2962   child->past_intervals = past_intervals;
2963
2964   child->on_new_interval();
2965
2966   child->send_notify = !child->is_primary();
2967
2968   child->dirty_info = true;
2969   child->dirty_big_info = true;
2970   dirty_info = true;
2971   dirty_big_info = true;
2972 }
2973
2974 void PeeringState::merge_from(
2975   map<spg_t,PeeringState *>& sources,
2976   PeeringCtx &rctx,
2977   unsigned split_bits,
2978   const pg_merge_meta_t& last_pg_merge_meta)
2979 {
2980   bool incomplete = false;
2981   if (info.last_complete != info.last_update ||
2982       info.is_incomplete() ||
2983       info.dne()) {
2984     psdout(10) << __func__ << " target incomplete" << dendl;
2985     incomplete = true;
2986   }
2987   if (last_pg_merge_meta.source_pgid != pg_t()) {
2988     if (info.pgid.pgid != last_pg_merge_meta.source_pgid.get_parent()) {
2989       psdout(10) << __func__ << " target doesn't match expected parent "
2990                  << last_pg_merge_meta.source_pgid.get_parent()
2991                  << " of source_pgid " << last_pg_merge_meta.source_pgid
2992                  << dendl;
2993       incomplete = true;
2994     }
2995     if (info.last_update != last_pg_merge_meta.target_version) {
2996       psdout(10) << __func__ << " target version doesn't match expected "
2997                << last_pg_merge_meta.target_version << dendl;
2998       incomplete = true;
2999     }
3000   }
3001
3002   PGLog::LogEntryHandlerRef handler{pl->get_log_handler(rctx.transaction)};
3003   pg_log.roll_forward(handler.get());
3004
3005   info.last_complete = info.last_update;  // to fake out trim()
3006   pg_log.reset_recovery_pointers();
3007   pg_log.trim(info.last_update, info);
3008
3009   vector<PGLog*> log_from;
3010   for (auto& i : sources) {
3011     auto& source = i.second;
3012     if (!source) {
3013       psdout(10) << __func__ << " source " << i.first << " missing" << dendl;
3014       incomplete = true;
3015       continue;
3016     }
3017     if (source->info.last_complete != source->info.last_update ||
3018         source->info.is_incomplete() ||
3019         source->info.dne()) {
3020       psdout(10) << __func__ << " source " << source->pg_whoami
3021                  << " incomplete"
3022                  << dendl;
3023       incomplete = true;
3024     }
3025     if (last_pg_merge_meta.source_pgid != pg_t()) {
3026       if (source->info.pgid.pgid != last_pg_merge_meta.source_pgid) {
3027         dout(10) << __func__ << " source " << source->info.pgid.pgid
3028                  << " doesn't match expected source pgid "
3029                  << last_pg_merge_meta.source_pgid << dendl;
3030         incomplete = true;
3031       }
3032       if (source->info.last_update != last_pg_merge_meta.source_version) {
3033         dout(10) << __func__ << " source version doesn't match expected "
3034                  << last_pg_merge_meta.target_version << dendl;
3035         incomplete = true;
3036       }
3037     }
3038
3039     // prepare log
3040     PGLog::LogEntryHandlerRef handler{
3041       source->pl->get_log_handler(rctx.transaction)};
3042     source->pg_log.roll_forward(handler.get());
3043     source->info.last_complete = source->info.last_update;  // to fake out trim()
3044     source->pg_log.reset_recovery_pointers();
3045     source->pg_log.trim(source->info.last_update, source->info);
3046     log_from.push_back(&source->pg_log);
3047
3048     // combine stats
3049     info.stats.add(source->info.stats);
3050
3051     // pull up last_update
3052     info.last_update = std::max(info.last_update, source->info.last_update);
3053
3054     // adopt source's PastIntervals if target has none.  we can do this since
3055     // pgp_num has been reduced prior to the merge, so the OSD mappings for
3056     // the PGs are identical.
3057     if (past_intervals.empty() && !source->past_intervals.empty()) {
3058       psdout(10) << __func__ << " taking source's past_intervals" << dendl;
3059       past_intervals = source->past_intervals;
3060     }
3061   }
3062
3063   info.last_complete = info.last_update;
3064   info.log_tail = info.last_update;
3065   if (incomplete) {
3066     info.last_backfill = hobject_t();
3067   }
3068
3069   // merge logs
3070   pg_log.merge_from(log_from, info.last_update);
3071
3072   // make sure we have a meaningful last_epoch_started/clean (if we were a
3073   // placeholder)
3074   if (info.history.epoch_created == 0) {
3075     // start with (a) source's history, since these PGs *should* have been
3076     // remapped in concert with each other...
3077     info.history = sources.begin()->second->info.history;
3078
3079     // we use the last_epoch_{started,clean} we got from
3080     // the caller, which are the epochs that were reported by the PGs were
3081     // found to be ready for merge.
3082     info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean;
3083     info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3084     info.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3085     psdout(10) << __func__
3086                << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/"
3087                << last_pg_merge_meta.last_epoch_clean
3088                << " from pool last_dec_*, source pg history was "
3089                << sources.begin()->second->info.history
3090                << dendl;
3091
3092     // above we have pulled down source's history and we need to check
3093     // history.epoch_created again to confirm that source is not a placeholder
3094     // too. (peering requires a sane history.same_interval_since value for any
3095     // non-newly created pg and below here we know we are basically iterating
3096     // back a series of past maps to fake a merge process, hence we need to
3097     // fix history.same_interval_since first so that start_peering_interval()
3098     // will not complain)
3099     if (info.history.epoch_created == 0) {
3100       dout(10) << __func__ << " both merge target and source are placeholders,"
3101                << " set sis to lec " << info.history.last_epoch_clean
3102                << dendl;
3103       info.history.same_interval_since = info.history.last_epoch_clean;
3104     }
3105
3106     // if the past_intervals start is later than last_epoch_clean, it
3107     // implies the source repeered again but the target didn't, or
3108     // that the source became clean in a later epoch than the target.
3109     // avoid the discrepancy but adjusting the interval start
3110     // backwards to match so that check_past_interval_bounds() will
3111     // not complain.
3112     auto pib = past_intervals.get_bounds();
3113     if (info.history.last_epoch_clean < pib.first) {
3114       psdout(10) << __func__ << " last_epoch_clean "
3115                  << info.history.last_epoch_clean << " < past_interval start "
3116                  << pib.first << ", adjusting start backwards" << dendl;
3117       past_intervals.adjust_start_backwards(info.history.last_epoch_clean);
3118     }
3119
3120     // Similarly, if the same_interval_since value is later than
3121     // last_epoch_clean, the next interval change will result in a
3122     // past_interval start that is later than last_epoch_clean.  This
3123     // can happen if we use the pg_history values from the merge
3124     // source.  Adjust the same_interval_since value backwards if that
3125     // happens.  (We trust the les and lec values more because they came from
3126     // the real target, whereas the history value we stole from the source.)
3127     if (info.history.last_epoch_started < info.history.same_interval_since) {
3128       psdout(10) << __func__ << " last_epoch_started "
3129                  << info.history.last_epoch_started << " < same_interval_since "
3130                  << info.history.same_interval_since
3131                  << ", adjusting pg_history backwards" << dendl;
3132       info.history.same_interval_since = info.history.last_epoch_clean;
3133       // make sure same_{up,primary}_since are <= same_interval_since
3134       info.history.same_up_since = std::min(
3135         info.history.same_up_since, info.history.same_interval_since);
3136       info.history.same_primary_since = std::min(
3137         info.history.same_primary_since, info.history.same_interval_since);
3138     }
3139   }
3140
3141   dirty_info = true;
3142   dirty_big_info = true;
3143 }
3144
3145 void PeeringState::start_split_stats(
3146   const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
3147 {
3148   out->resize(childpgs.size() + 1);
3149   info.stats.stats.sum.split(*out);
3150 }
3151
3152 void PeeringState::finish_split_stats(
3153   const object_stat_sum_t& stats, ObjectStore::Transaction &t)
3154 {
3155   info.stats.stats.sum = stats;
3156   write_if_dirty(t);
3157 }
3158
3159 void PeeringState::update_blocked_by()
3160 {
3161   // set a max on the number of blocking peers we report. if we go
3162   // over, report a random subset.  keep the result sorted.
3163   unsigned keep = std::min<unsigned>(
3164     blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
3165   unsigned skip = blocked_by.size() - keep;
3166   info.stats.blocked_by.clear();
3167   info.stats.blocked_by.resize(keep);
3168   unsigned pos = 0;
3169   for (set<int>::iterator p = blocked_by.begin();
3170        p != blocked_by.end() && keep > 0;
3171        ++p) {
3172     if (skip > 0 && (rand() % (skip + keep) < skip)) {
3173       --skip;
3174     } else {
3175       info.stats.blocked_by[pos++] = *p;
3176       --keep;
3177     }
3178   }
3179 }
3180
3181 static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard)
3182 {
3183     for (auto&p : pgs)
3184       if (p.shard == shard)
3185         return true;
3186     return false;
3187 }
3188
3189 static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard)
3190 {
3191     for (auto&p : pgs) {
3192       if (p == skip)
3193         continue;
3194       if (p.shard == shard)
3195         return p;
3196     }
3197     return pg_shard_t();
3198 }
3199
3200 void PeeringState::update_calc_stats()
3201 {
3202   info.stats.version = info.last_update;
3203   info.stats.created = info.history.epoch_created;
3204   info.stats.last_scrub = info.history.last_scrub;
3205   info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
3206   info.stats.last_deep_scrub = info.history.last_deep_scrub;
3207   info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
3208   info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
3209   info.stats.last_epoch_clean = info.history.last_epoch_clean;
3210
3211   info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
3212   info.stats.ondisk_log_size = info.stats.log_size;
3213   info.stats.log_start = pg_log.get_tail();
3214   info.stats.ondisk_log_start = pg_log.get_tail();
3215   info.stats.snaptrimq_len = pl->get_snap_trimq_size();
3216
3217   unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
3218
3219   // In rare case that upset is too large (usually transient), use as target
3220   // for calculations below.
3221   unsigned target = std::max(num_shards, (unsigned)upset.size());
3222   // For undersized actingset may be larger with OSDs out
3223   unsigned nrep = std::max(actingset.size(), upset.size());
3224   // calc num_object_copies
3225   info.stats.stats.calc_copies(std::max(target, nrep));
3226   info.stats.stats.sum.num_objects_degraded = 0;
3227   info.stats.stats.sum.num_objects_unfound = 0;
3228   info.stats.stats.sum.num_objects_misplaced = 0;
3229   info.stats.avail_no_missing.clear();
3230   info.stats.object_location_counts.clear();
3231
3232   // We should never hit this condition, but if end up hitting it,
3233   // make sure to update num_objects and set PG_STATE_INCONSISTENT.
3234   if (info.stats.stats.sum.num_objects < 0) {
3235     psdout(0) << __func__ << " negative num_objects = "
3236               << info.stats.stats.sum.num_objects << " setting it to 0 "
3237               << dendl;
3238     info.stats.stats.sum.num_objects = 0;
3239     state_set(PG_STATE_INCONSISTENT);
3240   }
3241
3242   if ((is_remapped() || is_undersized() || !is_clean()) &&
3243       (is_peered()|| is_activating())) {
3244     psdout(20) << __func__ << " actingset " << actingset << " upset "
3245                << upset << " acting_recovery_backfill " << acting_recovery_backfill << dendl;
3246
3247     ceph_assert(!acting_recovery_backfill.empty());
3248
3249     bool estimate = false;
3250
3251     // NOTE: we only generate degraded, misplaced and unfound
3252     // values for the summation, not individual stat categories.
3253     int64_t num_objects = info.stats.stats.sum.num_objects;
3254
3255     // Objects missing from up nodes, sorted by # objects.
3256     boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
3257     // Objects missing from nodes not in up, sort by # objects
3258     boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
3259
3260     // Fill missing_target_objects/acting_source_objects
3261
3262     {
3263       int64_t missing;
3264
3265       // Primary first
3266       missing = pg_log.get_missing().num_missing();
3267       ceph_assert(acting_recovery_backfill.count(pg_whoami));
3268       if (upset.count(pg_whoami)) {
3269         missing_target_objects.emplace(missing, pg_whoami);
3270       } else {
3271         acting_source_objects.emplace(missing, pg_whoami);
3272       }
3273       info.stats.stats.sum.num_objects_missing_on_primary = missing;
3274       if (missing == 0)
3275         info.stats.avail_no_missing.push_back(pg_whoami);
3276       psdout(20) << __func__ << " shard " << pg_whoami
3277                  << " primary objects " << num_objects
3278                  << " missing " << missing
3279                  << dendl;
3280     }
3281
3282     // All other peers
3283     for (auto& peer : peer_info) {
3284       // Primary should not be in the peer_info, skip if it is.
3285       if (peer.first == pg_whoami) continue;
3286       int64_t missing = 0;
3287       int64_t peer_num_objects =
3288         std::max((int64_t)0, peer.second.stats.stats.sum.num_objects);
3289       // Backfill targets always track num_objects accurately
3290       // all other peers track missing accurately.
3291       if (is_backfill_target(peer.first)) {
3292         missing = std::max((int64_t)0, num_objects - peer_num_objects);
3293       } else {
3294         if (peer_missing.count(peer.first)) {
3295           missing = peer_missing[peer.first].num_missing();
3296         } else {
3297           psdout(20) << __func__ << " no peer_missing found for "
3298                      << peer.first << dendl;
3299           if (is_recovering()) {
3300             estimate = true;
3301           }
3302           missing = std::max((int64_t)0, num_objects - peer_num_objects);
3303         }
3304       }
3305       if (upset.count(peer.first)) {
3306         missing_target_objects.emplace(missing, peer.first);
3307       } else if (actingset.count(peer.first)) {
3308         acting_source_objects.emplace(missing, peer.first);
3309       }
3310       peer.second.stats.stats.sum.num_objects_missing = missing;
3311       if (missing == 0)
3312         info.stats.avail_no_missing.push_back(peer.first);
3313       psdout(20) << __func__ << " shard " << peer.first
3314                  << " objects " << peer_num_objects
3315                  << " missing " << missing
3316                  << dendl;
3317     }
3318
3319     // Compute object_location_counts
3320     for (auto& ml: missing_loc.get_missing_locs()) {
3321       info.stats.object_location_counts[ml.second]++;
3322       psdout(30) << __func__ << " " << ml.first << " object_location_counts["
3323                  << ml.second << "]=" << info.stats.object_location_counts[ml.second]
3324                  << dendl;
3325     }
3326     int64_t not_missing = num_objects - missing_loc.get_missing_locs().size();
3327     if (not_missing) {
3328         // During recovery we know upset == actingset and is being populated
3329         // During backfill we know that all non-missing objects are in the actingset
3330         info.stats.object_location_counts[actingset] = not_missing;
3331     }
3332     psdout(30) << __func__ << " object_location_counts["
3333                << upset << "]=" << info.stats.object_location_counts[upset]
3334                << dendl;
3335     psdout(20) << __func__ << " object_location_counts "
3336                << info.stats.object_location_counts << dendl;
3337
3338     // A misplaced object is not stored on the correct OSD
3339     int64_t misplaced = 0;
3340     // a degraded objects has fewer replicas or EC shards than the pool specifies.
3341     int64_t degraded = 0;
3342
3343     if (is_recovering()) {
3344       for (auto& sml: missing_loc.get_missing_by_count()) {
3345         for (auto& ml: sml.second) {
3346           int missing_shards;
3347           if (sml.first == shard_id_t::NO_SHARD) {
3348             psdout(20) << __func__ << " ml " << ml.second
3349                        << " upset size " << upset.size()
3350                        << " up " << ml.first.up << dendl;
3351             missing_shards = (int)upset.size() - ml.first.up;
3352           } else {
3353             // Handle shards not even in upset below
3354             if (!find_shard(upset, sml.first))
3355               continue;
3356             missing_shards = std::max(0, 1 - ml.first.up);
3357             psdout(20) << __func__
3358                        << " shard " << sml.first
3359                        << " ml " << ml.second
3360                        << " missing shards " << missing_shards << dendl;
3361           }
3362           int odegraded = ml.second * missing_shards;
3363           // Copies on other osds but limited to the possible degraded
3364           int more_osds = std::min(missing_shards, ml.first.other);
3365           int omisplaced = ml.second * more_osds;
3366           ceph_assert(omisplaced <= odegraded);
3367           odegraded -= omisplaced;
3368
3369           misplaced += omisplaced;
3370           degraded += odegraded;
3371         }
3372       }
3373
3374       psdout(20) << __func__ << " missing based degraded "
3375                  << degraded << dendl;
3376       psdout(20) << __func__ << " missing based misplaced "
3377                  << misplaced << dendl;
3378
3379       // Handle undersized case
3380       if (pool.info.is_replicated()) {
3381         // Add degraded for missing targets (num_objects missing)
3382         ceph_assert(target >= upset.size());
3383         unsigned needed = target - upset.size();
3384         degraded += num_objects * needed;
3385       } else {
3386         for (unsigned i = 0 ; i < num_shards; ++i) {
3387           shard_id_t shard(i);
3388
3389           if (!find_shard(upset, shard)) {
3390             pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard);
3391
3392             if (pgs != pg_shard_t()) {
3393               int64_t missing;
3394
3395               if (pgs == pg_whoami)
3396                 missing = info.stats.stats.sum.num_objects_missing_on_primary;
3397               else
3398                 missing = peer_info[pgs].stats.stats.sum.num_objects_missing;
3399
3400               degraded += missing;
3401               misplaced += std::max((int64_t)0, num_objects - missing);
3402             } else {
3403               // No shard anywhere
3404               degraded += num_objects;
3405             }
3406           }
3407         }
3408       }
3409       goto out;
3410     }
3411
3412     // Handle undersized case
3413     if (pool.info.is_replicated()) {
3414       // Add to missing_target_objects
3415       ceph_assert(target >= missing_target_objects.size());
3416       unsigned needed = target - missing_target_objects.size();
3417       if (needed)
3418         missing_target_objects.emplace(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD));
3419     } else {
3420       for (unsigned i = 0 ; i < num_shards; ++i) {
3421         shard_id_t shard(i);
3422         bool found = false;
3423         for (const auto& t : missing_target_objects) {
3424           if (std::get<1>(t).shard == shard) {
3425             found = true;
3426             break;
3427           }
3428         }
3429         if (!found)
3430           missing_target_objects.emplace(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard));
3431       }
3432     }
3433
3434     for (const auto& item : missing_target_objects)
3435       psdout(20) << __func__ << " missing shard " << std::get<1>(item)
3436                  << " missing= " << std::get<0>(item) << dendl;
3437     for (const auto& item : acting_source_objects)
3438       psdout(20) << __func__ << " acting shard " << std::get<1>(item)
3439                  << " missing= " << std::get<0>(item) << dendl;
3440
3441     // Handle all objects not in missing for remapped
3442     // or backfill
3443     for (auto m = missing_target_objects.rbegin();
3444         m != missing_target_objects.rend(); ++m) {
3445
3446       int64_t extra_missing = -1;
3447
3448       if (pool.info.is_replicated()) {
3449         if (!acting_source_objects.empty()) {
3450           auto extra_copy = acting_source_objects.begin();
3451           extra_missing = std::get<0>(*extra_copy);
3452           acting_source_objects.erase(extra_copy);
3453         }
3454       } else {  // Erasure coded
3455         // Use corresponding shard
3456         for (const auto& a : acting_source_objects) {
3457           if (std::get<1>(a).shard == std::get<1>(*m).shard) {
3458             extra_missing = std::get<0>(a);
3459             acting_source_objects.erase(a);
3460             break;
3461           }
3462         }
3463       }
3464
3465       if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
3466         // We don't know which of the objects on the target
3467         // are part of extra_missing so assume are all degraded.
3468         misplaced += std::get<0>(*m) - extra_missing;
3469         degraded += extra_missing;
3470       } else {
3471         // 1. extra_missing == -1, more targets than sources so degraded
3472         // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
3473         //    previously degraded are now present on the target.
3474         degraded += std::get<0>(*m);
3475       }
3476     }
3477     // If there are still acting that haven't been accounted for
3478     // then they are misplaced
3479     for (const auto& a : acting_source_objects) {
3480       int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
3481       psdout(20) << __func__ << " extra acting misplaced " << extra_misplaced
3482                  << dendl;
3483       misplaced += extra_misplaced;
3484     }
3485 out:
3486     // NOTE: Tests use these messages to verify this code
3487     psdout(20) << __func__ << " degraded " << degraded
3488                << (estimate ? " (est)": "") << dendl;
3489     psdout(20) << __func__ << " misplaced " << misplaced
3490                << (estimate ? " (est)": "")<< dendl;
3491
3492     info.stats.stats.sum.num_objects_degraded = degraded;
3493     info.stats.stats.sum.num_objects_unfound = get_num_unfound();
3494     info.stats.stats.sum.num_objects_misplaced = misplaced;
3495   }
3496 }
3497
3498 std::optional<pg_stat_t> PeeringState::prepare_stats_for_publish(
3499   bool pg_stats_publish_valid,
3500   const pg_stat_t &pg_stats_publish,
3501   const object_stat_collection_t &unstable_stats)
3502 {
3503   if (info.stats.stats.sum.num_scrub_errors) {
3504     state_set(PG_STATE_INCONSISTENT);
3505   } else {
3506     state_clear(PG_STATE_INCONSISTENT);
3507     state_clear(PG_STATE_FAILED_REPAIR);
3508   }
3509
3510   utime_t now = ceph_clock_now();
3511   if (info.stats.state != state) {
3512     info.stats.last_change = now;
3513     // Optimistic estimation, if we just find out an inactive PG,
3514     // assumt it is active till now.
3515     if (!(state & PG_STATE_ACTIVE) &&
3516         (info.stats.state & PG_STATE_ACTIVE))
3517       info.stats.last_active = now;
3518
3519     if ((state & PG_STATE_ACTIVE) &&
3520         !(info.stats.state & PG_STATE_ACTIVE))
3521       info.stats.last_became_active = now;
3522     if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
3523         !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
3524       info.stats.last_became_peered = now;
3525     info.stats.state = state;
3526   }
3527
3528   update_calc_stats();
3529   if (info.stats.stats.sum.num_objects_degraded) {
3530     state_set(PG_STATE_DEGRADED);
3531   } else {
3532     state_clear(PG_STATE_DEGRADED);
3533   }
3534   update_blocked_by();
3535
3536   pg_stat_t pre_publish = info.stats;
3537   pre_publish.stats.add(unstable_stats);
3538   utime_t cutoff = now;
3539   cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
3540
3541   // share (some of) our purged_snaps via the pg_stats. limit # of intervals
3542   // because we don't want to make the pg_stat_t structures too expensive.
3543   unsigned max = cct->_conf->osd_max_snap_prune_intervals_per_epoch;
3544   unsigned num = 0;
3545   auto i = info.purged_snaps.begin();
3546   while (num < max && i != info.purged_snaps.end()) {
3547     pre_publish.purged_snaps.insert(i.get_start(), i.get_len());
3548     ++num;
3549     ++i;
3550   }
3551   psdout(20) << __func__ << " reporting purged_snaps "
3552              << pre_publish.purged_snaps << dendl;
3553
3554   if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
3555       info.stats.last_fresh > cutoff) {
3556     psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3557                << ": no change since " << info.stats.last_fresh << dendl;
3558     return std::nullopt;
3559   } else {
3560     // update our stat summary and timestamps
3561     info.stats.reported_epoch = get_osdmap_epoch();
3562     ++info.stats.reported_seq;
3563
3564     info.stats.last_fresh = now;
3565
3566     if (info.stats.state & PG_STATE_CLEAN)
3567       info.stats.last_clean = now;
3568     if (info.stats.state & PG_STATE_ACTIVE)
3569       info.stats.last_active = now;
3570     if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
3571       info.stats.last_peered = now;
3572     info.stats.last_unstale = now;
3573     if ((info.stats.state & PG_STATE_DEGRADED) == 0)
3574       info.stats.last_undegraded = now;
3575     if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
3576       info.stats.last_fullsized = now;
3577
3578     psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3579                << ":" << pg_stats_publish.reported_seq << dendl;
3580     return std::make_optional(std::move(pre_publish));
3581   }
3582 }
3583
3584 void PeeringState::init(
3585   int role,
3586   const vector<int>& newup, int new_up_primary,
3587   const vector<int>& newacting, int new_acting_primary,
3588   const pg_history_t& history,
3589   const PastIntervals& pi,
3590   bool backfill,
3591   ObjectStore::Transaction &t)
3592 {
3593   psdout(10) << "init role " << role << " up "
3594              << newup << " acting " << newacting
3595              << " history " << history
3596              << " past_intervals " << pi
3597              << dendl;
3598
3599   set_role(role);
3600   init_primary_up_acting(
3601     newup,
3602     newacting,
3603     new_up_primary,
3604     new_acting_primary);
3605
3606   info.history = history;
3607   past_intervals = pi;
3608
3609   info.stats.up = up;
3610   info.stats.up_primary = new_up_primary;
3611   info.stats.acting = acting;
3612   info.stats.acting_primary = new_acting_primary;
3613   info.stats.mapping_epoch = info.history.same_interval_since;
3614
3615   if (!perform_deletes_during_peering()) {
3616     pg_log.set_missing_may_contain_deletes();
3617   }
3618
3619   if (backfill) {
3620     psdout(10) << __func__ << ": Setting backfill" << dendl;
3621     info.set_last_backfill(hobject_t());
3622     info.last_complete = info.last_update;
3623     pg_log.mark_log_for_rewrite();
3624   }
3625
3626   on_new_interval();
3627
3628   dirty_info = true;
3629   dirty_big_info = true;
3630   write_if_dirty(t);
3631 }
3632
3633 void PeeringState::dump_peering_state(Formatter *f)
3634 {
3635   f->dump_string("state", get_pg_state_string());
3636   f->dump_unsigned("epoch", get_osdmap_epoch());
3637   f->open_array_section("up");
3638   for (vector<int>::const_iterator p = up.begin(); p != up.end(); ++p)
3639     f->dump_unsigned("osd", *p);
3640   f->close_section();
3641   f->open_array_section("acting");
3642   for (vector<int>::const_iterator p = acting.begin(); p != acting.end(); ++p)
3643     f->dump_unsigned("osd", *p);
3644   f->close_section();
3645   if (!backfill_targets.empty()) {
3646     f->open_array_section("backfill_targets");
3647     for (set<pg_shard_t>::iterator p = backfill_targets.begin();
3648          p != backfill_targets.end();
3649          ++p)
3650       f->dump_stream("shard") << *p;
3651     f->close_section();
3652   }
3653   if (!async_recovery_targets.empty()) {
3654     f->open_array_section("async_recovery_targets");
3655     for (set<pg_shard_t>::iterator p = async_recovery_targets.begin();
3656          p != async_recovery_targets.end();
3657          ++p)
3658       f->dump_stream("shard") << *p;
3659     f->close_section();
3660   }
3661   if (!acting_recovery_backfill.empty()) {
3662     f->open_array_section("acting_recovery_backfill");
3663     for (set<pg_shard_t>::iterator p = acting_recovery_backfill.begin();
3664          p != acting_recovery_backfill.end();
3665          ++p)
3666       f->dump_stream("shard") << *p;
3667     f->close_section();
3668   }
3669   f->open_object_section("info");
3670   update_calc_stats();
3671   info.dump(f);
3672   f->close_section();
3673
3674   f->open_array_section("peer_info");
3675   for (map<pg_shard_t, pg_info_t>::const_iterator p = peer_info.begin();
3676        p != peer_info.end();
3677        ++p) {
3678     f->open_object_section("info");
3679     f->dump_stream("peer") << p->first;
3680     p->second.dump(f);
3681     f->close_section();
3682   }
3683 }
3684
3685 void PeeringState::update_stats(
3686   std::function<bool(pg_history_t &, pg_stat_t &)> f,
3687   ObjectStore::Transaction *t) {
3688   if (f(info.history, info.stats)) {
3689     pl->publish_stats_to_osd();
3690   }
3691   pl->on_info_history_change();
3692
3693   if (t) {
3694     dirty_info = true;
3695     write_if_dirty(*t);
3696   }
3697 }
3698
3699 bool PeeringState::append_log_entries_update_missing(
3700   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3701   ObjectStore::Transaction &t, std::optional<eversion_t> trim_to,
3702   std::optional<eversion_t> roll_forward_to)
3703 {
3704   ceph_assert(!entries.empty());
3705   ceph_assert(entries.begin()->version > info.last_update);
3706
3707   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
3708   bool invalidate_stats =
3709     pg_log.append_new_log_entries(
3710       info.last_backfill,
3711       entries,
3712       rollbacker.get());
3713
3714   if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
3715     pg_log.roll_forward(rollbacker.get());
3716   }
3717   if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
3718     pg_log.roll_forward_to(*roll_forward_to, rollbacker.get());
3719     last_rollback_info_trimmed_to_applied = *roll_forward_to;
3720   }
3721
3722   info.last_update = pg_log.get_head();
3723
3724   if (pg_log.get_missing().num_missing() == 0) {
3725     // advance last_complete since nothing else is missing!
3726     info.last_complete = info.last_update;
3727   }
3728   info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
3729
3730   psdout(20) << __func__ << " trim_to bool = " << bool(trim_to)
3731              << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
3732   if (trim_to)
3733     pg_log.trim(*trim_to, info);
3734   dirty_info = true;
3735   write_if_dirty(t);
3736   return invalidate_stats;
3737 }
3738
3739 void PeeringState::merge_new_log_entries(
3740   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3741   ObjectStore::Transaction &t,
3742   std::optional<eversion_t> trim_to,
3743   std::optional<eversion_t> roll_forward_to)
3744 {
3745   psdout(10) << __func__ << " " << entries << dendl;
3746   ceph_assert(is_primary());
3747
3748   bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
3749   for (set<pg_shard_t>::const_iterator i = acting_recovery_backfill.begin();
3750        i != acting_recovery_backfill.end();
3751        ++i) {
3752     pg_shard_t peer(*i);
3753     if (peer == pg_whoami) continue;
3754     ceph_assert(peer_missing.count(peer));
3755     ceph_assert(peer_info.count(peer));
3756     pg_missing_t& pmissing(peer_missing[peer]);
3757     psdout(20) << __func__ << " peer_missing for " << peer
3758                << " = " << pmissing << dendl;
3759     pg_info_t& pinfo(peer_info[peer]);
3760     bool invalidate_stats = PGLog::append_log_entries_update_missing(
3761       pinfo.last_backfill,
3762       entries,
3763       true,
3764       NULL,
3765       pmissing,
3766       NULL,
3767       dpp);
3768     pinfo.last_update = info.last_update;
3769     pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
3770     rebuild_missing = rebuild_missing || invalidate_stats;
3771   }
3772
3773   if (!rebuild_missing) {
3774     return;
3775   }
3776
3777   for (auto &&i: entries) {
3778     missing_loc.rebuild(
3779       i.soid,
3780       pg_whoami,
3781       acting_recovery_backfill,
3782       info,
3783       pg_log.get_missing(),
3784       peer_missing,
3785       peer_info);
3786   }
3787 }
3788
3789 void PeeringState::add_log_entry(const pg_log_entry_t& e, bool applied)
3790 {
3791   // raise last_complete only if we were previously up to date
3792   if (info.last_complete == info.last_update)
3793     info.last_complete = e.version;
3794
3795   // raise last_update.
3796   ceph_assert(e.version > info.last_update);
3797   info.last_update = e.version;
3798
3799   // raise user_version, if it increased (it may have not get bumped
3800   // by all logged updates)
3801   if (e.user_version > info.last_user_version)
3802     info.last_user_version = e.user_version;
3803
3804   // log mutation
3805   pg_log.add(e, applied);
3806   psdout(10) << "add_log_entry " << e << dendl;
3807 }
3808
3809
3810 void PeeringState::append_log(
3811   const vector<pg_log_entry_t>& logv,
3812   eversion_t trim_to,
3813   eversion_t roll_forward_to,
3814   eversion_t mlcod,
3815   ObjectStore::Transaction &t,
3816   bool transaction_applied,
3817   bool async)
3818 {
3819   /* The primary has sent an info updating the history, but it may not
3820    * have arrived yet.  We want to make sure that we cannot remember this
3821    * write without remembering that it happened in an interval which went
3822    * active in epoch history.last_epoch_started.
3823    */
3824   if (info.last_epoch_started != info.history.last_epoch_started) {
3825     info.history.last_epoch_started = info.last_epoch_started;
3826   }
3827   if (info.last_interval_started != info.history.last_interval_started) {
3828     info.history.last_interval_started = info.last_interval_started;
3829   }
3830   psdout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3831
3832   PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
3833   if (!transaction_applied) {
3834      /* We must be a backfill or async recovery peer, so it's ok if we apply
3835       * out-of-turn since we won't be considered when
3836       * determining a min possible last_update.
3837       *
3838       * We skip_rollforward() here, which advances the crt, without
3839       * doing an actual rollforward. This avoids cleaning up entries
3840       * from the backend and we do not end up in a situation, where the
3841       * object is deleted before we can _merge_object_divergent_entries().
3842       */
3843     pg_log.skip_rollforward();
3844   }
3845
3846   for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3847        p != logv.end();
3848        ++p) {
3849     add_log_entry(*p, transaction_applied);
3850
3851     /* We don't want to leave the rollforward artifacts around
3852      * here past last_backfill.  It's ok for the same reason as
3853      * above */
3854     if (transaction_applied &&
3855         p->soid > info.last_backfill) {
3856       pg_log.roll_forward(handler.get());
3857     }
3858   }
3859   if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
3860     pg_log.roll_forward_to(
3861       roll_forward_to,
3862       handler.get());
3863     last_rollback_info_trimmed_to_applied = roll_forward_to;
3864   }
3865
3866   psdout(10) << __func__ << " approx pg log length =  "
3867              << pg_log.get_log().approx_size() << dendl;
3868   psdout(10) << __func__ << " transaction_applied = "
3869              << transaction_applied << dendl;
3870   if (!transaction_applied || async)
3871     psdout(10) << __func__ << " " << pg_whoami
3872                << " is async_recovery or backfill target" << dendl;
3873   pg_log.trim(trim_to, info, transaction_applied, async);
3874
3875   // update the local pg, pg log
3876   dirty_info = true;
3877   write_if_dirty(t);
3878
3879   if (!is_primary())
3880     min_last_complete_ondisk = mlcod;
3881 }
3882
3883 void PeeringState::recover_got(
3884   const hobject_t &oid, eversion_t v,
3885   bool is_delete,
3886   ObjectStore::Transaction &t)
3887 {
3888   if (v > pg_log.get_can_rollback_to()) {
3889     /* This can only happen during a repair, and even then, it would
3890      * be one heck of a race.  If we are repairing the object, the
3891      * write in question must be fully committed, so it's not valid
3892      * to roll it back anyway (and we'll be rolled forward shortly
3893      * anyway) */
3894     PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
3895     pg_log.roll_forward_to(v, handler.get());
3896   }
3897
3898   psdout(10) << "got missing " << oid << " v " << v << dendl;
3899   pg_log.recover_got(oid, v, info);
3900   if (pg_log.get_log().log.empty()) {
3901     psdout(10) << "last_complete now " << info.last_complete
3902                << " while log is empty" << dendl;
3903   } else if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
3904     psdout(10) << "last_complete now " << info.last_complete
3905                << " log.complete_to " << pg_log.get_log().complete_to->version
3906                << dendl;
3907   } else {
3908     psdout(10) << "last_complete now " << info.last_complete
3909                << " log.complete_to at end" << dendl;
3910     //below is not true in the repair case.
3911     //assert(missing.num_missing() == 0);  // otherwise, complete_to was wrong.
3912     ceph_assert(info.last_complete == info.last_update);
3913   }
3914
3915   if (is_primary()) {
3916     ceph_assert(missing_loc.needs_recovery(oid));
3917     if (!is_delete)
3918       missing_loc.add_location(oid, pg_whoami);
3919   }
3920
3921   // update pg
3922   dirty_info = true;
3923   write_if_dirty(t);
3924 }
3925
3926 void PeeringState::update_backfill_progress(
3927   const hobject_t &updated_backfill,
3928   const pg_stat_t &updated_stats,
3929   bool preserve_local_num_bytes,
3930   ObjectStore::Transaction &t) {
3931   info.set_last_backfill(updated_backfill);
3932   if (preserve_local_num_bytes) {
3933     psdout(25) << __func__ << " primary " << updated_stats.stats.sum.num_bytes
3934                << " local " << info.stats.stats.sum.num_bytes << dendl;
3935     int64_t bytes = info.stats.stats.sum.num_bytes;
3936     info.stats = updated_stats;
3937     info.stats.stats.sum.num_bytes = bytes;
3938   } else {
3939     psdout(20) << __func__ << " final " << updated_stats.stats.sum.num_bytes
3940                << " replaces local " << info.stats.stats.sum.num_bytes << dendl;
3941     info.stats = updated_stats;
3942   }
3943
3944   dirty_info = true;
3945   write_if_dirty(t);
3946 }
3947
3948 void PeeringState::adjust_purged_snaps(
3949   std::function<void(interval_set<snapid_t> &snaps)> f) {
3950   f(info.purged_snaps);
3951   dirty_info = true;
3952   dirty_big_info = true;
3953 }
3954
3955 void PeeringState::on_peer_recover(
3956   pg_shard_t peer,
3957   const hobject_t &soid,
3958   const eversion_t &version)
3959 {
3960   pl->publish_stats_to_osd();
3961   // done!
3962   peer_missing[peer].got(soid, version);
3963   missing_loc.add_location(soid, peer);
3964 }
3965
3966 void PeeringState::begin_peer_recover(
3967   pg_shard_t peer,
3968   const hobject_t soid)
3969 {
3970   peer_missing[peer].revise_have(soid, eversion_t());
3971 }
3972
3973 void PeeringState::force_object_missing(
3974   const set<pg_shard_t> &peers,
3975   const hobject_t &soid,
3976   eversion_t version)
3977 {
3978   for (auto &&peer : peers) {
3979     if (peer != primary) {
3980       peer_missing[peer].add(soid, version, eversion_t(), false);
3981     } else {
3982       pg_log.missing_add(soid, version, eversion_t());
3983       pg_log.reset_complete_to(&info);
3984       pg_log.set_last_requested(0);
3985     }
3986   }
3987
3988   missing_loc.rebuild(
3989     soid,
3990     pg_whoami,
3991     acting_recovery_backfill,
3992     info,
3993     pg_log.get_missing(),
3994     peer_missing,
3995     peer_info);
3996 }
3997
3998 void PeeringState::pre_submit_op(
3999   const hobject_t &hoid,
4000   const vector<pg_log_entry_t>& logv,
4001   eversion_t at_version)
4002 {
4003   if (at_version > eversion_t()) {
4004     for (auto &&i : get_acting_recovery_backfill()) {
4005       if (i == primary) continue;
4006       pg_info_t &pinfo = peer_info[i];
4007       // keep peer_info up to date
4008       if (pinfo.last_complete == pinfo.last_update)
4009         pinfo.last_complete = at_version;
4010       pinfo.last_update = at_version;
4011     }
4012   }
4013
4014   bool requires_missing_loc = false;
4015   for (auto &&i : get_async_recovery_targets()) {
4016     if (i == primary || !get_peer_missing(i).is_missing(hoid))
4017       continue;
4018     requires_missing_loc = true;
4019     for (auto &&entry: logv) {
4020       peer_missing[i].add_next_event(entry);
4021     }
4022   }
4023
4024   if (requires_missing_loc) {
4025     for (auto &&entry: logv) {
4026       psdout(30) << __func__ << " missing_loc before: "
4027                  << missing_loc.get_locations(entry.soid) << dendl;
4028       missing_loc.add_missing(entry.soid, entry.version,
4029                               eversion_t(), entry.is_delete());
4030       // clear out missing_loc
4031       missing_loc.clear_location(entry.soid);
4032       for (auto &i: get_actingset()) {
4033         if (!get_peer_missing(i).is_missing(entry.soid))
4034           missing_loc.add_location(entry.soid, i);
4035       }
4036       psdout(30) << __func__ << " missing_loc after: "
4037                  << missing_loc.get_locations(entry.soid) << dendl;
4038     }
4039   }
4040 }
4041
4042 void PeeringState::recovery_committed_to(eversion_t version)
4043 {
4044   psdout(10) << __func__ << " version " << version
4045              << " now ondisk" << dendl;
4046   last_complete_ondisk = version;
4047
4048   if (last_complete_ondisk == info.last_update) {
4049     if (!is_primary()) {
4050       // Either we are a replica or backfill target.
4051       // we are fully up to date.  tell the primary!
4052       pl->send_cluster_message(
4053         get_primary().osd,
4054         new MOSDPGTrim(
4055           get_osdmap_epoch(),
4056           spg_t(info.pgid.pgid, primary.shard),
4057           last_complete_ondisk),
4058         get_osdmap_epoch());
4059     } else {
4060       calc_min_last_complete_ondisk();
4061     }
4062   }
4063 }
4064
4065 void PeeringState::complete_write(eversion_t v, eversion_t lc)
4066 {
4067   last_update_ondisk = v;
4068   last_complete_ondisk = lc;
4069   calc_min_last_complete_ondisk();
4070 }
4071
4072 void PeeringState::calc_trim_to()
4073 {
4074   size_t target = pl->get_target_pg_log_entries();
4075
4076   eversion_t limit = std::min(
4077     min_last_complete_ondisk,
4078     pg_log.get_can_rollback_to());
4079   if (limit != eversion_t() &&
4080       limit != pg_trim_to &&
4081       pg_log.get_log().approx_size() > target) {
4082     size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target,
4083                              cct->_conf->osd_pg_log_trim_max);
4084     if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4085         cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4086       return;
4087     }
4088     list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
4089     eversion_t new_trim_to;
4090     for (size_t i = 0; i < num_to_trim; ++i) {
4091       new_trim_to = it->version;
4092       ++it;
4093       if (new_trim_to > limit) {
4094         new_trim_to = limit;
4095         psdout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
4096         break;
4097       }
4098     }
4099     psdout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
4100     pg_trim_to = new_trim_to;
4101     assert(pg_trim_to <= pg_log.get_head());
4102     assert(pg_trim_to <= min_last_complete_ondisk);
4103   }
4104 }
4105
4106 void PeeringState::calc_trim_to_aggressive()
4107 {
4108   size_t target = pl->get_target_pg_log_entries();
4109
4110   // limit pg log trimming up to the can_rollback_to value
4111   eversion_t limit = std::min({
4112     pg_log.get_head(),
4113     pg_log.get_can_rollback_to(),
4114     last_update_ondisk});
4115   psdout(10) << __func__ << " limit = " << limit << dendl;
4116
4117   if (limit != eversion_t() &&
4118       limit != pg_trim_to &&
4119       pg_log.get_log().approx_size() > target) {
4120     psdout(10) << __func__ << " approx pg log length =  "
4121              << pg_log.get_log().approx_size() << dendl;
4122     uint64_t num_to_trim = std::min<uint64_t>(pg_log.get_log().approx_size() - target,
4123                                               cct->_conf->osd_pg_log_trim_max);
4124     psdout(10) << __func__ << " num_to_trim =  " << num_to_trim << dendl;
4125     if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4126         cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4127       return;
4128     }
4129     auto it = pg_log.get_log().log.begin(); // oldest log entry
4130     auto rit = pg_log.get_log().log.rbegin();
4131     eversion_t by_n_to_keep; // start from tail
4132     eversion_t by_n_to_trim = eversion_t::max(); // start from head
4133     for (size_t i = 0; it != pg_log.get_log().log.end(); ++it, ++rit) {
4134       i++;
4135       if (i > target && by_n_to_keep == eversion_t()) {
4136         by_n_to_keep = rit->version;
4137       }
4138       if (i >= num_to_trim && by_n_to_trim == eversion_t::max()) {
4139         by_n_to_trim = it->version;
4140       }
4141       if (by_n_to_keep != eversion_t() &&
4142           by_n_to_trim != eversion_t::max()) {
4143         break;
4144       }
4145     }
4146
4147     if (by_n_to_keep == eversion_t()) {
4148       return;
4149     }
4150
4151     pg_trim_to = std::min({by_n_to_keep, by_n_to_trim, limit});
4152     psdout(10) << __func__ << " pg_trim_to now " << pg_trim_to << dendl;
4153     ceph_assert(pg_trim_to <= pg_log.get_head());
4154   }
4155 }
4156
4157 void PeeringState::apply_op_stats(
4158   const hobject_t &soid,
4159   const object_stat_sum_t &delta_stats)
4160 {
4161   info.stats.stats.add(delta_stats);
4162   info.stats.stats.floor(0);
4163
4164   for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
4165        i != get_backfill_targets().end();
4166        ++i) {
4167     pg_shard_t bt = *i;
4168     pg_info_t& pinfo = peer_info[bt];
4169     if (soid <= pinfo.last_backfill)
4170       pinfo.stats.stats.add(delta_stats);
4171   }
4172 }
4173
4174 void PeeringState::update_complete_backfill_object_stats(
4175   const hobject_t &hoid,
4176   const pg_stat_t &stats)
4177 {
4178   for (auto &&bt: get_backfill_targets()) {
4179     pg_info_t& pinfo = peer_info[bt];
4180     //Add stats to all peers that were missing object
4181     if (hoid > pinfo.last_backfill)
4182       pinfo.stats.add(stats);
4183   }
4184 }
4185
4186 void PeeringState::update_peer_last_backfill(
4187   pg_shard_t peer,
4188   const hobject_t &new_last_backfill)
4189 {
4190   pg_info_t &pinfo = peer_info[peer];
4191   pinfo.last_backfill = new_last_backfill;
4192   if (new_last_backfill.is_max()) {
4193     /* pinfo.stats might be wrong if we did log-based recovery on the
4194      * backfilled portion in addition to continuing backfill.
4195      */
4196     pinfo.stats = info.stats;
4197   }
4198 }
4199
4200 void PeeringState::set_revert_with_targets(
4201   const hobject_t &soid,
4202   const set<pg_shard_t> &good_peers)
4203 {
4204   for (auto &&peer: good_peers) {
4205     missing_loc.add_location(soid, peer);
4206   }
4207 }
4208
4209 void PeeringState::prepare_backfill_for_missing(
4210   const hobject_t &soid,
4211   const eversion_t &version,
4212   const vector<pg_shard_t> &targets) {
4213   for (auto &&peer: targets) {
4214     peer_missing[peer].add(soid, version, eversion_t(), false);
4215   }
4216 }
4217
4218 void PeeringState::update_hset(const pg_hit_set_history_t &hset_history)
4219 {
4220   info.hit_set = hset_history;
4221 }
4222
4223 /*------------ Peering State Machine----------------*/
4224 #undef dout_prefix
4225 #define dout_prefix (context< PeeringMachine >().dpp->gen_prefix(*_dout) \
4226                     << "state<" << get_state_name() << ">: ")
4227 #undef psdout
4228 #define psdout(x) ldout(context< PeeringMachine >().cct, x)
4229
4230 #define DECLARE_LOCALS                                  \
4231   PeeringState *ps = context< PeeringMachine >().state; \
4232   std::ignore = ps;                                     \
4233   PeeringListener *pl = context< PeeringMachine >().pl; \
4234   std::ignore = pl
4235
4236
4237 /*------Crashed-------*/
4238 PeeringState::Crashed::Crashed(my_context ctx)
4239   : my_base(ctx),
4240     NamedState(context< PeeringMachine >().state_history, "Crashed")
4241 {
4242   context< PeeringMachine >().log_enter(state_name);
4243   ceph_abort_msg("we got a bad state machine event");
4244 }
4245
4246
4247 /*------Initial-------*/
4248 PeeringState::Initial::Initial(my_context ctx)
4249   : my_base(ctx),
4250     NamedState(context< PeeringMachine >().state_history, "Initial")
4251 {
4252   context< PeeringMachine >().log_enter(state_name);
4253 }
4254
4255 boost::statechart::result PeeringState::Initial::react(const MNotifyRec& notify)
4256 {
4257   DECLARE_LOCALS;
4258   ps->proc_replica_info(
4259     notify.from, notify.notify.info, notify.notify.epoch_sent);
4260   ps->set_last_peering_reset();
4261   return transit< Primary >();
4262 }
4263
4264 boost::statechart::result PeeringState::Initial::react(const MInfoRec& i)
4265 {
4266   DECLARE_LOCALS;
4267   ceph_assert(!ps->is_primary());
4268   post_event(i);
4269   return transit< Stray >();
4270 }
4271
4272 boost::statechart::result PeeringState::Initial::react(const MLogRec& i)
4273 {
4274   DECLARE_LOCALS;
4275   ceph_assert(!ps->is_primary());
4276   post_event(i);
4277   return transit< Stray >();
4278 }
4279
4280 void PeeringState::Initial::exit()
4281 {
4282   context< PeeringMachine >().log_exit(state_name, enter_time);
4283   DECLARE_LOCALS;
4284   utime_t dur = ceph_clock_now() - enter_time;
4285   pl->get_peering_perf().tinc(rs_initial_latency, dur);
4286 }
4287
4288 /*------Started-------*/
4289 PeeringState::Started::Started(my_context ctx)
4290   : my_base(ctx),
4291     NamedState(context< PeeringMachine >().state_history, "Started")
4292 {
4293   context< PeeringMachine >().log_enter(state_name);
4294 }
4295
4296 boost::statechart::result
4297 PeeringState::Started::react(const IntervalFlush&)
4298 {
4299   psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4300   context< PeeringMachine >().state->end_block_outgoing();
4301   return discard_event();
4302 }
4303
4304 boost::statechart::result PeeringState::Started::react(const AdvMap& advmap)
4305 {
4306   DECLARE_LOCALS;
4307   psdout(10) << "Started advmap" << dendl;
4308   ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4309   if (ps->should_restart_peering(
4310         advmap.up_primary,
4311         advmap.acting_primary,
4312         advmap.newup,
4313         advmap.newacting,
4314         advmap.lastmap,
4315         advmap.osdmap)) {
4316     psdout(10) << "should_restart_peering, transitioning to Reset"
4317                        << dendl;
4318     post_event(advmap);
4319     return transit< Reset >();
4320   }
4321   ps->remove_down_peer_info(advmap.osdmap);
4322   return discard_event();
4323 }
4324
4325 boost::statechart::result PeeringState::Started::react(const QueryState& q)
4326 {
4327   q.f->open_object_section("state");
4328   q.f->dump_string("name", state_name);
4329   q.f->dump_stream("enter_time") << enter_time;
4330   q.f->close_section();
4331   return discard_event();
4332 }
4333
4334 void PeeringState::Started::exit()
4335 {
4336   context< PeeringMachine >().log_exit(state_name, enter_time);
4337   DECLARE_LOCALS;
4338   utime_t dur = ceph_clock_now() - enter_time;
4339   pl->get_peering_perf().tinc(rs_started_latency, dur);
4340   ps->state_clear(PG_STATE_WAIT | PG_STATE_LAGGY);
4341 }
4342
4343 /*--------Reset---------*/
4344 PeeringState::Reset::Reset(my_context ctx)
4345   : my_base(ctx),
4346     NamedState(context< PeeringMachine >().state_history, "Reset")
4347 {
4348   context< PeeringMachine >().log_enter(state_name);
4349   DECLARE_LOCALS;
4350
4351   ps->flushes_in_progress = 0;
4352   ps->set_last_peering_reset();
4353   ps->log_weirdness();
4354 }
4355
4356 boost::statechart::result
4357 PeeringState::Reset::react(const IntervalFlush&)
4358 {
4359   psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4360   context< PeeringMachine >().state->end_block_outgoing();
4361   return discard_event();
4362 }
4363
4364 boost::statechart::result PeeringState::Reset::react(const AdvMap& advmap)
4365 {
4366   DECLARE_LOCALS;
4367   psdout(10) << "Reset advmap" << dendl;
4368
4369   ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4370
4371   if (ps->should_restart_peering(
4372         advmap.up_primary,
4373         advmap.acting_primary,
4374         advmap.newup,
4375         advmap.newacting,
4376         advmap.lastmap,
4377         advmap.osdmap)) {
4378     psdout(10) << "should restart peering, calling start_peering_interval again"
4379                        << dendl;
4380     ps->start_peering_interval(
4381       advmap.lastmap,
4382       advmap.newup, advmap.up_primary,
4383       advmap.newacting, advmap.acting_primary,
4384       context< PeeringMachine >().get_cur_transaction());
4385   }
4386   ps->remove_down_peer_info(advmap.osdmap);
4387   ps->check_past_interval_bounds();
4388   return discard_event();
4389 }
4390
4391 boost::statechart::result PeeringState::Reset::react(const ActMap&)
4392 {
4393   DECLARE_LOCALS;
4394   if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
4395     ps->info.history.refresh_prior_readable_until_ub(
4396       pl->get_mnow(),
4397       ps->prior_readable_until_ub);
4398     context< PeeringMachine >().send_notify(
4399       ps->get_primary().osd,
4400       pg_notify_t(
4401         ps->get_primary().shard, ps->pg_whoami.shard,
4402         ps->get_osdmap_epoch(),
4403         ps->get_osdmap_epoch(),
4404         ps->info,
4405         ps->past_intervals));
4406   }
4407
4408   ps->update_heartbeat_peers();
4409
4410   return transit< Started >();
4411 }
4412
4413 boost::statechart::result PeeringState::Reset::react(const QueryState& q)
4414 {
4415   q.f->open_object_section("state");
4416   q.f->dump_string("name", state_name);
4417   q.f->dump_stream("enter_time") << enter_time;
4418   q.f->close_section();
4419   return discard_event();
4420 }
4421
4422 void PeeringState::Reset::exit()
4423 {
4424   context< PeeringMachine >().log_exit(state_name, enter_time);
4425   DECLARE_LOCALS;
4426   utime_t dur = ceph_clock_now() - enter_time;
4427   pl->get_peering_perf().tinc(rs_reset_latency, dur);
4428 }
4429
4430 /*-------Start---------*/
4431 PeeringState::Start::Start(my_context ctx)
4432   : my_base(ctx),
4433     NamedState(context< PeeringMachine >().state_history, "Start")
4434 {
4435   context< PeeringMachine >().log_enter(state_name);
4436
4437   DECLARE_LOCALS;
4438   if (ps->is_primary()) {
4439     psdout(1) << "transitioning to Primary" << dendl;
4440     post_event(MakePrimary());
4441   } else { //is_stray
4442     psdout(1) << "transitioning to Stray" << dendl;
4443     post_event(MakeStray());
4444   }
4445 }
4446
4447 void PeeringState::Start::exit()
4448 {
4449   context< PeeringMachine >().log_exit(state_name, enter_time);
4450   DECLARE_LOCALS;
4451   utime_t dur = ceph_clock_now() - enter_time;
4452   pl->get_peering_perf().tinc(rs_start_latency, dur);
4453 }
4454
4455 /*---------Primary--------*/
4456 PeeringState::Primary::Primary(my_context ctx)
4457   : my_base(ctx),
4458     NamedState(context< PeeringMachine >().state_history, "Started/Primary")
4459 {
4460   context< PeeringMachine >().log_enter(state_name);
4461   DECLARE_LOCALS;
4462   ceph_assert(ps->want_acting.empty());
4463
4464   // set CREATING bit until we have peered for the first time.
4465   if (ps->info.history.last_epoch_started == 0) {
4466     ps->state_set(PG_STATE_CREATING);
4467     // use the history timestamp, which ultimately comes from the
4468     // monitor in the create case.
4469     utime_t t = ps->info.history.last_scrub_stamp;
4470     ps->info.stats.last_fresh = t;
4471     ps->info.stats.last_active = t;
4472     ps->info.stats.last_change = t;
4473     ps->info.stats.last_peered = t;
4474     ps->info.stats.last_clean = t;
4475     ps->info.stats.last_unstale = t;
4476     ps->info.stats.last_undegraded = t;
4477     ps->info.stats.last_fullsized = t;
4478     ps->info.stats.last_scrub_stamp = t;
4479     ps->info.stats.last_deep_scrub_stamp = t;
4480     ps->info.stats.last_clean_scrub_stamp = t;
4481   }
4482 }
4483
4484 boost::statechart::result PeeringState::Primary::react(const MNotifyRec& notevt)
4485 {
4486   DECLARE_LOCALS;
4487   psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
4488   ps->proc_replica_info(
4489     notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
4490   return discard_event();
4491 }
4492
4493 boost::statechart::result PeeringState::Primary::react(const ActMap&)
4494 {
4495   DECLARE_LOCALS;
4496   psdout(7) << "handle ActMap primary" << dendl;
4497   pl->publish_stats_to_osd();
4498   return discard_event();
4499 }
4500
4501 boost::statechart::result PeeringState::Primary::react(
4502   const SetForceRecovery&)
4503 {
4504   DECLARE_LOCALS;
4505   ps->set_force_recovery(true);
4506   return discard_event();
4507 }
4508
4509 boost::statechart::result PeeringState::Primary::react(
4510   const UnsetForceRecovery&)
4511 {
4512   DECLARE_LOCALS;
4513   ps->set_force_recovery(false);
4514   return discard_event();
4515 }
4516
4517 boost::statechart::result PeeringState::Primary::react(
4518   const RequestScrub& evt)
4519 {
4520   DECLARE_LOCALS;
4521   if (ps->is_primary()) {
4522     pl->scrub_requested(evt.deep, evt.repair);
4523     psdout(10) << "marking for scrub" << dendl;
4524   }
4525   return discard_event();
4526 }
4527
4528 boost::statechart::result PeeringState::Primary::react(
4529   const SetForceBackfill&)
4530 {
4531   DECLARE_LOCALS;
4532   ps->set_force_backfill(true);
4533   return discard_event();
4534 }
4535
4536 boost::statechart::result PeeringState::Primary::react(
4537   const UnsetForceBackfill&)
4538 {
4539   DECLARE_LOCALS;
4540   ps->set_force_backfill(false);
4541   return discard_event();
4542 }
4543
4544 void PeeringState::Primary::exit()
4545 {
4546   context< PeeringMachine >().log_exit(state_name, enter_time);
4547   DECLARE_LOCALS;
4548   ps->want_acting.clear();
4549   utime_t dur = ceph_clock_now() - enter_time;
4550   pl->get_peering_perf().tinc(rs_primary_latency, dur);
4551   pl->clear_primary_state();
4552   ps->state_clear(PG_STATE_CREATING);
4553 }
4554
4555 /*---------Peering--------*/
4556 PeeringState::Peering::Peering(my_context ctx)
4557   : my_base(ctx),
4558     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering"),
4559     history_les_bound(false)
4560 {
4561   context< PeeringMachine >().log_enter(state_name);
4562   DECLARE_LOCALS;
4563
4564   ceph_assert(!ps->is_peered());
4565   ceph_assert(!ps->is_peering());
4566   ceph_assert(ps->is_primary());
4567   ps->state_set(PG_STATE_PEERING);
4568 }
4569
4570 boost::statechart::result PeeringState::Peering::react(const AdvMap& advmap)
4571 {
4572   DECLARE_LOCALS;
4573   psdout(10) << "Peering advmap" << dendl;
4574   if (prior_set.affected_by_map(*(advmap.osdmap), ps->dpp)) {
4575     psdout(1) << "Peering, affected_by_map, going to Reset" << dendl;
4576     post_event(advmap);
4577     return transit< Reset >();
4578   }
4579
4580   ps->adjust_need_up_thru(advmap.osdmap);
4581   ps->check_prior_readable_down_osds(advmap.osdmap);
4582
4583   return forward_event();
4584 }
4585
4586 boost::statechart::result PeeringState::Peering::react(const QueryState& q)
4587 {
4588   DECLARE_LOCALS;
4589
4590   q.f->open_object_section("state");
4591   q.f->dump_string("name", state_name);
4592   q.f->dump_stream("enter_time") << enter_time;
4593
4594   q.f->open_array_section("past_intervals");
4595   ps->past_intervals.dump(q.f);
4596   q.f->close_section();
4597
4598   q.f->open_array_section("probing_osds");
4599   for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
4600        p != prior_set.probe.end();
4601        ++p)
4602     q.f->dump_stream("osd") << *p;
4603   q.f->close_section();
4604
4605   if (prior_set.pg_down)
4606     q.f->dump_string("blocked", "peering is blocked due to down osds");
4607
4608   q.f->open_array_section("down_osds_we_would_probe");
4609   for (set<int>::iterator p = prior_set.down.begin();
4610        p != prior_set.down.end();
4611        ++p)
4612     q.f->dump_int("osd", *p);
4613   q.f->close_section();
4614
4615   q.f->open_array_section("peering_blocked_by");
4616   for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
4617        p != prior_set.blocked_by.end();
4618        ++p) {
4619     q.f->open_object_section("osd");
4620     q.f->dump_int("osd", p->first);
4621     q.f->dump_int("current_lost_at", p->second);
4622     q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
4623     q.f->close_section();
4624   }
4625   q.f->close_section();
4626
4627   if (history_les_bound) {
4628     q.f->open_array_section("peering_blocked_by_detail");
4629     q.f->open_object_section("item");
4630     q.f->dump_string("detail","peering_blocked_by_history_les_bound");
4631     q.f->close_section();
4632     q.f->close_section();
4633   }
4634
4635   q.f->close_section();
4636   return forward_event();
4637 }
4638
4639 void PeeringState::Peering::exit()
4640 {
4641
4642   DECLARE_LOCALS;
4643   psdout(10) << "Leaving Peering" << dendl;
4644   context< PeeringMachine >().log_exit(state_name, enter_time);
4645   ps->state_clear(PG_STATE_PEERING);
4646   pl->clear_probe_targets();
4647
4648   utime_t dur = ceph_clock_now() - enter_time;
4649   pl->get_peering_perf().tinc(rs_peering_latency, dur);
4650 }
4651
4652
4653 /*------Backfilling-------*/
4654 PeeringState::Backfilling::Backfilling(my_context ctx)
4655   : my_base(ctx),
4656     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Backfilling")
4657 {
4658   context< PeeringMachine >().log_enter(state_name);
4659
4660
4661   DECLARE_LOCALS;
4662   ps->backfill_reserved = true;
4663   pl->on_backfill_reserved();
4664   ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
4665   ps->state_clear(PG_STATE_BACKFILL_WAIT);
4666   ps->state_set(PG_STATE_BACKFILLING);
4667   pl->publish_stats_to_osd();
4668 }
4669
4670 void PeeringState::Backfilling::backfill_release_reservations()
4671 {
4672   DECLARE_LOCALS;
4673   pl->cancel_local_background_io_reservation();
4674   for (set<pg_shard_t>::iterator it = ps->backfill_targets.begin();
4675        it != ps->backfill_targets.end();
4676        ++it) {
4677     ceph_assert(*it != ps->pg_whoami);
4678     pl->send_cluster_message(
4679       it->osd,
4680       new MBackfillReserve(
4681         MBackfillReserve::RELEASE,
4682         spg_t(ps->info.pgid.pgid, it->shard),
4683         ps->get_osdmap_epoch()),
4684       ps->get_osdmap_epoch());
4685   }
4686 }
4687
4688 void PeeringState::Backfilling::cancel_backfill()
4689 {
4690   DECLARE_LOCALS;
4691   backfill_release_reservations();
4692   pl->on_backfill_canceled();
4693 }
4694
4695 boost::statechart::result
4696 PeeringState::Backfilling::react(const Backfilled &c)
4697 {
4698   backfill_release_reservations();
4699   return transit<Recovered>();
4700 }
4701
4702 boost::statechart::result
4703 PeeringState::Backfilling::react(const DeferBackfill &c)
4704 {
4705   DECLARE_LOCALS;
4706
4707   psdout(10) << "defer backfill, retry delay " << c.delay << dendl;
4708   ps->state_set(PG_STATE_BACKFILL_WAIT);
4709   ps->state_clear(PG_STATE_BACKFILLING);
4710   cancel_backfill();
4711
4712   pl->schedule_event_after(
4713     std::make_shared<PGPeeringEvent>(
4714       ps->get_osdmap_epoch(),
4715       ps->get_osdmap_epoch(),
4716       RequestBackfill()),
4717     c.delay);
4718   return transit<NotBackfilling>();
4719 }
4720
4721 boost::statechart::result
4722 PeeringState::Backfilling::react(const UnfoundBackfill &c)
4723 {
4724   DECLARE_LOCALS;
4725   psdout(10) << "backfill has unfound, can't continue" << dendl;
4726   ps->state_set(PG_STATE_BACKFILL_UNFOUND);
4727   ps->state_clear(PG_STATE_BACKFILLING);
4728   cancel_backfill();
4729   return transit<NotBackfilling>();
4730 }
4731
4732 boost::statechart::result
4733 PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull &)
4734 {
4735   DECLARE_LOCALS;
4736
4737   ps->state_set(PG_STATE_BACKFILL_TOOFULL);
4738   ps->state_clear(PG_STATE_BACKFILLING);
4739   cancel_backfill();
4740
4741   pl->schedule_event_after(
4742     std::make_shared<PGPeeringEvent>(
4743       ps->get_osdmap_epoch(),
4744       ps->get_osdmap_epoch(),
4745       RequestBackfill()),
4746     ps->cct->_conf->osd_backfill_retry_interval);
4747
4748   return transit<NotBackfilling>();
4749 }
4750
4751 boost::statechart::result
4752 PeeringState::Backfilling::react(const RemoteReservationRevoked &)
4753 {
4754   DECLARE_LOCALS;
4755   ps->state_set(PG_STATE_BACKFILL_WAIT);
4756   cancel_backfill();
4757   if (ps->needs_backfill()) {
4758     return transit<WaitLocalBackfillReserved>();
4759   } else {
4760     // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
4761     return discard_event();
4762   }
4763 }
4764
4765 void PeeringState::Backfilling::exit()
4766 {
4767   context< PeeringMachine >().log_exit(state_name, enter_time);
4768   DECLARE_LOCALS;
4769   ps->backfill_reserved = false;
4770   ps->state_clear(PG_STATE_BACKFILLING);
4771   ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
4772   utime_t dur = ceph_clock_now() - enter_time;
4773   pl->get_peering_perf().tinc(rs_backfilling_latency, dur);
4774 }
4775
4776 /*--WaitRemoteBackfillReserved--*/
4777
4778 PeeringState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
4779   : my_base(ctx),
4780     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteBackfillReserved"),
4781     backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
4782 {
4783   context< PeeringMachine >().log_enter(state_name);
4784   DECLARE_LOCALS;
4785
4786   ps->state_set(PG_STATE_BACKFILL_WAIT);
4787   pl->publish_stats_to_osd();
4788   post_event(RemoteBackfillReserved());
4789 }
4790
4791 boost::statechart::result
4792 PeeringState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
4793 {
4794   DECLARE_LOCALS;
4795
4796   int64_t num_bytes = ps->info.stats.stats.sum.num_bytes;
4797   psdout(10) << __func__ << " num_bytes " << num_bytes << dendl;
4798   if (backfill_osd_it !=
4799       context< Active >().remote_shards_to_reserve_backfill.end()) {
4800     // The primary never backfills itself
4801     ceph_assert(*backfill_osd_it != ps->pg_whoami);
4802     pl->send_cluster_message(
4803       backfill_osd_it->osd,
4804       new MBackfillReserve(
4805         MBackfillReserve::REQUEST,
4806         spg_t(context< PeeringMachine >().spgid.pgid, backfill_osd_it->shard),
4807         ps->get_osdmap_epoch(),
4808         ps->get_backfill_priority(),
4809         num_bytes,
4810         ps->peer_bytes[*backfill_osd_it]),
4811       ps->get_osdmap_epoch());
4812     ++backfill_osd_it;
4813   } else {
4814     ps->peer_bytes.clear();
4815     post_event(AllBackfillsReserved());
4816   }
4817   return discard_event();
4818 }
4819
4820 void PeeringState::WaitRemoteBackfillReserved::exit()
4821 {
4822   context< PeeringMachine >().log_exit(state_name, enter_time);
4823   DECLARE_LOCALS;
4824
4825   utime_t dur = ceph_clock_now() - enter_time;
4826   pl->get_peering_perf().tinc(rs_waitremotebackfillreserved_latency, dur);
4827 }
4828
4829 void PeeringState::WaitRemoteBackfillReserved::retry()
4830 {
4831   DECLARE_LOCALS;
4832   pl->cancel_local_background_io_reservation();
4833
4834   // Send CANCEL to all previously acquired reservations
4835   set<pg_shard_t>::const_iterator it, begin, end;
4836   begin = context< Active >().remote_shards_to_reserve_backfill.begin();
4837   end = context< Active >().remote_shards_to_reserve_backfill.end();
4838   ceph_assert(begin != end);
4839   for (it = begin; it != backfill_osd_it; ++it) {
4840     // The primary never backfills itself
4841     ceph_assert(*it != ps->pg_whoami);
4842     pl->send_cluster_message(
4843       it->osd,
4844       new MBackfillReserve(
4845         MBackfillReserve::RELEASE,
4846         spg_t(context< PeeringMachine >().spgid.pgid, it->shard),
4847         ps->get_osdmap_epoch()),
4848       ps->get_osdmap_epoch());
4849   }
4850
4851   ps->state_clear(PG_STATE_BACKFILL_WAIT);
4852   pl->publish_stats_to_osd();
4853
4854   pl->schedule_event_after(
4855     std::make_shared<PGPeeringEvent>(
4856       ps->get_osdmap_epoch(),
4857       ps->get_osdmap_epoch(),
4858       RequestBackfill()),
4859     ps->cct->_conf->osd_backfill_retry_interval);
4860 }
4861
4862 boost::statechart::result
4863 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRejectedTooFull &evt)
4864 {
4865   DECLARE_LOCALS;
4866   ps->state_set(PG_STATE_BACKFILL_TOOFULL);
4867   retry();
4868   return transit<NotBackfilling>();
4869 }
4870
4871 boost::statechart::result
4872 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt)
4873 {
4874   retry();
4875   return transit<NotBackfilling>();
4876 }
4877
4878 /*--WaitLocalBackfillReserved--*/
4879 PeeringState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
4880   : my_base(ctx),
4881     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalBackfillReserved")
4882 {
4883   context< PeeringMachine >().log_enter(state_name);
4884   DECLARE_LOCALS;
4885
4886   ps->state_set(PG_STATE_BACKFILL_WAIT);
4887   pl->request_local_background_io_reservation(
4888     ps->get_backfill_priority(),
4889     std::make_shared<PGPeeringEvent>(
4890       ps->get_osdmap_epoch(),
4891       ps->get_osdmap_epoch(),
4892       LocalBackfillReserved()),
4893     std::make_shared<PGPeeringEvent>(
4894       ps->get_osdmap_epoch(),
4895       ps->get_osdmap_epoch(),
4896       DeferBackfill(0.0)));
4897   pl->publish_stats_to_osd();
4898 }
4899
4900 void PeeringState::WaitLocalBackfillReserved::exit()
4901 {
4902   context< PeeringMachine >().log_exit(state_name, enter_time);
4903   DECLARE_LOCALS;
4904   utime_t dur = ceph_clock_now() - enter_time;
4905   pl->get_peering_perf().tinc(rs_waitlocalbackfillreserved_latency, dur);
4906 }
4907
4908 /*----NotBackfilling------*/
4909 PeeringState::NotBackfilling::NotBackfilling(my_context ctx)
4910   : my_base(ctx),
4911     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotBackfilling")
4912 {
4913   context< PeeringMachine >().log_enter(state_name);
4914   DECLARE_LOCALS;
4915   ps->state_clear(PG_STATE_REPAIR);
4916   pl->publish_stats_to_osd();
4917 }
4918
4919 boost::statechart::result
4920 PeeringState::NotBackfilling::react(const RemoteBackfillReserved &evt)
4921 {
4922   return discard_event();
4923 }
4924
4925 boost::statechart::result
4926 PeeringState::NotBackfilling::react(const RemoteReservationRejectedTooFull &evt)
4927 {
4928   return discard_event();
4929 }
4930
4931 void PeeringState::NotBackfilling::exit()
4932 {
4933   context< PeeringMachine >().log_exit(state_name, enter_time);
4934
4935   DECLARE_LOCALS;
4936   ps->state_clear(PG_STATE_BACKFILL_UNFOUND);
4937   utime_t dur = ceph_clock_now() - enter_time;
4938   pl->get_peering_perf().tinc(rs_notbackfilling_latency, dur);
4939 }
4940
4941 /*----NotRecovering------*/
4942 PeeringState::NotRecovering::NotRecovering(my_context ctx)
4943   : my_base(ctx),
4944     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotRecovering")
4945 {
4946   context< PeeringMachine >().log_enter(state_name);
4947   DECLARE_LOCALS;
4948   ps->state_clear(PG_STATE_REPAIR);
4949   pl->publish_stats_to_osd();
4950 }
4951
4952 void PeeringState::NotRecovering::exit()
4953 {
4954   context< PeeringMachine >().log_exit(state_name, enter_time);
4955
4956   DECLARE_LOCALS;
4957   ps->state_clear(PG_STATE_RECOVERY_UNFOUND);
4958   utime_t dur = ceph_clock_now() - enter_time;
4959   pl->get_peering_perf().tinc(rs_notrecovering_latency, dur);
4960 }
4961
4962 /*---RepNotRecovering----*/
4963 PeeringState::RepNotRecovering::RepNotRecovering(my_context ctx)
4964   : my_base(ctx),
4965     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepNotRecovering")
4966 {
4967   context< PeeringMachine >().log_enter(state_name);
4968 }
4969
4970 boost::statechart::result
4971 PeeringState::RepNotRecovering::react(const RejectTooFullRemoteReservation &evt)
4972 {
4973   DECLARE_LOCALS;
4974   ps->reject_reservation();
4975   post_event(RemoteReservationRejectedTooFull());
4976   return discard_event();
4977 }
4978
4979 void PeeringState::RepNotRecovering::exit()
4980 {
4981   context< PeeringMachine >().log_exit(state_name, enter_time);
4982   DECLARE_LOCALS;
4983   utime_t dur = ceph_clock_now() - enter_time;
4984   pl->get_peering_perf().tinc(rs_repnotrecovering_latency, dur);
4985 }
4986
4987 /*---RepWaitRecoveryReserved--*/
4988 PeeringState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
4989   : my_base(ctx),
4990     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitRecoveryReserved")
4991 {
4992   context< PeeringMachine >().log_enter(state_name);
4993 }
4994
4995 boost::statechart::result
4996 PeeringState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
4997 {
4998   DECLARE_LOCALS;
4999   pl->send_cluster_message(
5000     ps->primary.osd,
5001     new MRecoveryReserve(
5002       MRecoveryReserve::GRANT,
5003       spg_t(ps->info.pgid.pgid, ps->primary.shard),
5004       ps->get_osdmap_epoch()),
5005     ps->get_osdmap_epoch());
5006   return transit<RepRecovering>();
5007 }
5008
5009 boost::statechart::result
5010 PeeringState::RepWaitRecoveryReserved::react(
5011   const RemoteReservationCanceled &evt)
5012 {
5013   DECLARE_LOCALS;
5014   pl->unreserve_recovery_space();
5015
5016   pl->cancel_remote_recovery_reservation();
5017   return transit<RepNotRecovering>();
5018 }
5019
5020 void PeeringState::RepWaitRecoveryReserved::exit()
5021 {
5022   context< PeeringMachine >().log_exit(state_name, enter_time);
5023   DECLARE_LOCALS;
5024   utime_t dur = ceph_clock_now() - enter_time;
5025   pl->get_peering_perf().tinc(rs_repwaitrecoveryreserved_latency, dur);
5026 }
5027
5028 /*-RepWaitBackfillReserved*/
5029 PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
5030   : my_base(ctx),
5031     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitBackfillReserved")
5032 {
5033   context< PeeringMachine >().log_enter(state_name);
5034 }
5035
5036 boost::statechart::result
5037 PeeringState::RepNotRecovering::react(const RequestBackfillPrio &evt)
5038 {
5039
5040   DECLARE_LOCALS;
5041
5042   if (!pl->try_reserve_recovery_space(
5043         evt.primary_num_bytes, evt.local_num_bytes)) {
5044     post_event(RejectTooFullRemoteReservation());
5045   } else {
5046     PGPeeringEventRef preempt;
5047     if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5048       // older peers will interpret preemption as TOOFULL
5049       preempt = std::make_shared<PGPeeringEvent>(
5050         pl->get_osdmap_epoch(),
5051         pl->get_osdmap_epoch(),
5052         RemoteBackfillPreempted());
5053     }
5054     pl->request_remote_recovery_reservation(
5055       evt.priority,
5056       std::make_shared<PGPeeringEvent>(
5057         pl->get_osdmap_epoch(),
5058         pl->get_osdmap_epoch(),
5059         RemoteBackfillReserved()),
5060       preempt);
5061   }
5062   return transit<RepWaitBackfillReserved>();
5063 }
5064
5065 boost::statechart::result
5066 PeeringState::RepNotRecovering::react(const RequestRecoveryPrio &evt)
5067 {
5068   DECLARE_LOCALS;
5069
5070   // fall back to a local reckoning of priority of primary doesn't pass one
5071   // (pre-mimic compat)
5072   int prio = evt.priority ? evt.priority : ps->get_recovery_priority();
5073
5074   PGPeeringEventRef preempt;
5075   if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5076     // older peers can't handle this
5077     preempt = std::make_shared<PGPeeringEvent>(
5078       ps->get_osdmap_epoch(),
5079       ps->get_osdmap_epoch(),
5080       RemoteRecoveryPreempted());
5081   }
5082
5083   pl->request_remote_recovery_reservation(
5084     prio,
5085     std::make_shared<PGPeeringEvent>(
5086       ps->get_osdmap_epoch(),
5087       ps->get_osdmap_epoch(),
5088       RemoteRecoveryReserved()),
5089     preempt);
5090   return transit<RepWaitRecoveryReserved>();
5091 }
5092
5093 void PeeringState::RepWaitBackfillReserved::exit()
5094 {
5095   context< PeeringMachine >().log_exit(state_name, enter_time);
5096   DECLARE_LOCALS;
5097   utime_t dur = ceph_clock_now() - enter_time;
5098   pl->get_peering_perf().tinc(rs_repwaitbackfillreserved_latency, dur);
5099 }
5100
5101 boost::statechart::result
5102 PeeringState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
5103 {
5104   DECLARE_LOCALS;
5105
5106
5107   pl->send_cluster_message(
5108       ps->primary.osd,
5109       new MBackfillReserve(
5110         MBackfillReserve::GRANT,
5111         spg_t(ps->info.pgid.pgid, ps->primary.shard),
5112         ps->get_osdmap_epoch()),
5113       ps->get_osdmap_epoch());
5114   return transit<RepRecovering>();
5115 }
5116
5117 boost::statechart::result
5118 PeeringState::RepWaitBackfillReserved::react(
5119   const RejectTooFullRemoteReservation &evt)
5120 {
5121   DECLARE_LOCALS;
5122   ps->reject_reservation();
5123   post_event(RemoteReservationRejectedTooFull());
5124   return discard_event();
5125 }
5126
5127 boost::statechart::result
5128 PeeringState::RepWaitBackfillReserved::react(
5129   const RemoteReservationRejectedTooFull &evt)
5130 {
5131   DECLARE_LOCALS;
5132   pl->unreserve_recovery_space();
5133
5134   pl->cancel_remote_recovery_reservation();
5135   return transit<RepNotRecovering>();
5136 }
5137
5138 boost::statechart::result
5139 PeeringState::RepWaitBackfillReserved::react(
5140   const RemoteReservationCanceled &evt)
5141 {
5142   DECLARE_LOCALS;
5143   pl->unreserve_recovery_space();
5144
5145   pl->cancel_remote_recovery_reservation();
5146   return transit<RepNotRecovering>();
5147 }
5148
5149 /*---RepRecovering-------*/
5150 PeeringState::RepRecovering::RepRecovering(my_context ctx)
5151   : my_base(ctx),
5152     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepRecovering")
5153 {
5154   context< PeeringMachine >().log_enter(state_name);
5155 }
5156
5157 boost::statechart::result
5158 PeeringState::RepRecovering::react(const RemoteRecoveryPreempted &)
5159 {
5160   DECLARE_LOCALS;
5161
5162
5163   pl->unreserve_recovery_space();
5164   pl->send_cluster_message(
5165     ps->primary.osd,
5166     new MRecoveryReserve(
5167       MRecoveryReserve::REVOKE,
5168       spg_t(ps->info.pgid.pgid, ps->primary.shard),
5169       ps->get_osdmap_epoch()),
5170     ps->get_osdmap_epoch());
5171   return discard_event();
5172 }
5173
5174 boost::statechart::result
5175 PeeringState::RepRecovering::react(const BackfillTooFull &)
5176 {
5177   DECLARE_LOCALS;
5178
5179
5180   pl->unreserve_recovery_space();
5181   pl->send_cluster_message(
5182     ps->primary.osd,
5183     new MBackfillReserve(
5184       MBackfillReserve::REVOKE_TOOFULL,
5185       spg_t(ps->info.pgid.pgid, ps->primary.shard),
5186       ps->get_osdmap_epoch()),
5187     ps->get_osdmap_epoch());
5188   return discard_event();
5189 }
5190
5191 boost::statechart::result
5192 PeeringState::RepRecovering::react(const RemoteBackfillPreempted &)
5193 {
5194   DECLARE_LOCALS;
5195
5196
5197   pl->unreserve_recovery_space();
5198   pl->send_cluster_message(
5199     ps->primary.osd,
5200     new MBackfillReserve(
5201       MBackfillReserve::REVOKE,
5202       spg_t(ps->info.pgid.pgid, ps->primary.shard),
5203       ps->get_osdmap_epoch()),
5204     ps->get_osdmap_epoch());
5205   return discard_event();
5206 }
5207
5208 void PeeringState::RepRecovering::exit()
5209 {
5210   context< PeeringMachine >().log_exit(state_name, enter_time);
5211   DECLARE_LOCALS;
5212   pl->unreserve_recovery_space();
5213
5214   pl->cancel_remote_recovery_reservation();
5215   utime_t dur = ceph_clock_now() - enter_time;
5216   pl->get_peering_perf().tinc(rs_reprecovering_latency, dur);
5217 }
5218
5219 /*------Activating--------*/
5220 PeeringState::Activating::Activating(my_context ctx)
5221   : my_base(ctx),
5222     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Activating")
5223 {
5224   context< PeeringMachine >().log_enter(state_name);
5225 }
5226
5227 void PeeringState::Activating::exit()
5228 {
5229   context< PeeringMachine >().log_exit(state_name, enter_time);
5230   DECLARE_LOCALS;
5231   utime_t dur = ceph_clock_now() - enter_time;
5232   pl->get_peering_perf().tinc(rs_activating_latency, dur);
5233 }
5234
5235 PeeringState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
5236   : my_base(ctx),
5237     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalRecoveryReserved")
5238 {
5239   context< PeeringMachine >().log_enter(state_name);
5240   DECLARE_LOCALS;
5241
5242   // Make sure all nodes that part of the recovery aren't full
5243   if (!ps->cct->_conf->osd_debug_skip_full_check_in_recovery &&
5244       ps->get_osdmap()->check_full(ps->acting_recovery_backfill)) {
5245     post_event(RecoveryTooFull());
5246     return;
5247   }
5248
5249   ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5250   ps->state_set(PG_STATE_RECOVERY_WAIT);
5251   pl->request_local_background_io_reservation(
5252     ps->get_recovery_priority(),
5253     std::make_shared<PGPeeringEvent>(
5254       ps->get_osdmap_epoch(),
5255       ps->get_osdmap_epoch(),
5256       LocalRecoveryReserved()),
5257     std::make_shared<PGPeeringEvent>(
5258       ps->get_osdmap_epoch(),
5259       ps->get_osdmap_epoch(),
5260       DeferRecovery(0.0)));
5261   pl->publish_stats_to_osd();
5262 }
5263
5264 boost::statechart::result
5265 PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
5266 {
5267   DECLARE_LOCALS;
5268   ps->state_set(PG_STATE_RECOVERY_TOOFULL);
5269   pl->schedule_event_after(
5270     std::make_shared<PGPeeringEvent>(
5271       ps->get_osdmap_epoch(),
5272       ps->get_osdmap_epoch(),
5273       DoRecovery()),
5274     ps->cct->_conf->osd_recovery_retry_interval);
5275   return transit<NotRecovering>();
5276 }
5277
5278 void PeeringState::WaitLocalRecoveryReserved::exit()
5279 {
5280   context< PeeringMachine >().log_exit(state_name, enter_time);
5281   DECLARE_LOCALS;
5282   utime_t dur = ceph_clock_now() - enter_time;
5283   pl->get_peering_perf().tinc(rs_waitlocalrecoveryreserved_latency, dur);
5284 }
5285
5286 PeeringState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
5287   : my_base(ctx),
5288     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
5289     remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
5290 {
5291   context< PeeringMachine >().log_enter(state_name);
5292   post_event(RemoteRecoveryReserved());
5293 }
5294
5295 boost::statechart::result
5296 PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
5297   DECLARE_LOCALS;
5298
5299   if (remote_recovery_reservation_it !=
5300       context< Active >().remote_shards_to_reserve_recovery.end()) {
5301     ceph_assert(*remote_recovery_reservation_it != ps->pg_whoami);
5302     pl->send_cluster_message(
5303       remote_recovery_reservation_it->osd,
5304       new MRecoveryReserve(
5305         MRecoveryReserve::REQUEST,
5306         spg_t(context< PeeringMachine >().spgid.pgid,
5307               remote_recovery_reservation_it->shard),
5308         ps->get_osdmap_epoch(),
5309         ps->get_recovery_priority()),
5310       ps->get_osdmap_epoch());
5311     ++remote_recovery_reservation_it;
5312   } else {
5313     post_event(AllRemotesReserved());
5314   }
5315   return discard_event();
5316 }
5317
5318 void PeeringState::WaitRemoteRecoveryReserved::exit()
5319 {
5320   context< PeeringMachine >().log_exit(state_name, enter_time);
5321   DECLARE_LOCALS;
5322   utime_t dur = ceph_clock_now() - enter_time;
5323   pl->get_peering_perf().tinc(rs_waitremoterecoveryreserved_latency, dur);
5324 }
5325
5326 PeeringState::Recovering::Recovering(my_context ctx)
5327   : my_base(ctx),
5328     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovering")
5329 {
5330   context< PeeringMachine >().log_enter(state_name);
5331
5332   DECLARE_LOCALS;
5333   ps->state_clear(PG_STATE_RECOVERY_WAIT);
5334   ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5335   ps->state_set(PG_STATE_RECOVERING);
5336   pl->on_recovery_reserved();
5337   ceph_assert(!ps->state_test(PG_STATE_ACTIVATING));
5338   pl->publish_stats_to_osd();
5339 }
5340
5341 void PeeringState::Recovering::release_reservations(bool cancel)
5342 {
5343   DECLARE_LOCALS;
5344   ceph_assert(cancel || !ps->pg_log.get_missing().have_missing());
5345
5346   // release remote reservations
5347   for (set<pg_shard_t>::const_iterator i =
5348          context< Active >().remote_shards_to_reserve_recovery.begin();
5349         i != context< Active >().remote_shards_to_reserve_recovery.end();
5350         ++i) {
5351     if (*i == ps->pg_whoami) // skip myself
5352       continue;
5353     pl->send_cluster_message(
5354       i->osd,
5355       new MRecoveryReserve(
5356         MRecoveryReserve::RELEASE,
5357         spg_t(ps->info.pgid.pgid, i->shard),
5358         ps->get_osdmap_epoch()),
5359       ps->get_osdmap_epoch());
5360   }
5361 }
5362
5363 boost::statechart::result
5364 PeeringState::Recovering::react(const AllReplicasRecovered &evt)
5365 {
5366   DECLARE_LOCALS;
5367   ps->state_clear(PG_STATE_FORCED_RECOVERY);
5368   release_reservations();
5369   pl->cancel_local_background_io_reservation();
5370   return transit<Recovered>();
5371 }
5372
5373 boost::statechart::result
5374 PeeringState::Recovering::react(const RequestBackfill &evt)
5375 {
5376   DECLARE_LOCALS;
5377
5378   release_reservations();
5379
5380   ps->state_clear(PG_STATE_FORCED_RECOVERY);
5381   pl->cancel_local_background_io_reservation();
5382   pl->publish_stats_to_osd();
5383   // transit any async_recovery_targets back into acting
5384   // so pg won't have to stay undersized for long
5385   // as backfill might take a long time to complete..
5386   if (!ps->async_recovery_targets.empty()) {
5387     pg_shard_t auth_log_shard;
5388     bool history_les_bound = false;
5389     ps->choose_acting(auth_log_shard, true, &history_les_bound);
5390   }
5391   return transit<WaitLocalBackfillReserved>();
5392 }
5393
5394 boost::statechart::result
5395 PeeringState::Recovering::react(const DeferRecovery &evt)
5396 {
5397   DECLARE_LOCALS;
5398   if (!ps->state_test(PG_STATE_RECOVERING)) {
5399     // we may have finished recovery and have an AllReplicasRecovered
5400     // event queued to move us to the next state.
5401     psdout(10) << "got defer recovery but not recovering" << dendl;
5402     return discard_event();
5403   }
5404   psdout(10) << "defer recovery, retry delay " << evt.delay << dendl;
5405   ps->state_set(PG_STATE_RECOVERY_WAIT);
5406   pl->cancel_local_background_io_reservation();
5407   release_reservations(true);
5408   pl->schedule_event_after(
5409     std::make_shared<PGPeeringEvent>(
5410       ps->get_osdmap_epoch(),
5411       ps->get_osdmap_epoch(),
5412       DoRecovery()),
5413     evt.delay);
5414   return transit<NotRecovering>();
5415 }
5416
5417 boost::statechart::result
5418 PeeringState::Recovering::react(const UnfoundRecovery &evt)
5419 {
5420   DECLARE_LOCALS;
5421   psdout(10) << "recovery has unfound, can't continue" << dendl;
5422   ps->state_set(PG_STATE_RECOVERY_UNFOUND);
5423   pl->cancel_local_background_io_reservation();
5424   release_reservations(true);
5425   return transit<NotRecovering>();
5426 }
5427
5428 void PeeringState::Recovering::exit()
5429 {
5430   context< PeeringMachine >().log_exit(state_name, enter_time);
5431
5432   DECLARE_LOCALS;
5433   utime_t dur = ceph_clock_now() - enter_time;
5434   ps->state_clear(PG_STATE_RECOVERING);
5435   pl->get_peering_perf().tinc(rs_recovering_latency, dur);
5436 }
5437
5438 PeeringState::Recovered::Recovered(my_context ctx)
5439   : my_base(ctx),
5440     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovered")
5441 {
5442   pg_shard_t auth_log_shard;
5443
5444   context< PeeringMachine >().log_enter(state_name);
5445
5446   DECLARE_LOCALS;
5447
5448   ceph_assert(!ps->needs_recovery());
5449
5450   // if we finished backfill, all acting are active; recheck if
5451   // DEGRADED | UNDERSIZED is appropriate.
5452   ceph_assert(!ps->acting_recovery_backfill.empty());
5453   if (ps->get_osdmap()->get_pg_size(context< PeeringMachine >().spgid.pgid) <=
5454       ps->acting_recovery_backfill.size()) {
5455     ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
5456     pl->publish_stats_to_osd();
5457   }
5458
5459   // adjust acting set?  (e.g. because backfill completed...)
5460   bool history_les_bound = false;
5461   if (ps->acting != ps->up && !ps->choose_acting(auth_log_shard,
5462                                                  true, &history_les_bound)) {
5463     ceph_assert(ps->want_acting.size());
5464   } else if (!ps->async_recovery_targets.empty()) {
5465     ps->choose_acting(auth_log_shard, true, &history_les_bound);
5466   }
5467
5468   if (context< Active >().all_replicas_activated  &&
5469       ps->async_recovery_targets.empty())
5470     post_event(GoClean());
5471 }
5472
5473 void PeeringState::Recovered::exit()
5474 {
5475   context< PeeringMachine >().log_exit(state_name, enter_time);
5476   DECLARE_LOCALS;
5477
5478   utime_t dur = ceph_clock_now() - enter_time;
5479   pl->get_peering_perf().tinc(rs_recovered_latency, dur);
5480 }
5481
5482 PeeringState::Clean::Clean(my_context ctx)
5483   : my_base(ctx),
5484     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Clean")
5485 {
5486   context< PeeringMachine >().log_enter(state_name);
5487
5488   DECLARE_LOCALS;
5489
5490   if (ps->info.last_complete != ps->info.last_update) {
5491     ceph_abort();
5492   }
5493
5494
5495   ps->try_mark_clean();
5496
5497   context< PeeringMachine >().get_cur_transaction().register_on_commit(
5498     pl->on_clean());
5499 }
5500
5501 void PeeringState::Clean::exit()
5502 {
5503   context< PeeringMachine >().log_exit(state_name, enter_time);
5504
5505   DECLARE_LOCALS;
5506   ps->state_clear(PG_STATE_CLEAN);
5507   utime_t dur = ceph_clock_now() - enter_time;
5508   pl->get_peering_perf().tinc(rs_clean_latency, dur);
5509 }
5510
5511 template <typename T>
5512 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
5513 {
5514   set<int> osds_found;
5515   set<pg_shard_t> out;
5516   for (typename T::const_iterator i = in.begin();
5517        i != in.end();
5518        ++i) {
5519     if (*i != skip && !osds_found.count(i->osd)) {
5520       osds_found.insert(i->osd);
5521       out.insert(*i);
5522     }
5523   }
5524   return out;
5525 }
5526
5527 /*---------Active---------*/
5528 PeeringState::Active::Active(my_context ctx)
5529   : my_base(ctx),
5530     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active"),
5531     remote_shards_to_reserve_recovery(
5532       unique_osd_shard_set(
5533         context< PeeringMachine >().state->pg_whoami,
5534         context< PeeringMachine >().state->acting_recovery_backfill)),
5535     remote_shards_to_reserve_backfill(
5536       unique_osd_shard_set(
5537         context< PeeringMachine >().state->pg_whoami,
5538         context< PeeringMachine >().state->backfill_targets)),
5539     all_replicas_activated(false)
5540 {
5541   context< PeeringMachine >().log_enter(state_name);
5542
5543
5544   DECLARE_LOCALS;
5545
5546   ceph_assert(!ps->backfill_reserved);
5547   ceph_assert(ps->is_primary());
5548   psdout(10) << "In Active, about to call activate" << dendl;
5549   ps->start_flush(context< PeeringMachine >().get_cur_transaction());
5550   ps->activate(context< PeeringMachine >().get_cur_transaction(),
5551                ps->get_osdmap_epoch(),
5552                context< PeeringMachine >().get_recovery_ctx());
5553
5554   // everyone has to commit/ack before we are truly active
5555   ps->blocked_by.clear();
5556   for (set<pg_shard_t>::iterator p = ps->acting_recovery_backfill.begin();
5557        p != ps->acting_recovery_backfill.end();
5558        ++p) {
5559     if (p->shard != ps->pg_whoami.shard) {
5560       ps->blocked_by.insert(p->shard);
5561     }
5562   }
5563   pl->publish_stats_to_osd();
5564   psdout(10) << "Activate Finished" << dendl;
5565 }
5566
5567 boost::statechart::result PeeringState::Active::react(const AdvMap& advmap)
5568 {
5569   DECLARE_LOCALS;
5570
5571   if (ps->should_restart_peering(
5572         advmap.up_primary,
5573         advmap.acting_primary,
5574         advmap.newup,
5575         advmap.newacting,
5576         advmap.lastmap,
5577         advmap.osdmap)) {
5578     psdout(10) << "Active advmap interval change, fast return" << dendl;
5579     return forward_event();
5580   }
5581   psdout(10) << "Active advmap" << dendl;
5582   bool need_publish = false;
5583
5584   pl->on_active_advmap(advmap.osdmap);
5585   if (ps->dirty_big_info) {
5586     // share updated purged_snaps to mgr/mon so that we (a) stop reporting
5587     // purged snaps and (b) perhaps share more snaps that we have purged
5588     // but didn't fit in pg_stat_t.
5589     need_publish = true;
5590     ps->share_pg_info();
5591   }
5592
5593   bool need_acting_change = false;
5594   for (size_t i = 0; i < ps->want_acting.size(); i++) {
5595     int osd = ps->want_acting[i];
5596     if (!advmap.osdmap->is_up(osd)) {
5597       pg_shard_t osd_with_shard(osd, shard_id_t(i));
5598       if (!ps->is_acting(osd_with_shard) && !ps->is_up(osd_with_shard)) {
5599         psdout(10) << "Active stray osd." << osd << " in want_acting is down"
5600                    << dendl;
5601         need_acting_change = true;
5602       }
5603     }
5604   }
5605   if (need_acting_change) {
5606      psdout(10) << "Active need acting change, call choose_acting again"
5607                 << dendl;
5608     // possibly because we re-add some strays into the acting set and
5609     // some of them then go down in a subsequent map before we could see
5610     // the map changing the pg temp.
5611     // call choose_acting again to clear them out.
5612     // note that we leave restrict_to_up_acting to false in order to
5613     // not overkill any chosen stray that is still alive.
5614     pg_shard_t auth_log_shard;
5615     bool history_les_bound = false;
5616     ps->remove_down_peer_info(advmap.osdmap);
5617     ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
5618   }
5619
5620   /* Check for changes in pool size (if the acting set changed as a result,
5621    * this does not matter) */
5622   if (advmap.lastmap->get_pg_size(ps->info.pgid.pgid) !=
5623       ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid)) {
5624     if (ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid) <=
5625         ps->actingset.size()) {
5626       ps->state_clear(PG_STATE_UNDERSIZED);
5627     } else {
5628       ps->state_set(PG_STATE_UNDERSIZED);
5629     }
5630     // degraded changes will be detected by call from publish_stats_to_osd()
5631     need_publish = true;
5632   }
5633
5634   // if we haven't reported our PG stats in a long time, do so now.
5635   if (ps->info.stats.reported_epoch + ps->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
5636     psdout(20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - ps->info.stats.reported_epoch)
5637                        << " epochs" << dendl;
5638     need_publish = true;
5639   }
5640
5641   if (need_publish)
5642     pl->publish_stats_to_osd();
5643
5644   if (ps->check_prior_readable_down_osds(advmap.osdmap)) {
5645     pl->recheck_readable();
5646   }
5647
5648   return forward_event();
5649 }
5650
5651 boost::statechart::result PeeringState::Active::react(const ActMap&)
5652 {
5653   DECLARE_LOCALS;
5654   psdout(10) << "Active: handling ActMap" << dendl;
5655   ceph_assert(ps->is_primary());
5656
5657   pl->on_active_actmap();
5658
5659   if (ps->have_unfound()) {
5660     // object may have become unfound
5661     ps->discover_all_missing(context<PeeringMachine>().get_recovery_ctx().msgs);
5662   }
5663
5664   uint64_t unfound = ps->missing_loc.num_unfound();
5665   if (unfound > 0 &&
5666       ps->all_unfound_are_queried_or_lost(ps->get_osdmap())) {
5667     if (ps->cct->_conf->osd_auto_mark_unfound_lost) {
5668       pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has " << unfound
5669                             << " objects unfound and apparently lost, would automatically "
5670                             << "mark these objects lost but this feature is not yet implemented "
5671                             << "(osd_auto_mark_unfound_lost)";
5672     } else
5673       pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has "
5674                              << unfound << " objects unfound and apparently lost";
5675   }
5676
5677   return forward_event();
5678 }
5679
5680 boost::statechart::result PeeringState::Active::react(const MNotifyRec& notevt)
5681 {
5682
5683   DECLARE_LOCALS;
5684   ceph_assert(ps->is_primary());
5685   if (ps->peer_info.count(notevt.from)) {
5686     psdout(10) << "Active: got notify from " << notevt.from
5687                        << ", already have info from that osd, ignoring"
5688                        << dendl;
5689   } else if (ps->peer_purged.count(notevt.from)) {
5690     psdout(10) << "Active: got notify from " << notevt.from
5691                        << ", already purged that peer, ignoring"
5692                        << dendl;
5693   } else {
5694     psdout(10) << "Active: got notify from " << notevt.from
5695                        << ", calling proc_replica_info and discover_all_missing"
5696                        << dendl;
5697     ps->proc_replica_info(
5698       notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
5699     if (ps->have_unfound() || (ps->is_degraded() && ps->might_have_unfound.count(notevt.from))) {
5700       ps->discover_all_missing(
5701         context<PeeringMachine>().get_recovery_ctx().msgs);
5702     }
5703     // check if it is a previous down acting member that's coming back.
5704     // if so, request pg_temp change to trigger a new interval transition
5705     pg_shard_t auth_log_shard;
5706     bool history_les_bound = false;
5707     ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
5708     if (!ps->want_acting.empty() && ps->want_acting != ps->acting) {
5709       psdout(10) << "Active: got notify from previous acting member "
5710                  << notevt.from << ", requesting pg_temp change"
5711                  << dendl;
5712     }
5713   }
5714   return discard_event();
5715 }
5716
5717 boost::statechart::result PeeringState::Active::react(const MTrim& trim)
5718 {
5719   DECLARE_LOCALS;
5720   ceph_assert(ps->is_primary());
5721
5722   // peer is informing us of their last_complete_ondisk
5723   ldout(ps->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl;
5724   ps->update_peer_last_complete_ondisk(pg_shard_t{trim.from, trim.shard},
5725                                        trim.trim_to);
5726   // trim log when the pg is recovered
5727   ps->calc_min_last_complete_ondisk();
5728   return discard_event();
5729 }
5730
5731 boost::statechart::result PeeringState::Active::react(const MInfoRec& infoevt)
5732 {
5733   DECLARE_LOCALS;
5734   ceph_assert(ps->is_primary());
5735
5736   ceph_assert(!ps->acting_recovery_backfill.empty());
5737   if (infoevt.lease_ack) {
5738     ps->proc_lease_ack(infoevt.from.osd, *infoevt.lease_ack);
5739   }
5740   // don't update history (yet) if we are active and primary; the replica
5741   // may be telling us they have activated (and committed) but we can't
5742   // share that until _everyone_ does the same.
5743   if (ps->is_acting_recovery_backfill(infoevt.from) &&
5744       ps->peer_activated.count(infoevt.from) == 0) {
5745     psdout(10) << " peer osd." << infoevt.from
5746                << " activated and committed" << dendl;
5747     ps->peer_activated.insert(infoevt.from);
5748     ps->blocked_by.erase(infoevt.from.shard);
5749     pl->publish_stats_to_osd();
5750     if (ps->peer_activated.size() == ps->acting_recovery_backfill.size()) {
5751       all_activated_and_committed();
5752     }
5753   }
5754   return discard_event();
5755 }
5756
5757 boost::statechart::result PeeringState::Active::react(const MLogRec& logevt)
5758 {
5759   DECLARE_LOCALS;
5760   psdout(10) << "searching osd." << logevt.from
5761                      << " log for unfound items" << dendl;
5762   ps->proc_replica_log(
5763     logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
5764   bool got_missing = ps->search_for_missing(
5765     ps->peer_info[logevt.from],
5766     ps->peer_missing[logevt.from],
5767     logevt.from,
5768     context< PeeringMachine >().get_recovery_ctx());
5769   // If there are missing AND we are "fully" active then start recovery now
5770   if (got_missing && ps->state_test(PG_STATE_ACTIVE)) {
5771     post_event(DoRecovery());
5772   }
5773   return discard_event();
5774 }
5775
5776 boost::statechart::result PeeringState::Active::react(const QueryState& q)
5777 {
5778   DECLARE_LOCALS;
5779
5780   q.f->open_object_section("state");
5781   q.f->dump_string("name", state_name);
5782   q.f->dump_stream("enter_time") << enter_time;
5783
5784   {
5785     q.f->open_array_section("might_have_unfound");
5786     for (set<pg_shard_t>::iterator p = ps->might_have_unfound.begin();
5787          p != ps->might_have_unfound.end();
5788          ++p) {
5789       q.f->open_object_section("osd");
5790       q.f->dump_stream("osd") << *p;
5791       if (ps->peer_missing.count(*p)) {
5792         q.f->dump_string("status", "already probed");
5793       } else if (ps->peer_missing_requested.count(*p)) {
5794         q.f->dump_string("status", "querying");
5795       } else if (!ps->get_osdmap()->is_up(p->osd)) {
5796         q.f->dump_string("status", "osd is down");
5797       } else {
5798         q.f->dump_string("status", "not queried");
5799       }
5800       q.f->close_section();
5801     }
5802     q.f->close_section();
5803   }
5804   {
5805     q.f->open_object_section("recovery_progress");
5806     q.f->open_array_section("backfill_targets");
5807     for (set<pg_shard_t>::const_iterator p = ps->backfill_targets.begin();
5808          p != ps->backfill_targets.end(); ++p)
5809       q.f->dump_stream("replica") << *p;
5810     q.f->close_section();
5811     pl->dump_recovery_info(q.f);
5812     q.f->close_section();
5813   }
5814
5815   q.f->close_section();
5816   return forward_event();
5817 }
5818
5819 boost::statechart::result PeeringState::Active::react(
5820   const ActivateCommitted &evt)
5821 {
5822   DECLARE_LOCALS;
5823   ceph_assert(!ps->peer_activated.count(ps->pg_whoami));
5824   ps->peer_activated.insert(ps->pg_whoami);
5825   psdout(10) << "_activate_committed " << evt.epoch
5826              << " peer_activated now " << ps->peer_activated
5827              << " last_interval_started "
5828              << ps->info.history.last_interval_started
5829              << " last_epoch_started "
5830              << ps->info.history.last_epoch_started
5831              << " same_interval_since "
5832              << ps->info.history.same_interval_since
5833              << dendl;
5834   ceph_assert(!ps->acting_recovery_backfill.empty());
5835   if (ps->peer_activated.size() == ps->acting_recovery_backfill.size())
5836     all_activated_and_committed();
5837   return discard_event();
5838 }
5839
5840 boost::statechart::result PeeringState::Active::react(const AllReplicasActivated &evt)
5841 {
5842
5843   DECLARE_LOCALS;
5844   pg_t pgid = context< PeeringMachine >().spgid.pgid;
5845
5846   all_replicas_activated = true;
5847
5848   ps->state_clear(PG_STATE_ACTIVATING);
5849   ps->state_clear(PG_STATE_CREATING);
5850   ps->state_clear(PG_STATE_PREMERGE);
5851
5852   bool merge_target;
5853   if (ps->pool.info.is_pending_merge(pgid, &merge_target)) {
5854     ps->state_set(PG_STATE_PEERED);
5855     ps->state_set(PG_STATE_PREMERGE);
5856
5857     if (ps->actingset.size() != ps->get_osdmap()->get_pg_size(pgid)) {
5858       if (merge_target) {
5859         pg_t src = pgid;
5860         src.set_ps(ps->pool.info.get_pg_num_pending());
5861         assert(src.get_parent() == pgid);
5862         pl->set_not_ready_to_merge_target(pgid, src);
5863       } else {
5864         pl->set_not_ready_to_merge_source(pgid);
5865       }
5866     }
5867   } else if (ps->acting.size() < ps->pool.info.min_size) {
5868     ps->state_set(PG_STATE_PEERED);
5869   } else {
5870     ps->state_set(PG_STATE_ACTIVE);
5871   }
5872
5873   auto mnow = pl->get_mnow();
5874   if (ps->prior_readable_until_ub > mnow) {
5875     psdout(10) << " waiting for prior_readable_until_ub "
5876                << ps->prior_readable_until_ub << " > mnow " << mnow << dendl;
5877     ps->state_set(PG_STATE_WAIT);
5878     pl->queue_check_readable(
5879       ps->last_peering_reset,
5880       ps->prior_readable_until_ub - mnow);
5881   } else {
5882     psdout(10) << " mnow " << mnow << " >= prior_readable_until_ub "
5883                << ps->prior_readable_until_ub << dendl;
5884   }
5885
5886   if (ps->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) {
5887     pl->send_pg_created(pgid);
5888   }
5889
5890   ps->info.history.last_epoch_started = ps->info.last_epoch_started;
5891   ps->info.history.last_interval_started = ps->info.last_interval_started;
5892   ps->dirty_info = true;
5893
5894   ps->share_pg_info();
5895   pl->publish_stats_to_osd();
5896
5897   pl->on_activate_complete();
5898
5899   return discard_event();
5900 }
5901
5902 boost::statechart::result PeeringState::Active::react(const RenewLease& rl)
5903 {
5904   DECLARE_LOCALS;
5905   ps->proc_renew_lease();
5906   return discard_event();
5907 }
5908
5909 boost::statechart::result PeeringState::Active::react(const MLeaseAck& la)
5910 {
5911   DECLARE_LOCALS;
5912   ps->proc_lease_ack(la.from, la.lease_ack);
5913   return discard_event();
5914 }
5915
5916
5917 boost::statechart::result PeeringState::Active::react(const CheckReadable &evt)
5918 {
5919   DECLARE_LOCALS;
5920   pl->recheck_readable();
5921   return discard_event();
5922 }
5923
5924 /*
5925  * update info.history.last_epoch_started ONLY after we and all
5926  * replicas have activated AND committed the activate transaction
5927  * (i.e. the peering results are stable on disk).
5928  */
5929 void PeeringState::Active::all_activated_and_committed()
5930 {
5931   DECLARE_LOCALS;
5932   psdout(10) << "all_activated_and_committed" << dendl;
5933   ceph_assert(ps->is_primary());
5934   ceph_assert(ps->peer_activated.size() == ps->acting_recovery_backfill.size());
5935   ceph_assert(!ps->acting_recovery_backfill.empty());
5936   ceph_assert(ps->blocked_by.empty());
5937
5938   if (HAVE_FEATURE(ps->upacting_features, SERVER_OCTOPUS)) {
5939     // this is overkill when the activation is quick, but when it is slow it
5940     // is important, because the lease was renewed by the activate itself but we
5941     // don't know how long ago that was, and simply scheduling now may leave
5942     // a gap in lease coverage.  keep it simple and aggressively renew.
5943     ps->renew_lease(pl->get_mnow());
5944     ps->send_lease();
5945     ps->schedule_renew_lease();
5946   }
5947
5948   // Degraded?
5949   ps->update_calc_stats();
5950   if (ps->info.stats.stats.sum.num_objects_degraded) {
5951     ps->state_set(PG_STATE_DEGRADED);
5952   } else {
5953     ps->state_clear(PG_STATE_DEGRADED);
5954   }
5955
5956   post_event(PeeringState::AllReplicasActivated());
5957 }
5958
5959
5960 void PeeringState::Active::exit()
5961 {
5962   context< PeeringMachine >().log_exit(state_name, enter_time);
5963
5964
5965   DECLARE_LOCALS;
5966   pl->cancel_local_background_io_reservation();
5967
5968   ps->blocked_by.clear();
5969   ps->backfill_reserved = false;
5970   ps->state_clear(PG_STATE_ACTIVATING);
5971   ps->state_clear(PG_STATE_DEGRADED);
5972   ps->state_clear(PG_STATE_UNDERSIZED);
5973   ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
5974   ps->state_clear(PG_STATE_BACKFILL_WAIT);
5975   ps->state_clear(PG_STATE_RECOVERY_WAIT);
5976   ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5977   utime_t dur = ceph_clock_now() - enter_time;
5978   pl->get_peering_perf().tinc(rs_active_latency, dur);
5979   pl->on_active_exit();
5980 }
5981
5982 /*------ReplicaActive-----*/
5983 PeeringState::ReplicaActive::ReplicaActive(my_context ctx)
5984   : my_base(ctx),
5985     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive")
5986 {
5987   context< PeeringMachine >().log_enter(state_name);
5988
5989   DECLARE_LOCALS;
5990   ps->start_flush(context< PeeringMachine >().get_cur_transaction());
5991 }
5992
5993
5994 boost::statechart::result PeeringState::ReplicaActive::react(
5995   const Activate& actevt) {
5996   DECLARE_LOCALS;
5997   psdout(10) << "In ReplicaActive, about to call activate" << dendl;
5998   ps->activate(
5999     context< PeeringMachine >().get_cur_transaction(),
6000     actevt.activation_epoch,
6001     context< PeeringMachine >().get_recovery_ctx());
6002   psdout(10) << "Activate Finished" << dendl;
6003   return discard_event();
6004 }
6005
6006 boost::statechart::result PeeringState::ReplicaActive::react(
6007   const ActivateCommitted &evt)
6008 {
6009   DECLARE_LOCALS;
6010   psdout(10) << __func__ << " " << evt.epoch << " telling primary" << dendl;
6011
6012   auto &rctx = context<PeeringMachine>().get_recovery_ctx();
6013   auto epoch = ps->get_osdmap_epoch();
6014   pg_info_t i = ps->info;
6015   i.history.last_epoch_started = evt.activation_epoch;
6016   i.history.last_interval_started = i.history.same_interval_since;
6017   rctx.send_info(
6018     ps->get_primary().osd,
6019     spg_t(ps->info.pgid.pgid, ps->get_primary().shard),
6020     epoch,
6021     epoch,
6022     i,
6023     {}, /* lease */
6024     ps->get_lease_ack());
6025
6026   if (ps->acting.size() >= ps->pool.info.min_size) {
6027     ps->state_set(PG_STATE_ACTIVE);
6028   } else {
6029     ps->state_set(PG_STATE_PEERED);
6030   }
6031   pl->on_activate_committed();
6032
6033   return discard_event();
6034 }
6035
6036 boost::statechart::result PeeringState::ReplicaActive::react(const MLease& l)
6037 {
6038   DECLARE_LOCALS;
6039   spg_t spgid = context< PeeringMachine >().spgid;
6040   epoch_t epoch = pl->get_osdmap_epoch();
6041
6042   ps->proc_lease(l.lease);
6043   pl->send_cluster_message(
6044     ps->get_primary().osd,
6045     new MOSDPGLeaseAck(epoch,
6046                        spg_t(spgid.pgid, ps->get_primary().shard),
6047                        ps->get_lease_ack()),
6048     epoch);
6049   return discard_event();
6050 }
6051
6052 boost::statechart::result PeeringState::ReplicaActive::react(const MInfoRec& infoevt)
6053 {
6054   DECLARE_LOCALS;
6055   ps->proc_primary_info(context<PeeringMachine>().get_cur_transaction(),
6056                         infoevt.info);
6057   return discard_event();
6058 }
6059
6060 boost::statechart::result PeeringState::ReplicaActive::react(const MLogRec& logevt)
6061 {
6062   DECLARE_LOCALS;
6063   psdout(10) << "received log from " << logevt.from << dendl;
6064   ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6065   ps->merge_log(t, logevt.msg->info, logevt.msg->log, logevt.from);
6066   ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6067   if (logevt.msg->lease) {
6068     ps->proc_lease(*logevt.msg->lease);
6069   }
6070
6071   return discard_event();
6072 }
6073
6074 boost::statechart::result PeeringState::ReplicaActive::react(const MTrim& trim)
6075 {
6076   DECLARE_LOCALS;
6077   // primary is instructing us to trim
6078   ps->pg_log.trim(trim.trim_to, ps->info);
6079   ps->dirty_info = true;
6080   return discard_event();
6081 }
6082
6083 boost::statechart::result PeeringState::ReplicaActive::react(const ActMap&)
6084 {
6085   DECLARE_LOCALS;
6086   if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6087     ps->info.history.refresh_prior_readable_until_ub(
6088       pl->get_mnow(), ps->prior_readable_until_ub);
6089     context< PeeringMachine >().send_notify(
6090       ps->get_primary().osd,
6091       pg_notify_t(
6092         ps->get_primary().shard, ps->pg_whoami.shard,
6093         ps->get_osdmap_epoch(),
6094         ps->get_osdmap_epoch(),
6095         ps->info,
6096         ps->past_intervals));
6097   }
6098   return discard_event();
6099 }
6100
6101 boost::statechart::result PeeringState::ReplicaActive::react(
6102   const MQuery& query)
6103 {
6104   DECLARE_LOCALS;
6105   ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6106   return discard_event();
6107 }
6108
6109 boost::statechart::result PeeringState::ReplicaActive::react(const QueryState& q)
6110 {
6111   q.f->open_object_section("state");
6112   q.f->dump_string("name", state_name);
6113   q.f->dump_stream("enter_time") << enter_time;
6114   q.f->close_section();
6115   return forward_event();
6116 }
6117
6118 void PeeringState::ReplicaActive::exit()
6119 {
6120   context< PeeringMachine >().log_exit(state_name, enter_time);
6121   DECLARE_LOCALS;
6122   pl->unreserve_recovery_space();
6123
6124   pl->cancel_remote_recovery_reservation();
6125   utime_t dur = ceph_clock_now() - enter_time;
6126   pl->get_peering_perf().tinc(rs_replicaactive_latency, dur);
6127
6128   ps->min_last_complete_ondisk = eversion_t();
6129 }
6130
6131 /*-------Stray---*/
6132 PeeringState::Stray::Stray(my_context ctx)
6133   : my_base(ctx),
6134     NamedState(context< PeeringMachine >().state_history, "Started/Stray")
6135 {
6136   context< PeeringMachine >().log_enter(state_name);
6137
6138
6139   DECLARE_LOCALS;
6140   ceph_assert(!ps->is_peered());
6141   ceph_assert(!ps->is_peering());
6142   ceph_assert(!ps->is_primary());
6143
6144   if (!ps->get_osdmap()->have_pg_pool(ps->info.pgid.pgid.pool())) {
6145     ldout(ps->cct,10) << __func__ << " pool is deleted" << dendl;
6146     post_event(DeleteStart());
6147   } else {
6148     ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6149   }
6150 }
6151
6152 boost::statechart::result PeeringState::Stray::react(const MLogRec& logevt)
6153 {
6154   DECLARE_LOCALS;
6155   MOSDPGLog *msg = logevt.msg.get();
6156   psdout(10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
6157
6158   ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6159   if (msg->info.last_backfill == hobject_t()) {
6160     // restart backfill
6161     ps->info = msg->info;
6162     pl->on_info_history_change();
6163     ps->dirty_info = true;
6164     ps->dirty_big_info = true;  // maybe.
6165
6166     PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6167     ps->pg_log.reset_backfill_claim_log(msg->log, rollbacker.get());
6168
6169     ps->pg_log.reset_backfill();
6170   } else {
6171     ps->merge_log(t, msg->info, msg->log, logevt.from);
6172   }
6173   if (logevt.msg->lease) {
6174     ps->proc_lease(*logevt.msg->lease);
6175   }
6176
6177   ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6178
6179   post_event(Activate(logevt.msg->info.last_epoch_started));
6180   return transit<ReplicaActive>();
6181 }
6182
6183 boost::statechart::result PeeringState::Stray::react(const MInfoRec& infoevt)
6184 {
6185   DECLARE_LOCALS;
6186   psdout(10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
6187
6188   if (ps->info.last_update > infoevt.info.last_update) {
6189     // rewind divergent log entries
6190     ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6191     ps->rewind_divergent_log(t, infoevt.info.last_update);
6192     ps->info.stats = infoevt.info.stats;
6193     ps->info.hit_set = infoevt.info.hit_set;
6194   }
6195
6196   if (infoevt.lease) {
6197     ps->proc_lease(*infoevt.lease);
6198   }
6199
6200   ceph_assert(infoevt.info.last_update == ps->info.last_update);
6201   ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6202
6203   post_event(Activate(infoevt.info.last_epoch_started));
6204   return transit<ReplicaActive>();
6205 }
6206
6207 boost::statechart::result PeeringState::Stray::react(const MQuery& query)
6208 {
6209   DECLARE_LOCALS;
6210   ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6211   return discard_event();
6212 }
6213
6214 boost::statechart::result PeeringState::Stray::react(const ActMap&)
6215 {
6216   DECLARE_LOCALS;
6217   if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6218     ps->info.history.refresh_prior_readable_until_ub(
6219       pl->get_mnow(), ps->prior_readable_until_ub);
6220     context< PeeringMachine >().send_notify(
6221       ps->get_primary().osd,
6222       pg_notify_t(
6223         ps->get_primary().shard, ps->pg_whoami.shard,
6224         ps->get_osdmap_epoch(),
6225         ps->get_osdmap_epoch(),
6226         ps->info,
6227         ps->past_intervals));
6228   }
6229   return discard_event();
6230 }
6231
6232 void PeeringState::Stray::exit()
6233 {
6234   context< PeeringMachine >().log_exit(state_name, enter_time);
6235   DECLARE_LOCALS;
6236   utime_t dur = ceph_clock_now() - enter_time;
6237   pl->get_peering_perf().tinc(rs_stray_latency, dur);
6238 }
6239
6240
6241 /*--------ToDelete----------*/
6242 PeeringState::ToDelete::ToDelete(my_context ctx)
6243   : my_base(ctx),
6244     NamedState(context< PeeringMachine >().state_history, "Started/ToDelete")
6245 {
6246   context< PeeringMachine >().log_enter(state_name);
6247   DECLARE_LOCALS;
6248   pl->get_perf_logger().inc(l_osd_pg_removing);
6249 }
6250
6251 void PeeringState::ToDelete::exit()
6252 {
6253   context< PeeringMachine >().log_exit(state_name, enter_time);
6254   DECLARE_LOCALS;
6255   // note: on a successful removal, this path doesn't execute. see
6256   // _delete_some().
6257   pl->get_perf_logger().dec(l_osd_pg_removing);
6258
6259   pl->cancel_local_background_io_reservation();
6260 }
6261
6262 /*----WaitDeleteReserved----*/
6263 PeeringState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx)
6264   : my_base(ctx),
6265     NamedState(context< PeeringMachine >().state_history,
6266                "Started/ToDelete/WaitDeleteReseved")
6267 {
6268   context< PeeringMachine >().log_enter(state_name);
6269   DECLARE_LOCALS;
6270   context< ToDelete >().priority = ps->get_delete_priority();
6271
6272   pl->cancel_local_background_io_reservation();
6273   pl->request_local_background_io_reservation(
6274     context<ToDelete>().priority,
6275     std::make_shared<PGPeeringEvent>(
6276       ps->get_osdmap_epoch(),
6277       ps->get_osdmap_epoch(),
6278       DeleteReserved()),
6279     std::make_shared<PGPeeringEvent>(
6280       ps->get_osdmap_epoch(),
6281       ps->get_osdmap_epoch(),
6282       DeleteInterrupted()));
6283 }
6284
6285 boost::statechart::result PeeringState::ToDelete::react(
6286   const ActMap& evt)
6287 {
6288   DECLARE_LOCALS;
6289   if (ps->get_delete_priority() != priority) {
6290     psdout(10) << __func__ << " delete priority changed, resetting"
6291                    << dendl;
6292     return transit<ToDelete>();
6293   }
6294   return discard_event();
6295 }
6296
6297 void PeeringState::WaitDeleteReserved::exit()
6298 {
6299   context< PeeringMachine >().log_exit(state_name, enter_time);
6300 }
6301
6302 /*----Deleting-----*/
6303 PeeringState::Deleting::Deleting(my_context ctx)
6304   : my_base(ctx),
6305     NamedState(context< PeeringMachine >().state_history, "Started/ToDelete/Deleting")
6306 {
6307   start = ceph::mono_clock::now();
6308
6309   context< PeeringMachine >().log_enter(state_name);
6310
6311   DECLARE_LOCALS;
6312   ps->deleting = true;
6313   ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6314
6315   // clear log
6316   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6317   ps->pg_log.roll_forward(rollbacker.get());
6318
6319   // adjust info to backfill
6320   ps->info.set_last_backfill(hobject_t());
6321   ps->pg_log.reset_backfill();
6322   ps->dirty_info = true;
6323
6324   pl->on_removal(t);
6325 }
6326
6327 boost::statechart::result PeeringState::Deleting::react(
6328   const DeleteSome& evt)
6329 {
6330   DECLARE_LOCALS;
6331   next = pl->do_delete_work(context<PeeringMachine>().get_cur_transaction(),
6332     next);
6333   return discard_event();
6334 }
6335
6336 void PeeringState::Deleting::exit()
6337 {
6338   context< PeeringMachine >().log_exit(state_name, enter_time);
6339   DECLARE_LOCALS;
6340   ps->deleting = false;
6341   pl->cancel_local_background_io_reservation();
6342   psdout(20) << "Deleting::" << __func__ << this <<" finished in "
6343            << ceph::mono_clock::now() - start
6344            << dendl;
6345 }
6346
6347 /*--------GetInfo---------*/
6348 PeeringState::GetInfo::GetInfo(my_context ctx)
6349   : my_base(ctx),
6350     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetInfo")
6351 {
6352   context< PeeringMachine >().log_enter(state_name);
6353
6354
6355   DECLARE_LOCALS;
6356   ps->check_past_interval_bounds();
6357   ps->log_weirdness();
6358   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6359
6360   ceph_assert(ps->blocked_by.empty());
6361
6362   prior_set = ps->build_prior();
6363   ps->prior_readable_down_osds = prior_set.down;
6364   if (ps->prior_readable_down_osds.empty()) {
6365     psdout(10) << " no prior_set down osds, clearing prior_readable_until_ub"
6366                << dendl;
6367     ps->clear_prior_readable_until_ub();
6368   }
6369
6370   ps->reset_min_peer_features();
6371   get_infos();
6372   if (prior_set.pg_down) {
6373     post_event(IsDown());
6374   } else if (peer_info_requested.empty()) {
6375     post_event(GotInfo());
6376   }
6377 }
6378
6379 void PeeringState::GetInfo::get_infos()
6380 {
6381   DECLARE_LOCALS;
6382   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6383
6384   ps->blocked_by.clear();
6385   for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
6386        it != prior_set.probe.end();
6387        ++it) {
6388     pg_shard_t peer = *it;
6389     if (peer == ps->pg_whoami) {
6390       continue;
6391     }
6392     if (ps->peer_info.count(peer)) {
6393       psdout(10) << " have osd." << peer << " info " << ps->peer_info[peer] << dendl;
6394       continue;
6395     }
6396     if (peer_info_requested.count(peer)) {
6397       psdout(10) << " already requested info from osd." << peer << dendl;
6398       ps->blocked_by.insert(peer.osd);
6399     } else if (!ps->get_osdmap()->is_up(peer.osd)) {
6400       psdout(10) << " not querying info from down osd." << peer << dendl;
6401     } else {
6402       psdout(10) << " querying info from osd." << peer << dendl;
6403       context< PeeringMachine >().send_query(
6404         peer.osd,
6405         pg_query_t(pg_query_t::INFO,
6406                    it->shard, ps->pg_whoami.shard,
6407                    ps->info.history,
6408                    ps->get_osdmap_epoch()));
6409       peer_info_requested.insert(peer);
6410       ps->blocked_by.insert(peer.osd);
6411     }
6412   }
6413
6414   ps->check_prior_readable_down_osds(ps->get_osdmap());
6415
6416   pl->publish_stats_to_osd();
6417 }
6418
6419 boost::statechart::result PeeringState::GetInfo::react(const MNotifyRec& infoevt)
6420 {
6421
6422   DECLARE_LOCALS;
6423
6424   set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
6425   if (p != peer_info_requested.end()) {
6426     peer_info_requested.erase(p);
6427     ps->blocked_by.erase(infoevt.from.osd);
6428   }
6429
6430   epoch_t old_start = ps->info.history.last_epoch_started;
6431   if (ps->proc_replica_info(
6432         infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
6433     // we got something new ...
6434     PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6435     if (old_start < ps->info.history.last_epoch_started) {
6436       psdout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
6437       prior_set = ps->build_prior();
6438       ps->prior_readable_down_osds = prior_set.down;
6439
6440       // filter out any osds that got dropped from the probe set from
6441       // peer_info_requested.  this is less expensive than restarting
6442       // peering (which would re-probe everyone).
6443       set<pg_shard_t>::iterator p = peer_info_requested.begin();
6444       while (p != peer_info_requested.end()) {
6445         if (prior_set.probe.count(*p) == 0) {
6446           psdout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
6447           peer_info_requested.erase(p++);
6448         } else {
6449           ++p;
6450         }
6451       }
6452       get_infos();
6453     }
6454     psdout(20) << "Adding osd: " << infoevt.from.osd << " peer features: "
6455                        << hex << infoevt.features << dec << dendl;
6456     ps->apply_peer_features(infoevt.features);
6457
6458     // are we done getting everything?
6459     if (peer_info_requested.empty() && !prior_set.pg_down) {
6460       psdout(20) << "Common peer features: " << hex << ps->get_min_peer_features() << dec << dendl;
6461       psdout(20) << "Common acting features: " << hex << ps->get_min_acting_features() << dec << dendl;
6462       psdout(20) << "Common upacting features: " << hex << ps->get_min_upacting_features() << dec << dendl;
6463       post_event(GotInfo());
6464     }
6465   }
6466   return discard_event();
6467 }
6468
6469 boost::statechart::result PeeringState::GetInfo::react(const QueryState& q)
6470 {
6471   DECLARE_LOCALS;
6472   q.f->open_object_section("state");
6473   q.f->dump_string("name", state_name);
6474   q.f->dump_stream("enter_time") << enter_time;
6475
6476   q.f->open_array_section("requested_info_from");
6477   for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
6478        p != peer_info_requested.end();
6479        ++p) {
6480     q.f->open_object_section("osd");
6481     q.f->dump_stream("osd") << *p;
6482     if (ps->peer_info.count(*p)) {
6483       q.f->open_object_section("got_info");
6484       ps->peer_info[*p].dump(q.f);
6485       q.f->close_section();
6486     }
6487     q.f->close_section();
6488   }
6489   q.f->close_section();
6490
6491   q.f->close_section();
6492   return forward_event();
6493 }
6494
6495 void PeeringState::GetInfo::exit()
6496 {
6497   context< PeeringMachine >().log_exit(state_name, enter_time);
6498
6499   DECLARE_LOCALS;
6500   utime_t dur = ceph_clock_now() - enter_time;
6501   pl->get_peering_perf().tinc(rs_getinfo_latency, dur);
6502   ps->blocked_by.clear();
6503 }
6504
6505 /*------GetLog------------*/
6506 PeeringState::GetLog::GetLog(my_context ctx)
6507   : my_base(ctx),
6508     NamedState(
6509       context< PeeringMachine >().state_history,
6510       "Started/Primary/Peering/GetLog"),
6511     msg(0)
6512 {
6513   context< PeeringMachine >().log_enter(state_name);
6514
6515   DECLARE_LOCALS;
6516
6517   ps->log_weirdness();
6518
6519   // adjust acting?
6520   if (!ps->choose_acting(auth_log_shard, false,
6521                          &context< Peering >().history_les_bound)) {
6522     if (!ps->want_acting.empty()) {
6523       post_event(NeedActingChange());
6524     } else {
6525       post_event(IsIncomplete());
6526     }
6527     return;
6528   }
6529
6530   // am i the best?
6531   if (auth_log_shard == ps->pg_whoami) {
6532     post_event(GotLog());
6533     return;
6534   }
6535
6536   const pg_info_t& best = ps->peer_info[auth_log_shard];
6537
6538   // am i broken?
6539   if (ps->info.last_update < best.log_tail) {
6540     psdout(10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
6541     post_event(IsIncomplete());
6542     return;
6543   }
6544
6545   // how much log to request?
6546   eversion_t request_log_from = ps->info.last_update;
6547   ceph_assert(!ps->acting_recovery_backfill.empty());
6548   for (set<pg_shard_t>::iterator p = ps->acting_recovery_backfill.begin();
6549        p != ps->acting_recovery_backfill.end();
6550        ++p) {
6551     if (*p == ps->pg_whoami) continue;
6552     pg_info_t& ri = ps->peer_info[*p];
6553     if (ri.last_update < ps->info.log_tail && ri.last_update >= best.log_tail &&
6554         ri.last_update < request_log_from)
6555       request_log_from = ri.last_update;
6556   }
6557
6558   // how much?
6559   psdout(10) << " requesting log from osd." << auth_log_shard << dendl;
6560   context<PeeringMachine>().send_query(
6561     auth_log_shard.osd,
6562     pg_query_t(
6563       pg_query_t::LOG,
6564       auth_log_shard.shard, ps->pg_whoami.shard,
6565       request_log_from, ps->info.history,
6566       ps->get_osdmap_epoch()));
6567
6568   ceph_assert(ps->blocked_by.empty());
6569   ps->blocked_by.insert(auth_log_shard.osd);
6570   pl->publish_stats_to_osd();
6571 }
6572
6573 boost::statechart::result PeeringState::GetLog::react(const AdvMap& advmap)
6574 {
6575   // make sure our log source didn't go down.  we need to check
6576   // explicitly because it may not be part of the prior set, which
6577   // means the Peering state check won't catch it going down.
6578   if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
6579     psdout(10) << "GetLog: auth_log_shard osd."
6580                        << auth_log_shard.osd << " went down" << dendl;
6581     post_event(advmap);
6582     return transit< Reset >();
6583   }
6584
6585   // let the Peering state do its checks.
6586   return forward_event();
6587 }
6588
6589 boost::statechart::result PeeringState::GetLog::react(const MLogRec& logevt)
6590 {
6591   ceph_assert(!msg);
6592   if (logevt.from != auth_log_shard) {
6593     psdout(10) << "GetLog: discarding log from "
6594                        << "non-auth_log_shard osd." << logevt.from << dendl;
6595     return discard_event();
6596   }
6597   psdout(10) << "GetLog: received master log from osd."
6598                      << logevt.from << dendl;
6599   msg = logevt.msg;
6600   post_event(GotLog());
6601   return discard_event();
6602 }
6603
6604 boost::statechart::result PeeringState::GetLog::react(const GotLog&)
6605 {
6606
6607   DECLARE_LOCALS;
6608   psdout(10) << "leaving GetLog" << dendl;
6609   if (msg) {
6610     psdout(10) << "processing master log" << dendl;
6611     ps->proc_master_log(context<PeeringMachine>().get_cur_transaction(),
6612                         msg->info, msg->log, msg->missing,
6613                         auth_log_shard);
6614   }
6615   ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6616   return transit< GetMissing >();
6617 }
6618
6619 boost::statechart::result PeeringState::GetLog::react(const QueryState& q)
6620 {
6621   q.f->open_object_section("state");
6622   q.f->dump_string("name", state_name);
6623   q.f->dump_stream("enter_time") << enter_time;
6624   q.f->dump_stream("auth_log_shard") << auth_log_shard;
6625   q.f->close_section();
6626   return forward_event();
6627 }
6628
6629 void PeeringState::GetLog::exit()
6630 {
6631   context< PeeringMachine >().log_exit(state_name, enter_time);
6632
6633   DECLARE_LOCALS;
6634   utime_t dur = ceph_clock_now() - enter_time;
6635   pl->get_peering_perf().tinc(rs_getlog_latency, dur);
6636   ps->blocked_by.clear();
6637 }
6638
6639 /*------WaitActingChange--------*/
6640 PeeringState::WaitActingChange::WaitActingChange(my_context ctx)
6641   : my_base(ctx),
6642     NamedState(context< PeeringMachine >().state_history, "Started/Primary/WaitActingChange")
6643 {
6644   context< PeeringMachine >().log_enter(state_name);
6645 }
6646
6647 boost::statechart::result PeeringState::WaitActingChange::react(const AdvMap& advmap)
6648 {
6649   DECLARE_LOCALS;
6650   OSDMapRef osdmap = advmap.osdmap;
6651
6652   psdout(10) << "verifying no want_acting " << ps->want_acting << " targets didn't go down" << dendl;
6653   for (vector<int>::iterator p = ps->want_acting.begin(); p != ps->want_acting.end(); ++p) {
6654     if (!osdmap->is_up(*p)) {
6655       psdout(10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
6656       post_event(advmap);
6657       return transit< Reset >();
6658     }
6659   }
6660   return forward_event();
6661 }
6662
6663 boost::statechart::result PeeringState::WaitActingChange::react(const MLogRec& logevt)
6664 {
6665   psdout(10) << "In WaitActingChange, ignoring MLocRec" << dendl;
6666   return discard_event();
6667 }
6668
6669 boost::statechart::result PeeringState::WaitActingChange::react(const MInfoRec& evt)
6670 {
6671   psdout(10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
6672   return discard_event();
6673 }
6674
6675 boost::statechart::result PeeringState::WaitActingChange::react(const MNotifyRec& evt)
6676 {
6677   psdout(10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
6678   return discard_event();
6679 }
6680
6681 boost::statechart::result PeeringState::WaitActingChange::react(const QueryState& q)
6682 {
6683   q.f->open_object_section("state");
6684   q.f->dump_string("name", state_name);
6685   q.f->dump_stream("enter_time") << enter_time;
6686   q.f->dump_string("comment", "waiting for pg acting set to change");
6687   q.f->close_section();
6688   return forward_event();
6689 }
6690
6691 void PeeringState::WaitActingChange::exit()
6692 {
6693   context< PeeringMachine >().log_exit(state_name, enter_time);
6694   DECLARE_LOCALS;
6695   utime_t dur = ceph_clock_now() - enter_time;
6696   pl->get_peering_perf().tinc(rs_waitactingchange_latency, dur);
6697 }
6698
6699 /*------Down--------*/
6700 PeeringState::Down::Down(my_context ctx)
6701   : my_base(ctx),
6702     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Down")
6703 {
6704   context< PeeringMachine >().log_enter(state_name);
6705   DECLARE_LOCALS;
6706
6707   ps->state_clear(PG_STATE_PEERING);
6708   ps->state_set(PG_STATE_DOWN);
6709
6710   auto &prior_set = context< Peering >().prior_set;
6711   ceph_assert(ps->blocked_by.empty());
6712   ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
6713   pl->publish_stats_to_osd();
6714 }
6715
6716 void PeeringState::Down::exit()
6717 {
6718   context< PeeringMachine >().log_exit(state_name, enter_time);
6719
6720   DECLARE_LOCALS;
6721
6722   ps->state_clear(PG_STATE_DOWN);
6723   utime_t dur = ceph_clock_now() - enter_time;
6724   pl->get_peering_perf().tinc(rs_down_latency, dur);
6725
6726   ps->blocked_by.clear();
6727 }
6728
6729 boost::statechart::result PeeringState::Down::react(const QueryState& q)
6730 {
6731   q.f->open_object_section("state");
6732   q.f->dump_string("name", state_name);
6733   q.f->dump_stream("enter_time") << enter_time;
6734   q.f->dump_string("comment",
6735                    "not enough up instances of this PG to go active");
6736   q.f->close_section();
6737   return forward_event();
6738 }
6739
6740 boost::statechart::result PeeringState::Down::react(const MNotifyRec& infoevt)
6741 {
6742   DECLARE_LOCALS;
6743
6744   ceph_assert(ps->is_primary());
6745   epoch_t old_start = ps->info.history.last_epoch_started;
6746   if (!ps->peer_info.count(infoevt.from) &&
6747       ps->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) {
6748     ps->update_history(infoevt.notify.info.history);
6749   }
6750   // if we got something new to make pg escape down state
6751   if (ps->info.history.last_epoch_started > old_start) {
6752       psdout(10) << " last_epoch_started moved forward, re-enter getinfo" << dendl;
6753     ps->state_clear(PG_STATE_DOWN);
6754     ps->state_set(PG_STATE_PEERING);
6755     return transit< GetInfo >();
6756   }
6757
6758   return discard_event();
6759 }
6760
6761
6762 /*------Incomplete--------*/
6763 PeeringState::Incomplete::Incomplete(my_context ctx)
6764   : my_base(ctx),
6765     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Incomplete")
6766 {
6767   context< PeeringMachine >().log_enter(state_name);
6768   DECLARE_LOCALS;
6769
6770   ps->state_clear(PG_STATE_PEERING);
6771   ps->state_set(PG_STATE_INCOMPLETE);
6772
6773   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6774   ceph_assert(ps->blocked_by.empty());
6775   ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
6776   pl->publish_stats_to_osd();
6777 }
6778
6779 boost::statechart::result PeeringState::Incomplete::react(const AdvMap &advmap) {
6780   DECLARE_LOCALS;
6781   int64_t poolnum = ps->info.pgid.pool();
6782
6783   // Reset if min_size turn smaller than previous value, pg might now be able to go active
6784   if (!advmap.osdmap->have_pg_pool(poolnum) ||
6785       advmap.lastmap->get_pools().find(poolnum)->second.min_size >
6786       advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
6787     post_event(advmap);
6788     return transit< Reset >();
6789   }
6790
6791   return forward_event();
6792 }
6793
6794 boost::statechart::result PeeringState::Incomplete::react(const MNotifyRec& notevt) {
6795   DECLARE_LOCALS;
6796   psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
6797   if (ps->proc_replica_info(
6798     notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
6799     // We got something new, try again!
6800     return transit< GetLog >();
6801   } else {
6802     return discard_event();
6803   }
6804 }
6805
6806 boost::statechart::result PeeringState::Incomplete::react(
6807   const QueryState& q)
6808 {
6809   q.f->open_object_section("state");
6810   q.f->dump_string("name", state_name);
6811   q.f->dump_stream("enter_time") << enter_time;
6812   q.f->dump_string("comment", "not enough complete instances of this PG");
6813   q.f->close_section();
6814   return forward_event();
6815 }
6816
6817 void PeeringState::Incomplete::exit()
6818 {
6819   context< PeeringMachine >().log_exit(state_name, enter_time);
6820
6821   DECLARE_LOCALS;
6822
6823   ps->state_clear(PG_STATE_INCOMPLETE);
6824   utime_t dur = ceph_clock_now() - enter_time;
6825   pl->get_peering_perf().tinc(rs_incomplete_latency, dur);
6826
6827   ps->blocked_by.clear();
6828 }
6829
6830 /*------GetMissing--------*/
6831 PeeringState::GetMissing::GetMissing(my_context ctx)
6832   : my_base(ctx),
6833     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetMissing")
6834 {
6835   context< PeeringMachine >().log_enter(state_name);
6836
6837   DECLARE_LOCALS;
6838   ps->log_weirdness();
6839   ceph_assert(!ps->acting_recovery_backfill.empty());
6840   eversion_t since;
6841   for (set<pg_shard_t>::iterator i = ps->acting_recovery_backfill.begin();
6842        i != ps->acting_recovery_backfill.end();
6843        ++i) {
6844     if (*i == ps->get_primary()) continue;
6845     const pg_info_t& pi = ps->peer_info[*i];
6846     // reset this so to make sure the pg_missing_t is initialized and
6847     // has the correct semantics even if we don't need to get a
6848     // missing set from a shard. This way later additions due to
6849     // lost+unfound delete work properly.
6850     ps->peer_missing[*i].may_include_deletes = !ps->perform_deletes_during_peering();
6851
6852     if (pi.is_empty())
6853       continue;                                // no pg data, nothing divergent
6854
6855     if (pi.last_update < ps->pg_log.get_tail()) {
6856       psdout(10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
6857       ps->peer_missing[*i].clear();
6858       continue;
6859     }
6860     if (pi.last_backfill == hobject_t()) {
6861       psdout(10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
6862       ps->peer_missing[*i].clear();
6863       continue;
6864     }
6865
6866     if (pi.last_update == pi.last_complete &&  // peer has no missing
6867         pi.last_update == ps->info.last_update) {  // peer is up to date
6868       // replica has no missing and identical log as us.  no need to
6869       // pull anything.
6870       // FIXME: we can do better here.  if last_update==last_complete we
6871       //        can infer the rest!
6872       psdout(10) << " osd." << *i << " has no missing, identical log" << dendl;
6873       ps->peer_missing[*i].clear();
6874       continue;
6875     }
6876
6877     // We pull the log from the peer's last_epoch_started to ensure we
6878     // get enough log to detect divergent updates.
6879     since.epoch = pi.last_epoch_started;
6880     ceph_assert(pi.last_update >= ps->info.log_tail);  // or else choose_acting() did a bad thing
6881     if (pi.log_tail <= since) {
6882       psdout(10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
6883       context< PeeringMachine >().send_query(
6884         i->osd,
6885         pg_query_t(
6886           pg_query_t::LOG,
6887           i->shard, ps->pg_whoami.shard,
6888           since, ps->info.history,
6889           ps->get_osdmap_epoch()));
6890     } else {
6891       psdout(10) << " requesting fulllog+missing from osd." << *i
6892                          << " (want since " << since << " < log.tail "
6893                          << pi.log_tail << ")" << dendl;
6894       context< PeeringMachine >().send_query(
6895         i->osd, pg_query_t(
6896           pg_query_t::FULLLOG,
6897           i->shard, ps->pg_whoami.shard,
6898           ps->info.history, ps->get_osdmap_epoch()));
6899     }
6900     peer_missing_requested.insert(*i);
6901     ps->blocked_by.insert(i->osd);
6902   }
6903
6904   if (peer_missing_requested.empty()) {
6905     if (ps->need_up_thru) {
6906       psdout(10) << " still need up_thru update before going active"
6907                          << dendl;
6908       post_event(NeedUpThru());
6909       return;
6910     }
6911
6912     // all good!
6913     post_event(Activate(ps->get_osdmap_epoch()));
6914   } else {
6915     pl->publish_stats_to_osd();
6916   }
6917 }
6918
6919 boost::statechart::result PeeringState::GetMissing::react(const MLogRec& logevt)
6920 {
6921   DECLARE_LOCALS;
6922
6923   peer_missing_requested.erase(logevt.from);
6924   ps->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
6925
6926   if (peer_missing_requested.empty()) {
6927     if (ps->need_up_thru) {
6928       psdout(10) << " still need up_thru update before going active"
6929                          << dendl;
6930       post_event(NeedUpThru());
6931     } else {
6932       psdout(10) << "Got last missing, don't need missing "
6933                          << "posting Activate" << dendl;
6934       post_event(Activate(ps->get_osdmap_epoch()));
6935     }
6936   }
6937   return discard_event();
6938 }
6939
6940 boost::statechart::result PeeringState::GetMissing::react(const QueryState& q)
6941 {
6942   DECLARE_LOCALS;
6943   q.f->open_object_section("state");
6944   q.f->dump_string("name", state_name);
6945   q.f->dump_stream("enter_time") << enter_time;
6946
6947   q.f->open_array_section("peer_missing_requested");
6948   for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
6949        p != peer_missing_requested.end();
6950        ++p) {
6951     q.f->open_object_section("osd");
6952     q.f->dump_stream("osd") << *p;
6953     if (ps->peer_missing.count(*p)) {
6954       q.f->open_object_section("got_missing");
6955       ps->peer_missing[*p].dump(q.f);
6956       q.f->close_section();
6957     }
6958     q.f->close_section();
6959   }
6960   q.f->close_section();
6961
6962   q.f->close_section();
6963   return forward_event();
6964 }
6965
6966 void PeeringState::GetMissing::exit()
6967 {
6968   context< PeeringMachine >().log_exit(state_name, enter_time);
6969
6970   DECLARE_LOCALS;
6971   utime_t dur = ceph_clock_now() - enter_time;
6972   pl->get_peering_perf().tinc(rs_getmissing_latency, dur);
6973   ps->blocked_by.clear();
6974 }
6975
6976 /*------WaitUpThru--------*/
6977 PeeringState::WaitUpThru::WaitUpThru(my_context ctx)
6978   : my_base(ctx),
6979     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/WaitUpThru")
6980 {
6981   context< PeeringMachine >().log_enter(state_name);
6982 }
6983
6984 boost::statechart::result PeeringState::WaitUpThru::react(const ActMap& am)
6985 {
6986   DECLARE_LOCALS;
6987   if (!ps->need_up_thru) {
6988     post_event(Activate(ps->get_osdmap_epoch()));
6989   }
6990   return forward_event();
6991 }
6992
6993 boost::statechart::result PeeringState::WaitUpThru::react(const MLogRec& logevt)
6994 {
6995   DECLARE_LOCALS;
6996   psdout(10) << "Noting missing from osd." << logevt.from << dendl;
6997   ps->peer_missing[logevt.from].claim(logevt.msg->missing);
6998   ps->peer_info[logevt.from] = logevt.msg->info;
6999   return discard_event();
7000 }
7001
7002 boost::statechart::result PeeringState::WaitUpThru::react(const QueryState& q)
7003 {
7004   q.f->open_object_section("state");
7005   q.f->dump_string("name", state_name);
7006   q.f->dump_stream("enter_time") << enter_time;
7007   q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
7008   q.f->close_section();
7009   return forward_event();
7010 }
7011
7012 void PeeringState::WaitUpThru::exit()
7013 {
7014   context< PeeringMachine >().log_exit(state_name, enter_time);
7015   DECLARE_LOCALS;
7016   utime_t dur = ceph_clock_now() - enter_time;
7017   pl->get_peering_perf().tinc(rs_waitupthru_latency, dur);
7018 }
7019
7020 /*----PeeringState::PeeringMachine Methods-----*/
7021 #undef dout_prefix
7022 #define dout_prefix dpp->gen_prefix(*_dout)
7023
7024 void PeeringState::PeeringMachine::log_enter(const char *state_name)
7025 {
7026   DECLARE_LOCALS;
7027   psdout(5) << "enter " << state_name << dendl;
7028   pl->log_state_enter(state_name);
7029 }
7030
7031 void PeeringState::PeeringMachine::log_exit(const char *state_name, utime_t enter_time)
7032 {
7033   DECLARE_LOCALS;
7034   utime_t dur = ceph_clock_now() - enter_time;
7035   psdout(5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
7036   pl->log_state_exit(state_name, enter_time, event_count, event_time);
7037   event_count = 0;
7038   event_time = utime_t();
7039 }
7040
7041 ostream &operator<<(ostream &out, const PeeringState &ps) {
7042   out << "pg[" << ps.info
7043       << " " << pg_vector_string(ps.up);
7044   if (ps.acting != ps.up)
7045     out << "/" << pg_vector_string(ps.acting);
7046   if (ps.is_ec_pg())
7047     out << "p" << ps.get_primary();
7048   if (!ps.async_recovery_targets.empty())
7049     out << " async=[" << ps.async_recovery_targets << "]";
7050   if (!ps.backfill_targets.empty())
7051     out << " backfill=[" << ps.backfill_targets << "]";
7052   out << " r=" << ps.get_role();
7053   out << " lpr=" << ps.get_last_peering_reset();
7054
7055   if (ps.deleting)
7056     out << " DELETING";
7057
7058   if (!ps.past_intervals.empty()) {
7059     out << " pi=[" << ps.past_intervals.get_bounds()
7060         << ")/" << ps.past_intervals.size();
7061   }
7062
7063   if (ps.is_peered()) {
7064     if (ps.last_update_ondisk != ps.info.last_update)
7065       out << " luod=" << ps.last_update_ondisk;
7066     if (ps.last_update_applied != ps.info.last_update)
7067       out << " lua=" << ps.last_update_applied;
7068   }
7069
7070   if (ps.pg_log.get_tail() != ps.info.log_tail ||
7071       ps.pg_log.get_head() != ps.info.last_update)
7072     out << " (info mismatch, " << ps.pg_log.get_log() << ")";
7073
7074   if (!ps.pg_log.get_log().empty()) {
7075     if ((ps.pg_log.get_log().log.begin()->version <= ps.pg_log.get_tail())) {
7076       out << " (log bound mismatch, actual=["
7077           << ps.pg_log.get_log().log.begin()->version << ","
7078           << ps.pg_log.get_log().log.rbegin()->version << "]";
7079       out << ")";
7080     }
7081   }
7082
7083   out << " crt=" << ps.pg_log.get_can_rollback_to();
7084
7085   if (ps.last_complete_ondisk != ps.info.last_complete)
7086     out << " lcod " << ps.last_complete_ondisk;
7087
7088   out << " mlcod " << ps.min_last_complete_ondisk;
7089
7090   out << " " << pg_state_string(ps.get_state());
7091   if (ps.should_send_notify())
7092     out << " NOTIFY";
7093
7094   if (ps.prior_readable_until_ub != ceph::signedspan::zero()) {
7095     out << " pruub " << ps.prior_readable_until_ub
7096         << "@" << ps.get_prior_readable_down_osds();
7097   }
7098   return out;
7099 }