ceph/src/osd/PeeringState.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3
   4 #include "PGPeeringEvent.h"
   5 #include "common/ceph_releases.h"
   6 #include "common/dout.h"
   7 #include "PeeringState.h"
   8
   9 #include "messages/MOSDPGRemove.h"
  10 #include "messages/MBackfillReserve.h"
  11 #include "messages/MRecoveryReserve.h"
  12 #include "messages/MOSDScrubReserve.h"
  13 #include "messages/MOSDPGInfo.h"
  14 #include "messages/MOSDPGInfo2.h"
  15 #include "messages/MOSDPGTrim.h"
  16 #include "messages/MOSDPGLog.h"
  17 #include "messages/MOSDPGNotify.h"
  18 #include "messages/MOSDPGNotify2.h"
  19 #include "messages/MOSDPGQuery.h"
  20 #include "messages/MOSDPGQuery2.h"
  21 #include "messages/MOSDPGLease.h"
  22 #include "messages/MOSDPGLeaseAck.h"
  23
  24 #define dout_context cct
  25 #define dout_subsys ceph_subsys_osd
  26
  27 BufferedRecoveryMessages::BufferedRecoveryMessages(
  28   ceph_release_t r,
  29   PeeringCtx &ctx)
  30   : require_osd_release(r) {
  31   // steal messages from ctx
  32   message_map.swap(ctx.message_map);
  33 }
  34
  35 void BufferedRecoveryMessages::send_notify(int to, const pg_notify_t &n)
  36 {
  37   if (require_osd_release >= ceph_release_t::octopus) {
  38     spg_t pgid(n.info.pgid.pgid, n.to);
  39     send_osd_message(to, make_message<MOSDPGNotify2>(pgid, n));
  40   } else {
  41     send_osd_message(to, make_message<MOSDPGNotify>(n.epoch_sent, vector{n}));
  42   }
  43 }
  44
  45 void BufferedRecoveryMessages::send_query(
  46   int to,
  47   spg_t to_spgid,
  48   const pg_query_t &q)
  49 {
  50   if (require_osd_release >= ceph_release_t::octopus) {
  51     send_osd_message(to,
  52                      make_message<MOSDPGQuery2>(to_spgid, q));
  53   } else {
  54     auto m = make_message<MOSDPGQuery>(
  55       q.epoch_sent,
  56       MOSDPGQuery::pg_list_t{{to_spgid, q}});
  57     send_osd_message(to, m);
  58   }
  59 }
  60
  61 void BufferedRecoveryMessages::send_info(
  62   int to,
  63   spg_t to_spgid,
  64   epoch_t min_epoch,
  65   epoch_t cur_epoch,
  66   const pg_info_t &info,
  67   std::optional<pg_lease_t> lease,
  68   std::optional<pg_lease_ack_t> lease_ack)
  69 {
  70   if (require_osd_release >= ceph_release_t::octopus) {
  71     send_osd_message(
  72       to,
  73       make_message<MOSDPGInfo2>(
  74         to_spgid,
  75         info,
  76         cur_epoch,
  77         min_epoch,
  78         lease,
  79         lease_ack)
  80       );
  81   } else {
  82     send_osd_message(
  83       to,
  84       make_message<MOSDPGInfo>(
  85         cur_epoch,
  86         vector{pg_notify_t{to_spgid.shard,
  87                            info.pgid.shard,
  88                            min_epoch, cur_epoch,
  89                            info, PastIntervals{}}})
  90       );
  91   }
  92 }
  93
  94 void PGPool::update(CephContext *cct, OSDMapRef map)
  95 {
  96   const pg_pool_t *pi = map->get_pg_pool(id);
  97   if (!pi) {
  98     return; // pool has been deleted
  99   }
 100   info = *pi;
 101   name = map->get_pool_name(id);
 102
 103   bool updated = false;
 104   if ((map->get_epoch() != cached_epoch + 1) ||
 105       (pi->get_snap_epoch() == map->get_epoch())) {
 106     updated = true;
 107   }
 108
 109   if (info.is_pool_snaps_mode() && updated) {
 110     snapc = pi->get_snap_context();
 111   }
 112   cached_epoch = map->get_epoch();
 113 }
 114
 115 /*-------------Peering State Helpers----------------*/
 116 #undef dout_prefix
 117 #define dout_prefix (dpp->gen_prefix(*_dout))
 118 #undef psdout
 119 #define psdout(x) ldout(cct, x)
 120
 121 PeeringState::PeeringState(
 122   CephContext *cct,
 123   pg_shard_t pg_whoami,
 124   spg_t spgid,
 125   const PGPool &_pool,
 126   OSDMapRef curmap,
 127   DoutPrefixProvider *dpp,
 128   PeeringListener *pl)
 129   : state_history(*pl),
 130     cct(cct),
 131     spgid(spgid),
 132     dpp(dpp),
 133     pl(pl),
 134     orig_ctx(0),
 135     osdmap_ref(curmap),
 136     pool(_pool),
 137     pg_whoami(pg_whoami),
 138     info(spgid),
 139     pg_log(cct),
 140     missing_loc(spgid, this, dpp, cct),
 141     machine(this, cct, spgid, dpp, pl, &state_history)
 142 {
 143   machine.initiate();
 144 }
 145
 146 void PeeringState::start_handle(PeeringCtx *new_ctx) {
 147   ceph_assert(!rctx);
 148   ceph_assert(!orig_ctx);
 149   orig_ctx = new_ctx;
 150   if (new_ctx) {
 151     if (messages_pending_flush) {
 152       rctx.emplace(*messages_pending_flush, *new_ctx);
 153     } else {
 154       rctx.emplace(*new_ctx);
 155     }
 156     rctx->start_time = ceph_clock_now();
 157   }
 158 }
 159
 160 void PeeringState::begin_block_outgoing() {
 161   ceph_assert(!messages_pending_flush);
 162   ceph_assert(orig_ctx);
 163   ceph_assert(rctx);
 164   messages_pending_flush = BufferedRecoveryMessages(
 165     orig_ctx->require_osd_release);
 166   rctx.emplace(*messages_pending_flush, *orig_ctx);
 167 }
 168
 169 void PeeringState::clear_blocked_outgoing() {
 170   ceph_assert(orig_ctx);
 171   ceph_assert(rctx);
 172   messages_pending_flush = std::optional<BufferedRecoveryMessages>();
 173 }
 174
 175 void PeeringState::end_block_outgoing() {
 176   ceph_assert(messages_pending_flush);
 177   ceph_assert(orig_ctx);
 178   ceph_assert(rctx);
 179
 180   orig_ctx->accept_buffered_messages(*messages_pending_flush);
 181   rctx.emplace(*orig_ctx);
 182   messages_pending_flush = std::optional<BufferedRecoveryMessages>();
 183 }
 184
 185 void PeeringState::end_handle() {
 186   if (rctx) {
 187     utime_t dur = ceph_clock_now() - rctx->start_time;
 188     machine.event_time += dur;
 189   }
 190
 191   machine.event_count++;
 192   rctx = std::nullopt;
 193   orig_ctx = NULL;
 194 }
 195
 196 void PeeringState::check_recovery_sources(const OSDMapRef& osdmap)
 197 {
 198   /*
 199    * check that any peers we are planning to (or currently) pulling
 200    * objects from are dealt with.
 201    */
 202   missing_loc.check_recovery_sources(osdmap);
 203   pl->check_recovery_sources(osdmap);
 204
 205   for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
 206        i != peer_log_requested.end();
 207        ) {
 208     if (!osdmap->is_up(i->osd)) {
 209       psdout(10) << "peer_log_requested removing " << *i << dendl;
 210       peer_log_requested.erase(i++);
 211     } else {
 212       ++i;
 213     }
 214   }
 215
 216   for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
 217        i != peer_missing_requested.end();
 218        ) {
 219     if (!osdmap->is_up(i->osd)) {
 220       psdout(10) << "peer_missing_requested removing " << *i << dendl;
 221       peer_missing_requested.erase(i++);
 222     } else {
 223       ++i;
 224     }
 225   }
 226 }
 227
 228 void PeeringState::update_history(const pg_history_t& new_history)
 229 {
 230   auto mnow = pl->get_mnow();
 231   info.history.refresh_prior_readable_until_ub(mnow, prior_readable_until_ub);
 232   if (info.history.merge(new_history)) {
 233     psdout(20) << __func__ << " advanced history from " << new_history << dendl;
 234     dirty_info = true;
 235     if (info.history.last_epoch_clean >= info.history.same_interval_since) {
 236       psdout(20) << __func__ << " clearing past_intervals" << dendl;
 237       past_intervals.clear();
 238       dirty_big_info = true;
 239     }
 240     prior_readable_until_ub = info.history.get_prior_readable_until_ub(mnow);
 241     if (prior_readable_until_ub != ceph::signedspan::zero()) {
 242       dout(20) << __func__
 243                << " prior_readable_until_ub " << prior_readable_until_ub
 244                << " (mnow " << mnow << " + "
 245                << info.history.prior_readable_until_ub << ")" << dendl;
 246     }
 247   }
 248   pl->on_info_history_change();
 249 }
 250
 251 void PeeringState::purge_strays()
 252 {
 253   if (is_premerge()) {
 254     psdout(10) << "purge_strays " << stray_set << " but premerge, doing nothing"
 255                << dendl;
 256     return;
 257   }
 258   if (cct->_conf.get_val<bool>("osd_debug_no_purge_strays")) {
 259     return;
 260   }
 261   psdout(10) << "purge_strays " << stray_set << dendl;
 262
 263   bool removed = false;
 264   for (set<pg_shard_t>::iterator p = stray_set.begin();
 265        p != stray_set.end();
 266        ++p) {
 267     ceph_assert(!is_acting_recovery_backfill(*p));
 268     if (get_osdmap()->is_up(p->osd)) {
 269       psdout(10) << "sending PGRemove to osd." << *p << dendl;
 270       vector<spg_t> to_remove;
 271       to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
 272       MOSDPGRemove *m = new MOSDPGRemove(
 273         get_osdmap_epoch(),
 274         to_remove);
 275       pl->send_cluster_message(p->osd, m, get_osdmap_epoch());
 276     } else {
 277       psdout(10) << "not sending PGRemove to down osd." << *p << dendl;
 278     }
 279     peer_missing.erase(*p);
 280     peer_info.erase(*p);
 281     missing_loc.remove_stray_recovery_sources(*p);
 282     peer_purged.insert(*p);
 283     removed = true;
 284   }
 285
 286   // if we removed anyone, update peers (which include peer_info)
 287   if (removed)
 288     update_heartbeat_peers();
 289
 290   stray_set.clear();
 291
 292   // clear _requested maps; we may have to peer() again if we discover
 293   // (more) stray content
 294   peer_log_requested.clear();
 295   peer_missing_requested.clear();
 296 }
 297
 298
 299 bool PeeringState::proc_replica_info(
 300   pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
 301 {
 302   map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
 303   if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
 304     psdout(10) << " got dup osd." << from << " info "
 305                << oinfo << ", identical to ours" << dendl;
 306     return false;
 307   }
 308
 309   if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
 310     psdout(10) << " got info " << oinfo << " from down osd." << from
 311              << " discarding" << dendl;
 312     return false;
 313   }
 314
 315   psdout(10) << " got osd." << from << " " << oinfo << dendl;
 316   ceph_assert(is_primary());
 317   peer_info[from] = oinfo;
 318   might_have_unfound.insert(from);
 319
 320   update_history(oinfo.history);
 321
 322   // stray?
 323   if (!is_up(from) && !is_acting(from)) {
 324     psdout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
 325     stray_set.insert(from);
 326     if (is_clean()) {
 327       purge_strays();
 328     }
 329   }
 330
 331   // was this a new info?  if so, update peers!
 332   if (p == peer_info.end())
 333     update_heartbeat_peers();
 334
 335   return true;
 336 }
 337
 338
 339 void PeeringState::remove_down_peer_info(const OSDMapRef &osdmap)
 340 {
 341   // Remove any downed osds from peer_info
 342   bool removed = false;
 343   map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
 344   while (p != peer_info.end()) {
 345     if (!osdmap->is_up(p->first.osd)) {
 346       psdout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
 347       peer_missing.erase(p->first);
 348       peer_log_requested.erase(p->first);
 349       peer_missing_requested.erase(p->first);
 350       peer_purged.erase(p->first);
 351       peer_info.erase(p++);
 352       removed = true;
 353     } else
 354       ++p;
 355   }
 356
 357   // if we removed anyone, update peers (which include peer_info)
 358   if (removed)
 359     update_heartbeat_peers();
 360
 361   check_recovery_sources(osdmap);
 362 }
 363
 364 void PeeringState::update_heartbeat_peers()
 365 {
 366   if (!is_primary())
 367     return;
 368
 369   set<int> new_peers;
 370   for (unsigned i=0; i<acting.size(); i++) {
 371     if (acting[i] != CRUSH_ITEM_NONE)
 372       new_peers.insert(acting[i]);
 373   }
 374   for (unsigned i=0; i<up.size(); i++) {
 375     if (up[i] != CRUSH_ITEM_NONE)
 376       new_peers.insert(up[i]);
 377   }
 378   for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
 379        p != peer_info.end();
 380        ++p) {
 381     new_peers.insert(p->first.osd);
 382   }
 383   pl->update_heartbeat_peers(std::move(new_peers));
 384 }
 385
 386 void PeeringState::write_if_dirty(ObjectStore::Transaction& t)
 387 {
 388   pl->prepare_write(
 389     info,
 390     last_written_info,
 391     past_intervals,
 392     pg_log,
 393     dirty_info,
 394     dirty_big_info,
 395     last_persisted_osdmap < get_osdmap_epoch(),
 396     t);
 397   if (dirty_info || dirty_big_info) {
 398     last_persisted_osdmap = get_osdmap_epoch();
 399     last_written_info = info;
 400     dirty_info = false;
 401     dirty_big_info = false;
 402   }
 403 }
 404
 405 void PeeringState::advance_map(
 406   OSDMapRef osdmap, OSDMapRef lastmap,
 407   vector<int>& newup, int up_primary,
 408   vector<int>& newacting, int acting_primary,
 409   PeeringCtx &rctx)
 410 {
 411   ceph_assert(lastmap == osdmap_ref);
 412   psdout(10) << "handle_advance_map "
 413             << newup << "/" << newacting
 414             << " -- " << up_primary << "/" << acting_primary
 415             << dendl;
 416
 417   update_osdmap_ref(osdmap);
 418   pool.update(cct, osdmap);
 419
 420   AdvMap evt(
 421     osdmap, lastmap, newup, up_primary,
 422     newacting, acting_primary);
 423   handle_event(evt, &rctx);
 424   if (pool.info.last_change == osdmap_ref->get_epoch()) {
 425     pl->on_pool_change();
 426   }
 427   readable_interval = pool.get_readable_interval();
 428   last_require_osd_release = osdmap->require_osd_release;
 429 }
 430
 431 void PeeringState::activate_map(PeeringCtx &rctx)
 432 {
 433   psdout(10) << __func__ << dendl;
 434   ActMap evt;
 435   handle_event(evt, &rctx);
 436   if (osdmap_ref->get_epoch() - last_persisted_osdmap >
 437     cct->_conf->osd_pg_epoch_persisted_max_stale) {
 438     psdout(20) << __func__ << ": Dirtying info: last_persisted is "
 439               << last_persisted_osdmap
 440               << " while current is " << osdmap_ref->get_epoch() << dendl;
 441     dirty_info = true;
 442   } else {
 443     psdout(20) << __func__ << ": Not dirtying info: last_persisted is "
 444               << last_persisted_osdmap
 445               << " while current is " << osdmap_ref->get_epoch() << dendl;
 446   }
 447   write_if_dirty(rctx.transaction);
 448
 449   if (get_osdmap()->check_new_blacklist_entries()) {
 450     pl->check_blacklisted_watchers();
 451   }
 452 }
 453
 454 void PeeringState::set_last_peering_reset()
 455 {
 456   psdout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl;
 457   if (last_peering_reset != get_osdmap_epoch()) {
 458     last_peering_reset = get_osdmap_epoch();
 459     psdout(10) << "Clearing blocked outgoing recovery messages" << dendl;
 460     clear_blocked_outgoing();
 461     if (!pl->try_flush_or_schedule_async()) {
 462       psdout(10) << "Beginning to block outgoing recovery messages" << dendl;
 463       begin_block_outgoing();
 464     } else {
 465       psdout(10) << "Not blocking outgoing recovery messages" << dendl;
 466     }
 467   }
 468 }
 469
 470 void PeeringState::complete_flush()
 471 {
 472   flushes_in_progress--;
 473   if (flushes_in_progress == 0) {
 474     pl->on_flushed();
 475   }
 476 }
 477
 478 void PeeringState::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
 479 {
 480   const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
 481   if (!pi) {
 482     return; // pool deleted
 483   }
 484   bool changed = false;
 485   if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
 486     const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
 487     if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
 488       psdout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
 489       changed = true;
 490     }
 491   }
 492   if (changed) {
 493     info.history.last_epoch_marked_full = osdmap->get_epoch();
 494     dirty_info = true;
 495   }
 496 }
 497
 498 bool PeeringState::should_restart_peering(
 499   int newupprimary,
 500   int newactingprimary,
 501   const vector<int>& newup,
 502   const vector<int>& newacting,
 503   OSDMapRef lastmap,
 504   OSDMapRef osdmap)
 505 {
 506   if (PastIntervals::is_new_interval(
 507         primary.osd,
 508         newactingprimary,
 509         acting,
 510         newacting,
 511         up_primary.osd,
 512         newupprimary,
 513         up,
 514         newup,
 515         osdmap.get(),
 516         lastmap.get(),
 517         info.pgid.pgid)) {
 518     psdout(20) << "new interval newup " << newup
 519                << " newacting " << newacting << dendl;
 520     return true;
 521   }
 522   if (!lastmap->is_up(pg_whoami.osd) && osdmap->is_up(pg_whoami.osd)) {
 523     psdout(10) << __func__ << " osd transitioned from down -> up"
 524                << dendl;
 525     return true;
 526   }
 527   return false;
 528 }
 529
 530 /* Called before initializing peering during advance_map */
 531 void PeeringState::start_peering_interval(
 532   const OSDMapRef lastmap,
 533   const vector<int>& newup, int new_up_primary,
 534   const vector<int>& newacting, int new_acting_primary,
 535   ObjectStore::Transaction &t)
 536 {
 537   const OSDMapRef osdmap = get_osdmap();
 538
 539   set_last_peering_reset();
 540
 541   vector<int> oldacting, oldup;
 542   int oldrole = get_role();
 543
 544   if (is_primary()) {
 545     pl->clear_ready_to_merge();
 546   }
 547
 548
 549   pg_shard_t old_acting_primary = get_primary();
 550   pg_shard_t old_up_primary = up_primary;
 551   bool was_old_primary = is_primary();
 552   bool was_old_nonprimary = is_nonprimary();
 553
 554   acting.swap(oldacting);
 555   up.swap(oldup);
 556   init_primary_up_acting(
 557     newup,
 558     newacting,
 559     new_up_primary,
 560     new_acting_primary);
 561
 562   if (info.stats.up != up ||
 563       info.stats.acting != acting ||
 564       info.stats.up_primary != new_up_primary ||
 565       info.stats.acting_primary != new_acting_primary) {
 566     info.stats.up = up;
 567     info.stats.up_primary = new_up_primary;
 568     info.stats.acting = acting;
 569     info.stats.acting_primary = new_acting_primary;
 570     info.stats.mapping_epoch = osdmap->get_epoch();
 571   }
 572
 573   pl->clear_publish_stats();
 574
 575   // This will now be remapped during a backfill in cases
 576   // that it would not have been before.
 577   if (up != acting)
 578     state_set(PG_STATE_REMAPPED);
 579   else
 580     state_clear(PG_STATE_REMAPPED);
 581
 582   int role = osdmap->calc_pg_role(pg_whoami, acting);
 583   set_role(role);
 584
 585   // did acting, up, primary|acker change?
 586   if (!lastmap) {
 587     psdout(10) << " no lastmap" << dendl;
 588     dirty_info = true;
 589     dirty_big_info = true;
 590     info.history.same_interval_since = osdmap->get_epoch();
 591   } else {
 592     std::stringstream debug;
 593     ceph_assert(info.history.same_interval_since != 0);
 594     bool new_interval = PastIntervals::check_new_interval(
 595       old_acting_primary.osd,
 596       new_acting_primary,
 597       oldacting, newacting,
 598       old_up_primary.osd,
 599       new_up_primary,
 600       oldup, newup,
 601       info.history.same_interval_since,
 602       info.history.last_epoch_clean,
 603       osdmap.get(),
 604       lastmap.get(),
 605       info.pgid.pgid,
 606       missing_loc.get_recoverable_predicate(),
 607       &past_intervals,
 608       &debug);
 609     psdout(10) << __func__ << ": check_new_interval output: "
 610                << debug.str() << dendl;
 611     if (new_interval) {
 612       if (osdmap->get_epoch() == pl->oldest_stored_osdmap() &&
 613           info.history.last_epoch_clean < osdmap->get_epoch()) {
 614         psdout(10) << " map gap, clearing past_intervals and faking" << dendl;
 615         // our information is incomplete and useless; someone else was clean
 616         // after everything we know if osdmaps were trimmed.
 617         past_intervals.clear();
 618       } else {
 619         psdout(10) << " noting past " << past_intervals << dendl;
 620       }
 621       dirty_info = true;
 622       dirty_big_info = true;
 623       info.history.same_interval_since = osdmap->get_epoch();
 624       if (osdmap->have_pg_pool(info.pgid.pgid.pool()) &&
 625           info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
 626                                   osdmap->get_pg_num(info.pgid.pgid.pool()),
 627                                   nullptr)) {
 628         info.history.last_epoch_split = osdmap->get_epoch();
 629       }
 630     }
 631   }
 632
 633   if (old_up_primary != up_primary ||
 634       oldup != up) {
 635     info.history.same_up_since = osdmap->get_epoch();
 636   }
 637   // this comparison includes primary rank via pg_shard_t
 638   if (old_acting_primary != get_primary()) {
 639     info.history.same_primary_since = osdmap->get_epoch();
 640   }
 641
 642   on_new_interval();
 643   pl->on_info_history_change();
 644
 645   psdout(1) << __func__ << " up " << oldup << " -> " << up
 646             << ", acting " << oldacting << " -> " << acting
 647             << ", acting_primary " << old_acting_primary << " -> "
 648             << new_acting_primary
 649             << ", up_primary " << old_up_primary << " -> " << new_up_primary
 650             << ", role " << oldrole << " -> " << role
 651             << ", features acting " << acting_features
 652             << " upacting " << upacting_features
 653             << dendl;
 654
 655   // deactivate.
 656   state_clear(PG_STATE_ACTIVE);
 657   state_clear(PG_STATE_PEERED);
 658   state_clear(PG_STATE_PREMERGE);
 659   state_clear(PG_STATE_DOWN);
 660   state_clear(PG_STATE_RECOVERY_WAIT);
 661   state_clear(PG_STATE_RECOVERY_TOOFULL);
 662   state_clear(PG_STATE_RECOVERING);
 663
 664   peer_purged.clear();
 665   acting_recovery_backfill.clear();
 666
 667   // reset primary/replica state?
 668   if (was_old_primary || is_primary()) {
 669     pl->clear_want_pg_temp();
 670   } else if (was_old_nonprimary || is_nonprimary()) {
 671     pl->clear_want_pg_temp();
 672   }
 673   clear_primary_state();
 674
 675   pl->on_change(t);
 676
 677   ceph_assert(!deleting);
 678
 679   // should we tell the primary we are here?
 680   send_notify = !is_primary();
 681
 682   if (role != oldrole ||
 683       was_old_primary != is_primary()) {
 684     // did primary change?
 685     if (was_old_primary != is_primary()) {
 686       state_clear(PG_STATE_CLEAN);
 687     }
 688
 689     pl->on_role_change();
 690   } else {
 691     // no role change.
 692     // did primary change?
 693     if (get_primary() != old_acting_primary) {
 694       psdout(10) << oldacting << " -> " << acting
 695                << ", acting primary "
 696                << old_acting_primary << " -> " << get_primary()
 697                << dendl;
 698     } else {
 699       // primary is the same.
 700       if (is_primary()) {
 701         // i am (still) primary. but my replica set changed.
 702         state_clear(PG_STATE_CLEAN);
 703
 704         psdout(10) << oldacting << " -> " << acting
 705                  << ", replicas changed" << dendl;
 706       }
 707     }
 708   }
 709
 710   if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
 711     psdout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
 712     pl->queue_want_pg_temp(acting);
 713   }
 714 }
 715
 716 void PeeringState::on_new_interval()
 717 {
 718   dout(20) << __func__ << dendl;
 719   const OSDMapRef osdmap = get_osdmap();
 720
 721   // initialize features
 722   acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
 723   upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
 724   for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
 725     if (*p == CRUSH_ITEM_NONE)
 726       continue;
 727     uint64_t f = osdmap->get_xinfo(*p).features;
 728     acting_features &= f;
 729     upacting_features &= f;
 730   }
 731   for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
 732     if (*p == CRUSH_ITEM_NONE)
 733       continue;
 734     upacting_features &= osdmap->get_xinfo(*p).features;
 735   }
 736   psdout(20) << __func__ << " upacting_features 0x" << std::hex
 737              << upacting_features << std::dec
 738              << " from " << acting << "+" << up << dendl;
 739
 740   psdout(20) << __func__ << " checking missing set deletes flag. missing = "
 741              << get_pg_log().get_missing() << dendl;
 742
 743   if (!pg_log.get_missing().may_include_deletes &&
 744       !perform_deletes_during_peering()) {
 745     pl->rebuild_missing_set_with_deletes(pg_log);
 746   }
 747   ceph_assert(
 748     pg_log.get_missing().may_include_deletes ==
 749     !perform_deletes_during_peering());
 750
 751   init_hb_stamps();
 752
 753   // update lease bounds for a new interval
 754   auto mnow = pl->get_mnow();
 755   prior_readable_until_ub = std::max(prior_readable_until_ub,
 756                                      readable_until_ub);
 757   prior_readable_until_ub = info.history.refresh_prior_readable_until_ub(
 758     mnow, prior_readable_until_ub);
 759   psdout(10) << __func__ << " prior_readable_until_ub "
 760              << prior_readable_until_ub << " (mnow " << mnow << " + "
 761              << info.history.prior_readable_until_ub << ")" << dendl;
 762   prior_readable_down_osds.clear(); // we populate this when we build the priorset
 763
 764   readable_until =
 765     readable_until_ub =
 766     readable_until_ub_sent =
 767     readable_until_ub_from_primary = ceph::signedspan::zero();
 768
 769   acting_readable_until_ub.clear();
 770   if (is_primary()) {
 771     acting_readable_until_ub.resize(acting.size(), ceph::signedspan::zero());
 772   }
 773
 774   pl->on_new_interval();
 775 }
 776
 777 void PeeringState::init_primary_up_acting(
 778   const vector<int> &newup,
 779   const vector<int> &newacting,
 780   int new_up_primary,
 781   int new_acting_primary)
 782 {
 783   actingset.clear();
 784   acting = newacting;
 785   for (uint8_t i = 0; i < acting.size(); ++i) {
 786     if (acting[i] != CRUSH_ITEM_NONE)
 787       actingset.insert(
 788         pg_shard_t(
 789           acting[i],
 790           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
 791   }
 792   upset.clear();
 793   up = newup;
 794   for (uint8_t i = 0; i < up.size(); ++i) {
 795     if (up[i] != CRUSH_ITEM_NONE)
 796       upset.insert(
 797         pg_shard_t(
 798           up[i],
 799           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
 800   }
 801   if (!pool.info.is_erasure()) {
 802     // replicated
 803     up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
 804     primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
 805   } else {
 806     // erasure
 807     up_primary = pg_shard_t();
 808     primary = pg_shard_t();
 809     for (uint8_t i = 0; i < up.size(); ++i) {
 810       if (up[i] == new_up_primary) {
 811         up_primary = pg_shard_t(up[i], shard_id_t(i));
 812         break;
 813       }
 814     }
 815     for (uint8_t i = 0; i < acting.size(); ++i) {
 816       if (acting[i] == new_acting_primary) {
 817         primary = pg_shard_t(acting[i], shard_id_t(i));
 818         break;
 819       }
 820     }
 821     ceph_assert(up_primary.osd == new_up_primary);
 822     ceph_assert(primary.osd == new_acting_primary);
 823   }
 824 }
 825
 826 void PeeringState::init_hb_stamps()
 827 {
 828   if (is_primary()) {
 829     // we care about all other osds in the acting set
 830     hb_stamps.resize(acting.size() - 1);
 831     unsigned i = 0;
 832     for (auto p : acting) {
 833       if (p == CRUSH_ITEM_NONE || p == get_primary().osd) {
 834         continue;
 835       }
 836       hb_stamps[i++] = pl->get_hb_stamps(p);
 837     }
 838     hb_stamps.resize(i);
 839   } else if (is_nonprimary()) {
 840     // we care about just the primary
 841     hb_stamps.resize(1);
 842     hb_stamps[0] = pl->get_hb_stamps(get_primary().osd);
 843   } else {
 844     hb_stamps.clear();
 845   }
 846   dout(10) << __func__ << " now " << hb_stamps << dendl;
 847 }
 848
 849
 850 void PeeringState::clear_recovery_state()
 851 {
 852   async_recovery_targets.clear();
 853   backfill_targets.clear();
 854 }
 855
 856 void PeeringState::clear_primary_state()
 857 {
 858   psdout(10) << "clear_primary_state" << dendl;
 859
 860   // clear peering state
 861   stray_set.clear();
 862   peer_log_requested.clear();
 863   peer_missing_requested.clear();
 864   peer_info.clear();
 865   peer_bytes.clear();
 866   peer_missing.clear();
 867   peer_last_complete_ondisk.clear();
 868   peer_activated.clear();
 869   min_last_complete_ondisk = eversion_t();
 870   pg_trim_to = eversion_t();
 871   might_have_unfound.clear();
 872   need_up_thru = false;
 873   missing_loc.clear();
 874   pg_log.reset_recovery_pointers();
 875
 876   clear_recovery_state();
 877
 878   last_update_ondisk = eversion_t();
 879   missing_loc.clear();
 880   pl->clear_primary_state();
 881 }
 882
 883 /// return [start,end) bounds for required past_intervals
 884 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
 885   const pg_info_t &info,
 886   epoch_t oldest_map) {
 887   epoch_t start = std::max(
 888     info.history.last_epoch_clean ? info.history.last_epoch_clean :
 889     info.history.epoch_pool_created,
 890     oldest_map);
 891   epoch_t end = std::max(
 892     info.history.same_interval_since,
 893     info.history.epoch_pool_created);
 894   return make_pair(start, end);
 895 }
 896
 897
 898 void PeeringState::check_past_interval_bounds() const
 899 {
 900   auto oldest_epoch = pl->oldest_stored_osdmap();
 901   auto rpib = get_required_past_interval_bounds(
 902     info,
 903     oldest_epoch);
 904   if (rpib.first >= rpib.second) {
 905     // do not warn if the start bound is dictated by oldest_map; the
 906     // past intervals are presumably appropriate given the pg info.
 907     if (!past_intervals.empty() &&
 908         rpib.first > oldest_epoch) {
 909       pl->get_clog_error() << info.pgid << " required past_interval bounds are"
 910                              << " empty [" << rpib << ") but past_intervals is not: "
 911                              << past_intervals;
 912       derr << info.pgid << " required past_interval bounds are"
 913            << " empty [" << rpib << ") but past_intervals is not: "
 914            << past_intervals << dendl;
 915     }
 916   } else {
 917     if (past_intervals.empty()) {
 918       pl->get_clog_error() << info.pgid << " required past_interval bounds are"
 919                              << " not empty [" << rpib << ") but past_intervals "
 920                              << past_intervals << " is empty";
 921       derr << info.pgid << " required past_interval bounds are"
 922            << " not empty [" << rpib << ") but past_intervals "
 923            << past_intervals << " is empty" << dendl;
 924       ceph_assert(!past_intervals.empty());
 925     }
 926
 927     auto apib = past_intervals.get_bounds();
 928     if (apib.first > rpib.first) {
 929       pl->get_clog_error() << info.pgid << " past_intervals [" << apib
 930                              << ") start interval does not contain the required"
 931                              << " bound [" << rpib << ") start";
 932       derr << info.pgid << " past_intervals [" << apib
 933            << ") start interval does not contain the required"
 934            << " bound [" << rpib << ") start" << dendl;
 935       ceph_abort_msg("past_interval start interval mismatch");
 936     }
 937     if (apib.second != rpib.second) {
 938       pl->get_clog_error() << info.pgid << " past_interal bound [" << apib
 939                              << ") end does not match required [" << rpib
 940                              << ") end";
 941       derr << info.pgid << " past_interal bound [" << apib
 942            << ") end does not match required [" << rpib
 943            << ") end" << dendl;
 944       ceph_abort_msg("past_interval end mismatch");
 945     }
 946   }
 947 }
 948
 949 int PeeringState::clamp_recovery_priority(int priority, int pool_recovery_priority, int max)
 950 {
 951   static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
 952   static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
 953
 954   ceph_assert(max <= OSD_RECOVERY_PRIORITY_MAX);
 955
 956   // User can't set this too high anymore, but might be a legacy value
 957   if (pool_recovery_priority > OSD_POOL_PRIORITY_MAX)
 958     pool_recovery_priority = OSD_POOL_PRIORITY_MAX;
 959   if (pool_recovery_priority < OSD_POOL_PRIORITY_MIN)
 960     pool_recovery_priority = OSD_POOL_PRIORITY_MIN;
 961   // Shift range from min to max to 0 to max - min
 962   pool_recovery_priority += (0 - OSD_POOL_PRIORITY_MIN);
 963   ceph_assert(pool_recovery_priority >= 0 && pool_recovery_priority <= (OSD_POOL_PRIORITY_MAX - OSD_POOL_PRIORITY_MIN));
 964
 965   priority += pool_recovery_priority;
 966
 967   // Clamp to valid range
 968   if (priority > max) {
 969     return max;
 970   } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
 971     return OSD_RECOVERY_PRIORITY_MIN;
 972   } else {
 973     return priority;
 974   }
 975 }
 976
 977 unsigned PeeringState::get_recovery_priority()
 978 {
 979   // a higher value -> a higher priority
 980   int ret = OSD_RECOVERY_PRIORITY_BASE;
 981   int base = ret;
 982
 983   if (state & PG_STATE_FORCED_RECOVERY) {
 984     ret = OSD_RECOVERY_PRIORITY_FORCED;
 985   } else {
 986     // XXX: This priority boost isn't so much about inactive, but about data-at-risk
 987     if (is_degraded() && info.stats.avail_no_missing.size() < pool.info.min_size) {
 988       base = OSD_RECOVERY_INACTIVE_PRIORITY_BASE;
 989       // inactive: no. of replicas < min_size, highest priority since it blocks IO
 990       ret = base + (pool.info.min_size - info.stats.avail_no_missing.size());
 991     }
 992
 993     int64_t pool_recovery_priority = 0;
 994     pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
 995
 996     ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
 997   }
 998   psdout(20) << __func__ << " recovery priority is " << ret << dendl;
 999   return static_cast<unsigned>(ret);
1000 }
1001
1002 unsigned PeeringState::get_backfill_priority()
1003 {
1004   // a higher value -> a higher priority
1005   int ret = OSD_BACKFILL_PRIORITY_BASE;
1006   int base = ret;
1007
1008   if (state & PG_STATE_FORCED_BACKFILL) {
1009     ret = OSD_BACKFILL_PRIORITY_FORCED;
1010   } else {
1011     if (acting.size() < pool.info.min_size) {
1012       base = OSD_BACKFILL_INACTIVE_PRIORITY_BASE;
1013       // inactive: no. of replicas < min_size, highest priority since it blocks IO
1014       ret = base + (pool.info.min_size - acting.size());
1015
1016     } else if (is_undersized()) {
1017       // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
1018       ceph_assert(pool.info.size > actingset.size());
1019       base = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1020       ret = base + (pool.info.size - actingset.size());
1021
1022     } else if (is_degraded()) {
1023       // degraded: baseline degraded
1024       base = ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1025     }
1026
1027     // Adjust with pool's recovery priority
1028     int64_t pool_recovery_priority = 0;
1029     pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
1030
1031     ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
1032   }
1033
1034   psdout(20) << __func__ << " backfill priority is " << ret << dendl;
1035   return static_cast<unsigned>(ret);
1036 }
1037
1038 unsigned PeeringState::get_delete_priority()
1039 {
1040   auto state = get_osdmap()->get_state(pg_whoami.osd);
1041   if (state & (CEPH_OSD_BACKFILLFULL |
1042                CEPH_OSD_FULL)) {
1043     return OSD_DELETE_PRIORITY_FULL;
1044   } else if (state & CEPH_OSD_NEARFULL) {
1045     return OSD_DELETE_PRIORITY_FULLISH;
1046   } else {
1047     return OSD_DELETE_PRIORITY_NORMAL;
1048   }
1049 }
1050
1051 bool PeeringState::set_force_recovery(bool b)
1052 {
1053   bool did = false;
1054   if (b) {
1055     if (!(state & PG_STATE_FORCED_RECOVERY) &&
1056         (state & (PG_STATE_DEGRADED |
1057                   PG_STATE_RECOVERY_WAIT |
1058                   PG_STATE_RECOVERING))) {
1059       psdout(20) << __func__ << " set" << dendl;
1060       state_set(PG_STATE_FORCED_RECOVERY);
1061       pl->publish_stats_to_osd();
1062       did = true;
1063     }
1064   } else if (state & PG_STATE_FORCED_RECOVERY) {
1065     psdout(20) << __func__ << " clear" << dendl;
1066     state_clear(PG_STATE_FORCED_RECOVERY);
1067     pl->publish_stats_to_osd();
1068     did = true;
1069   }
1070   if (did) {
1071     psdout(20) << __func__ << " state " << get_current_state()
1072              << dendl;
1073     pl->update_local_background_io_priority(get_recovery_priority());
1074   }
1075   return did;
1076 }
1077
1078 bool PeeringState::set_force_backfill(bool b)
1079 {
1080   bool did = false;
1081   if (b) {
1082     if (!(state & PG_STATE_FORCED_BACKFILL) &&
1083         (state & (PG_STATE_DEGRADED |
1084                   PG_STATE_BACKFILL_WAIT |
1085                   PG_STATE_BACKFILLING))) {
1086       psdout(10) << __func__ << " set" << dendl;
1087       state_set(PG_STATE_FORCED_BACKFILL);
1088       pl->publish_stats_to_osd();
1089       did = true;
1090     }
1091   } else if (state & PG_STATE_FORCED_BACKFILL) {
1092     psdout(10) << __func__ << " clear" << dendl;
1093     state_clear(PG_STATE_FORCED_BACKFILL);
1094     pl->publish_stats_to_osd();
1095     did = true;
1096   }
1097   if (did) {
1098     psdout(20) << __func__ << " state " << get_current_state()
1099              << dendl;
1100     pl->update_local_background_io_priority(get_backfill_priority());
1101   }
1102   return did;
1103 }
1104
1105 void PeeringState::schedule_renew_lease()
1106 {
1107   pl->schedule_renew_lease(
1108     last_peering_reset,
1109     readable_interval / 2);
1110 }
1111
1112 void PeeringState::send_lease()
1113 {
1114   epoch_t epoch = pl->get_osdmap_epoch();
1115   for (auto peer : actingset) {
1116     if (peer == pg_whoami) {
1117       continue;
1118     }
1119     pl->send_cluster_message(
1120       peer.osd,
1121       new MOSDPGLease(epoch,
1122                       spg_t(spgid.pgid, peer.shard),
1123                       get_lease()),
1124       epoch);
1125   }
1126 }
1127
1128 void PeeringState::proc_lease(const pg_lease_t& l)
1129 {
1130   if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1131     psdout(20) << __func__ << " no-op, upacting_features 0x" << std::hex
1132                << upacting_features << std::dec
1133                << " does not include SERVER_OCTOPUS" << dendl;
1134     return;
1135   }
1136   if (!is_nonprimary()) {
1137     psdout(20) << __func__ << " no-op, !nonprimary" << dendl;
1138     return;
1139   }
1140   psdout(10) << __func__ << " " << l << dendl;
1141   if (l.readable_until_ub > readable_until_ub_from_primary) {
1142     readable_until_ub_from_primary = l.readable_until_ub;
1143   }
1144
1145   ceph::signedspan ru = ceph::signedspan::zero();
1146   if (l.readable_until != ceph::signedspan::zero() &&
1147       hb_stamps[0]->peer_clock_delta_ub) {
1148     ru = l.readable_until - *hb_stamps[0]->peer_clock_delta_ub;
1149     psdout(20) << " peer_clock_delta_ub " << *hb_stamps[0]->peer_clock_delta_ub
1150                << " -> ru " << ru << dendl;
1151   }
1152   if (ru > readable_until) {
1153     readable_until = ru;
1154     psdout(20) << __func__ << " readable_until now " << readable_until << dendl;
1155     // NOTE: if we ever decide to block/queue ops on the replica,
1156     // we'll need to wake them up here.
1157   }
1158
1159   ceph::signedspan ruub;
1160   if (hb_stamps[0]->peer_clock_delta_lb) {
1161     ruub = l.readable_until_ub - *hb_stamps[0]->peer_clock_delta_lb;
1162     psdout(20) << " peer_clock_delta_lb " << *hb_stamps[0]->peer_clock_delta_lb
1163                << " -> ruub " << ruub << dendl;
1164   } else {
1165     ruub = pl->get_mnow() + l.interval;
1166     psdout(20) << " no peer_clock_delta_lb -> ruub " << ruub << dendl;
1167   }
1168   if (ruub > readable_until_ub) {
1169     readable_until_ub = ruub;
1170     psdout(20) << __func__ << " readable_until_ub now " << readable_until_ub
1171                << dendl;
1172   }
1173 }
1174
1175 void PeeringState::proc_lease_ack(int from, const pg_lease_ack_t& a)
1176 {
1177   if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1178     return;
1179   }
1180   auto now = pl->get_mnow();
1181   bool was_min = false;
1182   for (unsigned i = 0; i < acting.size(); ++i) {
1183     if (from == acting[i]) {
1184       // the lease_ack value is based on the primary's clock
1185       if (a.readable_until_ub > acting_readable_until_ub[i]) {
1186         if (acting_readable_until_ub[i] == readable_until) {
1187           was_min = true;
1188         }
1189         acting_readable_until_ub[i] = a.readable_until_ub;
1190         break;
1191       }
1192     }
1193   }
1194   if (was_min) {
1195     auto old_ru = readable_until;
1196     recalc_readable_until();
1197     if (now < old_ru) {
1198       pl->recheck_readable();
1199     }
1200   }
1201 }
1202
1203 void PeeringState::proc_renew_lease()
1204 {
1205   if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1206     return;
1207   }
1208   renew_lease(pl->get_mnow());
1209   send_lease();
1210   schedule_renew_lease();
1211 }
1212
1213 void PeeringState::recalc_readable_until()
1214 {
1215   assert(is_primary());
1216   ceph::signedspan min = readable_until_ub_sent;
1217   for (unsigned i = 0; i < acting.size(); ++i) {
1218     if (acting[i] == pg_whoami.osd || acting[i] == CRUSH_ITEM_NONE) {
1219       continue;
1220     }
1221     dout(20) << __func__ << " peer osd." << acting[i]
1222              << " ruub " << acting_readable_until_ub[i] << dendl;
1223     if (acting_readable_until_ub[i] < min) {
1224       min = acting_readable_until_ub[i];
1225     }
1226   }
1227   readable_until = min;
1228   readable_until_ub = min;
1229   dout(20) << __func__ << " readable_until[_ub] " << readable_until
1230            << " (sent " << readable_until_ub_sent << ")" << dendl;
1231 }
1232
1233 bool PeeringState::check_prior_readable_down_osds(const OSDMapRef& map)
1234 {
1235   if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1236     return false;
1237   }
1238   bool changed = false;
1239   auto p = prior_readable_down_osds.begin();
1240   while (p != prior_readable_down_osds.end()) {
1241     if (map->is_dead(*p)) {
1242       dout(10) << __func__ << " prior_readable_down_osds osd." << *p
1243                << " is dead as of epoch " << map->get_epoch()
1244                << dendl;
1245       p = prior_readable_down_osds.erase(p);
1246       changed = true;
1247     } else {
1248       ++p;
1249     }
1250   }
1251   if (changed && prior_readable_down_osds.empty()) {
1252     psdout(10) << " empty prior_readable_down_osds, clearing ub" << dendl;
1253     clear_prior_readable_until_ub();
1254     return true;
1255   }
1256   return false;
1257 }
1258
1259 bool PeeringState::adjust_need_up_thru(const OSDMapRef osdmap)
1260 {
1261   epoch_t up_thru = osdmap->get_up_thru(pg_whoami.osd);
1262   if (need_up_thru &&
1263       up_thru >= info.history.same_interval_since) {
1264     psdout(10) << "adjust_need_up_thru now "
1265                << up_thru << ", need_up_thru now false" << dendl;
1266     need_up_thru = false;
1267     return true;
1268   }
1269   return false;
1270 }
1271
1272 PastIntervals::PriorSet PeeringState::build_prior()
1273 {
1274   if (1) {
1275     // sanity check
1276     for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
1277          it != peer_info.end();
1278          ++it) {
1279       ceph_assert(info.history.last_epoch_started >=
1280                   it->second.history.last_epoch_started);
1281     }
1282   }
1283
1284   const OSDMap &osdmap = *get_osdmap();
1285   PastIntervals::PriorSet prior = past_intervals.get_prior_set(
1286     pool.info.is_erasure(),
1287     info.history.last_epoch_started,
1288     &missing_loc.get_recoverable_predicate(),
1289     [&](epoch_t start, int osd, epoch_t *lost_at) {
1290       const osd_info_t *pinfo = 0;
1291       if (osdmap.exists(osd)) {
1292         pinfo = &osdmap.get_info(osd);
1293         if (lost_at)
1294           *lost_at = pinfo->lost_at;
1295       }
1296
1297       if (osdmap.is_up(osd)) {
1298         return PastIntervals::UP;
1299       } else if (!pinfo) {
1300         return PastIntervals::DNE;
1301       } else if (pinfo->lost_at > start) {
1302         return PastIntervals::LOST;
1303       } else {
1304         return PastIntervals::DOWN;
1305       }
1306     },
1307     up,
1308     acting,
1309     dpp);
1310
1311   if (prior.pg_down) {
1312     state_set(PG_STATE_DOWN);
1313   }
1314
1315   if (get_osdmap()->get_up_thru(pg_whoami.osd) <
1316       info.history.same_interval_since) {
1317     psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1318                << " < same_since " << info.history.same_interval_since
1319                << ", must notify monitor" << dendl;
1320     need_up_thru = true;
1321   } else {
1322     psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1323                << " >= same_since " << info.history.same_interval_since
1324                << ", all is well" << dendl;
1325     need_up_thru = false;
1326   }
1327   pl->set_probe_targets(prior.probe);
1328   return prior;
1329 }
1330
1331 bool PeeringState::needs_recovery() const
1332 {
1333   ceph_assert(is_primary());
1334
1335   auto &missing = pg_log.get_missing();
1336
1337   if (missing.num_missing()) {
1338     psdout(10) << __func__ << " primary has " << missing.num_missing()
1339                << " missing" << dendl;
1340     return true;
1341   }
1342
1343   ceph_assert(!acting_recovery_backfill.empty());
1344   set<pg_shard_t>::const_iterator end = acting_recovery_backfill.end();
1345   set<pg_shard_t>::const_iterator a = acting_recovery_backfill.begin();
1346   for (; a != end; ++a) {
1347     if (*a == get_primary()) continue;
1348     pg_shard_t peer = *a;
1349     map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
1350     if (pm == peer_missing.end()) {
1351       psdout(10) << __func__ << " osd." << peer << " doesn't have missing set"
1352                  << dendl;
1353       continue;
1354     }
1355     if (pm->second.num_missing()) {
1356       psdout(10) << __func__ << " osd." << peer << " has "
1357                  << pm->second.num_missing() << " missing" << dendl;
1358       return true;
1359     }
1360   }
1361
1362   psdout(10) << __func__ << " is recovered" << dendl;
1363   return false;
1364 }
1365
1366 bool PeeringState::needs_backfill() const
1367 {
1368   ceph_assert(is_primary());
1369
1370   // We can assume that only possible osds that need backfill
1371   // are on the backfill_targets vector nodes.
1372   set<pg_shard_t>::const_iterator end = backfill_targets.end();
1373   set<pg_shard_t>::const_iterator a = backfill_targets.begin();
1374   for (; a != end; ++a) {
1375     pg_shard_t peer = *a;
1376     map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
1377     if (!pi->second.last_backfill.is_max()) {
1378       psdout(10) << __func__ << " osd." << peer
1379                  << " has last_backfill " << pi->second.last_backfill << dendl;
1380       return true;
1381     }
1382   }
1383
1384   psdout(10) << __func__ << " does not need backfill" << dendl;
1385   return false;
1386 }
1387
1388 /*
1389  * Returns true unless there is a non-lost OSD in might_have_unfound.
1390  */
1391 bool PeeringState::all_unfound_are_queried_or_lost(
1392   const OSDMapRef osdmap) const
1393 {
1394   ceph_assert(is_primary());
1395
1396   set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
1397   set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
1398   for (; peer != mend; ++peer) {
1399     if (peer_missing.count(*peer))
1400       continue;
1401     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
1402     if (iter != peer_info.end() &&
1403         (iter->second.is_empty() || iter->second.dne()))
1404       continue;
1405     if (!osdmap->exists(peer->osd))
1406       continue;
1407     const osd_info_t &osd_info(osdmap->get_info(peer->osd));
1408     if (osd_info.lost_at <= osd_info.up_from) {
1409       // If there is even one OSD in might_have_unfound that isn't lost, we
1410       // still might retrieve our unfound.
1411       return false;
1412     }
1413   }
1414   psdout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound "
1415              << might_have_unfound
1416              << " have been queried or are marked lost" << dendl;
1417   return true;
1418 }
1419
1420
1421 void PeeringState::reject_reservation()
1422 {
1423   pl->unreserve_recovery_space();
1424   pl->send_cluster_message(
1425     primary.osd,
1426     new MBackfillReserve(
1427       MBackfillReserve::REJECT_TOOFULL,
1428       spg_t(info.pgid.pgid, primary.shard),
1429       get_osdmap_epoch()),
1430     get_osdmap_epoch());
1431 }
1432
1433 /**
1434  * find_best_info
1435  *
1436  * Returns an iterator to the best info in infos sorted by:
1437  *  1) Prefer newer last_update
1438  *  2) Prefer longer tail if it brings another info into contiguity
1439  *  3) Prefer current primary
1440  */
1441 map<pg_shard_t, pg_info_t>::const_iterator PeeringState::find_best_info(
1442   const map<pg_shard_t, pg_info_t> &infos,
1443   bool restrict_to_up_acting,
1444   bool *history_les_bound) const
1445 {
1446   ceph_assert(history_les_bound);
1447   /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1448    * to make changes to this process.  Also, make sure to update it
1449    * when you find bugs! */
1450   eversion_t min_last_update_acceptable = eversion_t::max();
1451   epoch_t max_last_epoch_started_found = 0;
1452   for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1453        i != infos.end();
1454        ++i) {
1455     if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1456         max_last_epoch_started_found < i->second.history.last_epoch_started) {
1457       *history_les_bound = true;
1458       max_last_epoch_started_found = i->second.history.last_epoch_started;
1459     }
1460     if (!i->second.is_incomplete() &&
1461         max_last_epoch_started_found < i->second.last_epoch_started) {
1462       *history_les_bound = false;
1463       max_last_epoch_started_found = i->second.last_epoch_started;
1464     }
1465   }
1466   for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1467        i != infos.end();
1468        ++i) {
1469     if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1470       if (min_last_update_acceptable > i->second.last_update)
1471         min_last_update_acceptable = i->second.last_update;
1472     }
1473   }
1474   if (min_last_update_acceptable == eversion_t::max())
1475     return infos.end();
1476
1477   map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1478   // find osd with newest last_update (oldest for ec_pool).
1479   // if there are multiples, prefer
1480   //  - a longer tail, if it brings another peer into log contiguity
1481   //  - the current primary
1482   for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1483        p != infos.end();
1484        ++p) {
1485     if (restrict_to_up_acting && !is_up(p->first) &&
1486         !is_acting(p->first))
1487       continue;
1488     // Only consider peers with last_update >= min_last_update_acceptable
1489     if (p->second.last_update < min_last_update_acceptable)
1490       continue;
1491     // Disqualify anyone with a too old last_epoch_started
1492     if (p->second.last_epoch_started < max_last_epoch_started_found)
1493       continue;
1494     // Disqualify anyone who is incomplete (not fully backfilled)
1495     if (p->second.is_incomplete())
1496       continue;
1497     if (best == infos.end()) {
1498       best = p;
1499       continue;
1500     }
1501     // Prefer newer last_update
1502     if (pool.info.require_rollback()) {
1503       if (p->second.last_update > best->second.last_update)
1504         continue;
1505       if (p->second.last_update < best->second.last_update) {
1506         best = p;
1507         continue;
1508       }
1509     } else {
1510       if (p->second.last_update < best->second.last_update)
1511         continue;
1512       if (p->second.last_update > best->second.last_update) {
1513         best = p;
1514         continue;
1515       }
1516     }
1517
1518     // Prefer longer tail
1519     if (p->second.log_tail > best->second.log_tail) {
1520       continue;
1521     } else if (p->second.log_tail < best->second.log_tail) {
1522       best = p;
1523       continue;
1524     }
1525
1526     if (!p->second.has_missing() && best->second.has_missing()) {
1527       psdout(10) << __func__ << " prefer osd." << p->first
1528                << " because it is complete while best has missing"
1529                << dendl;
1530       best = p;
1531       continue;
1532     } else if (p->second.has_missing() && !best->second.has_missing()) {
1533       psdout(10) << __func__ << " skipping osd." << p->first
1534                << " because it has missing while best is complete"
1535                << dendl;
1536       continue;
1537     } else {
1538       // both are complete or have missing
1539       // fall through
1540     }
1541
1542     // prefer current primary (usually the caller), all things being equal
1543     if (p->first == pg_whoami) {
1544       psdout(10) << "calc_acting prefer osd." << p->first
1545                  << " because it is current primary" << dendl;
1546       best = p;
1547       continue;
1548     }
1549   }
1550   return best;
1551 }
1552
1553 void PeeringState::calc_ec_acting(
1554   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1555   unsigned size,
1556   const vector<int> &acting,
1557   const vector<int> &up,
1558   const map<pg_shard_t, pg_info_t> &all_info,
1559   bool restrict_to_up_acting,
1560   vector<int> *_want,
1561   set<pg_shard_t> *backfill,
1562   set<pg_shard_t> *acting_backfill,
1563   ostream &ss)
1564 {
1565   vector<int> want(size, CRUSH_ITEM_NONE);
1566   map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1567   for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1568        i != all_info.end();
1569        ++i) {
1570     all_info_by_shard[i->first.shard].insert(i->first);
1571   }
1572   for (uint8_t i = 0; i < want.size(); ++i) {
1573     ss << "For position " << (unsigned)i << ": ";
1574     if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1575         !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1576         all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1577         auth_log_shard->second.log_tail) {
1578       ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1579       want[i] = up[i];
1580       continue;
1581     }
1582     if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1583       ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1584          << " and ";
1585       backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1586     }
1587
1588     if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1589         !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1590         all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1591         auth_log_shard->second.log_tail) {
1592       ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1593       want[i] = acting[i];
1594     } else if (!restrict_to_up_acting) {
1595       for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1596            j != all_info_by_shard[shard_id_t(i)].end();
1597            ++j) {
1598         ceph_assert(j->shard == i);
1599         if (!all_info.find(*j)->second.is_incomplete() &&
1600             all_info.find(*j)->second.last_update >=
1601             auth_log_shard->second.log_tail) {
1602           ss << " selecting stray: " << *j << std::endl;
1603           want[i] = j->osd;
1604           break;
1605         }
1606       }
1607       if (want[i] == CRUSH_ITEM_NONE)
1608         ss << " failed to fill position " << (int)i << std::endl;
1609     }
1610   }
1611
1612   for (uint8_t i = 0; i < want.size(); ++i) {
1613     if (want[i] != CRUSH_ITEM_NONE) {
1614       acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1615     }
1616   }
1617   acting_backfill->insert(backfill->begin(), backfill->end());
1618   _want->swap(want);
1619 }
1620
1621 /**
1622  * calculate the desired acting set.
1623  *
1624  * Choose an appropriate acting set.  Prefer up[0], unless it is
1625  * incomplete, or another osd has a longer tail that allows us to
1626  * bring other up nodes up to date.
1627  */
1628 void PeeringState::calc_replicated_acting(
1629   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1630   uint64_t force_auth_primary_missing_objects,
1631   unsigned size,
1632   const vector<int> &acting,
1633   const vector<int> &up,
1634   pg_shard_t up_primary,
1635   const map<pg_shard_t, pg_info_t> &all_info,
1636   bool restrict_to_up_acting,
1637   vector<int> *want,
1638   set<pg_shard_t> *backfill,
1639   set<pg_shard_t> *acting_backfill,
1640   const OSDMapRef osdmap,
1641   ostream &ss)
1642 {
1643   pg_shard_t auth_log_shard_id = auth_log_shard->first;
1644
1645   ss << __func__ << " newest update on osd." << auth_log_shard_id
1646      << " with " << auth_log_shard->second
1647      << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1648
1649   // select primary
1650   auto primary = all_info.find(up_primary);
1651   if (up.size() &&
1652       !primary->second.is_incomplete() &&
1653       primary->second.last_update >=
1654         auth_log_shard->second.log_tail) {
1655     if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1656       auto approx_missing_objects =
1657         primary->second.stats.stats.sum.num_objects_missing;
1658       auto auth_version = auth_log_shard->second.last_update.version;
1659       auto primary_version = primary->second.last_update.version;
1660       if (auth_version > primary_version) {
1661         approx_missing_objects += auth_version - primary_version;
1662       } else {
1663         approx_missing_objects += primary_version - auth_version;
1664       }
1665       if ((uint64_t)approx_missing_objects >
1666           force_auth_primary_missing_objects) {
1667         primary = auth_log_shard;
1668         ss << "up_primary: " << up_primary << ") has approximate "
1669            << approx_missing_objects
1670            << "(>" << force_auth_primary_missing_objects <<") "
1671            << "missing objects, osd." << auth_log_shard_id
1672            << " selected as primary instead"
1673            << std::endl;
1674       } else {
1675         ss << "up_primary: " << up_primary << ") selected as primary"
1676            << std::endl;
1677       }
1678     } else {
1679       ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1680     }
1681   } else {
1682     ceph_assert(!auth_log_shard->second.is_incomplete());
1683     ss << "up[0] needs backfill, osd." << auth_log_shard_id
1684        << " selected as primary instead" << std::endl;
1685     primary = auth_log_shard;
1686   }
1687
1688   ss << __func__ << " primary is osd." << primary->first
1689      << " with " << primary->second << std::endl;
1690   want->push_back(primary->first.osd);
1691   acting_backfill->insert(primary->first);
1692
1693   /* We include auth_log_shard->second.log_tail because in GetLog,
1694    * we will request logs back to the min last_update over our
1695    * acting_backfill set, which will result in our log being extended
1696    * as far backwards as necessary to pick up any peers which can
1697    * be log recovered by auth_log_shard's log */
1698   eversion_t oldest_auth_log_entry =
1699     std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
1700
1701   // select replicas that have log contiguity with primary.
1702   // prefer up, then acting, then any peer_info osds
1703   for (auto i : up) {
1704     pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
1705     if (up_cand == primary->first)
1706       continue;
1707     const pg_info_t &cur_info = all_info.find(up_cand)->second;
1708     if (cur_info.is_incomplete() ||
1709         cur_info.last_update < oldest_auth_log_entry) {
1710       ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1711       backfill->insert(up_cand);
1712       acting_backfill->insert(up_cand);
1713     } else {
1714       want->push_back(i);
1715       acting_backfill->insert(up_cand);
1716       ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
1717     }
1718   }
1719
1720   if (want->size() >= size) {
1721     return;
1722   }
1723
1724   std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
1725   candidate_by_last_update.reserve(acting.size());
1726   // This no longer has backfill OSDs, but they are covered above.
1727   for (auto i : acting) {
1728     pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
1729     // skip up osds we already considered above
1730     if (acting_cand == primary->first)
1731       continue;
1732     vector<int>::const_iterator up_it = find(up.begin(), up.end(), i);
1733     if (up_it != up.end())
1734       continue;
1735
1736     const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1737     if (cur_info.is_incomplete() ||
1738         cur_info.last_update < oldest_auth_log_entry) {
1739       ss << " shard " << acting_cand << " (acting) REJECTED "
1740          << cur_info << std::endl;
1741     } else {
1742       candidate_by_last_update.emplace_back(cur_info.last_update, i);
1743     }
1744   }
1745
1746   auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs,
1747                             const std::pair<eversion_t, int> &rhs) {
1748     return lhs.first > rhs.first;
1749   };
1750   // sort by last_update, in descending order.
1751   std::sort(candidate_by_last_update.begin(),
1752             candidate_by_last_update.end(), sort_by_eversion);
1753   for (auto &p: candidate_by_last_update) {
1754     ceph_assert(want->size() < size);
1755     want->push_back(p.second);
1756     pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1757     acting_backfill->insert(s);
1758     ss << " shard " << s << " (acting) accepted "
1759        << all_info.find(s)->second << std::endl;
1760     if (want->size() >= size) {
1761       return;
1762     }
1763   }
1764
1765   if (restrict_to_up_acting) {
1766     return;
1767   }
1768   candidate_by_last_update.clear();
1769   candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
1770   // continue to search stray to find more suitable peers
1771   for (auto &i : all_info) {
1772     // skip up osds we already considered above
1773     if (i.first == primary->first)
1774       continue;
1775     vector<int>::const_iterator up_it = find(up.begin(), up.end(), i.first.osd);
1776     if (up_it != up.end())
1777       continue;
1778     vector<int>::const_iterator acting_it = find(
1779       acting.begin(), acting.end(), i.first.osd);
1780     if (acting_it != acting.end())
1781       continue;
1782
1783     if (i.second.is_incomplete() ||
1784         i.second.last_update < oldest_auth_log_entry) {
1785       ss << " shard " << i.first << " (stray) REJECTED " << i.second
1786          << std::endl;
1787     } else {
1788       candidate_by_last_update.emplace_back(
1789         i.second.last_update, i.first.osd);
1790     }
1791   }
1792
1793   if (candidate_by_last_update.empty()) {
1794     // save us some effort
1795     return;
1796   }
1797
1798   // sort by last_update, in descending order.
1799   std::sort(candidate_by_last_update.begin(),
1800             candidate_by_last_update.end(), sort_by_eversion);
1801
1802   for (auto &p: candidate_by_last_update) {
1803     ceph_assert(want->size() < size);
1804     want->push_back(p.second);
1805     pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1806     acting_backfill->insert(s);
1807     ss << " shard " << s << " (stray) accepted "
1808        << all_info.find(s)->second << std::endl;
1809     if (want->size() >= size) {
1810       return;
1811     }
1812   }
1813 }
1814
1815 bool PeeringState::recoverable(const vector<int> &want) const
1816 {
1817   unsigned num_want_acting = 0;
1818   set<pg_shard_t> have;
1819   for (int i = 0; i < (int)want.size(); ++i) {
1820     if (want[i] != CRUSH_ITEM_NONE) {
1821       ++num_want_acting;
1822       have.insert(
1823         pg_shard_t(
1824           want[i],
1825           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1826     }
1827   }
1828
1829   if (num_want_acting < pool.info.min_size) {
1830     const bool recovery_ec_pool_below_min_size=
1831       HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_OCTOPUS);
1832
1833     if (pool.info.is_erasure() && !recovery_ec_pool_below_min_size) {
1834       psdout(10) << __func__ << " failed, ec recovery below min size not supported by pre-octopus" << dendl;
1835       return false;
1836     } else if (!cct->_conf.get_val<bool>("osd_allow_recovery_below_min_size")) {
1837       psdout(10) << __func__ << " failed, recovery below min size not enabled" << dendl;
1838       return false;
1839     }
1840   }
1841   if (missing_loc.get_recoverable_predicate()(have)) {
1842     return true;
1843   } else {
1844     psdout(10) << __func__ << " failed, not recoverable " << dendl;
1845     return false;
1846   }
1847 }
1848
1849 void PeeringState::choose_async_recovery_ec(
1850   const map<pg_shard_t, pg_info_t> &all_info,
1851   const pg_info_t &auth_info,
1852   vector<int> *want,
1853   set<pg_shard_t> *async_recovery,
1854   const OSDMapRef osdmap) const
1855 {
1856   set<pair<int, pg_shard_t> > candidates_by_cost;
1857   for (uint8_t i = 0; i < want->size(); ++i) {
1858     if ((*want)[i] == CRUSH_ITEM_NONE)
1859       continue;
1860
1861     // Considering log entries to recover is accurate enough for
1862     // now. We could use minimum_to_decode_with_cost() later if
1863     // necessary.
1864     pg_shard_t shard_i((*want)[i], shard_id_t(i));
1865     // do not include strays
1866     if (stray_set.find(shard_i) != stray_set.end())
1867       continue;
1868     // Do not include an osd that is not up, since choosing it as
1869     // an async_recovery_target will move it out of the acting set.
1870     // This results in it being identified as a stray during peering,
1871     // because it is no longer in the up or acting set.
1872     if (!is_up(shard_i))
1873       continue;
1874     auto shard_info = all_info.find(shard_i)->second;
1875     // for ec pools we rollback all entries past the authoritative
1876     // last_update *before* activation. This is relatively inexpensive
1877     // compared to recovery, since it is purely local, so treat shards
1878     // past the authoritative last_update the same as those equal to it.
1879     version_t auth_version = auth_info.last_update.version;
1880     version_t candidate_version = shard_info.last_update.version;
1881     if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1882       auto approx_missing_objects =
1883         shard_info.stats.stats.sum.num_objects_missing;
1884       if (auth_version > candidate_version) {
1885         approx_missing_objects += auth_version - candidate_version;
1886       }
1887       if (static_cast<uint64_t>(approx_missing_objects) >
1888          cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1889         candidates_by_cost.emplace(approx_missing_objects, shard_i);
1890       }
1891     } else {
1892       if (auth_version > candidate_version &&
1893           (auth_version - candidate_version) > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1894         candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i));
1895       }
1896     }
1897   }
1898
1899   psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1900              << dendl;
1901
1902   // take out as many osds as we can for async recovery, in order of cost
1903   for (auto rit = candidates_by_cost.rbegin();
1904        rit != candidates_by_cost.rend(); ++rit) {
1905     pg_shard_t cur_shard = rit->second;
1906     vector<int> candidate_want(*want);
1907     candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE;
1908     if (recoverable(candidate_want)) {
1909       want->swap(candidate_want);
1910       async_recovery->insert(cur_shard);
1911     }
1912   }
1913   psdout(20) << __func__ << " result want=" << *want
1914              << " async_recovery=" << *async_recovery << dendl;
1915 }
1916
1917 void PeeringState::choose_async_recovery_replicated(
1918   const map<pg_shard_t, pg_info_t> &all_info,
1919   const pg_info_t &auth_info,
1920   vector<int> *want,
1921   set<pg_shard_t> *async_recovery,
1922   const OSDMapRef osdmap) const
1923 {
1924   set<pair<int, pg_shard_t> > candidates_by_cost;
1925   for (auto osd_num : *want) {
1926     pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD);
1927     // do not include strays
1928     if (stray_set.find(shard_i) != stray_set.end())
1929       continue;
1930     // Do not include an osd that is not up, since choosing it as
1931     // an async_recovery_target will move it out of the acting set.
1932     // This results in it being identified as a stray during peering,
1933     // because it is no longer in the up or acting set.
1934     if (!is_up(shard_i))
1935       continue;
1936     auto shard_info = all_info.find(shard_i)->second;
1937     // use the approximate magnitude of the difference in length of
1938     // logs plus historical missing objects as the cost of recovery
1939     version_t auth_version = auth_info.last_update.version;
1940     version_t candidate_version = shard_info.last_update.version;
1941     if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1942       auto approx_missing_objects =
1943         shard_info.stats.stats.sum.num_objects_missing;
1944       if (auth_version > candidate_version) {
1945         approx_missing_objects += auth_version - candidate_version;
1946       } else {
1947         approx_missing_objects += candidate_version - auth_version;
1948       }
1949       if (static_cast<uint64_t>(approx_missing_objects)  >
1950          cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1951         candidates_by_cost.emplace(approx_missing_objects, shard_i);
1952       }
1953     } else {
1954       size_t approx_entries;
1955       if (auth_version > candidate_version) {
1956         approx_entries = auth_version - candidate_version;
1957       } else {
1958         approx_entries = candidate_version - auth_version;
1959       }
1960       if (approx_entries > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1961         candidates_by_cost.insert(make_pair(approx_entries, shard_i));
1962       }
1963     }
1964   }
1965
1966   psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1967              << dendl;
1968   // take out as many osds as we can for async recovery, in order of cost
1969   for (auto rit = candidates_by_cost.rbegin();
1970        rit != candidates_by_cost.rend(); ++rit) {
1971     if (want->size() <= pool.info.min_size) {
1972       break;
1973     }
1974     pg_shard_t cur_shard = rit->second;
1975     vector<int> candidate_want(*want);
1976     for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) {
1977       if (*it == cur_shard.osd) {
1978         candidate_want.erase(it);
1979         want->swap(candidate_want);
1980         async_recovery->insert(cur_shard);
1981         break;
1982       }
1983     }
1984   }
1985   psdout(20) << __func__ << " result want=" << *want
1986              << " async_recovery=" << *async_recovery << dendl;
1987 }
1988
1989
1990
1991 /**
1992  * choose acting
1993  *
1994  * calculate the desired acting, and request a change with the monitor
1995  * if it differs from the current acting.
1996  *
1997  * if restrict_to_up_acting=true, we filter out anything that's not in
1998  * up/acting.  in order to lift this restriction, we need to
1999  *  1) check whether it's worth switching the acting set any time we get
2000  *     a new pg info (not just here, when recovery finishes)
2001  *  2) check whether anything in want_acting went down on each new map
2002  *     (and, if so, calculate a new want_acting)
2003  *  3) remove the assertion in PG::PeeringState::Active::react(const AdvMap)
2004  * TODO!
2005  */
2006 bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
2007                                  bool restrict_to_up_acting,
2008                                  bool *history_les_bound,
2009                                  bool request_pg_temp_change_only)
2010 {
2011   map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
2012   all_info[pg_whoami] = info;
2013
2014   if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
2015     for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
2016          p != all_info.end();
2017          ++p) {
2018       psdout(10) << __func__ << " all_info osd." << p->first << " "
2019                  << p->second << dendl;
2020     }
2021   }
2022
2023   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
2024     find_best_info(all_info, restrict_to_up_acting, history_les_bound);
2025
2026   if (auth_log_shard == all_info.end()) {
2027     if (up != acting) {
2028       psdout(10) << __func__ << " no suitable info found (incomplete backfills?),"
2029                  << " reverting to up" << dendl;
2030       want_acting = up;
2031       vector<int> empty;
2032       pl->queue_want_pg_temp(empty);
2033     } else {
2034       psdout(10) << __func__ << " failed" << dendl;
2035       ceph_assert(want_acting.empty());
2036     }
2037     return false;
2038   }
2039
2040   ceph_assert(!auth_log_shard->second.is_incomplete());
2041   auth_log_shard_id = auth_log_shard->first;
2042
2043   set<pg_shard_t> want_backfill, want_acting_backfill;
2044   vector<int> want;
2045   stringstream ss;
2046   if (pool.info.is_replicated())
2047     calc_replicated_acting(
2048       auth_log_shard,
2049       cct->_conf.get_val<uint64_t>(
2050         "osd_force_auth_primary_missing_objects"),
2051       get_osdmap()->get_pg_size(info.pgid.pgid),
2052       acting,
2053       up,
2054       up_primary,
2055       all_info,
2056       restrict_to_up_acting,
2057       &want,
2058       &want_backfill,
2059       &want_acting_backfill,
2060       get_osdmap(),
2061       ss);
2062   else
2063     calc_ec_acting(
2064       auth_log_shard,
2065       get_osdmap()->get_pg_size(info.pgid.pgid),
2066       acting,
2067       up,
2068       all_info,
2069       restrict_to_up_acting,
2070       &want,
2071       &want_backfill,
2072       &want_acting_backfill,
2073       ss);
2074   psdout(10) << ss.str() << dendl;
2075
2076   if (!recoverable(want)) {
2077     want_acting.clear();
2078     return false;
2079   }
2080
2081   set<pg_shard_t> want_async_recovery;
2082   if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) {
2083     if (pool.info.is_erasure()) {
2084       choose_async_recovery_ec(
2085         all_info, auth_log_shard->second, &want, &want_async_recovery,
2086         get_osdmap());
2087     } else {
2088       choose_async_recovery_replicated(
2089         all_info, auth_log_shard->second, &want, &want_async_recovery,
2090         get_osdmap());
2091     }
2092   }
2093   while (want.size() > pool.info.size) {
2094     // async recovery should have taken out as many osds as it can.
2095     // if not, then always evict the last peer
2096     // (will get synchronously recovered later)
2097     psdout(10) << __func__ << " evicting osd." << want.back()
2098                << " from oversized want " << want << dendl;
2099     want.pop_back();
2100   }
2101   if (want != acting) {
2102     psdout(10) << __func__ << " want " << want << " != acting " << acting
2103                << ", requesting pg_temp change" << dendl;
2104     want_acting = want;
2105
2106     if (!cct->_conf->osd_debug_no_acting_change) {
2107       if (want_acting == up) {
2108         // There can't be any pending backfill if
2109         // want is the same as crush map up OSDs.
2110         ceph_assert(want_backfill.empty());
2111         vector<int> empty;
2112         pl->queue_want_pg_temp(empty);
2113       } else
2114         pl->queue_want_pg_temp(want);
2115     }
2116     return false;
2117   }
2118   if (request_pg_temp_change_only)
2119     return true;
2120   want_acting.clear();
2121   acting_recovery_backfill = want_acting_backfill;
2122   psdout(10) << "acting_recovery_backfill is "
2123              << acting_recovery_backfill << dendl;
2124   ceph_assert(
2125     backfill_targets.empty() ||
2126     backfill_targets == want_backfill);
2127   if (backfill_targets.empty()) {
2128     // Caller is GetInfo
2129     backfill_targets = want_backfill;
2130   }
2131   // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
2132   ceph_assert(
2133     async_recovery_targets.empty() ||
2134     async_recovery_targets == want_async_recovery ||
2135     !needs_recovery());
2136   if (async_recovery_targets.empty() || !needs_recovery()) {
2137     async_recovery_targets = want_async_recovery;
2138   }
2139   // Will not change if already set because up would have had to change
2140   // Verify that nothing in backfill is in stray_set
2141   for (set<pg_shard_t>::iterator i = want_backfill.begin();
2142       i != want_backfill.end();
2143       ++i) {
2144     ceph_assert(stray_set.find(*i) == stray_set.end());
2145   }
2146   psdout(10) << "choose_acting want=" << want << " backfill_targets="
2147            << want_backfill << " async_recovery_targets="
2148            << async_recovery_targets << dendl;
2149   return true;
2150 }
2151
2152 void PeeringState::log_weirdness()
2153 {
2154   if (pg_log.get_tail() != info.log_tail)
2155     pl->get_clog_error() << info.pgid
2156                            << " info mismatch, log.tail " << pg_log.get_tail()
2157                            << " != info.log_tail " << info.log_tail;
2158   if (pg_log.get_head() != info.last_update)
2159     pl->get_clog_error() << info.pgid
2160                            << " info mismatch, log.head " << pg_log.get_head()
2161                            << " != info.last_update " << info.last_update;
2162
2163   if (!pg_log.get_log().empty()) {
2164     // sloppy check
2165     if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
2166       pl->get_clog_error() << info.pgid
2167                              << " log bound mismatch, info (tail,head] ("
2168                              << pg_log.get_tail() << ","
2169                              << pg_log.get_head() << "]"
2170                              << " actual ["
2171                              << pg_log.get_log().log.begin()->version << ","
2172                              << pg_log.get_log().log.rbegin()->version << "]";
2173   }
2174
2175   if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
2176     pl->get_clog_error() << info.pgid
2177                            << " caller_ops.size "
2178                            << pg_log.get_log().caller_ops.size()
2179                            << " > log size " << pg_log.get_log().log.size();
2180   }
2181 }
2182
2183 /*
2184  * Process information from a replica to determine if it could have any
2185  * objects that i need.
2186  *
2187  * TODO: if the missing set becomes very large, this could get expensive.
2188  * Instead, we probably want to just iterate over our unfound set.
2189  */
2190 bool PeeringState::search_for_missing(
2191   const pg_info_t &oinfo, const pg_missing_t &omissing,
2192   pg_shard_t from,
2193   PeeringCtxWrapper &ctx)
2194 {
2195   uint64_t num_unfound_before = missing_loc.num_unfound();
2196   bool found_missing = missing_loc.add_source_info(
2197     from, oinfo, omissing, ctx.handle);
2198   if (found_missing && num_unfound_before != missing_loc.num_unfound())
2199     pl->publish_stats_to_osd();
2200   // avoid doing this if the peer is empty.  This is abit of paranoia
2201   // to avoid doing something rash if add_source_info() above
2202   // incorrectly decided we found something new. (if the peer has
2203   // last_update=0'0 that's impossible.)
2204   if (found_missing &&
2205       oinfo.last_update != eversion_t()) {
2206     pg_info_t tinfo(oinfo);
2207     tinfo.pgid.shard = pg_whoami.shard;
2208     ctx.send_info(
2209       from.osd,
2210       spg_t(info.pgid.pgid, from.shard),
2211       get_osdmap_epoch(),  // fixme: use lower epoch?
2212       get_osdmap_epoch(),
2213       tinfo);
2214   }
2215   return found_missing;
2216 }
2217
2218 bool PeeringState::discover_all_missing(
2219   BufferedRecoveryMessages &rctx)
2220 {
2221   auto &missing = pg_log.get_missing();
2222   uint64_t unfound = get_num_unfound();
2223   bool any = false;  // did we start any queries
2224
2225   psdout(10) << __func__ << " "
2226              << missing.num_missing() << " missing, "
2227              << unfound << " unfound"
2228              << dendl;
2229
2230   std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
2231   std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
2232   for (; m != mend; ++m) {
2233     pg_shard_t peer(*m);
2234
2235     if (!get_osdmap()->is_up(peer.osd)) {
2236       psdout(20) << __func__ << " skipping down osd." << peer << dendl;
2237       continue;
2238     }
2239
2240     if (peer_purged.count(peer)) {
2241       psdout(20) << __func__ << " skipping purged osd." << peer << dendl;
2242       continue;
2243     }
2244
2245     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
2246     if (iter != peer_info.end() &&
2247         (iter->second.is_empty() || iter->second.dne())) {
2248       // ignore empty peers
2249       continue;
2250     }
2251
2252     // If we've requested any of this stuff, the pg_missing_t information
2253     // should be on its way.
2254     // TODO: coalsce requested_* into a single data structure
2255     if (peer_missing.find(peer) != peer_missing.end()) {
2256       psdout(20) << __func__ << ": osd." << peer
2257                  << ": we already have pg_missing_t" << dendl;
2258       continue;
2259     }
2260     if (peer_log_requested.find(peer) != peer_log_requested.end()) {
2261       psdout(20) << __func__ << ": osd." << peer
2262                  << ": in peer_log_requested" << dendl;
2263       continue;
2264     }
2265     if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
2266       psdout(20) << __func__ << ": osd." << peer
2267                  << ": in peer_missing_requested" << dendl;
2268       continue;
2269     }
2270
2271     // Request missing
2272     psdout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
2273                << dendl;
2274     peer_missing_requested.insert(peer);
2275     rctx.send_query(
2276       peer.osd,
2277       spg_t(info.pgid.pgid, peer.shard),
2278       pg_query_t(
2279         pg_query_t::FULLLOG,
2280         peer.shard, pg_whoami.shard,
2281         info.history, get_osdmap_epoch()));
2282     any = true;
2283   }
2284   return any;
2285 }
2286
2287 /* Build the might_have_unfound set.
2288  *
2289  * This is used by the primary OSD during recovery.
2290  *
2291  * This set tracks the OSDs which might have unfound objects that the primary
2292  * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
2293  * will remove the OSD from the set.
2294  */
2295 void PeeringState::build_might_have_unfound()
2296 {
2297   ceph_assert(might_have_unfound.empty());
2298   ceph_assert(is_primary());
2299
2300   psdout(10) << __func__ << dendl;
2301
2302   check_past_interval_bounds();
2303
2304   might_have_unfound = past_intervals.get_might_have_unfound(
2305     pg_whoami,
2306     pool.info.is_erasure());
2307
2308   // include any (stray) peers
2309   for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
2310        p != peer_info.end();
2311        ++p)
2312     might_have_unfound.insert(p->first);
2313
2314   psdout(15) << __func__ << ": built " << might_have_unfound << dendl;
2315 }
2316
2317 void PeeringState::activate(
2318   ObjectStore::Transaction& t,
2319   epoch_t activation_epoch,
2320   PeeringCtxWrapper &ctx)
2321 {
2322   ceph_assert(!is_peered());
2323
2324   // twiddle pg state
2325   state_clear(PG_STATE_DOWN);
2326
2327   send_notify = false;
2328
2329   if (is_primary()) {
2330     // only update primary last_epoch_started if we will go active
2331     if (acting.size() >= pool.info.min_size) {
2332       ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
2333              info.last_epoch_started <= activation_epoch);
2334       info.last_epoch_started = activation_epoch;
2335       info.last_interval_started = info.history.same_interval_since;
2336     }
2337   } else if (is_acting(pg_whoami)) {
2338     /* update last_epoch_started on acting replica to whatever the primary sent
2339      * unless it's smaller (could happen if we are going peered rather than
2340      * active, see doc/dev/osd_internals/last_epoch_started.rst) */
2341     if (info.last_epoch_started < activation_epoch) {
2342       info.last_epoch_started = activation_epoch;
2343       info.last_interval_started = info.history.same_interval_since;
2344     }
2345   }
2346
2347   auto &missing = pg_log.get_missing();
2348
2349   min_last_complete_ondisk = eversion_t(0,0);  // we don't know (yet)!
2350   if (is_primary()) {
2351     last_update_ondisk = info.last_update;
2352   }
2353   last_update_applied = info.last_update;
2354   last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
2355
2356   need_up_thru = false;
2357
2358   // write pg info, log
2359   dirty_info = true;
2360   dirty_big_info = true; // maybe
2361
2362   pl->schedule_event_on_commit(
2363     t,
2364     std::make_shared<PGPeeringEvent>(
2365       get_osdmap_epoch(),
2366       get_osdmap_epoch(),
2367       ActivateCommitted(
2368         get_osdmap_epoch(),
2369         activation_epoch)));
2370
2371   // init complete pointer
2372   if (missing.num_missing() == 0) {
2373     psdout(10) << "activate - no missing, moving last_complete " << info.last_complete
2374              << " -> " << info.last_update << dendl;
2375     info.last_complete = info.last_update;
2376     info.stats.stats.sum.num_objects_missing = 0;
2377     pg_log.reset_recovery_pointers();
2378   } else {
2379     psdout(10) << "activate - not complete, " << missing << dendl;
2380     info.stats.stats.sum.num_objects_missing = missing.num_missing();
2381     pg_log.activate_not_complete(info);
2382   }
2383
2384   log_weirdness();
2385
2386   if (is_primary()) {
2387     // initialize snap_trimq
2388     interval_set<snapid_t> to_trim;
2389     auto& removed_snaps_queue = get_osdmap()->get_removed_snaps_queue();
2390     auto p = removed_snaps_queue.find(info.pgid.pgid.pool());
2391     if (p != removed_snaps_queue.end()) {
2392       dout(20) << "activate - purged_snaps " << info.purged_snaps
2393                << " removed_snaps " << p->second
2394                << dendl;
2395       for (auto q : p->second) {
2396         to_trim.insert(q.first, q.second);
2397       }
2398     }
2399     interval_set<snapid_t> purged;
2400     purged.intersection_of(to_trim, info.purged_snaps);
2401     to_trim.subtract(purged);
2402
2403     if (HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
2404       renew_lease(pl->get_mnow());
2405       // do not schedule until we are actually activated
2406     }
2407
2408     // adjust purged_snaps: PG may have been inactive while snaps were pruned
2409     // from the removed_snaps_queue in the osdmap.  update local purged_snaps
2410     // reflect only those snaps that we thought were pruned and were still in
2411     // the queue.
2412     info.purged_snaps.swap(purged);
2413
2414     // start up replicas
2415     info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
2416                                                  prior_readable_until_ub);
2417
2418     ceph_assert(!acting_recovery_backfill.empty());
2419     for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2420          i != acting_recovery_backfill.end();
2421          ++i) {
2422       if (*i == pg_whoami) continue;
2423       pg_shard_t peer = *i;
2424       ceph_assert(peer_info.count(peer));
2425       pg_info_t& pi = peer_info[peer];
2426
2427       psdout(10) << "activate peer osd." << peer << " " << pi << dendl;
2428
2429       MOSDPGLog *m = 0;
2430       ceph_assert(peer_missing.count(peer));
2431       pg_missing_t& pm = peer_missing[peer];
2432
2433       bool needs_past_intervals = pi.dne();
2434
2435       if (pi.last_update == info.last_update) {
2436         // empty log
2437         if (!pi.last_backfill.is_max())
2438           pl->get_clog_info() << info.pgid << " continuing backfill to osd."
2439                                 << peer
2440                                 << " from (" << pi.log_tail << "," << pi.last_update
2441                                 << "] " << pi.last_backfill
2442                                 << " to " << info.last_update;
2443         if (!pi.is_empty()) {
2444           psdout(10) << "activate peer osd." << peer
2445                      << " is up to date, queueing in pending_activators" << dendl;
2446           ctx.send_info(
2447             peer.osd,
2448             spg_t(info.pgid.pgid, peer.shard),
2449             get_osdmap_epoch(), // fixme: use lower epoch?
2450             get_osdmap_epoch(),
2451             info,
2452             get_lease());
2453         } else {
2454           psdout(10) << "activate peer osd." << peer
2455                      << " is up to date, but sending pg_log anyway" << dendl;
2456           m = new MOSDPGLog(
2457             i->shard, pg_whoami.shard,
2458             get_osdmap_epoch(), info,
2459             last_peering_reset);
2460         }
2461       } else if (
2462         pg_log.get_tail() > pi.last_update ||
2463         pi.last_backfill == hobject_t() ||
2464         (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
2465         /* ^ This last case covers a situation where a replica is not contiguous
2466          * with the auth_log, but is contiguous with this replica.  Reshuffling
2467          * the active set to handle this would be tricky, so instead we just go
2468          * ahead and backfill it anyway.  This is probably preferrable in any
2469          * case since the replica in question would have to be significantly
2470          * behind.
2471          */
2472         // backfill
2473         pl->get_clog_debug() << info.pgid << " starting backfill to osd." << peer
2474                                << " from (" << pi.log_tail << "," << pi.last_update
2475                                << "] " << pi.last_backfill
2476                                << " to " << info.last_update;
2477
2478         pi.last_update = info.last_update;
2479         pi.last_complete = info.last_update;
2480         pi.set_last_backfill(hobject_t());
2481         pi.last_epoch_started = info.last_epoch_started;
2482         pi.last_interval_started = info.last_interval_started;
2483         pi.history = info.history;
2484         pi.hit_set = info.hit_set;
2485         // Save num_bytes for reservation request, can't be negative
2486         peer_bytes[peer] = std::max<int64_t>(0, pi.stats.stats.sum.num_bytes);
2487         pi.stats.stats.clear();
2488         pi.stats.stats.sum.num_bytes = peer_bytes[peer];
2489
2490         // initialize peer with our purged_snaps.
2491         pi.purged_snaps = info.purged_snaps;
2492
2493         m = new MOSDPGLog(
2494           i->shard, pg_whoami.shard,
2495           get_osdmap_epoch(), pi,
2496           last_peering_reset /* epoch to create pg at */);
2497
2498         // send some recent log, so that op dup detection works well.
2499         m->log.copy_up_to(cct, pg_log.get_log(),
2500                           cct->_conf->osd_max_pg_log_entries);
2501         m->info.log_tail = m->log.tail;
2502         pi.log_tail = m->log.tail;  // sigh...
2503
2504         pm.clear();
2505       } else {
2506         // catch up
2507         ceph_assert(pg_log.get_tail() <= pi.last_update);
2508         m = new MOSDPGLog(
2509           i->shard, pg_whoami.shard,
2510           get_osdmap_epoch(), info,
2511           last_peering_reset /* epoch to create pg at */);
2512         // send new stuff to append to replicas log
2513         m->log.copy_after(cct, pg_log.get_log(), pi.last_update);
2514       }
2515
2516       // share past_intervals if we are creating the pg on the replica
2517       // based on whether our info for that peer was dne() *before*
2518       // updating pi.history in the backfill block above.
2519       if (m && needs_past_intervals)
2520         m->past_intervals = past_intervals;
2521
2522       // update local version of peer's missing list!
2523       if (m && pi.last_backfill != hobject_t()) {
2524         for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
2525              p != m->log.log.end();
2526              ++p) {
2527           if (p->soid <= pi.last_backfill &&
2528               !p->is_error()) {
2529             if (perform_deletes_during_peering() && p->is_delete()) {
2530               pm.rm(p->soid, p->version);
2531             } else {
2532               pm.add_next_event(*p);
2533             }
2534           }
2535         }
2536       }
2537
2538       if (m) {
2539         dout(10) << "activate peer osd." << peer << " sending " << m->log
2540                  << dendl;
2541         m->lease = get_lease();
2542         pl->send_cluster_message(peer.osd, m, get_osdmap_epoch());
2543       }
2544
2545       // peer now has
2546       pi.last_update = info.last_update;
2547
2548       // update our missing
2549       if (pm.num_missing() == 0) {
2550         pi.last_complete = pi.last_update;
2551         psdout(10) << "activate peer osd." << peer << " " << pi
2552                    << " uptodate" << dendl;
2553       } else {
2554         psdout(10) << "activate peer osd." << peer << " " << pi
2555                    << " missing " << pm << dendl;
2556       }
2557     }
2558
2559     // Set up missing_loc
2560     set<pg_shard_t> complete_shards;
2561     for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2562          i != acting_recovery_backfill.end();
2563          ++i) {
2564       psdout(20) << __func__ << " setting up missing_loc from shard " << *i
2565                  << " " << dendl;
2566       if (*i == get_primary()) {
2567         missing_loc.add_active_missing(missing);
2568         if (!missing.have_missing())
2569           complete_shards.insert(*i);
2570       } else {
2571         auto peer_missing_entry = peer_missing.find(*i);
2572         ceph_assert(peer_missing_entry != peer_missing.end());
2573         missing_loc.add_active_missing(peer_missing_entry->second);
2574         if (!peer_missing_entry->second.have_missing() &&
2575             peer_info[*i].last_backfill.is_max())
2576           complete_shards.insert(*i);
2577       }
2578     }
2579
2580     // If necessary, create might_have_unfound to help us find our unfound objects.
2581     // NOTE: It's important that we build might_have_unfound before trimming the
2582     // past intervals.
2583     might_have_unfound.clear();
2584     if (needs_recovery()) {
2585       // If only one shard has missing, we do a trick to add all others as recovery
2586       // source, this is considered safe since the PGLogs have been merged locally,
2587       // and covers vast majority of the use cases, like one OSD/host is down for
2588       // a while for hardware repairing
2589       if (complete_shards.size() + 1 == acting_recovery_backfill.size()) {
2590         missing_loc.add_batch_sources_info(complete_shards, ctx.handle);
2591       } else {
2592         missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
2593                                     ctx.handle);
2594         for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2595              i != acting_recovery_backfill.end();
2596              ++i) {
2597           if (*i == pg_whoami) continue;
2598           psdout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
2599           ceph_assert(peer_missing.count(*i));
2600           ceph_assert(peer_info.count(*i));
2601           missing_loc.add_source_info(
2602             *i,
2603             peer_info[*i],
2604             peer_missing[*i],
2605             ctx.handle);
2606         }
2607       }
2608       for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
2609            i != peer_missing.end();
2610            ++i) {
2611         if (is_acting_recovery_backfill(i->first))
2612           continue;
2613         ceph_assert(peer_info.count(i->first));
2614         search_for_missing(
2615           peer_info[i->first],
2616           i->second,
2617           i->first,
2618           ctx);
2619       }
2620
2621       build_might_have_unfound();
2622
2623       // Always call now so update_calc_stats() will be accurate
2624       discover_all_missing(ctx.msgs);
2625
2626     }
2627
2628     // num_objects_degraded if calculated should reflect this too, unless no
2629     // missing and we are about to go clean.
2630     if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
2631       state_set(PG_STATE_UNDERSIZED);
2632     }
2633
2634     state_set(PG_STATE_ACTIVATING);
2635     pl->on_activate(std::move(to_trim));
2636   }
2637   if (acting.size() >= pool.info.min_size) {
2638     PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2639     pg_log.roll_forward(rollbacker.get());
2640   }
2641 }
2642
2643 void PeeringState::share_pg_info()
2644 {
2645   psdout(10) << "share_pg_info" << dendl;
2646
2647   info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
2648                                                prior_readable_until_ub);
2649
2650   // share new pg_info_t with replicas
2651   ceph_assert(!acting_recovery_backfill.empty());
2652   for (auto pg_shard : acting_recovery_backfill) {
2653     if (pg_shard == pg_whoami) continue;
2654     if (auto peer = peer_info.find(pg_shard); peer != peer_info.end()) {
2655       peer->second.last_epoch_started = info.last_epoch_started;
2656       peer->second.last_interval_started = info.last_interval_started;
2657       peer->second.history.merge(info.history);
2658     }
2659     Message* m = nullptr;
2660     if (last_require_osd_release >= ceph_release_t::octopus) {
2661       m = new MOSDPGInfo2{spg_t{info.pgid.pgid, pg_shard.shard},
2662                           info,
2663                           get_osdmap_epoch(),
2664                           get_osdmap_epoch(),
2665                           get_lease(), {}};
2666     } else {
2667       m = new MOSDPGInfo{get_osdmap_epoch(),
2668                          {pg_notify_t{pg_shard.shard,
2669                                       pg_whoami.shard,
2670                                       get_osdmap_epoch(),
2671                                       get_osdmap_epoch(),
2672                                       info,
2673                                       past_intervals}}};
2674     }
2675     pl->send_cluster_message(pg_shard.osd, m, get_osdmap_epoch());
2676   }
2677 }
2678
2679 void PeeringState::merge_log(
2680   ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
2681   pg_shard_t from)
2682 {
2683   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2684   pg_log.merge_log(
2685     oinfo, olog, from, info, rollbacker.get(), dirty_info, dirty_big_info);
2686 }
2687
2688 void PeeringState::rewind_divergent_log(
2689   ObjectStore::Transaction& t, eversion_t newhead)
2690 {
2691   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2692   pg_log.rewind_divergent_log(
2693     newhead, info, rollbacker.get(), dirty_info, dirty_big_info);
2694 }
2695
2696
2697 void PeeringState::proc_primary_info(
2698   ObjectStore::Transaction &t, const pg_info_t &oinfo)
2699 {
2700   ceph_assert(!is_primary());
2701
2702   update_history(oinfo.history);
2703   if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
2704     info.stats.stats.sum.num_scrub_errors = 0;
2705     info.stats.stats.sum.num_shallow_scrub_errors = 0;
2706     info.stats.stats.sum.num_deep_scrub_errors = 0;
2707     dirty_info = true;
2708   }
2709
2710   if (!(info.purged_snaps == oinfo.purged_snaps)) {
2711     psdout(10) << __func__ << " updating purged_snaps to "
2712                << oinfo.purged_snaps
2713                << dendl;
2714     info.purged_snaps = oinfo.purged_snaps;
2715     dirty_info = true;
2716     dirty_big_info = true;
2717   }
2718 }
2719
2720 void PeeringState::proc_master_log(
2721   ObjectStore::Transaction& t, pg_info_t &oinfo,
2722   pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
2723 {
2724   psdout(10) << "proc_master_log for osd." << from << ": "
2725              << olog << " " << omissing << dendl;
2726   ceph_assert(!is_peered() && is_primary());
2727
2728   // merge log into our own log to build master log.  no need to
2729   // make any adjustments to their missing map; we are taking their
2730   // log to be authoritative (i.e., their entries are by definitely
2731   // non-divergent).
2732   merge_log(t, oinfo, olog, from);
2733   peer_info[from] = oinfo;
2734   psdout(10) << " peer osd." << from << " now " << oinfo
2735              << " " << omissing << dendl;
2736   might_have_unfound.insert(from);
2737
2738   // See doc/dev/osd_internals/last_epoch_started
2739   if (oinfo.last_epoch_started > info.last_epoch_started) {
2740     info.last_epoch_started = oinfo.last_epoch_started;
2741     dirty_info = true;
2742   }
2743   if (oinfo.last_interval_started > info.last_interval_started) {
2744     info.last_interval_started = oinfo.last_interval_started;
2745     dirty_info = true;
2746   }
2747   update_history(oinfo.history);
2748   ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
2749          info.last_epoch_started >= info.history.last_epoch_started);
2750
2751   peer_missing[from].claim(omissing);
2752 }
2753
2754 void PeeringState::proc_replica_log(
2755   pg_info_t &oinfo,
2756   const pg_log_t &olog,
2757   pg_missing_t& omissing,
2758   pg_shard_t from)
2759 {
2760   psdout(10) << "proc_replica_log for osd." << from << ": "
2761              << oinfo << " " << olog << " " << omissing << dendl;
2762
2763   pg_log.proc_replica_log(oinfo, olog, omissing, from);
2764
2765   peer_info[from] = oinfo;
2766   psdout(10) << " peer osd." << from << " now "
2767              << oinfo << " " << omissing << dendl;
2768   might_have_unfound.insert(from);
2769
2770   for (map<hobject_t, pg_missing_item>::const_iterator i =
2771          omissing.get_items().begin();
2772        i != omissing.get_items().end();
2773        ++i) {
2774     psdout(20) << " after missing " << i->first
2775                << " need " << i->second.need
2776                << " have " << i->second.have << dendl;
2777   }
2778   peer_missing[from].claim(omissing);
2779 }
2780
2781 void PeeringState::fulfill_info(
2782   pg_shard_t from, const pg_query_t &query,
2783   pair<pg_shard_t, pg_info_t> &notify_info)
2784 {
2785   ceph_assert(from == primary);
2786   ceph_assert(query.type == pg_query_t::INFO);
2787
2788   // info
2789   psdout(10) << "sending info" << dendl;
2790   notify_info = make_pair(from, info);
2791 }
2792
2793 void PeeringState::fulfill_log(
2794   pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
2795 {
2796   psdout(10) << "log request from " << from << dendl;
2797   ceph_assert(from == primary);
2798   ceph_assert(query.type != pg_query_t::INFO);
2799
2800   MOSDPGLog *mlog = new MOSDPGLog(
2801     from.shard, pg_whoami.shard,
2802     get_osdmap_epoch(),
2803     info, query_epoch);
2804   mlog->missing = pg_log.get_missing();
2805
2806   // primary -> other, when building master log
2807   if (query.type == pg_query_t::LOG) {
2808     psdout(10) << " sending info+missing+log since " << query.since
2809                << dendl;
2810     if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
2811       pl->get_clog_error() << info.pgid << " got broken pg_query_t::LOG since "
2812                              << query.since
2813                              << " when my log.tail is " << pg_log.get_tail()
2814                              << ", sending full log instead";
2815       mlog->log = pg_log.get_log();           // primary should not have requested this!!
2816     } else
2817       mlog->log.copy_after(cct, pg_log.get_log(), query.since);
2818   }
2819   else if (query.type == pg_query_t::FULLLOG) {
2820     psdout(10) << " sending info+missing+full log" << dendl;
2821     mlog->log = pg_log.get_log();
2822   }
2823
2824   psdout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
2825
2826   pl->send_cluster_message(from.osd, mlog, get_osdmap_epoch(), true);
2827 }
2828
2829 void PeeringState::fulfill_query(const MQuery& query, PeeringCtxWrapper &rctx)
2830 {
2831   if (query.query.type == pg_query_t::INFO) {
2832     pair<pg_shard_t, pg_info_t> notify_info;
2833     // note this refreshes our prior_readable_until_ub value
2834     update_history(query.query.history);
2835     fulfill_info(query.from, query.query, notify_info);
2836     rctx.send_notify(
2837       notify_info.first.osd,
2838       pg_notify_t(
2839         notify_info.first.shard, pg_whoami.shard,
2840         query.query_epoch,
2841         get_osdmap_epoch(),
2842         notify_info.second,
2843         past_intervals));
2844   } else {
2845     update_history(query.query.history);
2846     fulfill_log(query.from, query.query, query.query_epoch);
2847   }
2848 }
2849
2850 void PeeringState::try_mark_clean()
2851 {
2852   if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2853     state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
2854     state_set(PG_STATE_CLEAN);
2855     info.history.last_epoch_clean = get_osdmap_epoch();
2856     info.history.last_interval_clean = info.history.same_interval_since;
2857     past_intervals.clear();
2858     dirty_big_info = true;
2859     dirty_info = true;
2860   }
2861
2862   if (!is_active() && is_peered()) {
2863     if (is_clean()) {
2864       bool target;
2865       if (pool.info.is_pending_merge(info.pgid.pgid, &target)) {
2866         if (target) {
2867           psdout(10) << "ready to merge (target)" << dendl;
2868           pl->set_ready_to_merge_target(
2869             info.last_update,
2870             info.history.last_epoch_started,
2871             info.history.last_epoch_clean);
2872         } else {
2873           psdout(10) << "ready to merge (source)" << dendl;
2874           pl->set_ready_to_merge_source(info.last_update);
2875         }
2876       }
2877     } else {
2878       psdout(10) << "not clean, not ready to merge" << dendl;
2879       // we should have notified OSD in Active state entry point
2880     }
2881   }
2882
2883   state_clear(PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
2884
2885   share_pg_info();
2886   pl->publish_stats_to_osd();
2887   clear_recovery_state();
2888 }
2889
2890 void PeeringState::split_into(
2891   pg_t child_pgid, PeeringState *child, unsigned split_bits)
2892 {
2893   child->update_osdmap_ref(get_osdmap());
2894   child->pool = pool;
2895
2896   // Log
2897   pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2898   child->info.last_complete = info.last_complete;
2899
2900   info.last_update = pg_log.get_head();
2901   child->info.last_update = child->pg_log.get_head();
2902
2903   child->info.last_user_version = info.last_user_version;
2904
2905   info.log_tail = pg_log.get_tail();
2906   child->info.log_tail = child->pg_log.get_tail();
2907
2908   // reset last_complete, we might have modified pg_log & missing above
2909   pg_log.reset_complete_to(&info);
2910   child->pg_log.reset_complete_to(&child->info);
2911
2912   // Info
2913   child->info.history = info.history;
2914   child->info.history.epoch_created = get_osdmap_epoch();
2915   child->info.purged_snaps = info.purged_snaps;
2916
2917   if (info.last_backfill.is_max()) {
2918     child->info.set_last_backfill(hobject_t::get_max());
2919   } else {
2920     // restart backfill on parent and child to be safe.  we could
2921     // probably do better in the bitwise sort case, but it's more
2922     // fragile (there may be special work to do on backfill completion
2923     // in the future).
2924     info.set_last_backfill(hobject_t());
2925     child->info.set_last_backfill(hobject_t());
2926     // restarting backfill implies that the missing set is empty,
2927     // since it is only used for objects prior to last_backfill
2928     pg_log.reset_backfill();
2929     child->pg_log.reset_backfill();
2930   }
2931
2932   child->info.stats = info.stats;
2933   child->info.stats.parent_split_bits = split_bits;
2934   info.stats.stats_invalid = true;
2935   child->info.stats.stats_invalid = true;
2936   child->info.last_epoch_started = info.last_epoch_started;
2937   child->info.last_interval_started = info.last_interval_started;
2938
2939   // There can't be recovery/backfill going on now
2940   int primary, up_primary;
2941   vector<int> newup, newacting;
2942   get_osdmap()->pg_to_up_acting_osds(
2943     child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2944   child->init_primary_up_acting(
2945     newup,
2946     newacting,
2947     up_primary,
2948     primary);
2949   child->role = OSDMap::calc_pg_role(pg_whoami, child->acting);
2950
2951   // this comparison includes primary rank via pg_shard_t
2952   if (get_primary() != child->get_primary())
2953     child->info.history.same_primary_since = get_osdmap_epoch();
2954
2955   child->info.stats.up = up;
2956   child->info.stats.up_primary = up_primary;
2957   child->info.stats.acting = acting;
2958   child->info.stats.acting_primary = primary;
2959   child->info.stats.mapping_epoch = get_osdmap_epoch();
2960
2961   // History
2962   child->past_intervals = past_intervals;
2963
2964   child->on_new_interval();
2965
2966   child->send_notify = !child->is_primary();
2967
2968   child->dirty_info = true;
2969   child->dirty_big_info = true;
2970   dirty_info = true;
2971   dirty_big_info = true;
2972 }
2973
2974 void PeeringState::merge_from(
2975   map<spg_t,PeeringState *>& sources,
2976   PeeringCtx &rctx,
2977   unsigned split_bits,
2978   const pg_merge_meta_t& last_pg_merge_meta)
2979 {
2980   bool incomplete = false;
2981   if (info.last_complete != info.last_update ||
2982       info.is_incomplete() ||
2983       info.dne()) {
2984     psdout(10) << __func__ << " target incomplete" << dendl;
2985     incomplete = true;
2986   }
2987   if (last_pg_merge_meta.source_pgid != pg_t()) {
2988     if (info.pgid.pgid != last_pg_merge_meta.source_pgid.get_parent()) {
2989       psdout(10) << __func__ << " target doesn't match expected parent "
2990                  << last_pg_merge_meta.source_pgid.get_parent()
2991                  << " of source_pgid " << last_pg_merge_meta.source_pgid
2992                  << dendl;
2993       incomplete = true;
2994     }
2995     if (info.last_update != last_pg_merge_meta.target_version) {
2996       psdout(10) << __func__ << " target version doesn't match expected "
2997                << last_pg_merge_meta.target_version << dendl;
2998       incomplete = true;
2999     }
3000   }
3001
3002   PGLog::LogEntryHandlerRef handler{pl->get_log_handler(rctx.transaction)};
3003   pg_log.roll_forward(handler.get());
3004
3005   info.last_complete = info.last_update;  // to fake out trim()
3006   pg_log.reset_recovery_pointers();
3007   pg_log.trim(info.last_update, info);
3008
3009   vector<PGLog*> log_from;
3010   for (auto& i : sources) {
3011     auto& source = i.second;
3012     if (!source) {
3013       psdout(10) << __func__ << " source " << i.first << " missing" << dendl;
3014       incomplete = true;
3015       continue;
3016     }
3017     if (source->info.last_complete != source->info.last_update ||
3018         source->info.is_incomplete() ||
3019         source->info.dne()) {
3020       psdout(10) << __func__ << " source " << source->pg_whoami
3021                  << " incomplete"
3022                  << dendl;
3023       incomplete = true;
3024     }
3025     if (last_pg_merge_meta.source_pgid != pg_t()) {
3026       if (source->info.pgid.pgid != last_pg_merge_meta.source_pgid) {
3027         dout(10) << __func__ << " source " << source->info.pgid.pgid
3028                  << " doesn't match expected source pgid "
3029                  << last_pg_merge_meta.source_pgid << dendl;
3030         incomplete = true;
3031       }
3032       if (source->info.last_update != last_pg_merge_meta.source_version) {
3033         dout(10) << __func__ << " source version doesn't match expected "
3034                  << last_pg_merge_meta.target_version << dendl;
3035         incomplete = true;
3036       }
3037     }
3038
3039     // prepare log
3040     PGLog::LogEntryHandlerRef handler{
3041       source->pl->get_log_handler(rctx.transaction)};
3042     source->pg_log.roll_forward(handler.get());
3043     source->info.last_complete = source->info.last_update;  // to fake out trim()
3044     source->pg_log.reset_recovery_pointers();
3045     source->pg_log.trim(source->info.last_update, source->info);
3046     log_from.push_back(&source->pg_log);
3047
3048     // combine stats
3049     info.stats.add(source->info.stats);
3050
3051     // pull up last_update
3052     info.last_update = std::max(info.last_update, source->info.last_update);
3053
3054     // adopt source's PastIntervals if target has none.  we can do this since
3055     // pgp_num has been reduced prior to the merge, so the OSD mappings for
3056     // the PGs are identical.
3057     if (past_intervals.empty() && !source->past_intervals.empty()) {
3058       psdout(10) << __func__ << " taking source's past_intervals" << dendl;
3059       past_intervals = source->past_intervals;
3060     }
3061   }
3062
3063   info.last_complete = info.last_update;
3064   info.log_tail = info.last_update;
3065   if (incomplete) {
3066     info.last_backfill = hobject_t();
3067   }
3068
3069   // merge logs
3070   pg_log.merge_from(log_from, info.last_update);
3071
3072   // make sure we have a meaningful last_epoch_started/clean (if we were a
3073   // placeholder)
3074   if (info.history.epoch_created == 0) {
3075     // start with (a) source's history, since these PGs *should* have been
3076     // remapped in concert with each other...
3077     info.history = sources.begin()->second->info.history;
3078
3079     // we use the last_epoch_{started,clean} we got from
3080     // the caller, which are the epochs that were reported by the PGs were
3081     // found to be ready for merge.
3082     info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean;
3083     info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3084     info.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3085     psdout(10) << __func__
3086                << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/"
3087                << last_pg_merge_meta.last_epoch_clean
3088                << " from pool last_dec_*, source pg history was "
3089                << sources.begin()->second->info.history
3090                << dendl;
3091
3092     // if the past_intervals start is later than last_epoch_clean, it
3093     // implies the source repeered again but the target didn't, or
3094     // that the source became clean in a later epoch than the target.
3095     // avoid the discrepancy but adjusting the interval start
3096     // backwards to match so that check_past_interval_bounds() will
3097     // not complain.
3098     auto pib = past_intervals.get_bounds();
3099     if (info.history.last_epoch_clean < pib.first) {
3100       psdout(10) << __func__ << " last_epoch_clean "
3101                  << info.history.last_epoch_clean << " < past_interval start "
3102                  << pib.first << ", adjusting start backwards" << dendl;
3103       past_intervals.adjust_start_backwards(info.history.last_epoch_clean);
3104     }
3105
3106     // Similarly, if the same_interval_since value is later than
3107     // last_epoch_clean, the next interval change will result in a
3108     // past_interval start that is later than last_epoch_clean.  This
3109     // can happen if we use the pg_history values from the merge
3110     // source.  Adjust the same_interval_since value backwards if that
3111     // happens.  (We trust the les and lec values more because they came from
3112     // the real target, whereas the history value we stole from the source.)
3113     if (info.history.last_epoch_started < info.history.same_interval_since) {
3114       psdout(10) << __func__ << " last_epoch_started "
3115                  << info.history.last_epoch_started << " < same_interval_since "
3116                  << info.history.same_interval_since
3117                  << ", adjusting pg_history backwards" << dendl;
3118       info.history.same_interval_since = info.history.last_epoch_clean;
3119       // make sure same_{up,primary}_since are <= same_interval_since
3120       info.history.same_up_since = std::min(
3121         info.history.same_up_since, info.history.same_interval_since);
3122       info.history.same_primary_since = std::min(
3123         info.history.same_primary_since, info.history.same_interval_since);
3124     }
3125   }
3126
3127   dirty_info = true;
3128   dirty_big_info = true;
3129 }
3130
3131 void PeeringState::start_split_stats(
3132   const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
3133 {
3134   out->resize(childpgs.size() + 1);
3135   info.stats.stats.sum.split(*out);
3136 }
3137
3138 void PeeringState::finish_split_stats(
3139   const object_stat_sum_t& stats, ObjectStore::Transaction &t)
3140 {
3141   info.stats.stats.sum = stats;
3142   write_if_dirty(t);
3143 }
3144
3145 void PeeringState::update_blocked_by()
3146 {
3147   // set a max on the number of blocking peers we report. if we go
3148   // over, report a random subset.  keep the result sorted.
3149   unsigned keep = std::min<unsigned>(
3150     blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
3151   unsigned skip = blocked_by.size() - keep;
3152   info.stats.blocked_by.clear();
3153   info.stats.blocked_by.resize(keep);
3154   unsigned pos = 0;
3155   for (set<int>::iterator p = blocked_by.begin();
3156        p != blocked_by.end() && keep > 0;
3157        ++p) {
3158     if (skip > 0 && (rand() % (skip + keep) < skip)) {
3159       --skip;
3160     } else {
3161       info.stats.blocked_by[pos++] = *p;
3162       --keep;
3163     }
3164   }
3165 }
3166
3167 static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard)
3168 {
3169     for (auto&p : pgs)
3170       if (p.shard == shard)
3171         return true;
3172     return false;
3173 }
3174
3175 static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard)
3176 {
3177     for (auto&p : pgs) {
3178       if (p == skip)
3179         continue;
3180       if (p.shard == shard)
3181         return p;
3182     }
3183     return pg_shard_t();
3184 }
3185
3186 void PeeringState::update_calc_stats()
3187 {
3188   info.stats.version = info.last_update;
3189   info.stats.created = info.history.epoch_created;
3190   info.stats.last_scrub = info.history.last_scrub;
3191   info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
3192   info.stats.last_deep_scrub = info.history.last_deep_scrub;
3193   info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
3194   info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
3195   info.stats.last_epoch_clean = info.history.last_epoch_clean;
3196
3197   info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
3198   info.stats.ondisk_log_size = info.stats.log_size;
3199   info.stats.log_start = pg_log.get_tail();
3200   info.stats.ondisk_log_start = pg_log.get_tail();
3201   info.stats.snaptrimq_len = pl->get_snap_trimq_size();
3202
3203   unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
3204
3205   // In rare case that upset is too large (usually transient), use as target
3206   // for calculations below.
3207   unsigned target = std::max(num_shards, (unsigned)upset.size());
3208   // For undersized actingset may be larger with OSDs out
3209   unsigned nrep = std::max(actingset.size(), upset.size());
3210   // calc num_object_copies
3211   info.stats.stats.calc_copies(std::max(target, nrep));
3212   info.stats.stats.sum.num_objects_degraded = 0;
3213   info.stats.stats.sum.num_objects_unfound = 0;
3214   info.stats.stats.sum.num_objects_misplaced = 0;
3215   info.stats.avail_no_missing.clear();
3216   info.stats.object_location_counts.clear();
3217
3218   // We should never hit this condition, but if end up hitting it,
3219   // make sure to update num_objects and set PG_STATE_INCONSISTENT.
3220   if (info.stats.stats.sum.num_objects < 0) {
3221     psdout(0) << __func__ << " negative num_objects = "
3222               << info.stats.stats.sum.num_objects << " setting it to 0 "
3223               << dendl;
3224     info.stats.stats.sum.num_objects = 0;
3225     state_set(PG_STATE_INCONSISTENT);
3226   }
3227
3228   if ((is_remapped() || is_undersized() || !is_clean()) &&
3229       (is_peered()|| is_activating())) {
3230     psdout(20) << __func__ << " actingset " << actingset << " upset "
3231                << upset << " acting_recovery_backfill " << acting_recovery_backfill << dendl;
3232
3233     ceph_assert(!acting_recovery_backfill.empty());
3234
3235     bool estimate = false;
3236
3237     // NOTE: we only generate degraded, misplaced and unfound
3238     // values for the summation, not individual stat categories.
3239     int64_t num_objects = info.stats.stats.sum.num_objects;
3240
3241     // Objects missing from up nodes, sorted by # objects.
3242     boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
3243     // Objects missing from nodes not in up, sort by # objects
3244     boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
3245
3246     // Fill missing_target_objects/acting_source_objects
3247
3248     {
3249       int64_t missing;
3250
3251       // Primary first
3252       missing = pg_log.get_missing().num_missing();
3253       ceph_assert(acting_recovery_backfill.count(pg_whoami));
3254       if (upset.count(pg_whoami)) {
3255         missing_target_objects.emplace(missing, pg_whoami);
3256       } else {
3257         acting_source_objects.emplace(missing, pg_whoami);
3258       }
3259       info.stats.stats.sum.num_objects_missing_on_primary = missing;
3260       if (missing == 0)
3261         info.stats.avail_no_missing.push_back(pg_whoami);
3262       psdout(20) << __func__ << " shard " << pg_whoami
3263                  << " primary objects " << num_objects
3264                  << " missing " << missing
3265                  << dendl;
3266     }
3267
3268     // All other peers
3269     for (auto& peer : peer_info) {
3270       // Primary should not be in the peer_info, skip if it is.
3271       if (peer.first == pg_whoami) continue;
3272       int64_t missing = 0;
3273       int64_t peer_num_objects = peer.second.stats.stats.sum.num_objects;
3274       // Backfill targets always track num_objects accurately
3275       // all other peers track missing accurately.
3276       if (is_backfill_target(peer.first)) {
3277         missing = std::max((int64_t)0, num_objects - peer_num_objects);
3278       } else {
3279         if (peer_missing.count(peer.first)) {
3280           missing = peer_missing[peer.first].num_missing();
3281         } else {
3282           psdout(20) << __func__ << " no peer_missing found for "
3283                      << peer.first << dendl;
3284           if (is_recovering()) {
3285             estimate = true;
3286           }
3287           missing = std::max((int64_t)0, num_objects - peer_num_objects);
3288         }
3289       }
3290       if (upset.count(peer.first)) {
3291         missing_target_objects.emplace(missing, peer.first);
3292       } else if (actingset.count(peer.first)) {
3293         acting_source_objects.emplace(missing, peer.first);
3294       }
3295       peer.second.stats.stats.sum.num_objects_missing = missing;
3296       if (missing == 0)
3297         info.stats.avail_no_missing.push_back(peer.first);
3298       psdout(20) << __func__ << " shard " << peer.first
3299                  << " objects " << peer_num_objects
3300                  << " missing " << missing
3301                  << dendl;
3302     }
3303
3304     // Compute object_location_counts
3305     for (auto& ml: missing_loc.get_missing_locs()) {
3306       info.stats.object_location_counts[ml.second]++;
3307       psdout(30) << __func__ << " " << ml.first << " object_location_counts["
3308                  << ml.second << "]=" << info.stats.object_location_counts[ml.second]
3309                  << dendl;
3310     }
3311     int64_t not_missing = num_objects - missing_loc.get_missing_locs().size();
3312     if (not_missing) {
3313         // During recovery we know upset == actingset and is being populated
3314         // During backfill we know that all non-missing objects are in the actingset
3315         info.stats.object_location_counts[actingset] = not_missing;
3316     }
3317     psdout(30) << __func__ << " object_location_counts["
3318                << upset << "]=" << info.stats.object_location_counts[upset]
3319                << dendl;
3320     psdout(20) << __func__ << " object_location_counts "
3321                << info.stats.object_location_counts << dendl;
3322
3323     // A misplaced object is not stored on the correct OSD
3324     int64_t misplaced = 0;
3325     // a degraded objects has fewer replicas or EC shards than the pool specifies.
3326     int64_t degraded = 0;
3327
3328     if (is_recovering()) {
3329       for (auto& sml: missing_loc.get_missing_by_count()) {
3330         for (auto& ml: sml.second) {
3331           int missing_shards;
3332           if (sml.first == shard_id_t::NO_SHARD) {
3333             psdout(20) << __func__ << " ml " << ml.second
3334                        << " upset size " << upset.size()
3335                        << " up " << ml.first.up << dendl;
3336             missing_shards = (int)upset.size() - ml.first.up;
3337           } else {
3338             // Handle shards not even in upset below
3339             if (!find_shard(upset, sml.first))
3340               continue;
3341             missing_shards = std::max(0, 1 - ml.first.up);
3342             psdout(20) << __func__
3343                        << " shard " << sml.first
3344                        << " ml " << ml.second
3345                        << " missing shards " << missing_shards << dendl;
3346           }
3347           int odegraded = ml.second * missing_shards;
3348           // Copies on other osds but limited to the possible degraded
3349           int more_osds = std::min(missing_shards, ml.first.other);
3350           int omisplaced = ml.second * more_osds;
3351           ceph_assert(omisplaced <= odegraded);
3352           odegraded -= omisplaced;
3353
3354           misplaced += omisplaced;
3355           degraded += odegraded;
3356         }
3357       }
3358
3359       psdout(20) << __func__ << " missing based degraded "
3360                  << degraded << dendl;
3361       psdout(20) << __func__ << " missing based misplaced "
3362                  << misplaced << dendl;
3363
3364       // Handle undersized case
3365       if (pool.info.is_replicated()) {
3366         // Add degraded for missing targets (num_objects missing)
3367         ceph_assert(target >= upset.size());
3368         unsigned needed = target - upset.size();
3369         degraded += num_objects * needed;
3370       } else {
3371         for (unsigned i = 0 ; i < num_shards; ++i) {
3372           shard_id_t shard(i);
3373
3374           if (!find_shard(upset, shard)) {
3375             pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard);
3376
3377             if (pgs != pg_shard_t()) {
3378               int64_t missing;
3379
3380               if (pgs == pg_whoami)
3381                 missing = info.stats.stats.sum.num_objects_missing_on_primary;
3382               else
3383                 missing = peer_info[pgs].stats.stats.sum.num_objects_missing;
3384
3385               degraded += missing;
3386               misplaced += std::max((int64_t)0, num_objects - missing);
3387             } else {
3388               // No shard anywhere
3389               degraded += num_objects;
3390             }
3391           }
3392         }
3393       }
3394       goto out;
3395     }
3396
3397     // Handle undersized case
3398     if (pool.info.is_replicated()) {
3399       // Add to missing_target_objects
3400       ceph_assert(target >= missing_target_objects.size());
3401       unsigned needed = target - missing_target_objects.size();
3402       if (needed)
3403         missing_target_objects.emplace(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD));
3404     } else {
3405       for (unsigned i = 0 ; i < num_shards; ++i) {
3406         shard_id_t shard(i);
3407         bool found = false;
3408         for (const auto& t : missing_target_objects) {
3409           if (std::get<1>(t).shard == shard) {
3410             found = true;
3411             break;
3412           }
3413         }
3414         if (!found)
3415           missing_target_objects.emplace(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard));
3416       }
3417     }
3418
3419     for (const auto& item : missing_target_objects)
3420       psdout(20) << __func__ << " missing shard " << std::get<1>(item)
3421                  << " missing= " << std::get<0>(item) << dendl;
3422     for (const auto& item : acting_source_objects)
3423       psdout(20) << __func__ << " acting shard " << std::get<1>(item)
3424                  << " missing= " << std::get<0>(item) << dendl;
3425
3426     // Handle all objects not in missing for remapped
3427     // or backfill
3428     for (auto m = missing_target_objects.rbegin();
3429         m != missing_target_objects.rend(); ++m) {
3430
3431       int64_t extra_missing = -1;
3432
3433       if (pool.info.is_replicated()) {
3434         if (!acting_source_objects.empty()) {
3435           auto extra_copy = acting_source_objects.begin();
3436           extra_missing = std::get<0>(*extra_copy);
3437           acting_source_objects.erase(extra_copy);
3438         }
3439       } else {  // Erasure coded
3440         // Use corresponding shard
3441         for (const auto& a : acting_source_objects) {
3442           if (std::get<1>(a).shard == std::get<1>(*m).shard) {
3443             extra_missing = std::get<0>(a);
3444             acting_source_objects.erase(a);
3445             break;
3446           }
3447         }
3448       }
3449
3450       if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
3451         // We don't know which of the objects on the target
3452         // are part of extra_missing so assume are all degraded.
3453         misplaced += std::get<0>(*m) - extra_missing;
3454         degraded += extra_missing;
3455       } else {
3456         // 1. extra_missing == -1, more targets than sources so degraded
3457         // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
3458         //    previously degraded are now present on the target.
3459         degraded += std::get<0>(*m);
3460       }
3461     }
3462     // If there are still acting that haven't been accounted for
3463     // then they are misplaced
3464     for (const auto& a : acting_source_objects) {
3465       int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
3466       psdout(20) << __func__ << " extra acting misplaced " << extra_misplaced
3467                  << dendl;
3468       misplaced += extra_misplaced;
3469     }
3470 out:
3471     // NOTE: Tests use these messages to verify this code
3472     psdout(20) << __func__ << " degraded " << degraded
3473                << (estimate ? " (est)": "") << dendl;
3474     psdout(20) << __func__ << " misplaced " << misplaced
3475                << (estimate ? " (est)": "")<< dendl;
3476
3477     info.stats.stats.sum.num_objects_degraded = degraded;
3478     info.stats.stats.sum.num_objects_unfound = get_num_unfound();
3479     info.stats.stats.sum.num_objects_misplaced = misplaced;
3480   }
3481 }
3482
3483 std::optional<pg_stat_t> PeeringState::prepare_stats_for_publish(
3484   bool pg_stats_publish_valid,
3485   const pg_stat_t &pg_stats_publish,
3486   const object_stat_collection_t &unstable_stats)
3487 {
3488   if (info.stats.stats.sum.num_scrub_errors) {
3489     state_set(PG_STATE_INCONSISTENT);
3490   } else {
3491     state_clear(PG_STATE_INCONSISTENT);
3492     state_clear(PG_STATE_FAILED_REPAIR);
3493   }
3494
3495   utime_t now = ceph_clock_now();
3496   if (info.stats.state != state) {
3497     info.stats.last_change = now;
3498     // Optimistic estimation, if we just find out an inactive PG,
3499     // assumt it is active till now.
3500     if (!(state & PG_STATE_ACTIVE) &&
3501         (info.stats.state & PG_STATE_ACTIVE))
3502       info.stats.last_active = now;
3503
3504     if ((state & PG_STATE_ACTIVE) &&
3505         !(info.stats.state & PG_STATE_ACTIVE))
3506       info.stats.last_became_active = now;
3507     if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
3508         !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
3509       info.stats.last_became_peered = now;
3510     info.stats.state = state;
3511   }
3512
3513   update_calc_stats();
3514   if (info.stats.stats.sum.num_objects_degraded) {
3515     state_set(PG_STATE_DEGRADED);
3516   } else {
3517     state_clear(PG_STATE_DEGRADED);
3518   }
3519   update_blocked_by();
3520
3521   pg_stat_t pre_publish = info.stats;
3522   pre_publish.stats.add(unstable_stats);
3523   utime_t cutoff = now;
3524   cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
3525
3526   // share (some of) our purged_snaps via the pg_stats. limit # of intervals
3527   // because we don't want to make the pg_stat_t structures too expensive.
3528   unsigned max = cct->_conf->osd_max_snap_prune_intervals_per_epoch;
3529   unsigned num = 0;
3530   auto i = info.purged_snaps.begin();
3531   while (num < max && i != info.purged_snaps.end()) {
3532     pre_publish.purged_snaps.insert(i.get_start(), i.get_len());
3533     ++num;
3534     ++i;
3535   }
3536   psdout(20) << __func__ << " reporting purged_snaps "
3537              << pre_publish.purged_snaps << dendl;
3538
3539   if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
3540       info.stats.last_fresh > cutoff) {
3541     psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3542                << ": no change since " << info.stats.last_fresh << dendl;
3543     return std::nullopt;
3544   } else {
3545     // update our stat summary and timestamps
3546     info.stats.reported_epoch = get_osdmap_epoch();
3547     ++info.stats.reported_seq;
3548
3549     info.stats.last_fresh = now;
3550
3551     if (info.stats.state & PG_STATE_CLEAN)
3552       info.stats.last_clean = now;
3553     if (info.stats.state & PG_STATE_ACTIVE)
3554       info.stats.last_active = now;
3555     if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
3556       info.stats.last_peered = now;
3557     info.stats.last_unstale = now;
3558     if ((info.stats.state & PG_STATE_DEGRADED) == 0)
3559       info.stats.last_undegraded = now;
3560     if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
3561       info.stats.last_fullsized = now;
3562
3563     psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3564                << ":" << pg_stats_publish.reported_seq << dendl;
3565     return std::make_optional(std::move(pre_publish));
3566   }
3567 }
3568
3569 void PeeringState::init(
3570   int role,
3571   const vector<int>& newup, int new_up_primary,
3572   const vector<int>& newacting, int new_acting_primary,
3573   const pg_history_t& history,
3574   const PastIntervals& pi,
3575   bool backfill,
3576   ObjectStore::Transaction &t)
3577 {
3578   psdout(10) << "init role " << role << " up "
3579              << newup << " acting " << newacting
3580              << " history " << history
3581              << " past_intervals " << pi
3582              << dendl;
3583
3584   set_role(role);
3585   init_primary_up_acting(
3586     newup,
3587     newacting,
3588     new_up_primary,
3589     new_acting_primary);
3590
3591   info.history = history;
3592   past_intervals = pi;
3593
3594   info.stats.up = up;
3595   info.stats.up_primary = new_up_primary;
3596   info.stats.acting = acting;
3597   info.stats.acting_primary = new_acting_primary;
3598   info.stats.mapping_epoch = info.history.same_interval_since;
3599
3600   if (!perform_deletes_during_peering()) {
3601     pg_log.set_missing_may_contain_deletes();
3602   }
3603
3604   if (backfill) {
3605     psdout(10) << __func__ << ": Setting backfill" << dendl;
3606     info.set_last_backfill(hobject_t());
3607     info.last_complete = info.last_update;
3608     pg_log.mark_log_for_rewrite();
3609   }
3610
3611   on_new_interval();
3612
3613   dirty_info = true;
3614   dirty_big_info = true;
3615   write_if_dirty(t);
3616 }
3617
3618 void PeeringState::dump_peering_state(Formatter *f)
3619 {
3620   f->dump_string("state", get_pg_state_string());
3621   f->dump_unsigned("epoch", get_osdmap_epoch());
3622   f->open_array_section("up");
3623   for (vector<int>::const_iterator p = up.begin(); p != up.end(); ++p)
3624     f->dump_unsigned("osd", *p);
3625   f->close_section();
3626   f->open_array_section("acting");
3627   for (vector<int>::const_iterator p = acting.begin(); p != acting.end(); ++p)
3628     f->dump_unsigned("osd", *p);
3629   f->close_section();
3630   if (!backfill_targets.empty()) {
3631     f->open_array_section("backfill_targets");
3632     for (set<pg_shard_t>::iterator p = backfill_targets.begin();
3633          p != backfill_targets.end();
3634          ++p)
3635       f->dump_stream("shard") << *p;
3636     f->close_section();
3637   }
3638   if (!async_recovery_targets.empty()) {
3639     f->open_array_section("async_recovery_targets");
3640     for (set<pg_shard_t>::iterator p = async_recovery_targets.begin();
3641          p != async_recovery_targets.end();
3642          ++p)
3643       f->dump_stream("shard") << *p;
3644     f->close_section();
3645   }
3646   if (!acting_recovery_backfill.empty()) {
3647     f->open_array_section("acting_recovery_backfill");
3648     for (set<pg_shard_t>::iterator p = acting_recovery_backfill.begin();
3649          p != acting_recovery_backfill.end();
3650          ++p)
3651       f->dump_stream("shard") << *p;
3652     f->close_section();
3653   }
3654   f->open_object_section("info");
3655   update_calc_stats();
3656   info.dump(f);
3657   f->close_section();
3658
3659   f->open_array_section("peer_info");
3660   for (map<pg_shard_t, pg_info_t>::const_iterator p = peer_info.begin();
3661        p != peer_info.end();
3662        ++p) {
3663     f->open_object_section("info");
3664     f->dump_stream("peer") << p->first;
3665     p->second.dump(f);
3666     f->close_section();
3667   }
3668 }
3669
3670 void PeeringState::update_stats(
3671   std::function<bool(pg_history_t &, pg_stat_t &)> f,
3672   ObjectStore::Transaction *t) {
3673   if (f(info.history, info.stats)) {
3674     pl->publish_stats_to_osd();
3675   }
3676   pl->on_info_history_change();
3677
3678   if (t) {
3679     dirty_info = true;
3680     write_if_dirty(*t);
3681   }
3682 }
3683
3684 bool PeeringState::append_log_entries_update_missing(
3685   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3686   ObjectStore::Transaction &t, std::optional<eversion_t> trim_to,
3687   std::optional<eversion_t> roll_forward_to)
3688 {
3689   ceph_assert(!entries.empty());
3690   ceph_assert(entries.begin()->version > info.last_update);
3691
3692   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
3693   bool invalidate_stats =
3694     pg_log.append_new_log_entries(
3695       info.last_backfill,
3696       entries,
3697       rollbacker.get());
3698
3699   if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
3700     pg_log.roll_forward(rollbacker.get());
3701   }
3702   if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
3703     pg_log.roll_forward_to(*roll_forward_to, rollbacker.get());
3704     last_rollback_info_trimmed_to_applied = *roll_forward_to;
3705   }
3706
3707   info.last_update = pg_log.get_head();
3708
3709   if (pg_log.get_missing().num_missing() == 0) {
3710     // advance last_complete since nothing else is missing!
3711     info.last_complete = info.last_update;
3712   }
3713   info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
3714
3715   psdout(20) << __func__ << " trim_to bool = " << bool(trim_to)
3716              << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
3717   if (trim_to)
3718     pg_log.trim(*trim_to, info);
3719   dirty_info = true;
3720   write_if_dirty(t);
3721   return invalidate_stats;
3722 }
3723
3724 void PeeringState::merge_new_log_entries(
3725   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3726   ObjectStore::Transaction &t,
3727   std::optional<eversion_t> trim_to,
3728   std::optional<eversion_t> roll_forward_to)
3729 {
3730   psdout(10) << __func__ << " " << entries << dendl;
3731   ceph_assert(is_primary());
3732
3733   bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
3734   for (set<pg_shard_t>::const_iterator i = acting_recovery_backfill.begin();
3735        i != acting_recovery_backfill.end();
3736        ++i) {
3737     pg_shard_t peer(*i);
3738     if (peer == pg_whoami) continue;
3739     ceph_assert(peer_missing.count(peer));
3740     ceph_assert(peer_info.count(peer));
3741     pg_missing_t& pmissing(peer_missing[peer]);
3742     psdout(20) << __func__ << " peer_missing for " << peer
3743                << " = " << pmissing << dendl;
3744     pg_info_t& pinfo(peer_info[peer]);
3745     bool invalidate_stats = PGLog::append_log_entries_update_missing(
3746       pinfo.last_backfill,
3747       entries,
3748       true,
3749       NULL,
3750       pmissing,
3751       NULL,
3752       dpp);
3753     pinfo.last_update = info.last_update;
3754     pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
3755     rebuild_missing = rebuild_missing || invalidate_stats;
3756   }
3757
3758   if (!rebuild_missing) {
3759     return;
3760   }
3761
3762   for (auto &&i: entries) {
3763     missing_loc.rebuild(
3764       i.soid,
3765       pg_whoami,
3766       acting_recovery_backfill,
3767       info,
3768       pg_log.get_missing(),
3769       peer_missing,
3770       peer_info);
3771   }
3772 }
3773
3774 void PeeringState::add_log_entry(const pg_log_entry_t& e, bool applied)
3775 {
3776   // raise last_complete only if we were previously up to date
3777   if (info.last_complete == info.last_update)
3778     info.last_complete = e.version;
3779
3780   // raise last_update.
3781   ceph_assert(e.version > info.last_update);
3782   info.last_update = e.version;
3783
3784   // raise user_version, if it increased (it may have not get bumped
3785   // by all logged updates)
3786   if (e.user_version > info.last_user_version)
3787     info.last_user_version = e.user_version;
3788
3789   // log mutation
3790   pg_log.add(e, applied);
3791   psdout(10) << "add_log_entry " << e << dendl;
3792 }
3793
3794
3795 void PeeringState::append_log(
3796   const vector<pg_log_entry_t>& logv,
3797   eversion_t trim_to,
3798   eversion_t roll_forward_to,
3799   eversion_t mlcod,
3800   ObjectStore::Transaction &t,
3801   bool transaction_applied,
3802   bool async)
3803 {
3804   /* The primary has sent an info updating the history, but it may not
3805    * have arrived yet.  We want to make sure that we cannot remember this
3806    * write without remembering that it happened in an interval which went
3807    * active in epoch history.last_epoch_started.
3808    */
3809   if (info.last_epoch_started != info.history.last_epoch_started) {
3810     info.history.last_epoch_started = info.last_epoch_started;
3811   }
3812   if (info.last_interval_started != info.history.last_interval_started) {
3813     info.history.last_interval_started = info.last_interval_started;
3814   }
3815   psdout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3816
3817   PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
3818   if (!transaction_applied) {
3819      /* We must be a backfill or async recovery peer, so it's ok if we apply
3820       * out-of-turn since we won't be considered when
3821       * determining a min possible last_update.
3822       *
3823       * We skip_rollforward() here, which advances the crt, without
3824       * doing an actual rollforward. This avoids cleaning up entries
3825       * from the backend and we do not end up in a situation, where the
3826       * object is deleted before we can _merge_object_divergent_entries().
3827       */
3828     pg_log.skip_rollforward();
3829   }
3830
3831   for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3832        p != logv.end();
3833        ++p) {
3834     add_log_entry(*p, transaction_applied);
3835
3836     /* We don't want to leave the rollforward artifacts around
3837      * here past last_backfill.  It's ok for the same reason as
3838      * above */
3839     if (transaction_applied &&
3840         p->soid > info.last_backfill) {
3841       pg_log.roll_forward(handler.get());
3842     }
3843   }
3844   if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
3845     pg_log.roll_forward_to(
3846       roll_forward_to,
3847       handler.get());
3848     last_rollback_info_trimmed_to_applied = roll_forward_to;
3849   }
3850
3851   psdout(10) << __func__ << " approx pg log length =  "
3852              << pg_log.get_log().approx_size() << dendl;
3853   psdout(10) << __func__ << " transaction_applied = "
3854              << transaction_applied << dendl;
3855   if (!transaction_applied || async)
3856     psdout(10) << __func__ << " " << pg_whoami
3857                << " is async_recovery or backfill target" << dendl;
3858   pg_log.trim(trim_to, info, transaction_applied, async);
3859
3860   // update the local pg, pg log
3861   dirty_info = true;
3862   write_if_dirty(t);
3863
3864   if (!is_primary())
3865     min_last_complete_ondisk = mlcod;
3866 }
3867
3868 void PeeringState::recover_got(
3869   const hobject_t &oid, eversion_t v,
3870   bool is_delete,
3871   ObjectStore::Transaction &t)
3872 {
3873   if (v > pg_log.get_can_rollback_to()) {
3874     /* This can only happen during a repair, and even then, it would
3875      * be one heck of a race.  If we are repairing the object, the
3876      * write in question must be fully committed, so it's not valid
3877      * to roll it back anyway (and we'll be rolled forward shortly
3878      * anyway) */
3879     PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
3880     pg_log.roll_forward_to(v, handler.get());
3881   }
3882
3883   psdout(10) << "got missing " << oid << " v " << v << dendl;
3884   pg_log.recover_got(oid, v, info);
3885   if (pg_log.get_log().log.empty()) {
3886     psdout(10) << "last_complete now " << info.last_complete
3887                << " while log is empty" << dendl;
3888   } else if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
3889     psdout(10) << "last_complete now " << info.last_complete
3890                << " log.complete_to " << pg_log.get_log().complete_to->version
3891                << dendl;
3892   } else {
3893     psdout(10) << "last_complete now " << info.last_complete
3894                << " log.complete_to at end" << dendl;
3895     //below is not true in the repair case.
3896     //assert(missing.num_missing() == 0);  // otherwise, complete_to was wrong.
3897     ceph_assert(info.last_complete == info.last_update);
3898   }
3899
3900   if (is_primary()) {
3901     ceph_assert(missing_loc.needs_recovery(oid));
3902     if (!is_delete)
3903       missing_loc.add_location(oid, pg_whoami);
3904   }
3905
3906   // update pg
3907   dirty_info = true;
3908   write_if_dirty(t);
3909 }
3910
3911 void PeeringState::update_backfill_progress(
3912   const hobject_t &updated_backfill,
3913   const pg_stat_t &updated_stats,
3914   bool preserve_local_num_bytes,
3915   ObjectStore::Transaction &t) {
3916   info.set_last_backfill(updated_backfill);
3917   if (preserve_local_num_bytes) {
3918     psdout(25) << __func__ << " primary " << updated_stats.stats.sum.num_bytes
3919                << " local " << info.stats.stats.sum.num_bytes << dendl;
3920     int64_t bytes = info.stats.stats.sum.num_bytes;
3921     info.stats = updated_stats;
3922     info.stats.stats.sum.num_bytes = bytes;
3923   } else {
3924     psdout(20) << __func__ << " final " << updated_stats.stats.sum.num_bytes
3925                << " replaces local " << info.stats.stats.sum.num_bytes << dendl;
3926     info.stats = updated_stats;
3927   }
3928
3929   dirty_info = true;
3930   write_if_dirty(t);
3931 }
3932
3933 void PeeringState::adjust_purged_snaps(
3934   std::function<void(interval_set<snapid_t> &snaps)> f) {
3935   f(info.purged_snaps);
3936   dirty_info = true;
3937   dirty_big_info = true;
3938 }
3939
3940 void PeeringState::on_peer_recover(
3941   pg_shard_t peer,
3942   const hobject_t &soid,
3943   const eversion_t &version)
3944 {
3945   pl->publish_stats_to_osd();
3946   // done!
3947   peer_missing[peer].got(soid, version);
3948   missing_loc.add_location(soid, peer);
3949 }
3950
3951 void PeeringState::begin_peer_recover(
3952   pg_shard_t peer,
3953   const hobject_t soid)
3954 {
3955   peer_missing[peer].revise_have(soid, eversion_t());
3956 }
3957
3958 void PeeringState::force_object_missing(
3959   const set<pg_shard_t> &peers,
3960   const hobject_t &soid,
3961   eversion_t version)
3962 {
3963   for (auto &&peer : peers) {
3964     if (peer != primary) {
3965       peer_missing[peer].add(soid, version, eversion_t(), false);
3966     } else {
3967       pg_log.missing_add(soid, version, eversion_t());
3968       pg_log.reset_complete_to(&info);
3969       pg_log.set_last_requested(0);
3970     }
3971   }
3972
3973   missing_loc.rebuild(
3974     soid,
3975     pg_whoami,
3976     acting_recovery_backfill,
3977     info,
3978     pg_log.get_missing(),
3979     peer_missing,
3980     peer_info);
3981 }
3982
3983 void PeeringState::pre_submit_op(
3984   const hobject_t &hoid,
3985   const vector<pg_log_entry_t>& logv,
3986   eversion_t at_version)
3987 {
3988   if (at_version > eversion_t()) {
3989     for (auto &&i : get_acting_recovery_backfill()) {
3990       if (i == primary) continue;
3991       pg_info_t &pinfo = peer_info[i];
3992       // keep peer_info up to date
3993       if (pinfo.last_complete == pinfo.last_update)
3994         pinfo.last_complete = at_version;
3995       pinfo.last_update = at_version;
3996     }
3997   }
3998
3999   bool requires_missing_loc = false;
4000   for (auto &&i : get_async_recovery_targets()) {
4001     if (i == primary || !get_peer_missing(i).is_missing(hoid))
4002       continue;
4003     requires_missing_loc = true;
4004     for (auto &&entry: logv) {
4005       peer_missing[i].add_next_event(entry);
4006     }
4007   }
4008
4009   if (requires_missing_loc) {
4010     for (auto &&entry: logv) {
4011       psdout(30) << __func__ << " missing_loc before: "
4012                  << missing_loc.get_locations(entry.soid) << dendl;
4013       missing_loc.add_missing(entry.soid, entry.version,
4014                               eversion_t(), entry.is_delete());
4015       // clear out missing_loc
4016       missing_loc.clear_location(entry.soid);
4017       for (auto &i: get_actingset()) {
4018         if (!get_peer_missing(i).is_missing(entry.soid))
4019           missing_loc.add_location(entry.soid, i);
4020       }
4021       psdout(30) << __func__ << " missing_loc after: "
4022                  << missing_loc.get_locations(entry.soid) << dendl;
4023     }
4024   }
4025 }
4026
4027 void PeeringState::recovery_committed_to(eversion_t version)
4028 {
4029   psdout(10) << __func__ << " version " << version
4030              << " now ondisk" << dendl;
4031   last_complete_ondisk = version;
4032
4033   if (last_complete_ondisk == info.last_update) {
4034     if (!is_primary()) {
4035       // Either we are a replica or backfill target.
4036       // we are fully up to date.  tell the primary!
4037       pl->send_cluster_message(
4038         get_primary().osd,
4039         new MOSDPGTrim(
4040           get_osdmap_epoch(),
4041           spg_t(info.pgid.pgid, primary.shard),
4042           last_complete_ondisk),
4043         get_osdmap_epoch());
4044     } else {
4045       calc_min_last_complete_ondisk();
4046     }
4047   }
4048 }
4049
4050 void PeeringState::complete_write(eversion_t v, eversion_t lc)
4051 {
4052   last_update_ondisk = v;
4053   last_complete_ondisk = lc;
4054   calc_min_last_complete_ondisk();
4055 }
4056
4057 void PeeringState::calc_trim_to()
4058 {
4059   size_t target = pl->get_target_pg_log_entries();
4060
4061   eversion_t limit = std::min(
4062     min_last_complete_ondisk,
4063     pg_log.get_can_rollback_to());
4064   if (limit != eversion_t() &&
4065       limit != pg_trim_to &&
4066       pg_log.get_log().approx_size() > target) {
4067     size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target,
4068                              cct->_conf->osd_pg_log_trim_max);
4069     if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4070         cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4071       return;
4072     }
4073     list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
4074     eversion_t new_trim_to;
4075     for (size_t i = 0; i < num_to_trim; ++i) {
4076       new_trim_to = it->version;
4077       ++it;
4078       if (new_trim_to > limit) {
4079         new_trim_to = limit;
4080         psdout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
4081         break;
4082       }
4083     }
4084     psdout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
4085     pg_trim_to = new_trim_to;
4086     assert(pg_trim_to <= pg_log.get_head());
4087     assert(pg_trim_to <= min_last_complete_ondisk);
4088   }
4089 }
4090
4091 void PeeringState::calc_trim_to_aggressive()
4092 {
4093   size_t target = pl->get_target_pg_log_entries();
4094
4095   // limit pg log trimming up to the can_rollback_to value
4096   eversion_t limit = std::min({
4097     pg_log.get_head(),
4098     pg_log.get_can_rollback_to(),
4099     last_update_ondisk});
4100   psdout(10) << __func__ << " limit = " << limit << dendl;
4101
4102   if (limit != eversion_t() &&
4103       limit != pg_trim_to &&
4104       pg_log.get_log().approx_size() > target) {
4105     psdout(10) << __func__ << " approx pg log length =  "
4106              << pg_log.get_log().approx_size() << dendl;
4107     uint64_t num_to_trim = std::min<uint64_t>(pg_log.get_log().approx_size() - target,
4108                                               cct->_conf->osd_pg_log_trim_max);
4109     psdout(10) << __func__ << " num_to_trim =  " << num_to_trim << dendl;
4110     if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4111         cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4112       return;
4113     }
4114     auto it = pg_log.get_log().log.begin(); // oldest log entry
4115     auto rit = pg_log.get_log().log.rbegin();
4116     eversion_t by_n_to_keep; // start from tail
4117     eversion_t by_n_to_trim = eversion_t::max(); // start from head
4118     for (size_t i = 0; it != pg_log.get_log().log.end(); ++it, ++rit) {
4119       i++;
4120       if (i > target && by_n_to_keep == eversion_t()) {
4121         by_n_to_keep = rit->version;
4122       }
4123       if (i >= num_to_trim && by_n_to_trim == eversion_t::max()) {
4124         by_n_to_trim = it->version;
4125       }
4126       if (by_n_to_keep != eversion_t() &&
4127           by_n_to_trim != eversion_t::max()) {
4128         break;
4129       }
4130     }
4131
4132     if (by_n_to_keep == eversion_t()) {
4133       return;
4134     }
4135
4136     pg_trim_to = std::min({by_n_to_keep, by_n_to_trim, limit});
4137     psdout(10) << __func__ << " pg_trim_to now " << pg_trim_to << dendl;
4138     ceph_assert(pg_trim_to <= pg_log.get_head());
4139   }
4140 }
4141
4142 void PeeringState::apply_op_stats(
4143   const hobject_t &soid,
4144   const object_stat_sum_t &delta_stats)
4145 {
4146   info.stats.stats.add(delta_stats);
4147   info.stats.stats.floor(0);
4148
4149   for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
4150        i != get_backfill_targets().end();
4151        ++i) {
4152     pg_shard_t bt = *i;
4153     pg_info_t& pinfo = peer_info[bt];
4154     if (soid <= pinfo.last_backfill)
4155       pinfo.stats.stats.add(delta_stats);
4156   }
4157 }
4158
4159 void PeeringState::update_complete_backfill_object_stats(
4160   const hobject_t &hoid,
4161   const pg_stat_t &stats)
4162 {
4163   for (auto &&bt: get_backfill_targets()) {
4164     pg_info_t& pinfo = peer_info[bt];
4165     //Add stats to all peers that were missing object
4166     if (hoid > pinfo.last_backfill)
4167       pinfo.stats.add(stats);
4168   }
4169 }
4170
4171 void PeeringState::update_peer_last_backfill(
4172   pg_shard_t peer,
4173   const hobject_t &new_last_backfill)
4174 {
4175   pg_info_t &pinfo = peer_info[peer];
4176   pinfo.last_backfill = new_last_backfill;
4177   if (new_last_backfill.is_max()) {
4178     /* pinfo.stats might be wrong if we did log-based recovery on the
4179      * backfilled portion in addition to continuing backfill.
4180      */
4181     pinfo.stats = info.stats;
4182   }
4183 }
4184
4185 void PeeringState::set_revert_with_targets(
4186   const hobject_t &soid,
4187   const set<pg_shard_t> &good_peers)
4188 {
4189   for (auto &&peer: good_peers) {
4190     missing_loc.add_location(soid, peer);
4191   }
4192 }
4193
4194 void PeeringState::prepare_backfill_for_missing(
4195   const hobject_t &soid,
4196   const eversion_t &version,
4197   const vector<pg_shard_t> &targets) {
4198   for (auto &&peer: targets) {
4199     peer_missing[peer].add(soid, version, eversion_t(), false);
4200   }
4201 }
4202
4203 void PeeringState::update_hset(const pg_hit_set_history_t &hset_history)
4204 {
4205   info.hit_set = hset_history;
4206 }
4207
4208 /*------------ Peering State Machine----------------*/
4209 #undef dout_prefix
4210 #define dout_prefix (context< PeeringMachine >().dpp->gen_prefix(*_dout) \
4211                     << "state<" << get_state_name() << ">: ")
4212 #undef psdout
4213 #define psdout(x) ldout(context< PeeringMachine >().cct, x)
4214
4215 #define DECLARE_LOCALS                                  \
4216   PeeringState *ps = context< PeeringMachine >().state; \
4217   std::ignore = ps;                                     \
4218   PeeringListener *pl = context< PeeringMachine >().pl; \
4219   std::ignore = pl
4220
4221
4222 /*------Crashed-------*/
4223 PeeringState::Crashed::Crashed(my_context ctx)
4224   : my_base(ctx),
4225     NamedState(context< PeeringMachine >().state_history, "Crashed")
4226 {
4227   context< PeeringMachine >().log_enter(state_name);
4228   ceph_abort_msg("we got a bad state machine event");
4229 }
4230
4231
4232 /*------Initial-------*/
4233 PeeringState::Initial::Initial(my_context ctx)
4234   : my_base(ctx),
4235     NamedState(context< PeeringMachine >().state_history, "Initial")
4236 {
4237   context< PeeringMachine >().log_enter(state_name);
4238 }
4239
4240 boost::statechart::result PeeringState::Initial::react(const MNotifyRec& notify)
4241 {
4242   DECLARE_LOCALS;
4243   ps->proc_replica_info(
4244     notify.from, notify.notify.info, notify.notify.epoch_sent);
4245   ps->set_last_peering_reset();
4246   return transit< Primary >();
4247 }
4248
4249 boost::statechart::result PeeringState::Initial::react(const MInfoRec& i)
4250 {
4251   DECLARE_LOCALS;
4252   ceph_assert(!ps->is_primary());
4253   post_event(i);
4254   return transit< Stray >();
4255 }
4256
4257 boost::statechart::result PeeringState::Initial::react(const MLogRec& i)
4258 {
4259   DECLARE_LOCALS;
4260   ceph_assert(!ps->is_primary());
4261   post_event(i);
4262   return transit< Stray >();
4263 }
4264
4265 void PeeringState::Initial::exit()
4266 {
4267   context< PeeringMachine >().log_exit(state_name, enter_time);
4268   DECLARE_LOCALS;
4269   utime_t dur = ceph_clock_now() - enter_time;
4270   pl->get_peering_perf().tinc(rs_initial_latency, dur);
4271 }
4272
4273 /*------Started-------*/
4274 PeeringState::Started::Started(my_context ctx)
4275   : my_base(ctx),
4276     NamedState(context< PeeringMachine >().state_history, "Started")
4277 {
4278   context< PeeringMachine >().log_enter(state_name);
4279 }
4280
4281 boost::statechart::result
4282 PeeringState::Started::react(const IntervalFlush&)
4283 {
4284   psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4285   context< PeeringMachine >().state->end_block_outgoing();
4286   return discard_event();
4287 }
4288
4289 boost::statechart::result PeeringState::Started::react(const AdvMap& advmap)
4290 {
4291   DECLARE_LOCALS;
4292   psdout(10) << "Started advmap" << dendl;
4293   ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4294   if (ps->should_restart_peering(
4295         advmap.up_primary,
4296         advmap.acting_primary,
4297         advmap.newup,
4298         advmap.newacting,
4299         advmap.lastmap,
4300         advmap.osdmap)) {
4301     psdout(10) << "should_restart_peering, transitioning to Reset"
4302                        << dendl;
4303     post_event(advmap);
4304     return transit< Reset >();
4305   }
4306   ps->remove_down_peer_info(advmap.osdmap);
4307   return discard_event();
4308 }
4309
4310 boost::statechart::result PeeringState::Started::react(const QueryState& q)
4311 {
4312   q.f->open_object_section("state");
4313   q.f->dump_string("name", state_name);
4314   q.f->dump_stream("enter_time") << enter_time;
4315   q.f->close_section();
4316   return discard_event();
4317 }
4318
4319 void PeeringState::Started::exit()
4320 {
4321   context< PeeringMachine >().log_exit(state_name, enter_time);
4322   DECLARE_LOCALS;
4323   utime_t dur = ceph_clock_now() - enter_time;
4324   pl->get_peering_perf().tinc(rs_started_latency, dur);
4325   ps->state_clear(PG_STATE_WAIT | PG_STATE_LAGGY);
4326 }
4327
4328 /*--------Reset---------*/
4329 PeeringState::Reset::Reset(my_context ctx)
4330   : my_base(ctx),
4331     NamedState(context< PeeringMachine >().state_history, "Reset")
4332 {
4333   context< PeeringMachine >().log_enter(state_name);
4334   DECLARE_LOCALS;
4335
4336   ps->flushes_in_progress = 0;
4337   ps->set_last_peering_reset();
4338   ps->log_weirdness();
4339 }
4340
4341 boost::statechart::result
4342 PeeringState::Reset::react(const IntervalFlush&)
4343 {
4344   psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4345   context< PeeringMachine >().state->end_block_outgoing();
4346   return discard_event();
4347 }
4348
4349 boost::statechart::result PeeringState::Reset::react(const AdvMap& advmap)
4350 {
4351   DECLARE_LOCALS;
4352   psdout(10) << "Reset advmap" << dendl;
4353
4354   ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4355
4356   if (ps->should_restart_peering(
4357         advmap.up_primary,
4358         advmap.acting_primary,
4359         advmap.newup,
4360         advmap.newacting,
4361         advmap.lastmap,
4362         advmap.osdmap)) {
4363     psdout(10) << "should restart peering, calling start_peering_interval again"
4364                        << dendl;
4365     ps->start_peering_interval(
4366       advmap.lastmap,
4367       advmap.newup, advmap.up_primary,
4368       advmap.newacting, advmap.acting_primary,
4369       context< PeeringMachine >().get_cur_transaction());
4370   }
4371   ps->remove_down_peer_info(advmap.osdmap);
4372   ps->check_past_interval_bounds();
4373   return discard_event();
4374 }
4375
4376 boost::statechart::result PeeringState::Reset::react(const ActMap&)
4377 {
4378   DECLARE_LOCALS;
4379   if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
4380     ps->info.history.refresh_prior_readable_until_ub(
4381       pl->get_mnow(),
4382       ps->prior_readable_until_ub);
4383     context< PeeringMachine >().send_notify(
4384       ps->get_primary().osd,
4385       pg_notify_t(
4386         ps->get_primary().shard, ps->pg_whoami.shard,
4387         ps->get_osdmap_epoch(),
4388         ps->get_osdmap_epoch(),
4389         ps->info,
4390         ps->past_intervals));
4391   }
4392
4393   ps->update_heartbeat_peers();
4394
4395   return transit< Started >();
4396 }
4397
4398 boost::statechart::result PeeringState::Reset::react(const QueryState& q)
4399 {
4400   q.f->open_object_section("state");
4401   q.f->dump_string("name", state_name);
4402   q.f->dump_stream("enter_time") << enter_time;
4403   q.f->close_section();
4404   return discard_event();
4405 }
4406
4407 void PeeringState::Reset::exit()
4408 {
4409   context< PeeringMachine >().log_exit(state_name, enter_time);
4410   DECLARE_LOCALS;
4411   utime_t dur = ceph_clock_now() - enter_time;
4412   pl->get_peering_perf().tinc(rs_reset_latency, dur);
4413 }
4414
4415 /*-------Start---------*/
4416 PeeringState::Start::Start(my_context ctx)
4417   : my_base(ctx),
4418     NamedState(context< PeeringMachine >().state_history, "Start")
4419 {
4420   context< PeeringMachine >().log_enter(state_name);
4421
4422   DECLARE_LOCALS;
4423   if (ps->is_primary()) {
4424     psdout(1) << "transitioning to Primary" << dendl;
4425     post_event(MakePrimary());
4426   } else { //is_stray
4427     psdout(1) << "transitioning to Stray" << dendl;
4428     post_event(MakeStray());
4429   }
4430 }
4431
4432 void PeeringState::Start::exit()
4433 {
4434   context< PeeringMachine >().log_exit(state_name, enter_time);
4435   DECLARE_LOCALS;
4436   utime_t dur = ceph_clock_now() - enter_time;
4437   pl->get_peering_perf().tinc(rs_start_latency, dur);
4438 }
4439
4440 /*---------Primary--------*/
4441 PeeringState::Primary::Primary(my_context ctx)
4442   : my_base(ctx),
4443     NamedState(context< PeeringMachine >().state_history, "Started/Primary")
4444 {
4445   context< PeeringMachine >().log_enter(state_name);
4446   DECLARE_LOCALS;
4447   ceph_assert(ps->want_acting.empty());
4448
4449   // set CREATING bit until we have peered for the first time.
4450   if (ps->info.history.last_epoch_started == 0) {
4451     ps->state_set(PG_STATE_CREATING);
4452     // use the history timestamp, which ultimately comes from the
4453     // monitor in the create case.
4454     utime_t t = ps->info.history.last_scrub_stamp;
4455     ps->info.stats.last_fresh = t;
4456     ps->info.stats.last_active = t;
4457     ps->info.stats.last_change = t;
4458     ps->info.stats.last_peered = t;
4459     ps->info.stats.last_clean = t;
4460     ps->info.stats.last_unstale = t;
4461     ps->info.stats.last_undegraded = t;
4462     ps->info.stats.last_fullsized = t;
4463     ps->info.stats.last_scrub_stamp = t;
4464     ps->info.stats.last_deep_scrub_stamp = t;
4465     ps->info.stats.last_clean_scrub_stamp = t;
4466   }
4467 }
4468
4469 boost::statechart::result PeeringState::Primary::react(const MNotifyRec& notevt)
4470 {
4471   DECLARE_LOCALS;
4472   psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
4473   ps->proc_replica_info(
4474     notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
4475   return discard_event();
4476 }
4477
4478 boost::statechart::result PeeringState::Primary::react(const ActMap&)
4479 {
4480   DECLARE_LOCALS;
4481   psdout(7) << "handle ActMap primary" << dendl;
4482   pl->publish_stats_to_osd();
4483   return discard_event();
4484 }
4485
4486 boost::statechart::result PeeringState::Primary::react(
4487   const SetForceRecovery&)
4488 {
4489   DECLARE_LOCALS;
4490   ps->set_force_recovery(true);
4491   return discard_event();
4492 }
4493
4494 boost::statechart::result PeeringState::Primary::react(
4495   const UnsetForceRecovery&)
4496 {
4497   DECLARE_LOCALS;
4498   ps->set_force_recovery(false);
4499   return discard_event();
4500 }
4501
4502 boost::statechart::result PeeringState::Primary::react(
4503   const RequestScrub& evt)
4504 {
4505   DECLARE_LOCALS;
4506   if (ps->is_primary()) {
4507     pl->scrub_requested(evt.deep, evt.repair);
4508     psdout(10) << "marking for scrub" << dendl;
4509   }
4510   return discard_event();
4511 }
4512
4513 boost::statechart::result PeeringState::Primary::react(
4514   const SetForceBackfill&)
4515 {
4516   DECLARE_LOCALS;
4517   ps->set_force_backfill(true);
4518   return discard_event();
4519 }
4520
4521 boost::statechart::result PeeringState::Primary::react(
4522   const UnsetForceBackfill&)
4523 {
4524   DECLARE_LOCALS;
4525   ps->set_force_backfill(false);
4526   return discard_event();
4527 }
4528
4529 void PeeringState::Primary::exit()
4530 {
4531   context< PeeringMachine >().log_exit(state_name, enter_time);
4532   DECLARE_LOCALS;
4533   ps->want_acting.clear();
4534   utime_t dur = ceph_clock_now() - enter_time;
4535   pl->get_peering_perf().tinc(rs_primary_latency, dur);
4536   pl->clear_primary_state();
4537   ps->state_clear(PG_STATE_CREATING);
4538 }
4539
4540 /*---------Peering--------*/
4541 PeeringState::Peering::Peering(my_context ctx)
4542   : my_base(ctx),
4543     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering"),
4544     history_les_bound(false)
4545 {
4546   context< PeeringMachine >().log_enter(state_name);
4547   DECLARE_LOCALS;
4548
4549   ceph_assert(!ps->is_peered());
4550   ceph_assert(!ps->is_peering());
4551   ceph_assert(ps->is_primary());
4552   ps->state_set(PG_STATE_PEERING);
4553 }
4554
4555 boost::statechart::result PeeringState::Peering::react(const AdvMap& advmap)
4556 {
4557   DECLARE_LOCALS;
4558   psdout(10) << "Peering advmap" << dendl;
4559   if (prior_set.affected_by_map(*(advmap.osdmap), ps->dpp)) {
4560     psdout(1) << "Peering, affected_by_map, going to Reset" << dendl;
4561     post_event(advmap);
4562     return transit< Reset >();
4563   }
4564
4565   ps->adjust_need_up_thru(advmap.osdmap);
4566   ps->check_prior_readable_down_osds(advmap.osdmap);
4567
4568   return forward_event();
4569 }
4570
4571 boost::statechart::result PeeringState::Peering::react(const QueryState& q)
4572 {
4573   DECLARE_LOCALS;
4574
4575   q.f->open_object_section("state");
4576   q.f->dump_string("name", state_name);
4577   q.f->dump_stream("enter_time") << enter_time;
4578
4579   q.f->open_array_section("past_intervals");
4580   ps->past_intervals.dump(q.f);
4581   q.f->close_section();
4582
4583   q.f->open_array_section("probing_osds");
4584   for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
4585        p != prior_set.probe.end();
4586        ++p)
4587     q.f->dump_stream("osd") << *p;
4588   q.f->close_section();
4589
4590   if (prior_set.pg_down)
4591     q.f->dump_string("blocked", "peering is blocked due to down osds");
4592
4593   q.f->open_array_section("down_osds_we_would_probe");
4594   for (set<int>::iterator p = prior_set.down.begin();
4595        p != prior_set.down.end();
4596        ++p)
4597     q.f->dump_int("osd", *p);
4598   q.f->close_section();
4599
4600   q.f->open_array_section("peering_blocked_by");
4601   for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
4602        p != prior_set.blocked_by.end();
4603        ++p) {
4604     q.f->open_object_section("osd");
4605     q.f->dump_int("osd", p->first);
4606     q.f->dump_int("current_lost_at", p->second);
4607     q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
4608     q.f->close_section();
4609   }
4610   q.f->close_section();
4611
4612   if (history_les_bound) {
4613     q.f->open_array_section("peering_blocked_by_detail");
4614     q.f->open_object_section("item");
4615     q.f->dump_string("detail","peering_blocked_by_history_les_bound");
4616     q.f->close_section();
4617     q.f->close_section();
4618   }
4619
4620   q.f->close_section();
4621   return forward_event();
4622 }
4623
4624 void PeeringState::Peering::exit()
4625 {
4626
4627   DECLARE_LOCALS;
4628   psdout(10) << "Leaving Peering" << dendl;
4629   context< PeeringMachine >().log_exit(state_name, enter_time);
4630   ps->state_clear(PG_STATE_PEERING);
4631   pl->clear_probe_targets();
4632
4633   utime_t dur = ceph_clock_now() - enter_time;
4634   pl->get_peering_perf().tinc(rs_peering_latency, dur);
4635 }
4636
4637
4638 /*------Backfilling-------*/
4639 PeeringState::Backfilling::Backfilling(my_context ctx)
4640   : my_base(ctx),
4641     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Backfilling")
4642 {
4643   context< PeeringMachine >().log_enter(state_name);
4644
4645
4646   DECLARE_LOCALS;
4647   ps->backfill_reserved = true;
4648   pl->on_backfill_reserved();
4649   ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
4650   ps->state_clear(PG_STATE_BACKFILL_WAIT);
4651   ps->state_set(PG_STATE_BACKFILLING);
4652   pl->publish_stats_to_osd();
4653 }
4654
4655 void PeeringState::Backfilling::backfill_release_reservations()
4656 {
4657   DECLARE_LOCALS;
4658   pl->cancel_local_background_io_reservation();
4659   for (set<pg_shard_t>::iterator it = ps->backfill_targets.begin();
4660        it != ps->backfill_targets.end();
4661        ++it) {
4662     ceph_assert(*it != ps->pg_whoami);
4663     pl->send_cluster_message(
4664       it->osd,
4665       new MBackfillReserve(
4666         MBackfillReserve::RELEASE,
4667         spg_t(ps->info.pgid.pgid, it->shard),
4668         ps->get_osdmap_epoch()),
4669       ps->get_osdmap_epoch());
4670   }
4671 }
4672
4673 void PeeringState::Backfilling::cancel_backfill()
4674 {
4675   DECLARE_LOCALS;
4676   backfill_release_reservations();
4677   pl->on_backfill_canceled();
4678 }
4679
4680 boost::statechart::result
4681 PeeringState::Backfilling::react(const Backfilled &c)
4682 {
4683   backfill_release_reservations();
4684   return transit<Recovered>();
4685 }
4686
4687 boost::statechart::result
4688 PeeringState::Backfilling::react(const DeferBackfill &c)
4689 {
4690   DECLARE_LOCALS;
4691
4692   psdout(10) << "defer backfill, retry delay " << c.delay << dendl;
4693   ps->state_set(PG_STATE_BACKFILL_WAIT);
4694   ps->state_clear(PG_STATE_BACKFILLING);
4695   cancel_backfill();
4696
4697   pl->schedule_event_after(
4698     std::make_shared<PGPeeringEvent>(
4699       ps->get_osdmap_epoch(),
4700       ps->get_osdmap_epoch(),
4701       RequestBackfill()),
4702     c.delay);
4703   return transit<NotBackfilling>();
4704 }
4705
4706 boost::statechart::result
4707 PeeringState::Backfilling::react(const UnfoundBackfill &c)
4708 {
4709   DECLARE_LOCALS;
4710   psdout(10) << "backfill has unfound, can't continue" << dendl;
4711   ps->state_set(PG_STATE_BACKFILL_UNFOUND);
4712   ps->state_clear(PG_STATE_BACKFILLING);
4713   cancel_backfill();
4714   return transit<NotBackfilling>();
4715 }
4716
4717 boost::statechart::result
4718 PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull &)
4719 {
4720   DECLARE_LOCALS;
4721
4722   ps->state_set(PG_STATE_BACKFILL_TOOFULL);
4723   ps->state_clear(PG_STATE_BACKFILLING);
4724   cancel_backfill();
4725
4726   pl->schedule_event_after(
4727     std::make_shared<PGPeeringEvent>(
4728       ps->get_osdmap_epoch(),
4729       ps->get_osdmap_epoch(),
4730       RequestBackfill()),
4731     ps->cct->_conf->osd_backfill_retry_interval);
4732
4733   return transit<NotBackfilling>();
4734 }
4735
4736 boost::statechart::result
4737 PeeringState::Backfilling::react(const RemoteReservationRevoked &)
4738 {
4739   DECLARE_LOCALS;
4740   ps->state_set(PG_STATE_BACKFILL_WAIT);
4741   cancel_backfill();
4742   if (ps->needs_backfill()) {
4743     return transit<WaitLocalBackfillReserved>();
4744   } else {
4745     // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
4746     return discard_event();
4747   }
4748 }
4749
4750 void PeeringState::Backfilling::exit()
4751 {
4752   context< PeeringMachine >().log_exit(state_name, enter_time);
4753   DECLARE_LOCALS;
4754   ps->backfill_reserved = false;
4755   ps->state_clear(PG_STATE_BACKFILLING);
4756   ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
4757   utime_t dur = ceph_clock_now() - enter_time;
4758   pl->get_peering_perf().tinc(rs_backfilling_latency, dur);
4759 }
4760
4761 /*--WaitRemoteBackfillReserved--*/
4762
4763 PeeringState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
4764   : my_base(ctx),
4765     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteBackfillReserved"),
4766     backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
4767 {
4768   context< PeeringMachine >().log_enter(state_name);
4769   DECLARE_LOCALS;
4770
4771   ps->state_set(PG_STATE_BACKFILL_WAIT);
4772   pl->publish_stats_to_osd();
4773   post_event(RemoteBackfillReserved());
4774 }
4775
4776 boost::statechart::result
4777 PeeringState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
4778 {
4779   DECLARE_LOCALS;
4780
4781   int64_t num_bytes = ps->info.stats.stats.sum.num_bytes;
4782   psdout(10) << __func__ << " num_bytes " << num_bytes << dendl;
4783   if (backfill_osd_it !=
4784       context< Active >().remote_shards_to_reserve_backfill.end()) {
4785     // The primary never backfills itself
4786     ceph_assert(*backfill_osd_it != ps->pg_whoami);
4787     pl->send_cluster_message(
4788       backfill_osd_it->osd,
4789       new MBackfillReserve(
4790         MBackfillReserve::REQUEST,
4791         spg_t(context< PeeringMachine >().spgid.pgid, backfill_osd_it->shard),
4792         ps->get_osdmap_epoch(),
4793         ps->get_backfill_priority(),
4794         num_bytes,
4795         ps->peer_bytes[*backfill_osd_it]),
4796       ps->get_osdmap_epoch());
4797     ++backfill_osd_it;
4798   } else {
4799     ps->peer_bytes.clear();
4800     post_event(AllBackfillsReserved());
4801   }
4802   return discard_event();
4803 }
4804
4805 void PeeringState::WaitRemoteBackfillReserved::exit()
4806 {
4807   context< PeeringMachine >().log_exit(state_name, enter_time);
4808   DECLARE_LOCALS;
4809
4810   utime_t dur = ceph_clock_now() - enter_time;
4811   pl->get_peering_perf().tinc(rs_waitremotebackfillreserved_latency, dur);
4812 }
4813
4814 void PeeringState::WaitRemoteBackfillReserved::retry()
4815 {
4816   DECLARE_LOCALS;
4817   pl->cancel_local_background_io_reservation();
4818
4819   // Send CANCEL to all previously acquired reservations
4820   set<pg_shard_t>::const_iterator it, begin, end;
4821   begin = context< Active >().remote_shards_to_reserve_backfill.begin();
4822   end = context< Active >().remote_shards_to_reserve_backfill.end();
4823   ceph_assert(begin != end);
4824   for (it = begin; it != backfill_osd_it; ++it) {
4825     // The primary never backfills itself
4826     ceph_assert(*it != ps->pg_whoami);
4827     pl->send_cluster_message(
4828       it->osd,
4829       new MBackfillReserve(
4830         MBackfillReserve::RELEASE,
4831         spg_t(context< PeeringMachine >().spgid.pgid, it->shard),
4832         ps->get_osdmap_epoch()),
4833       ps->get_osdmap_epoch());
4834   }
4835
4836   ps->state_clear(PG_STATE_BACKFILL_WAIT);
4837   pl->publish_stats_to_osd();
4838
4839   pl->schedule_event_after(
4840     std::make_shared<PGPeeringEvent>(
4841       ps->get_osdmap_epoch(),
4842       ps->get_osdmap_epoch(),
4843       RequestBackfill()),
4844     ps->cct->_conf->osd_backfill_retry_interval);
4845 }
4846
4847 boost::statechart::result
4848 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRejectedTooFull &evt)
4849 {
4850   DECLARE_LOCALS;
4851   ps->state_set(PG_STATE_BACKFILL_TOOFULL);
4852   retry();
4853   return transit<NotBackfilling>();
4854 }
4855
4856 boost::statechart::result
4857 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt)
4858 {
4859   retry();
4860   return transit<NotBackfilling>();
4861 }
4862
4863 /*--WaitLocalBackfillReserved--*/
4864 PeeringState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
4865   : my_base(ctx),
4866     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalBackfillReserved")
4867 {
4868   context< PeeringMachine >().log_enter(state_name);
4869   DECLARE_LOCALS;
4870
4871   ps->state_set(PG_STATE_BACKFILL_WAIT);
4872   pl->request_local_background_io_reservation(
4873     ps->get_backfill_priority(),
4874     std::make_shared<PGPeeringEvent>(
4875       ps->get_osdmap_epoch(),
4876       ps->get_osdmap_epoch(),
4877       LocalBackfillReserved()),
4878     std::make_shared<PGPeeringEvent>(
4879       ps->get_osdmap_epoch(),
4880       ps->get_osdmap_epoch(),
4881       DeferBackfill(0.0)));
4882   pl->publish_stats_to_osd();
4883 }
4884
4885 void PeeringState::WaitLocalBackfillReserved::exit()
4886 {
4887   context< PeeringMachine >().log_exit(state_name, enter_time);
4888   DECLARE_LOCALS;
4889   utime_t dur = ceph_clock_now() - enter_time;
4890   pl->get_peering_perf().tinc(rs_waitlocalbackfillreserved_latency, dur);
4891 }
4892
4893 /*----NotBackfilling------*/
4894 PeeringState::NotBackfilling::NotBackfilling(my_context ctx)
4895   : my_base(ctx),
4896     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotBackfilling")
4897 {
4898   context< PeeringMachine >().log_enter(state_name);
4899   DECLARE_LOCALS;
4900   ps->state_clear(PG_STATE_REPAIR);
4901   pl->publish_stats_to_osd();
4902 }
4903
4904 boost::statechart::result
4905 PeeringState::NotBackfilling::react(const RemoteBackfillReserved &evt)
4906 {
4907   return discard_event();
4908 }
4909
4910 boost::statechart::result
4911 PeeringState::NotBackfilling::react(const RemoteReservationRejectedTooFull &evt)
4912 {
4913   return discard_event();
4914 }
4915
4916 void PeeringState::NotBackfilling::exit()
4917 {
4918   context< PeeringMachine >().log_exit(state_name, enter_time);
4919
4920   DECLARE_LOCALS;
4921   ps->state_clear(PG_STATE_BACKFILL_UNFOUND);
4922   utime_t dur = ceph_clock_now() - enter_time;
4923   pl->get_peering_perf().tinc(rs_notbackfilling_latency, dur);
4924 }
4925
4926 /*----NotRecovering------*/
4927 PeeringState::NotRecovering::NotRecovering(my_context ctx)
4928   : my_base(ctx),
4929     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotRecovering")
4930 {
4931   context< PeeringMachine >().log_enter(state_name);
4932   DECLARE_LOCALS;
4933   ps->state_clear(PG_STATE_REPAIR);
4934   pl->publish_stats_to_osd();
4935 }
4936
4937 void PeeringState::NotRecovering::exit()
4938 {
4939   context< PeeringMachine >().log_exit(state_name, enter_time);
4940
4941   DECLARE_LOCALS;
4942   ps->state_clear(PG_STATE_RECOVERY_UNFOUND);
4943   utime_t dur = ceph_clock_now() - enter_time;
4944   pl->get_peering_perf().tinc(rs_notrecovering_latency, dur);
4945 }
4946
4947 /*---RepNotRecovering----*/
4948 PeeringState::RepNotRecovering::RepNotRecovering(my_context ctx)
4949   : my_base(ctx),
4950     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepNotRecovering")
4951 {
4952   context< PeeringMachine >().log_enter(state_name);
4953 }
4954
4955 boost::statechart::result
4956 PeeringState::RepNotRecovering::react(const RejectTooFullRemoteReservation &evt)
4957 {
4958   DECLARE_LOCALS;
4959   ps->reject_reservation();
4960   post_event(RemoteReservationRejectedTooFull());
4961   return discard_event();
4962 }
4963
4964 void PeeringState::RepNotRecovering::exit()
4965 {
4966   context< PeeringMachine >().log_exit(state_name, enter_time);
4967   DECLARE_LOCALS;
4968   utime_t dur = ceph_clock_now() - enter_time;
4969   pl->get_peering_perf().tinc(rs_repnotrecovering_latency, dur);
4970 }
4971
4972 /*---RepWaitRecoveryReserved--*/
4973 PeeringState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
4974   : my_base(ctx),
4975     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitRecoveryReserved")
4976 {
4977   context< PeeringMachine >().log_enter(state_name);
4978 }
4979
4980 boost::statechart::result
4981 PeeringState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
4982 {
4983   DECLARE_LOCALS;
4984   pl->send_cluster_message(
4985     ps->primary.osd,
4986     new MRecoveryReserve(
4987       MRecoveryReserve::GRANT,
4988       spg_t(ps->info.pgid.pgid, ps->primary.shard),
4989       ps->get_osdmap_epoch()),
4990     ps->get_osdmap_epoch());
4991   return transit<RepRecovering>();
4992 }
4993
4994 boost::statechart::result
4995 PeeringState::RepWaitRecoveryReserved::react(
4996   const RemoteReservationCanceled &evt)
4997 {
4998   DECLARE_LOCALS;
4999   pl->unreserve_recovery_space();
5000
5001   pl->cancel_remote_recovery_reservation();
5002   return transit<RepNotRecovering>();
5003 }
5004
5005 void PeeringState::RepWaitRecoveryReserved::exit()
5006 {
5007   context< PeeringMachine >().log_exit(state_name, enter_time);
5008   DECLARE_LOCALS;
5009   utime_t dur = ceph_clock_now() - enter_time;
5010   pl->get_peering_perf().tinc(rs_repwaitrecoveryreserved_latency, dur);
5011 }
5012
5013 /*-RepWaitBackfillReserved*/
5014 PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
5015   : my_base(ctx),
5016     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitBackfillReserved")
5017 {
5018   context< PeeringMachine >().log_enter(state_name);
5019 }
5020
5021 boost::statechart::result
5022 PeeringState::RepNotRecovering::react(const RequestBackfillPrio &evt)
5023 {
5024
5025   DECLARE_LOCALS;
5026
5027   if (!pl->try_reserve_recovery_space(
5028         evt.primary_num_bytes, evt.local_num_bytes)) {
5029     post_event(RejectTooFullRemoteReservation());
5030   } else {
5031     PGPeeringEventRef preempt;
5032     if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5033       // older peers will interpret preemption as TOOFULL
5034       preempt = std::make_shared<PGPeeringEvent>(
5035         pl->get_osdmap_epoch(),
5036         pl->get_osdmap_epoch(),
5037         RemoteBackfillPreempted());
5038     }
5039     pl->request_remote_recovery_reservation(
5040       evt.priority,
5041       std::make_shared<PGPeeringEvent>(
5042         pl->get_osdmap_epoch(),
5043         pl->get_osdmap_epoch(),
5044         RemoteBackfillReserved()),
5045       preempt);
5046   }
5047   return transit<RepWaitBackfillReserved>();
5048 }
5049
5050 boost::statechart::result
5051 PeeringState::RepNotRecovering::react(const RequestRecoveryPrio &evt)
5052 {
5053   DECLARE_LOCALS;
5054
5055   // fall back to a local reckoning of priority of primary doesn't pass one
5056   // (pre-mimic compat)
5057   int prio = evt.priority ? evt.priority : ps->get_recovery_priority();
5058
5059   PGPeeringEventRef preempt;
5060   if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5061     // older peers can't handle this
5062     preempt = std::make_shared<PGPeeringEvent>(
5063       ps->get_osdmap_epoch(),
5064       ps->get_osdmap_epoch(),
5065       RemoteRecoveryPreempted());
5066   }
5067
5068   pl->request_remote_recovery_reservation(
5069     prio,
5070     std::make_shared<PGPeeringEvent>(
5071       ps->get_osdmap_epoch(),
5072       ps->get_osdmap_epoch(),
5073       RemoteRecoveryReserved()),
5074     preempt);
5075   return transit<RepWaitRecoveryReserved>();
5076 }
5077
5078 void PeeringState::RepWaitBackfillReserved::exit()
5079 {
5080   context< PeeringMachine >().log_exit(state_name, enter_time);
5081   DECLARE_LOCALS;
5082   utime_t dur = ceph_clock_now() - enter_time;
5083   pl->get_peering_perf().tinc(rs_repwaitbackfillreserved_latency, dur);
5084 }
5085
5086 boost::statechart::result
5087 PeeringState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
5088 {
5089   DECLARE_LOCALS;
5090
5091
5092   pl->send_cluster_message(
5093       ps->primary.osd,
5094       new MBackfillReserve(
5095         MBackfillReserve::GRANT,
5096         spg_t(ps->info.pgid.pgid, ps->primary.shard),
5097         ps->get_osdmap_epoch()),
5098       ps->get_osdmap_epoch());
5099   return transit<RepRecovering>();
5100 }
5101
5102 boost::statechart::result
5103 PeeringState::RepWaitBackfillReserved::react(
5104   const RejectTooFullRemoteReservation &evt)
5105 {
5106   DECLARE_LOCALS;
5107   ps->reject_reservation();
5108   post_event(RemoteReservationRejectedTooFull());
5109   return discard_event();
5110 }
5111
5112 boost::statechart::result
5113 PeeringState::RepWaitBackfillReserved::react(
5114   const RemoteReservationRejectedTooFull &evt)
5115 {
5116   DECLARE_LOCALS;
5117   pl->unreserve_recovery_space();
5118
5119   pl->cancel_remote_recovery_reservation();
5120   return transit<RepNotRecovering>();
5121 }
5122
5123 boost::statechart::result
5124 PeeringState::RepWaitBackfillReserved::react(
5125   const RemoteReservationCanceled &evt)
5126 {
5127   DECLARE_LOCALS;
5128   pl->unreserve_recovery_space();
5129
5130   pl->cancel_remote_recovery_reservation();
5131   return transit<RepNotRecovering>();
5132 }
5133
5134 /*---RepRecovering-------*/
5135 PeeringState::RepRecovering::RepRecovering(my_context ctx)
5136   : my_base(ctx),
5137     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepRecovering")
5138 {
5139   context< PeeringMachine >().log_enter(state_name);
5140 }
5141
5142 boost::statechart::result
5143 PeeringState::RepRecovering::react(const RemoteRecoveryPreempted &)
5144 {
5145   DECLARE_LOCALS;
5146
5147
5148   pl->unreserve_recovery_space();
5149   pl->send_cluster_message(
5150     ps->primary.osd,
5151     new MRecoveryReserve(
5152       MRecoveryReserve::REVOKE,
5153       spg_t(ps->info.pgid.pgid, ps->primary.shard),
5154       ps->get_osdmap_epoch()),
5155     ps->get_osdmap_epoch());
5156   return discard_event();
5157 }
5158
5159 boost::statechart::result
5160 PeeringState::RepRecovering::react(const BackfillTooFull &)
5161 {
5162   DECLARE_LOCALS;
5163
5164
5165   pl->unreserve_recovery_space();
5166   pl->send_cluster_message(
5167     ps->primary.osd,
5168     new MBackfillReserve(
5169       MBackfillReserve::REVOKE_TOOFULL,
5170       spg_t(ps->info.pgid.pgid, ps->primary.shard),
5171       ps->get_osdmap_epoch()),
5172     ps->get_osdmap_epoch());
5173   return discard_event();
5174 }
5175
5176 boost::statechart::result
5177 PeeringState::RepRecovering::react(const RemoteBackfillPreempted &)
5178 {
5179   DECLARE_LOCALS;
5180
5181
5182   pl->unreserve_recovery_space();
5183   pl->send_cluster_message(
5184     ps->primary.osd,
5185     new MBackfillReserve(
5186       MBackfillReserve::REVOKE,
5187       spg_t(ps->info.pgid.pgid, ps->primary.shard),
5188       ps->get_osdmap_epoch()),
5189     ps->get_osdmap_epoch());
5190   return discard_event();
5191 }
5192
5193 void PeeringState::RepRecovering::exit()
5194 {
5195   context< PeeringMachine >().log_exit(state_name, enter_time);
5196   DECLARE_LOCALS;
5197   pl->unreserve_recovery_space();
5198
5199   pl->cancel_remote_recovery_reservation();
5200   utime_t dur = ceph_clock_now() - enter_time;
5201   pl->get_peering_perf().tinc(rs_reprecovering_latency, dur);
5202 }
5203
5204 /*------Activating--------*/
5205 PeeringState::Activating::Activating(my_context ctx)
5206   : my_base(ctx),
5207     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Activating")
5208 {
5209   context< PeeringMachine >().log_enter(state_name);
5210 }
5211
5212 void PeeringState::Activating::exit()
5213 {
5214   context< PeeringMachine >().log_exit(state_name, enter_time);
5215   DECLARE_LOCALS;
5216   utime_t dur = ceph_clock_now() - enter_time;
5217   pl->get_peering_perf().tinc(rs_activating_latency, dur);
5218 }
5219
5220 PeeringState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
5221   : my_base(ctx),
5222     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalRecoveryReserved")
5223 {
5224   context< PeeringMachine >().log_enter(state_name);
5225   DECLARE_LOCALS;
5226
5227   // Make sure all nodes that part of the recovery aren't full
5228   if (!ps->cct->_conf->osd_debug_skip_full_check_in_recovery &&
5229       ps->get_osdmap()->check_full(ps->acting_recovery_backfill)) {
5230     post_event(RecoveryTooFull());
5231     return;
5232   }
5233
5234   ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5235   ps->state_set(PG_STATE_RECOVERY_WAIT);
5236   pl->request_local_background_io_reservation(
5237     ps->get_recovery_priority(),
5238     std::make_shared<PGPeeringEvent>(
5239       ps->get_osdmap_epoch(),
5240       ps->get_osdmap_epoch(),
5241       LocalRecoveryReserved()),
5242     std::make_shared<PGPeeringEvent>(
5243       ps->get_osdmap_epoch(),
5244       ps->get_osdmap_epoch(),
5245       DeferRecovery(0.0)));
5246   pl->publish_stats_to_osd();
5247 }
5248
5249 boost::statechart::result
5250 PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
5251 {
5252   DECLARE_LOCALS;
5253   ps->state_set(PG_STATE_RECOVERY_TOOFULL);
5254   pl->schedule_event_after(
5255     std::make_shared<PGPeeringEvent>(
5256       ps->get_osdmap_epoch(),
5257       ps->get_osdmap_epoch(),
5258       DoRecovery()),
5259     ps->cct->_conf->osd_recovery_retry_interval);
5260   return transit<NotRecovering>();
5261 }
5262
5263 void PeeringState::WaitLocalRecoveryReserved::exit()
5264 {
5265   context< PeeringMachine >().log_exit(state_name, enter_time);
5266   DECLARE_LOCALS;
5267   utime_t dur = ceph_clock_now() - enter_time;
5268   pl->get_peering_perf().tinc(rs_waitlocalrecoveryreserved_latency, dur);
5269 }
5270
5271 PeeringState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
5272   : my_base(ctx),
5273     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
5274     remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
5275 {
5276   context< PeeringMachine >().log_enter(state_name);
5277   post_event(RemoteRecoveryReserved());
5278 }
5279
5280 boost::statechart::result
5281 PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
5282   DECLARE_LOCALS;
5283
5284   if (remote_recovery_reservation_it !=
5285       context< Active >().remote_shards_to_reserve_recovery.end()) {
5286     ceph_assert(*remote_recovery_reservation_it != ps->pg_whoami);
5287     pl->send_cluster_message(
5288       remote_recovery_reservation_it->osd,
5289       new MRecoveryReserve(
5290         MRecoveryReserve::REQUEST,
5291         spg_t(context< PeeringMachine >().spgid.pgid,
5292               remote_recovery_reservation_it->shard),
5293         ps->get_osdmap_epoch(),
5294         ps->get_recovery_priority()),
5295       ps->get_osdmap_epoch());
5296     ++remote_recovery_reservation_it;
5297   } else {
5298     post_event(AllRemotesReserved());
5299   }
5300   return discard_event();
5301 }
5302
5303 void PeeringState::WaitRemoteRecoveryReserved::exit()
5304 {
5305   context< PeeringMachine >().log_exit(state_name, enter_time);
5306   DECLARE_LOCALS;
5307   utime_t dur = ceph_clock_now() - enter_time;
5308   pl->get_peering_perf().tinc(rs_waitremoterecoveryreserved_latency, dur);
5309 }
5310
5311 PeeringState::Recovering::Recovering(my_context ctx)
5312   : my_base(ctx),
5313     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovering")
5314 {
5315   context< PeeringMachine >().log_enter(state_name);
5316
5317   DECLARE_LOCALS;
5318   ps->state_clear(PG_STATE_RECOVERY_WAIT);
5319   ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5320   ps->state_set(PG_STATE_RECOVERING);
5321   pl->on_recovery_reserved();
5322   ceph_assert(!ps->state_test(PG_STATE_ACTIVATING));
5323   pl->publish_stats_to_osd();
5324 }
5325
5326 void PeeringState::Recovering::release_reservations(bool cancel)
5327 {
5328   DECLARE_LOCALS;
5329   ceph_assert(cancel || !ps->pg_log.get_missing().have_missing());
5330
5331   // release remote reservations
5332   for (set<pg_shard_t>::const_iterator i =
5333          context< Active >().remote_shards_to_reserve_recovery.begin();
5334         i != context< Active >().remote_shards_to_reserve_recovery.end();
5335         ++i) {
5336     if (*i == ps->pg_whoami) // skip myself
5337       continue;
5338     pl->send_cluster_message(
5339       i->osd,
5340       new MRecoveryReserve(
5341         MRecoveryReserve::RELEASE,
5342         spg_t(ps->info.pgid.pgid, i->shard),
5343         ps->get_osdmap_epoch()),
5344       ps->get_osdmap_epoch());
5345   }
5346 }
5347
5348 boost::statechart::result
5349 PeeringState::Recovering::react(const AllReplicasRecovered &evt)
5350 {
5351   DECLARE_LOCALS;
5352   ps->state_clear(PG_STATE_FORCED_RECOVERY);
5353   release_reservations();
5354   pl->cancel_local_background_io_reservation();
5355   return transit<Recovered>();
5356 }
5357
5358 boost::statechart::result
5359 PeeringState::Recovering::react(const RequestBackfill &evt)
5360 {
5361   DECLARE_LOCALS;
5362
5363   release_reservations();
5364
5365   ps->state_clear(PG_STATE_FORCED_RECOVERY);
5366   pl->cancel_local_background_io_reservation();
5367   pl->publish_stats_to_osd();
5368   // transit any async_recovery_targets back into acting
5369   // so pg won't have to stay undersized for long
5370   // as backfill might take a long time to complete..
5371   if (!ps->async_recovery_targets.empty()) {
5372     pg_shard_t auth_log_shard;
5373     bool history_les_bound = false;
5374     ps->choose_acting(auth_log_shard, true, &history_les_bound);
5375   }
5376   return transit<WaitLocalBackfillReserved>();
5377 }
5378
5379 boost::statechart::result
5380 PeeringState::Recovering::react(const DeferRecovery &evt)
5381 {
5382   DECLARE_LOCALS;
5383   if (!ps->state_test(PG_STATE_RECOVERING)) {
5384     // we may have finished recovery and have an AllReplicasRecovered
5385     // event queued to move us to the next state.
5386     psdout(10) << "got defer recovery but not recovering" << dendl;
5387     return discard_event();
5388   }
5389   psdout(10) << "defer recovery, retry delay " << evt.delay << dendl;
5390   ps->state_set(PG_STATE_RECOVERY_WAIT);
5391   pl->cancel_local_background_io_reservation();
5392   release_reservations(true);
5393   pl->schedule_event_after(
5394     std::make_shared<PGPeeringEvent>(
5395       ps->get_osdmap_epoch(),
5396       ps->get_osdmap_epoch(),
5397       DoRecovery()),
5398     evt.delay);
5399   return transit<NotRecovering>();
5400 }
5401
5402 boost::statechart::result
5403 PeeringState::Recovering::react(const UnfoundRecovery &evt)
5404 {
5405   DECLARE_LOCALS;
5406   psdout(10) << "recovery has unfound, can't continue" << dendl;
5407   ps->state_set(PG_STATE_RECOVERY_UNFOUND);
5408   pl->cancel_local_background_io_reservation();
5409   release_reservations(true);
5410   return transit<NotRecovering>();
5411 }
5412
5413 void PeeringState::Recovering::exit()
5414 {
5415   context< PeeringMachine >().log_exit(state_name, enter_time);
5416
5417   DECLARE_LOCALS;
5418   utime_t dur = ceph_clock_now() - enter_time;
5419   ps->state_clear(PG_STATE_RECOVERING);
5420   pl->get_peering_perf().tinc(rs_recovering_latency, dur);
5421 }
5422
5423 PeeringState::Recovered::Recovered(my_context ctx)
5424   : my_base(ctx),
5425     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovered")
5426 {
5427   pg_shard_t auth_log_shard;
5428
5429   context< PeeringMachine >().log_enter(state_name);
5430
5431   DECLARE_LOCALS;
5432
5433   ceph_assert(!ps->needs_recovery());
5434
5435   // if we finished backfill, all acting are active; recheck if
5436   // DEGRADED | UNDERSIZED is appropriate.
5437   ceph_assert(!ps->acting_recovery_backfill.empty());
5438   if (ps->get_osdmap()->get_pg_size(context< PeeringMachine >().spgid.pgid) <=
5439       ps->acting_recovery_backfill.size()) {
5440     ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
5441     pl->publish_stats_to_osd();
5442   }
5443
5444   // adjust acting set?  (e.g. because backfill completed...)
5445   bool history_les_bound = false;
5446   if (ps->acting != ps->up && !ps->choose_acting(auth_log_shard,
5447                                                  true, &history_les_bound)) {
5448     ceph_assert(ps->want_acting.size());
5449   } else if (!ps->async_recovery_targets.empty()) {
5450     ps->choose_acting(auth_log_shard, true, &history_les_bound);
5451   }
5452
5453   if (context< Active >().all_replicas_activated  &&
5454       ps->async_recovery_targets.empty())
5455     post_event(GoClean());
5456 }
5457
5458 void PeeringState::Recovered::exit()
5459 {
5460   context< PeeringMachine >().log_exit(state_name, enter_time);
5461   DECLARE_LOCALS;
5462
5463   utime_t dur = ceph_clock_now() - enter_time;
5464   pl->get_peering_perf().tinc(rs_recovered_latency, dur);
5465 }
5466
5467 PeeringState::Clean::Clean(my_context ctx)
5468   : my_base(ctx),
5469     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Clean")
5470 {
5471   context< PeeringMachine >().log_enter(state_name);
5472
5473   DECLARE_LOCALS;
5474
5475   if (ps->info.last_complete != ps->info.last_update) {
5476     ceph_abort();
5477   }
5478
5479
5480   ps->try_mark_clean();
5481
5482   context< PeeringMachine >().get_cur_transaction().register_on_commit(
5483     pl->on_clean());
5484 }
5485
5486 void PeeringState::Clean::exit()
5487 {
5488   context< PeeringMachine >().log_exit(state_name, enter_time);
5489
5490   DECLARE_LOCALS;
5491   ps->state_clear(PG_STATE_CLEAN);
5492   utime_t dur = ceph_clock_now() - enter_time;
5493   pl->get_peering_perf().tinc(rs_clean_latency, dur);
5494 }
5495
5496 template <typename T>
5497 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
5498 {
5499   set<int> osds_found;
5500   set<pg_shard_t> out;
5501   for (typename T::const_iterator i = in.begin();
5502        i != in.end();
5503        ++i) {
5504     if (*i != skip && !osds_found.count(i->osd)) {
5505       osds_found.insert(i->osd);
5506       out.insert(*i);
5507     }
5508   }
5509   return out;
5510 }
5511
5512 /*---------Active---------*/
5513 PeeringState::Active::Active(my_context ctx)
5514   : my_base(ctx),
5515     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active"),
5516     remote_shards_to_reserve_recovery(
5517       unique_osd_shard_set(
5518         context< PeeringMachine >().state->pg_whoami,
5519         context< PeeringMachine >().state->acting_recovery_backfill)),
5520     remote_shards_to_reserve_backfill(
5521       unique_osd_shard_set(
5522         context< PeeringMachine >().state->pg_whoami,
5523         context< PeeringMachine >().state->backfill_targets)),
5524     all_replicas_activated(false)
5525 {
5526   context< PeeringMachine >().log_enter(state_name);
5527
5528
5529   DECLARE_LOCALS;
5530
5531   ceph_assert(!ps->backfill_reserved);
5532   ceph_assert(ps->is_primary());
5533   psdout(10) << "In Active, about to call activate" << dendl;
5534   ps->start_flush(context< PeeringMachine >().get_cur_transaction());
5535   ps->activate(context< PeeringMachine >().get_cur_transaction(),
5536                ps->get_osdmap_epoch(),
5537                context< PeeringMachine >().get_recovery_ctx());
5538
5539   // everyone has to commit/ack before we are truly active
5540   ps->blocked_by.clear();
5541   for (set<pg_shard_t>::iterator p = ps->acting_recovery_backfill.begin();
5542        p != ps->acting_recovery_backfill.end();
5543        ++p) {
5544     if (p->shard != ps->pg_whoami.shard) {
5545       ps->blocked_by.insert(p->shard);
5546     }
5547   }
5548   pl->publish_stats_to_osd();
5549   psdout(10) << "Activate Finished" << dendl;
5550 }
5551
5552 boost::statechart::result PeeringState::Active::react(const AdvMap& advmap)
5553 {
5554   DECLARE_LOCALS;
5555
5556   if (ps->should_restart_peering(
5557         advmap.up_primary,
5558         advmap.acting_primary,
5559         advmap.newup,
5560         advmap.newacting,
5561         advmap.lastmap,
5562         advmap.osdmap)) {
5563     psdout(10) << "Active advmap interval change, fast return" << dendl;
5564     return forward_event();
5565   }
5566   psdout(10) << "Active advmap" << dendl;
5567   bool need_publish = false;
5568
5569   pl->on_active_advmap(advmap.osdmap);
5570   if (ps->dirty_big_info) {
5571     // share updated purged_snaps to mgr/mon so that we (a) stop reporting
5572     // purged snaps and (b) perhaps share more snaps that we have purged
5573     // but didn't fit in pg_stat_t.
5574     need_publish = true;
5575     ps->share_pg_info();
5576   }
5577
5578   bool need_acting_change = false;
5579   for (size_t i = 0; i < ps->want_acting.size(); i++) {
5580     int osd = ps->want_acting[i];
5581     if (!advmap.osdmap->is_up(osd)) {
5582       pg_shard_t osd_with_shard(osd, shard_id_t(i));
5583       if (!ps->is_acting(osd_with_shard) && !ps->is_up(osd_with_shard)) {
5584         psdout(10) << "Active stray osd." << osd << " in want_acting is down"
5585                    << dendl;
5586         need_acting_change = true;
5587       }
5588     }
5589   }
5590   if (need_acting_change) {
5591      psdout(10) << "Active need acting change, call choose_acting again"
5592                 << dendl;
5593     // possibly because we re-add some strays into the acting set and
5594     // some of them then go down in a subsequent map before we could see
5595     // the map changing the pg temp.
5596     // call choose_acting again to clear them out.
5597     // note that we leave restrict_to_up_acting to false in order to
5598     // not overkill any chosen stray that is still alive.
5599     pg_shard_t auth_log_shard;
5600     bool history_les_bound = false;
5601     ps->remove_down_peer_info(advmap.osdmap);
5602     ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
5603   }
5604
5605   /* Check for changes in pool size (if the acting set changed as a result,
5606    * this does not matter) */
5607   if (advmap.lastmap->get_pg_size(ps->info.pgid.pgid) !=
5608       ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid)) {
5609     if (ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid) <=
5610         ps->actingset.size()) {
5611       ps->state_clear(PG_STATE_UNDERSIZED);
5612     } else {
5613       ps->state_set(PG_STATE_UNDERSIZED);
5614     }
5615     // degraded changes will be detected by call from publish_stats_to_osd()
5616     need_publish = true;
5617   }
5618
5619   // if we haven't reported our PG stats in a long time, do so now.
5620   if (ps->info.stats.reported_epoch + ps->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
5621     psdout(20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - ps->info.stats.reported_epoch)
5622                        << " epochs" << dendl;
5623     need_publish = true;
5624   }
5625
5626   if (need_publish)
5627     pl->publish_stats_to_osd();
5628
5629   if (ps->check_prior_readable_down_osds(advmap.osdmap)) {
5630     pl->recheck_readable();
5631   }
5632
5633   return forward_event();
5634 }
5635
5636 boost::statechart::result PeeringState::Active::react(const ActMap&)
5637 {
5638   DECLARE_LOCALS;
5639   psdout(10) << "Active: handling ActMap" << dendl;
5640   ceph_assert(ps->is_primary());
5641
5642   pl->on_active_actmap();
5643
5644   if (ps->have_unfound()) {
5645     // object may have become unfound
5646     ps->discover_all_missing(context<PeeringMachine>().get_recovery_ctx().msgs);
5647   }
5648
5649   uint64_t unfound = ps->missing_loc.num_unfound();
5650   if (unfound > 0 &&
5651       ps->all_unfound_are_queried_or_lost(ps->get_osdmap())) {
5652     if (ps->cct->_conf->osd_auto_mark_unfound_lost) {
5653       pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has " << unfound
5654                             << " objects unfound and apparently lost, would automatically "
5655                             << "mark these objects lost but this feature is not yet implemented "
5656                             << "(osd_auto_mark_unfound_lost)";
5657     } else
5658       pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has "
5659                              << unfound << " objects unfound and apparently lost";
5660   }
5661
5662   return forward_event();
5663 }
5664
5665 boost::statechart::result PeeringState::Active::react(const MNotifyRec& notevt)
5666 {
5667
5668   DECLARE_LOCALS;
5669   ceph_assert(ps->is_primary());
5670   if (ps->peer_info.count(notevt.from)) {
5671     psdout(10) << "Active: got notify from " << notevt.from
5672                        << ", already have info from that osd, ignoring"
5673                        << dendl;
5674   } else if (ps->peer_purged.count(notevt.from)) {
5675     psdout(10) << "Active: got notify from " << notevt.from
5676                        << ", already purged that peer, ignoring"
5677                        << dendl;
5678   } else {
5679     psdout(10) << "Active: got notify from " << notevt.from
5680                        << ", calling proc_replica_info and discover_all_missing"
5681                        << dendl;
5682     ps->proc_replica_info(
5683       notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
5684     if (ps->have_unfound() || (ps->is_degraded() && ps->might_have_unfound.count(notevt.from))) {
5685       ps->discover_all_missing(
5686         context<PeeringMachine>().get_recovery_ctx().msgs);
5687     }
5688     // check if it is a previous down acting member that's coming back.
5689     // if so, request pg_temp change to trigger a new interval transition
5690     pg_shard_t auth_log_shard;
5691     bool history_les_bound = false;
5692     ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
5693     if (!ps->want_acting.empty() && ps->want_acting != ps->acting) {
5694       psdout(10) << "Active: got notify from previous acting member "
5695                  << notevt.from << ", requesting pg_temp change"
5696                  << dendl;
5697     }
5698   }
5699   return discard_event();
5700 }
5701
5702 boost::statechart::result PeeringState::Active::react(const MTrim& trim)
5703 {
5704   DECLARE_LOCALS;
5705   ceph_assert(ps->is_primary());
5706
5707   // peer is informing us of their last_complete_ondisk
5708   ldout(ps->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl;
5709   ps->update_peer_last_complete_ondisk(pg_shard_t{trim.from, trim.shard},
5710                                        trim.trim_to);
5711   // trim log when the pg is recovered
5712   ps->calc_min_last_complete_ondisk();
5713   return discard_event();
5714 }
5715
5716 boost::statechart::result PeeringState::Active::react(const MInfoRec& infoevt)
5717 {
5718   DECLARE_LOCALS;
5719   ceph_assert(ps->is_primary());
5720
5721   ceph_assert(!ps->acting_recovery_backfill.empty());
5722   if (infoevt.lease_ack) {
5723     ps->proc_lease_ack(infoevt.from.osd, *infoevt.lease_ack);
5724   }
5725   // don't update history (yet) if we are active and primary; the replica
5726   // may be telling us they have activated (and committed) but we can't
5727   // share that until _everyone_ does the same.
5728   if (ps->is_acting_recovery_backfill(infoevt.from) &&
5729       ps->peer_activated.count(infoevt.from) == 0) {
5730     psdout(10) << " peer osd." << infoevt.from
5731                << " activated and committed" << dendl;
5732     ps->peer_activated.insert(infoevt.from);
5733     ps->blocked_by.erase(infoevt.from.shard);
5734     pl->publish_stats_to_osd();
5735     if (ps->peer_activated.size() == ps->acting_recovery_backfill.size()) {
5736       all_activated_and_committed();
5737     }
5738   }
5739   return discard_event();
5740 }
5741
5742 boost::statechart::result PeeringState::Active::react(const MLogRec& logevt)
5743 {
5744   DECLARE_LOCALS;
5745   psdout(10) << "searching osd." << logevt.from
5746                      << " log for unfound items" << dendl;
5747   ps->proc_replica_log(
5748     logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
5749   bool got_missing = ps->search_for_missing(
5750     ps->peer_info[logevt.from],
5751     ps->peer_missing[logevt.from],
5752     logevt.from,
5753     context< PeeringMachine >().get_recovery_ctx());
5754   // If there are missing AND we are "fully" active then start recovery now
5755   if (got_missing && ps->state_test(PG_STATE_ACTIVE)) {
5756     post_event(DoRecovery());
5757   }
5758   return discard_event();
5759 }
5760
5761 boost::statechart::result PeeringState::Active::react(const QueryState& q)
5762 {
5763   DECLARE_LOCALS;
5764
5765   q.f->open_object_section("state");
5766   q.f->dump_string("name", state_name);
5767   q.f->dump_stream("enter_time") << enter_time;
5768
5769   {
5770     q.f->open_array_section("might_have_unfound");
5771     for (set<pg_shard_t>::iterator p = ps->might_have_unfound.begin();
5772          p != ps->might_have_unfound.end();
5773          ++p) {
5774       q.f->open_object_section("osd");
5775       q.f->dump_stream("osd") << *p;
5776       if (ps->peer_missing.count(*p)) {
5777         q.f->dump_string("status", "already probed");
5778       } else if (ps->peer_missing_requested.count(*p)) {
5779         q.f->dump_string("status", "querying");
5780       } else if (!ps->get_osdmap()->is_up(p->osd)) {
5781         q.f->dump_string("status", "osd is down");
5782       } else {
5783         q.f->dump_string("status", "not queried");
5784       }
5785       q.f->close_section();
5786     }
5787     q.f->close_section();
5788   }
5789   {
5790     q.f->open_object_section("recovery_progress");
5791     q.f->open_array_section("backfill_targets");
5792     for (set<pg_shard_t>::const_iterator p = ps->backfill_targets.begin();
5793          p != ps->backfill_targets.end(); ++p)
5794       q.f->dump_stream("replica") << *p;
5795     q.f->close_section();
5796     pl->dump_recovery_info(q.f);
5797     q.f->close_section();
5798   }
5799
5800   q.f->close_section();
5801   return forward_event();
5802 }
5803
5804 boost::statechart::result PeeringState::Active::react(
5805   const ActivateCommitted &evt)
5806 {
5807   DECLARE_LOCALS;
5808   ceph_assert(!ps->peer_activated.count(ps->pg_whoami));
5809   ps->peer_activated.insert(ps->pg_whoami);
5810   psdout(10) << "_activate_committed " << evt.epoch
5811              << " peer_activated now " << ps->peer_activated
5812              << " last_interval_started "
5813              << ps->info.history.last_interval_started
5814              << " last_epoch_started "
5815              << ps->info.history.last_epoch_started
5816              << " same_interval_since "
5817              << ps->info.history.same_interval_since
5818              << dendl;
5819   ceph_assert(!ps->acting_recovery_backfill.empty());
5820   if (ps->peer_activated.size() == ps->acting_recovery_backfill.size())
5821     all_activated_and_committed();
5822   return discard_event();
5823 }
5824
5825 boost::statechart::result PeeringState::Active::react(const AllReplicasActivated &evt)
5826 {
5827
5828   DECLARE_LOCALS;
5829   pg_t pgid = context< PeeringMachine >().spgid.pgid;
5830
5831   all_replicas_activated = true;
5832
5833   ps->state_clear(PG_STATE_ACTIVATING);
5834   ps->state_clear(PG_STATE_CREATING);
5835   ps->state_clear(PG_STATE_PREMERGE);
5836
5837   bool merge_target;
5838   if (ps->pool.info.is_pending_merge(pgid, &merge_target)) {
5839     ps->state_set(PG_STATE_PEERED);
5840     ps->state_set(PG_STATE_PREMERGE);
5841
5842     if (ps->actingset.size() != ps->get_osdmap()->get_pg_size(pgid)) {
5843       if (merge_target) {
5844         pg_t src = pgid;
5845         src.set_ps(ps->pool.info.get_pg_num_pending());
5846         assert(src.get_parent() == pgid);
5847         pl->set_not_ready_to_merge_target(pgid, src);
5848       } else {
5849         pl->set_not_ready_to_merge_source(pgid);
5850       }
5851     }
5852   } else if (ps->acting.size() < ps->pool.info.min_size) {
5853     ps->state_set(PG_STATE_PEERED);
5854   } else {
5855     ps->state_set(PG_STATE_ACTIVE);
5856   }
5857
5858   auto mnow = pl->get_mnow();
5859   if (ps->prior_readable_until_ub > mnow) {
5860     psdout(10) << " waiting for prior_readable_until_ub "
5861                << ps->prior_readable_until_ub << " > mnow " << mnow << dendl;
5862     ps->state_set(PG_STATE_WAIT);
5863     pl->queue_check_readable(
5864       ps->last_peering_reset,
5865       ps->prior_readable_until_ub - mnow);
5866   } else {
5867     psdout(10) << " mnow " << mnow << " >= prior_readable_until_ub "
5868                << ps->prior_readable_until_ub << dendl;
5869   }
5870
5871   if (ps->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) {
5872     pl->send_pg_created(pgid);
5873   }
5874
5875   ps->info.history.last_epoch_started = ps->info.last_epoch_started;
5876   ps->info.history.last_interval_started = ps->info.last_interval_started;
5877   ps->dirty_info = true;
5878
5879   ps->share_pg_info();
5880   pl->publish_stats_to_osd();
5881
5882   pl->on_activate_complete();
5883
5884   return discard_event();
5885 }
5886
5887 boost::statechart::result PeeringState::Active::react(const RenewLease& rl)
5888 {
5889   DECLARE_LOCALS;
5890   ps->proc_renew_lease();
5891   return discard_event();
5892 }
5893
5894 boost::statechart::result PeeringState::Active::react(const MLeaseAck& la)
5895 {
5896   DECLARE_LOCALS;
5897   ps->proc_lease_ack(la.from, la.lease_ack);
5898   return discard_event();
5899 }
5900
5901
5902 boost::statechart::result PeeringState::Active::react(const CheckReadable &evt)
5903 {
5904   DECLARE_LOCALS;
5905   pl->recheck_readable();
5906   return discard_event();
5907 }
5908
5909 /*
5910  * update info.history.last_epoch_started ONLY after we and all
5911  * replicas have activated AND committed the activate transaction
5912  * (i.e. the peering results are stable on disk).
5913  */
5914 void PeeringState::Active::all_activated_and_committed()
5915 {
5916   DECLARE_LOCALS;
5917   psdout(10) << "all_activated_and_committed" << dendl;
5918   ceph_assert(ps->is_primary());
5919   ceph_assert(ps->peer_activated.size() == ps->acting_recovery_backfill.size());
5920   ceph_assert(!ps->acting_recovery_backfill.empty());
5921   ceph_assert(ps->blocked_by.empty());
5922
5923   if (HAVE_FEATURE(ps->upacting_features, SERVER_OCTOPUS)) {
5924     // this is overkill when the activation is quick, but when it is slow it
5925     // is important, because the lease was renewed by the activate itself but we
5926     // don't know how long ago that was, and simply scheduling now may leave
5927     // a gap in lease coverage.  keep it simple and aggressively renew.
5928     ps->renew_lease(pl->get_mnow());
5929     ps->send_lease();
5930     ps->schedule_renew_lease();
5931   }
5932
5933   // Degraded?
5934   ps->update_calc_stats();
5935   if (ps->info.stats.stats.sum.num_objects_degraded) {
5936     ps->state_set(PG_STATE_DEGRADED);
5937   } else {
5938     ps->state_clear(PG_STATE_DEGRADED);
5939   }
5940
5941   post_event(PeeringState::AllReplicasActivated());
5942 }
5943
5944
5945 void PeeringState::Active::exit()
5946 {
5947   context< PeeringMachine >().log_exit(state_name, enter_time);
5948
5949
5950   DECLARE_LOCALS;
5951   pl->cancel_local_background_io_reservation();
5952
5953   ps->blocked_by.clear();
5954   ps->backfill_reserved = false;
5955   ps->state_clear(PG_STATE_ACTIVATING);
5956   ps->state_clear(PG_STATE_DEGRADED);
5957   ps->state_clear(PG_STATE_UNDERSIZED);
5958   ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
5959   ps->state_clear(PG_STATE_BACKFILL_WAIT);
5960   ps->state_clear(PG_STATE_RECOVERY_WAIT);
5961   ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5962   utime_t dur = ceph_clock_now() - enter_time;
5963   pl->get_peering_perf().tinc(rs_active_latency, dur);
5964   pl->on_active_exit();
5965 }
5966
5967 /*------ReplicaActive-----*/
5968 PeeringState::ReplicaActive::ReplicaActive(my_context ctx)
5969   : my_base(ctx),
5970     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive")
5971 {
5972   context< PeeringMachine >().log_enter(state_name);
5973
5974   DECLARE_LOCALS;
5975   ps->start_flush(context< PeeringMachine >().get_cur_transaction());
5976 }
5977
5978
5979 boost::statechart::result PeeringState::ReplicaActive::react(
5980   const Activate& actevt) {
5981   DECLARE_LOCALS;
5982   psdout(10) << "In ReplicaActive, about to call activate" << dendl;
5983   ps->activate(
5984     context< PeeringMachine >().get_cur_transaction(),
5985     actevt.activation_epoch,
5986     context< PeeringMachine >().get_recovery_ctx());
5987   psdout(10) << "Activate Finished" << dendl;
5988   return discard_event();
5989 }
5990
5991 boost::statechart::result PeeringState::ReplicaActive::react(
5992   const ActivateCommitted &evt)
5993 {
5994   DECLARE_LOCALS;
5995   psdout(10) << __func__ << " " << evt.epoch << " telling primary" << dendl;
5996
5997   auto &rctx = context<PeeringMachine>().get_recovery_ctx();
5998   auto epoch = ps->get_osdmap_epoch();
5999   pg_info_t i = ps->info;
6000   i.history.last_epoch_started = evt.activation_epoch;
6001   i.history.last_interval_started = i.history.same_interval_since;
6002   rctx.send_info(
6003     ps->get_primary().osd,
6004     spg_t(ps->info.pgid.pgid, ps->get_primary().shard),
6005     epoch,
6006     epoch,
6007     i,
6008     {}, /* lease */
6009     ps->get_lease_ack());
6010
6011   if (ps->acting.size() >= ps->pool.info.min_size) {
6012     ps->state_set(PG_STATE_ACTIVE);
6013   } else {
6014     ps->state_set(PG_STATE_PEERED);
6015   }
6016   pl->on_activate_committed();
6017
6018   return discard_event();
6019 }
6020
6021 boost::statechart::result PeeringState::ReplicaActive::react(const MLease& l)
6022 {
6023   DECLARE_LOCALS;
6024   spg_t spgid = context< PeeringMachine >().spgid;
6025   epoch_t epoch = pl->get_osdmap_epoch();
6026
6027   ps->proc_lease(l.lease);
6028   pl->send_cluster_message(
6029     ps->get_primary().osd,
6030     new MOSDPGLeaseAck(epoch,
6031                        spg_t(spgid.pgid, ps->get_primary().shard),
6032                        ps->get_lease_ack()),
6033     epoch);
6034   return discard_event();
6035 }
6036
6037 boost::statechart::result PeeringState::ReplicaActive::react(const MInfoRec& infoevt)
6038 {
6039   DECLARE_LOCALS;
6040   ps->proc_primary_info(context<PeeringMachine>().get_cur_transaction(),
6041                         infoevt.info);
6042   return discard_event();
6043 }
6044
6045 boost::statechart::result PeeringState::ReplicaActive::react(const MLogRec& logevt)
6046 {
6047   DECLARE_LOCALS;
6048   psdout(10) << "received log from " << logevt.from << dendl;
6049   ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6050   ps->merge_log(t, logevt.msg->info, logevt.msg->log, logevt.from);
6051   ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6052   if (logevt.msg->lease) {
6053     ps->proc_lease(*logevt.msg->lease);
6054   }
6055
6056   return discard_event();
6057 }
6058
6059 boost::statechart::result PeeringState::ReplicaActive::react(const MTrim& trim)
6060 {
6061   DECLARE_LOCALS;
6062   // primary is instructing us to trim
6063   ps->pg_log.trim(trim.trim_to, ps->info);
6064   ps->dirty_info = true;
6065   return discard_event();
6066 }
6067
6068 boost::statechart::result PeeringState::ReplicaActive::react(const ActMap&)
6069 {
6070   DECLARE_LOCALS;
6071   if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6072     ps->info.history.refresh_prior_readable_until_ub(
6073       pl->get_mnow(), ps->prior_readable_until_ub);
6074     context< PeeringMachine >().send_notify(
6075       ps->get_primary().osd,
6076       pg_notify_t(
6077         ps->get_primary().shard, ps->pg_whoami.shard,
6078         ps->get_osdmap_epoch(),
6079         ps->get_osdmap_epoch(),
6080         ps->info,
6081         ps->past_intervals));
6082   }
6083   return discard_event();
6084 }
6085
6086 boost::statechart::result PeeringState::ReplicaActive::react(
6087   const MQuery& query)
6088 {
6089   DECLARE_LOCALS;
6090   ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6091   return discard_event();
6092 }
6093
6094 boost::statechart::result PeeringState::ReplicaActive::react(const QueryState& q)
6095 {
6096   q.f->open_object_section("state");
6097   q.f->dump_string("name", state_name);
6098   q.f->dump_stream("enter_time") << enter_time;
6099   q.f->close_section();
6100   return forward_event();
6101 }
6102
6103 void PeeringState::ReplicaActive::exit()
6104 {
6105   context< PeeringMachine >().log_exit(state_name, enter_time);
6106   DECLARE_LOCALS;
6107   pl->unreserve_recovery_space();
6108
6109   pl->cancel_remote_recovery_reservation();
6110   utime_t dur = ceph_clock_now() - enter_time;
6111   pl->get_peering_perf().tinc(rs_replicaactive_latency, dur);
6112
6113   ps->min_last_complete_ondisk = eversion_t();
6114 }
6115
6116 /*-------Stray---*/
6117 PeeringState::Stray::Stray(my_context ctx)
6118   : my_base(ctx),
6119     NamedState(context< PeeringMachine >().state_history, "Started/Stray")
6120 {
6121   context< PeeringMachine >().log_enter(state_name);
6122
6123
6124   DECLARE_LOCALS;
6125   ceph_assert(!ps->is_peered());
6126   ceph_assert(!ps->is_peering());
6127   ceph_assert(!ps->is_primary());
6128
6129   if (!ps->get_osdmap()->have_pg_pool(ps->info.pgid.pgid.pool())) {
6130     ldout(ps->cct,10) << __func__ << " pool is deleted" << dendl;
6131     post_event(DeleteStart());
6132   } else {
6133     ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6134   }
6135 }
6136
6137 boost::statechart::result PeeringState::Stray::react(const MLogRec& logevt)
6138 {
6139   DECLARE_LOCALS;
6140   MOSDPGLog *msg = logevt.msg.get();
6141   psdout(10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
6142
6143   ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6144   if (msg->info.last_backfill == hobject_t()) {
6145     // restart backfill
6146     ps->info = msg->info;
6147     pl->on_info_history_change();
6148     ps->dirty_info = true;
6149     ps->dirty_big_info = true;  // maybe.
6150
6151     PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6152     ps->pg_log.reset_backfill_claim_log(msg->log, rollbacker.get());
6153
6154     ps->pg_log.reset_backfill();
6155   } else {
6156     ps->merge_log(t, msg->info, msg->log, logevt.from);
6157   }
6158   if (logevt.msg->lease) {
6159     ps->proc_lease(*logevt.msg->lease);
6160   }
6161
6162   ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6163
6164   post_event(Activate(logevt.msg->info.last_epoch_started));
6165   return transit<ReplicaActive>();
6166 }
6167
6168 boost::statechart::result PeeringState::Stray::react(const MInfoRec& infoevt)
6169 {
6170   DECLARE_LOCALS;
6171   psdout(10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
6172
6173   if (ps->info.last_update > infoevt.info.last_update) {
6174     // rewind divergent log entries
6175     ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6176     ps->rewind_divergent_log(t, infoevt.info.last_update);
6177     ps->info.stats = infoevt.info.stats;
6178     ps->info.hit_set = infoevt.info.hit_set;
6179   }
6180
6181   if (infoevt.lease) {
6182     ps->proc_lease(*infoevt.lease);
6183   }
6184
6185   ceph_assert(infoevt.info.last_update == ps->info.last_update);
6186   ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6187
6188   post_event(Activate(infoevt.info.last_epoch_started));
6189   return transit<ReplicaActive>();
6190 }
6191
6192 boost::statechart::result PeeringState::Stray::react(const MQuery& query)
6193 {
6194   DECLARE_LOCALS;
6195   ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6196   return discard_event();
6197 }
6198
6199 boost::statechart::result PeeringState::Stray::react(const ActMap&)
6200 {
6201   DECLARE_LOCALS;
6202   if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6203     ps->info.history.refresh_prior_readable_until_ub(
6204       pl->get_mnow(), ps->prior_readable_until_ub);
6205     context< PeeringMachine >().send_notify(
6206       ps->get_primary().osd,
6207       pg_notify_t(
6208         ps->get_primary().shard, ps->pg_whoami.shard,
6209         ps->get_osdmap_epoch(),
6210         ps->get_osdmap_epoch(),
6211         ps->info,
6212         ps->past_intervals));
6213   }
6214   return discard_event();
6215 }
6216
6217 void PeeringState::Stray::exit()
6218 {
6219   context< PeeringMachine >().log_exit(state_name, enter_time);
6220   DECLARE_LOCALS;
6221   utime_t dur = ceph_clock_now() - enter_time;
6222   pl->get_peering_perf().tinc(rs_stray_latency, dur);
6223 }
6224
6225
6226 /*--------ToDelete----------*/
6227 PeeringState::ToDelete::ToDelete(my_context ctx)
6228   : my_base(ctx),
6229     NamedState(context< PeeringMachine >().state_history, "Started/ToDelete")
6230 {
6231   context< PeeringMachine >().log_enter(state_name);
6232   DECLARE_LOCALS;
6233   pl->get_perf_logger().inc(l_osd_pg_removing);
6234 }
6235
6236 void PeeringState::ToDelete::exit()
6237 {
6238   context< PeeringMachine >().log_exit(state_name, enter_time);
6239   DECLARE_LOCALS;
6240   // note: on a successful removal, this path doesn't execute. see
6241   // _delete_some().
6242   pl->get_perf_logger().dec(l_osd_pg_removing);
6243
6244   pl->cancel_local_background_io_reservation();
6245 }
6246
6247 /*----WaitDeleteReserved----*/
6248 PeeringState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx)
6249   : my_base(ctx),
6250     NamedState(context< PeeringMachine >().state_history,
6251                "Started/ToDelete/WaitDeleteReseved")
6252 {
6253   context< PeeringMachine >().log_enter(state_name);
6254   DECLARE_LOCALS;
6255   context< ToDelete >().priority = ps->get_delete_priority();
6256
6257   pl->cancel_local_background_io_reservation();
6258   pl->request_local_background_io_reservation(
6259     context<ToDelete>().priority,
6260     std::make_shared<PGPeeringEvent>(
6261       ps->get_osdmap_epoch(),
6262       ps->get_osdmap_epoch(),
6263       DeleteReserved()),
6264     std::make_shared<PGPeeringEvent>(
6265       ps->get_osdmap_epoch(),
6266       ps->get_osdmap_epoch(),
6267       DeleteInterrupted()));
6268 }
6269
6270 boost::statechart::result PeeringState::ToDelete::react(
6271   const ActMap& evt)
6272 {
6273   DECLARE_LOCALS;
6274   if (ps->get_delete_priority() != priority) {
6275     psdout(10) << __func__ << " delete priority changed, resetting"
6276                    << dendl;
6277     return transit<ToDelete>();
6278   }
6279   return discard_event();
6280 }
6281
6282 void PeeringState::WaitDeleteReserved::exit()
6283 {
6284   context< PeeringMachine >().log_exit(state_name, enter_time);
6285 }
6286
6287 /*----Deleting-----*/
6288 PeeringState::Deleting::Deleting(my_context ctx)
6289   : my_base(ctx),
6290     NamedState(context< PeeringMachine >().state_history, "Started/ToDelete/Deleting")
6291 {
6292   context< PeeringMachine >().log_enter(state_name);
6293   DECLARE_LOCALS;
6294   ps->deleting = true;
6295   ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6296
6297   // clear log
6298   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6299   ps->pg_log.roll_forward(rollbacker.get());
6300
6301   // adjust info to backfill
6302   ps->info.set_last_backfill(hobject_t());
6303   ps->pg_log.reset_backfill();
6304   ps->dirty_info = true;
6305
6306   pl->on_removal(t);
6307 }
6308
6309 boost::statechart::result PeeringState::Deleting::react(
6310   const DeleteSome& evt)
6311 {
6312   DECLARE_LOCALS;
6313   pl->do_delete_work(context<PeeringMachine>().get_cur_transaction());
6314   return discard_event();
6315 }
6316
6317 void PeeringState::Deleting::exit()
6318 {
6319   context< PeeringMachine >().log_exit(state_name, enter_time);
6320   DECLARE_LOCALS;
6321   ps->deleting = false;
6322   pl->cancel_local_background_io_reservation();
6323 }
6324
6325 /*--------GetInfo---------*/
6326 PeeringState::GetInfo::GetInfo(my_context ctx)
6327   : my_base(ctx),
6328     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetInfo")
6329 {
6330   context< PeeringMachine >().log_enter(state_name);
6331
6332
6333   DECLARE_LOCALS;
6334   ps->check_past_interval_bounds();
6335   ps->log_weirdness();
6336   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6337
6338   ceph_assert(ps->blocked_by.empty());
6339
6340   prior_set = ps->build_prior();
6341   ps->prior_readable_down_osds = prior_set.down;
6342   if (ps->prior_readable_down_osds.empty()) {
6343     psdout(10) << " no prior_set down osds, clearing prior_readable_until_ub"
6344                << dendl;
6345     ps->clear_prior_readable_until_ub();
6346   }
6347
6348   ps->reset_min_peer_features();
6349   get_infos();
6350   if (prior_set.pg_down) {
6351     post_event(IsDown());
6352   } else if (peer_info_requested.empty()) {
6353     post_event(GotInfo());
6354   }
6355 }
6356
6357 void PeeringState::GetInfo::get_infos()
6358 {
6359   DECLARE_LOCALS;
6360   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6361
6362   ps->blocked_by.clear();
6363   for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
6364        it != prior_set.probe.end();
6365        ++it) {
6366     pg_shard_t peer = *it;
6367     if (peer == ps->pg_whoami) {
6368       continue;
6369     }
6370     if (ps->peer_info.count(peer)) {
6371       psdout(10) << " have osd." << peer << " info " << ps->peer_info[peer] << dendl;
6372       continue;
6373     }
6374     if (peer_info_requested.count(peer)) {
6375       psdout(10) << " already requested info from osd." << peer << dendl;
6376       ps->blocked_by.insert(peer.osd);
6377     } else if (!ps->get_osdmap()->is_up(peer.osd)) {
6378       psdout(10) << " not querying info from down osd." << peer << dendl;
6379     } else {
6380       psdout(10) << " querying info from osd." << peer << dendl;
6381       context< PeeringMachine >().send_query(
6382         peer.osd,
6383         pg_query_t(pg_query_t::INFO,
6384                    it->shard, ps->pg_whoami.shard,
6385                    ps->info.history,
6386                    ps->get_osdmap_epoch()));
6387       peer_info_requested.insert(peer);
6388       ps->blocked_by.insert(peer.osd);
6389     }
6390   }
6391
6392   ps->check_prior_readable_down_osds(ps->get_osdmap());
6393
6394   pl->publish_stats_to_osd();
6395 }
6396
6397 boost::statechart::result PeeringState::GetInfo::react(const MNotifyRec& infoevt)
6398 {
6399
6400   DECLARE_LOCALS;
6401
6402   set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
6403   if (p != peer_info_requested.end()) {
6404     peer_info_requested.erase(p);
6405     ps->blocked_by.erase(infoevt.from.osd);
6406   }
6407
6408   epoch_t old_start = ps->info.history.last_epoch_started;
6409   if (ps->proc_replica_info(
6410         infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
6411     // we got something new ...
6412     PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6413     if (old_start < ps->info.history.last_epoch_started) {
6414       psdout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
6415       prior_set = ps->build_prior();
6416       ps->prior_readable_down_osds = prior_set.down;
6417
6418       // filter out any osds that got dropped from the probe set from
6419       // peer_info_requested.  this is less expensive than restarting
6420       // peering (which would re-probe everyone).
6421       set<pg_shard_t>::iterator p = peer_info_requested.begin();
6422       while (p != peer_info_requested.end()) {
6423         if (prior_set.probe.count(*p) == 0) {
6424           psdout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
6425           peer_info_requested.erase(p++);
6426         } else {
6427           ++p;
6428         }
6429       }
6430       get_infos();
6431     }
6432     psdout(20) << "Adding osd: " << infoevt.from.osd << " peer features: "
6433                        << hex << infoevt.features << dec << dendl;
6434     ps->apply_peer_features(infoevt.features);
6435
6436     // are we done getting everything?
6437     if (peer_info_requested.empty() && !prior_set.pg_down) {
6438       psdout(20) << "Common peer features: " << hex << ps->get_min_peer_features() << dec << dendl;
6439       psdout(20) << "Common acting features: " << hex << ps->get_min_acting_features() << dec << dendl;
6440       psdout(20) << "Common upacting features: " << hex << ps->get_min_upacting_features() << dec << dendl;
6441       post_event(GotInfo());
6442     }
6443   }
6444   return discard_event();
6445 }
6446
6447 boost::statechart::result PeeringState::GetInfo::react(const QueryState& q)
6448 {
6449   DECLARE_LOCALS;
6450   q.f->open_object_section("state");
6451   q.f->dump_string("name", state_name);
6452   q.f->dump_stream("enter_time") << enter_time;
6453
6454   q.f->open_array_section("requested_info_from");
6455   for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
6456        p != peer_info_requested.end();
6457        ++p) {
6458     q.f->open_object_section("osd");
6459     q.f->dump_stream("osd") << *p;
6460     if (ps->peer_info.count(*p)) {
6461       q.f->open_object_section("got_info");
6462       ps->peer_info[*p].dump(q.f);
6463       q.f->close_section();
6464     }
6465     q.f->close_section();
6466   }
6467   q.f->close_section();
6468
6469   q.f->close_section();
6470   return forward_event();
6471 }
6472
6473 void PeeringState::GetInfo::exit()
6474 {
6475   context< PeeringMachine >().log_exit(state_name, enter_time);
6476
6477   DECLARE_LOCALS;
6478   utime_t dur = ceph_clock_now() - enter_time;
6479   pl->get_peering_perf().tinc(rs_getinfo_latency, dur);
6480   ps->blocked_by.clear();
6481 }
6482
6483 /*------GetLog------------*/
6484 PeeringState::GetLog::GetLog(my_context ctx)
6485   : my_base(ctx),
6486     NamedState(
6487       context< PeeringMachine >().state_history,
6488       "Started/Primary/Peering/GetLog"),
6489     msg(0)
6490 {
6491   context< PeeringMachine >().log_enter(state_name);
6492
6493   DECLARE_LOCALS;
6494
6495   ps->log_weirdness();
6496
6497   // adjust acting?
6498   if (!ps->choose_acting(auth_log_shard, false,
6499                          &context< Peering >().history_les_bound)) {
6500     if (!ps->want_acting.empty()) {
6501       post_event(NeedActingChange());
6502     } else {
6503       post_event(IsIncomplete());
6504     }
6505     return;
6506   }
6507
6508   // am i the best?
6509   if (auth_log_shard == ps->pg_whoami) {
6510     post_event(GotLog());
6511     return;
6512   }
6513
6514   const pg_info_t& best = ps->peer_info[auth_log_shard];
6515
6516   // am i broken?
6517   if (ps->info.last_update < best.log_tail) {
6518     psdout(10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
6519     post_event(IsIncomplete());
6520     return;
6521   }
6522
6523   // how much log to request?
6524   eversion_t request_log_from = ps->info.last_update;
6525   ceph_assert(!ps->acting_recovery_backfill.empty());
6526   for (set<pg_shard_t>::iterator p = ps->acting_recovery_backfill.begin();
6527        p != ps->acting_recovery_backfill.end();
6528        ++p) {
6529     if (*p == ps->pg_whoami) continue;
6530     pg_info_t& ri = ps->peer_info[*p];
6531     if (ri.last_update < ps->info.log_tail && ri.last_update >= best.log_tail &&
6532         ri.last_update < request_log_from)
6533       request_log_from = ri.last_update;
6534   }
6535
6536   // how much?
6537   psdout(10) << " requesting log from osd." << auth_log_shard << dendl;
6538   context<PeeringMachine>().send_query(
6539     auth_log_shard.osd,
6540     pg_query_t(
6541       pg_query_t::LOG,
6542       auth_log_shard.shard, ps->pg_whoami.shard,
6543       request_log_from, ps->info.history,
6544       ps->get_osdmap_epoch()));
6545
6546   ceph_assert(ps->blocked_by.empty());
6547   ps->blocked_by.insert(auth_log_shard.osd);
6548   pl->publish_stats_to_osd();
6549 }
6550
6551 boost::statechart::result PeeringState::GetLog::react(const AdvMap& advmap)
6552 {
6553   // make sure our log source didn't go down.  we need to check
6554   // explicitly because it may not be part of the prior set, which
6555   // means the Peering state check won't catch it going down.
6556   if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
6557     psdout(10) << "GetLog: auth_log_shard osd."
6558                        << auth_log_shard.osd << " went down" << dendl;
6559     post_event(advmap);
6560     return transit< Reset >();
6561   }
6562
6563   // let the Peering state do its checks.
6564   return forward_event();
6565 }
6566
6567 boost::statechart::result PeeringState::GetLog::react(const MLogRec& logevt)
6568 {
6569   ceph_assert(!msg);
6570   if (logevt.from != auth_log_shard) {
6571     psdout(10) << "GetLog: discarding log from "
6572                        << "non-auth_log_shard osd." << logevt.from << dendl;
6573     return discard_event();
6574   }
6575   psdout(10) << "GetLog: received master log from osd."
6576                      << logevt.from << dendl;
6577   msg = logevt.msg;
6578   post_event(GotLog());
6579   return discard_event();
6580 }
6581
6582 boost::statechart::result PeeringState::GetLog::react(const GotLog&)
6583 {
6584
6585   DECLARE_LOCALS;
6586   psdout(10) << "leaving GetLog" << dendl;
6587   if (msg) {
6588     psdout(10) << "processing master log" << dendl;
6589     ps->proc_master_log(context<PeeringMachine>().get_cur_transaction(),
6590                         msg->info, msg->log, msg->missing,
6591                         auth_log_shard);
6592   }
6593   ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6594   return transit< GetMissing >();
6595 }
6596
6597 boost::statechart::result PeeringState::GetLog::react(const QueryState& q)
6598 {
6599   q.f->open_object_section("state");
6600   q.f->dump_string("name", state_name);
6601   q.f->dump_stream("enter_time") << enter_time;
6602   q.f->dump_stream("auth_log_shard") << auth_log_shard;
6603   q.f->close_section();
6604   return forward_event();
6605 }
6606
6607 void PeeringState::GetLog::exit()
6608 {
6609   context< PeeringMachine >().log_exit(state_name, enter_time);
6610
6611   DECLARE_LOCALS;
6612   utime_t dur = ceph_clock_now() - enter_time;
6613   pl->get_peering_perf().tinc(rs_getlog_latency, dur);
6614   ps->blocked_by.clear();
6615 }
6616
6617 /*------WaitActingChange--------*/
6618 PeeringState::WaitActingChange::WaitActingChange(my_context ctx)
6619   : my_base(ctx),
6620     NamedState(context< PeeringMachine >().state_history, "Started/Primary/WaitActingChange")
6621 {
6622   context< PeeringMachine >().log_enter(state_name);
6623 }
6624
6625 boost::statechart::result PeeringState::WaitActingChange::react(const AdvMap& advmap)
6626 {
6627   DECLARE_LOCALS;
6628   OSDMapRef osdmap = advmap.osdmap;
6629
6630   psdout(10) << "verifying no want_acting " << ps->want_acting << " targets didn't go down" << dendl;
6631   for (vector<int>::iterator p = ps->want_acting.begin(); p != ps->want_acting.end(); ++p) {
6632     if (!osdmap->is_up(*p)) {
6633       psdout(10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
6634       post_event(advmap);
6635       return transit< Reset >();
6636     }
6637   }
6638   return forward_event();
6639 }
6640
6641 boost::statechart::result PeeringState::WaitActingChange::react(const MLogRec& logevt)
6642 {
6643   psdout(10) << "In WaitActingChange, ignoring MLocRec" << dendl;
6644   return discard_event();
6645 }
6646
6647 boost::statechart::result PeeringState::WaitActingChange::react(const MInfoRec& evt)
6648 {
6649   psdout(10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
6650   return discard_event();
6651 }
6652
6653 boost::statechart::result PeeringState::WaitActingChange::react(const MNotifyRec& evt)
6654 {
6655   psdout(10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
6656   return discard_event();
6657 }
6658
6659 boost::statechart::result PeeringState::WaitActingChange::react(const QueryState& q)
6660 {
6661   q.f->open_object_section("state");
6662   q.f->dump_string("name", state_name);
6663   q.f->dump_stream("enter_time") << enter_time;
6664   q.f->dump_string("comment", "waiting for pg acting set to change");
6665   q.f->close_section();
6666   return forward_event();
6667 }
6668
6669 void PeeringState::WaitActingChange::exit()
6670 {
6671   context< PeeringMachine >().log_exit(state_name, enter_time);
6672   DECLARE_LOCALS;
6673   utime_t dur = ceph_clock_now() - enter_time;
6674   pl->get_peering_perf().tinc(rs_waitactingchange_latency, dur);
6675 }
6676
6677 /*------Down--------*/
6678 PeeringState::Down::Down(my_context ctx)
6679   : my_base(ctx),
6680     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Down")
6681 {
6682   context< PeeringMachine >().log_enter(state_name);
6683   DECLARE_LOCALS;
6684
6685   ps->state_clear(PG_STATE_PEERING);
6686   ps->state_set(PG_STATE_DOWN);
6687
6688   auto &prior_set = context< Peering >().prior_set;
6689   ceph_assert(ps->blocked_by.empty());
6690   ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
6691   pl->publish_stats_to_osd();
6692 }
6693
6694 void PeeringState::Down::exit()
6695 {
6696   context< PeeringMachine >().log_exit(state_name, enter_time);
6697
6698   DECLARE_LOCALS;
6699
6700   ps->state_clear(PG_STATE_DOWN);
6701   utime_t dur = ceph_clock_now() - enter_time;
6702   pl->get_peering_perf().tinc(rs_down_latency, dur);
6703
6704   ps->blocked_by.clear();
6705 }
6706
6707 boost::statechart::result PeeringState::Down::react(const QueryState& q)
6708 {
6709   q.f->open_object_section("state");
6710   q.f->dump_string("name", state_name);
6711   q.f->dump_stream("enter_time") << enter_time;
6712   q.f->dump_string("comment",
6713                    "not enough up instances of this PG to go active");
6714   q.f->close_section();
6715   return forward_event();
6716 }
6717
6718 boost::statechart::result PeeringState::Down::react(const MNotifyRec& infoevt)
6719 {
6720   DECLARE_LOCALS;
6721
6722   ceph_assert(ps->is_primary());
6723   epoch_t old_start = ps->info.history.last_epoch_started;
6724   if (!ps->peer_info.count(infoevt.from) &&
6725       ps->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) {
6726     ps->update_history(infoevt.notify.info.history);
6727   }
6728   // if we got something new to make pg escape down state
6729   if (ps->info.history.last_epoch_started > old_start) {
6730       psdout(10) << " last_epoch_started moved forward, re-enter getinfo" << dendl;
6731     ps->state_clear(PG_STATE_DOWN);
6732     ps->state_set(PG_STATE_PEERING);
6733     return transit< GetInfo >();
6734   }
6735
6736   return discard_event();
6737 }
6738
6739
6740 /*------Incomplete--------*/
6741 PeeringState::Incomplete::Incomplete(my_context ctx)
6742   : my_base(ctx),
6743     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Incomplete")
6744 {
6745   context< PeeringMachine >().log_enter(state_name);
6746   DECLARE_LOCALS;
6747
6748   ps->state_clear(PG_STATE_PEERING);
6749   ps->state_set(PG_STATE_INCOMPLETE);
6750
6751   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6752   ceph_assert(ps->blocked_by.empty());
6753   ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
6754   pl->publish_stats_to_osd();
6755 }
6756
6757 boost::statechart::result PeeringState::Incomplete::react(const AdvMap &advmap) {
6758   DECLARE_LOCALS;
6759   int64_t poolnum = ps->info.pgid.pool();
6760
6761   // Reset if min_size turn smaller than previous value, pg might now be able to go active
6762   if (!advmap.osdmap->have_pg_pool(poolnum) ||
6763       advmap.lastmap->get_pools().find(poolnum)->second.min_size >
6764       advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
6765     post_event(advmap);
6766     return transit< Reset >();
6767   }
6768
6769   return forward_event();
6770 }
6771
6772 boost::statechart::result PeeringState::Incomplete::react(const MNotifyRec& notevt) {
6773   DECLARE_LOCALS;
6774   psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
6775   if (ps->proc_replica_info(
6776     notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
6777     // We got something new, try again!
6778     return transit< GetLog >();
6779   } else {
6780     return discard_event();
6781   }
6782 }
6783
6784 boost::statechart::result PeeringState::Incomplete::react(
6785   const QueryState& q)
6786 {
6787   q.f->open_object_section("state");
6788   q.f->dump_string("name", state_name);
6789   q.f->dump_stream("enter_time") << enter_time;
6790   q.f->dump_string("comment", "not enough complete instances of this PG");
6791   q.f->close_section();
6792   return forward_event();
6793 }
6794
6795 void PeeringState::Incomplete::exit()
6796 {
6797   context< PeeringMachine >().log_exit(state_name, enter_time);
6798
6799   DECLARE_LOCALS;
6800
6801   ps->state_clear(PG_STATE_INCOMPLETE);
6802   utime_t dur = ceph_clock_now() - enter_time;
6803   pl->get_peering_perf().tinc(rs_incomplete_latency, dur);
6804
6805   ps->blocked_by.clear();
6806 }
6807
6808 /*------GetMissing--------*/
6809 PeeringState::GetMissing::GetMissing(my_context ctx)
6810   : my_base(ctx),
6811     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetMissing")
6812 {
6813   context< PeeringMachine >().log_enter(state_name);
6814
6815   DECLARE_LOCALS;
6816   ps->log_weirdness();
6817   ceph_assert(!ps->acting_recovery_backfill.empty());
6818   eversion_t since;
6819   for (set<pg_shard_t>::iterator i = ps->acting_recovery_backfill.begin();
6820        i != ps->acting_recovery_backfill.end();
6821        ++i) {
6822     if (*i == ps->get_primary()) continue;
6823     const pg_info_t& pi = ps->peer_info[*i];
6824     // reset this so to make sure the pg_missing_t is initialized and
6825     // has the correct semantics even if we don't need to get a
6826     // missing set from a shard. This way later additions due to
6827     // lost+unfound delete work properly.
6828     ps->peer_missing[*i].may_include_deletes = !ps->perform_deletes_during_peering();
6829
6830     if (pi.is_empty())
6831       continue;                                // no pg data, nothing divergent
6832
6833     if (pi.last_update < ps->pg_log.get_tail()) {
6834       psdout(10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
6835       ps->peer_missing[*i].clear();
6836       continue;
6837     }
6838     if (pi.last_backfill == hobject_t()) {
6839       psdout(10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
6840       ps->peer_missing[*i].clear();
6841       continue;
6842     }
6843
6844     if (pi.last_update == pi.last_complete &&  // peer has no missing
6845         pi.last_update == ps->info.last_update) {  // peer is up to date
6846       // replica has no missing and identical log as us.  no need to
6847       // pull anything.
6848       // FIXME: we can do better here.  if last_update==last_complete we
6849       //        can infer the rest!
6850       psdout(10) << " osd." << *i << " has no missing, identical log" << dendl;
6851       ps->peer_missing[*i].clear();
6852       continue;
6853     }
6854
6855     // We pull the log from the peer's last_epoch_started to ensure we
6856     // get enough log to detect divergent updates.
6857     since.epoch = pi.last_epoch_started;
6858     ceph_assert(pi.last_update >= ps->info.log_tail);  // or else choose_acting() did a bad thing
6859     if (pi.log_tail <= since) {
6860       psdout(10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
6861       context< PeeringMachine >().send_query(
6862         i->osd,
6863         pg_query_t(
6864           pg_query_t::LOG,
6865           i->shard, ps->pg_whoami.shard,
6866           since, ps->info.history,
6867           ps->get_osdmap_epoch()));
6868     } else {
6869       psdout(10) << " requesting fulllog+missing from osd." << *i
6870                          << " (want since " << since << " < log.tail "
6871                          << pi.log_tail << ")" << dendl;
6872       context< PeeringMachine >().send_query(
6873         i->osd, pg_query_t(
6874           pg_query_t::FULLLOG,
6875           i->shard, ps->pg_whoami.shard,
6876           ps->info.history, ps->get_osdmap_epoch()));
6877     }
6878     peer_missing_requested.insert(*i);
6879     ps->blocked_by.insert(i->osd);
6880   }
6881
6882   if (peer_missing_requested.empty()) {
6883     if (ps->need_up_thru) {
6884       psdout(10) << " still need up_thru update before going active"
6885                          << dendl;
6886       post_event(NeedUpThru());
6887       return;
6888     }
6889
6890     // all good!
6891     post_event(Activate(ps->get_osdmap_epoch()));
6892   } else {
6893     pl->publish_stats_to_osd();
6894   }
6895 }
6896
6897 boost::statechart::result PeeringState::GetMissing::react(const MLogRec& logevt)
6898 {
6899   DECLARE_LOCALS;
6900
6901   peer_missing_requested.erase(logevt.from);
6902   ps->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
6903
6904   if (peer_missing_requested.empty()) {
6905     if (ps->need_up_thru) {
6906       psdout(10) << " still need up_thru update before going active"
6907                          << dendl;
6908       post_event(NeedUpThru());
6909     } else {
6910       psdout(10) << "Got last missing, don't need missing "
6911                          << "posting Activate" << dendl;
6912       post_event(Activate(ps->get_osdmap_epoch()));
6913     }
6914   }
6915   return discard_event();
6916 }
6917
6918 boost::statechart::result PeeringState::GetMissing::react(const QueryState& q)
6919 {
6920   DECLARE_LOCALS;
6921   q.f->open_object_section("state");
6922   q.f->dump_string("name", state_name);
6923   q.f->dump_stream("enter_time") << enter_time;
6924
6925   q.f->open_array_section("peer_missing_requested");
6926   for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
6927        p != peer_missing_requested.end();
6928        ++p) {
6929     q.f->open_object_section("osd");
6930     q.f->dump_stream("osd") << *p;
6931     if (ps->peer_missing.count(*p)) {
6932       q.f->open_object_section("got_missing");
6933       ps->peer_missing[*p].dump(q.f);
6934       q.f->close_section();
6935     }
6936     q.f->close_section();
6937   }
6938   q.f->close_section();
6939
6940   q.f->close_section();
6941   return forward_event();
6942 }
6943
6944 void PeeringState::GetMissing::exit()
6945 {
6946   context< PeeringMachine >().log_exit(state_name, enter_time);
6947
6948   DECLARE_LOCALS;
6949   utime_t dur = ceph_clock_now() - enter_time;
6950   pl->get_peering_perf().tinc(rs_getmissing_latency, dur);
6951   ps->blocked_by.clear();
6952 }
6953
6954 /*------WaitUpThru--------*/
6955 PeeringState::WaitUpThru::WaitUpThru(my_context ctx)
6956   : my_base(ctx),
6957     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/WaitUpThru")
6958 {
6959   context< PeeringMachine >().log_enter(state_name);
6960 }
6961
6962 boost::statechart::result PeeringState::WaitUpThru::react(const ActMap& am)
6963 {
6964   DECLARE_LOCALS;
6965   if (!ps->need_up_thru) {
6966     post_event(Activate(ps->get_osdmap_epoch()));
6967   }
6968   return forward_event();
6969 }
6970
6971 boost::statechart::result PeeringState::WaitUpThru::react(const MLogRec& logevt)
6972 {
6973   DECLARE_LOCALS;
6974   psdout(10) << "Noting missing from osd." << logevt.from << dendl;
6975   ps->peer_missing[logevt.from].claim(logevt.msg->missing);
6976   ps->peer_info[logevt.from] = logevt.msg->info;
6977   return discard_event();
6978 }
6979
6980 boost::statechart::result PeeringState::WaitUpThru::react(const QueryState& q)
6981 {
6982   q.f->open_object_section("state");
6983   q.f->dump_string("name", state_name);
6984   q.f->dump_stream("enter_time") << enter_time;
6985   q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
6986   q.f->close_section();
6987   return forward_event();
6988 }
6989
6990 void PeeringState::WaitUpThru::exit()
6991 {
6992   context< PeeringMachine >().log_exit(state_name, enter_time);
6993   DECLARE_LOCALS;
6994   utime_t dur = ceph_clock_now() - enter_time;
6995   pl->get_peering_perf().tinc(rs_waitupthru_latency, dur);
6996 }
6997
6998 /*----PeeringState::PeeringMachine Methods-----*/
6999 #undef dout_prefix
7000 #define dout_prefix dpp->gen_prefix(*_dout)
7001
7002 void PeeringState::PeeringMachine::log_enter(const char *state_name)
7003 {
7004   DECLARE_LOCALS;
7005   psdout(5) << "enter " << state_name << dendl;
7006   pl->log_state_enter(state_name);
7007 }
7008
7009 void PeeringState::PeeringMachine::log_exit(const char *state_name, utime_t enter_time)
7010 {
7011   DECLARE_LOCALS;
7012   utime_t dur = ceph_clock_now() - enter_time;
7013   psdout(5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
7014   pl->log_state_exit(state_name, enter_time, event_count, event_time);
7015   event_count = 0;
7016   event_time = utime_t();
7017 }
7018
7019 ostream &operator<<(ostream &out, const PeeringState &ps) {
7020   out << "pg[" << ps.info
7021       << " " << pg_vector_string(ps.up);
7022   if (ps.acting != ps.up)
7023     out << "/" << pg_vector_string(ps.acting);
7024   if (ps.is_ec_pg())
7025     out << "p" << ps.get_primary();
7026   if (!ps.async_recovery_targets.empty())
7027     out << " async=[" << ps.async_recovery_targets << "]";
7028   if (!ps.backfill_targets.empty())
7029     out << " backfill=[" << ps.backfill_targets << "]";
7030   out << " r=" << ps.get_role();
7031   out << " lpr=" << ps.get_last_peering_reset();
7032
7033   if (ps.deleting)
7034     out << " DELETING";
7035
7036   if (!ps.past_intervals.empty()) {
7037     out << " pi=[" << ps.past_intervals.get_bounds()
7038         << ")/" << ps.past_intervals.size();
7039   }
7040
7041   if (ps.is_peered()) {
7042     if (ps.last_update_ondisk != ps.info.last_update)
7043       out << " luod=" << ps.last_update_ondisk;
7044     if (ps.last_update_applied != ps.info.last_update)
7045       out << " lua=" << ps.last_update_applied;
7046   }
7047
7048   if (ps.pg_log.get_tail() != ps.info.log_tail ||
7049       ps.pg_log.get_head() != ps.info.last_update)
7050     out << " (info mismatch, " << ps.pg_log.get_log() << ")";
7051
7052   if (!ps.pg_log.get_log().empty()) {
7053     if ((ps.pg_log.get_log().log.begin()->version <= ps.pg_log.get_tail())) {
7054       out << " (log bound mismatch, actual=["
7055           << ps.pg_log.get_log().log.begin()->version << ","
7056           << ps.pg_log.get_log().log.rbegin()->version << "]";
7057       out << ")";
7058     }
7059   }
7060
7061   out << " crt=" << ps.pg_log.get_can_rollback_to();
7062
7063   if (ps.last_complete_ondisk != ps.info.last_complete)
7064     out << " lcod " << ps.last_complete_ondisk;
7065
7066   out << " mlcod " << ps.min_last_complete_ondisk;
7067
7068   out << " " << pg_state_string(ps.get_state());
7069   if (ps.should_send_notify())
7070     out << " NOTIFY";
7071
7072   if (ps.prior_readable_until_ub != ceph::signedspan::zero()) {
7073     out << " pruub " << ps.prior_readable_until_ub
7074         << "@" << ps.get_prior_readable_down_osds();
7075   }
7076   return out;
7077 }