ceph/src/osd/PeeringState.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3
   4 #include "PGPeeringEvent.h"
   5 #include "common/ceph_releases.h"
   6 #include "common/dout.h"
   7 #include "PeeringState.h"
   8
   9 #include "messages/MOSDPGRemove.h"
  10 #include "messages/MBackfillReserve.h"
  11 #include "messages/MRecoveryReserve.h"
  12 #include "messages/MOSDScrubReserve.h"
  13 #include "messages/MOSDPGInfo.h"
  14 #include "messages/MOSDPGInfo2.h"
  15 #include "messages/MOSDPGTrim.h"
  16 #include "messages/MOSDPGLog.h"
  17 #include "messages/MOSDPGNotify.h"
  18 #include "messages/MOSDPGNotify2.h"
  19 #include "messages/MOSDPGQuery.h"
  20 #include "messages/MOSDPGQuery2.h"
  21 #include "messages/MOSDPGLease.h"
  22 #include "messages/MOSDPGLeaseAck.h"
  23
  24 #define dout_context cct
  25 #define dout_subsys ceph_subsys_osd
  26
  27 BufferedRecoveryMessages::BufferedRecoveryMessages(
  28   ceph_release_t r,
  29   PeeringCtx &ctx)
  30   : require_osd_release(r) {
  31   // steal messages from ctx
  32   message_map.swap(ctx.message_map);
  33 }
  34
  35 void BufferedRecoveryMessages::send_notify(int to, const pg_notify_t &n)
  36 {
  37   if (require_osd_release >= ceph_release_t::octopus) {
  38     spg_t pgid(n.info.pgid.pgid, n.to);
  39     send_osd_message(to, make_message<MOSDPGNotify2>(pgid, n));
  40   } else {
  41     send_osd_message(to, make_message<MOSDPGNotify>(n.epoch_sent, vector{n}));
  42   }
  43 }
  44
  45 void BufferedRecoveryMessages::send_query(
  46   int to,
  47   spg_t to_spgid,
  48   const pg_query_t &q)
  49 {
  50   if (require_osd_release >= ceph_release_t::octopus) {
  51     send_osd_message(to,
  52                      make_message<MOSDPGQuery2>(to_spgid, q));
  53   } else {
  54     auto m = make_message<MOSDPGQuery>(
  55       q.epoch_sent,
  56       MOSDPGQuery::pg_list_t{{to_spgid, q}});
  57     send_osd_message(to, m);
  58   }
  59 }
  60
  61 void BufferedRecoveryMessages::send_info(
  62   int to,
  63   spg_t to_spgid,
  64   epoch_t min_epoch,
  65   epoch_t cur_epoch,
  66   const pg_info_t &info,
  67   std::optional<pg_lease_t> lease,
  68   std::optional<pg_lease_ack_t> lease_ack)
  69 {
  70   if (require_osd_release >= ceph_release_t::octopus) {
  71     send_osd_message(
  72       to,
  73       make_message<MOSDPGInfo2>(
  74         to_spgid,
  75         info,
  76         cur_epoch,
  77         min_epoch,
  78         lease,
  79         lease_ack)
  80       );
  81   } else {
  82     send_osd_message(
  83       to,
  84       make_message<MOSDPGInfo>(
  85         cur_epoch,
  86         vector{pg_notify_t{to_spgid.shard,
  87                            info.pgid.shard,
  88                            min_epoch, cur_epoch,
  89                            info, PastIntervals{}}})
  90       );
  91   }
  92 }
  93
  94 void PGPool::update(CephContext *cct, OSDMapRef map)
  95 {
  96   const pg_pool_t *pi = map->get_pg_pool(id);
  97   if (!pi) {
  98     return; // pool has been deleted
  99   }
 100   info = *pi;
 101   name = map->get_pool_name(id);
 102
 103   bool updated = false;
 104   if ((map->get_epoch() != cached_epoch + 1) ||
 105       (pi->get_snap_epoch() == map->get_epoch())) {
 106     updated = true;
 107   }
 108
 109   assert(map->require_osd_release >= ceph_release_t::mimic);
 110   if (info.is_pool_snaps_mode() && updated) {
 111     snapc = pi->get_snap_context();
 112   }
 113   cached_epoch = map->get_epoch();
 114 }
 115
 116 /*-------------Peering State Helpers----------------*/
 117 #undef dout_prefix
 118 #define dout_prefix (dpp->gen_prefix(*_dout))
 119 #undef psdout
 120 #define psdout(x) ldout(cct, x)
 121
 122 PeeringState::PeeringState(
 123   CephContext *cct,
 124   pg_shard_t pg_whoami,
 125   spg_t spgid,
 126   const PGPool &_pool,
 127   OSDMapRef curmap,
 128   DoutPrefixProvider *dpp,
 129   PeeringListener *pl)
 130   : state_history(*pl),
 131     cct(cct),
 132     spgid(spgid),
 133     dpp(dpp),
 134     pl(pl),
 135     orig_ctx(0),
 136     osdmap_ref(curmap),
 137     pool(_pool),
 138     pg_whoami(pg_whoami),
 139     info(spgid),
 140     pg_log(cct),
 141     missing_loc(spgid, this, dpp, cct),
 142     machine(this, cct, spgid, dpp, pl, &state_history)
 143 {
 144   machine.initiate();
 145 }
 146
 147 void PeeringState::start_handle(PeeringCtx *new_ctx) {
 148   ceph_assert(!rctx);
 149   ceph_assert(!orig_ctx);
 150   orig_ctx = new_ctx;
 151   if (new_ctx) {
 152     if (messages_pending_flush) {
 153       rctx.emplace(*messages_pending_flush, *new_ctx);
 154     } else {
 155       rctx.emplace(*new_ctx);
 156     }
 157     rctx->start_time = ceph_clock_now();
 158   }
 159 }
 160
 161 void PeeringState::begin_block_outgoing() {
 162   ceph_assert(!messages_pending_flush);
 163   ceph_assert(orig_ctx);
 164   ceph_assert(rctx);
 165   messages_pending_flush = BufferedRecoveryMessages(
 166     orig_ctx->require_osd_release);
 167   rctx.emplace(*messages_pending_flush, *orig_ctx);
 168 }
 169
 170 void PeeringState::clear_blocked_outgoing() {
 171   ceph_assert(orig_ctx);
 172   ceph_assert(rctx);
 173   messages_pending_flush = std::optional<BufferedRecoveryMessages>();
 174 }
 175
 176 void PeeringState::end_block_outgoing() {
 177   ceph_assert(messages_pending_flush);
 178   ceph_assert(orig_ctx);
 179   ceph_assert(rctx);
 180
 181   orig_ctx->accept_buffered_messages(*messages_pending_flush);
 182   rctx.emplace(*orig_ctx);
 183   messages_pending_flush = std::optional<BufferedRecoveryMessages>();
 184 }
 185
 186 void PeeringState::end_handle() {
 187   if (rctx) {
 188     utime_t dur = ceph_clock_now() - rctx->start_time;
 189     machine.event_time += dur;
 190   }
 191
 192   machine.event_count++;
 193   rctx = std::nullopt;
 194   orig_ctx = NULL;
 195 }
 196
 197 void PeeringState::check_recovery_sources(const OSDMapRef& osdmap)
 198 {
 199   /*
 200    * check that any peers we are planning to (or currently) pulling
 201    * objects from are dealt with.
 202    */
 203   missing_loc.check_recovery_sources(osdmap);
 204   pl->check_recovery_sources(osdmap);
 205
 206   for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
 207        i != peer_log_requested.end();
 208        ) {
 209     if (!osdmap->is_up(i->osd)) {
 210       psdout(10) << "peer_log_requested removing " << *i << dendl;
 211       peer_log_requested.erase(i++);
 212     } else {
 213       ++i;
 214     }
 215   }
 216
 217   for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
 218        i != peer_missing_requested.end();
 219        ) {
 220     if (!osdmap->is_up(i->osd)) {
 221       psdout(10) << "peer_missing_requested removing " << *i << dendl;
 222       peer_missing_requested.erase(i++);
 223     } else {
 224       ++i;
 225     }
 226   }
 227 }
 228
 229 void PeeringState::update_history(const pg_history_t& new_history)
 230 {
 231   auto mnow = pl->get_mnow();
 232   info.history.refresh_prior_readable_until_ub(mnow, prior_readable_until_ub);
 233   if (info.history.merge(new_history)) {
 234     psdout(20) << __func__ << " advanced history from " << new_history << dendl;
 235     dirty_info = true;
 236     if (info.history.last_epoch_clean >= info.history.same_interval_since) {
 237       psdout(20) << __func__ << " clearing past_intervals" << dendl;
 238       past_intervals.clear();
 239       dirty_big_info = true;
 240     }
 241     prior_readable_until_ub = info.history.get_prior_readable_until_ub(mnow);
 242     if (prior_readable_until_ub != ceph::signedspan::zero()) {
 243       dout(20) << __func__
 244                << " prior_readable_until_ub " << prior_readable_until_ub
 245                << " (mnow " << mnow << " + "
 246                << info.history.prior_readable_until_ub << ")" << dendl;
 247     }
 248   }
 249   pl->on_info_history_change();
 250 }
 251
 252 void PeeringState::purge_strays()
 253 {
 254   if (is_premerge()) {
 255     psdout(10) << "purge_strays " << stray_set << " but premerge, doing nothing"
 256                << dendl;
 257     return;
 258   }
 259   if (cct->_conf.get_val<bool>("osd_debug_no_purge_strays")) {
 260     return;
 261   }
 262   psdout(10) << "purge_strays " << stray_set << dendl;
 263
 264   bool removed = false;
 265   for (set<pg_shard_t>::iterator p = stray_set.begin();
 266        p != stray_set.end();
 267        ++p) {
 268     ceph_assert(!is_acting_recovery_backfill(*p));
 269     if (get_osdmap()->is_up(p->osd)) {
 270       psdout(10) << "sending PGRemove to osd." << *p << dendl;
 271       vector<spg_t> to_remove;
 272       to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
 273       MOSDPGRemove *m = new MOSDPGRemove(
 274         get_osdmap_epoch(),
 275         to_remove);
 276       pl->send_cluster_message(p->osd, m, get_osdmap_epoch());
 277     } else {
 278       psdout(10) << "not sending PGRemove to down osd." << *p << dendl;
 279     }
 280     peer_missing.erase(*p);
 281     peer_info.erase(*p);
 282     missing_loc.remove_stray_recovery_sources(*p);
 283     peer_purged.insert(*p);
 284     removed = true;
 285   }
 286
 287   // if we removed anyone, update peers (which include peer_info)
 288   if (removed)
 289     update_heartbeat_peers();
 290
 291   stray_set.clear();
 292
 293   // clear _requested maps; we may have to peer() again if we discover
 294   // (more) stray content
 295   peer_log_requested.clear();
 296   peer_missing_requested.clear();
 297 }
 298
 299
 300 bool PeeringState::proc_replica_info(
 301   pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
 302 {
 303   map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
 304   if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
 305     psdout(10) << " got dup osd." << from << " info "
 306                << oinfo << ", identical to ours" << dendl;
 307     return false;
 308   }
 309
 310   if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
 311     psdout(10) << " got info " << oinfo << " from down osd." << from
 312              << " discarding" << dendl;
 313     return false;
 314   }
 315
 316   psdout(10) << " got osd." << from << " " << oinfo << dendl;
 317   ceph_assert(is_primary());
 318   peer_info[from] = oinfo;
 319   might_have_unfound.insert(from);
 320
 321   update_history(oinfo.history);
 322
 323   // stray?
 324   if (!is_up(from) && !is_acting(from)) {
 325     psdout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
 326     stray_set.insert(from);
 327     if (is_clean()) {
 328       purge_strays();
 329     }
 330   }
 331
 332   // was this a new info?  if so, update peers!
 333   if (p == peer_info.end())
 334     update_heartbeat_peers();
 335
 336   return true;
 337 }
 338
 339
 340 void PeeringState::remove_down_peer_info(const OSDMapRef &osdmap)
 341 {
 342   // Remove any downed osds from peer_info
 343   bool removed = false;
 344   map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
 345   while (p != peer_info.end()) {
 346     if (!osdmap->is_up(p->first.osd)) {
 347       psdout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
 348       peer_missing.erase(p->first);
 349       peer_log_requested.erase(p->first);
 350       peer_missing_requested.erase(p->first);
 351       peer_purged.erase(p->first);
 352       peer_info.erase(p++);
 353       removed = true;
 354     } else
 355       ++p;
 356   }
 357
 358   // if we removed anyone, update peers (which include peer_info)
 359   if (removed)
 360     update_heartbeat_peers();
 361
 362   check_recovery_sources(osdmap);
 363 }
 364
 365 void PeeringState::update_heartbeat_peers()
 366 {
 367   if (!is_primary())
 368     return;
 369
 370   set<int> new_peers;
 371   for (unsigned i=0; i<acting.size(); i++) {
 372     if (acting[i] != CRUSH_ITEM_NONE)
 373       new_peers.insert(acting[i]);
 374   }
 375   for (unsigned i=0; i<up.size(); i++) {
 376     if (up[i] != CRUSH_ITEM_NONE)
 377       new_peers.insert(up[i]);
 378   }
 379   for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
 380        p != peer_info.end();
 381        ++p) {
 382     new_peers.insert(p->first.osd);
 383   }
 384   pl->update_heartbeat_peers(std::move(new_peers));
 385 }
 386
 387 void PeeringState::write_if_dirty(ObjectStore::Transaction& t)
 388 {
 389   pl->prepare_write(
 390     info,
 391     last_written_info,
 392     past_intervals,
 393     pg_log,
 394     dirty_info,
 395     dirty_big_info,
 396     last_persisted_osdmap < get_osdmap_epoch(),
 397     t);
 398   if (dirty_info || dirty_big_info) {
 399     last_persisted_osdmap = get_osdmap_epoch();
 400     last_written_info = info;
 401     dirty_info = false;
 402     dirty_big_info = false;
 403   }
 404 }
 405
 406 void PeeringState::advance_map(
 407   OSDMapRef osdmap, OSDMapRef lastmap,
 408   vector<int>& newup, int up_primary,
 409   vector<int>& newacting, int acting_primary,
 410   PeeringCtx &rctx)
 411 {
 412   ceph_assert(lastmap == osdmap_ref);
 413   psdout(10) << "handle_advance_map "
 414             << newup << "/" << newacting
 415             << " -- " << up_primary << "/" << acting_primary
 416             << dendl;
 417
 418   update_osdmap_ref(osdmap);
 419   pool.update(cct, osdmap);
 420
 421   AdvMap evt(
 422     osdmap, lastmap, newup, up_primary,
 423     newacting, acting_primary);
 424   handle_event(evt, &rctx);
 425   if (pool.info.last_change == osdmap_ref->get_epoch()) {
 426     pl->on_pool_change();
 427   }
 428   readable_interval = pool.get_readable_interval();
 429   last_require_osd_release = osdmap->require_osd_release;
 430 }
 431
 432 void PeeringState::activate_map(PeeringCtx &rctx)
 433 {
 434   psdout(10) << __func__ << dendl;
 435   ActMap evt;
 436   handle_event(evt, &rctx);
 437   if (osdmap_ref->get_epoch() - last_persisted_osdmap >
 438     cct->_conf->osd_pg_epoch_persisted_max_stale) {
 439     psdout(20) << __func__ << ": Dirtying info: last_persisted is "
 440               << last_persisted_osdmap
 441               << " while current is " << osdmap_ref->get_epoch() << dendl;
 442     dirty_info = true;
 443   } else {
 444     psdout(20) << __func__ << ": Not dirtying info: last_persisted is "
 445               << last_persisted_osdmap
 446               << " while current is " << osdmap_ref->get_epoch() << dendl;
 447   }
 448   write_if_dirty(rctx.transaction);
 449
 450   if (get_osdmap()->check_new_blacklist_entries()) {
 451     pl->check_blacklisted_watchers();
 452   }
 453 }
 454
 455 void PeeringState::set_last_peering_reset()
 456 {
 457   psdout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl;
 458   if (last_peering_reset != get_osdmap_epoch()) {
 459     last_peering_reset = get_osdmap_epoch();
 460     psdout(10) << "Clearing blocked outgoing recovery messages" << dendl;
 461     clear_blocked_outgoing();
 462     if (!pl->try_flush_or_schedule_async()) {
 463       psdout(10) << "Beginning to block outgoing recovery messages" << dendl;
 464       begin_block_outgoing();
 465     } else {
 466       psdout(10) << "Not blocking outgoing recovery messages" << dendl;
 467     }
 468   }
 469 }
 470
 471 void PeeringState::complete_flush()
 472 {
 473   flushes_in_progress--;
 474   if (flushes_in_progress == 0) {
 475     pl->on_flushed();
 476   }
 477 }
 478
 479 void PeeringState::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
 480 {
 481   const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
 482   if (!pi) {
 483     return; // pool deleted
 484   }
 485   bool changed = false;
 486   if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
 487     const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
 488     if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
 489       psdout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
 490       changed = true;
 491     }
 492   }
 493   if (changed) {
 494     info.history.last_epoch_marked_full = osdmap->get_epoch();
 495     dirty_info = true;
 496   }
 497 }
 498
 499 bool PeeringState::should_restart_peering(
 500   int newupprimary,
 501   int newactingprimary,
 502   const vector<int>& newup,
 503   const vector<int>& newacting,
 504   OSDMapRef lastmap,
 505   OSDMapRef osdmap)
 506 {
 507   if (PastIntervals::is_new_interval(
 508         primary.osd,
 509         newactingprimary,
 510         acting,
 511         newacting,
 512         up_primary.osd,
 513         newupprimary,
 514         up,
 515         newup,
 516         osdmap.get(),
 517         lastmap.get(),
 518         info.pgid.pgid)) {
 519     psdout(20) << "new interval newup " << newup
 520                << " newacting " << newacting << dendl;
 521     return true;
 522   }
 523   if (!lastmap->is_up(pg_whoami.osd) && osdmap->is_up(pg_whoami.osd)) {
 524     psdout(10) << __func__ << " osd transitioned from down -> up"
 525                << dendl;
 526     return true;
 527   }
 528   return false;
 529 }
 530
 531 /* Called before initializing peering during advance_map */
 532 void PeeringState::start_peering_interval(
 533   const OSDMapRef lastmap,
 534   const vector<int>& newup, int new_up_primary,
 535   const vector<int>& newacting, int new_acting_primary,
 536   ObjectStore::Transaction &t)
 537 {
 538   const OSDMapRef osdmap = get_osdmap();
 539
 540   set_last_peering_reset();
 541
 542   vector<int> oldacting, oldup;
 543   int oldrole = get_role();
 544
 545   if (is_primary()) {
 546     pl->clear_ready_to_merge();
 547   }
 548
 549
 550   pg_shard_t old_acting_primary = get_primary();
 551   pg_shard_t old_up_primary = up_primary;
 552   bool was_old_primary = is_primary();
 553   bool was_old_nonprimary = is_nonprimary();
 554
 555   acting.swap(oldacting);
 556   up.swap(oldup);
 557   init_primary_up_acting(
 558     newup,
 559     newacting,
 560     new_up_primary,
 561     new_acting_primary);
 562
 563   if (info.stats.up != up ||
 564       info.stats.acting != acting ||
 565       info.stats.up_primary != new_up_primary ||
 566       info.stats.acting_primary != new_acting_primary) {
 567     info.stats.up = up;
 568     info.stats.up_primary = new_up_primary;
 569     info.stats.acting = acting;
 570     info.stats.acting_primary = new_acting_primary;
 571     info.stats.mapping_epoch = osdmap->get_epoch();
 572   }
 573
 574   pl->clear_publish_stats();
 575
 576   // This will now be remapped during a backfill in cases
 577   // that it would not have been before.
 578   if (up != acting)
 579     state_set(PG_STATE_REMAPPED);
 580   else
 581     state_clear(PG_STATE_REMAPPED);
 582
 583   int role = osdmap->calc_pg_role(pg_whoami, acting);
 584   set_role(role);
 585
 586   // did acting, up, primary|acker change?
 587   if (!lastmap) {
 588     psdout(10) << " no lastmap" << dendl;
 589     dirty_info = true;
 590     dirty_big_info = true;
 591     info.history.same_interval_since = osdmap->get_epoch();
 592   } else {
 593     std::stringstream debug;
 594     ceph_assert(info.history.same_interval_since != 0);
 595     bool new_interval = PastIntervals::check_new_interval(
 596       old_acting_primary.osd,
 597       new_acting_primary,
 598       oldacting, newacting,
 599       old_up_primary.osd,
 600       new_up_primary,
 601       oldup, newup,
 602       info.history.same_interval_since,
 603       info.history.last_epoch_clean,
 604       osdmap.get(),
 605       lastmap.get(),
 606       info.pgid.pgid,
 607       missing_loc.get_recoverable_predicate(),
 608       &past_intervals,
 609       &debug);
 610     psdout(10) << __func__ << ": check_new_interval output: "
 611                << debug.str() << dendl;
 612     if (new_interval) {
 613       if (osdmap->get_epoch() == pl->oldest_stored_osdmap() &&
 614           info.history.last_epoch_clean < osdmap->get_epoch()) {
 615         psdout(10) << " map gap, clearing past_intervals and faking" << dendl;
 616         // our information is incomplete and useless; someone else was clean
 617         // after everything we know if osdmaps were trimmed.
 618         past_intervals.clear();
 619       } else {
 620         psdout(10) << " noting past " << past_intervals << dendl;
 621       }
 622       dirty_info = true;
 623       dirty_big_info = true;
 624       info.history.same_interval_since = osdmap->get_epoch();
 625       if (osdmap->have_pg_pool(info.pgid.pgid.pool()) &&
 626           info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
 627                                   osdmap->get_pg_num(info.pgid.pgid.pool()),
 628                                   nullptr)) {
 629         info.history.last_epoch_split = osdmap->get_epoch();
 630       }
 631     }
 632   }
 633
 634   if (old_up_primary != up_primary ||
 635       oldup != up) {
 636     info.history.same_up_since = osdmap->get_epoch();
 637   }
 638   // this comparison includes primary rank via pg_shard_t
 639   if (old_acting_primary != get_primary()) {
 640     info.history.same_primary_since = osdmap->get_epoch();
 641   }
 642
 643   on_new_interval();
 644   pl->on_info_history_change();
 645
 646   psdout(1) << __func__ << " up " << oldup << " -> " << up
 647             << ", acting " << oldacting << " -> " << acting
 648             << ", acting_primary " << old_acting_primary << " -> "
 649             << new_acting_primary
 650             << ", up_primary " << old_up_primary << " -> " << new_up_primary
 651             << ", role " << oldrole << " -> " << role
 652             << ", features acting " << acting_features
 653             << " upacting " << upacting_features
 654             << dendl;
 655
 656   // deactivate.
 657   state_clear(PG_STATE_ACTIVE);
 658   state_clear(PG_STATE_PEERED);
 659   state_clear(PG_STATE_PREMERGE);
 660   state_clear(PG_STATE_DOWN);
 661   state_clear(PG_STATE_RECOVERY_WAIT);
 662   state_clear(PG_STATE_RECOVERY_TOOFULL);
 663   state_clear(PG_STATE_RECOVERING);
 664
 665   peer_purged.clear();
 666   acting_recovery_backfill.clear();
 667
 668   // reset primary/replica state?
 669   if (was_old_primary || is_primary()) {
 670     pl->clear_want_pg_temp();
 671   } else if (was_old_nonprimary || is_nonprimary()) {
 672     pl->clear_want_pg_temp();
 673   }
 674   clear_primary_state();
 675
 676   pl->on_change(t);
 677
 678   ceph_assert(!deleting);
 679
 680   // should we tell the primary we are here?
 681   send_notify = !is_primary();
 682
 683   if (role != oldrole ||
 684       was_old_primary != is_primary()) {
 685     // did primary change?
 686     if (was_old_primary != is_primary()) {
 687       state_clear(PG_STATE_CLEAN);
 688     }
 689
 690     pl->on_role_change();
 691   } else {
 692     // no role change.
 693     // did primary change?
 694     if (get_primary() != old_acting_primary) {
 695       psdout(10) << oldacting << " -> " << acting
 696                << ", acting primary "
 697                << old_acting_primary << " -> " << get_primary()
 698                << dendl;
 699     } else {
 700       // primary is the same.
 701       if (is_primary()) {
 702         // i am (still) primary. but my replica set changed.
 703         state_clear(PG_STATE_CLEAN);
 704
 705         psdout(10) << oldacting << " -> " << acting
 706                  << ", replicas changed" << dendl;
 707       }
 708     }
 709   }
 710
 711   if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
 712     psdout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
 713     pl->queue_want_pg_temp(acting);
 714   }
 715 }
 716
 717 void PeeringState::on_new_interval()
 718 {
 719   dout(20) << __func__ << dendl;
 720   const OSDMapRef osdmap = get_osdmap();
 721
 722   // initialize features
 723   acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
 724   upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
 725   for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
 726     if (*p == CRUSH_ITEM_NONE)
 727       continue;
 728     uint64_t f = osdmap->get_xinfo(*p).features;
 729     acting_features &= f;
 730     upacting_features &= f;
 731   }
 732   for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
 733     if (*p == CRUSH_ITEM_NONE)
 734       continue;
 735     upacting_features &= osdmap->get_xinfo(*p).features;
 736   }
 737   psdout(20) << __func__ << " upacting_features 0x" << std::hex
 738              << upacting_features << std::dec
 739              << " from " << acting << "+" << up << dendl;
 740
 741   psdout(20) << __func__ << " checking missing set deletes flag. missing = "
 742              << get_pg_log().get_missing() << dendl;
 743
 744   if (!pg_log.get_missing().may_include_deletes &&
 745       !perform_deletes_during_peering()) {
 746     pl->rebuild_missing_set_with_deletes(pg_log);
 747   }
 748   ceph_assert(
 749     pg_log.get_missing().may_include_deletes ==
 750     !perform_deletes_during_peering());
 751
 752   init_hb_stamps();
 753
 754   // update lease bounds for a new interval
 755   auto mnow = pl->get_mnow();
 756   prior_readable_until_ub = std::max(prior_readable_until_ub,
 757                                      readable_until_ub);
 758   prior_readable_until_ub = info.history.refresh_prior_readable_until_ub(
 759     mnow, prior_readable_until_ub);
 760   psdout(10) << __func__ << " prior_readable_until_ub "
 761              << prior_readable_until_ub << " (mnow " << mnow << " + "
 762              << info.history.prior_readable_until_ub << ")" << dendl;
 763   prior_readable_down_osds.clear(); // we populate this when we build the priorset
 764
 765   readable_until =
 766     readable_until_ub =
 767     readable_until_ub_sent =
 768     readable_until_ub_from_primary = ceph::signedspan::zero();
 769
 770   acting_readable_until_ub.clear();
 771   if (is_primary()) {
 772     acting_readable_until_ub.resize(acting.size(), ceph::signedspan::zero());
 773   }
 774
 775   pl->on_new_interval();
 776 }
 777
 778 void PeeringState::init_primary_up_acting(
 779   const vector<int> &newup,
 780   const vector<int> &newacting,
 781   int new_up_primary,
 782   int new_acting_primary)
 783 {
 784   actingset.clear();
 785   acting = newacting;
 786   for (uint8_t i = 0; i < acting.size(); ++i) {
 787     if (acting[i] != CRUSH_ITEM_NONE)
 788       actingset.insert(
 789         pg_shard_t(
 790           acting[i],
 791           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
 792   }
 793   upset.clear();
 794   up = newup;
 795   for (uint8_t i = 0; i < up.size(); ++i) {
 796     if (up[i] != CRUSH_ITEM_NONE)
 797       upset.insert(
 798         pg_shard_t(
 799           up[i],
 800           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
 801   }
 802   if (!pool.info.is_erasure()) {
 803     // replicated
 804     up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
 805     primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
 806   } else {
 807     // erasure
 808     up_primary = pg_shard_t();
 809     primary = pg_shard_t();
 810     for (uint8_t i = 0; i < up.size(); ++i) {
 811       if (up[i] == new_up_primary) {
 812         up_primary = pg_shard_t(up[i], shard_id_t(i));
 813         break;
 814       }
 815     }
 816     for (uint8_t i = 0; i < acting.size(); ++i) {
 817       if (acting[i] == new_acting_primary) {
 818         primary = pg_shard_t(acting[i], shard_id_t(i));
 819         break;
 820       }
 821     }
 822     ceph_assert(up_primary.osd == new_up_primary);
 823     ceph_assert(primary.osd == new_acting_primary);
 824   }
 825 }
 826
 827 void PeeringState::init_hb_stamps()
 828 {
 829   if (is_primary()) {
 830     // we care about all other osds in the acting set
 831     hb_stamps.resize(acting.size() - 1);
 832     unsigned i = 0;
 833     for (auto p : acting) {
 834       if (p == CRUSH_ITEM_NONE || p == get_primary().osd) {
 835         continue;
 836       }
 837       hb_stamps[i++] = pl->get_hb_stamps(p);
 838     }
 839     hb_stamps.resize(i);
 840   } else if (is_nonprimary()) {
 841     // we care about just the primary
 842     hb_stamps.resize(1);
 843     hb_stamps[0] = pl->get_hb_stamps(get_primary().osd);
 844   } else {
 845     hb_stamps.clear();
 846   }
 847   dout(10) << __func__ << " now " << hb_stamps << dendl;
 848 }
 849
 850
 851 void PeeringState::clear_recovery_state()
 852 {
 853   async_recovery_targets.clear();
 854   backfill_targets.clear();
 855 }
 856
 857 void PeeringState::clear_primary_state()
 858 {
 859   psdout(10) << "clear_primary_state" << dendl;
 860
 861   // clear peering state
 862   stray_set.clear();
 863   peer_log_requested.clear();
 864   peer_missing_requested.clear();
 865   peer_info.clear();
 866   peer_bytes.clear();
 867   peer_missing.clear();
 868   peer_last_complete_ondisk.clear();
 869   peer_activated.clear();
 870   min_last_complete_ondisk = eversion_t();
 871   pg_trim_to = eversion_t();
 872   might_have_unfound.clear();
 873   need_up_thru = false;
 874   missing_loc.clear();
 875   pg_log.reset_recovery_pointers();
 876
 877   clear_recovery_state();
 878
 879   last_update_ondisk = eversion_t();
 880   missing_loc.clear();
 881   pl->clear_primary_state();
 882 }
 883
 884 /// return [start,end) bounds for required past_intervals
 885 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
 886   const pg_info_t &info,
 887   epoch_t oldest_map) {
 888   epoch_t start = std::max(
 889     info.history.last_epoch_clean ? info.history.last_epoch_clean :
 890     info.history.epoch_pool_created,
 891     oldest_map);
 892   epoch_t end = std::max(
 893     info.history.same_interval_since,
 894     info.history.epoch_pool_created);
 895   return make_pair(start, end);
 896 }
 897
 898
 899 void PeeringState::check_past_interval_bounds() const
 900 {
 901   auto oldest_epoch = pl->oldest_stored_osdmap();
 902   auto rpib = get_required_past_interval_bounds(
 903     info,
 904     oldest_epoch);
 905   if (rpib.first >= rpib.second) {
 906     // do not warn if the start bound is dictated by oldest_map; the
 907     // past intervals are presumably appropriate given the pg info.
 908     if (!past_intervals.empty() &&
 909         rpib.first > oldest_epoch) {
 910       pl->get_clog_error() << info.pgid << " required past_interval bounds are"
 911                              << " empty [" << rpib << ") but past_intervals is not: "
 912                              << past_intervals;
 913       derr << info.pgid << " required past_interval bounds are"
 914            << " empty [" << rpib << ") but past_intervals is not: "
 915            << past_intervals << dendl;
 916     }
 917   } else {
 918     if (past_intervals.empty()) {
 919       pl->get_clog_error() << info.pgid << " required past_interval bounds are"
 920                              << " not empty [" << rpib << ") but past_intervals "
 921                              << past_intervals << " is empty";
 922       derr << info.pgid << " required past_interval bounds are"
 923            << " not empty [" << rpib << ") but past_intervals "
 924            << past_intervals << " is empty" << dendl;
 925       ceph_assert(!past_intervals.empty());
 926     }
 927
 928     auto apib = past_intervals.get_bounds();
 929     if (apib.first > rpib.first) {
 930       pl->get_clog_error() << info.pgid << " past_intervals [" << apib
 931                              << ") start interval does not contain the required"
 932                              << " bound [" << rpib << ") start";
 933       derr << info.pgid << " past_intervals [" << apib
 934            << ") start interval does not contain the required"
 935            << " bound [" << rpib << ") start" << dendl;
 936       ceph_abort_msg("past_interval start interval mismatch");
 937     }
 938     if (apib.second != rpib.second) {
 939       pl->get_clog_error() << info.pgid << " past_interal bound [" << apib
 940                              << ") end does not match required [" << rpib
 941                              << ") end";
 942       derr << info.pgid << " past_interal bound [" << apib
 943            << ") end does not match required [" << rpib
 944            << ") end" << dendl;
 945       ceph_abort_msg("past_interval end mismatch");
 946     }
 947   }
 948 }
 949
 950 int PeeringState::clamp_recovery_priority(int priority, int pool_recovery_priority, int max)
 951 {
 952   static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
 953   static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
 954
 955   ceph_assert(max <= OSD_RECOVERY_PRIORITY_MAX);
 956
 957   // User can't set this too high anymore, but might be a legacy value
 958   if (pool_recovery_priority > OSD_POOL_PRIORITY_MAX)
 959     pool_recovery_priority = OSD_POOL_PRIORITY_MAX;
 960   if (pool_recovery_priority < OSD_POOL_PRIORITY_MIN)
 961     pool_recovery_priority = OSD_POOL_PRIORITY_MIN;
 962   // Shift range from min to max to 0 to max - min
 963   pool_recovery_priority += (0 - OSD_POOL_PRIORITY_MIN);
 964   ceph_assert(pool_recovery_priority >= 0 && pool_recovery_priority <= (OSD_POOL_PRIORITY_MAX - OSD_POOL_PRIORITY_MIN));
 965
 966   priority += pool_recovery_priority;
 967
 968   // Clamp to valid range
 969   if (priority > max) {
 970     return max;
 971   } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
 972     return OSD_RECOVERY_PRIORITY_MIN;
 973   } else {
 974     return priority;
 975   }
 976 }
 977
 978 unsigned PeeringState::get_recovery_priority()
 979 {
 980   // a higher value -> a higher priority
 981   int ret = OSD_RECOVERY_PRIORITY_BASE;
 982   int base = ret;
 983
 984   if (state & PG_STATE_FORCED_RECOVERY) {
 985     ret = OSD_RECOVERY_PRIORITY_FORCED;
 986   } else {
 987     // XXX: This priority boost isn't so much about inactive, but about data-at-risk
 988     if (is_degraded() && info.stats.avail_no_missing.size() < pool.info.min_size) {
 989       base = OSD_RECOVERY_INACTIVE_PRIORITY_BASE;
 990       // inactive: no. of replicas < min_size, highest priority since it blocks IO
 991       ret = base + (pool.info.min_size - info.stats.avail_no_missing.size());
 992     }
 993
 994     int64_t pool_recovery_priority = 0;
 995     pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
 996
 997     ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
 998   }
 999   psdout(20) << __func__ << " recovery priority is " << ret << dendl;
1000   return static_cast<unsigned>(ret);
1001 }
1002
1003 unsigned PeeringState::get_backfill_priority()
1004 {
1005   // a higher value -> a higher priority
1006   int ret = OSD_BACKFILL_PRIORITY_BASE;
1007   int base = ret;
1008
1009   if (state & PG_STATE_FORCED_BACKFILL) {
1010     ret = OSD_BACKFILL_PRIORITY_FORCED;
1011   } else {
1012     if (acting.size() < pool.info.min_size) {
1013       base = OSD_BACKFILL_INACTIVE_PRIORITY_BASE;
1014       // inactive: no. of replicas < min_size, highest priority since it blocks IO
1015       ret = base + (pool.info.min_size - acting.size());
1016
1017     } else if (is_undersized()) {
1018       // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
1019       ceph_assert(pool.info.size > actingset.size());
1020       base = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1021       ret = base + (pool.info.size - actingset.size());
1022
1023     } else if (is_degraded()) {
1024       // degraded: baseline degraded
1025       base = ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1026     }
1027
1028     // Adjust with pool's recovery priority
1029     int64_t pool_recovery_priority = 0;
1030     pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
1031
1032     ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
1033   }
1034
1035   psdout(20) << __func__ << " backfill priority is " << ret << dendl;
1036   return static_cast<unsigned>(ret);
1037 }
1038
1039 unsigned PeeringState::get_delete_priority()
1040 {
1041   auto state = get_osdmap()->get_state(pg_whoami.osd);
1042   if (state & (CEPH_OSD_BACKFILLFULL |
1043                CEPH_OSD_FULL)) {
1044     return OSD_DELETE_PRIORITY_FULL;
1045   } else if (state & CEPH_OSD_NEARFULL) {
1046     return OSD_DELETE_PRIORITY_FULLISH;
1047   } else {
1048     return OSD_DELETE_PRIORITY_NORMAL;
1049   }
1050 }
1051
1052 bool PeeringState::set_force_recovery(bool b)
1053 {
1054   bool did = false;
1055   if (b) {
1056     if (!(state & PG_STATE_FORCED_RECOVERY) &&
1057         (state & (PG_STATE_DEGRADED |
1058                   PG_STATE_RECOVERY_WAIT |
1059                   PG_STATE_RECOVERING))) {
1060       psdout(20) << __func__ << " set" << dendl;
1061       state_set(PG_STATE_FORCED_RECOVERY);
1062       pl->publish_stats_to_osd();
1063       did = true;
1064     }
1065   } else if (state & PG_STATE_FORCED_RECOVERY) {
1066     psdout(20) << __func__ << " clear" << dendl;
1067     state_clear(PG_STATE_FORCED_RECOVERY);
1068     pl->publish_stats_to_osd();
1069     did = true;
1070   }
1071   if (did) {
1072     psdout(20) << __func__ << " state " << get_current_state()
1073              << dendl;
1074     pl->update_local_background_io_priority(get_recovery_priority());
1075   }
1076   return did;
1077 }
1078
1079 bool PeeringState::set_force_backfill(bool b)
1080 {
1081   bool did = false;
1082   if (b) {
1083     if (!(state & PG_STATE_FORCED_BACKFILL) &&
1084         (state & (PG_STATE_DEGRADED |
1085                   PG_STATE_BACKFILL_WAIT |
1086                   PG_STATE_BACKFILLING))) {
1087       psdout(10) << __func__ << " set" << dendl;
1088       state_set(PG_STATE_FORCED_BACKFILL);
1089       pl->publish_stats_to_osd();
1090       did = true;
1091     }
1092   } else if (state & PG_STATE_FORCED_BACKFILL) {
1093     psdout(10) << __func__ << " clear" << dendl;
1094     state_clear(PG_STATE_FORCED_BACKFILL);
1095     pl->publish_stats_to_osd();
1096     did = true;
1097   }
1098   if (did) {
1099     psdout(20) << __func__ << " state " << get_current_state()
1100              << dendl;
1101     pl->update_local_background_io_priority(get_backfill_priority());
1102   }
1103   return did;
1104 }
1105
1106 void PeeringState::schedule_renew_lease()
1107 {
1108   pl->schedule_renew_lease(
1109     last_peering_reset,
1110     readable_interval / 2);
1111 }
1112
1113 void PeeringState::send_lease()
1114 {
1115   epoch_t epoch = pl->get_osdmap_epoch();
1116   for (auto peer : actingset) {
1117     if (peer == pg_whoami) {
1118       continue;
1119     }
1120     pl->send_cluster_message(
1121       peer.osd,
1122       new MOSDPGLease(epoch,
1123                       spg_t(spgid.pgid, peer.shard),
1124                       get_lease()),
1125       epoch);
1126   }
1127 }
1128
1129 void PeeringState::proc_lease(const pg_lease_t& l)
1130 {
1131   if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1132     psdout(20) << __func__ << " no-op, upacting_features 0x" << std::hex
1133                << upacting_features << std::dec
1134                << " does not include SERVER_OCTOPUS" << dendl;
1135     return;
1136   }
1137   if (!is_nonprimary()) {
1138     psdout(20) << __func__ << " no-op, !nonprimary" << dendl;
1139     return;
1140   }
1141   psdout(10) << __func__ << " " << l << dendl;
1142   if (l.readable_until_ub > readable_until_ub_from_primary) {
1143     readable_until_ub_from_primary = l.readable_until_ub;
1144   }
1145
1146   ceph::signedspan ru = ceph::signedspan::zero();
1147   if (l.readable_until != ceph::signedspan::zero() &&
1148       hb_stamps[0]->peer_clock_delta_ub) {
1149     ru = l.readable_until - *hb_stamps[0]->peer_clock_delta_ub;
1150     psdout(20) << " peer_clock_delta_ub " << *hb_stamps[0]->peer_clock_delta_ub
1151                << " -> ru " << ru << dendl;
1152   }
1153   if (ru > readable_until) {
1154     readable_until = ru;
1155     psdout(20) << __func__ << " readable_until now " << readable_until << dendl;
1156     // NOTE: if we ever decide to block/queue ops on the replica,
1157     // we'll need to wake them up here.
1158   }
1159
1160   ceph::signedspan ruub;
1161   if (hb_stamps[0]->peer_clock_delta_lb) {
1162     ruub = l.readable_until_ub - *hb_stamps[0]->peer_clock_delta_lb;
1163     psdout(20) << " peer_clock_delta_lb " << *hb_stamps[0]->peer_clock_delta_lb
1164                << " -> ruub " << ruub << dendl;
1165   } else {
1166     ruub = pl->get_mnow() + l.interval;
1167     psdout(20) << " no peer_clock_delta_lb -> ruub " << ruub << dendl;
1168   }
1169   if (ruub > readable_until_ub) {
1170     readable_until_ub = ruub;
1171     psdout(20) << __func__ << " readable_until_ub now " << readable_until_ub
1172                << dendl;
1173   }
1174 }
1175
1176 void PeeringState::proc_lease_ack(int from, const pg_lease_ack_t& a)
1177 {
1178   if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1179     return;
1180   }
1181   auto now = pl->get_mnow();
1182   bool was_min = false;
1183   for (unsigned i = 0; i < acting.size(); ++i) {
1184     if (from == acting[i]) {
1185       // the lease_ack value is based on the primary's clock
1186       if (a.readable_until_ub > acting_readable_until_ub[i]) {
1187         if (acting_readable_until_ub[i] == readable_until) {
1188           was_min = true;
1189         }
1190         acting_readable_until_ub[i] = a.readable_until_ub;
1191         break;
1192       }
1193     }
1194   }
1195   if (was_min) {
1196     auto old_ru = readable_until;
1197     recalc_readable_until();
1198     if (now < old_ru) {
1199       pl->recheck_readable();
1200     }
1201   }
1202 }
1203
1204 void PeeringState::proc_renew_lease()
1205 {
1206   if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1207     return;
1208   }
1209   renew_lease(pl->get_mnow());
1210   send_lease();
1211   schedule_renew_lease();
1212 }
1213
1214 void PeeringState::recalc_readable_until()
1215 {
1216   assert(is_primary());
1217   ceph::signedspan min = readable_until_ub_sent;
1218   for (unsigned i = 0; i < acting.size(); ++i) {
1219     if (acting[i] == pg_whoami.osd || acting[i] == CRUSH_ITEM_NONE) {
1220       continue;
1221     }
1222     dout(20) << __func__ << " peer osd." << acting[i]
1223              << " ruub " << acting_readable_until_ub[i] << dendl;
1224     if (acting_readable_until_ub[i] < min) {
1225       min = acting_readable_until_ub[i];
1226     }
1227   }
1228   readable_until = min;
1229   readable_until_ub = min;
1230   dout(20) << __func__ << " readable_until[_ub] " << readable_until
1231            << " (sent " << readable_until_ub_sent << ")" << dendl;
1232 }
1233
1234 bool PeeringState::check_prior_readable_down_osds(const OSDMapRef& map)
1235 {
1236   if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1237     return false;
1238   }
1239   bool changed = false;
1240   auto p = prior_readable_down_osds.begin();
1241   while (p != prior_readable_down_osds.end()) {
1242     if (map->is_dead(*p)) {
1243       dout(10) << __func__ << " prior_readable_down_osds osd." << *p
1244                << " is dead as of epoch " << map->get_epoch()
1245                << dendl;
1246       p = prior_readable_down_osds.erase(p);
1247       changed = true;
1248     } else {
1249       ++p;
1250     }
1251   }
1252   if (changed && prior_readable_down_osds.empty()) {
1253     psdout(10) << " empty prior_readable_down_osds, clearing ub" << dendl;
1254     clear_prior_readable_until_ub();
1255     return true;
1256   }
1257   return false;
1258 }
1259
1260 bool PeeringState::adjust_need_up_thru(const OSDMapRef osdmap)
1261 {
1262   epoch_t up_thru = osdmap->get_up_thru(pg_whoami.osd);
1263   if (need_up_thru &&
1264       up_thru >= info.history.same_interval_since) {
1265     psdout(10) << "adjust_need_up_thru now "
1266                << up_thru << ", need_up_thru now false" << dendl;
1267     need_up_thru = false;
1268     return true;
1269   }
1270   return false;
1271 }
1272
1273 PastIntervals::PriorSet PeeringState::build_prior()
1274 {
1275   if (1) {
1276     // sanity check
1277     for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
1278          it != peer_info.end();
1279          ++it) {
1280       ceph_assert(info.history.last_epoch_started >=
1281                   it->second.history.last_epoch_started);
1282     }
1283   }
1284
1285   const OSDMap &osdmap = *get_osdmap();
1286   PastIntervals::PriorSet prior = past_intervals.get_prior_set(
1287     pool.info.is_erasure(),
1288     info.history.last_epoch_started,
1289     &missing_loc.get_recoverable_predicate(),
1290     [&](epoch_t start, int osd, epoch_t *lost_at) {
1291       const osd_info_t *pinfo = 0;
1292       if (osdmap.exists(osd)) {
1293         pinfo = &osdmap.get_info(osd);
1294         if (lost_at)
1295           *lost_at = pinfo->lost_at;
1296       }
1297
1298       if (osdmap.is_up(osd)) {
1299         return PastIntervals::UP;
1300       } else if (!pinfo) {
1301         return PastIntervals::DNE;
1302       } else if (pinfo->lost_at > start) {
1303         return PastIntervals::LOST;
1304       } else {
1305         return PastIntervals::DOWN;
1306       }
1307     },
1308     up,
1309     acting,
1310     dpp);
1311
1312   if (prior.pg_down) {
1313     state_set(PG_STATE_DOWN);
1314   }
1315
1316   if (get_osdmap()->get_up_thru(pg_whoami.osd) <
1317       info.history.same_interval_since) {
1318     psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1319                << " < same_since " << info.history.same_interval_since
1320                << ", must notify monitor" << dendl;
1321     need_up_thru = true;
1322   } else {
1323     psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1324                << " >= same_since " << info.history.same_interval_since
1325                << ", all is well" << dendl;
1326     need_up_thru = false;
1327   }
1328   pl->set_probe_targets(prior.probe);
1329   return prior;
1330 }
1331
1332 bool PeeringState::needs_recovery() const
1333 {
1334   ceph_assert(is_primary());
1335
1336   auto &missing = pg_log.get_missing();
1337
1338   if (missing.num_missing()) {
1339     psdout(10) << __func__ << " primary has " << missing.num_missing()
1340                << " missing" << dendl;
1341     return true;
1342   }
1343
1344   ceph_assert(!acting_recovery_backfill.empty());
1345   set<pg_shard_t>::const_iterator end = acting_recovery_backfill.end();
1346   set<pg_shard_t>::const_iterator a = acting_recovery_backfill.begin();
1347   for (; a != end; ++a) {
1348     if (*a == get_primary()) continue;
1349     pg_shard_t peer = *a;
1350     map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
1351     if (pm == peer_missing.end()) {
1352       psdout(10) << __func__ << " osd." << peer << " doesn't have missing set"
1353                  << dendl;
1354       continue;
1355     }
1356     if (pm->second.num_missing()) {
1357       psdout(10) << __func__ << " osd." << peer << " has "
1358                  << pm->second.num_missing() << " missing" << dendl;
1359       return true;
1360     }
1361   }
1362
1363   psdout(10) << __func__ << " is recovered" << dendl;
1364   return false;
1365 }
1366
1367 bool PeeringState::needs_backfill() const
1368 {
1369   ceph_assert(is_primary());
1370
1371   // We can assume that only possible osds that need backfill
1372   // are on the backfill_targets vector nodes.
1373   set<pg_shard_t>::const_iterator end = backfill_targets.end();
1374   set<pg_shard_t>::const_iterator a = backfill_targets.begin();
1375   for (; a != end; ++a) {
1376     pg_shard_t peer = *a;
1377     map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
1378     if (!pi->second.last_backfill.is_max()) {
1379       psdout(10) << __func__ << " osd." << peer
1380                  << " has last_backfill " << pi->second.last_backfill << dendl;
1381       return true;
1382     }
1383   }
1384
1385   psdout(10) << __func__ << " does not need backfill" << dendl;
1386   return false;
1387 }
1388
1389 /*
1390  * Returns true unless there is a non-lost OSD in might_have_unfound.
1391  */
1392 bool PeeringState::all_unfound_are_queried_or_lost(
1393   const OSDMapRef osdmap) const
1394 {
1395   ceph_assert(is_primary());
1396
1397   set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
1398   set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
1399   for (; peer != mend; ++peer) {
1400     if (peer_missing.count(*peer))
1401       continue;
1402     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
1403     if (iter != peer_info.end() &&
1404         (iter->second.is_empty() || iter->second.dne()))
1405       continue;
1406     if (!osdmap->exists(peer->osd))
1407       continue;
1408     const osd_info_t &osd_info(osdmap->get_info(peer->osd));
1409     if (osd_info.lost_at <= osd_info.up_from) {
1410       // If there is even one OSD in might_have_unfound that isn't lost, we
1411       // still might retrieve our unfound.
1412       return false;
1413     }
1414   }
1415   psdout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound "
1416              << might_have_unfound
1417              << " have been queried or are marked lost" << dendl;
1418   return true;
1419 }
1420
1421
1422 void PeeringState::reject_reservation()
1423 {
1424   pl->unreserve_recovery_space();
1425   pl->send_cluster_message(
1426     primary.osd,
1427     new MBackfillReserve(
1428       MBackfillReserve::REJECT_TOOFULL,
1429       spg_t(info.pgid.pgid, primary.shard),
1430       get_osdmap_epoch()),
1431     get_osdmap_epoch());
1432 }
1433
1434 /**
1435  * find_best_info
1436  *
1437  * Returns an iterator to the best info in infos sorted by:
1438  *  1) Prefer newer last_update
1439  *  2) Prefer longer tail if it brings another info into contiguity
1440  *  3) Prefer current primary
1441  */
1442 map<pg_shard_t, pg_info_t>::const_iterator PeeringState::find_best_info(
1443   const map<pg_shard_t, pg_info_t> &infos,
1444   bool restrict_to_up_acting,
1445   bool *history_les_bound) const
1446 {
1447   ceph_assert(history_les_bound);
1448   /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1449    * to make changes to this process.  Also, make sure to update it
1450    * when you find bugs! */
1451   eversion_t min_last_update_acceptable = eversion_t::max();
1452   epoch_t max_last_epoch_started_found = 0;
1453   for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1454        i != infos.end();
1455        ++i) {
1456     if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1457         max_last_epoch_started_found < i->second.history.last_epoch_started) {
1458       *history_les_bound = true;
1459       max_last_epoch_started_found = i->second.history.last_epoch_started;
1460     }
1461     if (!i->second.is_incomplete() &&
1462         max_last_epoch_started_found < i->second.last_epoch_started) {
1463       *history_les_bound = false;
1464       max_last_epoch_started_found = i->second.last_epoch_started;
1465     }
1466   }
1467   for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1468        i != infos.end();
1469        ++i) {
1470     if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1471       if (min_last_update_acceptable > i->second.last_update)
1472         min_last_update_acceptable = i->second.last_update;
1473     }
1474   }
1475   if (min_last_update_acceptable == eversion_t::max())
1476     return infos.end();
1477
1478   map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1479   // find osd with newest last_update (oldest for ec_pool).
1480   // if there are multiples, prefer
1481   //  - a longer tail, if it brings another peer into log contiguity
1482   //  - the current primary
1483   for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1484        p != infos.end();
1485        ++p) {
1486     if (restrict_to_up_acting && !is_up(p->first) &&
1487         !is_acting(p->first))
1488       continue;
1489     // Only consider peers with last_update >= min_last_update_acceptable
1490     if (p->second.last_update < min_last_update_acceptable)
1491       continue;
1492     // Disqualify anyone with a too old last_epoch_started
1493     if (p->second.last_epoch_started < max_last_epoch_started_found)
1494       continue;
1495     // Disqualify anyone who is incomplete (not fully backfilled)
1496     if (p->second.is_incomplete())
1497       continue;
1498     if (best == infos.end()) {
1499       best = p;
1500       continue;
1501     }
1502     // Prefer newer last_update
1503     if (pool.info.require_rollback()) {
1504       if (p->second.last_update > best->second.last_update)
1505         continue;
1506       if (p->second.last_update < best->second.last_update) {
1507         best = p;
1508         continue;
1509       }
1510     } else {
1511       if (p->second.last_update < best->second.last_update)
1512         continue;
1513       if (p->second.last_update > best->second.last_update) {
1514         best = p;
1515         continue;
1516       }
1517     }
1518
1519     // Prefer longer tail
1520     if (p->second.log_tail > best->second.log_tail) {
1521       continue;
1522     } else if (p->second.log_tail < best->second.log_tail) {
1523       best = p;
1524       continue;
1525     }
1526
1527     if (!p->second.has_missing() && best->second.has_missing()) {
1528       psdout(10) << __func__ << " prefer osd." << p->first
1529                << " because it is complete while best has missing"
1530                << dendl;
1531       best = p;
1532       continue;
1533     } else if (p->second.has_missing() && !best->second.has_missing()) {
1534       psdout(10) << __func__ << " skipping osd." << p->first
1535                << " because it has missing while best is complete"
1536                << dendl;
1537       continue;
1538     } else {
1539       // both are complete or have missing
1540       // fall through
1541     }
1542
1543     // prefer current primary (usually the caller), all things being equal
1544     if (p->first == pg_whoami) {
1545       psdout(10) << "calc_acting prefer osd." << p->first
1546                  << " because it is current primary" << dendl;
1547       best = p;
1548       continue;
1549     }
1550   }
1551   return best;
1552 }
1553
1554 void PeeringState::calc_ec_acting(
1555   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1556   unsigned size,
1557   const vector<int> &acting,
1558   const vector<int> &up,
1559   const map<pg_shard_t, pg_info_t> &all_info,
1560   bool restrict_to_up_acting,
1561   vector<int> *_want,
1562   set<pg_shard_t> *backfill,
1563   set<pg_shard_t> *acting_backfill,
1564   ostream &ss)
1565 {
1566   vector<int> want(size, CRUSH_ITEM_NONE);
1567   map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1568   for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1569        i != all_info.end();
1570        ++i) {
1571     all_info_by_shard[i->first.shard].insert(i->first);
1572   }
1573   for (uint8_t i = 0; i < want.size(); ++i) {
1574     ss << "For position " << (unsigned)i << ": ";
1575     if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1576         !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1577         all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1578         auth_log_shard->second.log_tail) {
1579       ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1580       want[i] = up[i];
1581       continue;
1582     }
1583     if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1584       ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1585          << " and ";
1586       backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1587     }
1588
1589     if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1590         !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1591         all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1592         auth_log_shard->second.log_tail) {
1593       ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1594       want[i] = acting[i];
1595     } else if (!restrict_to_up_acting) {
1596       for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1597            j != all_info_by_shard[shard_id_t(i)].end();
1598            ++j) {
1599         ceph_assert(j->shard == i);
1600         if (!all_info.find(*j)->second.is_incomplete() &&
1601             all_info.find(*j)->second.last_update >=
1602             auth_log_shard->second.log_tail) {
1603           ss << " selecting stray: " << *j << std::endl;
1604           want[i] = j->osd;
1605           break;
1606         }
1607       }
1608       if (want[i] == CRUSH_ITEM_NONE)
1609         ss << " failed to fill position " << (int)i << std::endl;
1610     }
1611   }
1612
1613   for (uint8_t i = 0; i < want.size(); ++i) {
1614     if (want[i] != CRUSH_ITEM_NONE) {
1615       acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1616     }
1617   }
1618   acting_backfill->insert(backfill->begin(), backfill->end());
1619   _want->swap(want);
1620 }
1621
1622 /**
1623  * calculate the desired acting set.
1624  *
1625  * Choose an appropriate acting set.  Prefer up[0], unless it is
1626  * incomplete, or another osd has a longer tail that allows us to
1627  * bring other up nodes up to date.
1628  */
1629 void PeeringState::calc_replicated_acting(
1630   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1631   uint64_t force_auth_primary_missing_objects,
1632   unsigned size,
1633   const vector<int> &acting,
1634   const vector<int> &up,
1635   pg_shard_t up_primary,
1636   const map<pg_shard_t, pg_info_t> &all_info,
1637   bool restrict_to_up_acting,
1638   vector<int> *want,
1639   set<pg_shard_t> *backfill,
1640   set<pg_shard_t> *acting_backfill,
1641   const OSDMapRef osdmap,
1642   ostream &ss)
1643 {
1644   pg_shard_t auth_log_shard_id = auth_log_shard->first;
1645
1646   ss << __func__ << " newest update on osd." << auth_log_shard_id
1647      << " with " << auth_log_shard->second
1648      << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1649
1650   // select primary
1651   auto primary = all_info.find(up_primary);
1652   if (up.size() &&
1653       !primary->second.is_incomplete() &&
1654       primary->second.last_update >=
1655         auth_log_shard->second.log_tail) {
1656     if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1657       auto approx_missing_objects =
1658         primary->second.stats.stats.sum.num_objects_missing;
1659       auto auth_version = auth_log_shard->second.last_update.version;
1660       auto primary_version = primary->second.last_update.version;
1661       if (auth_version > primary_version) {
1662         approx_missing_objects += auth_version - primary_version;
1663       } else {
1664         approx_missing_objects += primary_version - auth_version;
1665       }
1666       if ((uint64_t)approx_missing_objects >
1667           force_auth_primary_missing_objects) {
1668         primary = auth_log_shard;
1669         ss << "up_primary: " << up_primary << ") has approximate "
1670            << approx_missing_objects
1671            << "(>" << force_auth_primary_missing_objects <<") "
1672            << "missing objects, osd." << auth_log_shard_id
1673            << " selected as primary instead"
1674            << std::endl;
1675       } else {
1676         ss << "up_primary: " << up_primary << ") selected as primary"
1677            << std::endl;
1678       }
1679     } else {
1680       ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1681     }
1682   } else {
1683     ceph_assert(!auth_log_shard->second.is_incomplete());
1684     ss << "up[0] needs backfill, osd." << auth_log_shard_id
1685        << " selected as primary instead" << std::endl;
1686     primary = auth_log_shard;
1687   }
1688
1689   ss << __func__ << " primary is osd." << primary->first
1690      << " with " << primary->second << std::endl;
1691   want->push_back(primary->first.osd);
1692   acting_backfill->insert(primary->first);
1693
1694   /* We include auth_log_shard->second.log_tail because in GetLog,
1695    * we will request logs back to the min last_update over our
1696    * acting_backfill set, which will result in our log being extended
1697    * as far backwards as necessary to pick up any peers which can
1698    * be log recovered by auth_log_shard's log */
1699   eversion_t oldest_auth_log_entry =
1700     std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
1701
1702   // select replicas that have log contiguity with primary.
1703   // prefer up, then acting, then any peer_info osds
1704   for (auto i : up) {
1705     pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
1706     if (up_cand == primary->first)
1707       continue;
1708     const pg_info_t &cur_info = all_info.find(up_cand)->second;
1709     if (cur_info.is_incomplete() ||
1710         cur_info.last_update < oldest_auth_log_entry) {
1711       ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1712       backfill->insert(up_cand);
1713       acting_backfill->insert(up_cand);
1714     } else {
1715       want->push_back(i);
1716       acting_backfill->insert(up_cand);
1717       ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
1718     }
1719   }
1720
1721   if (want->size() >= size) {
1722     return;
1723   }
1724
1725   std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
1726   candidate_by_last_update.reserve(acting.size());
1727   // This no longer has backfill OSDs, but they are covered above.
1728   for (auto i : acting) {
1729     pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
1730     // skip up osds we already considered above
1731     if (acting_cand == primary->first)
1732       continue;
1733     vector<int>::const_iterator up_it = find(up.begin(), up.end(), i);
1734     if (up_it != up.end())
1735       continue;
1736
1737     const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1738     if (cur_info.is_incomplete() ||
1739         cur_info.last_update < oldest_auth_log_entry) {
1740       ss << " shard " << acting_cand << " (acting) REJECTED "
1741          << cur_info << std::endl;
1742     } else {
1743       candidate_by_last_update.emplace_back(cur_info.last_update, i);
1744     }
1745   }
1746
1747   auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs,
1748                             const std::pair<eversion_t, int> &rhs) {
1749     return lhs.first > rhs.first;
1750   };
1751   // sort by last_update, in descending order.
1752   std::sort(candidate_by_last_update.begin(),
1753             candidate_by_last_update.end(), sort_by_eversion);
1754   for (auto &p: candidate_by_last_update) {
1755     ceph_assert(want->size() < size);
1756     want->push_back(p.second);
1757     pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1758     acting_backfill->insert(s);
1759     ss << " shard " << s << " (acting) accepted "
1760        << all_info.find(s)->second << std::endl;
1761     if (want->size() >= size) {
1762       return;
1763     }
1764   }
1765
1766   if (restrict_to_up_acting) {
1767     return;
1768   }
1769   candidate_by_last_update.clear();
1770   candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
1771   // continue to search stray to find more suitable peers
1772   for (auto &i : all_info) {
1773     // skip up osds we already considered above
1774     if (i.first == primary->first)
1775       continue;
1776     vector<int>::const_iterator up_it = find(up.begin(), up.end(), i.first.osd);
1777     if (up_it != up.end())
1778       continue;
1779     vector<int>::const_iterator acting_it = find(
1780       acting.begin(), acting.end(), i.first.osd);
1781     if (acting_it != acting.end())
1782       continue;
1783
1784     if (i.second.is_incomplete() ||
1785         i.second.last_update < oldest_auth_log_entry) {
1786       ss << " shard " << i.first << " (stray) REJECTED " << i.second
1787          << std::endl;
1788     } else {
1789       candidate_by_last_update.emplace_back(
1790         i.second.last_update, i.first.osd);
1791     }
1792   }
1793
1794   if (candidate_by_last_update.empty()) {
1795     // save us some effort
1796     return;
1797   }
1798
1799   // sort by last_update, in descending order.
1800   std::sort(candidate_by_last_update.begin(),
1801             candidate_by_last_update.end(), sort_by_eversion);
1802
1803   for (auto &p: candidate_by_last_update) {
1804     ceph_assert(want->size() < size);
1805     want->push_back(p.second);
1806     pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1807     acting_backfill->insert(s);
1808     ss << " shard " << s << " (stray) accepted "
1809        << all_info.find(s)->second << std::endl;
1810     if (want->size() >= size) {
1811       return;
1812     }
1813   }
1814 }
1815
1816 bool PeeringState::recoverable(const vector<int> &want) const
1817 {
1818   unsigned num_want_acting = 0;
1819   set<pg_shard_t> have;
1820   for (int i = 0; i < (int)want.size(); ++i) {
1821     if (want[i] != CRUSH_ITEM_NONE) {
1822       ++num_want_acting;
1823       have.insert(
1824         pg_shard_t(
1825           want[i],
1826           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1827     }
1828   }
1829
1830   if (num_want_acting < pool.info.min_size) {
1831     const bool recovery_ec_pool_below_min_size=
1832       HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_OCTOPUS);
1833
1834     if (pool.info.is_erasure() && !recovery_ec_pool_below_min_size) {
1835       psdout(10) << __func__ << " failed, ec recovery below min size not supported by pre-octopus" << dendl;
1836       return false;
1837     } else if (!cct->_conf.get_val<bool>("osd_allow_recovery_below_min_size")) {
1838       psdout(10) << __func__ << " failed, recovery below min size not enabled" << dendl;
1839       return false;
1840     }
1841   }
1842   if (missing_loc.get_recoverable_predicate()(have)) {
1843     return true;
1844   } else {
1845     psdout(10) << __func__ << " failed, not recoverable " << dendl;
1846     return false;
1847   }
1848 }
1849
1850 void PeeringState::choose_async_recovery_ec(
1851   const map<pg_shard_t, pg_info_t> &all_info,
1852   const pg_info_t &auth_info,
1853   vector<int> *want,
1854   set<pg_shard_t> *async_recovery,
1855   const OSDMapRef osdmap) const
1856 {
1857   set<pair<int, pg_shard_t> > candidates_by_cost;
1858   for (uint8_t i = 0; i < want->size(); ++i) {
1859     if ((*want)[i] == CRUSH_ITEM_NONE)
1860       continue;
1861
1862     // Considering log entries to recover is accurate enough for
1863     // now. We could use minimum_to_decode_with_cost() later if
1864     // necessary.
1865     pg_shard_t shard_i((*want)[i], shard_id_t(i));
1866     // do not include strays
1867     if (stray_set.find(shard_i) != stray_set.end())
1868       continue;
1869     // Do not include an osd that is not up, since choosing it as
1870     // an async_recovery_target will move it out of the acting set.
1871     // This results in it being identified as a stray during peering,
1872     // because it is no longer in the up or acting set.
1873     if (!is_up(shard_i))
1874       continue;
1875     auto shard_info = all_info.find(shard_i)->second;
1876     // for ec pools we rollback all entries past the authoritative
1877     // last_update *before* activation. This is relatively inexpensive
1878     // compared to recovery, since it is purely local, so treat shards
1879     // past the authoritative last_update the same as those equal to it.
1880     version_t auth_version = auth_info.last_update.version;
1881     version_t candidate_version = shard_info.last_update.version;
1882     if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1883       auto approx_missing_objects =
1884         shard_info.stats.stats.sum.num_objects_missing;
1885       if (auth_version > candidate_version) {
1886         approx_missing_objects += auth_version - candidate_version;
1887       }
1888       if (static_cast<uint64_t>(approx_missing_objects) >
1889          cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1890         candidates_by_cost.emplace(approx_missing_objects, shard_i);
1891       }
1892     } else {
1893       if (auth_version > candidate_version &&
1894           (auth_version - candidate_version) > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1895         candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i));
1896       }
1897     }
1898   }
1899
1900   psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1901              << dendl;
1902
1903   // take out as many osds as we can for async recovery, in order of cost
1904   for (auto rit = candidates_by_cost.rbegin();
1905        rit != candidates_by_cost.rend(); ++rit) {
1906     pg_shard_t cur_shard = rit->second;
1907     vector<int> candidate_want(*want);
1908     candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE;
1909     if (recoverable(candidate_want)) {
1910       want->swap(candidate_want);
1911       async_recovery->insert(cur_shard);
1912     }
1913   }
1914   psdout(20) << __func__ << " result want=" << *want
1915              << " async_recovery=" << *async_recovery << dendl;
1916 }
1917
1918 void PeeringState::choose_async_recovery_replicated(
1919   const map<pg_shard_t, pg_info_t> &all_info,
1920   const pg_info_t &auth_info,
1921   vector<int> *want,
1922   set<pg_shard_t> *async_recovery,
1923   const OSDMapRef osdmap) const
1924 {
1925   set<pair<int, pg_shard_t> > candidates_by_cost;
1926   for (auto osd_num : *want) {
1927     pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD);
1928     // do not include strays
1929     if (stray_set.find(shard_i) != stray_set.end())
1930       continue;
1931     // Do not include an osd that is not up, since choosing it as
1932     // an async_recovery_target will move it out of the acting set.
1933     // This results in it being identified as a stray during peering,
1934     // because it is no longer in the up or acting set.
1935     if (!is_up(shard_i))
1936       continue;
1937     auto shard_info = all_info.find(shard_i)->second;
1938     // use the approximate magnitude of the difference in length of
1939     // logs plus historical missing objects as the cost of recovery
1940     version_t auth_version = auth_info.last_update.version;
1941     version_t candidate_version = shard_info.last_update.version;
1942     if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1943       auto approx_missing_objects =
1944         shard_info.stats.stats.sum.num_objects_missing;
1945       if (auth_version > candidate_version) {
1946         approx_missing_objects += auth_version - candidate_version;
1947       } else {
1948         approx_missing_objects += candidate_version - auth_version;
1949       }
1950       if (static_cast<uint64_t>(approx_missing_objects)  >
1951          cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1952         candidates_by_cost.emplace(approx_missing_objects, shard_i);
1953       }
1954     } else {
1955       size_t approx_entries;
1956       if (auth_version > candidate_version) {
1957         approx_entries = auth_version - candidate_version;
1958       } else {
1959         approx_entries = candidate_version - auth_version;
1960       }
1961       if (approx_entries > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1962         candidates_by_cost.insert(make_pair(approx_entries, shard_i));
1963       }
1964     }
1965   }
1966
1967   psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1968              << dendl;
1969   // take out as many osds as we can for async recovery, in order of cost
1970   for (auto rit = candidates_by_cost.rbegin();
1971        rit != candidates_by_cost.rend(); ++rit) {
1972     if (want->size() <= pool.info.min_size) {
1973       break;
1974     }
1975     pg_shard_t cur_shard = rit->second;
1976     vector<int> candidate_want(*want);
1977     for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) {
1978       if (*it == cur_shard.osd) {
1979         candidate_want.erase(it);
1980         want->swap(candidate_want);
1981         async_recovery->insert(cur_shard);
1982         break;
1983       }
1984     }
1985   }
1986   psdout(20) << __func__ << " result want=" << *want
1987              << " async_recovery=" << *async_recovery << dendl;
1988 }
1989
1990
1991
1992 /**
1993  * choose acting
1994  *
1995  * calculate the desired acting, and request a change with the monitor
1996  * if it differs from the current acting.
1997  *
1998  * if restrict_to_up_acting=true, we filter out anything that's not in
1999  * up/acting.  in order to lift this restriction, we need to
2000  *  1) check whether it's worth switching the acting set any time we get
2001  *     a new pg info (not just here, when recovery finishes)
2002  *  2) check whether anything in want_acting went down on each new map
2003  *     (and, if so, calculate a new want_acting)
2004  *  3) remove the assertion in PG::PeeringState::Active::react(const AdvMap)
2005  * TODO!
2006  */
2007 bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
2008                                  bool restrict_to_up_acting,
2009                                  bool *history_les_bound,
2010                                  bool request_pg_temp_change_only)
2011 {
2012   map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
2013   all_info[pg_whoami] = info;
2014
2015   if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
2016     for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
2017          p != all_info.end();
2018          ++p) {
2019       psdout(10) << __func__ << " all_info osd." << p->first << " "
2020                  << p->second << dendl;
2021     }
2022   }
2023
2024   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
2025     find_best_info(all_info, restrict_to_up_acting, history_les_bound);
2026
2027   if (auth_log_shard == all_info.end()) {
2028     if (up != acting) {
2029       psdout(10) << __func__ << " no suitable info found (incomplete backfills?),"
2030                  << " reverting to up" << dendl;
2031       want_acting = up;
2032       vector<int> empty;
2033       pl->queue_want_pg_temp(empty);
2034     } else {
2035       psdout(10) << __func__ << " failed" << dendl;
2036       ceph_assert(want_acting.empty());
2037     }
2038     return false;
2039   }
2040
2041   ceph_assert(!auth_log_shard->second.is_incomplete());
2042   auth_log_shard_id = auth_log_shard->first;
2043
2044   set<pg_shard_t> want_backfill, want_acting_backfill;
2045   vector<int> want;
2046   stringstream ss;
2047   if (pool.info.is_replicated())
2048     calc_replicated_acting(
2049       auth_log_shard,
2050       cct->_conf.get_val<uint64_t>(
2051         "osd_force_auth_primary_missing_objects"),
2052       get_osdmap()->get_pg_size(info.pgid.pgid),
2053       acting,
2054       up,
2055       up_primary,
2056       all_info,
2057       restrict_to_up_acting,
2058       &want,
2059       &want_backfill,
2060       &want_acting_backfill,
2061       get_osdmap(),
2062       ss);
2063   else
2064     calc_ec_acting(
2065       auth_log_shard,
2066       get_osdmap()->get_pg_size(info.pgid.pgid),
2067       acting,
2068       up,
2069       all_info,
2070       restrict_to_up_acting,
2071       &want,
2072       &want_backfill,
2073       &want_acting_backfill,
2074       ss);
2075   psdout(10) << ss.str() << dendl;
2076
2077   if (!recoverable(want)) {
2078     want_acting.clear();
2079     return false;
2080   }
2081
2082   set<pg_shard_t> want_async_recovery;
2083   if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) {
2084     if (pool.info.is_erasure()) {
2085       choose_async_recovery_ec(
2086         all_info, auth_log_shard->second, &want, &want_async_recovery,
2087         get_osdmap());
2088     } else {
2089       choose_async_recovery_replicated(
2090         all_info, auth_log_shard->second, &want, &want_async_recovery,
2091         get_osdmap());
2092     }
2093   }
2094   while (want.size() > pool.info.size) {
2095     // async recovery should have taken out as many osds as it can.
2096     // if not, then always evict the last peer
2097     // (will get synchronously recovered later)
2098     psdout(10) << __func__ << " evicting osd." << want.back()
2099                << " from oversized want " << want << dendl;
2100     want.pop_back();
2101   }
2102   if (want != acting) {
2103     psdout(10) << __func__ << " want " << want << " != acting " << acting
2104                << ", requesting pg_temp change" << dendl;
2105     want_acting = want;
2106
2107     if (!cct->_conf->osd_debug_no_acting_change) {
2108       if (want_acting == up) {
2109         // There can't be any pending backfill if
2110         // want is the same as crush map up OSDs.
2111         ceph_assert(want_backfill.empty());
2112         vector<int> empty;
2113         pl->queue_want_pg_temp(empty);
2114       } else
2115         pl->queue_want_pg_temp(want);
2116     }
2117     return false;
2118   }
2119   if (request_pg_temp_change_only)
2120     return true;
2121   want_acting.clear();
2122   acting_recovery_backfill = want_acting_backfill;
2123   psdout(10) << "acting_recovery_backfill is "
2124              << acting_recovery_backfill << dendl;
2125   ceph_assert(
2126     backfill_targets.empty() ||
2127     backfill_targets == want_backfill);
2128   if (backfill_targets.empty()) {
2129     // Caller is GetInfo
2130     backfill_targets = want_backfill;
2131   }
2132   // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
2133   ceph_assert(
2134     async_recovery_targets.empty() ||
2135     async_recovery_targets == want_async_recovery ||
2136     !needs_recovery());
2137   if (async_recovery_targets.empty() || !needs_recovery()) {
2138     async_recovery_targets = want_async_recovery;
2139   }
2140   // Will not change if already set because up would have had to change
2141   // Verify that nothing in backfill is in stray_set
2142   for (set<pg_shard_t>::iterator i = want_backfill.begin();
2143       i != want_backfill.end();
2144       ++i) {
2145     ceph_assert(stray_set.find(*i) == stray_set.end());
2146   }
2147   psdout(10) << "choose_acting want=" << want << " backfill_targets="
2148            << want_backfill << " async_recovery_targets="
2149            << async_recovery_targets << dendl;
2150   return true;
2151 }
2152
2153 void PeeringState::log_weirdness()
2154 {
2155   if (pg_log.get_tail() != info.log_tail)
2156     pl->get_clog_error() << info.pgid
2157                            << " info mismatch, log.tail " << pg_log.get_tail()
2158                            << " != info.log_tail " << info.log_tail;
2159   if (pg_log.get_head() != info.last_update)
2160     pl->get_clog_error() << info.pgid
2161                            << " info mismatch, log.head " << pg_log.get_head()
2162                            << " != info.last_update " << info.last_update;
2163
2164   if (!pg_log.get_log().empty()) {
2165     // sloppy check
2166     if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
2167       pl->get_clog_error() << info.pgid
2168                              << " log bound mismatch, info (tail,head] ("
2169                              << pg_log.get_tail() << ","
2170                              << pg_log.get_head() << "]"
2171                              << " actual ["
2172                              << pg_log.get_log().log.begin()->version << ","
2173                              << pg_log.get_log().log.rbegin()->version << "]";
2174   }
2175
2176   if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
2177     pl->get_clog_error() << info.pgid
2178                            << " caller_ops.size "
2179                            << pg_log.get_log().caller_ops.size()
2180                            << " > log size " << pg_log.get_log().log.size();
2181   }
2182 }
2183
2184 /*
2185  * Process information from a replica to determine if it could have any
2186  * objects that i need.
2187  *
2188  * TODO: if the missing set becomes very large, this could get expensive.
2189  * Instead, we probably want to just iterate over our unfound set.
2190  */
2191 bool PeeringState::search_for_missing(
2192   const pg_info_t &oinfo, const pg_missing_t &omissing,
2193   pg_shard_t from,
2194   PeeringCtxWrapper &ctx)
2195 {
2196   uint64_t num_unfound_before = missing_loc.num_unfound();
2197   bool found_missing = missing_loc.add_source_info(
2198     from, oinfo, omissing, ctx.handle);
2199   if (found_missing && num_unfound_before != missing_loc.num_unfound())
2200     pl->publish_stats_to_osd();
2201   // avoid doing this if the peer is empty.  This is abit of paranoia
2202   // to avoid doing something rash if add_source_info() above
2203   // incorrectly decided we found something new. (if the peer has
2204   // last_update=0'0 that's impossible.)
2205   if (found_missing &&
2206       oinfo.last_update != eversion_t()) {
2207     pg_info_t tinfo(oinfo);
2208     tinfo.pgid.shard = pg_whoami.shard;
2209     ctx.send_info(
2210       from.osd,
2211       spg_t(info.pgid.pgid, from.shard),
2212       get_osdmap_epoch(),  // fixme: use lower epoch?
2213       get_osdmap_epoch(),
2214       tinfo);
2215   }
2216   return found_missing;
2217 }
2218
2219 bool PeeringState::discover_all_missing(
2220   BufferedRecoveryMessages &rctx)
2221 {
2222   auto &missing = pg_log.get_missing();
2223   uint64_t unfound = get_num_unfound();
2224   bool any = false;  // did we start any queries
2225
2226   psdout(10) << __func__ << " "
2227              << missing.num_missing() << " missing, "
2228              << unfound << " unfound"
2229              << dendl;
2230
2231   std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
2232   std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
2233   for (; m != mend; ++m) {
2234     pg_shard_t peer(*m);
2235
2236     if (!get_osdmap()->is_up(peer.osd)) {
2237       psdout(20) << __func__ << " skipping down osd." << peer << dendl;
2238       continue;
2239     }
2240
2241     if (peer_purged.count(peer)) {
2242       psdout(20) << __func__ << " skipping purged osd." << peer << dendl;
2243       continue;
2244     }
2245
2246     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
2247     if (iter != peer_info.end() &&
2248         (iter->second.is_empty() || iter->second.dne())) {
2249       // ignore empty peers
2250       continue;
2251     }
2252
2253     // If we've requested any of this stuff, the pg_missing_t information
2254     // should be on its way.
2255     // TODO: coalsce requested_* into a single data structure
2256     if (peer_missing.find(peer) != peer_missing.end()) {
2257       psdout(20) << __func__ << ": osd." << peer
2258                  << ": we already have pg_missing_t" << dendl;
2259       continue;
2260     }
2261     if (peer_log_requested.find(peer) != peer_log_requested.end()) {
2262       psdout(20) << __func__ << ": osd." << peer
2263                  << ": in peer_log_requested" << dendl;
2264       continue;
2265     }
2266     if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
2267       psdout(20) << __func__ << ": osd." << peer
2268                  << ": in peer_missing_requested" << dendl;
2269       continue;
2270     }
2271
2272     // Request missing
2273     psdout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
2274                << dendl;
2275     peer_missing_requested.insert(peer);
2276     rctx.send_query(
2277       peer.osd,
2278       spg_t(info.pgid.pgid, peer.shard),
2279       pg_query_t(
2280         pg_query_t::FULLLOG,
2281         peer.shard, pg_whoami.shard,
2282         info.history, get_osdmap_epoch()));
2283     any = true;
2284   }
2285   return any;
2286 }
2287
2288 /* Build the might_have_unfound set.
2289  *
2290  * This is used by the primary OSD during recovery.
2291  *
2292  * This set tracks the OSDs which might have unfound objects that the primary
2293  * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
2294  * will remove the OSD from the set.
2295  */
2296 void PeeringState::build_might_have_unfound()
2297 {
2298   ceph_assert(might_have_unfound.empty());
2299   ceph_assert(is_primary());
2300
2301   psdout(10) << __func__ << dendl;
2302
2303   check_past_interval_bounds();
2304
2305   might_have_unfound = past_intervals.get_might_have_unfound(
2306     pg_whoami,
2307     pool.info.is_erasure());
2308
2309   // include any (stray) peers
2310   for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
2311        p != peer_info.end();
2312        ++p)
2313     might_have_unfound.insert(p->first);
2314
2315   psdout(15) << __func__ << ": built " << might_have_unfound << dendl;
2316 }
2317
2318 void PeeringState::activate(
2319   ObjectStore::Transaction& t,
2320   epoch_t activation_epoch,
2321   PeeringCtxWrapper &ctx)
2322 {
2323   ceph_assert(!is_peered());
2324
2325   // twiddle pg state
2326   state_clear(PG_STATE_DOWN);
2327
2328   send_notify = false;
2329
2330   if (is_primary()) {
2331     // only update primary last_epoch_started if we will go active
2332     if (acting.size() >= pool.info.min_size) {
2333       ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
2334              info.last_epoch_started <= activation_epoch);
2335       info.last_epoch_started = activation_epoch;
2336       info.last_interval_started = info.history.same_interval_since;
2337     }
2338   } else if (is_acting(pg_whoami)) {
2339     /* update last_epoch_started on acting replica to whatever the primary sent
2340      * unless it's smaller (could happen if we are going peered rather than
2341      * active, see doc/dev/osd_internals/last_epoch_started.rst) */
2342     if (info.last_epoch_started < activation_epoch) {
2343       info.last_epoch_started = activation_epoch;
2344       info.last_interval_started = info.history.same_interval_since;
2345     }
2346   }
2347
2348   auto &missing = pg_log.get_missing();
2349
2350   min_last_complete_ondisk = eversion_t(0,0);  // we don't know (yet)!
2351   if (is_primary()) {
2352     last_update_ondisk = info.last_update;
2353   }
2354   last_update_applied = info.last_update;
2355   last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
2356
2357   need_up_thru = false;
2358
2359   // write pg info, log
2360   dirty_info = true;
2361   dirty_big_info = true; // maybe
2362
2363   pl->schedule_event_on_commit(
2364     t,
2365     std::make_shared<PGPeeringEvent>(
2366       get_osdmap_epoch(),
2367       get_osdmap_epoch(),
2368       ActivateCommitted(
2369         get_osdmap_epoch(),
2370         activation_epoch)));
2371
2372   // init complete pointer
2373   if (missing.num_missing() == 0) {
2374     psdout(10) << "activate - no missing, moving last_complete " << info.last_complete
2375              << " -> " << info.last_update << dendl;
2376     info.last_complete = info.last_update;
2377     info.stats.stats.sum.num_objects_missing = 0;
2378     pg_log.reset_recovery_pointers();
2379   } else {
2380     psdout(10) << "activate - not complete, " << missing << dendl;
2381     info.stats.stats.sum.num_objects_missing = missing.num_missing();
2382     pg_log.activate_not_complete(info);
2383   }
2384
2385   log_weirdness();
2386
2387   if (is_primary()) {
2388     // initialize snap_trimq
2389     interval_set<snapid_t> to_trim;
2390     auto& removed_snaps_queue = get_osdmap()->get_removed_snaps_queue();
2391     auto p = removed_snaps_queue.find(info.pgid.pgid.pool());
2392     if (p != removed_snaps_queue.end()) {
2393       dout(20) << "activate - purged_snaps " << info.purged_snaps
2394                << " removed_snaps " << p->second
2395                << dendl;
2396       for (auto q : p->second) {
2397         to_trim.insert(q.first, q.second);
2398       }
2399     }
2400     interval_set<snapid_t> purged;
2401     purged.intersection_of(to_trim, info.purged_snaps);
2402     to_trim.subtract(purged);
2403
2404     if (HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
2405       renew_lease(pl->get_mnow());
2406       // do not schedule until we are actually activated
2407     }
2408
2409     // adjust purged_snaps: PG may have been inactive while snaps were pruned
2410     // from the removed_snaps_queue in the osdmap.  update local purged_snaps
2411     // reflect only those snaps that we thought were pruned and were still in
2412     // the queue.
2413     info.purged_snaps.swap(purged);
2414
2415     // start up replicas
2416     info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
2417                                                  prior_readable_until_ub);
2418
2419     ceph_assert(!acting_recovery_backfill.empty());
2420     for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2421          i != acting_recovery_backfill.end();
2422          ++i) {
2423       if (*i == pg_whoami) continue;
2424       pg_shard_t peer = *i;
2425       ceph_assert(peer_info.count(peer));
2426       pg_info_t& pi = peer_info[peer];
2427
2428       psdout(10) << "activate peer osd." << peer << " " << pi << dendl;
2429
2430       MOSDPGLog *m = 0;
2431       ceph_assert(peer_missing.count(peer));
2432       pg_missing_t& pm = peer_missing[peer];
2433
2434       bool needs_past_intervals = pi.dne();
2435
2436       if (pi.last_update == info.last_update) {
2437         // empty log
2438         if (!pi.last_backfill.is_max())
2439           pl->get_clog_info() << info.pgid << " continuing backfill to osd."
2440                                 << peer
2441                                 << " from (" << pi.log_tail << "," << pi.last_update
2442                                 << "] " << pi.last_backfill
2443                                 << " to " << info.last_update;
2444         if (!pi.is_empty()) {
2445           psdout(10) << "activate peer osd." << peer
2446                      << " is up to date, queueing in pending_activators" << dendl;
2447           ctx.send_info(
2448             peer.osd,
2449             spg_t(info.pgid.pgid, peer.shard),
2450             get_osdmap_epoch(), // fixme: use lower epoch?
2451             get_osdmap_epoch(),
2452             info,
2453             get_lease());
2454         } else {
2455           psdout(10) << "activate peer osd." << peer
2456                      << " is up to date, but sending pg_log anyway" << dendl;
2457           m = new MOSDPGLog(
2458             i->shard, pg_whoami.shard,
2459             get_osdmap_epoch(), info,
2460             last_peering_reset);
2461         }
2462       } else if (
2463         pg_log.get_tail() > pi.last_update ||
2464         pi.last_backfill == hobject_t() ||
2465         (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
2466         /* ^ This last case covers a situation where a replica is not contiguous
2467          * with the auth_log, but is contiguous with this replica.  Reshuffling
2468          * the active set to handle this would be tricky, so instead we just go
2469          * ahead and backfill it anyway.  This is probably preferrable in any
2470          * case since the replica in question would have to be significantly
2471          * behind.
2472          */
2473         // backfill
2474         pl->get_clog_debug() << info.pgid << " starting backfill to osd." << peer
2475                                << " from (" << pi.log_tail << "," << pi.last_update
2476                                << "] " << pi.last_backfill
2477                                << " to " << info.last_update;
2478
2479         pi.last_update = info.last_update;
2480         pi.last_complete = info.last_update;
2481         pi.set_last_backfill(hobject_t());
2482         pi.last_epoch_started = info.last_epoch_started;
2483         pi.last_interval_started = info.last_interval_started;
2484         pi.history = info.history;
2485         pi.hit_set = info.hit_set;
2486         // Save num_bytes for reservation request, can't be negative
2487         peer_bytes[peer] = std::max<int64_t>(0, pi.stats.stats.sum.num_bytes);
2488         pi.stats.stats.clear();
2489         pi.stats.stats.sum.num_bytes = peer_bytes[peer];
2490
2491         // initialize peer with our purged_snaps.
2492         pi.purged_snaps = info.purged_snaps;
2493
2494         m = new MOSDPGLog(
2495           i->shard, pg_whoami.shard,
2496           get_osdmap_epoch(), pi,
2497           last_peering_reset /* epoch to create pg at */);
2498
2499         // send some recent log, so that op dup detection works well.
2500         m->log.copy_up_to(cct, pg_log.get_log(),
2501                           cct->_conf->osd_max_pg_log_entries);
2502         m->info.log_tail = m->log.tail;
2503         pi.log_tail = m->log.tail;  // sigh...
2504
2505         pm.clear();
2506       } else {
2507         // catch up
2508         ceph_assert(pg_log.get_tail() <= pi.last_update);
2509         m = new MOSDPGLog(
2510           i->shard, pg_whoami.shard,
2511           get_osdmap_epoch(), info,
2512           last_peering_reset /* epoch to create pg at */);
2513         // send new stuff to append to replicas log
2514         m->log.copy_after(cct, pg_log.get_log(), pi.last_update);
2515       }
2516
2517       // share past_intervals if we are creating the pg on the replica
2518       // based on whether our info for that peer was dne() *before*
2519       // updating pi.history in the backfill block above.
2520       if (m && needs_past_intervals)
2521         m->past_intervals = past_intervals;
2522
2523       // update local version of peer's missing list!
2524       if (m && pi.last_backfill != hobject_t()) {
2525         for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
2526              p != m->log.log.end();
2527              ++p) {
2528           if (p->soid <= pi.last_backfill &&
2529               !p->is_error()) {
2530             if (perform_deletes_during_peering() && p->is_delete()) {
2531               pm.rm(p->soid, p->version);
2532             } else {
2533               pm.add_next_event(*p);
2534             }
2535           }
2536         }
2537       }
2538
2539       if (m) {
2540         dout(10) << "activate peer osd." << peer << " sending " << m->log
2541                  << dendl;
2542         m->lease = get_lease();
2543         pl->send_cluster_message(peer.osd, m, get_osdmap_epoch());
2544       }
2545
2546       // peer now has
2547       pi.last_update = info.last_update;
2548
2549       // update our missing
2550       if (pm.num_missing() == 0) {
2551         pi.last_complete = pi.last_update;
2552         psdout(10) << "activate peer osd." << peer << " " << pi
2553                    << " uptodate" << dendl;
2554       } else {
2555         psdout(10) << "activate peer osd." << peer << " " << pi
2556                    << " missing " << pm << dendl;
2557       }
2558     }
2559
2560     // Set up missing_loc
2561     set<pg_shard_t> complete_shards;
2562     for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2563          i != acting_recovery_backfill.end();
2564          ++i) {
2565       psdout(20) << __func__ << " setting up missing_loc from shard " << *i
2566                  << " " << dendl;
2567       if (*i == get_primary()) {
2568         missing_loc.add_active_missing(missing);
2569         if (!missing.have_missing())
2570           complete_shards.insert(*i);
2571       } else {
2572         auto peer_missing_entry = peer_missing.find(*i);
2573         ceph_assert(peer_missing_entry != peer_missing.end());
2574         missing_loc.add_active_missing(peer_missing_entry->second);
2575         if (!peer_missing_entry->second.have_missing() &&
2576             peer_info[*i].last_backfill.is_max())
2577           complete_shards.insert(*i);
2578       }
2579     }
2580
2581     // If necessary, create might_have_unfound to help us find our unfound objects.
2582     // NOTE: It's important that we build might_have_unfound before trimming the
2583     // past intervals.
2584     might_have_unfound.clear();
2585     if (needs_recovery()) {
2586       // If only one shard has missing, we do a trick to add all others as recovery
2587       // source, this is considered safe since the PGLogs have been merged locally,
2588       // and covers vast majority of the use cases, like one OSD/host is down for
2589       // a while for hardware repairing
2590       if (complete_shards.size() + 1 == acting_recovery_backfill.size()) {
2591         missing_loc.add_batch_sources_info(complete_shards, ctx.handle);
2592       } else {
2593         missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
2594                                     ctx.handle);
2595         for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2596              i != acting_recovery_backfill.end();
2597              ++i) {
2598           if (*i == pg_whoami) continue;
2599           psdout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
2600           ceph_assert(peer_missing.count(*i));
2601           ceph_assert(peer_info.count(*i));
2602           missing_loc.add_source_info(
2603             *i,
2604             peer_info[*i],
2605             peer_missing[*i],
2606             ctx.handle);
2607         }
2608       }
2609       for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
2610            i != peer_missing.end();
2611            ++i) {
2612         if (is_acting_recovery_backfill(i->first))
2613           continue;
2614         ceph_assert(peer_info.count(i->first));
2615         search_for_missing(
2616           peer_info[i->first],
2617           i->second,
2618           i->first,
2619           ctx);
2620       }
2621
2622       build_might_have_unfound();
2623
2624       // Always call now so update_calc_stats() will be accurate
2625       discover_all_missing(ctx.msgs);
2626
2627     }
2628
2629     // num_objects_degraded if calculated should reflect this too, unless no
2630     // missing and we are about to go clean.
2631     if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
2632       state_set(PG_STATE_UNDERSIZED);
2633     }
2634
2635     state_set(PG_STATE_ACTIVATING);
2636     pl->on_activate(std::move(to_trim));
2637   }
2638   if (acting.size() >= pool.info.min_size) {
2639     PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2640     pg_log.roll_forward(rollbacker.get());
2641   }
2642 }
2643
2644 void PeeringState::share_pg_info()
2645 {
2646   psdout(10) << "share_pg_info" << dendl;
2647
2648   info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
2649                                                prior_readable_until_ub);
2650
2651   // share new pg_info_t with replicas
2652   ceph_assert(!acting_recovery_backfill.empty());
2653   for (auto pg_shard : acting_recovery_backfill) {
2654     if (pg_shard == pg_whoami) continue;
2655     if (auto peer = peer_info.find(pg_shard); peer != peer_info.end()) {
2656       peer->second.last_epoch_started = info.last_epoch_started;
2657       peer->second.last_interval_started = info.last_interval_started;
2658       peer->second.history.merge(info.history);
2659     }
2660     Message* m = nullptr;
2661     if (last_require_osd_release >= ceph_release_t::octopus) {
2662       m = new MOSDPGInfo2{spg_t{info.pgid.pgid, pg_shard.shard},
2663                           info,
2664                           get_osdmap_epoch(),
2665                           get_osdmap_epoch(),
2666                           get_lease(), {}};
2667     } else {
2668       m = new MOSDPGInfo{get_osdmap_epoch(),
2669                          {pg_notify_t{pg_shard.shard,
2670                                       pg_whoami.shard,
2671                                       get_osdmap_epoch(),
2672                                       get_osdmap_epoch(),
2673                                       info,
2674                                       past_intervals}}};
2675     }
2676     pl->send_cluster_message(pg_shard.osd, m, get_osdmap_epoch());
2677   }
2678 }
2679
2680 void PeeringState::merge_log(
2681   ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
2682   pg_shard_t from)
2683 {
2684   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2685   pg_log.merge_log(
2686     oinfo, olog, from, info, rollbacker.get(), dirty_info, dirty_big_info);
2687 }
2688
2689 void PeeringState::rewind_divergent_log(
2690   ObjectStore::Transaction& t, eversion_t newhead)
2691 {
2692   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2693   pg_log.rewind_divergent_log(
2694     newhead, info, rollbacker.get(), dirty_info, dirty_big_info);
2695 }
2696
2697
2698 void PeeringState::proc_primary_info(
2699   ObjectStore::Transaction &t, const pg_info_t &oinfo)
2700 {
2701   ceph_assert(!is_primary());
2702
2703   update_history(oinfo.history);
2704   if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
2705     info.stats.stats.sum.num_scrub_errors = 0;
2706     info.stats.stats.sum.num_shallow_scrub_errors = 0;
2707     info.stats.stats.sum.num_deep_scrub_errors = 0;
2708     dirty_info = true;
2709   }
2710
2711   if (!(info.purged_snaps == oinfo.purged_snaps)) {
2712     psdout(10) << __func__ << " updating purged_snaps to "
2713                << oinfo.purged_snaps
2714                << dendl;
2715     info.purged_snaps = oinfo.purged_snaps;
2716     dirty_info = true;
2717     dirty_big_info = true;
2718   }
2719 }
2720
2721 void PeeringState::proc_master_log(
2722   ObjectStore::Transaction& t, pg_info_t &oinfo,
2723   pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
2724 {
2725   psdout(10) << "proc_master_log for osd." << from << ": "
2726              << olog << " " << omissing << dendl;
2727   ceph_assert(!is_peered() && is_primary());
2728
2729   // merge log into our own log to build master log.  no need to
2730   // make any adjustments to their missing map; we are taking their
2731   // log to be authoritative (i.e., their entries are by definitely
2732   // non-divergent).
2733   merge_log(t, oinfo, olog, from);
2734   peer_info[from] = oinfo;
2735   psdout(10) << " peer osd." << from << " now " << oinfo
2736              << " " << omissing << dendl;
2737   might_have_unfound.insert(from);
2738
2739   // See doc/dev/osd_internals/last_epoch_started
2740   if (oinfo.last_epoch_started > info.last_epoch_started) {
2741     info.last_epoch_started = oinfo.last_epoch_started;
2742     dirty_info = true;
2743   }
2744   if (oinfo.last_interval_started > info.last_interval_started) {
2745     info.last_interval_started = oinfo.last_interval_started;
2746     dirty_info = true;
2747   }
2748   update_history(oinfo.history);
2749   ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
2750          info.last_epoch_started >= info.history.last_epoch_started);
2751
2752   peer_missing[from].claim(omissing);
2753 }
2754
2755 void PeeringState::proc_replica_log(
2756   pg_info_t &oinfo,
2757   const pg_log_t &olog,
2758   pg_missing_t& omissing,
2759   pg_shard_t from)
2760 {
2761   psdout(10) << "proc_replica_log for osd." << from << ": "
2762              << oinfo << " " << olog << " " << omissing << dendl;
2763
2764   pg_log.proc_replica_log(oinfo, olog, omissing, from);
2765
2766   peer_info[from] = oinfo;
2767   psdout(10) << " peer osd." << from << " now "
2768              << oinfo << " " << omissing << dendl;
2769   might_have_unfound.insert(from);
2770
2771   for (map<hobject_t, pg_missing_item>::const_iterator i =
2772          omissing.get_items().begin();
2773        i != omissing.get_items().end();
2774        ++i) {
2775     psdout(20) << " after missing " << i->first
2776                << " need " << i->second.need
2777                << " have " << i->second.have << dendl;
2778   }
2779   peer_missing[from].claim(omissing);
2780 }
2781
2782 void PeeringState::fulfill_info(
2783   pg_shard_t from, const pg_query_t &query,
2784   pair<pg_shard_t, pg_info_t> &notify_info)
2785 {
2786   ceph_assert(from == primary);
2787   ceph_assert(query.type == pg_query_t::INFO);
2788
2789   // info
2790   psdout(10) << "sending info" << dendl;
2791   notify_info = make_pair(from, info);
2792 }
2793
2794 void PeeringState::fulfill_log(
2795   pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
2796 {
2797   psdout(10) << "log request from " << from << dendl;
2798   ceph_assert(from == primary);
2799   ceph_assert(query.type != pg_query_t::INFO);
2800
2801   MOSDPGLog *mlog = new MOSDPGLog(
2802     from.shard, pg_whoami.shard,
2803     get_osdmap_epoch(),
2804     info, query_epoch);
2805   mlog->missing = pg_log.get_missing();
2806
2807   // primary -> other, when building master log
2808   if (query.type == pg_query_t::LOG) {
2809     psdout(10) << " sending info+missing+log since " << query.since
2810                << dendl;
2811     if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
2812       pl->get_clog_error() << info.pgid << " got broken pg_query_t::LOG since "
2813                              << query.since
2814                              << " when my log.tail is " << pg_log.get_tail()
2815                              << ", sending full log instead";
2816       mlog->log = pg_log.get_log();           // primary should not have requested this!!
2817     } else
2818       mlog->log.copy_after(cct, pg_log.get_log(), query.since);
2819   }
2820   else if (query.type == pg_query_t::FULLLOG) {
2821     psdout(10) << " sending info+missing+full log" << dendl;
2822     mlog->log = pg_log.get_log();
2823   }
2824
2825   psdout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
2826
2827   pl->send_cluster_message(from.osd, mlog, get_osdmap_epoch(), true);
2828 }
2829
2830 void PeeringState::fulfill_query(const MQuery& query, PeeringCtxWrapper &rctx)
2831 {
2832   if (query.query.type == pg_query_t::INFO) {
2833     pair<pg_shard_t, pg_info_t> notify_info;
2834     // note this refreshes our prior_readable_until_ub value
2835     update_history(query.query.history);
2836     fulfill_info(query.from, query.query, notify_info);
2837     rctx.send_notify(
2838       notify_info.first.osd,
2839       pg_notify_t(
2840         notify_info.first.shard, pg_whoami.shard,
2841         query.query_epoch,
2842         get_osdmap_epoch(),
2843         notify_info.second,
2844         past_intervals));
2845   } else {
2846     update_history(query.query.history);
2847     fulfill_log(query.from, query.query, query.query_epoch);
2848   }
2849 }
2850
2851 void PeeringState::try_mark_clean()
2852 {
2853   if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2854     state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
2855     state_set(PG_STATE_CLEAN);
2856     info.history.last_epoch_clean = get_osdmap_epoch();
2857     info.history.last_interval_clean = info.history.same_interval_since;
2858     past_intervals.clear();
2859     dirty_big_info = true;
2860     dirty_info = true;
2861   }
2862
2863   if (!is_active() && is_peered()) {
2864     if (is_clean()) {
2865       bool target;
2866       if (pool.info.is_pending_merge(info.pgid.pgid, &target)) {
2867         if (target) {
2868           psdout(10) << "ready to merge (target)" << dendl;
2869           pl->set_ready_to_merge_target(
2870             info.last_update,
2871             info.history.last_epoch_started,
2872             info.history.last_epoch_clean);
2873         } else {
2874           psdout(10) << "ready to merge (source)" << dendl;
2875           pl->set_ready_to_merge_source(info.last_update);
2876         }
2877       }
2878     } else {
2879       psdout(10) << "not clean, not ready to merge" << dendl;
2880       // we should have notified OSD in Active state entry point
2881     }
2882   }
2883
2884   state_clear(PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
2885
2886   share_pg_info();
2887   pl->publish_stats_to_osd();
2888   clear_recovery_state();
2889 }
2890
2891 void PeeringState::split_into(
2892   pg_t child_pgid, PeeringState *child, unsigned split_bits)
2893 {
2894   child->update_osdmap_ref(get_osdmap());
2895   child->pool = pool;
2896
2897   // Log
2898   pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2899   child->info.last_complete = info.last_complete;
2900
2901   info.last_update = pg_log.get_head();
2902   child->info.last_update = child->pg_log.get_head();
2903
2904   child->info.last_user_version = info.last_user_version;
2905
2906   info.log_tail = pg_log.get_tail();
2907   child->info.log_tail = child->pg_log.get_tail();
2908
2909   // reset last_complete, we might have modified pg_log & missing above
2910   pg_log.reset_complete_to(&info);
2911   child->pg_log.reset_complete_to(&child->info);
2912
2913   // Info
2914   child->info.history = info.history;
2915   child->info.history.epoch_created = get_osdmap_epoch();
2916   child->info.purged_snaps = info.purged_snaps;
2917
2918   if (info.last_backfill.is_max()) {
2919     child->info.set_last_backfill(hobject_t::get_max());
2920   } else {
2921     // restart backfill on parent and child to be safe.  we could
2922     // probably do better in the bitwise sort case, but it's more
2923     // fragile (there may be special work to do on backfill completion
2924     // in the future).
2925     info.set_last_backfill(hobject_t());
2926     child->info.set_last_backfill(hobject_t());
2927     // restarting backfill implies that the missing set is empty,
2928     // since it is only used for objects prior to last_backfill
2929     pg_log.reset_backfill();
2930     child->pg_log.reset_backfill();
2931   }
2932
2933   child->info.stats = info.stats;
2934   child->info.stats.parent_split_bits = split_bits;
2935   info.stats.stats_invalid = true;
2936   child->info.stats.stats_invalid = true;
2937   child->info.last_epoch_started = info.last_epoch_started;
2938   child->info.last_interval_started = info.last_interval_started;
2939
2940   // There can't be recovery/backfill going on now
2941   int primary, up_primary;
2942   vector<int> newup, newacting;
2943   get_osdmap()->pg_to_up_acting_osds(
2944     child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2945   child->init_primary_up_acting(
2946     newup,
2947     newacting,
2948     up_primary,
2949     primary);
2950   child->role = OSDMap::calc_pg_role(pg_whoami, child->acting);
2951
2952   // this comparison includes primary rank via pg_shard_t
2953   if (get_primary() != child->get_primary())
2954     child->info.history.same_primary_since = get_osdmap_epoch();
2955
2956   child->info.stats.up = up;
2957   child->info.stats.up_primary = up_primary;
2958   child->info.stats.acting = acting;
2959   child->info.stats.acting_primary = primary;
2960   child->info.stats.mapping_epoch = get_osdmap_epoch();
2961
2962   // History
2963   child->past_intervals = past_intervals;
2964
2965   child->on_new_interval();
2966
2967   child->send_notify = !child->is_primary();
2968
2969   child->dirty_info = true;
2970   child->dirty_big_info = true;
2971   dirty_info = true;
2972   dirty_big_info = true;
2973 }
2974
2975 void PeeringState::merge_from(
2976   map<spg_t,PeeringState *>& sources,
2977   PeeringCtx &rctx,
2978   unsigned split_bits,
2979   const pg_merge_meta_t& last_pg_merge_meta)
2980 {
2981   bool incomplete = false;
2982   if (info.last_complete != info.last_update ||
2983       info.is_incomplete() ||
2984       info.dne()) {
2985     psdout(10) << __func__ << " target incomplete" << dendl;
2986     incomplete = true;
2987   }
2988   if (last_pg_merge_meta.source_pgid != pg_t()) {
2989     if (info.pgid.pgid != last_pg_merge_meta.source_pgid.get_parent()) {
2990       psdout(10) << __func__ << " target doesn't match expected parent "
2991                  << last_pg_merge_meta.source_pgid.get_parent()
2992                  << " of source_pgid " << last_pg_merge_meta.source_pgid
2993                  << dendl;
2994       incomplete = true;
2995     }
2996     if (info.last_update != last_pg_merge_meta.target_version) {
2997       psdout(10) << __func__ << " target version doesn't match expected "
2998                << last_pg_merge_meta.target_version << dendl;
2999       incomplete = true;
3000     }
3001   }
3002
3003   PGLog::LogEntryHandlerRef handler{pl->get_log_handler(rctx.transaction)};
3004   pg_log.roll_forward(handler.get());
3005
3006   info.last_complete = info.last_update;  // to fake out trim()
3007   pg_log.reset_recovery_pointers();
3008   pg_log.trim(info.last_update, info);
3009
3010   vector<PGLog*> log_from;
3011   for (auto& i : sources) {
3012     auto& source = i.second;
3013     if (!source) {
3014       psdout(10) << __func__ << " source " << i.first << " missing" << dendl;
3015       incomplete = true;
3016       continue;
3017     }
3018     if (source->info.last_complete != source->info.last_update ||
3019         source->info.is_incomplete() ||
3020         source->info.dne()) {
3021       psdout(10) << __func__ << " source " << source->pg_whoami
3022                  << " incomplete"
3023                  << dendl;
3024       incomplete = true;
3025     }
3026     if (last_pg_merge_meta.source_pgid != pg_t()) {
3027       if (source->info.pgid.pgid != last_pg_merge_meta.source_pgid) {
3028         dout(10) << __func__ << " source " << source->info.pgid.pgid
3029                  << " doesn't match expected source pgid "
3030                  << last_pg_merge_meta.source_pgid << dendl;
3031         incomplete = true;
3032       }
3033       if (source->info.last_update != last_pg_merge_meta.source_version) {
3034         dout(10) << __func__ << " source version doesn't match expected "
3035                  << last_pg_merge_meta.target_version << dendl;
3036         incomplete = true;
3037       }
3038     }
3039
3040     // prepare log
3041     PGLog::LogEntryHandlerRef handler{
3042       source->pl->get_log_handler(rctx.transaction)};
3043     source->pg_log.roll_forward(handler.get());
3044     source->info.last_complete = source->info.last_update;  // to fake out trim()
3045     source->pg_log.reset_recovery_pointers();
3046     source->pg_log.trim(source->info.last_update, source->info);
3047     log_from.push_back(&source->pg_log);
3048
3049     // combine stats
3050     info.stats.add(source->info.stats);
3051
3052     // pull up last_update
3053     info.last_update = std::max(info.last_update, source->info.last_update);
3054
3055     // adopt source's PastIntervals if target has none.  we can do this since
3056     // pgp_num has been reduced prior to the merge, so the OSD mappings for
3057     // the PGs are identical.
3058     if (past_intervals.empty() && !source->past_intervals.empty()) {
3059       psdout(10) << __func__ << " taking source's past_intervals" << dendl;
3060       past_intervals = source->past_intervals;
3061     }
3062   }
3063
3064   info.last_complete = info.last_update;
3065   info.log_tail = info.last_update;
3066   if (incomplete) {
3067     info.last_backfill = hobject_t();
3068   }
3069
3070   // merge logs
3071   pg_log.merge_from(log_from, info.last_update);
3072
3073   // make sure we have a meaningful last_epoch_started/clean (if we were a
3074   // placeholder)
3075   if (info.history.epoch_created == 0) {
3076     // start with (a) source's history, since these PGs *should* have been
3077     // remapped in concert with each other...
3078     info.history = sources.begin()->second->info.history;
3079
3080     // we use the last_epoch_{started,clean} we got from
3081     // the caller, which are the epochs that were reported by the PGs were
3082     // found to be ready for merge.
3083     info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean;
3084     info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3085     info.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3086     psdout(10) << __func__
3087                << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/"
3088                << last_pg_merge_meta.last_epoch_clean
3089                << " from pool last_dec_*, source pg history was "
3090                << sources.begin()->second->info.history
3091                << dendl;
3092
3093     // if the past_intervals start is later than last_epoch_clean, it
3094     // implies the source repeered again but the target didn't, or
3095     // that the source became clean in a later epoch than the target.
3096     // avoid the discrepancy but adjusting the interval start
3097     // backwards to match so that check_past_interval_bounds() will
3098     // not complain.
3099     auto pib = past_intervals.get_bounds();
3100     if (info.history.last_epoch_clean < pib.first) {
3101       psdout(10) << __func__ << " last_epoch_clean "
3102                  << info.history.last_epoch_clean << " < past_interval start "
3103                  << pib.first << ", adjusting start backwards" << dendl;
3104       past_intervals.adjust_start_backwards(info.history.last_epoch_clean);
3105     }
3106
3107     // Similarly, if the same_interval_since value is later than
3108     // last_epoch_clean, the next interval change will result in a
3109     // past_interval start that is later than last_epoch_clean.  This
3110     // can happen if we use the pg_history values from the merge
3111     // source.  Adjust the same_interval_since value backwards if that
3112     // happens.  (We trust the les and lec values more because they came from
3113     // the real target, whereas the history value we stole from the source.)
3114     if (info.history.last_epoch_started < info.history.same_interval_since) {
3115       psdout(10) << __func__ << " last_epoch_started "
3116                  << info.history.last_epoch_started << " < same_interval_since "
3117                  << info.history.same_interval_since
3118                  << ", adjusting pg_history backwards" << dendl;
3119       info.history.same_interval_since = info.history.last_epoch_clean;
3120       // make sure same_{up,primary}_since are <= same_interval_since
3121       info.history.same_up_since = std::min(
3122         info.history.same_up_since, info.history.same_interval_since);
3123       info.history.same_primary_since = std::min(
3124         info.history.same_primary_since, info.history.same_interval_since);
3125     }
3126   }
3127
3128   dirty_info = true;
3129   dirty_big_info = true;
3130 }
3131
3132 void PeeringState::start_split_stats(
3133   const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
3134 {
3135   out->resize(childpgs.size() + 1);
3136   info.stats.stats.sum.split(*out);
3137 }
3138
3139 void PeeringState::finish_split_stats(
3140   const object_stat_sum_t& stats, ObjectStore::Transaction &t)
3141 {
3142   info.stats.stats.sum = stats;
3143   write_if_dirty(t);
3144 }
3145
3146 void PeeringState::update_blocked_by()
3147 {
3148   // set a max on the number of blocking peers we report. if we go
3149   // over, report a random subset.  keep the result sorted.
3150   unsigned keep = std::min<unsigned>(
3151     blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
3152   unsigned skip = blocked_by.size() - keep;
3153   info.stats.blocked_by.clear();
3154   info.stats.blocked_by.resize(keep);
3155   unsigned pos = 0;
3156   for (set<int>::iterator p = blocked_by.begin();
3157        p != blocked_by.end() && keep > 0;
3158        ++p) {
3159     if (skip > 0 && (rand() % (skip + keep) < skip)) {
3160       --skip;
3161     } else {
3162       info.stats.blocked_by[pos++] = *p;
3163       --keep;
3164     }
3165   }
3166 }
3167
3168 static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard)
3169 {
3170     for (auto&p : pgs)
3171       if (p.shard == shard)
3172         return true;
3173     return false;
3174 }
3175
3176 static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard)
3177 {
3178     for (auto&p : pgs) {
3179       if (p == skip)
3180         continue;
3181       if (p.shard == shard)
3182         return p;
3183     }
3184     return pg_shard_t();
3185 }
3186
3187 void PeeringState::update_calc_stats()
3188 {
3189   info.stats.version = info.last_update;
3190   info.stats.created = info.history.epoch_created;
3191   info.stats.last_scrub = info.history.last_scrub;
3192   info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
3193   info.stats.last_deep_scrub = info.history.last_deep_scrub;
3194   info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
3195   info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
3196   info.stats.last_epoch_clean = info.history.last_epoch_clean;
3197
3198   info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
3199   info.stats.ondisk_log_size = info.stats.log_size;
3200   info.stats.log_start = pg_log.get_tail();
3201   info.stats.ondisk_log_start = pg_log.get_tail();
3202   info.stats.snaptrimq_len = pl->get_snap_trimq_size();
3203
3204   unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
3205
3206   // In rare case that upset is too large (usually transient), use as target
3207   // for calculations below.
3208   unsigned target = std::max(num_shards, (unsigned)upset.size());
3209   // For undersized actingset may be larger with OSDs out
3210   unsigned nrep = std::max(actingset.size(), upset.size());
3211   // calc num_object_copies
3212   info.stats.stats.calc_copies(std::max(target, nrep));
3213   info.stats.stats.sum.num_objects_degraded = 0;
3214   info.stats.stats.sum.num_objects_unfound = 0;
3215   info.stats.stats.sum.num_objects_misplaced = 0;
3216   info.stats.avail_no_missing.clear();
3217   info.stats.object_location_counts.clear();
3218
3219   // We should never hit this condition, but if end up hitting it,
3220   // make sure to update num_objects and set PG_STATE_INCONSISTENT.
3221   if (info.stats.stats.sum.num_objects < 0) {
3222     psdout(0) << __func__ << " negative num_objects = "
3223               << info.stats.stats.sum.num_objects << " setting it to 0 "
3224               << dendl;
3225     info.stats.stats.sum.num_objects = 0;
3226     state_set(PG_STATE_INCONSISTENT);
3227   }
3228
3229   if ((is_remapped() || is_undersized() || !is_clean()) &&
3230       (is_peered()|| is_activating())) {
3231     psdout(20) << __func__ << " actingset " << actingset << " upset "
3232                << upset << " acting_recovery_backfill " << acting_recovery_backfill << dendl;
3233
3234     ceph_assert(!acting_recovery_backfill.empty());
3235
3236     bool estimate = false;
3237
3238     // NOTE: we only generate degraded, misplaced and unfound
3239     // values for the summation, not individual stat categories.
3240     int64_t num_objects = info.stats.stats.sum.num_objects;
3241
3242     // Objects missing from up nodes, sorted by # objects.
3243     boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
3244     // Objects missing from nodes not in up, sort by # objects
3245     boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
3246
3247     // Fill missing_target_objects/acting_source_objects
3248
3249     {
3250       int64_t missing;
3251
3252       // Primary first
3253       missing = pg_log.get_missing().num_missing();
3254       ceph_assert(acting_recovery_backfill.count(pg_whoami));
3255       if (upset.count(pg_whoami)) {
3256         missing_target_objects.emplace(missing, pg_whoami);
3257       } else {
3258         acting_source_objects.emplace(missing, pg_whoami);
3259       }
3260       info.stats.stats.sum.num_objects_missing_on_primary = missing;
3261       if (missing == 0)
3262         info.stats.avail_no_missing.push_back(pg_whoami);
3263       psdout(20) << __func__ << " shard " << pg_whoami
3264                  << " primary objects " << num_objects
3265                  << " missing " << missing
3266                  << dendl;
3267     }
3268
3269     // All other peers
3270     for (auto& peer : peer_info) {
3271       // Primary should not be in the peer_info, skip if it is.
3272       if (peer.first == pg_whoami) continue;
3273       int64_t missing = 0;
3274       int64_t peer_num_objects = peer.second.stats.stats.sum.num_objects;
3275       // Backfill targets always track num_objects accurately
3276       // all other peers track missing accurately.
3277       if (is_backfill_target(peer.first)) {
3278         missing = std::max((int64_t)0, num_objects - peer_num_objects);
3279       } else {
3280         if (peer_missing.count(peer.first)) {
3281           missing = peer_missing[peer.first].num_missing();
3282         } else {
3283           psdout(20) << __func__ << " no peer_missing found for "
3284                      << peer.first << dendl;
3285           if (is_recovering()) {
3286             estimate = true;
3287           }
3288           missing = std::max((int64_t)0, num_objects - peer_num_objects);
3289         }
3290       }
3291       if (upset.count(peer.first)) {
3292         missing_target_objects.emplace(missing, peer.first);
3293       } else if (actingset.count(peer.first)) {
3294         acting_source_objects.emplace(missing, peer.first);
3295       }
3296       peer.second.stats.stats.sum.num_objects_missing = missing;
3297       if (missing == 0)
3298         info.stats.avail_no_missing.push_back(peer.first);
3299       psdout(20) << __func__ << " shard " << peer.first
3300                  << " objects " << peer_num_objects
3301                  << " missing " << missing
3302                  << dendl;
3303     }
3304
3305     // Compute object_location_counts
3306     for (auto& ml: missing_loc.get_missing_locs()) {
3307       info.stats.object_location_counts[ml.second]++;
3308       psdout(30) << __func__ << " " << ml.first << " object_location_counts["
3309                  << ml.second << "]=" << info.stats.object_location_counts[ml.second]
3310                  << dendl;
3311     }
3312     int64_t not_missing = num_objects - missing_loc.get_missing_locs().size();
3313     if (not_missing) {
3314         // During recovery we know upset == actingset and is being populated
3315         // During backfill we know that all non-missing objects are in the actingset
3316         info.stats.object_location_counts[actingset] = not_missing;
3317     }
3318     psdout(30) << __func__ << " object_location_counts["
3319                << upset << "]=" << info.stats.object_location_counts[upset]
3320                << dendl;
3321     psdout(20) << __func__ << " object_location_counts "
3322                << info.stats.object_location_counts << dendl;
3323
3324     // A misplaced object is not stored on the correct OSD
3325     int64_t misplaced = 0;
3326     // a degraded objects has fewer replicas or EC shards than the pool specifies.
3327     int64_t degraded = 0;
3328
3329     if (is_recovering()) {
3330       for (auto& sml: missing_loc.get_missing_by_count()) {
3331         for (auto& ml: sml.second) {
3332           int missing_shards;
3333           if (sml.first == shard_id_t::NO_SHARD) {
3334             psdout(20) << __func__ << " ml " << ml.second
3335                        << " upset size " << upset.size()
3336                        << " up " << ml.first.up << dendl;
3337             missing_shards = (int)upset.size() - ml.first.up;
3338           } else {
3339             // Handle shards not even in upset below
3340             if (!find_shard(upset, sml.first))
3341               continue;
3342             missing_shards = std::max(0, 1 - ml.first.up);
3343             psdout(20) << __func__
3344                        << " shard " << sml.first
3345                        << " ml " << ml.second
3346                        << " missing shards " << missing_shards << dendl;
3347           }
3348           int odegraded = ml.second * missing_shards;
3349           // Copies on other osds but limited to the possible degraded
3350           int more_osds = std::min(missing_shards, ml.first.other);
3351           int omisplaced = ml.second * more_osds;
3352           ceph_assert(omisplaced <= odegraded);
3353           odegraded -= omisplaced;
3354
3355           misplaced += omisplaced;
3356           degraded += odegraded;
3357         }
3358       }
3359
3360       psdout(20) << __func__ << " missing based degraded "
3361                  << degraded << dendl;
3362       psdout(20) << __func__ << " missing based misplaced "
3363                  << misplaced << dendl;
3364
3365       // Handle undersized case
3366       if (pool.info.is_replicated()) {
3367         // Add degraded for missing targets (num_objects missing)
3368         ceph_assert(target >= upset.size());
3369         unsigned needed = target - upset.size();
3370         degraded += num_objects * needed;
3371       } else {
3372         for (unsigned i = 0 ; i < num_shards; ++i) {
3373           shard_id_t shard(i);
3374
3375           if (!find_shard(upset, shard)) {
3376             pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard);
3377
3378             if (pgs != pg_shard_t()) {
3379               int64_t missing;
3380
3381               if (pgs == pg_whoami)
3382                 missing = info.stats.stats.sum.num_objects_missing_on_primary;
3383               else
3384                 missing = peer_info[pgs].stats.stats.sum.num_objects_missing;
3385
3386               degraded += missing;
3387               misplaced += std::max((int64_t)0, num_objects - missing);
3388             } else {
3389               // No shard anywhere
3390               degraded += num_objects;
3391             }
3392           }
3393         }
3394       }
3395       goto out;
3396     }
3397
3398     // Handle undersized case
3399     if (pool.info.is_replicated()) {
3400       // Add to missing_target_objects
3401       ceph_assert(target >= missing_target_objects.size());
3402       unsigned needed = target - missing_target_objects.size();
3403       if (needed)
3404         missing_target_objects.emplace(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD));
3405     } else {
3406       for (unsigned i = 0 ; i < num_shards; ++i) {
3407         shard_id_t shard(i);
3408         bool found = false;
3409         for (const auto& t : missing_target_objects) {
3410           if (std::get<1>(t).shard == shard) {
3411             found = true;
3412             break;
3413           }
3414         }
3415         if (!found)
3416           missing_target_objects.emplace(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard));
3417       }
3418     }
3419
3420     for (const auto& item : missing_target_objects)
3421       psdout(20) << __func__ << " missing shard " << std::get<1>(item)
3422                  << " missing= " << std::get<0>(item) << dendl;
3423     for (const auto& item : acting_source_objects)
3424       psdout(20) << __func__ << " acting shard " << std::get<1>(item)
3425                  << " missing= " << std::get<0>(item) << dendl;
3426
3427     // Handle all objects not in missing for remapped
3428     // or backfill
3429     for (auto m = missing_target_objects.rbegin();
3430         m != missing_target_objects.rend(); ++m) {
3431
3432       int64_t extra_missing = -1;
3433
3434       if (pool.info.is_replicated()) {
3435         if (!acting_source_objects.empty()) {
3436           auto extra_copy = acting_source_objects.begin();
3437           extra_missing = std::get<0>(*extra_copy);
3438           acting_source_objects.erase(extra_copy);
3439         }
3440       } else {  // Erasure coded
3441         // Use corresponding shard
3442         for (const auto& a : acting_source_objects) {
3443           if (std::get<1>(a).shard == std::get<1>(*m).shard) {
3444             extra_missing = std::get<0>(a);
3445             acting_source_objects.erase(a);
3446             break;
3447           }
3448         }
3449       }
3450
3451       if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
3452         // We don't know which of the objects on the target
3453         // are part of extra_missing so assume are all degraded.
3454         misplaced += std::get<0>(*m) - extra_missing;
3455         degraded += extra_missing;
3456       } else {
3457         // 1. extra_missing == -1, more targets than sources so degraded
3458         // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
3459         //    previously degraded are now present on the target.
3460         degraded += std::get<0>(*m);
3461       }
3462     }
3463     // If there are still acting that haven't been accounted for
3464     // then they are misplaced
3465     for (const auto& a : acting_source_objects) {
3466       int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
3467       psdout(20) << __func__ << " extra acting misplaced " << extra_misplaced
3468                  << dendl;
3469       misplaced += extra_misplaced;
3470     }
3471 out:
3472     // NOTE: Tests use these messages to verify this code
3473     psdout(20) << __func__ << " degraded " << degraded
3474                << (estimate ? " (est)": "") << dendl;
3475     psdout(20) << __func__ << " misplaced " << misplaced
3476                << (estimate ? " (est)": "")<< dendl;
3477
3478     info.stats.stats.sum.num_objects_degraded = degraded;
3479     info.stats.stats.sum.num_objects_unfound = get_num_unfound();
3480     info.stats.stats.sum.num_objects_misplaced = misplaced;
3481   }
3482 }
3483
3484 std::optional<pg_stat_t> PeeringState::prepare_stats_for_publish(
3485   bool pg_stats_publish_valid,
3486   const pg_stat_t &pg_stats_publish,
3487   const object_stat_collection_t &unstable_stats)
3488 {
3489   if (info.stats.stats.sum.num_scrub_errors) {
3490     state_set(PG_STATE_INCONSISTENT);
3491   } else {
3492     state_clear(PG_STATE_INCONSISTENT);
3493     state_clear(PG_STATE_FAILED_REPAIR);
3494   }
3495
3496   utime_t now = ceph_clock_now();
3497   if (info.stats.state != state) {
3498     info.stats.last_change = now;
3499     // Optimistic estimation, if we just find out an inactive PG,
3500     // assumt it is active till now.
3501     if (!(state & PG_STATE_ACTIVE) &&
3502         (info.stats.state & PG_STATE_ACTIVE))
3503       info.stats.last_active = now;
3504
3505     if ((state & PG_STATE_ACTIVE) &&
3506         !(info.stats.state & PG_STATE_ACTIVE))
3507       info.stats.last_became_active = now;
3508     if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
3509         !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
3510       info.stats.last_became_peered = now;
3511     info.stats.state = state;
3512   }
3513
3514   update_calc_stats();
3515   if (info.stats.stats.sum.num_objects_degraded) {
3516     state_set(PG_STATE_DEGRADED);
3517   } else {
3518     state_clear(PG_STATE_DEGRADED);
3519   }
3520   update_blocked_by();
3521
3522   pg_stat_t pre_publish = info.stats;
3523   pre_publish.stats.add(unstable_stats);
3524   utime_t cutoff = now;
3525   cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
3526
3527   // share (some of) our purged_snaps via the pg_stats. limit # of intervals
3528   // because we don't want to make the pg_stat_t structures too expensive.
3529   unsigned max = cct->_conf->osd_max_snap_prune_intervals_per_epoch;
3530   unsigned num = 0;
3531   auto i = info.purged_snaps.begin();
3532   while (num < max && i != info.purged_snaps.end()) {
3533     pre_publish.purged_snaps.insert(i.get_start(), i.get_len());
3534     ++num;
3535     ++i;
3536   }
3537   psdout(20) << __func__ << " reporting purged_snaps "
3538              << pre_publish.purged_snaps << dendl;
3539
3540   if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
3541       info.stats.last_fresh > cutoff) {
3542     psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3543                << ": no change since " << info.stats.last_fresh << dendl;
3544     return std::nullopt;
3545   } else {
3546     // update our stat summary and timestamps
3547     info.stats.reported_epoch = get_osdmap_epoch();
3548     ++info.stats.reported_seq;
3549
3550     info.stats.last_fresh = now;
3551
3552     if (info.stats.state & PG_STATE_CLEAN)
3553       info.stats.last_clean = now;
3554     if (info.stats.state & PG_STATE_ACTIVE)
3555       info.stats.last_active = now;
3556     if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
3557       info.stats.last_peered = now;
3558     info.stats.last_unstale = now;
3559     if ((info.stats.state & PG_STATE_DEGRADED) == 0)
3560       info.stats.last_undegraded = now;
3561     if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
3562       info.stats.last_fullsized = now;
3563
3564     psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3565                << ":" << pg_stats_publish.reported_seq << dendl;
3566     return std::make_optional(std::move(pre_publish));
3567   }
3568 }
3569
3570 void PeeringState::init(
3571   int role,
3572   const vector<int>& newup, int new_up_primary,
3573   const vector<int>& newacting, int new_acting_primary,
3574   const pg_history_t& history,
3575   const PastIntervals& pi,
3576   bool backfill,
3577   ObjectStore::Transaction &t)
3578 {
3579   psdout(10) << "init role " << role << " up "
3580              << newup << " acting " << newacting
3581              << " history " << history
3582              << " past_intervals " << pi
3583              << dendl;
3584
3585   set_role(role);
3586   init_primary_up_acting(
3587     newup,
3588     newacting,
3589     new_up_primary,
3590     new_acting_primary);
3591
3592   info.history = history;
3593   past_intervals = pi;
3594
3595   info.stats.up = up;
3596   info.stats.up_primary = new_up_primary;
3597   info.stats.acting = acting;
3598   info.stats.acting_primary = new_acting_primary;
3599   info.stats.mapping_epoch = info.history.same_interval_since;
3600
3601   if (!perform_deletes_during_peering()) {
3602     pg_log.set_missing_may_contain_deletes();
3603   }
3604
3605   if (backfill) {
3606     psdout(10) << __func__ << ": Setting backfill" << dendl;
3607     info.set_last_backfill(hobject_t());
3608     info.last_complete = info.last_update;
3609     pg_log.mark_log_for_rewrite();
3610   }
3611
3612   on_new_interval();
3613
3614   dirty_info = true;
3615   dirty_big_info = true;
3616   write_if_dirty(t);
3617 }
3618
3619 void PeeringState::dump_peering_state(Formatter *f)
3620 {
3621   f->dump_string("state", get_pg_state_string());
3622   f->dump_unsigned("epoch", get_osdmap_epoch());
3623   f->open_array_section("up");
3624   for (vector<int>::const_iterator p = up.begin(); p != up.end(); ++p)
3625     f->dump_unsigned("osd", *p);
3626   f->close_section();
3627   f->open_array_section("acting");
3628   for (vector<int>::const_iterator p = acting.begin(); p != acting.end(); ++p)
3629     f->dump_unsigned("osd", *p);
3630   f->close_section();
3631   if (!backfill_targets.empty()) {
3632     f->open_array_section("backfill_targets");
3633     for (set<pg_shard_t>::iterator p = backfill_targets.begin();
3634          p != backfill_targets.end();
3635          ++p)
3636       f->dump_stream("shard") << *p;
3637     f->close_section();
3638   }
3639   if (!async_recovery_targets.empty()) {
3640     f->open_array_section("async_recovery_targets");
3641     for (set<pg_shard_t>::iterator p = async_recovery_targets.begin();
3642          p != async_recovery_targets.end();
3643          ++p)
3644       f->dump_stream("shard") << *p;
3645     f->close_section();
3646   }
3647   if (!acting_recovery_backfill.empty()) {
3648     f->open_array_section("acting_recovery_backfill");
3649     for (set<pg_shard_t>::iterator p = acting_recovery_backfill.begin();
3650          p != acting_recovery_backfill.end();
3651          ++p)
3652       f->dump_stream("shard") << *p;
3653     f->close_section();
3654   }
3655   f->open_object_section("info");
3656   update_calc_stats();
3657   info.dump(f);
3658   f->close_section();
3659
3660   f->open_array_section("peer_info");
3661   for (map<pg_shard_t, pg_info_t>::const_iterator p = peer_info.begin();
3662        p != peer_info.end();
3663        ++p) {
3664     f->open_object_section("info");
3665     f->dump_stream("peer") << p->first;
3666     p->second.dump(f);
3667     f->close_section();
3668   }
3669 }
3670
3671 void PeeringState::update_stats(
3672   std::function<bool(pg_history_t &, pg_stat_t &)> f,
3673   ObjectStore::Transaction *t) {
3674   if (f(info.history, info.stats)) {
3675     pl->publish_stats_to_osd();
3676   }
3677   pl->on_info_history_change();
3678
3679   if (t) {
3680     dirty_info = true;
3681     write_if_dirty(*t);
3682   }
3683 }
3684
3685 bool PeeringState::append_log_entries_update_missing(
3686   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3687   ObjectStore::Transaction &t, std::optional<eversion_t> trim_to,
3688   std::optional<eversion_t> roll_forward_to)
3689 {
3690   ceph_assert(!entries.empty());
3691   ceph_assert(entries.begin()->version > info.last_update);
3692
3693   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
3694   bool invalidate_stats =
3695     pg_log.append_new_log_entries(
3696       info.last_backfill,
3697       entries,
3698       rollbacker.get());
3699
3700   if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
3701     pg_log.roll_forward(rollbacker.get());
3702   }
3703   if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
3704     pg_log.roll_forward_to(*roll_forward_to, rollbacker.get());
3705     last_rollback_info_trimmed_to_applied = *roll_forward_to;
3706   }
3707
3708   info.last_update = pg_log.get_head();
3709
3710   if (pg_log.get_missing().num_missing() == 0) {
3711     // advance last_complete since nothing else is missing!
3712     info.last_complete = info.last_update;
3713   }
3714   info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
3715
3716   psdout(20) << __func__ << " trim_to bool = " << bool(trim_to)
3717              << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
3718   if (trim_to)
3719     pg_log.trim(*trim_to, info);
3720   dirty_info = true;
3721   write_if_dirty(t);
3722   return invalidate_stats;
3723 }
3724
3725 void PeeringState::merge_new_log_entries(
3726   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3727   ObjectStore::Transaction &t,
3728   std::optional<eversion_t> trim_to,
3729   std::optional<eversion_t> roll_forward_to)
3730 {
3731   psdout(10) << __func__ << " " << entries << dendl;
3732   ceph_assert(is_primary());
3733
3734   bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
3735   for (set<pg_shard_t>::const_iterator i = acting_recovery_backfill.begin();
3736        i != acting_recovery_backfill.end();
3737        ++i) {
3738     pg_shard_t peer(*i);
3739     if (peer == pg_whoami) continue;
3740     ceph_assert(peer_missing.count(peer));
3741     ceph_assert(peer_info.count(peer));
3742     pg_missing_t& pmissing(peer_missing[peer]);
3743     psdout(20) << __func__ << " peer_missing for " << peer
3744                << " = " << pmissing << dendl;
3745     pg_info_t& pinfo(peer_info[peer]);
3746     bool invalidate_stats = PGLog::append_log_entries_update_missing(
3747       pinfo.last_backfill,
3748       entries,
3749       true,
3750       NULL,
3751       pmissing,
3752       NULL,
3753       dpp);
3754     pinfo.last_update = info.last_update;
3755     pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
3756     rebuild_missing = rebuild_missing || invalidate_stats;
3757   }
3758
3759   if (!rebuild_missing) {
3760     return;
3761   }
3762
3763   for (auto &&i: entries) {
3764     missing_loc.rebuild(
3765       i.soid,
3766       pg_whoami,
3767       acting_recovery_backfill,
3768       info,
3769       pg_log.get_missing(),
3770       peer_missing,
3771       peer_info);
3772   }
3773 }
3774
3775 void PeeringState::add_log_entry(const pg_log_entry_t& e, bool applied)
3776 {
3777   // raise last_complete only if we were previously up to date
3778   if (info.last_complete == info.last_update)
3779     info.last_complete = e.version;
3780
3781   // raise last_update.
3782   ceph_assert(e.version > info.last_update);
3783   info.last_update = e.version;
3784
3785   // raise user_version, if it increased (it may have not get bumped
3786   // by all logged updates)
3787   if (e.user_version > info.last_user_version)
3788     info.last_user_version = e.user_version;
3789
3790   // log mutation
3791   pg_log.add(e, applied);
3792   psdout(10) << "add_log_entry " << e << dendl;
3793 }
3794
3795
3796 void PeeringState::append_log(
3797   const vector<pg_log_entry_t>& logv,
3798   eversion_t trim_to,
3799   eversion_t roll_forward_to,
3800   eversion_t mlcod,
3801   ObjectStore::Transaction &t,
3802   bool transaction_applied,
3803   bool async)
3804 {
3805   /* The primary has sent an info updating the history, but it may not
3806    * have arrived yet.  We want to make sure that we cannot remember this
3807    * write without remembering that it happened in an interval which went
3808    * active in epoch history.last_epoch_started.
3809    */
3810   if (info.last_epoch_started != info.history.last_epoch_started) {
3811     info.history.last_epoch_started = info.last_epoch_started;
3812   }
3813   if (info.last_interval_started != info.history.last_interval_started) {
3814     info.history.last_interval_started = info.last_interval_started;
3815   }
3816   psdout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3817
3818   PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
3819   if (!transaction_applied) {
3820      /* We must be a backfill or async recovery peer, so it's ok if we apply
3821       * out-of-turn since we won't be considered when
3822       * determining a min possible last_update.
3823       *
3824       * We skip_rollforward() here, which advances the crt, without
3825       * doing an actual rollforward. This avoids cleaning up entries
3826       * from the backend and we do not end up in a situation, where the
3827       * object is deleted before we can _merge_object_divergent_entries().
3828       */
3829     pg_log.skip_rollforward();
3830   }
3831
3832   for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3833        p != logv.end();
3834        ++p) {
3835     add_log_entry(*p, transaction_applied);
3836
3837     /* We don't want to leave the rollforward artifacts around
3838      * here past last_backfill.  It's ok for the same reason as
3839      * above */
3840     if (transaction_applied &&
3841         p->soid > info.last_backfill) {
3842       pg_log.roll_forward(handler.get());
3843     }
3844   }
3845   if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
3846     pg_log.roll_forward_to(
3847       roll_forward_to,
3848       handler.get());
3849     last_rollback_info_trimmed_to_applied = roll_forward_to;
3850   }
3851
3852   psdout(10) << __func__ << " approx pg log length =  "
3853              << pg_log.get_log().approx_size() << dendl;
3854   psdout(10) << __func__ << " transaction_applied = "
3855              << transaction_applied << dendl;
3856   if (!transaction_applied || async)
3857     psdout(10) << __func__ << " " << pg_whoami
3858                << " is async_recovery or backfill target" << dendl;
3859   pg_log.trim(trim_to, info, transaction_applied, async);
3860
3861   // update the local pg, pg log
3862   dirty_info = true;
3863   write_if_dirty(t);
3864
3865   if (!is_primary())
3866     min_last_complete_ondisk = mlcod;
3867 }
3868
3869 void PeeringState::recover_got(
3870   const hobject_t &oid, eversion_t v,
3871   bool is_delete,
3872   ObjectStore::Transaction &t)
3873 {
3874   if (v > pg_log.get_can_rollback_to()) {
3875     /* This can only happen during a repair, and even then, it would
3876      * be one heck of a race.  If we are repairing the object, the
3877      * write in question must be fully committed, so it's not valid
3878      * to roll it back anyway (and we'll be rolled forward shortly
3879      * anyway) */
3880     PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
3881     pg_log.roll_forward_to(v, handler.get());
3882   }
3883
3884   psdout(10) << "got missing " << oid << " v " << v << dendl;
3885   pg_log.recover_got(oid, v, info);
3886   if (pg_log.get_log().log.empty()) {
3887     psdout(10) << "last_complete now " << info.last_complete
3888                << " while log is empty" << dendl;
3889   } else if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
3890     psdout(10) << "last_complete now " << info.last_complete
3891                << " log.complete_to " << pg_log.get_log().complete_to->version
3892                << dendl;
3893   } else {
3894     psdout(10) << "last_complete now " << info.last_complete
3895                << " log.complete_to at end" << dendl;
3896     //below is not true in the repair case.
3897     //assert(missing.num_missing() == 0);  // otherwise, complete_to was wrong.
3898     ceph_assert(info.last_complete == info.last_update);
3899   }
3900
3901   if (is_primary()) {
3902     ceph_assert(missing_loc.needs_recovery(oid));
3903     if (!is_delete)
3904       missing_loc.add_location(oid, pg_whoami);
3905   }
3906
3907   // update pg
3908   dirty_info = true;
3909   write_if_dirty(t);
3910 }
3911
3912 void PeeringState::update_backfill_progress(
3913   const hobject_t &updated_backfill,
3914   const pg_stat_t &updated_stats,
3915   bool preserve_local_num_bytes,
3916   ObjectStore::Transaction &t) {
3917   info.set_last_backfill(updated_backfill);
3918   if (preserve_local_num_bytes) {
3919     psdout(25) << __func__ << " primary " << updated_stats.stats.sum.num_bytes
3920                << " local " << info.stats.stats.sum.num_bytes << dendl;
3921     int64_t bytes = info.stats.stats.sum.num_bytes;
3922     info.stats = updated_stats;
3923     info.stats.stats.sum.num_bytes = bytes;
3924   } else {
3925     psdout(20) << __func__ << " final " << updated_stats.stats.sum.num_bytes
3926                << " replaces local " << info.stats.stats.sum.num_bytes << dendl;
3927     info.stats = updated_stats;
3928   }
3929
3930   dirty_info = true;
3931   write_if_dirty(t);
3932 }
3933
3934 void PeeringState::adjust_purged_snaps(
3935   std::function<void(interval_set<snapid_t> &snaps)> f) {
3936   f(info.purged_snaps);
3937   dirty_info = true;
3938   dirty_big_info = true;
3939 }
3940
3941 void PeeringState::on_peer_recover(
3942   pg_shard_t peer,
3943   const hobject_t &soid,
3944   const eversion_t &version)
3945 {
3946   pl->publish_stats_to_osd();
3947   // done!
3948   peer_missing[peer].got(soid, version);
3949   missing_loc.add_location(soid, peer);
3950 }
3951
3952 void PeeringState::begin_peer_recover(
3953   pg_shard_t peer,
3954   const hobject_t soid)
3955 {
3956   peer_missing[peer].revise_have(soid, eversion_t());
3957 }
3958
3959 void PeeringState::force_object_missing(
3960   const set<pg_shard_t> &peers,
3961   const hobject_t &soid,
3962   eversion_t version)
3963 {
3964   for (auto &&peer : peers) {
3965     if (peer != primary) {
3966       peer_missing[peer].add(soid, version, eversion_t(), false);
3967     } else {
3968       pg_log.missing_add(soid, version, eversion_t());
3969       pg_log.reset_complete_to(&info);
3970       pg_log.set_last_requested(0);
3971     }
3972   }
3973
3974   missing_loc.rebuild(
3975     soid,
3976     pg_whoami,
3977     acting_recovery_backfill,
3978     info,
3979     pg_log.get_missing(),
3980     peer_missing,
3981     peer_info);
3982 }
3983
3984 void PeeringState::pre_submit_op(
3985   const hobject_t &hoid,
3986   const vector<pg_log_entry_t>& logv,
3987   eversion_t at_version)
3988 {
3989   if (at_version > eversion_t()) {
3990     for (auto &&i : get_acting_recovery_backfill()) {
3991       if (i == primary) continue;
3992       pg_info_t &pinfo = peer_info[i];
3993       // keep peer_info up to date
3994       if (pinfo.last_complete == pinfo.last_update)
3995         pinfo.last_complete = at_version;
3996       pinfo.last_update = at_version;
3997     }
3998   }
3999
4000   bool requires_missing_loc = false;
4001   for (auto &&i : get_async_recovery_targets()) {
4002     if (i == primary || !get_peer_missing(i).is_missing(hoid))
4003       continue;
4004     requires_missing_loc = true;
4005     for (auto &&entry: logv) {
4006       peer_missing[i].add_next_event(entry);
4007     }
4008   }
4009
4010   if (requires_missing_loc) {
4011     for (auto &&entry: logv) {
4012       psdout(30) << __func__ << " missing_loc before: "
4013                  << missing_loc.get_locations(entry.soid) << dendl;
4014       missing_loc.add_missing(entry.soid, entry.version,
4015                               eversion_t(), entry.is_delete());
4016       // clear out missing_loc
4017       missing_loc.clear_location(entry.soid);
4018       for (auto &i: get_actingset()) {
4019         if (!get_peer_missing(i).is_missing(entry.soid))
4020           missing_loc.add_location(entry.soid, i);
4021       }
4022       psdout(30) << __func__ << " missing_loc after: "
4023                  << missing_loc.get_locations(entry.soid) << dendl;
4024     }
4025   }
4026 }
4027
4028 void PeeringState::recovery_committed_to(eversion_t version)
4029 {
4030   psdout(10) << __func__ << " version " << version
4031              << " now ondisk" << dendl;
4032   last_complete_ondisk = version;
4033
4034   if (last_complete_ondisk == info.last_update) {
4035     if (!is_primary()) {
4036       // Either we are a replica or backfill target.
4037       // we are fully up to date.  tell the primary!
4038       pl->send_cluster_message(
4039         get_primary().osd,
4040         new MOSDPGTrim(
4041           get_osdmap_epoch(),
4042           spg_t(info.pgid.pgid, primary.shard),
4043           last_complete_ondisk),
4044         get_osdmap_epoch());
4045     } else {
4046       calc_min_last_complete_ondisk();
4047     }
4048   }
4049 }
4050
4051 void PeeringState::complete_write(eversion_t v, eversion_t lc)
4052 {
4053   last_update_ondisk = v;
4054   last_complete_ondisk = lc;
4055   calc_min_last_complete_ondisk();
4056 }
4057
4058 void PeeringState::calc_trim_to()
4059 {
4060   size_t target = pl->get_target_pg_log_entries();
4061
4062   eversion_t limit = std::min(
4063     min_last_complete_ondisk,
4064     pg_log.get_can_rollback_to());
4065   if (limit != eversion_t() &&
4066       limit != pg_trim_to &&
4067       pg_log.get_log().approx_size() > target) {
4068     size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target,
4069                              cct->_conf->osd_pg_log_trim_max);
4070     if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4071         cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4072       return;
4073     }
4074     list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
4075     eversion_t new_trim_to;
4076     for (size_t i = 0; i < num_to_trim; ++i) {
4077       new_trim_to = it->version;
4078       ++it;
4079       if (new_trim_to > limit) {
4080         new_trim_to = limit;
4081         psdout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
4082         break;
4083       }
4084     }
4085     psdout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
4086     pg_trim_to = new_trim_to;
4087     assert(pg_trim_to <= pg_log.get_head());
4088     assert(pg_trim_to <= min_last_complete_ondisk);
4089   }
4090 }
4091
4092 void PeeringState::calc_trim_to_aggressive()
4093 {
4094   size_t target = pl->get_target_pg_log_entries();
4095
4096   // limit pg log trimming up to the can_rollback_to value
4097   eversion_t limit = std::min(
4098     pg_log.get_head(),
4099     pg_log.get_can_rollback_to());
4100   psdout(10) << __func__ << " limit = " << limit << dendl;
4101
4102   if (limit != eversion_t() &&
4103       limit != pg_trim_to &&
4104       pg_log.get_log().approx_size() > target) {
4105     psdout(10) << __func__ << " approx pg log length =  "
4106              << pg_log.get_log().approx_size() << dendl;
4107     uint64_t num_to_trim = std::min<uint64_t>(pg_log.get_log().approx_size() - target,
4108                                               cct->_conf->osd_pg_log_trim_max);
4109     psdout(10) << __func__ << " num_to_trim =  " << num_to_trim << dendl;
4110     if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4111         cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4112       return;
4113     }
4114     auto it = pg_log.get_log().log.begin(); // oldest log entry
4115     auto rit = pg_log.get_log().log.rbegin();
4116     eversion_t by_n_to_keep; // start from tail
4117     eversion_t by_n_to_trim = eversion_t::max(); // start from head
4118     for (size_t i = 0; it != pg_log.get_log().log.end(); ++it, ++rit) {
4119       i++;
4120       if (i > target && by_n_to_keep == eversion_t()) {
4121         by_n_to_keep = rit->version;
4122       }
4123       if (i >= num_to_trim && by_n_to_trim == eversion_t::max()) {
4124         by_n_to_trim = it->version;
4125       }
4126       if (by_n_to_keep != eversion_t() &&
4127           by_n_to_trim != eversion_t::max()) {
4128         break;
4129       }
4130     }
4131
4132     if (by_n_to_keep == eversion_t()) {
4133       return;
4134     }
4135
4136     pg_trim_to = std::min({by_n_to_keep, by_n_to_trim, limit});
4137     psdout(10) << __func__ << " pg_trim_to now " << pg_trim_to << dendl;
4138     ceph_assert(pg_trim_to <= pg_log.get_head());
4139   }
4140 }
4141
4142 void PeeringState::apply_op_stats(
4143   const hobject_t &soid,
4144   const object_stat_sum_t &delta_stats)
4145 {
4146   info.stats.stats.add(delta_stats);
4147   info.stats.stats.floor(0);
4148
4149   for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
4150        i != get_backfill_targets().end();
4151        ++i) {
4152     pg_shard_t bt = *i;
4153     pg_info_t& pinfo = peer_info[bt];
4154     if (soid <= pinfo.last_backfill)
4155       pinfo.stats.stats.add(delta_stats);
4156   }
4157 }
4158
4159 void PeeringState::update_complete_backfill_object_stats(
4160   const hobject_t &hoid,
4161   const pg_stat_t &stats)
4162 {
4163   for (auto &&bt: get_backfill_targets()) {
4164     pg_info_t& pinfo = peer_info[bt];
4165     //Add stats to all peers that were missing object
4166     if (hoid > pinfo.last_backfill)
4167       pinfo.stats.add(stats);
4168   }
4169 }
4170
4171 void PeeringState::update_peer_last_backfill(
4172   pg_shard_t peer,
4173   const hobject_t &new_last_backfill)
4174 {
4175   pg_info_t &pinfo = peer_info[peer];
4176   pinfo.last_backfill = new_last_backfill;
4177   if (new_last_backfill.is_max()) {
4178     /* pinfo.stats might be wrong if we did log-based recovery on the
4179      * backfilled portion in addition to continuing backfill.
4180      */
4181     pinfo.stats = info.stats;
4182   }
4183 }
4184
4185 void PeeringState::set_revert_with_targets(
4186   const hobject_t &soid,
4187   const set<pg_shard_t> &good_peers)
4188 {
4189   for (auto &&peer: good_peers) {
4190     missing_loc.add_location(soid, peer);
4191   }
4192 }
4193
4194 void PeeringState::prepare_backfill_for_missing(
4195   const hobject_t &soid,
4196   const eversion_t &version,
4197   const vector<pg_shard_t> &targets) {
4198   for (auto &&peer: targets) {
4199     peer_missing[peer].add(soid, version, eversion_t(), false);
4200   }
4201 }
4202
4203 void PeeringState::update_hset(const pg_hit_set_history_t &hset_history)
4204 {
4205   info.hit_set = hset_history;
4206 }
4207
4208 /*------------ Peering State Machine----------------*/
4209 #undef dout_prefix
4210 #define dout_prefix (context< PeeringMachine >().dpp->gen_prefix(*_dout) \
4211                     << "state<" << get_state_name() << ">: ")
4212 #undef psdout
4213 #define psdout(x) ldout(context< PeeringMachine >().cct, x)
4214
4215 #define DECLARE_LOCALS                                  \
4216   PeeringState *ps = context< PeeringMachine >().state; \
4217   std::ignore = ps;                                     \
4218   PeeringListener *pl = context< PeeringMachine >().pl; \
4219   std::ignore = pl
4220
4221
4222 /*------Crashed-------*/
4223 PeeringState::Crashed::Crashed(my_context ctx)
4224   : my_base(ctx),
4225     NamedState(context< PeeringMachine >().state_history, "Crashed")
4226 {
4227   context< PeeringMachine >().log_enter(state_name);
4228   ceph_abort_msg("we got a bad state machine event");
4229 }
4230
4231
4232 /*------Initial-------*/
4233 PeeringState::Initial::Initial(my_context ctx)
4234   : my_base(ctx),
4235     NamedState(context< PeeringMachine >().state_history, "Initial")
4236 {
4237   context< PeeringMachine >().log_enter(state_name);
4238 }
4239
4240 boost::statechart::result PeeringState::Initial::react(const MNotifyRec& notify)
4241 {
4242   DECLARE_LOCALS;
4243   ps->proc_replica_info(
4244     notify.from, notify.notify.info, notify.notify.epoch_sent);
4245   ps->set_last_peering_reset();
4246   return transit< Primary >();
4247 }
4248
4249 boost::statechart::result PeeringState::Initial::react(const MInfoRec& i)
4250 {
4251   DECLARE_LOCALS;
4252   ceph_assert(!ps->is_primary());
4253   post_event(i);
4254   return transit< Stray >();
4255 }
4256
4257 boost::statechart::result PeeringState::Initial::react(const MLogRec& i)
4258 {
4259   DECLARE_LOCALS;
4260   ceph_assert(!ps->is_primary());
4261   post_event(i);
4262   return transit< Stray >();
4263 }
4264
4265 void PeeringState::Initial::exit()
4266 {
4267   context< PeeringMachine >().log_exit(state_name, enter_time);
4268   DECLARE_LOCALS;
4269   utime_t dur = ceph_clock_now() - enter_time;
4270   pl->get_peering_perf().tinc(rs_initial_latency, dur);
4271 }
4272
4273 /*------Started-------*/
4274 PeeringState::Started::Started(my_context ctx)
4275   : my_base(ctx),
4276     NamedState(context< PeeringMachine >().state_history, "Started")
4277 {
4278   context< PeeringMachine >().log_enter(state_name);
4279 }
4280
4281 boost::statechart::result
4282 PeeringState::Started::react(const IntervalFlush&)
4283 {
4284   psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4285   context< PeeringMachine >().state->end_block_outgoing();
4286   return discard_event();
4287 }
4288
4289 boost::statechart::result PeeringState::Started::react(const AdvMap& advmap)
4290 {
4291   DECLARE_LOCALS;
4292   psdout(10) << "Started advmap" << dendl;
4293   ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4294   if (ps->should_restart_peering(
4295         advmap.up_primary,
4296         advmap.acting_primary,
4297         advmap.newup,
4298         advmap.newacting,
4299         advmap.lastmap,
4300         advmap.osdmap)) {
4301     psdout(10) << "should_restart_peering, transitioning to Reset"
4302                        << dendl;
4303     post_event(advmap);
4304     return transit< Reset >();
4305   }
4306   ps->remove_down_peer_info(advmap.osdmap);
4307   return discard_event();
4308 }
4309
4310 boost::statechart::result PeeringState::Started::react(const QueryState& q)
4311 {
4312   q.f->open_object_section("state");
4313   q.f->dump_string("name", state_name);
4314   q.f->dump_stream("enter_time") << enter_time;
4315   q.f->close_section();
4316   return discard_event();
4317 }
4318
4319 void PeeringState::Started::exit()
4320 {
4321   context< PeeringMachine >().log_exit(state_name, enter_time);
4322   DECLARE_LOCALS;
4323   utime_t dur = ceph_clock_now() - enter_time;
4324   pl->get_peering_perf().tinc(rs_started_latency, dur);
4325   ps->state_clear(PG_STATE_WAIT | PG_STATE_LAGGY);
4326 }
4327
4328 /*--------Reset---------*/
4329 PeeringState::Reset::Reset(my_context ctx)
4330   : my_base(ctx),
4331     NamedState(context< PeeringMachine >().state_history, "Reset")
4332 {
4333   context< PeeringMachine >().log_enter(state_name);
4334   DECLARE_LOCALS;
4335
4336   ps->flushes_in_progress = 0;
4337   ps->set_last_peering_reset();
4338   ps->log_weirdness();
4339 }
4340
4341 boost::statechart::result
4342 PeeringState::Reset::react(const IntervalFlush&)
4343 {
4344   psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4345   context< PeeringMachine >().state->end_block_outgoing();
4346   return discard_event();
4347 }
4348
4349 boost::statechart::result PeeringState::Reset::react(const AdvMap& advmap)
4350 {
4351   DECLARE_LOCALS;
4352   psdout(10) << "Reset advmap" << dendl;
4353
4354   ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4355
4356   if (ps->should_restart_peering(
4357         advmap.up_primary,
4358         advmap.acting_primary,
4359         advmap.newup,
4360         advmap.newacting,
4361         advmap.lastmap,
4362         advmap.osdmap)) {
4363     psdout(10) << "should restart peering, calling start_peering_interval again"
4364                        << dendl;
4365     ps->start_peering_interval(
4366       advmap.lastmap,
4367       advmap.newup, advmap.up_primary,
4368       advmap.newacting, advmap.acting_primary,
4369       context< PeeringMachine >().get_cur_transaction());
4370   }
4371   ps->remove_down_peer_info(advmap.osdmap);
4372   ps->check_past_interval_bounds();
4373   return discard_event();
4374 }
4375
4376 boost::statechart::result PeeringState::Reset::react(const ActMap&)
4377 {
4378   DECLARE_LOCALS;
4379   if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
4380     ps->info.history.refresh_prior_readable_until_ub(
4381       pl->get_mnow(),
4382       ps->prior_readable_until_ub);
4383     context< PeeringMachine >().send_notify(
4384       ps->get_primary().osd,
4385       pg_notify_t(
4386         ps->get_primary().shard, ps->pg_whoami.shard,
4387         ps->get_osdmap_epoch(),
4388         ps->get_osdmap_epoch(),
4389         ps->info,
4390         ps->past_intervals));
4391   }
4392
4393   ps->update_heartbeat_peers();
4394
4395   return transit< Started >();
4396 }
4397
4398 boost::statechart::result PeeringState::Reset::react(const QueryState& q)
4399 {
4400   q.f->open_object_section("state");
4401   q.f->dump_string("name", state_name);
4402   q.f->dump_stream("enter_time") << enter_time;
4403   q.f->close_section();
4404   return discard_event();
4405 }
4406
4407 void PeeringState::Reset::exit()
4408 {
4409   context< PeeringMachine >().log_exit(state_name, enter_time);
4410   DECLARE_LOCALS;
4411   utime_t dur = ceph_clock_now() - enter_time;
4412   pl->get_peering_perf().tinc(rs_reset_latency, dur);
4413 }
4414
4415 /*-------Start---------*/
4416 PeeringState::Start::Start(my_context ctx)
4417   : my_base(ctx),
4418     NamedState(context< PeeringMachine >().state_history, "Start")
4419 {
4420   context< PeeringMachine >().log_enter(state_name);
4421
4422   DECLARE_LOCALS;
4423   if (ps->is_primary()) {
4424     psdout(1) << "transitioning to Primary" << dendl;
4425     post_event(MakePrimary());
4426   } else { //is_stray
4427     psdout(1) << "transitioning to Stray" << dendl;
4428     post_event(MakeStray());
4429   }
4430 }
4431
4432 void PeeringState::Start::exit()
4433 {
4434   context< PeeringMachine >().log_exit(state_name, enter_time);
4435   DECLARE_LOCALS;
4436   utime_t dur = ceph_clock_now() - enter_time;
4437   pl->get_peering_perf().tinc(rs_start_latency, dur);
4438 }
4439
4440 /*---------Primary--------*/
4441 PeeringState::Primary::Primary(my_context ctx)
4442   : my_base(ctx),
4443     NamedState(context< PeeringMachine >().state_history, "Started/Primary")
4444 {
4445   context< PeeringMachine >().log_enter(state_name);
4446   DECLARE_LOCALS;
4447   ceph_assert(ps->want_acting.empty());
4448
4449   // set CREATING bit until we have peered for the first time.
4450   if (ps->info.history.last_epoch_started == 0) {
4451     ps->state_set(PG_STATE_CREATING);
4452     // use the history timestamp, which ultimately comes from the
4453     // monitor in the create case.
4454     utime_t t = ps->info.history.last_scrub_stamp;
4455     ps->info.stats.last_fresh = t;
4456     ps->info.stats.last_active = t;
4457     ps->info.stats.last_change = t;
4458     ps->info.stats.last_peered = t;
4459     ps->info.stats.last_clean = t;
4460     ps->info.stats.last_unstale = t;
4461     ps->info.stats.last_undegraded = t;
4462     ps->info.stats.last_fullsized = t;
4463     ps->info.stats.last_scrub_stamp = t;
4464     ps->info.stats.last_deep_scrub_stamp = t;
4465     ps->info.stats.last_clean_scrub_stamp = t;
4466   }
4467 }
4468
4469 boost::statechart::result PeeringState::Primary::react(const MNotifyRec& notevt)
4470 {
4471   DECLARE_LOCALS;
4472   psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
4473   ps->proc_replica_info(
4474     notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
4475   return discard_event();
4476 }
4477
4478 boost::statechart::result PeeringState::Primary::react(const ActMap&)
4479 {
4480   DECLARE_LOCALS;
4481   psdout(7) << "handle ActMap primary" << dendl;
4482   pl->publish_stats_to_osd();
4483   return discard_event();
4484 }
4485
4486 boost::statechart::result PeeringState::Primary::react(
4487   const SetForceRecovery&)
4488 {
4489   DECLARE_LOCALS;
4490   ps->set_force_recovery(true);
4491   return discard_event();
4492 }
4493
4494 boost::statechart::result PeeringState::Primary::react(
4495   const UnsetForceRecovery&)
4496 {
4497   DECLARE_LOCALS;
4498   ps->set_force_recovery(false);
4499   return discard_event();
4500 }
4501
4502 boost::statechart::result PeeringState::Primary::react(
4503   const RequestScrub& evt)
4504 {
4505   DECLARE_LOCALS;
4506   if (ps->is_primary()) {
4507     pl->scrub_requested(evt.deep, evt.repair);
4508     psdout(10) << "marking for scrub" << dendl;
4509   }
4510   return discard_event();
4511 }
4512
4513 boost::statechart::result PeeringState::Primary::react(
4514   const SetForceBackfill&)
4515 {
4516   DECLARE_LOCALS;
4517   ps->set_force_backfill(true);
4518   return discard_event();
4519 }
4520
4521 boost::statechart::result PeeringState::Primary::react(
4522   const UnsetForceBackfill&)
4523 {
4524   DECLARE_LOCALS;
4525   ps->set_force_backfill(false);
4526   return discard_event();
4527 }
4528
4529 void PeeringState::Primary::exit()
4530 {
4531   context< PeeringMachine >().log_exit(state_name, enter_time);
4532   DECLARE_LOCALS;
4533   ps->want_acting.clear();
4534   utime_t dur = ceph_clock_now() - enter_time;
4535   pl->get_peering_perf().tinc(rs_primary_latency, dur);
4536   pl->clear_primary_state();
4537   ps->state_clear(PG_STATE_CREATING);
4538 }
4539
4540 /*---------Peering--------*/
4541 PeeringState::Peering::Peering(my_context ctx)
4542   : my_base(ctx),
4543     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering"),
4544     history_les_bound(false)
4545 {
4546   context< PeeringMachine >().log_enter(state_name);
4547   DECLARE_LOCALS;
4548
4549   ceph_assert(!ps->is_peered());
4550   ceph_assert(!ps->is_peering());
4551   ceph_assert(ps->is_primary());
4552   ps->state_set(PG_STATE_PEERING);
4553 }
4554
4555 boost::statechart::result PeeringState::Peering::react(const AdvMap& advmap)
4556 {
4557   DECLARE_LOCALS;
4558   psdout(10) << "Peering advmap" << dendl;
4559   if (prior_set.affected_by_map(*(advmap.osdmap), ps->dpp)) {
4560     psdout(1) << "Peering, affected_by_map, going to Reset" << dendl;
4561     post_event(advmap);
4562     return transit< Reset >();
4563   }
4564
4565   ps->adjust_need_up_thru(advmap.osdmap);
4566   ps->check_prior_readable_down_osds(advmap.osdmap);
4567
4568   return forward_event();
4569 }
4570
4571 boost::statechart::result PeeringState::Peering::react(const QueryState& q)
4572 {
4573   DECLARE_LOCALS;
4574
4575   q.f->open_object_section("state");
4576   q.f->dump_string("name", state_name);
4577   q.f->dump_stream("enter_time") << enter_time;
4578
4579   q.f->open_array_section("past_intervals");
4580   ps->past_intervals.dump(q.f);
4581   q.f->close_section();
4582
4583   q.f->open_array_section("probing_osds");
4584   for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
4585        p != prior_set.probe.end();
4586        ++p)
4587     q.f->dump_stream("osd") << *p;
4588   q.f->close_section();
4589
4590   if (prior_set.pg_down)
4591     q.f->dump_string("blocked", "peering is blocked due to down osds");
4592
4593   q.f->open_array_section("down_osds_we_would_probe");
4594   for (set<int>::iterator p = prior_set.down.begin();
4595        p != prior_set.down.end();
4596        ++p)
4597     q.f->dump_int("osd", *p);
4598   q.f->close_section();
4599
4600   q.f->open_array_section("peering_blocked_by");
4601   for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
4602        p != prior_set.blocked_by.end();
4603        ++p) {
4604     q.f->open_object_section("osd");
4605     q.f->dump_int("osd", p->first);
4606     q.f->dump_int("current_lost_at", p->second);
4607     q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
4608     q.f->close_section();
4609   }
4610   q.f->close_section();
4611
4612   if (history_les_bound) {
4613     q.f->open_array_section("peering_blocked_by_detail");
4614     q.f->open_object_section("item");
4615     q.f->dump_string("detail","peering_blocked_by_history_les_bound");
4616     q.f->close_section();
4617     q.f->close_section();
4618   }
4619
4620   q.f->close_section();
4621   return forward_event();
4622 }
4623
4624 void PeeringState::Peering::exit()
4625 {
4626
4627   DECLARE_LOCALS;
4628   psdout(10) << "Leaving Peering" << dendl;
4629   context< PeeringMachine >().log_exit(state_name, enter_time);
4630   ps->state_clear(PG_STATE_PEERING);
4631   pl->clear_probe_targets();
4632
4633   utime_t dur = ceph_clock_now() - enter_time;
4634   pl->get_peering_perf().tinc(rs_peering_latency, dur);
4635 }
4636
4637
4638 /*------Backfilling-------*/
4639 PeeringState::Backfilling::Backfilling(my_context ctx)
4640   : my_base(ctx),
4641     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Backfilling")
4642 {
4643   context< PeeringMachine >().log_enter(state_name);
4644
4645
4646   DECLARE_LOCALS;
4647   ps->backfill_reserved = true;
4648   pl->on_backfill_reserved();
4649   ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
4650   ps->state_clear(PG_STATE_BACKFILL_WAIT);
4651   ps->state_set(PG_STATE_BACKFILLING);
4652   pl->publish_stats_to_osd();
4653 }
4654
4655 void PeeringState::Backfilling::backfill_release_reservations()
4656 {
4657   DECLARE_LOCALS;
4658   pl->cancel_local_background_io_reservation();
4659   for (set<pg_shard_t>::iterator it = ps->backfill_targets.begin();
4660        it != ps->backfill_targets.end();
4661        ++it) {
4662     ceph_assert(*it != ps->pg_whoami);
4663     pl->send_cluster_message(
4664       it->osd,
4665       new MBackfillReserve(
4666         MBackfillReserve::RELEASE,
4667         spg_t(ps->info.pgid.pgid, it->shard),
4668         ps->get_osdmap_epoch()),
4669       ps->get_osdmap_epoch());
4670   }
4671 }
4672
4673 void PeeringState::Backfilling::cancel_backfill()
4674 {
4675   DECLARE_LOCALS;
4676   backfill_release_reservations();
4677   pl->on_backfill_canceled();
4678 }
4679
4680 boost::statechart::result
4681 PeeringState::Backfilling::react(const Backfilled &c)
4682 {
4683   backfill_release_reservations();
4684   return transit<Recovered>();
4685 }
4686
4687 boost::statechart::result
4688 PeeringState::Backfilling::react(const DeferBackfill &c)
4689 {
4690   DECLARE_LOCALS;
4691
4692   psdout(10) << "defer backfill, retry delay " << c.delay << dendl;
4693   ps->state_set(PG_STATE_BACKFILL_WAIT);
4694   ps->state_clear(PG_STATE_BACKFILLING);
4695   cancel_backfill();
4696
4697   pl->schedule_event_after(
4698     std::make_shared<PGPeeringEvent>(
4699       ps->get_osdmap_epoch(),
4700       ps->get_osdmap_epoch(),
4701       RequestBackfill()),
4702     c.delay);
4703   return transit<NotBackfilling>();
4704 }
4705
4706 boost::statechart::result
4707 PeeringState::Backfilling::react(const UnfoundBackfill &c)
4708 {
4709   DECLARE_LOCALS;
4710   psdout(10) << "backfill has unfound, can't continue" << dendl;
4711   ps->state_set(PG_STATE_BACKFILL_UNFOUND);
4712   ps->state_clear(PG_STATE_BACKFILLING);
4713   cancel_backfill();
4714   return transit<NotBackfilling>();
4715 }
4716
4717 boost::statechart::result
4718 PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull &)
4719 {
4720   DECLARE_LOCALS;
4721
4722   ps->state_set(PG_STATE_BACKFILL_TOOFULL);
4723   ps->state_clear(PG_STATE_BACKFILLING);
4724   cancel_backfill();
4725
4726   pl->schedule_event_after(
4727     std::make_shared<PGPeeringEvent>(
4728       ps->get_osdmap_epoch(),
4729       ps->get_osdmap_epoch(),
4730       RequestBackfill()),
4731     ps->cct->_conf->osd_backfill_retry_interval);
4732
4733   return transit<NotBackfilling>();
4734 }
4735
4736 boost::statechart::result
4737 PeeringState::Backfilling::react(const RemoteReservationRevoked &)
4738 {
4739   DECLARE_LOCALS;
4740   ps->state_set(PG_STATE_BACKFILL_WAIT);
4741   cancel_backfill();
4742   if (ps->needs_backfill()) {
4743     return transit<WaitLocalBackfillReserved>();
4744   } else {
4745     // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
4746     return discard_event();
4747   }
4748 }
4749
4750 void PeeringState::Backfilling::exit()
4751 {
4752   context< PeeringMachine >().log_exit(state_name, enter_time);
4753   DECLARE_LOCALS;
4754   ps->backfill_reserved = false;
4755   ps->state_clear(PG_STATE_BACKFILLING);
4756   ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
4757   utime_t dur = ceph_clock_now() - enter_time;
4758   pl->get_peering_perf().tinc(rs_backfilling_latency, dur);
4759 }
4760
4761 /*--WaitRemoteBackfillReserved--*/
4762
4763 PeeringState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
4764   : my_base(ctx),
4765     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteBackfillReserved"),
4766     backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
4767 {
4768   context< PeeringMachine >().log_enter(state_name);
4769   DECLARE_LOCALS;
4770
4771   ps->state_set(PG_STATE_BACKFILL_WAIT);
4772   pl->publish_stats_to_osd();
4773   post_event(RemoteBackfillReserved());
4774 }
4775
4776 boost::statechart::result
4777 PeeringState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
4778 {
4779   DECLARE_LOCALS;
4780
4781   int64_t num_bytes = ps->info.stats.stats.sum.num_bytes;
4782   psdout(10) << __func__ << " num_bytes " << num_bytes << dendl;
4783   if (backfill_osd_it !=
4784       context< Active >().remote_shards_to_reserve_backfill.end()) {
4785     // The primary never backfills itself
4786     ceph_assert(*backfill_osd_it != ps->pg_whoami);
4787     pl->send_cluster_message(
4788       backfill_osd_it->osd,
4789       new MBackfillReserve(
4790         MBackfillReserve::REQUEST,
4791         spg_t(context< PeeringMachine >().spgid.pgid, backfill_osd_it->shard),
4792         ps->get_osdmap_epoch(),
4793         ps->get_backfill_priority(),
4794         num_bytes,
4795         ps->peer_bytes[*backfill_osd_it]),
4796       ps->get_osdmap_epoch());
4797     ++backfill_osd_it;
4798   } else {
4799     ps->peer_bytes.clear();
4800     post_event(AllBackfillsReserved());
4801   }
4802   return discard_event();
4803 }
4804
4805 void PeeringState::WaitRemoteBackfillReserved::exit()
4806 {
4807   context< PeeringMachine >().log_exit(state_name, enter_time);
4808   DECLARE_LOCALS;
4809
4810   utime_t dur = ceph_clock_now() - enter_time;
4811   pl->get_peering_perf().tinc(rs_waitremotebackfillreserved_latency, dur);
4812 }
4813
4814 void PeeringState::WaitRemoteBackfillReserved::retry()
4815 {
4816   DECLARE_LOCALS;
4817   pl->cancel_local_background_io_reservation();
4818
4819   // Send CANCEL to all previously acquired reservations
4820   set<pg_shard_t>::const_iterator it, begin, end;
4821   begin = context< Active >().remote_shards_to_reserve_backfill.begin();
4822   end = context< Active >().remote_shards_to_reserve_backfill.end();
4823   ceph_assert(begin != end);
4824   for (it = begin; it != backfill_osd_it; ++it) {
4825     // The primary never backfills itself
4826     ceph_assert(*it != ps->pg_whoami);
4827     pl->send_cluster_message(
4828       it->osd,
4829       new MBackfillReserve(
4830         MBackfillReserve::RELEASE,
4831         spg_t(context< PeeringMachine >().spgid.pgid, it->shard),
4832         ps->get_osdmap_epoch()),
4833       ps->get_osdmap_epoch());
4834   }
4835
4836   ps->state_clear(PG_STATE_BACKFILL_WAIT);
4837   pl->publish_stats_to_osd();
4838
4839   pl->schedule_event_after(
4840     std::make_shared<PGPeeringEvent>(
4841       ps->get_osdmap_epoch(),
4842       ps->get_osdmap_epoch(),
4843       RequestBackfill()),
4844     ps->cct->_conf->osd_backfill_retry_interval);
4845 }
4846
4847 boost::statechart::result
4848 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRejectedTooFull &evt)
4849 {
4850   DECLARE_LOCALS;
4851   ps->state_set(PG_STATE_BACKFILL_TOOFULL);
4852   retry();
4853   return transit<NotBackfilling>();
4854 }
4855
4856 boost::statechart::result
4857 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt)
4858 {
4859   retry();
4860   return transit<NotBackfilling>();
4861 }
4862
4863 /*--WaitLocalBackfillReserved--*/
4864 PeeringState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
4865   : my_base(ctx),
4866     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalBackfillReserved")
4867 {
4868   context< PeeringMachine >().log_enter(state_name);
4869   DECLARE_LOCALS;
4870
4871   ps->state_set(PG_STATE_BACKFILL_WAIT);
4872   pl->request_local_background_io_reservation(
4873     ps->get_backfill_priority(),
4874     std::make_shared<PGPeeringEvent>(
4875       ps->get_osdmap_epoch(),
4876       ps->get_osdmap_epoch(),
4877       LocalBackfillReserved()),
4878     std::make_shared<PGPeeringEvent>(
4879       ps->get_osdmap_epoch(),
4880       ps->get_osdmap_epoch(),
4881       DeferBackfill(0.0)));
4882   pl->publish_stats_to_osd();
4883 }
4884
4885 void PeeringState::WaitLocalBackfillReserved::exit()
4886 {
4887   context< PeeringMachine >().log_exit(state_name, enter_time);
4888   DECLARE_LOCALS;
4889   utime_t dur = ceph_clock_now() - enter_time;
4890   pl->get_peering_perf().tinc(rs_waitlocalbackfillreserved_latency, dur);
4891 }
4892
4893 /*----NotBackfilling------*/
4894 PeeringState::NotBackfilling::NotBackfilling(my_context ctx)
4895   : my_base(ctx),
4896     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotBackfilling")
4897 {
4898   context< PeeringMachine >().log_enter(state_name);
4899   DECLARE_LOCALS;
4900   ps->state_clear(PG_STATE_REPAIR);
4901   pl->publish_stats_to_osd();
4902 }
4903
4904 boost::statechart::result
4905 PeeringState::NotBackfilling::react(const RemoteBackfillReserved &evt)
4906 {
4907   return discard_event();
4908 }
4909
4910 boost::statechart::result
4911 PeeringState::NotBackfilling::react(const RemoteReservationRejectedTooFull &evt)
4912 {
4913   return discard_event();
4914 }
4915
4916 void PeeringState::NotBackfilling::exit()
4917 {
4918   context< PeeringMachine >().log_exit(state_name, enter_time);
4919
4920   DECLARE_LOCALS;
4921   ps->state_clear(PG_STATE_BACKFILL_UNFOUND);
4922   utime_t dur = ceph_clock_now() - enter_time;
4923   pl->get_peering_perf().tinc(rs_notbackfilling_latency, dur);
4924 }
4925
4926 /*----NotRecovering------*/
4927 PeeringState::NotRecovering::NotRecovering(my_context ctx)
4928   : my_base(ctx),
4929     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotRecovering")
4930 {
4931   context< PeeringMachine >().log_enter(state_name);
4932   DECLARE_LOCALS;
4933   ps->state_clear(PG_STATE_REPAIR);
4934   pl->publish_stats_to_osd();
4935 }
4936
4937 void PeeringState::NotRecovering::exit()
4938 {
4939   context< PeeringMachine >().log_exit(state_name, enter_time);
4940
4941   DECLARE_LOCALS;
4942   ps->state_clear(PG_STATE_RECOVERY_UNFOUND);
4943   utime_t dur = ceph_clock_now() - enter_time;
4944   pl->get_peering_perf().tinc(rs_notrecovering_latency, dur);
4945 }
4946
4947 /*---RepNotRecovering----*/
4948 PeeringState::RepNotRecovering::RepNotRecovering(my_context ctx)
4949   : my_base(ctx),
4950     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepNotRecovering")
4951 {
4952   context< PeeringMachine >().log_enter(state_name);
4953 }
4954
4955 boost::statechart::result
4956 PeeringState::RepNotRecovering::react(const RejectTooFullRemoteReservation &evt)
4957 {
4958   DECLARE_LOCALS;
4959   ps->reject_reservation();
4960   post_event(RemoteReservationRejectedTooFull());
4961   return discard_event();
4962 }
4963
4964 void PeeringState::RepNotRecovering::exit()
4965 {
4966   context< PeeringMachine >().log_exit(state_name, enter_time);
4967   DECLARE_LOCALS;
4968   utime_t dur = ceph_clock_now() - enter_time;
4969   pl->get_peering_perf().tinc(rs_repnotrecovering_latency, dur);
4970 }
4971
4972 /*---RepWaitRecoveryReserved--*/
4973 PeeringState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
4974   : my_base(ctx),
4975     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitRecoveryReserved")
4976 {
4977   context< PeeringMachine >().log_enter(state_name);
4978 }
4979
4980 boost::statechart::result
4981 PeeringState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
4982 {
4983   DECLARE_LOCALS;
4984   pl->send_cluster_message(
4985     ps->primary.osd,
4986     new MRecoveryReserve(
4987       MRecoveryReserve::GRANT,
4988       spg_t(ps->info.pgid.pgid, ps->primary.shard),
4989       ps->get_osdmap_epoch()),
4990     ps->get_osdmap_epoch());
4991   return transit<RepRecovering>();
4992 }
4993
4994 boost::statechart::result
4995 PeeringState::RepWaitRecoveryReserved::react(
4996   const RemoteReservationCanceled &evt)
4997 {
4998   DECLARE_LOCALS;
4999   pl->unreserve_recovery_space();
5000
5001   pl->cancel_remote_recovery_reservation();
5002   return transit<RepNotRecovering>();
5003 }
5004
5005 void PeeringState::RepWaitRecoveryReserved::exit()
5006 {
5007   context< PeeringMachine >().log_exit(state_name, enter_time);
5008   DECLARE_LOCALS;
5009   utime_t dur = ceph_clock_now() - enter_time;
5010   pl->get_peering_perf().tinc(rs_repwaitrecoveryreserved_latency, dur);
5011 }
5012
5013 /*-RepWaitBackfillReserved*/
5014 PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
5015   : my_base(ctx),
5016     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitBackfillReserved")
5017 {
5018   context< PeeringMachine >().log_enter(state_name);
5019 }
5020
5021 boost::statechart::result
5022 PeeringState::RepNotRecovering::react(const RequestBackfillPrio &evt)
5023 {
5024
5025   DECLARE_LOCALS;
5026
5027   if (!pl->try_reserve_recovery_space(
5028         evt.primary_num_bytes, evt.local_num_bytes)) {
5029     post_event(RejectTooFullRemoteReservation());
5030   } else {
5031     PGPeeringEventRef preempt;
5032     if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5033       // older peers will interpret preemption as TOOFULL
5034       preempt = std::make_shared<PGPeeringEvent>(
5035         pl->get_osdmap_epoch(),
5036         pl->get_osdmap_epoch(),
5037         RemoteBackfillPreempted());
5038     }
5039     pl->request_remote_recovery_reservation(
5040       evt.priority,
5041       std::make_shared<PGPeeringEvent>(
5042         pl->get_osdmap_epoch(),
5043         pl->get_osdmap_epoch(),
5044         RemoteBackfillReserved()),
5045       preempt);
5046   }
5047   return transit<RepWaitBackfillReserved>();
5048 }
5049
5050 boost::statechart::result
5051 PeeringState::RepNotRecovering::react(const RequestRecoveryPrio &evt)
5052 {
5053   DECLARE_LOCALS;
5054
5055   // fall back to a local reckoning of priority of primary doesn't pass one
5056   // (pre-mimic compat)
5057   int prio = evt.priority ? evt.priority : ps->get_recovery_priority();
5058
5059   PGPeeringEventRef preempt;
5060   if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5061     // older peers can't handle this
5062     preempt = std::make_shared<PGPeeringEvent>(
5063       ps->get_osdmap_epoch(),
5064       ps->get_osdmap_epoch(),
5065       RemoteRecoveryPreempted());
5066   }
5067
5068   pl->request_remote_recovery_reservation(
5069     prio,
5070     std::make_shared<PGPeeringEvent>(
5071       ps->get_osdmap_epoch(),
5072       ps->get_osdmap_epoch(),
5073       RemoteRecoveryReserved()),
5074     preempt);
5075   return transit<RepWaitRecoveryReserved>();
5076 }
5077
5078 void PeeringState::RepWaitBackfillReserved::exit()
5079 {
5080   context< PeeringMachine >().log_exit(state_name, enter_time);
5081   DECLARE_LOCALS;
5082   utime_t dur = ceph_clock_now() - enter_time;
5083   pl->get_peering_perf().tinc(rs_repwaitbackfillreserved_latency, dur);
5084 }
5085
5086 boost::statechart::result
5087 PeeringState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
5088 {
5089   DECLARE_LOCALS;
5090
5091
5092   pl->send_cluster_message(
5093       ps->primary.osd,
5094       new MBackfillReserve(
5095         MBackfillReserve::GRANT,
5096         spg_t(ps->info.pgid.pgid, ps->primary.shard),
5097         ps->get_osdmap_epoch()),
5098       ps->get_osdmap_epoch());
5099   return transit<RepRecovering>();
5100 }
5101
5102 boost::statechart::result
5103 PeeringState::RepWaitBackfillReserved::react(
5104   const RejectTooFullRemoteReservation &evt)
5105 {
5106   DECLARE_LOCALS;
5107   ps->reject_reservation();
5108   post_event(RemoteReservationRejectedTooFull());
5109   return discard_event();
5110 }
5111
5112 boost::statechart::result
5113 PeeringState::RepWaitBackfillReserved::react(
5114   const RemoteReservationRejectedTooFull &evt)
5115 {
5116   DECLARE_LOCALS;
5117   pl->unreserve_recovery_space();
5118
5119   pl->cancel_remote_recovery_reservation();
5120   return transit<RepNotRecovering>();
5121 }
5122
5123 boost::statechart::result
5124 PeeringState::RepWaitBackfillReserved::react(
5125   const RemoteReservationCanceled &evt)
5126 {
5127   DECLARE_LOCALS;
5128   pl->unreserve_recovery_space();
5129
5130   pl->cancel_remote_recovery_reservation();
5131   return transit<RepNotRecovering>();
5132 }
5133
5134 /*---RepRecovering-------*/
5135 PeeringState::RepRecovering::RepRecovering(my_context ctx)
5136   : my_base(ctx),
5137     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepRecovering")
5138 {
5139   context< PeeringMachine >().log_enter(state_name);
5140 }
5141
5142 boost::statechart::result
5143 PeeringState::RepRecovering::react(const RemoteRecoveryPreempted &)
5144 {
5145   DECLARE_LOCALS;
5146
5147
5148   pl->unreserve_recovery_space();
5149   pl->send_cluster_message(
5150     ps->primary.osd,
5151     new MRecoveryReserve(
5152       MRecoveryReserve::REVOKE,
5153       spg_t(ps->info.pgid.pgid, ps->primary.shard),
5154       ps->get_osdmap_epoch()),
5155     ps->get_osdmap_epoch());
5156   return discard_event();
5157 }
5158
5159 boost::statechart::result
5160 PeeringState::RepRecovering::react(const BackfillTooFull &)
5161 {
5162   DECLARE_LOCALS;
5163
5164
5165   pl->unreserve_recovery_space();
5166   pl->send_cluster_message(
5167     ps->primary.osd,
5168     new MBackfillReserve(
5169       MBackfillReserve::REVOKE_TOOFULL,
5170       spg_t(ps->info.pgid.pgid, ps->primary.shard),
5171       ps->get_osdmap_epoch()),
5172     ps->get_osdmap_epoch());
5173   return discard_event();
5174 }
5175
5176 boost::statechart::result
5177 PeeringState::RepRecovering::react(const RemoteBackfillPreempted &)
5178 {
5179   DECLARE_LOCALS;
5180
5181
5182   pl->unreserve_recovery_space();
5183   pl->send_cluster_message(
5184     ps->primary.osd,
5185     new MBackfillReserve(
5186       MBackfillReserve::REVOKE,
5187       spg_t(ps->info.pgid.pgid, ps->primary.shard),
5188       ps->get_osdmap_epoch()),
5189     ps->get_osdmap_epoch());
5190   return discard_event();
5191 }
5192
5193 void PeeringState::RepRecovering::exit()
5194 {
5195   context< PeeringMachine >().log_exit(state_name, enter_time);
5196   DECLARE_LOCALS;
5197   pl->unreserve_recovery_space();
5198
5199   pl->cancel_remote_recovery_reservation();
5200   utime_t dur = ceph_clock_now() - enter_time;
5201   pl->get_peering_perf().tinc(rs_reprecovering_latency, dur);
5202 }
5203
5204 /*------Activating--------*/
5205 PeeringState::Activating::Activating(my_context ctx)
5206   : my_base(ctx),
5207     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Activating")
5208 {
5209   context< PeeringMachine >().log_enter(state_name);
5210 }
5211
5212 void PeeringState::Activating::exit()
5213 {
5214   context< PeeringMachine >().log_exit(state_name, enter_time);
5215   DECLARE_LOCALS;
5216   utime_t dur = ceph_clock_now() - enter_time;
5217   pl->get_peering_perf().tinc(rs_activating_latency, dur);
5218 }
5219
5220 PeeringState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
5221   : my_base(ctx),
5222     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalRecoveryReserved")
5223 {
5224   context< PeeringMachine >().log_enter(state_name);
5225   DECLARE_LOCALS;
5226
5227   // Make sure all nodes that part of the recovery aren't full
5228   if (!ps->cct->_conf->osd_debug_skip_full_check_in_recovery &&
5229       ps->get_osdmap()->check_full(ps->acting_recovery_backfill)) {
5230     post_event(RecoveryTooFull());
5231     return;
5232   }
5233
5234   ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5235   ps->state_set(PG_STATE_RECOVERY_WAIT);
5236   pl->request_local_background_io_reservation(
5237     ps->get_recovery_priority(),
5238     std::make_shared<PGPeeringEvent>(
5239       ps->get_osdmap_epoch(),
5240       ps->get_osdmap_epoch(),
5241       LocalRecoveryReserved()),
5242     std::make_shared<PGPeeringEvent>(
5243       ps->get_osdmap_epoch(),
5244       ps->get_osdmap_epoch(),
5245       DeferRecovery(0.0)));
5246   pl->publish_stats_to_osd();
5247 }
5248
5249 boost::statechart::result
5250 PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
5251 {
5252   DECLARE_LOCALS;
5253   ps->state_set(PG_STATE_RECOVERY_TOOFULL);
5254   pl->schedule_event_after(
5255     std::make_shared<PGPeeringEvent>(
5256       ps->get_osdmap_epoch(),
5257       ps->get_osdmap_epoch(),
5258       DoRecovery()),
5259     ps->cct->_conf->osd_recovery_retry_interval);
5260   return transit<NotRecovering>();
5261 }
5262
5263 void PeeringState::WaitLocalRecoveryReserved::exit()
5264 {
5265   context< PeeringMachine >().log_exit(state_name, enter_time);
5266   DECLARE_LOCALS;
5267   utime_t dur = ceph_clock_now() - enter_time;
5268   pl->get_peering_perf().tinc(rs_waitlocalrecoveryreserved_latency, dur);
5269 }
5270
5271 PeeringState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
5272   : my_base(ctx),
5273     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
5274     remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
5275 {
5276   context< PeeringMachine >().log_enter(state_name);
5277   post_event(RemoteRecoveryReserved());
5278 }
5279
5280 boost::statechart::result
5281 PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
5282   DECLARE_LOCALS;
5283
5284   if (remote_recovery_reservation_it !=
5285       context< Active >().remote_shards_to_reserve_recovery.end()) {
5286     ceph_assert(*remote_recovery_reservation_it != ps->pg_whoami);
5287     pl->send_cluster_message(
5288       remote_recovery_reservation_it->osd,
5289       new MRecoveryReserve(
5290         MRecoveryReserve::REQUEST,
5291         spg_t(context< PeeringMachine >().spgid.pgid,
5292               remote_recovery_reservation_it->shard),
5293         ps->get_osdmap_epoch(),
5294         ps->get_recovery_priority()),
5295       ps->get_osdmap_epoch());
5296     ++remote_recovery_reservation_it;
5297   } else {
5298     post_event(AllRemotesReserved());
5299   }
5300   return discard_event();
5301 }
5302
5303 void PeeringState::WaitRemoteRecoveryReserved::exit()
5304 {
5305   context< PeeringMachine >().log_exit(state_name, enter_time);
5306   DECLARE_LOCALS;
5307   utime_t dur = ceph_clock_now() - enter_time;
5308   pl->get_peering_perf().tinc(rs_waitremoterecoveryreserved_latency, dur);
5309 }
5310
5311 PeeringState::Recovering::Recovering(my_context ctx)
5312   : my_base(ctx),
5313     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovering")
5314 {
5315   context< PeeringMachine >().log_enter(state_name);
5316
5317   DECLARE_LOCALS;
5318   ps->state_clear(PG_STATE_RECOVERY_WAIT);
5319   ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5320   ps->state_set(PG_STATE_RECOVERING);
5321   pl->on_recovery_reserved();
5322   ceph_assert(!ps->state_test(PG_STATE_ACTIVATING));
5323   pl->publish_stats_to_osd();
5324 }
5325
5326 void PeeringState::Recovering::release_reservations(bool cancel)
5327 {
5328   DECLARE_LOCALS;
5329   ceph_assert(cancel || !ps->pg_log.get_missing().have_missing());
5330
5331   // release remote reservations
5332   for (set<pg_shard_t>::const_iterator i =
5333          context< Active >().remote_shards_to_reserve_recovery.begin();
5334         i != context< Active >().remote_shards_to_reserve_recovery.end();
5335         ++i) {
5336     if (*i == ps->pg_whoami) // skip myself
5337       continue;
5338     pl->send_cluster_message(
5339       i->osd,
5340       new MRecoveryReserve(
5341         MRecoveryReserve::RELEASE,
5342         spg_t(ps->info.pgid.pgid, i->shard),
5343         ps->get_osdmap_epoch()),
5344       ps->get_osdmap_epoch());
5345   }
5346 }
5347
5348 boost::statechart::result
5349 PeeringState::Recovering::react(const AllReplicasRecovered &evt)
5350 {
5351   DECLARE_LOCALS;
5352   ps->state_clear(PG_STATE_FORCED_RECOVERY);
5353   release_reservations();
5354   pl->cancel_local_background_io_reservation();
5355   return transit<Recovered>();
5356 }
5357
5358 boost::statechart::result
5359 PeeringState::Recovering::react(const RequestBackfill &evt)
5360 {
5361   DECLARE_LOCALS;
5362
5363   release_reservations();
5364
5365   ps->state_clear(PG_STATE_FORCED_RECOVERY);
5366   pl->cancel_local_background_io_reservation();
5367   pl->publish_stats_to_osd();
5368   // transit any async_recovery_targets back into acting
5369   // so pg won't have to stay undersized for long
5370   // as backfill might take a long time to complete..
5371   if (!ps->async_recovery_targets.empty()) {
5372     pg_shard_t auth_log_shard;
5373     bool history_les_bound = false;
5374     ps->choose_acting(auth_log_shard, true, &history_les_bound);
5375   }
5376   return transit<WaitLocalBackfillReserved>();
5377 }
5378
5379 boost::statechart::result
5380 PeeringState::Recovering::react(const DeferRecovery &evt)
5381 {
5382   DECLARE_LOCALS;
5383   if (!ps->state_test(PG_STATE_RECOVERING)) {
5384     // we may have finished recovery and have an AllReplicasRecovered
5385     // event queued to move us to the next state.
5386     psdout(10) << "got defer recovery but not recovering" << dendl;
5387     return discard_event();
5388   }
5389   psdout(10) << "defer recovery, retry delay " << evt.delay << dendl;
5390   ps->state_set(PG_STATE_RECOVERY_WAIT);
5391   pl->cancel_local_background_io_reservation();
5392   release_reservations(true);
5393   pl->schedule_event_after(
5394     std::make_shared<PGPeeringEvent>(
5395       ps->get_osdmap_epoch(),
5396       ps->get_osdmap_epoch(),
5397       DoRecovery()),
5398     evt.delay);
5399   return transit<NotRecovering>();
5400 }
5401
5402 boost::statechart::result
5403 PeeringState::Recovering::react(const UnfoundRecovery &evt)
5404 {
5405   DECLARE_LOCALS;
5406   psdout(10) << "recovery has unfound, can't continue" << dendl;
5407   ps->state_set(PG_STATE_RECOVERY_UNFOUND);
5408   pl->cancel_local_background_io_reservation();
5409   release_reservations(true);
5410   return transit<NotRecovering>();
5411 }
5412
5413 void PeeringState::Recovering::exit()
5414 {
5415   context< PeeringMachine >().log_exit(state_name, enter_time);
5416
5417   DECLARE_LOCALS;
5418   utime_t dur = ceph_clock_now() - enter_time;
5419   ps->state_clear(PG_STATE_RECOVERING);
5420   pl->get_peering_perf().tinc(rs_recovering_latency, dur);
5421 }
5422
5423 PeeringState::Recovered::Recovered(my_context ctx)
5424   : my_base(ctx),
5425     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovered")
5426 {
5427   pg_shard_t auth_log_shard;
5428
5429   context< PeeringMachine >().log_enter(state_name);
5430
5431   DECLARE_LOCALS;
5432
5433   ceph_assert(!ps->needs_recovery());
5434
5435   // if we finished backfill, all acting are active; recheck if
5436   // DEGRADED | UNDERSIZED is appropriate.
5437   ceph_assert(!ps->acting_recovery_backfill.empty());
5438   if (ps->get_osdmap()->get_pg_size(context< PeeringMachine >().spgid.pgid) <=
5439       ps->acting_recovery_backfill.size()) {
5440     ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
5441     pl->publish_stats_to_osd();
5442   }
5443
5444   // adjust acting set?  (e.g. because backfill completed...)
5445   bool history_les_bound = false;
5446   if (ps->acting != ps->up && !ps->choose_acting(auth_log_shard,
5447                                                  true, &history_les_bound)) {
5448     ceph_assert(ps->want_acting.size());
5449   } else if (!ps->async_recovery_targets.empty()) {
5450     ps->choose_acting(auth_log_shard, true, &history_les_bound);
5451   }
5452
5453   if (context< Active >().all_replicas_activated  &&
5454       ps->async_recovery_targets.empty())
5455     post_event(GoClean());
5456 }
5457
5458 void PeeringState::Recovered::exit()
5459 {
5460   context< PeeringMachine >().log_exit(state_name, enter_time);
5461   DECLARE_LOCALS;
5462
5463   utime_t dur = ceph_clock_now() - enter_time;
5464   pl->get_peering_perf().tinc(rs_recovered_latency, dur);
5465 }
5466
5467 PeeringState::Clean::Clean(my_context ctx)
5468   : my_base(ctx),
5469     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Clean")
5470 {
5471   context< PeeringMachine >().log_enter(state_name);
5472
5473   DECLARE_LOCALS;
5474
5475   if (ps->info.last_complete != ps->info.last_update) {
5476     ceph_abort();
5477   }
5478
5479
5480   ps->try_mark_clean();
5481
5482   context< PeeringMachine >().get_cur_transaction().register_on_commit(
5483     pl->on_clean());
5484 }
5485
5486 void PeeringState::Clean::exit()
5487 {
5488   context< PeeringMachine >().log_exit(state_name, enter_time);
5489
5490   DECLARE_LOCALS;
5491   ps->state_clear(PG_STATE_CLEAN);
5492   utime_t dur = ceph_clock_now() - enter_time;
5493   pl->get_peering_perf().tinc(rs_clean_latency, dur);
5494 }
5495
5496 template <typename T>
5497 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
5498 {
5499   set<int> osds_found;
5500   set<pg_shard_t> out;
5501   for (typename T::const_iterator i = in.begin();
5502        i != in.end();
5503        ++i) {
5504     if (*i != skip && !osds_found.count(i->osd)) {
5505       osds_found.insert(i->osd);
5506       out.insert(*i);
5507     }
5508   }
5509   return out;
5510 }
5511
5512 /*---------Active---------*/
5513 PeeringState::Active::Active(my_context ctx)
5514   : my_base(ctx),
5515     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active"),
5516     remote_shards_to_reserve_recovery(
5517       unique_osd_shard_set(
5518         context< PeeringMachine >().state->pg_whoami,
5519         context< PeeringMachine >().state->acting_recovery_backfill)),
5520     remote_shards_to_reserve_backfill(
5521       unique_osd_shard_set(
5522         context< PeeringMachine >().state->pg_whoami,
5523         context< PeeringMachine >().state->backfill_targets)),
5524     all_replicas_activated(false)
5525 {
5526   context< PeeringMachine >().log_enter(state_name);
5527
5528
5529   DECLARE_LOCALS;
5530
5531   ceph_assert(!ps->backfill_reserved);
5532   ceph_assert(ps->is_primary());
5533   psdout(10) << "In Active, about to call activate" << dendl;
5534   ps->start_flush(context< PeeringMachine >().get_cur_transaction());
5535   ps->activate(context< PeeringMachine >().get_cur_transaction(),
5536                ps->get_osdmap_epoch(),
5537                context< PeeringMachine >().get_recovery_ctx());
5538
5539   // everyone has to commit/ack before we are truly active
5540   ps->blocked_by.clear();
5541   for (set<pg_shard_t>::iterator p = ps->acting_recovery_backfill.begin();
5542        p != ps->acting_recovery_backfill.end();
5543        ++p) {
5544     if (p->shard != ps->pg_whoami.shard) {
5545       ps->blocked_by.insert(p->shard);
5546     }
5547   }
5548   pl->publish_stats_to_osd();
5549   psdout(10) << "Activate Finished" << dendl;
5550 }
5551
5552 boost::statechart::result PeeringState::Active::react(const AdvMap& advmap)
5553 {
5554   DECLARE_LOCALS;
5555
5556   if (ps->should_restart_peering(
5557         advmap.up_primary,
5558         advmap.acting_primary,
5559         advmap.newup,
5560         advmap.newacting,
5561         advmap.lastmap,
5562         advmap.osdmap)) {
5563     psdout(10) << "Active advmap interval change, fast return" << dendl;
5564     return forward_event();
5565   }
5566   psdout(10) << "Active advmap" << dendl;
5567   bool need_publish = false;
5568
5569   pl->on_active_advmap(advmap.osdmap);
5570   if (ps->dirty_big_info) {
5571     // share updated purged_snaps to mgr/mon so that we (a) stop reporting
5572     // purged snaps and (b) perhaps share more snaps that we have purged
5573     // but didn't fit in pg_stat_t.
5574     need_publish = true;
5575     ps->share_pg_info();
5576   }
5577
5578   for (size_t i = 0; i < ps->want_acting.size(); i++) {
5579     int osd = ps->want_acting[i];
5580     if (!advmap.osdmap->is_up(osd)) {
5581       pg_shard_t osd_with_shard(osd, shard_id_t(i));
5582       ceph_assert(ps->is_acting(osd_with_shard) || ps->is_up(osd_with_shard));
5583     }
5584   }
5585
5586   /* Check for changes in pool size (if the acting set changed as a result,
5587    * this does not matter) */
5588   if (advmap.lastmap->get_pg_size(ps->info.pgid.pgid) !=
5589       ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid)) {
5590     if (ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid) <=
5591         ps->actingset.size()) {
5592       ps->state_clear(PG_STATE_UNDERSIZED);
5593     } else {
5594       ps->state_set(PG_STATE_UNDERSIZED);
5595     }
5596     // degraded changes will be detected by call from publish_stats_to_osd()
5597     need_publish = true;
5598   }
5599
5600   // if we haven't reported our PG stats in a long time, do so now.
5601   if (ps->info.stats.reported_epoch + ps->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
5602     psdout(20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - ps->info.stats.reported_epoch)
5603                        << " epochs" << dendl;
5604     need_publish = true;
5605   }
5606
5607   if (need_publish)
5608     pl->publish_stats_to_osd();
5609
5610   if (ps->check_prior_readable_down_osds(advmap.osdmap)) {
5611     pl->recheck_readable();
5612   }
5613
5614   return forward_event();
5615 }
5616
5617 boost::statechart::result PeeringState::Active::react(const ActMap&)
5618 {
5619   DECLARE_LOCALS;
5620   psdout(10) << "Active: handling ActMap" << dendl;
5621   ceph_assert(ps->is_primary());
5622
5623   pl->on_active_actmap();
5624
5625   if (ps->have_unfound()) {
5626     // object may have become unfound
5627     ps->discover_all_missing(context<PeeringMachine>().get_recovery_ctx().msgs);
5628   }
5629
5630   uint64_t unfound = ps->missing_loc.num_unfound();
5631   if (unfound > 0 &&
5632       ps->all_unfound_are_queried_or_lost(ps->get_osdmap())) {
5633     if (ps->cct->_conf->osd_auto_mark_unfound_lost) {
5634       pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has " << unfound
5635                             << " objects unfound and apparently lost, would automatically "
5636                             << "mark these objects lost but this feature is not yet implemented "
5637                             << "(osd_auto_mark_unfound_lost)";
5638     } else
5639       pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has "
5640                              << unfound << " objects unfound and apparently lost";
5641   }
5642
5643   return forward_event();
5644 }
5645
5646 boost::statechart::result PeeringState::Active::react(const MNotifyRec& notevt)
5647 {
5648
5649   DECLARE_LOCALS;
5650   ceph_assert(ps->is_primary());
5651   if (ps->peer_info.count(notevt.from)) {
5652     psdout(10) << "Active: got notify from " << notevt.from
5653                        << ", already have info from that osd, ignoring"
5654                        << dendl;
5655   } else if (ps->peer_purged.count(notevt.from)) {
5656     psdout(10) << "Active: got notify from " << notevt.from
5657                        << ", already purged that peer, ignoring"
5658                        << dendl;
5659   } else {
5660     psdout(10) << "Active: got notify from " << notevt.from
5661                        << ", calling proc_replica_info and discover_all_missing"
5662                        << dendl;
5663     ps->proc_replica_info(
5664       notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
5665     if (ps->have_unfound() || (ps->is_degraded() && ps->might_have_unfound.count(notevt.from))) {
5666       ps->discover_all_missing(
5667         context<PeeringMachine>().get_recovery_ctx().msgs);
5668     }
5669     // check if it is a previous down acting member that's coming back.
5670     // if so, request pg_temp change to trigger a new interval transition
5671     pg_shard_t auth_log_shard;
5672     bool history_les_bound = false;
5673     ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
5674     if (!ps->want_acting.empty() && ps->want_acting != ps->acting) {
5675       psdout(10) << "Active: got notify from previous acting member "
5676                  << notevt.from << ", requesting pg_temp change"
5677                  << dendl;
5678     }
5679   }
5680   return discard_event();
5681 }
5682
5683 boost::statechart::result PeeringState::Active::react(const MTrim& trim)
5684 {
5685   DECLARE_LOCALS;
5686   ceph_assert(ps->is_primary());
5687
5688   // peer is informing us of their last_complete_ondisk
5689   ldout(ps->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl;
5690   ps->update_peer_last_complete_ondisk(pg_shard_t{trim.from, trim.shard},
5691                                        trim.trim_to);
5692   // trim log when the pg is recovered
5693   ps->calc_min_last_complete_ondisk();
5694   return discard_event();
5695 }
5696
5697 boost::statechart::result PeeringState::Active::react(const MInfoRec& infoevt)
5698 {
5699   DECLARE_LOCALS;
5700   ceph_assert(ps->is_primary());
5701
5702   ceph_assert(!ps->acting_recovery_backfill.empty());
5703   if (infoevt.lease_ack) {
5704     ps->proc_lease_ack(infoevt.from.osd, *infoevt.lease_ack);
5705   }
5706   // don't update history (yet) if we are active and primary; the replica
5707   // may be telling us they have activated (and committed) but we can't
5708   // share that until _everyone_ does the same.
5709   if (ps->is_acting_recovery_backfill(infoevt.from) &&
5710       ps->peer_activated.count(infoevt.from) == 0) {
5711     psdout(10) << " peer osd." << infoevt.from
5712                << " activated and committed" << dendl;
5713     ps->peer_activated.insert(infoevt.from);
5714     ps->blocked_by.erase(infoevt.from.shard);
5715     pl->publish_stats_to_osd();
5716     if (ps->peer_activated.size() == ps->acting_recovery_backfill.size()) {
5717       all_activated_and_committed();
5718     }
5719   }
5720   return discard_event();
5721 }
5722
5723 boost::statechart::result PeeringState::Active::react(const MLogRec& logevt)
5724 {
5725   DECLARE_LOCALS;
5726   psdout(10) << "searching osd." << logevt.from
5727                      << " log for unfound items" << dendl;
5728   ps->proc_replica_log(
5729     logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
5730   bool got_missing = ps->search_for_missing(
5731     ps->peer_info[logevt.from],
5732     ps->peer_missing[logevt.from],
5733     logevt.from,
5734     context< PeeringMachine >().get_recovery_ctx());
5735   // If there are missing AND we are "fully" active then start recovery now
5736   if (got_missing && ps->state_test(PG_STATE_ACTIVE)) {
5737     post_event(DoRecovery());
5738   }
5739   return discard_event();
5740 }
5741
5742 boost::statechart::result PeeringState::Active::react(const QueryState& q)
5743 {
5744   DECLARE_LOCALS;
5745
5746   q.f->open_object_section("state");
5747   q.f->dump_string("name", state_name);
5748   q.f->dump_stream("enter_time") << enter_time;
5749
5750   {
5751     q.f->open_array_section("might_have_unfound");
5752     for (set<pg_shard_t>::iterator p = ps->might_have_unfound.begin();
5753          p != ps->might_have_unfound.end();
5754          ++p) {
5755       q.f->open_object_section("osd");
5756       q.f->dump_stream("osd") << *p;
5757       if (ps->peer_missing.count(*p)) {
5758         q.f->dump_string("status", "already probed");
5759       } else if (ps->peer_missing_requested.count(*p)) {
5760         q.f->dump_string("status", "querying");
5761       } else if (!ps->get_osdmap()->is_up(p->osd)) {
5762         q.f->dump_string("status", "osd is down");
5763       } else {
5764         q.f->dump_string("status", "not queried");
5765       }
5766       q.f->close_section();
5767     }
5768     q.f->close_section();
5769   }
5770   {
5771     q.f->open_object_section("recovery_progress");
5772     q.f->open_array_section("backfill_targets");
5773     for (set<pg_shard_t>::const_iterator p = ps->backfill_targets.begin();
5774          p != ps->backfill_targets.end(); ++p)
5775       q.f->dump_stream("replica") << *p;
5776     q.f->close_section();
5777     pl->dump_recovery_info(q.f);
5778     q.f->close_section();
5779   }
5780
5781   q.f->close_section();
5782   return forward_event();
5783 }
5784
5785 boost::statechart::result PeeringState::Active::react(
5786   const ActivateCommitted &evt)
5787 {
5788   DECLARE_LOCALS;
5789   ceph_assert(!ps->peer_activated.count(ps->pg_whoami));
5790   ps->peer_activated.insert(ps->pg_whoami);
5791   psdout(10) << "_activate_committed " << evt.epoch
5792              << " peer_activated now " << ps->peer_activated
5793              << " last_interval_started "
5794              << ps->info.history.last_interval_started
5795              << " last_epoch_started "
5796              << ps->info.history.last_epoch_started
5797              << " same_interval_since "
5798              << ps->info.history.same_interval_since
5799              << dendl;
5800   ceph_assert(!ps->acting_recovery_backfill.empty());
5801   if (ps->peer_activated.size() == ps->acting_recovery_backfill.size())
5802     all_activated_and_committed();
5803   return discard_event();
5804 }
5805
5806 boost::statechart::result PeeringState::Active::react(const AllReplicasActivated &evt)
5807 {
5808
5809   DECLARE_LOCALS;
5810   pg_t pgid = context< PeeringMachine >().spgid.pgid;
5811
5812   all_replicas_activated = true;
5813
5814   ps->state_clear(PG_STATE_ACTIVATING);
5815   ps->state_clear(PG_STATE_CREATING);
5816   ps->state_clear(PG_STATE_PREMERGE);
5817
5818   bool merge_target;
5819   if (ps->pool.info.is_pending_merge(pgid, &merge_target)) {
5820     ps->state_set(PG_STATE_PEERED);
5821     ps->state_set(PG_STATE_PREMERGE);
5822
5823     if (ps->actingset.size() != ps->get_osdmap()->get_pg_size(pgid)) {
5824       if (merge_target) {
5825         pg_t src = pgid;
5826         src.set_ps(ps->pool.info.get_pg_num_pending());
5827         assert(src.get_parent() == pgid);
5828         pl->set_not_ready_to_merge_target(pgid, src);
5829       } else {
5830         pl->set_not_ready_to_merge_source(pgid);
5831       }
5832     }
5833   } else if (ps->acting.size() < ps->pool.info.min_size) {
5834     ps->state_set(PG_STATE_PEERED);
5835   } else {
5836     ps->state_set(PG_STATE_ACTIVE);
5837   }
5838
5839   auto mnow = pl->get_mnow();
5840   if (ps->prior_readable_until_ub > mnow) {
5841     psdout(10) << " waiting for prior_readable_until_ub "
5842                << ps->prior_readable_until_ub << " > mnow " << mnow << dendl;
5843     ps->state_set(PG_STATE_WAIT);
5844     pl->queue_check_readable(
5845       ps->last_peering_reset,
5846       ps->prior_readable_until_ub - mnow);
5847   } else {
5848     psdout(10) << " mnow " << mnow << " >= prior_readable_until_ub "
5849                << ps->prior_readable_until_ub << dendl;
5850   }
5851
5852   if (ps->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) {
5853     pl->send_pg_created(pgid);
5854   }
5855
5856   ps->info.history.last_epoch_started = ps->info.last_epoch_started;
5857   ps->info.history.last_interval_started = ps->info.last_interval_started;
5858   ps->dirty_info = true;
5859
5860   ps->share_pg_info();
5861   pl->publish_stats_to_osd();
5862
5863   pl->on_activate_complete();
5864
5865   return discard_event();
5866 }
5867
5868 boost::statechart::result PeeringState::Active::react(const RenewLease& rl)
5869 {
5870   DECLARE_LOCALS;
5871   ps->proc_renew_lease();
5872   return discard_event();
5873 }
5874
5875 boost::statechart::result PeeringState::Active::react(const MLeaseAck& la)
5876 {
5877   DECLARE_LOCALS;
5878   ps->proc_lease_ack(la.from, la.lease_ack);
5879   return discard_event();
5880 }
5881
5882
5883 boost::statechart::result PeeringState::Active::react(const CheckReadable &evt)
5884 {
5885   DECLARE_LOCALS;
5886   pl->recheck_readable();
5887   return discard_event();
5888 }
5889
5890 /*
5891  * update info.history.last_epoch_started ONLY after we and all
5892  * replicas have activated AND committed the activate transaction
5893  * (i.e. the peering results are stable on disk).
5894  */
5895 void PeeringState::Active::all_activated_and_committed()
5896 {
5897   DECLARE_LOCALS;
5898   psdout(10) << "all_activated_and_committed" << dendl;
5899   ceph_assert(ps->is_primary());
5900   ceph_assert(ps->peer_activated.size() == ps->acting_recovery_backfill.size());
5901   ceph_assert(!ps->acting_recovery_backfill.empty());
5902   ceph_assert(ps->blocked_by.empty());
5903
5904   if (HAVE_FEATURE(ps->upacting_features, SERVER_OCTOPUS)) {
5905     // this is overkill when the activation is quick, but when it is slow it
5906     // is important, because the lease was renewed by the activate itself but we
5907     // don't know how long ago that was, and simply scheduling now may leave
5908     // a gap in lease coverage.  keep it simple and aggressively renew.
5909     ps->renew_lease(pl->get_mnow());
5910     ps->send_lease();
5911     ps->schedule_renew_lease();
5912   }
5913
5914   // Degraded?
5915   ps->update_calc_stats();
5916   if (ps->info.stats.stats.sum.num_objects_degraded) {
5917     ps->state_set(PG_STATE_DEGRADED);
5918   } else {
5919     ps->state_clear(PG_STATE_DEGRADED);
5920   }
5921
5922   post_event(PeeringState::AllReplicasActivated());
5923 }
5924
5925
5926 void PeeringState::Active::exit()
5927 {
5928   context< PeeringMachine >().log_exit(state_name, enter_time);
5929
5930
5931   DECLARE_LOCALS;
5932   pl->cancel_local_background_io_reservation();
5933
5934   ps->blocked_by.clear();
5935   ps->backfill_reserved = false;
5936   ps->state_clear(PG_STATE_ACTIVATING);
5937   ps->state_clear(PG_STATE_DEGRADED);
5938   ps->state_clear(PG_STATE_UNDERSIZED);
5939   ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
5940   ps->state_clear(PG_STATE_BACKFILL_WAIT);
5941   ps->state_clear(PG_STATE_RECOVERY_WAIT);
5942   ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5943   utime_t dur = ceph_clock_now() - enter_time;
5944   pl->get_peering_perf().tinc(rs_active_latency, dur);
5945   pl->on_active_exit();
5946 }
5947
5948 /*------ReplicaActive-----*/
5949 PeeringState::ReplicaActive::ReplicaActive(my_context ctx)
5950   : my_base(ctx),
5951     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive")
5952 {
5953   context< PeeringMachine >().log_enter(state_name);
5954
5955   DECLARE_LOCALS;
5956   ps->start_flush(context< PeeringMachine >().get_cur_transaction());
5957 }
5958
5959
5960 boost::statechart::result PeeringState::ReplicaActive::react(
5961   const Activate& actevt) {
5962   DECLARE_LOCALS;
5963   psdout(10) << "In ReplicaActive, about to call activate" << dendl;
5964   ps->activate(
5965     context< PeeringMachine >().get_cur_transaction(),
5966     actevt.activation_epoch,
5967     context< PeeringMachine >().get_recovery_ctx());
5968   psdout(10) << "Activate Finished" << dendl;
5969   return discard_event();
5970 }
5971
5972 boost::statechart::result PeeringState::ReplicaActive::react(
5973   const ActivateCommitted &evt)
5974 {
5975   DECLARE_LOCALS;
5976   psdout(10) << __func__ << " " << evt.epoch << " telling primary" << dendl;
5977
5978   auto &rctx = context<PeeringMachine>().get_recovery_ctx();
5979   auto epoch = ps->get_osdmap_epoch();
5980   pg_info_t i = ps->info;
5981   i.history.last_epoch_started = evt.activation_epoch;
5982   i.history.last_interval_started = i.history.same_interval_since;
5983   rctx.send_info(
5984     ps->get_primary().osd,
5985     spg_t(ps->info.pgid.pgid, ps->get_primary().shard),
5986     epoch,
5987     epoch,
5988     i,
5989     {}, /* lease */
5990     ps->get_lease_ack());
5991
5992   if (ps->acting.size() >= ps->pool.info.min_size) {
5993     ps->state_set(PG_STATE_ACTIVE);
5994   } else {
5995     ps->state_set(PG_STATE_PEERED);
5996   }
5997   pl->on_activate_committed();
5998
5999   return discard_event();
6000 }
6001
6002 boost::statechart::result PeeringState::ReplicaActive::react(const MLease& l)
6003 {
6004   DECLARE_LOCALS;
6005   spg_t spgid = context< PeeringMachine >().spgid;
6006   epoch_t epoch = pl->get_osdmap_epoch();
6007
6008   ps->proc_lease(l.lease);
6009   pl->send_cluster_message(
6010     ps->get_primary().osd,
6011     new MOSDPGLeaseAck(epoch,
6012                        spg_t(spgid.pgid, ps->get_primary().shard),
6013                        ps->get_lease_ack()),
6014     epoch);
6015   return discard_event();
6016 }
6017
6018 boost::statechart::result PeeringState::ReplicaActive::react(const MInfoRec& infoevt)
6019 {
6020   DECLARE_LOCALS;
6021   ps->proc_primary_info(context<PeeringMachine>().get_cur_transaction(),
6022                         infoevt.info);
6023   return discard_event();
6024 }
6025
6026 boost::statechart::result PeeringState::ReplicaActive::react(const MLogRec& logevt)
6027 {
6028   DECLARE_LOCALS;
6029   psdout(10) << "received log from " << logevt.from << dendl;
6030   ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6031   ps->merge_log(t, logevt.msg->info, logevt.msg->log, logevt.from);
6032   ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6033   if (logevt.msg->lease) {
6034     ps->proc_lease(*logevt.msg->lease);
6035   }
6036
6037   return discard_event();
6038 }
6039
6040 boost::statechart::result PeeringState::ReplicaActive::react(const MTrim& trim)
6041 {
6042   DECLARE_LOCALS;
6043   // primary is instructing us to trim
6044   ps->pg_log.trim(trim.trim_to, ps->info);
6045   ps->dirty_info = true;
6046   return discard_event();
6047 }
6048
6049 boost::statechart::result PeeringState::ReplicaActive::react(const ActMap&)
6050 {
6051   DECLARE_LOCALS;
6052   if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6053     ps->info.history.refresh_prior_readable_until_ub(
6054       pl->get_mnow(), ps->prior_readable_until_ub);
6055     context< PeeringMachine >().send_notify(
6056       ps->get_primary().osd,
6057       pg_notify_t(
6058         ps->get_primary().shard, ps->pg_whoami.shard,
6059         ps->get_osdmap_epoch(),
6060         ps->get_osdmap_epoch(),
6061         ps->info,
6062         ps->past_intervals));
6063   }
6064   return discard_event();
6065 }
6066
6067 boost::statechart::result PeeringState::ReplicaActive::react(
6068   const MQuery& query)
6069 {
6070   DECLARE_LOCALS;
6071   ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6072   return discard_event();
6073 }
6074
6075 boost::statechart::result PeeringState::ReplicaActive::react(const QueryState& q)
6076 {
6077   q.f->open_object_section("state");
6078   q.f->dump_string("name", state_name);
6079   q.f->dump_stream("enter_time") << enter_time;
6080   q.f->close_section();
6081   return forward_event();
6082 }
6083
6084 void PeeringState::ReplicaActive::exit()
6085 {
6086   context< PeeringMachine >().log_exit(state_name, enter_time);
6087   DECLARE_LOCALS;
6088   pl->unreserve_recovery_space();
6089
6090   pl->cancel_remote_recovery_reservation();
6091   utime_t dur = ceph_clock_now() - enter_time;
6092   pl->get_peering_perf().tinc(rs_replicaactive_latency, dur);
6093
6094   ps->min_last_complete_ondisk = eversion_t();
6095 }
6096
6097 /*-------Stray---*/
6098 PeeringState::Stray::Stray(my_context ctx)
6099   : my_base(ctx),
6100     NamedState(context< PeeringMachine >().state_history, "Started/Stray")
6101 {
6102   context< PeeringMachine >().log_enter(state_name);
6103
6104
6105   DECLARE_LOCALS;
6106   ceph_assert(!ps->is_peered());
6107   ceph_assert(!ps->is_peering());
6108   ceph_assert(!ps->is_primary());
6109
6110   if (!ps->get_osdmap()->have_pg_pool(ps->info.pgid.pgid.pool())) {
6111     ldout(ps->cct,10) << __func__ << " pool is deleted" << dendl;
6112     post_event(DeleteStart());
6113   } else {
6114     ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6115   }
6116 }
6117
6118 boost::statechart::result PeeringState::Stray::react(const MLogRec& logevt)
6119 {
6120   DECLARE_LOCALS;
6121   MOSDPGLog *msg = logevt.msg.get();
6122   psdout(10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
6123
6124   ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6125   if (msg->info.last_backfill == hobject_t()) {
6126     // restart backfill
6127     ps->info = msg->info;
6128     pl->on_info_history_change();
6129     ps->dirty_info = true;
6130     ps->dirty_big_info = true;  // maybe.
6131
6132     PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6133     ps->pg_log.reset_backfill_claim_log(msg->log, rollbacker.get());
6134
6135     ps->pg_log.reset_backfill();
6136   } else {
6137     ps->merge_log(t, msg->info, msg->log, logevt.from);
6138   }
6139   if (logevt.msg->lease) {
6140     ps->proc_lease(*logevt.msg->lease);
6141   }
6142
6143   ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6144
6145   post_event(Activate(logevt.msg->info.last_epoch_started));
6146   return transit<ReplicaActive>();
6147 }
6148
6149 boost::statechart::result PeeringState::Stray::react(const MInfoRec& infoevt)
6150 {
6151   DECLARE_LOCALS;
6152   psdout(10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
6153
6154   if (ps->info.last_update > infoevt.info.last_update) {
6155     // rewind divergent log entries
6156     ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6157     ps->rewind_divergent_log(t, infoevt.info.last_update);
6158     ps->info.stats = infoevt.info.stats;
6159     ps->info.hit_set = infoevt.info.hit_set;
6160   }
6161
6162   if (infoevt.lease) {
6163     ps->proc_lease(*infoevt.lease);
6164   }
6165
6166   ceph_assert(infoevt.info.last_update == ps->info.last_update);
6167   ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6168
6169   post_event(Activate(infoevt.info.last_epoch_started));
6170   return transit<ReplicaActive>();
6171 }
6172
6173 boost::statechart::result PeeringState::Stray::react(const MQuery& query)
6174 {
6175   DECLARE_LOCALS;
6176   ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6177   return discard_event();
6178 }
6179
6180 boost::statechart::result PeeringState::Stray::react(const ActMap&)
6181 {
6182   DECLARE_LOCALS;
6183   if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6184     ps->info.history.refresh_prior_readable_until_ub(
6185       pl->get_mnow(), ps->prior_readable_until_ub);
6186     context< PeeringMachine >().send_notify(
6187       ps->get_primary().osd,
6188       pg_notify_t(
6189         ps->get_primary().shard, ps->pg_whoami.shard,
6190         ps->get_osdmap_epoch(),
6191         ps->get_osdmap_epoch(),
6192         ps->info,
6193         ps->past_intervals));
6194   }
6195   return discard_event();
6196 }
6197
6198 void PeeringState::Stray::exit()
6199 {
6200   context< PeeringMachine >().log_exit(state_name, enter_time);
6201   DECLARE_LOCALS;
6202   utime_t dur = ceph_clock_now() - enter_time;
6203   pl->get_peering_perf().tinc(rs_stray_latency, dur);
6204 }
6205
6206
6207 /*--------ToDelete----------*/
6208 PeeringState::ToDelete::ToDelete(my_context ctx)
6209   : my_base(ctx),
6210     NamedState(context< PeeringMachine >().state_history, "Started/ToDelete")
6211 {
6212   context< PeeringMachine >().log_enter(state_name);
6213   DECLARE_LOCALS;
6214   pl->get_perf_logger().inc(l_osd_pg_removing);
6215 }
6216
6217 void PeeringState::ToDelete::exit()
6218 {
6219   context< PeeringMachine >().log_exit(state_name, enter_time);
6220   DECLARE_LOCALS;
6221   // note: on a successful removal, this path doesn't execute. see
6222   // _delete_some().
6223   pl->get_perf_logger().dec(l_osd_pg_removing);
6224
6225   pl->cancel_local_background_io_reservation();
6226 }
6227
6228 /*----WaitDeleteReserved----*/
6229 PeeringState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx)
6230   : my_base(ctx),
6231     NamedState(context< PeeringMachine >().state_history,
6232                "Started/ToDelete/WaitDeleteReseved")
6233 {
6234   context< PeeringMachine >().log_enter(state_name);
6235   DECLARE_LOCALS;
6236   context< ToDelete >().priority = ps->get_delete_priority();
6237
6238   pl->cancel_local_background_io_reservation();
6239   pl->request_local_background_io_reservation(
6240     context<ToDelete>().priority,
6241     std::make_shared<PGPeeringEvent>(
6242       ps->get_osdmap_epoch(),
6243       ps->get_osdmap_epoch(),
6244       DeleteReserved()),
6245     std::make_shared<PGPeeringEvent>(
6246       ps->get_osdmap_epoch(),
6247       ps->get_osdmap_epoch(),
6248       DeleteInterrupted()));
6249 }
6250
6251 boost::statechart::result PeeringState::ToDelete::react(
6252   const ActMap& evt)
6253 {
6254   DECLARE_LOCALS;
6255   if (ps->get_delete_priority() != priority) {
6256     psdout(10) << __func__ << " delete priority changed, resetting"
6257                    << dendl;
6258     return transit<ToDelete>();
6259   }
6260   return discard_event();
6261 }
6262
6263 void PeeringState::WaitDeleteReserved::exit()
6264 {
6265   context< PeeringMachine >().log_exit(state_name, enter_time);
6266 }
6267
6268 /*----Deleting-----*/
6269 PeeringState::Deleting::Deleting(my_context ctx)
6270   : my_base(ctx),
6271     NamedState(context< PeeringMachine >().state_history, "Started/ToDelete/Deleting")
6272 {
6273   context< PeeringMachine >().log_enter(state_name);
6274   DECLARE_LOCALS;
6275   ps->deleting = true;
6276   ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6277
6278   // clear log
6279   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6280   ps->pg_log.roll_forward(rollbacker.get());
6281
6282   // adjust info to backfill
6283   ps->info.set_last_backfill(hobject_t());
6284   ps->pg_log.reset_backfill();
6285   ps->dirty_info = true;
6286
6287   pl->on_removal(t);
6288 }
6289
6290 boost::statechart::result PeeringState::Deleting::react(
6291   const DeleteSome& evt)
6292 {
6293   DECLARE_LOCALS;
6294   pl->do_delete_work(context<PeeringMachine>().get_cur_transaction());
6295   return discard_event();
6296 }
6297
6298 void PeeringState::Deleting::exit()
6299 {
6300   context< PeeringMachine >().log_exit(state_name, enter_time);
6301   DECLARE_LOCALS;
6302   ps->deleting = false;
6303   pl->cancel_local_background_io_reservation();
6304 }
6305
6306 /*--------GetInfo---------*/
6307 PeeringState::GetInfo::GetInfo(my_context ctx)
6308   : my_base(ctx),
6309     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetInfo")
6310 {
6311   context< PeeringMachine >().log_enter(state_name);
6312
6313
6314   DECLARE_LOCALS;
6315   ps->check_past_interval_bounds();
6316   ps->log_weirdness();
6317   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6318
6319   ceph_assert(ps->blocked_by.empty());
6320
6321   prior_set = ps->build_prior();
6322   ps->prior_readable_down_osds = prior_set.down;
6323   if (ps->prior_readable_down_osds.empty()) {
6324     psdout(10) << " no prior_set down osds, clearing prior_readable_until_ub"
6325                << dendl;
6326     ps->clear_prior_readable_until_ub();
6327   }
6328
6329   ps->reset_min_peer_features();
6330   get_infos();
6331   if (prior_set.pg_down) {
6332     post_event(IsDown());
6333   } else if (peer_info_requested.empty()) {
6334     post_event(GotInfo());
6335   }
6336 }
6337
6338 void PeeringState::GetInfo::get_infos()
6339 {
6340   DECLARE_LOCALS;
6341   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6342
6343   ps->blocked_by.clear();
6344   for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
6345        it != prior_set.probe.end();
6346        ++it) {
6347     pg_shard_t peer = *it;
6348     if (peer == ps->pg_whoami) {
6349       continue;
6350     }
6351     if (ps->peer_info.count(peer)) {
6352       psdout(10) << " have osd." << peer << " info " << ps->peer_info[peer] << dendl;
6353       continue;
6354     }
6355     if (peer_info_requested.count(peer)) {
6356       psdout(10) << " already requested info from osd." << peer << dendl;
6357       ps->blocked_by.insert(peer.osd);
6358     } else if (!ps->get_osdmap()->is_up(peer.osd)) {
6359       psdout(10) << " not querying info from down osd." << peer << dendl;
6360     } else {
6361       psdout(10) << " querying info from osd." << peer << dendl;
6362       context< PeeringMachine >().send_query(
6363         peer.osd,
6364         pg_query_t(pg_query_t::INFO,
6365                    it->shard, ps->pg_whoami.shard,
6366                    ps->info.history,
6367                    ps->get_osdmap_epoch()));
6368       peer_info_requested.insert(peer);
6369       ps->blocked_by.insert(peer.osd);
6370     }
6371   }
6372
6373   ps->check_prior_readable_down_osds(ps->get_osdmap());
6374
6375   pl->publish_stats_to_osd();
6376 }
6377
6378 boost::statechart::result PeeringState::GetInfo::react(const MNotifyRec& infoevt)
6379 {
6380
6381   DECLARE_LOCALS;
6382
6383   set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
6384   if (p != peer_info_requested.end()) {
6385     peer_info_requested.erase(p);
6386     ps->blocked_by.erase(infoevt.from.osd);
6387   }
6388
6389   epoch_t old_start = ps->info.history.last_epoch_started;
6390   if (ps->proc_replica_info(
6391         infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
6392     // we got something new ...
6393     PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6394     if (old_start < ps->info.history.last_epoch_started) {
6395       psdout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
6396       prior_set = ps->build_prior();
6397       ps->prior_readable_down_osds = prior_set.down;
6398
6399       // filter out any osds that got dropped from the probe set from
6400       // peer_info_requested.  this is less expensive than restarting
6401       // peering (which would re-probe everyone).
6402       set<pg_shard_t>::iterator p = peer_info_requested.begin();
6403       while (p != peer_info_requested.end()) {
6404         if (prior_set.probe.count(*p) == 0) {
6405           psdout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
6406           peer_info_requested.erase(p++);
6407         } else {
6408           ++p;
6409         }
6410       }
6411       get_infos();
6412     }
6413     psdout(20) << "Adding osd: " << infoevt.from.osd << " peer features: "
6414                        << hex << infoevt.features << dec << dendl;
6415     ps->apply_peer_features(infoevt.features);
6416
6417     // are we done getting everything?
6418     if (peer_info_requested.empty() && !prior_set.pg_down) {
6419       psdout(20) << "Common peer features: " << hex << ps->get_min_peer_features() << dec << dendl;
6420       psdout(20) << "Common acting features: " << hex << ps->get_min_acting_features() << dec << dendl;
6421       psdout(20) << "Common upacting features: " << hex << ps->get_min_upacting_features() << dec << dendl;
6422       post_event(GotInfo());
6423     }
6424   }
6425   return discard_event();
6426 }
6427
6428 boost::statechart::result PeeringState::GetInfo::react(const QueryState& q)
6429 {
6430   DECLARE_LOCALS;
6431   q.f->open_object_section("state");
6432   q.f->dump_string("name", state_name);
6433   q.f->dump_stream("enter_time") << enter_time;
6434
6435   q.f->open_array_section("requested_info_from");
6436   for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
6437        p != peer_info_requested.end();
6438        ++p) {
6439     q.f->open_object_section("osd");
6440     q.f->dump_stream("osd") << *p;
6441     if (ps->peer_info.count(*p)) {
6442       q.f->open_object_section("got_info");
6443       ps->peer_info[*p].dump(q.f);
6444       q.f->close_section();
6445     }
6446     q.f->close_section();
6447   }
6448   q.f->close_section();
6449
6450   q.f->close_section();
6451   return forward_event();
6452 }
6453
6454 void PeeringState::GetInfo::exit()
6455 {
6456   context< PeeringMachine >().log_exit(state_name, enter_time);
6457
6458   DECLARE_LOCALS;
6459   utime_t dur = ceph_clock_now() - enter_time;
6460   pl->get_peering_perf().tinc(rs_getinfo_latency, dur);
6461   ps->blocked_by.clear();
6462 }
6463
6464 /*------GetLog------------*/
6465 PeeringState::GetLog::GetLog(my_context ctx)
6466   : my_base(ctx),
6467     NamedState(
6468       context< PeeringMachine >().state_history,
6469       "Started/Primary/Peering/GetLog"),
6470     msg(0)
6471 {
6472   context< PeeringMachine >().log_enter(state_name);
6473
6474   DECLARE_LOCALS;
6475
6476   ps->log_weirdness();
6477
6478   // adjust acting?
6479   if (!ps->choose_acting(auth_log_shard, false,
6480                          &context< Peering >().history_les_bound)) {
6481     if (!ps->want_acting.empty()) {
6482       post_event(NeedActingChange());
6483     } else {
6484       post_event(IsIncomplete());
6485     }
6486     return;
6487   }
6488
6489   // am i the best?
6490   if (auth_log_shard == ps->pg_whoami) {
6491     post_event(GotLog());
6492     return;
6493   }
6494
6495   const pg_info_t& best = ps->peer_info[auth_log_shard];
6496
6497   // am i broken?
6498   if (ps->info.last_update < best.log_tail) {
6499     psdout(10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
6500     post_event(IsIncomplete());
6501     return;
6502   }
6503
6504   // how much log to request?
6505   eversion_t request_log_from = ps->info.last_update;
6506   ceph_assert(!ps->acting_recovery_backfill.empty());
6507   for (set<pg_shard_t>::iterator p = ps->acting_recovery_backfill.begin();
6508        p != ps->acting_recovery_backfill.end();
6509        ++p) {
6510     if (*p == ps->pg_whoami) continue;
6511     pg_info_t& ri = ps->peer_info[*p];
6512     if (ri.last_update < ps->info.log_tail && ri.last_update >= best.log_tail &&
6513         ri.last_update < request_log_from)
6514       request_log_from = ri.last_update;
6515   }
6516
6517   // how much?
6518   psdout(10) << " requesting log from osd." << auth_log_shard << dendl;
6519   context<PeeringMachine>().send_query(
6520     auth_log_shard.osd,
6521     pg_query_t(
6522       pg_query_t::LOG,
6523       auth_log_shard.shard, ps->pg_whoami.shard,
6524       request_log_from, ps->info.history,
6525       ps->get_osdmap_epoch()));
6526
6527   ceph_assert(ps->blocked_by.empty());
6528   ps->blocked_by.insert(auth_log_shard.osd);
6529   pl->publish_stats_to_osd();
6530 }
6531
6532 boost::statechart::result PeeringState::GetLog::react(const AdvMap& advmap)
6533 {
6534   // make sure our log source didn't go down.  we need to check
6535   // explicitly because it may not be part of the prior set, which
6536   // means the Peering state check won't catch it going down.
6537   if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
6538     psdout(10) << "GetLog: auth_log_shard osd."
6539                        << auth_log_shard.osd << " went down" << dendl;
6540     post_event(advmap);
6541     return transit< Reset >();
6542   }
6543
6544   // let the Peering state do its checks.
6545   return forward_event();
6546 }
6547
6548 boost::statechart::result PeeringState::GetLog::react(const MLogRec& logevt)
6549 {
6550   ceph_assert(!msg);
6551   if (logevt.from != auth_log_shard) {
6552     psdout(10) << "GetLog: discarding log from "
6553                        << "non-auth_log_shard osd." << logevt.from << dendl;
6554     return discard_event();
6555   }
6556   psdout(10) << "GetLog: received master log from osd."
6557                      << logevt.from << dendl;
6558   msg = logevt.msg;
6559   post_event(GotLog());
6560   return discard_event();
6561 }
6562
6563 boost::statechart::result PeeringState::GetLog::react(const GotLog&)
6564 {
6565
6566   DECLARE_LOCALS;
6567   psdout(10) << "leaving GetLog" << dendl;
6568   if (msg) {
6569     psdout(10) << "processing master log" << dendl;
6570     ps->proc_master_log(context<PeeringMachine>().get_cur_transaction(),
6571                         msg->info, msg->log, msg->missing,
6572                         auth_log_shard);
6573   }
6574   ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6575   return transit< GetMissing >();
6576 }
6577
6578 boost::statechart::result PeeringState::GetLog::react(const QueryState& q)
6579 {
6580   q.f->open_object_section("state");
6581   q.f->dump_string("name", state_name);
6582   q.f->dump_stream("enter_time") << enter_time;
6583   q.f->dump_stream("auth_log_shard") << auth_log_shard;
6584   q.f->close_section();
6585   return forward_event();
6586 }
6587
6588 void PeeringState::GetLog::exit()
6589 {
6590   context< PeeringMachine >().log_exit(state_name, enter_time);
6591
6592   DECLARE_LOCALS;
6593   utime_t dur = ceph_clock_now() - enter_time;
6594   pl->get_peering_perf().tinc(rs_getlog_latency, dur);
6595   ps->blocked_by.clear();
6596 }
6597
6598 /*------WaitActingChange--------*/
6599 PeeringState::WaitActingChange::WaitActingChange(my_context ctx)
6600   : my_base(ctx),
6601     NamedState(context< PeeringMachine >().state_history, "Started/Primary/WaitActingChange")
6602 {
6603   context< PeeringMachine >().log_enter(state_name);
6604 }
6605
6606 boost::statechart::result PeeringState::WaitActingChange::react(const AdvMap& advmap)
6607 {
6608   DECLARE_LOCALS;
6609   OSDMapRef osdmap = advmap.osdmap;
6610
6611   psdout(10) << "verifying no want_acting " << ps->want_acting << " targets didn't go down" << dendl;
6612   for (vector<int>::iterator p = ps->want_acting.begin(); p != ps->want_acting.end(); ++p) {
6613     if (!osdmap->is_up(*p)) {
6614       psdout(10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
6615       post_event(advmap);
6616       return transit< Reset >();
6617     }
6618   }
6619   return forward_event();
6620 }
6621
6622 boost::statechart::result PeeringState::WaitActingChange::react(const MLogRec& logevt)
6623 {
6624   psdout(10) << "In WaitActingChange, ignoring MLocRec" << dendl;
6625   return discard_event();
6626 }
6627
6628 boost::statechart::result PeeringState::WaitActingChange::react(const MInfoRec& evt)
6629 {
6630   psdout(10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
6631   return discard_event();
6632 }
6633
6634 boost::statechart::result PeeringState::WaitActingChange::react(const MNotifyRec& evt)
6635 {
6636   psdout(10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
6637   return discard_event();
6638 }
6639
6640 boost::statechart::result PeeringState::WaitActingChange::react(const QueryState& q)
6641 {
6642   q.f->open_object_section("state");
6643   q.f->dump_string("name", state_name);
6644   q.f->dump_stream("enter_time") << enter_time;
6645   q.f->dump_string("comment", "waiting for pg acting set to change");
6646   q.f->close_section();
6647   return forward_event();
6648 }
6649
6650 void PeeringState::WaitActingChange::exit()
6651 {
6652   context< PeeringMachine >().log_exit(state_name, enter_time);
6653   DECLARE_LOCALS;
6654   utime_t dur = ceph_clock_now() - enter_time;
6655   pl->get_peering_perf().tinc(rs_waitactingchange_latency, dur);
6656 }
6657
6658 /*------Down--------*/
6659 PeeringState::Down::Down(my_context ctx)
6660   : my_base(ctx),
6661     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Down")
6662 {
6663   context< PeeringMachine >().log_enter(state_name);
6664   DECLARE_LOCALS;
6665
6666   ps->state_clear(PG_STATE_PEERING);
6667   ps->state_set(PG_STATE_DOWN);
6668
6669   auto &prior_set = context< Peering >().prior_set;
6670   ceph_assert(ps->blocked_by.empty());
6671   ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
6672   pl->publish_stats_to_osd();
6673 }
6674
6675 void PeeringState::Down::exit()
6676 {
6677   context< PeeringMachine >().log_exit(state_name, enter_time);
6678
6679   DECLARE_LOCALS;
6680
6681   ps->state_clear(PG_STATE_DOWN);
6682   utime_t dur = ceph_clock_now() - enter_time;
6683   pl->get_peering_perf().tinc(rs_down_latency, dur);
6684
6685   ps->blocked_by.clear();
6686 }
6687
6688 boost::statechart::result PeeringState::Down::react(const QueryState& q)
6689 {
6690   q.f->open_object_section("state");
6691   q.f->dump_string("name", state_name);
6692   q.f->dump_stream("enter_time") << enter_time;
6693   q.f->dump_string("comment",
6694                    "not enough up instances of this PG to go active");
6695   q.f->close_section();
6696   return forward_event();
6697 }
6698
6699 boost::statechart::result PeeringState::Down::react(const MNotifyRec& infoevt)
6700 {
6701   DECLARE_LOCALS;
6702
6703   ceph_assert(ps->is_primary());
6704   epoch_t old_start = ps->info.history.last_epoch_started;
6705   if (!ps->peer_info.count(infoevt.from) &&
6706       ps->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) {
6707     ps->update_history(infoevt.notify.info.history);
6708   }
6709   // if we got something new to make pg escape down state
6710   if (ps->info.history.last_epoch_started > old_start) {
6711       psdout(10) << " last_epoch_started moved forward, re-enter getinfo" << dendl;
6712     ps->state_clear(PG_STATE_DOWN);
6713     ps->state_set(PG_STATE_PEERING);
6714     return transit< GetInfo >();
6715   }
6716
6717   return discard_event();
6718 }
6719
6720
6721 /*------Incomplete--------*/
6722 PeeringState::Incomplete::Incomplete(my_context ctx)
6723   : my_base(ctx),
6724     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Incomplete")
6725 {
6726   context< PeeringMachine >().log_enter(state_name);
6727   DECLARE_LOCALS;
6728
6729   ps->state_clear(PG_STATE_PEERING);
6730   ps->state_set(PG_STATE_INCOMPLETE);
6731
6732   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6733   ceph_assert(ps->blocked_by.empty());
6734   ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
6735   pl->publish_stats_to_osd();
6736 }
6737
6738 boost::statechart::result PeeringState::Incomplete::react(const AdvMap &advmap) {
6739   DECLARE_LOCALS;
6740   int64_t poolnum = ps->info.pgid.pool();
6741
6742   // Reset if min_size turn smaller than previous value, pg might now be able to go active
6743   if (!advmap.osdmap->have_pg_pool(poolnum) ||
6744       advmap.lastmap->get_pools().find(poolnum)->second.min_size >
6745       advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
6746     post_event(advmap);
6747     return transit< Reset >();
6748   }
6749
6750   return forward_event();
6751 }
6752
6753 boost::statechart::result PeeringState::Incomplete::react(const MNotifyRec& notevt) {
6754   DECLARE_LOCALS;
6755   psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
6756   if (ps->proc_replica_info(
6757     notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
6758     // We got something new, try again!
6759     return transit< GetLog >();
6760   } else {
6761     return discard_event();
6762   }
6763 }
6764
6765 boost::statechart::result PeeringState::Incomplete::react(
6766   const QueryState& q)
6767 {
6768   q.f->open_object_section("state");
6769   q.f->dump_string("name", state_name);
6770   q.f->dump_stream("enter_time") << enter_time;
6771   q.f->dump_string("comment", "not enough complete instances of this PG");
6772   q.f->close_section();
6773   return forward_event();
6774 }
6775
6776 void PeeringState::Incomplete::exit()
6777 {
6778   context< PeeringMachine >().log_exit(state_name, enter_time);
6779
6780   DECLARE_LOCALS;
6781
6782   ps->state_clear(PG_STATE_INCOMPLETE);
6783   utime_t dur = ceph_clock_now() - enter_time;
6784   pl->get_peering_perf().tinc(rs_incomplete_latency, dur);
6785
6786   ps->blocked_by.clear();
6787 }
6788
6789 /*------GetMissing--------*/
6790 PeeringState::GetMissing::GetMissing(my_context ctx)
6791   : my_base(ctx),
6792     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetMissing")
6793 {
6794   context< PeeringMachine >().log_enter(state_name);
6795
6796   DECLARE_LOCALS;
6797   ps->log_weirdness();
6798   ceph_assert(!ps->acting_recovery_backfill.empty());
6799   eversion_t since;
6800   for (set<pg_shard_t>::iterator i = ps->acting_recovery_backfill.begin();
6801        i != ps->acting_recovery_backfill.end();
6802        ++i) {
6803     if (*i == ps->get_primary()) continue;
6804     const pg_info_t& pi = ps->peer_info[*i];
6805     // reset this so to make sure the pg_missing_t is initialized and
6806     // has the correct semantics even if we don't need to get a
6807     // missing set from a shard. This way later additions due to
6808     // lost+unfound delete work properly.
6809     ps->peer_missing[*i].may_include_deletes = !ps->perform_deletes_during_peering();
6810
6811     if (pi.is_empty())
6812       continue;                                // no pg data, nothing divergent
6813
6814     if (pi.last_update < ps->pg_log.get_tail()) {
6815       psdout(10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
6816       ps->peer_missing[*i].clear();
6817       continue;
6818     }
6819     if (pi.last_backfill == hobject_t()) {
6820       psdout(10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
6821       ps->peer_missing[*i].clear();
6822       continue;
6823     }
6824
6825     if (pi.last_update == pi.last_complete &&  // peer has no missing
6826         pi.last_update == ps->info.last_update) {  // peer is up to date
6827       // replica has no missing and identical log as us.  no need to
6828       // pull anything.
6829       // FIXME: we can do better here.  if last_update==last_complete we
6830       //        can infer the rest!
6831       psdout(10) << " osd." << *i << " has no missing, identical log" << dendl;
6832       ps->peer_missing[*i].clear();
6833       continue;
6834     }
6835
6836     // We pull the log from the peer's last_epoch_started to ensure we
6837     // get enough log to detect divergent updates.
6838     since.epoch = pi.last_epoch_started;
6839     ceph_assert(pi.last_update >= ps->info.log_tail);  // or else choose_acting() did a bad thing
6840     if (pi.log_tail <= since) {
6841       psdout(10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
6842       context< PeeringMachine >().send_query(
6843         i->osd,
6844         pg_query_t(
6845           pg_query_t::LOG,
6846           i->shard, ps->pg_whoami.shard,
6847           since, ps->info.history,
6848           ps->get_osdmap_epoch()));
6849     } else {
6850       psdout(10) << " requesting fulllog+missing from osd." << *i
6851                          << " (want since " << since << " < log.tail "
6852                          << pi.log_tail << ")" << dendl;
6853       context< PeeringMachine >().send_query(
6854         i->osd, pg_query_t(
6855           pg_query_t::FULLLOG,
6856           i->shard, ps->pg_whoami.shard,
6857           ps->info.history, ps->get_osdmap_epoch()));
6858     }
6859     peer_missing_requested.insert(*i);
6860     ps->blocked_by.insert(i->osd);
6861   }
6862
6863   if (peer_missing_requested.empty()) {
6864     if (ps->need_up_thru) {
6865       psdout(10) << " still need up_thru update before going active"
6866                          << dendl;
6867       post_event(NeedUpThru());
6868       return;
6869     }
6870
6871     // all good!
6872     post_event(Activate(ps->get_osdmap_epoch()));
6873   } else {
6874     pl->publish_stats_to_osd();
6875   }
6876 }
6877
6878 boost::statechart::result PeeringState::GetMissing::react(const MLogRec& logevt)
6879 {
6880   DECLARE_LOCALS;
6881
6882   peer_missing_requested.erase(logevt.from);
6883   ps->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
6884
6885   if (peer_missing_requested.empty()) {
6886     if (ps->need_up_thru) {
6887       psdout(10) << " still need up_thru update before going active"
6888                          << dendl;
6889       post_event(NeedUpThru());
6890     } else {
6891       psdout(10) << "Got last missing, don't need missing "
6892                          << "posting Activate" << dendl;
6893       post_event(Activate(ps->get_osdmap_epoch()));
6894     }
6895   }
6896   return discard_event();
6897 }
6898
6899 boost::statechart::result PeeringState::GetMissing::react(const QueryState& q)
6900 {
6901   DECLARE_LOCALS;
6902   q.f->open_object_section("state");
6903   q.f->dump_string("name", state_name);
6904   q.f->dump_stream("enter_time") << enter_time;
6905
6906   q.f->open_array_section("peer_missing_requested");
6907   for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
6908        p != peer_missing_requested.end();
6909        ++p) {
6910     q.f->open_object_section("osd");
6911     q.f->dump_stream("osd") << *p;
6912     if (ps->peer_missing.count(*p)) {
6913       q.f->open_object_section("got_missing");
6914       ps->peer_missing[*p].dump(q.f);
6915       q.f->close_section();
6916     }
6917     q.f->close_section();
6918   }
6919   q.f->close_section();
6920
6921   q.f->close_section();
6922   return forward_event();
6923 }
6924
6925 void PeeringState::GetMissing::exit()
6926 {
6927   context< PeeringMachine >().log_exit(state_name, enter_time);
6928
6929   DECLARE_LOCALS;
6930   utime_t dur = ceph_clock_now() - enter_time;
6931   pl->get_peering_perf().tinc(rs_getmissing_latency, dur);
6932   ps->blocked_by.clear();
6933 }
6934
6935 /*------WaitUpThru--------*/
6936 PeeringState::WaitUpThru::WaitUpThru(my_context ctx)
6937   : my_base(ctx),
6938     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/WaitUpThru")
6939 {
6940   context< PeeringMachine >().log_enter(state_name);
6941 }
6942
6943 boost::statechart::result PeeringState::WaitUpThru::react(const ActMap& am)
6944 {
6945   DECLARE_LOCALS;
6946   if (!ps->need_up_thru) {
6947     post_event(Activate(ps->get_osdmap_epoch()));
6948   }
6949   return forward_event();
6950 }
6951
6952 boost::statechart::result PeeringState::WaitUpThru::react(const MLogRec& logevt)
6953 {
6954   DECLARE_LOCALS;
6955   psdout(10) << "Noting missing from osd." << logevt.from << dendl;
6956   ps->peer_missing[logevt.from].claim(logevt.msg->missing);
6957   ps->peer_info[logevt.from] = logevt.msg->info;
6958   return discard_event();
6959 }
6960
6961 boost::statechart::result PeeringState::WaitUpThru::react(const QueryState& q)
6962 {
6963   q.f->open_object_section("state");
6964   q.f->dump_string("name", state_name);
6965   q.f->dump_stream("enter_time") << enter_time;
6966   q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
6967   q.f->close_section();
6968   return forward_event();
6969 }
6970
6971 void PeeringState::WaitUpThru::exit()
6972 {
6973   context< PeeringMachine >().log_exit(state_name, enter_time);
6974   DECLARE_LOCALS;
6975   utime_t dur = ceph_clock_now() - enter_time;
6976   pl->get_peering_perf().tinc(rs_waitupthru_latency, dur);
6977 }
6978
6979 /*----PeeringState::PeeringMachine Methods-----*/
6980 #undef dout_prefix
6981 #define dout_prefix dpp->gen_prefix(*_dout)
6982
6983 void PeeringState::PeeringMachine::log_enter(const char *state_name)
6984 {
6985   DECLARE_LOCALS;
6986   psdout(5) << "enter " << state_name << dendl;
6987   pl->log_state_enter(state_name);
6988 }
6989
6990 void PeeringState::PeeringMachine::log_exit(const char *state_name, utime_t enter_time)
6991 {
6992   DECLARE_LOCALS;
6993   utime_t dur = ceph_clock_now() - enter_time;
6994   psdout(5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
6995   pl->log_state_exit(state_name, enter_time, event_count, event_time);
6996   event_count = 0;
6997   event_time = utime_t();
6998 }
6999
7000 ostream &operator<<(ostream &out, const PeeringState &ps) {
7001   out << "pg[" << ps.info
7002       << " " << pg_vector_string(ps.up);
7003   if (ps.acting != ps.up)
7004     out << "/" << pg_vector_string(ps.acting);
7005   if (ps.is_ec_pg())
7006     out << "p" << ps.get_primary();
7007   if (!ps.async_recovery_targets.empty())
7008     out << " async=[" << ps.async_recovery_targets << "]";
7009   if (!ps.backfill_targets.empty())
7010     out << " backfill=[" << ps.backfill_targets << "]";
7011   out << " r=" << ps.get_role();
7012   out << " lpr=" << ps.get_last_peering_reset();
7013
7014   if (ps.deleting)
7015     out << " DELETING";
7016
7017   if (!ps.past_intervals.empty()) {
7018     out << " pi=[" << ps.past_intervals.get_bounds()
7019         << ")/" << ps.past_intervals.size();
7020   }
7021
7022   if (ps.is_peered()) {
7023     if (ps.last_update_ondisk != ps.info.last_update)
7024       out << " luod=" << ps.last_update_ondisk;
7025     if (ps.last_update_applied != ps.info.last_update)
7026       out << " lua=" << ps.last_update_applied;
7027   }
7028
7029   if (ps.pg_log.get_tail() != ps.info.log_tail ||
7030       ps.pg_log.get_head() != ps.info.last_update)
7031     out << " (info mismatch, " << ps.pg_log.get_log() << ")";
7032
7033   if (!ps.pg_log.get_log().empty()) {
7034     if ((ps.pg_log.get_log().log.begin()->version <= ps.pg_log.get_tail())) {
7035       out << " (log bound mismatch, actual=["
7036           << ps.pg_log.get_log().log.begin()->version << ","
7037           << ps.pg_log.get_log().log.rbegin()->version << "]";
7038       out << ")";
7039     }
7040   }
7041
7042   out << " crt=" << ps.pg_log.get_can_rollback_to();
7043
7044   if (ps.last_complete_ondisk != ps.info.last_complete)
7045     out << " lcod " << ps.last_complete_ondisk;
7046
7047   out << " mlcod " << ps.min_last_complete_ondisk;
7048
7049   out << " " << pg_state_string(ps.get_state());
7050   if (ps.should_send_notify())
7051     out << " NOTIFY";
7052
7053   if (ps.prior_readable_until_ub != ceph::signedspan::zero()) {
7054     out << " pruub " << ps.prior_readable_until_ub
7055         << "@" << ps.get_prior_readable_down_osds();
7056   }
7057   return out;
7058 }