ceph/src/osd/PeeringState.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3
   4 #include "PGPeeringEvent.h"
   5 #include "common/ceph_releases.h"
   6 #include "common/dout.h"
   7 #include "PeeringState.h"
   8
   9 #include "messages/MOSDPGRemove.h"
  10 #include "messages/MBackfillReserve.h"
  11 #include "messages/MRecoveryReserve.h"
  12 #include "messages/MOSDScrubReserve.h"
  13 #include "messages/MOSDPGInfo.h"
  14 #include "messages/MOSDPGInfo2.h"
  15 #include "messages/MOSDPGTrim.h"
  16 #include "messages/MOSDPGLog.h"
  17 #include "messages/MOSDPGNotify.h"
  18 #include "messages/MOSDPGNotify2.h"
  19 #include "messages/MOSDPGQuery.h"
  20 #include "messages/MOSDPGQuery2.h"
  21 #include "messages/MOSDPGLease.h"
  22 #include "messages/MOSDPGLeaseAck.h"
  23
  24 #define dout_context cct
  25 #define dout_subsys ceph_subsys_osd
  26
  27 BufferedRecoveryMessages::BufferedRecoveryMessages(
  28   ceph_release_t r,
  29   PeeringCtx &ctx)
  30   : require_osd_release(r) {
  31   // steal messages from ctx
  32   message_map.swap(ctx.message_map);
  33 }
  34
  35 void BufferedRecoveryMessages::send_notify(int to, const pg_notify_t &n)
  36 {
  37   if (require_osd_release >= ceph_release_t::octopus) {
  38     spg_t pgid(n.info.pgid.pgid, n.to);
  39     send_osd_message(to, make_message<MOSDPGNotify2>(pgid, n));
  40   } else {
  41     send_osd_message(to, make_message<MOSDPGNotify>(n.epoch_sent, vector{n}));
  42   }
  43 }
  44
  45 void BufferedRecoveryMessages::send_query(
  46   int to,
  47   spg_t to_spgid,
  48   const pg_query_t &q)
  49 {
  50   if (require_osd_release >= ceph_release_t::octopus) {
  51     send_osd_message(to,
  52                      make_message<MOSDPGQuery2>(to_spgid, q));
  53   } else {
  54     auto m = make_message<MOSDPGQuery>(
  55       q.epoch_sent,
  56       MOSDPGQuery::pg_list_t{{to_spgid, q}});
  57     send_osd_message(to, m);
  58   }
  59 }
  60
  61 void BufferedRecoveryMessages::send_info(
  62   int to,
  63   spg_t to_spgid,
  64   epoch_t min_epoch,
  65   epoch_t cur_epoch,
  66   const pg_info_t &info,
  67   std::optional<pg_lease_t> lease,
  68   std::optional<pg_lease_ack_t> lease_ack)
  69 {
  70   if (require_osd_release >= ceph_release_t::octopus) {
  71     send_osd_message(
  72       to,
  73       make_message<MOSDPGInfo2>(
  74         to_spgid,
  75         info,
  76         cur_epoch,
  77         min_epoch,
  78         lease,
  79         lease_ack)
  80       );
  81   } else {
  82     send_osd_message(
  83       to,
  84       make_message<MOSDPGInfo>(
  85         cur_epoch,
  86         vector{pg_notify_t{to_spgid.shard,
  87                            info.pgid.shard,
  88                            min_epoch, cur_epoch,
  89                            info, PastIntervals{}}})
  90       );
  91   }
  92 }
  93
  94 void PGPool::update(CephContext *cct, OSDMapRef map)
  95 {
  96   const pg_pool_t *pi = map->get_pg_pool(id);
  97   if (!pi) {
  98     return; // pool has been deleted
  99   }
 100   info = *pi;
 101   name = map->get_pool_name(id);
 102
 103   bool updated = false;
 104   if ((map->get_epoch() != cached_epoch + 1) ||
 105       (pi->get_snap_epoch() == map->get_epoch())) {
 106     updated = true;
 107   }
 108
 109   if (info.is_pool_snaps_mode() && updated) {
 110     snapc = pi->get_snap_context();
 111   }
 112   cached_epoch = map->get_epoch();
 113 }
 114
 115 /*-------------Peering State Helpers----------------*/
 116 #undef dout_prefix
 117 #define dout_prefix (dpp->gen_prefix(*_dout))
 118 #undef psdout
 119 #define psdout(x) ldout(cct, x)
 120
 121 PeeringState::PeeringState(
 122   CephContext *cct,
 123   pg_shard_t pg_whoami,
 124   spg_t spgid,
 125   const PGPool &_pool,
 126   OSDMapRef curmap,
 127   DoutPrefixProvider *dpp,
 128   PeeringListener *pl)
 129   : state_history(*pl),
 130     cct(cct),
 131     spgid(spgid),
 132     dpp(dpp),
 133     pl(pl),
 134     orig_ctx(0),
 135     osdmap_ref(curmap),
 136     pool(_pool),
 137     pg_whoami(pg_whoami),
 138     info(spgid),
 139     pg_log(cct),
 140     missing_loc(spgid, this, dpp, cct),
 141     machine(this, cct, spgid, dpp, pl, &state_history)
 142 {
 143   machine.initiate();
 144 }
 145
 146 void PeeringState::start_handle(PeeringCtx *new_ctx) {
 147   ceph_assert(!rctx);
 148   ceph_assert(!orig_ctx);
 149   orig_ctx = new_ctx;
 150   if (new_ctx) {
 151     if (messages_pending_flush) {
 152       rctx.emplace(*messages_pending_flush, *new_ctx);
 153     } else {
 154       rctx.emplace(*new_ctx);
 155     }
 156     rctx->start_time = ceph_clock_now();
 157   }
 158 }
 159
 160 void PeeringState::begin_block_outgoing() {
 161   ceph_assert(!messages_pending_flush);
 162   ceph_assert(orig_ctx);
 163   ceph_assert(rctx);
 164   messages_pending_flush = BufferedRecoveryMessages(
 165     orig_ctx->require_osd_release);
 166   rctx.emplace(*messages_pending_flush, *orig_ctx);
 167 }
 168
 169 void PeeringState::clear_blocked_outgoing() {
 170   ceph_assert(orig_ctx);
 171   ceph_assert(rctx);
 172   messages_pending_flush = std::optional<BufferedRecoveryMessages>();
 173 }
 174
 175 void PeeringState::end_block_outgoing() {
 176   ceph_assert(messages_pending_flush);
 177   ceph_assert(orig_ctx);
 178   ceph_assert(rctx);
 179
 180   orig_ctx->accept_buffered_messages(*messages_pending_flush);
 181   rctx.emplace(*orig_ctx);
 182   messages_pending_flush = std::optional<BufferedRecoveryMessages>();
 183 }
 184
 185 void PeeringState::end_handle() {
 186   if (rctx) {
 187     utime_t dur = ceph_clock_now() - rctx->start_time;
 188     machine.event_time += dur;
 189   }
 190
 191   machine.event_count++;
 192   rctx = std::nullopt;
 193   orig_ctx = NULL;
 194 }
 195
 196 void PeeringState::check_recovery_sources(const OSDMapRef& osdmap)
 197 {
 198   /*
 199    * check that any peers we are planning to (or currently) pulling
 200    * objects from are dealt with.
 201    */
 202   missing_loc.check_recovery_sources(osdmap);
 203   pl->check_recovery_sources(osdmap);
 204
 205   for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
 206        i != peer_log_requested.end();
 207        ) {
 208     if (!osdmap->is_up(i->osd)) {
 209       psdout(10) << "peer_log_requested removing " << *i << dendl;
 210       peer_log_requested.erase(i++);
 211     } else {
 212       ++i;
 213     }
 214   }
 215
 216   for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
 217        i != peer_missing_requested.end();
 218        ) {
 219     if (!osdmap->is_up(i->osd)) {
 220       psdout(10) << "peer_missing_requested removing " << *i << dendl;
 221       peer_missing_requested.erase(i++);
 222     } else {
 223       ++i;
 224     }
 225   }
 226 }
 227
 228 void PeeringState::update_history(const pg_history_t& new_history)
 229 {
 230   auto mnow = pl->get_mnow();
 231   info.history.refresh_prior_readable_until_ub(mnow, prior_readable_until_ub);
 232   if (info.history.merge(new_history)) {
 233     psdout(20) << __func__ << " advanced history from " << new_history << dendl;
 234     dirty_info = true;
 235     if (info.history.last_epoch_clean >= info.history.same_interval_since) {
 236       psdout(20) << __func__ << " clearing past_intervals" << dendl;
 237       past_intervals.clear();
 238       dirty_big_info = true;
 239     }
 240     prior_readable_until_ub = info.history.get_prior_readable_until_ub(mnow);
 241     if (prior_readable_until_ub != ceph::signedspan::zero()) {
 242       dout(20) << __func__
 243                << " prior_readable_until_ub " << prior_readable_until_ub
 244                << " (mnow " << mnow << " + "
 245                << info.history.prior_readable_until_ub << ")" << dendl;
 246     }
 247   }
 248   pl->on_info_history_change();
 249 }
 250
 251 void PeeringState::purge_strays()
 252 {
 253   if (is_premerge()) {
 254     psdout(10) << "purge_strays " << stray_set << " but premerge, doing nothing"
 255                << dendl;
 256     return;
 257   }
 258   if (cct->_conf.get_val<bool>("osd_debug_no_purge_strays")) {
 259     return;
 260   }
 261   psdout(10) << "purge_strays " << stray_set << dendl;
 262
 263   bool removed = false;
 264   for (set<pg_shard_t>::iterator p = stray_set.begin();
 265        p != stray_set.end();
 266        ++p) {
 267     ceph_assert(!is_acting_recovery_backfill(*p));
 268     if (get_osdmap()->is_up(p->osd)) {
 269       psdout(10) << "sending PGRemove to osd." << *p << dendl;
 270       vector<spg_t> to_remove;
 271       to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
 272       MOSDPGRemove *m = new MOSDPGRemove(
 273         get_osdmap_epoch(),
 274         to_remove);
 275       pl->send_cluster_message(p->osd, m, get_osdmap_epoch());
 276     } else {
 277       psdout(10) << "not sending PGRemove to down osd." << *p << dendl;
 278     }
 279     peer_missing.erase(*p);
 280     peer_info.erase(*p);
 281     missing_loc.remove_stray_recovery_sources(*p);
 282     peer_purged.insert(*p);
 283     removed = true;
 284   }
 285
 286   // if we removed anyone, update peers (which include peer_info)
 287   if (removed)
 288     update_heartbeat_peers();
 289
 290   stray_set.clear();
 291
 292   // clear _requested maps; we may have to peer() again if we discover
 293   // (more) stray content
 294   peer_log_requested.clear();
 295   peer_missing_requested.clear();
 296 }
 297
 298
 299 bool PeeringState::proc_replica_info(
 300   pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
 301 {
 302   map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
 303   if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
 304     psdout(10) << " got dup osd." << from << " info "
 305                << oinfo << ", identical to ours" << dendl;
 306     return false;
 307   }
 308
 309   if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
 310     psdout(10) << " got info " << oinfo << " from down osd." << from
 311              << " discarding" << dendl;
 312     return false;
 313   }
 314
 315   psdout(10) << " got osd." << from << " " << oinfo << dendl;
 316   ceph_assert(is_primary());
 317   peer_info[from] = oinfo;
 318   might_have_unfound.insert(from);
 319
 320   update_history(oinfo.history);
 321
 322   // stray?
 323   if (!is_up(from) && !is_acting(from)) {
 324     psdout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
 325     stray_set.insert(from);
 326     if (is_clean()) {
 327       purge_strays();
 328     }
 329   }
 330
 331   // was this a new info?  if so, update peers!
 332   if (p == peer_info.end())
 333     update_heartbeat_peers();
 334
 335   return true;
 336 }
 337
 338
 339 void PeeringState::remove_down_peer_info(const OSDMapRef &osdmap)
 340 {
 341   // Remove any downed osds from peer_info
 342   bool removed = false;
 343   map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
 344   while (p != peer_info.end()) {
 345     if (!osdmap->is_up(p->first.osd)) {
 346       psdout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
 347       peer_missing.erase(p->first);
 348       peer_log_requested.erase(p->first);
 349       peer_missing_requested.erase(p->first);
 350       peer_info.erase(p++);
 351       removed = true;
 352     } else
 353       ++p;
 354   }
 355
 356   // Remove any downed osds from peer_purged so we can re-purge if necessary
 357   auto it = peer_purged.begin();
 358   while (it != peer_purged.end()) {
 359     if (!osdmap->is_up(it->osd)) {
 360       psdout(10) << " dropping down osd." << *it << " from peer_purged" << dendl;
 361       peer_purged.erase(it++);
 362     } else {
 363       ++it;
 364     }
 365   }
 366
 367   // if we removed anyone, update peers (which include peer_info)
 368   if (removed)
 369     update_heartbeat_peers();
 370
 371   check_recovery_sources(osdmap);
 372 }
 373
 374 void PeeringState::update_heartbeat_peers()
 375 {
 376   if (!is_primary())
 377     return;
 378
 379   set<int> new_peers;
 380   for (unsigned i=0; i<acting.size(); i++) {
 381     if (acting[i] != CRUSH_ITEM_NONE)
 382       new_peers.insert(acting[i]);
 383   }
 384   for (unsigned i=0; i<up.size(); i++) {
 385     if (up[i] != CRUSH_ITEM_NONE)
 386       new_peers.insert(up[i]);
 387   }
 388   for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
 389        p != peer_info.end();
 390        ++p) {
 391     new_peers.insert(p->first.osd);
 392   }
 393   pl->update_heartbeat_peers(std::move(new_peers));
 394 }
 395
 396 void PeeringState::write_if_dirty(ObjectStore::Transaction& t)
 397 {
 398   pl->prepare_write(
 399     info,
 400     last_written_info,
 401     past_intervals,
 402     pg_log,
 403     dirty_info,
 404     dirty_big_info,
 405     last_persisted_osdmap < get_osdmap_epoch(),
 406     t);
 407   if (dirty_info || dirty_big_info) {
 408     last_persisted_osdmap = get_osdmap_epoch();
 409     last_written_info = info;
 410     dirty_info = false;
 411     dirty_big_info = false;
 412   }
 413 }
 414
 415 void PeeringState::advance_map(
 416   OSDMapRef osdmap, OSDMapRef lastmap,
 417   vector<int>& newup, int up_primary,
 418   vector<int>& newacting, int acting_primary,
 419   PeeringCtx &rctx)
 420 {
 421   ceph_assert(lastmap == osdmap_ref);
 422   psdout(10) << "handle_advance_map "
 423             << newup << "/" << newacting
 424             << " -- " << up_primary << "/" << acting_primary
 425             << dendl;
 426
 427   update_osdmap_ref(osdmap);
 428   pool.update(cct, osdmap);
 429
 430   AdvMap evt(
 431     osdmap, lastmap, newup, up_primary,
 432     newacting, acting_primary);
 433   handle_event(evt, &rctx);
 434   if (pool.info.last_change == osdmap_ref->get_epoch()) {
 435     pl->on_pool_change();
 436   }
 437   readable_interval = pool.get_readable_interval();
 438   last_require_osd_release = osdmap->require_osd_release;
 439 }
 440
 441 void PeeringState::activate_map(PeeringCtx &rctx)
 442 {
 443   psdout(10) << __func__ << dendl;
 444   ActMap evt;
 445   handle_event(evt, &rctx);
 446   if (osdmap_ref->get_epoch() - last_persisted_osdmap >
 447     cct->_conf->osd_pg_epoch_persisted_max_stale) {
 448     psdout(20) << __func__ << ": Dirtying info: last_persisted is "
 449               << last_persisted_osdmap
 450               << " while current is " << osdmap_ref->get_epoch() << dendl;
 451     dirty_info = true;
 452   } else {
 453     psdout(20) << __func__ << ": Not dirtying info: last_persisted is "
 454               << last_persisted_osdmap
 455               << " while current is " << osdmap_ref->get_epoch() << dendl;
 456   }
 457   write_if_dirty(rctx.transaction);
 458
 459   if (get_osdmap()->check_new_blacklist_entries()) {
 460     pl->check_blacklisted_watchers();
 461   }
 462 }
 463
 464 void PeeringState::set_last_peering_reset()
 465 {
 466   psdout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl;
 467   if (last_peering_reset != get_osdmap_epoch()) {
 468     last_peering_reset = get_osdmap_epoch();
 469     psdout(10) << "Clearing blocked outgoing recovery messages" << dendl;
 470     clear_blocked_outgoing();
 471     if (!pl->try_flush_or_schedule_async()) {
 472       psdout(10) << "Beginning to block outgoing recovery messages" << dendl;
 473       begin_block_outgoing();
 474     } else {
 475       psdout(10) << "Not blocking outgoing recovery messages" << dendl;
 476     }
 477   }
 478 }
 479
 480 void PeeringState::complete_flush()
 481 {
 482   flushes_in_progress--;
 483   if (flushes_in_progress == 0) {
 484     pl->on_flushed();
 485   }
 486 }
 487
 488 void PeeringState::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
 489 {
 490   const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
 491   if (!pi) {
 492     return; // pool deleted
 493   }
 494   bool changed = false;
 495   if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
 496     const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
 497     if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
 498       psdout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
 499       changed = true;
 500     }
 501   }
 502   if (changed) {
 503     info.history.last_epoch_marked_full = osdmap->get_epoch();
 504     dirty_info = true;
 505   }
 506 }
 507
 508 bool PeeringState::should_restart_peering(
 509   int newupprimary,
 510   int newactingprimary,
 511   const vector<int>& newup,
 512   const vector<int>& newacting,
 513   OSDMapRef lastmap,
 514   OSDMapRef osdmap)
 515 {
 516   if (PastIntervals::is_new_interval(
 517         primary.osd,
 518         newactingprimary,
 519         acting,
 520         newacting,
 521         up_primary.osd,
 522         newupprimary,
 523         up,
 524         newup,
 525         osdmap.get(),
 526         lastmap.get(),
 527         info.pgid.pgid)) {
 528     psdout(20) << "new interval newup " << newup
 529                << " newacting " << newacting << dendl;
 530     return true;
 531   }
 532   if (!lastmap->is_up(pg_whoami.osd) && osdmap->is_up(pg_whoami.osd)) {
 533     psdout(10) << __func__ << " osd transitioned from down -> up"
 534                << dendl;
 535     return true;
 536   }
 537   return false;
 538 }
 539
 540 /* Called before initializing peering during advance_map */
 541 void PeeringState::start_peering_interval(
 542   const OSDMapRef lastmap,
 543   const vector<int>& newup, int new_up_primary,
 544   const vector<int>& newacting, int new_acting_primary,
 545   ObjectStore::Transaction &t)
 546 {
 547   const OSDMapRef osdmap = get_osdmap();
 548
 549   set_last_peering_reset();
 550
 551   vector<int> oldacting, oldup;
 552   int oldrole = get_role();
 553
 554   if (is_primary()) {
 555     pl->clear_ready_to_merge();
 556   }
 557
 558
 559   pg_shard_t old_acting_primary = get_primary();
 560   pg_shard_t old_up_primary = up_primary;
 561   bool was_old_primary = is_primary();
 562   bool was_old_nonprimary = is_nonprimary();
 563
 564   acting.swap(oldacting);
 565   up.swap(oldup);
 566   init_primary_up_acting(
 567     newup,
 568     newacting,
 569     new_up_primary,
 570     new_acting_primary);
 571
 572   if (info.stats.up != up ||
 573       info.stats.acting != acting ||
 574       info.stats.up_primary != new_up_primary ||
 575       info.stats.acting_primary != new_acting_primary) {
 576     info.stats.up = up;
 577     info.stats.up_primary = new_up_primary;
 578     info.stats.acting = acting;
 579     info.stats.acting_primary = new_acting_primary;
 580     info.stats.mapping_epoch = osdmap->get_epoch();
 581   }
 582
 583   pl->clear_publish_stats();
 584
 585   // This will now be remapped during a backfill in cases
 586   // that it would not have been before.
 587   if (up != acting)
 588     state_set(PG_STATE_REMAPPED);
 589   else
 590     state_clear(PG_STATE_REMAPPED);
 591
 592   int role = osdmap->calc_pg_role(pg_whoami, acting);
 593   set_role(role);
 594
 595   // did acting, up, primary|acker change?
 596   if (!lastmap) {
 597     psdout(10) << " no lastmap" << dendl;
 598     dirty_info = true;
 599     dirty_big_info = true;
 600     info.history.same_interval_since = osdmap->get_epoch();
 601   } else {
 602     std::stringstream debug;
 603     ceph_assert(info.history.same_interval_since != 0);
 604     bool new_interval = PastIntervals::check_new_interval(
 605       old_acting_primary.osd,
 606       new_acting_primary,
 607       oldacting, newacting,
 608       old_up_primary.osd,
 609       new_up_primary,
 610       oldup, newup,
 611       info.history.same_interval_since,
 612       info.history.last_epoch_clean,
 613       osdmap.get(),
 614       lastmap.get(),
 615       info.pgid.pgid,
 616       missing_loc.get_recoverable_predicate(),
 617       &past_intervals,
 618       &debug);
 619     psdout(10) << __func__ << ": check_new_interval output: "
 620                << debug.str() << dendl;
 621     if (new_interval) {
 622       if (osdmap->get_epoch() == pl->oldest_stored_osdmap() &&
 623           info.history.last_epoch_clean < osdmap->get_epoch()) {
 624         psdout(10) << " map gap, clearing past_intervals and faking" << dendl;
 625         // our information is incomplete and useless; someone else was clean
 626         // after everything we know if osdmaps were trimmed.
 627         past_intervals.clear();
 628       } else {
 629         psdout(10) << " noting past " << past_intervals << dendl;
 630       }
 631       dirty_info = true;
 632       dirty_big_info = true;
 633       info.history.same_interval_since = osdmap->get_epoch();
 634       if (osdmap->have_pg_pool(info.pgid.pgid.pool()) &&
 635           info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
 636                                   osdmap->get_pg_num(info.pgid.pgid.pool()),
 637                                   nullptr)) {
 638         info.history.last_epoch_split = osdmap->get_epoch();
 639       }
 640     }
 641   }
 642
 643   if (old_up_primary != up_primary ||
 644       oldup != up) {
 645     info.history.same_up_since = osdmap->get_epoch();
 646   }
 647   // this comparison includes primary rank via pg_shard_t
 648   if (old_acting_primary != get_primary()) {
 649     info.history.same_primary_since = osdmap->get_epoch();
 650   }
 651
 652   on_new_interval();
 653   pl->on_info_history_change();
 654
 655   psdout(1) << __func__ << " up " << oldup << " -> " << up
 656             << ", acting " << oldacting << " -> " << acting
 657             << ", acting_primary " << old_acting_primary << " -> "
 658             << new_acting_primary
 659             << ", up_primary " << old_up_primary << " -> " << new_up_primary
 660             << ", role " << oldrole << " -> " << role
 661             << ", features acting " << acting_features
 662             << " upacting " << upacting_features
 663             << dendl;
 664
 665   // deactivate.
 666   state_clear(PG_STATE_ACTIVE);
 667   state_clear(PG_STATE_PEERED);
 668   state_clear(PG_STATE_PREMERGE);
 669   state_clear(PG_STATE_DOWN);
 670   state_clear(PG_STATE_RECOVERY_WAIT);
 671   state_clear(PG_STATE_RECOVERY_TOOFULL);
 672   state_clear(PG_STATE_RECOVERING);
 673
 674   peer_purged.clear();
 675   acting_recovery_backfill.clear();
 676
 677   // reset primary/replica state?
 678   if (was_old_primary || is_primary()) {
 679     pl->clear_want_pg_temp();
 680   } else if (was_old_nonprimary || is_nonprimary()) {
 681     pl->clear_want_pg_temp();
 682   }
 683   clear_primary_state();
 684
 685   pl->on_change(t);
 686
 687   ceph_assert(!deleting);
 688
 689   // should we tell the primary we are here?
 690   send_notify = !is_primary();
 691
 692   if (role != oldrole ||
 693       was_old_primary != is_primary()) {
 694     // did primary change?
 695     if (was_old_primary != is_primary()) {
 696       state_clear(PG_STATE_CLEAN);
 697     }
 698
 699     pl->on_role_change();
 700   } else {
 701     // no role change.
 702     // did primary change?
 703     if (get_primary() != old_acting_primary) {
 704       psdout(10) << oldacting << " -> " << acting
 705                << ", acting primary "
 706                << old_acting_primary << " -> " << get_primary()
 707                << dendl;
 708     } else {
 709       // primary is the same.
 710       if (is_primary()) {
 711         // i am (still) primary. but my replica set changed.
 712         state_clear(PG_STATE_CLEAN);
 713
 714         psdout(10) << oldacting << " -> " << acting
 715                  << ", replicas changed" << dendl;
 716       }
 717     }
 718   }
 719
 720   if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
 721     psdout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
 722     pl->queue_want_pg_temp(acting);
 723   }
 724 }
 725
 726 void PeeringState::on_new_interval()
 727 {
 728   dout(20) << __func__ << dendl;
 729   const OSDMapRef osdmap = get_osdmap();
 730
 731   // initialize features
 732   acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
 733   upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
 734   for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
 735     if (*p == CRUSH_ITEM_NONE)
 736       continue;
 737     uint64_t f = osdmap->get_xinfo(*p).features;
 738     acting_features &= f;
 739     upacting_features &= f;
 740   }
 741   for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
 742     if (*p == CRUSH_ITEM_NONE)
 743       continue;
 744     upacting_features &= osdmap->get_xinfo(*p).features;
 745   }
 746   psdout(20) << __func__ << " upacting_features 0x" << std::hex
 747              << upacting_features << std::dec
 748              << " from " << acting << "+" << up << dendl;
 749
 750   psdout(20) << __func__ << " checking missing set deletes flag. missing = "
 751              << get_pg_log().get_missing() << dendl;
 752
 753   if (!pg_log.get_missing().may_include_deletes &&
 754       !perform_deletes_during_peering()) {
 755     pl->rebuild_missing_set_with_deletes(pg_log);
 756   }
 757   ceph_assert(
 758     pg_log.get_missing().may_include_deletes ==
 759     !perform_deletes_during_peering());
 760
 761   init_hb_stamps();
 762
 763   // update lease bounds for a new interval
 764   auto mnow = pl->get_mnow();
 765   prior_readable_until_ub = std::max(prior_readable_until_ub,
 766                                      readable_until_ub);
 767   prior_readable_until_ub = info.history.refresh_prior_readable_until_ub(
 768     mnow, prior_readable_until_ub);
 769   psdout(10) << __func__ << " prior_readable_until_ub "
 770              << prior_readable_until_ub << " (mnow " << mnow << " + "
 771              << info.history.prior_readable_until_ub << ")" << dendl;
 772   prior_readable_down_osds.clear(); // we populate this when we build the priorset
 773
 774   readable_until =
 775     readable_until_ub =
 776     readable_until_ub_sent =
 777     readable_until_ub_from_primary = ceph::signedspan::zero();
 778
 779   acting_readable_until_ub.clear();
 780   if (is_primary()) {
 781     acting_readable_until_ub.resize(acting.size(), ceph::signedspan::zero());
 782   }
 783
 784   pl->on_new_interval();
 785 }
 786
 787 void PeeringState::init_primary_up_acting(
 788   const vector<int> &newup,
 789   const vector<int> &newacting,
 790   int new_up_primary,
 791   int new_acting_primary)
 792 {
 793   actingset.clear();
 794   acting = newacting;
 795   for (uint8_t i = 0; i < acting.size(); ++i) {
 796     if (acting[i] != CRUSH_ITEM_NONE)
 797       actingset.insert(
 798         pg_shard_t(
 799           acting[i],
 800           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
 801   }
 802   upset.clear();
 803   up = newup;
 804   for (uint8_t i = 0; i < up.size(); ++i) {
 805     if (up[i] != CRUSH_ITEM_NONE)
 806       upset.insert(
 807         pg_shard_t(
 808           up[i],
 809           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
 810   }
 811   if (!pool.info.is_erasure()) {
 812     // replicated
 813     up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
 814     primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
 815   } else {
 816     // erasure
 817     up_primary = pg_shard_t();
 818     primary = pg_shard_t();
 819     for (uint8_t i = 0; i < up.size(); ++i) {
 820       if (up[i] == new_up_primary) {
 821         up_primary = pg_shard_t(up[i], shard_id_t(i));
 822         break;
 823       }
 824     }
 825     for (uint8_t i = 0; i < acting.size(); ++i) {
 826       if (acting[i] == new_acting_primary) {
 827         primary = pg_shard_t(acting[i], shard_id_t(i));
 828         break;
 829       }
 830     }
 831     ceph_assert(up_primary.osd == new_up_primary);
 832     ceph_assert(primary.osd == new_acting_primary);
 833   }
 834 }
 835
 836 void PeeringState::init_hb_stamps()
 837 {
 838   if (is_primary()) {
 839     // we care about all other osds in the acting set
 840     hb_stamps.resize(acting.size() - 1);
 841     unsigned i = 0;
 842     for (auto p : acting) {
 843       if (p == CRUSH_ITEM_NONE || p == get_primary().osd) {
 844         continue;
 845       }
 846       hb_stamps[i++] = pl->get_hb_stamps(p);
 847     }
 848     hb_stamps.resize(i);
 849   } else if (is_nonprimary()) {
 850     // we care about just the primary
 851     hb_stamps.resize(1);
 852     hb_stamps[0] = pl->get_hb_stamps(get_primary().osd);
 853   } else {
 854     hb_stamps.clear();
 855   }
 856   dout(10) << __func__ << " now " << hb_stamps << dendl;
 857 }
 858
 859
 860 void PeeringState::clear_recovery_state()
 861 {
 862   async_recovery_targets.clear();
 863   backfill_targets.clear();
 864 }
 865
 866 void PeeringState::clear_primary_state()
 867 {
 868   psdout(10) << "clear_primary_state" << dendl;
 869
 870   // clear peering state
 871   stray_set.clear();
 872   peer_log_requested.clear();
 873   peer_missing_requested.clear();
 874   peer_info.clear();
 875   peer_bytes.clear();
 876   peer_missing.clear();
 877   peer_last_complete_ondisk.clear();
 878   peer_activated.clear();
 879   min_last_complete_ondisk = eversion_t();
 880   pg_trim_to = eversion_t();
 881   might_have_unfound.clear();
 882   need_up_thru = false;
 883   missing_loc.clear();
 884   pg_log.reset_recovery_pointers();
 885
 886   clear_recovery_state();
 887
 888   last_update_ondisk = eversion_t();
 889   missing_loc.clear();
 890   pl->clear_primary_state();
 891 }
 892
 893 /// return [start,end) bounds for required past_intervals
 894 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
 895   const pg_info_t &info,
 896   epoch_t oldest_map) {
 897   epoch_t start = std::max(
 898     info.history.last_epoch_clean ? info.history.last_epoch_clean :
 899     info.history.epoch_pool_created,
 900     oldest_map);
 901   epoch_t end = std::max(
 902     info.history.same_interval_since,
 903     info.history.epoch_pool_created);
 904   return make_pair(start, end);
 905 }
 906
 907
 908 void PeeringState::check_past_interval_bounds() const
 909 {
 910   auto oldest_epoch = pl->oldest_stored_osdmap();
 911   auto rpib = get_required_past_interval_bounds(
 912     info,
 913     oldest_epoch);
 914   if (rpib.first >= rpib.second) {
 915     // do not warn if the start bound is dictated by oldest_map; the
 916     // past intervals are presumably appropriate given the pg info.
 917     if (!past_intervals.empty() &&
 918         rpib.first > oldest_epoch) {
 919       pl->get_clog_error() << info.pgid << " required past_interval bounds are"
 920                              << " empty [" << rpib << ") but past_intervals is not: "
 921                              << past_intervals;
 922       derr << info.pgid << " required past_interval bounds are"
 923            << " empty [" << rpib << ") but past_intervals is not: "
 924            << past_intervals << dendl;
 925     }
 926   } else {
 927     if (past_intervals.empty()) {
 928       pl->get_clog_error() << info.pgid << " required past_interval bounds are"
 929                              << " not empty [" << rpib << ") but past_intervals "
 930                              << past_intervals << " is empty";
 931       derr << info.pgid << " required past_interval bounds are"
 932            << " not empty [" << rpib << ") but past_intervals "
 933            << past_intervals << " is empty" << dendl;
 934       ceph_assert(!past_intervals.empty());
 935     }
 936
 937     auto apib = past_intervals.get_bounds();
 938     if (apib.first > rpib.first) {
 939       pl->get_clog_error() << info.pgid << " past_intervals [" << apib
 940                              << ") start interval does not contain the required"
 941                              << " bound [" << rpib << ") start";
 942       derr << info.pgid << " past_intervals [" << apib
 943            << ") start interval does not contain the required"
 944            << " bound [" << rpib << ") start" << dendl;
 945       ceph_abort_msg("past_interval start interval mismatch");
 946     }
 947     if (apib.second != rpib.second) {
 948       pl->get_clog_error() << info.pgid << " past_interal bound [" << apib
 949                              << ") end does not match required [" << rpib
 950                              << ") end";
 951       derr << info.pgid << " past_interal bound [" << apib
 952            << ") end does not match required [" << rpib
 953            << ") end" << dendl;
 954       ceph_abort_msg("past_interval end mismatch");
 955     }
 956   }
 957 }
 958
 959 int PeeringState::clamp_recovery_priority(int priority, int pool_recovery_priority, int max)
 960 {
 961   static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
 962   static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
 963
 964   ceph_assert(max <= OSD_RECOVERY_PRIORITY_MAX);
 965
 966   // User can't set this too high anymore, but might be a legacy value
 967   if (pool_recovery_priority > OSD_POOL_PRIORITY_MAX)
 968     pool_recovery_priority = OSD_POOL_PRIORITY_MAX;
 969   if (pool_recovery_priority < OSD_POOL_PRIORITY_MIN)
 970     pool_recovery_priority = OSD_POOL_PRIORITY_MIN;
 971   // Shift range from min to max to 0 to max - min
 972   pool_recovery_priority += (0 - OSD_POOL_PRIORITY_MIN);
 973   ceph_assert(pool_recovery_priority >= 0 && pool_recovery_priority <= (OSD_POOL_PRIORITY_MAX - OSD_POOL_PRIORITY_MIN));
 974
 975   priority += pool_recovery_priority;
 976
 977   // Clamp to valid range
 978   if (priority > max) {
 979     return max;
 980   } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
 981     return OSD_RECOVERY_PRIORITY_MIN;
 982   } else {
 983     return priority;
 984   }
 985 }
 986
 987 unsigned PeeringState::get_recovery_priority()
 988 {
 989   // a higher value -> a higher priority
 990   int ret = OSD_RECOVERY_PRIORITY_BASE;
 991   int base = ret;
 992
 993   if (state & PG_STATE_FORCED_RECOVERY) {
 994     ret = OSD_RECOVERY_PRIORITY_FORCED;
 995   } else {
 996     // XXX: This priority boost isn't so much about inactive, but about data-at-risk
 997     if (is_degraded() && info.stats.avail_no_missing.size() < pool.info.min_size) {
 998       base = OSD_RECOVERY_INACTIVE_PRIORITY_BASE;
 999       // inactive: no. of replicas < min_size, highest priority since it blocks IO
1000       ret = base + (pool.info.min_size - info.stats.avail_no_missing.size());
1001     }
1002
1003     int64_t pool_recovery_priority = 0;
1004     pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
1005
1006     ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
1007   }
1008   psdout(20) << __func__ << " recovery priority is " << ret << dendl;
1009   return static_cast<unsigned>(ret);
1010 }
1011
1012 unsigned PeeringState::get_backfill_priority()
1013 {
1014   // a higher value -> a higher priority
1015   int ret = OSD_BACKFILL_PRIORITY_BASE;
1016   int base = ret;
1017
1018   if (state & PG_STATE_FORCED_BACKFILL) {
1019     ret = OSD_BACKFILL_PRIORITY_FORCED;
1020   } else {
1021     if (actingset.size() < pool.info.min_size) {
1022       base = OSD_BACKFILL_INACTIVE_PRIORITY_BASE;
1023       // inactive: no. of replicas < min_size, highest priority since it blocks IO
1024       ret = base + (pool.info.min_size - actingset.size());
1025
1026     } else if (is_undersized()) {
1027       // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
1028       ceph_assert(pool.info.size > actingset.size());
1029       base = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1030       ret = base + (pool.info.size - actingset.size());
1031
1032     } else if (is_degraded()) {
1033       // degraded: baseline degraded
1034       base = ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1035     }
1036
1037     // Adjust with pool's recovery priority
1038     int64_t pool_recovery_priority = 0;
1039     pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
1040
1041     ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
1042   }
1043
1044   psdout(20) << __func__ << " backfill priority is " << ret << dendl;
1045   return static_cast<unsigned>(ret);
1046 }
1047
1048 unsigned PeeringState::get_delete_priority()
1049 {
1050   auto state = get_osdmap()->get_state(pg_whoami.osd);
1051   if (state & (CEPH_OSD_BACKFILLFULL |
1052                CEPH_OSD_FULL)) {
1053     return OSD_DELETE_PRIORITY_FULL;
1054   } else if (state & CEPH_OSD_NEARFULL) {
1055     return OSD_DELETE_PRIORITY_FULLISH;
1056   } else {
1057     return OSD_DELETE_PRIORITY_NORMAL;
1058   }
1059 }
1060
1061 bool PeeringState::set_force_recovery(bool b)
1062 {
1063   bool did = false;
1064   if (b) {
1065     if (!(state & PG_STATE_FORCED_RECOVERY) &&
1066         (state & (PG_STATE_DEGRADED |
1067                   PG_STATE_RECOVERY_WAIT |
1068                   PG_STATE_RECOVERING))) {
1069       psdout(20) << __func__ << " set" << dendl;
1070       state_set(PG_STATE_FORCED_RECOVERY);
1071       pl->publish_stats_to_osd();
1072       did = true;
1073     }
1074   } else if (state & PG_STATE_FORCED_RECOVERY) {
1075     psdout(20) << __func__ << " clear" << dendl;
1076     state_clear(PG_STATE_FORCED_RECOVERY);
1077     pl->publish_stats_to_osd();
1078     did = true;
1079   }
1080   if (did) {
1081     psdout(20) << __func__ << " state " << get_current_state()
1082              << dendl;
1083     pl->update_local_background_io_priority(get_recovery_priority());
1084   }
1085   return did;
1086 }
1087
1088 bool PeeringState::set_force_backfill(bool b)
1089 {
1090   bool did = false;
1091   if (b) {
1092     if (!(state & PG_STATE_FORCED_BACKFILL) &&
1093         (state & (PG_STATE_DEGRADED |
1094                   PG_STATE_BACKFILL_WAIT |
1095                   PG_STATE_BACKFILLING))) {
1096       psdout(10) << __func__ << " set" << dendl;
1097       state_set(PG_STATE_FORCED_BACKFILL);
1098       pl->publish_stats_to_osd();
1099       did = true;
1100     }
1101   } else if (state & PG_STATE_FORCED_BACKFILL) {
1102     psdout(10) << __func__ << " clear" << dendl;
1103     state_clear(PG_STATE_FORCED_BACKFILL);
1104     pl->publish_stats_to_osd();
1105     did = true;
1106   }
1107   if (did) {
1108     psdout(20) << __func__ << " state " << get_current_state()
1109              << dendl;
1110     pl->update_local_background_io_priority(get_backfill_priority());
1111   }
1112   return did;
1113 }
1114
1115 void PeeringState::schedule_renew_lease()
1116 {
1117   pl->schedule_renew_lease(
1118     last_peering_reset,
1119     readable_interval / 2);
1120 }
1121
1122 void PeeringState::send_lease()
1123 {
1124   epoch_t epoch = pl->get_osdmap_epoch();
1125   for (auto peer : actingset) {
1126     if (peer == pg_whoami) {
1127       continue;
1128     }
1129     pl->send_cluster_message(
1130       peer.osd,
1131       new MOSDPGLease(epoch,
1132                       spg_t(spgid.pgid, peer.shard),
1133                       get_lease()),
1134       epoch);
1135   }
1136 }
1137
1138 void PeeringState::proc_lease(const pg_lease_t& l)
1139 {
1140   if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1141     psdout(20) << __func__ << " no-op, upacting_features 0x" << std::hex
1142                << upacting_features << std::dec
1143                << " does not include SERVER_OCTOPUS" << dendl;
1144     return;
1145   }
1146   if (!is_nonprimary()) {
1147     psdout(20) << __func__ << " no-op, !nonprimary" << dendl;
1148     return;
1149   }
1150   psdout(10) << __func__ << " " << l << dendl;
1151   if (l.readable_until_ub > readable_until_ub_from_primary) {
1152     readable_until_ub_from_primary = l.readable_until_ub;
1153   }
1154
1155   ceph::signedspan ru = ceph::signedspan::zero();
1156   if (l.readable_until != ceph::signedspan::zero() &&
1157       hb_stamps[0]->peer_clock_delta_ub) {
1158     ru = l.readable_until - *hb_stamps[0]->peer_clock_delta_ub;
1159     psdout(20) << " peer_clock_delta_ub " << *hb_stamps[0]->peer_clock_delta_ub
1160                << " -> ru " << ru << dendl;
1161   }
1162   if (ru > readable_until) {
1163     readable_until = ru;
1164     psdout(20) << __func__ << " readable_until now " << readable_until << dendl;
1165     // NOTE: if we ever decide to block/queue ops on the replica,
1166     // we'll need to wake them up here.
1167   }
1168
1169   ceph::signedspan ruub;
1170   if (hb_stamps[0]->peer_clock_delta_lb) {
1171     ruub = l.readable_until_ub - *hb_stamps[0]->peer_clock_delta_lb;
1172     psdout(20) << " peer_clock_delta_lb " << *hb_stamps[0]->peer_clock_delta_lb
1173                << " -> ruub " << ruub << dendl;
1174   } else {
1175     ruub = pl->get_mnow() + l.interval;
1176     psdout(20) << " no peer_clock_delta_lb -> ruub " << ruub << dendl;
1177   }
1178   if (ruub > readable_until_ub) {
1179     readable_until_ub = ruub;
1180     psdout(20) << __func__ << " readable_until_ub now " << readable_until_ub
1181                << dendl;
1182   }
1183 }
1184
1185 void PeeringState::proc_lease_ack(int from, const pg_lease_ack_t& a)
1186 {
1187   if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1188     return;
1189   }
1190   auto now = pl->get_mnow();
1191   bool was_min = false;
1192   for (unsigned i = 0; i < acting.size(); ++i) {
1193     if (from == acting[i]) {
1194       // the lease_ack value is based on the primary's clock
1195       if (a.readable_until_ub > acting_readable_until_ub[i]) {
1196         if (acting_readable_until_ub[i] == readable_until) {
1197           was_min = true;
1198         }
1199         acting_readable_until_ub[i] = a.readable_until_ub;
1200         break;
1201       }
1202     }
1203   }
1204   if (was_min) {
1205     auto old_ru = readable_until;
1206     recalc_readable_until();
1207     if (now < old_ru) {
1208       pl->recheck_readable();
1209     }
1210   }
1211 }
1212
1213 void PeeringState::proc_renew_lease()
1214 {
1215   if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1216     return;
1217   }
1218   renew_lease(pl->get_mnow());
1219   send_lease();
1220   schedule_renew_lease();
1221 }
1222
1223 void PeeringState::recalc_readable_until()
1224 {
1225   assert(is_primary());
1226   ceph::signedspan min = readable_until_ub_sent;
1227   for (unsigned i = 0; i < acting.size(); ++i) {
1228     if (acting[i] == pg_whoami.osd || acting[i] == CRUSH_ITEM_NONE) {
1229       continue;
1230     }
1231     dout(20) << __func__ << " peer osd." << acting[i]
1232              << " ruub " << acting_readable_until_ub[i] << dendl;
1233     if (acting_readable_until_ub[i] < min) {
1234       min = acting_readable_until_ub[i];
1235     }
1236   }
1237   readable_until = min;
1238   readable_until_ub = min;
1239   dout(20) << __func__ << " readable_until[_ub] " << readable_until
1240            << " (sent " << readable_until_ub_sent << ")" << dendl;
1241 }
1242
1243 bool PeeringState::check_prior_readable_down_osds(const OSDMapRef& map)
1244 {
1245   if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1246     return false;
1247   }
1248   bool changed = false;
1249   auto p = prior_readable_down_osds.begin();
1250   while (p != prior_readable_down_osds.end()) {
1251     if (map->is_dead(*p)) {
1252       dout(10) << __func__ << " prior_readable_down_osds osd." << *p
1253                << " is dead as of epoch " << map->get_epoch()
1254                << dendl;
1255       p = prior_readable_down_osds.erase(p);
1256       changed = true;
1257     } else {
1258       ++p;
1259     }
1260   }
1261   if (changed && prior_readable_down_osds.empty()) {
1262     psdout(10) << " empty prior_readable_down_osds, clearing ub" << dendl;
1263     clear_prior_readable_until_ub();
1264     return true;
1265   }
1266   return false;
1267 }
1268
1269 bool PeeringState::adjust_need_up_thru(const OSDMapRef osdmap)
1270 {
1271   epoch_t up_thru = osdmap->get_up_thru(pg_whoami.osd);
1272   if (need_up_thru &&
1273       up_thru >= info.history.same_interval_since) {
1274     psdout(10) << "adjust_need_up_thru now "
1275                << up_thru << ", need_up_thru now false" << dendl;
1276     need_up_thru = false;
1277     return true;
1278   }
1279   return false;
1280 }
1281
1282 PastIntervals::PriorSet PeeringState::build_prior()
1283 {
1284   if (1) {
1285     // sanity check
1286     for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
1287          it != peer_info.end();
1288          ++it) {
1289       ceph_assert(info.history.last_epoch_started >=
1290                   it->second.history.last_epoch_started);
1291     }
1292   }
1293
1294   const OSDMap &osdmap = *get_osdmap();
1295   PastIntervals::PriorSet prior = past_intervals.get_prior_set(
1296     pool.info.is_erasure(),
1297     info.history.last_epoch_started,
1298     &missing_loc.get_recoverable_predicate(),
1299     [&](epoch_t start, int osd, epoch_t *lost_at) {
1300       const osd_info_t *pinfo = 0;
1301       if (osdmap.exists(osd)) {
1302         pinfo = &osdmap.get_info(osd);
1303         if (lost_at)
1304           *lost_at = pinfo->lost_at;
1305       }
1306
1307       if (osdmap.is_up(osd)) {
1308         return PastIntervals::UP;
1309       } else if (!pinfo) {
1310         return PastIntervals::DNE;
1311       } else if (pinfo->lost_at > start) {
1312         return PastIntervals::LOST;
1313       } else {
1314         return PastIntervals::DOWN;
1315       }
1316     },
1317     up,
1318     acting,
1319     dpp);
1320
1321   if (prior.pg_down) {
1322     state_set(PG_STATE_DOWN);
1323   }
1324
1325   if (get_osdmap()->get_up_thru(pg_whoami.osd) <
1326       info.history.same_interval_since) {
1327     psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1328                << " < same_since " << info.history.same_interval_since
1329                << ", must notify monitor" << dendl;
1330     need_up_thru = true;
1331   } else {
1332     psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1333                << " >= same_since " << info.history.same_interval_since
1334                << ", all is well" << dendl;
1335     need_up_thru = false;
1336   }
1337   pl->set_probe_targets(prior.probe);
1338   return prior;
1339 }
1340
1341 bool PeeringState::needs_recovery() const
1342 {
1343   ceph_assert(is_primary());
1344
1345   auto &missing = pg_log.get_missing();
1346
1347   if (missing.num_missing()) {
1348     psdout(10) << __func__ << " primary has " << missing.num_missing()
1349                << " missing" << dendl;
1350     return true;
1351   }
1352
1353   ceph_assert(!acting_recovery_backfill.empty());
1354   set<pg_shard_t>::const_iterator end = acting_recovery_backfill.end();
1355   set<pg_shard_t>::const_iterator a = acting_recovery_backfill.begin();
1356   for (; a != end; ++a) {
1357     if (*a == get_primary()) continue;
1358     pg_shard_t peer = *a;
1359     map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
1360     if (pm == peer_missing.end()) {
1361       psdout(10) << __func__ << " osd." << peer << " doesn't have missing set"
1362                  << dendl;
1363       continue;
1364     }
1365     if (pm->second.num_missing()) {
1366       psdout(10) << __func__ << " osd." << peer << " has "
1367                  << pm->second.num_missing() << " missing" << dendl;
1368       return true;
1369     }
1370   }
1371
1372   psdout(10) << __func__ << " is recovered" << dendl;
1373   return false;
1374 }
1375
1376 bool PeeringState::needs_backfill() const
1377 {
1378   ceph_assert(is_primary());
1379
1380   // We can assume that only possible osds that need backfill
1381   // are on the backfill_targets vector nodes.
1382   set<pg_shard_t>::const_iterator end = backfill_targets.end();
1383   set<pg_shard_t>::const_iterator a = backfill_targets.begin();
1384   for (; a != end; ++a) {
1385     pg_shard_t peer = *a;
1386     map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
1387     if (!pi->second.last_backfill.is_max()) {
1388       psdout(10) << __func__ << " osd." << peer
1389                  << " has last_backfill " << pi->second.last_backfill << dendl;
1390       return true;
1391     }
1392   }
1393
1394   psdout(10) << __func__ << " does not need backfill" << dendl;
1395   return false;
1396 }
1397
1398 /*
1399  * Returns true unless there is a non-lost OSD in might_have_unfound.
1400  */
1401 bool PeeringState::all_unfound_are_queried_or_lost(
1402   const OSDMapRef osdmap) const
1403 {
1404   ceph_assert(is_primary());
1405
1406   set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
1407   set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
1408   for (; peer != mend; ++peer) {
1409     if (peer_missing.count(*peer))
1410       continue;
1411     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
1412     if (iter != peer_info.end() &&
1413         (iter->second.is_empty() || iter->second.dne()))
1414       continue;
1415     if (!osdmap->exists(peer->osd))
1416       continue;
1417     const osd_info_t &osd_info(osdmap->get_info(peer->osd));
1418     if (osd_info.lost_at <= osd_info.up_from) {
1419       // If there is even one OSD in might_have_unfound that isn't lost, we
1420       // still might retrieve our unfound.
1421       return false;
1422     }
1423   }
1424   psdout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound "
1425              << might_have_unfound
1426              << " have been queried or are marked lost" << dendl;
1427   return true;
1428 }
1429
1430
1431 void PeeringState::reject_reservation()
1432 {
1433   pl->unreserve_recovery_space();
1434   pl->send_cluster_message(
1435     primary.osd,
1436     new MBackfillReserve(
1437       MBackfillReserve::REJECT_TOOFULL,
1438       spg_t(info.pgid.pgid, primary.shard),
1439       get_osdmap_epoch()),
1440     get_osdmap_epoch());
1441 }
1442
1443 /**
1444  * find_best_info
1445  *
1446  * Returns an iterator to the best info in infos sorted by:
1447  *  1) Prefer newer last_update
1448  *  2) Prefer longer tail if it brings another info into contiguity
1449  *  3) Prefer current primary
1450  */
1451 map<pg_shard_t, pg_info_t>::const_iterator PeeringState::find_best_info(
1452   const map<pg_shard_t, pg_info_t> &infos,
1453   bool restrict_to_up_acting,
1454   bool *history_les_bound) const
1455 {
1456   ceph_assert(history_les_bound);
1457   /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1458    * to make changes to this process.  Also, make sure to update it
1459    * when you find bugs! */
1460   eversion_t min_last_update_acceptable = eversion_t::max();
1461   epoch_t max_last_epoch_started_found = 0;
1462   for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1463        i != infos.end();
1464        ++i) {
1465     if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1466         max_last_epoch_started_found < i->second.history.last_epoch_started) {
1467       *history_les_bound = true;
1468       max_last_epoch_started_found = i->second.history.last_epoch_started;
1469     }
1470     if (!i->second.is_incomplete() &&
1471         max_last_epoch_started_found < i->second.last_epoch_started) {
1472       *history_les_bound = false;
1473       max_last_epoch_started_found = i->second.last_epoch_started;
1474     }
1475   }
1476   for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1477        i != infos.end();
1478        ++i) {
1479     if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1480       if (min_last_update_acceptable > i->second.last_update)
1481         min_last_update_acceptable = i->second.last_update;
1482     }
1483   }
1484   if (min_last_update_acceptable == eversion_t::max())
1485     return infos.end();
1486
1487   map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1488   // find osd with newest last_update (oldest for ec_pool).
1489   // if there are multiples, prefer
1490   //  - a longer tail, if it brings another peer into log contiguity
1491   //  - the current primary
1492   for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1493        p != infos.end();
1494        ++p) {
1495     if (restrict_to_up_acting && !is_up(p->first) &&
1496         !is_acting(p->first))
1497       continue;
1498     // Only consider peers with last_update >= min_last_update_acceptable
1499     if (p->second.last_update < min_last_update_acceptable)
1500       continue;
1501     // Disqualify anyone with a too old last_epoch_started
1502     if (p->second.last_epoch_started < max_last_epoch_started_found)
1503       continue;
1504     // Disqualify anyone who is incomplete (not fully backfilled)
1505     if (p->second.is_incomplete())
1506       continue;
1507     if (best == infos.end()) {
1508       best = p;
1509       continue;
1510     }
1511     // Prefer newer last_update
1512     if (pool.info.require_rollback()) {
1513       if (p->second.last_update > best->second.last_update)
1514         continue;
1515       if (p->second.last_update < best->second.last_update) {
1516         best = p;
1517         continue;
1518       }
1519     } else {
1520       if (p->second.last_update < best->second.last_update)
1521         continue;
1522       if (p->second.last_update > best->second.last_update) {
1523         best = p;
1524         continue;
1525       }
1526     }
1527
1528     // Prefer longer tail
1529     if (p->second.log_tail > best->second.log_tail) {
1530       continue;
1531     } else if (p->second.log_tail < best->second.log_tail) {
1532       best = p;
1533       continue;
1534     }
1535
1536     if (!p->second.has_missing() && best->second.has_missing()) {
1537       psdout(10) << __func__ << " prefer osd." << p->first
1538                << " because it is complete while best has missing"
1539                << dendl;
1540       best = p;
1541       continue;
1542     } else if (p->second.has_missing() && !best->second.has_missing()) {
1543       psdout(10) << __func__ << " skipping osd." << p->first
1544                << " because it has missing while best is complete"
1545                << dendl;
1546       continue;
1547     } else {
1548       // both are complete or have missing
1549       // fall through
1550     }
1551
1552     // prefer current primary (usually the caller), all things being equal
1553     if (p->first == pg_whoami) {
1554       psdout(10) << "calc_acting prefer osd." << p->first
1555                  << " because it is current primary" << dendl;
1556       best = p;
1557       continue;
1558     }
1559   }
1560   return best;
1561 }
1562
1563 void PeeringState::calc_ec_acting(
1564   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1565   unsigned size,
1566   const vector<int> &acting,
1567   const vector<int> &up,
1568   const map<pg_shard_t, pg_info_t> &all_info,
1569   bool restrict_to_up_acting,
1570   vector<int> *_want,
1571   set<pg_shard_t> *backfill,
1572   set<pg_shard_t> *acting_backfill,
1573   ostream &ss)
1574 {
1575   vector<int> want(size, CRUSH_ITEM_NONE);
1576   map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1577   for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1578        i != all_info.end();
1579        ++i) {
1580     all_info_by_shard[i->first.shard].insert(i->first);
1581   }
1582   for (uint8_t i = 0; i < want.size(); ++i) {
1583     ss << "For position " << (unsigned)i << ": ";
1584     if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1585         !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1586         all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1587         auth_log_shard->second.log_tail) {
1588       ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1589       want[i] = up[i];
1590       continue;
1591     }
1592     if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1593       ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1594          << " and ";
1595       backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1596     }
1597
1598     if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1599         !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1600         all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1601         auth_log_shard->second.log_tail) {
1602       ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1603       want[i] = acting[i];
1604     } else if (!restrict_to_up_acting) {
1605       for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1606            j != all_info_by_shard[shard_id_t(i)].end();
1607            ++j) {
1608         ceph_assert(j->shard == i);
1609         if (!all_info.find(*j)->second.is_incomplete() &&
1610             all_info.find(*j)->second.last_update >=
1611             auth_log_shard->second.log_tail) {
1612           ss << " selecting stray: " << *j << std::endl;
1613           want[i] = j->osd;
1614           break;
1615         }
1616       }
1617       if (want[i] == CRUSH_ITEM_NONE)
1618         ss << " failed to fill position " << (int)i << std::endl;
1619     }
1620   }
1621
1622   for (uint8_t i = 0; i < want.size(); ++i) {
1623     if (want[i] != CRUSH_ITEM_NONE) {
1624       acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1625     }
1626   }
1627   acting_backfill->insert(backfill->begin(), backfill->end());
1628   _want->swap(want);
1629 }
1630
1631 /**
1632  * calculate the desired acting set.
1633  *
1634  * Choose an appropriate acting set.  Prefer up[0], unless it is
1635  * incomplete, or another osd has a longer tail that allows us to
1636  * bring other up nodes up to date.
1637  */
1638 void PeeringState::calc_replicated_acting(
1639   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1640   uint64_t force_auth_primary_missing_objects,
1641   unsigned size,
1642   const vector<int> &acting,
1643   const vector<int> &up,
1644   pg_shard_t up_primary,
1645   const map<pg_shard_t, pg_info_t> &all_info,
1646   bool restrict_to_up_acting,
1647   vector<int> *want,
1648   set<pg_shard_t> *backfill,
1649   set<pg_shard_t> *acting_backfill,
1650   const OSDMapRef osdmap,
1651   ostream &ss)
1652 {
1653   pg_shard_t auth_log_shard_id = auth_log_shard->first;
1654
1655   ss << __func__ << " newest update on osd." << auth_log_shard_id
1656      << " with " << auth_log_shard->second
1657      << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1658
1659   // select primary
1660   auto primary = all_info.find(up_primary);
1661   if (up.size() &&
1662       !primary->second.is_incomplete() &&
1663       primary->second.last_update >=
1664         auth_log_shard->second.log_tail) {
1665     if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1666       auto approx_missing_objects =
1667         primary->second.stats.stats.sum.num_objects_missing;
1668       auto auth_version = auth_log_shard->second.last_update.version;
1669       auto primary_version = primary->second.last_update.version;
1670       if (auth_version > primary_version) {
1671         approx_missing_objects += auth_version - primary_version;
1672       } else {
1673         approx_missing_objects += primary_version - auth_version;
1674       }
1675       if ((uint64_t)approx_missing_objects >
1676           force_auth_primary_missing_objects) {
1677         primary = auth_log_shard;
1678         ss << "up_primary: " << up_primary << ") has approximate "
1679            << approx_missing_objects
1680            << "(>" << force_auth_primary_missing_objects <<") "
1681            << "missing objects, osd." << auth_log_shard_id
1682            << " selected as primary instead"
1683            << std::endl;
1684       } else {
1685         ss << "up_primary: " << up_primary << ") selected as primary"
1686            << std::endl;
1687       }
1688     } else {
1689       ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1690     }
1691   } else {
1692     ceph_assert(!auth_log_shard->second.is_incomplete());
1693     ss << "up[0] needs backfill, osd." << auth_log_shard_id
1694        << " selected as primary instead" << std::endl;
1695     primary = auth_log_shard;
1696   }
1697
1698   ss << __func__ << " primary is osd." << primary->first
1699      << " with " << primary->second << std::endl;
1700   want->push_back(primary->first.osd);
1701   acting_backfill->insert(primary->first);
1702
1703   /* We include auth_log_shard->second.log_tail because in GetLog,
1704    * we will request logs back to the min last_update over our
1705    * acting_backfill set, which will result in our log being extended
1706    * as far backwards as necessary to pick up any peers which can
1707    * be log recovered by auth_log_shard's log */
1708   eversion_t oldest_auth_log_entry =
1709     std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
1710
1711   // select replicas that have log contiguity with primary.
1712   // prefer up, then acting, then any peer_info osds
1713   for (auto i : up) {
1714     pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
1715     if (up_cand == primary->first)
1716       continue;
1717     const pg_info_t &cur_info = all_info.find(up_cand)->second;
1718     if (cur_info.is_incomplete() ||
1719         cur_info.last_update < oldest_auth_log_entry) {
1720       ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1721       backfill->insert(up_cand);
1722       acting_backfill->insert(up_cand);
1723     } else {
1724       want->push_back(i);
1725       acting_backfill->insert(up_cand);
1726       ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
1727     }
1728   }
1729
1730   if (want->size() >= size) {
1731     return;
1732   }
1733
1734   std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
1735   candidate_by_last_update.reserve(acting.size());
1736   // This no longer has backfill OSDs, but they are covered above.
1737   for (auto i : acting) {
1738     pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
1739     // skip up osds we already considered above
1740     if (acting_cand == primary->first)
1741       continue;
1742     vector<int>::const_iterator up_it = find(up.begin(), up.end(), i);
1743     if (up_it != up.end())
1744       continue;
1745
1746     const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1747     if (cur_info.is_incomplete() ||
1748         cur_info.last_update < oldest_auth_log_entry) {
1749       ss << " shard " << acting_cand << " (acting) REJECTED "
1750          << cur_info << std::endl;
1751     } else {
1752       candidate_by_last_update.emplace_back(cur_info.last_update, i);
1753     }
1754   }
1755
1756   auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs,
1757                             const std::pair<eversion_t, int> &rhs) {
1758     return lhs.first > rhs.first;
1759   };
1760   // sort by last_update, in descending order.
1761   std::sort(candidate_by_last_update.begin(),
1762             candidate_by_last_update.end(), sort_by_eversion);
1763   for (auto &p: candidate_by_last_update) {
1764     ceph_assert(want->size() < size);
1765     want->push_back(p.second);
1766     pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1767     acting_backfill->insert(s);
1768     ss << " shard " << s << " (acting) accepted "
1769        << all_info.find(s)->second << std::endl;
1770     if (want->size() >= size) {
1771       return;
1772     }
1773   }
1774
1775   if (restrict_to_up_acting) {
1776     return;
1777   }
1778   candidate_by_last_update.clear();
1779   candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
1780   // continue to search stray to find more suitable peers
1781   for (auto &i : all_info) {
1782     // skip up osds we already considered above
1783     if (i.first == primary->first)
1784       continue;
1785     vector<int>::const_iterator up_it = find(up.begin(), up.end(), i.first.osd);
1786     if (up_it != up.end())
1787       continue;
1788     vector<int>::const_iterator acting_it = find(
1789       acting.begin(), acting.end(), i.first.osd);
1790     if (acting_it != acting.end())
1791       continue;
1792
1793     if (i.second.is_incomplete() ||
1794         i.second.last_update < oldest_auth_log_entry) {
1795       ss << " shard " << i.first << " (stray) REJECTED " << i.second
1796          << std::endl;
1797     } else {
1798       candidate_by_last_update.emplace_back(
1799         i.second.last_update, i.first.osd);
1800     }
1801   }
1802
1803   if (candidate_by_last_update.empty()) {
1804     // save us some effort
1805     return;
1806   }
1807
1808   // sort by last_update, in descending order.
1809   std::sort(candidate_by_last_update.begin(),
1810             candidate_by_last_update.end(), sort_by_eversion);
1811
1812   for (auto &p: candidate_by_last_update) {
1813     ceph_assert(want->size() < size);
1814     want->push_back(p.second);
1815     pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1816     acting_backfill->insert(s);
1817     ss << " shard " << s << " (stray) accepted "
1818        << all_info.find(s)->second << std::endl;
1819     if (want->size() >= size) {
1820       return;
1821     }
1822   }
1823 }
1824
1825 bool PeeringState::recoverable(const vector<int> &want) const
1826 {
1827   unsigned num_want_acting = 0;
1828   set<pg_shard_t> have;
1829   for (int i = 0; i < (int)want.size(); ++i) {
1830     if (want[i] != CRUSH_ITEM_NONE) {
1831       ++num_want_acting;
1832       have.insert(
1833         pg_shard_t(
1834           want[i],
1835           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1836     }
1837   }
1838
1839   if (num_want_acting < pool.info.min_size) {
1840     const bool recovery_ec_pool_below_min_size=
1841       HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_OCTOPUS);
1842
1843     if (pool.info.is_erasure() && !recovery_ec_pool_below_min_size) {
1844       psdout(10) << __func__ << " failed, ec recovery below min size not supported by pre-octopus" << dendl;
1845       return false;
1846     } else if (!cct->_conf.get_val<bool>("osd_allow_recovery_below_min_size")) {
1847       psdout(10) << __func__ << " failed, recovery below min size not enabled" << dendl;
1848       return false;
1849     }
1850   }
1851   if (missing_loc.get_recoverable_predicate()(have)) {
1852     return true;
1853   } else {
1854     psdout(10) << __func__ << " failed, not recoverable " << dendl;
1855     return false;
1856   }
1857 }
1858
1859 void PeeringState::choose_async_recovery_ec(
1860   const map<pg_shard_t, pg_info_t> &all_info,
1861   const pg_info_t &auth_info,
1862   vector<int> *want,
1863   set<pg_shard_t> *async_recovery,
1864   const OSDMapRef osdmap) const
1865 {
1866   set<pair<int, pg_shard_t> > candidates_by_cost;
1867   for (uint8_t i = 0; i < want->size(); ++i) {
1868     if ((*want)[i] == CRUSH_ITEM_NONE)
1869       continue;
1870
1871     // Considering log entries to recover is accurate enough for
1872     // now. We could use minimum_to_decode_with_cost() later if
1873     // necessary.
1874     pg_shard_t shard_i((*want)[i], shard_id_t(i));
1875     // do not include strays
1876     if (stray_set.find(shard_i) != stray_set.end())
1877       continue;
1878     // Do not include an osd that is not up, since choosing it as
1879     // an async_recovery_target will move it out of the acting set.
1880     // This results in it being identified as a stray during peering,
1881     // because it is no longer in the up or acting set.
1882     if (!is_up(shard_i))
1883       continue;
1884     auto shard_info = all_info.find(shard_i)->second;
1885     // for ec pools we rollback all entries past the authoritative
1886     // last_update *before* activation. This is relatively inexpensive
1887     // compared to recovery, since it is purely local, so treat shards
1888     // past the authoritative last_update the same as those equal to it.
1889     version_t auth_version = auth_info.last_update.version;
1890     version_t candidate_version = shard_info.last_update.version;
1891     if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1892       auto approx_missing_objects =
1893         shard_info.stats.stats.sum.num_objects_missing;
1894       if (auth_version > candidate_version) {
1895         approx_missing_objects += auth_version - candidate_version;
1896       }
1897       if (static_cast<uint64_t>(approx_missing_objects) >
1898          cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1899         candidates_by_cost.emplace(approx_missing_objects, shard_i);
1900       }
1901     } else {
1902       if (auth_version > candidate_version &&
1903           (auth_version - candidate_version) > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1904         candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i));
1905       }
1906     }
1907   }
1908
1909   psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1910              << dendl;
1911
1912   // take out as many osds as we can for async recovery, in order of cost
1913   for (auto rit = candidates_by_cost.rbegin();
1914        rit != candidates_by_cost.rend(); ++rit) {
1915     pg_shard_t cur_shard = rit->second;
1916     vector<int> candidate_want(*want);
1917     candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE;
1918     if (recoverable(candidate_want)) {
1919       want->swap(candidate_want);
1920       async_recovery->insert(cur_shard);
1921     }
1922   }
1923   psdout(20) << __func__ << " result want=" << *want
1924              << " async_recovery=" << *async_recovery << dendl;
1925 }
1926
1927 void PeeringState::choose_async_recovery_replicated(
1928   const map<pg_shard_t, pg_info_t> &all_info,
1929   const pg_info_t &auth_info,
1930   vector<int> *want,
1931   set<pg_shard_t> *async_recovery,
1932   const OSDMapRef osdmap) const
1933 {
1934   set<pair<int, pg_shard_t> > candidates_by_cost;
1935   for (auto osd_num : *want) {
1936     pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD);
1937     // do not include strays
1938     if (stray_set.find(shard_i) != stray_set.end())
1939       continue;
1940     // Do not include an osd that is not up, since choosing it as
1941     // an async_recovery_target will move it out of the acting set.
1942     // This results in it being identified as a stray during peering,
1943     // because it is no longer in the up or acting set.
1944     if (!is_up(shard_i))
1945       continue;
1946     auto shard_info = all_info.find(shard_i)->second;
1947     // use the approximate magnitude of the difference in length of
1948     // logs plus historical missing objects as the cost of recovery
1949     version_t auth_version = auth_info.last_update.version;
1950     version_t candidate_version = shard_info.last_update.version;
1951     if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1952       auto approx_missing_objects =
1953         shard_info.stats.stats.sum.num_objects_missing;
1954       if (auth_version > candidate_version) {
1955         approx_missing_objects += auth_version - candidate_version;
1956       } else {
1957         approx_missing_objects += candidate_version - auth_version;
1958       }
1959       if (static_cast<uint64_t>(approx_missing_objects)  >
1960          cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1961         candidates_by_cost.emplace(approx_missing_objects, shard_i);
1962       }
1963     } else {
1964       size_t approx_entries;
1965       if (auth_version > candidate_version) {
1966         approx_entries = auth_version - candidate_version;
1967       } else {
1968         approx_entries = candidate_version - auth_version;
1969       }
1970       if (approx_entries > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1971         candidates_by_cost.insert(make_pair(approx_entries, shard_i));
1972       }
1973     }
1974   }
1975
1976   psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1977              << dendl;
1978   // take out as many osds as we can for async recovery, in order of cost
1979   for (auto rit = candidates_by_cost.rbegin();
1980        rit != candidates_by_cost.rend(); ++rit) {
1981     if (want->size() <= pool.info.min_size) {
1982       break;
1983     }
1984     pg_shard_t cur_shard = rit->second;
1985     vector<int> candidate_want(*want);
1986     for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) {
1987       if (*it == cur_shard.osd) {
1988         candidate_want.erase(it);
1989         want->swap(candidate_want);
1990         async_recovery->insert(cur_shard);
1991         break;
1992       }
1993     }
1994   }
1995   psdout(20) << __func__ << " result want=" << *want
1996              << " async_recovery=" << *async_recovery << dendl;
1997 }
1998
1999
2000
2001 /**
2002  * choose acting
2003  *
2004  * calculate the desired acting, and request a change with the monitor
2005  * if it differs from the current acting.
2006  *
2007  * if restrict_to_up_acting=true, we filter out anything that's not in
2008  * up/acting.  in order to lift this restriction, we need to
2009  *  1) check whether it's worth switching the acting set any time we get
2010  *     a new pg info (not just here, when recovery finishes)
2011  *  2) check whether anything in want_acting went down on each new map
2012  *     (and, if so, calculate a new want_acting)
2013  *  3) remove the assertion in PG::PeeringState::Active::react(const AdvMap)
2014  * TODO!
2015  */
2016 bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
2017                                  bool restrict_to_up_acting,
2018                                  bool *history_les_bound,
2019                                  bool request_pg_temp_change_only)
2020 {
2021   map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
2022   all_info[pg_whoami] = info;
2023
2024   if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
2025     for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
2026          p != all_info.end();
2027          ++p) {
2028       psdout(10) << __func__ << " all_info osd." << p->first << " "
2029                  << p->second << dendl;
2030     }
2031   }
2032
2033   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
2034     find_best_info(all_info, restrict_to_up_acting, history_les_bound);
2035
2036   if (auth_log_shard == all_info.end()) {
2037     if (up != acting) {
2038       psdout(10) << __func__ << " no suitable info found (incomplete backfills?),"
2039                  << " reverting to up" << dendl;
2040       want_acting = up;
2041       vector<int> empty;
2042       pl->queue_want_pg_temp(empty);
2043     } else {
2044       psdout(10) << __func__ << " failed" << dendl;
2045       ceph_assert(want_acting.empty());
2046     }
2047     return false;
2048   }
2049
2050   ceph_assert(!auth_log_shard->second.is_incomplete());
2051   auth_log_shard_id = auth_log_shard->first;
2052
2053   set<pg_shard_t> want_backfill, want_acting_backfill;
2054   vector<int> want;
2055   stringstream ss;
2056   if (pool.info.is_replicated())
2057     calc_replicated_acting(
2058       auth_log_shard,
2059       cct->_conf.get_val<uint64_t>(
2060         "osd_force_auth_primary_missing_objects"),
2061       get_osdmap()->get_pg_size(info.pgid.pgid),
2062       acting,
2063       up,
2064       up_primary,
2065       all_info,
2066       restrict_to_up_acting,
2067       &want,
2068       &want_backfill,
2069       &want_acting_backfill,
2070       get_osdmap(),
2071       ss);
2072   else
2073     calc_ec_acting(
2074       auth_log_shard,
2075       get_osdmap()->get_pg_size(info.pgid.pgid),
2076       acting,
2077       up,
2078       all_info,
2079       restrict_to_up_acting,
2080       &want,
2081       &want_backfill,
2082       &want_acting_backfill,
2083       ss);
2084   psdout(10) << ss.str() << dendl;
2085
2086   if (!recoverable(want)) {
2087     want_acting.clear();
2088     return false;
2089   }
2090
2091   set<pg_shard_t> want_async_recovery;
2092   if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) {
2093     if (pool.info.is_erasure()) {
2094       choose_async_recovery_ec(
2095         all_info, auth_log_shard->second, &want, &want_async_recovery,
2096         get_osdmap());
2097     } else {
2098       choose_async_recovery_replicated(
2099         all_info, auth_log_shard->second, &want, &want_async_recovery,
2100         get_osdmap());
2101     }
2102   }
2103   while (want.size() > pool.info.size) {
2104     // async recovery should have taken out as many osds as it can.
2105     // if not, then always evict the last peer
2106     // (will get synchronously recovered later)
2107     psdout(10) << __func__ << " evicting osd." << want.back()
2108                << " from oversized want " << want << dendl;
2109     want.pop_back();
2110   }
2111   if (want != acting) {
2112     psdout(10) << __func__ << " want " << want << " != acting " << acting
2113                << ", requesting pg_temp change" << dendl;
2114     want_acting = want;
2115
2116     if (!cct->_conf->osd_debug_no_acting_change) {
2117       if (want_acting == up) {
2118         // There can't be any pending backfill if
2119         // want is the same as crush map up OSDs.
2120         ceph_assert(want_backfill.empty());
2121         vector<int> empty;
2122         pl->queue_want_pg_temp(empty);
2123       } else
2124         pl->queue_want_pg_temp(want);
2125     }
2126     return false;
2127   }
2128   if (request_pg_temp_change_only)
2129     return true;
2130   want_acting.clear();
2131   acting_recovery_backfill = want_acting_backfill;
2132   psdout(10) << "acting_recovery_backfill is "
2133              << acting_recovery_backfill << dendl;
2134   ceph_assert(
2135     backfill_targets.empty() ||
2136     backfill_targets == want_backfill);
2137   if (backfill_targets.empty()) {
2138     // Caller is GetInfo
2139     backfill_targets = want_backfill;
2140   }
2141   // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
2142   ceph_assert(
2143     async_recovery_targets.empty() ||
2144     async_recovery_targets == want_async_recovery ||
2145     !needs_recovery());
2146   if (async_recovery_targets.empty() || !needs_recovery()) {
2147     async_recovery_targets = want_async_recovery;
2148   }
2149   // Will not change if already set because up would have had to change
2150   // Verify that nothing in backfill is in stray_set
2151   for (set<pg_shard_t>::iterator i = want_backfill.begin();
2152       i != want_backfill.end();
2153       ++i) {
2154     ceph_assert(stray_set.find(*i) == stray_set.end());
2155   }
2156   psdout(10) << "choose_acting want=" << want << " backfill_targets="
2157            << want_backfill << " async_recovery_targets="
2158            << async_recovery_targets << dendl;
2159   return true;
2160 }
2161
2162 void PeeringState::log_weirdness()
2163 {
2164   if (pg_log.get_tail() != info.log_tail)
2165     pl->get_clog_error() << info.pgid
2166                            << " info mismatch, log.tail " << pg_log.get_tail()
2167                            << " != info.log_tail " << info.log_tail;
2168   if (pg_log.get_head() != info.last_update)
2169     pl->get_clog_error() << info.pgid
2170                            << " info mismatch, log.head " << pg_log.get_head()
2171                            << " != info.last_update " << info.last_update;
2172
2173   if (!pg_log.get_log().empty()) {
2174     // sloppy check
2175     if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
2176       pl->get_clog_error() << info.pgid
2177                              << " log bound mismatch, info (tail,head] ("
2178                              << pg_log.get_tail() << ","
2179                              << pg_log.get_head() << "]"
2180                              << " actual ["
2181                              << pg_log.get_log().log.begin()->version << ","
2182                              << pg_log.get_log().log.rbegin()->version << "]";
2183   }
2184
2185   if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
2186     pl->get_clog_error() << info.pgid
2187                            << " caller_ops.size "
2188                            << pg_log.get_log().caller_ops.size()
2189                            << " > log size " << pg_log.get_log().log.size();
2190   }
2191 }
2192
2193 /*
2194  * Process information from a replica to determine if it could have any
2195  * objects that i need.
2196  *
2197  * TODO: if the missing set becomes very large, this could get expensive.
2198  * Instead, we probably want to just iterate over our unfound set.
2199  */
2200 bool PeeringState::search_for_missing(
2201   const pg_info_t &oinfo, const pg_missing_t &omissing,
2202   pg_shard_t from,
2203   PeeringCtxWrapper &ctx)
2204 {
2205   uint64_t num_unfound_before = missing_loc.num_unfound();
2206   bool found_missing = missing_loc.add_source_info(
2207     from, oinfo, omissing, ctx.handle);
2208   if (found_missing && num_unfound_before != missing_loc.num_unfound())
2209     pl->publish_stats_to_osd();
2210   // avoid doing this if the peer is empty.  This is abit of paranoia
2211   // to avoid doing something rash if add_source_info() above
2212   // incorrectly decided we found something new. (if the peer has
2213   // last_update=0'0 that's impossible.)
2214   if (found_missing &&
2215       oinfo.last_update != eversion_t()) {
2216     pg_info_t tinfo(oinfo);
2217     tinfo.pgid.shard = pg_whoami.shard;
2218     ctx.send_info(
2219       from.osd,
2220       spg_t(info.pgid.pgid, from.shard),
2221       get_osdmap_epoch(),  // fixme: use lower epoch?
2222       get_osdmap_epoch(),
2223       tinfo);
2224   }
2225   return found_missing;
2226 }
2227
2228 bool PeeringState::discover_all_missing(
2229   BufferedRecoveryMessages &rctx)
2230 {
2231   auto &missing = pg_log.get_missing();
2232   uint64_t unfound = get_num_unfound();
2233   bool any = false;  // did we start any queries
2234
2235   psdout(10) << __func__ << " "
2236              << missing.num_missing() << " missing, "
2237              << unfound << " unfound"
2238              << dendl;
2239
2240   std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
2241   std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
2242   for (; m != mend; ++m) {
2243     pg_shard_t peer(*m);
2244
2245     if (!get_osdmap()->is_up(peer.osd)) {
2246       psdout(20) << __func__ << " skipping down osd." << peer << dendl;
2247       continue;
2248     }
2249
2250     if (peer_purged.count(peer)) {
2251       psdout(20) << __func__ << " skipping purged osd." << peer << dendl;
2252       continue;
2253     }
2254
2255     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
2256     if (iter != peer_info.end() &&
2257         (iter->second.is_empty() || iter->second.dne())) {
2258       // ignore empty peers
2259       continue;
2260     }
2261
2262     // If we've requested any of this stuff, the pg_missing_t information
2263     // should be on its way.
2264     // TODO: coalsce requested_* into a single data structure
2265     if (peer_missing.find(peer) != peer_missing.end()) {
2266       psdout(20) << __func__ << ": osd." << peer
2267                  << ": we already have pg_missing_t" << dendl;
2268       continue;
2269     }
2270     if (peer_log_requested.find(peer) != peer_log_requested.end()) {
2271       psdout(20) << __func__ << ": osd." << peer
2272                  << ": in peer_log_requested" << dendl;
2273       continue;
2274     }
2275     if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
2276       psdout(20) << __func__ << ": osd." << peer
2277                  << ": in peer_missing_requested" << dendl;
2278       continue;
2279     }
2280
2281     // Request missing
2282     psdout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
2283                << dendl;
2284     peer_missing_requested.insert(peer);
2285     rctx.send_query(
2286       peer.osd,
2287       spg_t(info.pgid.pgid, peer.shard),
2288       pg_query_t(
2289         pg_query_t::FULLLOG,
2290         peer.shard, pg_whoami.shard,
2291         info.history, get_osdmap_epoch()));
2292     any = true;
2293   }
2294   return any;
2295 }
2296
2297 /* Build the might_have_unfound set.
2298  *
2299  * This is used by the primary OSD during recovery.
2300  *
2301  * This set tracks the OSDs which might have unfound objects that the primary
2302  * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
2303  * will remove the OSD from the set.
2304  */
2305 void PeeringState::build_might_have_unfound()
2306 {
2307   ceph_assert(might_have_unfound.empty());
2308   ceph_assert(is_primary());
2309
2310   psdout(10) << __func__ << dendl;
2311
2312   check_past_interval_bounds();
2313
2314   might_have_unfound = past_intervals.get_might_have_unfound(
2315     pg_whoami,
2316     pool.info.is_erasure());
2317
2318   // include any (stray) peers
2319   for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
2320        p != peer_info.end();
2321        ++p)
2322     might_have_unfound.insert(p->first);
2323
2324   psdout(15) << __func__ << ": built " << might_have_unfound << dendl;
2325 }
2326
2327 void PeeringState::activate(
2328   ObjectStore::Transaction& t,
2329   epoch_t activation_epoch,
2330   PeeringCtxWrapper &ctx)
2331 {
2332   ceph_assert(!is_peered());
2333
2334   // twiddle pg state
2335   state_clear(PG_STATE_DOWN);
2336
2337   send_notify = false;
2338
2339   if (is_primary()) {
2340     // only update primary last_epoch_started if we will go active
2341     if (actingset.size() >= pool.info.min_size) {
2342       ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
2343              info.last_epoch_started <= activation_epoch);
2344       info.last_epoch_started = activation_epoch;
2345       info.last_interval_started = info.history.same_interval_since;
2346     }
2347   } else if (is_acting(pg_whoami)) {
2348     /* update last_epoch_started on acting replica to whatever the primary sent
2349      * unless it's smaller (could happen if we are going peered rather than
2350      * active, see doc/dev/osd_internals/last_epoch_started.rst) */
2351     if (info.last_epoch_started < activation_epoch) {
2352       info.last_epoch_started = activation_epoch;
2353       info.last_interval_started = info.history.same_interval_since;
2354     }
2355   }
2356
2357   auto &missing = pg_log.get_missing();
2358
2359   min_last_complete_ondisk = eversion_t(0,0);  // we don't know (yet)!
2360   if (is_primary()) {
2361     last_update_ondisk = info.last_update;
2362   }
2363   last_update_applied = info.last_update;
2364   last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
2365
2366   need_up_thru = false;
2367
2368   // write pg info, log
2369   dirty_info = true;
2370   dirty_big_info = true; // maybe
2371
2372   pl->schedule_event_on_commit(
2373     t,
2374     std::make_shared<PGPeeringEvent>(
2375       get_osdmap_epoch(),
2376       get_osdmap_epoch(),
2377       ActivateCommitted(
2378         get_osdmap_epoch(),
2379         activation_epoch)));
2380
2381   // init complete pointer
2382   if (missing.num_missing() == 0) {
2383     psdout(10) << "activate - no missing, moving last_complete " << info.last_complete
2384              << " -> " << info.last_update << dendl;
2385     info.last_complete = info.last_update;
2386     info.stats.stats.sum.num_objects_missing = 0;
2387     pg_log.reset_recovery_pointers();
2388   } else {
2389     psdout(10) << "activate - not complete, " << missing << dendl;
2390     info.stats.stats.sum.num_objects_missing = missing.num_missing();
2391     pg_log.activate_not_complete(info);
2392   }
2393
2394   log_weirdness();
2395
2396   if (is_primary()) {
2397     // initialize snap_trimq
2398     interval_set<snapid_t> to_trim;
2399     auto& removed_snaps_queue = get_osdmap()->get_removed_snaps_queue();
2400     auto p = removed_snaps_queue.find(info.pgid.pgid.pool());
2401     if (p != removed_snaps_queue.end()) {
2402       dout(20) << "activate - purged_snaps " << info.purged_snaps
2403                << " removed_snaps " << p->second
2404                << dendl;
2405       for (auto q : p->second) {
2406         to_trim.insert(q.first, q.second);
2407       }
2408     }
2409     interval_set<snapid_t> purged;
2410     purged.intersection_of(to_trim, info.purged_snaps);
2411     to_trim.subtract(purged);
2412
2413     if (HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
2414       renew_lease(pl->get_mnow());
2415       // do not schedule until we are actually activated
2416     }
2417
2418     // adjust purged_snaps: PG may have been inactive while snaps were pruned
2419     // from the removed_snaps_queue in the osdmap.  update local purged_snaps
2420     // reflect only those snaps that we thought were pruned and were still in
2421     // the queue.
2422     info.purged_snaps.swap(purged);
2423
2424     // start up replicas
2425     info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
2426                                                  prior_readable_until_ub);
2427
2428     ceph_assert(!acting_recovery_backfill.empty());
2429     for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2430          i != acting_recovery_backfill.end();
2431          ++i) {
2432       if (*i == pg_whoami) continue;
2433       pg_shard_t peer = *i;
2434       ceph_assert(peer_info.count(peer));
2435       pg_info_t& pi = peer_info[peer];
2436
2437       psdout(10) << "activate peer osd." << peer << " " << pi << dendl;
2438
2439       MOSDPGLog *m = 0;
2440       ceph_assert(peer_missing.count(peer));
2441       pg_missing_t& pm = peer_missing[peer];
2442
2443       bool needs_past_intervals = pi.dne();
2444
2445       if (pi.last_update == info.last_update) {
2446         // empty log
2447         if (!pi.last_backfill.is_max())
2448           pl->get_clog_info() << info.pgid << " continuing backfill to osd."
2449                                 << peer
2450                                 << " from (" << pi.log_tail << "," << pi.last_update
2451                                 << "] " << pi.last_backfill
2452                                 << " to " << info.last_update;
2453         if (!pi.is_empty()) {
2454           psdout(10) << "activate peer osd." << peer
2455                      << " is up to date, queueing in pending_activators" << dendl;
2456           ctx.send_info(
2457             peer.osd,
2458             spg_t(info.pgid.pgid, peer.shard),
2459             get_osdmap_epoch(), // fixme: use lower epoch?
2460             get_osdmap_epoch(),
2461             info,
2462             get_lease());
2463         } else {
2464           psdout(10) << "activate peer osd." << peer
2465                      << " is up to date, but sending pg_log anyway" << dendl;
2466           m = new MOSDPGLog(
2467             i->shard, pg_whoami.shard,
2468             get_osdmap_epoch(), info,
2469             last_peering_reset);
2470         }
2471       } else if (
2472         pg_log.get_tail() > pi.last_update ||
2473         pi.last_backfill == hobject_t() ||
2474         (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
2475         /* ^ This last case covers a situation where a replica is not contiguous
2476          * with the auth_log, but is contiguous with this replica.  Reshuffling
2477          * the active set to handle this would be tricky, so instead we just go
2478          * ahead and backfill it anyway.  This is probably preferrable in any
2479          * case since the replica in question would have to be significantly
2480          * behind.
2481          */
2482         // backfill
2483         pl->get_clog_debug() << info.pgid << " starting backfill to osd." << peer
2484                                << " from (" << pi.log_tail << "," << pi.last_update
2485                                << "] " << pi.last_backfill
2486                                << " to " << info.last_update;
2487
2488         pi.last_update = info.last_update;
2489         pi.last_complete = info.last_update;
2490         pi.set_last_backfill(hobject_t());
2491         pi.last_epoch_started = info.last_epoch_started;
2492         pi.last_interval_started = info.last_interval_started;
2493         pi.history = info.history;
2494         pi.hit_set = info.hit_set;
2495         // Save num_bytes for reservation request, can't be negative
2496         peer_bytes[peer] = std::max<int64_t>(0, pi.stats.stats.sum.num_bytes);
2497         pi.stats.stats.clear();
2498         pi.stats.stats.sum.num_bytes = peer_bytes[peer];
2499
2500         // initialize peer with our purged_snaps.
2501         pi.purged_snaps = info.purged_snaps;
2502
2503         m = new MOSDPGLog(
2504           i->shard, pg_whoami.shard,
2505           get_osdmap_epoch(), pi,
2506           last_peering_reset /* epoch to create pg at */);
2507
2508         // send some recent log, so that op dup detection works well.
2509         m->log.copy_up_to(cct, pg_log.get_log(),
2510                           cct->_conf->osd_max_pg_log_entries);
2511         m->info.log_tail = m->log.tail;
2512         pi.log_tail = m->log.tail;  // sigh...
2513
2514         pm.clear();
2515       } else {
2516         // catch up
2517         ceph_assert(pg_log.get_tail() <= pi.last_update);
2518         m = new MOSDPGLog(
2519           i->shard, pg_whoami.shard,
2520           get_osdmap_epoch(), info,
2521           last_peering_reset /* epoch to create pg at */);
2522         // send new stuff to append to replicas log
2523         m->log.copy_after(cct, pg_log.get_log(), pi.last_update);
2524       }
2525
2526       // share past_intervals if we are creating the pg on the replica
2527       // based on whether our info for that peer was dne() *before*
2528       // updating pi.history in the backfill block above.
2529       if (m && needs_past_intervals)
2530         m->past_intervals = past_intervals;
2531
2532       // update local version of peer's missing list!
2533       if (m && pi.last_backfill != hobject_t()) {
2534         for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
2535              p != m->log.log.end();
2536              ++p) {
2537           if (p->soid <= pi.last_backfill &&
2538               !p->is_error()) {
2539             if (perform_deletes_during_peering() && p->is_delete()) {
2540               pm.rm(p->soid, p->version);
2541             } else {
2542               pm.add_next_event(*p);
2543             }
2544           }
2545         }
2546       }
2547
2548       if (m) {
2549         dout(10) << "activate peer osd." << peer << " sending " << m->log
2550                  << dendl;
2551         m->lease = get_lease();
2552         pl->send_cluster_message(peer.osd, m, get_osdmap_epoch());
2553       }
2554
2555       // peer now has
2556       pi.last_update = info.last_update;
2557
2558       // update our missing
2559       if (pm.num_missing() == 0) {
2560         pi.last_complete = pi.last_update;
2561         psdout(10) << "activate peer osd." << peer << " " << pi
2562                    << " uptodate" << dendl;
2563       } else {
2564         psdout(10) << "activate peer osd." << peer << " " << pi
2565                    << " missing " << pm << dendl;
2566       }
2567     }
2568
2569     // Set up missing_loc
2570     set<pg_shard_t> complete_shards;
2571     for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2572          i != acting_recovery_backfill.end();
2573          ++i) {
2574       psdout(20) << __func__ << " setting up missing_loc from shard " << *i
2575                  << " " << dendl;
2576       if (*i == get_primary()) {
2577         missing_loc.add_active_missing(missing);
2578         if (!missing.have_missing())
2579           complete_shards.insert(*i);
2580       } else {
2581         auto peer_missing_entry = peer_missing.find(*i);
2582         ceph_assert(peer_missing_entry != peer_missing.end());
2583         missing_loc.add_active_missing(peer_missing_entry->second);
2584         if (!peer_missing_entry->second.have_missing() &&
2585             peer_info[*i].last_backfill.is_max())
2586           complete_shards.insert(*i);
2587       }
2588     }
2589
2590     // If necessary, create might_have_unfound to help us find our unfound objects.
2591     // NOTE: It's important that we build might_have_unfound before trimming the
2592     // past intervals.
2593     might_have_unfound.clear();
2594     if (needs_recovery()) {
2595       // If only one shard has missing, we do a trick to add all others as recovery
2596       // source, this is considered safe since the PGLogs have been merged locally,
2597       // and covers vast majority of the use cases, like one OSD/host is down for
2598       // a while for hardware repairing
2599       if (complete_shards.size() + 1 == acting_recovery_backfill.size()) {
2600         missing_loc.add_batch_sources_info(complete_shards, ctx.handle);
2601       } else {
2602         missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
2603                                     ctx.handle);
2604         for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2605              i != acting_recovery_backfill.end();
2606              ++i) {
2607           if (*i == pg_whoami) continue;
2608           psdout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
2609           ceph_assert(peer_missing.count(*i));
2610           ceph_assert(peer_info.count(*i));
2611           missing_loc.add_source_info(
2612             *i,
2613             peer_info[*i],
2614             peer_missing[*i],
2615             ctx.handle);
2616         }
2617       }
2618       for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
2619            i != peer_missing.end();
2620            ++i) {
2621         if (is_acting_recovery_backfill(i->first))
2622           continue;
2623         ceph_assert(peer_info.count(i->first));
2624         search_for_missing(
2625           peer_info[i->first],
2626           i->second,
2627           i->first,
2628           ctx);
2629       }
2630
2631       build_might_have_unfound();
2632
2633       // Always call now so update_calc_stats() will be accurate
2634       discover_all_missing(ctx.msgs);
2635
2636     }
2637
2638     // num_objects_degraded if calculated should reflect this too, unless no
2639     // missing and we are about to go clean.
2640     if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
2641       state_set(PG_STATE_UNDERSIZED);
2642     }
2643
2644     state_set(PG_STATE_ACTIVATING);
2645     pl->on_activate(std::move(to_trim));
2646   }
2647   if (actingset.size() >= pool.info.min_size) {
2648     PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2649     pg_log.roll_forward(rollbacker.get());
2650   }
2651 }
2652
2653 void PeeringState::share_pg_info()
2654 {
2655   psdout(10) << "share_pg_info" << dendl;
2656
2657   info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
2658                                                prior_readable_until_ub);
2659
2660   // share new pg_info_t with replicas
2661   ceph_assert(!acting_recovery_backfill.empty());
2662   for (auto pg_shard : acting_recovery_backfill) {
2663     if (pg_shard == pg_whoami) continue;
2664     if (auto peer = peer_info.find(pg_shard); peer != peer_info.end()) {
2665       peer->second.last_epoch_started = info.last_epoch_started;
2666       peer->second.last_interval_started = info.last_interval_started;
2667       peer->second.history.merge(info.history);
2668     }
2669     Message* m = nullptr;
2670     if (last_require_osd_release >= ceph_release_t::octopus) {
2671       m = new MOSDPGInfo2{spg_t{info.pgid.pgid, pg_shard.shard},
2672                           info,
2673                           get_osdmap_epoch(),
2674                           get_osdmap_epoch(),
2675                           get_lease(), {}};
2676     } else {
2677       m = new MOSDPGInfo{get_osdmap_epoch(),
2678                          {pg_notify_t{pg_shard.shard,
2679                                       pg_whoami.shard,
2680                                       get_osdmap_epoch(),
2681                                       get_osdmap_epoch(),
2682                                       info,
2683                                       past_intervals}}};
2684     }
2685     pl->send_cluster_message(pg_shard.osd, m, get_osdmap_epoch());
2686   }
2687 }
2688
2689 void PeeringState::merge_log(
2690   ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
2691   pg_shard_t from)
2692 {
2693   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2694   pg_log.merge_log(
2695     oinfo, olog, from, info, rollbacker.get(), dirty_info, dirty_big_info);
2696 }
2697
2698 void PeeringState::rewind_divergent_log(
2699   ObjectStore::Transaction& t, eversion_t newhead)
2700 {
2701   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2702   pg_log.rewind_divergent_log(
2703     newhead, info, rollbacker.get(), dirty_info, dirty_big_info);
2704 }
2705
2706
2707 void PeeringState::proc_primary_info(
2708   ObjectStore::Transaction &t, const pg_info_t &oinfo)
2709 {
2710   ceph_assert(!is_primary());
2711
2712   update_history(oinfo.history);
2713   if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
2714     info.stats.stats.sum.num_scrub_errors = 0;
2715     info.stats.stats.sum.num_shallow_scrub_errors = 0;
2716     info.stats.stats.sum.num_deep_scrub_errors = 0;
2717     dirty_info = true;
2718   }
2719
2720   if (!(info.purged_snaps == oinfo.purged_snaps)) {
2721     psdout(10) << __func__ << " updating purged_snaps to "
2722                << oinfo.purged_snaps
2723                << dendl;
2724     info.purged_snaps = oinfo.purged_snaps;
2725     dirty_info = true;
2726     dirty_big_info = true;
2727   }
2728 }
2729
2730 void PeeringState::proc_master_log(
2731   ObjectStore::Transaction& t, pg_info_t &oinfo,
2732   pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
2733 {
2734   psdout(10) << "proc_master_log for osd." << from << ": "
2735              << olog << " " << omissing << dendl;
2736   ceph_assert(!is_peered() && is_primary());
2737
2738   // merge log into our own log to build master log.  no need to
2739   // make any adjustments to their missing map; we are taking their
2740   // log to be authoritative (i.e., their entries are by definitely
2741   // non-divergent).
2742   merge_log(t, oinfo, olog, from);
2743   peer_info[from] = oinfo;
2744   psdout(10) << " peer osd." << from << " now " << oinfo
2745              << " " << omissing << dendl;
2746   might_have_unfound.insert(from);
2747
2748   // See doc/dev/osd_internals/last_epoch_started
2749   if (oinfo.last_epoch_started > info.last_epoch_started) {
2750     info.last_epoch_started = oinfo.last_epoch_started;
2751     dirty_info = true;
2752   }
2753   if (oinfo.last_interval_started > info.last_interval_started) {
2754     info.last_interval_started = oinfo.last_interval_started;
2755     dirty_info = true;
2756   }
2757   update_history(oinfo.history);
2758   ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
2759          info.last_epoch_started >= info.history.last_epoch_started);
2760
2761   peer_missing[from].claim(omissing);
2762 }
2763
2764 void PeeringState::proc_replica_log(
2765   pg_info_t &oinfo,
2766   const pg_log_t &olog,
2767   pg_missing_t& omissing,
2768   pg_shard_t from)
2769 {
2770   psdout(10) << "proc_replica_log for osd." << from << ": "
2771              << oinfo << " " << olog << " " << omissing << dendl;
2772
2773   pg_log.proc_replica_log(oinfo, olog, omissing, from);
2774
2775   peer_info[from] = oinfo;
2776   psdout(10) << " peer osd." << from << " now "
2777              << oinfo << " " << omissing << dendl;
2778   might_have_unfound.insert(from);
2779
2780   for (map<hobject_t, pg_missing_item>::const_iterator i =
2781          omissing.get_items().begin();
2782        i != omissing.get_items().end();
2783        ++i) {
2784     psdout(20) << " after missing " << i->first
2785                << " need " << i->second.need
2786                << " have " << i->second.have << dendl;
2787   }
2788   peer_missing[from].claim(omissing);
2789 }
2790
2791 void PeeringState::fulfill_info(
2792   pg_shard_t from, const pg_query_t &query,
2793   pair<pg_shard_t, pg_info_t> &notify_info)
2794 {
2795   ceph_assert(from == primary);
2796   ceph_assert(query.type == pg_query_t::INFO);
2797
2798   // info
2799   psdout(10) << "sending info" << dendl;
2800   notify_info = make_pair(from, info);
2801 }
2802
2803 void PeeringState::fulfill_log(
2804   pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
2805 {
2806   psdout(10) << "log request from " << from << dendl;
2807   ceph_assert(from == primary);
2808   ceph_assert(query.type != pg_query_t::INFO);
2809
2810   MOSDPGLog *mlog = new MOSDPGLog(
2811     from.shard, pg_whoami.shard,
2812     get_osdmap_epoch(),
2813     info, query_epoch);
2814   mlog->missing = pg_log.get_missing();
2815
2816   // primary -> other, when building master log
2817   if (query.type == pg_query_t::LOG) {
2818     psdout(10) << " sending info+missing+log since " << query.since
2819                << dendl;
2820     if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
2821       pl->get_clog_error() << info.pgid << " got broken pg_query_t::LOG since "
2822                              << query.since
2823                              << " when my log.tail is " << pg_log.get_tail()
2824                              << ", sending full log instead";
2825       mlog->log = pg_log.get_log();           // primary should not have requested this!!
2826     } else
2827       mlog->log.copy_after(cct, pg_log.get_log(), query.since);
2828   }
2829   else if (query.type == pg_query_t::FULLLOG) {
2830     psdout(10) << " sending info+missing+full log" << dendl;
2831     mlog->log = pg_log.get_log();
2832   }
2833
2834   psdout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
2835
2836   pl->send_cluster_message(from.osd, mlog, get_osdmap_epoch(), true);
2837 }
2838
2839 void PeeringState::fulfill_query(const MQuery& query, PeeringCtxWrapper &rctx)
2840 {
2841   if (query.query.type == pg_query_t::INFO) {
2842     pair<pg_shard_t, pg_info_t> notify_info;
2843     // note this refreshes our prior_readable_until_ub value
2844     update_history(query.query.history);
2845     fulfill_info(query.from, query.query, notify_info);
2846     rctx.send_notify(
2847       notify_info.first.osd,
2848       pg_notify_t(
2849         notify_info.first.shard, pg_whoami.shard,
2850         query.query_epoch,
2851         get_osdmap_epoch(),
2852         notify_info.second,
2853         past_intervals));
2854   } else {
2855     update_history(query.query.history);
2856     fulfill_log(query.from, query.query, query.query_epoch);
2857   }
2858 }
2859
2860 void PeeringState::try_mark_clean()
2861 {
2862   if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2863     state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
2864     state_set(PG_STATE_CLEAN);
2865     info.history.last_epoch_clean = get_osdmap_epoch();
2866     info.history.last_interval_clean = info.history.same_interval_since;
2867     past_intervals.clear();
2868     dirty_big_info = true;
2869     dirty_info = true;
2870   }
2871
2872   if (!is_active() && is_peered()) {
2873     if (is_clean()) {
2874       bool target;
2875       if (pool.info.is_pending_merge(info.pgid.pgid, &target)) {
2876         if (target) {
2877           psdout(10) << "ready to merge (target)" << dendl;
2878           pl->set_ready_to_merge_target(
2879             info.last_update,
2880             info.history.last_epoch_started,
2881             info.history.last_epoch_clean);
2882         } else {
2883           psdout(10) << "ready to merge (source)" << dendl;
2884           pl->set_ready_to_merge_source(info.last_update);
2885         }
2886       }
2887     } else {
2888       psdout(10) << "not clean, not ready to merge" << dendl;
2889       // we should have notified OSD in Active state entry point
2890     }
2891   }
2892
2893   state_clear(PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
2894
2895   share_pg_info();
2896   pl->publish_stats_to_osd();
2897   clear_recovery_state();
2898 }
2899
2900 void PeeringState::split_into(
2901   pg_t child_pgid, PeeringState *child, unsigned split_bits)
2902 {
2903   child->update_osdmap_ref(get_osdmap());
2904   child->pool = pool;
2905
2906   // Log
2907   pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2908   child->info.last_complete = info.last_complete;
2909
2910   info.last_update = pg_log.get_head();
2911   child->info.last_update = child->pg_log.get_head();
2912
2913   child->info.last_user_version = info.last_user_version;
2914
2915   info.log_tail = pg_log.get_tail();
2916   child->info.log_tail = child->pg_log.get_tail();
2917
2918   // reset last_complete, we might have modified pg_log & missing above
2919   pg_log.reset_complete_to(&info);
2920   child->pg_log.reset_complete_to(&child->info);
2921
2922   // Info
2923   child->info.history = info.history;
2924   child->info.history.epoch_created = get_osdmap_epoch();
2925   child->info.purged_snaps = info.purged_snaps;
2926
2927   if (info.last_backfill.is_max()) {
2928     child->info.set_last_backfill(hobject_t::get_max());
2929   } else {
2930     // restart backfill on parent and child to be safe.  we could
2931     // probably do better in the bitwise sort case, but it's more
2932     // fragile (there may be special work to do on backfill completion
2933     // in the future).
2934     info.set_last_backfill(hobject_t());
2935     child->info.set_last_backfill(hobject_t());
2936     // restarting backfill implies that the missing set is empty,
2937     // since it is only used for objects prior to last_backfill
2938     pg_log.reset_backfill();
2939     child->pg_log.reset_backfill();
2940   }
2941
2942   child->info.stats = info.stats;
2943   child->info.stats.parent_split_bits = split_bits;
2944   info.stats.stats_invalid = true;
2945   child->info.stats.stats_invalid = true;
2946   child->info.last_epoch_started = info.last_epoch_started;
2947   child->info.last_interval_started = info.last_interval_started;
2948
2949   // There can't be recovery/backfill going on now
2950   int primary, up_primary;
2951   vector<int> newup, newacting;
2952   get_osdmap()->pg_to_up_acting_osds(
2953     child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2954   child->init_primary_up_acting(
2955     newup,
2956     newacting,
2957     up_primary,
2958     primary);
2959   child->role = OSDMap::calc_pg_role(pg_whoami, child->acting);
2960
2961   // this comparison includes primary rank via pg_shard_t
2962   if (get_primary() != child->get_primary())
2963     child->info.history.same_primary_since = get_osdmap_epoch();
2964
2965   child->info.stats.up = up;
2966   child->info.stats.up_primary = up_primary;
2967   child->info.stats.acting = acting;
2968   child->info.stats.acting_primary = primary;
2969   child->info.stats.mapping_epoch = get_osdmap_epoch();
2970
2971   // History
2972   child->past_intervals = past_intervals;
2973
2974   child->on_new_interval();
2975
2976   child->send_notify = !child->is_primary();
2977
2978   child->dirty_info = true;
2979   child->dirty_big_info = true;
2980   dirty_info = true;
2981   dirty_big_info = true;
2982 }
2983
2984 void PeeringState::merge_from(
2985   map<spg_t,PeeringState *>& sources,
2986   PeeringCtx &rctx,
2987   unsigned split_bits,
2988   const pg_merge_meta_t& last_pg_merge_meta)
2989 {
2990   bool incomplete = false;
2991   if (info.last_complete != info.last_update ||
2992       info.is_incomplete() ||
2993       info.dne()) {
2994     psdout(10) << __func__ << " target incomplete" << dendl;
2995     incomplete = true;
2996   }
2997   if (last_pg_merge_meta.source_pgid != pg_t()) {
2998     if (info.pgid.pgid != last_pg_merge_meta.source_pgid.get_parent()) {
2999       psdout(10) << __func__ << " target doesn't match expected parent "
3000                  << last_pg_merge_meta.source_pgid.get_parent()
3001                  << " of source_pgid " << last_pg_merge_meta.source_pgid
3002                  << dendl;
3003       incomplete = true;
3004     }
3005     if (info.last_update != last_pg_merge_meta.target_version) {
3006       psdout(10) << __func__ << " target version doesn't match expected "
3007                << last_pg_merge_meta.target_version << dendl;
3008       incomplete = true;
3009     }
3010   }
3011
3012   PGLog::LogEntryHandlerRef handler{pl->get_log_handler(rctx.transaction)};
3013   pg_log.roll_forward(handler.get());
3014
3015   info.last_complete = info.last_update;  // to fake out trim()
3016   pg_log.reset_recovery_pointers();
3017   pg_log.trim(info.last_update, info);
3018
3019   vector<PGLog*> log_from;
3020   for (auto& i : sources) {
3021     auto& source = i.second;
3022     if (!source) {
3023       psdout(10) << __func__ << " source " << i.first << " missing" << dendl;
3024       incomplete = true;
3025       continue;
3026     }
3027     if (source->info.last_complete != source->info.last_update ||
3028         source->info.is_incomplete() ||
3029         source->info.dne()) {
3030       psdout(10) << __func__ << " source " << source->pg_whoami
3031                  << " incomplete"
3032                  << dendl;
3033       incomplete = true;
3034     }
3035     if (last_pg_merge_meta.source_pgid != pg_t()) {
3036       if (source->info.pgid.pgid != last_pg_merge_meta.source_pgid) {
3037         dout(10) << __func__ << " source " << source->info.pgid.pgid
3038                  << " doesn't match expected source pgid "
3039                  << last_pg_merge_meta.source_pgid << dendl;
3040         incomplete = true;
3041       }
3042       if (source->info.last_update != last_pg_merge_meta.source_version) {
3043         dout(10) << __func__ << " source version doesn't match expected "
3044                  << last_pg_merge_meta.target_version << dendl;
3045         incomplete = true;
3046       }
3047     }
3048
3049     // prepare log
3050     PGLog::LogEntryHandlerRef handler{
3051       source->pl->get_log_handler(rctx.transaction)};
3052     source->pg_log.roll_forward(handler.get());
3053     source->info.last_complete = source->info.last_update;  // to fake out trim()
3054     source->pg_log.reset_recovery_pointers();
3055     source->pg_log.trim(source->info.last_update, source->info);
3056     log_from.push_back(&source->pg_log);
3057
3058     // combine stats
3059     info.stats.add(source->info.stats);
3060
3061     // pull up last_update
3062     info.last_update = std::max(info.last_update, source->info.last_update);
3063
3064     // adopt source's PastIntervals if target has none.  we can do this since
3065     // pgp_num has been reduced prior to the merge, so the OSD mappings for
3066     // the PGs are identical.
3067     if (past_intervals.empty() && !source->past_intervals.empty()) {
3068       psdout(10) << __func__ << " taking source's past_intervals" << dendl;
3069       past_intervals = source->past_intervals;
3070     }
3071   }
3072
3073   info.last_complete = info.last_update;
3074   info.log_tail = info.last_update;
3075   if (incomplete) {
3076     info.last_backfill = hobject_t();
3077   }
3078
3079   // merge logs
3080   pg_log.merge_from(log_from, info.last_update);
3081
3082   // make sure we have a meaningful last_epoch_started/clean (if we were a
3083   // placeholder)
3084   if (info.history.epoch_created == 0) {
3085     // start with (a) source's history, since these PGs *should* have been
3086     // remapped in concert with each other...
3087     info.history = sources.begin()->second->info.history;
3088
3089     // we use the last_epoch_{started,clean} we got from
3090     // the caller, which are the epochs that were reported by the PGs were
3091     // found to be ready for merge.
3092     info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean;
3093     info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3094     info.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3095     psdout(10) << __func__
3096                << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/"
3097                << last_pg_merge_meta.last_epoch_clean
3098                << " from pool last_dec_*, source pg history was "
3099                << sources.begin()->second->info.history
3100                << dendl;
3101
3102     // above we have pulled down source's history and we need to check
3103     // history.epoch_created again to confirm that source is not a placeholder
3104     // too. (peering requires a sane history.same_interval_since value for any
3105     // non-newly created pg and below here we know we are basically iterating
3106     // back a series of past maps to fake a merge process, hence we need to
3107     // fix history.same_interval_since first so that start_peering_interval()
3108     // will not complain)
3109     if (info.history.epoch_created == 0) {
3110       dout(10) << __func__ << " both merge target and source are placeholders,"
3111                << " set sis to lec " << info.history.last_epoch_clean
3112                << dendl;
3113       info.history.same_interval_since = info.history.last_epoch_clean;
3114     }
3115
3116     // if the past_intervals start is later than last_epoch_clean, it
3117     // implies the source repeered again but the target didn't, or
3118     // that the source became clean in a later epoch than the target.
3119     // avoid the discrepancy but adjusting the interval start
3120     // backwards to match so that check_past_interval_bounds() will
3121     // not complain.
3122     auto pib = past_intervals.get_bounds();
3123     if (info.history.last_epoch_clean < pib.first) {
3124       psdout(10) << __func__ << " last_epoch_clean "
3125                  << info.history.last_epoch_clean << " < past_interval start "
3126                  << pib.first << ", adjusting start backwards" << dendl;
3127       past_intervals.adjust_start_backwards(info.history.last_epoch_clean);
3128     }
3129
3130     // Similarly, if the same_interval_since value is later than
3131     // last_epoch_clean, the next interval change will result in a
3132     // past_interval start that is later than last_epoch_clean.  This
3133     // can happen if we use the pg_history values from the merge
3134     // source.  Adjust the same_interval_since value backwards if that
3135     // happens.  (We trust the les and lec values more because they came from
3136     // the real target, whereas the history value we stole from the source.)
3137     if (info.history.last_epoch_started < info.history.same_interval_since) {
3138       psdout(10) << __func__ << " last_epoch_started "
3139                  << info.history.last_epoch_started << " < same_interval_since "
3140                  << info.history.same_interval_since
3141                  << ", adjusting pg_history backwards" << dendl;
3142       info.history.same_interval_since = info.history.last_epoch_clean;
3143       // make sure same_{up,primary}_since are <= same_interval_since
3144       info.history.same_up_since = std::min(
3145         info.history.same_up_since, info.history.same_interval_since);
3146       info.history.same_primary_since = std::min(
3147         info.history.same_primary_since, info.history.same_interval_since);
3148     }
3149   }
3150
3151   dirty_info = true;
3152   dirty_big_info = true;
3153 }
3154
3155 void PeeringState::start_split_stats(
3156   const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
3157 {
3158   out->resize(childpgs.size() + 1);
3159   info.stats.stats.sum.split(*out);
3160 }
3161
3162 void PeeringState::finish_split_stats(
3163   const object_stat_sum_t& stats, ObjectStore::Transaction &t)
3164 {
3165   info.stats.stats.sum = stats;
3166   write_if_dirty(t);
3167 }
3168
3169 void PeeringState::update_blocked_by()
3170 {
3171   // set a max on the number of blocking peers we report. if we go
3172   // over, report a random subset.  keep the result sorted.
3173   unsigned keep = std::min<unsigned>(
3174     blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
3175   unsigned skip = blocked_by.size() - keep;
3176   info.stats.blocked_by.clear();
3177   info.stats.blocked_by.resize(keep);
3178   unsigned pos = 0;
3179   for (set<int>::iterator p = blocked_by.begin();
3180        p != blocked_by.end() && keep > 0;
3181        ++p) {
3182     if (skip > 0 && (rand() % (skip + keep) < skip)) {
3183       --skip;
3184     } else {
3185       info.stats.blocked_by[pos++] = *p;
3186       --keep;
3187     }
3188   }
3189 }
3190
3191 static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard)
3192 {
3193     for (auto&p : pgs)
3194       if (p.shard == shard)
3195         return true;
3196     return false;
3197 }
3198
3199 static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard)
3200 {
3201     for (auto&p : pgs) {
3202       if (p == skip)
3203         continue;
3204       if (p.shard == shard)
3205         return p;
3206     }
3207     return pg_shard_t();
3208 }
3209
3210 void PeeringState::update_calc_stats()
3211 {
3212   info.stats.version = info.last_update;
3213   info.stats.created = info.history.epoch_created;
3214   info.stats.last_scrub = info.history.last_scrub;
3215   info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
3216   info.stats.last_deep_scrub = info.history.last_deep_scrub;
3217   info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
3218   info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
3219   info.stats.last_epoch_clean = info.history.last_epoch_clean;
3220
3221   info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
3222   info.stats.ondisk_log_size = info.stats.log_size;
3223   info.stats.log_start = pg_log.get_tail();
3224   info.stats.ondisk_log_start = pg_log.get_tail();
3225   info.stats.snaptrimq_len = pl->get_snap_trimq_size();
3226
3227   unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
3228
3229   // In rare case that upset is too large (usually transient), use as target
3230   // for calculations below.
3231   unsigned target = std::max(num_shards, (unsigned)upset.size());
3232   // For undersized actingset may be larger with OSDs out
3233   unsigned nrep = std::max(actingset.size(), upset.size());
3234   // calc num_object_copies
3235   info.stats.stats.calc_copies(std::max(target, nrep));
3236   info.stats.stats.sum.num_objects_degraded = 0;
3237   info.stats.stats.sum.num_objects_unfound = 0;
3238   info.stats.stats.sum.num_objects_misplaced = 0;
3239   info.stats.avail_no_missing.clear();
3240   info.stats.object_location_counts.clear();
3241
3242   // We should never hit this condition, but if end up hitting it,
3243   // make sure to update num_objects and set PG_STATE_INCONSISTENT.
3244   if (info.stats.stats.sum.num_objects < 0) {
3245     psdout(0) << __func__ << " negative num_objects = "
3246               << info.stats.stats.sum.num_objects << " setting it to 0 "
3247               << dendl;
3248     info.stats.stats.sum.num_objects = 0;
3249     state_set(PG_STATE_INCONSISTENT);
3250   }
3251
3252   if ((is_remapped() || is_undersized() || !is_clean()) &&
3253       (is_peered()|| is_activating())) {
3254     psdout(20) << __func__ << " actingset " << actingset << " upset "
3255                << upset << " acting_recovery_backfill " << acting_recovery_backfill << dendl;
3256
3257     ceph_assert(!acting_recovery_backfill.empty());
3258
3259     bool estimate = false;
3260
3261     // NOTE: we only generate degraded, misplaced and unfound
3262     // values for the summation, not individual stat categories.
3263     int64_t num_objects = info.stats.stats.sum.num_objects;
3264
3265     // Objects missing from up nodes, sorted by # objects.
3266     boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
3267     // Objects missing from nodes not in up, sort by # objects
3268     boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
3269
3270     // Fill missing_target_objects/acting_source_objects
3271
3272     {
3273       int64_t missing;
3274
3275       // Primary first
3276       missing = pg_log.get_missing().num_missing();
3277       ceph_assert(acting_recovery_backfill.count(pg_whoami));
3278       if (upset.count(pg_whoami)) {
3279         missing_target_objects.emplace(missing, pg_whoami);
3280       } else {
3281         acting_source_objects.emplace(missing, pg_whoami);
3282       }
3283       info.stats.stats.sum.num_objects_missing_on_primary = missing;
3284       if (missing == 0)
3285         info.stats.avail_no_missing.push_back(pg_whoami);
3286       psdout(20) << __func__ << " shard " << pg_whoami
3287                  << " primary objects " << num_objects
3288                  << " missing " << missing
3289                  << dendl;
3290     }
3291
3292     // All other peers
3293     for (auto& peer : peer_info) {
3294       // Primary should not be in the peer_info, skip if it is.
3295       if (peer.first == pg_whoami) continue;
3296       int64_t missing = 0;
3297       int64_t peer_num_objects =
3298         std::max((int64_t)0, peer.second.stats.stats.sum.num_objects);
3299       // Backfill targets always track num_objects accurately
3300       // all other peers track missing accurately.
3301       if (is_backfill_target(peer.first)) {
3302         missing = std::max((int64_t)0, num_objects - peer_num_objects);
3303       } else {
3304         if (peer_missing.count(peer.first)) {
3305           missing = peer_missing[peer.first].num_missing();
3306         } else {
3307           psdout(20) << __func__ << " no peer_missing found for "
3308                      << peer.first << dendl;
3309           if (is_recovering()) {
3310             estimate = true;
3311           }
3312           missing = std::max((int64_t)0, num_objects - peer_num_objects);
3313         }
3314       }
3315       if (upset.count(peer.first)) {
3316         missing_target_objects.emplace(missing, peer.first);
3317       } else if (actingset.count(peer.first)) {
3318         acting_source_objects.emplace(missing, peer.first);
3319       }
3320       peer.second.stats.stats.sum.num_objects_missing = missing;
3321       if (missing == 0)
3322         info.stats.avail_no_missing.push_back(peer.first);
3323       psdout(20) << __func__ << " shard " << peer.first
3324                  << " objects " << peer_num_objects
3325                  << " missing " << missing
3326                  << dendl;
3327     }
3328
3329     // Compute object_location_counts
3330     for (auto& ml: missing_loc.get_missing_locs()) {
3331       info.stats.object_location_counts[ml.second]++;
3332       psdout(30) << __func__ << " " << ml.first << " object_location_counts["
3333                  << ml.second << "]=" << info.stats.object_location_counts[ml.second]
3334                  << dendl;
3335     }
3336     int64_t not_missing = num_objects - missing_loc.get_missing_locs().size();
3337     if (not_missing) {
3338         // During recovery we know upset == actingset and is being populated
3339         // During backfill we know that all non-missing objects are in the actingset
3340         info.stats.object_location_counts[actingset] = not_missing;
3341     }
3342     psdout(30) << __func__ << " object_location_counts["
3343                << upset << "]=" << info.stats.object_location_counts[upset]
3344                << dendl;
3345     psdout(20) << __func__ << " object_location_counts "
3346                << info.stats.object_location_counts << dendl;
3347
3348     // A misplaced object is not stored on the correct OSD
3349     int64_t misplaced = 0;
3350     // a degraded objects has fewer replicas or EC shards than the pool specifies.
3351     int64_t degraded = 0;
3352
3353     if (is_recovering()) {
3354       for (auto& sml: missing_loc.get_missing_by_count()) {
3355         for (auto& ml: sml.second) {
3356           int missing_shards;
3357           if (sml.first == shard_id_t::NO_SHARD) {
3358             psdout(20) << __func__ << " ml " << ml.second
3359                        << " upset size " << upset.size()
3360                        << " up " << ml.first.up << dendl;
3361             missing_shards = (int)upset.size() - ml.first.up;
3362           } else {
3363             // Handle shards not even in upset below
3364             if (!find_shard(upset, sml.first))
3365               continue;
3366             missing_shards = std::max(0, 1 - ml.first.up);
3367             psdout(20) << __func__
3368                        << " shard " << sml.first
3369                        << " ml " << ml.second
3370                        << " missing shards " << missing_shards << dendl;
3371           }
3372           int odegraded = ml.second * missing_shards;
3373           // Copies on other osds but limited to the possible degraded
3374           int more_osds = std::min(missing_shards, ml.first.other);
3375           int omisplaced = ml.second * more_osds;
3376           ceph_assert(omisplaced <= odegraded);
3377           odegraded -= omisplaced;
3378
3379           misplaced += omisplaced;
3380           degraded += odegraded;
3381         }
3382       }
3383
3384       psdout(20) << __func__ << " missing based degraded "
3385                  << degraded << dendl;
3386       psdout(20) << __func__ << " missing based misplaced "
3387                  << misplaced << dendl;
3388
3389       // Handle undersized case
3390       if (pool.info.is_replicated()) {
3391         // Add degraded for missing targets (num_objects missing)
3392         ceph_assert(target >= upset.size());
3393         unsigned needed = target - upset.size();
3394         degraded += num_objects * needed;
3395       } else {
3396         for (unsigned i = 0 ; i < num_shards; ++i) {
3397           shard_id_t shard(i);
3398
3399           if (!find_shard(upset, shard)) {
3400             pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard);
3401
3402             if (pgs != pg_shard_t()) {
3403               int64_t missing;
3404
3405               if (pgs == pg_whoami)
3406                 missing = info.stats.stats.sum.num_objects_missing_on_primary;
3407               else
3408                 missing = peer_info[pgs].stats.stats.sum.num_objects_missing;
3409
3410               degraded += missing;
3411               misplaced += std::max((int64_t)0, num_objects - missing);
3412             } else {
3413               // No shard anywhere
3414               degraded += num_objects;
3415             }
3416           }
3417         }
3418       }
3419       goto out;
3420     }
3421
3422     // Handle undersized case
3423     if (pool.info.is_replicated()) {
3424       // Add to missing_target_objects
3425       ceph_assert(target >= missing_target_objects.size());
3426       unsigned needed = target - missing_target_objects.size();
3427       if (needed)
3428         missing_target_objects.emplace(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD));
3429     } else {
3430       for (unsigned i = 0 ; i < num_shards; ++i) {
3431         shard_id_t shard(i);
3432         bool found = false;
3433         for (const auto& t : missing_target_objects) {
3434           if (std::get<1>(t).shard == shard) {
3435             found = true;
3436             break;
3437           }
3438         }
3439         if (!found)
3440           missing_target_objects.emplace(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard));
3441       }
3442     }
3443
3444     for (const auto& item : missing_target_objects)
3445       psdout(20) << __func__ << " missing shard " << std::get<1>(item)
3446                  << " missing= " << std::get<0>(item) << dendl;
3447     for (const auto& item : acting_source_objects)
3448       psdout(20) << __func__ << " acting shard " << std::get<1>(item)
3449                  << " missing= " << std::get<0>(item) << dendl;
3450
3451     // Handle all objects not in missing for remapped
3452     // or backfill
3453     for (auto m = missing_target_objects.rbegin();
3454         m != missing_target_objects.rend(); ++m) {
3455
3456       int64_t extra_missing = -1;
3457
3458       if (pool.info.is_replicated()) {
3459         if (!acting_source_objects.empty()) {
3460           auto extra_copy = acting_source_objects.begin();
3461           extra_missing = std::get<0>(*extra_copy);
3462           acting_source_objects.erase(extra_copy);
3463         }
3464       } else {  // Erasure coded
3465         // Use corresponding shard
3466         for (const auto& a : acting_source_objects) {
3467           if (std::get<1>(a).shard == std::get<1>(*m).shard) {
3468             extra_missing = std::get<0>(a);
3469             acting_source_objects.erase(a);
3470             break;
3471           }
3472         }
3473       }
3474
3475       if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
3476         // We don't know which of the objects on the target
3477         // are part of extra_missing so assume are all degraded.
3478         misplaced += std::get<0>(*m) - extra_missing;
3479         degraded += extra_missing;
3480       } else {
3481         // 1. extra_missing == -1, more targets than sources so degraded
3482         // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
3483         //    previously degraded are now present on the target.
3484         degraded += std::get<0>(*m);
3485       }
3486     }
3487     // If there are still acting that haven't been accounted for
3488     // then they are misplaced
3489     for (const auto& a : acting_source_objects) {
3490       int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
3491       psdout(20) << __func__ << " extra acting misplaced " << extra_misplaced
3492                  << dendl;
3493       misplaced += extra_misplaced;
3494     }
3495 out:
3496     // NOTE: Tests use these messages to verify this code
3497     psdout(20) << __func__ << " degraded " << degraded
3498                << (estimate ? " (est)": "") << dendl;
3499     psdout(20) << __func__ << " misplaced " << misplaced
3500                << (estimate ? " (est)": "")<< dendl;
3501
3502     info.stats.stats.sum.num_objects_degraded = degraded;
3503     info.stats.stats.sum.num_objects_unfound = get_num_unfound();
3504     info.stats.stats.sum.num_objects_misplaced = misplaced;
3505   }
3506 }
3507
3508 std::optional<pg_stat_t> PeeringState::prepare_stats_for_publish(
3509   bool pg_stats_publish_valid,
3510   const pg_stat_t &pg_stats_publish,
3511   const object_stat_collection_t &unstable_stats)
3512 {
3513   if (info.stats.stats.sum.num_scrub_errors) {
3514     state_set(PG_STATE_INCONSISTENT);
3515   } else {
3516     state_clear(PG_STATE_INCONSISTENT);
3517     state_clear(PG_STATE_FAILED_REPAIR);
3518   }
3519
3520   utime_t now = ceph_clock_now();
3521   if (info.stats.state != state) {
3522     info.stats.last_change = now;
3523     // Optimistic estimation, if we just find out an inactive PG,
3524     // assumt it is active till now.
3525     if (!(state & PG_STATE_ACTIVE) &&
3526         (info.stats.state & PG_STATE_ACTIVE))
3527       info.stats.last_active = now;
3528
3529     if ((state & PG_STATE_ACTIVE) &&
3530         !(info.stats.state & PG_STATE_ACTIVE))
3531       info.stats.last_became_active = now;
3532     if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
3533         !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
3534       info.stats.last_became_peered = now;
3535     info.stats.state = state;
3536   }
3537
3538   update_calc_stats();
3539   if (info.stats.stats.sum.num_objects_degraded) {
3540     state_set(PG_STATE_DEGRADED);
3541   } else {
3542     state_clear(PG_STATE_DEGRADED);
3543   }
3544   update_blocked_by();
3545
3546   pg_stat_t pre_publish = info.stats;
3547   pre_publish.stats.add(unstable_stats);
3548   utime_t cutoff = now;
3549   cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
3550
3551   // share (some of) our purged_snaps via the pg_stats. limit # of intervals
3552   // because we don't want to make the pg_stat_t structures too expensive.
3553   unsigned max = cct->_conf->osd_max_snap_prune_intervals_per_epoch;
3554   unsigned num = 0;
3555   auto i = info.purged_snaps.begin();
3556   while (num < max && i != info.purged_snaps.end()) {
3557     pre_publish.purged_snaps.insert(i.get_start(), i.get_len());
3558     ++num;
3559     ++i;
3560   }
3561   psdout(20) << __func__ << " reporting purged_snaps "
3562              << pre_publish.purged_snaps << dendl;
3563
3564   if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
3565       info.stats.last_fresh > cutoff) {
3566     psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3567                << ": no change since " << info.stats.last_fresh << dendl;
3568     return std::nullopt;
3569   } else {
3570     // update our stat summary and timestamps
3571     info.stats.reported_epoch = get_osdmap_epoch();
3572     ++info.stats.reported_seq;
3573
3574     info.stats.last_fresh = now;
3575
3576     if (info.stats.state & PG_STATE_CLEAN)
3577       info.stats.last_clean = now;
3578     if (info.stats.state & PG_STATE_ACTIVE)
3579       info.stats.last_active = now;
3580     if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
3581       info.stats.last_peered = now;
3582     info.stats.last_unstale = now;
3583     if ((info.stats.state & PG_STATE_DEGRADED) == 0)
3584       info.stats.last_undegraded = now;
3585     if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
3586       info.stats.last_fullsized = now;
3587
3588     psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3589                << ":" << pg_stats_publish.reported_seq << dendl;
3590     return std::make_optional(std::move(pre_publish));
3591   }
3592 }
3593
3594 void PeeringState::init(
3595   int role,
3596   const vector<int>& newup, int new_up_primary,
3597   const vector<int>& newacting, int new_acting_primary,
3598   const pg_history_t& history,
3599   const PastIntervals& pi,
3600   bool backfill,
3601   ObjectStore::Transaction &t)
3602 {
3603   psdout(10) << "init role " << role << " up "
3604              << newup << " acting " << newacting
3605              << " history " << history
3606              << " past_intervals " << pi
3607              << dendl;
3608
3609   set_role(role);
3610   init_primary_up_acting(
3611     newup,
3612     newacting,
3613     new_up_primary,
3614     new_acting_primary);
3615
3616   info.history = history;
3617   past_intervals = pi;
3618
3619   info.stats.up = up;
3620   info.stats.up_primary = new_up_primary;
3621   info.stats.acting = acting;
3622   info.stats.acting_primary = new_acting_primary;
3623   info.stats.mapping_epoch = info.history.same_interval_since;
3624
3625   if (!perform_deletes_during_peering()) {
3626     pg_log.set_missing_may_contain_deletes();
3627   }
3628
3629   if (backfill) {
3630     psdout(10) << __func__ << ": Setting backfill" << dendl;
3631     info.set_last_backfill(hobject_t());
3632     info.last_complete = info.last_update;
3633     pg_log.mark_log_for_rewrite();
3634   }
3635
3636   on_new_interval();
3637
3638   dirty_info = true;
3639   dirty_big_info = true;
3640   write_if_dirty(t);
3641 }
3642
3643 void PeeringState::dump_peering_state(Formatter *f)
3644 {
3645   f->dump_string("state", get_pg_state_string());
3646   f->dump_unsigned("epoch", get_osdmap_epoch());
3647   f->open_array_section("up");
3648   for (vector<int>::const_iterator p = up.begin(); p != up.end(); ++p)
3649     f->dump_unsigned("osd", *p);
3650   f->close_section();
3651   f->open_array_section("acting");
3652   for (vector<int>::const_iterator p = acting.begin(); p != acting.end(); ++p)
3653     f->dump_unsigned("osd", *p);
3654   f->close_section();
3655   if (!backfill_targets.empty()) {
3656     f->open_array_section("backfill_targets");
3657     for (set<pg_shard_t>::iterator p = backfill_targets.begin();
3658          p != backfill_targets.end();
3659          ++p)
3660       f->dump_stream("shard") << *p;
3661     f->close_section();
3662   }
3663   if (!async_recovery_targets.empty()) {
3664     f->open_array_section("async_recovery_targets");
3665     for (set<pg_shard_t>::iterator p = async_recovery_targets.begin();
3666          p != async_recovery_targets.end();
3667          ++p)
3668       f->dump_stream("shard") << *p;
3669     f->close_section();
3670   }
3671   if (!acting_recovery_backfill.empty()) {
3672     f->open_array_section("acting_recovery_backfill");
3673     for (set<pg_shard_t>::iterator p = acting_recovery_backfill.begin();
3674          p != acting_recovery_backfill.end();
3675          ++p)
3676       f->dump_stream("shard") << *p;
3677     f->close_section();
3678   }
3679   f->open_object_section("info");
3680   update_calc_stats();
3681   info.dump(f);
3682   f->close_section();
3683
3684   f->open_array_section("peer_info");
3685   for (map<pg_shard_t, pg_info_t>::const_iterator p = peer_info.begin();
3686        p != peer_info.end();
3687        ++p) {
3688     f->open_object_section("info");
3689     f->dump_stream("peer") << p->first;
3690     p->second.dump(f);
3691     f->close_section();
3692   }
3693 }
3694
3695 void PeeringState::update_stats(
3696   std::function<bool(pg_history_t &, pg_stat_t &)> f,
3697   ObjectStore::Transaction *t) {
3698   if (f(info.history, info.stats)) {
3699     pl->publish_stats_to_osd();
3700   }
3701   pl->on_info_history_change();
3702
3703   if (t) {
3704     dirty_info = true;
3705     write_if_dirty(*t);
3706   }
3707 }
3708
3709 bool PeeringState::append_log_entries_update_missing(
3710   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3711   ObjectStore::Transaction &t, std::optional<eversion_t> trim_to,
3712   std::optional<eversion_t> roll_forward_to)
3713 {
3714   ceph_assert(!entries.empty());
3715   ceph_assert(entries.begin()->version > info.last_update);
3716
3717   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
3718   bool invalidate_stats =
3719     pg_log.append_new_log_entries(
3720       info.last_backfill,
3721       entries,
3722       rollbacker.get());
3723
3724   if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
3725     pg_log.roll_forward(rollbacker.get());
3726   }
3727   if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
3728     pg_log.roll_forward_to(*roll_forward_to, rollbacker.get());
3729     last_rollback_info_trimmed_to_applied = *roll_forward_to;
3730   }
3731
3732   info.last_update = pg_log.get_head();
3733
3734   if (pg_log.get_missing().num_missing() == 0) {
3735     // advance last_complete since nothing else is missing!
3736     info.last_complete = info.last_update;
3737   }
3738   info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
3739
3740   psdout(20) << __func__ << " trim_to bool = " << bool(trim_to)
3741              << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
3742   if (trim_to)
3743     pg_log.trim(*trim_to, info);
3744   dirty_info = true;
3745   write_if_dirty(t);
3746   return invalidate_stats;
3747 }
3748
3749 void PeeringState::merge_new_log_entries(
3750   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3751   ObjectStore::Transaction &t,
3752   std::optional<eversion_t> trim_to,
3753   std::optional<eversion_t> roll_forward_to)
3754 {
3755   psdout(10) << __func__ << " " << entries << dendl;
3756   ceph_assert(is_primary());
3757
3758   bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
3759   for (set<pg_shard_t>::const_iterator i = acting_recovery_backfill.begin();
3760        i != acting_recovery_backfill.end();
3761        ++i) {
3762     pg_shard_t peer(*i);
3763     if (peer == pg_whoami) continue;
3764     ceph_assert(peer_missing.count(peer));
3765     ceph_assert(peer_info.count(peer));
3766     pg_missing_t& pmissing(peer_missing[peer]);
3767     psdout(20) << __func__ << " peer_missing for " << peer
3768                << " = " << pmissing << dendl;
3769     pg_info_t& pinfo(peer_info[peer]);
3770     bool invalidate_stats = PGLog::append_log_entries_update_missing(
3771       pinfo.last_backfill,
3772       entries,
3773       true,
3774       NULL,
3775       pmissing,
3776       NULL,
3777       dpp);
3778     pinfo.last_update = info.last_update;
3779     pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
3780     rebuild_missing = rebuild_missing || invalidate_stats;
3781   }
3782
3783   if (!rebuild_missing) {
3784     return;
3785   }
3786
3787   for (auto &&i: entries) {
3788     missing_loc.rebuild(
3789       i.soid,
3790       pg_whoami,
3791       acting_recovery_backfill,
3792       info,
3793       pg_log.get_missing(),
3794       peer_missing,
3795       peer_info);
3796   }
3797 }
3798
3799 void PeeringState::add_log_entry(const pg_log_entry_t& e, bool applied)
3800 {
3801   // raise last_complete only if we were previously up to date
3802   if (info.last_complete == info.last_update)
3803     info.last_complete = e.version;
3804
3805   // raise last_update.
3806   ceph_assert(e.version > info.last_update);
3807   info.last_update = e.version;
3808
3809   // raise user_version, if it increased (it may have not get bumped
3810   // by all logged updates)
3811   if (e.user_version > info.last_user_version)
3812     info.last_user_version = e.user_version;
3813
3814   // log mutation
3815   pg_log.add(e, applied);
3816   psdout(10) << "add_log_entry " << e << dendl;
3817 }
3818
3819
3820 void PeeringState::append_log(
3821   const vector<pg_log_entry_t>& logv,
3822   eversion_t trim_to,
3823   eversion_t roll_forward_to,
3824   eversion_t mlcod,
3825   ObjectStore::Transaction &t,
3826   bool transaction_applied,
3827   bool async)
3828 {
3829   /* The primary has sent an info updating the history, but it may not
3830    * have arrived yet.  We want to make sure that we cannot remember this
3831    * write without remembering that it happened in an interval which went
3832    * active in epoch history.last_epoch_started.
3833    */
3834   if (info.last_epoch_started != info.history.last_epoch_started) {
3835     info.history.last_epoch_started = info.last_epoch_started;
3836   }
3837   if (info.last_interval_started != info.history.last_interval_started) {
3838     info.history.last_interval_started = info.last_interval_started;
3839   }
3840   psdout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3841
3842   PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
3843   if (!transaction_applied) {
3844      /* We must be a backfill or async recovery peer, so it's ok if we apply
3845       * out-of-turn since we won't be considered when
3846       * determining a min possible last_update.
3847       *
3848       * We skip_rollforward() here, which advances the crt, without
3849       * doing an actual rollforward. This avoids cleaning up entries
3850       * from the backend and we do not end up in a situation, where the
3851       * object is deleted before we can _merge_object_divergent_entries().
3852       */
3853     pg_log.skip_rollforward();
3854   }
3855
3856   for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3857        p != logv.end();
3858        ++p) {
3859     add_log_entry(*p, transaction_applied);
3860
3861     /* We don't want to leave the rollforward artifacts around
3862      * here past last_backfill.  It's ok for the same reason as
3863      * above */
3864     if (transaction_applied &&
3865         p->soid > info.last_backfill) {
3866       pg_log.roll_forward(handler.get());
3867     }
3868   }
3869   if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
3870     pg_log.roll_forward_to(
3871       roll_forward_to,
3872       handler.get());
3873     last_rollback_info_trimmed_to_applied = roll_forward_to;
3874   }
3875
3876   psdout(10) << __func__ << " approx pg log length =  "
3877              << pg_log.get_log().approx_size() << dendl;
3878   psdout(10) << __func__ << " transaction_applied = "
3879              << transaction_applied << dendl;
3880   if (!transaction_applied || async)
3881     psdout(10) << __func__ << " " << pg_whoami
3882                << " is async_recovery or backfill target" << dendl;
3883   pg_log.trim(trim_to, info, transaction_applied, async);
3884
3885   // update the local pg, pg log
3886   dirty_info = true;
3887   write_if_dirty(t);
3888
3889   if (!is_primary())
3890     min_last_complete_ondisk = mlcod;
3891 }
3892
3893 void PeeringState::recover_got(
3894   const hobject_t &oid, eversion_t v,
3895   bool is_delete,
3896   ObjectStore::Transaction &t)
3897 {
3898   if (v > pg_log.get_can_rollback_to()) {
3899     /* This can only happen during a repair, and even then, it would
3900      * be one heck of a race.  If we are repairing the object, the
3901      * write in question must be fully committed, so it's not valid
3902      * to roll it back anyway (and we'll be rolled forward shortly
3903      * anyway) */
3904     PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
3905     pg_log.roll_forward_to(v, handler.get());
3906   }
3907
3908   psdout(10) << "got missing " << oid << " v " << v << dendl;
3909   pg_log.recover_got(oid, v, info);
3910   if (pg_log.get_log().log.empty()) {
3911     psdout(10) << "last_complete now " << info.last_complete
3912                << " while log is empty" << dendl;
3913   } else if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
3914     psdout(10) << "last_complete now " << info.last_complete
3915                << " log.complete_to " << pg_log.get_log().complete_to->version
3916                << dendl;
3917   } else {
3918     psdout(10) << "last_complete now " << info.last_complete
3919                << " log.complete_to at end" << dendl;
3920     //below is not true in the repair case.
3921     //assert(missing.num_missing() == 0);  // otherwise, complete_to was wrong.
3922     ceph_assert(info.last_complete == info.last_update);
3923   }
3924
3925   if (is_primary()) {
3926     ceph_assert(missing_loc.needs_recovery(oid));
3927     if (!is_delete)
3928       missing_loc.add_location(oid, pg_whoami);
3929   }
3930
3931   // update pg
3932   dirty_info = true;
3933   write_if_dirty(t);
3934 }
3935
3936 void PeeringState::update_backfill_progress(
3937   const hobject_t &updated_backfill,
3938   const pg_stat_t &updated_stats,
3939   bool preserve_local_num_bytes,
3940   ObjectStore::Transaction &t) {
3941   info.set_last_backfill(updated_backfill);
3942   if (preserve_local_num_bytes) {
3943     psdout(25) << __func__ << " primary " << updated_stats.stats.sum.num_bytes
3944                << " local " << info.stats.stats.sum.num_bytes << dendl;
3945     int64_t bytes = info.stats.stats.sum.num_bytes;
3946     info.stats = updated_stats;
3947     info.stats.stats.sum.num_bytes = bytes;
3948   } else {
3949     psdout(20) << __func__ << " final " << updated_stats.stats.sum.num_bytes
3950                << " replaces local " << info.stats.stats.sum.num_bytes << dendl;
3951     info.stats = updated_stats;
3952   }
3953
3954   dirty_info = true;
3955   write_if_dirty(t);
3956 }
3957
3958 void PeeringState::adjust_purged_snaps(
3959   std::function<void(interval_set<snapid_t> &snaps)> f) {
3960   f(info.purged_snaps);
3961   dirty_info = true;
3962   dirty_big_info = true;
3963 }
3964
3965 void PeeringState::on_peer_recover(
3966   pg_shard_t peer,
3967   const hobject_t &soid,
3968   const eversion_t &version)
3969 {
3970   pl->publish_stats_to_osd();
3971   // done!
3972   peer_missing[peer].got(soid, version);
3973   missing_loc.add_location(soid, peer);
3974 }
3975
3976 void PeeringState::begin_peer_recover(
3977   pg_shard_t peer,
3978   const hobject_t soid)
3979 {
3980   peer_missing[peer].revise_have(soid, eversion_t());
3981 }
3982
3983 void PeeringState::force_object_missing(
3984   const set<pg_shard_t> &peers,
3985   const hobject_t &soid,
3986   eversion_t version)
3987 {
3988   for (auto &&peer : peers) {
3989     if (peer != primary) {
3990       peer_missing[peer].add(soid, version, eversion_t(), false);
3991     } else {
3992       pg_log.missing_add(soid, version, eversion_t());
3993       pg_log.reset_complete_to(&info);
3994       pg_log.set_last_requested(0);
3995     }
3996   }
3997
3998   missing_loc.rebuild(
3999     soid,
4000     pg_whoami,
4001     acting_recovery_backfill,
4002     info,
4003     pg_log.get_missing(),
4004     peer_missing,
4005     peer_info);
4006 }
4007
4008 void PeeringState::pre_submit_op(
4009   const hobject_t &hoid,
4010   const vector<pg_log_entry_t>& logv,
4011   eversion_t at_version)
4012 {
4013   if (at_version > eversion_t()) {
4014     for (auto &&i : get_acting_recovery_backfill()) {
4015       if (i == primary) continue;
4016       pg_info_t &pinfo = peer_info[i];
4017       // keep peer_info up to date
4018       if (pinfo.last_complete == pinfo.last_update)
4019         pinfo.last_complete = at_version;
4020       pinfo.last_update = at_version;
4021     }
4022   }
4023
4024   bool requires_missing_loc = false;
4025   for (auto &&i : get_async_recovery_targets()) {
4026     if (i == primary || !get_peer_missing(i).is_missing(hoid))
4027       continue;
4028     requires_missing_loc = true;
4029     for (auto &&entry: logv) {
4030       peer_missing[i].add_next_event(entry);
4031     }
4032   }
4033
4034   if (requires_missing_loc) {
4035     for (auto &&entry: logv) {
4036       psdout(30) << __func__ << " missing_loc before: "
4037                  << missing_loc.get_locations(entry.soid) << dendl;
4038       missing_loc.add_missing(entry.soid, entry.version,
4039                               eversion_t(), entry.is_delete());
4040       // clear out missing_loc
4041       missing_loc.clear_location(entry.soid);
4042       for (auto &i: get_actingset()) {
4043         if (!get_peer_missing(i).is_missing(entry.soid))
4044           missing_loc.add_location(entry.soid, i);
4045       }
4046       psdout(30) << __func__ << " missing_loc after: "
4047                  << missing_loc.get_locations(entry.soid) << dendl;
4048     }
4049   }
4050 }
4051
4052 void PeeringState::recovery_committed_to(eversion_t version)
4053 {
4054   psdout(10) << __func__ << " version " << version
4055              << " now ondisk" << dendl;
4056   last_complete_ondisk = version;
4057
4058   if (last_complete_ondisk == info.last_update) {
4059     if (!is_primary()) {
4060       // Either we are a replica or backfill target.
4061       // we are fully up to date.  tell the primary!
4062       pl->send_cluster_message(
4063         get_primary().osd,
4064         new MOSDPGTrim(
4065           get_osdmap_epoch(),
4066           spg_t(info.pgid.pgid, primary.shard),
4067           last_complete_ondisk),
4068         get_osdmap_epoch());
4069     } else {
4070       calc_min_last_complete_ondisk();
4071     }
4072   }
4073 }
4074
4075 void PeeringState::complete_write(eversion_t v, eversion_t lc)
4076 {
4077   last_update_ondisk = v;
4078   last_complete_ondisk = lc;
4079   calc_min_last_complete_ondisk();
4080 }
4081
4082 void PeeringState::calc_trim_to()
4083 {
4084   size_t target = pl->get_target_pg_log_entries();
4085
4086   eversion_t limit = std::min(
4087     min_last_complete_ondisk,
4088     pg_log.get_can_rollback_to());
4089   if (limit != eversion_t() &&
4090       limit != pg_trim_to &&
4091       pg_log.get_log().approx_size() > target) {
4092     size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target,
4093                              cct->_conf->osd_pg_log_trim_max);
4094     if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4095         cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4096       return;
4097     }
4098     list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
4099     eversion_t new_trim_to;
4100     for (size_t i = 0; i < num_to_trim; ++i) {
4101       new_trim_to = it->version;
4102       ++it;
4103       if (new_trim_to > limit) {
4104         new_trim_to = limit;
4105         psdout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
4106         break;
4107       }
4108     }
4109     psdout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
4110     pg_trim_to = new_trim_to;
4111     assert(pg_trim_to <= pg_log.get_head());
4112     assert(pg_trim_to <= min_last_complete_ondisk);
4113   }
4114 }
4115
4116 void PeeringState::calc_trim_to_aggressive()
4117 {
4118   size_t target = pl->get_target_pg_log_entries();
4119
4120   // limit pg log trimming up to the can_rollback_to value
4121   eversion_t limit = std::min({
4122     pg_log.get_head(),
4123     pg_log.get_can_rollback_to(),
4124     last_update_ondisk});
4125   psdout(10) << __func__ << " limit = " << limit << dendl;
4126
4127   if (limit != eversion_t() &&
4128       limit != pg_trim_to &&
4129       pg_log.get_log().approx_size() > target) {
4130     psdout(10) << __func__ << " approx pg log length =  "
4131              << pg_log.get_log().approx_size() << dendl;
4132     uint64_t num_to_trim = std::min<uint64_t>(pg_log.get_log().approx_size() - target,
4133                                               cct->_conf->osd_pg_log_trim_max);
4134     psdout(10) << __func__ << " num_to_trim =  " << num_to_trim << dendl;
4135     if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4136         cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4137       return;
4138     }
4139     auto it = pg_log.get_log().log.begin(); // oldest log entry
4140     auto rit = pg_log.get_log().log.rbegin();
4141     eversion_t by_n_to_keep; // start from tail
4142     eversion_t by_n_to_trim = eversion_t::max(); // start from head
4143     for (size_t i = 0; it != pg_log.get_log().log.end(); ++it, ++rit) {
4144       i++;
4145       if (i > target && by_n_to_keep == eversion_t()) {
4146         by_n_to_keep = rit->version;
4147       }
4148       if (i >= num_to_trim && by_n_to_trim == eversion_t::max()) {
4149         by_n_to_trim = it->version;
4150       }
4151       if (by_n_to_keep != eversion_t() &&
4152           by_n_to_trim != eversion_t::max()) {
4153         break;
4154       }
4155     }
4156
4157     if (by_n_to_keep == eversion_t()) {
4158       return;
4159     }
4160
4161     pg_trim_to = std::min({by_n_to_keep, by_n_to_trim, limit});
4162     psdout(10) << __func__ << " pg_trim_to now " << pg_trim_to << dendl;
4163     ceph_assert(pg_trim_to <= pg_log.get_head());
4164   }
4165 }
4166
4167 void PeeringState::apply_op_stats(
4168   const hobject_t &soid,
4169   const object_stat_sum_t &delta_stats)
4170 {
4171   info.stats.stats.add(delta_stats);
4172   info.stats.stats.floor(0);
4173
4174   for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
4175        i != get_backfill_targets().end();
4176        ++i) {
4177     pg_shard_t bt = *i;
4178     pg_info_t& pinfo = peer_info[bt];
4179     if (soid <= pinfo.last_backfill)
4180       pinfo.stats.stats.add(delta_stats);
4181   }
4182 }
4183
4184 void PeeringState::update_complete_backfill_object_stats(
4185   const hobject_t &hoid,
4186   const pg_stat_t &stats)
4187 {
4188   for (auto &&bt: get_backfill_targets()) {
4189     pg_info_t& pinfo = peer_info[bt];
4190     //Add stats to all peers that were missing object
4191     if (hoid > pinfo.last_backfill)
4192       pinfo.stats.add(stats);
4193   }
4194 }
4195
4196 void PeeringState::update_peer_last_backfill(
4197   pg_shard_t peer,
4198   const hobject_t &new_last_backfill)
4199 {
4200   pg_info_t &pinfo = peer_info[peer];
4201   pinfo.last_backfill = new_last_backfill;
4202   if (new_last_backfill.is_max()) {
4203     /* pinfo.stats might be wrong if we did log-based recovery on the
4204      * backfilled portion in addition to continuing backfill.
4205      */
4206     pinfo.stats = info.stats;
4207   }
4208 }
4209
4210 void PeeringState::set_revert_with_targets(
4211   const hobject_t &soid,
4212   const set<pg_shard_t> &good_peers)
4213 {
4214   for (auto &&peer: good_peers) {
4215     missing_loc.add_location(soid, peer);
4216   }
4217 }
4218
4219 void PeeringState::prepare_backfill_for_missing(
4220   const hobject_t &soid,
4221   const eversion_t &version,
4222   const vector<pg_shard_t> &targets) {
4223   for (auto &&peer: targets) {
4224     peer_missing[peer].add(soid, version, eversion_t(), false);
4225   }
4226 }
4227
4228 void PeeringState::update_hset(const pg_hit_set_history_t &hset_history)
4229 {
4230   info.hit_set = hset_history;
4231 }
4232
4233 /*------------ Peering State Machine----------------*/
4234 #undef dout_prefix
4235 #define dout_prefix (context< PeeringMachine >().dpp->gen_prefix(*_dout) \
4236                     << "state<" << get_state_name() << ">: ")
4237 #undef psdout
4238 #define psdout(x) ldout(context< PeeringMachine >().cct, x)
4239
4240 #define DECLARE_LOCALS                                  \
4241   PeeringState *ps = context< PeeringMachine >().state; \
4242   std::ignore = ps;                                     \
4243   PeeringListener *pl = context< PeeringMachine >().pl; \
4244   std::ignore = pl
4245
4246
4247 /*------Crashed-------*/
4248 PeeringState::Crashed::Crashed(my_context ctx)
4249   : my_base(ctx),
4250     NamedState(context< PeeringMachine >().state_history, "Crashed")
4251 {
4252   context< PeeringMachine >().log_enter(state_name);
4253   ceph_abort_msg("we got a bad state machine event");
4254 }
4255
4256
4257 /*------Initial-------*/
4258 PeeringState::Initial::Initial(my_context ctx)
4259   : my_base(ctx),
4260     NamedState(context< PeeringMachine >().state_history, "Initial")
4261 {
4262   context< PeeringMachine >().log_enter(state_name);
4263 }
4264
4265 boost::statechart::result PeeringState::Initial::react(const MNotifyRec& notify)
4266 {
4267   DECLARE_LOCALS;
4268   ps->proc_replica_info(
4269     notify.from, notify.notify.info, notify.notify.epoch_sent);
4270   ps->set_last_peering_reset();
4271   return transit< Primary >();
4272 }
4273
4274 boost::statechart::result PeeringState::Initial::react(const MInfoRec& i)
4275 {
4276   DECLARE_LOCALS;
4277   ceph_assert(!ps->is_primary());
4278   post_event(i);
4279   return transit< Stray >();
4280 }
4281
4282 boost::statechart::result PeeringState::Initial::react(const MLogRec& i)
4283 {
4284   DECLARE_LOCALS;
4285   ceph_assert(!ps->is_primary());
4286   post_event(i);
4287   return transit< Stray >();
4288 }
4289
4290 void PeeringState::Initial::exit()
4291 {
4292   context< PeeringMachine >().log_exit(state_name, enter_time);
4293   DECLARE_LOCALS;
4294   utime_t dur = ceph_clock_now() - enter_time;
4295   pl->get_peering_perf().tinc(rs_initial_latency, dur);
4296 }
4297
4298 /*------Started-------*/
4299 PeeringState::Started::Started(my_context ctx)
4300   : my_base(ctx),
4301     NamedState(context< PeeringMachine >().state_history, "Started")
4302 {
4303   context< PeeringMachine >().log_enter(state_name);
4304 }
4305
4306 boost::statechart::result
4307 PeeringState::Started::react(const IntervalFlush&)
4308 {
4309   psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4310   context< PeeringMachine >().state->end_block_outgoing();
4311   return discard_event();
4312 }
4313
4314 boost::statechart::result PeeringState::Started::react(const AdvMap& advmap)
4315 {
4316   DECLARE_LOCALS;
4317   psdout(10) << "Started advmap" << dendl;
4318   ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4319   if (ps->should_restart_peering(
4320         advmap.up_primary,
4321         advmap.acting_primary,
4322         advmap.newup,
4323         advmap.newacting,
4324         advmap.lastmap,
4325         advmap.osdmap)) {
4326     psdout(10) << "should_restart_peering, transitioning to Reset"
4327                        << dendl;
4328     post_event(advmap);
4329     return transit< Reset >();
4330   }
4331   ps->remove_down_peer_info(advmap.osdmap);
4332   return discard_event();
4333 }
4334
4335 boost::statechart::result PeeringState::Started::react(const QueryState& q)
4336 {
4337   q.f->open_object_section("state");
4338   q.f->dump_string("name", state_name);
4339   q.f->dump_stream("enter_time") << enter_time;
4340   q.f->close_section();
4341   return discard_event();
4342 }
4343
4344 void PeeringState::Started::exit()
4345 {
4346   context< PeeringMachine >().log_exit(state_name, enter_time);
4347   DECLARE_LOCALS;
4348   utime_t dur = ceph_clock_now() - enter_time;
4349   pl->get_peering_perf().tinc(rs_started_latency, dur);
4350   ps->state_clear(PG_STATE_WAIT | PG_STATE_LAGGY);
4351 }
4352
4353 /*--------Reset---------*/
4354 PeeringState::Reset::Reset(my_context ctx)
4355   : my_base(ctx),
4356     NamedState(context< PeeringMachine >().state_history, "Reset")
4357 {
4358   context< PeeringMachine >().log_enter(state_name);
4359   DECLARE_LOCALS;
4360
4361   ps->flushes_in_progress = 0;
4362   ps->set_last_peering_reset();
4363   ps->log_weirdness();
4364 }
4365
4366 boost::statechart::result
4367 PeeringState::Reset::react(const IntervalFlush&)
4368 {
4369   psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4370   context< PeeringMachine >().state->end_block_outgoing();
4371   return discard_event();
4372 }
4373
4374 boost::statechart::result PeeringState::Reset::react(const AdvMap& advmap)
4375 {
4376   DECLARE_LOCALS;
4377   psdout(10) << "Reset advmap" << dendl;
4378
4379   ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4380
4381   if (ps->should_restart_peering(
4382         advmap.up_primary,
4383         advmap.acting_primary,
4384         advmap.newup,
4385         advmap.newacting,
4386         advmap.lastmap,
4387         advmap.osdmap)) {
4388     psdout(10) << "should restart peering, calling start_peering_interval again"
4389                        << dendl;
4390     ps->start_peering_interval(
4391       advmap.lastmap,
4392       advmap.newup, advmap.up_primary,
4393       advmap.newacting, advmap.acting_primary,
4394       context< PeeringMachine >().get_cur_transaction());
4395   }
4396   ps->remove_down_peer_info(advmap.osdmap);
4397   ps->check_past_interval_bounds();
4398   return discard_event();
4399 }
4400
4401 boost::statechart::result PeeringState::Reset::react(const ActMap&)
4402 {
4403   DECLARE_LOCALS;
4404   if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
4405     ps->info.history.refresh_prior_readable_until_ub(
4406       pl->get_mnow(),
4407       ps->prior_readable_until_ub);
4408     context< PeeringMachine >().send_notify(
4409       ps->get_primary().osd,
4410       pg_notify_t(
4411         ps->get_primary().shard, ps->pg_whoami.shard,
4412         ps->get_osdmap_epoch(),
4413         ps->get_osdmap_epoch(),
4414         ps->info,
4415         ps->past_intervals));
4416   }
4417
4418   ps->update_heartbeat_peers();
4419
4420   return transit< Started >();
4421 }
4422
4423 boost::statechart::result PeeringState::Reset::react(const QueryState& q)
4424 {
4425   q.f->open_object_section("state");
4426   q.f->dump_string("name", state_name);
4427   q.f->dump_stream("enter_time") << enter_time;
4428   q.f->close_section();
4429   return discard_event();
4430 }
4431
4432 void PeeringState::Reset::exit()
4433 {
4434   context< PeeringMachine >().log_exit(state_name, enter_time);
4435   DECLARE_LOCALS;
4436   utime_t dur = ceph_clock_now() - enter_time;
4437   pl->get_peering_perf().tinc(rs_reset_latency, dur);
4438 }
4439
4440 /*-------Start---------*/
4441 PeeringState::Start::Start(my_context ctx)
4442   : my_base(ctx),
4443     NamedState(context< PeeringMachine >().state_history, "Start")
4444 {
4445   context< PeeringMachine >().log_enter(state_name);
4446
4447   DECLARE_LOCALS;
4448   if (ps->is_primary()) {
4449     psdout(1) << "transitioning to Primary" << dendl;
4450     post_event(MakePrimary());
4451   } else { //is_stray
4452     psdout(1) << "transitioning to Stray" << dendl;
4453     post_event(MakeStray());
4454   }
4455 }
4456
4457 void PeeringState::Start::exit()
4458 {
4459   context< PeeringMachine >().log_exit(state_name, enter_time);
4460   DECLARE_LOCALS;
4461   utime_t dur = ceph_clock_now() - enter_time;
4462   pl->get_peering_perf().tinc(rs_start_latency, dur);
4463 }
4464
4465 /*---------Primary--------*/
4466 PeeringState::Primary::Primary(my_context ctx)
4467   : my_base(ctx),
4468     NamedState(context< PeeringMachine >().state_history, "Started/Primary")
4469 {
4470   context< PeeringMachine >().log_enter(state_name);
4471   DECLARE_LOCALS;
4472   ceph_assert(ps->want_acting.empty());
4473
4474   // set CREATING bit until we have peered for the first time.
4475   if (ps->info.history.last_epoch_started == 0) {
4476     ps->state_set(PG_STATE_CREATING);
4477     // use the history timestamp, which ultimately comes from the
4478     // monitor in the create case.
4479     utime_t t = ps->info.history.last_scrub_stamp;
4480     ps->info.stats.last_fresh = t;
4481     ps->info.stats.last_active = t;
4482     ps->info.stats.last_change = t;
4483     ps->info.stats.last_peered = t;
4484     ps->info.stats.last_clean = t;
4485     ps->info.stats.last_unstale = t;
4486     ps->info.stats.last_undegraded = t;
4487     ps->info.stats.last_fullsized = t;
4488     ps->info.stats.last_scrub_stamp = t;
4489     ps->info.stats.last_deep_scrub_stamp = t;
4490     ps->info.stats.last_clean_scrub_stamp = t;
4491   }
4492 }
4493
4494 boost::statechart::result PeeringState::Primary::react(const MNotifyRec& notevt)
4495 {
4496   DECLARE_LOCALS;
4497   psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
4498   ps->proc_replica_info(
4499     notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
4500   return discard_event();
4501 }
4502
4503 boost::statechart::result PeeringState::Primary::react(const ActMap&)
4504 {
4505   DECLARE_LOCALS;
4506   psdout(7) << "handle ActMap primary" << dendl;
4507   pl->publish_stats_to_osd();
4508   return discard_event();
4509 }
4510
4511 boost::statechart::result PeeringState::Primary::react(
4512   const SetForceRecovery&)
4513 {
4514   DECLARE_LOCALS;
4515   ps->set_force_recovery(true);
4516   return discard_event();
4517 }
4518
4519 boost::statechart::result PeeringState::Primary::react(
4520   const UnsetForceRecovery&)
4521 {
4522   DECLARE_LOCALS;
4523   ps->set_force_recovery(false);
4524   return discard_event();
4525 }
4526
4527 boost::statechart::result PeeringState::Primary::react(
4528   const RequestScrub& evt)
4529 {
4530   DECLARE_LOCALS;
4531   if (ps->is_primary()) {
4532     pl->scrub_requested(evt.deep, evt.repair);
4533     psdout(10) << "marking for scrub" << dendl;
4534   }
4535   return discard_event();
4536 }
4537
4538 boost::statechart::result PeeringState::Primary::react(
4539   const SetForceBackfill&)
4540 {
4541   DECLARE_LOCALS;
4542   ps->set_force_backfill(true);
4543   return discard_event();
4544 }
4545
4546 boost::statechart::result PeeringState::Primary::react(
4547   const UnsetForceBackfill&)
4548 {
4549   DECLARE_LOCALS;
4550   ps->set_force_backfill(false);
4551   return discard_event();
4552 }
4553
4554 void PeeringState::Primary::exit()
4555 {
4556   context< PeeringMachine >().log_exit(state_name, enter_time);
4557   DECLARE_LOCALS;
4558   ps->want_acting.clear();
4559   utime_t dur = ceph_clock_now() - enter_time;
4560   pl->get_peering_perf().tinc(rs_primary_latency, dur);
4561   pl->clear_primary_state();
4562   ps->state_clear(PG_STATE_CREATING);
4563 }
4564
4565 /*---------Peering--------*/
4566 PeeringState::Peering::Peering(my_context ctx)
4567   : my_base(ctx),
4568     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering"),
4569     history_les_bound(false)
4570 {
4571   context< PeeringMachine >().log_enter(state_name);
4572   DECLARE_LOCALS;
4573
4574   ceph_assert(!ps->is_peered());
4575   ceph_assert(!ps->is_peering());
4576   ceph_assert(ps->is_primary());
4577   ps->state_set(PG_STATE_PEERING);
4578 }
4579
4580 boost::statechart::result PeeringState::Peering::react(const AdvMap& advmap)
4581 {
4582   DECLARE_LOCALS;
4583   psdout(10) << "Peering advmap" << dendl;
4584   if (prior_set.affected_by_map(*(advmap.osdmap), ps->dpp)) {
4585     psdout(1) << "Peering, affected_by_map, going to Reset" << dendl;
4586     post_event(advmap);
4587     return transit< Reset >();
4588   }
4589
4590   ps->adjust_need_up_thru(advmap.osdmap);
4591   ps->check_prior_readable_down_osds(advmap.osdmap);
4592
4593   return forward_event();
4594 }
4595
4596 boost::statechart::result PeeringState::Peering::react(const QueryState& q)
4597 {
4598   DECLARE_LOCALS;
4599
4600   q.f->open_object_section("state");
4601   q.f->dump_string("name", state_name);
4602   q.f->dump_stream("enter_time") << enter_time;
4603
4604   q.f->open_array_section("past_intervals");
4605   ps->past_intervals.dump(q.f);
4606   q.f->close_section();
4607
4608   q.f->open_array_section("probing_osds");
4609   for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
4610        p != prior_set.probe.end();
4611        ++p)
4612     q.f->dump_stream("osd") << *p;
4613   q.f->close_section();
4614
4615   if (prior_set.pg_down)
4616     q.f->dump_string("blocked", "peering is blocked due to down osds");
4617
4618   q.f->open_array_section("down_osds_we_would_probe");
4619   for (set<int>::iterator p = prior_set.down.begin();
4620        p != prior_set.down.end();
4621        ++p)
4622     q.f->dump_int("osd", *p);
4623   q.f->close_section();
4624
4625   q.f->open_array_section("peering_blocked_by");
4626   for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
4627        p != prior_set.blocked_by.end();
4628        ++p) {
4629     q.f->open_object_section("osd");
4630     q.f->dump_int("osd", p->first);
4631     q.f->dump_int("current_lost_at", p->second);
4632     q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
4633     q.f->close_section();
4634   }
4635   q.f->close_section();
4636
4637   if (history_les_bound) {
4638     q.f->open_array_section("peering_blocked_by_detail");
4639     q.f->open_object_section("item");
4640     q.f->dump_string("detail","peering_blocked_by_history_les_bound");
4641     q.f->close_section();
4642     q.f->close_section();
4643   }
4644
4645   q.f->close_section();
4646   return forward_event();
4647 }
4648
4649 void PeeringState::Peering::exit()
4650 {
4651
4652   DECLARE_LOCALS;
4653   psdout(10) << "Leaving Peering" << dendl;
4654   context< PeeringMachine >().log_exit(state_name, enter_time);
4655   ps->state_clear(PG_STATE_PEERING);
4656   pl->clear_probe_targets();
4657
4658   utime_t dur = ceph_clock_now() - enter_time;
4659   pl->get_peering_perf().tinc(rs_peering_latency, dur);
4660 }
4661
4662
4663 /*------Backfilling-------*/
4664 PeeringState::Backfilling::Backfilling(my_context ctx)
4665   : my_base(ctx),
4666     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Backfilling")
4667 {
4668   context< PeeringMachine >().log_enter(state_name);
4669
4670
4671   DECLARE_LOCALS;
4672   ps->backfill_reserved = true;
4673   pl->on_backfill_reserved();
4674   ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
4675   ps->state_clear(PG_STATE_BACKFILL_WAIT);
4676   ps->state_set(PG_STATE_BACKFILLING);
4677   pl->publish_stats_to_osd();
4678 }
4679
4680 void PeeringState::Backfilling::backfill_release_reservations()
4681 {
4682   DECLARE_LOCALS;
4683   pl->cancel_local_background_io_reservation();
4684   for (set<pg_shard_t>::iterator it = ps->backfill_targets.begin();
4685        it != ps->backfill_targets.end();
4686        ++it) {
4687     ceph_assert(*it != ps->pg_whoami);
4688     pl->send_cluster_message(
4689       it->osd,
4690       new MBackfillReserve(
4691         MBackfillReserve::RELEASE,
4692         spg_t(ps->info.pgid.pgid, it->shard),
4693         ps->get_osdmap_epoch()),
4694       ps->get_osdmap_epoch());
4695   }
4696 }
4697
4698 void PeeringState::Backfilling::cancel_backfill()
4699 {
4700   DECLARE_LOCALS;
4701   backfill_release_reservations();
4702   pl->on_backfill_canceled();
4703 }
4704
4705 boost::statechart::result
4706 PeeringState::Backfilling::react(const Backfilled &c)
4707 {
4708   backfill_release_reservations();
4709   return transit<Recovered>();
4710 }
4711
4712 boost::statechart::result
4713 PeeringState::Backfilling::react(const DeferBackfill &c)
4714 {
4715   DECLARE_LOCALS;
4716
4717   psdout(10) << "defer backfill, retry delay " << c.delay << dendl;
4718   ps->state_set(PG_STATE_BACKFILL_WAIT);
4719   ps->state_clear(PG_STATE_BACKFILLING);
4720   cancel_backfill();
4721
4722   pl->schedule_event_after(
4723     std::make_shared<PGPeeringEvent>(
4724       ps->get_osdmap_epoch(),
4725       ps->get_osdmap_epoch(),
4726       RequestBackfill()),
4727     c.delay);
4728   return transit<NotBackfilling>();
4729 }
4730
4731 boost::statechart::result
4732 PeeringState::Backfilling::react(const UnfoundBackfill &c)
4733 {
4734   DECLARE_LOCALS;
4735   psdout(10) << "backfill has unfound, can't continue" << dendl;
4736   ps->state_set(PG_STATE_BACKFILL_UNFOUND);
4737   ps->state_clear(PG_STATE_BACKFILLING);
4738   cancel_backfill();
4739   return transit<NotBackfilling>();
4740 }
4741
4742 boost::statechart::result
4743 PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull &)
4744 {
4745   DECLARE_LOCALS;
4746
4747   ps->state_set(PG_STATE_BACKFILL_TOOFULL);
4748   ps->state_clear(PG_STATE_BACKFILLING);
4749   cancel_backfill();
4750
4751   pl->schedule_event_after(
4752     std::make_shared<PGPeeringEvent>(
4753       ps->get_osdmap_epoch(),
4754       ps->get_osdmap_epoch(),
4755       RequestBackfill()),
4756     ps->cct->_conf->osd_backfill_retry_interval);
4757
4758   return transit<NotBackfilling>();
4759 }
4760
4761 boost::statechart::result
4762 PeeringState::Backfilling::react(const RemoteReservationRevoked &)
4763 {
4764   DECLARE_LOCALS;
4765   ps->state_set(PG_STATE_BACKFILL_WAIT);
4766   cancel_backfill();
4767   if (ps->needs_backfill()) {
4768     return transit<WaitLocalBackfillReserved>();
4769   } else {
4770     // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
4771     return discard_event();
4772   }
4773 }
4774
4775 void PeeringState::Backfilling::exit()
4776 {
4777   context< PeeringMachine >().log_exit(state_name, enter_time);
4778   DECLARE_LOCALS;
4779   ps->backfill_reserved = false;
4780   ps->state_clear(PG_STATE_BACKFILLING);
4781   ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
4782   utime_t dur = ceph_clock_now() - enter_time;
4783   pl->get_peering_perf().tinc(rs_backfilling_latency, dur);
4784 }
4785
4786 /*--WaitRemoteBackfillReserved--*/
4787
4788 PeeringState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
4789   : my_base(ctx),
4790     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteBackfillReserved"),
4791     backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
4792 {
4793   context< PeeringMachine >().log_enter(state_name);
4794   DECLARE_LOCALS;
4795
4796   ps->state_set(PG_STATE_BACKFILL_WAIT);
4797   pl->publish_stats_to_osd();
4798   post_event(RemoteBackfillReserved());
4799 }
4800
4801 boost::statechart::result
4802 PeeringState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
4803 {
4804   DECLARE_LOCALS;
4805
4806   int64_t num_bytes = ps->info.stats.stats.sum.num_bytes;
4807   psdout(10) << __func__ << " num_bytes " << num_bytes << dendl;
4808   if (backfill_osd_it !=
4809       context< Active >().remote_shards_to_reserve_backfill.end()) {
4810     // The primary never backfills itself
4811     ceph_assert(*backfill_osd_it != ps->pg_whoami);
4812     pl->send_cluster_message(
4813       backfill_osd_it->osd,
4814       new MBackfillReserve(
4815         MBackfillReserve::REQUEST,
4816         spg_t(context< PeeringMachine >().spgid.pgid, backfill_osd_it->shard),
4817         ps->get_osdmap_epoch(),
4818         ps->get_backfill_priority(),
4819         num_bytes,
4820         ps->peer_bytes[*backfill_osd_it]),
4821       ps->get_osdmap_epoch());
4822     ++backfill_osd_it;
4823   } else {
4824     ps->peer_bytes.clear();
4825     post_event(AllBackfillsReserved());
4826   }
4827   return discard_event();
4828 }
4829
4830 void PeeringState::WaitRemoteBackfillReserved::exit()
4831 {
4832   context< PeeringMachine >().log_exit(state_name, enter_time);
4833   DECLARE_LOCALS;
4834
4835   utime_t dur = ceph_clock_now() - enter_time;
4836   pl->get_peering_perf().tinc(rs_waitremotebackfillreserved_latency, dur);
4837 }
4838
4839 void PeeringState::WaitRemoteBackfillReserved::retry()
4840 {
4841   DECLARE_LOCALS;
4842   pl->cancel_local_background_io_reservation();
4843
4844   // Send CANCEL to all previously acquired reservations
4845   set<pg_shard_t>::const_iterator it, begin, end;
4846   begin = context< Active >().remote_shards_to_reserve_backfill.begin();
4847   end = context< Active >().remote_shards_to_reserve_backfill.end();
4848   ceph_assert(begin != end);
4849   for (it = begin; it != backfill_osd_it; ++it) {
4850     // The primary never backfills itself
4851     ceph_assert(*it != ps->pg_whoami);
4852     pl->send_cluster_message(
4853       it->osd,
4854       new MBackfillReserve(
4855         MBackfillReserve::RELEASE,
4856         spg_t(context< PeeringMachine >().spgid.pgid, it->shard),
4857         ps->get_osdmap_epoch()),
4858       ps->get_osdmap_epoch());
4859   }
4860
4861   ps->state_clear(PG_STATE_BACKFILL_WAIT);
4862   pl->publish_stats_to_osd();
4863
4864   pl->schedule_event_after(
4865     std::make_shared<PGPeeringEvent>(
4866       ps->get_osdmap_epoch(),
4867       ps->get_osdmap_epoch(),
4868       RequestBackfill()),
4869     ps->cct->_conf->osd_backfill_retry_interval);
4870 }
4871
4872 boost::statechart::result
4873 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRejectedTooFull &evt)
4874 {
4875   DECLARE_LOCALS;
4876   ps->state_set(PG_STATE_BACKFILL_TOOFULL);
4877   retry();
4878   return transit<NotBackfilling>();
4879 }
4880
4881 boost::statechart::result
4882 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt)
4883 {
4884   retry();
4885   return transit<NotBackfilling>();
4886 }
4887
4888 /*--WaitLocalBackfillReserved--*/
4889 PeeringState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
4890   : my_base(ctx),
4891     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalBackfillReserved")
4892 {
4893   context< PeeringMachine >().log_enter(state_name);
4894   DECLARE_LOCALS;
4895
4896   ps->state_set(PG_STATE_BACKFILL_WAIT);
4897   pl->request_local_background_io_reservation(
4898     ps->get_backfill_priority(),
4899     std::make_shared<PGPeeringEvent>(
4900       ps->get_osdmap_epoch(),
4901       ps->get_osdmap_epoch(),
4902       LocalBackfillReserved()),
4903     std::make_shared<PGPeeringEvent>(
4904       ps->get_osdmap_epoch(),
4905       ps->get_osdmap_epoch(),
4906       DeferBackfill(0.0)));
4907   pl->publish_stats_to_osd();
4908 }
4909
4910 void PeeringState::WaitLocalBackfillReserved::exit()
4911 {
4912   context< PeeringMachine >().log_exit(state_name, enter_time);
4913   DECLARE_LOCALS;
4914   utime_t dur = ceph_clock_now() - enter_time;
4915   pl->get_peering_perf().tinc(rs_waitlocalbackfillreserved_latency, dur);
4916 }
4917
4918 /*----NotBackfilling------*/
4919 PeeringState::NotBackfilling::NotBackfilling(my_context ctx)
4920   : my_base(ctx),
4921     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotBackfilling")
4922 {
4923   context< PeeringMachine >().log_enter(state_name);
4924   DECLARE_LOCALS;
4925   ps->state_clear(PG_STATE_REPAIR);
4926   pl->publish_stats_to_osd();
4927 }
4928
4929 boost::statechart::result
4930 PeeringState::NotBackfilling::react(const RemoteBackfillReserved &evt)
4931 {
4932   return discard_event();
4933 }
4934
4935 boost::statechart::result
4936 PeeringState::NotBackfilling::react(const RemoteReservationRejectedTooFull &evt)
4937 {
4938   return discard_event();
4939 }
4940
4941 void PeeringState::NotBackfilling::exit()
4942 {
4943   context< PeeringMachine >().log_exit(state_name, enter_time);
4944
4945   DECLARE_LOCALS;
4946   ps->state_clear(PG_STATE_BACKFILL_UNFOUND);
4947   utime_t dur = ceph_clock_now() - enter_time;
4948   pl->get_peering_perf().tinc(rs_notbackfilling_latency, dur);
4949 }
4950
4951 /*----NotRecovering------*/
4952 PeeringState::NotRecovering::NotRecovering(my_context ctx)
4953   : my_base(ctx),
4954     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotRecovering")
4955 {
4956   context< PeeringMachine >().log_enter(state_name);
4957   DECLARE_LOCALS;
4958   ps->state_clear(PG_STATE_REPAIR);
4959   pl->publish_stats_to_osd();
4960 }
4961
4962 void PeeringState::NotRecovering::exit()
4963 {
4964   context< PeeringMachine >().log_exit(state_name, enter_time);
4965
4966   DECLARE_LOCALS;
4967   ps->state_clear(PG_STATE_RECOVERY_UNFOUND);
4968   utime_t dur = ceph_clock_now() - enter_time;
4969   pl->get_peering_perf().tinc(rs_notrecovering_latency, dur);
4970 }
4971
4972 /*---RepNotRecovering----*/
4973 PeeringState::RepNotRecovering::RepNotRecovering(my_context ctx)
4974   : my_base(ctx),
4975     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepNotRecovering")
4976 {
4977   context< PeeringMachine >().log_enter(state_name);
4978 }
4979
4980 boost::statechart::result
4981 PeeringState::RepNotRecovering::react(const RejectTooFullRemoteReservation &evt)
4982 {
4983   DECLARE_LOCALS;
4984   ps->reject_reservation();
4985   post_event(RemoteReservationRejectedTooFull());
4986   return discard_event();
4987 }
4988
4989 void PeeringState::RepNotRecovering::exit()
4990 {
4991   context< PeeringMachine >().log_exit(state_name, enter_time);
4992   DECLARE_LOCALS;
4993   utime_t dur = ceph_clock_now() - enter_time;
4994   pl->get_peering_perf().tinc(rs_repnotrecovering_latency, dur);
4995 }
4996
4997 /*---RepWaitRecoveryReserved--*/
4998 PeeringState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
4999   : my_base(ctx),
5000     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitRecoveryReserved")
5001 {
5002   context< PeeringMachine >().log_enter(state_name);
5003 }
5004
5005 boost::statechart::result
5006 PeeringState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
5007 {
5008   DECLARE_LOCALS;
5009   pl->send_cluster_message(
5010     ps->primary.osd,
5011     new MRecoveryReserve(
5012       MRecoveryReserve::GRANT,
5013       spg_t(ps->info.pgid.pgid, ps->primary.shard),
5014       ps->get_osdmap_epoch()),
5015     ps->get_osdmap_epoch());
5016   return transit<RepRecovering>();
5017 }
5018
5019 boost::statechart::result
5020 PeeringState::RepWaitRecoveryReserved::react(
5021   const RemoteReservationCanceled &evt)
5022 {
5023   DECLARE_LOCALS;
5024   pl->unreserve_recovery_space();
5025
5026   pl->cancel_remote_recovery_reservation();
5027   return transit<RepNotRecovering>();
5028 }
5029
5030 void PeeringState::RepWaitRecoveryReserved::exit()
5031 {
5032   context< PeeringMachine >().log_exit(state_name, enter_time);
5033   DECLARE_LOCALS;
5034   utime_t dur = ceph_clock_now() - enter_time;
5035   pl->get_peering_perf().tinc(rs_repwaitrecoveryreserved_latency, dur);
5036 }
5037
5038 /*-RepWaitBackfillReserved*/
5039 PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
5040   : my_base(ctx),
5041     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitBackfillReserved")
5042 {
5043   context< PeeringMachine >().log_enter(state_name);
5044 }
5045
5046 boost::statechart::result
5047 PeeringState::RepNotRecovering::react(const RequestBackfillPrio &evt)
5048 {
5049
5050   DECLARE_LOCALS;
5051
5052   if (!pl->try_reserve_recovery_space(
5053         evt.primary_num_bytes, evt.local_num_bytes)) {
5054     post_event(RejectTooFullRemoteReservation());
5055   } else {
5056     PGPeeringEventRef preempt;
5057     if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5058       // older peers will interpret preemption as TOOFULL
5059       preempt = std::make_shared<PGPeeringEvent>(
5060         pl->get_osdmap_epoch(),
5061         pl->get_osdmap_epoch(),
5062         RemoteBackfillPreempted());
5063     }
5064     pl->request_remote_recovery_reservation(
5065       evt.priority,
5066       std::make_shared<PGPeeringEvent>(
5067         pl->get_osdmap_epoch(),
5068         pl->get_osdmap_epoch(),
5069         RemoteBackfillReserved()),
5070       preempt);
5071   }
5072   return transit<RepWaitBackfillReserved>();
5073 }
5074
5075 boost::statechart::result
5076 PeeringState::RepNotRecovering::react(const RequestRecoveryPrio &evt)
5077 {
5078   DECLARE_LOCALS;
5079
5080   // fall back to a local reckoning of priority of primary doesn't pass one
5081   // (pre-mimic compat)
5082   int prio = evt.priority ? evt.priority : ps->get_recovery_priority();
5083
5084   PGPeeringEventRef preempt;
5085   if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5086     // older peers can't handle this
5087     preempt = std::make_shared<PGPeeringEvent>(
5088       ps->get_osdmap_epoch(),
5089       ps->get_osdmap_epoch(),
5090       RemoteRecoveryPreempted());
5091   }
5092
5093   pl->request_remote_recovery_reservation(
5094     prio,
5095     std::make_shared<PGPeeringEvent>(
5096       ps->get_osdmap_epoch(),
5097       ps->get_osdmap_epoch(),
5098       RemoteRecoveryReserved()),
5099     preempt);
5100   return transit<RepWaitRecoveryReserved>();
5101 }
5102
5103 void PeeringState::RepWaitBackfillReserved::exit()
5104 {
5105   context< PeeringMachine >().log_exit(state_name, enter_time);
5106   DECLARE_LOCALS;
5107   utime_t dur = ceph_clock_now() - enter_time;
5108   pl->get_peering_perf().tinc(rs_repwaitbackfillreserved_latency, dur);
5109 }
5110
5111 boost::statechart::result
5112 PeeringState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
5113 {
5114   DECLARE_LOCALS;
5115
5116
5117   pl->send_cluster_message(
5118       ps->primary.osd,
5119       new MBackfillReserve(
5120         MBackfillReserve::GRANT,
5121         spg_t(ps->info.pgid.pgid, ps->primary.shard),
5122         ps->get_osdmap_epoch()),
5123       ps->get_osdmap_epoch());
5124   return transit<RepRecovering>();
5125 }
5126
5127 boost::statechart::result
5128 PeeringState::RepWaitBackfillReserved::react(
5129   const RejectTooFullRemoteReservation &evt)
5130 {
5131   DECLARE_LOCALS;
5132   ps->reject_reservation();
5133   post_event(RemoteReservationRejectedTooFull());
5134   return discard_event();
5135 }
5136
5137 boost::statechart::result
5138 PeeringState::RepWaitBackfillReserved::react(
5139   const RemoteReservationRejectedTooFull &evt)
5140 {
5141   DECLARE_LOCALS;
5142   pl->unreserve_recovery_space();
5143
5144   pl->cancel_remote_recovery_reservation();
5145   return transit<RepNotRecovering>();
5146 }
5147
5148 boost::statechart::result
5149 PeeringState::RepWaitBackfillReserved::react(
5150   const RemoteReservationCanceled &evt)
5151 {
5152   DECLARE_LOCALS;
5153   pl->unreserve_recovery_space();
5154
5155   pl->cancel_remote_recovery_reservation();
5156   return transit<RepNotRecovering>();
5157 }
5158
5159 /*---RepRecovering-------*/
5160 PeeringState::RepRecovering::RepRecovering(my_context ctx)
5161   : my_base(ctx),
5162     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepRecovering")
5163 {
5164   context< PeeringMachine >().log_enter(state_name);
5165 }
5166
5167 boost::statechart::result
5168 PeeringState::RepRecovering::react(const RemoteRecoveryPreempted &)
5169 {
5170   DECLARE_LOCALS;
5171
5172
5173   pl->unreserve_recovery_space();
5174   pl->send_cluster_message(
5175     ps->primary.osd,
5176     new MRecoveryReserve(
5177       MRecoveryReserve::REVOKE,
5178       spg_t(ps->info.pgid.pgid, ps->primary.shard),
5179       ps->get_osdmap_epoch()),
5180     ps->get_osdmap_epoch());
5181   return discard_event();
5182 }
5183
5184 boost::statechart::result
5185 PeeringState::RepRecovering::react(const BackfillTooFull &)
5186 {
5187   DECLARE_LOCALS;
5188
5189
5190   pl->unreserve_recovery_space();
5191   pl->send_cluster_message(
5192     ps->primary.osd,
5193     new MBackfillReserve(
5194       MBackfillReserve::REVOKE_TOOFULL,
5195       spg_t(ps->info.pgid.pgid, ps->primary.shard),
5196       ps->get_osdmap_epoch()),
5197     ps->get_osdmap_epoch());
5198   return discard_event();
5199 }
5200
5201 boost::statechart::result
5202 PeeringState::RepRecovering::react(const RemoteBackfillPreempted &)
5203 {
5204   DECLARE_LOCALS;
5205
5206
5207   pl->unreserve_recovery_space();
5208   pl->send_cluster_message(
5209     ps->primary.osd,
5210     new MBackfillReserve(
5211       MBackfillReserve::REVOKE,
5212       spg_t(ps->info.pgid.pgid, ps->primary.shard),
5213       ps->get_osdmap_epoch()),
5214     ps->get_osdmap_epoch());
5215   return discard_event();
5216 }
5217
5218 void PeeringState::RepRecovering::exit()
5219 {
5220   context< PeeringMachine >().log_exit(state_name, enter_time);
5221   DECLARE_LOCALS;
5222   pl->unreserve_recovery_space();
5223
5224   pl->cancel_remote_recovery_reservation();
5225   utime_t dur = ceph_clock_now() - enter_time;
5226   pl->get_peering_perf().tinc(rs_reprecovering_latency, dur);
5227 }
5228
5229 /*------Activating--------*/
5230 PeeringState::Activating::Activating(my_context ctx)
5231   : my_base(ctx),
5232     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Activating")
5233 {
5234   context< PeeringMachine >().log_enter(state_name);
5235 }
5236
5237 void PeeringState::Activating::exit()
5238 {
5239   context< PeeringMachine >().log_exit(state_name, enter_time);
5240   DECLARE_LOCALS;
5241   utime_t dur = ceph_clock_now() - enter_time;
5242   pl->get_peering_perf().tinc(rs_activating_latency, dur);
5243 }
5244
5245 PeeringState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
5246   : my_base(ctx),
5247     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalRecoveryReserved")
5248 {
5249   context< PeeringMachine >().log_enter(state_name);
5250   DECLARE_LOCALS;
5251
5252   // Make sure all nodes that part of the recovery aren't full
5253   if (!ps->cct->_conf->osd_debug_skip_full_check_in_recovery &&
5254       ps->get_osdmap()->check_full(ps->acting_recovery_backfill)) {
5255     post_event(RecoveryTooFull());
5256     return;
5257   }
5258
5259   ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5260   ps->state_set(PG_STATE_RECOVERY_WAIT);
5261   pl->request_local_background_io_reservation(
5262     ps->get_recovery_priority(),
5263     std::make_shared<PGPeeringEvent>(
5264       ps->get_osdmap_epoch(),
5265       ps->get_osdmap_epoch(),
5266       LocalRecoveryReserved()),
5267     std::make_shared<PGPeeringEvent>(
5268       ps->get_osdmap_epoch(),
5269       ps->get_osdmap_epoch(),
5270       DeferRecovery(0.0)));
5271   pl->publish_stats_to_osd();
5272 }
5273
5274 boost::statechart::result
5275 PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
5276 {
5277   DECLARE_LOCALS;
5278   ps->state_set(PG_STATE_RECOVERY_TOOFULL);
5279   pl->schedule_event_after(
5280     std::make_shared<PGPeeringEvent>(
5281       ps->get_osdmap_epoch(),
5282       ps->get_osdmap_epoch(),
5283       DoRecovery()),
5284     ps->cct->_conf->osd_recovery_retry_interval);
5285   return transit<NotRecovering>();
5286 }
5287
5288 void PeeringState::WaitLocalRecoveryReserved::exit()
5289 {
5290   context< PeeringMachine >().log_exit(state_name, enter_time);
5291   DECLARE_LOCALS;
5292   utime_t dur = ceph_clock_now() - enter_time;
5293   pl->get_peering_perf().tinc(rs_waitlocalrecoveryreserved_latency, dur);
5294 }
5295
5296 PeeringState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
5297   : my_base(ctx),
5298     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
5299     remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
5300 {
5301   context< PeeringMachine >().log_enter(state_name);
5302   post_event(RemoteRecoveryReserved());
5303 }
5304
5305 boost::statechart::result
5306 PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
5307   DECLARE_LOCALS;
5308
5309   if (remote_recovery_reservation_it !=
5310       context< Active >().remote_shards_to_reserve_recovery.end()) {
5311     ceph_assert(*remote_recovery_reservation_it != ps->pg_whoami);
5312     pl->send_cluster_message(
5313       remote_recovery_reservation_it->osd,
5314       new MRecoveryReserve(
5315         MRecoveryReserve::REQUEST,
5316         spg_t(context< PeeringMachine >().spgid.pgid,
5317               remote_recovery_reservation_it->shard),
5318         ps->get_osdmap_epoch(),
5319         ps->get_recovery_priority()),
5320       ps->get_osdmap_epoch());
5321     ++remote_recovery_reservation_it;
5322   } else {
5323     post_event(AllRemotesReserved());
5324   }
5325   return discard_event();
5326 }
5327
5328 void PeeringState::WaitRemoteRecoveryReserved::exit()
5329 {
5330   context< PeeringMachine >().log_exit(state_name, enter_time);
5331   DECLARE_LOCALS;
5332   utime_t dur = ceph_clock_now() - enter_time;
5333   pl->get_peering_perf().tinc(rs_waitremoterecoveryreserved_latency, dur);
5334 }
5335
5336 PeeringState::Recovering::Recovering(my_context ctx)
5337   : my_base(ctx),
5338     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovering")
5339 {
5340   context< PeeringMachine >().log_enter(state_name);
5341
5342   DECLARE_LOCALS;
5343   ps->state_clear(PG_STATE_RECOVERY_WAIT);
5344   ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5345   ps->state_set(PG_STATE_RECOVERING);
5346   pl->on_recovery_reserved();
5347   ceph_assert(!ps->state_test(PG_STATE_ACTIVATING));
5348   pl->publish_stats_to_osd();
5349 }
5350
5351 void PeeringState::Recovering::release_reservations(bool cancel)
5352 {
5353   DECLARE_LOCALS;
5354   ceph_assert(cancel || !ps->pg_log.get_missing().have_missing());
5355
5356   // release remote reservations
5357   for (set<pg_shard_t>::const_iterator i =
5358          context< Active >().remote_shards_to_reserve_recovery.begin();
5359         i != context< Active >().remote_shards_to_reserve_recovery.end();
5360         ++i) {
5361     if (*i == ps->pg_whoami) // skip myself
5362       continue;
5363     pl->send_cluster_message(
5364       i->osd,
5365       new MRecoveryReserve(
5366         MRecoveryReserve::RELEASE,
5367         spg_t(ps->info.pgid.pgid, i->shard),
5368         ps->get_osdmap_epoch()),
5369       ps->get_osdmap_epoch());
5370   }
5371 }
5372
5373 boost::statechart::result
5374 PeeringState::Recovering::react(const AllReplicasRecovered &evt)
5375 {
5376   DECLARE_LOCALS;
5377   ps->state_clear(PG_STATE_FORCED_RECOVERY);
5378   release_reservations();
5379   pl->cancel_local_background_io_reservation();
5380   return transit<Recovered>();
5381 }
5382
5383 boost::statechart::result
5384 PeeringState::Recovering::react(const RequestBackfill &evt)
5385 {
5386   DECLARE_LOCALS;
5387
5388   release_reservations();
5389
5390   ps->state_clear(PG_STATE_FORCED_RECOVERY);
5391   pl->cancel_local_background_io_reservation();
5392   pl->publish_stats_to_osd();
5393   // transit any async_recovery_targets back into acting
5394   // so pg won't have to stay undersized for long
5395   // as backfill might take a long time to complete..
5396   if (!ps->async_recovery_targets.empty()) {
5397     pg_shard_t auth_log_shard;
5398     bool history_les_bound = false;
5399     ps->choose_acting(auth_log_shard, true, &history_les_bound);
5400   }
5401   return transit<WaitLocalBackfillReserved>();
5402 }
5403
5404 boost::statechart::result
5405 PeeringState::Recovering::react(const DeferRecovery &evt)
5406 {
5407   DECLARE_LOCALS;
5408   if (!ps->state_test(PG_STATE_RECOVERING)) {
5409     // we may have finished recovery and have an AllReplicasRecovered
5410     // event queued to move us to the next state.
5411     psdout(10) << "got defer recovery but not recovering" << dendl;
5412     return discard_event();
5413   }
5414   psdout(10) << "defer recovery, retry delay " << evt.delay << dendl;
5415   ps->state_set(PG_STATE_RECOVERY_WAIT);
5416   pl->cancel_local_background_io_reservation();
5417   release_reservations(true);
5418   pl->schedule_event_after(
5419     std::make_shared<PGPeeringEvent>(
5420       ps->get_osdmap_epoch(),
5421       ps->get_osdmap_epoch(),
5422       DoRecovery()),
5423     evt.delay);
5424   return transit<NotRecovering>();
5425 }
5426
5427 boost::statechart::result
5428 PeeringState::Recovering::react(const UnfoundRecovery &evt)
5429 {
5430   DECLARE_LOCALS;
5431   psdout(10) << "recovery has unfound, can't continue" << dendl;
5432   ps->state_set(PG_STATE_RECOVERY_UNFOUND);
5433   pl->cancel_local_background_io_reservation();
5434   release_reservations(true);
5435   return transit<NotRecovering>();
5436 }
5437
5438 void PeeringState::Recovering::exit()
5439 {
5440   context< PeeringMachine >().log_exit(state_name, enter_time);
5441
5442   DECLARE_LOCALS;
5443   utime_t dur = ceph_clock_now() - enter_time;
5444   ps->state_clear(PG_STATE_RECOVERING);
5445   pl->get_peering_perf().tinc(rs_recovering_latency, dur);
5446 }
5447
5448 PeeringState::Recovered::Recovered(my_context ctx)
5449   : my_base(ctx),
5450     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovered")
5451 {
5452   pg_shard_t auth_log_shard;
5453
5454   context< PeeringMachine >().log_enter(state_name);
5455
5456   DECLARE_LOCALS;
5457
5458   ceph_assert(!ps->needs_recovery());
5459
5460   // if we finished backfill, all acting are active; recheck if
5461   // DEGRADED | UNDERSIZED is appropriate.
5462   ceph_assert(!ps->acting_recovery_backfill.empty());
5463   if (ps->get_osdmap()->get_pg_size(context< PeeringMachine >().spgid.pgid) <=
5464       ps->acting_recovery_backfill.size()) {
5465     ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
5466     pl->publish_stats_to_osd();
5467   }
5468
5469   // adjust acting set?  (e.g. because backfill completed...)
5470   bool history_les_bound = false;
5471   if (ps->acting != ps->up && !ps->choose_acting(auth_log_shard,
5472                                                  true, &history_les_bound)) {
5473     ceph_assert(ps->want_acting.size());
5474   } else if (!ps->async_recovery_targets.empty()) {
5475     ps->choose_acting(auth_log_shard, true, &history_les_bound);
5476   }
5477
5478   if (context< Active >().all_replicas_activated  &&
5479       ps->async_recovery_targets.empty())
5480     post_event(GoClean());
5481 }
5482
5483 void PeeringState::Recovered::exit()
5484 {
5485   context< PeeringMachine >().log_exit(state_name, enter_time);
5486   DECLARE_LOCALS;
5487
5488   utime_t dur = ceph_clock_now() - enter_time;
5489   pl->get_peering_perf().tinc(rs_recovered_latency, dur);
5490 }
5491
5492 PeeringState::Clean::Clean(my_context ctx)
5493   : my_base(ctx),
5494     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Clean")
5495 {
5496   context< PeeringMachine >().log_enter(state_name);
5497
5498   DECLARE_LOCALS;
5499
5500   if (ps->info.last_complete != ps->info.last_update) {
5501     ceph_abort();
5502   }
5503
5504
5505   ps->try_mark_clean();
5506
5507   context< PeeringMachine >().get_cur_transaction().register_on_commit(
5508     pl->on_clean());
5509 }
5510
5511 void PeeringState::Clean::exit()
5512 {
5513   context< PeeringMachine >().log_exit(state_name, enter_time);
5514
5515   DECLARE_LOCALS;
5516   ps->state_clear(PG_STATE_CLEAN);
5517   utime_t dur = ceph_clock_now() - enter_time;
5518   pl->get_peering_perf().tinc(rs_clean_latency, dur);
5519 }
5520
5521 template <typename T>
5522 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
5523 {
5524   set<int> osds_found;
5525   set<pg_shard_t> out;
5526   for (typename T::const_iterator i = in.begin();
5527        i != in.end();
5528        ++i) {
5529     if (*i != skip && !osds_found.count(i->osd)) {
5530       osds_found.insert(i->osd);
5531       out.insert(*i);
5532     }
5533   }
5534   return out;
5535 }
5536
5537 /*---------Active---------*/
5538 PeeringState::Active::Active(my_context ctx)
5539   : my_base(ctx),
5540     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active"),
5541     remote_shards_to_reserve_recovery(
5542       unique_osd_shard_set(
5543         context< PeeringMachine >().state->pg_whoami,
5544         context< PeeringMachine >().state->acting_recovery_backfill)),
5545     remote_shards_to_reserve_backfill(
5546       unique_osd_shard_set(
5547         context< PeeringMachine >().state->pg_whoami,
5548         context< PeeringMachine >().state->backfill_targets)),
5549     all_replicas_activated(false)
5550 {
5551   context< PeeringMachine >().log_enter(state_name);
5552
5553
5554   DECLARE_LOCALS;
5555
5556   ceph_assert(!ps->backfill_reserved);
5557   ceph_assert(ps->is_primary());
5558   psdout(10) << "In Active, about to call activate" << dendl;
5559   ps->start_flush(context< PeeringMachine >().get_cur_transaction());
5560   ps->activate(context< PeeringMachine >().get_cur_transaction(),
5561                ps->get_osdmap_epoch(),
5562                context< PeeringMachine >().get_recovery_ctx());
5563
5564   // everyone has to commit/ack before we are truly active
5565   ps->blocked_by.clear();
5566   for (set<pg_shard_t>::iterator p = ps->acting_recovery_backfill.begin();
5567        p != ps->acting_recovery_backfill.end();
5568        ++p) {
5569     if (p->shard != ps->pg_whoami.shard) {
5570       ps->blocked_by.insert(p->shard);
5571     }
5572   }
5573   pl->publish_stats_to_osd();
5574   psdout(10) << "Activate Finished" << dendl;
5575 }
5576
5577 boost::statechart::result PeeringState::Active::react(const AdvMap& advmap)
5578 {
5579   DECLARE_LOCALS;
5580
5581   if (ps->should_restart_peering(
5582         advmap.up_primary,
5583         advmap.acting_primary,
5584         advmap.newup,
5585         advmap.newacting,
5586         advmap.lastmap,
5587         advmap.osdmap)) {
5588     psdout(10) << "Active advmap interval change, fast return" << dendl;
5589     return forward_event();
5590   }
5591   psdout(10) << "Active advmap" << dendl;
5592   bool need_publish = false;
5593
5594   pl->on_active_advmap(advmap.osdmap);
5595   if (ps->dirty_big_info) {
5596     // share updated purged_snaps to mgr/mon so that we (a) stop reporting
5597     // purged snaps and (b) perhaps share more snaps that we have purged
5598     // but didn't fit in pg_stat_t.
5599     need_publish = true;
5600     ps->share_pg_info();
5601   }
5602
5603   bool need_acting_change = false;
5604   for (size_t i = 0; i < ps->want_acting.size(); i++) {
5605     int osd = ps->want_acting[i];
5606     if (!advmap.osdmap->is_up(osd)) {
5607       pg_shard_t osd_with_shard(osd, shard_id_t(i));
5608       if (!ps->is_acting(osd_with_shard) && !ps->is_up(osd_with_shard)) {
5609         psdout(10) << "Active stray osd." << osd << " in want_acting is down"
5610                    << dendl;
5611         need_acting_change = true;
5612       }
5613     }
5614   }
5615   if (need_acting_change) {
5616      psdout(10) << "Active need acting change, call choose_acting again"
5617                 << dendl;
5618     // possibly because we re-add some strays into the acting set and
5619     // some of them then go down in a subsequent map before we could see
5620     // the map changing the pg temp.
5621     // call choose_acting again to clear them out.
5622     // note that we leave restrict_to_up_acting to false in order to
5623     // not overkill any chosen stray that is still alive.
5624     pg_shard_t auth_log_shard;
5625     bool history_les_bound = false;
5626     ps->remove_down_peer_info(advmap.osdmap);
5627     ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
5628   }
5629
5630   /* Check for changes in pool size (if the acting set changed as a result,
5631    * this does not matter) */
5632   if (advmap.lastmap->get_pg_size(ps->info.pgid.pgid) !=
5633       ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid)) {
5634     if (ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid) <=
5635         ps->actingset.size()) {
5636       ps->state_clear(PG_STATE_UNDERSIZED);
5637     } else {
5638       ps->state_set(PG_STATE_UNDERSIZED);
5639     }
5640     // degraded changes will be detected by call from publish_stats_to_osd()
5641     need_publish = true;
5642   }
5643
5644   // if we haven't reported our PG stats in a long time, do so now.
5645   if (ps->info.stats.reported_epoch + ps->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
5646     psdout(20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - ps->info.stats.reported_epoch)
5647                        << " epochs" << dendl;
5648     need_publish = true;
5649   }
5650
5651   if (need_publish)
5652     pl->publish_stats_to_osd();
5653
5654   if (ps->check_prior_readable_down_osds(advmap.osdmap)) {
5655     pl->recheck_readable();
5656   }
5657
5658   return forward_event();
5659 }
5660
5661 boost::statechart::result PeeringState::Active::react(const ActMap&)
5662 {
5663   DECLARE_LOCALS;
5664   psdout(10) << "Active: handling ActMap" << dendl;
5665   ceph_assert(ps->is_primary());
5666
5667   pl->on_active_actmap();
5668
5669   if (ps->have_unfound()) {
5670     // object may have become unfound
5671     ps->discover_all_missing(context<PeeringMachine>().get_recovery_ctx().msgs);
5672   }
5673
5674   uint64_t unfound = ps->missing_loc.num_unfound();
5675   if (unfound > 0 &&
5676       ps->all_unfound_are_queried_or_lost(ps->get_osdmap())) {
5677     if (ps->cct->_conf->osd_auto_mark_unfound_lost) {
5678       pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has " << unfound
5679                             << " objects unfound and apparently lost, would automatically "
5680                             << "mark these objects lost but this feature is not yet implemented "
5681                             << "(osd_auto_mark_unfound_lost)";
5682     } else
5683       pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has "
5684                              << unfound << " objects unfound and apparently lost";
5685   }
5686
5687   return forward_event();
5688 }
5689
5690 boost::statechart::result PeeringState::Active::react(const MNotifyRec& notevt)
5691 {
5692
5693   DECLARE_LOCALS;
5694   ceph_assert(ps->is_primary());
5695   if (ps->peer_info.count(notevt.from)) {
5696     psdout(10) << "Active: got notify from " << notevt.from
5697                        << ", already have info from that osd, ignoring"
5698                        << dendl;
5699   } else if (ps->peer_purged.count(notevt.from)) {
5700     psdout(10) << "Active: got notify from " << notevt.from
5701                        << ", already purged that peer, ignoring"
5702                        << dendl;
5703   } else {
5704     psdout(10) << "Active: got notify from " << notevt.from
5705                        << ", calling proc_replica_info and discover_all_missing"
5706                        << dendl;
5707     ps->proc_replica_info(
5708       notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
5709     if (ps->have_unfound() || (ps->is_degraded() && ps->might_have_unfound.count(notevt.from))) {
5710       ps->discover_all_missing(
5711         context<PeeringMachine>().get_recovery_ctx().msgs);
5712     }
5713     // check if it is a previous down acting member that's coming back.
5714     // if so, request pg_temp change to trigger a new interval transition
5715     pg_shard_t auth_log_shard;
5716     bool history_les_bound = false;
5717     ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
5718     if (!ps->want_acting.empty() && ps->want_acting != ps->acting) {
5719       psdout(10) << "Active: got notify from previous acting member "
5720                  << notevt.from << ", requesting pg_temp change"
5721                  << dendl;
5722     }
5723   }
5724   return discard_event();
5725 }
5726
5727 boost::statechart::result PeeringState::Active::react(const MTrim& trim)
5728 {
5729   DECLARE_LOCALS;
5730   ceph_assert(ps->is_primary());
5731
5732   // peer is informing us of their last_complete_ondisk
5733   ldout(ps->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl;
5734   ps->update_peer_last_complete_ondisk(pg_shard_t{trim.from, trim.shard},
5735                                        trim.trim_to);
5736   // trim log when the pg is recovered
5737   ps->calc_min_last_complete_ondisk();
5738   return discard_event();
5739 }
5740
5741 boost::statechart::result PeeringState::Active::react(const MInfoRec& infoevt)
5742 {
5743   DECLARE_LOCALS;
5744   ceph_assert(ps->is_primary());
5745
5746   ceph_assert(!ps->acting_recovery_backfill.empty());
5747   if (infoevt.lease_ack) {
5748     ps->proc_lease_ack(infoevt.from.osd, *infoevt.lease_ack);
5749   }
5750   // don't update history (yet) if we are active and primary; the replica
5751   // may be telling us they have activated (and committed) but we can't
5752   // share that until _everyone_ does the same.
5753   if (ps->is_acting_recovery_backfill(infoevt.from) &&
5754       ps->peer_activated.count(infoevt.from) == 0) {
5755     psdout(10) << " peer osd." << infoevt.from
5756                << " activated and committed" << dendl;
5757     ps->peer_activated.insert(infoevt.from);
5758     ps->blocked_by.erase(infoevt.from.shard);
5759     pl->publish_stats_to_osd();
5760     if (ps->peer_activated.size() == ps->acting_recovery_backfill.size()) {
5761       all_activated_and_committed();
5762     }
5763   }
5764   return discard_event();
5765 }
5766
5767 boost::statechart::result PeeringState::Active::react(const MLogRec& logevt)
5768 {
5769   DECLARE_LOCALS;
5770   psdout(10) << "searching osd." << logevt.from
5771                      << " log for unfound items" << dendl;
5772   ps->proc_replica_log(
5773     logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
5774   bool got_missing = ps->search_for_missing(
5775     ps->peer_info[logevt.from],
5776     ps->peer_missing[logevt.from],
5777     logevt.from,
5778     context< PeeringMachine >().get_recovery_ctx());
5779   // If there are missing AND we are "fully" active then start recovery now
5780   if (got_missing && ps->state_test(PG_STATE_ACTIVE)) {
5781     post_event(DoRecovery());
5782   }
5783   return discard_event();
5784 }
5785
5786 boost::statechart::result PeeringState::Active::react(const QueryState& q)
5787 {
5788   DECLARE_LOCALS;
5789
5790   q.f->open_object_section("state");
5791   q.f->dump_string("name", state_name);
5792   q.f->dump_stream("enter_time") << enter_time;
5793
5794   {
5795     q.f->open_array_section("might_have_unfound");
5796     for (set<pg_shard_t>::iterator p = ps->might_have_unfound.begin();
5797          p != ps->might_have_unfound.end();
5798          ++p) {
5799       q.f->open_object_section("osd");
5800       q.f->dump_stream("osd") << *p;
5801       if (ps->peer_missing.count(*p)) {
5802         q.f->dump_string("status", "already probed");
5803       } else if (ps->peer_missing_requested.count(*p)) {
5804         q.f->dump_string("status", "querying");
5805       } else if (!ps->get_osdmap()->is_up(p->osd)) {
5806         q.f->dump_string("status", "osd is down");
5807       } else {
5808         q.f->dump_string("status", "not queried");
5809       }
5810       q.f->close_section();
5811     }
5812     q.f->close_section();
5813   }
5814   {
5815     q.f->open_object_section("recovery_progress");
5816     q.f->open_array_section("backfill_targets");
5817     for (set<pg_shard_t>::const_iterator p = ps->backfill_targets.begin();
5818          p != ps->backfill_targets.end(); ++p)
5819       q.f->dump_stream("replica") << *p;
5820     q.f->close_section();
5821     pl->dump_recovery_info(q.f);
5822     q.f->close_section();
5823   }
5824
5825   q.f->close_section();
5826   return forward_event();
5827 }
5828
5829 boost::statechart::result PeeringState::Active::react(
5830   const ActivateCommitted &evt)
5831 {
5832   DECLARE_LOCALS;
5833   ceph_assert(!ps->peer_activated.count(ps->pg_whoami));
5834   ps->peer_activated.insert(ps->pg_whoami);
5835   psdout(10) << "_activate_committed " << evt.epoch
5836              << " peer_activated now " << ps->peer_activated
5837              << " last_interval_started "
5838              << ps->info.history.last_interval_started
5839              << " last_epoch_started "
5840              << ps->info.history.last_epoch_started
5841              << " same_interval_since "
5842              << ps->info.history.same_interval_since
5843              << dendl;
5844   ceph_assert(!ps->acting_recovery_backfill.empty());
5845   if (ps->peer_activated.size() == ps->acting_recovery_backfill.size())
5846     all_activated_and_committed();
5847   return discard_event();
5848 }
5849
5850 boost::statechart::result PeeringState::Active::react(const AllReplicasActivated &evt)
5851 {
5852
5853   DECLARE_LOCALS;
5854   pg_t pgid = context< PeeringMachine >().spgid.pgid;
5855
5856   all_replicas_activated = true;
5857
5858   ps->state_clear(PG_STATE_ACTIVATING);
5859   ps->state_clear(PG_STATE_CREATING);
5860   ps->state_clear(PG_STATE_PREMERGE);
5861
5862   bool merge_target;
5863   if (ps->pool.info.is_pending_merge(pgid, &merge_target)) {
5864     ps->state_set(PG_STATE_PEERED);
5865     ps->state_set(PG_STATE_PREMERGE);
5866
5867     if (ps->actingset.size() != ps->get_osdmap()->get_pg_size(pgid)) {
5868       if (merge_target) {
5869         pg_t src = pgid;
5870         src.set_ps(ps->pool.info.get_pg_num_pending());
5871         assert(src.get_parent() == pgid);
5872         pl->set_not_ready_to_merge_target(pgid, src);
5873       } else {
5874         pl->set_not_ready_to_merge_source(pgid);
5875       }
5876     }
5877   } else if (ps->actingset.size() < ps->pool.info.min_size) {
5878     ps->state_set(PG_STATE_PEERED);
5879   } else {
5880     ps->state_set(PG_STATE_ACTIVE);
5881   }
5882
5883   auto mnow = pl->get_mnow();
5884   if (ps->prior_readable_until_ub > mnow) {
5885     psdout(10) << " waiting for prior_readable_until_ub "
5886                << ps->prior_readable_until_ub << " > mnow " << mnow << dendl;
5887     ps->state_set(PG_STATE_WAIT);
5888     pl->queue_check_readable(
5889       ps->last_peering_reset,
5890       ps->prior_readable_until_ub - mnow);
5891   } else {
5892     psdout(10) << " mnow " << mnow << " >= prior_readable_until_ub "
5893                << ps->prior_readable_until_ub << dendl;
5894   }
5895
5896   if (ps->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) {
5897     pl->send_pg_created(pgid);
5898   }
5899
5900   ps->info.history.last_epoch_started = ps->info.last_epoch_started;
5901   ps->info.history.last_interval_started = ps->info.last_interval_started;
5902   ps->dirty_info = true;
5903
5904   ps->share_pg_info();
5905   pl->publish_stats_to_osd();
5906
5907   pl->on_activate_complete();
5908
5909   return discard_event();
5910 }
5911
5912 boost::statechart::result PeeringState::Active::react(const RenewLease& rl)
5913 {
5914   DECLARE_LOCALS;
5915   ps->proc_renew_lease();
5916   return discard_event();
5917 }
5918
5919 boost::statechart::result PeeringState::Active::react(const MLeaseAck& la)
5920 {
5921   DECLARE_LOCALS;
5922   ps->proc_lease_ack(la.from, la.lease_ack);
5923   return discard_event();
5924 }
5925
5926
5927 boost::statechart::result PeeringState::Active::react(const CheckReadable &evt)
5928 {
5929   DECLARE_LOCALS;
5930   pl->recheck_readable();
5931   return discard_event();
5932 }
5933
5934 /*
5935  * update info.history.last_epoch_started ONLY after we and all
5936  * replicas have activated AND committed the activate transaction
5937  * (i.e. the peering results are stable on disk).
5938  */
5939 void PeeringState::Active::all_activated_and_committed()
5940 {
5941   DECLARE_LOCALS;
5942   psdout(10) << "all_activated_and_committed" << dendl;
5943   ceph_assert(ps->is_primary());
5944   ceph_assert(ps->peer_activated.size() == ps->acting_recovery_backfill.size());
5945   ceph_assert(!ps->acting_recovery_backfill.empty());
5946   ceph_assert(ps->blocked_by.empty());
5947
5948   if (HAVE_FEATURE(ps->upacting_features, SERVER_OCTOPUS)) {
5949     // this is overkill when the activation is quick, but when it is slow it
5950     // is important, because the lease was renewed by the activate itself but we
5951     // don't know how long ago that was, and simply scheduling now may leave
5952     // a gap in lease coverage.  keep it simple and aggressively renew.
5953     ps->renew_lease(pl->get_mnow());
5954     ps->send_lease();
5955     ps->schedule_renew_lease();
5956   }
5957
5958   // Degraded?
5959   ps->update_calc_stats();
5960   if (ps->info.stats.stats.sum.num_objects_degraded) {
5961     ps->state_set(PG_STATE_DEGRADED);
5962   } else {
5963     ps->state_clear(PG_STATE_DEGRADED);
5964   }
5965
5966   post_event(PeeringState::AllReplicasActivated());
5967 }
5968
5969
5970 void PeeringState::Active::exit()
5971 {
5972   context< PeeringMachine >().log_exit(state_name, enter_time);
5973
5974
5975   DECLARE_LOCALS;
5976   pl->cancel_local_background_io_reservation();
5977
5978   ps->blocked_by.clear();
5979   ps->backfill_reserved = false;
5980   ps->state_clear(PG_STATE_ACTIVATING);
5981   ps->state_clear(PG_STATE_DEGRADED);
5982   ps->state_clear(PG_STATE_UNDERSIZED);
5983   ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
5984   ps->state_clear(PG_STATE_BACKFILL_WAIT);
5985   ps->state_clear(PG_STATE_RECOVERY_WAIT);
5986   ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5987   utime_t dur = ceph_clock_now() - enter_time;
5988   pl->get_peering_perf().tinc(rs_active_latency, dur);
5989   pl->on_active_exit();
5990 }
5991
5992 /*------ReplicaActive-----*/
5993 PeeringState::ReplicaActive::ReplicaActive(my_context ctx)
5994   : my_base(ctx),
5995     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive")
5996 {
5997   context< PeeringMachine >().log_enter(state_name);
5998
5999   DECLARE_LOCALS;
6000   ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6001 }
6002
6003
6004 boost::statechart::result PeeringState::ReplicaActive::react(
6005   const Activate& actevt) {
6006   DECLARE_LOCALS;
6007   psdout(10) << "In ReplicaActive, about to call activate" << dendl;
6008   ps->activate(
6009     context< PeeringMachine >().get_cur_transaction(),
6010     actevt.activation_epoch,
6011     context< PeeringMachine >().get_recovery_ctx());
6012   psdout(10) << "Activate Finished" << dendl;
6013   return discard_event();
6014 }
6015
6016 boost::statechart::result PeeringState::ReplicaActive::react(
6017   const ActivateCommitted &evt)
6018 {
6019   DECLARE_LOCALS;
6020   psdout(10) << __func__ << " " << evt.epoch << " telling primary" << dendl;
6021
6022   auto &rctx = context<PeeringMachine>().get_recovery_ctx();
6023   auto epoch = ps->get_osdmap_epoch();
6024   pg_info_t i = ps->info;
6025   i.history.last_epoch_started = evt.activation_epoch;
6026   i.history.last_interval_started = i.history.same_interval_since;
6027   rctx.send_info(
6028     ps->get_primary().osd,
6029     spg_t(ps->info.pgid.pgid, ps->get_primary().shard),
6030     epoch,
6031     epoch,
6032     i,
6033     {}, /* lease */
6034     ps->get_lease_ack());
6035
6036   if (ps->actingset.size() >= ps->pool.info.min_size) {
6037     ps->state_set(PG_STATE_ACTIVE);
6038   } else {
6039     ps->state_set(PG_STATE_PEERED);
6040   }
6041   pl->on_activate_committed();
6042
6043   return discard_event();
6044 }
6045
6046 boost::statechart::result PeeringState::ReplicaActive::react(const MLease& l)
6047 {
6048   DECLARE_LOCALS;
6049   spg_t spgid = context< PeeringMachine >().spgid;
6050   epoch_t epoch = pl->get_osdmap_epoch();
6051
6052   ps->proc_lease(l.lease);
6053   pl->send_cluster_message(
6054     ps->get_primary().osd,
6055     new MOSDPGLeaseAck(epoch,
6056                        spg_t(spgid.pgid, ps->get_primary().shard),
6057                        ps->get_lease_ack()),
6058     epoch);
6059   return discard_event();
6060 }
6061
6062 boost::statechart::result PeeringState::ReplicaActive::react(const MInfoRec& infoevt)
6063 {
6064   DECLARE_LOCALS;
6065   ps->proc_primary_info(context<PeeringMachine>().get_cur_transaction(),
6066                         infoevt.info);
6067   return discard_event();
6068 }
6069
6070 boost::statechart::result PeeringState::ReplicaActive::react(const MLogRec& logevt)
6071 {
6072   DECLARE_LOCALS;
6073   psdout(10) << "received log from " << logevt.from << dendl;
6074   ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6075   ps->merge_log(t, logevt.msg->info, logevt.msg->log, logevt.from);
6076   ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6077   if (logevt.msg->lease) {
6078     ps->proc_lease(*logevt.msg->lease);
6079   }
6080
6081   return discard_event();
6082 }
6083
6084 boost::statechart::result PeeringState::ReplicaActive::react(const MTrim& trim)
6085 {
6086   DECLARE_LOCALS;
6087   // primary is instructing us to trim
6088   ps->pg_log.trim(trim.trim_to, ps->info);
6089   ps->dirty_info = true;
6090   return discard_event();
6091 }
6092
6093 boost::statechart::result PeeringState::ReplicaActive::react(const ActMap&)
6094 {
6095   DECLARE_LOCALS;
6096   if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6097     ps->info.history.refresh_prior_readable_until_ub(
6098       pl->get_mnow(), ps->prior_readable_until_ub);
6099     context< PeeringMachine >().send_notify(
6100       ps->get_primary().osd,
6101       pg_notify_t(
6102         ps->get_primary().shard, ps->pg_whoami.shard,
6103         ps->get_osdmap_epoch(),
6104         ps->get_osdmap_epoch(),
6105         ps->info,
6106         ps->past_intervals));
6107   }
6108   return discard_event();
6109 }
6110
6111 boost::statechart::result PeeringState::ReplicaActive::react(
6112   const MQuery& query)
6113 {
6114   DECLARE_LOCALS;
6115   ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6116   return discard_event();
6117 }
6118
6119 boost::statechart::result PeeringState::ReplicaActive::react(const QueryState& q)
6120 {
6121   q.f->open_object_section("state");
6122   q.f->dump_string("name", state_name);
6123   q.f->dump_stream("enter_time") << enter_time;
6124   q.f->close_section();
6125   return forward_event();
6126 }
6127
6128 void PeeringState::ReplicaActive::exit()
6129 {
6130   context< PeeringMachine >().log_exit(state_name, enter_time);
6131   DECLARE_LOCALS;
6132   pl->unreserve_recovery_space();
6133
6134   pl->cancel_remote_recovery_reservation();
6135   utime_t dur = ceph_clock_now() - enter_time;
6136   pl->get_peering_perf().tinc(rs_replicaactive_latency, dur);
6137
6138   ps->min_last_complete_ondisk = eversion_t();
6139 }
6140
6141 /*-------Stray---*/
6142 PeeringState::Stray::Stray(my_context ctx)
6143   : my_base(ctx),
6144     NamedState(context< PeeringMachine >().state_history, "Started/Stray")
6145 {
6146   context< PeeringMachine >().log_enter(state_name);
6147
6148
6149   DECLARE_LOCALS;
6150   ceph_assert(!ps->is_peered());
6151   ceph_assert(!ps->is_peering());
6152   ceph_assert(!ps->is_primary());
6153
6154   if (!ps->get_osdmap()->have_pg_pool(ps->info.pgid.pgid.pool())) {
6155     ldout(ps->cct,10) << __func__ << " pool is deleted" << dendl;
6156     post_event(DeleteStart());
6157   } else {
6158     ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6159   }
6160 }
6161
6162 boost::statechart::result PeeringState::Stray::react(const MLogRec& logevt)
6163 {
6164   DECLARE_LOCALS;
6165   MOSDPGLog *msg = logevt.msg.get();
6166   psdout(10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
6167
6168   ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6169   if (msg->info.last_backfill == hobject_t()) {
6170     // restart backfill
6171     ps->info = msg->info;
6172     pl->on_info_history_change();
6173     ps->dirty_info = true;
6174     ps->dirty_big_info = true;  // maybe.
6175
6176     PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6177     ps->pg_log.reset_backfill_claim_log(msg->log, rollbacker.get());
6178
6179     ps->pg_log.reset_backfill();
6180   } else {
6181     ps->merge_log(t, msg->info, msg->log, logevt.from);
6182   }
6183   if (logevt.msg->lease) {
6184     ps->proc_lease(*logevt.msg->lease);
6185   }
6186
6187   ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6188
6189   post_event(Activate(logevt.msg->info.last_epoch_started));
6190   return transit<ReplicaActive>();
6191 }
6192
6193 boost::statechart::result PeeringState::Stray::react(const MInfoRec& infoevt)
6194 {
6195   DECLARE_LOCALS;
6196   psdout(10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
6197
6198   if (ps->info.last_update > infoevt.info.last_update) {
6199     // rewind divergent log entries
6200     ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6201     ps->rewind_divergent_log(t, infoevt.info.last_update);
6202     ps->info.stats = infoevt.info.stats;
6203     ps->info.hit_set = infoevt.info.hit_set;
6204   }
6205
6206   if (infoevt.lease) {
6207     ps->proc_lease(*infoevt.lease);
6208   }
6209
6210   ceph_assert(infoevt.info.last_update == ps->info.last_update);
6211   ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6212
6213   post_event(Activate(infoevt.info.last_epoch_started));
6214   return transit<ReplicaActive>();
6215 }
6216
6217 boost::statechart::result PeeringState::Stray::react(const MQuery& query)
6218 {
6219   DECLARE_LOCALS;
6220   ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6221   return discard_event();
6222 }
6223
6224 boost::statechart::result PeeringState::Stray::react(const ActMap&)
6225 {
6226   DECLARE_LOCALS;
6227   if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6228     ps->info.history.refresh_prior_readable_until_ub(
6229       pl->get_mnow(), ps->prior_readable_until_ub);
6230     context< PeeringMachine >().send_notify(
6231       ps->get_primary().osd,
6232       pg_notify_t(
6233         ps->get_primary().shard, ps->pg_whoami.shard,
6234         ps->get_osdmap_epoch(),
6235         ps->get_osdmap_epoch(),
6236         ps->info,
6237         ps->past_intervals));
6238   }
6239   return discard_event();
6240 }
6241
6242 void PeeringState::Stray::exit()
6243 {
6244   context< PeeringMachine >().log_exit(state_name, enter_time);
6245   DECLARE_LOCALS;
6246   utime_t dur = ceph_clock_now() - enter_time;
6247   pl->get_peering_perf().tinc(rs_stray_latency, dur);
6248 }
6249
6250
6251 /*--------ToDelete----------*/
6252 PeeringState::ToDelete::ToDelete(my_context ctx)
6253   : my_base(ctx),
6254     NamedState(context< PeeringMachine >().state_history, "Started/ToDelete")
6255 {
6256   context< PeeringMachine >().log_enter(state_name);
6257   DECLARE_LOCALS;
6258   pl->get_perf_logger().inc(l_osd_pg_removing);
6259 }
6260
6261 void PeeringState::ToDelete::exit()
6262 {
6263   context< PeeringMachine >().log_exit(state_name, enter_time);
6264   DECLARE_LOCALS;
6265   // note: on a successful removal, this path doesn't execute. see
6266   // _delete_some().
6267   pl->get_perf_logger().dec(l_osd_pg_removing);
6268
6269   pl->cancel_local_background_io_reservation();
6270 }
6271
6272 /*----WaitDeleteReserved----*/
6273 PeeringState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx)
6274   : my_base(ctx),
6275     NamedState(context< PeeringMachine >().state_history,
6276                "Started/ToDelete/WaitDeleteReseved")
6277 {
6278   context< PeeringMachine >().log_enter(state_name);
6279   DECLARE_LOCALS;
6280   context< ToDelete >().priority = ps->get_delete_priority();
6281
6282   pl->cancel_local_background_io_reservation();
6283   pl->request_local_background_io_reservation(
6284     context<ToDelete>().priority,
6285     std::make_shared<PGPeeringEvent>(
6286       ps->get_osdmap_epoch(),
6287       ps->get_osdmap_epoch(),
6288       DeleteReserved()),
6289     std::make_shared<PGPeeringEvent>(
6290       ps->get_osdmap_epoch(),
6291       ps->get_osdmap_epoch(),
6292       DeleteInterrupted()));
6293 }
6294
6295 boost::statechart::result PeeringState::ToDelete::react(
6296   const ActMap& evt)
6297 {
6298   DECLARE_LOCALS;
6299   if (ps->get_delete_priority() != priority) {
6300     psdout(10) << __func__ << " delete priority changed, resetting"
6301                    << dendl;
6302     return transit<ToDelete>();
6303   }
6304   return discard_event();
6305 }
6306
6307 void PeeringState::WaitDeleteReserved::exit()
6308 {
6309   context< PeeringMachine >().log_exit(state_name, enter_time);
6310 }
6311
6312 /*----Deleting-----*/
6313 PeeringState::Deleting::Deleting(my_context ctx)
6314   : my_base(ctx),
6315     NamedState(context< PeeringMachine >().state_history, "Started/ToDelete/Deleting")
6316 {
6317   start = ceph::mono_clock::now();
6318
6319   context< PeeringMachine >().log_enter(state_name);
6320
6321   DECLARE_LOCALS;
6322   ps->deleting = true;
6323   ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6324
6325   // clear log
6326   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6327   ps->pg_log.roll_forward(rollbacker.get());
6328
6329   // adjust info to backfill
6330   ps->info.set_last_backfill(hobject_t());
6331   ps->pg_log.reset_backfill();
6332   ps->dirty_info = true;
6333
6334   pl->on_removal(t);
6335 }
6336
6337 boost::statechart::result PeeringState::Deleting::react(
6338   const DeleteSome& evt)
6339 {
6340   DECLARE_LOCALS;
6341   next = pl->do_delete_work(context<PeeringMachine>().get_cur_transaction(),
6342     next);
6343   return discard_event();
6344 }
6345
6346 void PeeringState::Deleting::exit()
6347 {
6348   context< PeeringMachine >().log_exit(state_name, enter_time);
6349   DECLARE_LOCALS;
6350   ps->deleting = false;
6351   pl->cancel_local_background_io_reservation();
6352   psdout(20) << "Deleting::" << __func__ << this <<" finished in "
6353            << ceph::mono_clock::now() - start
6354            << dendl;
6355 }
6356
6357 /*--------GetInfo---------*/
6358 PeeringState::GetInfo::GetInfo(my_context ctx)
6359   : my_base(ctx),
6360     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetInfo")
6361 {
6362   context< PeeringMachine >().log_enter(state_name);
6363
6364
6365   DECLARE_LOCALS;
6366   ps->check_past_interval_bounds();
6367   ps->log_weirdness();
6368   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6369
6370   ceph_assert(ps->blocked_by.empty());
6371
6372   prior_set = ps->build_prior();
6373   ps->prior_readable_down_osds = prior_set.down;
6374   if (ps->prior_readable_down_osds.empty()) {
6375     psdout(10) << " no prior_set down osds, clearing prior_readable_until_ub"
6376                << dendl;
6377     ps->clear_prior_readable_until_ub();
6378   }
6379
6380   ps->reset_min_peer_features();
6381   get_infos();
6382   if (prior_set.pg_down) {
6383     post_event(IsDown());
6384   } else if (peer_info_requested.empty()) {
6385     post_event(GotInfo());
6386   }
6387 }
6388
6389 void PeeringState::GetInfo::get_infos()
6390 {
6391   DECLARE_LOCALS;
6392   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6393
6394   ps->blocked_by.clear();
6395   for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
6396        it != prior_set.probe.end();
6397        ++it) {
6398     pg_shard_t peer = *it;
6399     if (peer == ps->pg_whoami) {
6400       continue;
6401     }
6402     if (ps->peer_info.count(peer)) {
6403       psdout(10) << " have osd." << peer << " info " << ps->peer_info[peer] << dendl;
6404       continue;
6405     }
6406     if (peer_info_requested.count(peer)) {
6407       psdout(10) << " already requested info from osd." << peer << dendl;
6408       ps->blocked_by.insert(peer.osd);
6409     } else if (!ps->get_osdmap()->is_up(peer.osd)) {
6410       psdout(10) << " not querying info from down osd." << peer << dendl;
6411     } else {
6412       psdout(10) << " querying info from osd." << peer << dendl;
6413       context< PeeringMachine >().send_query(
6414         peer.osd,
6415         pg_query_t(pg_query_t::INFO,
6416                    it->shard, ps->pg_whoami.shard,
6417                    ps->info.history,
6418                    ps->get_osdmap_epoch()));
6419       peer_info_requested.insert(peer);
6420       ps->blocked_by.insert(peer.osd);
6421     }
6422   }
6423
6424   ps->check_prior_readable_down_osds(ps->get_osdmap());
6425
6426   pl->publish_stats_to_osd();
6427 }
6428
6429 boost::statechart::result PeeringState::GetInfo::react(const MNotifyRec& infoevt)
6430 {
6431
6432   DECLARE_LOCALS;
6433
6434   set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
6435   if (p != peer_info_requested.end()) {
6436     peer_info_requested.erase(p);
6437     ps->blocked_by.erase(infoevt.from.osd);
6438   }
6439
6440   epoch_t old_start = ps->info.history.last_epoch_started;
6441   if (ps->proc_replica_info(
6442         infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
6443     // we got something new ...
6444     PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6445     if (old_start < ps->info.history.last_epoch_started) {
6446       psdout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
6447       prior_set = ps->build_prior();
6448       ps->prior_readable_down_osds = prior_set.down;
6449
6450       // filter out any osds that got dropped from the probe set from
6451       // peer_info_requested.  this is less expensive than restarting
6452       // peering (which would re-probe everyone).
6453       set<pg_shard_t>::iterator p = peer_info_requested.begin();
6454       while (p != peer_info_requested.end()) {
6455         if (prior_set.probe.count(*p) == 0) {
6456           psdout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
6457           peer_info_requested.erase(p++);
6458         } else {
6459           ++p;
6460         }
6461       }
6462       get_infos();
6463     }
6464     psdout(20) << "Adding osd: " << infoevt.from.osd << " peer features: "
6465                        << hex << infoevt.features << dec << dendl;
6466     ps->apply_peer_features(infoevt.features);
6467
6468     // are we done getting everything?
6469     if (peer_info_requested.empty() && !prior_set.pg_down) {
6470       psdout(20) << "Common peer features: " << hex << ps->get_min_peer_features() << dec << dendl;
6471       psdout(20) << "Common acting features: " << hex << ps->get_min_acting_features() << dec << dendl;
6472       psdout(20) << "Common upacting features: " << hex << ps->get_min_upacting_features() << dec << dendl;
6473       post_event(GotInfo());
6474     }
6475   }
6476   return discard_event();
6477 }
6478
6479 boost::statechart::result PeeringState::GetInfo::react(const QueryState& q)
6480 {
6481   DECLARE_LOCALS;
6482   q.f->open_object_section("state");
6483   q.f->dump_string("name", state_name);
6484   q.f->dump_stream("enter_time") << enter_time;
6485
6486   q.f->open_array_section("requested_info_from");
6487   for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
6488        p != peer_info_requested.end();
6489        ++p) {
6490     q.f->open_object_section("osd");
6491     q.f->dump_stream("osd") << *p;
6492     if (ps->peer_info.count(*p)) {
6493       q.f->open_object_section("got_info");
6494       ps->peer_info[*p].dump(q.f);
6495       q.f->close_section();
6496     }
6497     q.f->close_section();
6498   }
6499   q.f->close_section();
6500
6501   q.f->close_section();
6502   return forward_event();
6503 }
6504
6505 void PeeringState::GetInfo::exit()
6506 {
6507   context< PeeringMachine >().log_exit(state_name, enter_time);
6508
6509   DECLARE_LOCALS;
6510   utime_t dur = ceph_clock_now() - enter_time;
6511   pl->get_peering_perf().tinc(rs_getinfo_latency, dur);
6512   ps->blocked_by.clear();
6513 }
6514
6515 /*------GetLog------------*/
6516 PeeringState::GetLog::GetLog(my_context ctx)
6517   : my_base(ctx),
6518     NamedState(
6519       context< PeeringMachine >().state_history,
6520       "Started/Primary/Peering/GetLog"),
6521     msg(0)
6522 {
6523   context< PeeringMachine >().log_enter(state_name);
6524
6525   DECLARE_LOCALS;
6526
6527   ps->log_weirdness();
6528
6529   // adjust acting?
6530   if (!ps->choose_acting(auth_log_shard, false,
6531                          &context< Peering >().history_les_bound)) {
6532     if (!ps->want_acting.empty()) {
6533       post_event(NeedActingChange());
6534     } else {
6535       post_event(IsIncomplete());
6536     }
6537     return;
6538   }
6539
6540   // am i the best?
6541   if (auth_log_shard == ps->pg_whoami) {
6542     post_event(GotLog());
6543     return;
6544   }
6545
6546   const pg_info_t& best = ps->peer_info[auth_log_shard];
6547
6548   // am i broken?
6549   if (ps->info.last_update < best.log_tail) {
6550     psdout(10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
6551     post_event(IsIncomplete());
6552     return;
6553   }
6554
6555   // how much log to request?
6556   eversion_t request_log_from = ps->info.last_update;
6557   ceph_assert(!ps->acting_recovery_backfill.empty());
6558   for (set<pg_shard_t>::iterator p = ps->acting_recovery_backfill.begin();
6559        p != ps->acting_recovery_backfill.end();
6560        ++p) {
6561     if (*p == ps->pg_whoami) continue;
6562     pg_info_t& ri = ps->peer_info[*p];
6563     if (ri.last_update < ps->info.log_tail && ri.last_update >= best.log_tail &&
6564         ri.last_update < request_log_from)
6565       request_log_from = ri.last_update;
6566   }
6567
6568   // how much?
6569   psdout(10) << " requesting log from osd." << auth_log_shard << dendl;
6570   context<PeeringMachine>().send_query(
6571     auth_log_shard.osd,
6572     pg_query_t(
6573       pg_query_t::LOG,
6574       auth_log_shard.shard, ps->pg_whoami.shard,
6575       request_log_from, ps->info.history,
6576       ps->get_osdmap_epoch()));
6577
6578   ceph_assert(ps->blocked_by.empty());
6579   ps->blocked_by.insert(auth_log_shard.osd);
6580   pl->publish_stats_to_osd();
6581 }
6582
6583 boost::statechart::result PeeringState::GetLog::react(const AdvMap& advmap)
6584 {
6585   // make sure our log source didn't go down.  we need to check
6586   // explicitly because it may not be part of the prior set, which
6587   // means the Peering state check won't catch it going down.
6588   if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
6589     psdout(10) << "GetLog: auth_log_shard osd."
6590                        << auth_log_shard.osd << " went down" << dendl;
6591     post_event(advmap);
6592     return transit< Reset >();
6593   }
6594
6595   // let the Peering state do its checks.
6596   return forward_event();
6597 }
6598
6599 boost::statechart::result PeeringState::GetLog::react(const MLogRec& logevt)
6600 {
6601   ceph_assert(!msg);
6602   if (logevt.from != auth_log_shard) {
6603     psdout(10) << "GetLog: discarding log from "
6604                        << "non-auth_log_shard osd." << logevt.from << dendl;
6605     return discard_event();
6606   }
6607   psdout(10) << "GetLog: received master log from osd."
6608                      << logevt.from << dendl;
6609   msg = logevt.msg;
6610   post_event(GotLog());
6611   return discard_event();
6612 }
6613
6614 boost::statechart::result PeeringState::GetLog::react(const GotLog&)
6615 {
6616
6617   DECLARE_LOCALS;
6618   psdout(10) << "leaving GetLog" << dendl;
6619   if (msg) {
6620     psdout(10) << "processing master log" << dendl;
6621     ps->proc_master_log(context<PeeringMachine>().get_cur_transaction(),
6622                         msg->info, msg->log, msg->missing,
6623                         auth_log_shard);
6624   }
6625   ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6626   return transit< GetMissing >();
6627 }
6628
6629 boost::statechart::result PeeringState::GetLog::react(const QueryState& q)
6630 {
6631   q.f->open_object_section("state");
6632   q.f->dump_string("name", state_name);
6633   q.f->dump_stream("enter_time") << enter_time;
6634   q.f->dump_stream("auth_log_shard") << auth_log_shard;
6635   q.f->close_section();
6636   return forward_event();
6637 }
6638
6639 void PeeringState::GetLog::exit()
6640 {
6641   context< PeeringMachine >().log_exit(state_name, enter_time);
6642
6643   DECLARE_LOCALS;
6644   utime_t dur = ceph_clock_now() - enter_time;
6645   pl->get_peering_perf().tinc(rs_getlog_latency, dur);
6646   ps->blocked_by.clear();
6647 }
6648
6649 /*------WaitActingChange--------*/
6650 PeeringState::WaitActingChange::WaitActingChange(my_context ctx)
6651   : my_base(ctx),
6652     NamedState(context< PeeringMachine >().state_history, "Started/Primary/WaitActingChange")
6653 {
6654   context< PeeringMachine >().log_enter(state_name);
6655 }
6656
6657 boost::statechart::result PeeringState::WaitActingChange::react(const AdvMap& advmap)
6658 {
6659   DECLARE_LOCALS;
6660   OSDMapRef osdmap = advmap.osdmap;
6661
6662   psdout(10) << "verifying no want_acting " << ps->want_acting << " targets didn't go down" << dendl;
6663   for (vector<int>::iterator p = ps->want_acting.begin(); p != ps->want_acting.end(); ++p) {
6664     if (!osdmap->is_up(*p)) {
6665       psdout(10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
6666       post_event(advmap);
6667       return transit< Reset >();
6668     }
6669   }
6670   return forward_event();
6671 }
6672
6673 boost::statechart::result PeeringState::WaitActingChange::react(const MLogRec& logevt)
6674 {
6675   psdout(10) << "In WaitActingChange, ignoring MLocRec" << dendl;
6676   return discard_event();
6677 }
6678
6679 boost::statechart::result PeeringState::WaitActingChange::react(const MInfoRec& evt)
6680 {
6681   psdout(10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
6682   return discard_event();
6683 }
6684
6685 boost::statechart::result PeeringState::WaitActingChange::react(const MNotifyRec& evt)
6686 {
6687   psdout(10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
6688   return discard_event();
6689 }
6690
6691 boost::statechart::result PeeringState::WaitActingChange::react(const QueryState& q)
6692 {
6693   q.f->open_object_section("state");
6694   q.f->dump_string("name", state_name);
6695   q.f->dump_stream("enter_time") << enter_time;
6696   q.f->dump_string("comment", "waiting for pg acting set to change");
6697   q.f->close_section();
6698   return forward_event();
6699 }
6700
6701 void PeeringState::WaitActingChange::exit()
6702 {
6703   context< PeeringMachine >().log_exit(state_name, enter_time);
6704   DECLARE_LOCALS;
6705   utime_t dur = ceph_clock_now() - enter_time;
6706   pl->get_peering_perf().tinc(rs_waitactingchange_latency, dur);
6707 }
6708
6709 /*------Down--------*/
6710 PeeringState::Down::Down(my_context ctx)
6711   : my_base(ctx),
6712     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Down")
6713 {
6714   context< PeeringMachine >().log_enter(state_name);
6715   DECLARE_LOCALS;
6716
6717   ps->state_clear(PG_STATE_PEERING);
6718   ps->state_set(PG_STATE_DOWN);
6719
6720   auto &prior_set = context< Peering >().prior_set;
6721   ceph_assert(ps->blocked_by.empty());
6722   ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
6723   pl->publish_stats_to_osd();
6724 }
6725
6726 void PeeringState::Down::exit()
6727 {
6728   context< PeeringMachine >().log_exit(state_name, enter_time);
6729
6730   DECLARE_LOCALS;
6731
6732   ps->state_clear(PG_STATE_DOWN);
6733   utime_t dur = ceph_clock_now() - enter_time;
6734   pl->get_peering_perf().tinc(rs_down_latency, dur);
6735
6736   ps->blocked_by.clear();
6737 }
6738
6739 boost::statechart::result PeeringState::Down::react(const QueryState& q)
6740 {
6741   q.f->open_object_section("state");
6742   q.f->dump_string("name", state_name);
6743   q.f->dump_stream("enter_time") << enter_time;
6744   q.f->dump_string("comment",
6745                    "not enough up instances of this PG to go active");
6746   q.f->close_section();
6747   return forward_event();
6748 }
6749
6750 boost::statechart::result PeeringState::Down::react(const MNotifyRec& infoevt)
6751 {
6752   DECLARE_LOCALS;
6753
6754   ceph_assert(ps->is_primary());
6755   epoch_t old_start = ps->info.history.last_epoch_started;
6756   if (!ps->peer_info.count(infoevt.from) &&
6757       ps->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) {
6758     ps->update_history(infoevt.notify.info.history);
6759   }
6760   // if we got something new to make pg escape down state
6761   if (ps->info.history.last_epoch_started > old_start) {
6762       psdout(10) << " last_epoch_started moved forward, re-enter getinfo" << dendl;
6763     ps->state_clear(PG_STATE_DOWN);
6764     ps->state_set(PG_STATE_PEERING);
6765     return transit< GetInfo >();
6766   }
6767
6768   return discard_event();
6769 }
6770
6771
6772 /*------Incomplete--------*/
6773 PeeringState::Incomplete::Incomplete(my_context ctx)
6774   : my_base(ctx),
6775     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Incomplete")
6776 {
6777   context< PeeringMachine >().log_enter(state_name);
6778   DECLARE_LOCALS;
6779
6780   ps->state_clear(PG_STATE_PEERING);
6781   ps->state_set(PG_STATE_INCOMPLETE);
6782
6783   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6784   ceph_assert(ps->blocked_by.empty());
6785   ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
6786   pl->publish_stats_to_osd();
6787 }
6788
6789 boost::statechart::result PeeringState::Incomplete::react(const AdvMap &advmap) {
6790   DECLARE_LOCALS;
6791   int64_t poolnum = ps->info.pgid.pool();
6792
6793   // Reset if min_size turn smaller than previous value, pg might now be able to go active
6794   if (!advmap.osdmap->have_pg_pool(poolnum) ||
6795       advmap.lastmap->get_pools().find(poolnum)->second.min_size >
6796       advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
6797     post_event(advmap);
6798     return transit< Reset >();
6799   }
6800
6801   return forward_event();
6802 }
6803
6804 boost::statechart::result PeeringState::Incomplete::react(const MNotifyRec& notevt) {
6805   DECLARE_LOCALS;
6806   psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
6807   if (ps->proc_replica_info(
6808     notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
6809     // We got something new, try again!
6810     return transit< GetLog >();
6811   } else {
6812     return discard_event();
6813   }
6814 }
6815
6816 boost::statechart::result PeeringState::Incomplete::react(
6817   const QueryState& q)
6818 {
6819   q.f->open_object_section("state");
6820   q.f->dump_string("name", state_name);
6821   q.f->dump_stream("enter_time") << enter_time;
6822   q.f->dump_string("comment", "not enough complete instances of this PG");
6823   q.f->close_section();
6824   return forward_event();
6825 }
6826
6827 void PeeringState::Incomplete::exit()
6828 {
6829   context< PeeringMachine >().log_exit(state_name, enter_time);
6830
6831   DECLARE_LOCALS;
6832
6833   ps->state_clear(PG_STATE_INCOMPLETE);
6834   utime_t dur = ceph_clock_now() - enter_time;
6835   pl->get_peering_perf().tinc(rs_incomplete_latency, dur);
6836
6837   ps->blocked_by.clear();
6838 }
6839
6840 /*------GetMissing--------*/
6841 PeeringState::GetMissing::GetMissing(my_context ctx)
6842   : my_base(ctx),
6843     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetMissing")
6844 {
6845   context< PeeringMachine >().log_enter(state_name);
6846
6847   DECLARE_LOCALS;
6848   ps->log_weirdness();
6849   ceph_assert(!ps->acting_recovery_backfill.empty());
6850   eversion_t since;
6851   for (set<pg_shard_t>::iterator i = ps->acting_recovery_backfill.begin();
6852        i != ps->acting_recovery_backfill.end();
6853        ++i) {
6854     if (*i == ps->get_primary()) continue;
6855     const pg_info_t& pi = ps->peer_info[*i];
6856     // reset this so to make sure the pg_missing_t is initialized and
6857     // has the correct semantics even if we don't need to get a
6858     // missing set from a shard. This way later additions due to
6859     // lost+unfound delete work properly.
6860     ps->peer_missing[*i].may_include_deletes = !ps->perform_deletes_during_peering();
6861
6862     if (pi.is_empty())
6863       continue;                                // no pg data, nothing divergent
6864
6865     if (pi.last_update < ps->pg_log.get_tail()) {
6866       psdout(10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
6867       ps->peer_missing[*i].clear();
6868       continue;
6869     }
6870     if (pi.last_backfill == hobject_t()) {
6871       psdout(10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
6872       ps->peer_missing[*i].clear();
6873       continue;
6874     }
6875
6876     if (pi.last_update == pi.last_complete &&  // peer has no missing
6877         pi.last_update == ps->info.last_update) {  // peer is up to date
6878       // replica has no missing and identical log as us.  no need to
6879       // pull anything.
6880       // FIXME: we can do better here.  if last_update==last_complete we
6881       //        can infer the rest!
6882       psdout(10) << " osd." << *i << " has no missing, identical log" << dendl;
6883       ps->peer_missing[*i].clear();
6884       continue;
6885     }
6886
6887     // We pull the log from the peer's last_epoch_started to ensure we
6888     // get enough log to detect divergent updates.
6889     since.epoch = pi.last_epoch_started;
6890     ceph_assert(pi.last_update >= ps->info.log_tail);  // or else choose_acting() did a bad thing
6891     if (pi.log_tail <= since) {
6892       psdout(10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
6893       context< PeeringMachine >().send_query(
6894         i->osd,
6895         pg_query_t(
6896           pg_query_t::LOG,
6897           i->shard, ps->pg_whoami.shard,
6898           since, ps->info.history,
6899           ps->get_osdmap_epoch()));
6900     } else {
6901       psdout(10) << " requesting fulllog+missing from osd." << *i
6902                          << " (want since " << since << " < log.tail "
6903                          << pi.log_tail << ")" << dendl;
6904       context< PeeringMachine >().send_query(
6905         i->osd, pg_query_t(
6906           pg_query_t::FULLLOG,
6907           i->shard, ps->pg_whoami.shard,
6908           ps->info.history, ps->get_osdmap_epoch()));
6909     }
6910     peer_missing_requested.insert(*i);
6911     ps->blocked_by.insert(i->osd);
6912   }
6913
6914   if (peer_missing_requested.empty()) {
6915     if (ps->need_up_thru) {
6916       psdout(10) << " still need up_thru update before going active"
6917                          << dendl;
6918       post_event(NeedUpThru());
6919       return;
6920     }
6921
6922     // all good!
6923     post_event(Activate(ps->get_osdmap_epoch()));
6924   } else {
6925     pl->publish_stats_to_osd();
6926   }
6927 }
6928
6929 boost::statechart::result PeeringState::GetMissing::react(const MLogRec& logevt)
6930 {
6931   DECLARE_LOCALS;
6932
6933   peer_missing_requested.erase(logevt.from);
6934   ps->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
6935
6936   if (peer_missing_requested.empty()) {
6937     if (ps->need_up_thru) {
6938       psdout(10) << " still need up_thru update before going active"
6939                          << dendl;
6940       post_event(NeedUpThru());
6941     } else {
6942       psdout(10) << "Got last missing, don't need missing "
6943                          << "posting Activate" << dendl;
6944       post_event(Activate(ps->get_osdmap_epoch()));
6945     }
6946   }
6947   return discard_event();
6948 }
6949
6950 boost::statechart::result PeeringState::GetMissing::react(const QueryState& q)
6951 {
6952   DECLARE_LOCALS;
6953   q.f->open_object_section("state");
6954   q.f->dump_string("name", state_name);
6955   q.f->dump_stream("enter_time") << enter_time;
6956
6957   q.f->open_array_section("peer_missing_requested");
6958   for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
6959        p != peer_missing_requested.end();
6960        ++p) {
6961     q.f->open_object_section("osd");
6962     q.f->dump_stream("osd") << *p;
6963     if (ps->peer_missing.count(*p)) {
6964       q.f->open_object_section("got_missing");
6965       ps->peer_missing[*p].dump(q.f);
6966       q.f->close_section();
6967     }
6968     q.f->close_section();
6969   }
6970   q.f->close_section();
6971
6972   q.f->close_section();
6973   return forward_event();
6974 }
6975
6976 void PeeringState::GetMissing::exit()
6977 {
6978   context< PeeringMachine >().log_exit(state_name, enter_time);
6979
6980   DECLARE_LOCALS;
6981   utime_t dur = ceph_clock_now() - enter_time;
6982   pl->get_peering_perf().tinc(rs_getmissing_latency, dur);
6983   ps->blocked_by.clear();
6984 }
6985
6986 /*------WaitUpThru--------*/
6987 PeeringState::WaitUpThru::WaitUpThru(my_context ctx)
6988   : my_base(ctx),
6989     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/WaitUpThru")
6990 {
6991   context< PeeringMachine >().log_enter(state_name);
6992 }
6993
6994 boost::statechart::result PeeringState::WaitUpThru::react(const ActMap& am)
6995 {
6996   DECLARE_LOCALS;
6997   if (!ps->need_up_thru) {
6998     post_event(Activate(ps->get_osdmap_epoch()));
6999   }
7000   return forward_event();
7001 }
7002
7003 boost::statechart::result PeeringState::WaitUpThru::react(const MLogRec& logevt)
7004 {
7005   DECLARE_LOCALS;
7006   psdout(10) << "Noting missing from osd." << logevt.from << dendl;
7007   ps->peer_missing[logevt.from].claim(logevt.msg->missing);
7008   ps->peer_info[logevt.from] = logevt.msg->info;
7009   return discard_event();
7010 }
7011
7012 boost::statechart::result PeeringState::WaitUpThru::react(const QueryState& q)
7013 {
7014   q.f->open_object_section("state");
7015   q.f->dump_string("name", state_name);
7016   q.f->dump_stream("enter_time") << enter_time;
7017   q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
7018   q.f->close_section();
7019   return forward_event();
7020 }
7021
7022 void PeeringState::WaitUpThru::exit()
7023 {
7024   context< PeeringMachine >().log_exit(state_name, enter_time);
7025   DECLARE_LOCALS;
7026   utime_t dur = ceph_clock_now() - enter_time;
7027   pl->get_peering_perf().tinc(rs_waitupthru_latency, dur);
7028 }
7029
7030 /*----PeeringState::PeeringMachine Methods-----*/
7031 #undef dout_prefix
7032 #define dout_prefix dpp->gen_prefix(*_dout)
7033
7034 void PeeringState::PeeringMachine::log_enter(const char *state_name)
7035 {
7036   DECLARE_LOCALS;
7037   psdout(5) << "enter " << state_name << dendl;
7038   pl->log_state_enter(state_name);
7039 }
7040
7041 void PeeringState::PeeringMachine::log_exit(const char *state_name, utime_t enter_time)
7042 {
7043   DECLARE_LOCALS;
7044   utime_t dur = ceph_clock_now() - enter_time;
7045   psdout(5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
7046   pl->log_state_exit(state_name, enter_time, event_count, event_time);
7047   event_count = 0;
7048   event_time = utime_t();
7049 }
7050
7051 ostream &operator<<(ostream &out, const PeeringState &ps) {
7052   out << "pg[" << ps.info
7053       << " " << pg_vector_string(ps.up);
7054   if (ps.acting != ps.up)
7055     out << "/" << pg_vector_string(ps.acting);
7056   if (ps.is_ec_pg())
7057     out << "p" << ps.get_primary();
7058   if (!ps.async_recovery_targets.empty())
7059     out << " async=[" << ps.async_recovery_targets << "]";
7060   if (!ps.backfill_targets.empty())
7061     out << " backfill=[" << ps.backfill_targets << "]";
7062   out << " r=" << ps.get_role();
7063   out << " lpr=" << ps.get_last_peering_reset();
7064
7065   if (ps.deleting)
7066     out << " DELETING";
7067
7068   if (!ps.past_intervals.empty()) {
7069     out << " pi=[" << ps.past_intervals.get_bounds()
7070         << ")/" << ps.past_intervals.size();
7071   }
7072
7073   if (ps.is_peered()) {
7074     if (ps.last_update_ondisk != ps.info.last_update)
7075       out << " luod=" << ps.last_update_ondisk;
7076     if (ps.last_update_applied != ps.info.last_update)
7077       out << " lua=" << ps.last_update_applied;
7078   }
7079
7080   if (ps.pg_log.get_tail() != ps.info.log_tail ||
7081       ps.pg_log.get_head() != ps.info.last_update)
7082     out << " (info mismatch, " << ps.pg_log.get_log() << ")";
7083
7084   if (!ps.pg_log.get_log().empty()) {
7085     if ((ps.pg_log.get_log().log.begin()->version <= ps.pg_log.get_tail())) {
7086       out << " (log bound mismatch, actual=["
7087           << ps.pg_log.get_log().log.begin()->version << ","
7088           << ps.pg_log.get_log().log.rbegin()->version << "]";
7089       out << ")";
7090     }
7091   }
7092
7093   out << " crt=" << ps.pg_log.get_can_rollback_to();
7094
7095   if (ps.last_complete_ondisk != ps.info.last_complete)
7096     out << " lcod " << ps.last_complete_ondisk;
7097
7098   out << " mlcod " << ps.min_last_complete_ondisk;
7099
7100   out << " " << pg_state_string(ps.get_state());
7101   if (ps.should_send_notify())
7102     out << " NOTIFY";
7103
7104   if (ps.prior_readable_until_ub != ceph::signedspan::zero()) {
7105     out << " pruub " << ps.prior_readable_until_ub
7106         << "@" << ps.get_prior_readable_down_osds();
7107   }
7108   return out;
7109 }