ceph/src/osd/PeeringState.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3
   4 #include "PGPeeringEvent.h"
   5 #include "common/ceph_releases.h"
   6 #include "common/dout.h"
   7 #include "PeeringState.h"
   8
   9 #include "messages/MOSDPGRemove.h"
  10 #include "messages/MBackfillReserve.h"
  11 #include "messages/MRecoveryReserve.h"
  12 #include "messages/MOSDScrubReserve.h"
  13 #include "messages/MOSDPGInfo.h"
  14 #include "messages/MOSDPGInfo2.h"
  15 #include "messages/MOSDPGTrim.h"
  16 #include "messages/MOSDPGLog.h"
  17 #include "messages/MOSDPGNotify.h"
  18 #include "messages/MOSDPGNotify2.h"
  19 #include "messages/MOSDPGQuery.h"
  20 #include "messages/MOSDPGQuery2.h"
  21 #include "messages/MOSDPGLease.h"
  22 #include "messages/MOSDPGLeaseAck.h"
  23
  24 #define dout_context cct
  25 #define dout_subsys ceph_subsys_osd
  26
  27 using std::dec;
  28 using std::hex;
  29 using std::make_pair;
  30 using std::map;
  31 using std::ostream;
  32 using std::pair;
  33 using std::set;
  34 using std::stringstream;
  35 using std::vector;
  36
  37 using ceph::Formatter;
  38 using ceph::make_message;
  39
  40 BufferedRecoveryMessages::BufferedRecoveryMessages(
  41   ceph_release_t r,
  42   PeeringCtx &ctx)
  43   : require_osd_release(r) {
  44   // steal messages from ctx
  45   message_map.swap(ctx.message_map);
  46 }
  47
  48 void BufferedRecoveryMessages::send_notify(int to, const pg_notify_t &n)
  49 {
  50   if (require_osd_release >= ceph_release_t::octopus) {
  51     spg_t pgid(n.info.pgid.pgid, n.to);
  52     send_osd_message(to, make_message<MOSDPGNotify2>(pgid, n));
  53   } else {
  54     send_osd_message(to, make_message<MOSDPGNotify>(n.epoch_sent, vector{n}));
  55   }
  56 }
  57
  58 void BufferedRecoveryMessages::send_query(
  59   int to,
  60   spg_t to_spgid,
  61   const pg_query_t &q)
  62 {
  63   if (require_osd_release >= ceph_release_t::octopus) {
  64     send_osd_message(to,
  65                      make_message<MOSDPGQuery2>(to_spgid, q));
  66   } else {
  67     auto m = make_message<MOSDPGQuery>(
  68       q.epoch_sent,
  69       MOSDPGQuery::pg_list_t{{to_spgid, q}});
  70     send_osd_message(to, m);
  71   }
  72 }
  73
  74 void BufferedRecoveryMessages::send_info(
  75   int to,
  76   spg_t to_spgid,
  77   epoch_t min_epoch,
  78   epoch_t cur_epoch,
  79   const pg_info_t &info,
  80   std::optional<pg_lease_t> lease,
  81   std::optional<pg_lease_ack_t> lease_ack)
  82 {
  83   if (require_osd_release >= ceph_release_t::octopus) {
  84     send_osd_message(
  85       to,
  86       make_message<MOSDPGInfo2>(
  87         to_spgid,
  88         info,
  89         cur_epoch,
  90         min_epoch,
  91         lease,
  92         lease_ack)
  93       );
  94   } else {
  95     send_osd_message(
  96       to,
  97       make_message<MOSDPGInfo>(
  98         cur_epoch,
  99         vector{pg_notify_t{to_spgid.shard,
 100                            info.pgid.shard,
 101                            min_epoch, cur_epoch,
 102                            info, PastIntervals{}}})
 103       );
 104   }
 105 }
 106
 107 void PGPool::update(OSDMapRef map)
 108 {
 109   const pg_pool_t *pi = map->get_pg_pool(id);
 110   if (!pi) {
 111     return; // pool has been deleted
 112   }
 113   info = *pi;
 114   name = map->get_pool_name(id);
 115
 116   bool updated = false;
 117   if ((map->get_epoch() != cached_epoch + 1) ||
 118       (pi->get_snap_epoch() == map->get_epoch())) {
 119     updated = true;
 120   }
 121
 122   if (info.is_pool_snaps_mode() && updated) {
 123     snapc = pi->get_snap_context();
 124   }
 125   cached_epoch = map->get_epoch();
 126 }
 127
 128 /*-------------Peering State Helpers----------------*/
 129 #undef dout_prefix
 130 #define dout_prefix (dpp->gen_prefix(*_dout))
 131 #undef psdout
 132 #define psdout(x) ldout(cct, x)
 133
 134 PeeringState::PeeringState(
 135   CephContext *cct,
 136   pg_shard_t pg_whoami,
 137   spg_t spgid,
 138   const PGPool &_pool,
 139   OSDMapRef curmap,
 140   DoutPrefixProvider *dpp,
 141   PeeringListener *pl)
 142   : state_history(*pl),
 143     cct(cct),
 144     spgid(spgid),
 145     dpp(dpp),
 146     pl(pl),
 147     orig_ctx(0),
 148     osdmap_ref(curmap),
 149     pool(_pool),
 150     pg_whoami(pg_whoami),
 151     info(spgid),
 152     pg_log(cct),
 153     missing_loc(spgid, this, dpp, cct),
 154     machine(this, cct, spgid, dpp, pl, &state_history)
 155 {
 156   machine.initiate();
 157 }
 158
 159 void PeeringState::start_handle(PeeringCtx *new_ctx) {
 160   ceph_assert(!rctx);
 161   ceph_assert(!orig_ctx);
 162   orig_ctx = new_ctx;
 163   if (new_ctx) {
 164     if (messages_pending_flush) {
 165       rctx.emplace(*messages_pending_flush, *new_ctx);
 166     } else {
 167       rctx.emplace(*new_ctx);
 168     }
 169     rctx->start_time = ceph_clock_now();
 170   }
 171 }
 172
 173 void PeeringState::begin_block_outgoing() {
 174   ceph_assert(!messages_pending_flush);
 175   ceph_assert(orig_ctx);
 176   ceph_assert(rctx);
 177   messages_pending_flush = BufferedRecoveryMessages(
 178     orig_ctx->require_osd_release);
 179   rctx.emplace(*messages_pending_flush, *orig_ctx);
 180 }
 181
 182 void PeeringState::clear_blocked_outgoing() {
 183   ceph_assert(orig_ctx);
 184   ceph_assert(rctx);
 185   messages_pending_flush = std::optional<BufferedRecoveryMessages>();
 186 }
 187
 188 void PeeringState::end_block_outgoing() {
 189   ceph_assert(messages_pending_flush);
 190   ceph_assert(orig_ctx);
 191   ceph_assert(rctx);
 192
 193   orig_ctx->accept_buffered_messages(*messages_pending_flush);
 194   rctx.emplace(*orig_ctx);
 195   messages_pending_flush = std::optional<BufferedRecoveryMessages>();
 196 }
 197
 198 void PeeringState::end_handle() {
 199   if (rctx) {
 200     utime_t dur = ceph_clock_now() - rctx->start_time;
 201     machine.event_time += dur;
 202   }
 203
 204   machine.event_count++;
 205   rctx = std::nullopt;
 206   orig_ctx = NULL;
 207 }
 208
 209 void PeeringState::check_recovery_sources(const OSDMapRef& osdmap)
 210 {
 211   /*
 212    * check that any peers we are planning to (or currently) pulling
 213    * objects from are dealt with.
 214    */
 215   missing_loc.check_recovery_sources(osdmap);
 216   pl->check_recovery_sources(osdmap);
 217
 218   for (auto i = peer_log_requested.begin(); i != peer_log_requested.end();) {
 219     if (!osdmap->is_up(i->osd)) {
 220       psdout(10) << "peer_log_requested removing " << *i << dendl;
 221       peer_log_requested.erase(i++);
 222     } else {
 223       ++i;
 224     }
 225   }
 226
 227   for (auto i = peer_missing_requested.begin();
 228        i != peer_missing_requested.end();) {
 229     if (!osdmap->is_up(i->osd)) {
 230       psdout(10) << "peer_missing_requested removing " << *i << dendl;
 231       peer_missing_requested.erase(i++);
 232     } else {
 233       ++i;
 234     }
 235   }
 236 }
 237
 238 void PeeringState::update_history(const pg_history_t& new_history)
 239 {
 240   auto mnow = pl->get_mnow();
 241   info.history.refresh_prior_readable_until_ub(mnow, prior_readable_until_ub);
 242   if (info.history.merge(new_history)) {
 243     psdout(20) << __func__ << " advanced history from " << new_history << dendl;
 244     dirty_info = true;
 245     if (info.history.last_epoch_clean >= info.history.same_interval_since) {
 246       psdout(20) << __func__ << " clearing past_intervals" << dendl;
 247       past_intervals.clear();
 248       dirty_big_info = true;
 249     }
 250     prior_readable_until_ub = info.history.get_prior_readable_until_ub(mnow);
 251     if (prior_readable_until_ub != ceph::signedspan::zero()) {
 252       dout(20) << __func__
 253                << " prior_readable_until_ub " << prior_readable_until_ub
 254                << " (mnow " << mnow << " + "
 255                << info.history.prior_readable_until_ub << ")" << dendl;
 256     }
 257   }
 258   pl->on_info_history_change();
 259 }
 260
 261 hobject_t PeeringState::earliest_backfill() const
 262 {
 263   hobject_t e = hobject_t::get_max();
 264   for (const pg_shard_t& bt : get_backfill_targets()) {
 265     const pg_info_t &pi = get_peer_info(bt);
 266     e = std::min(pi.last_backfill, e);
 267   }
 268   return e;
 269 }
 270
 271 void PeeringState::purge_strays()
 272 {
 273   if (is_premerge()) {
 274     psdout(10) << "purge_strays " << stray_set << " but premerge, doing nothing"
 275                << dendl;
 276     return;
 277   }
 278   if (cct->_conf.get_val<bool>("osd_debug_no_purge_strays")) {
 279     return;
 280   }
 281   psdout(10) << "purge_strays " << stray_set << dendl;
 282
 283   bool removed = false;
 284   for (auto p = stray_set.begin(); p != stray_set.end(); ++p) {
 285     ceph_assert(!is_acting_recovery_backfill(*p));
 286     if (get_osdmap()->is_up(p->osd)) {
 287       psdout(10) << "sending PGRemove to osd." << *p << dendl;
 288       vector<spg_t> to_remove;
 289       to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
 290       auto m = make_message<MOSDPGRemove>(
 291         get_osdmap_epoch(),
 292         to_remove);
 293       pl->send_cluster_message(p->osd, m, get_osdmap_epoch());
 294     } else {
 295       psdout(10) << "not sending PGRemove to down osd." << *p << dendl;
 296     }
 297     peer_missing.erase(*p);
 298     peer_info.erase(*p);
 299     missing_loc.remove_stray_recovery_sources(*p);
 300     peer_purged.insert(*p);
 301     removed = true;
 302   }
 303
 304   // if we removed anyone, update peers (which include peer_info)
 305   if (removed)
 306     update_heartbeat_peers();
 307
 308   stray_set.clear();
 309
 310   // clear _requested maps; we may have to peer() again if we discover
 311   // (more) stray content
 312   peer_log_requested.clear();
 313   peer_missing_requested.clear();
 314 }
 315
 316 void PeeringState::query_unfound(Formatter *f, string state)
 317 {
 318   psdout(20) << "Enter PeeringState common QueryUnfound" << dendl;
 319   {
 320     f->dump_string("state", state);
 321     f->dump_bool("available_might_have_unfound", true);
 322     f->open_array_section("might_have_unfound");
 323     for (auto p = might_have_unfound.begin();
 324          p != might_have_unfound.end();
 325          ++p) {
 326       if (peer_missing.count(*p)) {
 327         ; // Ignore already probed OSDs
 328       } else {
 329         f->open_object_section("osd");
 330         f->dump_stream("osd") << *p;
 331         if (peer_missing_requested.count(*p)) {
 332           f->dump_string("status", "querying");
 333         } else if (!get_osdmap()->is_up(p->osd)) {
 334           f->dump_string("status", "osd is down");
 335         } else {
 336           f->dump_string("status", "not queried");
 337         }
 338         f->close_section();
 339       }
 340     }
 341     f->close_section();
 342   }
 343   psdout(20) << "Exit PeeringState common QueryUnfound" << dendl;
 344   return;
 345 }
 346
 347 bool PeeringState::proc_replica_info(
 348   pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
 349 {
 350   auto p = peer_info.find(from);
 351   if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
 352     psdout(10) << " got dup osd." << from << " info "
 353                << oinfo << ", identical to ours" << dendl;
 354     return false;
 355   }
 356
 357   if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
 358     psdout(10) << " got info " << oinfo << " from down osd." << from
 359              << " discarding" << dendl;
 360     return false;
 361   }
 362
 363   psdout(10) << " got osd." << from << " " << oinfo << dendl;
 364   ceph_assert(is_primary());
 365   peer_info[from] = oinfo;
 366   might_have_unfound.insert(from);
 367
 368   update_history(oinfo.history);
 369
 370   // stray?
 371   if (!is_up(from) && !is_acting(from)) {
 372     psdout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
 373     stray_set.insert(from);
 374     if (is_clean()) {
 375       purge_strays();
 376     }
 377   }
 378
 379   // was this a new info?  if so, update peers!
 380   if (p == peer_info.end())
 381     update_heartbeat_peers();
 382
 383   return true;
 384 }
 385
 386
 387 void PeeringState::remove_down_peer_info(const OSDMapRef &osdmap)
 388 {
 389   // Remove any downed osds from peer_info
 390   bool removed = false;
 391   auto p = peer_info.begin();
 392   while (p != peer_info.end()) {
 393     if (!osdmap->is_up(p->first.osd)) {
 394       psdout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
 395       peer_missing.erase(p->first);
 396       peer_log_requested.erase(p->first);
 397       peer_missing_requested.erase(p->first);
 398       peer_purged.erase(p->first);
 399       peer_info.erase(p++);
 400       removed = true;
 401     } else
 402       ++p;
 403   }
 404
 405   // if we removed anyone, update peers (which include peer_info)
 406   if (removed)
 407     update_heartbeat_peers();
 408
 409   check_recovery_sources(osdmap);
 410 }
 411
 412 void PeeringState::update_heartbeat_peers()
 413 {
 414   if (!is_primary())
 415     return;
 416
 417   set<int> new_peers;
 418   for (unsigned i=0; i<acting.size(); i++) {
 419     if (acting[i] != CRUSH_ITEM_NONE)
 420       new_peers.insert(acting[i]);
 421   }
 422   for (unsigned i=0; i<up.size(); i++) {
 423     if (up[i] != CRUSH_ITEM_NONE)
 424       new_peers.insert(up[i]);
 425   }
 426   for (auto p = peer_info.begin(); p != peer_info.end(); ++p) {
 427     new_peers.insert(p->first.osd);
 428   }
 429   pl->update_heartbeat_peers(std::move(new_peers));
 430 }
 431
 432 void PeeringState::write_if_dirty(ObjectStore::Transaction& t)
 433 {
 434   pl->prepare_write(
 435     info,
 436     last_written_info,
 437     past_intervals,
 438     pg_log,
 439     dirty_info,
 440     dirty_big_info,
 441     last_persisted_osdmap < get_osdmap_epoch(),
 442     t);
 443   if (dirty_info || dirty_big_info) {
 444     last_persisted_osdmap = get_osdmap_epoch();
 445     last_written_info = info;
 446     dirty_info = false;
 447     dirty_big_info = false;
 448   }
 449 }
 450
 451 void PeeringState::advance_map(
 452   OSDMapRef osdmap, OSDMapRef lastmap,
 453   vector<int>& newup, int up_primary,
 454   vector<int>& newacting, int acting_primary,
 455   PeeringCtx &rctx)
 456 {
 457   ceph_assert(lastmap == osdmap_ref);
 458   psdout(10) << "handle_advance_map "
 459             << newup << "/" << newacting
 460             << " -- " << up_primary << "/" << acting_primary
 461             << dendl;
 462
 463   update_osdmap_ref(osdmap);
 464   pool.update(osdmap);
 465
 466   AdvMap evt(
 467     osdmap, lastmap, newup, up_primary,
 468     newacting, acting_primary);
 469   handle_event(evt, &rctx);
 470   if (pool.info.last_change == osdmap_ref->get_epoch()) {
 471     pl->on_pool_change();
 472   }
 473   readable_interval = pool.get_readable_interval(cct->_conf);
 474   last_require_osd_release = osdmap->require_osd_release;
 475 }
 476
 477 void PeeringState::activate_map(PeeringCtx &rctx)
 478 {
 479   psdout(10) << __func__ << dendl;
 480   ActMap evt;
 481   handle_event(evt, &rctx);
 482   if (osdmap_ref->get_epoch() - last_persisted_osdmap >
 483     cct->_conf->osd_pg_epoch_persisted_max_stale) {
 484     psdout(20) << __func__ << ": Dirtying info: last_persisted is "
 485               << last_persisted_osdmap
 486               << " while current is " << osdmap_ref->get_epoch() << dendl;
 487     dirty_info = true;
 488   } else {
 489     psdout(20) << __func__ << ": Not dirtying info: last_persisted is "
 490               << last_persisted_osdmap
 491               << " while current is " << osdmap_ref->get_epoch() << dendl;
 492   }
 493   write_if_dirty(rctx.transaction);
 494
 495   if (get_osdmap()->check_new_blocklist_entries()) {
 496     pl->check_blocklisted_watchers();
 497   }
 498 }
 499
 500 void PeeringState::set_last_peering_reset()
 501 {
 502   psdout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl;
 503   if (last_peering_reset != get_osdmap_epoch()) {
 504     last_peering_reset = get_osdmap_epoch();
 505     psdout(10) << "Clearing blocked outgoing recovery messages" << dendl;
 506     clear_blocked_outgoing();
 507     if (!pl->try_flush_or_schedule_async()) {
 508       psdout(10) << "Beginning to block outgoing recovery messages" << dendl;
 509       begin_block_outgoing();
 510     } else {
 511       psdout(10) << "Not blocking outgoing recovery messages" << dendl;
 512     }
 513   }
 514 }
 515
 516 void PeeringState::complete_flush()
 517 {
 518   flushes_in_progress--;
 519   if (flushes_in_progress == 0) {
 520     pl->on_flushed();
 521   }
 522 }
 523
 524 void PeeringState::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
 525 {
 526   const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
 527   if (!pi) {
 528     return; // pool deleted
 529   }
 530   bool changed = false;
 531   if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
 532     const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
 533     if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
 534       psdout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
 535       changed = true;
 536     }
 537   }
 538   if (changed) {
 539     info.history.last_epoch_marked_full = osdmap->get_epoch();
 540     dirty_info = true;
 541   }
 542 }
 543
 544 bool PeeringState::should_restart_peering(
 545   int newupprimary,
 546   int newactingprimary,
 547   const vector<int>& newup,
 548   const vector<int>& newacting,
 549   OSDMapRef lastmap,
 550   OSDMapRef osdmap)
 551 {
 552   if (PastIntervals::is_new_interval(
 553         primary.osd,
 554         newactingprimary,
 555         acting,
 556         newacting,
 557         up_primary.osd,
 558         newupprimary,
 559         up,
 560         newup,
 561         osdmap.get(),
 562         lastmap.get(),
 563         info.pgid.pgid)) {
 564     psdout(20) << "new interval newup " << newup
 565                << " newacting " << newacting << dendl;
 566     return true;
 567   }
 568   if (!lastmap->is_up(pg_whoami.osd) && osdmap->is_up(pg_whoami.osd)) {
 569     psdout(10) << __func__ << " osd transitioned from down -> up"
 570                << dendl;
 571     return true;
 572   }
 573   return false;
 574 }
 575
 576 /* Called before initializing peering during advance_map */
 577 void PeeringState::start_peering_interval(
 578   const OSDMapRef lastmap,
 579   const vector<int>& newup, int new_up_primary,
 580   const vector<int>& newacting, int new_acting_primary,
 581   ObjectStore::Transaction &t)
 582 {
 583   const OSDMapRef osdmap = get_osdmap();
 584
 585   set_last_peering_reset();
 586
 587   vector<int> oldacting, oldup;
 588   int oldrole = get_role();
 589
 590   if (is_primary()) {
 591     pl->clear_ready_to_merge();
 592   }
 593
 594
 595   pg_shard_t old_acting_primary = get_primary();
 596   pg_shard_t old_up_primary = up_primary;
 597   bool was_old_primary = is_primary();
 598   bool was_old_nonprimary = is_nonprimary();
 599
 600   acting.swap(oldacting);
 601   up.swap(oldup);
 602   init_primary_up_acting(
 603     newup,
 604     newacting,
 605     new_up_primary,
 606     new_acting_primary);
 607
 608   if (info.stats.up != up ||
 609       info.stats.acting != acting ||
 610       info.stats.up_primary != new_up_primary ||
 611       info.stats.acting_primary != new_acting_primary) {
 612     info.stats.up = up;
 613     info.stats.up_primary = new_up_primary;
 614     info.stats.acting = acting;
 615     info.stats.acting_primary = new_acting_primary;
 616     info.stats.mapping_epoch = osdmap->get_epoch();
 617   }
 618
 619   pl->clear_publish_stats();
 620
 621   // This will now be remapped during a backfill in cases
 622   // that it would not have been before.
 623   if (up != acting)
 624     state_set(PG_STATE_REMAPPED);
 625   else
 626     state_clear(PG_STATE_REMAPPED);
 627
 628   int role = osdmap->calc_pg_role(pg_whoami, acting);
 629   set_role(role);
 630
 631   // did acting, up, primary|acker change?
 632   if (!lastmap) {
 633     psdout(10) << " no lastmap" << dendl;
 634     dirty_info = true;
 635     dirty_big_info = true;
 636     info.history.same_interval_since = osdmap->get_epoch();
 637   } else {
 638     std::stringstream debug;
 639     ceph_assert(info.history.same_interval_since != 0);
 640     bool new_interval = PastIntervals::check_new_interval(
 641       old_acting_primary.osd,
 642       new_acting_primary,
 643       oldacting, newacting,
 644       old_up_primary.osd,
 645       new_up_primary,
 646       oldup, newup,
 647       info.history.same_interval_since,
 648       info.history.last_epoch_clean,
 649       osdmap.get(),
 650       lastmap.get(),
 651       info.pgid.pgid,
 652       missing_loc.get_recoverable_predicate(),
 653       &past_intervals,
 654       &debug);
 655     psdout(10) << __func__ << ": check_new_interval output: "
 656                << debug.str() << dendl;
 657     if (new_interval) {
 658       if (osdmap->get_epoch() == pl->oldest_stored_osdmap() &&
 659           info.history.last_epoch_clean < osdmap->get_epoch()) {
 660         psdout(10) << " map gap, clearing past_intervals and faking" << dendl;
 661         // our information is incomplete and useless; someone else was clean
 662         // after everything we know if osdmaps were trimmed.
 663         past_intervals.clear();
 664       } else {
 665         psdout(10) << " noting past " << past_intervals << dendl;
 666       }
 667       dirty_info = true;
 668       dirty_big_info = true;
 669       info.history.same_interval_since = osdmap->get_epoch();
 670       if (osdmap->have_pg_pool(info.pgid.pgid.pool()) &&
 671           info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
 672                                   osdmap->get_pg_num(info.pgid.pgid.pool()),
 673                                   nullptr)) {
 674         info.history.last_epoch_split = osdmap->get_epoch();
 675       }
 676     }
 677   }
 678
 679   if (old_up_primary != up_primary ||
 680       oldup != up) {
 681     info.history.same_up_since = osdmap->get_epoch();
 682   }
 683   // this comparison includes primary rank via pg_shard_t
 684   if (old_acting_primary != get_primary()) {
 685     info.history.same_primary_since = osdmap->get_epoch();
 686   }
 687
 688   on_new_interval();
 689   pl->on_info_history_change();
 690
 691   psdout(1) << __func__ << " up " << oldup << " -> " << up
 692             << ", acting " << oldacting << " -> " << acting
 693             << ", acting_primary " << old_acting_primary << " -> "
 694             << new_acting_primary
 695             << ", up_primary " << old_up_primary << " -> " << new_up_primary
 696             << ", role " << oldrole << " -> " << role
 697             << ", features acting " << acting_features
 698             << " upacting " << upacting_features
 699             << dendl;
 700
 701   // deactivate.
 702   state_clear(PG_STATE_ACTIVE);
 703   state_clear(PG_STATE_PEERED);
 704   state_clear(PG_STATE_PREMERGE);
 705   state_clear(PG_STATE_DOWN);
 706   state_clear(PG_STATE_RECOVERY_WAIT);
 707   state_clear(PG_STATE_RECOVERY_TOOFULL);
 708   state_clear(PG_STATE_RECOVERING);
 709
 710   peer_purged.clear();
 711   acting_recovery_backfill.clear();
 712
 713   // reset primary/replica state?
 714   if (was_old_primary || is_primary()) {
 715     pl->clear_want_pg_temp();
 716   } else if (was_old_nonprimary || is_nonprimary()) {
 717     pl->clear_want_pg_temp();
 718   }
 719   clear_primary_state();
 720
 721   pl->on_change(t);
 722
 723   ceph_assert(!deleting);
 724
 725   // should we tell the primary we are here?
 726   send_notify = !is_primary();
 727
 728   if (role != oldrole ||
 729       was_old_primary != is_primary()) {
 730     // did primary change?
 731     if (was_old_primary != is_primary()) {
 732       state_clear(PG_STATE_CLEAN);
 733     }
 734
 735     pl->on_role_change();
 736   } else {
 737     // no role change.
 738     // did primary change?
 739     if (get_primary() != old_acting_primary) {
 740       psdout(10) << oldacting << " -> " << acting
 741                << ", acting primary "
 742                << old_acting_primary << " -> " << get_primary()
 743                << dendl;
 744     } else {
 745       // primary is the same.
 746       if (is_primary()) {
 747         // i am (still) primary. but my replica set changed.
 748         state_clear(PG_STATE_CLEAN);
 749
 750         psdout(10) << oldacting << " -> " << acting
 751                  << ", replicas changed" << dendl;
 752       }
 753     }
 754   }
 755
 756   if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
 757     psdout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
 758     pl->queue_want_pg_temp(acting);
 759   }
 760 }
 761
 762 void PeeringState::on_new_interval()
 763 {
 764   dout(20) << __func__ << dendl;
 765   const OSDMapRef osdmap = get_osdmap();
 766
 767   // initialize features
 768   acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
 769   upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
 770   for (auto p = acting.begin(); p != acting.end(); ++p) {
 771     if (*p == CRUSH_ITEM_NONE)
 772       continue;
 773     uint64_t f = osdmap->get_xinfo(*p).features;
 774     acting_features &= f;
 775     upacting_features &= f;
 776   }
 777   for (auto p = up.begin(); p != up.end(); ++p) {
 778     if (*p == CRUSH_ITEM_NONE)
 779       continue;
 780     upacting_features &= osdmap->get_xinfo(*p).features;
 781   }
 782   psdout(20) << __func__ << " upacting_features 0x" << std::hex
 783              << upacting_features << std::dec
 784              << " from " << acting << "+" << up << dendl;
 785
 786   psdout(20) << __func__ << " checking missing set deletes flag. missing = "
 787              << get_pg_log().get_missing() << dendl;
 788
 789   if (!pg_log.get_missing().may_include_deletes &&
 790       !perform_deletes_during_peering()) {
 791     pl->rebuild_missing_set_with_deletes(pg_log);
 792   }
 793   ceph_assert(
 794     pg_log.get_missing().may_include_deletes ==
 795     !perform_deletes_during_peering());
 796
 797   init_hb_stamps();
 798
 799   // update lease bounds for a new interval
 800   auto mnow = pl->get_mnow();
 801   prior_readable_until_ub = std::max(prior_readable_until_ub,
 802                                      readable_until_ub);
 803   prior_readable_until_ub = info.history.refresh_prior_readable_until_ub(
 804     mnow, prior_readable_until_ub);
 805   psdout(10) << __func__ << " prior_readable_until_ub "
 806              << prior_readable_until_ub << " (mnow " << mnow << " + "
 807              << info.history.prior_readable_until_ub << ")" << dendl;
 808   prior_readable_down_osds.clear(); // we populate this when we build the priorset
 809
 810   readable_until =
 811     readable_until_ub =
 812     readable_until_ub_sent =
 813     readable_until_ub_from_primary = ceph::signedspan::zero();
 814
 815   acting_readable_until_ub.clear();
 816   if (is_primary()) {
 817     acting_readable_until_ub.resize(acting.size(), ceph::signedspan::zero());
 818   }
 819
 820   pl->on_new_interval();
 821 }
 822
 823 void PeeringState::init_primary_up_acting(
 824   const vector<int> &newup,
 825   const vector<int> &newacting,
 826   int new_up_primary,
 827   int new_acting_primary)
 828 {
 829   actingset.clear();
 830   acting = newacting;
 831   for (uint8_t i = 0; i < acting.size(); ++i) {
 832     if (acting[i] != CRUSH_ITEM_NONE)
 833       actingset.insert(
 834         pg_shard_t(
 835           acting[i],
 836           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
 837   }
 838   upset.clear();
 839   up = newup;
 840   for (uint8_t i = 0; i < up.size(); ++i) {
 841     if (up[i] != CRUSH_ITEM_NONE)
 842       upset.insert(
 843         pg_shard_t(
 844           up[i],
 845           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
 846   }
 847   if (!pool.info.is_erasure()) {
 848     // replicated
 849     up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
 850     primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
 851   } else {
 852     // erasure
 853     up_primary = pg_shard_t();
 854     primary = pg_shard_t();
 855     for (uint8_t i = 0; i < up.size(); ++i) {
 856       if (up[i] == new_up_primary) {
 857         up_primary = pg_shard_t(up[i], shard_id_t(i));
 858         break;
 859       }
 860     }
 861     for (uint8_t i = 0; i < acting.size(); ++i) {
 862       if (acting[i] == new_acting_primary) {
 863         primary = pg_shard_t(acting[i], shard_id_t(i));
 864         break;
 865       }
 866     }
 867     ceph_assert(up_primary.osd == new_up_primary);
 868     ceph_assert(primary.osd == new_acting_primary);
 869   }
 870 }
 871
 872 void PeeringState::init_hb_stamps()
 873 {
 874   if (is_primary()) {
 875     // we care about all other osds in the acting set
 876     hb_stamps.resize(acting.size() - 1);
 877     unsigned i = 0;
 878     for (auto p : acting) {
 879       if (p == CRUSH_ITEM_NONE || p == get_primary().osd) {
 880         continue;
 881       }
 882       hb_stamps[i++] = pl->get_hb_stamps(p);
 883     }
 884     hb_stamps.resize(i);
 885   } else if (is_nonprimary()) {
 886     // we care about just the primary
 887     hb_stamps.resize(1);
 888     hb_stamps[0] = pl->get_hb_stamps(get_primary().osd);
 889   } else {
 890     hb_stamps.clear();
 891   }
 892   dout(10) << __func__ << " now " << hb_stamps << dendl;
 893 }
 894
 895
 896 void PeeringState::clear_recovery_state()
 897 {
 898   async_recovery_targets.clear();
 899   backfill_targets.clear();
 900 }
 901
 902 void PeeringState::clear_primary_state()
 903 {
 904   psdout(10) << "clear_primary_state" << dendl;
 905
 906   // clear peering state
 907   stray_set.clear();
 908   peer_log_requested.clear();
 909   peer_missing_requested.clear();
 910   peer_info.clear();
 911   peer_bytes.clear();
 912   peer_missing.clear();
 913   peer_last_complete_ondisk.clear();
 914   peer_activated.clear();
 915   min_last_complete_ondisk = eversion_t();
 916   pg_trim_to = eversion_t();
 917   might_have_unfound.clear();
 918   need_up_thru = false;
 919   missing_loc.clear();
 920   pg_log.reset_recovery_pointers();
 921
 922   clear_recovery_state();
 923
 924   last_update_ondisk = eversion_t();
 925   missing_loc.clear();
 926   pl->clear_primary_state();
 927 }
 928
 929 /// return [start,end) bounds for required past_intervals
 930 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
 931   const pg_info_t &info,
 932   epoch_t oldest_map) {
 933   epoch_t start = std::max(
 934     info.history.last_epoch_clean ? info.history.last_epoch_clean :
 935     info.history.epoch_pool_created,
 936     oldest_map);
 937   epoch_t end = std::max(
 938     info.history.same_interval_since,
 939     info.history.epoch_pool_created);
 940   return make_pair(start, end);
 941 }
 942
 943
 944 void PeeringState::check_past_interval_bounds() const
 945 {
 946   auto oldest_epoch = pl->oldest_stored_osdmap();
 947   auto rpib = get_required_past_interval_bounds(
 948     info,
 949     oldest_epoch);
 950   if (rpib.first >= rpib.second) {
 951     // do not warn if the start bound is dictated by oldest_map; the
 952     // past intervals are presumably appropriate given the pg info.
 953     if (!past_intervals.empty() &&
 954         rpib.first > oldest_epoch) {
 955       pl->get_clog_error() << info.pgid << " required past_interval bounds are"
 956                              << " empty [" << rpib << ") but past_intervals is not: "
 957                              << past_intervals;
 958       derr << info.pgid << " required past_interval bounds are"
 959            << " empty [" << rpib << ") but past_intervals is not: "
 960            << past_intervals << dendl;
 961     }
 962   } else {
 963     if (past_intervals.empty()) {
 964       pl->get_clog_error() << info.pgid << " required past_interval bounds are"
 965                              << " not empty [" << rpib << ") but past_intervals "
 966                              << past_intervals << " is empty";
 967       derr << info.pgid << " required past_interval bounds are"
 968            << " not empty [" << rpib << ") but past_intervals "
 969            << past_intervals << " is empty" << dendl;
 970       ceph_assert(!past_intervals.empty());
 971     }
 972
 973     auto apib = past_intervals.get_bounds();
 974     if (apib.first > rpib.first) {
 975       pl->get_clog_error() << info.pgid << " past_intervals [" << apib
 976                              << ") start interval does not contain the required"
 977                              << " bound [" << rpib << ") start";
 978       derr << info.pgid << " past_intervals [" << apib
 979            << ") start interval does not contain the required"
 980            << " bound [" << rpib << ") start" << dendl;
 981       ceph_abort_msg("past_interval start interval mismatch");
 982     }
 983     if (apib.second != rpib.second) {
 984       pl->get_clog_error() << info.pgid << " past_interal bound [" << apib
 985                              << ") end does not match required [" << rpib
 986                              << ") end";
 987       derr << info.pgid << " past_interal bound [" << apib
 988            << ") end does not match required [" << rpib
 989            << ") end" << dendl;
 990       ceph_abort_msg("past_interval end mismatch");
 991     }
 992   }
 993 }
 994
 995 int PeeringState::clamp_recovery_priority(int priority, int pool_recovery_priority, int max)
 996 {
 997   static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
 998   static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
 999
1000   ceph_assert(max <= OSD_RECOVERY_PRIORITY_MAX);
1001
1002   // User can't set this too high anymore, but might be a legacy value
1003   if (pool_recovery_priority > OSD_POOL_PRIORITY_MAX)
1004     pool_recovery_priority = OSD_POOL_PRIORITY_MAX;
1005   if (pool_recovery_priority < OSD_POOL_PRIORITY_MIN)
1006     pool_recovery_priority = OSD_POOL_PRIORITY_MIN;
1007   // Shift range from min to max to 0 to max - min
1008   pool_recovery_priority += (0 - OSD_POOL_PRIORITY_MIN);
1009   ceph_assert(pool_recovery_priority >= 0 && pool_recovery_priority <= (OSD_POOL_PRIORITY_MAX - OSD_POOL_PRIORITY_MIN));
1010
1011   priority += pool_recovery_priority;
1012
1013   // Clamp to valid range
1014   if (priority > max) {
1015     return max;
1016   } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
1017     return OSD_RECOVERY_PRIORITY_MIN;
1018   } else {
1019     return priority;
1020   }
1021 }
1022
1023 unsigned PeeringState::get_recovery_priority()
1024 {
1025   // a higher value -> a higher priority
1026   int ret = OSD_RECOVERY_PRIORITY_BASE;
1027   int base = ret;
1028
1029   if (state & PG_STATE_FORCED_RECOVERY) {
1030     ret = OSD_RECOVERY_PRIORITY_FORCED;
1031   } else {
1032     // XXX: This priority boost isn't so much about inactive, but about data-at-risk
1033     if (is_degraded() && info.stats.avail_no_missing.size() < pool.info.min_size) {
1034       base = OSD_RECOVERY_INACTIVE_PRIORITY_BASE;
1035       // inactive: no. of replicas < min_size, highest priority since it blocks IO
1036       ret = base + (pool.info.min_size - info.stats.avail_no_missing.size());
1037     }
1038
1039     int64_t pool_recovery_priority = 0;
1040     pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
1041
1042     ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
1043   }
1044   psdout(20) << __func__ << " recovery priority is " << ret << dendl;
1045   return static_cast<unsigned>(ret);
1046 }
1047
1048 unsigned PeeringState::get_backfill_priority()
1049 {
1050   // a higher value -> a higher priority
1051   int ret = OSD_BACKFILL_PRIORITY_BASE;
1052   int base = ret;
1053
1054   if (state & PG_STATE_FORCED_BACKFILL) {
1055     ret = OSD_BACKFILL_PRIORITY_FORCED;
1056   } else {
1057     if (actingset.size() < pool.info.min_size) {
1058       base = OSD_BACKFILL_INACTIVE_PRIORITY_BASE;
1059       // inactive: no. of replicas < min_size, highest priority since it blocks IO
1060       ret = base + (pool.info.min_size - actingset.size());
1061
1062     } else if (is_undersized()) {
1063       // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
1064       ceph_assert(pool.info.size > actingset.size());
1065       base = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1066       ret = base + (pool.info.size - actingset.size());
1067
1068     } else if (is_degraded()) {
1069       // degraded: baseline degraded
1070       base = ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
1071     }
1072
1073     // Adjust with pool's recovery priority
1074     int64_t pool_recovery_priority = 0;
1075     pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
1076
1077     ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
1078   }
1079
1080   psdout(20) << __func__ << " backfill priority is " << ret << dendl;
1081   return static_cast<unsigned>(ret);
1082 }
1083
1084 unsigned PeeringState::get_delete_priority()
1085 {
1086   auto state = get_osdmap()->get_state(pg_whoami.osd);
1087   if (state & (CEPH_OSD_BACKFILLFULL |
1088                CEPH_OSD_FULL)) {
1089     return OSD_DELETE_PRIORITY_FULL;
1090   } else if (state & CEPH_OSD_NEARFULL) {
1091     return OSD_DELETE_PRIORITY_FULLISH;
1092   } else {
1093     return OSD_DELETE_PRIORITY_NORMAL;
1094   }
1095 }
1096
1097 bool PeeringState::set_force_recovery(bool b)
1098 {
1099   bool did = false;
1100   if (b) {
1101     if (!(state & PG_STATE_FORCED_RECOVERY) &&
1102         (state & (PG_STATE_DEGRADED |
1103                   PG_STATE_RECOVERY_WAIT |
1104                   PG_STATE_RECOVERING))) {
1105       psdout(20) << __func__ << " set" << dendl;
1106       state_set(PG_STATE_FORCED_RECOVERY);
1107       pl->publish_stats_to_osd();
1108       did = true;
1109     }
1110   } else if (state & PG_STATE_FORCED_RECOVERY) {
1111     psdout(20) << __func__ << " clear" << dendl;
1112     state_clear(PG_STATE_FORCED_RECOVERY);
1113     pl->publish_stats_to_osd();
1114     did = true;
1115   }
1116   if (did) {
1117     psdout(20) << __func__ << " state " << get_current_state()
1118              << dendl;
1119     pl->update_local_background_io_priority(get_recovery_priority());
1120   }
1121   return did;
1122 }
1123
1124 bool PeeringState::set_force_backfill(bool b)
1125 {
1126   bool did = false;
1127   if (b) {
1128     if (!(state & PG_STATE_FORCED_BACKFILL) &&
1129         (state & (PG_STATE_DEGRADED |
1130                   PG_STATE_BACKFILL_WAIT |
1131                   PG_STATE_BACKFILLING))) {
1132       psdout(10) << __func__ << " set" << dendl;
1133       state_set(PG_STATE_FORCED_BACKFILL);
1134       pl->publish_stats_to_osd();
1135       did = true;
1136     }
1137   } else if (state & PG_STATE_FORCED_BACKFILL) {
1138     psdout(10) << __func__ << " clear" << dendl;
1139     state_clear(PG_STATE_FORCED_BACKFILL);
1140     pl->publish_stats_to_osd();
1141     did = true;
1142   }
1143   if (did) {
1144     psdout(20) << __func__ << " state " << get_current_state()
1145              << dendl;
1146     pl->update_local_background_io_priority(get_backfill_priority());
1147   }
1148   return did;
1149 }
1150
1151 void PeeringState::schedule_renew_lease()
1152 {
1153   pl->schedule_renew_lease(
1154     last_peering_reset,
1155     readable_interval / 2);
1156 }
1157
1158 void PeeringState::send_lease()
1159 {
1160   epoch_t epoch = pl->get_osdmap_epoch();
1161   for (auto peer : actingset) {
1162     if (peer == pg_whoami) {
1163       continue;
1164     }
1165     pl->send_cluster_message(
1166       peer.osd,
1167       make_message<MOSDPGLease>(epoch,
1168                       spg_t(spgid.pgid, peer.shard),
1169                       get_lease()),
1170       epoch);
1171   }
1172 }
1173
1174 void PeeringState::proc_lease(const pg_lease_t& l)
1175 {
1176   if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1177     psdout(20) << __func__ << " no-op, upacting_features 0x" << std::hex
1178                << upacting_features << std::dec
1179                << " does not include SERVER_OCTOPUS" << dendl;
1180     return;
1181   }
1182   if (!is_nonprimary()) {
1183     psdout(20) << __func__ << " no-op, !nonprimary" << dendl;
1184     return;
1185   }
1186   psdout(10) << __func__ << " " << l << dendl;
1187   if (l.readable_until_ub > readable_until_ub_from_primary) {
1188     readable_until_ub_from_primary = l.readable_until_ub;
1189   }
1190
1191   ceph::signedspan ru = ceph::signedspan::zero();
1192   if (l.readable_until != ceph::signedspan::zero() &&
1193       hb_stamps[0]->peer_clock_delta_ub) {
1194     ru = l.readable_until - *hb_stamps[0]->peer_clock_delta_ub;
1195     psdout(20) << " peer_clock_delta_ub " << *hb_stamps[0]->peer_clock_delta_ub
1196                << " -> ru " << ru << dendl;
1197   }
1198   if (ru > readable_until) {
1199     readable_until = ru;
1200     psdout(20) << __func__ << " readable_until now " << readable_until << dendl;
1201     // NOTE: if we ever decide to block/queue ops on the replica,
1202     // we'll need to wake them up here.
1203   }
1204
1205   ceph::signedspan ruub;
1206   if (hb_stamps[0]->peer_clock_delta_lb) {
1207     ruub = l.readable_until_ub - *hb_stamps[0]->peer_clock_delta_lb;
1208     psdout(20) << " peer_clock_delta_lb " << *hb_stamps[0]->peer_clock_delta_lb
1209                << " -> ruub " << ruub << dendl;
1210   } else {
1211     ruub = pl->get_mnow() + l.interval;
1212     psdout(20) << " no peer_clock_delta_lb -> ruub " << ruub << dendl;
1213   }
1214   if (ruub > readable_until_ub) {
1215     readable_until_ub = ruub;
1216     psdout(20) << __func__ << " readable_until_ub now " << readable_until_ub
1217                << dendl;
1218   }
1219 }
1220
1221 void PeeringState::proc_lease_ack(int from, const pg_lease_ack_t& a)
1222 {
1223   if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1224     return;
1225   }
1226   auto now = pl->get_mnow();
1227   bool was_min = false;
1228   for (unsigned i = 0; i < acting.size(); ++i) {
1229     if (from == acting[i]) {
1230       // the lease_ack value is based on the primary's clock
1231       if (a.readable_until_ub > acting_readable_until_ub[i]) {
1232         if (acting_readable_until_ub[i] == readable_until) {
1233           was_min = true;
1234         }
1235         acting_readable_until_ub[i] = a.readable_until_ub;
1236         break;
1237       }
1238     }
1239   }
1240   if (was_min) {
1241     auto old_ru = readable_until;
1242     recalc_readable_until();
1243     if (now < old_ru) {
1244       pl->recheck_readable();
1245     }
1246   }
1247 }
1248
1249 void PeeringState::proc_renew_lease()
1250 {
1251   if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1252     return;
1253   }
1254   renew_lease(pl->get_mnow());
1255   send_lease();
1256   schedule_renew_lease();
1257 }
1258
1259 void PeeringState::recalc_readable_until()
1260 {
1261   assert(is_primary());
1262   ceph::signedspan min = readable_until_ub_sent;
1263   for (unsigned i = 0; i < acting.size(); ++i) {
1264     if (acting[i] == pg_whoami.osd || acting[i] == CRUSH_ITEM_NONE) {
1265       continue;
1266     }
1267     dout(20) << __func__ << " peer osd." << acting[i]
1268              << " ruub " << acting_readable_until_ub[i] << dendl;
1269     if (acting_readable_until_ub[i] < min) {
1270       min = acting_readable_until_ub[i];
1271     }
1272   }
1273   readable_until = min;
1274   readable_until_ub = min;
1275   dout(20) << __func__ << " readable_until[_ub] " << readable_until
1276            << " (sent " << readable_until_ub_sent << ")" << dendl;
1277 }
1278
1279 bool PeeringState::check_prior_readable_down_osds(const OSDMapRef& map)
1280 {
1281   if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
1282     return false;
1283   }
1284   bool changed = false;
1285   auto p = prior_readable_down_osds.begin();
1286   while (p != prior_readable_down_osds.end()) {
1287     if (map->is_dead(*p)) {
1288       dout(10) << __func__ << " prior_readable_down_osds osd." << *p
1289                << " is dead as of epoch " << map->get_epoch()
1290                << dendl;
1291       p = prior_readable_down_osds.erase(p);
1292       changed = true;
1293     } else {
1294       ++p;
1295     }
1296   }
1297   if (changed && prior_readable_down_osds.empty()) {
1298     psdout(10) << " empty prior_readable_down_osds, clearing ub" << dendl;
1299     clear_prior_readable_until_ub();
1300     return true;
1301   }
1302   return false;
1303 }
1304
1305 bool PeeringState::adjust_need_up_thru(const OSDMapRef osdmap)
1306 {
1307   epoch_t up_thru = osdmap->get_up_thru(pg_whoami.osd);
1308   if (need_up_thru &&
1309       up_thru >= info.history.same_interval_since) {
1310     psdout(10) << "adjust_need_up_thru now "
1311                << up_thru << ", need_up_thru now false" << dendl;
1312     need_up_thru = false;
1313     return true;
1314   }
1315   return false;
1316 }
1317
1318 PastIntervals::PriorSet PeeringState::build_prior()
1319 {
1320   if (1) {
1321     // sanity check
1322     for (auto it = peer_info.begin(); it != peer_info.end(); ++it) {
1323       ceph_assert(info.history.last_epoch_started >=
1324                   it->second.history.last_epoch_started);
1325     }
1326   }
1327
1328   const OSDMap &osdmap = *get_osdmap();
1329   PastIntervals::PriorSet prior = past_intervals.get_prior_set(
1330     pool.info.is_erasure(),
1331     info.history.last_epoch_started,
1332     &missing_loc.get_recoverable_predicate(),
1333     [&](epoch_t start, int osd, epoch_t *lost_at) {
1334       const osd_info_t *pinfo = 0;
1335       if (osdmap.exists(osd)) {
1336         pinfo = &osdmap.get_info(osd);
1337         if (lost_at)
1338           *lost_at = pinfo->lost_at;
1339       }
1340
1341       if (osdmap.is_up(osd)) {
1342         return PastIntervals::UP;
1343       } else if (!pinfo) {
1344         return PastIntervals::DNE;
1345       } else if (pinfo->lost_at > start) {
1346         return PastIntervals::LOST;
1347       } else {
1348         return PastIntervals::DOWN;
1349       }
1350     },
1351     up,
1352     acting,
1353     dpp);
1354
1355   if (prior.pg_down) {
1356     state_set(PG_STATE_DOWN);
1357   }
1358
1359   if (get_osdmap()->get_up_thru(pg_whoami.osd) <
1360       info.history.same_interval_since) {
1361     psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1362                << " < same_since " << info.history.same_interval_since
1363                << ", must notify monitor" << dendl;
1364     need_up_thru = true;
1365   } else {
1366     psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
1367                << " >= same_since " << info.history.same_interval_since
1368                << ", all is well" << dendl;
1369     need_up_thru = false;
1370   }
1371   pl->set_probe_targets(prior.probe);
1372   return prior;
1373 }
1374
1375 bool PeeringState::needs_recovery() const
1376 {
1377   ceph_assert(is_primary());
1378
1379   auto &missing = pg_log.get_missing();
1380
1381   if (missing.num_missing()) {
1382     psdout(10) << __func__ << " primary has " << missing.num_missing()
1383                << " missing" << dendl;
1384     return true;
1385   }
1386
1387   ceph_assert(!acting_recovery_backfill.empty());
1388   for (const pg_shard_t& peer : acting_recovery_backfill) {
1389     if (peer == get_primary()) {
1390       continue;
1391     }
1392     auto pm = peer_missing.find(peer);
1393     if (pm == peer_missing.end()) {
1394       psdout(10) << __func__ << " osd." << peer << " doesn't have missing set"
1395                  << dendl;
1396       continue;
1397     }
1398     if (pm->second.num_missing()) {
1399       psdout(10) << __func__ << " osd." << peer << " has "
1400                  << pm->second.num_missing() << " missing" << dendl;
1401       return true;
1402     }
1403   }
1404
1405   psdout(10) << __func__ << " is recovered" << dendl;
1406   return false;
1407 }
1408
1409 bool PeeringState::needs_backfill() const
1410 {
1411   ceph_assert(is_primary());
1412
1413   // We can assume that only possible osds that need backfill
1414   // are on the backfill_targets vector nodes.
1415   for (const pg_shard_t& peer : backfill_targets) {
1416     auto pi = peer_info.find(peer);
1417     ceph_assert(pi != peer_info.end());
1418     if (!pi->second.last_backfill.is_max()) {
1419       psdout(10) << __func__ << " osd." << peer
1420                  << " has last_backfill " << pi->second.last_backfill << dendl;
1421       return true;
1422     }
1423   }
1424
1425   psdout(10) << __func__ << " does not need backfill" << dendl;
1426   return false;
1427 }
1428
1429 /*
1430  * Returns true unless there is a non-lost OSD in might_have_unfound.
1431  */
1432 bool PeeringState::all_unfound_are_queried_or_lost(
1433   const OSDMapRef osdmap) const
1434 {
1435   ceph_assert(is_primary());
1436
1437   auto peer = might_have_unfound.begin();
1438   auto mend = might_have_unfound.end();
1439   for (; peer != mend; ++peer) {
1440     if (peer_missing.count(*peer))
1441       continue;
1442     auto iter = peer_info.find(*peer);
1443     if (iter != peer_info.end() &&
1444         (iter->second.is_empty() || iter->second.dne()))
1445       continue;
1446     if (!osdmap->exists(peer->osd))
1447       continue;
1448     const osd_info_t &osd_info(osdmap->get_info(peer->osd));
1449     if (osd_info.lost_at <= osd_info.up_from) {
1450       // If there is even one OSD in might_have_unfound that isn't lost, we
1451       // still might retrieve our unfound.
1452       return false;
1453     }
1454   }
1455   psdout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound "
1456              << might_have_unfound
1457              << " have been queried or are marked lost" << dendl;
1458   return true;
1459 }
1460
1461
1462 void PeeringState::reject_reservation()
1463 {
1464   pl->unreserve_recovery_space();
1465   pl->send_cluster_message(
1466     primary.osd,
1467     make_message<MBackfillReserve>(
1468       MBackfillReserve::REJECT_TOOFULL,
1469       spg_t(info.pgid.pgid, primary.shard),
1470       get_osdmap_epoch()),
1471     get_osdmap_epoch());
1472 }
1473
1474 /**
1475  * find_best_info
1476  *
1477  * Returns an iterator to the best info in infos sorted by:
1478  *  1) Prefer newer last_update
1479  *  2) Prefer longer tail if it brings another info into contiguity
1480  *  3) Prefer current primary
1481  */
1482 map<pg_shard_t, pg_info_t>::const_iterator PeeringState::find_best_info(
1483   const map<pg_shard_t, pg_info_t> &infos,
1484   bool restrict_to_up_acting,
1485   bool *history_les_bound) const
1486 {
1487   ceph_assert(history_les_bound);
1488   /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1489    * to make changes to this process.  Also, make sure to update it
1490    * when you find bugs! */
1491   epoch_t max_last_epoch_started_found = 0;
1492   for (auto i = infos.begin(); i != infos.end(); ++i) {
1493     if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1494         max_last_epoch_started_found < i->second.history.last_epoch_started) {
1495       *history_les_bound = true;
1496       max_last_epoch_started_found = i->second.history.last_epoch_started;
1497     }
1498     if (!i->second.is_incomplete() &&
1499         max_last_epoch_started_found < i->second.last_epoch_started) {
1500       *history_les_bound = false;
1501       max_last_epoch_started_found = i->second.last_epoch_started;
1502     }
1503   }
1504   eversion_t min_last_update_acceptable = eversion_t::max();
1505   for (auto i = infos.begin(); i != infos.end(); ++i) {
1506     if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1507       if (min_last_update_acceptable > i->second.last_update)
1508         min_last_update_acceptable = i->second.last_update;
1509     }
1510   }
1511   if (min_last_update_acceptable == eversion_t::max())
1512     return infos.end();
1513
1514   auto best = infos.end();
1515   // find osd with newest last_update (oldest for ec_pool).
1516   // if there are multiples, prefer
1517   //  - a longer tail, if it brings another peer into log contiguity
1518   //  - the current primary
1519   for (auto p = infos.begin(); p != infos.end(); ++p) {
1520     if (restrict_to_up_acting && !is_up(p->first) &&
1521         !is_acting(p->first))
1522       continue;
1523     // Only consider peers with last_update >= min_last_update_acceptable
1524     if (p->second.last_update < min_last_update_acceptable)
1525       continue;
1526     // Disqualify anyone with a too old last_epoch_started
1527     if (p->second.last_epoch_started < max_last_epoch_started_found)
1528       continue;
1529     // Disqualify anyone who is incomplete (not fully backfilled)
1530     if (p->second.is_incomplete())
1531       continue;
1532     if (best == infos.end()) {
1533       best = p;
1534       continue;
1535     }
1536     // Prefer newer last_update
1537     if (pool.info.require_rollback()) {
1538       if (p->second.last_update > best->second.last_update)
1539         continue;
1540       if (p->second.last_update < best->second.last_update) {
1541         best = p;
1542         continue;
1543       }
1544     } else {
1545       if (p->second.last_update < best->second.last_update)
1546         continue;
1547       if (p->second.last_update > best->second.last_update) {
1548         best = p;
1549         continue;
1550       }
1551     }
1552
1553     // Prefer longer tail
1554     if (p->second.log_tail > best->second.log_tail) {
1555       continue;
1556     } else if (p->second.log_tail < best->second.log_tail) {
1557       best = p;
1558       continue;
1559     }
1560
1561     if (!p->second.has_missing() && best->second.has_missing()) {
1562       psdout(10) << __func__ << " prefer osd." << p->first
1563                << " because it is complete while best has missing"
1564                << dendl;
1565       best = p;
1566       continue;
1567     } else if (p->second.has_missing() && !best->second.has_missing()) {
1568       psdout(10) << __func__ << " skipping osd." << p->first
1569                << " because it has missing while best is complete"
1570                << dendl;
1571       continue;
1572     } else {
1573       // both are complete or have missing
1574       // fall through
1575     }
1576
1577     // prefer current primary (usually the caller), all things being equal
1578     if (p->first == pg_whoami) {
1579       psdout(10) << "calc_acting prefer osd." << p->first
1580                  << " because it is current primary" << dendl;
1581       best = p;
1582       continue;
1583     }
1584   }
1585   return best;
1586 }
1587
1588 void PeeringState::calc_ec_acting(
1589   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1590   unsigned size,
1591   const vector<int> &acting,
1592   const vector<int> &up,
1593   const map<pg_shard_t, pg_info_t> &all_info,
1594   bool restrict_to_up_acting,
1595   vector<int> *_want,
1596   set<pg_shard_t> *backfill,
1597   set<pg_shard_t> *acting_backfill,
1598   ostream &ss)
1599 {
1600   vector<int> want(size, CRUSH_ITEM_NONE);
1601   map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1602   for (auto i = all_info.begin();
1603        i != all_info.end();
1604        ++i) {
1605     all_info_by_shard[i->first.shard].insert(i->first);
1606   }
1607   for (uint8_t i = 0; i < want.size(); ++i) {
1608     ss << "For position " << (unsigned)i << ": ";
1609     if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1610         !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1611         all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1612         auth_log_shard->second.log_tail) {
1613       ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1614       want[i] = up[i];
1615       continue;
1616     }
1617     if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1618       ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1619          << " and ";
1620       backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1621     }
1622
1623     if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1624         !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1625         all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1626         auth_log_shard->second.log_tail) {
1627       ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1628       want[i] = acting[i];
1629     } else if (!restrict_to_up_acting) {
1630       for (auto j = all_info_by_shard[shard_id_t(i)].begin();
1631            j != all_info_by_shard[shard_id_t(i)].end();
1632            ++j) {
1633         ceph_assert(j->shard == i);
1634         if (!all_info.find(*j)->second.is_incomplete() &&
1635             all_info.find(*j)->second.last_update >=
1636             auth_log_shard->second.log_tail) {
1637           ss << " selecting stray: " << *j << std::endl;
1638           want[i] = j->osd;
1639           break;
1640         }
1641       }
1642       if (want[i] == CRUSH_ITEM_NONE)
1643         ss << " failed to fill position " << (int)i << std::endl;
1644     }
1645   }
1646
1647   for (uint8_t i = 0; i < want.size(); ++i) {
1648     if (want[i] != CRUSH_ITEM_NONE) {
1649       acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1650     }
1651   }
1652   acting_backfill->insert(backfill->begin(), backfill->end());
1653   _want->swap(want);
1654 }
1655
1656 std::pair<map<pg_shard_t, pg_info_t>::const_iterator, eversion_t>
1657 PeeringState::select_replicated_primary(
1658   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1659   uint64_t force_auth_primary_missing_objects,
1660   const std::vector<int> &up,
1661   pg_shard_t up_primary,
1662   const map<pg_shard_t, pg_info_t> &all_info,
1663   const OSDMapRef osdmap,
1664   ostream &ss)
1665 {
1666   pg_shard_t auth_log_shard_id = auth_log_shard->first;
1667
1668   ss << __func__ << " newest update on osd." << auth_log_shard_id
1669      << " with " << auth_log_shard->second << std::endl;
1670
1671   // select primary
1672   auto primary = all_info.find(up_primary);
1673   if (up.size() &&
1674       !primary->second.is_incomplete() &&
1675       primary->second.last_update >=
1676         auth_log_shard->second.log_tail) {
1677     if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1678       auto approx_missing_objects =
1679         primary->second.stats.stats.sum.num_objects_missing;
1680       auto auth_version = auth_log_shard->second.last_update.version;
1681       auto primary_version = primary->second.last_update.version;
1682       if (auth_version > primary_version) {
1683         approx_missing_objects += auth_version - primary_version;
1684       } else {
1685         approx_missing_objects += primary_version - auth_version;
1686       }
1687       if ((uint64_t)approx_missing_objects >
1688           force_auth_primary_missing_objects) {
1689         primary = auth_log_shard;
1690         ss << "up_primary: " << up_primary << ") has approximate "
1691            << approx_missing_objects
1692            << "(>" << force_auth_primary_missing_objects <<") "
1693            << "missing objects, osd." << auth_log_shard_id
1694            << " selected as primary instead"
1695            << std::endl;
1696       } else {
1697         ss << "up_primary: " << up_primary << ") selected as primary"
1698            << std::endl;
1699       }
1700     } else {
1701       ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1702     }
1703   } else {
1704     ceph_assert(!auth_log_shard->second.is_incomplete());
1705     ss << "up[0] needs backfill, osd." << auth_log_shard_id
1706        << " selected as primary instead" << std::endl;
1707     primary = auth_log_shard;
1708   }
1709
1710   ss << __func__ << " primary is osd." << primary->first
1711      << " with " << primary->second << std::endl;
1712
1713   /* We include auth_log_shard->second.log_tail because in GetLog,
1714    * we will request logs back to the min last_update over our
1715    * acting_backfill set, which will result in our log being extended
1716    * as far backwards as necessary to pick up any peers which can
1717    * be log recovered by auth_log_shard's log */
1718   eversion_t oldest_auth_log_entry =
1719     std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
1720
1721   return std::make_pair(primary, oldest_auth_log_entry);
1722 }
1723
1724
1725 /**
1726  * calculate the desired acting set.
1727  *
1728  * Choose an appropriate acting set.  Prefer up[0], unless it is
1729  * incomplete, or another osd has a longer tail that allows us to
1730  * bring other up nodes up to date.
1731  */
1732 void PeeringState::calc_replicated_acting(
1733   map<pg_shard_t, pg_info_t>::const_iterator primary,
1734   eversion_t oldest_auth_log_entry,
1735   unsigned size,
1736   const vector<int> &acting,
1737   const vector<int> &up,
1738   pg_shard_t up_primary,
1739   const map<pg_shard_t, pg_info_t> &all_info,
1740   bool restrict_to_up_acting,
1741   vector<int> *want,
1742   set<pg_shard_t> *backfill,
1743   set<pg_shard_t> *acting_backfill,
1744   const OSDMapRef osdmap,
1745   const PGPool& pool,
1746   ostream &ss)
1747 {
1748   ss << __func__ << (restrict_to_up_acting ? " restrict_to_up_acting" : "")
1749      << std::endl;
1750
1751   want->push_back(primary->first.osd);
1752   acting_backfill->insert(primary->first);
1753
1754   // select replicas that have log contiguity with primary.
1755   // prefer up, then acting, then any peer_info osds
1756   for (auto i : up) {
1757     pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
1758     if (up_cand == primary->first)
1759       continue;
1760     const pg_info_t &cur_info = all_info.find(up_cand)->second;
1761     if (cur_info.is_incomplete() ||
1762         cur_info.last_update < oldest_auth_log_entry) {
1763       ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1764       backfill->insert(up_cand);
1765       acting_backfill->insert(up_cand);
1766     } else {
1767       want->push_back(i);
1768       acting_backfill->insert(up_cand);
1769       ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
1770     }
1771   }
1772
1773   if (want->size() >= size) {
1774     return;
1775   }
1776
1777   std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
1778   candidate_by_last_update.reserve(acting.size());
1779   // This no longer has backfill OSDs, but they are covered above.
1780   for (auto i : acting) {
1781     pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
1782     // skip up osds we already considered above
1783     if (acting_cand == primary->first)
1784       continue;
1785     auto up_it = find(up.begin(), up.end(), i);
1786     if (up_it != up.end())
1787       continue;
1788
1789     const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1790     if (cur_info.is_incomplete() ||
1791         cur_info.last_update < oldest_auth_log_entry) {
1792       ss << " shard " << acting_cand << " (acting) REJECTED "
1793          << cur_info << std::endl;
1794     } else {
1795       candidate_by_last_update.emplace_back(cur_info.last_update, i);
1796     }
1797   }
1798
1799   auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs,
1800                             const std::pair<eversion_t, int> &rhs) {
1801     return lhs.first > rhs.first;
1802   };
1803   // sort by last_update, in descending order.
1804   std::sort(candidate_by_last_update.begin(),
1805             candidate_by_last_update.end(), sort_by_eversion);
1806   for (auto &p: candidate_by_last_update) {
1807     ceph_assert(want->size() < size);
1808     want->push_back(p.second);
1809     pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1810     acting_backfill->insert(s);
1811     ss << " shard " << s << " (acting) accepted "
1812        << all_info.find(s)->second << std::endl;
1813     if (want->size() >= size) {
1814       return;
1815     }
1816   }
1817
1818   if (restrict_to_up_acting) {
1819     return;
1820   }
1821   candidate_by_last_update.clear();
1822   candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
1823   // continue to search stray to find more suitable peers
1824   for (auto &i : all_info) {
1825     // skip up osds we already considered above
1826     if (i.first == primary->first)
1827       continue;
1828     auto up_it = find(up.begin(), up.end(), i.first.osd);
1829     if (up_it != up.end())
1830       continue;
1831     auto acting_it = find(
1832       acting.begin(), acting.end(), i.first.osd);
1833     if (acting_it != acting.end())
1834       continue;
1835
1836     if (i.second.is_incomplete() ||
1837         i.second.last_update < oldest_auth_log_entry) {
1838       ss << " shard " << i.first << " (stray) REJECTED " << i.second
1839          << std::endl;
1840     } else {
1841       candidate_by_last_update.emplace_back(
1842         i.second.last_update, i.first.osd);
1843     }
1844   }
1845
1846   if (candidate_by_last_update.empty()) {
1847     // save us some effort
1848     return;
1849   }
1850
1851   // sort by last_update, in descending order.
1852   std::sort(candidate_by_last_update.begin(),
1853             candidate_by_last_update.end(), sort_by_eversion);
1854
1855   for (auto &p: candidate_by_last_update) {
1856     ceph_assert(want->size() < size);
1857     want->push_back(p.second);
1858     pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1859     acting_backfill->insert(s);
1860     ss << " shard " << s << " (stray) accepted "
1861        << all_info.find(s)->second << std::endl;
1862     if (want->size() >= size) {
1863       return;
1864     }
1865   }
1866 }
1867
1868 // Defines osd preference order: acting set, then larger last_update
1869 using osd_ord_t = std::tuple<bool, eversion_t>; // <acting, last_update>
1870 using osd_id_t = int;
1871
1872 class bucket_candidates_t {
1873   std::deque<std::pair<osd_ord_t, osd_id_t>> osds;
1874   int selected = 0;
1875
1876 public:
1877   void add_osd(osd_ord_t ord, osd_id_t osd) {
1878     // osds will be added in smallest to largest order
1879     assert(osds.empty() || osds.back().first <= ord);
1880     osds.push_back(std::make_pair(ord, osd));
1881   }
1882   osd_id_t pop_osd() {
1883     ceph_assert(!is_empty());
1884     auto ret = osds.back();
1885     osds.pop_back();
1886     return ret.second;
1887   }
1888
1889   void inc_selected() { selected++; }
1890   unsigned get_num_selected() const { return selected; }
1891
1892   osd_ord_t get_ord() const {
1893     return osds.empty() ? std::make_tuple(false, eversion_t())
1894       : osds.back().first;
1895   }
1896
1897   bool is_empty() const { return osds.empty(); }
1898
1899   bool operator<(const bucket_candidates_t &rhs) const {
1900     return std::make_tuple(-selected, get_ord()) <
1901       std::make_tuple(-rhs.selected, rhs.get_ord());
1902   }
1903
1904   friend std::ostream &operator<<(std::ostream &, const bucket_candidates_t &);
1905 };
1906
1907 std::ostream &operator<<(std::ostream &lhs, const bucket_candidates_t &cand)
1908 {
1909   return lhs << "candidates[" << cand.osds << "]";
1910 }
1911
1912 class bucket_heap_t {
1913   using elem_t = std::reference_wrapper<bucket_candidates_t>;
1914   std::vector<elem_t> heap;
1915
1916   // Max heap -- should emit buckets in order of preference
1917   struct comp {
1918     bool operator()(const elem_t &lhs, const elem_t &rhs) {
1919       return lhs.get() < rhs.get();
1920     }
1921   };
1922 public:
1923   void push_if_nonempty(elem_t e) {
1924     if (!e.get().is_empty()) {
1925       heap.push_back(e);
1926       std::push_heap(heap.begin(), heap.end(), comp());
1927     }
1928   }
1929   elem_t pop() {
1930     std::pop_heap(heap.begin(), heap.end(), comp());
1931     auto ret = heap.back();
1932     heap.pop_back();
1933     return ret;
1934   }
1935
1936   bool is_empty() const { return heap.empty(); }
1937 };
1938
1939 /**
1940  * calc_replicated_acting_stretch
1941  *
1942  * Choose an acting set using as much of the up set as possible; filling
1943  * in the remaining slots so as to maximize the number of crush buckets at
1944  * level pool.info.peering_crush_bucket_barrier represented.
1945  *
1946  * Stretch clusters are a bit special: while they have a "size" the
1947  * same way as normal pools, if we happen to lose a data center
1948  * (we call it a "stretch bucket", but really it'll be a data center or
1949  * a cloud availability zone), we don't actually want to shove
1950  * 2 DC's worth of replication into a single site -- it won't fit!
1951  * So we locally calculate a bucket_max, based
1952  * on the targeted number of stretch buckets for the pool and
1953  * its size. Then we won't pull more than bucket_max from any
1954  * given ancestor even if it leaves us undersized.
1955
1956  * There are two distinct phases: (commented below)
1957  */
1958 void PeeringState::calc_replicated_acting_stretch(
1959   map<pg_shard_t, pg_info_t>::const_iterator primary,
1960   eversion_t oldest_auth_log_entry,
1961   unsigned size,
1962   const vector<int> &acting,
1963   const vector<int> &up,
1964   pg_shard_t up_primary,
1965   const map<pg_shard_t, pg_info_t> &all_info,
1966   bool restrict_to_up_acting,
1967   vector<int> *want,
1968   set<pg_shard_t> *backfill,
1969   set<pg_shard_t> *acting_backfill,
1970   const OSDMapRef osdmap,
1971   const PGPool& pool,
1972   ostream &ss)
1973 {
1974   ceph_assert(want);
1975   ceph_assert(acting_backfill);
1976   ceph_assert(backfill);
1977   ss << __func__ << (restrict_to_up_acting ? " restrict_to_up_acting" : "")
1978      << std::endl;
1979
1980   auto used = [want](int osd) {
1981     return std::find(want->begin(), want->end(), osd) != want->end();
1982   };
1983
1984   auto usable_info = [&](const auto &cur_info) mutable {
1985     return !(cur_info.is_incomplete() ||
1986              cur_info.last_update < oldest_auth_log_entry);
1987   };
1988
1989   auto osd_info = [&](int osd) mutable -> const pg_info_t & {
1990     pg_shard_t cand = pg_shard_t(osd, shard_id_t::NO_SHARD);
1991     const pg_info_t &cur_info = all_info.find(cand)->second;
1992     return cur_info;
1993   };
1994
1995   auto usable_osd = [&](int osd) mutable {
1996     return usable_info(osd_info(osd));
1997   };
1998
1999   std::map<int, bucket_candidates_t> ancestors;
2000   auto get_ancestor = [&](int osd) mutable {
2001     int ancestor = osdmap->crush->get_parent_of_type(
2002       osd,
2003       pool.info.peering_crush_bucket_barrier,
2004       pool.info.crush_rule);
2005     return &ancestors[ancestor];
2006   };
2007
2008   unsigned bucket_max = pool.info.size / pool.info.peering_crush_bucket_target;
2009   if (bucket_max * pool.info.peering_crush_bucket_target < pool.info.size) {
2010     ++bucket_max;
2011   }
2012
2013   /* 1) Select all usable osds from the up set as well as the primary
2014    *
2015    * We also stash any unusable osds from up into backfill.
2016    */
2017   auto add_required = [&](int osd) {
2018     if (!used(osd)) {
2019       want->push_back(osd);
2020       acting_backfill->insert(
2021         pg_shard_t(osd, shard_id_t::NO_SHARD));
2022       get_ancestor(osd)->inc_selected();
2023     }
2024   };
2025   add_required(primary->first.osd);
2026   ss << " osd " << primary->first.osd << " primary accepted "
2027      << osd_info(primary->first.osd) << std::endl;
2028   for (auto upcand: up) {
2029     auto upshard = pg_shard_t(upcand, shard_id_t::NO_SHARD);
2030     auto &curinfo = osd_info(upcand);
2031     if (usable_osd(upcand)) {
2032       ss << " osd " << upcand << " (up) accepted " << curinfo << std::endl;
2033       add_required(upcand);
2034     } else {
2035       ss << " osd " << upcand << " (up) backfill " << curinfo << std::endl;
2036       backfill->insert(upshard);
2037       acting_backfill->insert(upshard);
2038     }
2039   }
2040
2041   if (want->size() >= pool.info.size) { // non-failed CRUSH mappings are valid
2042     ss << " up set sufficient" << std::endl;
2043     return;
2044   }
2045   ss << " up set insufficient, considering remaining osds" << std::endl;
2046
2047   /* 2) Fill out remaining slots from usable osds in all_info
2048    *    while maximizing the number of ancestor nodes at the
2049    *    barrier_id crush level.
2050    */
2051   {
2052     std::vector<std::pair<osd_ord_t, osd_id_t>> candidates;
2053     /* To do this, we first filter the set of usable osd into an ordered
2054      * list of usable osds
2055      */
2056     auto get_osd_ord = [&](bool is_acting, const pg_info_t &info) -> osd_ord_t {
2057       return std::make_tuple(
2058         !is_acting /* acting should sort first */,
2059         info.last_update);
2060     };
2061     for (auto &cand : acting) {
2062       auto &cand_info = osd_info(cand);
2063       if (!used(cand) && usable_info(cand_info)) {
2064         ss << " acting candidate " << cand << " " << cand_info << std::endl;
2065         candidates.push_back(std::make_pair(get_osd_ord(true, cand_info), cand));
2066       }
2067     }
2068     if (!restrict_to_up_acting) {
2069       for (auto &[cand, info] : all_info) {
2070         if (!used(cand.osd) && usable_info(info) &&
2071             (std::find(acting.begin(), acting.end(), cand.osd)
2072              == acting.end())) {
2073           ss << " other candidate " << cand << " " << info << std::endl;
2074           candidates.push_back(
2075             std::make_pair(get_osd_ord(false, info), cand.osd));
2076         }
2077       }
2078     }
2079     std::sort(candidates.begin(), candidates.end());
2080
2081     // We then filter these candidates by ancestor
2082     std::for_each(candidates.begin(), candidates.end(), [&](auto cand) {
2083       get_ancestor(cand.second)->add_osd(cand.first, cand.second);
2084     });
2085   }
2086
2087   auto pop_ancestor = [&](auto &ancestor) {
2088     ceph_assert(!ancestor.is_empty());
2089     auto osd = ancestor.pop_osd();
2090
2091     ss << " accepting candidate " << osd << std::endl;
2092
2093     ceph_assert(!used(osd));
2094     ceph_assert(usable_osd(osd));
2095
2096     want->push_back(osd);
2097     acting_backfill->insert(
2098       pg_shard_t(osd, shard_id_t::NO_SHARD));
2099     ancestor.inc_selected();
2100   };
2101
2102   /* Next, we use the ancestors map to grab a descendant of the
2103    * peering_crush_mandatory_member if not already represented.
2104    *
2105    * TODO: using 0 here to match other users.  Prior to merge, I
2106    * expect that this and other users should instead check against
2107    * CRUSH_ITEM_NONE.
2108    */
2109   if (pool.info.peering_crush_mandatory_member != CRUSH_ITEM_NONE) {
2110     auto aiter = ancestors.find(pool.info.peering_crush_mandatory_member);
2111     if (aiter != ancestors.end() &&
2112         !aiter->second.get_num_selected()) {
2113       ss << " adding required ancestor " << aiter->first << std::endl;
2114       ceph_assert(!aiter->second.is_empty()); // wouldn't exist otherwise
2115       pop_ancestor(aiter->second);
2116     }
2117   }
2118
2119   /* We then place the ancestors in a heap ordered by fewest selected
2120    * and then by the ordering token of the next osd */
2121   bucket_heap_t aheap;
2122   std::for_each(ancestors.begin(), ancestors.end(), [&](auto &anc) {
2123     aheap.push_if_nonempty(anc.second);
2124   });
2125
2126   /* and pull from this heap until it's empty or we have enough.
2127    * "We have enough" is a sufficient check here for
2128    * stretch_set_can_peer() because our heap sorting always
2129    * pulls from ancestors with the least number of included OSDs,
2130    * so if it is possible to satisfy the bucket_count constraints we
2131    * will do so.
2132    */
2133   while (!aheap.is_empty() && want->size() < pool.info.size) {
2134     auto next = aheap.pop();
2135     pop_ancestor(next.get());
2136     if (next.get().get_num_selected() < bucket_max) {
2137       aheap.push_if_nonempty(next);
2138     }
2139   }
2140
2141   /* The end result is that we should have as many buckets covered as
2142    * possible while respecting up, the primary selection,
2143    * the pool size (given bucket count constraints),
2144    * and the mandatory member.
2145    */
2146 }
2147
2148
2149 bool PeeringState::recoverable(const vector<int> &want) const
2150 {
2151   unsigned num_want_acting = 0;
2152   set<pg_shard_t> have;
2153   for (int i = 0; i < (int)want.size(); ++i) {
2154     if (want[i] != CRUSH_ITEM_NONE) {
2155       ++num_want_acting;
2156       have.insert(
2157         pg_shard_t(
2158           want[i],
2159           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2160     }
2161   }
2162
2163   if (num_want_acting < pool.info.min_size) {
2164     const bool recovery_ec_pool_below_min_size=
2165       HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_OCTOPUS);
2166
2167     if (pool.info.is_erasure() && !recovery_ec_pool_below_min_size) {
2168       psdout(10) << __func__ << " failed, ec recovery below min size not supported by pre-octopus" << dendl;
2169       return false;
2170     } else if (!cct->_conf.get_val<bool>("osd_allow_recovery_below_min_size")) {
2171       psdout(10) << __func__ << " failed, recovery below min size not enabled" << dendl;
2172       return false;
2173     }
2174   }
2175   if (missing_loc.get_recoverable_predicate()(have)) {
2176     return true;
2177   } else {
2178     psdout(10) << __func__ << " failed, not recoverable " << dendl;
2179     return false;
2180   }
2181 }
2182
2183 void PeeringState::choose_async_recovery_ec(
2184   const map<pg_shard_t, pg_info_t> &all_info,
2185   const pg_info_t &auth_info,
2186   vector<int> *want,
2187   set<pg_shard_t> *async_recovery,
2188   const OSDMapRef osdmap) const
2189 {
2190   set<pair<int, pg_shard_t> > candidates_by_cost;
2191   for (uint8_t i = 0; i < want->size(); ++i) {
2192     if ((*want)[i] == CRUSH_ITEM_NONE)
2193       continue;
2194
2195     // Considering log entries to recover is accurate enough for
2196     // now. We could use minimum_to_decode_with_cost() later if
2197     // necessary.
2198     pg_shard_t shard_i((*want)[i], shard_id_t(i));
2199     // do not include strays
2200     if (stray_set.find(shard_i) != stray_set.end())
2201       continue;
2202     // Do not include an osd that is not up, since choosing it as
2203     // an async_recovery_target will move it out of the acting set.
2204     // This results in it being identified as a stray during peering,
2205     // because it is no longer in the up or acting set.
2206     if (!is_up(shard_i))
2207       continue;
2208     auto shard_info = all_info.find(shard_i)->second;
2209     // for ec pools we rollback all entries past the authoritative
2210     // last_update *before* activation. This is relatively inexpensive
2211     // compared to recovery, since it is purely local, so treat shards
2212     // past the authoritative last_update the same as those equal to it.
2213     version_t auth_version = auth_info.last_update.version;
2214     version_t candidate_version = shard_info.last_update.version;
2215     if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
2216       auto approx_missing_objects =
2217         shard_info.stats.stats.sum.num_objects_missing;
2218       if (auth_version > candidate_version) {
2219         approx_missing_objects += auth_version - candidate_version;
2220       }
2221       if (static_cast<uint64_t>(approx_missing_objects) >
2222          cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
2223         candidates_by_cost.emplace(approx_missing_objects, shard_i);
2224       }
2225     } else {
2226       if (auth_version > candidate_version &&
2227           (auth_version - candidate_version) > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
2228         candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i));
2229       }
2230     }
2231   }
2232
2233   psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
2234              << dendl;
2235
2236   // take out as many osds as we can for async recovery, in order of cost
2237   for (auto rit = candidates_by_cost.rbegin();
2238        rit != candidates_by_cost.rend(); ++rit) {
2239     pg_shard_t cur_shard = rit->second;
2240     vector<int> candidate_want(*want);
2241     candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE;
2242     if (recoverable(candidate_want)) {
2243       want->swap(candidate_want);
2244       async_recovery->insert(cur_shard);
2245     }
2246   }
2247   psdout(20) << __func__ << " result want=" << *want
2248              << " async_recovery=" << *async_recovery << dendl;
2249 }
2250
2251 void PeeringState::choose_async_recovery_replicated(
2252   const map<pg_shard_t, pg_info_t> &all_info,
2253   const pg_info_t &auth_info,
2254   vector<int> *want,
2255   set<pg_shard_t> *async_recovery,
2256   const OSDMapRef osdmap) const
2257 {
2258   set<pair<int, pg_shard_t> > candidates_by_cost;
2259   for (auto osd_num : *want) {
2260     pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD);
2261     // do not include strays
2262     if (stray_set.find(shard_i) != stray_set.end())
2263       continue;
2264     // Do not include an osd that is not up, since choosing it as
2265     // an async_recovery_target will move it out of the acting set.
2266     // This results in it being identified as a stray during peering,
2267     // because it is no longer in the up or acting set.
2268     if (!is_up(shard_i))
2269       continue;
2270     auto shard_info = all_info.find(shard_i)->second;
2271     // use the approximate magnitude of the difference in length of
2272     // logs plus historical missing objects as the cost of recovery
2273     version_t auth_version = auth_info.last_update.version;
2274     version_t candidate_version = shard_info.last_update.version;
2275     if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
2276       auto approx_missing_objects =
2277         shard_info.stats.stats.sum.num_objects_missing;
2278       if (auth_version > candidate_version) {
2279         approx_missing_objects += auth_version - candidate_version;
2280       } else {
2281         approx_missing_objects += candidate_version - auth_version;
2282       }
2283       if (static_cast<uint64_t>(approx_missing_objects)  >
2284          cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
2285         candidates_by_cost.emplace(approx_missing_objects, shard_i);
2286       }
2287     } else {
2288       size_t approx_entries;
2289       if (auth_version > candidate_version) {
2290         approx_entries = auth_version - candidate_version;
2291       } else {
2292         approx_entries = candidate_version - auth_version;
2293       }
2294       if (approx_entries > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
2295         candidates_by_cost.insert(make_pair(approx_entries, shard_i));
2296       }
2297     }
2298   }
2299
2300   psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
2301              << dendl;
2302   // take out as many osds as we can for async recovery, in order of cost
2303   for (auto rit = candidates_by_cost.rbegin();
2304        rit != candidates_by_cost.rend(); ++rit) {
2305     if (want->size() <= pool.info.min_size) {
2306       break;
2307     }
2308     pg_shard_t cur_shard = rit->second;
2309     vector<int> candidate_want(*want);
2310     for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) {
2311       if (*it == cur_shard.osd) {
2312         candidate_want.erase(it);
2313         if (pool.info.stretch_set_can_peer(candidate_want, *osdmap, NULL)) {
2314           // if we're in stretch mode, we can only remove the osd if it doesn't
2315           // break peering limits.
2316           want->swap(candidate_want);
2317           async_recovery->insert(cur_shard);
2318         }
2319         break;
2320       }
2321     }
2322   }
2323
2324   psdout(20) << __func__ << " result want=" << *want
2325              << " async_recovery=" << *async_recovery << dendl;
2326 }
2327
2328 /**
2329  * choose acting
2330  *
2331  * calculate the desired acting, and request a change with the monitor
2332  * if it differs from the current acting.
2333  *
2334  * if restrict_to_up_acting=true, we filter out anything that's not in
2335  * up/acting.  in order to lift this restriction, we need to
2336  *  1) check whether it's worth switching the acting set any time we get
2337  *     a new pg info (not just here, when recovery finishes)
2338  *  2) check whether anything in want_acting went down on each new map
2339  *     (and, if so, calculate a new want_acting)
2340  *  3) remove the assertion in PG::PeeringState::Active::react(const AdvMap)
2341  * TODO!
2342  */
2343 bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
2344                                  bool restrict_to_up_acting,
2345                                  bool *history_les_bound,
2346                                  bool request_pg_temp_change_only)
2347 {
2348   map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
2349   all_info[pg_whoami] = info;
2350
2351   if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
2352     for (auto p = all_info.begin(); p != all_info.end(); ++p) {
2353       psdout(10) << __func__ << " all_info osd." << p->first << " "
2354                  << p->second << dendl;
2355     }
2356   }
2357
2358   auto auth_log_shard = find_best_info(all_info, restrict_to_up_acting,
2359                                        history_les_bound);
2360
2361   if (auth_log_shard == all_info.end()) {
2362     if (up != acting) {
2363       psdout(10) << __func__ << " no suitable info found (incomplete backfills?),"
2364                  << " reverting to up" << dendl;
2365       want_acting = up;
2366       vector<int> empty;
2367       pl->queue_want_pg_temp(empty);
2368     } else {
2369       psdout(10) << __func__ << " failed" << dendl;
2370       ceph_assert(want_acting.empty());
2371     }
2372     return false;
2373   }
2374
2375   ceph_assert(!auth_log_shard->second.is_incomplete());
2376   auth_log_shard_id = auth_log_shard->first;
2377
2378   set<pg_shard_t> want_backfill, want_acting_backfill;
2379   vector<int> want;
2380   stringstream ss;
2381   if (pool.info.is_replicated()) {
2382     auto [primary_shard, oldest_log] = select_replicated_primary(
2383       auth_log_shard,
2384       cct->_conf.get_val<uint64_t>(
2385         "osd_force_auth_primary_missing_objects"),
2386       up,
2387       up_primary,
2388       all_info,
2389       get_osdmap(),
2390       ss);
2391     if (pool.info.is_stretch_pool()) {
2392       calc_replicated_acting_stretch(
2393         primary_shard,
2394         oldest_log,
2395         get_osdmap()->get_pg_size(info.pgid.pgid),
2396         acting,
2397         up,
2398         up_primary,
2399         all_info,
2400         restrict_to_up_acting,
2401         &want,
2402         &want_backfill,
2403         &want_acting_backfill,
2404         get_osdmap(),
2405         pool,
2406         ss);
2407     } else {
2408       calc_replicated_acting(
2409         primary_shard,
2410         oldest_log,
2411         get_osdmap()->get_pg_size(info.pgid.pgid),
2412         acting,
2413         up,
2414         up_primary,
2415         all_info,
2416         restrict_to_up_acting,
2417         &want,
2418         &want_backfill,
2419         &want_acting_backfill,
2420         get_osdmap(),
2421         pool,
2422         ss);
2423     }
2424   } else {
2425     calc_ec_acting(
2426       auth_log_shard,
2427       get_osdmap()->get_pg_size(info.pgid.pgid),
2428       acting,
2429       up,
2430       all_info,
2431       restrict_to_up_acting,
2432       &want,
2433       &want_backfill,
2434       &want_acting_backfill,
2435       ss);
2436   }
2437   psdout(10) << ss.str() << dendl;
2438
2439   if (!recoverable(want)) {
2440     want_acting.clear();
2441     return false;
2442   }
2443
2444   set<pg_shard_t> want_async_recovery;
2445   if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) {
2446     if (pool.info.is_erasure()) {
2447       choose_async_recovery_ec(
2448         all_info, auth_log_shard->second, &want, &want_async_recovery,
2449         get_osdmap());
2450     } else {
2451       choose_async_recovery_replicated(
2452         all_info, auth_log_shard->second, &want, &want_async_recovery,
2453         get_osdmap());
2454     }
2455   }
2456   while (want.size() > pool.info.size) {
2457     // async recovery should have taken out as many osds as it can.
2458     // if not, then always evict the last peer
2459     // (will get synchronously recovered later)
2460     psdout(10) << __func__ << " evicting osd." << want.back()
2461                << " from oversized want " << want << dendl;
2462     want.pop_back();
2463   }
2464   if (want != acting) {
2465     psdout(10) << __func__ << " want " << want << " != acting " << acting
2466                << ", requesting pg_temp change" << dendl;
2467     want_acting = want;
2468
2469     if (!cct->_conf->osd_debug_no_acting_change) {
2470       if (want_acting == up) {
2471         // There can't be any pending backfill if
2472         // want is the same as crush map up OSDs.
2473         ceph_assert(want_backfill.empty());
2474         vector<int> empty;
2475         pl->queue_want_pg_temp(empty);
2476       } else
2477         pl->queue_want_pg_temp(want);
2478     }
2479     return false;
2480   }
2481
2482   if (request_pg_temp_change_only)
2483     return true;
2484   want_acting.clear();
2485   acting_recovery_backfill = want_acting_backfill;
2486   psdout(10) << "acting_recovery_backfill is "
2487              << acting_recovery_backfill << dendl;
2488   ceph_assert(
2489     backfill_targets.empty() ||
2490     backfill_targets == want_backfill);
2491   if (backfill_targets.empty()) {
2492     // Caller is GetInfo
2493     backfill_targets = want_backfill;
2494   }
2495   // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
2496   ceph_assert(
2497     async_recovery_targets.empty() ||
2498     async_recovery_targets == want_async_recovery ||
2499     !needs_recovery());
2500   if (async_recovery_targets.empty() || !needs_recovery()) {
2501     async_recovery_targets = want_async_recovery;
2502   }
2503   // Will not change if already set because up would have had to change
2504   // Verify that nothing in backfill is in stray_set
2505   for (auto i = want_backfill.begin(); i != want_backfill.end(); ++i) {
2506     ceph_assert(stray_set.find(*i) == stray_set.end());
2507   }
2508   psdout(10) << "choose_acting want=" << want << " backfill_targets="
2509            << want_backfill << " async_recovery_targets="
2510            << async_recovery_targets << dendl;
2511   return true;
2512 }
2513
2514 void PeeringState::log_weirdness()
2515 {
2516   if (pg_log.get_tail() != info.log_tail)
2517     pl->get_clog_error() << info.pgid
2518                            << " info mismatch, log.tail " << pg_log.get_tail()
2519                            << " != info.log_tail " << info.log_tail;
2520   if (pg_log.get_head() != info.last_update)
2521     pl->get_clog_error() << info.pgid
2522                            << " info mismatch, log.head " << pg_log.get_head()
2523                            << " != info.last_update " << info.last_update;
2524
2525   if (!pg_log.get_log().empty()) {
2526     // sloppy check
2527     if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
2528       pl->get_clog_error() << info.pgid
2529                              << " log bound mismatch, info (tail,head] ("
2530                              << pg_log.get_tail() << ","
2531                              << pg_log.get_head() << "]"
2532                              << " actual ["
2533                              << pg_log.get_log().log.begin()->version << ","
2534                              << pg_log.get_log().log.rbegin()->version << "]";
2535   }
2536
2537   if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
2538     pl->get_clog_error() << info.pgid
2539                            << " caller_ops.size "
2540                            << pg_log.get_log().caller_ops.size()
2541                            << " > log size " << pg_log.get_log().log.size();
2542   }
2543 }
2544
2545 /*
2546  * Process information from a replica to determine if it could have any
2547  * objects that i need.
2548  *
2549  * TODO: if the missing set becomes very large, this could get expensive.
2550  * Instead, we probably want to just iterate over our unfound set.
2551  */
2552 bool PeeringState::search_for_missing(
2553   const pg_info_t &oinfo, const pg_missing_t &omissing,
2554   pg_shard_t from,
2555   PeeringCtxWrapper &ctx)
2556 {
2557   uint64_t num_unfound_before = missing_loc.num_unfound();
2558   bool found_missing = missing_loc.add_source_info(
2559     from, oinfo, omissing, ctx.handle);
2560   if (found_missing && num_unfound_before != missing_loc.num_unfound())
2561     pl->publish_stats_to_osd();
2562   // avoid doing this if the peer is empty.  This is abit of paranoia
2563   // to avoid doing something rash if add_source_info() above
2564   // incorrectly decided we found something new. (if the peer has
2565   // last_update=0'0 that's impossible.)
2566   if (found_missing &&
2567       oinfo.last_update != eversion_t()) {
2568     pg_info_t tinfo(oinfo);
2569     tinfo.pgid.shard = pg_whoami.shard;
2570     ctx.send_info(
2571       from.osd,
2572       spg_t(info.pgid.pgid, from.shard),
2573       get_osdmap_epoch(),  // fixme: use lower epoch?
2574       get_osdmap_epoch(),
2575       tinfo);
2576   }
2577   return found_missing;
2578 }
2579
2580 bool PeeringState::discover_all_missing(
2581   BufferedRecoveryMessages &rctx)
2582 {
2583   auto &missing = pg_log.get_missing();
2584   uint64_t unfound = get_num_unfound();
2585   bool any = false;  // did we start any queries
2586
2587   psdout(10) << __func__ << " "
2588              << missing.num_missing() << " missing, "
2589              << unfound << " unfound"
2590              << dendl;
2591
2592   auto m = might_have_unfound.begin();
2593   auto mend = might_have_unfound.end();
2594   for (; m != mend; ++m) {
2595     pg_shard_t peer(*m);
2596
2597     if (!get_osdmap()->is_up(peer.osd)) {
2598       psdout(20) << __func__ << " skipping down osd." << peer << dendl;
2599       continue;
2600     }
2601
2602     if (peer_purged.count(peer)) {
2603       psdout(20) << __func__ << " skipping purged osd." << peer << dendl;
2604       continue;
2605     }
2606
2607     auto iter = peer_info.find(peer);
2608     if (iter != peer_info.end() &&
2609         (iter->second.is_empty() || iter->second.dne())) {
2610       // ignore empty peers
2611       continue;
2612     }
2613
2614     // If we've requested any of this stuff, the pg_missing_t information
2615     // should be on its way.
2616     // TODO: coalsce requested_* into a single data structure
2617     if (peer_missing.find(peer) != peer_missing.end()) {
2618       psdout(20) << __func__ << ": osd." << peer
2619                  << ": we already have pg_missing_t" << dendl;
2620       continue;
2621     }
2622     if (peer_log_requested.find(peer) != peer_log_requested.end()) {
2623       psdout(20) << __func__ << ": osd." << peer
2624                  << ": in peer_log_requested" << dendl;
2625       continue;
2626     }
2627     if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
2628       psdout(20) << __func__ << ": osd." << peer
2629                  << ": in peer_missing_requested" << dendl;
2630       continue;
2631     }
2632
2633     // Request missing
2634     psdout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
2635                << dendl;
2636     peer_missing_requested.insert(peer);
2637     rctx.send_query(
2638       peer.osd,
2639       spg_t(info.pgid.pgid, peer.shard),
2640       pg_query_t(
2641         pg_query_t::FULLLOG,
2642         peer.shard, pg_whoami.shard,
2643         info.history, get_osdmap_epoch()));
2644     any = true;
2645   }
2646   return any;
2647 }
2648
2649 /* Build the might_have_unfound set.
2650  *
2651  * This is used by the primary OSD during recovery.
2652  *
2653  * This set tracks the OSDs which might have unfound objects that the primary
2654  * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
2655  * will remove the OSD from the set.
2656  */
2657 void PeeringState::build_might_have_unfound()
2658 {
2659   ceph_assert(might_have_unfound.empty());
2660   ceph_assert(is_primary());
2661
2662   psdout(10) << __func__ << dendl;
2663
2664   check_past_interval_bounds();
2665
2666   might_have_unfound = past_intervals.get_might_have_unfound(
2667     pg_whoami,
2668     pool.info.is_erasure());
2669
2670   // include any (stray) peers
2671   for (auto p = peer_info.begin(); p != peer_info.end(); ++p)
2672     might_have_unfound.insert(p->first);
2673
2674   psdout(15) << __func__ << ": built " << might_have_unfound << dendl;
2675 }
2676
2677 void PeeringState::activate(
2678   ObjectStore::Transaction& t,
2679   epoch_t activation_epoch,
2680   PeeringCtxWrapper &ctx)
2681 {
2682   ceph_assert(!is_peered());
2683
2684   // twiddle pg state
2685   state_clear(PG_STATE_DOWN);
2686
2687   send_notify = false;
2688
2689   if (is_primary()) {
2690     // only update primary last_epoch_started if we will go active
2691     if (acting_set_writeable()) {
2692       ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
2693              info.last_epoch_started <= activation_epoch);
2694       info.last_epoch_started = activation_epoch;
2695       info.last_interval_started = info.history.same_interval_since;
2696     }
2697   } else if (is_acting(pg_whoami)) {
2698     /* update last_epoch_started on acting replica to whatever the primary sent
2699      * unless it's smaller (could happen if we are going peered rather than
2700      * active, see doc/dev/osd_internals/last_epoch_started.rst) */
2701     if (info.last_epoch_started < activation_epoch) {
2702       info.last_epoch_started = activation_epoch;
2703       info.last_interval_started = info.history.same_interval_since;
2704     }
2705   }
2706
2707   auto &missing = pg_log.get_missing();
2708
2709   min_last_complete_ondisk = eversion_t(0,0);  // we don't know (yet)!
2710   if (is_primary()) {
2711     last_update_ondisk = info.last_update;
2712   }
2713   last_update_applied = info.last_update;
2714   last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
2715
2716   need_up_thru = false;
2717
2718   // write pg info, log
2719   dirty_info = true;
2720   dirty_big_info = true; // maybe
2721
2722   pl->schedule_event_on_commit(
2723     t,
2724     std::make_shared<PGPeeringEvent>(
2725       get_osdmap_epoch(),
2726       get_osdmap_epoch(),
2727       ActivateCommitted(
2728         get_osdmap_epoch(),
2729         activation_epoch)));
2730
2731   // init complete pointer
2732   if (missing.num_missing() == 0) {
2733     psdout(10) << "activate - no missing, moving last_complete " << info.last_complete
2734              << " -> " << info.last_update << dendl;
2735     info.last_complete = info.last_update;
2736     info.stats.stats.sum.num_objects_missing = 0;
2737     pg_log.reset_recovery_pointers();
2738   } else {
2739     psdout(10) << "activate - not complete, " << missing << dendl;
2740     info.stats.stats.sum.num_objects_missing = missing.num_missing();
2741     pg_log.activate_not_complete(info);
2742   }
2743
2744   log_weirdness();
2745
2746   if (is_primary()) {
2747     // initialize snap_trimq
2748     interval_set<snapid_t> to_trim;
2749     auto& removed_snaps_queue = get_osdmap()->get_removed_snaps_queue();
2750     auto p = removed_snaps_queue.find(info.pgid.pgid.pool());
2751     if (p != removed_snaps_queue.end()) {
2752       dout(20) << "activate - purged_snaps " << info.purged_snaps
2753                << " removed_snaps " << p->second
2754                << dendl;
2755       for (auto q : p->second) {
2756         to_trim.insert(q.first, q.second);
2757       }
2758     }
2759     interval_set<snapid_t> purged;
2760     purged.intersection_of(to_trim, info.purged_snaps);
2761     to_trim.subtract(purged);
2762
2763     if (HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
2764       renew_lease(pl->get_mnow());
2765       // do not schedule until we are actually activated
2766     }
2767
2768     // adjust purged_snaps: PG may have been inactive while snaps were pruned
2769     // from the removed_snaps_queue in the osdmap.  update local purged_snaps
2770     // reflect only those snaps that we thought were pruned and were still in
2771     // the queue.
2772     info.purged_snaps.swap(purged);
2773
2774     // start up replicas
2775     info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
2776                                                  prior_readable_until_ub);
2777
2778     ceph_assert(!acting_recovery_backfill.empty());
2779     for (auto i = acting_recovery_backfill.begin();
2780          i != acting_recovery_backfill.end();
2781          ++i) {
2782       if (*i == pg_whoami) continue;
2783       pg_shard_t peer = *i;
2784       ceph_assert(peer_info.count(peer));
2785       pg_info_t& pi = peer_info[peer];
2786
2787       psdout(10) << "activate peer osd." << peer << " " << pi << dendl;
2788
2789       MRef<MOSDPGLog> m;
2790       ceph_assert(peer_missing.count(peer));
2791       pg_missing_t& pm = peer_missing[peer];
2792
2793       bool needs_past_intervals = pi.dne();
2794
2795       if (pi.last_update == info.last_update) {
2796         // empty log
2797         if (!pi.last_backfill.is_max())
2798           pl->get_clog_info() << info.pgid << " continuing backfill to osd."
2799                                 << peer
2800                                 << " from (" << pi.log_tail << "," << pi.last_update
2801                                 << "] " << pi.last_backfill
2802                                 << " to " << info.last_update;
2803         if (!pi.is_empty()) {
2804           psdout(10) << "activate peer osd." << peer
2805                      << " is up to date, queueing in pending_activators" << dendl;
2806           ctx.send_info(
2807             peer.osd,
2808             spg_t(info.pgid.pgid, peer.shard),
2809             get_osdmap_epoch(), // fixme: use lower epoch?
2810             get_osdmap_epoch(),
2811             info,
2812             get_lease());
2813         } else {
2814           psdout(10) << "activate peer osd." << peer
2815                      << " is up to date, but sending pg_log anyway" << dendl;
2816           m = make_message<MOSDPGLog>(
2817             i->shard, pg_whoami.shard,
2818             get_osdmap_epoch(), info,
2819             last_peering_reset);
2820         }
2821       } else if (
2822         pg_log.get_tail() > pi.last_update ||
2823         pi.last_backfill == hobject_t() ||
2824         (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
2825         /* ^ This last case covers a situation where a replica is not contiguous
2826          * with the auth_log, but is contiguous with this replica.  Reshuffling
2827          * the active set to handle this would be tricky, so instead we just go
2828          * ahead and backfill it anyway.  This is probably preferrable in any
2829          * case since the replica in question would have to be significantly
2830          * behind.
2831          */
2832         // backfill
2833         pl->get_clog_debug() << info.pgid << " starting backfill to osd." << peer
2834                                << " from (" << pi.log_tail << "," << pi.last_update
2835                                << "] " << pi.last_backfill
2836                                << " to " << info.last_update;
2837
2838         pi.last_update = info.last_update;
2839         pi.last_complete = info.last_update;
2840         pi.set_last_backfill(hobject_t());
2841         pi.last_epoch_started = info.last_epoch_started;
2842         pi.last_interval_started = info.last_interval_started;
2843         pi.history = info.history;
2844         pi.hit_set = info.hit_set;
2845         // Save num_bytes for reservation request, can't be negative
2846         peer_bytes[peer] = std::max<int64_t>(0, pi.stats.stats.sum.num_bytes);
2847         pi.stats.stats.clear();
2848         pi.stats.stats.sum.num_bytes = peer_bytes[peer];
2849
2850         // initialize peer with our purged_snaps.
2851         pi.purged_snaps = info.purged_snaps;
2852
2853         m = make_message<MOSDPGLog>(
2854           i->shard, pg_whoami.shard,
2855           get_osdmap_epoch(), pi,
2856           last_peering_reset /* epoch to create pg at */);
2857
2858         // send some recent log, so that op dup detection works well.
2859         m->log.copy_up_to(cct, pg_log.get_log(),
2860                           cct->_conf->osd_max_pg_log_entries);
2861         m->info.log_tail = m->log.tail;
2862         pi.log_tail = m->log.tail;  // sigh...
2863
2864         pm.clear();
2865       } else {
2866         // catch up
2867         ceph_assert(pg_log.get_tail() <= pi.last_update);
2868         m = make_message<MOSDPGLog>(
2869           i->shard, pg_whoami.shard,
2870           get_osdmap_epoch(), info,
2871           last_peering_reset /* epoch to create pg at */);
2872         // send new stuff to append to replicas log
2873         m->log.copy_after(cct, pg_log.get_log(), pi.last_update);
2874       }
2875
2876       // share past_intervals if we are creating the pg on the replica
2877       // based on whether our info for that peer was dne() *before*
2878       // updating pi.history in the backfill block above.
2879       if (m && needs_past_intervals)
2880         m->past_intervals = past_intervals;
2881
2882       // update local version of peer's missing list!
2883       if (m && pi.last_backfill != hobject_t()) {
2884         for (auto p = m->log.log.begin(); p != m->log.log.end(); ++p) {
2885           if (p->soid <= pi.last_backfill &&
2886               !p->is_error()) {
2887             if (perform_deletes_during_peering() && p->is_delete()) {
2888               pm.rm(p->soid, p->version);
2889             } else {
2890               pm.add_next_event(*p);
2891             }
2892           }
2893         }
2894       }
2895
2896       if (m) {
2897         dout(10) << "activate peer osd." << peer << " sending " << m->log
2898                  << dendl;
2899         m->lease = get_lease();
2900         pl->send_cluster_message(peer.osd, m, get_osdmap_epoch());
2901       }
2902
2903       // peer now has
2904       pi.last_update = info.last_update;
2905
2906       // update our missing
2907       if (pm.num_missing() == 0) {
2908         pi.last_complete = pi.last_update;
2909         psdout(10) << "activate peer osd." << peer << " " << pi
2910                    << " uptodate" << dendl;
2911       } else {
2912         psdout(10) << "activate peer osd." << peer << " " << pi
2913                    << " missing " << pm << dendl;
2914       }
2915     }
2916
2917     // Set up missing_loc
2918     set<pg_shard_t> complete_shards;
2919     for (auto i = acting_recovery_backfill.begin();
2920          i != acting_recovery_backfill.end();
2921          ++i) {
2922       psdout(20) << __func__ << " setting up missing_loc from shard " << *i
2923                  << " " << dendl;
2924       if (*i == get_primary()) {
2925         missing_loc.add_active_missing(missing);
2926         if (!missing.have_missing())
2927           complete_shards.insert(*i);
2928       } else {
2929         auto peer_missing_entry = peer_missing.find(*i);
2930         ceph_assert(peer_missing_entry != peer_missing.end());
2931         missing_loc.add_active_missing(peer_missing_entry->second);
2932         if (!peer_missing_entry->second.have_missing() &&
2933             peer_info[*i].last_backfill.is_max())
2934           complete_shards.insert(*i);
2935       }
2936     }
2937
2938     // If necessary, create might_have_unfound to help us find our unfound objects.
2939     // NOTE: It's important that we build might_have_unfound before trimming the
2940     // past intervals.
2941     might_have_unfound.clear();
2942     if (needs_recovery()) {
2943       // If only one shard has missing, we do a trick to add all others as recovery
2944       // source, this is considered safe since the PGLogs have been merged locally,
2945       // and covers vast majority of the use cases, like one OSD/host is down for
2946       // a while for hardware repairing
2947       if (complete_shards.size() + 1 == acting_recovery_backfill.size()) {
2948         missing_loc.add_batch_sources_info(complete_shards, ctx.handle);
2949       } else {
2950         missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
2951                                     ctx.handle);
2952         for (auto i = acting_recovery_backfill.begin();
2953              i != acting_recovery_backfill.end();
2954              ++i) {
2955           if (*i == pg_whoami) continue;
2956           psdout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
2957           ceph_assert(peer_missing.count(*i));
2958           ceph_assert(peer_info.count(*i));
2959           missing_loc.add_source_info(
2960             *i,
2961             peer_info[*i],
2962             peer_missing[*i],
2963             ctx.handle);
2964         }
2965       }
2966       for (auto i = peer_missing.begin(); i != peer_missing.end(); ++i) {
2967         if (is_acting_recovery_backfill(i->first))
2968           continue;
2969         ceph_assert(peer_info.count(i->first));
2970         search_for_missing(
2971           peer_info[i->first],
2972           i->second,
2973           i->first,
2974           ctx);
2975       }
2976
2977       build_might_have_unfound();
2978
2979       // Always call now so update_calc_stats() will be accurate
2980       discover_all_missing(ctx.msgs);
2981
2982     }
2983
2984     // num_objects_degraded if calculated should reflect this too, unless no
2985     // missing and we are about to go clean.
2986     if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
2987       state_set(PG_STATE_UNDERSIZED);
2988     }
2989
2990     state_set(PG_STATE_ACTIVATING);
2991     pl->on_activate(std::move(to_trim));
2992   }
2993   if (acting_set_writeable()) {
2994     PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
2995     pg_log.roll_forward(rollbacker.get());
2996   }
2997 }
2998
2999 void PeeringState::share_pg_info()
3000 {
3001   psdout(10) << "share_pg_info" << dendl;
3002
3003   info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
3004                                                prior_readable_until_ub);
3005
3006   // share new pg_info_t with replicas
3007   ceph_assert(!acting_recovery_backfill.empty());
3008   for (auto pg_shard : acting_recovery_backfill) {
3009     if (pg_shard == pg_whoami) continue;
3010     if (auto peer = peer_info.find(pg_shard); peer != peer_info.end()) {
3011       peer->second.last_epoch_started = info.last_epoch_started;
3012       peer->second.last_interval_started = info.last_interval_started;
3013       peer->second.history.merge(info.history);
3014     }
3015     MessageRef m;
3016     if (last_require_osd_release >= ceph_release_t::octopus) {
3017       m = make_message<MOSDPGInfo2>(spg_t{info.pgid.pgid, pg_shard.shard},
3018                           info,
3019                           get_osdmap_epoch(),
3020                           get_osdmap_epoch(),
3021                           std::optional<pg_lease_t>{get_lease()},
3022                           std::nullopt);
3023     } else {
3024       m = make_message<MOSDPGInfo>(get_osdmap_epoch(),
3025               MOSDPGInfo::pg_list_t{
3026                 pg_notify_t{pg_shard.shard,
3027                             pg_whoami.shard,
3028                             get_osdmap_epoch(),
3029                             get_osdmap_epoch(),
3030                             info,
3031                             past_intervals}});
3032     }
3033     pl->send_cluster_message(pg_shard.osd, m, get_osdmap_epoch());
3034   }
3035 }
3036
3037 void PeeringState::merge_log(
3038   ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t&& olog,
3039   pg_shard_t from)
3040 {
3041   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
3042   pg_log.merge_log(
3043     oinfo, std::move(olog), from, info, rollbacker.get(),
3044     dirty_info, dirty_big_info);
3045 }
3046
3047 void PeeringState::rewind_divergent_log(
3048   ObjectStore::Transaction& t, eversion_t newhead)
3049 {
3050   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
3051   pg_log.rewind_divergent_log(
3052     newhead, info, rollbacker.get(), dirty_info, dirty_big_info);
3053 }
3054
3055
3056 void PeeringState::proc_primary_info(
3057   ObjectStore::Transaction &t, const pg_info_t &oinfo)
3058 {
3059   ceph_assert(!is_primary());
3060
3061   update_history(oinfo.history);
3062   if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
3063     info.stats.stats.sum.num_scrub_errors = 0;
3064     info.stats.stats.sum.num_shallow_scrub_errors = 0;
3065     info.stats.stats.sum.num_deep_scrub_errors = 0;
3066     dirty_info = true;
3067   }
3068
3069   if (!(info.purged_snaps == oinfo.purged_snaps)) {
3070     psdout(10) << __func__ << " updating purged_snaps to "
3071                << oinfo.purged_snaps
3072                << dendl;
3073     info.purged_snaps = oinfo.purged_snaps;
3074     dirty_info = true;
3075     dirty_big_info = true;
3076   }
3077 }
3078
3079 void PeeringState::proc_master_log(
3080   ObjectStore::Transaction& t, pg_info_t &oinfo,
3081   pg_log_t&& olog, pg_missing_t&& omissing, pg_shard_t from)
3082 {
3083   psdout(10) << "proc_master_log for osd." << from << ": "
3084              << olog << " " << omissing << dendl;
3085   ceph_assert(!is_peered() && is_primary());
3086
3087   // merge log into our own log to build master log.  no need to
3088   // make any adjustments to their missing map; we are taking their
3089   // log to be authoritative (i.e., their entries are by definitely
3090   // non-divergent).
3091   merge_log(t, oinfo, std::move(olog), from);
3092   peer_info[from] = oinfo;
3093   psdout(10) << " peer osd." << from << " now " << oinfo
3094              << " " << omissing << dendl;
3095   might_have_unfound.insert(from);
3096
3097   // See doc/dev/osd_internals/last_epoch_started
3098   if (oinfo.last_epoch_started > info.last_epoch_started) {
3099     info.last_epoch_started = oinfo.last_epoch_started;
3100     dirty_info = true;
3101   }
3102   if (oinfo.last_interval_started > info.last_interval_started) {
3103     info.last_interval_started = oinfo.last_interval_started;
3104     dirty_info = true;
3105   }
3106   update_history(oinfo.history);
3107   ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
3108          info.last_epoch_started >= info.history.last_epoch_started);
3109
3110   peer_missing[from].claim(std::move(omissing));
3111 }
3112
3113 void PeeringState::proc_replica_log(
3114   pg_info_t &oinfo,
3115   const pg_log_t &olog,
3116   pg_missing_t&& omissing,
3117   pg_shard_t from)
3118 {
3119   psdout(10) << "proc_replica_log for osd." << from << ": "
3120              << oinfo << " " << olog << " " << omissing << dendl;
3121
3122   pg_log.proc_replica_log(oinfo, olog, omissing, from);
3123
3124   peer_info[from] = oinfo;
3125   psdout(10) << " peer osd." << from << " now "
3126              << oinfo << " " << omissing << dendl;
3127   might_have_unfound.insert(from);
3128
3129   for (auto i = omissing.get_items().begin();
3130        i != omissing.get_items().end();
3131        ++i) {
3132     psdout(20) << " after missing " << i->first
3133                << " need " << i->second.need
3134                << " have " << i->second.have << dendl;
3135   }
3136   peer_missing[from].claim(std::move(omissing));
3137 }
3138
3139 void PeeringState::fulfill_info(
3140   pg_shard_t from, const pg_query_t &query,
3141   pair<pg_shard_t, pg_info_t> &notify_info)
3142 {
3143   ceph_assert(from == primary);
3144   ceph_assert(query.type == pg_query_t::INFO);
3145
3146   // info
3147   psdout(10) << "sending info" << dendl;
3148   notify_info = make_pair(from, info);
3149 }
3150
3151 void PeeringState::fulfill_log(
3152   pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
3153 {
3154   psdout(10) << "log request from " << from << dendl;
3155   ceph_assert(from == primary);
3156   ceph_assert(query.type != pg_query_t::INFO);
3157
3158   auto mlog = make_message<MOSDPGLog>(
3159     from.shard, pg_whoami.shard,
3160     get_osdmap_epoch(),
3161     info, query_epoch);
3162   mlog->missing = pg_log.get_missing();
3163
3164   // primary -> other, when building master log
3165   if (query.type == pg_query_t::LOG) {
3166     psdout(10) << " sending info+missing+log since " << query.since
3167                << dendl;
3168     if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
3169       pl->get_clog_error() << info.pgid << " got broken pg_query_t::LOG since "
3170                              << query.since
3171                              << " when my log.tail is " << pg_log.get_tail()
3172                              << ", sending full log instead";
3173       mlog->log = pg_log.get_log();           // primary should not have requested this!!
3174     } else
3175       mlog->log.copy_after(cct, pg_log.get_log(), query.since);
3176   }
3177   else if (query.type == pg_query_t::FULLLOG) {
3178     psdout(10) << " sending info+missing+full log" << dendl;
3179     mlog->log = pg_log.get_log();
3180   }
3181
3182   psdout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
3183
3184   pl->send_cluster_message(from.osd, mlog, get_osdmap_epoch(), true);
3185 }
3186
3187 void PeeringState::fulfill_query(const MQuery& query, PeeringCtxWrapper &rctx)
3188 {
3189   if (query.query.type == pg_query_t::INFO) {
3190     pair<pg_shard_t, pg_info_t> notify_info;
3191     // note this refreshes our prior_readable_until_ub value
3192     update_history(query.query.history);
3193     fulfill_info(query.from, query.query, notify_info);
3194     rctx.send_notify(
3195       notify_info.first.osd,
3196       pg_notify_t(
3197         notify_info.first.shard, pg_whoami.shard,
3198         query.query_epoch,
3199         get_osdmap_epoch(),
3200         notify_info.second,
3201         past_intervals));
3202   } else {
3203     update_history(query.query.history);
3204     fulfill_log(query.from, query.query, query.query_epoch);
3205   }
3206 }
3207
3208 void PeeringState::try_mark_clean()
3209 {
3210   if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
3211     state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
3212     state_set(PG_STATE_CLEAN);
3213     info.history.last_epoch_clean = get_osdmap_epoch();
3214     info.history.last_interval_clean = info.history.same_interval_since;
3215     past_intervals.clear();
3216     dirty_big_info = true;
3217     dirty_info = true;
3218   }
3219
3220   if (!is_active() && is_peered()) {
3221     if (is_clean()) {
3222       bool target;
3223       if (pool.info.is_pending_merge(info.pgid.pgid, &target)) {
3224         if (target) {
3225           psdout(10) << "ready to merge (target)" << dendl;
3226           pl->set_ready_to_merge_target(
3227             info.last_update,
3228             info.history.last_epoch_started,
3229             info.history.last_epoch_clean);
3230         } else {
3231           psdout(10) << "ready to merge (source)" << dendl;
3232           pl->set_ready_to_merge_source(info.last_update);
3233         }
3234       }
3235     } else {
3236       psdout(10) << "not clean, not ready to merge" << dendl;
3237       // we should have notified OSD in Active state entry point
3238     }
3239   }
3240
3241   state_clear(PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
3242
3243   share_pg_info();
3244   pl->publish_stats_to_osd();
3245   clear_recovery_state();
3246 }
3247
3248 void PeeringState::split_into(
3249   pg_t child_pgid, PeeringState *child, unsigned split_bits)
3250 {
3251   child->update_osdmap_ref(get_osdmap());
3252   child->pool = pool;
3253
3254   // Log
3255   pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
3256   child->info.last_complete = info.last_complete;
3257
3258   info.last_update = pg_log.get_head();
3259   child->info.last_update = child->pg_log.get_head();
3260
3261   child->info.last_user_version = info.last_user_version;
3262
3263   info.log_tail = pg_log.get_tail();
3264   child->info.log_tail = child->pg_log.get_tail();
3265
3266   // reset last_complete, we might have modified pg_log & missing above
3267   pg_log.reset_complete_to(&info);
3268   child->pg_log.reset_complete_to(&child->info);
3269
3270   // Info
3271   child->info.history = info.history;
3272   child->info.history.epoch_created = get_osdmap_epoch();
3273   child->info.purged_snaps = info.purged_snaps;
3274
3275   if (info.last_backfill.is_max()) {
3276     child->info.set_last_backfill(hobject_t::get_max());
3277   } else {
3278     // restart backfill on parent and child to be safe.  we could
3279     // probably do better in the bitwise sort case, but it's more
3280     // fragile (there may be special work to do on backfill completion
3281     // in the future).
3282     info.set_last_backfill(hobject_t());
3283     child->info.set_last_backfill(hobject_t());
3284     // restarting backfill implies that the missing set is empty,
3285     // since it is only used for objects prior to last_backfill
3286     pg_log.reset_backfill();
3287     child->pg_log.reset_backfill();
3288   }
3289
3290   child->info.stats = info.stats;
3291   child->info.stats.parent_split_bits = split_bits;
3292   info.stats.stats_invalid = true;
3293   child->info.stats.stats_invalid = true;
3294   child->info.last_epoch_started = info.last_epoch_started;
3295   child->info.last_interval_started = info.last_interval_started;
3296
3297   // There can't be recovery/backfill going on now
3298   int primary, up_primary;
3299   vector<int> newup, newacting;
3300   get_osdmap()->pg_to_up_acting_osds(
3301     child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
3302   child->init_primary_up_acting(
3303     newup,
3304     newacting,
3305     up_primary,
3306     primary);
3307   child->role = OSDMap::calc_pg_role(pg_whoami, child->acting);
3308
3309   // this comparison includes primary rank via pg_shard_t
3310   if (get_primary() != child->get_primary())
3311     child->info.history.same_primary_since = get_osdmap_epoch();
3312
3313   child->info.stats.up = newup;
3314   child->info.stats.up_primary = up_primary;
3315   child->info.stats.acting = newacting;
3316   child->info.stats.acting_primary = primary;
3317   child->info.stats.mapping_epoch = get_osdmap_epoch();
3318
3319   // History
3320   child->past_intervals = past_intervals;
3321
3322   child->on_new_interval();
3323
3324   child->send_notify = !child->is_primary();
3325
3326   child->dirty_info = true;
3327   child->dirty_big_info = true;
3328   dirty_info = true;
3329   dirty_big_info = true;
3330 }
3331
3332 void PeeringState::merge_from(
3333   map<spg_t,PeeringState *>& sources,
3334   PeeringCtx &rctx,
3335   unsigned split_bits,
3336   const pg_merge_meta_t& last_pg_merge_meta)
3337 {
3338   bool incomplete = false;
3339   if (info.last_complete != info.last_update ||
3340       info.is_incomplete() ||
3341       info.dne()) {
3342     psdout(10) << __func__ << " target incomplete" << dendl;
3343     incomplete = true;
3344   }
3345   if (last_pg_merge_meta.source_pgid != pg_t()) {
3346     if (info.pgid.pgid != last_pg_merge_meta.source_pgid.get_parent()) {
3347       psdout(10) << __func__ << " target doesn't match expected parent "
3348                  << last_pg_merge_meta.source_pgid.get_parent()
3349                  << " of source_pgid " << last_pg_merge_meta.source_pgid
3350                  << dendl;
3351       incomplete = true;
3352     }
3353     if (info.last_update != last_pg_merge_meta.target_version) {
3354       psdout(10) << __func__ << " target version doesn't match expected "
3355                << last_pg_merge_meta.target_version << dendl;
3356       incomplete = true;
3357     }
3358   }
3359
3360   PGLog::LogEntryHandlerRef handler{pl->get_log_handler(rctx.transaction)};
3361   pg_log.roll_forward(handler.get());
3362
3363   info.last_complete = info.last_update;  // to fake out trim()
3364   pg_log.reset_recovery_pointers();
3365   pg_log.trim(info.last_update, info);
3366
3367   vector<PGLog*> log_from;
3368   for (auto& i : sources) {
3369     auto& source = i.second;
3370     if (!source) {
3371       psdout(10) << __func__ << " source " << i.first << " missing" << dendl;
3372       incomplete = true;
3373       continue;
3374     }
3375     if (source->info.last_complete != source->info.last_update ||
3376         source->info.is_incomplete() ||
3377         source->info.dne()) {
3378       psdout(10) << __func__ << " source " << source->pg_whoami
3379                  << " incomplete"
3380                  << dendl;
3381       incomplete = true;
3382     }
3383     if (last_pg_merge_meta.source_pgid != pg_t()) {
3384       if (source->info.pgid.pgid != last_pg_merge_meta.source_pgid) {
3385         dout(10) << __func__ << " source " << source->info.pgid.pgid
3386                  << " doesn't match expected source pgid "
3387                  << last_pg_merge_meta.source_pgid << dendl;
3388         incomplete = true;
3389       }
3390       if (source->info.last_update != last_pg_merge_meta.source_version) {
3391         dout(10) << __func__ << " source version doesn't match expected "
3392                  << last_pg_merge_meta.target_version << dendl;
3393         incomplete = true;
3394       }
3395     }
3396
3397     // prepare log
3398     PGLog::LogEntryHandlerRef handler{
3399       source->pl->get_log_handler(rctx.transaction)};
3400     source->pg_log.roll_forward(handler.get());
3401     source->info.last_complete = source->info.last_update;  // to fake out trim()
3402     source->pg_log.reset_recovery_pointers();
3403     source->pg_log.trim(source->info.last_update, source->info);
3404     log_from.push_back(&source->pg_log);
3405
3406     // combine stats
3407     info.stats.add(source->info.stats);
3408
3409     // pull up last_update
3410     info.last_update = std::max(info.last_update, source->info.last_update);
3411
3412     // adopt source's PastIntervals if target has none.  we can do this since
3413     // pgp_num has been reduced prior to the merge, so the OSD mappings for
3414     // the PGs are identical.
3415     if (past_intervals.empty() && !source->past_intervals.empty()) {
3416       psdout(10) << __func__ << " taking source's past_intervals" << dendl;
3417       past_intervals = source->past_intervals;
3418     }
3419   }
3420
3421   info.last_complete = info.last_update;
3422   info.log_tail = info.last_update;
3423   if (incomplete) {
3424     info.last_backfill = hobject_t();
3425   }
3426
3427   // merge logs
3428   pg_log.merge_from(log_from, info.last_update);
3429
3430   // make sure we have a meaningful last_epoch_started/clean (if we were a
3431   // placeholder)
3432   if (info.history.epoch_created == 0) {
3433     // start with (a) source's history, since these PGs *should* have been
3434     // remapped in concert with each other...
3435     info.history = sources.begin()->second->info.history;
3436
3437     // we use the last_epoch_{started,clean} we got from
3438     // the caller, which are the epochs that were reported by the PGs were
3439     // found to be ready for merge.
3440     info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean;
3441     info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3442     info.last_epoch_started = last_pg_merge_meta.last_epoch_started;
3443     psdout(10) << __func__
3444                << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/"
3445                << last_pg_merge_meta.last_epoch_clean
3446                << " from pool last_dec_*, source pg history was "
3447                << sources.begin()->second->info.history
3448                << dendl;
3449
3450     // above we have pulled down source's history and we need to check
3451     // history.epoch_created again to confirm that source is not a placeholder
3452     // too. (peering requires a sane history.same_interval_since value for any
3453     // non-newly created pg and below here we know we are basically iterating
3454     // back a series of past maps to fake a merge process, hence we need to
3455     // fix history.same_interval_since first so that start_peering_interval()
3456     // will not complain)
3457     if (info.history.epoch_created == 0) {
3458       dout(10) << __func__ << " both merge target and source are placeholders,"
3459                << " set sis to lec " << info.history.last_epoch_clean
3460                << dendl;
3461       info.history.same_interval_since = info.history.last_epoch_clean;
3462     }
3463
3464     // if the past_intervals start is later than last_epoch_clean, it
3465     // implies the source repeered again but the target didn't, or
3466     // that the source became clean in a later epoch than the target.
3467     // avoid the discrepancy but adjusting the interval start
3468     // backwards to match so that check_past_interval_bounds() will
3469     // not complain.
3470     auto pib = past_intervals.get_bounds();
3471     if (info.history.last_epoch_clean < pib.first) {
3472       psdout(10) << __func__ << " last_epoch_clean "
3473                  << info.history.last_epoch_clean << " < past_interval start "
3474                  << pib.first << ", adjusting start backwards" << dendl;
3475       past_intervals.adjust_start_backwards(info.history.last_epoch_clean);
3476     }
3477
3478     // Similarly, if the same_interval_since value is later than
3479     // last_epoch_clean, the next interval change will result in a
3480     // past_interval start that is later than last_epoch_clean.  This
3481     // can happen if we use the pg_history values from the merge
3482     // source.  Adjust the same_interval_since value backwards if that
3483     // happens.  (We trust the les and lec values more because they came from
3484     // the real target, whereas the history value we stole from the source.)
3485     if (info.history.last_epoch_started < info.history.same_interval_since) {
3486       psdout(10) << __func__ << " last_epoch_started "
3487                  << info.history.last_epoch_started << " < same_interval_since "
3488                  << info.history.same_interval_since
3489                  << ", adjusting pg_history backwards" << dendl;
3490       info.history.same_interval_since = info.history.last_epoch_clean;
3491       // make sure same_{up,primary}_since are <= same_interval_since
3492       info.history.same_up_since = std::min(
3493         info.history.same_up_since, info.history.same_interval_since);
3494       info.history.same_primary_since = std::min(
3495         info.history.same_primary_since, info.history.same_interval_since);
3496     }
3497   }
3498
3499   dirty_info = true;
3500   dirty_big_info = true;
3501 }
3502
3503 void PeeringState::start_split_stats(
3504   const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
3505 {
3506   out->resize(childpgs.size() + 1);
3507   info.stats.stats.sum.split(*out);
3508 }
3509
3510 void PeeringState::finish_split_stats(
3511   const object_stat_sum_t& stats, ObjectStore::Transaction &t)
3512 {
3513   info.stats.stats.sum = stats;
3514   write_if_dirty(t);
3515 }
3516
3517 void PeeringState::update_blocked_by()
3518 {
3519   // set a max on the number of blocking peers we report. if we go
3520   // over, report a random subset.  keep the result sorted.
3521   unsigned keep = std::min<unsigned>(
3522     blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
3523   unsigned skip = blocked_by.size() - keep;
3524   info.stats.blocked_by.clear();
3525   info.stats.blocked_by.resize(keep);
3526   unsigned pos = 0;
3527   for (auto p = blocked_by.begin(); p != blocked_by.end() && keep > 0; ++p) {
3528     if (skip > 0 && (rand() % (skip + keep) < skip)) {
3529       --skip;
3530     } else {
3531       info.stats.blocked_by[pos++] = *p;
3532       --keep;
3533     }
3534   }
3535 }
3536
3537 static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard)
3538 {
3539     for (auto&p : pgs)
3540       if (p.shard == shard)
3541         return true;
3542     return false;
3543 }
3544
3545 static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard)
3546 {
3547     for (auto&p : pgs) {
3548       if (p == skip)
3549         continue;
3550       if (p.shard == shard)
3551         return p;
3552     }
3553     return pg_shard_t();
3554 }
3555
3556 void PeeringState::update_calc_stats()
3557 {
3558   info.stats.version = info.last_update;
3559   info.stats.created = info.history.epoch_created;
3560   info.stats.last_scrub = info.history.last_scrub;
3561   info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
3562   info.stats.last_deep_scrub = info.history.last_deep_scrub;
3563   info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
3564   info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
3565   info.stats.last_epoch_clean = info.history.last_epoch_clean;
3566
3567   info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
3568   info.stats.ondisk_log_size = info.stats.log_size;
3569   info.stats.log_start = pg_log.get_tail();
3570   info.stats.ondisk_log_start = pg_log.get_tail();
3571   info.stats.snaptrimq_len = pl->get_snap_trimq_size();
3572
3573   unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
3574
3575   // In rare case that upset is too large (usually transient), use as target
3576   // for calculations below.
3577   unsigned target = std::max(num_shards, (unsigned)upset.size());
3578   // For undersized actingset may be larger with OSDs out
3579   unsigned nrep = std::max(actingset.size(), upset.size());
3580   // calc num_object_copies
3581   info.stats.stats.calc_copies(std::max(target, nrep));
3582   info.stats.stats.sum.num_objects_degraded = 0;
3583   info.stats.stats.sum.num_objects_unfound = 0;
3584   info.stats.stats.sum.num_objects_misplaced = 0;
3585   info.stats.avail_no_missing.clear();
3586   info.stats.object_location_counts.clear();
3587
3588   // We should never hit this condition, but if end up hitting it,
3589   // make sure to update num_objects and set PG_STATE_INCONSISTENT.
3590   if (info.stats.stats.sum.num_objects < 0) {
3591     psdout(0) << __func__ << " negative num_objects = "
3592               << info.stats.stats.sum.num_objects << " setting it to 0 "
3593               << dendl;
3594     info.stats.stats.sum.num_objects = 0;
3595     state_set(PG_STATE_INCONSISTENT);
3596   }
3597
3598   if ((is_remapped() || is_undersized() || !is_clean()) &&
3599       (is_peered()|| is_activating())) {
3600     psdout(20) << __func__ << " actingset " << actingset << " upset "
3601                << upset << " acting_recovery_backfill " << acting_recovery_backfill << dendl;
3602
3603     ceph_assert(!acting_recovery_backfill.empty());
3604
3605     bool estimate = false;
3606
3607     // NOTE: we only generate degraded, misplaced and unfound
3608     // values for the summation, not individual stat categories.
3609     int64_t num_objects = info.stats.stats.sum.num_objects;
3610
3611     // Objects missing from up nodes, sorted by # objects.
3612     boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
3613     // Objects missing from nodes not in up, sort by # objects
3614     boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
3615
3616     // Fill missing_target_objects/acting_source_objects
3617
3618     {
3619       int64_t missing;
3620
3621       // Primary first
3622       missing = pg_log.get_missing().num_missing();
3623       ceph_assert(acting_recovery_backfill.count(pg_whoami));
3624       if (upset.count(pg_whoami)) {
3625         missing_target_objects.emplace(missing, pg_whoami);
3626       } else {
3627         acting_source_objects.emplace(missing, pg_whoami);
3628       }
3629       info.stats.stats.sum.num_objects_missing_on_primary = missing;
3630       if (missing == 0)
3631         info.stats.avail_no_missing.push_back(pg_whoami);
3632       psdout(20) << __func__ << " shard " << pg_whoami
3633                  << " primary objects " << num_objects
3634                  << " missing " << missing
3635                  << dendl;
3636     }
3637
3638     // All other peers
3639     for (auto& peer : peer_info) {
3640       // Primary should not be in the peer_info, skip if it is.
3641       if (peer.first == pg_whoami) continue;
3642       int64_t missing = 0;
3643       int64_t peer_num_objects =
3644         std::max((int64_t)0, peer.second.stats.stats.sum.num_objects);
3645       // Backfill targets always track num_objects accurately
3646       // all other peers track missing accurately.
3647       if (is_backfill_target(peer.first)) {
3648         missing = std::max((int64_t)0, num_objects - peer_num_objects);
3649       } else {
3650         if (peer_missing.count(peer.first)) {
3651           missing = peer_missing[peer.first].num_missing();
3652         } else {
3653           psdout(20) << __func__ << " no peer_missing found for "
3654                      << peer.first << dendl;
3655           if (is_recovering()) {
3656             estimate = true;
3657           }
3658           missing = std::max((int64_t)0, num_objects - peer_num_objects);
3659         }
3660       }
3661       if (upset.count(peer.first)) {
3662         missing_target_objects.emplace(missing, peer.first);
3663       } else if (actingset.count(peer.first)) {
3664         acting_source_objects.emplace(missing, peer.first);
3665       }
3666       peer.second.stats.stats.sum.num_objects_missing = missing;
3667       if (missing == 0)
3668         info.stats.avail_no_missing.push_back(peer.first);
3669       psdout(20) << __func__ << " shard " << peer.first
3670                  << " objects " << peer_num_objects
3671                  << " missing " << missing
3672                  << dendl;
3673     }
3674
3675     // Compute object_location_counts
3676     for (auto& ml: missing_loc.get_missing_locs()) {
3677       info.stats.object_location_counts[ml.second]++;
3678       psdout(30) << __func__ << " " << ml.first << " object_location_counts["
3679                  << ml.second << "]=" << info.stats.object_location_counts[ml.second]
3680                  << dendl;
3681     }
3682     int64_t not_missing = num_objects - missing_loc.get_missing_locs().size();
3683     if (not_missing) {
3684         // During recovery we know upset == actingset and is being populated
3685         // During backfill we know that all non-missing objects are in the actingset
3686         info.stats.object_location_counts[actingset] = not_missing;
3687     }
3688     psdout(30) << __func__ << " object_location_counts["
3689                << upset << "]=" << info.stats.object_location_counts[upset]
3690                << dendl;
3691     psdout(20) << __func__ << " object_location_counts "
3692                << info.stats.object_location_counts << dendl;
3693
3694     // A misplaced object is not stored on the correct OSD
3695     int64_t misplaced = 0;
3696     // a degraded objects has fewer replicas or EC shards than the pool specifies.
3697     int64_t degraded = 0;
3698
3699     if (is_recovering()) {
3700       for (auto& sml: missing_loc.get_missing_by_count()) {
3701         for (auto& ml: sml.second) {
3702           int missing_shards;
3703           if (sml.first == shard_id_t::NO_SHARD) {
3704             psdout(20) << __func__ << " ml " << ml.second
3705                        << " upset size " << upset.size()
3706                        << " up " << ml.first.up << dendl;
3707             missing_shards = (int)upset.size() - ml.first.up;
3708           } else {
3709             // Handle shards not even in upset below
3710             if (!find_shard(upset, sml.first))
3711               continue;
3712             missing_shards = std::max(0, 1 - ml.first.up);
3713             psdout(20) << __func__
3714                        << " shard " << sml.first
3715                        << " ml " << ml.second
3716                        << " missing shards " << missing_shards << dendl;
3717           }
3718           int odegraded = ml.second * missing_shards;
3719           // Copies on other osds but limited to the possible degraded
3720           int more_osds = std::min(missing_shards, ml.first.other);
3721           int omisplaced = ml.second * more_osds;
3722           ceph_assert(omisplaced <= odegraded);
3723           odegraded -= omisplaced;
3724
3725           misplaced += omisplaced;
3726           degraded += odegraded;
3727         }
3728       }
3729
3730       psdout(20) << __func__ << " missing based degraded "
3731                  << degraded << dendl;
3732       psdout(20) << __func__ << " missing based misplaced "
3733                  << misplaced << dendl;
3734
3735       // Handle undersized case
3736       if (pool.info.is_replicated()) {
3737         // Add degraded for missing targets (num_objects missing)
3738         ceph_assert(target >= upset.size());
3739         unsigned needed = target - upset.size();
3740         degraded += num_objects * needed;
3741       } else {
3742         for (unsigned i = 0 ; i < num_shards; ++i) {
3743           shard_id_t shard(i);
3744
3745           if (!find_shard(upset, shard)) {
3746             pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard);
3747
3748             if (pgs != pg_shard_t()) {
3749               int64_t missing;
3750
3751               if (pgs == pg_whoami)
3752                 missing = info.stats.stats.sum.num_objects_missing_on_primary;
3753               else
3754                 missing = peer_info[pgs].stats.stats.sum.num_objects_missing;
3755
3756               degraded += missing;
3757               misplaced += std::max((int64_t)0, num_objects - missing);
3758             } else {
3759               // No shard anywhere
3760               degraded += num_objects;
3761             }
3762           }
3763         }
3764       }
3765       goto out;
3766     }
3767
3768     // Handle undersized case
3769     if (pool.info.is_replicated()) {
3770       // Add to missing_target_objects
3771       ceph_assert(target >= missing_target_objects.size());
3772       unsigned needed = target - missing_target_objects.size();
3773       if (needed)
3774         missing_target_objects.emplace(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD));
3775     } else {
3776       for (unsigned i = 0 ; i < num_shards; ++i) {
3777         shard_id_t shard(i);
3778         bool found = false;
3779         for (const auto& t : missing_target_objects) {
3780           if (std::get<1>(t).shard == shard) {
3781             found = true;
3782             break;
3783           }
3784         }
3785         if (!found)
3786           missing_target_objects.emplace(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard));
3787       }
3788     }
3789
3790     for (const auto& item : missing_target_objects)
3791       psdout(20) << __func__ << " missing shard " << std::get<1>(item)
3792                  << " missing= " << std::get<0>(item) << dendl;
3793     for (const auto& item : acting_source_objects)
3794       psdout(20) << __func__ << " acting shard " << std::get<1>(item)
3795                  << " missing= " << std::get<0>(item) << dendl;
3796
3797     // Handle all objects not in missing for remapped
3798     // or backfill
3799     for (auto m = missing_target_objects.rbegin();
3800         m != missing_target_objects.rend(); ++m) {
3801
3802       int64_t extra_missing = -1;
3803
3804       if (pool.info.is_replicated()) {
3805         if (!acting_source_objects.empty()) {
3806           auto extra_copy = acting_source_objects.begin();
3807           extra_missing = std::get<0>(*extra_copy);
3808           acting_source_objects.erase(extra_copy);
3809         }
3810       } else {  // Erasure coded
3811         // Use corresponding shard
3812         for (const auto& a : acting_source_objects) {
3813           if (std::get<1>(a).shard == std::get<1>(*m).shard) {
3814             extra_missing = std::get<0>(a);
3815             acting_source_objects.erase(a);
3816             break;
3817           }
3818         }
3819       }
3820
3821       if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
3822         // We don't know which of the objects on the target
3823         // are part of extra_missing so assume are all degraded.
3824         misplaced += std::get<0>(*m) - extra_missing;
3825         degraded += extra_missing;
3826       } else {
3827         // 1. extra_missing == -1, more targets than sources so degraded
3828         // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
3829         //    previously degraded are now present on the target.
3830         degraded += std::get<0>(*m);
3831       }
3832     }
3833     // If there are still acting that haven't been accounted for
3834     // then they are misplaced
3835     for (const auto& a : acting_source_objects) {
3836       int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
3837       psdout(20) << __func__ << " extra acting misplaced " << extra_misplaced
3838                  << dendl;
3839       misplaced += extra_misplaced;
3840     }
3841 out:
3842     // NOTE: Tests use these messages to verify this code
3843     psdout(20) << __func__ << " degraded " << degraded
3844                << (estimate ? " (est)": "") << dendl;
3845     psdout(20) << __func__ << " misplaced " << misplaced
3846                << (estimate ? " (est)": "")<< dendl;
3847
3848     info.stats.stats.sum.num_objects_degraded = degraded;
3849     info.stats.stats.sum.num_objects_unfound = get_num_unfound();
3850     info.stats.stats.sum.num_objects_misplaced = misplaced;
3851   }
3852 }
3853
3854 std::optional<pg_stat_t> PeeringState::prepare_stats_for_publish(
3855   bool pg_stats_publish_valid,
3856   const pg_stat_t &pg_stats_publish,
3857   const object_stat_collection_t &unstable_stats)
3858 {
3859   if (info.stats.stats.sum.num_scrub_errors) {
3860     state_set(PG_STATE_INCONSISTENT);
3861   } else {
3862     state_clear(PG_STATE_INCONSISTENT);
3863     state_clear(PG_STATE_FAILED_REPAIR);
3864   }
3865
3866   utime_t now = ceph_clock_now();
3867   if (info.stats.state != state) {
3868     info.stats.last_change = now;
3869     // Optimistic estimation, if we just find out an inactive PG,
3870     // assumt it is active till now.
3871     if (!(state & PG_STATE_ACTIVE) &&
3872         (info.stats.state & PG_STATE_ACTIVE))
3873       info.stats.last_active = now;
3874
3875     if ((state & PG_STATE_ACTIVE) &&
3876         !(info.stats.state & PG_STATE_ACTIVE))
3877       info.stats.last_became_active = now;
3878     if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
3879         !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
3880       info.stats.last_became_peered = now;
3881     info.stats.state = state;
3882   }
3883
3884   update_calc_stats();
3885   if (info.stats.stats.sum.num_objects_degraded) {
3886     state_set(PG_STATE_DEGRADED);
3887   } else {
3888     state_clear(PG_STATE_DEGRADED);
3889   }
3890   update_blocked_by();
3891
3892   pg_stat_t pre_publish = info.stats;
3893   pre_publish.stats.add(unstable_stats);
3894   utime_t cutoff = now;
3895   cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
3896
3897   // share (some of) our purged_snaps via the pg_stats. limit # of intervals
3898   // because we don't want to make the pg_stat_t structures too expensive.
3899   unsigned max = cct->_conf->osd_max_snap_prune_intervals_per_epoch;
3900   unsigned num = 0;
3901   auto i = info.purged_snaps.begin();
3902   while (num < max && i != info.purged_snaps.end()) {
3903     pre_publish.purged_snaps.insert(i.get_start(), i.get_len());
3904     ++num;
3905     ++i;
3906   }
3907   psdout(20) << __func__ << " reporting purged_snaps "
3908              << pre_publish.purged_snaps << dendl;
3909
3910   if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
3911       info.stats.last_fresh > cutoff) {
3912     psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3913                << ": no change since " << info.stats.last_fresh << dendl;
3914     return std::nullopt;
3915   } else {
3916     // update our stat summary and timestamps
3917     info.stats.reported_epoch = get_osdmap_epoch();
3918     ++info.stats.reported_seq;
3919
3920     info.stats.last_fresh = now;
3921
3922     if (info.stats.state & PG_STATE_CLEAN)
3923       info.stats.last_clean = now;
3924     if (info.stats.state & PG_STATE_ACTIVE)
3925       info.stats.last_active = now;
3926     if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
3927       info.stats.last_peered = now;
3928     info.stats.last_unstale = now;
3929     if ((info.stats.state & PG_STATE_DEGRADED) == 0)
3930       info.stats.last_undegraded = now;
3931     if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
3932       info.stats.last_fullsized = now;
3933
3934     psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3935                << ":" << pg_stats_publish.reported_seq << dendl;
3936     return std::make_optional(std::move(pre_publish));
3937   }
3938 }
3939
3940 void PeeringState::init(
3941   int role,
3942   const vector<int>& newup, int new_up_primary,
3943   const vector<int>& newacting, int new_acting_primary,
3944   const pg_history_t& history,
3945   const PastIntervals& pi,
3946   bool backfill,
3947   ObjectStore::Transaction &t)
3948 {
3949   psdout(10) << "init role " << role << " up "
3950              << newup << " acting " << newacting
3951              << " history " << history
3952              << " past_intervals " << pi
3953              << dendl;
3954
3955   set_role(role);
3956   init_primary_up_acting(
3957     newup,
3958     newacting,
3959     new_up_primary,
3960     new_acting_primary);
3961
3962   info.history = history;
3963   past_intervals = pi;
3964
3965   info.stats.up = up;
3966   info.stats.up_primary = new_up_primary;
3967   info.stats.acting = acting;
3968   info.stats.acting_primary = new_acting_primary;
3969   info.stats.mapping_epoch = info.history.same_interval_since;
3970
3971   if (!perform_deletes_during_peering()) {
3972     pg_log.set_missing_may_contain_deletes();
3973   }
3974
3975   if (backfill) {
3976     psdout(10) << __func__ << ": Setting backfill" << dendl;
3977     info.set_last_backfill(hobject_t());
3978     info.last_complete = info.last_update;
3979     pg_log.mark_log_for_rewrite();
3980   }
3981
3982   on_new_interval();
3983
3984   dirty_info = true;
3985   dirty_big_info = true;
3986   write_if_dirty(t);
3987 }
3988
3989 void PeeringState::dump_peering_state(Formatter *f)
3990 {
3991   f->dump_string("state", get_pg_state_string());
3992   f->dump_unsigned("epoch", get_osdmap_epoch());
3993   f->open_array_section("up");
3994   for (auto p = up.begin(); p != up.end(); ++p)
3995     f->dump_unsigned("osd", *p);
3996   f->close_section();
3997   f->open_array_section("acting");
3998   for (auto p = acting.begin(); p != acting.end(); ++p)
3999     f->dump_unsigned("osd", *p);
4000   f->close_section();
4001   if (!backfill_targets.empty()) {
4002     f->open_array_section("backfill_targets");
4003     for (auto p = backfill_targets.begin(); p != backfill_targets.end(); ++p)
4004       f->dump_stream("shard") << *p;
4005     f->close_section();
4006   }
4007   if (!async_recovery_targets.empty()) {
4008     f->open_array_section("async_recovery_targets");
4009     for (auto p = async_recovery_targets.begin();
4010          p != async_recovery_targets.end();
4011          ++p)
4012       f->dump_stream("shard") << *p;
4013     f->close_section();
4014   }
4015   if (!acting_recovery_backfill.empty()) {
4016     f->open_array_section("acting_recovery_backfill");
4017     for (auto p = acting_recovery_backfill.begin();
4018          p != acting_recovery_backfill.end();
4019          ++p)
4020       f->dump_stream("shard") << *p;
4021     f->close_section();
4022   }
4023   f->open_object_section("info");
4024   update_calc_stats();
4025   info.dump(f);
4026   f->close_section();
4027
4028   f->open_array_section("peer_info");
4029   for (auto p = peer_info.begin(); p != peer_info.end(); ++p) {
4030     f->open_object_section("info");
4031     f->dump_stream("peer") << p->first;
4032     p->second.dump(f);
4033     f->close_section();
4034   }
4035   f->close_section();
4036 }
4037
4038 void PeeringState::update_stats(
4039   std::function<bool(pg_history_t &, pg_stat_t &)> f,
4040   ObjectStore::Transaction *t) {
4041   if (f(info.history, info.stats)) {
4042     pl->publish_stats_to_osd();
4043   }
4044   pl->on_info_history_change();
4045
4046   if (t) {
4047     dirty_info = true;
4048     write_if_dirty(*t);
4049   }
4050 }
4051
4052 bool PeeringState::append_log_entries_update_missing(
4053   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
4054   ObjectStore::Transaction &t, std::optional<eversion_t> trim_to,
4055   std::optional<eversion_t> roll_forward_to)
4056 {
4057   ceph_assert(!entries.empty());
4058   ceph_assert(entries.begin()->version > info.last_update);
4059
4060   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
4061   bool invalidate_stats =
4062     pg_log.append_new_log_entries(
4063       info.last_backfill,
4064       entries,
4065       rollbacker.get());
4066
4067   if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
4068     pg_log.roll_forward(rollbacker.get());
4069   }
4070   if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
4071     pg_log.roll_forward_to(*roll_forward_to, rollbacker.get());
4072     last_rollback_info_trimmed_to_applied = *roll_forward_to;
4073   }
4074
4075   info.last_update = pg_log.get_head();
4076
4077   if (pg_log.get_missing().num_missing() == 0) {
4078     // advance last_complete since nothing else is missing!
4079     info.last_complete = info.last_update;
4080   }
4081   info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
4082
4083   psdout(20) << __func__ << " trim_to bool = " << bool(trim_to)
4084              << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
4085   if (trim_to)
4086     pg_log.trim(*trim_to, info);
4087   dirty_info = true;
4088   write_if_dirty(t);
4089   return invalidate_stats;
4090 }
4091
4092 void PeeringState::merge_new_log_entries(
4093   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
4094   ObjectStore::Transaction &t,
4095   std::optional<eversion_t> trim_to,
4096   std::optional<eversion_t> roll_forward_to)
4097 {
4098   psdout(10) << __func__ << " " << entries << dendl;
4099   ceph_assert(is_primary());
4100
4101   bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
4102   for (auto i = acting_recovery_backfill.begin();
4103        i != acting_recovery_backfill.end();
4104        ++i) {
4105     pg_shard_t peer(*i);
4106     if (peer == pg_whoami) continue;
4107     ceph_assert(peer_missing.count(peer));
4108     ceph_assert(peer_info.count(peer));
4109     pg_missing_t& pmissing(peer_missing[peer]);
4110     psdout(20) << __func__ << " peer_missing for " << peer
4111                << " = " << pmissing << dendl;
4112     pg_info_t& pinfo(peer_info[peer]);
4113     bool invalidate_stats = PGLog::append_log_entries_update_missing(
4114       pinfo.last_backfill,
4115       entries,
4116       true,
4117       NULL,
4118       pmissing,
4119       NULL,
4120       dpp);
4121     pinfo.last_update = info.last_update;
4122     pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
4123     rebuild_missing = rebuild_missing || invalidate_stats;
4124   }
4125
4126   if (!rebuild_missing) {
4127     return;
4128   }
4129
4130   for (auto &&i: entries) {
4131     missing_loc.rebuild(
4132       i.soid,
4133       pg_whoami,
4134       acting_recovery_backfill,
4135       info,
4136       pg_log.get_missing(),
4137       peer_missing,
4138       peer_info);
4139   }
4140 }
4141
4142 void PeeringState::add_log_entry(const pg_log_entry_t& e, bool applied)
4143 {
4144   // raise last_complete only if we were previously up to date
4145   if (info.last_complete == info.last_update)
4146     info.last_complete = e.version;
4147
4148   // raise last_update.
4149   ceph_assert(e.version > info.last_update);
4150   info.last_update = e.version;
4151
4152   // raise user_version, if it increased (it may have not get bumped
4153   // by all logged updates)
4154   if (e.user_version > info.last_user_version)
4155     info.last_user_version = e.user_version;
4156
4157   // log mutation
4158   pg_log.add(e, applied);
4159   psdout(10) << "add_log_entry " << e << dendl;
4160 }
4161
4162
4163 void PeeringState::append_log(
4164   vector<pg_log_entry_t>&& logv,
4165   eversion_t trim_to,
4166   eversion_t roll_forward_to,
4167   eversion_t mlcod,
4168   ObjectStore::Transaction &t,
4169   bool transaction_applied,
4170   bool async)
4171 {
4172   /* The primary has sent an info updating the history, but it may not
4173    * have arrived yet.  We want to make sure that we cannot remember this
4174    * write without remembering that it happened in an interval which went
4175    * active in epoch history.last_epoch_started.
4176    */
4177   if (info.last_epoch_started != info.history.last_epoch_started) {
4178     info.history.last_epoch_started = info.last_epoch_started;
4179   }
4180   if (info.last_interval_started != info.history.last_interval_started) {
4181     info.history.last_interval_started = info.last_interval_started;
4182   }
4183   psdout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
4184
4185   PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
4186   if (!transaction_applied) {
4187      /* We must be a backfill or async recovery peer, so it's ok if we apply
4188       * out-of-turn since we won't be considered when
4189       * determining a min possible last_update.
4190       *
4191       * We skip_rollforward() here, which advances the crt, without
4192       * doing an actual rollforward. This avoids cleaning up entries
4193       * from the backend and we do not end up in a situation, where the
4194       * object is deleted before we can _merge_object_divergent_entries().
4195       */
4196     pg_log.skip_rollforward();
4197   }
4198
4199   for (auto p = logv.begin(); p != logv.end(); ++p) {
4200     add_log_entry(*p, transaction_applied);
4201
4202     /* We don't want to leave the rollforward artifacts around
4203      * here past last_backfill.  It's ok for the same reason as
4204      * above */
4205     if (transaction_applied &&
4206         p->soid > info.last_backfill) {
4207       pg_log.roll_forward(handler.get());
4208     }
4209   }
4210   if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
4211     pg_log.roll_forward_to(
4212       roll_forward_to,
4213       handler.get());
4214     last_rollback_info_trimmed_to_applied = roll_forward_to;
4215   }
4216
4217   psdout(10) << __func__ << " approx pg log length =  "
4218              << pg_log.get_log().approx_size() << dendl;
4219   psdout(10) << __func__ << " transaction_applied = "
4220              << transaction_applied << dendl;
4221   if (!transaction_applied || async)
4222     psdout(10) << __func__ << " " << pg_whoami
4223                << " is async_recovery or backfill target" << dendl;
4224   pg_log.trim(trim_to, info, transaction_applied, async);
4225
4226   // update the local pg, pg log
4227   dirty_info = true;
4228   write_if_dirty(t);
4229
4230   if (!is_primary())
4231     min_last_complete_ondisk = mlcod;
4232 }
4233
4234 void PeeringState::recover_got(
4235   const hobject_t &oid, eversion_t v,
4236   bool is_delete,
4237   ObjectStore::Transaction &t)
4238 {
4239   if (v > pg_log.get_can_rollback_to()) {
4240     /* This can only happen during a repair, and even then, it would
4241      * be one heck of a race.  If we are repairing the object, the
4242      * write in question must be fully committed, so it's not valid
4243      * to roll it back anyway (and we'll be rolled forward shortly
4244      * anyway) */
4245     PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
4246     pg_log.roll_forward_to(v, handler.get());
4247   }
4248
4249   psdout(10) << "got missing " << oid << " v " << v << dendl;
4250   pg_log.recover_got(oid, v, info);
4251   if (pg_log.get_log().log.empty()) {
4252     psdout(10) << "last_complete now " << info.last_complete
4253                << " while log is empty" << dendl;
4254   } else if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
4255     psdout(10) << "last_complete now " << info.last_complete
4256                << " log.complete_to " << pg_log.get_log().complete_to->version
4257                << dendl;
4258   } else {
4259     psdout(10) << "last_complete now " << info.last_complete
4260                << " log.complete_to at end" << dendl;
4261     //below is not true in the repair case.
4262     //assert(missing.num_missing() == 0);  // otherwise, complete_to was wrong.
4263     ceph_assert(info.last_complete == info.last_update);
4264   }
4265
4266   if (is_primary()) {
4267     ceph_assert(missing_loc.needs_recovery(oid));
4268     if (!is_delete)
4269       missing_loc.add_location(oid, pg_whoami);
4270   }
4271
4272   // update pg
4273   dirty_info = true;
4274   write_if_dirty(t);
4275 }
4276
4277 void PeeringState::update_backfill_progress(
4278   const hobject_t &updated_backfill,
4279   const pg_stat_t &updated_stats,
4280   bool preserve_local_num_bytes,
4281   ObjectStore::Transaction &t) {
4282   info.set_last_backfill(updated_backfill);
4283   if (preserve_local_num_bytes) {
4284     psdout(25) << __func__ << " primary " << updated_stats.stats.sum.num_bytes
4285                << " local " << info.stats.stats.sum.num_bytes << dendl;
4286     int64_t bytes = info.stats.stats.sum.num_bytes;
4287     info.stats = updated_stats;
4288     info.stats.stats.sum.num_bytes = bytes;
4289   } else {
4290     psdout(20) << __func__ << " final " << updated_stats.stats.sum.num_bytes
4291                << " replaces local " << info.stats.stats.sum.num_bytes << dendl;
4292     info.stats = updated_stats;
4293   }
4294
4295   dirty_info = true;
4296   write_if_dirty(t);
4297 }
4298
4299 void PeeringState::adjust_purged_snaps(
4300   std::function<void(interval_set<snapid_t> &snaps)> f) {
4301   f(info.purged_snaps);
4302   dirty_info = true;
4303   dirty_big_info = true;
4304 }
4305
4306 void PeeringState::on_peer_recover(
4307   pg_shard_t peer,
4308   const hobject_t &soid,
4309   const eversion_t &version)
4310 {
4311   pl->publish_stats_to_osd();
4312   // done!
4313   peer_missing[peer].got(soid, version);
4314   missing_loc.add_location(soid, peer);
4315 }
4316
4317 void PeeringState::begin_peer_recover(
4318   pg_shard_t peer,
4319   const hobject_t soid)
4320 {
4321   peer_missing[peer].revise_have(soid, eversion_t());
4322 }
4323
4324 void PeeringState::force_object_missing(
4325   const set<pg_shard_t> &peers,
4326   const hobject_t &soid,
4327   eversion_t version)
4328 {
4329   for (auto &&peer : peers) {
4330     if (peer != primary) {
4331       peer_missing[peer].add(soid, version, eversion_t(), false);
4332     } else {
4333       pg_log.missing_add(soid, version, eversion_t());
4334       pg_log.reset_complete_to(&info);
4335       pg_log.set_last_requested(0);
4336     }
4337   }
4338
4339   missing_loc.rebuild(
4340     soid,
4341     pg_whoami,
4342     acting_recovery_backfill,
4343     info,
4344     pg_log.get_missing(),
4345     peer_missing,
4346     peer_info);
4347 }
4348
4349 void PeeringState::pre_submit_op(
4350   const hobject_t &hoid,
4351   const vector<pg_log_entry_t>& logv,
4352   eversion_t at_version)
4353 {
4354   if (at_version > eversion_t()) {
4355     for (auto &&i : get_acting_recovery_backfill()) {
4356       if (i == primary) continue;
4357       pg_info_t &pinfo = peer_info[i];
4358       // keep peer_info up to date
4359       if (pinfo.last_complete == pinfo.last_update)
4360         pinfo.last_complete = at_version;
4361       pinfo.last_update = at_version;
4362     }
4363   }
4364
4365   bool requires_missing_loc = false;
4366   for (auto &&i : get_async_recovery_targets()) {
4367     if (i == primary || !get_peer_missing(i).is_missing(hoid))
4368       continue;
4369     requires_missing_loc = true;
4370     for (auto &&entry: logv) {
4371       peer_missing[i].add_next_event(entry);
4372     }
4373   }
4374
4375   if (requires_missing_loc) {
4376     for (auto &&entry: logv) {
4377       psdout(30) << __func__ << " missing_loc before: "
4378                  << missing_loc.get_locations(entry.soid) << dendl;
4379       missing_loc.add_missing(entry.soid, entry.version,
4380                               eversion_t(), entry.is_delete());
4381       // clear out missing_loc
4382       missing_loc.clear_location(entry.soid);
4383       for (auto &i: get_actingset()) {
4384         if (!get_peer_missing(i).is_missing(entry.soid))
4385           missing_loc.add_location(entry.soid, i);
4386       }
4387       psdout(30) << __func__ << " missing_loc after: "
4388                  << missing_loc.get_locations(entry.soid) << dendl;
4389     }
4390   }
4391 }
4392
4393 void PeeringState::recovery_committed_to(eversion_t version)
4394 {
4395   psdout(10) << __func__ << " version " << version
4396              << " now ondisk" << dendl;
4397   last_complete_ondisk = version;
4398
4399   if (last_complete_ondisk == info.last_update) {
4400     if (!is_primary()) {
4401       // Either we are a replica or backfill target.
4402       // we are fully up to date.  tell the primary!
4403       pl->send_cluster_message(
4404         get_primary().osd,
4405         make_message<MOSDPGTrim>(
4406           get_osdmap_epoch(),
4407           spg_t(info.pgid.pgid, primary.shard),
4408           last_complete_ondisk),
4409         get_osdmap_epoch());
4410     } else {
4411       calc_min_last_complete_ondisk();
4412     }
4413   }
4414 }
4415
4416 void PeeringState::complete_write(eversion_t v, eversion_t lc)
4417 {
4418   last_update_ondisk = v;
4419   last_complete_ondisk = lc;
4420   calc_min_last_complete_ondisk();
4421 }
4422
4423 void PeeringState::calc_trim_to()
4424 {
4425   size_t target = pl->get_target_pg_log_entries();
4426
4427   eversion_t limit = std::min(
4428     min_last_complete_ondisk,
4429     pg_log.get_can_rollback_to());
4430   if (limit != eversion_t() &&
4431       limit != pg_trim_to &&
4432       pg_log.get_log().approx_size() > target) {
4433     size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target,
4434                              cct->_conf->osd_pg_log_trim_max);
4435     if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4436         cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4437       return;
4438     }
4439     auto it = pg_log.get_log().log.begin();
4440     eversion_t new_trim_to;
4441     for (size_t i = 0; i < num_to_trim; ++i) {
4442       new_trim_to = it->version;
4443       ++it;
4444       if (new_trim_to > limit) {
4445         new_trim_to = limit;
4446         psdout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
4447         break;
4448       }
4449     }
4450     psdout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
4451     pg_trim_to = new_trim_to;
4452     assert(pg_trim_to <= pg_log.get_head());
4453     assert(pg_trim_to <= min_last_complete_ondisk);
4454   }
4455 }
4456
4457 void PeeringState::calc_trim_to_aggressive()
4458 {
4459   size_t target = pl->get_target_pg_log_entries();
4460
4461   // limit pg log trimming up to the can_rollback_to value
4462   eversion_t limit = std::min({
4463     pg_log.get_head(),
4464     pg_log.get_can_rollback_to(),
4465     last_update_ondisk});
4466   psdout(10) << __func__ << " limit = " << limit << dendl;
4467
4468   if (limit != eversion_t() &&
4469       limit != pg_trim_to &&
4470       pg_log.get_log().approx_size() > target) {
4471     psdout(10) << __func__ << " approx pg log length =  "
4472              << pg_log.get_log().approx_size() << dendl;
4473     uint64_t num_to_trim = std::min<uint64_t>(pg_log.get_log().approx_size() - target,
4474                                               cct->_conf->osd_pg_log_trim_max);
4475     psdout(10) << __func__ << " num_to_trim =  " << num_to_trim << dendl;
4476     if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
4477         cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
4478       return;
4479     }
4480     auto it = pg_log.get_log().log.begin(); // oldest log entry
4481     auto rit = pg_log.get_log().log.rbegin();
4482     eversion_t by_n_to_keep; // start from tail
4483     eversion_t by_n_to_trim = eversion_t::max(); // start from head
4484     for (size_t i = 0; it != pg_log.get_log().log.end(); ++it, ++rit) {
4485       i++;
4486       if (i > target && by_n_to_keep == eversion_t()) {
4487         by_n_to_keep = rit->version;
4488       }
4489       if (i >= num_to_trim && by_n_to_trim == eversion_t::max()) {
4490         by_n_to_trim = it->version;
4491       }
4492       if (by_n_to_keep != eversion_t() &&
4493           by_n_to_trim != eversion_t::max()) {
4494         break;
4495       }
4496     }
4497
4498     if (by_n_to_keep == eversion_t()) {
4499       return;
4500     }
4501
4502     pg_trim_to = std::min({by_n_to_keep, by_n_to_trim, limit});
4503     psdout(10) << __func__ << " pg_trim_to now " << pg_trim_to << dendl;
4504     ceph_assert(pg_trim_to <= pg_log.get_head());
4505   }
4506 }
4507
4508 void PeeringState::apply_op_stats(
4509   const hobject_t &soid,
4510   const object_stat_sum_t &delta_stats)
4511 {
4512   info.stats.stats.add(delta_stats);
4513   info.stats.stats.floor(0);
4514
4515   for (auto i = get_backfill_targets().begin();
4516        i != get_backfill_targets().end();
4517        ++i) {
4518     pg_shard_t bt = *i;
4519     pg_info_t& pinfo = peer_info[bt];
4520     if (soid <= pinfo.last_backfill)
4521       pinfo.stats.stats.add(delta_stats);
4522   }
4523 }
4524
4525 void PeeringState::update_complete_backfill_object_stats(
4526   const hobject_t &hoid,
4527   const pg_stat_t &stats)
4528 {
4529   for (auto &&bt: get_backfill_targets()) {
4530     pg_info_t& pinfo = peer_info[bt];
4531     //Add stats to all peers that were missing object
4532     if (hoid > pinfo.last_backfill)
4533       pinfo.stats.add(stats);
4534   }
4535 }
4536
4537 void PeeringState::update_peer_last_backfill(
4538   pg_shard_t peer,
4539   const hobject_t &new_last_backfill)
4540 {
4541   pg_info_t &pinfo = peer_info[peer];
4542   pinfo.last_backfill = new_last_backfill;
4543   if (new_last_backfill.is_max()) {
4544     /* pinfo.stats might be wrong if we did log-based recovery on the
4545      * backfilled portion in addition to continuing backfill.
4546      */
4547     pinfo.stats = info.stats;
4548   }
4549 }
4550
4551 void PeeringState::set_revert_with_targets(
4552   const hobject_t &soid,
4553   const set<pg_shard_t> &good_peers)
4554 {
4555   for (auto &&peer: good_peers) {
4556     missing_loc.add_location(soid, peer);
4557   }
4558 }
4559
4560 void PeeringState::prepare_backfill_for_missing(
4561   const hobject_t &soid,
4562   const eversion_t &version,
4563   const vector<pg_shard_t> &targets) {
4564   for (auto &&peer: targets) {
4565     peer_missing[peer].add(soid, version, eversion_t(), false);
4566   }
4567 }
4568
4569 void PeeringState::update_hset(const pg_hit_set_history_t &hset_history)
4570 {
4571   info.hit_set = hset_history;
4572 }
4573
4574 /*------------ Peering State Machine----------------*/
4575 #undef dout_prefix
4576 #define dout_prefix (context< PeeringMachine >().dpp->gen_prefix(*_dout) \
4577                     << "state<" << get_state_name() << ">: ")
4578 #undef psdout
4579 #define psdout(x) ldout(context< PeeringMachine >().cct, x)
4580
4581 #define DECLARE_LOCALS                                  \
4582   PeeringState *ps = context< PeeringMachine >().state; \
4583   std::ignore = ps;                                     \
4584   PeeringListener *pl = context< PeeringMachine >().pl; \
4585   std::ignore = pl
4586
4587
4588 /*------Crashed-------*/
4589 PeeringState::Crashed::Crashed(my_context ctx)
4590   : my_base(ctx),
4591     NamedState(context< PeeringMachine >().state_history, "Crashed")
4592 {
4593   context< PeeringMachine >().log_enter(state_name);
4594   ceph_abort_msg("we got a bad state machine event");
4595 }
4596
4597
4598 /*------Initial-------*/
4599 PeeringState::Initial::Initial(my_context ctx)
4600   : my_base(ctx),
4601     NamedState(context< PeeringMachine >().state_history, "Initial")
4602 {
4603   context< PeeringMachine >().log_enter(state_name);
4604 }
4605
4606 boost::statechart::result PeeringState::Initial::react(const MNotifyRec& notify)
4607 {
4608   DECLARE_LOCALS;
4609   ps->proc_replica_info(
4610     notify.from, notify.notify.info, notify.notify.epoch_sent);
4611   ps->set_last_peering_reset();
4612   return transit< Primary >();
4613 }
4614
4615 boost::statechart::result PeeringState::Initial::react(const MInfoRec& i)
4616 {
4617   DECLARE_LOCALS;
4618   ceph_assert(!ps->is_primary());
4619   post_event(i);
4620   return transit< Stray >();
4621 }
4622
4623 boost::statechart::result PeeringState::Initial::react(const MLogRec& i)
4624 {
4625   DECLARE_LOCALS;
4626   ceph_assert(!ps->is_primary());
4627   post_event(i);
4628   return transit< Stray >();
4629 }
4630
4631 void PeeringState::Initial::exit()
4632 {
4633   context< PeeringMachine >().log_exit(state_name, enter_time);
4634   DECLARE_LOCALS;
4635   utime_t dur = ceph_clock_now() - enter_time;
4636   pl->get_peering_perf().tinc(rs_initial_latency, dur);
4637 }
4638
4639 /*------Started-------*/
4640 PeeringState::Started::Started(my_context ctx)
4641   : my_base(ctx),
4642     NamedState(context< PeeringMachine >().state_history, "Started")
4643 {
4644   context< PeeringMachine >().log_enter(state_name);
4645 }
4646
4647 boost::statechart::result
4648 PeeringState::Started::react(const IntervalFlush&)
4649 {
4650   psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4651   context< PeeringMachine >().state->end_block_outgoing();
4652   return discard_event();
4653 }
4654
4655 boost::statechart::result PeeringState::Started::react(const AdvMap& advmap)
4656 {
4657   DECLARE_LOCALS;
4658   psdout(10) << "Started advmap" << dendl;
4659   ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4660   if (ps->should_restart_peering(
4661         advmap.up_primary,
4662         advmap.acting_primary,
4663         advmap.newup,
4664         advmap.newacting,
4665         advmap.lastmap,
4666         advmap.osdmap)) {
4667     psdout(10) << "should_restart_peering, transitioning to Reset"
4668                        << dendl;
4669     post_event(advmap);
4670     return transit< Reset >();
4671   }
4672   ps->remove_down_peer_info(advmap.osdmap);
4673   return discard_event();
4674 }
4675
4676 boost::statechart::result PeeringState::Started::react(const QueryState& q)
4677 {
4678   q.f->open_object_section("state");
4679   q.f->dump_string("name", state_name);
4680   q.f->dump_stream("enter_time") << enter_time;
4681   q.f->close_section();
4682   return discard_event();
4683 }
4684
4685 boost::statechart::result PeeringState::Started::react(const QueryUnfound& q)
4686 {
4687   q.f->dump_string("state", "Started");
4688   q.f->dump_bool("available_might_have_unfound", false);
4689   return discard_event();
4690 }
4691
4692 void PeeringState::Started::exit()
4693 {
4694   context< PeeringMachine >().log_exit(state_name, enter_time);
4695   DECLARE_LOCALS;
4696   utime_t dur = ceph_clock_now() - enter_time;
4697   pl->get_peering_perf().tinc(rs_started_latency, dur);
4698   ps->state_clear(PG_STATE_WAIT | PG_STATE_LAGGY);
4699 }
4700
4701 /*--------Reset---------*/
4702 PeeringState::Reset::Reset(my_context ctx)
4703   : my_base(ctx),
4704     NamedState(context< PeeringMachine >().state_history, "Reset")
4705 {
4706   context< PeeringMachine >().log_enter(state_name);
4707   DECLARE_LOCALS;
4708
4709   ps->flushes_in_progress = 0;
4710   ps->set_last_peering_reset();
4711   ps->log_weirdness();
4712 }
4713
4714 boost::statechart::result
4715 PeeringState::Reset::react(const IntervalFlush&)
4716 {
4717   psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
4718   context< PeeringMachine >().state->end_block_outgoing();
4719   return discard_event();
4720 }
4721
4722 boost::statechart::result PeeringState::Reset::react(const AdvMap& advmap)
4723 {
4724   DECLARE_LOCALS;
4725   psdout(10) << "Reset advmap" << dendl;
4726
4727   ps->check_full_transition(advmap.lastmap, advmap.osdmap);
4728
4729   if (ps->should_restart_peering(
4730         advmap.up_primary,
4731         advmap.acting_primary,
4732         advmap.newup,
4733         advmap.newacting,
4734         advmap.lastmap,
4735         advmap.osdmap)) {
4736     psdout(10) << "should restart peering, calling start_peering_interval again"
4737                        << dendl;
4738     ps->start_peering_interval(
4739       advmap.lastmap,
4740       advmap.newup, advmap.up_primary,
4741       advmap.newacting, advmap.acting_primary,
4742       context< PeeringMachine >().get_cur_transaction());
4743   }
4744   ps->remove_down_peer_info(advmap.osdmap);
4745   ps->check_past_interval_bounds();
4746   return discard_event();
4747 }
4748
4749 boost::statechart::result PeeringState::Reset::react(const ActMap&)
4750 {
4751   DECLARE_LOCALS;
4752   if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
4753     ps->info.history.refresh_prior_readable_until_ub(
4754       pl->get_mnow(),
4755       ps->prior_readable_until_ub);
4756     context< PeeringMachine >().send_notify(
4757       ps->get_primary().osd,
4758       pg_notify_t(
4759         ps->get_primary().shard, ps->pg_whoami.shard,
4760         ps->get_osdmap_epoch(),
4761         ps->get_osdmap_epoch(),
4762         ps->info,
4763         ps->past_intervals));
4764   }
4765
4766   ps->update_heartbeat_peers();
4767
4768   return transit< Started >();
4769 }
4770
4771 boost::statechart::result PeeringState::Reset::react(const QueryState& q)
4772 {
4773   q.f->open_object_section("state");
4774   q.f->dump_string("name", state_name);
4775   q.f->dump_stream("enter_time") << enter_time;
4776   q.f->close_section();
4777   return discard_event();
4778 }
4779
4780 boost::statechart::result PeeringState::Reset::react(const QueryUnfound& q)
4781 {
4782   q.f->dump_string("state", "Reset");
4783   q.f->dump_bool("available_might_have_unfound", false);
4784   return discard_event();
4785 }
4786
4787 void PeeringState::Reset::exit()
4788 {
4789   context< PeeringMachine >().log_exit(state_name, enter_time);
4790   DECLARE_LOCALS;
4791   utime_t dur = ceph_clock_now() - enter_time;
4792   pl->get_peering_perf().tinc(rs_reset_latency, dur);
4793 }
4794
4795 /*-------Start---------*/
4796 PeeringState::Start::Start(my_context ctx)
4797   : my_base(ctx),
4798     NamedState(context< PeeringMachine >().state_history, "Start")
4799 {
4800   context< PeeringMachine >().log_enter(state_name);
4801
4802   DECLARE_LOCALS;
4803   if (ps->is_primary()) {
4804     psdout(1) << "transitioning to Primary" << dendl;
4805     post_event(MakePrimary());
4806   } else { //is_stray
4807     psdout(1) << "transitioning to Stray" << dendl;
4808     post_event(MakeStray());
4809   }
4810 }
4811
4812 void PeeringState::Start::exit()
4813 {
4814   context< PeeringMachine >().log_exit(state_name, enter_time);
4815   DECLARE_LOCALS;
4816   utime_t dur = ceph_clock_now() - enter_time;
4817   pl->get_peering_perf().tinc(rs_start_latency, dur);
4818 }
4819
4820 /*---------Primary--------*/
4821 PeeringState::Primary::Primary(my_context ctx)
4822   : my_base(ctx),
4823     NamedState(context< PeeringMachine >().state_history, "Started/Primary")
4824 {
4825   context< PeeringMachine >().log_enter(state_name);
4826   DECLARE_LOCALS;
4827   ceph_assert(ps->want_acting.empty());
4828
4829   // set CREATING bit until we have peered for the first time.
4830   if (ps->info.history.last_epoch_started == 0) {
4831     ps->state_set(PG_STATE_CREATING);
4832     // use the history timestamp, which ultimately comes from the
4833     // monitor in the create case.
4834     utime_t t = ps->info.history.last_scrub_stamp;
4835     ps->info.stats.last_fresh = t;
4836     ps->info.stats.last_active = t;
4837     ps->info.stats.last_change = t;
4838     ps->info.stats.last_peered = t;
4839     ps->info.stats.last_clean = t;
4840     ps->info.stats.last_unstale = t;
4841     ps->info.stats.last_undegraded = t;
4842     ps->info.stats.last_fullsized = t;
4843     ps->info.stats.last_scrub_stamp = t;
4844     ps->info.stats.last_deep_scrub_stamp = t;
4845     ps->info.stats.last_clean_scrub_stamp = t;
4846   }
4847 }
4848
4849 boost::statechart::result PeeringState::Primary::react(const MNotifyRec& notevt)
4850 {
4851   DECLARE_LOCALS;
4852   psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
4853   ps->proc_replica_info(
4854     notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
4855   return discard_event();
4856 }
4857
4858 boost::statechart::result PeeringState::Primary::react(const ActMap&)
4859 {
4860   DECLARE_LOCALS;
4861   psdout(7) << "handle ActMap primary" << dendl;
4862   pl->publish_stats_to_osd();
4863   return discard_event();
4864 }
4865
4866 boost::statechart::result PeeringState::Primary::react(
4867   const SetForceRecovery&)
4868 {
4869   DECLARE_LOCALS;
4870   ps->set_force_recovery(true);
4871   return discard_event();
4872 }
4873
4874 boost::statechart::result PeeringState::Primary::react(
4875   const UnsetForceRecovery&)
4876 {
4877   DECLARE_LOCALS;
4878   ps->set_force_recovery(false);
4879   return discard_event();
4880 }
4881
4882 boost::statechart::result PeeringState::Primary::react(
4883   const RequestScrub& evt)
4884 {
4885   DECLARE_LOCALS;
4886   if (ps->is_primary()) {
4887     pl->scrub_requested(evt.deep, evt.repair);
4888     psdout(10) << "marking for scrub" << dendl;
4889   }
4890   return discard_event();
4891 }
4892
4893 boost::statechart::result PeeringState::Primary::react(
4894   const SetForceBackfill&)
4895 {
4896   DECLARE_LOCALS;
4897   ps->set_force_backfill(true);
4898   return discard_event();
4899 }
4900
4901 boost::statechart::result PeeringState::Primary::react(
4902   const UnsetForceBackfill&)
4903 {
4904   DECLARE_LOCALS;
4905   ps->set_force_backfill(false);
4906   return discard_event();
4907 }
4908
4909 void PeeringState::Primary::exit()
4910 {
4911   context< PeeringMachine >().log_exit(state_name, enter_time);
4912   DECLARE_LOCALS;
4913   ps->want_acting.clear();
4914   utime_t dur = ceph_clock_now() - enter_time;
4915   pl->get_peering_perf().tinc(rs_primary_latency, dur);
4916   pl->clear_primary_state();
4917   ps->state_clear(PG_STATE_CREATING);
4918 }
4919
4920 /*---------Peering--------*/
4921 PeeringState::Peering::Peering(my_context ctx)
4922   : my_base(ctx),
4923     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering"),
4924     history_les_bound(false)
4925 {
4926   context< PeeringMachine >().log_enter(state_name);
4927   DECLARE_LOCALS;
4928
4929   ceph_assert(!ps->is_peered());
4930   ceph_assert(!ps->is_peering());
4931   ceph_assert(ps->is_primary());
4932   ps->state_set(PG_STATE_PEERING);
4933 }
4934
4935 boost::statechart::result PeeringState::Peering::react(const AdvMap& advmap)
4936 {
4937   DECLARE_LOCALS;
4938   psdout(10) << "Peering advmap" << dendl;
4939   if (prior_set.affected_by_map(*(advmap.osdmap), ps->dpp)) {
4940     psdout(1) << "Peering, affected_by_map, going to Reset" << dendl;
4941     post_event(advmap);
4942     return transit< Reset >();
4943   }
4944
4945   ps->adjust_need_up_thru(advmap.osdmap);
4946   ps->check_prior_readable_down_osds(advmap.osdmap);
4947
4948   return forward_event();
4949 }
4950
4951 boost::statechart::result PeeringState::Peering::react(const QueryState& q)
4952 {
4953   DECLARE_LOCALS;
4954
4955   q.f->open_object_section("state");
4956   q.f->dump_string("name", state_name);
4957   q.f->dump_stream("enter_time") << enter_time;
4958
4959   q.f->open_array_section("past_intervals");
4960   ps->past_intervals.dump(q.f);
4961   q.f->close_section();
4962
4963   q.f->open_array_section("probing_osds");
4964   for (auto p = prior_set.probe.begin(); p != prior_set.probe.end(); ++p)
4965     q.f->dump_stream("osd") << *p;
4966   q.f->close_section();
4967
4968   if (prior_set.pg_down)
4969     q.f->dump_string("blocked", "peering is blocked due to down osds");
4970
4971   q.f->open_array_section("down_osds_we_would_probe");
4972   for (auto p = prior_set.down.begin(); p != prior_set.down.end(); ++p)
4973     q.f->dump_int("osd", *p);
4974   q.f->close_section();
4975
4976   q.f->open_array_section("peering_blocked_by");
4977   for (auto p = prior_set.blocked_by.begin();
4978        p != prior_set.blocked_by.end();
4979        ++p) {
4980     q.f->open_object_section("osd");
4981     q.f->dump_int("osd", p->first);
4982     q.f->dump_int("current_lost_at", p->second);
4983     q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
4984     q.f->close_section();
4985   }
4986   q.f->close_section();
4987
4988   if (history_les_bound) {
4989     q.f->open_array_section("peering_blocked_by_detail");
4990     q.f->open_object_section("item");
4991     q.f->dump_string("detail","peering_blocked_by_history_les_bound");
4992     q.f->close_section();
4993     q.f->close_section();
4994   }
4995
4996   q.f->close_section();
4997   return forward_event();
4998 }
4999
5000 boost::statechart::result PeeringState::Peering::react(const QueryUnfound& q)
5001 {
5002   q.f->dump_string("state", "Peering");
5003   q.f->dump_bool("available_might_have_unfound", false);
5004   return discard_event();
5005 }
5006
5007 void PeeringState::Peering::exit()
5008 {
5009
5010   DECLARE_LOCALS;
5011   psdout(10) << "Leaving Peering" << dendl;
5012   context< PeeringMachine >().log_exit(state_name, enter_time);
5013   ps->state_clear(PG_STATE_PEERING);
5014   pl->clear_probe_targets();
5015
5016   utime_t dur = ceph_clock_now() - enter_time;
5017   pl->get_peering_perf().tinc(rs_peering_latency, dur);
5018 }
5019
5020
5021 /*------Backfilling-------*/
5022 PeeringState::Backfilling::Backfilling(my_context ctx)
5023   : my_base(ctx),
5024     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Backfilling")
5025 {
5026   context< PeeringMachine >().log_enter(state_name);
5027
5028
5029   DECLARE_LOCALS;
5030   ps->backfill_reserved = true;
5031   pl->on_backfill_reserved();
5032   ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
5033   ps->state_clear(PG_STATE_BACKFILL_WAIT);
5034   ps->state_set(PG_STATE_BACKFILLING);
5035   pl->publish_stats_to_osd();
5036 }
5037
5038 void PeeringState::Backfilling::backfill_release_reservations()
5039 {
5040   DECLARE_LOCALS;
5041   pl->cancel_local_background_io_reservation();
5042   for (auto it = ps->backfill_targets.begin();
5043        it != ps->backfill_targets.end();
5044        ++it) {
5045     ceph_assert(*it != ps->pg_whoami);
5046     pl->send_cluster_message(
5047       it->osd,
5048       make_message<MBackfillReserve>(
5049         MBackfillReserve::RELEASE,
5050         spg_t(ps->info.pgid.pgid, it->shard),
5051         ps->get_osdmap_epoch()),
5052       ps->get_osdmap_epoch());
5053   }
5054 }
5055
5056 void PeeringState::Backfilling::cancel_backfill()
5057 {
5058   DECLARE_LOCALS;
5059   backfill_release_reservations();
5060   pl->on_backfill_canceled();
5061 }
5062
5063 boost::statechart::result
5064 PeeringState::Backfilling::react(const Backfilled &c)
5065 {
5066   backfill_release_reservations();
5067   return transit<Recovered>();
5068 }
5069
5070 boost::statechart::result
5071 PeeringState::Backfilling::react(const DeferBackfill &c)
5072 {
5073   DECLARE_LOCALS;
5074
5075   psdout(10) << "defer backfill, retry delay " << c.delay << dendl;
5076   ps->state_set(PG_STATE_BACKFILL_WAIT);
5077   ps->state_clear(PG_STATE_BACKFILLING);
5078   cancel_backfill();
5079
5080   pl->schedule_event_after(
5081     std::make_shared<PGPeeringEvent>(
5082       ps->get_osdmap_epoch(),
5083       ps->get_osdmap_epoch(),
5084       RequestBackfill()),
5085     c.delay);
5086   return transit<NotBackfilling>();
5087 }
5088
5089 boost::statechart::result
5090 PeeringState::Backfilling::react(const UnfoundBackfill &c)
5091 {
5092   DECLARE_LOCALS;
5093   psdout(10) << "backfill has unfound, can't continue" << dendl;
5094   ps->state_set(PG_STATE_BACKFILL_UNFOUND);
5095   ps->state_clear(PG_STATE_BACKFILLING);
5096   cancel_backfill();
5097   return transit<NotBackfilling>();
5098 }
5099
5100 boost::statechart::result
5101 PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull &)
5102 {
5103   DECLARE_LOCALS;
5104
5105   ps->state_set(PG_STATE_BACKFILL_TOOFULL);
5106   ps->state_clear(PG_STATE_BACKFILLING);
5107   cancel_backfill();
5108
5109   pl->schedule_event_after(
5110     std::make_shared<PGPeeringEvent>(
5111       ps->get_osdmap_epoch(),
5112       ps->get_osdmap_epoch(),
5113       RequestBackfill()),
5114     ps->cct->_conf->osd_backfill_retry_interval);
5115
5116   return transit<NotBackfilling>();
5117 }
5118
5119 boost::statechart::result
5120 PeeringState::Backfilling::react(const RemoteReservationRevoked &)
5121 {
5122   DECLARE_LOCALS;
5123   ps->state_set(PG_STATE_BACKFILL_WAIT);
5124   cancel_backfill();
5125   if (ps->needs_backfill()) {
5126     return transit<WaitLocalBackfillReserved>();
5127   } else {
5128     // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
5129     return discard_event();
5130   }
5131 }
5132
5133 void PeeringState::Backfilling::exit()
5134 {
5135   context< PeeringMachine >().log_exit(state_name, enter_time);
5136   DECLARE_LOCALS;
5137   ps->backfill_reserved = false;
5138   ps->state_clear(PG_STATE_BACKFILLING);
5139   ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
5140   utime_t dur = ceph_clock_now() - enter_time;
5141   pl->get_peering_perf().tinc(rs_backfilling_latency, dur);
5142 }
5143
5144 /*--WaitRemoteBackfillReserved--*/
5145
5146 PeeringState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
5147   : my_base(ctx),
5148     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteBackfillReserved"),
5149     backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
5150 {
5151   context< PeeringMachine >().log_enter(state_name);
5152   DECLARE_LOCALS;
5153
5154   ps->state_set(PG_STATE_BACKFILL_WAIT);
5155   pl->publish_stats_to_osd();
5156   post_event(RemoteBackfillReserved());
5157 }
5158
5159 boost::statechart::result
5160 PeeringState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
5161 {
5162   DECLARE_LOCALS;
5163
5164   int64_t num_bytes = ps->info.stats.stats.sum.num_bytes;
5165   psdout(10) << __func__ << " num_bytes " << num_bytes << dendl;
5166   if (backfill_osd_it !=
5167       context< Active >().remote_shards_to_reserve_backfill.end()) {
5168     // The primary never backfills itself
5169     ceph_assert(*backfill_osd_it != ps->pg_whoami);
5170     pl->send_cluster_message(
5171       backfill_osd_it->osd,
5172       make_message<MBackfillReserve>(
5173         MBackfillReserve::REQUEST,
5174         spg_t(context< PeeringMachine >().spgid.pgid, backfill_osd_it->shard),
5175         ps->get_osdmap_epoch(),
5176         ps->get_backfill_priority(),
5177         num_bytes,
5178         ps->peer_bytes[*backfill_osd_it]),
5179       ps->get_osdmap_epoch());
5180     ++backfill_osd_it;
5181   } else {
5182     ps->peer_bytes.clear();
5183     post_event(AllBackfillsReserved());
5184   }
5185   return discard_event();
5186 }
5187
5188 void PeeringState::WaitRemoteBackfillReserved::exit()
5189 {
5190   context< PeeringMachine >().log_exit(state_name, enter_time);
5191   DECLARE_LOCALS;
5192
5193   utime_t dur = ceph_clock_now() - enter_time;
5194   pl->get_peering_perf().tinc(rs_waitremotebackfillreserved_latency, dur);
5195 }
5196
5197 void PeeringState::WaitRemoteBackfillReserved::retry()
5198 {
5199   DECLARE_LOCALS;
5200   pl->cancel_local_background_io_reservation();
5201
5202   // Send CANCEL to all previously acquired reservations
5203   set<pg_shard_t>::const_iterator it, begin, end;
5204   begin = context< Active >().remote_shards_to_reserve_backfill.begin();
5205   end = context< Active >().remote_shards_to_reserve_backfill.end();
5206   ceph_assert(begin != end);
5207   for (it = begin; it != backfill_osd_it; ++it) {
5208     // The primary never backfills itself
5209     ceph_assert(*it != ps->pg_whoami);
5210     pl->send_cluster_message(
5211       it->osd,
5212       make_message<MBackfillReserve>(
5213         MBackfillReserve::RELEASE,
5214         spg_t(context< PeeringMachine >().spgid.pgid, it->shard),
5215         ps->get_osdmap_epoch()),
5216       ps->get_osdmap_epoch());
5217   }
5218
5219   ps->state_clear(PG_STATE_BACKFILL_WAIT);
5220   pl->publish_stats_to_osd();
5221
5222   pl->schedule_event_after(
5223     std::make_shared<PGPeeringEvent>(
5224       ps->get_osdmap_epoch(),
5225       ps->get_osdmap_epoch(),
5226       RequestBackfill()),
5227     ps->cct->_conf->osd_backfill_retry_interval);
5228 }
5229
5230 boost::statechart::result
5231 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRejectedTooFull &evt)
5232 {
5233   DECLARE_LOCALS;
5234   ps->state_set(PG_STATE_BACKFILL_TOOFULL);
5235   retry();
5236   return transit<NotBackfilling>();
5237 }
5238
5239 boost::statechart::result
5240 PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt)
5241 {
5242   retry();
5243   return transit<NotBackfilling>();
5244 }
5245
5246 /*--WaitLocalBackfillReserved--*/
5247 PeeringState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
5248   : my_base(ctx),
5249     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalBackfillReserved")
5250 {
5251   context< PeeringMachine >().log_enter(state_name);
5252   DECLARE_LOCALS;
5253
5254   ps->state_set(PG_STATE_BACKFILL_WAIT);
5255   pl->request_local_background_io_reservation(
5256     ps->get_backfill_priority(),
5257     std::make_unique<PGPeeringEvent>(
5258       ps->get_osdmap_epoch(),
5259       ps->get_osdmap_epoch(),
5260       LocalBackfillReserved()),
5261     std::make_unique<PGPeeringEvent>(
5262       ps->get_osdmap_epoch(),
5263       ps->get_osdmap_epoch(),
5264       DeferBackfill(0.0)));
5265   pl->publish_stats_to_osd();
5266 }
5267
5268 void PeeringState::WaitLocalBackfillReserved::exit()
5269 {
5270   context< PeeringMachine >().log_exit(state_name, enter_time);
5271   DECLARE_LOCALS;
5272   utime_t dur = ceph_clock_now() - enter_time;
5273   pl->get_peering_perf().tinc(rs_waitlocalbackfillreserved_latency, dur);
5274 }
5275
5276 /*----NotBackfilling------*/
5277 PeeringState::NotBackfilling::NotBackfilling(my_context ctx)
5278   : my_base(ctx),
5279     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotBackfilling")
5280 {
5281   context< PeeringMachine >().log_enter(state_name);
5282   DECLARE_LOCALS;
5283   ps->state_clear(PG_STATE_REPAIR);
5284   pl->publish_stats_to_osd();
5285 }
5286
5287 boost::statechart::result PeeringState::NotBackfilling::react(const QueryUnfound& q)
5288 {
5289   DECLARE_LOCALS;
5290
5291   ps->query_unfound(q.f, "NotBackfilling");
5292   return discard_event();
5293 }
5294
5295 boost::statechart::result
5296 PeeringState::NotBackfilling::react(const RemoteBackfillReserved &evt)
5297 {
5298   return discard_event();
5299 }
5300
5301 boost::statechart::result
5302 PeeringState::NotBackfilling::react(const RemoteReservationRejectedTooFull &evt)
5303 {
5304   return discard_event();
5305 }
5306
5307 void PeeringState::NotBackfilling::exit()
5308 {
5309   context< PeeringMachine >().log_exit(state_name, enter_time);
5310
5311   DECLARE_LOCALS;
5312   ps->state_clear(PG_STATE_BACKFILL_UNFOUND);
5313   utime_t dur = ceph_clock_now() - enter_time;
5314   pl->get_peering_perf().tinc(rs_notbackfilling_latency, dur);
5315 }
5316
5317 /*----NotRecovering------*/
5318 PeeringState::NotRecovering::NotRecovering(my_context ctx)
5319   : my_base(ctx),
5320     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotRecovering")
5321 {
5322   context< PeeringMachine >().log_enter(state_name);
5323   DECLARE_LOCALS;
5324   ps->state_clear(PG_STATE_REPAIR);
5325   pl->publish_stats_to_osd();
5326 }
5327
5328 boost::statechart::result PeeringState::NotRecovering::react(const QueryUnfound& q)
5329 {
5330   DECLARE_LOCALS;
5331
5332   ps->query_unfound(q.f, "NotRecovering");
5333   return discard_event();
5334 }
5335
5336 void PeeringState::NotRecovering::exit()
5337 {
5338   context< PeeringMachine >().log_exit(state_name, enter_time);
5339
5340   DECLARE_LOCALS;
5341   ps->state_clear(PG_STATE_RECOVERY_UNFOUND);
5342   utime_t dur = ceph_clock_now() - enter_time;
5343   pl->get_peering_perf().tinc(rs_notrecovering_latency, dur);
5344 }
5345
5346 /*---RepNotRecovering----*/
5347 PeeringState::RepNotRecovering::RepNotRecovering(my_context ctx)
5348   : my_base(ctx),
5349     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepNotRecovering")
5350 {
5351   context< PeeringMachine >().log_enter(state_name);
5352 }
5353
5354 boost::statechart::result
5355 PeeringState::RepNotRecovering::react(const RejectTooFullRemoteReservation &evt)
5356 {
5357   DECLARE_LOCALS;
5358   ps->reject_reservation();
5359   post_event(RemoteReservationRejectedTooFull());
5360   return discard_event();
5361 }
5362
5363 void PeeringState::RepNotRecovering::exit()
5364 {
5365   context< PeeringMachine >().log_exit(state_name, enter_time);
5366   DECLARE_LOCALS;
5367   utime_t dur = ceph_clock_now() - enter_time;
5368   pl->get_peering_perf().tinc(rs_repnotrecovering_latency, dur);
5369 }
5370
5371 /*---RepWaitRecoveryReserved--*/
5372 PeeringState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
5373   : my_base(ctx),
5374     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitRecoveryReserved")
5375 {
5376   context< PeeringMachine >().log_enter(state_name);
5377 }
5378
5379 boost::statechart::result
5380 PeeringState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
5381 {
5382   DECLARE_LOCALS;
5383   pl->send_cluster_message(
5384     ps->primary.osd,
5385     make_message<MRecoveryReserve>(
5386       MRecoveryReserve::GRANT,
5387       spg_t(ps->info.pgid.pgid, ps->primary.shard),
5388       ps->get_osdmap_epoch()),
5389     ps->get_osdmap_epoch());
5390   return transit<RepRecovering>();
5391 }
5392
5393 boost::statechart::result
5394 PeeringState::RepWaitRecoveryReserved::react(
5395   const RemoteReservationCanceled &evt)
5396 {
5397   DECLARE_LOCALS;
5398   pl->unreserve_recovery_space();
5399
5400   pl->cancel_remote_recovery_reservation();
5401   return transit<RepNotRecovering>();
5402 }
5403
5404 void PeeringState::RepWaitRecoveryReserved::exit()
5405 {
5406   context< PeeringMachine >().log_exit(state_name, enter_time);
5407   DECLARE_LOCALS;
5408   utime_t dur = ceph_clock_now() - enter_time;
5409   pl->get_peering_perf().tinc(rs_repwaitrecoveryreserved_latency, dur);
5410 }
5411
5412 /*-RepWaitBackfillReserved*/
5413 PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
5414   : my_base(ctx),
5415     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitBackfillReserved")
5416 {
5417   context< PeeringMachine >().log_enter(state_name);
5418 }
5419
5420 boost::statechart::result
5421 PeeringState::RepNotRecovering::react(const RequestBackfillPrio &evt)
5422 {
5423
5424   DECLARE_LOCALS;
5425
5426   if (!pl->try_reserve_recovery_space(
5427         evt.primary_num_bytes, evt.local_num_bytes)) {
5428     post_event(RejectTooFullRemoteReservation());
5429   } else {
5430     PGPeeringEventURef preempt;
5431     if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5432       // older peers will interpret preemption as TOOFULL
5433       preempt = std::make_unique<PGPeeringEvent>(
5434         pl->get_osdmap_epoch(),
5435         pl->get_osdmap_epoch(),
5436         RemoteBackfillPreempted());
5437     }
5438     pl->request_remote_recovery_reservation(
5439       evt.priority,
5440       std::make_unique<PGPeeringEvent>(
5441         pl->get_osdmap_epoch(),
5442         pl->get_osdmap_epoch(),
5443         RemoteBackfillReserved()),
5444       std::move(preempt));
5445   }
5446   return transit<RepWaitBackfillReserved>();
5447 }
5448
5449 boost::statechart::result
5450 PeeringState::RepNotRecovering::react(const RequestRecoveryPrio &evt)
5451 {
5452   DECLARE_LOCALS;
5453
5454   // fall back to a local reckoning of priority of primary doesn't pass one
5455   // (pre-mimic compat)
5456   int prio = evt.priority ? evt.priority : ps->get_recovery_priority();
5457
5458   PGPeeringEventURef preempt;
5459   if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
5460     // older peers can't handle this
5461     preempt = std::make_unique<PGPeeringEvent>(
5462       ps->get_osdmap_epoch(),
5463       ps->get_osdmap_epoch(),
5464       RemoteRecoveryPreempted());
5465   }
5466
5467   pl->request_remote_recovery_reservation(
5468     prio,
5469     std::make_unique<PGPeeringEvent>(
5470       ps->get_osdmap_epoch(),
5471       ps->get_osdmap_epoch(),
5472       RemoteRecoveryReserved()),
5473     std::move(preempt));
5474   return transit<RepWaitRecoveryReserved>();
5475 }
5476
5477 void PeeringState::RepWaitBackfillReserved::exit()
5478 {
5479   context< PeeringMachine >().log_exit(state_name, enter_time);
5480   DECLARE_LOCALS;
5481   utime_t dur = ceph_clock_now() - enter_time;
5482   pl->get_peering_perf().tinc(rs_repwaitbackfillreserved_latency, dur);
5483 }
5484
5485 boost::statechart::result
5486 PeeringState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
5487 {
5488   DECLARE_LOCALS;
5489
5490
5491   pl->send_cluster_message(
5492       ps->primary.osd,
5493       make_message<MBackfillReserve>(
5494         MBackfillReserve::GRANT,
5495         spg_t(ps->info.pgid.pgid, ps->primary.shard),
5496         ps->get_osdmap_epoch()),
5497       ps->get_osdmap_epoch());
5498   return transit<RepRecovering>();
5499 }
5500
5501 boost::statechart::result
5502 PeeringState::RepWaitBackfillReserved::react(
5503   const RejectTooFullRemoteReservation &evt)
5504 {
5505   DECLARE_LOCALS;
5506   ps->reject_reservation();
5507   post_event(RemoteReservationRejectedTooFull());
5508   return discard_event();
5509 }
5510
5511 boost::statechart::result
5512 PeeringState::RepWaitBackfillReserved::react(
5513   const RemoteReservationRejectedTooFull &evt)
5514 {
5515   DECLARE_LOCALS;
5516   pl->unreserve_recovery_space();
5517
5518   pl->cancel_remote_recovery_reservation();
5519   return transit<RepNotRecovering>();
5520 }
5521
5522 boost::statechart::result
5523 PeeringState::RepWaitBackfillReserved::react(
5524   const RemoteReservationCanceled &evt)
5525 {
5526   DECLARE_LOCALS;
5527   pl->unreserve_recovery_space();
5528
5529   pl->cancel_remote_recovery_reservation();
5530   return transit<RepNotRecovering>();
5531 }
5532
5533 /*---RepRecovering-------*/
5534 PeeringState::RepRecovering::RepRecovering(my_context ctx)
5535   : my_base(ctx),
5536     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepRecovering")
5537 {
5538   context< PeeringMachine >().log_enter(state_name);
5539 }
5540
5541 boost::statechart::result
5542 PeeringState::RepRecovering::react(const RemoteRecoveryPreempted &)
5543 {
5544   DECLARE_LOCALS;
5545
5546
5547   pl->unreserve_recovery_space();
5548   pl->send_cluster_message(
5549     ps->primary.osd,
5550     make_message<MRecoveryReserve>(
5551       MRecoveryReserve::REVOKE,
5552       spg_t(ps->info.pgid.pgid, ps->primary.shard),
5553       ps->get_osdmap_epoch()),
5554     ps->get_osdmap_epoch());
5555   return discard_event();
5556 }
5557
5558 boost::statechart::result
5559 PeeringState::RepRecovering::react(const BackfillTooFull &)
5560 {
5561   DECLARE_LOCALS;
5562
5563
5564   pl->unreserve_recovery_space();
5565   pl->send_cluster_message(
5566     ps->primary.osd,
5567     make_message<MBackfillReserve>(
5568       MBackfillReserve::REVOKE_TOOFULL,
5569       spg_t(ps->info.pgid.pgid, ps->primary.shard),
5570       ps->get_osdmap_epoch()),
5571     ps->get_osdmap_epoch());
5572   return discard_event();
5573 }
5574
5575 boost::statechart::result
5576 PeeringState::RepRecovering::react(const RemoteBackfillPreempted &)
5577 {
5578   DECLARE_LOCALS;
5579
5580
5581   pl->unreserve_recovery_space();
5582   pl->send_cluster_message(
5583     ps->primary.osd,
5584     make_message<MBackfillReserve>(
5585       MBackfillReserve::REVOKE,
5586       spg_t(ps->info.pgid.pgid, ps->primary.shard),
5587       ps->get_osdmap_epoch()),
5588     ps->get_osdmap_epoch());
5589   return discard_event();
5590 }
5591
5592 void PeeringState::RepRecovering::exit()
5593 {
5594   context< PeeringMachine >().log_exit(state_name, enter_time);
5595   DECLARE_LOCALS;
5596   pl->unreserve_recovery_space();
5597
5598   pl->cancel_remote_recovery_reservation();
5599   utime_t dur = ceph_clock_now() - enter_time;
5600   pl->get_peering_perf().tinc(rs_reprecovering_latency, dur);
5601 }
5602
5603 /*------Activating--------*/
5604 PeeringState::Activating::Activating(my_context ctx)
5605   : my_base(ctx),
5606     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Activating")
5607 {
5608   context< PeeringMachine >().log_enter(state_name);
5609 }
5610
5611 void PeeringState::Activating::exit()
5612 {
5613   context< PeeringMachine >().log_exit(state_name, enter_time);
5614   DECLARE_LOCALS;
5615   utime_t dur = ceph_clock_now() - enter_time;
5616   pl->get_peering_perf().tinc(rs_activating_latency, dur);
5617 }
5618
5619 PeeringState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
5620   : my_base(ctx),
5621     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalRecoveryReserved")
5622 {
5623   context< PeeringMachine >().log_enter(state_name);
5624   DECLARE_LOCALS;
5625
5626   // Make sure all nodes that part of the recovery aren't full
5627   if (!ps->cct->_conf->osd_debug_skip_full_check_in_recovery &&
5628       ps->get_osdmap()->check_full(ps->acting_recovery_backfill)) {
5629     post_event(RecoveryTooFull());
5630     return;
5631   }
5632
5633   ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5634   ps->state_set(PG_STATE_RECOVERY_WAIT);
5635   pl->request_local_background_io_reservation(
5636     ps->get_recovery_priority(),
5637     std::make_unique<PGPeeringEvent>(
5638       ps->get_osdmap_epoch(),
5639       ps->get_osdmap_epoch(),
5640       LocalRecoveryReserved()),
5641     std::make_unique<PGPeeringEvent>(
5642       ps->get_osdmap_epoch(),
5643       ps->get_osdmap_epoch(),
5644       DeferRecovery(0.0)));
5645   pl->publish_stats_to_osd();
5646 }
5647
5648 boost::statechart::result
5649 PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
5650 {
5651   DECLARE_LOCALS;
5652   ps->state_set(PG_STATE_RECOVERY_TOOFULL);
5653   pl->schedule_event_after(
5654     std::make_shared<PGPeeringEvent>(
5655       ps->get_osdmap_epoch(),
5656       ps->get_osdmap_epoch(),
5657       DoRecovery()),
5658     ps->cct->_conf->osd_recovery_retry_interval);
5659   return transit<NotRecovering>();
5660 }
5661
5662 void PeeringState::WaitLocalRecoveryReserved::exit()
5663 {
5664   context< PeeringMachine >().log_exit(state_name, enter_time);
5665   DECLARE_LOCALS;
5666   utime_t dur = ceph_clock_now() - enter_time;
5667   pl->get_peering_perf().tinc(rs_waitlocalrecoveryreserved_latency, dur);
5668 }
5669
5670 PeeringState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
5671   : my_base(ctx),
5672     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
5673     remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
5674 {
5675   context< PeeringMachine >().log_enter(state_name);
5676   post_event(RemoteRecoveryReserved());
5677 }
5678
5679 boost::statechart::result
5680 PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
5681   DECLARE_LOCALS;
5682
5683   if (remote_recovery_reservation_it !=
5684       context< Active >().remote_shards_to_reserve_recovery.end()) {
5685     ceph_assert(*remote_recovery_reservation_it != ps->pg_whoami);
5686     pl->send_cluster_message(
5687       remote_recovery_reservation_it->osd,
5688       make_message<MRecoveryReserve>(
5689         MRecoveryReserve::REQUEST,
5690         spg_t(context< PeeringMachine >().spgid.pgid,
5691               remote_recovery_reservation_it->shard),
5692         ps->get_osdmap_epoch(),
5693         ps->get_recovery_priority()),
5694       ps->get_osdmap_epoch());
5695     ++remote_recovery_reservation_it;
5696   } else {
5697     post_event(AllRemotesReserved());
5698   }
5699   return discard_event();
5700 }
5701
5702 void PeeringState::WaitRemoteRecoveryReserved::exit()
5703 {
5704   context< PeeringMachine >().log_exit(state_name, enter_time);
5705   DECLARE_LOCALS;
5706   utime_t dur = ceph_clock_now() - enter_time;
5707   pl->get_peering_perf().tinc(rs_waitremoterecoveryreserved_latency, dur);
5708 }
5709
5710 PeeringState::Recovering::Recovering(my_context ctx)
5711   : my_base(ctx),
5712     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovering")
5713 {
5714   context< PeeringMachine >().log_enter(state_name);
5715
5716   DECLARE_LOCALS;
5717   ps->state_clear(PG_STATE_RECOVERY_WAIT);
5718   ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
5719   ps->state_set(PG_STATE_RECOVERING);
5720   pl->on_recovery_reserved();
5721   ceph_assert(!ps->state_test(PG_STATE_ACTIVATING));
5722   pl->publish_stats_to_osd();
5723 }
5724
5725 void PeeringState::Recovering::release_reservations(bool cancel)
5726 {
5727   DECLARE_LOCALS;
5728   ceph_assert(cancel || !ps->pg_log.get_missing().have_missing());
5729
5730   // release remote reservations
5731   for (auto i = context< Active >().remote_shards_to_reserve_recovery.begin();
5732        i != context< Active >().remote_shards_to_reserve_recovery.end();
5733        ++i) {
5734     if (*i == ps->pg_whoami) // skip myself
5735       continue;
5736     pl->send_cluster_message(
5737       i->osd,
5738       make_message<MRecoveryReserve>(
5739         MRecoveryReserve::RELEASE,
5740         spg_t(ps->info.pgid.pgid, i->shard),
5741         ps->get_osdmap_epoch()),
5742       ps->get_osdmap_epoch());
5743   }
5744 }
5745
5746 boost::statechart::result
5747 PeeringState::Recovering::react(const AllReplicasRecovered &evt)
5748 {
5749   DECLARE_LOCALS;
5750   ps->state_clear(PG_STATE_FORCED_RECOVERY);
5751   release_reservations();
5752   pl->cancel_local_background_io_reservation();
5753   return transit<Recovered>();
5754 }
5755
5756 boost::statechart::result
5757 PeeringState::Recovering::react(const RequestBackfill &evt)
5758 {
5759   DECLARE_LOCALS;
5760
5761   release_reservations();
5762
5763   ps->state_clear(PG_STATE_FORCED_RECOVERY);
5764   pl->cancel_local_background_io_reservation();
5765   pl->publish_stats_to_osd();
5766   // transit any async_recovery_targets back into acting
5767   // so pg won't have to stay undersized for long
5768   // as backfill might take a long time to complete..
5769   if (!ps->async_recovery_targets.empty()) {
5770     pg_shard_t auth_log_shard;
5771     bool history_les_bound = false;
5772     // FIXME: Uh-oh we have to check this return value; choose_acting can fail!
5773     ps->choose_acting(auth_log_shard, true, &history_les_bound);
5774   }
5775   return transit<WaitLocalBackfillReserved>();
5776 }
5777
5778 boost::statechart::result
5779 PeeringState::Recovering::react(const DeferRecovery &evt)
5780 {
5781   DECLARE_LOCALS;
5782   if (!ps->state_test(PG_STATE_RECOVERING)) {
5783     // we may have finished recovery and have an AllReplicasRecovered
5784     // event queued to move us to the next state.
5785     psdout(10) << "got defer recovery but not recovering" << dendl;
5786     return discard_event();
5787   }
5788   psdout(10) << "defer recovery, retry delay " << evt.delay << dendl;
5789   ps->state_set(PG_STATE_RECOVERY_WAIT);
5790   pl->cancel_local_background_io_reservation();
5791   release_reservations(true);
5792   pl->schedule_event_after(
5793     std::make_shared<PGPeeringEvent>(
5794       ps->get_osdmap_epoch(),
5795       ps->get_osdmap_epoch(),
5796       DoRecovery()),
5797     evt.delay);
5798   return transit<NotRecovering>();
5799 }
5800
5801 boost::statechart::result
5802 PeeringState::Recovering::react(const UnfoundRecovery &evt)
5803 {
5804   DECLARE_LOCALS;
5805   psdout(10) << "recovery has unfound, can't continue" << dendl;
5806   ps->state_set(PG_STATE_RECOVERY_UNFOUND);
5807   pl->cancel_local_background_io_reservation();
5808   release_reservations(true);
5809   return transit<NotRecovering>();
5810 }
5811
5812 void PeeringState::Recovering::exit()
5813 {
5814   context< PeeringMachine >().log_exit(state_name, enter_time);
5815
5816   DECLARE_LOCALS;
5817   utime_t dur = ceph_clock_now() - enter_time;
5818   ps->state_clear(PG_STATE_RECOVERING);
5819   pl->get_peering_perf().tinc(rs_recovering_latency, dur);
5820 }
5821
5822 PeeringState::Recovered::Recovered(my_context ctx)
5823   : my_base(ctx),
5824     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovered")
5825 {
5826   pg_shard_t auth_log_shard;
5827
5828   context< PeeringMachine >().log_enter(state_name);
5829
5830   DECLARE_LOCALS;
5831
5832   ceph_assert(!ps->needs_recovery());
5833
5834   // if we finished backfill, all acting are active; recheck if
5835   // DEGRADED | UNDERSIZED is appropriate.
5836   ceph_assert(!ps->acting_recovery_backfill.empty());
5837   if (ps->get_osdmap()->get_pg_size(context< PeeringMachine >().spgid.pgid) <=
5838       ps->acting_recovery_backfill.size()) {
5839     ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
5840     pl->publish_stats_to_osd();
5841   }
5842
5843   // adjust acting set?  (e.g. because backfill completed...)
5844   bool history_les_bound = false;
5845   if (ps->acting != ps->up && !ps->choose_acting(auth_log_shard,
5846                                                  true, &history_les_bound)) {
5847     ceph_assert(ps->want_acting.size());
5848   } else if (!ps->async_recovery_targets.empty()) {
5849     // FIXME: Uh-oh we have to check this return value; choose_acting can fail!
5850     ps->choose_acting(auth_log_shard, true, &history_les_bound);
5851   }
5852
5853   if (context< Active >().all_replicas_activated  &&
5854       ps->async_recovery_targets.empty())
5855     post_event(GoClean());
5856 }
5857
5858 void PeeringState::Recovered::exit()
5859 {
5860   context< PeeringMachine >().log_exit(state_name, enter_time);
5861   DECLARE_LOCALS;
5862
5863   utime_t dur = ceph_clock_now() - enter_time;
5864   pl->get_peering_perf().tinc(rs_recovered_latency, dur);
5865 }
5866
5867 PeeringState::Clean::Clean(my_context ctx)
5868   : my_base(ctx),
5869     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Clean")
5870 {
5871   context< PeeringMachine >().log_enter(state_name);
5872
5873   DECLARE_LOCALS;
5874
5875   if (ps->info.last_complete != ps->info.last_update) {
5876     ceph_abort();
5877   }
5878
5879
5880   ps->try_mark_clean();
5881
5882   context< PeeringMachine >().get_cur_transaction().register_on_commit(
5883     pl->on_clean());
5884 }
5885
5886 void PeeringState::Clean::exit()
5887 {
5888   context< PeeringMachine >().log_exit(state_name, enter_time);
5889
5890   DECLARE_LOCALS;
5891   ps->state_clear(PG_STATE_CLEAN);
5892   utime_t dur = ceph_clock_now() - enter_time;
5893   pl->get_peering_perf().tinc(rs_clean_latency, dur);
5894 }
5895
5896 template <typename T>
5897 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
5898 {
5899   set<int> osds_found;
5900   set<pg_shard_t> out;
5901   for (auto i = in.begin(); i != in.end(); ++i) {
5902     if (*i != skip && !osds_found.count(i->osd)) {
5903       osds_found.insert(i->osd);
5904       out.insert(*i);
5905     }
5906   }
5907   return out;
5908 }
5909
5910 /*---------Active---------*/
5911 PeeringState::Active::Active(my_context ctx)
5912   : my_base(ctx),
5913     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active"),
5914     remote_shards_to_reserve_recovery(
5915       unique_osd_shard_set(
5916         context< PeeringMachine >().state->pg_whoami,
5917         context< PeeringMachine >().state->acting_recovery_backfill)),
5918     remote_shards_to_reserve_backfill(
5919       unique_osd_shard_set(
5920         context< PeeringMachine >().state->pg_whoami,
5921         context< PeeringMachine >().state->backfill_targets)),
5922     all_replicas_activated(false)
5923 {
5924   context< PeeringMachine >().log_enter(state_name);
5925
5926
5927   DECLARE_LOCALS;
5928
5929   ceph_assert(!ps->backfill_reserved);
5930   ceph_assert(ps->is_primary());
5931   psdout(10) << "In Active, about to call activate" << dendl;
5932   ps->start_flush(context< PeeringMachine >().get_cur_transaction());
5933   ps->activate(context< PeeringMachine >().get_cur_transaction(),
5934                ps->get_osdmap_epoch(),
5935                context< PeeringMachine >().get_recovery_ctx());
5936
5937   // everyone has to commit/ack before we are truly active
5938   ps->blocked_by.clear();
5939   for (auto p = ps->acting_recovery_backfill.begin();
5940        p != ps->acting_recovery_backfill.end();
5941        ++p) {
5942     if (p->shard != ps->pg_whoami.shard) {
5943       ps->blocked_by.insert(p->shard);
5944     }
5945   }
5946   pl->publish_stats_to_osd();
5947   psdout(10) << "Activate Finished" << dendl;
5948 }
5949
5950 boost::statechart::result PeeringState::Active::react(const AdvMap& advmap)
5951 {
5952   DECLARE_LOCALS;
5953
5954   if (ps->should_restart_peering(
5955         advmap.up_primary,
5956         advmap.acting_primary,
5957         advmap.newup,
5958         advmap.newacting,
5959         advmap.lastmap,
5960         advmap.osdmap)) {
5961     psdout(10) << "Active advmap interval change, fast return" << dendl;
5962     return forward_event();
5963   }
5964   psdout(10) << "Active advmap" << dendl;
5965   bool need_publish = false;
5966
5967   pl->on_active_advmap(advmap.osdmap);
5968   if (ps->dirty_big_info) {
5969     // share updated purged_snaps to mgr/mon so that we (a) stop reporting
5970     // purged snaps and (b) perhaps share more snaps that we have purged
5971     // but didn't fit in pg_stat_t.
5972     need_publish = true;
5973     ps->share_pg_info();
5974   }
5975
5976   bool need_acting_change = false;
5977   for (size_t i = 0; i < ps->want_acting.size(); i++) {
5978     int osd = ps->want_acting[i];
5979     if (!advmap.osdmap->is_up(osd)) {
5980       pg_shard_t osd_with_shard(osd, shard_id_t(i));
5981       if (!ps->is_acting(osd_with_shard) && !ps->is_up(osd_with_shard)) {
5982         psdout(10) << "Active stray osd." << osd << " in want_acting is down"
5983                    << dendl;
5984         need_acting_change = true;
5985       }
5986     }
5987   }
5988   if (need_acting_change) {
5989      psdout(10) << "Active need acting change, call choose_acting again"
5990                 << dendl;
5991     // possibly because we re-add some strays into the acting set and
5992     // some of them then go down in a subsequent map before we could see
5993     // the map changing the pg temp.
5994     // call choose_acting again to clear them out.
5995     // note that we leave restrict_to_up_acting to false in order to
5996     // not overkill any chosen stray that is still alive.
5997     pg_shard_t auth_log_shard;
5998     bool history_les_bound = false;
5999     ps->remove_down_peer_info(advmap.osdmap);
6000     ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
6001   }
6002
6003   /* Check for changes in pool size (if the acting set changed as a result,
6004    * this does not matter) */
6005   if (advmap.lastmap->get_pg_size(ps->info.pgid.pgid) !=
6006       ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid)) {
6007     if (ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid) <=
6008         ps->actingset.size()) {
6009       ps->state_clear(PG_STATE_UNDERSIZED);
6010     } else {
6011       ps->state_set(PG_STATE_UNDERSIZED);
6012     }
6013     // degraded changes will be detected by call from publish_stats_to_osd()
6014     need_publish = true;
6015   }
6016
6017   // if we haven't reported our PG stats in a long time, do so now.
6018   if (ps->info.stats.reported_epoch + ps->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
6019     psdout(20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - ps->info.stats.reported_epoch)
6020                        << " epochs" << dendl;
6021     need_publish = true;
6022   }
6023
6024   if (need_publish)
6025     pl->publish_stats_to_osd();
6026
6027   if (ps->check_prior_readable_down_osds(advmap.osdmap)) {
6028     pl->recheck_readable();
6029   }
6030
6031   return forward_event();
6032 }
6033
6034 boost::statechart::result PeeringState::Active::react(const ActMap&)
6035 {
6036   DECLARE_LOCALS;
6037   psdout(10) << "Active: handling ActMap" << dendl;
6038   ceph_assert(ps->is_primary());
6039
6040   pl->on_active_actmap();
6041
6042   if (ps->have_unfound()) {
6043     // object may have become unfound
6044     ps->discover_all_missing(context<PeeringMachine>().get_recovery_ctx().msgs);
6045   }
6046
6047   uint64_t unfound = ps->missing_loc.num_unfound();
6048   if (unfound > 0 &&
6049       ps->all_unfound_are_queried_or_lost(ps->get_osdmap())) {
6050     if (ps->cct->_conf->osd_auto_mark_unfound_lost) {
6051       pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has " << unfound
6052                             << " objects unfound and apparently lost, would automatically "
6053                             << "mark these objects lost but this feature is not yet implemented "
6054                             << "(osd_auto_mark_unfound_lost)";
6055     } else
6056       pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has "
6057                              << unfound << " objects unfound and apparently lost";
6058   }
6059
6060   return forward_event();
6061 }
6062
6063 boost::statechart::result PeeringState::Active::react(const MNotifyRec& notevt)
6064 {
6065
6066   DECLARE_LOCALS;
6067   ceph_assert(ps->is_primary());
6068   if (ps->peer_info.count(notevt.from)) {
6069     psdout(10) << "Active: got notify from " << notevt.from
6070                        << ", already have info from that osd, ignoring"
6071                        << dendl;
6072   } else if (ps->peer_purged.count(notevt.from)) {
6073     psdout(10) << "Active: got notify from " << notevt.from
6074                        << ", already purged that peer, ignoring"
6075                        << dendl;
6076   } else {
6077     psdout(10) << "Active: got notify from " << notevt.from
6078                        << ", calling proc_replica_info and discover_all_missing"
6079                        << dendl;
6080     ps->proc_replica_info(
6081       notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
6082     if (ps->have_unfound() || (ps->is_degraded() && ps->might_have_unfound.count(notevt.from))) {
6083       ps->discover_all_missing(
6084         context<PeeringMachine>().get_recovery_ctx().msgs);
6085     }
6086     // check if it is a previous down acting member that's coming back.
6087     // if so, request pg_temp change to trigger a new interval transition
6088     pg_shard_t auth_log_shard;
6089     bool history_les_bound = false;
6090     // FIXME: Uh-oh we have to check this return value; choose_acting can fail!
6091     ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
6092     if (!ps->want_acting.empty() && ps->want_acting != ps->acting) {
6093       psdout(10) << "Active: got notify from previous acting member "
6094                  << notevt.from << ", requesting pg_temp change"
6095                  << dendl;
6096     }
6097   }
6098   return discard_event();
6099 }
6100
6101 boost::statechart::result PeeringState::Active::react(const MTrim& trim)
6102 {
6103   DECLARE_LOCALS;
6104   ceph_assert(ps->is_primary());
6105
6106   // peer is informing us of their last_complete_ondisk
6107   ldout(ps->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl;
6108   ps->update_peer_last_complete_ondisk(pg_shard_t{trim.from, trim.shard},
6109                                        trim.trim_to);
6110   // trim log when the pg is recovered
6111   ps->calc_min_last_complete_ondisk();
6112   return discard_event();
6113 }
6114
6115 boost::statechart::result PeeringState::Active::react(const MInfoRec& infoevt)
6116 {
6117   DECLARE_LOCALS;
6118   ceph_assert(ps->is_primary());
6119
6120   ceph_assert(!ps->acting_recovery_backfill.empty());
6121   if (infoevt.lease_ack) {
6122     ps->proc_lease_ack(infoevt.from.osd, *infoevt.lease_ack);
6123   }
6124   // don't update history (yet) if we are active and primary; the replica
6125   // may be telling us they have activated (and committed) but we can't
6126   // share that until _everyone_ does the same.
6127   if (ps->is_acting_recovery_backfill(infoevt.from) &&
6128       ps->peer_activated.count(infoevt.from) == 0) {
6129     psdout(10) << " peer osd." << infoevt.from
6130                << " activated and committed" << dendl;
6131     ps->peer_activated.insert(infoevt.from);
6132     ps->blocked_by.erase(infoevt.from.shard);
6133     pl->publish_stats_to_osd();
6134     if (ps->peer_activated.size() == ps->acting_recovery_backfill.size()) {
6135       all_activated_and_committed();
6136     }
6137   }
6138   return discard_event();
6139 }
6140
6141 boost::statechart::result PeeringState::Active::react(const MLogRec& logevt)
6142 {
6143   DECLARE_LOCALS;
6144   psdout(10) << "searching osd." << logevt.from
6145                      << " log for unfound items" << dendl;
6146   ps->proc_replica_log(
6147     logevt.msg->info, logevt.msg->log, std::move(logevt.msg->missing), logevt.from);
6148   bool got_missing = ps->search_for_missing(
6149     ps->peer_info[logevt.from],
6150     ps->peer_missing[logevt.from],
6151     logevt.from,
6152     context< PeeringMachine >().get_recovery_ctx());
6153   // If there are missing AND we are "fully" active then start recovery now
6154   if (got_missing && ps->state_test(PG_STATE_ACTIVE)) {
6155     post_event(DoRecovery());
6156   }
6157   return discard_event();
6158 }
6159
6160 boost::statechart::result PeeringState::Active::react(const QueryState& q)
6161 {
6162   DECLARE_LOCALS;
6163
6164   q.f->open_object_section("state");
6165   q.f->dump_string("name", state_name);
6166   q.f->dump_stream("enter_time") << enter_time;
6167
6168   {
6169     q.f->open_array_section("might_have_unfound");
6170     for (auto p = ps->might_have_unfound.begin();
6171          p != ps->might_have_unfound.end();
6172          ++p) {
6173       q.f->open_object_section("osd");
6174       q.f->dump_stream("osd") << *p;
6175       if (ps->peer_missing.count(*p)) {
6176         q.f->dump_string("status", "already probed");
6177       } else if (ps->peer_missing_requested.count(*p)) {
6178         q.f->dump_string("status", "querying");
6179       } else if (!ps->get_osdmap()->is_up(p->osd)) {
6180         q.f->dump_string("status", "osd is down");
6181       } else {
6182         q.f->dump_string("status", "not queried");
6183       }
6184       q.f->close_section();
6185     }
6186     q.f->close_section();
6187   }
6188   {
6189     q.f->open_object_section("recovery_progress");
6190     q.f->open_array_section("backfill_targets");
6191     for (auto p = ps->backfill_targets.begin();
6192          p != ps->backfill_targets.end(); ++p)
6193       q.f->dump_stream("replica") << *p;
6194     q.f->close_section();
6195     pl->dump_recovery_info(q.f);
6196     q.f->close_section();
6197   }
6198
6199   q.f->close_section();
6200   return forward_event();
6201 }
6202
6203 boost::statechart::result PeeringState::Active::react(const QueryUnfound& q)
6204 {
6205   DECLARE_LOCALS;
6206
6207   ps->query_unfound(q.f, "Active");
6208   return discard_event();
6209 }
6210
6211 boost::statechart::result PeeringState::Active::react(
6212   const ActivateCommitted &evt)
6213 {
6214   DECLARE_LOCALS;
6215   ceph_assert(!ps->peer_activated.count(ps->pg_whoami));
6216   ps->peer_activated.insert(ps->pg_whoami);
6217   psdout(10) << "_activate_committed " << evt.epoch
6218              << " peer_activated now " << ps->peer_activated
6219              << " last_interval_started "
6220              << ps->info.history.last_interval_started
6221              << " last_epoch_started "
6222              << ps->info.history.last_epoch_started
6223              << " same_interval_since "
6224              << ps->info.history.same_interval_since
6225              << dendl;
6226   ceph_assert(!ps->acting_recovery_backfill.empty());
6227   if (ps->peer_activated.size() == ps->acting_recovery_backfill.size())
6228     all_activated_and_committed();
6229   return discard_event();
6230 }
6231
6232 boost::statechart::result PeeringState::Active::react(const AllReplicasActivated &evt)
6233 {
6234
6235   DECLARE_LOCALS;
6236   pg_t pgid = context< PeeringMachine >().spgid.pgid;
6237
6238   all_replicas_activated = true;
6239
6240   ps->state_clear(PG_STATE_ACTIVATING);
6241   ps->state_clear(PG_STATE_CREATING);
6242   ps->state_clear(PG_STATE_PREMERGE);
6243
6244   bool merge_target;
6245   if (ps->pool.info.is_pending_merge(pgid, &merge_target)) {
6246     ps->state_set(PG_STATE_PEERED);
6247     ps->state_set(PG_STATE_PREMERGE);
6248
6249     if (ps->actingset.size() != ps->get_osdmap()->get_pg_size(pgid)) {
6250       if (merge_target) {
6251         pg_t src = pgid;
6252         src.set_ps(ps->pool.info.get_pg_num_pending());
6253         assert(src.get_parent() == pgid);
6254         pl->set_not_ready_to_merge_target(pgid, src);
6255       } else {
6256         pl->set_not_ready_to_merge_source(pgid);
6257       }
6258     }
6259   } else if (!ps->acting_set_writeable()) {
6260     ps->state_set(PG_STATE_PEERED);
6261   } else {
6262     ps->state_set(PG_STATE_ACTIVE);
6263   }
6264
6265   auto mnow = pl->get_mnow();
6266   if (ps->prior_readable_until_ub > mnow) {
6267     psdout(10) << " waiting for prior_readable_until_ub "
6268                << ps->prior_readable_until_ub << " > mnow " << mnow << dendl;
6269     ps->state_set(PG_STATE_WAIT);
6270     pl->queue_check_readable(
6271       ps->last_peering_reset,
6272       ps->prior_readable_until_ub - mnow);
6273   } else {
6274     psdout(10) << " mnow " << mnow << " >= prior_readable_until_ub "
6275                << ps->prior_readable_until_ub << dendl;
6276   }
6277
6278   if (ps->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) {
6279     pl->send_pg_created(pgid);
6280   }
6281
6282   ps->info.history.last_epoch_started = ps->info.last_epoch_started;
6283   ps->info.history.last_interval_started = ps->info.last_interval_started;
6284   ps->dirty_info = true;
6285
6286   ps->share_pg_info();
6287   pl->publish_stats_to_osd();
6288
6289   pl->on_activate_complete();
6290
6291   return discard_event();
6292 }
6293
6294 boost::statechart::result PeeringState::Active::react(const RenewLease& rl)
6295 {
6296   DECLARE_LOCALS;
6297   ps->proc_renew_lease();
6298   return discard_event();
6299 }
6300
6301 boost::statechart::result PeeringState::Active::react(const MLeaseAck& la)
6302 {
6303   DECLARE_LOCALS;
6304   ps->proc_lease_ack(la.from, la.lease_ack);
6305   return discard_event();
6306 }
6307
6308
6309 boost::statechart::result PeeringState::Active::react(const CheckReadable &evt)
6310 {
6311   DECLARE_LOCALS;
6312   pl->recheck_readable();
6313   return discard_event();
6314 }
6315
6316 /*
6317  * update info.history.last_epoch_started ONLY after we and all
6318  * replicas have activated AND committed the activate transaction
6319  * (i.e. the peering results are stable on disk).
6320  */
6321 void PeeringState::Active::all_activated_and_committed()
6322 {
6323   DECLARE_LOCALS;
6324   psdout(10) << "all_activated_and_committed" << dendl;
6325   ceph_assert(ps->is_primary());
6326   ceph_assert(ps->peer_activated.size() == ps->acting_recovery_backfill.size());
6327   ceph_assert(!ps->acting_recovery_backfill.empty());
6328   ceph_assert(ps->blocked_by.empty());
6329
6330   if (HAVE_FEATURE(ps->upacting_features, SERVER_OCTOPUS)) {
6331     // this is overkill when the activation is quick, but when it is slow it
6332     // is important, because the lease was renewed by the activate itself but we
6333     // don't know how long ago that was, and simply scheduling now may leave
6334     // a gap in lease coverage.  keep it simple and aggressively renew.
6335     ps->renew_lease(pl->get_mnow());
6336     ps->send_lease();
6337     ps->schedule_renew_lease();
6338   }
6339
6340   // Degraded?
6341   ps->update_calc_stats();
6342   if (ps->info.stats.stats.sum.num_objects_degraded) {
6343     ps->state_set(PG_STATE_DEGRADED);
6344   } else {
6345     ps->state_clear(PG_STATE_DEGRADED);
6346   }
6347
6348   post_event(PeeringState::AllReplicasActivated());
6349 }
6350
6351
6352 void PeeringState::Active::exit()
6353 {
6354   context< PeeringMachine >().log_exit(state_name, enter_time);
6355
6356
6357   DECLARE_LOCALS;
6358   pl->cancel_local_background_io_reservation();
6359
6360   ps->blocked_by.clear();
6361   ps->backfill_reserved = false;
6362   ps->state_clear(PG_STATE_ACTIVATING);
6363   ps->state_clear(PG_STATE_DEGRADED);
6364   ps->state_clear(PG_STATE_UNDERSIZED);
6365   ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
6366   ps->state_clear(PG_STATE_BACKFILL_WAIT);
6367   ps->state_clear(PG_STATE_RECOVERY_WAIT);
6368   ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
6369   utime_t dur = ceph_clock_now() - enter_time;
6370   pl->get_peering_perf().tinc(rs_active_latency, dur);
6371   pl->on_active_exit();
6372 }
6373
6374 /*------ReplicaActive-----*/
6375 PeeringState::ReplicaActive::ReplicaActive(my_context ctx)
6376   : my_base(ctx),
6377     NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive")
6378 {
6379   context< PeeringMachine >().log_enter(state_name);
6380
6381   DECLARE_LOCALS;
6382   ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6383 }
6384
6385
6386 boost::statechart::result PeeringState::ReplicaActive::react(
6387   const Activate& actevt) {
6388   DECLARE_LOCALS;
6389   psdout(10) << "In ReplicaActive, about to call activate" << dendl;
6390   ps->activate(
6391     context< PeeringMachine >().get_cur_transaction(),
6392     actevt.activation_epoch,
6393     context< PeeringMachine >().get_recovery_ctx());
6394   psdout(10) << "Activate Finished" << dendl;
6395   return discard_event();
6396 }
6397
6398 boost::statechart::result PeeringState::ReplicaActive::react(
6399   const ActivateCommitted &evt)
6400 {
6401   DECLARE_LOCALS;
6402   psdout(10) << __func__ << " " << evt.epoch << " telling primary" << dendl;
6403
6404   auto &rctx = context<PeeringMachine>().get_recovery_ctx();
6405   auto epoch = ps->get_osdmap_epoch();
6406   pg_info_t i = ps->info;
6407   i.history.last_epoch_started = evt.activation_epoch;
6408   i.history.last_interval_started = i.history.same_interval_since;
6409   rctx.send_info(
6410     ps->get_primary().osd,
6411     spg_t(ps->info.pgid.pgid, ps->get_primary().shard),
6412     epoch,
6413     epoch,
6414     i,
6415     {}, /* lease */
6416     ps->get_lease_ack());
6417
6418   if (ps->acting_set_writeable()) {
6419     ps->state_set(PG_STATE_ACTIVE);
6420   } else {
6421     ps->state_set(PG_STATE_PEERED);
6422   }
6423   pl->on_activate_committed();
6424
6425   return discard_event();
6426 }
6427
6428 boost::statechart::result PeeringState::ReplicaActive::react(const MLease& l)
6429 {
6430   DECLARE_LOCALS;
6431   spg_t spgid = context< PeeringMachine >().spgid;
6432   epoch_t epoch = pl->get_osdmap_epoch();
6433
6434   ps->proc_lease(l.lease);
6435   pl->send_cluster_message(
6436     ps->get_primary().osd,
6437     make_message<MOSDPGLeaseAck>(epoch,
6438                        spg_t(spgid.pgid, ps->get_primary().shard),
6439                        ps->get_lease_ack()),
6440     epoch);
6441   return discard_event();
6442 }
6443
6444 boost::statechart::result PeeringState::ReplicaActive::react(const MInfoRec& infoevt)
6445 {
6446   DECLARE_LOCALS;
6447   ps->proc_primary_info(context<PeeringMachine>().get_cur_transaction(),
6448                         infoevt.info);
6449   return discard_event();
6450 }
6451
6452 boost::statechart::result PeeringState::ReplicaActive::react(const MLogRec& logevt)
6453 {
6454   DECLARE_LOCALS;
6455   psdout(10) << "received log from " << logevt.from << dendl;
6456   ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6457   ps->merge_log(t, logevt.msg->info, std::move(logevt.msg->log), logevt.from);
6458   ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6459   if (logevt.msg->lease) {
6460     ps->proc_lease(*logevt.msg->lease);
6461   }
6462
6463   return discard_event();
6464 }
6465
6466 boost::statechart::result PeeringState::ReplicaActive::react(const MTrim& trim)
6467 {
6468   DECLARE_LOCALS;
6469   // primary is instructing us to trim
6470   ps->pg_log.trim(trim.trim_to, ps->info);
6471   ps->dirty_info = true;
6472   return discard_event();
6473 }
6474
6475 boost::statechart::result PeeringState::ReplicaActive::react(const ActMap&)
6476 {
6477   DECLARE_LOCALS;
6478   if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6479     ps->info.history.refresh_prior_readable_until_ub(
6480       pl->get_mnow(), ps->prior_readable_until_ub);
6481     context< PeeringMachine >().send_notify(
6482       ps->get_primary().osd,
6483       pg_notify_t(
6484         ps->get_primary().shard, ps->pg_whoami.shard,
6485         ps->get_osdmap_epoch(),
6486         ps->get_osdmap_epoch(),
6487         ps->info,
6488         ps->past_intervals));
6489   }
6490   return discard_event();
6491 }
6492
6493 boost::statechart::result PeeringState::ReplicaActive::react(
6494   const MQuery& query)
6495 {
6496   DECLARE_LOCALS;
6497   ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6498   return discard_event();
6499 }
6500
6501 boost::statechart::result PeeringState::ReplicaActive::react(const QueryState& q)
6502 {
6503   q.f->open_object_section("state");
6504   q.f->dump_string("name", state_name);
6505   q.f->dump_stream("enter_time") << enter_time;
6506   q.f->close_section();
6507   return forward_event();
6508 }
6509
6510 boost::statechart::result PeeringState::ReplicaActive::react(const QueryUnfound& q)
6511 {
6512   q.f->dump_string("state", "ReplicaActive");
6513   q.f->dump_bool("available_might_have_unfound", false);
6514   return discard_event();
6515 }
6516
6517 void PeeringState::ReplicaActive::exit()
6518 {
6519   context< PeeringMachine >().log_exit(state_name, enter_time);
6520   DECLARE_LOCALS;
6521   pl->unreserve_recovery_space();
6522
6523   pl->cancel_remote_recovery_reservation();
6524   utime_t dur = ceph_clock_now() - enter_time;
6525   pl->get_peering_perf().tinc(rs_replicaactive_latency, dur);
6526
6527   ps->min_last_complete_ondisk = eversion_t();
6528 }
6529
6530 /*-------Stray---*/
6531 PeeringState::Stray::Stray(my_context ctx)
6532   : my_base(ctx),
6533     NamedState(context< PeeringMachine >().state_history, "Started/Stray")
6534 {
6535   context< PeeringMachine >().log_enter(state_name);
6536
6537
6538   DECLARE_LOCALS;
6539   ceph_assert(!ps->is_peered());
6540   ceph_assert(!ps->is_peering());
6541   ceph_assert(!ps->is_primary());
6542
6543   if (!ps->get_osdmap()->have_pg_pool(ps->info.pgid.pgid.pool())) {
6544     ldout(ps->cct,10) << __func__ << " pool is deleted" << dendl;
6545     post_event(DeleteStart());
6546   } else {
6547     ps->start_flush(context< PeeringMachine >().get_cur_transaction());
6548   }
6549 }
6550
6551 boost::statechart::result PeeringState::Stray::react(const MLogRec& logevt)
6552 {
6553   DECLARE_LOCALS;
6554   MOSDPGLog *msg = logevt.msg.get();
6555   psdout(10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
6556
6557   ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6558   if (msg->info.last_backfill == hobject_t()) {
6559     // restart backfill
6560     ps->info = msg->info;
6561     pl->on_info_history_change();
6562     ps->dirty_info = true;
6563     ps->dirty_big_info = true;  // maybe.
6564
6565     PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6566     ps->pg_log.reset_backfill_claim_log(msg->log, rollbacker.get());
6567
6568     ps->pg_log.reset_backfill();
6569   } else {
6570     ps->merge_log(t, msg->info, std::move(msg->log), logevt.from);
6571   }
6572   if (logevt.msg->lease) {
6573     ps->proc_lease(*logevt.msg->lease);
6574   }
6575
6576   ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6577
6578   post_event(Activate(logevt.msg->info.last_epoch_started));
6579   return transit<ReplicaActive>();
6580 }
6581
6582 boost::statechart::result PeeringState::Stray::react(const MInfoRec& infoevt)
6583 {
6584   DECLARE_LOCALS;
6585   psdout(10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
6586
6587   if (ps->info.last_update > infoevt.info.last_update) {
6588     // rewind divergent log entries
6589     ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6590     ps->rewind_divergent_log(t, infoevt.info.last_update);
6591     ps->info.stats = infoevt.info.stats;
6592     ps->info.hit_set = infoevt.info.hit_set;
6593   }
6594
6595   if (infoevt.lease) {
6596     ps->proc_lease(*infoevt.lease);
6597   }
6598
6599   ceph_assert(infoevt.info.last_update == ps->info.last_update);
6600   ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
6601
6602   post_event(Activate(infoevt.info.last_epoch_started));
6603   return transit<ReplicaActive>();
6604 }
6605
6606 boost::statechart::result PeeringState::Stray::react(const MQuery& query)
6607 {
6608   DECLARE_LOCALS;
6609   ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
6610   return discard_event();
6611 }
6612
6613 boost::statechart::result PeeringState::Stray::react(const ActMap&)
6614 {
6615   DECLARE_LOCALS;
6616   if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
6617     ps->info.history.refresh_prior_readable_until_ub(
6618       pl->get_mnow(), ps->prior_readable_until_ub);
6619     context< PeeringMachine >().send_notify(
6620       ps->get_primary().osd,
6621       pg_notify_t(
6622         ps->get_primary().shard, ps->pg_whoami.shard,
6623         ps->get_osdmap_epoch(),
6624         ps->get_osdmap_epoch(),
6625         ps->info,
6626         ps->past_intervals));
6627   }
6628   return discard_event();
6629 }
6630
6631 void PeeringState::Stray::exit()
6632 {
6633   context< PeeringMachine >().log_exit(state_name, enter_time);
6634   DECLARE_LOCALS;
6635   utime_t dur = ceph_clock_now() - enter_time;
6636   pl->get_peering_perf().tinc(rs_stray_latency, dur);
6637 }
6638
6639
6640 /*--------ToDelete----------*/
6641 PeeringState::ToDelete::ToDelete(my_context ctx)
6642   : my_base(ctx),
6643     NamedState(context< PeeringMachine >().state_history, "Started/ToDelete")
6644 {
6645   context< PeeringMachine >().log_enter(state_name);
6646   DECLARE_LOCALS;
6647   pl->get_perf_logger().inc(l_osd_pg_removing);
6648 }
6649
6650 void PeeringState::ToDelete::exit()
6651 {
6652   context< PeeringMachine >().log_exit(state_name, enter_time);
6653   DECLARE_LOCALS;
6654   // note: on a successful removal, this path doesn't execute. see
6655   // _delete_some().
6656   pl->get_perf_logger().dec(l_osd_pg_removing);
6657
6658   pl->cancel_local_background_io_reservation();
6659 }
6660
6661 /*----WaitDeleteReserved----*/
6662 PeeringState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx)
6663   : my_base(ctx),
6664     NamedState(context< PeeringMachine >().state_history,
6665                "Started/ToDelete/WaitDeleteReseved")
6666 {
6667   context< PeeringMachine >().log_enter(state_name);
6668   DECLARE_LOCALS;
6669   context< ToDelete >().priority = ps->get_delete_priority();
6670
6671   pl->cancel_local_background_io_reservation();
6672   pl->request_local_background_io_reservation(
6673     context<ToDelete>().priority,
6674     std::make_unique<PGPeeringEvent>(
6675       ps->get_osdmap_epoch(),
6676       ps->get_osdmap_epoch(),
6677       DeleteReserved()),
6678     std::make_unique<PGPeeringEvent>(
6679       ps->get_osdmap_epoch(),
6680       ps->get_osdmap_epoch(),
6681       DeleteInterrupted()));
6682 }
6683
6684 boost::statechart::result PeeringState::ToDelete::react(
6685   const ActMap& evt)
6686 {
6687   DECLARE_LOCALS;
6688   if (ps->get_delete_priority() != priority) {
6689     psdout(10) << __func__ << " delete priority changed, resetting"
6690                    << dendl;
6691     return transit<ToDelete>();
6692   }
6693   return discard_event();
6694 }
6695
6696 void PeeringState::WaitDeleteReserved::exit()
6697 {
6698   context< PeeringMachine >().log_exit(state_name, enter_time);
6699 }
6700
6701 /*----Deleting-----*/
6702 PeeringState::Deleting::Deleting(my_context ctx)
6703   : my_base(ctx),
6704     NamedState(context< PeeringMachine >().state_history, "Started/ToDelete/Deleting")
6705 {
6706   context< PeeringMachine >().log_enter(state_name);
6707
6708   DECLARE_LOCALS;
6709   ps->deleting = true;
6710   ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
6711
6712   // clear log
6713   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
6714   ps->pg_log.roll_forward(rollbacker.get());
6715
6716   // adjust info to backfill
6717   ps->info.set_last_backfill(hobject_t());
6718   ps->pg_log.reset_backfill();
6719   ps->dirty_info = true;
6720
6721   pl->on_removal(t);
6722 }
6723
6724 boost::statechart::result PeeringState::Deleting::react(
6725   const DeleteSome& evt)
6726 {
6727   DECLARE_LOCALS;
6728   std::pair<ghobject_t, bool> p;
6729   p = pl->do_delete_work(context<PeeringMachine>().get_cur_transaction(),
6730     next);
6731   next = p.first;
6732   return p.second ? discard_event() : terminate();
6733 }
6734
6735 void PeeringState::Deleting::exit()
6736 {
6737   context< PeeringMachine >().log_exit(state_name, enter_time);
6738   DECLARE_LOCALS;
6739   ps->deleting = false;
6740   pl->cancel_local_background_io_reservation();
6741 }
6742
6743 /*--------GetInfo---------*/
6744 PeeringState::GetInfo::GetInfo(my_context ctx)
6745   : my_base(ctx),
6746     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetInfo")
6747 {
6748   context< PeeringMachine >().log_enter(state_name);
6749
6750
6751   DECLARE_LOCALS;
6752   ps->check_past_interval_bounds();
6753   ps->log_weirdness();
6754   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6755
6756   ceph_assert(ps->blocked_by.empty());
6757
6758   prior_set = ps->build_prior();
6759   ps->prior_readable_down_osds = prior_set.down;
6760   if (ps->prior_readable_down_osds.empty()) {
6761     psdout(10) << " no prior_set down osds, clearing prior_readable_until_ub"
6762                << dendl;
6763     ps->clear_prior_readable_until_ub();
6764   }
6765
6766   ps->reset_min_peer_features();
6767   get_infos();
6768   if (prior_set.pg_down) {
6769     post_event(IsDown());
6770   } else if (peer_info_requested.empty()) {
6771     post_event(GotInfo());
6772   }
6773 }
6774
6775 void PeeringState::GetInfo::get_infos()
6776 {
6777   DECLARE_LOCALS;
6778   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6779
6780   ps->blocked_by.clear();
6781   for (auto it = prior_set.probe.begin(); it != prior_set.probe.end(); ++it) {
6782     pg_shard_t peer = *it;
6783     if (peer == ps->pg_whoami) {
6784       continue;
6785     }
6786     if (ps->peer_info.count(peer)) {
6787       psdout(10) << " have osd." << peer << " info " << ps->peer_info[peer] << dendl;
6788       continue;
6789     }
6790     if (peer_info_requested.count(peer)) {
6791       psdout(10) << " already requested info from osd." << peer << dendl;
6792       ps->blocked_by.insert(peer.osd);
6793     } else if (!ps->get_osdmap()->is_up(peer.osd)) {
6794       psdout(10) << " not querying info from down osd." << peer << dendl;
6795     } else {
6796       psdout(10) << " querying info from osd." << peer << dendl;
6797       context< PeeringMachine >().send_query(
6798         peer.osd,
6799         pg_query_t(pg_query_t::INFO,
6800                    it->shard, ps->pg_whoami.shard,
6801                    ps->info.history,
6802                    ps->get_osdmap_epoch()));
6803       peer_info_requested.insert(peer);
6804       ps->blocked_by.insert(peer.osd);
6805     }
6806   }
6807
6808   ps->check_prior_readable_down_osds(ps->get_osdmap());
6809
6810   pl->publish_stats_to_osd();
6811 }
6812
6813 boost::statechart::result PeeringState::GetInfo::react(const MNotifyRec& infoevt)
6814 {
6815
6816   DECLARE_LOCALS;
6817
6818   auto p = peer_info_requested.find(infoevt.from);
6819   if (p != peer_info_requested.end()) {
6820     peer_info_requested.erase(p);
6821     ps->blocked_by.erase(infoevt.from.osd);
6822   }
6823
6824   epoch_t old_start = ps->info.history.last_epoch_started;
6825   if (ps->proc_replica_info(
6826         infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
6827     // we got something new ...
6828     PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
6829     if (old_start < ps->info.history.last_epoch_started) {
6830       psdout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
6831       prior_set = ps->build_prior();
6832       ps->prior_readable_down_osds = prior_set.down;
6833
6834       // filter out any osds that got dropped from the probe set from
6835       // peer_info_requested.  this is less expensive than restarting
6836       // peering (which would re-probe everyone).
6837       auto p = peer_info_requested.begin();
6838       while (p != peer_info_requested.end()) {
6839         if (prior_set.probe.count(*p) == 0) {
6840           psdout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
6841           peer_info_requested.erase(p++);
6842         } else {
6843           ++p;
6844         }
6845       }
6846       get_infos();
6847     }
6848     psdout(20) << "Adding osd: " << infoevt.from.osd << " peer features: "
6849                        << hex << infoevt.features << dec << dendl;
6850     ps->apply_peer_features(infoevt.features);
6851
6852     // are we done getting everything?
6853     if (peer_info_requested.empty() && !prior_set.pg_down) {
6854       psdout(20) << "Common peer features: " << hex << ps->get_min_peer_features() << dec << dendl;
6855       psdout(20) << "Common acting features: " << hex << ps->get_min_acting_features() << dec << dendl;
6856       psdout(20) << "Common upacting features: " << hex << ps->get_min_upacting_features() << dec << dendl;
6857       post_event(GotInfo());
6858     }
6859   }
6860   return discard_event();
6861 }
6862
6863 boost::statechart::result PeeringState::GetInfo::react(const QueryState& q)
6864 {
6865   DECLARE_LOCALS;
6866   q.f->open_object_section("state");
6867   q.f->dump_string("name", state_name);
6868   q.f->dump_stream("enter_time") << enter_time;
6869
6870   q.f->open_array_section("requested_info_from");
6871   for (auto p = peer_info_requested.begin();
6872        p != peer_info_requested.end();
6873        ++p) {
6874     q.f->open_object_section("osd");
6875     q.f->dump_stream("osd") << *p;
6876     if (ps->peer_info.count(*p)) {
6877       q.f->open_object_section("got_info");
6878       ps->peer_info[*p].dump(q.f);
6879       q.f->close_section();
6880     }
6881     q.f->close_section();
6882   }
6883   q.f->close_section();
6884
6885   q.f->close_section();
6886   return forward_event();
6887 }
6888
6889 boost::statechart::result PeeringState::GetInfo::react(const QueryUnfound& q)
6890 {
6891   q.f->dump_string("state", "GetInfo");
6892   q.f->dump_bool("available_might_have_unfound", false);
6893   return discard_event();
6894 }
6895
6896 void PeeringState::GetInfo::exit()
6897 {
6898   context< PeeringMachine >().log_exit(state_name, enter_time);
6899
6900   DECLARE_LOCALS;
6901   utime_t dur = ceph_clock_now() - enter_time;
6902   pl->get_peering_perf().tinc(rs_getinfo_latency, dur);
6903   ps->blocked_by.clear();
6904 }
6905
6906 /*------GetLog------------*/
6907 PeeringState::GetLog::GetLog(my_context ctx)
6908   : my_base(ctx),
6909     NamedState(
6910       context< PeeringMachine >().state_history,
6911       "Started/Primary/Peering/GetLog"),
6912     msg(0)
6913 {
6914   context< PeeringMachine >().log_enter(state_name);
6915
6916   DECLARE_LOCALS;
6917
6918   ps->log_weirdness();
6919
6920   // adjust acting?
6921   if (!ps->choose_acting(auth_log_shard, false,
6922                          &context< Peering >().history_les_bound)) {
6923     if (!ps->want_acting.empty()) {
6924       post_event(NeedActingChange());
6925     } else {
6926       post_event(IsIncomplete());
6927     }
6928     return;
6929   }
6930
6931   // am i the best?
6932   if (auth_log_shard == ps->pg_whoami) {
6933     post_event(GotLog());
6934     return;
6935   }
6936
6937   const pg_info_t& best = ps->peer_info[auth_log_shard];
6938
6939   // am i broken?
6940   if (ps->info.last_update < best.log_tail) {
6941     psdout(10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
6942     post_event(IsIncomplete());
6943     return;
6944   }
6945
6946   // how much log to request?
6947   eversion_t request_log_from = ps->info.last_update;
6948   ceph_assert(!ps->acting_recovery_backfill.empty());
6949   for (auto p = ps->acting_recovery_backfill.begin();
6950        p != ps->acting_recovery_backfill.end();
6951        ++p) {
6952     if (*p == ps->pg_whoami) continue;
6953     pg_info_t& ri = ps->peer_info[*p];
6954     if (ri.last_update < ps->info.log_tail && ri.last_update >= best.log_tail &&
6955         ri.last_update < request_log_from)
6956       request_log_from = ri.last_update;
6957   }
6958
6959   // how much?
6960   psdout(10) << " requesting log from osd." << auth_log_shard << dendl;
6961   context<PeeringMachine>().send_query(
6962     auth_log_shard.osd,
6963     pg_query_t(
6964       pg_query_t::LOG,
6965       auth_log_shard.shard, ps->pg_whoami.shard,
6966       request_log_from, ps->info.history,
6967       ps->get_osdmap_epoch()));
6968
6969   ceph_assert(ps->blocked_by.empty());
6970   ps->blocked_by.insert(auth_log_shard.osd);
6971   pl->publish_stats_to_osd();
6972 }
6973
6974 boost::statechart::result PeeringState::GetLog::react(const AdvMap& advmap)
6975 {
6976   // make sure our log source didn't go down.  we need to check
6977   // explicitly because it may not be part of the prior set, which
6978   // means the Peering state check won't catch it going down.
6979   if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
6980     psdout(10) << "GetLog: auth_log_shard osd."
6981                        << auth_log_shard.osd << " went down" << dendl;
6982     post_event(advmap);
6983     return transit< Reset >();
6984   }
6985
6986   // let the Peering state do its checks.
6987   return forward_event();
6988 }
6989
6990 boost::statechart::result PeeringState::GetLog::react(const MLogRec& logevt)
6991 {
6992   ceph_assert(!msg);
6993   if (logevt.from != auth_log_shard) {
6994     psdout(10) << "GetLog: discarding log from "
6995                        << "non-auth_log_shard osd." << logevt.from << dendl;
6996     return discard_event();
6997   }
6998   psdout(10) << "GetLog: received master log from osd."
6999                      << logevt.from << dendl;
7000   msg = logevt.msg;
7001   post_event(GotLog());
7002   return discard_event();
7003 }
7004
7005 boost::statechart::result PeeringState::GetLog::react(const GotLog&)
7006 {
7007
7008   DECLARE_LOCALS;
7009   psdout(10) << "leaving GetLog" << dendl;
7010   if (msg) {
7011     psdout(10) << "processing master log" << dendl;
7012     ps->proc_master_log(context<PeeringMachine>().get_cur_transaction(),
7013                         msg->info, std::move(msg->log), std::move(msg->missing),
7014                         auth_log_shard);
7015   }
7016   ps->start_flush(context< PeeringMachine >().get_cur_transaction());
7017   return transit< GetMissing >();
7018 }
7019
7020 boost::statechart::result PeeringState::GetLog::react(const QueryState& q)
7021 {
7022   q.f->open_object_section("state");
7023   q.f->dump_string("name", state_name);
7024   q.f->dump_stream("enter_time") << enter_time;
7025   q.f->dump_stream("auth_log_shard") << auth_log_shard;
7026   q.f->close_section();
7027   return forward_event();
7028 }
7029
7030 boost::statechart::result PeeringState::GetLog::react(const QueryUnfound& q)
7031 {
7032   q.f->dump_string("state", "GetLog");
7033   q.f->dump_bool("available_might_have_unfound", false);
7034   return discard_event();
7035 }
7036
7037 void PeeringState::GetLog::exit()
7038 {
7039   context< PeeringMachine >().log_exit(state_name, enter_time);
7040
7041   DECLARE_LOCALS;
7042   utime_t dur = ceph_clock_now() - enter_time;
7043   pl->get_peering_perf().tinc(rs_getlog_latency, dur);
7044   ps->blocked_by.clear();
7045 }
7046
7047 /*------WaitActingChange--------*/
7048 PeeringState::WaitActingChange::WaitActingChange(my_context ctx)
7049   : my_base(ctx),
7050     NamedState(context< PeeringMachine >().state_history, "Started/Primary/WaitActingChange")
7051 {
7052   context< PeeringMachine >().log_enter(state_name);
7053 }
7054
7055 boost::statechart::result PeeringState::WaitActingChange::react(const AdvMap& advmap)
7056 {
7057   DECLARE_LOCALS;
7058   OSDMapRef osdmap = advmap.osdmap;
7059
7060   psdout(10) << "verifying no want_acting " << ps->want_acting << " targets didn't go down" << dendl;
7061   for (auto p = ps->want_acting.begin(); p != ps->want_acting.end(); ++p) {
7062     if (!osdmap->is_up(*p)) {
7063       psdout(10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
7064       post_event(advmap);
7065       return transit< Reset >();
7066     }
7067   }
7068   return forward_event();
7069 }
7070
7071 boost::statechart::result PeeringState::WaitActingChange::react(const MLogRec& logevt)
7072 {
7073   psdout(10) << "In WaitActingChange, ignoring MLocRec" << dendl;
7074   return discard_event();
7075 }
7076
7077 boost::statechart::result PeeringState::WaitActingChange::react(const MInfoRec& evt)
7078 {
7079   psdout(10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
7080   return discard_event();
7081 }
7082
7083 boost::statechart::result PeeringState::WaitActingChange::react(const MNotifyRec& evt)
7084 {
7085   psdout(10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
7086   return discard_event();
7087 }
7088
7089 boost::statechart::result PeeringState::WaitActingChange::react(const QueryState& q)
7090 {
7091   q.f->open_object_section("state");
7092   q.f->dump_string("name", state_name);
7093   q.f->dump_stream("enter_time") << enter_time;
7094   q.f->dump_string("comment", "waiting for pg acting set to change");
7095   q.f->close_section();
7096   return forward_event();
7097 }
7098
7099 boost::statechart::result PeeringState::WaitActingChange::react(const QueryUnfound& q)
7100 {
7101   q.f->dump_string("state", "WaitActingChange");
7102   q.f->dump_bool("available_might_have_unfound", false);
7103   return discard_event();
7104 }
7105
7106 void PeeringState::WaitActingChange::exit()
7107 {
7108   context< PeeringMachine >().log_exit(state_name, enter_time);
7109   DECLARE_LOCALS;
7110   utime_t dur = ceph_clock_now() - enter_time;
7111   pl->get_peering_perf().tinc(rs_waitactingchange_latency, dur);
7112 }
7113
7114 /*------Down--------*/
7115 PeeringState::Down::Down(my_context ctx)
7116   : my_base(ctx),
7117     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Down")
7118 {
7119   context< PeeringMachine >().log_enter(state_name);
7120   DECLARE_LOCALS;
7121
7122   ps->state_clear(PG_STATE_PEERING);
7123   ps->state_set(PG_STATE_DOWN);
7124
7125   auto &prior_set = context< Peering >().prior_set;
7126   ceph_assert(ps->blocked_by.empty());
7127   ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
7128   pl->publish_stats_to_osd();
7129 }
7130
7131 void PeeringState::Down::exit()
7132 {
7133   context< PeeringMachine >().log_exit(state_name, enter_time);
7134
7135   DECLARE_LOCALS;
7136
7137   ps->state_clear(PG_STATE_DOWN);
7138   utime_t dur = ceph_clock_now() - enter_time;
7139   pl->get_peering_perf().tinc(rs_down_latency, dur);
7140
7141   ps->blocked_by.clear();
7142 }
7143
7144 boost::statechart::result PeeringState::Down::react(const QueryState& q)
7145 {
7146   q.f->open_object_section("state");
7147   q.f->dump_string("name", state_name);
7148   q.f->dump_stream("enter_time") << enter_time;
7149   q.f->dump_string("comment",
7150                    "not enough up instances of this PG to go active");
7151   q.f->close_section();
7152   return forward_event();
7153 }
7154
7155 boost::statechart::result PeeringState::Down::react(const QueryUnfound& q)
7156 {
7157   q.f->dump_string("state", "Down");
7158   q.f->dump_bool("available_might_have_unfound", false);
7159   return discard_event();
7160 }
7161
7162 boost::statechart::result PeeringState::Down::react(const MNotifyRec& infoevt)
7163 {
7164   DECLARE_LOCALS;
7165
7166   ceph_assert(ps->is_primary());
7167   epoch_t old_start = ps->info.history.last_epoch_started;
7168   if (!ps->peer_info.count(infoevt.from) &&
7169       ps->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) {
7170     ps->update_history(infoevt.notify.info.history);
7171   }
7172   // if we got something new to make pg escape down state
7173   if (ps->info.history.last_epoch_started > old_start) {
7174       psdout(10) << " last_epoch_started moved forward, re-enter getinfo" << dendl;
7175     ps->state_clear(PG_STATE_DOWN);
7176     ps->state_set(PG_STATE_PEERING);
7177     return transit< GetInfo >();
7178   }
7179
7180   return discard_event();
7181 }
7182
7183
7184 /*------Incomplete--------*/
7185 PeeringState::Incomplete::Incomplete(my_context ctx)
7186   : my_base(ctx),
7187     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Incomplete")
7188 {
7189   context< PeeringMachine >().log_enter(state_name);
7190   DECLARE_LOCALS;
7191
7192   ps->state_clear(PG_STATE_PEERING);
7193   ps->state_set(PG_STATE_INCOMPLETE);
7194
7195   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7196   ceph_assert(ps->blocked_by.empty());
7197   ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
7198   pl->publish_stats_to_osd();
7199 }
7200
7201 boost::statechart::result PeeringState::Incomplete::react(const AdvMap &advmap) {
7202   DECLARE_LOCALS;
7203   int64_t poolnum = ps->info.pgid.pool();
7204
7205   // Reset if min_size turn smaller than previous value, pg might now be able to go active
7206   if (!advmap.osdmap->have_pg_pool(poolnum) ||
7207       advmap.lastmap->get_pools().find(poolnum)->second.min_size >
7208       advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
7209     post_event(advmap);
7210     return transit< Reset >();
7211   }
7212
7213   return forward_event();
7214 }
7215
7216 boost::statechart::result PeeringState::Incomplete::react(const MNotifyRec& notevt) {
7217   DECLARE_LOCALS;
7218   psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
7219   if (ps->proc_replica_info(
7220     notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
7221     // We got something new, try again!
7222     return transit< GetLog >();
7223   } else {
7224     return discard_event();
7225   }
7226 }
7227
7228 boost::statechart::result PeeringState::Incomplete::react(
7229   const QueryState& q)
7230 {
7231   q.f->open_object_section("state");
7232   q.f->dump_string("name", state_name);
7233   q.f->dump_stream("enter_time") << enter_time;
7234   q.f->dump_string("comment", "not enough complete instances of this PG");
7235   q.f->close_section();
7236   return forward_event();
7237 }
7238
7239 boost::statechart::result PeeringState::Incomplete::react(const QueryUnfound& q)
7240 {
7241   q.f->dump_string("state", "Incomplete");
7242   q.f->dump_bool("available_might_have_unfound", false);
7243   return discard_event();
7244 }
7245
7246 void PeeringState::Incomplete::exit()
7247 {
7248   context< PeeringMachine >().log_exit(state_name, enter_time);
7249
7250   DECLARE_LOCALS;
7251
7252   ps->state_clear(PG_STATE_INCOMPLETE);
7253   utime_t dur = ceph_clock_now() - enter_time;
7254   pl->get_peering_perf().tinc(rs_incomplete_latency, dur);
7255
7256   ps->blocked_by.clear();
7257 }
7258
7259 /*------GetMissing--------*/
7260 PeeringState::GetMissing::GetMissing(my_context ctx)
7261   : my_base(ctx),
7262     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetMissing")
7263 {
7264   context< PeeringMachine >().log_enter(state_name);
7265
7266   DECLARE_LOCALS;
7267   ps->log_weirdness();
7268   ceph_assert(!ps->acting_recovery_backfill.empty());
7269   eversion_t since;
7270   for (auto i = ps->acting_recovery_backfill.begin();
7271        i != ps->acting_recovery_backfill.end();
7272        ++i) {
7273     if (*i == ps->get_primary()) continue;
7274     const pg_info_t& pi = ps->peer_info[*i];
7275     // reset this so to make sure the pg_missing_t is initialized and
7276     // has the correct semantics even if we don't need to get a
7277     // missing set from a shard. This way later additions due to
7278     // lost+unfound delete work properly.
7279     ps->peer_missing[*i].may_include_deletes = !ps->perform_deletes_during_peering();
7280
7281     if (pi.is_empty())
7282       continue;                                // no pg data, nothing divergent
7283
7284     if (pi.last_update < ps->pg_log.get_tail()) {
7285       psdout(10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
7286       ps->peer_missing[*i].clear();
7287       continue;
7288     }
7289     if (pi.last_backfill == hobject_t()) {
7290       psdout(10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
7291       ps->peer_missing[*i].clear();
7292       continue;
7293     }
7294
7295     if (pi.last_update == pi.last_complete &&  // peer has no missing
7296         pi.last_update == ps->info.last_update) {  // peer is up to date
7297       // replica has no missing and identical log as us.  no need to
7298       // pull anything.
7299       // FIXME: we can do better here.  if last_update==last_complete we
7300       //        can infer the rest!
7301       psdout(10) << " osd." << *i << " has no missing, identical log" << dendl;
7302       ps->peer_missing[*i].clear();
7303       continue;
7304     }
7305
7306     // We pull the log from the peer's last_epoch_started to ensure we
7307     // get enough log to detect divergent updates.
7308     since.epoch = pi.last_epoch_started;
7309     ceph_assert(pi.last_update >= ps->info.log_tail);  // or else choose_acting() did a bad thing
7310     if (pi.log_tail <= since) {
7311       psdout(10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
7312       context< PeeringMachine >().send_query(
7313         i->osd,
7314         pg_query_t(
7315           pg_query_t::LOG,
7316           i->shard, ps->pg_whoami.shard,
7317           since, ps->info.history,
7318           ps->get_osdmap_epoch()));
7319     } else {
7320       psdout(10) << " requesting fulllog+missing from osd." << *i
7321                          << " (want since " << since << " < log.tail "
7322                          << pi.log_tail << ")" << dendl;
7323       context< PeeringMachine >().send_query(
7324         i->osd, pg_query_t(
7325           pg_query_t::FULLLOG,
7326           i->shard, ps->pg_whoami.shard,
7327           ps->info.history, ps->get_osdmap_epoch()));
7328     }
7329     peer_missing_requested.insert(*i);
7330     ps->blocked_by.insert(i->osd);
7331   }
7332
7333   if (peer_missing_requested.empty()) {
7334     if (ps->need_up_thru) {
7335       psdout(10) << " still need up_thru update before going active"
7336                          << dendl;
7337       post_event(NeedUpThru());
7338       return;
7339     }
7340
7341     // all good!
7342     post_event(Activate(ps->get_osdmap_epoch()));
7343   } else {
7344     pl->publish_stats_to_osd();
7345   }
7346 }
7347
7348 boost::statechart::result PeeringState::GetMissing::react(const MLogRec& logevt)
7349 {
7350   DECLARE_LOCALS;
7351
7352   peer_missing_requested.erase(logevt.from);
7353   ps->proc_replica_log(logevt.msg->info,
7354                        logevt.msg->log,
7355                        std::move(logevt.msg->missing),
7356                        logevt.from);
7357
7358   if (peer_missing_requested.empty()) {
7359     if (ps->need_up_thru) {
7360       psdout(10) << " still need up_thru update before going active"
7361                          << dendl;
7362       post_event(NeedUpThru());
7363     } else {
7364       psdout(10) << "Got last missing, don't need missing "
7365                          << "posting Activate" << dendl;
7366       post_event(Activate(ps->get_osdmap_epoch()));
7367     }
7368   }
7369   return discard_event();
7370 }
7371
7372 boost::statechart::result PeeringState::GetMissing::react(const QueryState& q)
7373 {
7374   DECLARE_LOCALS;
7375   q.f->open_object_section("state");
7376   q.f->dump_string("name", state_name);
7377   q.f->dump_stream("enter_time") << enter_time;
7378
7379   q.f->open_array_section("peer_missing_requested");
7380   for (auto p = peer_missing_requested.begin();
7381        p != peer_missing_requested.end();
7382        ++p) {
7383     q.f->open_object_section("osd");
7384     q.f->dump_stream("osd") << *p;
7385     if (ps->peer_missing.count(*p)) {
7386       q.f->open_object_section("got_missing");
7387       ps->peer_missing[*p].dump(q.f);
7388       q.f->close_section();
7389     }
7390     q.f->close_section();
7391   }
7392   q.f->close_section();
7393
7394   q.f->close_section();
7395   return forward_event();
7396 }
7397
7398 boost::statechart::result PeeringState::GetMissing::react(const QueryUnfound& q)
7399 {
7400   q.f->dump_string("state", "GetMising");
7401   q.f->dump_bool("available_might_have_unfound", false);
7402   return discard_event();
7403 }
7404
7405 void PeeringState::GetMissing::exit()
7406 {
7407   context< PeeringMachine >().log_exit(state_name, enter_time);
7408
7409   DECLARE_LOCALS;
7410   utime_t dur = ceph_clock_now() - enter_time;
7411   pl->get_peering_perf().tinc(rs_getmissing_latency, dur);
7412   ps->blocked_by.clear();
7413 }
7414
7415 /*------WaitUpThru--------*/
7416 PeeringState::WaitUpThru::WaitUpThru(my_context ctx)
7417   : my_base(ctx),
7418     NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/WaitUpThru")
7419 {
7420   context< PeeringMachine >().log_enter(state_name);
7421 }
7422
7423 boost::statechart::result PeeringState::WaitUpThru::react(const ActMap& am)
7424 {
7425   DECLARE_LOCALS;
7426   if (!ps->need_up_thru) {
7427     post_event(Activate(ps->get_osdmap_epoch()));
7428   }
7429   return forward_event();
7430 }
7431
7432 boost::statechart::result PeeringState::WaitUpThru::react(const MLogRec& logevt)
7433 {
7434   DECLARE_LOCALS;
7435   psdout(10) << "Noting missing from osd." << logevt.from << dendl;
7436   ps->peer_missing[logevt.from].claim(std::move(logevt.msg->missing));
7437   ps->peer_info[logevt.from] = logevt.msg->info;
7438   return discard_event();
7439 }
7440
7441 boost::statechart::result PeeringState::WaitUpThru::react(const QueryState& q)
7442 {
7443   q.f->open_object_section("state");
7444   q.f->dump_string("name", state_name);
7445   q.f->dump_stream("enter_time") << enter_time;
7446   q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
7447   q.f->close_section();
7448   return forward_event();
7449 }
7450
7451 boost::statechart::result PeeringState::WaitUpThru::react(const QueryUnfound& q)
7452 {
7453   q.f->dump_string("state", "WaitUpThru");
7454   q.f->dump_bool("available_might_have_unfound", false);
7455   return discard_event();
7456 }
7457
7458 void PeeringState::WaitUpThru::exit()
7459 {
7460   context< PeeringMachine >().log_exit(state_name, enter_time);
7461   DECLARE_LOCALS;
7462   utime_t dur = ceph_clock_now() - enter_time;
7463   pl->get_peering_perf().tinc(rs_waitupthru_latency, dur);
7464 }
7465
7466 /*----PeeringState::PeeringMachine Methods-----*/
7467 #undef dout_prefix
7468 #define dout_prefix dpp->gen_prefix(*_dout)
7469
7470 void PeeringState::PeeringMachine::log_enter(const char *state_name)
7471 {
7472   DECLARE_LOCALS;
7473   psdout(5) << "enter " << state_name << dendl;
7474   pl->log_state_enter(state_name);
7475 }
7476
7477 void PeeringState::PeeringMachine::log_exit(const char *state_name, utime_t enter_time)
7478 {
7479   DECLARE_LOCALS;
7480   utime_t dur = ceph_clock_now() - enter_time;
7481   psdout(5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
7482   pl->log_state_exit(state_name, enter_time, event_count, event_time);
7483   event_count = 0;
7484   event_time = utime_t();
7485 }
7486
7487 ostream &operator<<(ostream &out, const PeeringState &ps) {
7488   out << "pg[" << ps.info
7489       << " " << pg_vector_string(ps.up);
7490   if (ps.acting != ps.up)
7491     out << "/" << pg_vector_string(ps.acting);
7492   if (ps.is_ec_pg())
7493     out << "p" << ps.get_primary();
7494   if (!ps.async_recovery_targets.empty())
7495     out << " async=[" << ps.async_recovery_targets << "]";
7496   if (!ps.backfill_targets.empty())
7497     out << " backfill=[" << ps.backfill_targets << "]";
7498   out << " r=" << ps.get_role();
7499   out << " lpr=" << ps.get_last_peering_reset();
7500
7501   if (ps.deleting)
7502     out << " DELETING";
7503
7504   if (!ps.past_intervals.empty()) {
7505     out << " pi=[" << ps.past_intervals.get_bounds()
7506         << ")/" << ps.past_intervals.size();
7507   }
7508
7509   if (ps.is_peered()) {
7510     if (ps.last_update_ondisk != ps.info.last_update)
7511       out << " luod=" << ps.last_update_ondisk;
7512     if (ps.last_update_applied != ps.info.last_update)
7513       out << " lua=" << ps.last_update_applied;
7514   }
7515
7516   if (ps.pg_log.get_tail() != ps.info.log_tail ||
7517       ps.pg_log.get_head() != ps.info.last_update)
7518     out << " (info mismatch, " << ps.pg_log.get_log() << ")";
7519
7520   if (!ps.pg_log.get_log().empty()) {
7521     if ((ps.pg_log.get_log().log.begin()->version <= ps.pg_log.get_tail())) {
7522       out << " (log bound mismatch, actual=["
7523           << ps.pg_log.get_log().log.begin()->version << ","
7524           << ps.pg_log.get_log().log.rbegin()->version << "]";
7525       out << ")";
7526     }
7527   }
7528
7529   out << " crt=" << ps.pg_log.get_can_rollback_to();
7530
7531   if (ps.last_complete_ondisk != ps.info.last_complete)
7532     out << " lcod " << ps.last_complete_ondisk;
7533
7534   out << " mlcod " << ps.min_last_complete_ondisk;
7535
7536   out << " " << pg_state_string(ps.get_state());
7537   if (ps.should_send_notify())
7538     out << " NOTIFY";
7539
7540   if (ps.prior_readable_until_ub != ceph::signedspan::zero()) {
7541     out << " pruub " << ps.prior_readable_until_ub
7542         << "@" << ps.get_prior_readable_down_osds();
7543   }
7544   return out;
7545 }
7546
7547 std::vector<pg_shard_t> PeeringState::get_replica_recovery_order() const
7548 {
7549   std::vector<std::pair<unsigned int, pg_shard_t>> replicas_by_num_missing,
7550     async_by_num_missing;
7551   replicas_by_num_missing.reserve(get_acting_recovery_backfill().size() - 1);
7552   for (auto &p : get_acting_recovery_backfill()) {
7553     if (p == get_primary()) {
7554       continue;
7555     }
7556     auto pm = get_peer_missing().find(p);
7557     assert(pm != get_peer_missing().end());
7558     auto nm = pm->second.num_missing();
7559     if (nm != 0) {
7560       if (is_async_recovery_target(p)) {
7561         async_by_num_missing.push_back(make_pair(nm, p));
7562       } else {
7563         replicas_by_num_missing.push_back(make_pair(nm, p));
7564       }
7565     }
7566   }
7567   // sort by number of missing objects, in ascending order.
7568   auto func = [](const std::pair<unsigned int, pg_shard_t> &lhs,
7569                  const std::pair<unsigned int, pg_shard_t> &rhs) {
7570     return lhs.first < rhs.first;
7571   };
7572   // acting goes first
7573   std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func);
7574   // then async_recovery_targets
7575   std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func);
7576   replicas_by_num_missing.insert(replicas_by_num_missing.end(),
7577     async_by_num_missing.begin(), async_by_num_missing.end());
7578
7579   std::vector<pg_shard_t> ret;
7580   ret.reserve(replicas_by_num_missing.size());
7581   for (auto p : replicas_by_num_missing) {
7582     ret.push_back(p.second);
7583   }
7584   return ret;
7585 }
7586
7587