ceph/src/osd/PG.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15 #include "PG.h"
  16 #include "messages/MOSDRepScrub.h"
  17
  18 #include "common/errno.h"
  19 #include "common/ceph_releases.h"
  20 #include "common/config.h"
  21 #include "OSD.h"
  22 #include "OpRequest.h"
  23 #include "ScrubStore.h"
  24 #include "pg_scrubber.h"
  25 #include "Session.h"
  26 #include "osd/scheduler/OpSchedulerItem.h"
  27
  28 #include "common/Timer.h"
  29 #include "common/perf_counters.h"
  30
  31 #include "messages/MOSDOp.h"
  32 #include "messages/MOSDPGNotify.h"
  33 #include "messages/MOSDPGInfo.h"
  34 #include "messages/MOSDPGScan.h"
  35 #include "messages/MOSDPGBackfill.h"
  36 #include "messages/MOSDPGBackfillRemove.h"
  37 #include "messages/MBackfillReserve.h"
  38 #include "messages/MRecoveryReserve.h"
  39 #include "messages/MOSDPGPush.h"
  40 #include "messages/MOSDPGPushReply.h"
  41 #include "messages/MOSDPGPull.h"
  42 #include "messages/MOSDECSubOpWrite.h"
  43 #include "messages/MOSDECSubOpWriteReply.h"
  44 #include "messages/MOSDECSubOpRead.h"
  45 #include "messages/MOSDECSubOpReadReply.h"
  46 #include "messages/MOSDPGUpdateLogMissing.h"
  47 #include "messages/MOSDPGUpdateLogMissingReply.h"
  48 #include "messages/MOSDBackoff.h"
  49 #include "messages/MOSDScrubReserve.h"
  50 #include "messages/MOSDRepOp.h"
  51 #include "messages/MOSDRepOpReply.h"
  52 #include "messages/MOSDRepScrubMap.h"
  53 #include "messages/MOSDPGRecoveryDelete.h"
  54 #include "messages/MOSDPGRecoveryDeleteReply.h"
  55
  56 #include "common/BackTrace.h"
  57 #include "common/EventTrace.h"
  58
  59 #ifdef WITH_LTTNG
  60 #define TRACEPOINT_DEFINE
  61 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
  62 #include "tracing/pg.h"
  63 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
  64 #undef TRACEPOINT_DEFINE
  65 #else
  66 #define tracepoint(...)
  67 #endif
  68
  69 #include <sstream>
  70
  71 #define dout_context cct
  72 #define dout_subsys ceph_subsys_osd
  73 #undef dout_prefix
  74 #define dout_prefix _prefix(_dout, this)
  75
  76 using std::list;
  77 using std::map;
  78 using std::ostringstream;
  79 using std::pair;
  80 using std::set;
  81 using std::string;
  82 using std::stringstream;
  83 using std::unique_ptr;
  84 using std::vector;
  85
  86 using ceph::bufferlist;
  87 using ceph::bufferptr;
  88 using ceph::decode;
  89 using ceph::encode;
  90 using ceph::Formatter;
  91
  92 using namespace ceph::osd::scheduler;
  93
  94 template <class T>
  95 static ostream& _prefix(std::ostream *_dout, T *t)
  96 {
  97   return t->gen_prefix(*_dout);
  98 }
  99
 100 void PG::get(const char* tag)
 101 {
 102   int after = ++ref;
 103   lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " "
 104                                  << "tag " << (tag ? tag : "(none") << " "
 105                                  << (after - 1) << " -> " << after << dendl;
 106 #ifdef PG_DEBUG_REFS
 107   std::lock_guard l(_ref_id_lock);
 108   _tag_counts[tag]++;
 109 #endif
 110 }
 111
 112 void PG::put(const char* tag)
 113 {
 114 #ifdef PG_DEBUG_REFS
 115   {
 116     std::lock_guard l(_ref_id_lock);
 117     auto tag_counts_entry = _tag_counts.find(tag);
 118     ceph_assert(tag_counts_entry != _tag_counts.end());
 119     --tag_counts_entry->second;
 120     if (tag_counts_entry->second == 0) {
 121       _tag_counts.erase(tag_counts_entry);
 122     }
 123   }
 124 #endif
 125   auto local_cct = cct;
 126   int after = --ref;
 127   lgeneric_subdout(local_cct, refs, 5) << "PG::put " << this << " "
 128                                        << "tag " << (tag ? tag : "(none") << " "
 129                                        << (after + 1) << " -> " << after
 130                                        << dendl;
 131   if (after == 0)
 132     delete this;
 133 }
 134
 135 #ifdef PG_DEBUG_REFS
 136 uint64_t PG::get_with_id()
 137 {
 138   ref++;
 139   std::lock_guard l(_ref_id_lock);
 140   uint64_t id = ++_ref_id;
 141   BackTrace bt(0);
 142   stringstream ss;
 143   bt.print(ss);
 144   lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " " << info.pgid
 145                                  << " got id " << id << " "
 146                                  << (ref - 1) << " -> " << ref
 147                                  << dendl;
 148   ceph_assert(!_live_ids.count(id));
 149   _live_ids.insert(make_pair(id, ss.str()));
 150   return id;
 151 }
 152
 153 void PG::put_with_id(uint64_t id)
 154 {
 155   int newref = --ref;
 156   lgeneric_subdout(cct, refs, 5) << "PG::put " << this << " " << info.pgid
 157                                  << " put id " << id << " "
 158                                  << (newref + 1) << " -> " << newref
 159                                  << dendl;
 160   {
 161     std::lock_guard l(_ref_id_lock);
 162     ceph_assert(_live_ids.count(id));
 163     _live_ids.erase(id);
 164   }
 165   if (newref)
 166     delete this;
 167 }
 168
 169 void PG::dump_live_ids()
 170 {
 171   std::lock_guard l(_ref_id_lock);
 172   dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl;
 173   for (map<uint64_t, string>::iterator i = _live_ids.begin();
 174        i != _live_ids.end();
 175        ++i) {
 176     dout(0) << "\t\tid: " << *i << dendl;
 177   }
 178   dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl;
 179   for (map<string, uint64_t>::iterator i = _tag_counts.begin();
 180        i != _tag_counts.end();
 181        ++i) {
 182     dout(0) << "\t\tid: " << *i << dendl;
 183   }
 184 }
 185 #endif
 186
 187 PG::PG(OSDService *o, OSDMapRef curmap,
 188        const PGPool &_pool, spg_t p) :
 189   pg_whoami(o->whoami, p.shard),
 190   pg_id(p),
 191   coll(p),
 192   osd(o),
 193   cct(o->cct),
 194   osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
 195   snap_mapper(
 196     cct,
 197     &osdriver,
 198     p.ps(),
 199     p.get_split_bits(_pool.info.get_pg_num()),
 200     _pool.id,
 201     p.shard),
 202   trace_endpoint("0.0.0.0", 0, "PG"),
 203   info_struct_v(0),
 204   pgmeta_oid(p.make_pgmeta_oid()),
 205   stat_queue_item(this),
 206   scrub_queued(false),
 207   recovery_queued(false),
 208   recovery_ops_active(0),
 209   backfill_reserving(false),
 210   pg_stats_publish_valid(false),
 211   finish_sync_event(NULL),
 212   scrub_after_recovery(false),
 213   active_pushes(0),
 214   recovery_state(
 215     o->cct,
 216     pg_whoami,
 217     p,
 218     _pool,
 219     curmap,
 220     this,
 221     this),
 222   pool(recovery_state.get_pool()),
 223   info(recovery_state.get_info())
 224 {
 225 #ifdef PG_DEBUG_REFS
 226   osd->add_pgid(p, this);
 227 #endif
 228 #ifdef WITH_BLKIN
 229   std::stringstream ss;
 230   ss << "PG " << info.pgid;
 231   trace_endpoint.copy_name(ss.str());
 232 #endif
 233 }
 234
 235 PG::~PG()
 236 {
 237 #ifdef PG_DEBUG_REFS
 238   osd->remove_pgid(info.pgid, this);
 239 #endif
 240 }
 241
 242 void PG::lock(bool no_lockdep) const
 243 {
 244 #ifdef CEPH_DEBUG_MUTEX
 245   _lock.lock(no_lockdep);
 246 #else
 247   _lock.lock();
 248   locked_by = std::this_thread::get_id();
 249 #endif
 250   // if we have unrecorded dirty state with the lock dropped, there is a bug
 251   ceph_assert(!recovery_state.debug_has_dirty_state());
 252
 253   dout(30) << "lock" << dendl;
 254 }
 255
 256 bool PG::is_locked() const
 257 {
 258   return ceph_mutex_is_locked(_lock);
 259 }
 260
 261 void PG::unlock() const
 262 {
 263   //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
 264   ceph_assert(!recovery_state.debug_has_dirty_state());
 265 #ifndef CEPH_DEBUG_MUTEX
 266   locked_by = {};
 267 #endif
 268   _lock.unlock();
 269 }
 270
 271 std::ostream& PG::gen_prefix(std::ostream& out) const
 272 {
 273   OSDMapRef mapref = recovery_state.get_osdmap();
 274 #ifdef CEPH_DEBUG_MUTEX
 275   if (_lock.is_locked_by_me()) {
 276 #else
 277   if (locked_by == std::this_thread::get_id()) {
 278 #endif
 279     out << "osd." << osd->whoami
 280         << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
 281         << " " << *this << " ";
 282   } else {
 283     out << "osd." << osd->whoami
 284         << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
 285         << " pg[" << pg_id.pgid << "(unlocked)] ";
 286   }
 287   return out;
 288 }
 289
 290 PerfCounters &PG::get_peering_perf() {
 291   return *(osd->recoverystate_perf);
 292 }
 293
 294 PerfCounters &PG::get_perf_logger() {
 295   return *(osd->logger);
 296 }
 297
 298 void PG::log_state_enter(const char *state) {
 299   osd->pg_recovery_stats.log_enter(state);
 300 }
 301
 302 void PG::log_state_exit(
 303   const char *state_name, utime_t enter_time,
 304   uint64_t events, utime_t event_dur) {
 305   osd->pg_recovery_stats.log_exit(
 306     state_name, ceph_clock_now() - enter_time, events, event_dur);
 307 }
 308
 309 /********* PG **********/
 310
 311 void PG::remove_snap_mapped_object(
 312   ObjectStore::Transaction &t, const hobject_t &soid)
 313 {
 314   t.remove(
 315     coll,
 316     ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard));
 317   clear_object_snap_mapping(&t, soid);
 318 }
 319
 320 void PG::clear_object_snap_mapping(
 321   ObjectStore::Transaction *t, const hobject_t &soid)
 322 {
 323   OSDriver::OSTransaction _t(osdriver.get_transaction(t));
 324   if (soid.snap < CEPH_MAXSNAP) {
 325     int r = snap_mapper.remove_oid(
 326       soid,
 327       &_t);
 328     if (!(r == 0 || r == -ENOENT)) {
 329       derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
 330       ceph_abort();
 331     }
 332   }
 333 }
 334
 335 void PG::update_object_snap_mapping(
 336   ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps)
 337 {
 338   OSDriver::OSTransaction _t(osdriver.get_transaction(t));
 339   ceph_assert(soid.snap < CEPH_MAXSNAP);
 340   int r = snap_mapper.remove_oid(
 341     soid,
 342     &_t);
 343   if (!(r == 0 || r == -ENOENT)) {
 344     derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
 345     ceph_abort();
 346   }
 347   snap_mapper.add_oid(
 348     soid,
 349     snaps,
 350     &_t);
 351 }
 352
 353 /******* PG ***********/
 354 void PG::clear_primary_state()
 355 {
 356   dout(20) << __func__ << dendl;
 357
 358   projected_log = PGLog::IndexedLog();
 359
 360   snap_trimq.clear();
 361   snap_trimq_repeat.clear();
 362   finish_sync_event = 0;  // so that _finish_recovery doesn't go off in another thread
 363   release_pg_backoffs();
 364
 365   if (m_scrubber) {
 366     m_scrubber->discard_replica_reservations();
 367   }
 368   scrub_after_recovery = false;
 369
 370   agent_clear();
 371 }
 372
 373
 374 bool PG::op_has_sufficient_caps(OpRequestRef& op)
 375 {
 376   // only check MOSDOp
 377   if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
 378     return true;
 379
 380   auto req = op->get_req<MOSDOp>();
 381   auto priv = req->get_connection()->get_priv();
 382   auto session = static_cast<Session*>(priv.get());
 383   if (!session) {
 384     dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl;
 385     return false;
 386   }
 387   OSDCap& caps = session->caps;
 388   priv.reset();
 389
 390   const string &key = req->get_hobj().get_key().empty() ?
 391     req->get_oid().name :
 392     req->get_hobj().get_key();
 393
 394   bool cap = caps.is_capable(pool.name, req->get_hobj().nspace,
 395                              pool.info.application_metadata,
 396                              key,
 397                              op->need_read_cap(),
 398                              op->need_write_cap(),
 399                              op->classes(),
 400                              session->get_peer_socket_addr());
 401
 402   dout(20) << "op_has_sufficient_caps "
 403            << "session=" << session
 404            << " pool=" << pool.id << " (" << pool.name
 405            << " " << req->get_hobj().nspace
 406            << ")"
 407            << " pool_app_metadata=" << pool.info.application_metadata
 408            << " need_read_cap=" << op->need_read_cap()
 409            << " need_write_cap=" << op->need_write_cap()
 410            << " classes=" << op->classes()
 411            << " -> " << (cap ? "yes" : "NO")
 412            << dendl;
 413   return cap;
 414 }
 415
 416 void PG::queue_recovery()
 417 {
 418   if (!is_primary() || !is_peered()) {
 419     dout(10) << "queue_recovery -- not primary or not peered " << dendl;
 420     ceph_assert(!recovery_queued);
 421   } else if (recovery_queued) {
 422     dout(10) << "queue_recovery -- already queued" << dendl;
 423   } else {
 424     dout(10) << "queue_recovery -- queuing" << dendl;
 425     recovery_queued = true;
 426     osd->queue_for_recovery(this);
 427   }
 428 }
 429
 430 void PG::queue_scrub_after_repair()
 431 {
 432   dout(10) << __func__ << dendl;
 433   ceph_assert(ceph_mutex_is_locked(_lock));
 434
 435   m_planned_scrub.must_deep_scrub = true;
 436   m_planned_scrub.check_repair = true;
 437   m_planned_scrub.must_scrub = true;
 438
 439   if (is_scrubbing()) {
 440     dout(10) << __func__ << ": scrubbing already" << dendl;
 441     return;
 442   }
 443   if (scrub_queued) {
 444     dout(10) << __func__ << ": already queued" << dendl;
 445     return;
 446   }
 447
 448   m_scrubber->set_op_parameters(m_planned_scrub);
 449   dout(15) << __func__ << ": queueing" << dendl;
 450
 451   scrub_queued = true;
 452   osd->queue_scrub_after_repair(this, Scrub::scrub_prio_t::high_priority);
 453 }
 454
 455 unsigned PG::get_scrub_priority()
 456 {
 457   // a higher value -> a higher priority
 458   int64_t pool_scrub_priority =
 459     pool.info.opts.value_or(pool_opts_t::SCRUB_PRIORITY, (int64_t)0);
 460   return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
 461 }
 462
 463 Context *PG::finish_recovery()
 464 {
 465   dout(10) << "finish_recovery" << dendl;
 466   ceph_assert(info.last_complete == info.last_update);
 467
 468   clear_recovery_state();
 469
 470   /*
 471    * sync all this before purging strays.  but don't block!
 472    */
 473   finish_sync_event = new C_PG_FinishRecovery(this);
 474   return finish_sync_event;
 475 }
 476
 477 void PG::_finish_recovery(Context* c)
 478 {
 479   dout(15) << __func__ << " finish_sync_event? " << finish_sync_event << " clean? "
 480                  << is_clean() << dendl;
 481
 482   std::scoped_lock locker{*this};
 483   if (recovery_state.is_deleting() || !is_clean()) {
 484     dout(10) << __func__ << " raced with delete or repair" << dendl;
 485     return;
 486   }
 487   // When recovery is initiated by a repair, that flag is left on
 488   state_clear(PG_STATE_REPAIR);
 489   if (c == finish_sync_event) {
 490     dout(15) << __func__ << " scrub_after_recovery? " << scrub_after_recovery << dendl;
 491     finish_sync_event = 0;
 492     recovery_state.purge_strays();
 493
 494     publish_stats_to_osd();
 495
 496     if (scrub_after_recovery) {
 497       dout(10) << "_finish_recovery requeueing for scrub" << dendl;
 498       scrub_after_recovery = false;
 499       queue_scrub_after_repair();
 500     }
 501   } else {
 502     dout(10) << "_finish_recovery -- stale" << dendl;
 503   }
 504 }
 505
 506 void PG::start_recovery_op(const hobject_t& soid)
 507 {
 508   dout(10) << "start_recovery_op " << soid
 509 #ifdef DEBUG_RECOVERY_OIDS
 510            << " (" << recovering_oids << ")"
 511 #endif
 512            << dendl;
 513   ceph_assert(recovery_ops_active >= 0);
 514   recovery_ops_active++;
 515 #ifdef DEBUG_RECOVERY_OIDS
 516   recovering_oids.insert(soid);
 517 #endif
 518   osd->start_recovery_op(this, soid);
 519 }
 520
 521 void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
 522 {
 523   dout(10) << "finish_recovery_op " << soid
 524 #ifdef DEBUG_RECOVERY_OIDS
 525            << " (" << recovering_oids << ")"
 526 #endif
 527            << dendl;
 528   ceph_assert(recovery_ops_active > 0);
 529   recovery_ops_active--;
 530 #ifdef DEBUG_RECOVERY_OIDS
 531   ceph_assert(recovering_oids.count(soid));
 532   recovering_oids.erase(recovering_oids.find(soid));
 533 #endif
 534   osd->finish_recovery_op(this, soid, dequeue);
 535
 536   if (!dequeue) {
 537     queue_recovery();
 538   }
 539 }
 540
 541 void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
 542 {
 543   recovery_state.split_into(child_pgid, &child->recovery_state, split_bits);
 544
 545   child->update_snap_mapper_bits(split_bits);
 546
 547   child->snap_trimq = snap_trimq;
 548   child->snap_trimq_repeat = snap_trimq_repeat;
 549
 550   _split_into(child_pgid, child, split_bits);
 551
 552   // release all backoffs for simplicity
 553   release_backoffs(hobject_t(), hobject_t::get_max());
 554 }
 555
 556 void PG::start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
 557 {
 558   recovery_state.start_split_stats(childpgs, out);
 559 }
 560
 561 void PG::finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction &t)
 562 {
 563   recovery_state.finish_split_stats(stats, t);
 564 }
 565
 566 void PG::merge_from(map<spg_t,PGRef>& sources, PeeringCtx &rctx,
 567                     unsigned split_bits,
 568                     const pg_merge_meta_t& last_pg_merge_meta)
 569 {
 570   dout(10) << __func__ << " from " << sources << " split_bits " << split_bits
 571            << dendl;
 572   map<spg_t, PeeringState*> source_ps;
 573   for (auto &&source : sources) {
 574     source_ps.emplace(source.first, &source.second->recovery_state);
 575   }
 576   recovery_state.merge_from(source_ps, rctx, split_bits, last_pg_merge_meta);
 577
 578   for (auto& i : sources) {
 579     auto& source = i.second;
 580     // wipe out source's pgmeta
 581     rctx.transaction.remove(source->coll, source->pgmeta_oid);
 582
 583     // merge (and destroy source collection)
 584     rctx.transaction.merge_collection(source->coll, coll, split_bits);
 585   }
 586
 587   // merge_collection does this, but maybe all of our sources were missing.
 588   rctx.transaction.collection_set_bits(coll, split_bits);
 589
 590   snap_mapper.update_bits(split_bits);
 591 }
 592
 593 void PG::add_backoff(const ceph::ref_t<Session>& s, const hobject_t& begin, const hobject_t& end)
 594 {
 595   auto con = s->con;
 596   if (!con)   // OSD::ms_handle_reset clears s->con without a lock
 597     return;
 598   auto b = s->have_backoff(info.pgid, begin);
 599   if (b) {
 600     derr << __func__ << " already have backoff for " << s << " begin " << begin
 601          << " " << *b << dendl;
 602     ceph_abort();
 603   }
 604   std::lock_guard l(backoff_lock);
 605   b = ceph::make_ref<Backoff>(info.pgid, this, s, ++s->backoff_seq, begin, end);
 606   backoffs[begin].insert(b);
 607   s->add_backoff(b);
 608   dout(10) << __func__ << " session " << s << " added " << *b << dendl;
 609   con->send_message(
 610     new MOSDBackoff(
 611       info.pgid,
 612       get_osdmap_epoch(),
 613       CEPH_OSD_BACKOFF_OP_BLOCK,
 614       b->id,
 615       begin,
 616       end));
 617 }
 618
 619 void PG::release_backoffs(const hobject_t& begin, const hobject_t& end)
 620 {
 621   dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl;
 622   vector<ceph::ref_t<Backoff>> bv;
 623   {
 624     std::lock_guard l(backoff_lock);
 625     auto p = backoffs.lower_bound(begin);
 626     while (p != backoffs.end()) {
 627       int r = cmp(p->first, end);
 628       dout(20) << __func__ << " ? " << r << " " << p->first
 629                << " " << p->second << dendl;
 630       // note: must still examine begin=end=p->first case
 631       if (r > 0 || (r == 0 && begin < end)) {
 632         break;
 633       }
 634       dout(20) << __func__ << " checking " << p->first
 635                << " " << p->second << dendl;
 636       auto q = p->second.begin();
 637       while (q != p->second.end()) {
 638         dout(20) << __func__ << " checking  " << *q << dendl;
 639         int r = cmp((*q)->begin, begin);
 640         if (r == 0 || (r > 0 && (*q)->end < end)) {
 641           bv.push_back(*q);
 642           q = p->second.erase(q);
 643         } else {
 644           ++q;
 645         }
 646       }
 647       if (p->second.empty()) {
 648         p = backoffs.erase(p);
 649       } else {
 650         ++p;
 651       }
 652     }
 653   }
 654   for (auto b : bv) {
 655     std::lock_guard l(b->lock);
 656     dout(10) << __func__ << " " << *b << dendl;
 657     if (b->session) {
 658       ceph_assert(b->pg == this);
 659       ConnectionRef con = b->session->con;
 660       if (con) {   // OSD::ms_handle_reset clears s->con without a lock
 661         con->send_message(
 662           new MOSDBackoff(
 663             info.pgid,
 664             get_osdmap_epoch(),
 665             CEPH_OSD_BACKOFF_OP_UNBLOCK,
 666             b->id,
 667             b->begin,
 668             b->end));
 669       }
 670       if (b->is_new()) {
 671         b->state = Backoff::STATE_DELETING;
 672       } else {
 673         b->session->rm_backoff(b);
 674         b->session.reset();
 675       }
 676       b->pg.reset();
 677     }
 678   }
 679 }
 680
 681 void PG::clear_backoffs()
 682 {
 683   dout(10) << __func__ << " " << dendl;
 684   map<hobject_t,set<ceph::ref_t<Backoff>>> ls;
 685   {
 686     std::lock_guard l(backoff_lock);
 687     ls.swap(backoffs);
 688   }
 689   for (auto& p : ls) {
 690     for (auto& b : p.second) {
 691       std::lock_guard l(b->lock);
 692       dout(10) << __func__ << " " << *b << dendl;
 693       if (b->session) {
 694         ceph_assert(b->pg == this);
 695         if (b->is_new()) {
 696           b->state = Backoff::STATE_DELETING;
 697         } else {
 698           b->session->rm_backoff(b);
 699           b->session.reset();
 700         }
 701         b->pg.reset();
 702       }
 703     }
 704   }
 705 }
 706
 707 // called by Session::clear_backoffs()
 708 void PG::rm_backoff(const ceph::ref_t<Backoff>& b)
 709 {
 710   dout(10) << __func__ << " " << *b << dendl;
 711   std::lock_guard l(backoff_lock);
 712   ceph_assert(ceph_mutex_is_locked_by_me(b->lock));
 713   ceph_assert(b->pg == this);
 714   auto p = backoffs.find(b->begin);
 715   // may race with release_backoffs()
 716   if (p != backoffs.end()) {
 717     auto q = p->second.find(b);
 718     if (q != p->second.end()) {
 719       p->second.erase(q);
 720       if (p->second.empty()) {
 721         backoffs.erase(p);
 722       }
 723     }
 724   }
 725 }
 726
 727 void PG::clear_recovery_state()
 728 {
 729   dout(10) << "clear_recovery_state" << dendl;
 730
 731   finish_sync_event = 0;
 732
 733   hobject_t soid;
 734   while (recovery_ops_active > 0) {
 735 #ifdef DEBUG_RECOVERY_OIDS
 736     soid = *recovering_oids.begin();
 737 #endif
 738     finish_recovery_op(soid, true);
 739   }
 740
 741   backfill_info.clear();
 742   peer_backfill_info.clear();
 743   waiting_on_backfill.clear();
 744   _clear_recovery_state();  // pg impl specific hook
 745 }
 746
 747 void PG::cancel_recovery()
 748 {
 749   dout(10) << "cancel_recovery" << dendl;
 750   clear_recovery_state();
 751 }
 752
 753 void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
 754 {
 755   std::lock_guard l(heartbeat_peer_lock);
 756   probe_targets.clear();
 757   for (set<pg_shard_t>::iterator i = probe_set.begin();
 758        i != probe_set.end();
 759        ++i) {
 760     probe_targets.insert(i->osd);
 761   }
 762 }
 763
 764 void PG::send_cluster_message(
 765   int target, MessageRef m,
 766   epoch_t epoch, bool share_map_update=false)
 767 {
 768   ConnectionRef con = osd->get_con_osd_cluster(
 769     target, get_osdmap_epoch());
 770   if (!con) {
 771     return;
 772   }
 773
 774   if (share_map_update) {
 775     osd->maybe_share_map(con.get(), get_osdmap());
 776   }
 777   osd->send_message_osd_cluster(m, con.get());
 778 }
 779
 780 void PG::clear_probe_targets()
 781 {
 782   std::lock_guard l(heartbeat_peer_lock);
 783   probe_targets.clear();
 784 }
 785
 786 void PG::update_heartbeat_peers(set<int> new_peers)
 787 {
 788   bool need_update = false;
 789   heartbeat_peer_lock.lock();
 790   if (new_peers == heartbeat_peers) {
 791     dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl;
 792   } else {
 793     dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl;
 794     heartbeat_peers.swap(new_peers);
 795     need_update = true;
 796   }
 797   heartbeat_peer_lock.unlock();
 798
 799   if (need_update)
 800     osd->need_heartbeat_peer_update();
 801 }
 802
 803
 804 bool PG::check_in_progress_op(
 805   const osd_reqid_t &r,
 806   eversion_t *version,
 807   version_t *user_version,
 808   int *return_code,
 809   vector<pg_log_op_return_item_t> *op_returns
 810   ) const
 811 {
 812   return (
 813     projected_log.get_request(r, version, user_version, return_code,
 814                               op_returns) ||
 815     recovery_state.get_pg_log().get_log().get_request(
 816       r, version, user_version, return_code, op_returns));
 817 }
 818
 819 void PG::publish_stats_to_osd()
 820 {
 821   if (!is_primary())
 822     return;
 823
 824   std::lock_guard l{pg_stats_publish_lock};
 825   auto stats = recovery_state.prepare_stats_for_publish(
 826     pg_stats_publish_valid,
 827     pg_stats_publish,
 828     unstable_stats);
 829   if (stats) {
 830     pg_stats_publish = stats.value();
 831     pg_stats_publish_valid = true;
 832   }
 833 }
 834
 835 unsigned PG::get_target_pg_log_entries() const
 836 {
 837   return osd->get_target_pg_log_entries();
 838 }
 839
 840 void PG::clear_publish_stats()
 841 {
 842   dout(15) << "clear_stats" << dendl;
 843   std::lock_guard l{pg_stats_publish_lock};
 844   pg_stats_publish_valid = false;
 845 }
 846
 847 /**
 848  * initialize a newly instantiated pg
 849  *
 850  * Initialize PG state, as when a PG is initially created, or when it
 851  * is first instantiated on the current node.
 852  *
 853  * @param role our role/rank
 854  * @param newup up set
 855  * @param newacting acting set
 856  * @param history pg history
 857  * @param pi past_intervals
 858  * @param backfill true if info should be marked as backfill
 859  * @param t transaction to write out our new state in
 860  */
 861 void PG::init(
 862   int role,
 863   const vector<int>& newup, int new_up_primary,
 864   const vector<int>& newacting, int new_acting_primary,
 865   const pg_history_t& history,
 866   const PastIntervals& pi,
 867   bool backfill,
 868   ObjectStore::Transaction &t)
 869 {
 870   recovery_state.init(
 871     role, newup, new_up_primary, newacting,
 872     new_acting_primary, history, pi, backfill, t);
 873 }
 874
 875 void PG::shutdown()
 876 {
 877   ch->flush();
 878   std::scoped_lock l{*this};
 879   recovery_state.shutdown();
 880   on_shutdown();
 881 }
 882
 883 #pragma GCC diagnostic ignored "-Wpragmas"
 884 #pragma GCC diagnostic push
 885 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 886
 887 void PG::upgrade(ObjectStore *store)
 888 {
 889   dout(0) << __func__ << " " << info_struct_v << " -> " << pg_latest_struct_v
 890           << dendl;
 891   ceph_assert(info_struct_v <= 10);
 892   ObjectStore::Transaction t;
 893
 894   // <do upgrade steps here>
 895
 896   // finished upgrade!
 897   ceph_assert(info_struct_v == 10);
 898
 899   // update infover_key
 900   if (info_struct_v < pg_latest_struct_v) {
 901     map<string,bufferlist> v;
 902     __u8 ver = pg_latest_struct_v;
 903     encode(ver, v[string(infover_key)]);
 904     t.omap_setkeys(coll, pgmeta_oid, v);
 905   }
 906
 907   recovery_state.force_write_state(t);
 908
 909   ObjectStore::CollectionHandle ch = store->open_collection(coll);
 910   int r = store->queue_transaction(ch, std::move(t));
 911   if (r != 0) {
 912     derr << __func__ << ": queue_transaction returned "
 913          << cpp_strerror(r) << dendl;
 914     ceph_abort();
 915   }
 916   ceph_assert(r == 0);
 917
 918   C_SaferCond waiter;
 919   if (!ch->flush_commit(&waiter)) {
 920     waiter.wait();
 921   }
 922 }
 923
 924 #pragma GCC diagnostic pop
 925 #pragma GCC diagnostic warning "-Wpragmas"
 926
 927 void PG::prepare_write(
 928   pg_info_t &info,
 929   pg_info_t &last_written_info,
 930   PastIntervals &past_intervals,
 931   PGLog &pglog,
 932   bool dirty_info,
 933   bool dirty_big_info,
 934   bool need_write_epoch,
 935   ObjectStore::Transaction &t)
 936 {
 937   info.stats.stats.add(unstable_stats);
 938   unstable_stats.clear();
 939   map<string,bufferlist> km;
 940   string key_to_remove;
 941   if (dirty_big_info || dirty_info) {
 942     int ret = prepare_info_keymap(
 943       cct,
 944       &km,
 945       &key_to_remove,
 946       get_osdmap_epoch(),
 947       info,
 948       last_written_info,
 949       past_intervals,
 950       dirty_big_info,
 951       need_write_epoch,
 952       cct->_conf->osd_fast_info,
 953       osd->logger,
 954       this);
 955     ceph_assert(ret == 0);
 956   }
 957   pglog.write_log_and_missing(
 958     t, &km, coll, pgmeta_oid, pool.info.require_rollback());
 959   if (!km.empty())
 960     t.omap_setkeys(coll, pgmeta_oid, km);
 961   if (!key_to_remove.empty())
 962     t.omap_rmkey(coll, pgmeta_oid, key_to_remove);
 963 }
 964
 965 #pragma GCC diagnostic ignored "-Wpragmas"
 966 #pragma GCC diagnostic push
 967 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 968
 969 bool PG::_has_removal_flag(ObjectStore *store,
 970                            spg_t pgid)
 971 {
 972   coll_t coll(pgid);
 973   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
 974
 975   // first try new way
 976   set<string> keys;
 977   keys.insert("_remove");
 978   map<string,bufferlist> values;
 979   auto ch = store->open_collection(coll);
 980   ceph_assert(ch);
 981   if (store->omap_get_values(ch, pgmeta_oid, keys, &values) == 0 &&
 982       values.size() == 1)
 983     return true;
 984
 985   return false;
 986 }
 987
 988 int PG::peek_map_epoch(ObjectStore *store,
 989                        spg_t pgid,
 990                        epoch_t *pepoch)
 991 {
 992   coll_t coll(pgid);
 993   ghobject_t legacy_infos_oid(OSD::make_infos_oid());
 994   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
 995   epoch_t cur_epoch = 0;
 996
 997   // validate collection name
 998   ceph_assert(coll.is_pg());
 999
1000   // try for v8
1001   set<string> keys;
1002   keys.insert(string(infover_key));
1003   keys.insert(string(epoch_key));
1004   map<string,bufferlist> values;
1005   auto ch = store->open_collection(coll);
1006   ceph_assert(ch);
1007   int r = store->omap_get_values(ch, pgmeta_oid, keys, &values);
1008   if (r == 0) {
1009     ceph_assert(values.size() == 2);
1010
1011     // sanity check version
1012     auto bp = values[string(infover_key)].cbegin();
1013     __u8 struct_v = 0;
1014     decode(struct_v, bp);
1015     ceph_assert(struct_v >= 8);
1016
1017     // get epoch
1018     bp = values[string(epoch_key)].begin();
1019     decode(cur_epoch, bp);
1020   } else {
1021     // probably bug 10617; see OSD::load_pgs()
1022     return -1;
1023   }
1024
1025   *pepoch = cur_epoch;
1026   return 0;
1027 }
1028
1029 #pragma GCC diagnostic pop
1030 #pragma GCC diagnostic warning "-Wpragmas"
1031
1032 bool PG::check_log_for_corruption(ObjectStore *store)
1033 {
1034   /// TODO: this method needs to work with the omap log
1035   return true;
1036 }
1037
1038 //! Get the name we're going to save our corrupt page log as
1039 std::string PG::get_corrupt_pg_log_name() const
1040 {
1041   const int MAX_BUF = 512;
1042   char buf[MAX_BUF];
1043   struct tm tm_buf;
1044   time_t my_time(time(NULL));
1045   const struct tm *t = localtime_r(&my_time, &tm_buf);
1046   int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t);
1047   if (ret == 0) {
1048     dout(0) << "strftime failed" << dendl;
1049     return "corrupt_log_unknown_time";
1050   }
1051   string out(buf);
1052   out += stringify(info.pgid);
1053   return out;
1054 }
1055
1056 int PG::read_info(
1057   ObjectStore *store, spg_t pgid, const coll_t &coll,
1058   pg_info_t &info, PastIntervals &past_intervals,
1059   __u8 &struct_v)
1060 {
1061   set<string> keys;
1062   keys.insert(string(infover_key));
1063   keys.insert(string(info_key));
1064   keys.insert(string(biginfo_key));
1065   keys.insert(string(fastinfo_key));
1066   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
1067   map<string,bufferlist> values;
1068   auto ch = store->open_collection(coll);
1069   ceph_assert(ch);
1070   int r = store->omap_get_values(ch, pgmeta_oid, keys, &values);
1071   ceph_assert(r == 0);
1072   ceph_assert(values.size() == 3 ||
1073          values.size() == 4);
1074
1075   auto p = values[string(infover_key)].cbegin();
1076   decode(struct_v, p);
1077   ceph_assert(struct_v >= 10);
1078
1079   p = values[string(info_key)].begin();
1080   decode(info, p);
1081
1082   p = values[string(biginfo_key)].begin();
1083   decode(past_intervals, p);
1084   decode(info.purged_snaps, p);
1085
1086   p = values[string(fastinfo_key)].begin();
1087   if (!p.end()) {
1088     pg_fast_info_t fast;
1089     decode(fast, p);
1090     fast.try_apply_to(&info);
1091   }
1092   return 0;
1093 }
1094
1095 void PG::read_state(ObjectStore *store)
1096 {
1097   PastIntervals past_intervals_from_disk;
1098   pg_info_t info_from_disk;
1099   int r = read_info(
1100     store,
1101     pg_id,
1102     coll,
1103     info_from_disk,
1104     past_intervals_from_disk,
1105     info_struct_v);
1106   ceph_assert(r >= 0);
1107
1108   if (info_struct_v < pg_compat_struct_v) {
1109     derr << "PG needs upgrade, but on-disk data is too old; upgrade to"
1110          << " an older version first." << dendl;
1111     ceph_abort_msg("PG too old to upgrade");
1112   }
1113
1114   recovery_state.init_from_disk_state(
1115     std::move(info_from_disk),
1116     std::move(past_intervals_from_disk),
1117     [this, store] (PGLog &pglog) {
1118       ostringstream oss;
1119       pglog.read_log_and_missing(
1120         store,
1121         ch,
1122         pgmeta_oid,
1123         info,
1124         oss,
1125         cct->_conf->osd_ignore_stale_divergent_priors,
1126         cct->_conf->osd_debug_verify_missing_on_start);
1127
1128       if (oss.tellp())
1129         osd->clog->error() << oss.str();
1130       return 0;
1131     });
1132
1133   if (info_struct_v < pg_latest_struct_v) {
1134     upgrade(store);
1135   }
1136
1137   // initialize current mapping
1138   {
1139     int primary, up_primary;
1140     vector<int> acting, up;
1141     get_osdmap()->pg_to_up_acting_osds(
1142       pg_id.pgid, &up, &up_primary, &acting, &primary);
1143     recovery_state.init_primary_up_acting(
1144       up,
1145       acting,
1146       up_primary,
1147       primary);
1148     recovery_state.set_role(OSDMap::calc_pg_role(pg_whoami, acting));
1149   }
1150
1151   // init pool options
1152   store->set_collection_opts(ch, pool.info.opts);
1153
1154   PeeringCtx rctx(ceph_release_t::unknown);
1155   handle_initialize(rctx);
1156   // note: we don't activate here because we know the OSD will advance maps
1157   // during boot.
1158   write_if_dirty(rctx.transaction);
1159   store->queue_transaction(ch, std::move(rctx.transaction));
1160 }
1161
1162 void PG::update_snap_map(
1163   const vector<pg_log_entry_t> &log_entries,
1164   ObjectStore::Transaction &t)
1165 {
1166   for (auto i = log_entries.cbegin(); i != log_entries.cend(); ++i) {
1167     OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
1168     if (i->soid.snap < CEPH_MAXSNAP) {
1169       if (i->is_delete()) {
1170         int r = snap_mapper.remove_oid(
1171           i->soid,
1172           &_t);
1173         if (r)
1174           derr << __func__ << " remove_oid " << i->soid << " failed with " << r << dendl;
1175         // On removal tolerate missing key corruption
1176         ceph_assert(r == 0 || r == -ENOENT);
1177       } else if (i->is_update()) {
1178         ceph_assert(i->snaps.length() > 0);
1179         vector<snapid_t> snaps;
1180         bufferlist snapbl = i->snaps;
1181         auto p = snapbl.cbegin();
1182         try {
1183           decode(snaps, p);
1184         } catch (...) {
1185           derr << __func__ << " decode snaps failure on " << *i << dendl;
1186           snaps.clear();
1187         }
1188         set<snapid_t> _snaps(snaps.begin(), snaps.end());
1189
1190         if (i->is_clone() || i->is_promote()) {
1191           snap_mapper.add_oid(
1192             i->soid,
1193             _snaps,
1194             &_t);
1195         } else if (i->is_modify()) {
1196           int r = snap_mapper.update_snaps(
1197             i->soid,
1198             _snaps,
1199             0,
1200             &_t);
1201           ceph_assert(r == 0);
1202         } else {
1203           ceph_assert(i->is_clean());
1204         }
1205       }
1206     }
1207   }
1208 }
1209
1210 /**
1211  * filter trimming|trimmed snaps out of snapcontext
1212  */
1213 void PG::filter_snapc(vector<snapid_t> &snaps)
1214 {
1215   // nothing needs to trim, we can return immediately
1216   if (snap_trimq.empty() && info.purged_snaps.empty())
1217     return;
1218
1219   bool filtering = false;
1220   vector<snapid_t> newsnaps;
1221   for (vector<snapid_t>::iterator p = snaps.begin();
1222        p != snaps.end();
1223        ++p) {
1224     if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
1225       if (!filtering) {
1226         // start building a new vector with what we've seen so far
1227         dout(10) << "filter_snapc filtering " << snaps << dendl;
1228         newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
1229         filtering = true;
1230       }
1231       dout(20) << "filter_snapc  removing trimq|purged snap " << *p << dendl;
1232     } else {
1233       if (filtering)
1234         newsnaps.push_back(*p);  // continue building new vector
1235     }
1236   }
1237   if (filtering) {
1238     snaps.swap(newsnaps);
1239     dout(10) << "filter_snapc  result " << snaps << dendl;
1240   }
1241 }
1242
1243 void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m)
1244 {
1245   for (auto it = m.begin(); it != m.end(); ++it)
1246     requeue_ops(it->second);
1247   m.clear();
1248 }
1249
1250 void PG::requeue_op(OpRequestRef op)
1251 {
1252   auto p = waiting_for_map.find(op->get_source());
1253   if (p != waiting_for_map.end()) {
1254     dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")"
1255              << dendl;
1256     p->second.push_front(op);
1257   } else {
1258     dout(20) << __func__ << " " << op << dendl;
1259     osd->enqueue_front(
1260       OpSchedulerItem(
1261         unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(info.pgid, op)),
1262         op->get_req()->get_cost(),
1263         op->get_req()->get_priority(),
1264         op->get_req()->get_recv_stamp(),
1265         op->get_req()->get_source().num(),
1266         get_osdmap_epoch()));
1267   }
1268 }
1269
1270 void PG::requeue_ops(list<OpRequestRef> &ls)
1271 {
1272   for (list<OpRequestRef>::reverse_iterator i = ls.rbegin();
1273        i != ls.rend();
1274        ++i) {
1275     requeue_op(*i);
1276   }
1277   ls.clear();
1278 }
1279
1280 void PG::requeue_map_waiters()
1281 {
1282   epoch_t epoch = get_osdmap_epoch();
1283   auto p = waiting_for_map.begin();
1284   while (p != waiting_for_map.end()) {
1285     if (epoch < p->second.front()->min_epoch) {
1286       dout(20) << __func__ << " " << p->first << " front op "
1287                << p->second.front() << " must still wait, doing nothing"
1288                << dendl;
1289       ++p;
1290     } else {
1291       dout(20) << __func__ << " " << p->first << " " << p->second << dendl;
1292       for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
1293         auto req = *q;
1294         osd->enqueue_front(OpSchedulerItem(
1295           unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(info.pgid, req)),
1296           req->get_req()->get_cost(),
1297           req->get_req()->get_priority(),
1298           req->get_req()->get_recv_stamp(),
1299           req->get_req()->get_source().num(),
1300           epoch));
1301       }
1302       p = waiting_for_map.erase(p);
1303     }
1304   }
1305 }
1306
1307 bool PG::get_must_scrub() const
1308 {
1309   dout(20) << __func__ << " must_scrub? " << (m_planned_scrub.must_scrub ? "true" : "false") << dendl;
1310   return m_planned_scrub.must_scrub;
1311 }
1312
1313 unsigned int PG::scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const
1314 {
1315   return m_scrubber->scrub_requeue_priority(with_priority);
1316 }
1317
1318 unsigned int PG::scrub_requeue_priority(Scrub::scrub_prio_t with_priority, unsigned int suggested_priority) const
1319 {
1320   return m_scrubber->scrub_requeue_priority(with_priority, suggested_priority);
1321 }
1322
1323 // ==========================================================================================
1324 // SCRUB
1325
1326 /*
1327  *  implementation note:
1328  *  PG::sched_scrub() is called only once per a specific scrub session.
1329  *  That call commits us to the whatever choices are made (deep/shallow, etc').
1330  *  Unless failing to start scrubbing, the 'planned scrub' flag-set is 'frozen' into
1331  *  PgScrubber's m_flags, then cleared.
1332  */
1333 bool PG::sched_scrub()
1334 {
1335   dout(15) << __func__ << " pg(" << info.pgid
1336           << (is_active() ? ") <active>" : ") <not-active>")
1337           << (is_clean() ? " <clean>" : " <not-clean>") << dendl;
1338   ceph_assert(ceph_mutex_is_locked(_lock));
1339   ceph_assert(!is_scrubbing());
1340
1341   if (!is_primary() || !is_active() || !is_clean()) {
1342     return false;
1343   }
1344
1345   if (scrub_queued) {
1346     // only applicable to the very first time a scrub event is queued
1347     // (until handled and posted to the scrub FSM)
1348     dout(10) << __func__ << ": already queued" << dendl;
1349     return false;
1350   }
1351
1352   // analyse the combination of the requested scrub flags, the osd/pool configuration
1353   // and the PG status to determine whether we should scrub now, and what type of scrub
1354   // should that be.
1355   auto updated_flags = verify_scrub_mode();
1356   if (!updated_flags) {
1357     // the stars do not align for starting a scrub for this PG at this time
1358     // (due to configuration or priority issues)
1359     // The reason was already reported by the callee.
1360     dout(10) << __func__ << ": failed to initiate a scrub" << dendl;
1361     return false;
1362   }
1363
1364   // try to reserve the local OSD resources. If failing: no harm. We will
1365   // be retried by the OSD later on.
1366   if (!m_scrubber->reserve_local()) {
1367     dout(10) << __func__ << ": failed to reserve locally" << dendl;
1368     return false;
1369   }
1370
1371   // can commit to the updated flags now, as nothing will stop the scrub
1372   m_planned_scrub = *updated_flags;
1373
1374   // An interrupted recovery repair could leave this set.
1375   state_clear(PG_STATE_REPAIR);
1376
1377   // Pass control to the scrubber. It is the scrubber that handles the replicas'
1378   // resources reservations.
1379   m_scrubber->set_op_parameters(m_planned_scrub);
1380
1381   dout(10) << __func__ << ": queueing" << dendl;
1382
1383   scrub_queued = true;
1384   osd->queue_for_scrub(this, Scrub::scrub_prio_t::low_priority);
1385   return true;
1386 }
1387
1388 double PG::next_deepscrub_interval() const
1389 {
1390   double deep_scrub_interval =
1391     pool.info.opts.value_or(pool_opts_t::DEEP_SCRUB_INTERVAL, 0.0);
1392   if (deep_scrub_interval <= 0.0)
1393     deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
1394   return info.history.last_deep_scrub_stamp + deep_scrub_interval;
1395 }
1396
1397 bool PG::is_time_for_deep(bool allow_deep_scrub,
1398                           bool allow_scrub,
1399                           bool has_deep_errors,
1400                           const requested_scrub_t& planned) const
1401 {
1402   dout(10) << __func__ << ": need_auto?" << planned.need_auto << " allow_deep_scrub? " << allow_deep_scrub << dendl;
1403
1404   if (!allow_deep_scrub)
1405     return false;
1406
1407   if (planned.need_auto) {
1408     dout(10) << __func__ << ": need repair after scrub errors" << dendl;
1409     return true;
1410   }
1411
1412   if (ceph_clock_now() >= next_deepscrub_interval())
1413     return true;
1414
1415   if (has_deep_errors) {
1416     osd->clog->info() << "osd." << osd->whoami << " pg " << info.pgid
1417                       << " Deep scrub errors, upgrading scrub to deep-scrub";
1418     return true;
1419   }
1420
1421   // we only flip coins if 'allow_scrub' is asserted. Otherwise - as this function is
1422   // called often, we will probably be deep-scrubbing most of the time.
1423   if (allow_scrub) {
1424     bool deep_coin_flip =
1425       (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
1426
1427     dout(15) << __func__ << ": time_for_deep=" << planned.time_for_deep
1428              << " deep_coin_flip=" << deep_coin_flip << dendl;
1429
1430     if (deep_coin_flip)
1431       return true;
1432   }
1433
1434   return false;
1435 }
1436
1437 bool PG::verify_periodic_scrub_mode(bool allow_deep_scrub,
1438                               bool try_to_auto_repair,
1439                               bool allow_regular_scrub,
1440                               bool has_deep_errors,
1441                               requested_scrub_t& planned) const
1442
1443 {
1444   ceph_assert(!planned.must_deep_scrub && !planned.must_repair);
1445
1446   if (!allow_deep_scrub && has_deep_errors) {
1447       osd->clog->error()
1448         << "osd." << osd->whoami << " pg " << info.pgid
1449         << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
1450       return false;
1451   }
1452
1453   if (allow_deep_scrub) {
1454     // Initial entry and scheduled scrubs without nodeep_scrub set get here
1455
1456     planned.time_for_deep =
1457       is_time_for_deep(allow_deep_scrub, allow_regular_scrub, has_deep_errors, planned);
1458
1459     if (try_to_auto_repair) {
1460       if (planned.time_for_deep) {
1461         dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
1462         planned.auto_repair = true;
1463       } else if (allow_regular_scrub) {
1464         dout(20) << __func__ << ": auto repair with scrubbing, rescrub if errors found"
1465                  << dendl;
1466         planned.deep_scrub_on_error = true;
1467       }
1468     }
1469   }
1470
1471   dout(20) << __func__ << " updated flags: " << planned
1472            << " allow_regular_scrub: " << allow_regular_scrub << dendl;
1473
1474   // NOSCRUB so skip regular scrubs
1475   if (!allow_regular_scrub && !planned.time_for_deep) {
1476     return false;
1477   }
1478
1479   return true;
1480 }
1481
1482 std::optional<requested_scrub_t> PG::verify_scrub_mode() const
1483 {
1484   dout(10) << __func__ << " processing pg " << info.pgid << dendl;
1485
1486   bool allow_deep_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
1487                             pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB));
1488   bool allow_regular_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
1489                                pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB));
1490   bool has_deep_errors = (info.stats.stats.sum.num_deep_scrub_errors > 0);
1491   bool try_to_auto_repair =
1492     (cct->_conf->osd_scrub_auto_repair && get_pgbackend()->auto_repair_supported());
1493
1494   auto upd_flags = m_planned_scrub;
1495
1496   upd_flags.time_for_deep = false;
1497   // Clear these in case user issues the scrub/repair command during
1498   // the scheduling of the scrub/repair (e.g. request reservation)
1499   upd_flags.deep_scrub_on_error = false;
1500   upd_flags.auto_repair = false;
1501
1502   if (upd_flags.must_scrub && !upd_flags.must_deep_scrub && has_deep_errors) {
1503     osd->clog->error() << "osd." << osd->whoami << " pg " << info.pgid
1504                        << " Regular scrub request, deep-scrub details will be lost";
1505   }
1506
1507   if (!upd_flags.must_scrub) {
1508     // All periodic scrub handling goes here because must_scrub is
1509     // always set for must_deep_scrub and must_repair.
1510
1511     bool can_start_periodic =
1512       verify_periodic_scrub_mode(allow_deep_scrub, try_to_auto_repair,
1513                                  allow_regular_scrub, has_deep_errors, upd_flags);
1514     if (!can_start_periodic) {
1515       return std::nullopt;
1516     }
1517   }
1518
1519   //  scrubbing while recovering?
1520
1521   bool prevented_by_recovery =
1522     osd->is_recovery_active() && !cct->_conf->osd_scrub_during_recovery &&
1523     (!cct->_conf->osd_repair_during_recovery || !upd_flags.must_repair);
1524
1525   if (prevented_by_recovery) {
1526     dout(20) << __func__ << ": scrubbing prevented during recovery" << dendl;
1527     return std::nullopt;
1528   }
1529
1530   upd_flags.need_auto = false;
1531   return upd_flags;
1532 }
1533
1534 void PG::reg_next_scrub()
1535 {
1536   m_scrubber->reg_next_scrub(m_planned_scrub);
1537 }
1538
1539 void PG::on_info_history_change()
1540 {
1541   if (m_scrubber) {
1542     m_scrubber->unreg_next_scrub();
1543     m_scrubber->reg_next_scrub(m_planned_scrub);
1544   }
1545 }
1546
1547 void PG::scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type)
1548 {
1549   m_scrubber->scrub_requested(scrub_level, scrub_type, m_planned_scrub);
1550 }
1551
1552 void PG::clear_ready_to_merge() {
1553   osd->clear_ready_to_merge(this);
1554 }
1555
1556 void PG::queue_want_pg_temp(const vector<int> &wanted) {
1557   osd->queue_want_pg_temp(get_pgid().pgid, wanted);
1558 }
1559
1560 void PG::clear_want_pg_temp() {
1561   osd->remove_want_pg_temp(get_pgid().pgid);
1562 }
1563
1564 void PG::on_role_change() {
1565   requeue_ops(waiting_for_peered);
1566   plpg_on_role_change();
1567 }
1568
1569 void PG::on_new_interval() {
1570   dout(20) << __func__ << " scrub_queued was " << scrub_queued << " flags: " << m_planned_scrub << dendl;
1571   scrub_queued = false;
1572   projected_last_update = eversion_t();
1573   cancel_recovery();
1574 }
1575
1576 epoch_t PG::oldest_stored_osdmap() {
1577   return osd->get_superblock().oldest_map;
1578 }
1579
1580 OstreamTemp PG::get_clog_info() {
1581   return osd->clog->info();
1582 }
1583
1584 OstreamTemp PG::get_clog_debug() {
1585   return osd->clog->debug();
1586 }
1587
1588 OstreamTemp PG::get_clog_error() {
1589   return osd->clog->error();
1590 }
1591
1592 void PG::schedule_event_after(
1593   PGPeeringEventRef event,
1594   float delay) {
1595   std::lock_guard lock(osd->recovery_request_lock);
1596   osd->recovery_request_timer.add_event_after(
1597     delay,
1598     new QueuePeeringEvt(
1599       this,
1600       std::move(event)));
1601 }
1602
1603 void PG::request_local_background_io_reservation(
1604   unsigned priority,
1605   PGPeeringEventURef on_grant,
1606   PGPeeringEventURef on_preempt) {
1607   osd->local_reserver.request_reservation(
1608     pg_id,
1609     on_grant ? new QueuePeeringEvt(
1610       this, std::move(on_grant)) : nullptr,
1611     priority,
1612     on_preempt ? new QueuePeeringEvt(
1613       this, std::move(on_preempt)) : nullptr);
1614 }
1615
1616 void PG::update_local_background_io_priority(
1617   unsigned priority) {
1618   osd->local_reserver.update_priority(
1619     pg_id,
1620     priority);
1621 }
1622
1623 void PG::cancel_local_background_io_reservation() {
1624   osd->local_reserver.cancel_reservation(
1625     pg_id);
1626 }
1627
1628 void PG::request_remote_recovery_reservation(
1629   unsigned priority,
1630   PGPeeringEventURef on_grant,
1631   PGPeeringEventURef on_preempt) {
1632   osd->remote_reserver.request_reservation(
1633     pg_id,
1634     on_grant ? new QueuePeeringEvt(
1635       this, std::move(on_grant)) : nullptr,
1636     priority,
1637     on_preempt ? new QueuePeeringEvt(
1638       this, std::move(on_preempt)) : nullptr);
1639 }
1640
1641 void PG::cancel_remote_recovery_reservation() {
1642   osd->remote_reserver.cancel_reservation(
1643     pg_id);
1644 }
1645
1646 void PG::schedule_event_on_commit(
1647   ObjectStore::Transaction &t,
1648   PGPeeringEventRef on_commit)
1649 {
1650   t.register_on_commit(new QueuePeeringEvt(this, on_commit));
1651 }
1652
1653 void PG::on_activate(interval_set<snapid_t> snaps)
1654 {
1655   ceph_assert(!m_scrubber->are_callbacks_pending());
1656   ceph_assert(callbacks_for_degraded_object.empty());
1657   snap_trimq = snaps;
1658   release_pg_backoffs();
1659   projected_last_update = info.last_update;
1660 }
1661
1662 void PG::on_active_exit()
1663 {
1664   backfill_reserving = false;
1665   agent_stop();
1666 }
1667
1668 void PG::on_active_advmap(const OSDMapRef &osdmap)
1669 {
1670   const auto& new_removed_snaps = osdmap->get_new_removed_snaps();
1671   auto i = new_removed_snaps.find(get_pgid().pool());
1672   if (i != new_removed_snaps.end()) {
1673     bool bad = false;
1674     for (auto j : i->second) {
1675       if (snap_trimq.intersects(j.first, j.second)) {
1676         decltype(snap_trimq) added, overlap;
1677         added.insert(j.first, j.second);
1678         overlap.intersection_of(snap_trimq, added);
1679         derr << __func__ << " removed_snaps already contains "
1680              << overlap << dendl;
1681         bad = true;
1682         snap_trimq.union_of(added);
1683       } else {
1684         snap_trimq.insert(j.first, j.second);
1685       }
1686     }
1687     dout(10) << __func__ << " new removed_snaps " << i->second
1688              << ", snap_trimq now " << snap_trimq << dendl;
1689     ceph_assert(!bad || !cct->_conf->osd_debug_verify_cached_snaps);
1690   }
1691
1692   const auto& new_purged_snaps = osdmap->get_new_purged_snaps();
1693   auto j = new_purged_snaps.find(get_pgid().pgid.pool());
1694   if (j != new_purged_snaps.end()) {
1695     bool bad = false;
1696     for (auto k : j->second) {
1697       if (!recovery_state.get_info().purged_snaps.contains(k.first, k.second)) {
1698         interval_set<snapid_t> rm, overlap;
1699         rm.insert(k.first, k.second);
1700         overlap.intersection_of(recovery_state.get_info().purged_snaps, rm);
1701         derr << __func__ << " purged_snaps does not contain "
1702              << rm << ", only " << overlap << dendl;
1703         recovery_state.adjust_purged_snaps(
1704           [&overlap](auto &purged_snaps) {
1705             purged_snaps.subtract(overlap);
1706           });
1707         // This can currently happen in the normal (if unlikely) course of
1708         // events.  Because adding snaps to purged_snaps does not increase
1709         // the pg version or add a pg log entry, we don't reliably propagate
1710         // purged_snaps additions to other OSDs.
1711         // One example:
1712         //  - purge S
1713         //  - primary and replicas update purged_snaps
1714         //  - no object updates
1715         //  - pg mapping changes, new primary on different node
1716         //  - new primary pg version == eversion_t(), so info is not
1717         //    propagated.
1718         //bad = true;
1719       } else {
1720         recovery_state.adjust_purged_snaps(
1721           [&k](auto &purged_snaps) {
1722             purged_snaps.erase(k.first, k.second);
1723           });
1724       }
1725     }
1726     dout(10) << __func__ << " new purged_snaps " << j->second
1727              << ", now " << recovery_state.get_info().purged_snaps << dendl;
1728     ceph_assert(!bad || !cct->_conf->osd_debug_verify_cached_snaps);
1729   }
1730 }
1731
1732 void PG::queue_snap_retrim(snapid_t snap)
1733 {
1734   if (!is_active() ||
1735       !is_primary()) {
1736     dout(10) << __func__ << " snap " << snap << " - not active and primary"
1737              << dendl;
1738     return;
1739   }
1740   if (!snap_trimq.contains(snap)) {
1741     snap_trimq.insert(snap);
1742     snap_trimq_repeat.insert(snap);
1743     dout(20) << __func__ << " snap " << snap
1744              << ", trimq now " << snap_trimq
1745              << ", repeat " << snap_trimq_repeat << dendl;
1746     kick_snap_trim();
1747   } else {
1748     dout(20) << __func__ << " snap " << snap
1749              << " already in trimq " << snap_trimq << dendl;
1750   }
1751 }
1752
1753 void PG::on_active_actmap()
1754 {
1755   if (cct->_conf->osd_check_for_log_corruption)
1756     check_log_for_corruption(osd->store);
1757
1758
1759   if (recovery_state.is_active()) {
1760     dout(10) << "Active: kicking snap trim" << dendl;
1761     kick_snap_trim();
1762   }
1763
1764   if (recovery_state.is_peered() &&
1765       !recovery_state.is_clean() &&
1766       !recovery_state.get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) &&
1767       (!recovery_state.get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) ||
1768        recovery_state.is_degraded())) {
1769     queue_recovery();
1770   }
1771 }
1772
1773 void PG::on_backfill_reserved()
1774 {
1775   backfill_reserving = false;
1776   queue_recovery();
1777 }
1778
1779 void PG::on_backfill_canceled()
1780 {
1781   if (!waiting_on_backfill.empty()) {
1782     waiting_on_backfill.clear();
1783     finish_recovery_op(hobject_t::get_max());
1784   }
1785 }
1786
1787 void PG::on_recovery_reserved()
1788 {
1789   queue_recovery();
1790 }
1791
1792 void PG::set_not_ready_to_merge_target(pg_t pgid, pg_t src)
1793 {
1794   osd->set_not_ready_to_merge_target(pgid, src);
1795 }
1796
1797 void PG::set_not_ready_to_merge_source(pg_t pgid)
1798 {
1799   osd->set_not_ready_to_merge_source(pgid);
1800 }
1801
1802 void PG::set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec)
1803 {
1804   osd->set_ready_to_merge_target(this, lu, les, lec);
1805 }
1806
1807 void PG::set_ready_to_merge_source(eversion_t lu)
1808 {
1809   osd->set_ready_to_merge_source(this, lu);
1810 }
1811
1812 void PG::send_pg_created(pg_t pgid)
1813 {
1814   osd->send_pg_created(pgid);
1815 }
1816
1817 ceph::signedspan PG::get_mnow()
1818 {
1819   return osd->get_mnow();
1820 }
1821
1822 HeartbeatStampsRef PG::get_hb_stamps(int peer)
1823 {
1824   return osd->get_hb_stamps(peer);
1825 }
1826
1827 void PG::schedule_renew_lease(epoch_t lpr, ceph::timespan delay)
1828 {
1829   auto spgid = info.pgid;
1830   auto o = osd;
1831   osd->mono_timer.add_event(
1832     delay,
1833     [o, lpr, spgid]() {
1834       o->queue_renew_lease(lpr, spgid);
1835     });
1836 }
1837
1838 void PG::queue_check_readable(epoch_t lpr, ceph::timespan delay)
1839 {
1840   osd->queue_check_readable(info.pgid, lpr, delay);
1841 }
1842
1843 void PG::rebuild_missing_set_with_deletes(PGLog &pglog)
1844 {
1845   pglog.rebuild_missing_set_with_deletes(
1846     osd->store,
1847     ch,
1848     recovery_state.get_info());
1849 }
1850
1851 void PG::on_activate_committed()
1852 {
1853   if (!is_primary()) {
1854     // waiters
1855     if (recovery_state.needs_flush() == 0) {
1856       requeue_ops(waiting_for_peered);
1857     } else if (!waiting_for_peered.empty()) {
1858       dout(10) << __func__ << " flushes in progress, moving "
1859                << waiting_for_peered.size() << " items to waiting_for_flush"
1860                << dendl;
1861       ceph_assert(waiting_for_flush.empty());
1862       waiting_for_flush.swap(waiting_for_peered);
1863     }
1864   }
1865 }
1866
1867 // Compute pending backfill data
1868 static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes)
1869 {
1870   lgeneric_dout(cct, 20) << __func__ << " Adjust local usage "
1871                          << (local_bytes >> 10) << "KiB"
1872                          << " primary usage " << (bf_bytes >> 10)
1873                          << "KiB" << dendl;
1874
1875   return std::max((int64_t)0, bf_bytes - local_bytes);
1876 }
1877
1878
1879 // We can zero the value of primary num_bytes as just an atomic.
1880 // However, setting above zero reserves space for backfill and requires
1881 // the OSDService::stat_lock which protects all OSD usage
1882 bool PG::try_reserve_recovery_space(
1883   int64_t primary_bytes, int64_t local_bytes) {
1884   // Use tentative_bacfill_full() to make sure enough
1885   // space is available to handle target bytes from primary.
1886
1887   // TODO: If we passed num_objects from primary we could account for
1888   // an estimate of the metadata overhead.
1889
1890   // TODO: If we had compressed_allocated and compressed_original from primary
1891   // we could compute compression ratio and adjust accordingly.
1892
1893   // XXX: There is no way to get omap overhead and this would only apply
1894   // to whatever possibly different partition that is storing the database.
1895
1896   // update_osd_stat() from heartbeat will do this on a new
1897   // statfs using ps->primary_bytes.
1898   uint64_t pending_adjustment = 0;
1899   if (primary_bytes) {
1900     // For erasure coded pool overestimate by a full stripe per object
1901     // because we don't know how each objected rounded to the nearest stripe
1902     if (pool.info.is_erasure()) {
1903       primary_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
1904       primary_bytes += get_pgbackend()->get_ec_stripe_chunk_size() *
1905         info.stats.stats.sum.num_objects;
1906       local_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
1907       local_bytes += get_pgbackend()->get_ec_stripe_chunk_size() *
1908         info.stats.stats.sum.num_objects;
1909     }
1910     pending_adjustment = pending_backfill(
1911       cct,
1912       primary_bytes,
1913       local_bytes);
1914     dout(10) << __func__ << " primary_bytes " << (primary_bytes >> 10)
1915              << "KiB"
1916              << " local " << (local_bytes >> 10) << "KiB"
1917              << " pending_adjustments " << (pending_adjustment >> 10) << "KiB"
1918              << dendl;
1919   }
1920
1921   // This lock protects not only the stats OSDService but also setting the
1922   // pg primary_bytes.  That's why we don't immediately unlock
1923   std::lock_guard l{osd->stat_lock};
1924   osd_stat_t cur_stat = osd->osd_stat;
1925   if (cct->_conf->osd_debug_reject_backfill_probability > 0 &&
1926       (rand()%1000 < (cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
1927     dout(10) << "backfill reservation rejected: failure injection"
1928              << dendl;
1929     return false;
1930   } else if (!cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
1931       osd->tentative_backfill_full(this, pending_adjustment, cur_stat)) {
1932     dout(10) << "backfill reservation rejected: backfill full"
1933              << dendl;
1934     return false;
1935   } else {
1936     // Don't reserve space if skipped reservation check, this is used
1937     // to test the other backfill full check AND in case a corruption
1938     // of num_bytes requires ignoring that value and trying the
1939     // backfill anyway.
1940     if (primary_bytes &&
1941         !cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) {
1942       primary_num_bytes.store(primary_bytes);
1943       local_num_bytes.store(local_bytes);
1944     } else {
1945       unreserve_recovery_space();
1946     }
1947     return true;
1948   }
1949 }
1950
1951 void PG::unreserve_recovery_space() {
1952   primary_num_bytes.store(0);
1953   local_num_bytes.store(0);
1954 }
1955
1956 void PG::_scan_rollback_obs(const vector<ghobject_t> &rollback_obs)
1957 {
1958   ObjectStore::Transaction t;
1959   eversion_t trimmed_to = recovery_state.get_last_rollback_info_trimmed_to_applied();
1960   for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
1961        i != rollback_obs.end();
1962        ++i) {
1963     if (i->generation < trimmed_to.version) {
1964       dout(10) << __func__ << "osd." << osd->whoami
1965                << " pg " << info.pgid
1966                << " found obsolete rollback obj "
1967                << *i << " generation < trimmed_to "
1968                << trimmed_to
1969                << "...repaired" << dendl;
1970       t.remove(coll, *i);
1971     }
1972   }
1973   if (!t.empty()) {
1974     derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
1975          << dendl;
1976     osd->store->queue_transaction(ch, std::move(t), NULL);
1977   }
1978 }
1979
1980
1981 void PG::_repair_oinfo_oid(ScrubMap &smap)
1982 {
1983   for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
1984        i != smap.objects.rend();
1985        ++i) {
1986     const hobject_t &hoid = i->first;
1987     ScrubMap::object &o = i->second;
1988
1989     bufferlist bl;
1990     if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
1991       continue;
1992     }
1993     bl.push_back(o.attrs[OI_ATTR]);
1994     object_info_t oi;
1995     try {
1996       oi.decode(bl);
1997     } catch(...) {
1998       continue;
1999     }
2000     if (oi.soid != hoid) {
2001       ObjectStore::Transaction t;
2002       OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
2003       osd->clog->error() << "osd." << osd->whoami
2004                             << " found object info error on pg "
2005                             << info.pgid
2006                             << " oid " << hoid << " oid in object info: "
2007                             << oi.soid
2008                             << "...repaired";
2009       // Fix object info
2010       oi.soid = hoid;
2011       bl.clear();
2012       encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
2013
2014       bufferptr bp(bl.c_str(), bl.length());
2015       o.attrs[OI_ATTR] = bp;
2016
2017       t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
2018       int r = osd->store->queue_transaction(ch, std::move(t));
2019       if (r != 0) {
2020         derr << __func__ << ": queue_transaction got " << cpp_strerror(r)
2021              << dendl;
2022       }
2023     }
2024   }
2025 }
2026
2027 void PG::repair_object(
2028   const hobject_t &soid,
2029   const list<pair<ScrubMap::object, pg_shard_t> > &ok_peers,
2030   const set<pg_shard_t> &bad_peers)
2031 {
2032   set<pg_shard_t> ok_shards;
2033   for (auto &&peer: ok_peers) ok_shards.insert(peer.second);
2034
2035   dout(10) << "repair_object " << soid
2036            << " bad_peers osd.{" << bad_peers << "},"
2037            << " ok_peers osd.{" << ok_shards << "}" << dendl;
2038
2039   const ScrubMap::object &po = ok_peers.back().first;
2040   eversion_t v;
2041   object_info_t oi;
2042   try {
2043     bufferlist bv;
2044     if (po.attrs.count(OI_ATTR)) {
2045       bv.push_back(po.attrs.find(OI_ATTR)->second);
2046     }
2047     auto bliter = bv.cbegin();
2048     decode(oi, bliter);
2049   } catch (...) {
2050     dout(0) << __func__ << ": Need version of replica, bad object_info_t: "
2051             << soid << dendl;
2052     ceph_abort();
2053   }
2054
2055   if (bad_peers.count(get_primary())) {
2056     // We should only be scrubbing if the PG is clean.
2057     ceph_assert(waiting_for_unreadable_object.empty());
2058     dout(10) << __func__ << ": primary = " << get_primary() << dendl;
2059   }
2060
2061   /* No need to pass ok_peers, they must not be missing the object, so
2062    * force_object_missing will add them to missing_loc anyway */
2063   recovery_state.force_object_missing(bad_peers, soid, oi.version);
2064 }
2065
2066 void PG::forward_scrub_event(ScrubAPI fn, epoch_t epoch_queued)
2067 {
2068   dout(20) << __func__ << " queued at: " << epoch_queued << dendl;
2069   if (is_active() && m_scrubber) {
2070     ((*m_scrubber).*fn)(epoch_queued);
2071   } else {
2072     // pg might be in the process of being deleted
2073     dout(5) << __func__ << " refusing to forward. " << (is_clean() ? "(clean) " : "(not clean) ") <<
2074             (is_active() ? "(active) " : "(not active) ") <<  dendl;
2075   }
2076 }
2077
2078 void PG::replica_scrub(OpRequestRef op, ThreadPool::TPHandle& handle)
2079 {
2080   dout(10) << __func__ << " (op)" << dendl;
2081   if (m_scrubber)
2082     m_scrubber->replica_scrub_op(op);
2083 }
2084
2085 void PG::scrub(epoch_t epoch_queued, ThreadPool::TPHandle& handle)
2086 {
2087   dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
2088   // a new scrub
2089   scrub_queued = false;
2090   forward_scrub_event(&ScrubPgIF::initiate_regular_scrub, epoch_queued);
2091 }
2092
2093 // note: no need to secure OSD resources for a recovery scrub
2094 void PG::recovery_scrub(epoch_t epoch_queued,
2095                         [[maybe_unused]] ThreadPool::TPHandle& handle)
2096 {
2097   dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
2098   // a new scrub
2099   scrub_queued = false;
2100   forward_scrub_event(&ScrubPgIF::initiate_scrub_after_repair, epoch_queued);
2101 }
2102
2103 void PG::replica_scrub(epoch_t epoch_queued,
2104                        [[maybe_unused]] ThreadPool::TPHandle& handle)
2105 {
2106   dout(10) << __func__ << " queued at: " << epoch_queued
2107            << (is_primary() ? " (primary)" : " (replica)") << dendl;
2108   scrub_queued = false;
2109   forward_scrub_event(&ScrubPgIF::send_start_replica, epoch_queued);
2110 }
2111
2112 void PG::scrub_send_scrub_resched(epoch_t epoch_queued,
2113                                   [[maybe_unused]] ThreadPool::TPHandle& handle)
2114 {
2115   dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
2116   scrub_queued = false;
2117   forward_scrub_event(&ScrubPgIF::send_scrub_resched, epoch_queued);
2118 }
2119
2120 void PG::scrub_send_resources_granted(epoch_t epoch_queued,
2121                                       [[maybe_unused]] ThreadPool::TPHandle& handle)
2122 {
2123   dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
2124   forward_scrub_event(&ScrubPgIF::send_remotes_reserved, epoch_queued);
2125 }
2126
2127 void PG::scrub_send_resources_denied(epoch_t epoch_queued,
2128                                      [[maybe_unused]] ThreadPool::TPHandle& handle)
2129 {
2130   dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
2131   forward_scrub_event(&ScrubPgIF::send_reservation_failure, epoch_queued);
2132 }
2133
2134 void PG::replica_scrub_resched(epoch_t epoch_queued,
2135                                [[maybe_unused]] ThreadPool::TPHandle& handle)
2136 {
2137   dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
2138   scrub_queued = false;
2139   forward_scrub_event(&ScrubPgIF::send_sched_replica, epoch_queued);
2140 }
2141
2142 void PG::scrub_send_pushes_update(epoch_t epoch_queued,
2143                                   [[maybe_unused]] ThreadPool::TPHandle& handle)
2144 {
2145   dout(10) << __func__ << " queued at: " << epoch_queued << dendl;
2146   forward_scrub_event(&ScrubPgIF::active_pushes_notification, epoch_queued);
2147 }
2148
2149 void PG::scrub_send_replica_pushes(epoch_t epoch_queued,
2150                                    [[maybe_unused]] ThreadPool::TPHandle& handle)
2151 {
2152   dout(15) << __func__ << " queued at: " << epoch_queued << dendl;
2153   forward_scrub_event(&ScrubPgIF::send_replica_pushes_upd, epoch_queued);
2154 }
2155
2156 void PG::scrub_send_applied_update(epoch_t epoch_queued,
2157                                    [[maybe_unused]] ThreadPool::TPHandle& handle)
2158 {
2159   dout(15) << __func__ << " queued at: " << epoch_queued << dendl;
2160   forward_scrub_event(&ScrubPgIF::update_applied_notification, epoch_queued);
2161 }
2162
2163 void PG::scrub_send_unblocking(epoch_t epoch_queued,
2164                                [[maybe_unused]] ThreadPool::TPHandle& handle)
2165 {
2166   dout(15) << __func__ << " queued at: " << epoch_queued << dendl;
2167   forward_scrub_event(&ScrubPgIF::send_scrub_unblock, epoch_queued);
2168 }
2169
2170 void PG::scrub_send_digest_update(epoch_t epoch_queued,
2171                                   [[maybe_unused]] ThreadPool::TPHandle& handle)
2172 {
2173   dout(15) << __func__ << " queued at: " << epoch_queued << dendl;
2174   forward_scrub_event(&ScrubPgIF::digest_update_notification, epoch_queued);
2175 }
2176
2177 void PG::scrub_send_replmaps_ready(epoch_t epoch_queued,
2178                                    [[maybe_unused]] ThreadPool::TPHandle& handle)
2179 {
2180   dout(15) << __func__ << " queued at: " << epoch_queued << dendl;
2181   forward_scrub_event(&ScrubPgIF::send_replica_maps_ready, epoch_queued);
2182 }
2183
2184 bool PG::ops_blocked_by_scrub() const
2185 {
2186   return !waiting_for_scrub.empty();
2187 }
2188
2189 Scrub::scrub_prio_t PG::is_scrub_blocking_ops() const
2190 {
2191   return waiting_for_scrub.empty() ? Scrub::scrub_prio_t::low_priority
2192                                    : Scrub::scrub_prio_t::high_priority;
2193 }
2194
2195 bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
2196 {
2197   if (auto last_reset = get_last_peering_reset();
2198       last_reset > reply_epoch || last_reset > query_epoch) {
2199     dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch "
2200              << query_epoch << " last_peering_reset " << last_reset << dendl;
2201     return true;
2202   }
2203   return false;
2204 }
2205
2206 struct FlushState {
2207   PGRef pg;
2208   epoch_t epoch;
2209   FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
2210   ~FlushState() {
2211     std::scoped_lock l{*pg};
2212     if (!pg->pg_has_reset_since(epoch)) {
2213       pg->recovery_state.complete_flush();
2214     }
2215   }
2216 };
2217 typedef std::shared_ptr<FlushState> FlushStateRef;
2218
2219 void PG::start_flush_on_transaction(ObjectStore::Transaction &t)
2220 {
2221   // flush in progress ops
2222   FlushStateRef flush_trigger (std::make_shared<FlushState>(
2223                                this, get_osdmap_epoch()));
2224   t.register_on_applied(new ContainerContext<FlushStateRef>(flush_trigger));
2225   t.register_on_commit(new ContainerContext<FlushStateRef>(flush_trigger));
2226 }
2227
2228 bool PG::try_flush_or_schedule_async()
2229 {
2230   Context *c = new QueuePeeringEvt(
2231     this, get_osdmap_epoch(), PeeringState::IntervalFlush());
2232   if (!ch->flush_commit(c)) {
2233     return false;
2234   } else {
2235     delete c;
2236     return true;
2237   }
2238 }
2239
2240 ostream& operator<<(ostream& out, const PG& pg)
2241 {
2242   out << pg.recovery_state;
2243
2244   // listing all scrub-related flags - both current and "planned next scrub"
2245   if (pg.is_scrubbing()) {
2246     out << *pg.m_scrubber;
2247   }
2248   out << pg.m_planned_scrub;
2249
2250   if (pg.recovery_ops_active)
2251     out << " rops=" << pg.recovery_ops_active;
2252
2253   //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
2254   if (pg.recovery_state.have_missing()) {
2255     out << " m=" << pg.recovery_state.get_num_missing();
2256     if (pg.is_primary()) {
2257       uint64_t unfound = pg.recovery_state.get_num_unfound();
2258       if (unfound)
2259         out << " u=" << unfound;
2260     }
2261   }
2262   if (!pg.is_clean()) {
2263     out << " mbc=" << pg.recovery_state.get_missing_by_count();
2264   }
2265   if (!pg.snap_trimq.empty()) {
2266     out << " trimq=";
2267     // only show a count if the set is large
2268     if (pg.snap_trimq.num_intervals() > 16) {
2269       out << pg.snap_trimq.size();
2270       if (!pg.snap_trimq_repeat.empty()) {
2271         out << "(" << pg.snap_trimq_repeat.size() << ")";
2272       }
2273     } else {
2274       out << pg.snap_trimq;
2275       if (!pg.snap_trimq_repeat.empty()) {
2276         out << "(" << pg.snap_trimq_repeat << ")";
2277       }
2278     }
2279   }
2280   if (!pg.recovery_state.get_info().purged_snaps.empty()) {
2281     out << " ps="; // snap trim queue / purged snaps
2282     if (pg.recovery_state.get_info().purged_snaps.num_intervals() > 16) {
2283       out << pg.recovery_state.get_info().purged_snaps.size();
2284     } else {
2285       out << pg.recovery_state.get_info().purged_snaps;
2286     }
2287   }
2288
2289   out << "]";
2290   return out;
2291 }
2292
2293 bool PG::can_discard_op(OpRequestRef& op)
2294 {
2295   auto m = op->get_req<MOSDOp>();
2296   if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) {
2297     dout(20) << " discard " << *m << dendl;
2298     return true;
2299   }
2300
2301   if (m->get_map_epoch() < info.history.same_primary_since) {
2302     dout(7) << " changed after " << m->get_map_epoch()
2303             << ", dropping " << *m << dendl;
2304     return true;
2305   }
2306
2307   if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
2308                          CEPH_OSD_FLAG_LOCALIZE_READS)) &&
2309       !is_primary() &&
2310       m->get_map_epoch() < info.history.same_interval_since) {
2311     // Note: the Objecter will resend on interval change without the primary
2312     // changing if it actually sent to a replica.  If the primary hasn't
2313     // changed since the send epoch, we got it, and we're primary, it won't
2314     // have resent even if the interval did change as it sent it to the primary
2315     // (us).
2316     return true;
2317   }
2318
2319
2320   if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) {
2321     // >= luminous client
2322     if (m->get_connection()->has_feature(CEPH_FEATURE_SERVER_NAUTILUS)) {
2323       // >= nautilus client
2324       if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) {
2325         dout(7) << __func__ << " sent before last_force_op_resend "
2326                 << pool.info.last_force_op_resend
2327                 << ", dropping" << *m << dendl;
2328         return true;
2329       }
2330     } else {
2331       // == < nautilus client (luminous or mimic)
2332       if (m->get_map_epoch() < pool.info.get_last_force_op_resend_prenautilus()) {
2333         dout(7) << __func__ << " sent before last_force_op_resend_prenautilus "
2334                 << pool.info.last_force_op_resend_prenautilus
2335                 << ", dropping" << *m << dendl;
2336         return true;
2337       }
2338     }
2339     if (m->get_map_epoch() < info.history.last_epoch_split) {
2340       dout(7) << __func__ << " pg split in "
2341               << info.history.last_epoch_split << ", dropping" << dendl;
2342       return true;
2343     }
2344   } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) {
2345     // < luminous client
2346     if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) {
2347       dout(7) << __func__ << " sent before last_force_op_resend_preluminous "
2348               << pool.info.last_force_op_resend_preluminous
2349               << ", dropping" << *m << dendl;
2350       return true;
2351     }
2352   }
2353
2354   return false;
2355 }
2356
2357 template<typename T, int MSGTYPE>
2358 bool PG::can_discard_replica_op(OpRequestRef& op)
2359 {
2360   auto m = op->get_req<T>();
2361   ceph_assert(m->get_type() == MSGTYPE);
2362
2363   int from = m->get_source().num();
2364
2365   // if a repop is replied after a replica goes down in a new osdmap, and
2366   // before the pg advances to this new osdmap, the repop replies before this
2367   // repop can be discarded by that replica OSD, because the primary resets the
2368   // connection to it when handling the new osdmap marking it down, and also
2369   // resets the messenger sesssion when the replica reconnects. to avoid the
2370   // out-of-order replies, the messages from that replica should be discarded.
2371   OSDMapRef next_map = osd->get_next_osdmap();
2372   if (next_map->is_down(from)) {
2373     dout(20) << " " << __func__ << " dead for nextmap is down " << from << dendl;
2374     return true;
2375   }
2376   /* Mostly, this overlaps with the old_peering_msg
2377    * condition.  An important exception is pushes
2378    * sent by replicas not in the acting set, since
2379    * if such a replica goes down it does not cause
2380    * a new interval. */
2381   if (next_map->get_down_at(from) >= m->map_epoch) {
2382     dout(20) << " " << __func__ << " dead for 'get_down_at' " << from << dendl;
2383     return true;
2384   }
2385
2386   // same pg?
2387   //  if pg changes _at all_, we reset and repeer!
2388   if (old_peering_msg(m->map_epoch, m->map_epoch)) {
2389     dout(10) << "can_discard_replica_op pg changed " << info.history
2390              << " after " << m->map_epoch
2391              << ", dropping" << dendl;
2392     return true;
2393   }
2394   return false;
2395 }
2396
2397 bool PG::can_discard_scan(OpRequestRef op)
2398 {
2399   auto m = op->get_req<MOSDPGScan>();
2400   ceph_assert(m->get_type() == MSG_OSD_PG_SCAN);
2401
2402   if (old_peering_msg(m->map_epoch, m->query_epoch)) {
2403     dout(10) << " got old scan, ignoring" << dendl;
2404     return true;
2405   }
2406   return false;
2407 }
2408
2409 bool PG::can_discard_backfill(OpRequestRef op)
2410 {
2411   auto m = op->get_req<MOSDPGBackfill>();
2412   ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL);
2413
2414   if (old_peering_msg(m->map_epoch, m->query_epoch)) {
2415     dout(10) << " got old backfill, ignoring" << dendl;
2416     return true;
2417   }
2418
2419   return false;
2420
2421 }
2422
2423 bool PG::can_discard_request(OpRequestRef& op)
2424 {
2425   switch (op->get_req()->get_type()) {
2426   case CEPH_MSG_OSD_OP:
2427     return can_discard_op(op);
2428   case CEPH_MSG_OSD_BACKOFF:
2429     return false; // never discard
2430   case MSG_OSD_REPOP:
2431     return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op);
2432   case MSG_OSD_PG_PUSH:
2433     return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op);
2434   case MSG_OSD_PG_PULL:
2435     return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
2436   case MSG_OSD_PG_PUSH_REPLY:
2437     return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
2438   case MSG_OSD_REPOPREPLY:
2439     return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
2440   case MSG_OSD_PG_RECOVERY_DELETE:
2441     return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op);
2442
2443   case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
2444     return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op);
2445
2446   case MSG_OSD_EC_WRITE:
2447     return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
2448   case MSG_OSD_EC_WRITE_REPLY:
2449     return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op);
2450   case MSG_OSD_EC_READ:
2451     return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
2452   case MSG_OSD_EC_READ_REPLY:
2453     return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
2454   case MSG_OSD_REP_SCRUB:
2455     return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
2456   case MSG_OSD_SCRUB_RESERVE:
2457     return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op);
2458   case MSG_OSD_REP_SCRUBMAP:
2459     return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op);
2460   case MSG_OSD_PG_UPDATE_LOG_MISSING:
2461     return can_discard_replica_op<
2462       MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op);
2463   case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
2464     return can_discard_replica_op<
2465       MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
2466
2467   case MSG_OSD_PG_SCAN:
2468     return can_discard_scan(op);
2469   case MSG_OSD_PG_BACKFILL:
2470     return can_discard_backfill(op);
2471   case MSG_OSD_PG_BACKFILL_REMOVE:
2472     return can_discard_replica_op<MOSDPGBackfillRemove,
2473                                   MSG_OSD_PG_BACKFILL_REMOVE>(op);
2474   }
2475   return true;
2476 }
2477
2478 void PG::do_peering_event(PGPeeringEventRef evt, PeeringCtx &rctx)
2479 {
2480   dout(10) << __func__ << ": " << evt->get_desc() << dendl;
2481   ceph_assert(have_same_or_newer_map(evt->get_epoch_sent()));
2482   if (old_peering_evt(evt)) {
2483     dout(10) << "discard old " << evt->get_desc() << dendl;
2484   } else {
2485     recovery_state.handle_event(evt, &rctx);
2486   }
2487   // write_if_dirty regardless of path above to ensure we capture any work
2488   // done by OSD::advance_pg().
2489   write_if_dirty(rctx.transaction);
2490 }
2491
2492 void PG::queue_peering_event(PGPeeringEventRef evt)
2493 {
2494   if (old_peering_evt(evt))
2495     return;
2496   osd->osd->enqueue_peering_evt(info.pgid, evt);
2497 }
2498
2499 void PG::queue_null(epoch_t msg_epoch,
2500                     epoch_t query_epoch)
2501 {
2502   dout(10) << "null" << dendl;
2503   queue_peering_event(
2504     PGPeeringEventRef(std::make_shared<PGPeeringEvent>(msg_epoch, query_epoch,
2505                                          NullEvt())));
2506 }
2507
2508 void PG::find_unfound(epoch_t queued, PeeringCtx &rctx)
2509 {
2510   /*
2511     * if we couldn't start any recovery ops and things are still
2512     * unfound, see if we can discover more missing object locations.
2513     * It may be that our initial locations were bad and we errored
2514     * out while trying to pull.
2515     */
2516   if (!recovery_state.discover_all_missing(rctx)) {
2517     string action;
2518     if (state_test(PG_STATE_BACKFILLING)) {
2519       auto evt = PGPeeringEventRef(
2520         new PGPeeringEvent(
2521           queued,
2522           queued,
2523           PeeringState::UnfoundBackfill()));
2524       queue_peering_event(evt);
2525       action = "in backfill";
2526     } else if (state_test(PG_STATE_RECOVERING)) {
2527       auto evt = PGPeeringEventRef(
2528         new PGPeeringEvent(
2529           queued,
2530           queued,
2531           PeeringState::UnfoundRecovery()));
2532       queue_peering_event(evt);
2533       action = "in recovery";
2534     } else {
2535       action = "already out of recovery/backfill";
2536     }
2537     dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
2538   } else {
2539     dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
2540     queue_recovery();
2541   }
2542 }
2543
2544 void PG::handle_advance_map(
2545   OSDMapRef osdmap, OSDMapRef lastmap,
2546   vector<int>& newup, int up_primary,
2547   vector<int>& newacting, int acting_primary,
2548   PeeringCtx &rctx)
2549 {
2550   dout(10) << __func__ << ": " << osdmap->get_epoch() << dendl;
2551   osd_shard->update_pg_epoch(pg_slot, osdmap->get_epoch());
2552   recovery_state.advance_map(
2553     osdmap,
2554     lastmap,
2555     newup,
2556     up_primary,
2557     newacting,
2558     acting_primary,
2559     rctx);
2560 }
2561
2562 void PG::handle_activate_map(PeeringCtx &rctx)
2563 {
2564   dout(10) << __func__ << ": " << get_osdmap()->get_epoch()
2565            << dendl;
2566   recovery_state.activate_map(rctx);
2567
2568   requeue_map_waiters();
2569 }
2570
2571 void PG::handle_initialize(PeeringCtx &rctx)
2572 {
2573   dout(10) << __func__ << dendl;
2574   PeeringState::Initialize evt;
2575   recovery_state.handle_event(evt, &rctx);
2576 }
2577
2578
2579 void PG::handle_query_state(Formatter *f)
2580 {
2581   dout(10) << "handle_query_state" << dendl;
2582   PeeringState::QueryState q(f);
2583   recovery_state.handle_event(q, 0);
2584
2585   // This code has moved to after the close of recovery_state array.
2586   // I don't think that scrub is a recovery state
2587   if (is_primary() && is_active() && m_scrubber && m_scrubber->is_scrub_active()) {
2588     m_scrubber->handle_query_state(f);
2589   }
2590 }
2591
2592 void PG::init_collection_pool_opts()
2593 {
2594   auto r = osd->store->set_collection_opts(ch, pool.info.opts);
2595   if (r < 0 && r != -EOPNOTSUPP) {
2596     derr << __func__ << " set_collection_opts returns error:" << r << dendl;
2597   }
2598 }
2599
2600 void PG::on_pool_change()
2601 {
2602   init_collection_pool_opts();
2603   plpg_on_pool_change();
2604 }
2605
2606 void PG::C_DeleteMore::complete(int r) {
2607   ceph_assert(r == 0);
2608   pg->lock();
2609   if (!pg->pg_has_reset_since(epoch)) {
2610     pg->osd->queue_for_pg_delete(pg->get_pgid(), epoch);
2611   }
2612   pg->unlock();
2613   delete this;
2614 }
2615
2616 std::pair<ghobject_t, bool> PG::do_delete_work(
2617   ObjectStore::Transaction &t,
2618   ghobject_t _next)
2619 {
2620   dout(10) << __func__ << dendl;
2621
2622   {
2623     float osd_delete_sleep = osd->osd->get_osd_delete_sleep();
2624     if (osd_delete_sleep > 0 && delete_needs_sleep) {
2625       epoch_t e = get_osdmap()->get_epoch();
2626       PGRef pgref(this);
2627       auto delete_requeue_callback = new LambdaContext([this, pgref, e](int r) {
2628         dout(20) << __func__ << " wake up at "
2629                  << ceph_clock_now()
2630                  << ", re-queuing delete" << dendl;
2631         std::scoped_lock locker{*this};
2632         delete_needs_sleep = false;
2633         if (!pg_has_reset_since(e)) {
2634           osd->queue_for_pg_delete(get_pgid(), e);
2635         }
2636       });
2637
2638       auto delete_schedule_time = ceph::real_clock::now();
2639       delete_schedule_time += ceph::make_timespan(osd_delete_sleep);
2640       std::lock_guard l{osd->sleep_lock};
2641       osd->sleep_timer.add_event_at(delete_schedule_time,
2642                                     delete_requeue_callback);
2643       dout(20) << __func__ << " Delete scheduled at " << delete_schedule_time << dendl;
2644       return std::make_pair(_next, true);
2645     }
2646   }
2647
2648   delete_needs_sleep = true;
2649
2650   ghobject_t next;
2651
2652   vector<ghobject_t> olist;
2653   int max = std::min(osd->store->get_ideal_list_max(),
2654                      (int)cct->_conf->osd_target_transaction_size);
2655
2656   osd->store->collection_list(
2657     ch,
2658     _next,
2659     ghobject_t::get_max(),
2660     max,
2661     &olist,
2662     &next);
2663   dout(20) << __func__ << " " << olist << dendl;
2664
2665   // make sure we've removed everything
2666   // by one more listing from the beginning
2667   if (_next != ghobject_t() && olist.empty()) {
2668     next = ghobject_t();
2669     osd->store->collection_list(
2670       ch,
2671       next,
2672       ghobject_t::get_max(),
2673       max,
2674       &olist,
2675       &next);
2676     if (!olist.empty()) {
2677       dout(0) << __func__ << " additional unexpected onode list"
2678               <<" (new onodes has appeared since PG removal started"
2679               << olist << dendl;
2680     }
2681   }
2682
2683   OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
2684   int64_t num = 0;
2685   for (auto& oid : olist) {
2686     if (oid == pgmeta_oid) {
2687       continue;
2688     }
2689     if (oid.is_pgmeta()) {
2690       osd->clog->warn() << info.pgid << " found stray pgmeta-like " << oid
2691                         << " during PG removal";
2692     }
2693     int r = snap_mapper.remove_oid(oid.hobj, &_t);
2694     if (r != 0 && r != -ENOENT) {
2695       ceph_abort();
2696     }
2697     t.remove(coll, oid);
2698     ++num;
2699   }
2700   bool running = true;
2701   if (num) {
2702     dout(20) << __func__ << " deleting " << num << " objects" << dendl;
2703     Context *fin = new C_DeleteMore(this, get_osdmap_epoch());
2704     t.register_on_commit(fin);
2705   } else {
2706     if (cct->_conf->osd_inject_failure_on_pg_removal) {
2707       _exit(1);
2708     }
2709
2710     // final flush here to ensure completions drop refs.  Of particular concern
2711     // are the SnapMapper ContainerContexts.
2712     {
2713       PGRef pgref(this);
2714       PGLog::clear_info_log(info.pgid, &t);
2715       t.remove_collection(coll);
2716       t.register_on_commit(new ContainerContext<PGRef>(pgref));
2717       t.register_on_applied(new ContainerContext<PGRef>(pgref));
2718       osd->store->queue_transaction(ch, std::move(t));
2719     }
2720     ch->flush();
2721
2722     if (!osd->try_finish_pg_delete(this, pool.info.get_pg_num())) {
2723       dout(1) << __func__ << " raced with merge, reinstantiating" << dendl;
2724       ch = osd->store->create_new_collection(coll);
2725       create_pg_collection(t,
2726               info.pgid,
2727               info.pgid.get_split_bits(pool.info.get_pg_num()));
2728       init_pg_ondisk(t, info.pgid, &pool.info);
2729       recovery_state.reset_last_persisted();
2730     } else {
2731       recovery_state.set_delete_complete();
2732
2733       // cancel reserver here, since the PG is about to get deleted and the
2734       // exit() methods don't run when that happens.
2735       osd->local_reserver.cancel_reservation(info.pgid);
2736
2737       running = false;
2738     }
2739   }
2740   return {next, running};
2741 }
2742
2743 int PG::pg_stat_adjust(osd_stat_t *ns)
2744 {
2745   osd_stat_t &new_stat = *ns;
2746   if (is_primary()) {
2747     return 0;
2748   }
2749   // Adjust the kb_used by adding pending backfill data
2750   uint64_t reserved_num_bytes = get_reserved_num_bytes();
2751
2752   // For now we don't consider projected space gains here
2753   // I suggest we have an optional 2 pass backfill that frees up
2754   // space in a first pass.  This could be triggered when at nearfull
2755   // or near to backfillfull.
2756   if (reserved_num_bytes > 0) {
2757     // TODO: Handle compression by adjusting by the PGs average
2758     // compression precentage.
2759     dout(20) << __func__ << " reserved_num_bytes " << (reserved_num_bytes >> 10) << "KiB"
2760              << " Before kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl;
2761     if (new_stat.statfs.available > reserved_num_bytes)
2762       new_stat.statfs.available -= reserved_num_bytes;
2763     else
2764       new_stat.statfs.available = 0;
2765     dout(20) << __func__ << " After kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl;
2766     return 1;
2767   }
2768   return 0;
2769 }
2770
2771 void PG::dump_pgstate_history(Formatter *f)
2772 {
2773   std::scoped_lock l{*this};
2774   recovery_state.dump_history(f);
2775 }
2776
2777 void PG::dump_missing(Formatter *f)
2778 {
2779   for (auto& i : recovery_state.get_pg_log().get_missing().get_items()) {
2780     f->open_object_section("object");
2781     f->dump_object("oid", i.first);
2782     f->dump_object("missing_info", i.second);
2783     if (recovery_state.get_missing_loc().needs_recovery(i.first)) {
2784       f->dump_bool(
2785         "unfound",
2786         recovery_state.get_missing_loc().is_unfound(i.first));
2787       f->open_array_section("locations");
2788       for (auto l : recovery_state.get_missing_loc().get_locations(i.first)) {
2789         f->dump_object("shard", l);
2790       }
2791       f->close_section();
2792     }
2793     f->close_section();
2794   }
2795 }
2796
2797 void PG::get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f)
2798 {
2799   std::lock_guard l{pg_stats_publish_lock};
2800   if (pg_stats_publish_valid) {
2801     f(pg_stats_publish, pg_stats_publish.get_effective_last_epoch_clean());
2802   }
2803 }
2804
2805 void PG::with_heartbeat_peers(std::function<void(int)> f)
2806 {
2807   std::lock_guard l{heartbeat_peer_lock};
2808   for (auto p : heartbeat_peers) {
2809     f(p);
2810   }
2811   for (auto p : probe_targets) {
2812     f(p);
2813   }
2814 }
2815
2816 uint64_t PG::get_min_alloc_size() const {
2817   return osd->store->get_min_alloc_size();
2818 }