]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PG.cc
d77cef0d7b5c552b1156a2124b56dcea82c65a7d
[ceph.git] / ceph / src / osd / PG.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "PG.h"
16 #include "messages/MOSDRepScrub.h"
17
18 #include "common/errno.h"
19 #include "common/ceph_releases.h"
20 #include "common/config.h"
21 #include "OSD.h"
22 #include "OpRequest.h"
23 #include "osd/scrubber/ScrubStore.h"
24 #include "osd/scrubber/pg_scrubber.h"
25 #include "osd/scheduler/OpSchedulerItem.h"
26 #include "Session.h"
27
28 #include "common/Timer.h"
29 #include "common/perf_counters.h"
30
31 #include "messages/MOSDOp.h"
32 #include "messages/MOSDPGScan.h"
33 #include "messages/MOSDPGBackfill.h"
34 #include "messages/MOSDPGBackfillRemove.h"
35 #include "messages/MBackfillReserve.h"
36 #include "messages/MRecoveryReserve.h"
37 #include "messages/MOSDPGPush.h"
38 #include "messages/MOSDPGPushReply.h"
39 #include "messages/MOSDPGPull.h"
40 #include "messages/MOSDECSubOpWrite.h"
41 #include "messages/MOSDECSubOpWriteReply.h"
42 #include "messages/MOSDECSubOpRead.h"
43 #include "messages/MOSDECSubOpReadReply.h"
44 #include "messages/MOSDPGUpdateLogMissing.h"
45 #include "messages/MOSDPGUpdateLogMissingReply.h"
46 #include "messages/MOSDBackoff.h"
47 #include "messages/MOSDScrubReserve.h"
48 #include "messages/MOSDRepOp.h"
49 #include "messages/MOSDRepOpReply.h"
50 #include "messages/MOSDRepScrubMap.h"
51 #include "messages/MOSDPGRecoveryDelete.h"
52 #include "messages/MOSDPGRecoveryDeleteReply.h"
53
54 #include "common/BackTrace.h"
55 #include "common/EventTrace.h"
56
57 #ifdef WITH_LTTNG
58 #define TRACEPOINT_DEFINE
59 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
60 #include "tracing/pg.h"
61 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
62 #undef TRACEPOINT_DEFINE
63 #else
64 #define tracepoint(...)
65 #endif
66
67 #include <sstream>
68
69 #define dout_context cct
70 #define dout_subsys ceph_subsys_osd
71 #undef dout_prefix
72 #define dout_prefix _prefix(_dout, this)
73
74 using std::list;
75 using std::map;
76 using std::ostringstream;
77 using std::pair;
78 using std::set;
79 using std::string;
80 using std::stringstream;
81 using std::unique_ptr;
82 using std::vector;
83
84 using ceph::bufferlist;
85 using ceph::bufferptr;
86 using ceph::decode;
87 using ceph::encode;
88 using ceph::Formatter;
89
90 using namespace ceph::osd::scheduler;
91
92 template <class T>
93 static ostream& _prefix(std::ostream *_dout, T *t)
94 {
95 return t->gen_prefix(*_dout);
96 }
97
98 void PG::get(const char* tag)
99 {
100 int after = ++ref;
101 lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " "
102 << "tag " << (tag ? tag : "(none") << " "
103 << (after - 1) << " -> " << after << dendl;
104 #ifdef PG_DEBUG_REFS
105 std::lock_guard l(_ref_id_lock);
106 _tag_counts[tag]++;
107 #endif
108 }
109
110 void PG::put(const char* tag)
111 {
112 #ifdef PG_DEBUG_REFS
113 {
114 std::lock_guard l(_ref_id_lock);
115 auto tag_counts_entry = _tag_counts.find(tag);
116 ceph_assert(tag_counts_entry != _tag_counts.end());
117 --tag_counts_entry->second;
118 if (tag_counts_entry->second == 0) {
119 _tag_counts.erase(tag_counts_entry);
120 }
121 }
122 #endif
123 auto local_cct = cct;
124 int after = --ref;
125 lgeneric_subdout(local_cct, refs, 5) << "PG::put " << this << " "
126 << "tag " << (tag ? tag : "(none") << " "
127 << (after + 1) << " -> " << after
128 << dendl;
129 if (after == 0)
130 delete this;
131 }
132
133 #ifdef PG_DEBUG_REFS
134 uint64_t PG::get_with_id()
135 {
136 ref++;
137 std::lock_guard l(_ref_id_lock);
138 uint64_t id = ++_ref_id;
139 ClibBackTrace bt(0);
140 stringstream ss;
141 bt.print(ss);
142 lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " " << info.pgid
143 << " got id " << id << " "
144 << (ref - 1) << " -> " << ref
145 << dendl;
146 ceph_assert(!_live_ids.count(id));
147 _live_ids.insert(make_pair(id, ss.str()));
148 return id;
149 }
150
151 void PG::put_with_id(uint64_t id)
152 {
153 int newref = --ref;
154 lgeneric_subdout(cct, refs, 5) << "PG::put " << this << " " << info.pgid
155 << " put id " << id << " "
156 << (newref + 1) << " -> " << newref
157 << dendl;
158 {
159 std::lock_guard l(_ref_id_lock);
160 ceph_assert(_live_ids.count(id));
161 _live_ids.erase(id);
162 }
163 if (newref)
164 delete this;
165 }
166
167 void PG::dump_live_ids()
168 {
169 std::lock_guard l(_ref_id_lock);
170 dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl;
171 for (map<uint64_t, string>::iterator i = _live_ids.begin();
172 i != _live_ids.end();
173 ++i) {
174 dout(0) << "\t\tid: " << *i << dendl;
175 }
176 dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl;
177 for (map<string, uint64_t>::iterator i = _tag_counts.begin();
178 i != _tag_counts.end();
179 ++i) {
180 dout(0) << "\t\tid: " << *i << dendl;
181 }
182 }
183 #endif
184
185 PG::PG(OSDService *o, OSDMapRef curmap,
186 const PGPool &_pool, spg_t p) :
187 pg_whoami(o->whoami, p.shard),
188 pg_id(p),
189 coll(p),
190 osd(o),
191 cct(o->cct),
192 osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
193 snap_mapper(
194 cct,
195 &osdriver,
196 p.ps(),
197 p.get_split_bits(_pool.info.get_pg_num()),
198 _pool.id,
199 p.shard),
200 trace_endpoint("0.0.0.0", 0, "PG"),
201 info_struct_v(0),
202 pgmeta_oid(p.make_pgmeta_oid()),
203 stat_queue_item(this),
204 recovery_queued(false),
205 recovery_ops_active(0),
206 backfill_reserving(false),
207 finish_sync_event(NULL),
208 scrub_after_recovery(false),
209 active_pushes(0),
210 recovery_state(
211 o->cct,
212 pg_whoami,
213 p,
214 _pool,
215 curmap,
216 this,
217 this),
218 pool(recovery_state.get_pool()),
219 info(recovery_state.get_info())
220 {
221 #ifdef PG_DEBUG_REFS
222 osd->add_pgid(p, this);
223 #endif
224 #ifdef WITH_BLKIN
225 std::stringstream ss;
226 ss << "PG " << info.pgid;
227 trace_endpoint.copy_name(ss.str());
228 #endif
229 }
230
231 PG::~PG()
232 {
233 #ifdef PG_DEBUG_REFS
234 osd->remove_pgid(info.pgid, this);
235 #endif
236 }
237
238 void PG::lock(bool no_lockdep) const
239 {
240 #ifdef CEPH_DEBUG_MUTEX
241 _lock.lock(no_lockdep);
242 #else
243 _lock.lock();
244 locked_by = std::this_thread::get_id();
245 #endif
246 // if we have unrecorded dirty state with the lock dropped, there is a bug
247 ceph_assert(!recovery_state.debug_has_dirty_state());
248
249 dout(30) << "lock" << dendl;
250 }
251
252 bool PG::is_locked() const
253 {
254 return ceph_mutex_is_locked(_lock);
255 }
256
257 void PG::unlock() const
258 {
259 //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
260 ceph_assert(!recovery_state.debug_has_dirty_state());
261 #ifndef CEPH_DEBUG_MUTEX
262 locked_by = {};
263 #endif
264 _lock.unlock();
265 }
266
267 std::ostream& PG::gen_prefix(std::ostream& out) const
268 {
269 OSDMapRef mapref = recovery_state.get_osdmap();
270 #ifdef CEPH_DEBUG_MUTEX
271 if (_lock.is_locked_by_me()) {
272 #else
273 if (locked_by == std::this_thread::get_id()) {
274 #endif
275 out << "osd." << osd->whoami
276 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
277 << " " << *this << " ";
278 } else {
279 out << "osd." << osd->whoami
280 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
281 << " pg[" << pg_id.pgid << "(unlocked)] ";
282 }
283 return out;
284 }
285
286 PerfCounters &PG::get_peering_perf() {
287 return *(osd->recoverystate_perf);
288 }
289
290 PerfCounters &PG::get_perf_logger() {
291 return *(osd->logger);
292 }
293
294 void PG::log_state_enter(const char *state) {
295 osd->pg_recovery_stats.log_enter(state);
296 }
297
298 void PG::log_state_exit(
299 const char *state_name, utime_t enter_time,
300 uint64_t events, utime_t event_dur) {
301 osd->pg_recovery_stats.log_exit(
302 state_name, ceph_clock_now() - enter_time, events, event_dur);
303 }
304
305 /********* PG **********/
306
307 void PG::remove_snap_mapped_object(
308 ObjectStore::Transaction &t, const hobject_t &soid)
309 {
310 t.remove(
311 coll,
312 ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard));
313 clear_object_snap_mapping(&t, soid);
314 }
315
316 void PG::clear_object_snap_mapping(
317 ObjectStore::Transaction *t, const hobject_t &soid)
318 {
319 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
320 if (soid.snap < CEPH_MAXSNAP) {
321 int r = snap_mapper.remove_oid(
322 soid,
323 &_t);
324 if (!(r == 0 || r == -ENOENT)) {
325 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
326 ceph_abort();
327 }
328 }
329 }
330
331 void PG::update_object_snap_mapping(
332 ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps)
333 {
334 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
335 ceph_assert(soid.snap < CEPH_MAXSNAP);
336 int r = snap_mapper.remove_oid(
337 soid,
338 &_t);
339 if (!(r == 0 || r == -ENOENT)) {
340 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
341 ceph_abort();
342 }
343 snap_mapper.add_oid(
344 soid,
345 snaps,
346 &_t);
347 }
348
349 /******* PG ***********/
350 void PG::clear_primary_state()
351 {
352 dout(20) << __func__ << dendl;
353
354 projected_log = PGLog::IndexedLog();
355
356 snap_trimq.clear();
357 snap_trimq_repeat.clear();
358 finish_sync_event = 0; // so that _finish_recovery doesn't go off in another thread
359 release_pg_backoffs();
360
361 if (m_scrubber) {
362 m_scrubber->discard_replica_reservations();
363 }
364 scrub_after_recovery = false;
365
366 agent_clear();
367 }
368
369
370 bool PG::op_has_sufficient_caps(OpRequestRef& op)
371 {
372 // only check MOSDOp
373 if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
374 return true;
375
376 auto req = op->get_req<MOSDOp>();
377 auto priv = req->get_connection()->get_priv();
378 auto session = static_cast<Session*>(priv.get());
379 if (!session) {
380 dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl;
381 return false;
382 }
383 OSDCap& caps = session->caps;
384 priv.reset();
385
386 const string &key = req->get_hobj().get_key().empty() ?
387 req->get_oid().name :
388 req->get_hobj().get_key();
389
390 bool cap = caps.is_capable(pool.name, req->get_hobj().nspace,
391 pool.info.application_metadata,
392 key,
393 op->need_read_cap(),
394 op->need_write_cap(),
395 op->classes(),
396 session->get_peer_socket_addr());
397
398 dout(20) << "op_has_sufficient_caps "
399 << "session=" << session
400 << " pool=" << pool.id << " (" << pool.name
401 << " " << req->get_hobj().nspace
402 << ")"
403 << " pool_app_metadata=" << pool.info.application_metadata
404 << " need_read_cap=" << op->need_read_cap()
405 << " need_write_cap=" << op->need_write_cap()
406 << " classes=" << op->classes()
407 << " -> " << (cap ? "yes" : "NO")
408 << dendl;
409 return cap;
410 }
411
412 void PG::queue_recovery()
413 {
414 if (!is_primary() || !is_peered()) {
415 dout(10) << "queue_recovery -- not primary or not peered " << dendl;
416 ceph_assert(!recovery_queued);
417 } else if (recovery_queued) {
418 dout(10) << "queue_recovery -- already queued" << dendl;
419 } else {
420 dout(10) << "queue_recovery -- queuing" << dendl;
421 recovery_queued = true;
422 osd->queue_for_recovery(this);
423 }
424 }
425
426 void PG::queue_scrub_after_repair()
427 {
428 dout(10) << __func__ << dendl;
429 ceph_assert(ceph_mutex_is_locked(_lock));
430
431 m_planned_scrub.must_deep_scrub = true;
432 m_planned_scrub.check_repair = true;
433 m_planned_scrub.must_scrub = true;
434
435 if (is_scrub_queued_or_active()) {
436 dout(10) << __func__ << ": scrubbing already ("
437 << (is_scrubbing() ? "active)" : "queued)") << dendl;
438 return;
439 }
440
441 m_scrubber->set_op_parameters(m_planned_scrub);
442 dout(15) << __func__ << ": queueing" << dendl;
443
444 osd->queue_scrub_after_repair(this, Scrub::scrub_prio_t::high_priority);
445 }
446
447 unsigned PG::get_scrub_priority()
448 {
449 // a higher value -> a higher priority
450 int64_t pool_scrub_priority =
451 pool.info.opts.value_or(pool_opts_t::SCRUB_PRIORITY, (int64_t)0);
452 return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
453 }
454
455 Context *PG::finish_recovery()
456 {
457 dout(10) << "finish_recovery" << dendl;
458 ceph_assert(info.last_complete == info.last_update);
459
460 clear_recovery_state();
461
462 /*
463 * sync all this before purging strays. but don't block!
464 */
465 finish_sync_event = new C_PG_FinishRecovery(this);
466 return finish_sync_event;
467 }
468
469 void PG::_finish_recovery(Context* c)
470 {
471 dout(15) << __func__ << " finish_sync_event? " << finish_sync_event << " clean? "
472 << is_clean() << dendl;
473
474 std::scoped_lock locker{*this};
475 if (recovery_state.is_deleting() || !is_clean()) {
476 dout(10) << __func__ << " raced with delete or repair" << dendl;
477 return;
478 }
479 // When recovery is initiated by a repair, that flag is left on
480 state_clear(PG_STATE_REPAIR);
481 if (c == finish_sync_event) {
482 dout(15) << __func__ << " scrub_after_recovery? " << scrub_after_recovery << dendl;
483 finish_sync_event = 0;
484 recovery_state.purge_strays();
485
486 publish_stats_to_osd();
487
488 if (scrub_after_recovery) {
489 dout(10) << "_finish_recovery requeueing for scrub" << dendl;
490 scrub_after_recovery = false;
491 queue_scrub_after_repair();
492 }
493 } else {
494 dout(10) << "_finish_recovery -- stale" << dendl;
495 }
496 }
497
498 void PG::start_recovery_op(const hobject_t& soid)
499 {
500 dout(10) << "start_recovery_op " << soid
501 #ifdef DEBUG_RECOVERY_OIDS
502 << " (" << recovering_oids << ")"
503 #endif
504 << dendl;
505 ceph_assert(recovery_ops_active >= 0);
506 recovery_ops_active++;
507 #ifdef DEBUG_RECOVERY_OIDS
508 recovering_oids.insert(soid);
509 #endif
510 osd->start_recovery_op(this, soid);
511 }
512
513 void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
514 {
515 dout(10) << "finish_recovery_op " << soid
516 #ifdef DEBUG_RECOVERY_OIDS
517 << " (" << recovering_oids << ")"
518 #endif
519 << dendl;
520 ceph_assert(recovery_ops_active > 0);
521 recovery_ops_active--;
522 #ifdef DEBUG_RECOVERY_OIDS
523 ceph_assert(recovering_oids.count(soid));
524 recovering_oids.erase(recovering_oids.find(soid));
525 #endif
526 osd->finish_recovery_op(this, soid, dequeue);
527
528 if (!dequeue) {
529 queue_recovery();
530 }
531 }
532
533 void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
534 {
535 recovery_state.split_into(child_pgid, &child->recovery_state, split_bits);
536
537 child->update_snap_mapper_bits(split_bits);
538
539 child->snap_trimq = snap_trimq;
540 child->snap_trimq_repeat = snap_trimq_repeat;
541
542 _split_into(child_pgid, child, split_bits);
543
544 // release all backoffs for simplicity
545 release_backoffs(hobject_t(), hobject_t::get_max());
546 }
547
548 void PG::start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
549 {
550 recovery_state.start_split_stats(childpgs, out);
551 }
552
553 void PG::finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction &t)
554 {
555 recovery_state.finish_split_stats(stats, t);
556 }
557
558 void PG::merge_from(map<spg_t,PGRef>& sources, PeeringCtx &rctx,
559 unsigned split_bits,
560 const pg_merge_meta_t& last_pg_merge_meta)
561 {
562 dout(10) << __func__ << " from " << sources << " split_bits " << split_bits
563 << dendl;
564 map<spg_t, PeeringState*> source_ps;
565 for (auto &&source : sources) {
566 source_ps.emplace(source.first, &source.second->recovery_state);
567 }
568 recovery_state.merge_from(source_ps, rctx, split_bits, last_pg_merge_meta);
569
570 for (auto& i : sources) {
571 auto& source = i.second;
572 // wipe out source's pgmeta
573 rctx.transaction.remove(source->coll, source->pgmeta_oid);
574
575 // merge (and destroy source collection)
576 rctx.transaction.merge_collection(source->coll, coll, split_bits);
577 }
578
579 // merge_collection does this, but maybe all of our sources were missing.
580 rctx.transaction.collection_set_bits(coll, split_bits);
581
582 snap_mapper.update_bits(split_bits);
583 }
584
585 void PG::add_backoff(const ceph::ref_t<Session>& s, const hobject_t& begin, const hobject_t& end)
586 {
587 auto con = s->con;
588 if (!con) // OSD::ms_handle_reset clears s->con without a lock
589 return;
590 auto b = s->have_backoff(info.pgid, begin);
591 if (b) {
592 derr << __func__ << " already have backoff for " << s << " begin " << begin
593 << " " << *b << dendl;
594 ceph_abort();
595 }
596 std::lock_guard l(backoff_lock);
597 b = ceph::make_ref<Backoff>(info.pgid, this, s, ++s->backoff_seq, begin, end);
598 backoffs[begin].insert(b);
599 s->add_backoff(b);
600 dout(10) << __func__ << " session " << s << " added " << *b << dendl;
601 con->send_message(
602 new MOSDBackoff(
603 info.pgid,
604 get_osdmap_epoch(),
605 CEPH_OSD_BACKOFF_OP_BLOCK,
606 b->id,
607 begin,
608 end));
609 }
610
611 void PG::release_backoffs(const hobject_t& begin, const hobject_t& end)
612 {
613 dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl;
614 vector<ceph::ref_t<Backoff>> bv;
615 {
616 std::lock_guard l(backoff_lock);
617 auto p = backoffs.lower_bound(begin);
618 while (p != backoffs.end()) {
619 int r = cmp(p->first, end);
620 dout(20) << __func__ << " ? " << r << " " << p->first
621 << " " << p->second << dendl;
622 // note: must still examine begin=end=p->first case
623 if (r > 0 || (r == 0 && begin < end)) {
624 break;
625 }
626 dout(20) << __func__ << " checking " << p->first
627 << " " << p->second << dendl;
628 auto q = p->second.begin();
629 while (q != p->second.end()) {
630 dout(20) << __func__ << " checking " << *q << dendl;
631 int rr = cmp((*q)->begin, begin);
632 if (rr == 0 || (rr > 0 && (*q)->end < end)) {
633 bv.push_back(*q);
634 q = p->second.erase(q);
635 } else {
636 ++q;
637 }
638 }
639 if (p->second.empty()) {
640 p = backoffs.erase(p);
641 } else {
642 ++p;
643 }
644 }
645 }
646 for (auto b : bv) {
647 std::lock_guard l(b->lock);
648 dout(10) << __func__ << " " << *b << dendl;
649 if (b->session) {
650 ceph_assert(b->pg == this);
651 ConnectionRef con = b->session->con;
652 if (con) { // OSD::ms_handle_reset clears s->con without a lock
653 con->send_message(
654 new MOSDBackoff(
655 info.pgid,
656 get_osdmap_epoch(),
657 CEPH_OSD_BACKOFF_OP_UNBLOCK,
658 b->id,
659 b->begin,
660 b->end));
661 }
662 if (b->is_new()) {
663 b->state = Backoff::STATE_DELETING;
664 } else {
665 b->session->rm_backoff(b);
666 b->session.reset();
667 }
668 b->pg.reset();
669 }
670 }
671 }
672
673 void PG::clear_backoffs()
674 {
675 dout(10) << __func__ << " " << dendl;
676 map<hobject_t,set<ceph::ref_t<Backoff>>> ls;
677 {
678 std::lock_guard l(backoff_lock);
679 ls.swap(backoffs);
680 }
681 for (auto& p : ls) {
682 for (auto& b : p.second) {
683 std::lock_guard l(b->lock);
684 dout(10) << __func__ << " " << *b << dendl;
685 if (b->session) {
686 ceph_assert(b->pg == this);
687 if (b->is_new()) {
688 b->state = Backoff::STATE_DELETING;
689 } else {
690 b->session->rm_backoff(b);
691 b->session.reset();
692 }
693 b->pg.reset();
694 }
695 }
696 }
697 }
698
699 // called by Session::clear_backoffs()
700 void PG::rm_backoff(const ceph::ref_t<Backoff>& b)
701 {
702 dout(10) << __func__ << " " << *b << dendl;
703 std::lock_guard l(backoff_lock);
704 ceph_assert(ceph_mutex_is_locked_by_me(b->lock));
705 ceph_assert(b->pg == this);
706 auto p = backoffs.find(b->begin);
707 // may race with release_backoffs()
708 if (p != backoffs.end()) {
709 auto q = p->second.find(b);
710 if (q != p->second.end()) {
711 p->second.erase(q);
712 if (p->second.empty()) {
713 backoffs.erase(p);
714 }
715 }
716 }
717 }
718
719 void PG::clear_recovery_state()
720 {
721 dout(10) << "clear_recovery_state" << dendl;
722
723 finish_sync_event = 0;
724
725 hobject_t soid;
726 while (recovery_ops_active > 0) {
727 #ifdef DEBUG_RECOVERY_OIDS
728 soid = *recovering_oids.begin();
729 #endif
730 finish_recovery_op(soid, true);
731 }
732
733 backfill_info.clear();
734 peer_backfill_info.clear();
735 waiting_on_backfill.clear();
736 _clear_recovery_state(); // pg impl specific hook
737 }
738
739 void PG::cancel_recovery()
740 {
741 dout(10) << "cancel_recovery" << dendl;
742 clear_recovery_state();
743 }
744
745 void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
746 {
747 std::lock_guard l(heartbeat_peer_lock);
748 probe_targets.clear();
749 for (set<pg_shard_t>::iterator i = probe_set.begin();
750 i != probe_set.end();
751 ++i) {
752 probe_targets.insert(i->osd);
753 }
754 }
755
756 void PG::send_cluster_message(
757 int target, MessageRef m,
758 epoch_t epoch, bool share_map_update)
759 {
760 ConnectionRef con = osd->get_con_osd_cluster(
761 target, get_osdmap_epoch());
762 if (!con) {
763 return;
764 }
765
766 if (share_map_update) {
767 osd->maybe_share_map(con.get(), get_osdmap());
768 }
769 osd->send_message_osd_cluster(m, con.get());
770 }
771
772 void PG::clear_probe_targets()
773 {
774 std::lock_guard l(heartbeat_peer_lock);
775 probe_targets.clear();
776 }
777
778 void PG::update_heartbeat_peers(set<int> new_peers)
779 {
780 bool need_update = false;
781 heartbeat_peer_lock.lock();
782 if (new_peers == heartbeat_peers) {
783 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl;
784 } else {
785 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl;
786 heartbeat_peers.swap(new_peers);
787 need_update = true;
788 }
789 heartbeat_peer_lock.unlock();
790
791 if (need_update)
792 osd->need_heartbeat_peer_update();
793 }
794
795
796 bool PG::check_in_progress_op(
797 const osd_reqid_t &r,
798 eversion_t *version,
799 version_t *user_version,
800 int *return_code,
801 vector<pg_log_op_return_item_t> *op_returns
802 ) const
803 {
804 return (
805 projected_log.get_request(r, version, user_version, return_code,
806 op_returns) ||
807 recovery_state.get_pg_log().get_log().get_request(
808 r, version, user_version, return_code, op_returns));
809 }
810
811 void PG::publish_stats_to_osd()
812 {
813 if (!is_primary())
814 return;
815
816 ceph_assert(m_scrubber);
817 recovery_state.update_stats_wo_resched(
818 [scrubber = m_scrubber.get()](pg_history_t& hist,
819 pg_stat_t& info) mutable -> void {
820 info.scrub_sched_status = scrubber->get_schedule();
821 });
822
823 std::lock_guard l{pg_stats_publish_lock};
824 auto stats =
825 recovery_state.prepare_stats_for_publish(pg_stats_publish, unstable_stats);
826 if (stats) {
827 pg_stats_publish = std::move(stats);
828 }
829 }
830
831 unsigned PG::get_target_pg_log_entries() const
832 {
833 return osd->get_target_pg_log_entries();
834 }
835
836 void PG::clear_publish_stats()
837 {
838 dout(15) << "clear_stats" << dendl;
839 std::lock_guard l{pg_stats_publish_lock};
840 pg_stats_publish.reset();
841 }
842
843 /**
844 * initialize a newly instantiated pg
845 *
846 * Initialize PG state, as when a PG is initially created, or when it
847 * is first instantiated on the current node.
848 *
849 * @param role our role/rank
850 * @param newup up set
851 * @param newacting acting set
852 * @param history pg history
853 * @param pi past_intervals
854 * @param backfill true if info should be marked as backfill
855 * @param t transaction to write out our new state in
856 */
857 void PG::init(
858 int role,
859 const vector<int>& newup, int new_up_primary,
860 const vector<int>& newacting, int new_acting_primary,
861 const pg_history_t& history,
862 const PastIntervals& pi,
863 ObjectStore::Transaction &t)
864 {
865 recovery_state.init(
866 role, newup, new_up_primary, newacting,
867 new_acting_primary, history, pi, t);
868 }
869
870 void PG::shutdown()
871 {
872 ch->flush();
873 std::scoped_lock l{*this};
874 recovery_state.shutdown();
875 on_shutdown();
876 }
877
878 #pragma GCC diagnostic ignored "-Wpragmas"
879 #pragma GCC diagnostic push
880 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
881
882 void PG::upgrade(ObjectStore *store)
883 {
884 dout(0) << __func__ << " " << info_struct_v << " -> " << pg_latest_struct_v
885 << dendl;
886 ceph_assert(info_struct_v <= 10);
887 ObjectStore::Transaction t;
888
889 // <do upgrade steps here>
890
891 // finished upgrade!
892 ceph_assert(info_struct_v == 10);
893
894 // update infover_key
895 if (info_struct_v < pg_latest_struct_v) {
896 map<string,bufferlist> v;
897 __u8 ver = pg_latest_struct_v;
898 encode(ver, v[string(infover_key)]);
899 t.omap_setkeys(coll, pgmeta_oid, v);
900 }
901
902 recovery_state.force_write_state(t);
903
904 ObjectStore::CollectionHandle ch = store->open_collection(coll);
905 int r = store->queue_transaction(ch, std::move(t));
906 if (r != 0) {
907 derr << __func__ << ": queue_transaction returned "
908 << cpp_strerror(r) << dendl;
909 ceph_abort();
910 }
911 ceph_assert(r == 0);
912
913 C_SaferCond waiter;
914 if (!ch->flush_commit(&waiter)) {
915 waiter.wait();
916 }
917 }
918
919 #pragma GCC diagnostic pop
920 #pragma GCC diagnostic warning "-Wpragmas"
921
922 void PG::prepare_write(
923 pg_info_t &info,
924 pg_info_t &last_written_info,
925 PastIntervals &past_intervals,
926 PGLog &pglog,
927 bool dirty_info,
928 bool dirty_big_info,
929 bool need_write_epoch,
930 ObjectStore::Transaction &t)
931 {
932 info.stats.stats.add(unstable_stats);
933 unstable_stats.clear();
934 map<string,bufferlist> km;
935 string key_to_remove;
936 if (dirty_big_info || dirty_info) {
937 int ret = prepare_info_keymap(
938 cct,
939 &km,
940 &key_to_remove,
941 get_osdmap_epoch(),
942 info,
943 last_written_info,
944 past_intervals,
945 dirty_big_info,
946 need_write_epoch,
947 cct->_conf->osd_fast_info,
948 osd->logger,
949 this);
950 ceph_assert(ret == 0);
951 }
952 pglog.write_log_and_missing(
953 t, &km, coll, pgmeta_oid, pool.info.require_rollback());
954 if (!km.empty())
955 t.omap_setkeys(coll, pgmeta_oid, km);
956 if (!key_to_remove.empty())
957 t.omap_rmkey(coll, pgmeta_oid, key_to_remove);
958 }
959
960 #pragma GCC diagnostic ignored "-Wpragmas"
961 #pragma GCC diagnostic push
962 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
963
964 bool PG::_has_removal_flag(ObjectStore *store,
965 spg_t pgid)
966 {
967 coll_t coll(pgid);
968 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
969
970 // first try new way
971 set<string> keys;
972 keys.insert("_remove");
973 map<string,bufferlist> values;
974 auto ch = store->open_collection(coll);
975 ceph_assert(ch);
976 if (store->omap_get_values(ch, pgmeta_oid, keys, &values) == 0 &&
977 values.size() == 1)
978 return true;
979
980 return false;
981 }
982
983 int PG::peek_map_epoch(ObjectStore *store,
984 spg_t pgid,
985 epoch_t *pepoch)
986 {
987 coll_t coll(pgid);
988 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
989 epoch_t cur_epoch = 0;
990
991 // validate collection name
992 ceph_assert(coll.is_pg());
993
994 // try for v8
995 set<string> keys;
996 keys.insert(string(infover_key));
997 keys.insert(string(epoch_key));
998 map<string,bufferlist> values;
999 auto ch = store->open_collection(coll);
1000 ceph_assert(ch);
1001 int r = store->omap_get_values(ch, pgmeta_oid, keys, &values);
1002 if (r == 0) {
1003 ceph_assert(values.size() == 2);
1004
1005 // sanity check version
1006 auto bp = values[string(infover_key)].cbegin();
1007 __u8 struct_v = 0;
1008 decode(struct_v, bp);
1009 ceph_assert(struct_v >= 8);
1010
1011 // get epoch
1012 bp = values[string(epoch_key)].begin();
1013 decode(cur_epoch, bp);
1014 } else {
1015 // probably bug 10617; see OSD::load_pgs()
1016 return -1;
1017 }
1018
1019 *pepoch = cur_epoch;
1020 return 0;
1021 }
1022
1023 #pragma GCC diagnostic pop
1024 #pragma GCC diagnostic warning "-Wpragmas"
1025
1026 bool PG::check_log_for_corruption(ObjectStore *store)
1027 {
1028 /// TODO: this method needs to work with the omap log
1029 return true;
1030 }
1031
1032 //! Get the name we're going to save our corrupt page log as
1033 std::string PG::get_corrupt_pg_log_name() const
1034 {
1035 const int MAX_BUF = 512;
1036 char buf[MAX_BUF];
1037 struct tm tm_buf;
1038 time_t my_time(time(NULL));
1039 const struct tm *t = localtime_r(&my_time, &tm_buf);
1040 int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t);
1041 if (ret == 0) {
1042 dout(0) << "strftime failed" << dendl;
1043 return "corrupt_log_unknown_time";
1044 }
1045 string out(buf);
1046 out += stringify(info.pgid);
1047 return out;
1048 }
1049
1050 int PG::read_info(
1051 ObjectStore *store, spg_t pgid, const coll_t &coll,
1052 pg_info_t &info, PastIntervals &past_intervals,
1053 __u8 &struct_v)
1054 {
1055 set<string> keys;
1056 keys.insert(string(infover_key));
1057 keys.insert(string(info_key));
1058 keys.insert(string(biginfo_key));
1059 keys.insert(string(fastinfo_key));
1060 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
1061 map<string,bufferlist> values;
1062 auto ch = store->open_collection(coll);
1063 ceph_assert(ch);
1064 int r = store->omap_get_values(ch, pgmeta_oid, keys, &values);
1065 ceph_assert(r == 0);
1066 ceph_assert(values.size() == 3 ||
1067 values.size() == 4);
1068
1069 auto p = values[string(infover_key)].cbegin();
1070 decode(struct_v, p);
1071 ceph_assert(struct_v >= 10);
1072
1073 p = values[string(info_key)].begin();
1074 decode(info, p);
1075
1076 p = values[string(biginfo_key)].begin();
1077 decode(past_intervals, p);
1078 decode(info.purged_snaps, p);
1079
1080 p = values[string(fastinfo_key)].begin();
1081 if (!p.end()) {
1082 pg_fast_info_t fast;
1083 decode(fast, p);
1084 fast.try_apply_to(&info);
1085 }
1086 return 0;
1087 }
1088
1089 void PG::read_state(ObjectStore *store)
1090 {
1091 PastIntervals past_intervals_from_disk;
1092 pg_info_t info_from_disk;
1093 int r = read_info(
1094 store,
1095 pg_id,
1096 coll,
1097 info_from_disk,
1098 past_intervals_from_disk,
1099 info_struct_v);
1100 ceph_assert(r >= 0);
1101
1102 if (info_struct_v < pg_compat_struct_v) {
1103 derr << "PG needs upgrade, but on-disk data is too old; upgrade to"
1104 << " an older version first." << dendl;
1105 ceph_abort_msg("PG too old to upgrade");
1106 }
1107
1108 recovery_state.init_from_disk_state(
1109 std::move(info_from_disk),
1110 std::move(past_intervals_from_disk),
1111 [this, store] (PGLog &pglog) {
1112 ostringstream oss;
1113 pglog.read_log_and_missing(
1114 store,
1115 ch,
1116 pgmeta_oid,
1117 info,
1118 oss,
1119 cct->_conf->osd_ignore_stale_divergent_priors,
1120 cct->_conf->osd_debug_verify_missing_on_start);
1121
1122 if (oss.tellp())
1123 osd->clog->error() << oss.str();
1124 return 0;
1125 });
1126
1127 if (info_struct_v < pg_latest_struct_v) {
1128 upgrade(store);
1129 }
1130
1131 // initialize current mapping
1132 {
1133 int primary, up_primary;
1134 vector<int> acting, up;
1135 get_osdmap()->pg_to_up_acting_osds(
1136 pg_id.pgid, &up, &up_primary, &acting, &primary);
1137 recovery_state.init_primary_up_acting(
1138 up,
1139 acting,
1140 up_primary,
1141 primary);
1142 recovery_state.set_role(OSDMap::calc_pg_role(pg_whoami, acting));
1143 }
1144
1145 // init pool options
1146 store->set_collection_opts(ch, pool.info.opts);
1147
1148 PeeringCtx rctx;
1149 handle_initialize(rctx);
1150 // note: we don't activate here because we know the OSD will advance maps
1151 // during boot.
1152 write_if_dirty(rctx.transaction);
1153 store->queue_transaction(ch, std::move(rctx.transaction));
1154 }
1155
1156 void PG::update_snap_map(
1157 const vector<pg_log_entry_t> &log_entries,
1158 ObjectStore::Transaction &t)
1159 {
1160 for (auto i = log_entries.cbegin(); i != log_entries.cend(); ++i) {
1161 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
1162 if (i->soid.snap < CEPH_MAXSNAP) {
1163 if (i->is_delete()) {
1164 int r = snap_mapper.remove_oid(
1165 i->soid,
1166 &_t);
1167 if (r)
1168 derr << __func__ << " remove_oid " << i->soid << " failed with " << r << dendl;
1169 // On removal tolerate missing key corruption
1170 ceph_assert(r == 0 || r == -ENOENT);
1171 } else if (i->is_update()) {
1172 ceph_assert(i->snaps.length() > 0);
1173 vector<snapid_t> snaps;
1174 bufferlist snapbl = i->snaps;
1175 auto p = snapbl.cbegin();
1176 try {
1177 decode(snaps, p);
1178 } catch (...) {
1179 derr << __func__ << " decode snaps failure on " << *i << dendl;
1180 snaps.clear();
1181 }
1182 set<snapid_t> _snaps(snaps.begin(), snaps.end());
1183
1184 if (i->is_clone() || i->is_promote()) {
1185 snap_mapper.add_oid(
1186 i->soid,
1187 _snaps,
1188 &_t);
1189 } else if (i->is_modify()) {
1190 int r = snap_mapper.update_snaps(
1191 i->soid,
1192 _snaps,
1193 0,
1194 &_t);
1195 ceph_assert(r == 0);
1196 } else {
1197 ceph_assert(i->is_clean());
1198 }
1199 }
1200 }
1201 }
1202 }
1203
1204 /**
1205 * filter trimming|trimmed snaps out of snapcontext
1206 */
1207 void PG::filter_snapc(vector<snapid_t> &snaps)
1208 {
1209 // nothing needs to trim, we can return immediately
1210 if (snap_trimq.empty() && info.purged_snaps.empty())
1211 return;
1212
1213 bool filtering = false;
1214 vector<snapid_t> newsnaps;
1215 for (vector<snapid_t>::iterator p = snaps.begin();
1216 p != snaps.end();
1217 ++p) {
1218 if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
1219 if (!filtering) {
1220 // start building a new vector with what we've seen so far
1221 dout(10) << "filter_snapc filtering " << snaps << dendl;
1222 newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
1223 filtering = true;
1224 }
1225 dout(20) << "filter_snapc removing trimq|purged snap " << *p << dendl;
1226 } else {
1227 if (filtering)
1228 newsnaps.push_back(*p); // continue building new vector
1229 }
1230 }
1231 if (filtering) {
1232 snaps.swap(newsnaps);
1233 dout(10) << "filter_snapc result " << snaps << dendl;
1234 }
1235 }
1236
1237 void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m)
1238 {
1239 for (auto it = m.begin(); it != m.end(); ++it)
1240 requeue_ops(it->second);
1241 m.clear();
1242 }
1243
1244 void PG::requeue_op(OpRequestRef op)
1245 {
1246 auto p = waiting_for_map.find(op->get_source());
1247 if (p != waiting_for_map.end()) {
1248 dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")"
1249 << dendl;
1250 p->second.push_front(op);
1251 } else {
1252 dout(20) << __func__ << " " << op << dendl;
1253 osd->enqueue_front(
1254 OpSchedulerItem(
1255 unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(info.pgid, op)),
1256 op->get_req()->get_cost(),
1257 op->get_req()->get_priority(),
1258 op->get_req()->get_recv_stamp(),
1259 op->get_req()->get_source().num(),
1260 get_osdmap_epoch()));
1261 }
1262 }
1263
1264 void PG::requeue_ops(list<OpRequestRef> &ls)
1265 {
1266 for (list<OpRequestRef>::reverse_iterator i = ls.rbegin();
1267 i != ls.rend();
1268 ++i) {
1269 requeue_op(*i);
1270 }
1271 ls.clear();
1272 }
1273
1274 void PG::requeue_map_waiters()
1275 {
1276 epoch_t epoch = get_osdmap_epoch();
1277 auto p = waiting_for_map.begin();
1278 while (p != waiting_for_map.end()) {
1279 if (epoch < p->second.front()->min_epoch) {
1280 dout(20) << __func__ << " " << p->first << " front op "
1281 << p->second.front() << " must still wait, doing nothing"
1282 << dendl;
1283 ++p;
1284 } else {
1285 dout(20) << __func__ << " " << p->first << " " << p->second << dendl;
1286 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
1287 auto req = *q;
1288 osd->enqueue_front(OpSchedulerItem(
1289 unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(info.pgid, req)),
1290 req->get_req()->get_cost(),
1291 req->get_req()->get_priority(),
1292 req->get_req()->get_recv_stamp(),
1293 req->get_req()->get_source().num(),
1294 epoch));
1295 }
1296 p = waiting_for_map.erase(p);
1297 }
1298 }
1299 }
1300
1301 bool PG::get_must_scrub() const
1302 {
1303 dout(20) << __func__ << " must_scrub? " << (m_planned_scrub.must_scrub ? "true" : "false") << dendl;
1304 return m_planned_scrub.must_scrub;
1305 }
1306
1307 unsigned int PG::scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const
1308 {
1309 return m_scrubber->scrub_requeue_priority(with_priority);
1310 }
1311
1312 unsigned int PG::scrub_requeue_priority(Scrub::scrub_prio_t with_priority, unsigned int suggested_priority) const
1313 {
1314 return m_scrubber->scrub_requeue_priority(with_priority, suggested_priority);
1315 }
1316
1317 // ==========================================================================================
1318 // SCRUB
1319
1320 /*
1321 * implementation note:
1322 * PG::sched_scrub() is called only once per a specific scrub session.
1323 * That call commits us to the whatever choices are made (deep/shallow, etc').
1324 * Unless failing to start scrubbing, the 'planned scrub' flag-set is 'frozen' into
1325 * PgScrubber's m_flags, then cleared.
1326 */
1327 Scrub::schedule_result_t PG::sched_scrub()
1328 {
1329 dout(15) << __func__ << " pg(" << info.pgid
1330 << (is_active() ? ") <active>" : ") <not-active>")
1331 << (is_clean() ? " <clean>" : " <not-clean>") << dendl;
1332 ceph_assert(ceph_mutex_is_locked(_lock));
1333 ceph_assert(m_scrubber);
1334
1335 if (is_scrub_queued_or_active()) {
1336 return Scrub::schedule_result_t::already_started;
1337 }
1338
1339 if (!is_primary() || !is_active() || !is_clean()) {
1340 return Scrub::schedule_result_t::bad_pg_state;
1341 }
1342
1343 // analyse the combination of the requested scrub flags, the osd/pool configuration
1344 // and the PG status to determine whether we should scrub now, and what type of scrub
1345 // should that be.
1346 auto updated_flags = verify_scrub_mode();
1347 if (!updated_flags) {
1348 // the stars do not align for starting a scrub for this PG at this time
1349 // (due to configuration or priority issues)
1350 // The reason was already reported by the callee.
1351 dout(10) << __func__ << ": failed to initiate a scrub" << dendl;
1352 return Scrub::schedule_result_t::preconditions;
1353 }
1354
1355 // try to reserve the local OSD resources. If failing: no harm. We will
1356 // be retried by the OSD later on.
1357 if (!m_scrubber->reserve_local()) {
1358 dout(10) << __func__ << ": failed to reserve locally" << dendl;
1359 return Scrub::schedule_result_t::no_local_resources;
1360 }
1361
1362 // can commit to the updated flags now, as nothing will stop the scrub
1363 m_planned_scrub = *updated_flags;
1364
1365 // An interrupted recovery repair could leave this set.
1366 state_clear(PG_STATE_REPAIR);
1367
1368 // Pass control to the scrubber. It is the scrubber that handles the replicas'
1369 // resources reservations.
1370 m_scrubber->set_op_parameters(m_planned_scrub);
1371
1372 dout(10) << __func__ << ": queueing" << dendl;
1373 osd->queue_for_scrub(this, Scrub::scrub_prio_t::low_priority);
1374 return Scrub::schedule_result_t::scrub_initiated;
1375 }
1376
1377 double PG::next_deepscrub_interval() const
1378 {
1379 double deep_scrub_interval =
1380 pool.info.opts.value_or(pool_opts_t::DEEP_SCRUB_INTERVAL, 0.0);
1381 if (deep_scrub_interval <= 0.0)
1382 deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
1383 return info.history.last_deep_scrub_stamp + deep_scrub_interval;
1384 }
1385
1386 bool PG::is_time_for_deep(bool allow_deep_scrub,
1387 bool allow_scrub,
1388 bool has_deep_errors,
1389 const requested_scrub_t& planned) const
1390 {
1391 dout(10) << __func__ << ": need_auto?" << planned.need_auto << " allow_deep_scrub? "
1392 << allow_deep_scrub << dendl;
1393
1394 if (!allow_deep_scrub)
1395 return false;
1396
1397 if (planned.need_auto) {
1398 dout(10) << __func__ << ": need repair after scrub errors" << dendl;
1399 return true;
1400 }
1401
1402 if (ceph_clock_now() >= next_deepscrub_interval()) {
1403 dout(20) << __func__ << ": now (" << ceph_clock_now() << ") >= time for deep ("
1404 << next_deepscrub_interval() << ")" << dendl;
1405 return true;
1406 }
1407
1408 if (has_deep_errors) {
1409 osd->clog->info() << "osd." << osd->whoami << " pg " << info.pgid
1410 << " Deep scrub errors, upgrading scrub to deep-scrub";
1411 return true;
1412 }
1413
1414 // we only flip coins if 'allow_scrub' is asserted. Otherwise - as this function is
1415 // called often, we will probably be deep-scrubbing most of the time.
1416 if (allow_scrub) {
1417 bool deep_coin_flip =
1418 (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
1419
1420 dout(15) << __func__ << ": time_for_deep=" << planned.time_for_deep
1421 << " deep_coin_flip=" << deep_coin_flip << dendl;
1422
1423 if (deep_coin_flip)
1424 return true;
1425 }
1426
1427 return false;
1428 }
1429
1430 bool PG::verify_periodic_scrub_mode(bool allow_deep_scrub,
1431 bool try_to_auto_repair,
1432 bool allow_regular_scrub,
1433 bool has_deep_errors,
1434 requested_scrub_t& planned) const
1435
1436 {
1437 ceph_assert(!planned.must_deep_scrub && !planned.must_repair);
1438
1439 if (!allow_deep_scrub && has_deep_errors) {
1440 osd->clog->error()
1441 << "osd." << osd->whoami << " pg " << info.pgid
1442 << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
1443 return false;
1444 }
1445
1446 if (allow_deep_scrub) {
1447 // Initial entry and scheduled scrubs without nodeep_scrub set get here
1448
1449 planned.time_for_deep =
1450 is_time_for_deep(allow_deep_scrub, allow_regular_scrub, has_deep_errors, planned);
1451
1452 if (try_to_auto_repair) {
1453 if (planned.time_for_deep) {
1454 dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
1455 planned.auto_repair = true;
1456 } else if (allow_regular_scrub) {
1457 dout(20) << __func__ << ": auto repair with scrubbing, rescrub if errors found"
1458 << dendl;
1459 planned.deep_scrub_on_error = true;
1460 }
1461 }
1462 }
1463
1464 dout(20) << __func__ << " updated flags: " << planned
1465 << " allow_regular_scrub: " << allow_regular_scrub << dendl;
1466
1467 // NOSCRUB so skip regular scrubs
1468 if (!allow_regular_scrub && !planned.time_for_deep) {
1469 return false;
1470 }
1471
1472 return true;
1473 }
1474
1475 std::optional<requested_scrub_t> PG::verify_scrub_mode() const
1476 {
1477 const bool allow_regular_scrub =
1478 !(get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
1479 pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB));
1480 const bool allow_deep_scrub =
1481 allow_regular_scrub &&
1482 !(get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
1483 pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB));
1484 const bool has_deep_errors = (info.stats.stats.sum.num_deep_scrub_errors > 0);
1485 const bool try_to_auto_repair = (cct->_conf->osd_scrub_auto_repair &&
1486 get_pgbackend()->auto_repair_supported());
1487
1488 dout(10) << __func__ << " pg: " << info.pgid
1489 << " allow: " << allow_regular_scrub << "/" << allow_deep_scrub
1490 << " deep errs: " << has_deep_errors
1491 << " auto-repair: " << try_to_auto_repair << " ("
1492 << cct->_conf->osd_scrub_auto_repair << ")" << dendl;
1493
1494 auto upd_flags = m_planned_scrub;
1495
1496 upd_flags.time_for_deep = false;
1497 // Clear these in case user issues the scrub/repair command during
1498 // the scheduling of the scrub/repair (e.g. request reservation)
1499 upd_flags.deep_scrub_on_error = false;
1500 upd_flags.auto_repair = false;
1501
1502 if (upd_flags.must_scrub && !upd_flags.must_deep_scrub && has_deep_errors) {
1503 osd->clog->error()
1504 << "osd." << osd->whoami << " pg " << info.pgid
1505 << " Regular scrub request, deep-scrub details will be lost";
1506 }
1507
1508 if (!upd_flags.must_scrub) {
1509 // All periodic scrub handling goes here because must_scrub is
1510 // always set for must_deep_scrub and must_repair.
1511
1512 const bool can_start_periodic = verify_periodic_scrub_mode(
1513 allow_deep_scrub, try_to_auto_repair, allow_regular_scrub,
1514 has_deep_errors, upd_flags);
1515 if (!can_start_periodic) {
1516 // "I don't want no scrub"
1517 dout(20) << __func__ << ": no periodic scrubs allowed" << dendl;
1518 return std::nullopt;
1519 }
1520 }
1521
1522 // scrubbing while recovering?
1523
1524 bool prevented_by_recovery =
1525 osd->is_recovery_active() && !cct->_conf->osd_scrub_during_recovery &&
1526 (!cct->_conf->osd_repair_during_recovery || !upd_flags.must_repair);
1527
1528 if (prevented_by_recovery) {
1529 dout(20) << __func__ << ": scrubbing prevented during recovery" << dendl;
1530 return std::nullopt;
1531 }
1532
1533 upd_flags.need_auto = false;
1534 return upd_flags;
1535 }
1536
1537 /*
1538 * Note: on_info_history_change() is used in those two cases where we're not sure
1539 * whether the role of the PG was changed, and if so - was this change relayed to the
1540 * scrub-queue.
1541 */
1542 void PG::on_info_history_change()
1543 {
1544 dout(20) << __func__ << " for a " << (is_primary() ? "Primary" : "non-primary") <<dendl;
1545
1546 ceph_assert(m_scrubber);
1547 m_scrubber->on_maybe_registration_change(m_planned_scrub);
1548 }
1549
1550 void PG::reschedule_scrub()
1551 {
1552 dout(20) << __func__ << " for a " << (is_primary() ? "Primary" : "non-primary") <<dendl;
1553
1554 // we are assuming no change in primary status
1555 if (is_primary()) {
1556 ceph_assert(m_scrubber);
1557 m_scrubber->update_scrub_job(m_planned_scrub);
1558 }
1559 }
1560
1561 void PG::on_primary_status_change(bool was_primary, bool now_primary)
1562 {
1563 // make sure we have a working scrubber when becoming a primary
1564
1565 if (was_primary != now_primary) {
1566 ceph_assert(m_scrubber);
1567 m_scrubber->on_primary_change(m_planned_scrub);
1568 }
1569 }
1570
1571 void PG::scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type)
1572 {
1573 ceph_assert(m_scrubber);
1574 m_scrubber->scrub_requested(scrub_level, scrub_type, m_planned_scrub);
1575 }
1576
1577 void PG::clear_ready_to_merge() {
1578 osd->clear_ready_to_merge(this);
1579 }
1580
1581 void PG::queue_want_pg_temp(const vector<int> &wanted) {
1582 osd->queue_want_pg_temp(get_pgid().pgid, wanted);
1583 }
1584
1585 void PG::clear_want_pg_temp() {
1586 osd->remove_want_pg_temp(get_pgid().pgid);
1587 }
1588
1589 void PG::on_role_change() {
1590 requeue_ops(waiting_for_peered);
1591 plpg_on_role_change();
1592 }
1593
1594 void PG::on_new_interval()
1595 {
1596 projected_last_update = eversion_t();
1597 cancel_recovery();
1598
1599 assert(m_scrubber);
1600 // log some scrub data before we react to the interval
1601 dout(20) << __func__ << (is_scrub_queued_or_active() ? " scrubbing " : " ")
1602 << "flags: " << m_planned_scrub << dendl;
1603
1604 m_scrubber->on_maybe_registration_change(m_planned_scrub);
1605 }
1606
1607 epoch_t PG::oldest_stored_osdmap() {
1608 return osd->get_superblock().oldest_map;
1609 }
1610
1611 OstreamTemp PG::get_clog_info() {
1612 return osd->clog->info();
1613 }
1614
1615 OstreamTemp PG::get_clog_debug() {
1616 return osd->clog->debug();
1617 }
1618
1619 OstreamTemp PG::get_clog_error() {
1620 return osd->clog->error();
1621 }
1622
1623 void PG::schedule_event_after(
1624 PGPeeringEventRef event,
1625 float delay) {
1626 std::lock_guard lock(osd->recovery_request_lock);
1627 osd->recovery_request_timer.add_event_after(
1628 delay,
1629 new QueuePeeringEvt(
1630 this,
1631 std::move(event)));
1632 }
1633
1634 void PG::request_local_background_io_reservation(
1635 unsigned priority,
1636 PGPeeringEventURef on_grant,
1637 PGPeeringEventURef on_preempt) {
1638 osd->local_reserver.request_reservation(
1639 pg_id,
1640 on_grant ? new QueuePeeringEvt(
1641 this, std::move(on_grant)) : nullptr,
1642 priority,
1643 on_preempt ? new QueuePeeringEvt(
1644 this, std::move(on_preempt)) : nullptr);
1645 }
1646
1647 void PG::update_local_background_io_priority(
1648 unsigned priority) {
1649 osd->local_reserver.update_priority(
1650 pg_id,
1651 priority);
1652 }
1653
1654 void PG::cancel_local_background_io_reservation() {
1655 osd->local_reserver.cancel_reservation(
1656 pg_id);
1657 }
1658
1659 void PG::request_remote_recovery_reservation(
1660 unsigned priority,
1661 PGPeeringEventURef on_grant,
1662 PGPeeringEventURef on_preempt) {
1663 osd->remote_reserver.request_reservation(
1664 pg_id,
1665 on_grant ? new QueuePeeringEvt(
1666 this, std::move(on_grant)) : nullptr,
1667 priority,
1668 on_preempt ? new QueuePeeringEvt(
1669 this, std::move(on_preempt)) : nullptr);
1670 }
1671
1672 void PG::cancel_remote_recovery_reservation() {
1673 osd->remote_reserver.cancel_reservation(
1674 pg_id);
1675 }
1676
1677 void PG::schedule_event_on_commit(
1678 ObjectStore::Transaction &t,
1679 PGPeeringEventRef on_commit)
1680 {
1681 t.register_on_commit(new QueuePeeringEvt(this, on_commit));
1682 }
1683
1684 void PG::on_activate(interval_set<snapid_t> snaps)
1685 {
1686 ceph_assert(!m_scrubber->are_callbacks_pending());
1687 ceph_assert(callbacks_for_degraded_object.empty());
1688 snap_trimq = snaps;
1689 release_pg_backoffs();
1690 projected_last_update = info.last_update;
1691 }
1692
1693 void PG::on_active_exit()
1694 {
1695 backfill_reserving = false;
1696 agent_stop();
1697 }
1698
1699 void PG::on_active_advmap(const OSDMapRef &osdmap)
1700 {
1701 const auto& new_removed_snaps = osdmap->get_new_removed_snaps();
1702 auto i = new_removed_snaps.find(get_pgid().pool());
1703 if (i != new_removed_snaps.end()) {
1704 bool bad = false;
1705 for (auto j : i->second) {
1706 if (snap_trimq.intersects(j.first, j.second)) {
1707 decltype(snap_trimq) added, overlap;
1708 added.insert(j.first, j.second);
1709 overlap.intersection_of(snap_trimq, added);
1710 derr << __func__ << " removed_snaps already contains "
1711 << overlap << dendl;
1712 bad = true;
1713 snap_trimq.union_of(added);
1714 } else {
1715 snap_trimq.insert(j.first, j.second);
1716 }
1717 }
1718 dout(10) << __func__ << " new removed_snaps " << i->second
1719 << ", snap_trimq now " << snap_trimq << dendl;
1720 ceph_assert(!bad || !cct->_conf->osd_debug_verify_cached_snaps);
1721 }
1722
1723 const auto& new_purged_snaps = osdmap->get_new_purged_snaps();
1724 auto j = new_purged_snaps.find(get_pgid().pgid.pool());
1725 if (j != new_purged_snaps.end()) {
1726 bool bad = false;
1727 for (auto k : j->second) {
1728 if (!recovery_state.get_info().purged_snaps.contains(k.first, k.second)) {
1729 interval_set<snapid_t> rm, overlap;
1730 rm.insert(k.first, k.second);
1731 overlap.intersection_of(recovery_state.get_info().purged_snaps, rm);
1732 derr << __func__ << " purged_snaps does not contain "
1733 << rm << ", only " << overlap << dendl;
1734 recovery_state.adjust_purged_snaps(
1735 [&overlap](auto &purged_snaps) {
1736 purged_snaps.subtract(overlap);
1737 });
1738 // This can currently happen in the normal (if unlikely) course of
1739 // events. Because adding snaps to purged_snaps does not increase
1740 // the pg version or add a pg log entry, we don't reliably propagate
1741 // purged_snaps additions to other OSDs.
1742 // One example:
1743 // - purge S
1744 // - primary and replicas update purged_snaps
1745 // - no object updates
1746 // - pg mapping changes, new primary on different node
1747 // - new primary pg version == eversion_t(), so info is not
1748 // propagated.
1749 //bad = true;
1750 } else {
1751 recovery_state.adjust_purged_snaps(
1752 [&k](auto &purged_snaps) {
1753 purged_snaps.erase(k.first, k.second);
1754 });
1755 }
1756 }
1757 dout(10) << __func__ << " new purged_snaps " << j->second
1758 << ", now " << recovery_state.get_info().purged_snaps << dendl;
1759 ceph_assert(!bad || !cct->_conf->osd_debug_verify_cached_snaps);
1760 }
1761 }
1762
1763 void PG::queue_snap_retrim(snapid_t snap)
1764 {
1765 if (!is_active() ||
1766 !is_primary()) {
1767 dout(10) << __func__ << " snap " << snap << " - not active and primary"
1768 << dendl;
1769 return;
1770 }
1771 if (!snap_trimq.contains(snap)) {
1772 snap_trimq.insert(snap);
1773 snap_trimq_repeat.insert(snap);
1774 dout(20) << __func__ << " snap " << snap
1775 << ", trimq now " << snap_trimq
1776 << ", repeat " << snap_trimq_repeat << dendl;
1777 kick_snap_trim();
1778 } else {
1779 dout(20) << __func__ << " snap " << snap
1780 << " already in trimq " << snap_trimq << dendl;
1781 }
1782 }
1783
1784 void PG::on_active_actmap()
1785 {
1786 if (cct->_conf->osd_check_for_log_corruption)
1787 check_log_for_corruption(osd->store);
1788
1789
1790 if (recovery_state.is_active()) {
1791 dout(10) << "Active: kicking snap trim" << dendl;
1792 kick_snap_trim();
1793 }
1794
1795 if (recovery_state.is_peered() &&
1796 !recovery_state.is_clean() &&
1797 !recovery_state.get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) &&
1798 (!recovery_state.get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) ||
1799 recovery_state.is_degraded())) {
1800 queue_recovery();
1801 }
1802 }
1803
1804 void PG::on_backfill_reserved()
1805 {
1806 backfill_reserving = false;
1807 queue_recovery();
1808 }
1809
1810 void PG::on_backfill_canceled()
1811 {
1812 if (!waiting_on_backfill.empty()) {
1813 waiting_on_backfill.clear();
1814 finish_recovery_op(hobject_t::get_max());
1815 }
1816 }
1817
1818 void PG::on_recovery_reserved()
1819 {
1820 queue_recovery();
1821 }
1822
1823 void PG::set_not_ready_to_merge_target(pg_t pgid, pg_t src)
1824 {
1825 osd->set_not_ready_to_merge_target(pgid, src);
1826 }
1827
1828 void PG::set_not_ready_to_merge_source(pg_t pgid)
1829 {
1830 osd->set_not_ready_to_merge_source(pgid);
1831 }
1832
1833 void PG::set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec)
1834 {
1835 osd->set_ready_to_merge_target(this, lu, les, lec);
1836 }
1837
1838 void PG::set_ready_to_merge_source(eversion_t lu)
1839 {
1840 osd->set_ready_to_merge_source(this, lu);
1841 }
1842
1843 void PG::send_pg_created(pg_t pgid)
1844 {
1845 osd->send_pg_created(pgid);
1846 }
1847
1848 ceph::signedspan PG::get_mnow()
1849 {
1850 return osd->get_mnow();
1851 }
1852
1853 HeartbeatStampsRef PG::get_hb_stamps(int peer)
1854 {
1855 return osd->get_hb_stamps(peer);
1856 }
1857
1858 void PG::schedule_renew_lease(epoch_t lpr, ceph::timespan delay)
1859 {
1860 auto spgid = info.pgid;
1861 auto o = osd;
1862 osd->mono_timer.add_event(
1863 delay,
1864 [o, lpr, spgid]() {
1865 o->queue_renew_lease(lpr, spgid);
1866 });
1867 }
1868
1869 void PG::queue_check_readable(epoch_t lpr, ceph::timespan delay)
1870 {
1871 osd->queue_check_readable(info.pgid, lpr, delay);
1872 }
1873
1874 void PG::rebuild_missing_set_with_deletes(PGLog &pglog)
1875 {
1876 pglog.rebuild_missing_set_with_deletes(
1877 osd->store,
1878 ch,
1879 recovery_state.get_info());
1880 }
1881
1882 void PG::on_activate_committed()
1883 {
1884 if (!is_primary()) {
1885 // waiters
1886 if (recovery_state.needs_flush() == 0) {
1887 requeue_ops(waiting_for_peered);
1888 } else if (!waiting_for_peered.empty()) {
1889 dout(10) << __func__ << " flushes in progress, moving "
1890 << waiting_for_peered.size() << " items to waiting_for_flush"
1891 << dendl;
1892 ceph_assert(waiting_for_flush.empty());
1893 waiting_for_flush.swap(waiting_for_peered);
1894 }
1895 }
1896 }
1897
1898 // Compute pending backfill data
1899 static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes)
1900 {
1901 lgeneric_dout(cct, 20) << __func__ << " Adjust local usage "
1902 << (local_bytes >> 10) << "KiB"
1903 << " primary usage " << (bf_bytes >> 10)
1904 << "KiB" << dendl;
1905
1906 return std::max((int64_t)0, bf_bytes - local_bytes);
1907 }
1908
1909
1910 // We can zero the value of primary num_bytes as just an atomic.
1911 // However, setting above zero reserves space for backfill and requires
1912 // the OSDService::stat_lock which protects all OSD usage
1913 bool PG::try_reserve_recovery_space(
1914 int64_t primary_bytes, int64_t local_bytes) {
1915 // Use tentative_bacfill_full() to make sure enough
1916 // space is available to handle target bytes from primary.
1917
1918 // TODO: If we passed num_objects from primary we could account for
1919 // an estimate of the metadata overhead.
1920
1921 // TODO: If we had compressed_allocated and compressed_original from primary
1922 // we could compute compression ratio and adjust accordingly.
1923
1924 // XXX: There is no way to get omap overhead and this would only apply
1925 // to whatever possibly different partition that is storing the database.
1926
1927 // update_osd_stat() from heartbeat will do this on a new
1928 // statfs using ps->primary_bytes.
1929 uint64_t pending_adjustment = 0;
1930 if (primary_bytes) {
1931 // For erasure coded pool overestimate by a full stripe per object
1932 // because we don't know how each objected rounded to the nearest stripe
1933 if (pool.info.is_erasure()) {
1934 primary_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
1935 primary_bytes += get_pgbackend()->get_ec_stripe_chunk_size() *
1936 info.stats.stats.sum.num_objects;
1937 local_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
1938 local_bytes += get_pgbackend()->get_ec_stripe_chunk_size() *
1939 info.stats.stats.sum.num_objects;
1940 }
1941 pending_adjustment = pending_backfill(
1942 cct,
1943 primary_bytes,
1944 local_bytes);
1945 dout(10) << __func__ << " primary_bytes " << (primary_bytes >> 10)
1946 << "KiB"
1947 << " local " << (local_bytes >> 10) << "KiB"
1948 << " pending_adjustments " << (pending_adjustment >> 10) << "KiB"
1949 << dendl;
1950 }
1951
1952 // This lock protects not only the stats OSDService but also setting the
1953 // pg primary_bytes. That's why we don't immediately unlock
1954 std::lock_guard l{osd->stat_lock};
1955 osd_stat_t cur_stat = osd->osd_stat;
1956 if (cct->_conf->osd_debug_reject_backfill_probability > 0 &&
1957 (rand()%1000 < (cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
1958 dout(10) << "backfill reservation rejected: failure injection"
1959 << dendl;
1960 return false;
1961 } else if (!cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
1962 osd->tentative_backfill_full(this, pending_adjustment, cur_stat)) {
1963 dout(10) << "backfill reservation rejected: backfill full"
1964 << dendl;
1965 return false;
1966 } else {
1967 // Don't reserve space if skipped reservation check, this is used
1968 // to test the other backfill full check AND in case a corruption
1969 // of num_bytes requires ignoring that value and trying the
1970 // backfill anyway.
1971 if (primary_bytes &&
1972 !cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) {
1973 primary_num_bytes.store(primary_bytes);
1974 local_num_bytes.store(local_bytes);
1975 } else {
1976 unreserve_recovery_space();
1977 }
1978 return true;
1979 }
1980 }
1981
1982 void PG::unreserve_recovery_space() {
1983 primary_num_bytes.store(0);
1984 local_num_bytes.store(0);
1985 }
1986
1987 void PG::_scan_rollback_obs(const vector<ghobject_t> &rollback_obs)
1988 {
1989 ObjectStore::Transaction t;
1990 eversion_t trimmed_to = recovery_state.get_last_rollback_info_trimmed_to_applied();
1991 for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
1992 i != rollback_obs.end();
1993 ++i) {
1994 if (i->generation < trimmed_to.version) {
1995 dout(10) << __func__ << "osd." << osd->whoami
1996 << " pg " << info.pgid
1997 << " found obsolete rollback obj "
1998 << *i << " generation < trimmed_to "
1999 << trimmed_to
2000 << "...repaired" << dendl;
2001 t.remove(coll, *i);
2002 }
2003 }
2004 if (!t.empty()) {
2005 derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
2006 << dendl;
2007 osd->store->queue_transaction(ch, std::move(t), NULL);
2008 }
2009 }
2010
2011
2012 void PG::_repair_oinfo_oid(ScrubMap &smap)
2013 {
2014 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
2015 i != smap.objects.rend();
2016 ++i) {
2017 const hobject_t &hoid = i->first;
2018 ScrubMap::object &o = i->second;
2019
2020 bufferlist bl;
2021 if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
2022 continue;
2023 }
2024 bl.push_back(o.attrs[OI_ATTR]);
2025 object_info_t oi;
2026 try {
2027 oi.decode(bl);
2028 } catch(...) {
2029 continue;
2030 }
2031 if (oi.soid != hoid) {
2032 ObjectStore::Transaction t;
2033 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
2034 osd->clog->error() << "osd." << osd->whoami
2035 << " found object info error on pg "
2036 << info.pgid
2037 << " oid " << hoid << " oid in object info: "
2038 << oi.soid
2039 << "...repaired";
2040 // Fix object info
2041 oi.soid = hoid;
2042 bl.clear();
2043 encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
2044
2045 bufferptr bp(bl.c_str(), bl.length());
2046 o.attrs[OI_ATTR] = bp;
2047
2048 t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
2049 int r = osd->store->queue_transaction(ch, std::move(t));
2050 if (r != 0) {
2051 derr << __func__ << ": queue_transaction got " << cpp_strerror(r)
2052 << dendl;
2053 }
2054 }
2055 }
2056 }
2057
2058 void PG::repair_object(
2059 const hobject_t &soid,
2060 const list<pair<ScrubMap::object, pg_shard_t> > &ok_peers,
2061 const set<pg_shard_t> &bad_peers)
2062 {
2063 set<pg_shard_t> ok_shards;
2064 for (auto &&peer: ok_peers) ok_shards.insert(peer.second);
2065
2066 dout(10) << "repair_object " << soid
2067 << " bad_peers osd.{" << bad_peers << "},"
2068 << " ok_peers osd.{" << ok_shards << "}" << dendl;
2069
2070 const ScrubMap::object &po = ok_peers.back().first;
2071 eversion_t v;
2072 object_info_t oi;
2073 try {
2074 bufferlist bv;
2075 if (po.attrs.count(OI_ATTR)) {
2076 bv.push_back(po.attrs.find(OI_ATTR)->second);
2077 }
2078 auto bliter = bv.cbegin();
2079 decode(oi, bliter);
2080 } catch (...) {
2081 dout(0) << __func__ << ": Need version of replica, bad object_info_t: "
2082 << soid << dendl;
2083 ceph_abort();
2084 }
2085
2086 if (bad_peers.count(get_primary())) {
2087 // We should only be scrubbing if the PG is clean.
2088 ceph_assert(waiting_for_unreadable_object.empty());
2089 dout(10) << __func__ << ": primary = " << get_primary() << dendl;
2090 }
2091
2092 /* No need to pass ok_peers, they must not be missing the object, so
2093 * force_object_missing will add them to missing_loc anyway */
2094 recovery_state.force_object_missing(bad_peers, soid, oi.version);
2095 }
2096
2097 void PG::forward_scrub_event(ScrubAPI fn, epoch_t epoch_queued, std::string_view desc)
2098 {
2099 dout(20) << __func__ << ": " << desc << " queued at: " << epoch_queued << dendl;
2100 ceph_assert(m_scrubber);
2101 if (is_active()) {
2102 ((*m_scrubber).*fn)(epoch_queued);
2103 } else {
2104 // pg might be in the process of being deleted
2105 dout(5) << __func__ << " refusing to forward. " << (is_clean() ? "(clean) " : "(not clean) ") <<
2106 (is_active() ? "(active) " : "(not active) ") << dendl;
2107 }
2108 }
2109
2110 void PG::forward_scrub_event(ScrubSafeAPI fn,
2111 epoch_t epoch_queued,
2112 Scrub::act_token_t act_token,
2113 std::string_view desc)
2114 {
2115 dout(20) << __func__ << ": " << desc << " queued: " << epoch_queued
2116 << " token: " << act_token << dendl;
2117 ceph_assert(m_scrubber);
2118 if (is_active()) {
2119 ((*m_scrubber).*fn)(epoch_queued, act_token);
2120 } else {
2121 // pg might be in the process of being deleted
2122 dout(5) << __func__ << " refusing to forward. "
2123 << (is_clean() ? "(clean) " : "(not clean) ")
2124 << (is_active() ? "(active) " : "(not active) ") << dendl;
2125 }
2126 }
2127
2128 void PG::replica_scrub(OpRequestRef op, ThreadPool::TPHandle& handle)
2129 {
2130 dout(10) << __func__ << " (op)" << dendl;
2131 ceph_assert(m_scrubber);
2132 m_scrubber->replica_scrub_op(op);
2133 }
2134
2135 void PG::replica_scrub(epoch_t epoch_queued,
2136 Scrub::act_token_t act_token,
2137 [[maybe_unused]] ThreadPool::TPHandle& handle)
2138 {
2139 dout(10) << __func__ << " queued at: " << epoch_queued
2140 << (is_primary() ? " (primary)" : " (replica)") << dendl;
2141 forward_scrub_event(&ScrubPgIF::send_start_replica, epoch_queued, act_token,
2142 "StartReplica/nw");
2143 }
2144
2145 bool PG::ops_blocked_by_scrub() const
2146 {
2147 return !waiting_for_scrub.empty();
2148 }
2149
2150 Scrub::scrub_prio_t PG::is_scrub_blocking_ops() const
2151 {
2152 return waiting_for_scrub.empty() ? Scrub::scrub_prio_t::low_priority
2153 : Scrub::scrub_prio_t::high_priority;
2154 }
2155
2156 bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
2157 {
2158 if (auto last_reset = get_last_peering_reset();
2159 last_reset > reply_epoch || last_reset > query_epoch) {
2160 dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch "
2161 << query_epoch << " last_peering_reset " << last_reset << dendl;
2162 return true;
2163 }
2164 return false;
2165 }
2166
2167 struct FlushState {
2168 PGRef pg;
2169 epoch_t epoch;
2170 FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
2171 ~FlushState() {
2172 std::scoped_lock l{*pg};
2173 if (!pg->pg_has_reset_since(epoch)) {
2174 pg->recovery_state.complete_flush();
2175 }
2176 }
2177 };
2178 typedef std::shared_ptr<FlushState> FlushStateRef;
2179
2180 void PG::start_flush_on_transaction(ObjectStore::Transaction &t)
2181 {
2182 // flush in progress ops
2183 FlushStateRef flush_trigger (std::make_shared<FlushState>(
2184 this, get_osdmap_epoch()));
2185 t.register_on_applied(new ContainerContext<FlushStateRef>(flush_trigger));
2186 t.register_on_commit(new ContainerContext<FlushStateRef>(flush_trigger));
2187 }
2188
2189 bool PG::try_flush_or_schedule_async()
2190 {
2191 Context *c = new QueuePeeringEvt(
2192 this, get_osdmap_epoch(), PeeringState::IntervalFlush());
2193 if (!ch->flush_commit(c)) {
2194 return false;
2195 } else {
2196 delete c;
2197 return true;
2198 }
2199 }
2200
2201 ostream& operator<<(ostream& out, const PG& pg)
2202 {
2203 out << pg.recovery_state;
2204
2205 // listing all scrub-related flags - both current and "planned next scrub"
2206 if (pg.is_scrubbing()) {
2207 out << *pg.m_scrubber;
2208 }
2209 out << pg.m_planned_scrub;
2210
2211 if (pg.recovery_ops_active)
2212 out << " rops=" << pg.recovery_ops_active;
2213
2214 //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
2215 if (pg.recovery_state.have_missing()) {
2216 out << " m=" << pg.recovery_state.get_num_missing();
2217 if (pg.is_primary()) {
2218 uint64_t unfound = pg.recovery_state.get_num_unfound();
2219 if (unfound)
2220 out << " u=" << unfound;
2221 }
2222 }
2223 if (!pg.is_clean()) {
2224 out << " mbc=" << pg.recovery_state.get_missing_by_count();
2225 }
2226 if (!pg.snap_trimq.empty()) {
2227 out << " trimq=";
2228 // only show a count if the set is large
2229 if (pg.snap_trimq.num_intervals() > 16) {
2230 out << pg.snap_trimq.size();
2231 if (!pg.snap_trimq_repeat.empty()) {
2232 out << "(" << pg.snap_trimq_repeat.size() << ")";
2233 }
2234 } else {
2235 out << pg.snap_trimq;
2236 if (!pg.snap_trimq_repeat.empty()) {
2237 out << "(" << pg.snap_trimq_repeat << ")";
2238 }
2239 }
2240 }
2241 if (!pg.recovery_state.get_info().purged_snaps.empty()) {
2242 out << " ps="; // snap trim queue / purged snaps
2243 if (pg.recovery_state.get_info().purged_snaps.num_intervals() > 16) {
2244 out << pg.recovery_state.get_info().purged_snaps.size();
2245 } else {
2246 out << pg.recovery_state.get_info().purged_snaps;
2247 }
2248 }
2249
2250 out << "]";
2251 return out;
2252 }
2253
2254 bool PG::can_discard_op(OpRequestRef& op)
2255 {
2256 auto m = op->get_req<MOSDOp>();
2257 if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) {
2258 dout(20) << " discard " << *m << dendl;
2259 return true;
2260 }
2261
2262 if (m->get_map_epoch() < info.history.same_primary_since) {
2263 dout(7) << " changed after " << m->get_map_epoch()
2264 << ", dropping " << *m << dendl;
2265 return true;
2266 }
2267
2268 if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
2269 CEPH_OSD_FLAG_LOCALIZE_READS)) &&
2270 !is_primary() &&
2271 m->get_map_epoch() < info.history.same_interval_since) {
2272 // Note: the Objecter will resend on interval change without the primary
2273 // changing if it actually sent to a replica. If the primary hasn't
2274 // changed since the send epoch, we got it, and we're primary, it won't
2275 // have resent even if the interval did change as it sent it to the primary
2276 // (us).
2277 return true;
2278 }
2279
2280
2281 if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) {
2282 // >= luminous client
2283 if (m->get_connection()->has_feature(CEPH_FEATURE_SERVER_NAUTILUS)) {
2284 // >= nautilus client
2285 if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) {
2286 dout(7) << __func__ << " sent before last_force_op_resend "
2287 << pool.info.last_force_op_resend
2288 << ", dropping" << *m << dendl;
2289 return true;
2290 }
2291 } else {
2292 // == < nautilus client (luminous or mimic)
2293 if (m->get_map_epoch() < pool.info.get_last_force_op_resend_prenautilus()) {
2294 dout(7) << __func__ << " sent before last_force_op_resend_prenautilus "
2295 << pool.info.last_force_op_resend_prenautilus
2296 << ", dropping" << *m << dendl;
2297 return true;
2298 }
2299 }
2300 if (m->get_map_epoch() < info.history.last_epoch_split) {
2301 dout(7) << __func__ << " pg split in "
2302 << info.history.last_epoch_split << ", dropping" << dendl;
2303 return true;
2304 }
2305 } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) {
2306 // < luminous client
2307 if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) {
2308 dout(7) << __func__ << " sent before last_force_op_resend_preluminous "
2309 << pool.info.last_force_op_resend_preluminous
2310 << ", dropping" << *m << dendl;
2311 return true;
2312 }
2313 }
2314
2315 return false;
2316 }
2317
2318 template<typename T, int MSGTYPE>
2319 bool PG::can_discard_replica_op(OpRequestRef& op)
2320 {
2321 auto m = op->get_req<T>();
2322 ceph_assert(m->get_type() == MSGTYPE);
2323
2324 int from = m->get_source().num();
2325
2326 // if a repop is replied after a replica goes down in a new osdmap, and
2327 // before the pg advances to this new osdmap, the repop replies before this
2328 // repop can be discarded by that replica OSD, because the primary resets the
2329 // connection to it when handling the new osdmap marking it down, and also
2330 // resets the messenger sesssion when the replica reconnects. to avoid the
2331 // out-of-order replies, the messages from that replica should be discarded.
2332 OSDMapRef next_map = osd->get_next_osdmap();
2333 if (next_map->is_down(from)) {
2334 dout(20) << " " << __func__ << " dead for nextmap is down " << from << dendl;
2335 return true;
2336 }
2337 /* Mostly, this overlaps with the old_peering_msg
2338 * condition. An important exception is pushes
2339 * sent by replicas not in the acting set, since
2340 * if such a replica goes down it does not cause
2341 * a new interval. */
2342 if (next_map->get_down_at(from) >= m->map_epoch) {
2343 dout(20) << " " << __func__ << " dead for 'get_down_at' " << from << dendl;
2344 return true;
2345 }
2346
2347 // same pg?
2348 // if pg changes _at all_, we reset and repeer!
2349 if (old_peering_msg(m->map_epoch, m->map_epoch)) {
2350 dout(10) << "can_discard_replica_op pg changed " << info.history
2351 << " after " << m->map_epoch
2352 << ", dropping" << dendl;
2353 return true;
2354 }
2355 return false;
2356 }
2357
2358 bool PG::can_discard_scan(OpRequestRef op)
2359 {
2360 auto m = op->get_req<MOSDPGScan>();
2361 ceph_assert(m->get_type() == MSG_OSD_PG_SCAN);
2362
2363 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
2364 dout(10) << " got old scan, ignoring" << dendl;
2365 return true;
2366 }
2367 return false;
2368 }
2369
2370 bool PG::can_discard_backfill(OpRequestRef op)
2371 {
2372 auto m = op->get_req<MOSDPGBackfill>();
2373 ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL);
2374
2375 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
2376 dout(10) << " got old backfill, ignoring" << dendl;
2377 return true;
2378 }
2379
2380 return false;
2381
2382 }
2383
2384 bool PG::can_discard_request(OpRequestRef& op)
2385 {
2386 switch (op->get_req()->get_type()) {
2387 case CEPH_MSG_OSD_OP:
2388 return can_discard_op(op);
2389 case CEPH_MSG_OSD_BACKOFF:
2390 return false; // never discard
2391 case MSG_OSD_REPOP:
2392 return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op);
2393 case MSG_OSD_PG_PUSH:
2394 return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op);
2395 case MSG_OSD_PG_PULL:
2396 return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
2397 case MSG_OSD_PG_PUSH_REPLY:
2398 return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
2399 case MSG_OSD_REPOPREPLY:
2400 return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
2401 case MSG_OSD_PG_RECOVERY_DELETE:
2402 return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op);
2403
2404 case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
2405 return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op);
2406
2407 case MSG_OSD_EC_WRITE:
2408 return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
2409 case MSG_OSD_EC_WRITE_REPLY:
2410 return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op);
2411 case MSG_OSD_EC_READ:
2412 return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
2413 case MSG_OSD_EC_READ_REPLY:
2414 return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
2415 case MSG_OSD_REP_SCRUB:
2416 return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
2417 case MSG_OSD_SCRUB_RESERVE:
2418 return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op);
2419 case MSG_OSD_REP_SCRUBMAP:
2420 return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op);
2421 case MSG_OSD_PG_UPDATE_LOG_MISSING:
2422 return can_discard_replica_op<
2423 MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op);
2424 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
2425 return can_discard_replica_op<
2426 MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
2427
2428 case MSG_OSD_PG_SCAN:
2429 return can_discard_scan(op);
2430 case MSG_OSD_PG_BACKFILL:
2431 return can_discard_backfill(op);
2432 case MSG_OSD_PG_BACKFILL_REMOVE:
2433 return can_discard_replica_op<MOSDPGBackfillRemove,
2434 MSG_OSD_PG_BACKFILL_REMOVE>(op);
2435 }
2436 return true;
2437 }
2438
2439 void PG::do_peering_event(PGPeeringEventRef evt, PeeringCtx &rctx)
2440 {
2441 dout(10) << __func__ << ": " << evt->get_desc() << dendl;
2442 ceph_assert(have_same_or_newer_map(evt->get_epoch_sent()));
2443 if (old_peering_evt(evt)) {
2444 dout(10) << "discard old " << evt->get_desc() << dendl;
2445 } else {
2446 recovery_state.handle_event(evt, &rctx);
2447 }
2448 // write_if_dirty regardless of path above to ensure we capture any work
2449 // done by OSD::advance_pg().
2450 write_if_dirty(rctx.transaction);
2451 }
2452
2453 void PG::queue_peering_event(PGPeeringEventRef evt)
2454 {
2455 if (old_peering_evt(evt))
2456 return;
2457 osd->osd->enqueue_peering_evt(info.pgid, evt);
2458 }
2459
2460 void PG::queue_null(epoch_t msg_epoch,
2461 epoch_t query_epoch)
2462 {
2463 dout(10) << "null" << dendl;
2464 queue_peering_event(
2465 PGPeeringEventRef(std::make_shared<PGPeeringEvent>(msg_epoch, query_epoch,
2466 NullEvt())));
2467 }
2468
2469 void PG::find_unfound(epoch_t queued, PeeringCtx &rctx)
2470 {
2471 /*
2472 * if we couldn't start any recovery ops and things are still
2473 * unfound, see if we can discover more missing object locations.
2474 * It may be that our initial locations were bad and we errored
2475 * out while trying to pull.
2476 */
2477 if (!recovery_state.discover_all_missing(rctx)) {
2478 string action;
2479 if (state_test(PG_STATE_BACKFILLING)) {
2480 auto evt = PGPeeringEventRef(
2481 new PGPeeringEvent(
2482 queued,
2483 queued,
2484 PeeringState::UnfoundBackfill()));
2485 queue_peering_event(evt);
2486 action = "in backfill";
2487 } else if (state_test(PG_STATE_RECOVERING)) {
2488 auto evt = PGPeeringEventRef(
2489 new PGPeeringEvent(
2490 queued,
2491 queued,
2492 PeeringState::UnfoundRecovery()));
2493 queue_peering_event(evt);
2494 action = "in recovery";
2495 } else {
2496 action = "already out of recovery/backfill";
2497 }
2498 dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
2499 } else {
2500 dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
2501 queue_recovery();
2502 }
2503 }
2504
2505 void PG::handle_advance_map(
2506 OSDMapRef osdmap, OSDMapRef lastmap,
2507 vector<int>& newup, int up_primary,
2508 vector<int>& newacting, int acting_primary,
2509 PeeringCtx &rctx)
2510 {
2511 dout(10) << __func__ << ": " << osdmap->get_epoch() << dendl;
2512 osd_shard->update_pg_epoch(pg_slot, osdmap->get_epoch());
2513 recovery_state.advance_map(
2514 osdmap,
2515 lastmap,
2516 newup,
2517 up_primary,
2518 newacting,
2519 acting_primary,
2520 rctx);
2521 }
2522
2523 void PG::handle_activate_map(PeeringCtx &rctx)
2524 {
2525 dout(10) << __func__ << ": " << get_osdmap()->get_epoch()
2526 << dendl;
2527 recovery_state.activate_map(rctx);
2528
2529 requeue_map_waiters();
2530 }
2531
2532 void PG::handle_initialize(PeeringCtx &rctx)
2533 {
2534 dout(10) << __func__ << dendl;
2535 PeeringState::Initialize evt;
2536 recovery_state.handle_event(evt, &rctx);
2537 }
2538
2539
2540 void PG::handle_query_state(Formatter *f)
2541 {
2542 dout(10) << "handle_query_state" << dendl;
2543 PeeringState::QueryState q(f);
2544 recovery_state.handle_event(q, 0);
2545 }
2546
2547 void PG::init_collection_pool_opts()
2548 {
2549 auto r = osd->store->set_collection_opts(ch, pool.info.opts);
2550 if (r < 0 && r != -EOPNOTSUPP) {
2551 derr << __func__ << " set_collection_opts returns error:" << r << dendl;
2552 }
2553 }
2554
2555 void PG::on_pool_change()
2556 {
2557 init_collection_pool_opts();
2558 plpg_on_pool_change();
2559 }
2560
2561 void PG::C_DeleteMore::complete(int r) {
2562 ceph_assert(r == 0);
2563 pg->lock();
2564 if (!pg->pg_has_reset_since(epoch)) {
2565 pg->osd->queue_for_pg_delete(pg->get_pgid(), epoch);
2566 }
2567 pg->unlock();
2568 delete this;
2569 }
2570
2571 std::pair<ghobject_t, bool> PG::do_delete_work(
2572 ObjectStore::Transaction &t,
2573 ghobject_t _next)
2574 {
2575 dout(10) << __func__ << dendl;
2576
2577 {
2578 float osd_delete_sleep = osd->osd->get_osd_delete_sleep();
2579 if (osd_delete_sleep > 0 && delete_needs_sleep) {
2580 epoch_t e = get_osdmap()->get_epoch();
2581 PGRef pgref(this);
2582 auto delete_requeue_callback = new LambdaContext([this, pgref, e](int r) {
2583 dout(20) << "do_delete_work() [cb] wake up at "
2584 << ceph_clock_now()
2585 << ", re-queuing delete" << dendl;
2586 std::scoped_lock locker{*this};
2587 delete_needs_sleep = false;
2588 if (!pg_has_reset_since(e)) {
2589 osd->queue_for_pg_delete(get_pgid(), e);
2590 }
2591 });
2592
2593 auto delete_schedule_time = ceph::real_clock::now();
2594 delete_schedule_time += ceph::make_timespan(osd_delete_sleep);
2595 std::lock_guard l{osd->sleep_lock};
2596 osd->sleep_timer.add_event_at(delete_schedule_time,
2597 delete_requeue_callback);
2598 dout(20) << __func__ << " Delete scheduled at " << delete_schedule_time << dendl;
2599 return std::make_pair(_next, true);
2600 }
2601 }
2602
2603 delete_needs_sleep = true;
2604
2605 ghobject_t next;
2606
2607 vector<ghobject_t> olist;
2608 int max = std::min(osd->store->get_ideal_list_max(),
2609 (int)cct->_conf->osd_target_transaction_size);
2610
2611 osd->store->collection_list(
2612 ch,
2613 _next,
2614 ghobject_t::get_max(),
2615 max,
2616 &olist,
2617 &next);
2618 dout(20) << __func__ << " " << olist << dendl;
2619
2620 // make sure we've removed everything
2621 // by one more listing from the beginning
2622 if (_next != ghobject_t() && olist.empty()) {
2623 next = ghobject_t();
2624 osd->store->collection_list(
2625 ch,
2626 next,
2627 ghobject_t::get_max(),
2628 max,
2629 &olist,
2630 &next);
2631 for (auto& oid : olist) {
2632 if (oid == pgmeta_oid) {
2633 dout(20) << __func__ << " removing pgmeta object " << oid << dendl;
2634 } else {
2635 dout(0) << __func__ << " additional unexpected onode"
2636 <<" new onode has appeared since PG removal started"
2637 << oid << dendl;
2638 }
2639 }
2640 }
2641
2642 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
2643 int64_t num = 0;
2644 for (auto& oid : olist) {
2645 if (oid == pgmeta_oid) {
2646 continue;
2647 }
2648 if (oid.is_pgmeta()) {
2649 osd->clog->warn() << info.pgid << " found stray pgmeta-like " << oid
2650 << " during PG removal";
2651 }
2652 int r = snap_mapper.remove_oid(oid.hobj, &_t);
2653 if (r != 0 && r != -ENOENT) {
2654 ceph_abort();
2655 }
2656 t.remove(coll, oid);
2657 ++num;
2658 }
2659 bool running = true;
2660 if (num) {
2661 dout(20) << __func__ << " deleting " << num << " objects" << dendl;
2662 Context *fin = new C_DeleteMore(this, get_osdmap_epoch());
2663 t.register_on_commit(fin);
2664 } else {
2665 if (cct->_conf->osd_inject_failure_on_pg_removal) {
2666 _exit(1);
2667 }
2668
2669 // final flush here to ensure completions drop refs. Of particular concern
2670 // are the SnapMapper ContainerContexts.
2671 {
2672 PGRef pgref(this);
2673 PGLog::clear_info_log(info.pgid, &t);
2674 t.remove_collection(coll);
2675 t.register_on_commit(new ContainerContext<PGRef>(pgref));
2676 t.register_on_applied(new ContainerContext<PGRef>(pgref));
2677 osd->store->queue_transaction(ch, std::move(t));
2678 }
2679 ch->flush();
2680
2681 if (!osd->try_finish_pg_delete(this, pool.info.get_pg_num())) {
2682 dout(1) << __func__ << " raced with merge, reinstantiating" << dendl;
2683 ch = osd->store->create_new_collection(coll);
2684 create_pg_collection(t,
2685 info.pgid,
2686 info.pgid.get_split_bits(pool.info.get_pg_num()));
2687 init_pg_ondisk(t, info.pgid, &pool.info);
2688 recovery_state.reset_last_persisted();
2689 } else {
2690 recovery_state.set_delete_complete();
2691
2692 // cancel reserver here, since the PG is about to get deleted and the
2693 // exit() methods don't run when that happens.
2694 osd->local_reserver.cancel_reservation(info.pgid);
2695
2696 running = false;
2697 }
2698 }
2699 return {next, running};
2700 }
2701
2702 int PG::pg_stat_adjust(osd_stat_t *ns)
2703 {
2704 osd_stat_t &new_stat = *ns;
2705 if (is_primary()) {
2706 return 0;
2707 }
2708 // Adjust the kb_used by adding pending backfill data
2709 uint64_t reserved_num_bytes = get_reserved_num_bytes();
2710
2711 // For now we don't consider projected space gains here
2712 // I suggest we have an optional 2 pass backfill that frees up
2713 // space in a first pass. This could be triggered when at nearfull
2714 // or near to backfillfull.
2715 if (reserved_num_bytes > 0) {
2716 // TODO: Handle compression by adjusting by the PGs average
2717 // compression precentage.
2718 dout(20) << __func__ << " reserved_num_bytes " << (reserved_num_bytes >> 10) << "KiB"
2719 << " Before kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl;
2720 if (new_stat.statfs.available > reserved_num_bytes)
2721 new_stat.statfs.available -= reserved_num_bytes;
2722 else
2723 new_stat.statfs.available = 0;
2724 dout(20) << __func__ << " After kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl;
2725 return 1;
2726 }
2727 return 0;
2728 }
2729
2730 void PG::dump_pgstate_history(Formatter *f)
2731 {
2732 std::scoped_lock l{*this};
2733 recovery_state.dump_history(f);
2734 }
2735
2736 void PG::dump_missing(Formatter *f)
2737 {
2738 for (auto& i : recovery_state.get_pg_log().get_missing().get_items()) {
2739 f->open_object_section("object");
2740 f->dump_object("oid", i.first);
2741 f->dump_object("missing_info", i.second);
2742 if (recovery_state.get_missing_loc().needs_recovery(i.first)) {
2743 f->dump_bool(
2744 "unfound",
2745 recovery_state.get_missing_loc().is_unfound(i.first));
2746 f->open_array_section("locations");
2747 for (auto l : recovery_state.get_missing_loc().get_locations(i.first)) {
2748 f->dump_object("shard", l);
2749 }
2750 f->close_section();
2751 }
2752 f->close_section();
2753 }
2754 }
2755
2756 void PG::with_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)>&& f)
2757 {
2758 std::lock_guard l{pg_stats_publish_lock};
2759 if (pg_stats_publish) {
2760 f(*pg_stats_publish, pg_stats_publish->get_effective_last_epoch_clean());
2761 }
2762 }
2763
2764 void PG::with_heartbeat_peers(std::function<void(int)>&& f)
2765 {
2766 std::lock_guard l{heartbeat_peer_lock};
2767 for (auto p : heartbeat_peers) {
2768 f(p);
2769 }
2770 for (auto p : probe_targets) {
2771 f(p);
2772 }
2773 }
2774
2775 uint64_t PG::get_min_alloc_size() const {
2776 return osd->store->get_min_alloc_size();
2777 }