]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PG.cc
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / osd / PG.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "PG.h"
16 // #include "msg/Messenger.h"
17 #include "messages/MOSDRepScrub.h"
18 // #include "common/cmdparse.h"
19 // #include "common/ceph_context.h"
20
21 #include "common/errno.h"
22 #include "common/config.h"
23 #include "OSD.h"
24 #include "OpRequest.h"
25 #include "ScrubStore.h"
26 #include "Session.h"
27
28 #include "common/Timer.h"
29 #include "common/perf_counters.h"
30
31 #include "messages/MOSDOp.h"
32 #include "messages/MOSDPGNotify.h"
33 // #include "messages/MOSDPGLog.h"
34 #include "messages/MOSDPGRemove.h"
35 #include "messages/MOSDPGInfo.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDPGBackfill.h"
39 #include "messages/MOSDPGBackfillRemove.h"
40 #include "messages/MBackfillReserve.h"
41 #include "messages/MRecoveryReserve.h"
42 #include "messages/MOSDPGPush.h"
43 #include "messages/MOSDPGPushReply.h"
44 #include "messages/MOSDPGPull.h"
45 #include "messages/MOSDECSubOpWrite.h"
46 #include "messages/MOSDECSubOpWriteReply.h"
47 #include "messages/MOSDECSubOpRead.h"
48 #include "messages/MOSDECSubOpReadReply.h"
49 #include "messages/MOSDPGUpdateLogMissing.h"
50 #include "messages/MOSDPGUpdateLogMissingReply.h"
51 #include "messages/MOSDBackoff.h"
52 #include "messages/MOSDScrubReserve.h"
53 #include "messages/MOSDRepOp.h"
54 #include "messages/MOSDRepOpReply.h"
55 #include "messages/MOSDRepScrubMap.h"
56 #include "messages/MOSDPGRecoveryDelete.h"
57 #include "messages/MOSDPGRecoveryDeleteReply.h"
58
59 #include "common/BackTrace.h"
60 #include "common/EventTrace.h"
61
62 #ifdef WITH_LTTNG
63 #define TRACEPOINT_DEFINE
64 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
65 #include "tracing/pg.h"
66 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
67 #undef TRACEPOINT_DEFINE
68 #else
69 #define tracepoint(...)
70 #endif
71
72 #include <sstream>
73
74 #define dout_context cct
75 #define dout_subsys ceph_subsys_osd
76 #undef dout_prefix
77 #define dout_prefix _prefix(_dout, this)
78
79 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
80 // easily skip them
81 const string infover_key("_infover");
82 const string info_key("_info");
83 const string biginfo_key("_biginfo");
84 const string epoch_key("_epoch");
85 const string fastinfo_key("_fastinfo");
86
87 template <class T>
88 static ostream& _prefix(std::ostream *_dout, T *t)
89 {
90 return t->gen_prefix(*_dout);
91 }
92
93 void PGStateHistory::enter(PG* pg, const utime_t entime, const char* state)
94 {
95 // Ignore trimming state machine for now
96 if (::strstr(state, "Trimming") != NULL) {
97 return;
98 } else if (pi != nullptr) {
99 pi->enter_state(entime, state);
100 } else {
101 // Store current state since we can't reliably take the PG lock here
102 if ( tmppi == nullptr) {
103 tmppi = std::unique_ptr<PGStateInstance>(new PGStateInstance);
104 }
105
106 thispg = pg;
107 tmppi->enter_state(entime, state);
108 }
109 }
110
111 void PGStateHistory::exit(const char* state) {
112 // Ignore trimming state machine for now
113 // Do nothing if PG is being destroyed!
114 if (::strstr(state, "Trimming") != NULL || pg_in_destructor) {
115 return;
116 } else {
117 bool ilocked = false;
118 if(!thispg->is_locked()) {
119 thispg->lock();
120 ilocked = true;
121 }
122 if (pi == nullptr) {
123 buffer.push_back(std::unique_ptr<PGStateInstance>(tmppi.release()));
124 pi = buffer.back().get();
125 pi->setepoch(thispg->get_osdmap_epoch());
126 }
127
128 pi->exit_state(ceph_clock_now());
129 if (::strcmp(state, "Reset") == 0) {
130 this->reset();
131 }
132 if(ilocked) {
133 thispg->unlock();
134 }
135 }
136 }
137
138 void PGStateHistory::dump(Formatter* f) const {
139 f->open_array_section("history");
140 for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) {
141 f->open_object_section("states");
142 f->dump_stream("epoch") << (*pi)->this_epoch;
143 for (auto she : (*pi)->state_history) {
144 f->dump_string("state", std::get<2>(she));
145 f->dump_stream("enter") << std::get<0>(she);
146 f->dump_stream("exit") << std::get<1>(she);
147 }
148 f->close_section();
149 }
150 f->close_section();
151 }
152
153 void PG::get(const char* tag)
154 {
155 int after = ++ref;
156 lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " "
157 << "tag " << (tag ? tag : "(none") << " "
158 << (after - 1) << " -> " << after << dendl;
159 #ifdef PG_DEBUG_REFS
160 std::lock_guard l(_ref_id_lock);
161 _tag_counts[tag]++;
162 #endif
163 }
164
165 void PG::put(const char* tag)
166 {
167 #ifdef PG_DEBUG_REFS
168 {
169 std::lock_guard l(_ref_id_lock);
170 auto tag_counts_entry = _tag_counts.find(tag);
171 ceph_assert(tag_counts_entry != _tag_counts.end());
172 --tag_counts_entry->second;
173 if (tag_counts_entry->second == 0) {
174 _tag_counts.erase(tag_counts_entry);
175 }
176 }
177 #endif
178 auto local_cct = cct;
179 int after = --ref;
180 lgeneric_subdout(local_cct, refs, 5) << "PG::put " << this << " "
181 << "tag " << (tag ? tag : "(none") << " "
182 << (after + 1) << " -> " << after
183 << dendl;
184 if (after == 0)
185 delete this;
186 }
187
188 #ifdef PG_DEBUG_REFS
189 uint64_t PG::get_with_id()
190 {
191 ref++;
192 std::lock_guard l(_ref_id_lock);
193 uint64_t id = ++_ref_id;
194 BackTrace bt(0);
195 stringstream ss;
196 bt.print(ss);
197 lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " " << info.pgid
198 << " got id " << id << " "
199 << (ref - 1) << " -> " << ref
200 << dendl;
201 ceph_assert(!_live_ids.count(id));
202 _live_ids.insert(make_pair(id, ss.str()));
203 return id;
204 }
205
206 void PG::put_with_id(uint64_t id)
207 {
208 int newref = --ref;
209 lgeneric_subdout(cct, refs, 5) << "PG::put " << this << " " << info.pgid
210 << " put id " << id << " "
211 << (newref + 1) << " -> " << newref
212 << dendl;
213 {
214 std::lock_guard l(_ref_id_lock);
215 ceph_assert(_live_ids.count(id));
216 _live_ids.erase(id);
217 }
218 if (newref)
219 delete this;
220 }
221
222 void PG::dump_live_ids()
223 {
224 std::lock_guard l(_ref_id_lock);
225 dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl;
226 for (map<uint64_t, string>::iterator i = _live_ids.begin();
227 i != _live_ids.end();
228 ++i) {
229 dout(0) << "\t\tid: " << *i << dendl;
230 }
231 dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl;
232 for (map<string, uint64_t>::iterator i = _tag_counts.begin();
233 i != _tag_counts.end();
234 ++i) {
235 dout(0) << "\t\tid: " << *i << dendl;
236 }
237 }
238 #endif
239
240
241 void PGPool::update(CephContext *cct, OSDMapRef map)
242 {
243 const pg_pool_t *pi = map->get_pg_pool(id);
244 if (!pi) {
245 return; // pool has been deleted
246 }
247 info = *pi;
248 name = map->get_pool_name(id);
249
250 bool updated = false;
251 if ((map->get_epoch() != cached_epoch + 1) ||
252 (pi->get_snap_epoch() == map->get_epoch())) {
253 updated = true;
254 }
255
256 if (map->require_osd_release >= CEPH_RELEASE_MIMIC) {
257 // mimic tracks removed_snaps_queue in the OSDmap and purged_snaps
258 // in the pg_info_t, with deltas for both in each OSDMap. we don't
259 // need to (and can't) track it here.
260 cached_removed_snaps.clear();
261 newly_removed_snaps.clear();
262 } else {
263 // legacy (<= luminous) removed_snaps tracking
264 if (updated) {
265 if (pi->maybe_updated_removed_snaps(cached_removed_snaps)) {
266 pi->build_removed_snaps(newly_removed_snaps);
267 if (cached_removed_snaps.subset_of(newly_removed_snaps)) {
268 interval_set<snapid_t> removed_snaps = newly_removed_snaps;
269 newly_removed_snaps.subtract(cached_removed_snaps);
270 cached_removed_snaps.swap(removed_snaps);
271 } else {
272 lgeneric_subdout(cct, osd, 0) << __func__
273 << " cached_removed_snaps shrank from " << cached_removed_snaps
274 << " to " << newly_removed_snaps << dendl;
275 cached_removed_snaps.swap(newly_removed_snaps);
276 newly_removed_snaps.clear();
277 }
278 } else {
279 newly_removed_snaps.clear();
280 }
281 } else {
282 /* 1) map->get_epoch() == cached_epoch + 1 &&
283 * 2) pi->get_snap_epoch() != map->get_epoch()
284 *
285 * From the if branch, 1 && 2 must be true. From 2, we know that
286 * this map didn't change the set of removed snaps. From 1, we
287 * know that our cached_removed_snaps matches the previous map.
288 * Thus, from 1 && 2, cached_removed snaps matches the current
289 * set of removed snaps and all we have to do is clear
290 * newly_removed_snaps.
291 */
292 newly_removed_snaps.clear();
293 }
294 lgeneric_subdout(cct, osd, 20)
295 << "PGPool::update cached_removed_snaps "
296 << cached_removed_snaps
297 << " newly_removed_snaps "
298 << newly_removed_snaps
299 << " snapc " << snapc
300 << (updated ? " (updated)":" (no change)")
301 << dendl;
302 if (cct->_conf->osd_debug_verify_cached_snaps) {
303 interval_set<snapid_t> actual_removed_snaps;
304 pi->build_removed_snaps(actual_removed_snaps);
305 if (!(actual_removed_snaps == cached_removed_snaps)) {
306 lgeneric_derr(cct) << __func__
307 << ": mismatch between the actual removed snaps "
308 << actual_removed_snaps
309 << " and pool.cached_removed_snaps "
310 << " pool.cached_removed_snaps " << cached_removed_snaps
311 << dendl;
312 }
313 ceph_assert(actual_removed_snaps == cached_removed_snaps);
314 }
315 }
316 if (info.is_pool_snaps_mode() && updated) {
317 snapc = pi->get_snap_context();
318 }
319 cached_epoch = map->get_epoch();
320 }
321
322 PG::PG(OSDService *o, OSDMapRef curmap,
323 const PGPool &_pool, spg_t p) :
324 pg_id(p),
325 coll(p),
326 osd(o),
327 cct(o->cct),
328 osdmap_ref(curmap),
329 pool(_pool),
330 osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
331 snap_mapper(
332 cct,
333 &osdriver,
334 p.ps(),
335 p.get_split_bits(_pool.info.get_pg_num()),
336 _pool.id,
337 p.shard),
338 last_persisted_osdmap(curmap->get_epoch()),
339 deleting(false),
340 trace_endpoint("0.0.0.0", 0, "PG"),
341 dirty_info(false), dirty_big_info(false),
342 info(p),
343 info_struct_v(0),
344 pg_log(cct),
345 pgmeta_oid(p.make_pgmeta_oid()),
346 missing_loc(this),
347 stat_queue_item(this),
348 scrub_queued(false),
349 recovery_queued(false),
350 recovery_ops_active(0),
351 role(-1),
352 state(0),
353 send_notify(false),
354 pg_whoami(osd->whoami, p.shard),
355 need_up_thru(false),
356 last_peering_reset(0),
357 heartbeat_peer_lock("PG::heartbeat_peer_lock"),
358 backfill_reserved(false),
359 backfill_reserving(false),
360 flushes_in_progress(0),
361 pg_stats_publish_lock("PG::pg_stats_publish_lock"),
362 pg_stats_publish_valid(false),
363 finish_sync_event(NULL),
364 backoff_lock("PG::backoff_lock"),
365 scrub_after_recovery(false),
366 active_pushes(0),
367 recovery_state(this),
368 peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
369 acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
370 upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
371 last_epoch(0),
372 last_require_osd_release(curmap->require_osd_release)
373 {
374 #ifdef PG_DEBUG_REFS
375 osd->add_pgid(p, this);
376 #endif
377 #ifdef WITH_BLKIN
378 std::stringstream ss;
379 ss << "PG " << info.pgid;
380 trace_endpoint.copy_name(ss.str());
381 #endif
382 }
383
384 PG::~PG()
385 {
386 pgstate_history.set_pg_in_destructor();
387 #ifdef PG_DEBUG_REFS
388 osd->remove_pgid(info.pgid, this);
389 #endif
390 }
391
392 void PG::lock(bool no_lockdep) const
393 {
394 _lock.Lock(no_lockdep);
395 // if we have unrecorded dirty state with the lock dropped, there is a bug
396 ceph_assert(!dirty_info);
397 ceph_assert(!dirty_big_info);
398
399 dout(30) << "lock" << dendl;
400 }
401
402 std::ostream& PG::gen_prefix(std::ostream& out) const
403 {
404 OSDMapRef mapref = osdmap_ref;
405 if (_lock.is_locked_by_me()) {
406 out << "osd." << osd->whoami
407 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
408 << " " << *this << " ";
409 } else {
410 out << "osd." << osd->whoami
411 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
412 << " pg[" << info.pgid << "(unlocked)] ";
413 }
414 return out;
415 }
416
417 /********* PG **********/
418
419 void PG::proc_master_log(
420 ObjectStore::Transaction& t, pg_info_t &oinfo,
421 pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
422 {
423 dout(10) << "proc_master_log for osd." << from << ": "
424 << olog << " " << omissing << dendl;
425 ceph_assert(!is_peered() && is_primary());
426
427 // merge log into our own log to build master log. no need to
428 // make any adjustments to their missing map; we are taking their
429 // log to be authoritative (i.e., their entries are by definitely
430 // non-divergent).
431 merge_log(t, oinfo, olog, from);
432 peer_info[from] = oinfo;
433 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
434 might_have_unfound.insert(from);
435
436 // See doc/dev/osd_internals/last_epoch_started
437 if (oinfo.last_epoch_started > info.last_epoch_started) {
438 info.last_epoch_started = oinfo.last_epoch_started;
439 dirty_info = true;
440 }
441 if (oinfo.last_interval_started > info.last_interval_started) {
442 info.last_interval_started = oinfo.last_interval_started;
443 dirty_info = true;
444 }
445 update_history(oinfo.history);
446 ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
447 info.last_epoch_started >= info.history.last_epoch_started);
448
449 peer_missing[from].claim(omissing);
450 }
451
452 void PG::proc_replica_log(
453 pg_info_t &oinfo,
454 const pg_log_t &olog,
455 pg_missing_t& omissing,
456 pg_shard_t from)
457 {
458 dout(10) << "proc_replica_log for osd." << from << ": "
459 << oinfo << " " << olog << " " << omissing << dendl;
460
461 pg_log.proc_replica_log(oinfo, olog, omissing, from);
462
463 peer_info[from] = oinfo;
464 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
465 might_have_unfound.insert(from);
466
467 for (map<hobject_t, pg_missing_item>::const_iterator i =
468 omissing.get_items().begin();
469 i != omissing.get_items().end();
470 ++i) {
471 dout(20) << " after missing " << i->first << " need " << i->second.need
472 << " have " << i->second.have << dendl;
473 }
474 peer_missing[from].claim(omissing);
475 }
476
477 bool PG::proc_replica_info(
478 pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
479 {
480 map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
481 if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
482 dout(10) << " got dup osd." << from << " info " << oinfo << ", identical to ours" << dendl;
483 return false;
484 }
485
486 if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
487 dout(10) << " got info " << oinfo << " from down osd." << from
488 << " discarding" << dendl;
489 return false;
490 }
491
492 dout(10) << " got osd." << from << " " << oinfo << dendl;
493 ceph_assert(is_primary());
494 peer_info[from] = oinfo;
495 might_have_unfound.insert(from);
496
497 update_history(oinfo.history);
498
499 // stray?
500 if (!is_up(from) && !is_acting(from)) {
501 dout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
502 stray_set.insert(from);
503 if (is_clean()) {
504 purge_strays();
505 }
506 }
507
508 // was this a new info? if so, update peers!
509 if (p == peer_info.end())
510 update_heartbeat_peers();
511
512 return true;
513 }
514
515 void PG::remove_snap_mapped_object(
516 ObjectStore::Transaction &t, const hobject_t &soid)
517 {
518 t.remove(
519 coll,
520 ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard));
521 clear_object_snap_mapping(&t, soid);
522 }
523
524 void PG::clear_object_snap_mapping(
525 ObjectStore::Transaction *t, const hobject_t &soid)
526 {
527 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
528 if (soid.snap < CEPH_MAXSNAP) {
529 int r = snap_mapper.remove_oid(
530 soid,
531 &_t);
532 if (!(r == 0 || r == -ENOENT)) {
533 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
534 ceph_abort();
535 }
536 }
537 }
538
539 void PG::update_object_snap_mapping(
540 ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps)
541 {
542 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
543 ceph_assert(soid.snap < CEPH_MAXSNAP);
544 int r = snap_mapper.remove_oid(
545 soid,
546 &_t);
547 if (!(r == 0 || r == -ENOENT)) {
548 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
549 ceph_abort();
550 }
551 snap_mapper.add_oid(
552 soid,
553 snaps,
554 &_t);
555 }
556
557 void PG::merge_log(
558 ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_shard_t from)
559 {
560 PGLogEntryHandler rollbacker{this, &t};
561 pg_log.merge_log(
562 oinfo, olog, from, info, &rollbacker, dirty_info, dirty_big_info);
563 }
564
565 void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead)
566 {
567 PGLogEntryHandler rollbacker{this, &t};
568 pg_log.rewind_divergent_log(
569 newhead, info, &rollbacker, dirty_info, dirty_big_info);
570 }
571
572 /*
573 * Process information from a replica to determine if it could have any
574 * objects that i need.
575 *
576 * TODO: if the missing set becomes very large, this could get expensive.
577 * Instead, we probably want to just iterate over our unfound set.
578 */
579 bool PG::search_for_missing(
580 const pg_info_t &oinfo, const pg_missing_t &omissing,
581 pg_shard_t from,
582 RecoveryCtx *ctx)
583 {
584 uint64_t num_unfound_before = missing_loc.num_unfound();
585 bool found_missing = missing_loc.add_source_info(
586 from, oinfo, omissing, ctx->handle);
587 if (found_missing && num_unfound_before != missing_loc.num_unfound())
588 publish_stats_to_osd();
589 // avoid doing this if the peer is empty. This is abit of paranoia
590 // to avoid doing something rash if add_source_info() above
591 // incorrectly decided we found something new. (if the peer has
592 // last_update=0'0 that's impossible.)
593 if (found_missing &&
594 oinfo.last_update != eversion_t()) {
595 pg_info_t tinfo(oinfo);
596 tinfo.pgid.shard = pg_whoami.shard;
597 (*(ctx->info_map))[from.osd].push_back(
598 make_pair(
599 pg_notify_t(
600 from.shard, pg_whoami.shard,
601 get_osdmap_epoch(),
602 get_osdmap_epoch(),
603 tinfo),
604 past_intervals));
605 }
606 return found_missing;
607 }
608
609
610 // MissingLoc
611
612 bool PG::MissingLoc::readable_with_acting(
613 const hobject_t &hoid,
614 const set<pg_shard_t> &acting) const {
615 if (!needs_recovery(hoid))
616 return true;
617 if (is_deleted(hoid))
618 return false;
619 auto missing_loc_entry = missing_loc.find(hoid);
620 if (missing_loc_entry == missing_loc.end())
621 return false;
622 const set<pg_shard_t> &locs = missing_loc_entry->second;
623 ldout(pg->cct, 10) << __func__ << ": locs:" << locs << dendl;
624 set<pg_shard_t> have_acting;
625 for (set<pg_shard_t>::const_iterator i = locs.begin();
626 i != locs.end();
627 ++i) {
628 if (acting.count(*i))
629 have_acting.insert(*i);
630 }
631 return (*is_readable)(have_acting);
632 }
633
634 void PG::MissingLoc::add_batch_sources_info(
635 const set<pg_shard_t> &sources, ThreadPool::TPHandle* handle)
636 {
637 ldout(pg->cct, 10) << __func__ << ": adding sources in batch "
638 << sources.size() << dendl;
639 unsigned loop = 0;
640 bool sources_updated = false;
641 for (map<hobject_t, pg_missing_item>::const_iterator i = needs_recovery_map.begin();
642 i != needs_recovery_map.end();
643 ++i) {
644 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
645 handle->reset_tp_timeout();
646 loop = 0;
647 }
648 if (i->second.is_delete())
649 continue;
650
651 auto p = missing_loc.find(i->first);
652 if (p == missing_loc.end()) {
653 p = missing_loc.emplace(i->first, set<pg_shard_t>()).first;
654 } else {
655 _dec_count(p->second);
656 }
657 missing_loc[i->first].insert(sources.begin(), sources.end());
658 _inc_count(p->second);
659
660 if (!sources_updated) {
661 missing_loc_sources.insert(sources.begin(), sources.end());
662 sources_updated = true;
663 }
664 }
665 }
666
667 bool PG::MissingLoc::add_source_info(
668 pg_shard_t fromosd,
669 const pg_info_t &oinfo,
670 const pg_missing_t &omissing,
671 ThreadPool::TPHandle* handle)
672 {
673 bool found_missing = false;
674 unsigned loop = 0;
675 bool sources_updated = false;
676 // found items?
677 for (map<hobject_t,pg_missing_item>::const_iterator p = needs_recovery_map.begin();
678 p != needs_recovery_map.end();
679 ++p) {
680 const hobject_t &soid(p->first);
681 eversion_t need = p->second.need;
682 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
683 handle->reset_tp_timeout();
684 loop = 0;
685 }
686 if (p->second.is_delete()) {
687 ldout(pg->cct, 10) << __func__ << " " << soid
688 << " delete, ignoring source" << dendl;
689 continue;
690 }
691 if (oinfo.last_update < need) {
692 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
693 << " also missing on osd." << fromosd
694 << " (last_update " << oinfo.last_update
695 << " < needed " << need << ")" << dendl;
696 continue;
697 }
698 if (!oinfo.last_backfill.is_max() &&
699 !oinfo.last_backfill_bitwise) {
700 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
701 << " also missing on osd." << fromosd
702 << " (last_backfill " << oinfo.last_backfill
703 << " but with wrong sort order)"
704 << dendl;
705 continue;
706 }
707 if (p->first >= oinfo.last_backfill) {
708 // FIXME: this is _probably_ true, although it could conceivably
709 // be in the undefined region! Hmm!
710 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
711 << " also missing on osd." << fromosd
712 << " (past last_backfill " << oinfo.last_backfill
713 << ")" << dendl;
714 continue;
715 }
716 if (omissing.is_missing(soid)) {
717 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
718 << " also missing on osd." << fromosd << dendl;
719 continue;
720 }
721
722 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
723 << " is on osd." << fromosd << dendl;
724
725 {
726 auto p = missing_loc.find(soid);
727 if (p == missing_loc.end()) {
728 p = missing_loc.emplace(soid, set<pg_shard_t>()).first;
729 } else {
730 _dec_count(p->second);
731 }
732 p->second.insert(fromosd);
733 _inc_count(p->second);
734 }
735
736 if (!sources_updated) {
737 missing_loc_sources.insert(fromosd);
738 sources_updated = true;
739 }
740 found_missing = true;
741 }
742
743 ldout(pg->cct, 20) << "needs_recovery_map missing " << needs_recovery_map
744 << dendl;
745 return found_missing;
746 }
747
748 void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
749 {
750 set<pg_shard_t> now_down;
751 for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
752 p != missing_loc_sources.end();
753 ) {
754 if (osdmap->is_up(p->osd)) {
755 ++p;
756 continue;
757 }
758 ldout(pg->cct, 10) << __func__ << " source osd." << *p << " now down" << dendl;
759 now_down.insert(*p);
760 missing_loc_sources.erase(p++);
761 }
762
763 if (now_down.empty()) {
764 ldout(pg->cct, 10) << __func__ << " no source osds (" << missing_loc_sources << ") went down" << dendl;
765 } else {
766 ldout(pg->cct, 10) << __func__ << " sources osds " << now_down << " now down, remaining sources are "
767 << missing_loc_sources << dendl;
768
769 // filter missing_loc
770 map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin();
771 while (p != missing_loc.end()) {
772 set<pg_shard_t>::iterator q = p->second.begin();
773 bool changed = false;
774 while (q != p->second.end()) {
775 if (now_down.count(*q)) {
776 if (!changed) {
777 changed = true;
778 _dec_count(p->second);
779 }
780 p->second.erase(q++);
781 } else {
782 ++q;
783 }
784 }
785 if (p->second.empty()) {
786 missing_loc.erase(p++);
787 } else {
788 if (changed) {
789 _inc_count(p->second);
790 }
791 ++p;
792 }
793 }
794 }
795 }
796
797 void PG::discover_all_missing(map<int, map<spg_t,pg_query_t> > &query_map)
798 {
799 auto &missing = pg_log.get_missing();
800 uint64_t unfound = get_num_unfound();
801
802 dout(10) << __func__ << " "
803 << missing.num_missing() << " missing, "
804 << unfound << " unfound"
805 << dendl;
806
807 std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
808 std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
809 for (; m != mend; ++m) {
810 pg_shard_t peer(*m);
811
812 if (!get_osdmap()->is_up(peer.osd)) {
813 dout(20) << __func__ << " skipping down osd." << peer << dendl;
814 continue;
815 }
816
817 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
818 if (iter != peer_info.end() &&
819 (iter->second.is_empty() || iter->second.dne())) {
820 // ignore empty peers
821 continue;
822 }
823
824 // If we've requested any of this stuff, the pg_missing_t information
825 // should be on its way.
826 // TODO: coalsce requested_* into a single data structure
827 if (peer_missing.find(peer) != peer_missing.end()) {
828 dout(20) << __func__ << ": osd." << peer
829 << ": we already have pg_missing_t" << dendl;
830 continue;
831 }
832 if (peer_log_requested.find(peer) != peer_log_requested.end()) {
833 dout(20) << __func__ << ": osd." << peer
834 << ": in peer_log_requested" << dendl;
835 continue;
836 }
837 if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
838 dout(20) << __func__ << ": osd." << peer
839 << ": in peer_missing_requested" << dendl;
840 continue;
841 }
842
843 // Request missing
844 dout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
845 << dendl;
846 peer_missing_requested.insert(peer);
847 query_map[peer.osd][spg_t(info.pgid.pgid, peer.shard)] =
848 pg_query_t(
849 pg_query_t::FULLLOG,
850 peer.shard, pg_whoami.shard,
851 info.history, get_osdmap_epoch());
852 }
853 }
854
855 /******* PG ***********/
856 bool PG::needs_recovery() const
857 {
858 ceph_assert(is_primary());
859
860 auto &missing = pg_log.get_missing();
861
862 if (missing.num_missing()) {
863 dout(10) << __func__ << " primary has " << missing.num_missing()
864 << " missing" << dendl;
865 return true;
866 }
867
868 ceph_assert(!acting_recovery_backfill.empty());
869 set<pg_shard_t>::const_iterator end = acting_recovery_backfill.end();
870 set<pg_shard_t>::const_iterator a = acting_recovery_backfill.begin();
871 for (; a != end; ++a) {
872 if (*a == get_primary()) continue;
873 pg_shard_t peer = *a;
874 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
875 if (pm == peer_missing.end()) {
876 dout(10) << __func__ << " osd." << peer << " doesn't have missing set"
877 << dendl;
878 continue;
879 }
880 if (pm->second.num_missing()) {
881 dout(10) << __func__ << " osd." << peer << " has "
882 << pm->second.num_missing() << " missing" << dendl;
883 return true;
884 }
885 }
886
887 dout(10) << __func__ << " is recovered" << dendl;
888 return false;
889 }
890
891 bool PG::needs_backfill() const
892 {
893 ceph_assert(is_primary());
894
895 // We can assume that only possible osds that need backfill
896 // are on the backfill_targets vector nodes.
897 set<pg_shard_t>::const_iterator end = backfill_targets.end();
898 set<pg_shard_t>::const_iterator a = backfill_targets.begin();
899 for (; a != end; ++a) {
900 pg_shard_t peer = *a;
901 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
902 if (!pi->second.last_backfill.is_max()) {
903 dout(10) << __func__ << " osd." << peer << " has last_backfill " << pi->second.last_backfill << dendl;
904 return true;
905 }
906 }
907
908 dout(10) << __func__ << " does not need backfill" << dendl;
909 return false;
910 }
911
912
913 void PG::check_past_interval_bounds() const
914 {
915 auto rpib = get_required_past_interval_bounds(
916 info,
917 osd->get_superblock().oldest_map);
918 if (rpib.first >= rpib.second) {
919 if (!past_intervals.empty()) {
920 osd->clog->error() << info.pgid << " required past_interval bounds are"
921 << " empty [" << rpib << ") but past_intervals is not: "
922 << past_intervals;
923 derr << info.pgid << " required past_interval bounds are"
924 << " empty [" << rpib << ") but past_intervals is not: "
925 << past_intervals << dendl;
926 }
927 } else {
928 if (past_intervals.empty()) {
929 osd->clog->error() << info.pgid << " required past_interval bounds are"
930 << " not empty [" << rpib << ") but past_intervals "
931 << past_intervals << " is empty";
932 derr << info.pgid << " required past_interval bounds are"
933 << " not empty [" << rpib << ") but past_intervals "
934 << past_intervals << " is empty" << dendl;
935 ceph_assert(!past_intervals.empty());
936 }
937
938 auto apib = past_intervals.get_bounds();
939 if (apib.first > rpib.first) {
940 osd->clog->error() << info.pgid << " past_intervals [" << apib
941 << ") start interval does not contain the required"
942 << " bound [" << rpib << ") start";
943 derr << info.pgid << " past_intervals [" << apib
944 << ") start interval does not contain the required"
945 << " bound [" << rpib << ") start" << dendl;
946 ceph_abort_msg("past_interval start interval mismatch");
947 }
948 if (apib.second != rpib.second) {
949 osd->clog->error() << info.pgid << " past_interal bound [" << apib
950 << ") end does not match required [" << rpib
951 << ") end";
952 derr << info.pgid << " past_interal bound [" << apib
953 << ") end does not match required [" << rpib
954 << ") end" << dendl;
955 ceph_abort_msg("past_interval end mismatch");
956 }
957 }
958 }
959
960 bool PG::adjust_need_up_thru(const OSDMapRef osdmap)
961 {
962 epoch_t up_thru = osdmap->get_up_thru(osd->whoami);
963 if (need_up_thru &&
964 up_thru >= info.history.same_interval_since) {
965 dout(10) << "adjust_need_up_thru now " << up_thru << ", need_up_thru now false" << dendl;
966 need_up_thru = false;
967 return true;
968 }
969 return false;
970 }
971
972 void PG::remove_down_peer_info(const OSDMapRef osdmap)
973 {
974 // Remove any downed osds from peer_info
975 bool removed = false;
976 map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
977 while (p != peer_info.end()) {
978 if (!osdmap->is_up(p->first.osd)) {
979 dout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
980 peer_missing.erase(p->first);
981 peer_log_requested.erase(p->first);
982 peer_missing_requested.erase(p->first);
983 peer_info.erase(p++);
984 removed = true;
985 } else
986 ++p;
987 }
988
989 // if we removed anyone, update peers (which include peer_info)
990 if (removed)
991 update_heartbeat_peers();
992 check_recovery_sources(osdmap);
993 }
994
995 /*
996 * Returns true unless there is a non-lost OSD in might_have_unfound.
997 */
998 bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
999 {
1000 ceph_assert(is_primary());
1001
1002 set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
1003 set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
1004 for (; peer != mend; ++peer) {
1005 if (peer_missing.count(*peer))
1006 continue;
1007 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
1008 if (iter != peer_info.end() &&
1009 (iter->second.is_empty() || iter->second.dne()))
1010 continue;
1011 if (!osdmap->exists(peer->osd))
1012 continue;
1013 const osd_info_t &osd_info(osdmap->get_info(peer->osd));
1014 if (osd_info.lost_at <= osd_info.up_from) {
1015 // If there is even one OSD in might_have_unfound that isn't lost, we
1016 // still might retrieve our unfound.
1017 return false;
1018 }
1019 }
1020 dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound
1021 << " have been queried or are marked lost" << dendl;
1022 return true;
1023 }
1024
1025 PastIntervals::PriorSet PG::build_prior()
1026 {
1027 if (1) {
1028 // sanity check
1029 for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
1030 it != peer_info.end();
1031 ++it) {
1032 ceph_assert(info.history.last_epoch_started >= it->second.history.last_epoch_started);
1033 }
1034 }
1035
1036 const OSDMap &osdmap = *get_osdmap();
1037 PastIntervals::PriorSet prior = past_intervals.get_prior_set(
1038 pool.info.is_erasure(),
1039 info.history.last_epoch_started,
1040 get_pgbackend()->get_is_recoverable_predicate(),
1041 [&](epoch_t start, int osd, epoch_t *lost_at) {
1042 const osd_info_t *pinfo = 0;
1043 if (osdmap.exists(osd)) {
1044 pinfo = &osdmap.get_info(osd);
1045 if (lost_at)
1046 *lost_at = pinfo->lost_at;
1047 }
1048
1049 if (osdmap.is_up(osd)) {
1050 return PastIntervals::UP;
1051 } else if (!pinfo) {
1052 return PastIntervals::DNE;
1053 } else if (pinfo->lost_at > start) {
1054 return PastIntervals::LOST;
1055 } else {
1056 return PastIntervals::DOWN;
1057 }
1058 },
1059 up,
1060 acting,
1061 this);
1062
1063 if (prior.pg_down) {
1064 state_set(PG_STATE_DOWN);
1065 }
1066
1067 if (get_osdmap()->get_up_thru(osd->whoami) < info.history.same_interval_since) {
1068 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
1069 << " < same_since " << info.history.same_interval_since
1070 << ", must notify monitor" << dendl;
1071 need_up_thru = true;
1072 } else {
1073 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
1074 << " >= same_since " << info.history.same_interval_since
1075 << ", all is well" << dendl;
1076 need_up_thru = false;
1077 }
1078 set_probe_targets(prior.probe);
1079 return prior;
1080 }
1081
1082 void PG::clear_primary_state()
1083 {
1084 dout(10) << "clear_primary_state" << dendl;
1085
1086 // clear peering state
1087 stray_set.clear();
1088 peer_log_requested.clear();
1089 peer_missing_requested.clear();
1090 peer_info.clear();
1091 peer_bytes.clear();
1092 peer_missing.clear();
1093 need_up_thru = false;
1094 peer_last_complete_ondisk.clear();
1095 peer_activated.clear();
1096 min_last_complete_ondisk = eversion_t();
1097 pg_trim_to = eversion_t();
1098 might_have_unfound.clear();
1099 projected_log = PGLog::IndexedLog();
1100
1101 last_update_ondisk = eversion_t();
1102
1103 snap_trimq.clear();
1104
1105 finish_sync_event = 0; // so that _finish_recovery doesn't go off in another thread
1106
1107 missing_loc.clear();
1108
1109 release_pg_backoffs();
1110
1111 pg_log.reset_recovery_pointers();
1112
1113 scrubber.reserved_peers.clear();
1114 scrub_after_recovery = false;
1115
1116 agent_clear();
1117 }
1118
1119 PG::Scrubber::Scrubber()
1120 : reserved(false), reserve_failed(false),
1121 epoch_start(0),
1122 active(false),
1123 shallow_errors(0), deep_errors(0), fixed(0),
1124 must_scrub(false), must_deep_scrub(false), must_repair(false),
1125 auto_repair(false),
1126 check_repair(false),
1127 deep_scrub_on_error(false),
1128 num_digest_updates_pending(0),
1129 state(INACTIVE),
1130 deep(false)
1131 {}
1132
1133 PG::Scrubber::~Scrubber() {}
1134
1135 /**
1136 * find_best_info
1137 *
1138 * Returns an iterator to the best info in infos sorted by:
1139 * 1) Prefer newer last_update
1140 * 2) Prefer longer tail if it brings another info into contiguity
1141 * 3) Prefer current primary
1142 */
1143 map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
1144 const map<pg_shard_t, pg_info_t> &infos,
1145 bool restrict_to_up_acting,
1146 bool *history_les_bound) const
1147 {
1148 ceph_assert(history_les_bound);
1149 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1150 * to make changes to this process. Also, make sure to update it
1151 * when you find bugs! */
1152 eversion_t min_last_update_acceptable = eversion_t::max();
1153 epoch_t max_last_epoch_started_found = 0;
1154 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1155 i != infos.end();
1156 ++i) {
1157 if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1158 max_last_epoch_started_found < i->second.history.last_epoch_started) {
1159 *history_les_bound = true;
1160 max_last_epoch_started_found = i->second.history.last_epoch_started;
1161 }
1162 if (!i->second.is_incomplete() &&
1163 max_last_epoch_started_found < i->second.last_epoch_started) {
1164 *history_les_bound = false;
1165 max_last_epoch_started_found = i->second.last_epoch_started;
1166 }
1167 }
1168 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1169 i != infos.end();
1170 ++i) {
1171 if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1172 if (min_last_update_acceptable > i->second.last_update)
1173 min_last_update_acceptable = i->second.last_update;
1174 }
1175 }
1176 if (min_last_update_acceptable == eversion_t::max())
1177 return infos.end();
1178
1179 map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1180 // find osd with newest last_update (oldest for ec_pool).
1181 // if there are multiples, prefer
1182 // - a longer tail, if it brings another peer into log contiguity
1183 // - the current primary
1184 for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1185 p != infos.end();
1186 ++p) {
1187 if (restrict_to_up_acting && !is_up(p->first) &&
1188 !is_acting(p->first))
1189 continue;
1190 // Only consider peers with last_update >= min_last_update_acceptable
1191 if (p->second.last_update < min_last_update_acceptable)
1192 continue;
1193 // Disqualify anyone with a too old last_epoch_started
1194 if (p->second.last_epoch_started < max_last_epoch_started_found)
1195 continue;
1196 // Disqualify anyone who is incomplete (not fully backfilled)
1197 if (p->second.is_incomplete())
1198 continue;
1199 if (best == infos.end()) {
1200 best = p;
1201 continue;
1202 }
1203 // Prefer newer last_update
1204 if (pool.info.require_rollback()) {
1205 if (p->second.last_update > best->second.last_update)
1206 continue;
1207 if (p->second.last_update < best->second.last_update) {
1208 best = p;
1209 continue;
1210 }
1211 } else {
1212 if (p->second.last_update < best->second.last_update)
1213 continue;
1214 if (p->second.last_update > best->second.last_update) {
1215 best = p;
1216 continue;
1217 }
1218 }
1219
1220 // Prefer longer tail
1221 if (p->second.log_tail > best->second.log_tail) {
1222 continue;
1223 } else if (p->second.log_tail < best->second.log_tail) {
1224 best = p;
1225 continue;
1226 }
1227
1228 if (!p->second.has_missing() && best->second.has_missing()) {
1229 dout(10) << __func__ << " prefer osd." << p->first
1230 << " because it is complete while best has missing"
1231 << dendl;
1232 best = p;
1233 continue;
1234 } else if (p->second.has_missing() && !best->second.has_missing()) {
1235 dout(10) << __func__ << " skipping osd." << p->first
1236 << " because it has missing while best is complete"
1237 << dendl;
1238 continue;
1239 } else {
1240 // both are complete or have missing
1241 // fall through
1242 }
1243
1244 // prefer current primary (usually the caller), all things being equal
1245 if (p->first == pg_whoami) {
1246 dout(10) << "calc_acting prefer osd." << p->first
1247 << " because it is current primary" << dendl;
1248 best = p;
1249 continue;
1250 }
1251 }
1252 return best;
1253 }
1254
1255 void PG::calc_ec_acting(
1256 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1257 unsigned size,
1258 const vector<int> &acting,
1259 const vector<int> &up,
1260 const map<pg_shard_t, pg_info_t> &all_info,
1261 bool restrict_to_up_acting,
1262 vector<int> *_want,
1263 set<pg_shard_t> *backfill,
1264 set<pg_shard_t> *acting_backfill,
1265 ostream &ss)
1266 {
1267 vector<int> want(size, CRUSH_ITEM_NONE);
1268 map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1269 for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1270 i != all_info.end();
1271 ++i) {
1272 all_info_by_shard[i->first.shard].insert(i->first);
1273 }
1274 for (uint8_t i = 0; i < want.size(); ++i) {
1275 ss << "For position " << (unsigned)i << ": ";
1276 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1277 !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1278 all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1279 auth_log_shard->second.log_tail) {
1280 ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1281 want[i] = up[i];
1282 continue;
1283 }
1284 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1285 ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1286 << " and ";
1287 backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1288 }
1289
1290 if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1291 !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1292 all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1293 auth_log_shard->second.log_tail) {
1294 ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1295 want[i] = acting[i];
1296 } else if (!restrict_to_up_acting) {
1297 for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1298 j != all_info_by_shard[shard_id_t(i)].end();
1299 ++j) {
1300 ceph_assert(j->shard == i);
1301 if (!all_info.find(*j)->second.is_incomplete() &&
1302 all_info.find(*j)->second.last_update >=
1303 auth_log_shard->second.log_tail) {
1304 ss << " selecting stray: " << *j << std::endl;
1305 want[i] = j->osd;
1306 break;
1307 }
1308 }
1309 if (want[i] == CRUSH_ITEM_NONE)
1310 ss << " failed to fill position " << (int)i << std::endl;
1311 }
1312 }
1313
1314 for (uint8_t i = 0; i < want.size(); ++i) {
1315 if (want[i] != CRUSH_ITEM_NONE) {
1316 acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1317 }
1318 }
1319 acting_backfill->insert(backfill->begin(), backfill->end());
1320 _want->swap(want);
1321 }
1322
1323 /**
1324 * calculate the desired acting set.
1325 *
1326 * Choose an appropriate acting set. Prefer up[0], unless it is
1327 * incomplete, or another osd has a longer tail that allows us to
1328 * bring other up nodes up to date.
1329 */
1330 void PG::calc_replicated_acting(
1331 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1332 uint64_t force_auth_primary_missing_objects,
1333 unsigned size,
1334 const vector<int> &acting,
1335 const vector<int> &up,
1336 pg_shard_t up_primary,
1337 const map<pg_shard_t, pg_info_t> &all_info,
1338 bool restrict_to_up_acting,
1339 vector<int> *want,
1340 set<pg_shard_t> *backfill,
1341 set<pg_shard_t> *acting_backfill,
1342 const OSDMapRef osdmap,
1343 ostream &ss)
1344 {
1345 pg_shard_t auth_log_shard_id = auth_log_shard->first;
1346
1347 ss << __func__ << " newest update on osd." << auth_log_shard_id
1348 << " with " << auth_log_shard->second
1349 << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1350
1351 // select primary
1352 auto primary = all_info.find(up_primary);
1353 if (up.size() &&
1354 !primary->second.is_incomplete() &&
1355 primary->second.last_update >=
1356 auth_log_shard->second.log_tail) {
1357 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1358 auto approx_missing_objects =
1359 primary->second.stats.stats.sum.num_objects_missing;
1360 auto auth_version = auth_log_shard->second.last_update.version;
1361 auto primary_version = primary->second.last_update.version;
1362 if (auth_version > primary_version) {
1363 approx_missing_objects += auth_version - primary_version;
1364 } else {
1365 approx_missing_objects += primary_version - auth_version;
1366 }
1367 if ((uint64_t)approx_missing_objects >
1368 force_auth_primary_missing_objects) {
1369 primary = auth_log_shard;
1370 ss << "up_primary: " << up_primary << ") has approximate "
1371 << approx_missing_objects
1372 << "(>" << force_auth_primary_missing_objects <<") "
1373 << "missing objects, osd." << auth_log_shard_id
1374 << " selected as primary instead"
1375 << std::endl;
1376 } else {
1377 ss << "up_primary: " << up_primary << ") selected as primary"
1378 << std::endl;
1379 }
1380 } else {
1381 ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1382 }
1383 } else {
1384 ceph_assert(!auth_log_shard->second.is_incomplete());
1385 ss << "up[0] needs backfill, osd." << auth_log_shard_id
1386 << " selected as primary instead" << std::endl;
1387 primary = auth_log_shard;
1388 }
1389
1390 ss << __func__ << " primary is osd." << primary->first
1391 << " with " << primary->second << std::endl;
1392 want->push_back(primary->first.osd);
1393 acting_backfill->insert(primary->first);
1394
1395 /* We include auth_log_shard->second.log_tail because in GetLog,
1396 * we will request logs back to the min last_update over our
1397 * acting_backfill set, which will result in our log being extended
1398 * as far backwards as necessary to pick up any peers which can
1399 * be log recovered by auth_log_shard's log */
1400 eversion_t oldest_auth_log_entry =
1401 std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
1402
1403 // select replicas that have log contiguity with primary.
1404 // prefer up, then acting, then any peer_info osds
1405 for (auto i : up) {
1406 pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
1407 if (up_cand == primary->first)
1408 continue;
1409 const pg_info_t &cur_info = all_info.find(up_cand)->second;
1410 if (cur_info.is_incomplete() ||
1411 cur_info.last_update < oldest_auth_log_entry) {
1412 ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1413 backfill->insert(up_cand);
1414 acting_backfill->insert(up_cand);
1415 } else {
1416 want->push_back(i);
1417 acting_backfill->insert(up_cand);
1418 ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
1419 }
1420 if (want->size() >= size) {
1421 break;
1422 }
1423 }
1424
1425 if (want->size() >= size) {
1426 return;
1427 }
1428
1429 std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
1430 candidate_by_last_update.reserve(acting.size());
1431 // This no longer has backfill OSDs, but they are covered above.
1432 for (auto i : acting) {
1433 pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
1434 // skip up osds we already considered above
1435 if (acting_cand == primary->first)
1436 continue;
1437 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i);
1438 if (up_it != up.end())
1439 continue;
1440
1441 const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1442 if (cur_info.is_incomplete() ||
1443 cur_info.last_update < oldest_auth_log_entry) {
1444 ss << " shard " << acting_cand << " (acting) REJECTED "
1445 << cur_info << std::endl;
1446 } else {
1447 candidate_by_last_update.push_back(make_pair(cur_info.last_update, i));
1448 }
1449 }
1450
1451 auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs,
1452 const std::pair<eversion_t, int> &rhs) {
1453 return lhs.first > rhs.first;
1454 };
1455 // sort by last_update, in descending order.
1456 std::sort(candidate_by_last_update.begin(),
1457 candidate_by_last_update.end(), sort_by_eversion);
1458 for (auto &p: candidate_by_last_update) {
1459 ceph_assert(want->size() < size);
1460 want->push_back(p.second);
1461 pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1462 acting_backfill->insert(s);
1463 ss << " shard " << s << " (acting) accepted "
1464 << all_info.find(s)->second << std::endl;
1465 if (want->size() >= size) {
1466 return;
1467 }
1468 }
1469
1470 if (restrict_to_up_acting) {
1471 return;
1472 }
1473 candidate_by_last_update.clear();
1474 candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
1475 // continue to search stray to find more suitable peers
1476 for (auto &i : all_info) {
1477 // skip up osds we already considered above
1478 if (i.first == primary->first)
1479 continue;
1480 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i.first.osd);
1481 if (up_it != up.end())
1482 continue;
1483 vector<int>::const_iterator acting_it = find(
1484 acting.begin(), acting.end(), i.first.osd);
1485 if (acting_it != acting.end())
1486 continue;
1487
1488 if (i.second.is_incomplete() ||
1489 i.second.last_update < oldest_auth_log_entry) {
1490 ss << " shard " << i.first << " (stray) REJECTED " << i.second
1491 << std::endl;
1492 } else {
1493 candidate_by_last_update.push_back(
1494 make_pair(i.second.last_update, i.first.osd));
1495 }
1496 }
1497
1498 if (candidate_by_last_update.empty()) {
1499 // save us some effort
1500 return;
1501 }
1502
1503 // sort by last_update, in descending order.
1504 std::sort(candidate_by_last_update.begin(),
1505 candidate_by_last_update.end(), sort_by_eversion);
1506
1507 for (auto &p: candidate_by_last_update) {
1508 ceph_assert(want->size() < size);
1509 want->push_back(p.second);
1510 pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1511 acting_backfill->insert(s);
1512 ss << " shard " << s << " (stray) accepted "
1513 << all_info.find(s)->second << std::endl;
1514 if (want->size() >= size) {
1515 return;
1516 }
1517 }
1518 }
1519
1520 bool PG::recoverable_and_ge_min_size(const vector<int> &want) const
1521 {
1522 unsigned num_want_acting = 0;
1523 set<pg_shard_t> have;
1524 for (int i = 0; i < (int)want.size(); ++i) {
1525 if (want[i] != CRUSH_ITEM_NONE) {
1526 ++num_want_acting;
1527 have.insert(
1528 pg_shard_t(
1529 want[i],
1530 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1531 }
1532 }
1533 // We go incomplete if below min_size for ec_pools since backfill
1534 // does not currently maintain rollbackability
1535 // Otherwise, we will go "peered", but not "active"
1536 if (num_want_acting < pool.info.min_size &&
1537 (pool.info.is_erasure() ||
1538 !cct->_conf->osd_allow_recovery_below_min_size)) {
1539 dout(10) << __func__ << " failed, below min size" << dendl;
1540 return false;
1541 }
1542
1543 /* Check whether we have enough acting shards to later perform recovery */
1544 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable_predicate(
1545 get_pgbackend()->get_is_recoverable_predicate());
1546 if (!(*recoverable_predicate)(have)) {
1547 dout(10) << __func__ << " failed, not recoverable" << dendl;
1548 return false;
1549 }
1550
1551 return true;
1552 }
1553
1554 void PG::choose_async_recovery_ec(const map<pg_shard_t, pg_info_t> &all_info,
1555 const pg_info_t &auth_info,
1556 vector<int> *want,
1557 set<pg_shard_t> *async_recovery) const
1558 {
1559 set<pair<int, pg_shard_t> > candidates_by_cost;
1560 for (uint8_t i = 0; i < want->size(); ++i) {
1561 if ((*want)[i] == CRUSH_ITEM_NONE)
1562 continue;
1563
1564 // Considering log entries to recover is accurate enough for
1565 // now. We could use minimum_to_decode_with_cost() later if
1566 // necessary.
1567 pg_shard_t shard_i((*want)[i], shard_id_t(i));
1568 // do not include strays
1569 if (stray_set.find(shard_i) != stray_set.end())
1570 continue;
1571 // Do not include an osd that is not up, since choosing it as
1572 // an async_recovery_target will move it out of the acting set.
1573 // This results in it being identified as a stray during peering,
1574 // because it is no longer in the up or acting set.
1575 if (!is_up(shard_i))
1576 continue;
1577 auto shard_info = all_info.find(shard_i)->second;
1578 // for ec pools we rollback all entries past the authoritative
1579 // last_update *before* activation. This is relatively inexpensive
1580 // compared to recovery, since it is purely local, so treat shards
1581 // past the authoritative last_update the same as those equal to it.
1582 version_t auth_version = auth_info.last_update.version;
1583 version_t candidate_version = shard_info.last_update.version;
1584 auto approx_missing_objects =
1585 shard_info.stats.stats.sum.num_objects_missing;
1586 if (auth_version > candidate_version) {
1587 approx_missing_objects += auth_version - candidate_version;
1588 }
1589 if (static_cast<uint64_t>(approx_missing_objects) >
1590 cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1591 candidates_by_cost.insert(make_pair(approx_missing_objects, shard_i));
1592 }
1593 }
1594
1595 dout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1596 << dendl;
1597
1598 // take out as many osds as we can for async recovery, in order of cost
1599 for (auto rit = candidates_by_cost.rbegin();
1600 rit != candidates_by_cost.rend(); ++rit) {
1601 pg_shard_t cur_shard = rit->second;
1602 vector<int> candidate_want(*want);
1603 candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE;
1604 if (recoverable_and_ge_min_size(candidate_want)) {
1605 want->swap(candidate_want);
1606 async_recovery->insert(cur_shard);
1607 }
1608 }
1609 dout(20) << __func__ << " result want=" << *want
1610 << " async_recovery=" << *async_recovery << dendl;
1611 }
1612
1613 void PG::choose_async_recovery_replicated(const map<pg_shard_t, pg_info_t> &all_info,
1614 const pg_info_t &auth_info,
1615 vector<int> *want,
1616 set<pg_shard_t> *async_recovery) const
1617 {
1618 set<pair<int, pg_shard_t> > candidates_by_cost;
1619 for (auto osd_num : *want) {
1620 pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD);
1621 // do not include strays
1622 if (stray_set.find(shard_i) != stray_set.end())
1623 continue;
1624 // Do not include an osd that is not up, since choosing it as
1625 // an async_recovery_target will move it out of the acting set.
1626 // This results in it being identified as a stray during peering,
1627 // because it is no longer in the up or acting set.
1628 if (!is_up(shard_i))
1629 continue;
1630 auto shard_info = all_info.find(shard_i)->second;
1631 // use the approximate magnitude of the difference in length of
1632 // logs plus historical missing objects as the cost of recovery
1633 version_t auth_version = auth_info.last_update.version;
1634 version_t candidate_version = shard_info.last_update.version;
1635 auto approx_missing_objects =
1636 shard_info.stats.stats.sum.num_objects_missing;
1637 if (auth_version > candidate_version) {
1638 approx_missing_objects += auth_version - candidate_version;
1639 } else {
1640 approx_missing_objects += candidate_version - auth_version;
1641 }
1642 if (static_cast<uint64_t>(approx_missing_objects) >
1643 cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1644 candidates_by_cost.insert(make_pair(approx_missing_objects, shard_i));
1645 }
1646 }
1647
1648 dout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1649 << dendl;
1650 // take out as many osds as we can for async recovery, in order of cost
1651 for (auto rit = candidates_by_cost.rbegin();
1652 rit != candidates_by_cost.rend(); ++rit) {
1653 if (want->size() <= pool.info.min_size) {
1654 break;
1655 }
1656 pg_shard_t cur_shard = rit->second;
1657 vector<int> candidate_want(*want);
1658 for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) {
1659 if (*it == cur_shard.osd) {
1660 candidate_want.erase(it);
1661 want->swap(candidate_want);
1662 async_recovery->insert(cur_shard);
1663 break;
1664 }
1665 }
1666 }
1667 dout(20) << __func__ << " result want=" << *want
1668 << " async_recovery=" << *async_recovery << dendl;
1669 }
1670
1671 /**
1672 * choose acting
1673 *
1674 * calculate the desired acting, and request a change with the monitor
1675 * if it differs from the current acting.
1676 *
1677 * if restrict_to_up_acting=true, we filter out anything that's not in
1678 * up/acting. in order to lift this restriction, we need to
1679 * 1) check whether it's worth switching the acting set any time we get
1680 * a new pg info (not just here, when recovery finishes)
1681 * 2) check whether anything in want_acting went down on each new map
1682 * (and, if so, calculate a new want_acting)
1683 * 3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap)
1684 * TODO!
1685 */
1686 bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
1687 bool restrict_to_up_acting,
1688 bool *history_les_bound)
1689 {
1690 map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
1691 all_info[pg_whoami] = info;
1692
1693 if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
1694 for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
1695 p != all_info.end();
1696 ++p) {
1697 dout(10) << __func__ << " all_info osd." << p->first << " " << p->second << dendl;
1698 }
1699 }
1700
1701 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
1702 find_best_info(all_info, restrict_to_up_acting, history_les_bound);
1703
1704 if (auth_log_shard == all_info.end()) {
1705 if (up != acting) {
1706 dout(10) << __func__ << " no suitable info found (incomplete backfills?),"
1707 << " reverting to up" << dendl;
1708 want_acting = up;
1709 vector<int> empty;
1710 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1711 } else {
1712 dout(10) << __func__ << " failed" << dendl;
1713 ceph_assert(want_acting.empty());
1714 }
1715 return false;
1716 }
1717
1718 ceph_assert(!auth_log_shard->second.is_incomplete());
1719 auth_log_shard_id = auth_log_shard->first;
1720
1721 set<pg_shard_t> want_backfill, want_acting_backfill;
1722 vector<int> want;
1723 stringstream ss;
1724 if (!pool.info.is_erasure())
1725 calc_replicated_acting(
1726 auth_log_shard,
1727 cct->_conf.get_val<uint64_t>(
1728 "osd_force_auth_primary_missing_objects"),
1729 get_osdmap()->get_pg_size(info.pgid.pgid),
1730 acting,
1731 up,
1732 up_primary,
1733 all_info,
1734 restrict_to_up_acting,
1735 &want,
1736 &want_backfill,
1737 &want_acting_backfill,
1738 get_osdmap(),
1739 ss);
1740 else
1741 calc_ec_acting(
1742 auth_log_shard,
1743 get_osdmap()->get_pg_size(info.pgid.pgid),
1744 acting,
1745 up,
1746 all_info,
1747 restrict_to_up_acting,
1748 &want,
1749 &want_backfill,
1750 &want_acting_backfill,
1751 ss);
1752 dout(10) << ss.str() << dendl;
1753
1754 if (!recoverable_and_ge_min_size(want)) {
1755 want_acting.clear();
1756 return false;
1757 }
1758
1759 set<pg_shard_t> want_async_recovery;
1760 if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) {
1761 if (pool.info.is_erasure()) {
1762 choose_async_recovery_ec(all_info, auth_log_shard->second, &want, &want_async_recovery);
1763 } else {
1764 choose_async_recovery_replicated(all_info, auth_log_shard->second, &want, &want_async_recovery);
1765 }
1766 }
1767 if (want != acting) {
1768 dout(10) << __func__ << " want " << want << " != acting " << acting
1769 << ", requesting pg_temp change" << dendl;
1770 want_acting = want;
1771
1772 if (!cct->_conf->osd_debug_no_acting_change) {
1773 if (want_acting == up) {
1774 // There can't be any pending backfill if
1775 // want is the same as crush map up OSDs.
1776 ceph_assert(want_backfill.empty());
1777 vector<int> empty;
1778 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1779 } else
1780 osd->queue_want_pg_temp(info.pgid.pgid, want);
1781 }
1782 return false;
1783 }
1784 want_acting.clear();
1785 acting_recovery_backfill = want_acting_backfill;
1786 dout(10) << "acting_recovery_backfill is " << acting_recovery_backfill << dendl;
1787 ceph_assert(backfill_targets.empty() || backfill_targets == want_backfill);
1788 if (backfill_targets.empty()) {
1789 // Caller is GetInfo
1790 backfill_targets = want_backfill;
1791 }
1792 // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
1793 ceph_assert(async_recovery_targets.empty() || async_recovery_targets == want_async_recovery || !needs_recovery());
1794 if (async_recovery_targets.empty() || !needs_recovery()) {
1795 async_recovery_targets = want_async_recovery;
1796 }
1797 // Will not change if already set because up would have had to change
1798 // Verify that nothing in backfill is in stray_set
1799 for (set<pg_shard_t>::iterator i = want_backfill.begin();
1800 i != want_backfill.end();
1801 ++i) {
1802 ceph_assert(stray_set.find(*i) == stray_set.end());
1803 }
1804 dout(10) << "choose_acting want=" << want << " backfill_targets="
1805 << want_backfill << " async_recovery_targets="
1806 << async_recovery_targets << dendl;
1807 return true;
1808 }
1809
1810 /* Build the might_have_unfound set.
1811 *
1812 * This is used by the primary OSD during recovery.
1813 *
1814 * This set tracks the OSDs which might have unfound objects that the primary
1815 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
1816 * will remove the OSD from the set.
1817 */
1818 void PG::build_might_have_unfound()
1819 {
1820 ceph_assert(might_have_unfound.empty());
1821 ceph_assert(is_primary());
1822
1823 dout(10) << __func__ << dendl;
1824
1825 check_past_interval_bounds();
1826
1827 might_have_unfound = past_intervals.get_might_have_unfound(
1828 pg_whoami,
1829 pool.info.is_erasure());
1830
1831 // include any (stray) peers
1832 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
1833 p != peer_info.end();
1834 ++p)
1835 might_have_unfound.insert(p->first);
1836
1837 dout(15) << __func__ << ": built " << might_have_unfound << dendl;
1838 }
1839
1840 void PG::activate(ObjectStore::Transaction& t,
1841 epoch_t activation_epoch,
1842 map<int, map<spg_t,pg_query_t> >& query_map,
1843 map<int,
1844 vector<
1845 pair<pg_notify_t,
1846 PastIntervals> > > *activator_map,
1847 RecoveryCtx *ctx)
1848 {
1849 ceph_assert(!is_peered());
1850 ceph_assert(scrubber.callbacks.empty());
1851 ceph_assert(callbacks_for_degraded_object.empty());
1852
1853 // twiddle pg state
1854 state_clear(PG_STATE_DOWN);
1855
1856 send_notify = false;
1857
1858 if (is_primary()) {
1859 // only update primary last_epoch_started if we will go active
1860 if (acting.size() >= pool.info.min_size) {
1861 ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
1862 info.last_epoch_started <= activation_epoch);
1863 info.last_epoch_started = activation_epoch;
1864 info.last_interval_started = info.history.same_interval_since;
1865 }
1866 } else if (is_acting(pg_whoami)) {
1867 /* update last_epoch_started on acting replica to whatever the primary sent
1868 * unless it's smaller (could happen if we are going peered rather than
1869 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
1870 if (info.last_epoch_started < activation_epoch) {
1871 info.last_epoch_started = activation_epoch;
1872 info.last_interval_started = info.history.same_interval_since;
1873 }
1874 }
1875
1876 auto &missing = pg_log.get_missing();
1877
1878 if (is_primary()) {
1879 last_update_ondisk = info.last_update;
1880 min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
1881 }
1882 last_update_applied = info.last_update;
1883 last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
1884
1885 need_up_thru = false;
1886
1887 // write pg info, log
1888 dirty_info = true;
1889 dirty_big_info = true; // maybe
1890
1891 // find out when we commit
1892 t.register_on_complete(
1893 new C_PG_ActivateCommitted(
1894 this,
1895 get_osdmap_epoch(),
1896 activation_epoch));
1897
1898 if (is_primary()) {
1899 // initialize snap_trimq
1900 if (get_osdmap()->require_osd_release < CEPH_RELEASE_MIMIC) {
1901 dout(20) << "activate - purged_snaps " << info.purged_snaps
1902 << " cached_removed_snaps " << pool.cached_removed_snaps
1903 << dendl;
1904 snap_trimq = pool.cached_removed_snaps;
1905 } else {
1906 auto& removed_snaps_queue = get_osdmap()->get_removed_snaps_queue();
1907 auto p = removed_snaps_queue.find(info.pgid.pgid.pool());
1908 snap_trimq.clear();
1909 if (p != removed_snaps_queue.end()) {
1910 dout(20) << "activate - purged_snaps " << info.purged_snaps
1911 << " removed_snaps " << p->second
1912 << dendl;
1913 for (auto q : p->second) {
1914 snap_trimq.insert(q.first, q.second);
1915 }
1916 }
1917 }
1918 interval_set<snapid_t> purged;
1919 purged.intersection_of(snap_trimq, info.purged_snaps);
1920 snap_trimq.subtract(purged);
1921
1922 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_MIMIC) {
1923 // adjust purged_snaps: PG may have been inactive while snaps were pruned
1924 // from the removed_snaps_queue in the osdmap. update local purged_snaps
1925 // reflect only those snaps that we thought were pruned and were still in
1926 // the queue.
1927 info.purged_snaps.swap(purged);
1928 }
1929 }
1930
1931 // init complete pointer
1932 if (missing.num_missing() == 0) {
1933 dout(10) << "activate - no missing, moving last_complete " << info.last_complete
1934 << " -> " << info.last_update << dendl;
1935 info.last_complete = info.last_update;
1936 info.stats.stats.sum.num_objects_missing = 0;
1937 pg_log.reset_recovery_pointers();
1938 } else {
1939 dout(10) << "activate - not complete, " << missing << dendl;
1940 info.stats.stats.sum.num_objects_missing = missing.num_missing();
1941 pg_log.activate_not_complete(info);
1942 }
1943
1944 log_weirdness();
1945
1946 // if primary..
1947 if (is_primary()) {
1948 ceph_assert(ctx);
1949 // start up replicas
1950
1951 ceph_assert(!acting_recovery_backfill.empty());
1952 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
1953 i != acting_recovery_backfill.end();
1954 ++i) {
1955 if (*i == pg_whoami) continue;
1956 pg_shard_t peer = *i;
1957 ceph_assert(peer_info.count(peer));
1958 pg_info_t& pi = peer_info[peer];
1959
1960 dout(10) << "activate peer osd." << peer << " " << pi << dendl;
1961
1962 MOSDPGLog *m = 0;
1963 ceph_assert(peer_missing.count(peer));
1964 pg_missing_t& pm = peer_missing[peer];
1965
1966 bool needs_past_intervals = pi.dne();
1967
1968 /*
1969 * cover case where peer sort order was different and
1970 * last_backfill cannot be interpreted
1971 */
1972 bool force_restart_backfill =
1973 !pi.last_backfill.is_max() &&
1974 !pi.last_backfill_bitwise;
1975
1976 if (pi.last_update == info.last_update && !force_restart_backfill) {
1977 // empty log
1978 if (!pi.last_backfill.is_max())
1979 osd->clog->info() << info.pgid << " continuing backfill to osd."
1980 << peer
1981 << " from (" << pi.log_tail << "," << pi.last_update
1982 << "] " << pi.last_backfill
1983 << " to " << info.last_update;
1984 if (!pi.is_empty() && activator_map) {
1985 dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
1986 (*activator_map)[peer.osd].push_back(
1987 make_pair(
1988 pg_notify_t(
1989 peer.shard, pg_whoami.shard,
1990 get_osdmap_epoch(),
1991 get_osdmap_epoch(),
1992 info),
1993 past_intervals));
1994 } else {
1995 dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
1996 m = new MOSDPGLog(
1997 i->shard, pg_whoami.shard,
1998 get_osdmap_epoch(), info,
1999 last_peering_reset);
2000 }
2001 } else if (
2002 pg_log.get_tail() > pi.last_update ||
2003 pi.last_backfill == hobject_t() ||
2004 force_restart_backfill ||
2005 (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
2006 /* ^ This last case covers a situation where a replica is not contiguous
2007 * with the auth_log, but is contiguous with this replica. Reshuffling
2008 * the active set to handle this would be tricky, so instead we just go
2009 * ahead and backfill it anyway. This is probably preferrable in any
2010 * case since the replica in question would have to be significantly
2011 * behind.
2012 */
2013 // backfill
2014 osd->clog->debug() << info.pgid << " starting backfill to osd." << peer
2015 << " from (" << pi.log_tail << "," << pi.last_update
2016 << "] " << pi.last_backfill
2017 << " to " << info.last_update;
2018
2019 pi.last_update = info.last_update;
2020 pi.last_complete = info.last_update;
2021 pi.set_last_backfill(hobject_t());
2022 pi.last_epoch_started = info.last_epoch_started;
2023 pi.last_interval_started = info.last_interval_started;
2024 pi.history = info.history;
2025 pi.hit_set = info.hit_set;
2026 // Save num_bytes for reservation request, can't be negative
2027 peer_bytes[peer] = std::max<int64_t>(0, pi.stats.stats.sum.num_bytes);
2028 pi.stats.stats.clear();
2029
2030 // initialize peer with our purged_snaps.
2031 pi.purged_snaps = info.purged_snaps;
2032
2033 m = new MOSDPGLog(
2034 i->shard, pg_whoami.shard,
2035 get_osdmap_epoch(), pi,
2036 last_peering_reset /* epoch to create pg at */);
2037
2038 // send some recent log, so that op dup detection works well.
2039 m->log.copy_up_to(pg_log.get_log(), cct->_conf->osd_min_pg_log_entries);
2040 m->info.log_tail = m->log.tail;
2041 pi.log_tail = m->log.tail; // sigh...
2042
2043 pm.clear();
2044 } else {
2045 // catch up
2046 ceph_assert(pg_log.get_tail() <= pi.last_update);
2047 m = new MOSDPGLog(
2048 i->shard, pg_whoami.shard,
2049 get_osdmap_epoch(), info,
2050 last_peering_reset /* epoch to create pg at */);
2051 // send new stuff to append to replicas log
2052 m->log.copy_after(pg_log.get_log(), pi.last_update);
2053 }
2054
2055 // share past_intervals if we are creating the pg on the replica
2056 // based on whether our info for that peer was dne() *before*
2057 // updating pi.history in the backfill block above.
2058 if (m && needs_past_intervals)
2059 m->past_intervals = past_intervals;
2060
2061 // update local version of peer's missing list!
2062 if (m && pi.last_backfill != hobject_t()) {
2063 for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
2064 p != m->log.log.end();
2065 ++p) {
2066 if (p->soid <= pi.last_backfill &&
2067 !p->is_error()) {
2068 if (perform_deletes_during_peering() && p->is_delete()) {
2069 pm.rm(p->soid, p->version);
2070 } else {
2071 pm.add_next_event(*p);
2072 }
2073 }
2074 }
2075 }
2076
2077 if (m) {
2078 dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
2079 //m->log.print(cout);
2080 osd->send_message_osd_cluster(peer.osd, m, get_osdmap_epoch());
2081 }
2082
2083 // peer now has
2084 pi.last_update = info.last_update;
2085
2086 // update our missing
2087 if (pm.num_missing() == 0) {
2088 pi.last_complete = pi.last_update;
2089 dout(10) << "activate peer osd." << peer << " " << pi << " uptodate" << dendl;
2090 } else {
2091 dout(10) << "activate peer osd." << peer << " " << pi << " missing " << pm << dendl;
2092 }
2093 }
2094
2095 // Set up missing_loc
2096 set<pg_shard_t> complete_shards;
2097 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2098 i != acting_recovery_backfill.end();
2099 ++i) {
2100 dout(20) << __func__ << " setting up missing_loc from shard " << *i << " " << dendl;
2101 if (*i == get_primary()) {
2102 missing_loc.add_active_missing(missing);
2103 if (!missing.have_missing())
2104 complete_shards.insert(*i);
2105 } else {
2106 auto peer_missing_entry = peer_missing.find(*i);
2107 ceph_assert(peer_missing_entry != peer_missing.end());
2108 missing_loc.add_active_missing(peer_missing_entry->second);
2109 if (!peer_missing_entry->second.have_missing() &&
2110 peer_info[*i].last_backfill.is_max())
2111 complete_shards.insert(*i);
2112 }
2113 }
2114
2115 // If necessary, create might_have_unfound to help us find our unfound objects.
2116 // NOTE: It's important that we build might_have_unfound before trimming the
2117 // past intervals.
2118 might_have_unfound.clear();
2119 if (needs_recovery()) {
2120 // If only one shard has missing, we do a trick to add all others as recovery
2121 // source, this is considered safe since the PGLogs have been merged locally,
2122 // and covers vast majority of the use cases, like one OSD/host is down for
2123 // a while for hardware repairing
2124 if (complete_shards.size() + 1 == acting_recovery_backfill.size()) {
2125 missing_loc.add_batch_sources_info(complete_shards, ctx->handle);
2126 } else {
2127 missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
2128 ctx->handle);
2129 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2130 i != acting_recovery_backfill.end();
2131 ++i) {
2132 if (*i == pg_whoami) continue;
2133 dout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
2134 ceph_assert(peer_missing.count(*i));
2135 ceph_assert(peer_info.count(*i));
2136 missing_loc.add_source_info(
2137 *i,
2138 peer_info[*i],
2139 peer_missing[*i],
2140 ctx->handle);
2141 }
2142 }
2143 for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
2144 i != peer_missing.end();
2145 ++i) {
2146 if (is_acting_recovery_backfill(i->first))
2147 continue;
2148 ceph_assert(peer_info.count(i->first));
2149 search_for_missing(
2150 peer_info[i->first],
2151 i->second,
2152 i->first,
2153 ctx);
2154 }
2155
2156 build_might_have_unfound();
2157
2158 // Always call now so _update_calc_stats() will be accurate
2159 discover_all_missing(query_map);
2160 }
2161
2162 // num_objects_degraded if calculated should reflect this too, unless no
2163 // missing and we are about to go clean.
2164 if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
2165 state_set(PG_STATE_UNDERSIZED);
2166 }
2167
2168 state_set(PG_STATE_ACTIVATING);
2169 release_pg_backoffs();
2170 projected_last_update = info.last_update;
2171 }
2172 if (acting.size() >= pool.info.min_size) {
2173 PGLogEntryHandler handler{this, &t};
2174 pg_log.roll_forward(&handler);
2175 }
2176 }
2177
2178 bool PG::op_has_sufficient_caps(OpRequestRef& op)
2179 {
2180 // only check MOSDOp
2181 if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
2182 return true;
2183
2184 const MOSDOp *req = static_cast<const MOSDOp*>(op->get_req());
2185
2186 auto priv = req->get_connection()->get_priv();
2187 auto session = static_cast<Session*>(priv.get());
2188 if (!session) {
2189 dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl;
2190 return false;
2191 }
2192 OSDCap& caps = session->caps;
2193 priv.reset();
2194
2195 const string &key = req->get_hobj().get_key().empty() ?
2196 req->get_oid().name :
2197 req->get_hobj().get_key();
2198
2199 bool cap = caps.is_capable(pool.name, req->get_hobj().nspace,
2200 pool.info.application_metadata,
2201 key,
2202 op->need_read_cap(),
2203 op->need_write_cap(),
2204 op->classes(),
2205 session->get_peer_socket_addr());
2206
2207 dout(20) << "op_has_sufficient_caps "
2208 << "session=" << session
2209 << " pool=" << pool.id << " (" << pool.name
2210 << " " << req->get_hobj().nspace
2211 << ")"
2212 << " pool_app_metadata=" << pool.info.application_metadata
2213 << " need_read_cap=" << op->need_read_cap()
2214 << " need_write_cap=" << op->need_write_cap()
2215 << " classes=" << op->classes()
2216 << " -> " << (cap ? "yes" : "NO")
2217 << dendl;
2218 return cap;
2219 }
2220
2221 void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
2222 {
2223 lock();
2224 if (pg_has_reset_since(epoch)) {
2225 dout(10) << "_activate_committed " << epoch
2226 << ", that was an old interval" << dendl;
2227 } else if (is_primary()) {
2228 ceph_assert(!peer_activated.count(pg_whoami));
2229 peer_activated.insert(pg_whoami);
2230 dout(10) << "_activate_committed " << epoch
2231 << " peer_activated now " << peer_activated
2232 << " last_interval_started " << info.history.last_interval_started
2233 << " last_epoch_started " << info.history.last_epoch_started
2234 << " same_interval_since " << info.history.same_interval_since << dendl;
2235 ceph_assert(!acting_recovery_backfill.empty());
2236 if (peer_activated.size() == acting_recovery_backfill.size())
2237 all_activated_and_committed();
2238 } else {
2239 dout(10) << "_activate_committed " << epoch << " telling primary" << dendl;
2240 MOSDPGInfo *m = new MOSDPGInfo(epoch);
2241 pg_notify_t i = pg_notify_t(
2242 get_primary().shard, pg_whoami.shard,
2243 get_osdmap_epoch(),
2244 get_osdmap_epoch(),
2245 info);
2246
2247 i.info.history.last_epoch_started = activation_epoch;
2248 i.info.history.last_interval_started = i.info.history.same_interval_since;
2249 if (acting.size() >= pool.info.min_size) {
2250 state_set(PG_STATE_ACTIVE);
2251 } else {
2252 state_set(PG_STATE_PEERED);
2253 }
2254
2255 m->pg_list.push_back(make_pair(i, PastIntervals()));
2256 osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap_epoch());
2257
2258 // waiters
2259 if (flushes_in_progress == 0) {
2260 requeue_ops(waiting_for_peered);
2261 } else if (!waiting_for_peered.empty()) {
2262 dout(10) << __func__ << " flushes in progress, moving "
2263 << waiting_for_peered.size() << " items to waiting_for_flush"
2264 << dendl;
2265 ceph_assert(waiting_for_flush.empty());
2266 waiting_for_flush.swap(waiting_for_peered);
2267 }
2268 }
2269
2270 ceph_assert(!dirty_info);
2271
2272 unlock();
2273 }
2274
2275 /*
2276 * update info.history.last_epoch_started ONLY after we and all
2277 * replicas have activated AND committed the activate transaction
2278 * (i.e. the peering results are stable on disk).
2279 */
2280 void PG::all_activated_and_committed()
2281 {
2282 dout(10) << "all_activated_and_committed" << dendl;
2283 ceph_assert(is_primary());
2284 ceph_assert(peer_activated.size() == acting_recovery_backfill.size());
2285 ceph_assert(!acting_recovery_backfill.empty());
2286 ceph_assert(blocked_by.empty());
2287
2288 // Degraded?
2289 _update_calc_stats();
2290 if (info.stats.stats.sum.num_objects_degraded) {
2291 state_set(PG_STATE_DEGRADED);
2292 } else {
2293 state_clear(PG_STATE_DEGRADED);
2294 }
2295
2296 queue_peering_event(
2297 PGPeeringEventRef(
2298 std::make_shared<PGPeeringEvent>(
2299 get_osdmap_epoch(),
2300 get_osdmap_epoch(),
2301 AllReplicasActivated())));
2302 }
2303
2304 bool PG::requeue_scrub(bool high_priority)
2305 {
2306 ceph_assert(is_locked());
2307 if (scrub_queued) {
2308 dout(10) << __func__ << ": already queued" << dendl;
2309 return false;
2310 } else {
2311 dout(10) << __func__ << ": queueing" << dendl;
2312 scrub_queued = true;
2313 osd->queue_for_scrub(this, high_priority);
2314 return true;
2315 }
2316 }
2317
2318 void PG::queue_recovery()
2319 {
2320 if (!is_primary() || !is_peered()) {
2321 dout(10) << "queue_recovery -- not primary or not peered " << dendl;
2322 ceph_assert(!recovery_queued);
2323 } else if (recovery_queued) {
2324 dout(10) << "queue_recovery -- already queued" << dendl;
2325 } else {
2326 dout(10) << "queue_recovery -- queuing" << dendl;
2327 recovery_queued = true;
2328 osd->queue_for_recovery(this);
2329 }
2330 }
2331
2332 bool PG::queue_scrub()
2333 {
2334 ceph_assert(is_locked());
2335 if (is_scrubbing()) {
2336 return false;
2337 }
2338 // An interrupted recovery repair could leave this set.
2339 state_clear(PG_STATE_REPAIR);
2340 scrubber.priority = scrubber.must_scrub ?
2341 cct->_conf->osd_requested_scrub_priority : get_scrub_priority();
2342 scrubber.must_scrub = false;
2343 state_set(PG_STATE_SCRUBBING);
2344 if (scrubber.must_deep_scrub) {
2345 state_set(PG_STATE_DEEP_SCRUB);
2346 scrubber.must_deep_scrub = false;
2347 }
2348 if (scrubber.must_repair || scrubber.auto_repair) {
2349 state_set(PG_STATE_REPAIR);
2350 scrubber.must_repair = false;
2351 }
2352 requeue_scrub();
2353 return true;
2354 }
2355
2356 unsigned PG::get_scrub_priority()
2357 {
2358 // a higher value -> a higher priority
2359 int64_t pool_scrub_priority = 0;
2360 pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority);
2361 return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
2362 }
2363
2364 void PG::try_mark_clean()
2365 {
2366 if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2367 state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
2368 state_set(PG_STATE_CLEAN);
2369 info.history.last_epoch_clean = get_osdmap_epoch();
2370 info.history.last_interval_clean = info.history.same_interval_since;
2371 past_intervals.clear();
2372 dirty_big_info = true;
2373 dirty_info = true;
2374 }
2375
2376 if (is_active()) {
2377 kick_snap_trim();
2378 } else if (is_peered()) {
2379 if (is_clean()) {
2380 bool target;
2381 if (pool.info.is_pending_merge(info.pgid.pgid, &target)) {
2382 if (target) {
2383 ldout(cct, 10) << "ready to merge (target)" << dendl;
2384 osd->set_ready_to_merge_target(this,
2385 info.last_update,
2386 info.history.last_epoch_started,
2387 info.history.last_epoch_clean);
2388 } else {
2389 ldout(cct, 10) << "ready to merge (source)" << dendl;
2390 osd->set_ready_to_merge_source(this, info.last_update);
2391 }
2392 }
2393 } else {
2394 ldout(cct, 10) << "not clean, not ready to merge" << dendl;
2395 // we should have notified OSD in Active state entry point
2396 }
2397 }
2398
2399 state_clear(PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
2400
2401 share_pg_info();
2402 publish_stats_to_osd();
2403 requeue_ops(waiting_for_clean_to_primary_repair);
2404 }
2405
2406 bool PG::set_force_recovery(bool b)
2407 {
2408 bool did = false;
2409 if (b) {
2410 if (!(state & PG_STATE_FORCED_RECOVERY) &&
2411 (state & (PG_STATE_DEGRADED |
2412 PG_STATE_RECOVERY_WAIT |
2413 PG_STATE_RECOVERING))) {
2414 dout(20) << __func__ << " set" << dendl;
2415 state_set(PG_STATE_FORCED_RECOVERY);
2416 publish_stats_to_osd();
2417 did = true;
2418 }
2419 } else if (state & PG_STATE_FORCED_RECOVERY) {
2420 dout(20) << __func__ << " clear" << dendl;
2421 state_clear(PG_STATE_FORCED_RECOVERY);
2422 publish_stats_to_osd();
2423 did = true;
2424 }
2425 if (did) {
2426 dout(20) << __func__ << " state " << pgstate_history.get_current_state() << dendl;
2427 osd->local_reserver.update_priority(info.pgid, get_recovery_priority());
2428 }
2429 return did;
2430 }
2431
2432 bool PG::set_force_backfill(bool b)
2433 {
2434 bool did = false;
2435 if (b) {
2436 if (!(state & PG_STATE_FORCED_BACKFILL) &&
2437 (state & (PG_STATE_DEGRADED |
2438 PG_STATE_BACKFILL_WAIT |
2439 PG_STATE_BACKFILLING))) {
2440 dout(10) << __func__ << " set" << dendl;
2441 state_set(PG_STATE_FORCED_BACKFILL);
2442 publish_stats_to_osd();
2443 did = true;
2444 }
2445 } else if (state & PG_STATE_FORCED_BACKFILL) {
2446 dout(10) << __func__ << " clear" << dendl;
2447 state_clear(PG_STATE_FORCED_BACKFILL);
2448 publish_stats_to_osd();
2449 did = true;
2450 }
2451 if (did) {
2452 dout(20) << __func__ << " state " << pgstate_history.get_current_state() << dendl;
2453 osd->local_reserver.update_priority(info.pgid, get_backfill_priority());
2454 }
2455 return did;
2456 }
2457
2458 inline int PG::clamp_recovery_priority(int priority)
2459 {
2460 static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
2461 static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
2462
2463 // Clamp to valid range
2464 if (priority > OSD_RECOVERY_PRIORITY_MAX) {
2465 return OSD_RECOVERY_PRIORITY_MAX;
2466 } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
2467 return OSD_RECOVERY_PRIORITY_MIN;
2468 } else {
2469 return priority;
2470 }
2471 }
2472
2473 unsigned PG::get_recovery_priority()
2474 {
2475 // a higher value -> a higher priority
2476 int64_t ret = 0;
2477
2478 if (state & PG_STATE_FORCED_RECOVERY) {
2479 ret = OSD_RECOVERY_PRIORITY_FORCED;
2480 } else {
2481 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &ret);
2482 ret = clamp_recovery_priority(OSD_RECOVERY_PRIORITY_BASE + ret);
2483 }
2484 dout(20) << __func__ << " recovery priority for " << *this << " is " << ret << ", state is " << state << dendl;
2485 return static_cast<unsigned>(ret);
2486 }
2487
2488 unsigned PG::get_backfill_priority()
2489 {
2490 // a higher value -> a higher priority
2491 int ret = OSD_BACKFILL_PRIORITY_BASE;
2492 if (state & PG_STATE_FORCED_BACKFILL) {
2493 ret = OSD_BACKFILL_PRIORITY_FORCED;
2494 } else {
2495 if (acting.size() < pool.info.min_size) {
2496 // inactive: no. of replicas < min_size, highest priority since it blocks IO
2497 ret = OSD_BACKFILL_INACTIVE_PRIORITY_BASE + (pool.info.min_size - acting.size());
2498
2499 } else if (is_undersized()) {
2500 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
2501 ceph_assert(pool.info.size > actingset.size());
2502 ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE + (pool.info.size - actingset.size());
2503
2504 } else if (is_degraded()) {
2505 // degraded: baseline degraded
2506 ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
2507 }
2508
2509 // Adjust with pool's recovery priority
2510 int64_t pool_recovery_priority = 0;
2511 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2512
2513 ret = clamp_recovery_priority(pool_recovery_priority + ret);
2514 }
2515
2516 return static_cast<unsigned>(ret);
2517 }
2518
2519 unsigned PG::get_delete_priority()
2520 {
2521 auto state = get_osdmap()->get_state(osd->whoami);
2522 if (state & (CEPH_OSD_BACKFILLFULL |
2523 CEPH_OSD_FULL)) {
2524 return OSD_DELETE_PRIORITY_FULL;
2525 } else if (state & CEPH_OSD_NEARFULL) {
2526 return OSD_DELETE_PRIORITY_FULLISH;
2527 } else {
2528 return OSD_DELETE_PRIORITY_NORMAL;
2529 }
2530 }
2531
2532 Context *PG::finish_recovery()
2533 {
2534 dout(10) << "finish_recovery" << dendl;
2535 ceph_assert(info.last_complete == info.last_update);
2536
2537 clear_recovery_state();
2538
2539 /*
2540 * sync all this before purging strays. but don't block!
2541 */
2542 finish_sync_event = new C_PG_FinishRecovery(this);
2543 return finish_sync_event;
2544 }
2545
2546 void PG::_finish_recovery(Context *c)
2547 {
2548 lock();
2549 // When recovery is initiated by a repair, that flag is left on
2550 state_clear(PG_STATE_REPAIR);
2551 if (deleting) {
2552 unlock();
2553 return;
2554 }
2555 if (c == finish_sync_event) {
2556 dout(10) << "_finish_recovery" << dendl;
2557 finish_sync_event = 0;
2558 purge_strays();
2559
2560 publish_stats_to_osd();
2561
2562 if (scrub_after_recovery) {
2563 dout(10) << "_finish_recovery requeueing for scrub" << dendl;
2564 scrub_after_recovery = false;
2565 scrubber.must_deep_scrub = true;
2566 scrubber.check_repair = true;
2567 queue_scrub();
2568 }
2569 } else {
2570 dout(10) << "_finish_recovery -- stale" << dendl;
2571 }
2572 unlock();
2573 }
2574
2575 void PG::start_recovery_op(const hobject_t& soid)
2576 {
2577 dout(10) << "start_recovery_op " << soid
2578 #ifdef DEBUG_RECOVERY_OIDS
2579 << " (" << recovering_oids << ")"
2580 #endif
2581 << dendl;
2582 ceph_assert(recovery_ops_active >= 0);
2583 recovery_ops_active++;
2584 #ifdef DEBUG_RECOVERY_OIDS
2585 recovering_oids.insert(soid);
2586 #endif
2587 osd->start_recovery_op(this, soid);
2588 }
2589
2590 void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
2591 {
2592 dout(10) << "finish_recovery_op " << soid
2593 #ifdef DEBUG_RECOVERY_OIDS
2594 << " (" << recovering_oids << ")"
2595 #endif
2596 << dendl;
2597 ceph_assert(recovery_ops_active > 0);
2598 recovery_ops_active--;
2599 #ifdef DEBUG_RECOVERY_OIDS
2600 ceph_assert(recovering_oids.count(soid));
2601 recovering_oids.erase(recovering_oids.find(soid));
2602 #endif
2603 osd->finish_recovery_op(this, soid, dequeue);
2604
2605 if (!dequeue) {
2606 queue_recovery();
2607 }
2608 }
2609
2610 void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
2611 {
2612 child->update_snap_mapper_bits(split_bits);
2613 child->update_osdmap_ref(get_osdmap());
2614
2615 child->pool = pool;
2616
2617 // Log
2618 pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2619 child->info.last_complete = info.last_complete;
2620
2621 info.last_update = pg_log.get_head();
2622 child->info.last_update = child->pg_log.get_head();
2623
2624 child->info.last_user_version = info.last_user_version;
2625
2626 info.log_tail = pg_log.get_tail();
2627 child->info.log_tail = child->pg_log.get_tail();
2628
2629 if (info.last_complete < pg_log.get_tail())
2630 info.last_complete = pg_log.get_tail();
2631 if (child->info.last_complete < child->pg_log.get_tail())
2632 child->info.last_complete = child->pg_log.get_tail();
2633
2634 // Info
2635 child->info.history = info.history;
2636 child->info.history.epoch_created = get_osdmap_epoch();
2637 child->info.purged_snaps = info.purged_snaps;
2638
2639 if (info.last_backfill.is_max()) {
2640 child->info.set_last_backfill(hobject_t::get_max());
2641 } else {
2642 // restart backfill on parent and child to be safe. we could
2643 // probably do better in the bitwise sort case, but it's more
2644 // fragile (there may be special work to do on backfill completion
2645 // in the future).
2646 info.set_last_backfill(hobject_t());
2647 child->info.set_last_backfill(hobject_t());
2648 // restarting backfill implies that the missing set is empty,
2649 // since it is only used for objects prior to last_backfill
2650 pg_log.reset_backfill();
2651 child->pg_log.reset_backfill();
2652 }
2653
2654 child->info.stats = info.stats;
2655 child->info.stats.parent_split_bits = split_bits;
2656 info.stats.stats_invalid = true;
2657 child->info.stats.stats_invalid = true;
2658 child->info.last_epoch_started = info.last_epoch_started;
2659 child->info.last_interval_started = info.last_interval_started;
2660
2661 child->snap_trimq = snap_trimq;
2662
2663 // There can't be recovery/backfill going on now
2664 int primary, up_primary;
2665 vector<int> newup, newacting;
2666 get_osdmap()->pg_to_up_acting_osds(
2667 child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2668 child->init_primary_up_acting(
2669 newup,
2670 newacting,
2671 up_primary,
2672 primary);
2673 child->role = OSDMap::calc_pg_role(osd->whoami, child->acting);
2674
2675 // this comparison includes primary rank via pg_shard_t
2676 if (get_primary() != child->get_primary())
2677 child->info.history.same_primary_since = get_osdmap_epoch();
2678
2679 child->info.stats.up = up;
2680 child->info.stats.up_primary = up_primary;
2681 child->info.stats.acting = acting;
2682 child->info.stats.acting_primary = primary;
2683 child->info.stats.mapping_epoch = get_osdmap_epoch();
2684
2685 // History
2686 child->past_intervals = past_intervals;
2687
2688 _split_into(child_pgid, child, split_bits);
2689
2690 // release all backoffs for simplicity
2691 release_backoffs(hobject_t(), hobject_t::get_max());
2692
2693 child->on_new_interval();
2694
2695 child->send_notify = !child->is_primary();
2696
2697 child->dirty_info = true;
2698 child->dirty_big_info = true;
2699 dirty_info = true;
2700 dirty_big_info = true;
2701 }
2702
2703 void PG::start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
2704 {
2705 out->resize(childpgs.size() + 1);
2706 info.stats.stats.sum.split(*out);
2707 }
2708
2709 void PG::finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction *t)
2710 {
2711 info.stats.stats.sum = stats;
2712 write_if_dirty(*t);
2713 }
2714
2715 void PG::merge_from(map<spg_t,PGRef>& sources, RecoveryCtx *rctx,
2716 unsigned split_bits,
2717 const pg_merge_meta_t& last_pg_merge_meta)
2718 {
2719 dout(10) << __func__ << " from " << sources << " split_bits " << split_bits
2720 << dendl;
2721 bool incomplete = false;
2722 if (info.last_complete != info.last_update ||
2723 info.is_incomplete() ||
2724 info.dne()) {
2725 dout(10) << __func__ << " target incomplete" << dendl;
2726 incomplete = true;
2727 }
2728 if (last_pg_merge_meta.source_pgid != pg_t()) {
2729 if (info.pgid.pgid != last_pg_merge_meta.source_pgid.get_parent()) {
2730 dout(10) << __func__ << " target doesn't match expected parent "
2731 << last_pg_merge_meta.source_pgid.get_parent()
2732 << " of source_pgid " << last_pg_merge_meta.source_pgid
2733 << dendl;
2734 incomplete = true;
2735 }
2736 if (info.last_update != last_pg_merge_meta.target_version) {
2737 dout(10) << __func__ << " target version doesn't match expected "
2738 << last_pg_merge_meta.target_version << dendl;
2739 incomplete = true;
2740 }
2741 }
2742
2743 PGLogEntryHandler handler{this, rctx->transaction};
2744 pg_log.roll_forward(&handler);
2745
2746 info.last_complete = info.last_update; // to fake out trim()
2747 pg_log.reset_recovery_pointers();
2748 pg_log.trim(info.last_update, info);
2749
2750 vector<PGLog*> log_from;
2751 for (auto& i : sources) {
2752 auto& source = i.second;
2753 if (!source) {
2754 dout(10) << __func__ << " source " << i.first << " missing" << dendl;
2755 incomplete = true;
2756 continue;
2757 }
2758 if (source->info.last_complete != source->info.last_update ||
2759 source->info.is_incomplete() ||
2760 source->info.dne()) {
2761 dout(10) << __func__ << " source " << source->pg_id << " incomplete"
2762 << dendl;
2763 incomplete = true;
2764 }
2765 if (last_pg_merge_meta.source_pgid != pg_t()) {
2766 if (source->info.pgid.pgid != last_pg_merge_meta.source_pgid) {
2767 dout(10) << __func__ << " source " << source->info.pgid.pgid
2768 << " doesn't match expected source pgid "
2769 << last_pg_merge_meta.source_pgid << dendl;
2770 incomplete = true;
2771 }
2772 if (source->info.last_update != last_pg_merge_meta.source_version) {
2773 dout(10) << __func__ << " source version doesn't match expected "
2774 << last_pg_merge_meta.target_version << dendl;
2775 incomplete = true;
2776 }
2777 }
2778
2779 // prepare log
2780 PGLogEntryHandler handler{source.get(), rctx->transaction};
2781 source->pg_log.roll_forward(&handler);
2782 source->info.last_complete = source->info.last_update; // to fake out trim()
2783 source->pg_log.reset_recovery_pointers();
2784 source->pg_log.trim(source->info.last_update, source->info);
2785 log_from.push_back(&source->pg_log);
2786
2787 // wipe out source's pgmeta
2788 rctx->transaction->remove(source->coll, source->pgmeta_oid);
2789
2790 // merge (and destroy source collection)
2791 rctx->transaction->merge_collection(source->coll, coll, split_bits);
2792
2793 // combine stats
2794 info.stats.add(source->info.stats);
2795
2796 // pull up last_update
2797 info.last_update = std::max(info.last_update, source->info.last_update);
2798
2799 // adopt source's PastIntervals if target has none. we can do this since
2800 // pgp_num has been reduced prior to the merge, so the OSD mappings for
2801 // the PGs are identical.
2802 if (past_intervals.empty() && !source->past_intervals.empty()) {
2803 dout(10) << __func__ << " taking source's past_intervals" << dendl;
2804 past_intervals = source->past_intervals;
2805 }
2806 }
2807
2808 // merge_collection does this, but maybe all of our sources were missing.
2809 rctx->transaction->collection_set_bits(coll, split_bits);
2810
2811 info.last_complete = info.last_update;
2812 info.log_tail = info.last_update;
2813 if (incomplete) {
2814 info.last_backfill = hobject_t();
2815 }
2816
2817 snap_mapper.update_bits(split_bits);
2818
2819 // merge logs
2820 pg_log.merge_from(log_from, info.last_update);
2821
2822 // make sure we have a meaningful last_epoch_started/clean (if we were a
2823 // placeholder)
2824 if (info.last_epoch_started == 0) {
2825 // start with (a) source's history, since these PGs *should* have been
2826 // remapped in concert with each other...
2827 info.history = sources.begin()->second->info.history;
2828
2829 // we use the last_epoch_{started,clean} we got from
2830 // the caller, which are the epochs that were reported by the PGs were
2831 // found to be ready for merge.
2832 info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean;
2833 info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started;
2834 info.last_epoch_started = last_pg_merge_meta.last_epoch_started;
2835 dout(10) << __func__
2836 << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/"
2837 << last_pg_merge_meta.last_epoch_clean
2838 << " from pool last_dec_*, source pg history was "
2839 << sources.begin()->second->info.history
2840 << dendl;
2841
2842 // if the past_intervals start is later than last_epoch_clean, it
2843 // implies the source repeered again but the target didn't, or
2844 // that the source became clean in a later epoch than the target.
2845 // avoid the discrepancy but adjusting the interval start
2846 // backwards to match so that check_past_interval_bounds() will
2847 // not complain.
2848 auto pib = past_intervals.get_bounds();
2849 if (info.history.last_epoch_clean < pib.first) {
2850 dout(10) << __func__ << " last_epoch_clean "
2851 << info.history.last_epoch_clean << " < past_interval start "
2852 << pib.first << ", adjusting start backwards" << dendl;
2853 past_intervals.adjust_start_backwards(info.history.last_epoch_clean);
2854 }
2855
2856 // Similarly, if the same_interval_since value is later than
2857 // last_epoch_clean, the next interval change will result in a
2858 // past_interval start that is later than last_epoch_clean. This
2859 // can happen if we use the pg_history values from the merge
2860 // source. Adjust the same_interval_since value backwards if that
2861 // happens. (We trust the les and lec values more because they came from
2862 // the real target, whereas the history value we stole from the source.)
2863 if (info.history.last_epoch_started < info.history.same_interval_since) {
2864 dout(10) << __func__ << " last_epoch_started "
2865 << info.history.last_epoch_started << " < same_interval_since "
2866 << info.history.same_interval_since
2867 << ", adjusting pg_history backwards" << dendl;
2868 info.history.same_interval_since = info.history.last_epoch_clean;
2869 // make sure same_{up,primary}_since are <= same_interval_since
2870 info.history.same_up_since = std::min(
2871 info.history.same_up_since, info.history.same_interval_since);
2872 info.history.same_primary_since = std::min(
2873 info.history.same_primary_since, info.history.same_interval_since);
2874 }
2875 }
2876
2877 dirty_info = true;
2878 dirty_big_info = true;
2879 }
2880
2881 void PG::add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end)
2882 {
2883 ConnectionRef con = s->con;
2884 if (!con) // OSD::ms_handle_reset clears s->con without a lock
2885 return;
2886 BackoffRef b(s->have_backoff(info.pgid, begin));
2887 if (b) {
2888 derr << __func__ << " already have backoff for " << s << " begin " << begin
2889 << " " << *b << dendl;
2890 ceph_abort();
2891 }
2892 std::lock_guard l(backoff_lock);
2893 {
2894 b = new Backoff(info.pgid, this, s, ++s->backoff_seq, begin, end);
2895 backoffs[begin].insert(b);
2896 s->add_backoff(b);
2897 dout(10) << __func__ << " session " << s << " added " << *b << dendl;
2898 }
2899 con->send_message(
2900 new MOSDBackoff(
2901 info.pgid,
2902 get_osdmap_epoch(),
2903 CEPH_OSD_BACKOFF_OP_BLOCK,
2904 b->id,
2905 begin,
2906 end));
2907 }
2908
2909 void PG::release_backoffs(const hobject_t& begin, const hobject_t& end)
2910 {
2911 dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl;
2912 vector<BackoffRef> bv;
2913 {
2914 std::lock_guard l(backoff_lock);
2915 auto p = backoffs.lower_bound(begin);
2916 while (p != backoffs.end()) {
2917 int r = cmp(p->first, end);
2918 dout(20) << __func__ << " ? " << r << " " << p->first
2919 << " " << p->second << dendl;
2920 // note: must still examine begin=end=p->first case
2921 if (r > 0 || (r == 0 && begin < end)) {
2922 break;
2923 }
2924 dout(20) << __func__ << " checking " << p->first
2925 << " " << p->second << dendl;
2926 auto q = p->second.begin();
2927 while (q != p->second.end()) {
2928 dout(20) << __func__ << " checking " << *q << dendl;
2929 int r = cmp((*q)->begin, begin);
2930 if (r == 0 || (r > 0 && (*q)->end < end)) {
2931 bv.push_back(*q);
2932 q = p->second.erase(q);
2933 } else {
2934 ++q;
2935 }
2936 }
2937 if (p->second.empty()) {
2938 p = backoffs.erase(p);
2939 } else {
2940 ++p;
2941 }
2942 }
2943 }
2944 for (auto b : bv) {
2945 std::lock_guard l(b->lock);
2946 dout(10) << __func__ << " " << *b << dendl;
2947 if (b->session) {
2948 ceph_assert(b->pg == this);
2949 ConnectionRef con = b->session->con;
2950 if (con) { // OSD::ms_handle_reset clears s->con without a lock
2951 con->send_message(
2952 new MOSDBackoff(
2953 info.pgid,
2954 get_osdmap_epoch(),
2955 CEPH_OSD_BACKOFF_OP_UNBLOCK,
2956 b->id,
2957 b->begin,
2958 b->end));
2959 }
2960 if (b->is_new()) {
2961 b->state = Backoff::STATE_DELETING;
2962 } else {
2963 b->session->rm_backoff(b);
2964 b->session.reset();
2965 }
2966 b->pg.reset();
2967 }
2968 }
2969 }
2970
2971 void PG::clear_backoffs()
2972 {
2973 dout(10) << __func__ << " " << dendl;
2974 map<hobject_t,set<BackoffRef>> ls;
2975 {
2976 std::lock_guard l(backoff_lock);
2977 ls.swap(backoffs);
2978 }
2979 for (auto& p : ls) {
2980 for (auto& b : p.second) {
2981 std::lock_guard l(b->lock);
2982 dout(10) << __func__ << " " << *b << dendl;
2983 if (b->session) {
2984 ceph_assert(b->pg == this);
2985 if (b->is_new()) {
2986 b->state = Backoff::STATE_DELETING;
2987 } else {
2988 b->session->rm_backoff(b);
2989 b->session.reset();
2990 }
2991 b->pg.reset();
2992 }
2993 }
2994 }
2995 }
2996
2997 // called by Session::clear_backoffs()
2998 void PG::rm_backoff(BackoffRef b)
2999 {
3000 dout(10) << __func__ << " " << *b << dendl;
3001 std::lock_guard l(backoff_lock);
3002 ceph_assert(b->lock.is_locked_by_me());
3003 ceph_assert(b->pg == this);
3004 auto p = backoffs.find(b->begin);
3005 // may race with release_backoffs()
3006 if (p != backoffs.end()) {
3007 auto q = p->second.find(b);
3008 if (q != p->second.end()) {
3009 p->second.erase(q);
3010 if (p->second.empty()) {
3011 backoffs.erase(p);
3012 }
3013 }
3014 }
3015 }
3016
3017 void PG::clear_recovery_state()
3018 {
3019 dout(10) << "clear_recovery_state" << dendl;
3020
3021 pg_log.reset_recovery_pointers();
3022 finish_sync_event = 0;
3023
3024 hobject_t soid;
3025 while (recovery_ops_active > 0) {
3026 #ifdef DEBUG_RECOVERY_OIDS
3027 soid = *recovering_oids.begin();
3028 #endif
3029 finish_recovery_op(soid, true);
3030 }
3031
3032 async_recovery_targets.clear();
3033 backfill_targets.clear();
3034 backfill_info.clear();
3035 peer_backfill_info.clear();
3036 waiting_on_backfill.clear();
3037 _clear_recovery_state(); // pg impl specific hook
3038 }
3039
3040 void PG::cancel_recovery()
3041 {
3042 dout(10) << "cancel_recovery" << dendl;
3043 clear_recovery_state();
3044 }
3045
3046
3047 void PG::purge_strays()
3048 {
3049 if (is_premerge()) {
3050 dout(10) << "purge_strays " << stray_set << " but premerge, doing nothing"
3051 << dendl;
3052 return;
3053 }
3054 if (cct->_conf.get_val<bool>("osd_debug_no_purge_strays")) {
3055 return;
3056 }
3057 dout(10) << "purge_strays " << stray_set << dendl;
3058
3059 bool removed = false;
3060 for (set<pg_shard_t>::iterator p = stray_set.begin();
3061 p != stray_set.end();
3062 ++p) {
3063 ceph_assert(!is_acting_recovery_backfill(*p));
3064 if (get_osdmap()->is_up(p->osd)) {
3065 dout(10) << "sending PGRemove to osd." << *p << dendl;
3066 vector<spg_t> to_remove;
3067 to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
3068 MOSDPGRemove *m = new MOSDPGRemove(
3069 get_osdmap_epoch(),
3070 to_remove);
3071 osd->send_message_osd_cluster(p->osd, m, get_osdmap_epoch());
3072 } else {
3073 dout(10) << "not sending PGRemove to down osd." << *p << dendl;
3074 }
3075 peer_missing.erase(*p);
3076 peer_info.erase(*p);
3077 peer_purged.insert(*p);
3078 removed = true;
3079 }
3080
3081 // if we removed anyone, update peers (which include peer_info)
3082 if (removed)
3083 update_heartbeat_peers();
3084
3085 stray_set.clear();
3086
3087 // clear _requested maps; we may have to peer() again if we discover
3088 // (more) stray content
3089 peer_log_requested.clear();
3090 peer_missing_requested.clear();
3091 }
3092
3093 void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
3094 {
3095 std::lock_guard l(heartbeat_peer_lock);
3096 probe_targets.clear();
3097 for (set<pg_shard_t>::iterator i = probe_set.begin();
3098 i != probe_set.end();
3099 ++i) {
3100 probe_targets.insert(i->osd);
3101 }
3102 }
3103
3104 void PG::clear_probe_targets()
3105 {
3106 std::lock_guard l(heartbeat_peer_lock);
3107 probe_targets.clear();
3108 }
3109
3110 void PG::update_heartbeat_peers()
3111 {
3112 ceph_assert(is_locked());
3113
3114 if (!is_primary())
3115 return;
3116
3117 set<int> new_peers;
3118 for (unsigned i=0; i<acting.size(); i++) {
3119 if (acting[i] != CRUSH_ITEM_NONE)
3120 new_peers.insert(acting[i]);
3121 }
3122 for (unsigned i=0; i<up.size(); i++) {
3123 if (up[i] != CRUSH_ITEM_NONE)
3124 new_peers.insert(up[i]);
3125 }
3126 for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
3127 p != peer_info.end();
3128 ++p)
3129 new_peers.insert(p->first.osd);
3130
3131 bool need_update = false;
3132 heartbeat_peer_lock.Lock();
3133 if (new_peers == heartbeat_peers) {
3134 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl;
3135 } else {
3136 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl;
3137 heartbeat_peers.swap(new_peers);
3138 need_update = true;
3139 }
3140 heartbeat_peer_lock.Unlock();
3141
3142 if (need_update)
3143 osd->need_heartbeat_peer_update();
3144 }
3145
3146
3147 bool PG::check_in_progress_op(
3148 const osd_reqid_t &r,
3149 eversion_t *version,
3150 version_t *user_version,
3151 int *return_code) const
3152 {
3153 return (
3154 projected_log.get_request(r, version, user_version, return_code) ||
3155 pg_log.get_log().get_request(r, version, user_version, return_code));
3156 }
3157
3158 static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard)
3159 {
3160 for (auto&p : pgs)
3161 if (p.shard == shard)
3162 return true;
3163 return false;
3164 }
3165
3166 static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard)
3167 {
3168 for (auto&p : pgs) {
3169 if (p == skip)
3170 continue;
3171 if (p.shard == shard)
3172 return p;
3173 }
3174 return pg_shard_t();
3175 }
3176
3177 void PG::_update_calc_stats()
3178 {
3179 info.stats.version = info.last_update;
3180 info.stats.created = info.history.epoch_created;
3181 info.stats.last_scrub = info.history.last_scrub;
3182 info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
3183 info.stats.last_deep_scrub = info.history.last_deep_scrub;
3184 info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
3185 info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
3186 info.stats.last_epoch_clean = info.history.last_epoch_clean;
3187
3188 info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
3189 info.stats.ondisk_log_size = info.stats.log_size;
3190 info.stats.log_start = pg_log.get_tail();
3191 info.stats.ondisk_log_start = pg_log.get_tail();
3192 info.stats.snaptrimq_len = snap_trimq.size();
3193
3194 unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
3195
3196 // In rare case that upset is too large (usually transient), use as target
3197 // for calculations below.
3198 unsigned target = std::max(num_shards, (unsigned)upset.size());
3199 // For undersized actingset may be larger with OSDs out
3200 unsigned nrep = std::max(actingset.size(), upset.size());
3201 // calc num_object_copies
3202 info.stats.stats.calc_copies(std::max(target, nrep));
3203 info.stats.stats.sum.num_objects_degraded = 0;
3204 info.stats.stats.sum.num_objects_unfound = 0;
3205 info.stats.stats.sum.num_objects_misplaced = 0;
3206
3207 if ((is_remapped() || is_undersized() || !is_clean()) && (is_peered() || is_activating())) {
3208 dout(20) << __func__ << " actingset " << actingset << " upset "
3209 << upset << " acting_recovery_backfill " << acting_recovery_backfill << dendl;
3210 dout(20) << __func__ << " acting " << acting << " up " << up << dendl;
3211
3212 ceph_assert(!acting_recovery_backfill.empty());
3213
3214 bool estimate = false;
3215
3216 // NOTE: we only generate degraded, misplaced and unfound
3217 // values for the summation, not individual stat categories.
3218 int64_t num_objects = info.stats.stats.sum.num_objects;
3219
3220 // Objects missing from up nodes, sorted by # objects.
3221 boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
3222 // Objects missing from nodes not in up, sort by # objects
3223 boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
3224
3225 // Fill missing_target_objects/acting_source_objects
3226
3227 {
3228 int64_t missing;
3229
3230 // Primary first
3231 missing = pg_log.get_missing().num_missing();
3232 ceph_assert(acting_recovery_backfill.count(pg_whoami));
3233 if (upset.count(pg_whoami)) {
3234 missing_target_objects.insert(make_pair(missing, pg_whoami));
3235 } else {
3236 acting_source_objects.insert(make_pair(missing, pg_whoami));
3237 }
3238 info.stats.stats.sum.num_objects_missing_on_primary = missing;
3239 dout(20) << __func__ << " shard " << pg_whoami
3240 << " primary objects " << num_objects
3241 << " missing " << missing
3242 << dendl;
3243 }
3244
3245 // All other peers
3246 for (auto& peer : peer_info) {
3247 // Primary should not be in the peer_info, skip if it is.
3248 if (peer.first == pg_whoami) continue;
3249 int64_t missing = 0;
3250 int64_t peer_num_objects = peer.second.stats.stats.sum.num_objects;
3251 // Backfill targets always track num_objects accurately
3252 // all other peers track missing accurately.
3253 if (is_backfill_targets(peer.first)) {
3254 missing = std::max((int64_t)0, num_objects - peer_num_objects);
3255 } else {
3256 if (peer_missing.count(peer.first)) {
3257 missing = peer_missing[peer.first].num_missing();
3258 } else {
3259 dout(20) << __func__ << " no peer_missing found for " << peer.first << dendl;
3260 if (is_recovering()) {
3261 estimate = true;
3262 }
3263 missing = std::max((int64_t)0, num_objects - peer_num_objects);
3264 }
3265 }
3266 if (upset.count(peer.first)) {
3267 missing_target_objects.insert(make_pair(missing, peer.first));
3268 } else if (actingset.count(peer.first)) {
3269 acting_source_objects.insert(make_pair(missing, peer.first));
3270 }
3271 peer.second.stats.stats.sum.num_objects_missing = missing;
3272 dout(20) << __func__ << " shard " << peer.first
3273 << " objects " << peer_num_objects
3274 << " missing " << missing
3275 << dendl;
3276 }
3277
3278 // A misplaced object is not stored on the correct OSD
3279 int64_t misplaced = 0;
3280 // a degraded objects has fewer replicas or EC shards than the pool specifies.
3281 int64_t degraded = 0;
3282
3283 if (is_recovering()) {
3284 for (auto& sml: missing_loc.get_missing_by_count()) {
3285 for (auto& ml: sml.second) {
3286 int missing_shards;
3287 if (sml.first == shard_id_t::NO_SHARD) {
3288 dout(20) << __func__ << " ml " << ml.second << " upset size " << upset.size() << " up " << ml.first.up << dendl;
3289 missing_shards = (int)upset.size() - ml.first.up;
3290 } else {
3291 // Handle shards not even in upset below
3292 if (!find_shard(upset, sml.first))
3293 continue;
3294 missing_shards = std::max(0, 1 - ml.first.up);
3295 dout(20) << __func__ << " shard " << sml.first << " ml " << ml.second << " missing shards " << missing_shards << dendl;
3296 }
3297 int odegraded = ml.second * missing_shards;
3298 // Copies on other osds but limited to the possible degraded
3299 int more_osds = std::min(missing_shards, ml.first.other);
3300 int omisplaced = ml.second * more_osds;
3301 ceph_assert(omisplaced <= odegraded);
3302 odegraded -= omisplaced;
3303
3304 misplaced += omisplaced;
3305 degraded += odegraded;
3306 }
3307 }
3308
3309 dout(20) << __func__ << " missing based degraded " << degraded << dendl;
3310 dout(20) << __func__ << " missing based misplaced " << misplaced << dendl;
3311
3312 // Handle undersized case
3313 if (pool.info.is_replicated()) {
3314 // Add degraded for missing targets (num_objects missing)
3315 ceph_assert(target >= upset.size());
3316 unsigned needed = target - upset.size();
3317 degraded += num_objects * needed;
3318 } else {
3319 for (unsigned i = 0 ; i < num_shards; ++i) {
3320 shard_id_t shard(i);
3321
3322 if (!find_shard(upset, shard)) {
3323 pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard);
3324
3325 if (pgs != pg_shard_t()) {
3326 int64_t missing;
3327
3328 if (pgs == pg_whoami)
3329 missing = info.stats.stats.sum.num_objects_missing_on_primary;
3330 else
3331 missing = peer_info[pgs].stats.stats.sum.num_objects_missing;
3332
3333 degraded += missing;
3334 misplaced += std::max((int64_t)0, num_objects - missing);
3335 } else {
3336 // No shard anywhere
3337 degraded += num_objects;
3338 }
3339 }
3340 }
3341 }
3342 goto out;
3343 }
3344
3345 // Handle undersized case
3346 if (pool.info.is_replicated()) {
3347 // Add to missing_target_objects
3348 ceph_assert(target >= missing_target_objects.size());
3349 unsigned needed = target - missing_target_objects.size();
3350 if (needed)
3351 missing_target_objects.insert(make_pair(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD)));
3352 } else {
3353 for (unsigned i = 0 ; i < num_shards; ++i) {
3354 shard_id_t shard(i);
3355 bool found = false;
3356 for (const auto& t : missing_target_objects) {
3357 if (std::get<1>(t).shard == shard) {
3358 found = true;
3359 break;
3360 }
3361 }
3362 if (!found)
3363 missing_target_objects.insert(make_pair(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard)));
3364 }
3365 }
3366
3367 for (const auto& item : missing_target_objects)
3368 dout(20) << __func__ << " missing shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
3369 for (const auto& item : acting_source_objects)
3370 dout(20) << __func__ << " acting shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
3371
3372 // Handle all objects not in missing for remapped
3373 // or backfill
3374 for (auto m = missing_target_objects.rbegin();
3375 m != missing_target_objects.rend(); ++m) {
3376
3377 int64_t extra_missing = -1;
3378
3379 if (pool.info.is_replicated()) {
3380 if (!acting_source_objects.empty()) {
3381 auto extra_copy = acting_source_objects.begin();
3382 extra_missing = std::get<0>(*extra_copy);
3383 acting_source_objects.erase(extra_copy);
3384 }
3385 } else { // Erasure coded
3386 // Use corresponding shard
3387 for (const auto& a : acting_source_objects) {
3388 if (std::get<1>(a).shard == std::get<1>(*m).shard) {
3389 extra_missing = std::get<0>(a);
3390 acting_source_objects.erase(a);
3391 break;
3392 }
3393 }
3394 }
3395
3396 if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
3397 // We don't know which of the objects on the target
3398 // are part of extra_missing so assume are all degraded.
3399 misplaced += std::get<0>(*m) - extra_missing;
3400 degraded += extra_missing;
3401 } else {
3402 // 1. extra_missing == -1, more targets than sources so degraded
3403 // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
3404 // previously degraded are now present on the target.
3405 degraded += std::get<0>(*m);
3406 }
3407 }
3408 // If there are still acting that haven't been accounted for
3409 // then they are misplaced
3410 for (const auto& a : acting_source_objects) {
3411 int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
3412 dout(20) << __func__ << " extra acting misplaced " << extra_misplaced << dendl;
3413 misplaced += extra_misplaced;
3414 }
3415 out:
3416 // NOTE: Tests use these messages to verify this code
3417 dout(20) << __func__ << " degraded " << degraded << (estimate ? " (est)": "") << dendl;
3418 dout(20) << __func__ << " misplaced " << misplaced << (estimate ? " (est)": "")<< dendl;
3419
3420 info.stats.stats.sum.num_objects_degraded = degraded;
3421 info.stats.stats.sum.num_objects_unfound = get_num_unfound();
3422 info.stats.stats.sum.num_objects_misplaced = misplaced;
3423 }
3424 }
3425
3426 void PG::_update_blocked_by()
3427 {
3428 // set a max on the number of blocking peers we report. if we go
3429 // over, report a random subset. keep the result sorted.
3430 unsigned keep = std::min<unsigned>(blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
3431 unsigned skip = blocked_by.size() - keep;
3432 info.stats.blocked_by.clear();
3433 info.stats.blocked_by.resize(keep);
3434 unsigned pos = 0;
3435 for (set<int>::iterator p = blocked_by.begin();
3436 p != blocked_by.end() && keep > 0;
3437 ++p) {
3438 if (skip > 0 && (rand() % (skip + keep) < skip)) {
3439 --skip;
3440 } else {
3441 info.stats.blocked_by[pos++] = *p;
3442 --keep;
3443 }
3444 }
3445 }
3446
3447 void PG::publish_stats_to_osd()
3448 {
3449 if (!is_primary())
3450 return;
3451
3452 pg_stats_publish_lock.Lock();
3453
3454 if (info.stats.stats.sum.num_scrub_errors)
3455 state_set(PG_STATE_INCONSISTENT);
3456 else {
3457 state_clear(PG_STATE_INCONSISTENT);
3458 state_clear(PG_STATE_FAILED_REPAIR);
3459 }
3460
3461 utime_t now = ceph_clock_now();
3462 if (info.stats.state != state) {
3463 info.stats.last_change = now;
3464 // Optimistic estimation, if we just find out an inactive PG,
3465 // assumt it is active till now.
3466 if (!(state & PG_STATE_ACTIVE) &&
3467 (info.stats.state & PG_STATE_ACTIVE))
3468 info.stats.last_active = now;
3469
3470 if ((state & PG_STATE_ACTIVE) &&
3471 !(info.stats.state & PG_STATE_ACTIVE))
3472 info.stats.last_became_active = now;
3473 if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
3474 !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
3475 info.stats.last_became_peered = now;
3476 info.stats.state = state;
3477 }
3478
3479 _update_calc_stats();
3480 if (info.stats.stats.sum.num_objects_degraded) {
3481 state_set(PG_STATE_DEGRADED);
3482 } else {
3483 state_clear(PG_STATE_DEGRADED);
3484 }
3485 _update_blocked_by();
3486
3487 pg_stat_t pre_publish = info.stats;
3488 pre_publish.stats.add(unstable_stats);
3489 utime_t cutoff = now;
3490 cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
3491
3492 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_MIMIC) {
3493 // share (some of) our purged_snaps via the pg_stats. limit # of intervals
3494 // because we don't want to make the pg_stat_t structures too expensive.
3495 unsigned max = cct->_conf->osd_max_snap_prune_intervals_per_epoch;
3496 unsigned num = 0;
3497 auto i = info.purged_snaps.begin();
3498 while (num < max && i != info.purged_snaps.end()) {
3499 pre_publish.purged_snaps.insert(i.get_start(), i.get_len());
3500 ++num;
3501 ++i;
3502 }
3503 dout(20) << __func__ << " reporting purged_snaps "
3504 << pre_publish.purged_snaps << dendl;
3505 }
3506
3507 if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
3508 info.stats.last_fresh > cutoff) {
3509 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3510 << ": no change since " << info.stats.last_fresh << dendl;
3511 } else {
3512 // update our stat summary and timestamps
3513 info.stats.reported_epoch = get_osdmap_epoch();
3514 ++info.stats.reported_seq;
3515
3516 info.stats.last_fresh = now;
3517
3518 if (info.stats.state & PG_STATE_CLEAN)
3519 info.stats.last_clean = now;
3520 if (info.stats.state & PG_STATE_ACTIVE)
3521 info.stats.last_active = now;
3522 if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
3523 info.stats.last_peered = now;
3524 info.stats.last_unstale = now;
3525 if ((info.stats.state & PG_STATE_DEGRADED) == 0)
3526 info.stats.last_undegraded = now;
3527 if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
3528 info.stats.last_fullsized = now;
3529
3530 pg_stats_publish_valid = true;
3531 pg_stats_publish = pre_publish;
3532
3533 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3534 << ":" << pg_stats_publish.reported_seq << dendl;
3535 }
3536 pg_stats_publish_lock.Unlock();
3537 }
3538
3539 void PG::clear_publish_stats()
3540 {
3541 dout(15) << "clear_stats" << dendl;
3542 pg_stats_publish_lock.Lock();
3543 pg_stats_publish_valid = false;
3544 pg_stats_publish_lock.Unlock();
3545 }
3546
3547 /**
3548 * initialize a newly instantiated pg
3549 *
3550 * Initialize PG state, as when a PG is initially created, or when it
3551 * is first instantiated on the current node.
3552 *
3553 * @param role our role/rank
3554 * @param newup up set
3555 * @param newacting acting set
3556 * @param history pg history
3557 * @param pi past_intervals
3558 * @param backfill true if info should be marked as backfill
3559 * @param t transaction to write out our new state in
3560 */
3561 void PG::init(
3562 int role,
3563 const vector<int>& newup, int new_up_primary,
3564 const vector<int>& newacting, int new_acting_primary,
3565 const pg_history_t& history,
3566 const PastIntervals& pi,
3567 bool backfill,
3568 ObjectStore::Transaction *t)
3569 {
3570 dout(10) << "init role " << role << " up " << newup << " acting " << newacting
3571 << " history " << history
3572 << " past_intervals " << pi
3573 << dendl;
3574
3575 set_role(role);
3576 init_primary_up_acting(
3577 newup,
3578 newacting,
3579 new_up_primary,
3580 new_acting_primary);
3581
3582 info.history = history;
3583 past_intervals = pi;
3584
3585 info.stats.up = up;
3586 info.stats.up_primary = new_up_primary;
3587 info.stats.acting = acting;
3588 info.stats.acting_primary = new_acting_primary;
3589 info.stats.mapping_epoch = info.history.same_interval_since;
3590
3591 if (backfill) {
3592 dout(10) << __func__ << ": Setting backfill" << dendl;
3593 info.set_last_backfill(hobject_t());
3594 info.last_complete = info.last_update;
3595 pg_log.mark_log_for_rewrite();
3596 }
3597
3598 on_new_interval();
3599
3600 dirty_info = true;
3601 dirty_big_info = true;
3602 write_if_dirty(*t);
3603 }
3604
3605 void PG::shutdown()
3606 {
3607 ch->flush();
3608 lock();
3609 on_shutdown();
3610 unlock();
3611 }
3612
3613 #pragma GCC diagnostic ignored "-Wpragmas"
3614 #pragma GCC diagnostic push
3615 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3616
3617 void PG::upgrade(ObjectStore *store)
3618 {
3619 dout(0) << __func__ << " " << info_struct_v << " -> " << latest_struct_v
3620 << dendl;
3621 ceph_assert(info_struct_v <= 10);
3622 ObjectStore::Transaction t;
3623
3624 // <do upgrade steps here>
3625
3626 // finished upgrade!
3627 ceph_assert(info_struct_v == 10);
3628
3629 // update infover_key
3630 if (info_struct_v < latest_struct_v) {
3631 map<string,bufferlist> v;
3632 __u8 ver = latest_struct_v;
3633 encode(ver, v[infover_key]);
3634 t.omap_setkeys(coll, pgmeta_oid, v);
3635 }
3636
3637 dirty_info = true;
3638 dirty_big_info = true;
3639 write_if_dirty(t);
3640
3641 ObjectStore::CollectionHandle ch = store->open_collection(coll);
3642 int r = store->queue_transaction(ch, std::move(t));
3643 if (r != 0) {
3644 derr << __func__ << ": queue_transaction returned "
3645 << cpp_strerror(r) << dendl;
3646 ceph_abort();
3647 }
3648 ceph_assert(r == 0);
3649
3650 C_SaferCond waiter;
3651 if (!ch->flush_commit(&waiter)) {
3652 waiter.wait();
3653 }
3654 }
3655
3656 #pragma GCC diagnostic pop
3657 #pragma GCC diagnostic warning "-Wpragmas"
3658
3659 int PG::_prepare_write_info(CephContext* cct,
3660 map<string,bufferlist> *km,
3661 epoch_t epoch,
3662 pg_info_t &info, pg_info_t &last_written_info,
3663 PastIntervals &past_intervals,
3664 bool dirty_big_info,
3665 bool dirty_epoch,
3666 bool try_fast_info,
3667 PerfCounters *logger)
3668 {
3669 if (dirty_epoch) {
3670 encode(epoch, (*km)[epoch_key]);
3671 }
3672
3673 if (logger)
3674 logger->inc(l_osd_pg_info);
3675
3676 // try to do info efficiently?
3677 if (!dirty_big_info && try_fast_info &&
3678 info.last_update > last_written_info.last_update) {
3679 pg_fast_info_t fast;
3680 fast.populate_from(info);
3681 bool did = fast.try_apply_to(&last_written_info);
3682 ceph_assert(did); // we verified last_update increased above
3683 if (info == last_written_info) {
3684 encode(fast, (*km)[fastinfo_key]);
3685 if (logger)
3686 logger->inc(l_osd_pg_fastinfo);
3687 return 0;
3688 }
3689 generic_dout(30) << __func__ << " fastinfo failed, info:\n";
3690 {
3691 JSONFormatter jf(true);
3692 jf.dump_object("info", info);
3693 jf.flush(*_dout);
3694 }
3695 {
3696 *_dout << "\nlast_written_info:\n";
3697 JSONFormatter jf(true);
3698 jf.dump_object("last_written_info", last_written_info);
3699 jf.flush(*_dout);
3700 }
3701 *_dout << dendl;
3702 }
3703 last_written_info = info;
3704
3705 // info. store purged_snaps separately.
3706 interval_set<snapid_t> purged_snaps;
3707 purged_snaps.swap(info.purged_snaps);
3708 encode(info, (*km)[info_key]);
3709 purged_snaps.swap(info.purged_snaps);
3710
3711 if (dirty_big_info) {
3712 // potentially big stuff
3713 bufferlist& bigbl = (*km)[biginfo_key];
3714 encode(past_intervals, bigbl);
3715 encode(info.purged_snaps, bigbl);
3716 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
3717 if (logger)
3718 logger->inc(l_osd_pg_biginfo);
3719 }
3720
3721 return 0;
3722 }
3723
3724 void PG::_create(ObjectStore::Transaction& t, spg_t pgid, int bits)
3725 {
3726 coll_t coll(pgid);
3727 t.create_collection(coll, bits);
3728 }
3729
3730 void PG::_init(ObjectStore::Transaction& t, spg_t pgid, const pg_pool_t *pool)
3731 {
3732 coll_t coll(pgid);
3733
3734 if (pool) {
3735 // Give a hint to the PG collection
3736 bufferlist hint;
3737 uint32_t pg_num = pool->get_pg_num();
3738 uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
3739 encode(pg_num, hint);
3740 encode(expected_num_objects_pg, hint);
3741 uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
3742 t.collection_hint(coll, hint_type, hint);
3743 }
3744
3745 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3746 t.touch(coll, pgmeta_oid);
3747 map<string,bufferlist> values;
3748 __u8 struct_v = latest_struct_v;
3749 encode(struct_v, values[infover_key]);
3750 t.omap_setkeys(coll, pgmeta_oid, values);
3751 }
3752
3753 void PG::prepare_write_info(map<string,bufferlist> *km)
3754 {
3755 info.stats.stats.add(unstable_stats);
3756 unstable_stats.clear();
3757
3758 bool need_update_epoch = last_epoch < get_osdmap_epoch();
3759 int ret = _prepare_write_info(cct, km, get_osdmap_epoch(),
3760 info,
3761 last_written_info,
3762 past_intervals,
3763 dirty_big_info, need_update_epoch,
3764 cct->_conf->osd_fast_info,
3765 osd->logger);
3766 ceph_assert(ret == 0);
3767 if (need_update_epoch)
3768 last_epoch = get_osdmap_epoch();
3769 last_persisted_osdmap = last_epoch;
3770
3771 dirty_info = false;
3772 dirty_big_info = false;
3773 }
3774
3775 #pragma GCC diagnostic ignored "-Wpragmas"
3776 #pragma GCC diagnostic push
3777 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3778
3779 bool PG::_has_removal_flag(ObjectStore *store,
3780 spg_t pgid)
3781 {
3782 coll_t coll(pgid);
3783 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3784
3785 // first try new way
3786 set<string> keys;
3787 keys.insert("_remove");
3788 map<string,bufferlist> values;
3789 auto ch = store->open_collection(coll);
3790 ceph_assert(ch);
3791 if (store->omap_get_values(ch, pgmeta_oid, keys, &values) == 0 &&
3792 values.size() == 1)
3793 return true;
3794
3795 return false;
3796 }
3797
3798 int PG::peek_map_epoch(ObjectStore *store,
3799 spg_t pgid,
3800 epoch_t *pepoch)
3801 {
3802 coll_t coll(pgid);
3803 ghobject_t legacy_infos_oid(OSD::make_infos_oid());
3804 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3805 epoch_t cur_epoch = 0;
3806
3807 // validate collection name
3808 ceph_assert(coll.is_pg());
3809
3810 // try for v8
3811 set<string> keys;
3812 keys.insert(infover_key);
3813 keys.insert(epoch_key);
3814 map<string,bufferlist> values;
3815 auto ch = store->open_collection(coll);
3816 ceph_assert(ch);
3817 int r = store->omap_get_values(ch, pgmeta_oid, keys, &values);
3818 if (r == 0) {
3819 ceph_assert(values.size() == 2);
3820
3821 // sanity check version
3822 auto bp = values[infover_key].cbegin();
3823 __u8 struct_v = 0;
3824 decode(struct_v, bp);
3825 ceph_assert(struct_v >= 8);
3826
3827 // get epoch
3828 bp = values[epoch_key].begin();
3829 decode(cur_epoch, bp);
3830 } else {
3831 // probably bug 10617; see OSD::load_pgs()
3832 return -1;
3833 }
3834
3835 *pepoch = cur_epoch;
3836 return 0;
3837 }
3838
3839 #pragma GCC diagnostic pop
3840 #pragma GCC diagnostic warning "-Wpragmas"
3841
3842 void PG::write_if_dirty(ObjectStore::Transaction& t)
3843 {
3844 map<string,bufferlist> km;
3845 if (dirty_big_info || dirty_info)
3846 prepare_write_info(&km);
3847 pg_log.write_log_and_missing(t, &km, coll, pgmeta_oid, pool.info.require_rollback());
3848 if (!km.empty())
3849 t.omap_setkeys(coll, pgmeta_oid, km);
3850 }
3851
3852 void PG::add_log_entry(const pg_log_entry_t& e, bool applied)
3853 {
3854 // raise last_complete only if we were previously up to date
3855 if (info.last_complete == info.last_update)
3856 info.last_complete = e.version;
3857
3858 // raise last_update.
3859 ceph_assert(e.version > info.last_update);
3860 info.last_update = e.version;
3861
3862 // raise user_version, if it increased (it may have not get bumped
3863 // by all logged updates)
3864 if (e.user_version > info.last_user_version)
3865 info.last_user_version = e.user_version;
3866
3867 // log mutation
3868 pg_log.add(e, applied);
3869 dout(10) << "add_log_entry " << e << dendl;
3870 }
3871
3872
3873 void PG::append_log(
3874 const vector<pg_log_entry_t>& logv,
3875 eversion_t trim_to,
3876 eversion_t roll_forward_to,
3877 ObjectStore::Transaction &t,
3878 bool transaction_applied,
3879 bool async)
3880 {
3881 if (transaction_applied)
3882 update_snap_map(logv, t);
3883
3884 /* The primary has sent an info updating the history, but it may not
3885 * have arrived yet. We want to make sure that we cannot remember this
3886 * write without remembering that it happened in an interval which went
3887 * active in epoch history.last_epoch_started.
3888 */
3889 if (info.last_epoch_started != info.history.last_epoch_started) {
3890 info.history.last_epoch_started = info.last_epoch_started;
3891 }
3892 if (info.last_interval_started != info.history.last_interval_started) {
3893 info.history.last_interval_started = info.last_interval_started;
3894 }
3895 dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3896
3897 PGLogEntryHandler handler{this, &t};
3898 if (!transaction_applied) {
3899 /* We must be a backfill peer, so it's ok if we apply
3900 * out-of-turn since we won't be considered when
3901 * determining a min possible last_update.
3902 */
3903 pg_log.roll_forward(&handler);
3904 }
3905
3906 for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3907 p != logv.end();
3908 ++p) {
3909 add_log_entry(*p, transaction_applied);
3910
3911 /* We don't want to leave the rollforward artifacts around
3912 * here past last_backfill. It's ok for the same reason as
3913 * above */
3914 if (transaction_applied &&
3915 p->soid > info.last_backfill) {
3916 pg_log.roll_forward(&handler);
3917 }
3918 }
3919 auto last = logv.rbegin();
3920 if (is_primary() && last != logv.rend()) {
3921 projected_log.skip_can_rollback_to_to_head();
3922 projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
3923 }
3924
3925 if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
3926 pg_log.roll_forward_to(
3927 roll_forward_to,
3928 &handler);
3929 last_rollback_info_trimmed_to_applied = roll_forward_to;
3930 }
3931
3932 dout(10) << __func__ << " approx pg log length = "
3933 << pg_log.get_log().approx_size() << dendl;
3934 dout(10) << __func__ << " transaction_applied = "
3935 << transaction_applied << dendl;
3936 if (!transaction_applied || async)
3937 dout(10) << __func__ << " " << pg_whoami
3938 << " is async_recovery or backfill target" << dendl;
3939 pg_log.trim(trim_to, info, transaction_applied, async);
3940
3941 // update the local pg, pg log
3942 dirty_info = true;
3943 write_if_dirty(t);
3944 }
3945
3946 bool PG::check_log_for_corruption(ObjectStore *store)
3947 {
3948 /// TODO: this method needs to work with the omap log
3949 return true;
3950 }
3951
3952 //! Get the name we're going to save our corrupt page log as
3953 std::string PG::get_corrupt_pg_log_name() const
3954 {
3955 const int MAX_BUF = 512;
3956 char buf[MAX_BUF];
3957 struct tm tm_buf;
3958 time_t my_time(time(NULL));
3959 const struct tm *t = localtime_r(&my_time, &tm_buf);
3960 int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t);
3961 if (ret == 0) {
3962 dout(0) << "strftime failed" << dendl;
3963 return "corrupt_log_unknown_time";
3964 }
3965 string out(buf);
3966 out += stringify(info.pgid);
3967 return out;
3968 }
3969
3970 int PG::read_info(
3971 ObjectStore *store, spg_t pgid, const coll_t &coll,
3972 pg_info_t &info, PastIntervals &past_intervals,
3973 __u8 &struct_v)
3974 {
3975 set<string> keys;
3976 keys.insert(infover_key);
3977 keys.insert(info_key);
3978 keys.insert(biginfo_key);
3979 keys.insert(fastinfo_key);
3980 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3981 map<string,bufferlist> values;
3982 auto ch = store->open_collection(coll);
3983 ceph_assert(ch);
3984 int r = store->omap_get_values(ch, pgmeta_oid, keys, &values);
3985 ceph_assert(r == 0);
3986 ceph_assert(values.size() == 3 ||
3987 values.size() == 4);
3988
3989 auto p = values[infover_key].cbegin();
3990 decode(struct_v, p);
3991 ceph_assert(struct_v >= 10);
3992
3993 p = values[info_key].begin();
3994 decode(info, p);
3995
3996 p = values[biginfo_key].begin();
3997 decode(past_intervals, p);
3998 decode(info.purged_snaps, p);
3999
4000 p = values[fastinfo_key].begin();
4001 if (!p.end()) {
4002 pg_fast_info_t fast;
4003 decode(fast, p);
4004 fast.try_apply_to(&info);
4005 }
4006 return 0;
4007 }
4008
4009 void PG::read_state(ObjectStore *store)
4010 {
4011 int r = read_info(store, pg_id, coll, info, past_intervals,
4012 info_struct_v);
4013 ceph_assert(r >= 0);
4014
4015 if (info_struct_v < compat_struct_v) {
4016 derr << "PG needs upgrade, but on-disk data is too old; upgrade to"
4017 << " an older version first." << dendl;
4018 ceph_abort_msg("PG too old to upgrade");
4019 }
4020
4021 last_written_info = info;
4022
4023 ostringstream oss;
4024 pg_log.read_log_and_missing(
4025 store,
4026 ch,
4027 pgmeta_oid,
4028 info,
4029 oss,
4030 cct->_conf->osd_ignore_stale_divergent_priors,
4031 cct->_conf->osd_debug_verify_missing_on_start);
4032 if (oss.tellp())
4033 osd->clog->error() << oss.str();
4034
4035 // log any weirdness
4036 log_weirdness();
4037
4038 if (info_struct_v < latest_struct_v) {
4039 upgrade(store);
4040 }
4041
4042 // initialize current mapping
4043 {
4044 int primary, up_primary;
4045 vector<int> acting, up;
4046 get_osdmap()->pg_to_up_acting_osds(
4047 pg_id.pgid, &up, &up_primary, &acting, &primary);
4048 init_primary_up_acting(
4049 up,
4050 acting,
4051 up_primary,
4052 primary);
4053 int rr = OSDMap::calc_pg_role(osd->whoami, acting);
4054 if (pool.info.is_replicated() || rr == pg_whoami.shard)
4055 set_role(rr);
4056 else
4057 set_role(-1);
4058 }
4059
4060 PG::RecoveryCtx rctx(0, 0, 0, new ObjectStore::Transaction);
4061 handle_initialize(&rctx);
4062 // note: we don't activate here because we know the OSD will advance maps
4063 // during boot.
4064 write_if_dirty(*rctx.transaction);
4065 store->queue_transaction(ch, std::move(*rctx.transaction));
4066 delete rctx.transaction;
4067 }
4068
4069 void PG::log_weirdness()
4070 {
4071 if (pg_log.get_tail() != info.log_tail)
4072 osd->clog->error() << info.pgid
4073 << " info mismatch, log.tail " << pg_log.get_tail()
4074 << " != info.log_tail " << info.log_tail;
4075 if (pg_log.get_head() != info.last_update)
4076 osd->clog->error() << info.pgid
4077 << " info mismatch, log.head " << pg_log.get_head()
4078 << " != info.last_update " << info.last_update;
4079
4080 if (!pg_log.get_log().empty()) {
4081 // sloppy check
4082 if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
4083 osd->clog->error() << info.pgid
4084 << " log bound mismatch, info (tail,head] ("
4085 << pg_log.get_tail() << "," << pg_log.get_head() << "]"
4086 << " actual ["
4087 << pg_log.get_log().log.begin()->version << ","
4088 << pg_log.get_log().log.rbegin()->version << "]";
4089 }
4090
4091 if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
4092 osd->clog->error() << info.pgid
4093 << " caller_ops.size " << pg_log.get_log().caller_ops.size()
4094 << " > log size " << pg_log.get_log().log.size();
4095 }
4096 }
4097
4098 void PG::update_snap_map(
4099 const vector<pg_log_entry_t> &log_entries,
4100 ObjectStore::Transaction &t)
4101 {
4102 for (vector<pg_log_entry_t>::const_iterator i = log_entries.begin();
4103 i != log_entries.end();
4104 ++i) {
4105 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4106 if (i->soid.snap < CEPH_MAXSNAP) {
4107 if (i->is_delete()) {
4108 int r = snap_mapper.remove_oid(
4109 i->soid,
4110 &_t);
4111 if (r != 0)
4112 derr << __func__ << " remove_oid " << i->soid << " failed with " << r << dendl;
4113 // On removal tolerate missing key corruption
4114 ceph_assert(r == 0 || r == -ENOENT);
4115 } else if (i->is_update()) {
4116 ceph_assert(i->snaps.length() > 0);
4117 vector<snapid_t> snaps;
4118 bufferlist snapbl = i->snaps;
4119 auto p = snapbl.cbegin();
4120 try {
4121 decode(snaps, p);
4122 } catch (...) {
4123 derr << __func__ << " decode snaps failure on " << *i << dendl;
4124 snaps.clear();
4125 }
4126 set<snapid_t> _snaps(snaps.begin(), snaps.end());
4127
4128 if (i->is_clone() || i->is_promote()) {
4129 snap_mapper.add_oid(
4130 i->soid,
4131 _snaps,
4132 &_t);
4133 } else if (i->is_modify()) {
4134 int r = snap_mapper.update_snaps(
4135 i->soid,
4136 _snaps,
4137 0,
4138 &_t);
4139 ceph_assert(r == 0);
4140 } else {
4141 ceph_assert(i->is_clean());
4142 }
4143 }
4144 }
4145 }
4146 }
4147
4148 /**
4149 * filter trimming|trimmed snaps out of snapcontext
4150 */
4151 void PG::filter_snapc(vector<snapid_t> &snaps)
4152 {
4153 // nothing needs to trim, we can return immediately
4154 if (snap_trimq.empty() && info.purged_snaps.empty())
4155 return;
4156
4157 bool filtering = false;
4158 vector<snapid_t> newsnaps;
4159 for (vector<snapid_t>::iterator p = snaps.begin();
4160 p != snaps.end();
4161 ++p) {
4162 if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
4163 if (!filtering) {
4164 // start building a new vector with what we've seen so far
4165 dout(10) << "filter_snapc filtering " << snaps << dendl;
4166 newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
4167 filtering = true;
4168 }
4169 dout(20) << "filter_snapc removing trimq|purged snap " << *p << dendl;
4170 } else {
4171 if (filtering)
4172 newsnaps.push_back(*p); // continue building new vector
4173 }
4174 }
4175 if (filtering) {
4176 snaps.swap(newsnaps);
4177 dout(10) << "filter_snapc result " << snaps << dendl;
4178 }
4179 }
4180
4181 void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m)
4182 {
4183 for (map<hobject_t, list<OpRequestRef>>::iterator it = m.begin();
4184 it != m.end();
4185 ++it)
4186 requeue_ops(it->second);
4187 m.clear();
4188 }
4189
4190 void PG::requeue_op(OpRequestRef op)
4191 {
4192 auto p = waiting_for_map.find(op->get_source());
4193 if (p != waiting_for_map.end()) {
4194 dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")"
4195 << dendl;
4196 p->second.push_front(op);
4197 } else {
4198 dout(20) << __func__ << " " << op << dendl;
4199 osd->enqueue_front(
4200 OpQueueItem(
4201 unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(info.pgid, op)),
4202 op->get_req()->get_cost(),
4203 op->get_req()->get_priority(),
4204 op->get_req()->get_recv_stamp(),
4205 op->get_req()->get_source().num(),
4206 get_osdmap_epoch()));
4207 }
4208 }
4209
4210 void PG::requeue_ops(list<OpRequestRef> &ls)
4211 {
4212 for (list<OpRequestRef>::reverse_iterator i = ls.rbegin();
4213 i != ls.rend();
4214 ++i) {
4215 requeue_op(*i);
4216 }
4217 ls.clear();
4218 }
4219
4220 void PG::requeue_map_waiters()
4221 {
4222 epoch_t epoch = get_osdmap_epoch();
4223 auto p = waiting_for_map.begin();
4224 while (p != waiting_for_map.end()) {
4225 if (epoch < p->second.front()->min_epoch) {
4226 dout(20) << __func__ << " " << p->first << " front op "
4227 << p->second.front() << " must still wait, doing nothing"
4228 << dendl;
4229 ++p;
4230 } else {
4231 dout(20) << __func__ << " " << p->first << " " << p->second << dendl;
4232 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
4233 auto req = *q;
4234 osd->enqueue_front(OpQueueItem(
4235 unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(info.pgid, req)),
4236 req->get_req()->get_cost(),
4237 req->get_req()->get_priority(),
4238 req->get_req()->get_recv_stamp(),
4239 req->get_req()->get_source().num(),
4240 epoch));
4241 }
4242 p = waiting_for_map.erase(p);
4243 }
4244 }
4245 }
4246
4247
4248 // ==========================================================================================
4249 // SCRUB
4250
4251 /*
4252 * when holding pg and sched_scrub_lock, then the states are:
4253 * scheduling:
4254 * scrubber.reserved = true
4255 * scrub_rserved_peers includes whoami
4256 * osd->scrub_pending++
4257 * scheduling, replica declined:
4258 * scrubber.reserved = true
4259 * scrubber.reserved_peers includes -1
4260 * osd->scrub_pending++
4261 * pending:
4262 * scrubber.reserved = true
4263 * scrubber.reserved_peers.size() == acting.size();
4264 * pg on scrub_wq
4265 * osd->scrub_pending++
4266 * scrubbing:
4267 * scrubber.reserved = false;
4268 * scrubber.reserved_peers empty
4269 * osd->scrubber.active++
4270 */
4271
4272 // returns true if a scrub has been newly kicked off
4273 bool PG::sched_scrub()
4274 {
4275 bool nodeep_scrub = false;
4276 ceph_assert(is_locked());
4277 if (!(is_primary() && is_active() && is_clean() && !is_scrubbing())) {
4278 return false;
4279 }
4280
4281 double deep_scrub_interval = 0;
4282 pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
4283 if (deep_scrub_interval <= 0) {
4284 deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
4285 }
4286 bool time_for_deep = ceph_clock_now() >=
4287 info.history.last_deep_scrub_stamp + deep_scrub_interval;
4288
4289 bool deep_coin_flip = false;
4290 // Only add random deep scrubs when NOT user initiated scrub
4291 if (!scrubber.must_scrub)
4292 deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
4293 dout(20) << __func__ << ": time_for_deep=" << time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
4294
4295 time_for_deep = (time_for_deep || deep_coin_flip);
4296
4297 //NODEEP_SCRUB so ignore time initiated deep-scrub
4298 if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
4299 pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
4300 time_for_deep = false;
4301 nodeep_scrub = true;
4302 }
4303
4304 if (!scrubber.must_scrub) {
4305 ceph_assert(!scrubber.must_deep_scrub);
4306
4307 //NOSCRUB so skip regular scrubs
4308 if ((get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
4309 pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) && !time_for_deep) {
4310 if (scrubber.reserved) {
4311 // cancel scrub if it is still in scheduling,
4312 // so pgs from other pools where scrub are still legal
4313 // have a chance to go ahead with scrubbing.
4314 clear_scrub_reserved();
4315 scrub_unreserve_replicas();
4316 }
4317 return false;
4318 }
4319 }
4320
4321 // Clear these in case user issues the scrub/repair command during
4322 // the scheduling of the scrub/repair (e.g. request reservation)
4323 scrubber.deep_scrub_on_error = false;
4324 scrubber.auto_repair = false;
4325 if (cct->_conf->osd_scrub_auto_repair
4326 && get_pgbackend()->auto_repair_supported()
4327 // respect the command from user, and not do auto-repair
4328 && !scrubber.must_repair
4329 && !scrubber.must_scrub
4330 && !scrubber.must_deep_scrub) {
4331 if (time_for_deep) {
4332 dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
4333 scrubber.auto_repair = true;
4334 } else {
4335 dout(20) << __func__ << ": auto repair with scrubbing, rescrub if errors found" << dendl;
4336 scrubber.deep_scrub_on_error = true;
4337 }
4338 }
4339
4340 bool ret = true;
4341 if (!scrubber.reserved) {
4342 ceph_assert(scrubber.reserved_peers.empty());
4343 if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
4344 osd->inc_scrubs_pending()) {
4345 dout(20) << __func__ << ": reserved locally, reserving replicas" << dendl;
4346 scrubber.reserved = true;
4347 scrubber.reserved_peers.insert(pg_whoami);
4348 scrub_reserve_replicas();
4349 } else {
4350 dout(20) << __func__ << ": failed to reserve locally" << dendl;
4351 ret = false;
4352 }
4353 }
4354 if (scrubber.reserved) {
4355 if (scrubber.reserve_failed) {
4356 dout(20) << "sched_scrub: failed, a peer declined" << dendl;
4357 clear_scrub_reserved();
4358 scrub_unreserve_replicas();
4359 ret = false;
4360 } else if (scrubber.reserved_peers.size() == acting.size()) {
4361 dout(20) << "sched_scrub: success, reserved self and replicas" << dendl;
4362 if (time_for_deep) {
4363 dout(10) << "sched_scrub: scrub will be deep" << dendl;
4364 state_set(PG_STATE_DEEP_SCRUB);
4365 } else if (!scrubber.must_deep_scrub && info.stats.stats.sum.num_deep_scrub_errors) {
4366 if (!nodeep_scrub) {
4367 osd->clog->info() << "osd." << osd->whoami
4368 << " pg " << info.pgid
4369 << " Deep scrub errors, upgrading scrub to deep-scrub";
4370 state_set(PG_STATE_DEEP_SCRUB);
4371 } else if (!scrubber.must_scrub) {
4372 osd->clog->error() << "osd." << osd->whoami
4373 << " pg " << info.pgid
4374 << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
4375 clear_scrub_reserved();
4376 scrub_unreserve_replicas();
4377 return false;
4378 } else {
4379 osd->clog->error() << "osd." << osd->whoami
4380 << " pg " << info.pgid
4381 << " Regular scrub request, deep-scrub details will be lost";
4382 }
4383 }
4384 queue_scrub();
4385 } else {
4386 // none declined, since scrubber.reserved is set
4387 dout(20) << "sched_scrub: reserved " << scrubber.reserved_peers << ", waiting for replicas" << dendl;
4388 }
4389 }
4390
4391 return ret;
4392 }
4393
4394 void PG::reg_next_scrub()
4395 {
4396 if (!is_primary())
4397 return;
4398
4399 utime_t reg_stamp;
4400 bool must = false;
4401 if (scrubber.must_scrub) {
4402 // Set the smallest time that isn't utime_t()
4403 reg_stamp = utime_t(0,1);
4404 must = true;
4405 } else if (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats) {
4406 reg_stamp = ceph_clock_now();
4407 must = true;
4408 } else {
4409 reg_stamp = info.history.last_scrub_stamp;
4410 }
4411 // note down the sched_time, so we can locate this scrub, and remove it
4412 // later on.
4413 double scrub_min_interval = 0, scrub_max_interval = 0;
4414 pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
4415 pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
4416 ceph_assert(scrubber.scrub_reg_stamp == utime_t());
4417 scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
4418 reg_stamp,
4419 scrub_min_interval,
4420 scrub_max_interval,
4421 must);
4422 dout(10) << __func__ << " pg " << pg_id << " register next scrub, scrub time "
4423 << scrubber.scrub_reg_stamp << ", must = " << (int)must << dendl;
4424 }
4425
4426 void PG::unreg_next_scrub()
4427 {
4428 if (is_primary()) {
4429 osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
4430 scrubber.scrub_reg_stamp = utime_t();
4431 }
4432 }
4433
4434 void PG::do_replica_scrub_map(OpRequestRef op)
4435 {
4436 const MOSDRepScrubMap *m = static_cast<const MOSDRepScrubMap*>(op->get_req());
4437 dout(7) << __func__ << " " << *m << dendl;
4438 if (m->map_epoch < info.history.same_interval_since) {
4439 dout(10) << __func__ << " discarding old from "
4440 << m->map_epoch << " < " << info.history.same_interval_since
4441 << dendl;
4442 return;
4443 }
4444 if (!scrubber.is_chunky_scrub_active()) {
4445 dout(10) << __func__ << " scrub isn't active" << dendl;
4446 return;
4447 }
4448
4449 op->mark_started();
4450
4451 auto p = const_cast<bufferlist&>(m->get_data()).cbegin();
4452 scrubber.received_maps[m->from].decode(p, info.pgid.pool());
4453 dout(10) << "map version is "
4454 << scrubber.received_maps[m->from].valid_through
4455 << dendl;
4456
4457 dout(10) << __func__ << " waiting_on_whom was " << scrubber.waiting_on_whom
4458 << dendl;
4459 ceph_assert(scrubber.waiting_on_whom.count(m->from));
4460 scrubber.waiting_on_whom.erase(m->from);
4461 if (m->preempted) {
4462 dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
4463 scrub_preempted = true;
4464 }
4465 if (scrubber.waiting_on_whom.empty()) {
4466 requeue_scrub(ops_blocked_by_scrub());
4467 }
4468 }
4469
4470 // send scrub v3 messages (chunky scrub)
4471 void PG::_request_scrub_map(
4472 pg_shard_t replica, eversion_t version,
4473 hobject_t start, hobject_t end,
4474 bool deep,
4475 bool allow_preemption)
4476 {
4477 ceph_assert(replica != pg_whoami);
4478 dout(10) << "scrub requesting scrubmap from osd." << replica
4479 << " deep " << (int)deep << dendl;
4480 MOSDRepScrub *repscrubop = new MOSDRepScrub(
4481 spg_t(info.pgid.pgid, replica.shard), version,
4482 get_osdmap_epoch(),
4483 get_last_peering_reset(),
4484 start, end, deep,
4485 allow_preemption,
4486 scrubber.priority,
4487 ops_blocked_by_scrub());
4488 // default priority, we want the rep scrub processed prior to any recovery
4489 // or client io messages (we are holding a lock!)
4490 osd->send_message_osd_cluster(
4491 replica.osd, repscrubop, get_osdmap_epoch());
4492 }
4493
4494 void PG::handle_scrub_reserve_request(OpRequestRef op)
4495 {
4496 dout(7) << __func__ << " " << *op->get_req() << dendl;
4497 op->mark_started();
4498 if (scrubber.reserved) {
4499 dout(10) << __func__ << " ignoring reserve request: Already reserved"
4500 << dendl;
4501 return;
4502 }
4503 if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
4504 osd->inc_scrubs_pending()) {
4505 scrubber.reserved = true;
4506 } else {
4507 dout(20) << __func__ << ": failed to reserve remotely" << dendl;
4508 scrubber.reserved = false;
4509 }
4510 const MOSDScrubReserve *m =
4511 static_cast<const MOSDScrubReserve*>(op->get_req());
4512 Message *reply = new MOSDScrubReserve(
4513 spg_t(info.pgid.pgid, primary.shard),
4514 m->map_epoch,
4515 scrubber.reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
4516 pg_whoami);
4517 osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
4518 }
4519
4520 void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
4521 {
4522 dout(7) << __func__ << " " << *op->get_req() << dendl;
4523 op->mark_started();
4524 if (!scrubber.reserved) {
4525 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
4526 return;
4527 }
4528 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
4529 dout(10) << " already had osd." << from << " reserved" << dendl;
4530 } else {
4531 dout(10) << " osd." << from << " scrub reserve = success" << dendl;
4532 scrubber.reserved_peers.insert(from);
4533 sched_scrub();
4534 }
4535 }
4536
4537 void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
4538 {
4539 dout(7) << __func__ << " " << *op->get_req() << dendl;
4540 op->mark_started();
4541 if (!scrubber.reserved) {
4542 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
4543 return;
4544 }
4545 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
4546 dout(10) << " already had osd." << from << " reserved" << dendl;
4547 } else {
4548 /* One decline stops this pg from being scheduled for scrubbing. */
4549 dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
4550 scrubber.reserve_failed = true;
4551 sched_scrub();
4552 }
4553 }
4554
4555 void PG::handle_scrub_reserve_release(OpRequestRef op)
4556 {
4557 dout(7) << __func__ << " " << *op->get_req() << dendl;
4558 op->mark_started();
4559 clear_scrub_reserved();
4560 }
4561
4562 // We can zero the value of primary num_bytes as just an atomic.
4563 // However, setting above zero reserves space for backfill and requires
4564 // the OSDService::stat_lock which protects all OSD usage
4565 void PG::set_reserved_num_bytes(int64_t primary, int64_t local) {
4566 ceph_assert(osd->stat_lock.is_locked_by_me());
4567 primary_num_bytes.store(primary);
4568 local_num_bytes.store(local);
4569 return;
4570 }
4571
4572 void PG::clear_reserved_num_bytes() {
4573 primary_num_bytes.store(0);
4574 local_num_bytes.store(0);
4575 return;
4576 }
4577
4578 void PG::reject_reservation()
4579 {
4580 clear_reserved_num_bytes();
4581 osd->send_message_osd_cluster(
4582 primary.osd,
4583 new MBackfillReserve(
4584 MBackfillReserve::REJECT,
4585 spg_t(info.pgid.pgid, primary.shard),
4586 get_osdmap_epoch()),
4587 get_osdmap_epoch());
4588 }
4589
4590 void PG::schedule_backfill_retry(float delay)
4591 {
4592 std::lock_guard lock(osd->recovery_request_lock);
4593 osd->recovery_request_timer.add_event_after(
4594 delay,
4595 new QueuePeeringEvt<RequestBackfill>(
4596 this, get_osdmap_epoch(),
4597 RequestBackfill()));
4598 }
4599
4600 void PG::schedule_recovery_retry(float delay)
4601 {
4602 std::lock_guard lock(osd->recovery_request_lock);
4603 osd->recovery_request_timer.add_event_after(
4604 delay,
4605 new QueuePeeringEvt<DoRecovery>(
4606 this, get_osdmap_epoch(),
4607 DoRecovery()));
4608 }
4609
4610 void PG::clear_scrub_reserved()
4611 {
4612 scrubber.reserved_peers.clear();
4613 scrubber.reserve_failed = false;
4614
4615 if (scrubber.reserved) {
4616 scrubber.reserved = false;
4617 osd->dec_scrubs_pending();
4618 }
4619 }
4620
4621 void PG::scrub_reserve_replicas()
4622 {
4623 ceph_assert(backfill_targets.empty());
4624 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
4625 i != acting_recovery_backfill.end();
4626 ++i) {
4627 if (*i == pg_whoami) continue;
4628 dout(10) << "scrub requesting reserve from osd." << *i << dendl;
4629 osd->send_message_osd_cluster(
4630 i->osd,
4631 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
4632 get_osdmap_epoch(),
4633 MOSDScrubReserve::REQUEST, pg_whoami),
4634 get_osdmap_epoch());
4635 }
4636 }
4637
4638 void PG::scrub_unreserve_replicas()
4639 {
4640 ceph_assert(backfill_targets.empty());
4641 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
4642 i != acting_recovery_backfill.end();
4643 ++i) {
4644 if (*i == pg_whoami) continue;
4645 dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
4646 osd->send_message_osd_cluster(
4647 i->osd,
4648 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
4649 get_osdmap_epoch(),
4650 MOSDScrubReserve::RELEASE, pg_whoami),
4651 get_osdmap_epoch());
4652 }
4653 }
4654
4655 void PG::_scan_rollback_obs(const vector<ghobject_t> &rollback_obs)
4656 {
4657 ObjectStore::Transaction t;
4658 eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
4659 for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
4660 i != rollback_obs.end();
4661 ++i) {
4662 if (i->generation < trimmed_to.version) {
4663 osd->clog->error() << "osd." << osd->whoami
4664 << " pg " << info.pgid
4665 << " found obsolete rollback obj "
4666 << *i << " generation < trimmed_to "
4667 << trimmed_to
4668 << "...repaired";
4669 t.remove(coll, *i);
4670 }
4671 }
4672 if (!t.empty()) {
4673 derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
4674 << dendl;
4675 osd->store->queue_transaction(ch, std::move(t), NULL);
4676 }
4677 }
4678
4679 void PG::_scan_snaps(ScrubMap &smap)
4680 {
4681 hobject_t head;
4682 SnapSet snapset;
4683
4684 // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify
4685 // caller using clean_meta_map(), and it works properly.
4686 dout(20) << __func__ << " start" << dendl;
4687
4688 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4689 i != smap.objects.rend();
4690 ++i) {
4691 const hobject_t &hoid = i->first;
4692 ScrubMap::object &o = i->second;
4693
4694 dout(20) << __func__ << " " << hoid << dendl;
4695
4696 ceph_assert(!hoid.is_snapdir());
4697 if (hoid.is_head()) {
4698 // parse the SnapSet
4699 bufferlist bl;
4700 if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
4701 continue;
4702 }
4703 bl.push_back(o.attrs[SS_ATTR]);
4704 auto p = bl.cbegin();
4705 try {
4706 decode(snapset, p);
4707 } catch(...) {
4708 continue;
4709 }
4710 head = hoid.get_head();
4711 continue;
4712 }
4713 if (hoid.snap < CEPH_MAXSNAP) {
4714 // check and if necessary fix snap_mapper
4715 if (hoid.get_head() != head) {
4716 derr << __func__ << " no head for " << hoid << " (have " << head << ")"
4717 << dendl;
4718 continue;
4719 }
4720 set<snapid_t> obj_snaps;
4721 auto p = snapset.clone_snaps.find(hoid.snap);
4722 if (p == snapset.clone_snaps.end()) {
4723 derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset
4724 << dendl;
4725 continue;
4726 }
4727 obj_snaps.insert(p->second.begin(), p->second.end());
4728 set<snapid_t> cur_snaps;
4729 int r = snap_mapper.get_snaps(hoid, &cur_snaps);
4730 if (r != 0 && r != -ENOENT) {
4731 derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
4732 ceph_abort();
4733 }
4734 if (r == -ENOENT || cur_snaps != obj_snaps) {
4735 ObjectStore::Transaction t;
4736 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4737 if (r == 0) {
4738 r = snap_mapper.remove_oid(hoid, &_t);
4739 if (r != 0) {
4740 derr << __func__ << ": remove_oid returned " << cpp_strerror(r)
4741 << dendl;
4742 ceph_abort();
4743 }
4744 osd->clog->error() << "osd." << osd->whoami
4745 << " found snap mapper error on pg "
4746 << info.pgid
4747 << " oid " << hoid << " snaps in mapper: "
4748 << cur_snaps << ", oi: "
4749 << obj_snaps
4750 << "...repaired";
4751 } else {
4752 osd->clog->error() << "osd." << osd->whoami
4753 << " found snap mapper error on pg "
4754 << info.pgid
4755 << " oid " << hoid << " snaps missing in mapper"
4756 << ", should be: "
4757 << obj_snaps
4758 << " was " << cur_snaps << " r " << r
4759 << "...repaired";
4760 }
4761 snap_mapper.add_oid(hoid, obj_snaps, &_t);
4762
4763 // wait for repair to apply to avoid confusing other bits of the system.
4764 {
4765 Cond my_cond;
4766 Mutex my_lock("PG::_scan_snaps my_lock");
4767 int r = 0;
4768 bool done;
4769 t.register_on_applied_sync(
4770 new C_SafeCond(&my_lock, &my_cond, &done, &r));
4771 r = osd->store->queue_transaction(ch, std::move(t));
4772 if (r != 0) {
4773 derr << __func__ << ": queue_transaction got " << cpp_strerror(r)
4774 << dendl;
4775 } else {
4776 my_lock.Lock();
4777 while (!done)
4778 my_cond.Wait(my_lock);
4779 my_lock.Unlock();
4780 }
4781 }
4782 }
4783 }
4784 }
4785 }
4786
4787 void PG::_repair_oinfo_oid(ScrubMap &smap)
4788 {
4789 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4790 i != smap.objects.rend();
4791 ++i) {
4792 const hobject_t &hoid = i->first;
4793 ScrubMap::object &o = i->second;
4794
4795 bufferlist bl;
4796 if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4797 continue;
4798 }
4799 bl.push_back(o.attrs[OI_ATTR]);
4800 object_info_t oi;
4801 try {
4802 oi.decode(bl);
4803 } catch(...) {
4804 continue;
4805 }
4806 if (oi.soid != hoid) {
4807 ObjectStore::Transaction t;
4808 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4809 osd->clog->error() << "osd." << osd->whoami
4810 << " found object info error on pg "
4811 << info.pgid
4812 << " oid " << hoid << " oid in object info: "
4813 << oi.soid
4814 << "...repaired";
4815 // Fix object info
4816 oi.soid = hoid;
4817 bl.clear();
4818 encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4819
4820 bufferptr bp(bl.c_str(), bl.length());
4821 o.attrs[OI_ATTR] = bp;
4822
4823 t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
4824 int r = osd->store->queue_transaction(ch, std::move(t));
4825 if (r != 0) {
4826 derr << __func__ << ": queue_transaction got " << cpp_strerror(r)
4827 << dendl;
4828 }
4829 }
4830 }
4831 }
4832 int PG::build_scrub_map_chunk(
4833 ScrubMap &map,
4834 ScrubMapBuilder &pos,
4835 hobject_t start,
4836 hobject_t end,
4837 bool deep,
4838 ThreadPool::TPHandle &handle)
4839 {
4840 dout(10) << __func__ << " [" << start << "," << end << ") "
4841 << " pos " << pos
4842 << dendl;
4843
4844 // start
4845 while (pos.empty()) {
4846 pos.deep = deep;
4847 map.valid_through = info.last_update;
4848
4849 // objects
4850 vector<ghobject_t> rollback_obs;
4851 pos.ret = get_pgbackend()->objects_list_range(
4852 start,
4853 end,
4854 &pos.ls,
4855 &rollback_obs);
4856 if (pos.ret < 0) {
4857 dout(5) << "objects_list_range error: " << pos.ret << dendl;
4858 return pos.ret;
4859 }
4860 if (pos.ls.empty()) {
4861 break;
4862 }
4863 _scan_rollback_obs(rollback_obs);
4864 pos.pos = 0;
4865 return -EINPROGRESS;
4866 }
4867
4868 // scan objects
4869 while (!pos.done()) {
4870 int r = get_pgbackend()->be_scan_list(map, pos);
4871 if (r == -EINPROGRESS) {
4872 return r;
4873 }
4874 }
4875
4876 // finish
4877 dout(20) << __func__ << " finishing" << dendl;
4878 ceph_assert(pos.done());
4879 _repair_oinfo_oid(map);
4880 if (!is_primary()) {
4881 ScrubMap for_meta_scrub;
4882 // In case we restarted smaller chunk, clear old data
4883 scrubber.cleaned_meta_map.clear_from(scrubber.start);
4884 scrubber.cleaned_meta_map.insert(map);
4885 scrubber.clean_meta_map(for_meta_scrub);
4886 _scan_snaps(for_meta_scrub);
4887 }
4888
4889 dout(20) << __func__ << " done, got " << map.objects.size() << " items"
4890 << dendl;
4891 return 0;
4892 }
4893
4894 void PG::Scrubber::cleanup_store(ObjectStore::Transaction *t) {
4895 if (!store)
4896 return;
4897 struct OnComplete : Context {
4898 std::unique_ptr<Scrub::Store> store;
4899 explicit OnComplete(
4900 std::unique_ptr<Scrub::Store> &&store)
4901 : store(std::move(store)) {}
4902 void finish(int) override {}
4903 };
4904 store->cleanup(t);
4905 t->register_on_complete(new OnComplete(std::move(store)));
4906 ceph_assert(!store);
4907 }
4908
4909 void PG::repair_object(
4910 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
4911 pg_shard_t bad_peer)
4912 {
4913 list<pg_shard_t> op_shards;
4914 for (auto i : *ok_peers) {
4915 op_shards.push_back(i.second);
4916 }
4917 dout(10) << "repair_object " << soid << " bad_peer osd."
4918 << bad_peer << " ok_peers osd.{" << op_shards << "}" << dendl;
4919 ScrubMap::object &po = ok_peers->back().first;
4920 eversion_t v;
4921 bufferlist bv;
4922 bv.push_back(po.attrs[OI_ATTR]);
4923 object_info_t oi;
4924 try {
4925 auto bliter = bv.cbegin();
4926 decode(oi, bliter);
4927 } catch (...) {
4928 dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
4929 ceph_abort();
4930 }
4931 if (bad_peer != primary) {
4932 peer_missing[bad_peer].add(soid, oi.version, eversion_t(), false);
4933 } else {
4934 // We should only be scrubbing if the PG is clean.
4935 ceph_assert(waiting_for_unreadable_object.empty());
4936
4937 pg_log.missing_add(soid, oi.version, eversion_t());
4938
4939 pg_log.set_last_requested(0);
4940 dout(10) << __func__ << ": primary = " << primary << dendl;
4941 }
4942
4943 if (is_ec_pg() || bad_peer == primary) {
4944 // we'd better collect all shard for EC pg, and prepare good peers as the
4945 // source of pull in the case of replicated pg.
4946 missing_loc.add_missing(soid, oi.version, eversion_t());
4947 list<pair<ScrubMap::object, pg_shard_t> >::iterator i;
4948 for (i = ok_peers->begin();
4949 i != ok_peers->end();
4950 ++i)
4951 missing_loc.add_location(soid, i->second);
4952 }
4953 }
4954
4955 /* replica_scrub
4956 *
4957 * Wait for last_update_applied to match msg->scrub_to as above. Wait
4958 * for pushes to complete in case of recent recovery. Build a single
4959 * scrubmap of objects that are in the range [msg->start, msg->end).
4960 */
4961 void PG::replica_scrub(
4962 OpRequestRef op,
4963 ThreadPool::TPHandle &handle)
4964 {
4965 const MOSDRepScrub *msg = static_cast<const MOSDRepScrub *>(op->get_req());
4966 ceph_assert(!scrubber.active_rep_scrub);
4967 dout(7) << "replica_scrub" << dendl;
4968
4969 if (msg->map_epoch < info.history.same_interval_since) {
4970 dout(10) << "replica_scrub discarding old replica_scrub from "
4971 << msg->map_epoch << " < " << info.history.same_interval_since
4972 << dendl;
4973 return;
4974 }
4975
4976 ceph_assert(msg->chunky);
4977 if (active_pushes > 0) {
4978 dout(10) << "waiting for active pushes to finish" << dendl;
4979 scrubber.active_rep_scrub = op;
4980 return;
4981 }
4982
4983 scrubber.state = Scrubber::BUILD_MAP_REPLICA;
4984 scrubber.replica_scrub_start = msg->min_epoch;
4985 scrubber.start = msg->start;
4986 scrubber.end = msg->end;
4987 scrubber.max_end = msg->end;
4988 scrubber.deep = msg->deep;
4989 scrubber.epoch_start = info.history.same_interval_since;
4990 if (msg->priority) {
4991 scrubber.priority = msg->priority;
4992 } else {
4993 scrubber.priority = get_scrub_priority();
4994 }
4995
4996 scrub_can_preempt = msg->allow_preemption;
4997 scrub_preempted = false;
4998 scrubber.replica_scrubmap_pos.reset();
4999
5000 requeue_scrub(msg->high_priority);
5001 }
5002
5003 /* Scrub:
5004 * PG_STATE_SCRUBBING is set when the scrub is queued
5005 *
5006 * scrub will be chunky if all OSDs in PG support chunky scrub
5007 * scrub will fail if OSDs are too old.
5008 */
5009 void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
5010 {
5011 if (cct->_conf->osd_scrub_sleep > 0 &&
5012 (scrubber.state == PG::Scrubber::NEW_CHUNK ||
5013 scrubber.state == PG::Scrubber::INACTIVE) &&
5014 scrubber.needs_sleep) {
5015 ceph_assert(!scrubber.sleeping);
5016 dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
5017
5018 // Do an async sleep so we don't block the op queue
5019 OSDService *osds = osd;
5020 spg_t pgid = get_pgid();
5021 int state = scrubber.state;
5022 auto scrub_requeue_callback =
5023 new FunctionContext([osds, pgid, state](int r) {
5024 PGRef pg = osds->osd->lookup_lock_pg(pgid);
5025 if (pg == nullptr) {
5026 lgeneric_dout(osds->osd->cct, 20)
5027 << "scrub_requeue_callback: Could not find "
5028 << "PG " << pgid << " can't complete scrub requeue after sleep"
5029 << dendl;
5030 return;
5031 }
5032 pg->scrubber.sleeping = false;
5033 pg->scrubber.needs_sleep = false;
5034 lgeneric_dout(pg->cct, 20)
5035 << "scrub_requeue_callback: slept for "
5036 << ceph_clock_now() - pg->scrubber.sleep_start
5037 << ", re-queuing scrub with state " << state << dendl;
5038 pg->scrub_queued = false;
5039 pg->requeue_scrub();
5040 pg->scrubber.sleep_start = utime_t();
5041 pg->unlock();
5042 });
5043 std::lock_guard l(osd->sleep_lock);
5044 osd->sleep_timer.add_event_after(cct->_conf->osd_scrub_sleep,
5045 scrub_requeue_callback);
5046 scrubber.sleeping = true;
5047 scrubber.sleep_start = ceph_clock_now();
5048 return;
5049 }
5050 if (pg_has_reset_since(queued)) {
5051 return;
5052 }
5053 ceph_assert(scrub_queued);
5054 scrub_queued = false;
5055 scrubber.needs_sleep = true;
5056
5057 // for the replica
5058 if (!is_primary() &&
5059 scrubber.state == PG::Scrubber::BUILD_MAP_REPLICA) {
5060 chunky_scrub(handle);
5061 return;
5062 }
5063
5064 if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
5065 dout(10) << "scrub -- not primary or active or not clean" << dendl;
5066 state_clear(PG_STATE_SCRUBBING);
5067 state_clear(PG_STATE_REPAIR);
5068 state_clear(PG_STATE_DEEP_SCRUB);
5069 publish_stats_to_osd();
5070 return;
5071 }
5072
5073 if (!scrubber.active) {
5074 ceph_assert(backfill_targets.empty());
5075
5076 scrubber.deep = state_test(PG_STATE_DEEP_SCRUB);
5077
5078 dout(10) << "starting a new chunky scrub" << dendl;
5079 }
5080
5081 chunky_scrub(handle);
5082 }
5083
5084 /*
5085 * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
5086 * chunk.
5087 *
5088 * The object store is partitioned into chunks which end on hash boundaries. For
5089 * each chunk, the following logic is performed:
5090 *
5091 * (1) Block writes on the chunk
5092 * (2) Request maps from replicas
5093 * (3) Wait for pushes to be applied (after recovery)
5094 * (4) Wait for writes to flush on the chunk
5095 * (5) Wait for maps from replicas
5096 * (6) Compare / repair all scrub maps
5097 * (7) Wait for digest updates to apply
5098 *
5099 * This logic is encoded in the mostly linear state machine:
5100 *
5101 * +------------------+
5102 * _________v__________ |
5103 * | | |
5104 * | INACTIVE | |
5105 * |____________________| |
5106 * | |
5107 * | +----------+ |
5108 * _________v___v______ | |
5109 * | | | |
5110 * | NEW_CHUNK | | |
5111 * |____________________| | |
5112 * | | |
5113 * _________v__________ | |
5114 * | | | |
5115 * | WAIT_PUSHES | | |
5116 * |____________________| | |
5117 * | | |
5118 * _________v__________ | |
5119 * | | | |
5120 * | WAIT_LAST_UPDATE | | |
5121 * |____________________| | |
5122 * | | |
5123 * _________v__________ | |
5124 * | | | |
5125 * | BUILD_MAP | | |
5126 * |____________________| | |
5127 * | | |
5128 * _________v__________ | |
5129 * | | | |
5130 * | WAIT_REPLICAS | | |
5131 * |____________________| | |
5132 * | | |
5133 * _________v__________ | |
5134 * | | | |
5135 * | COMPARE_MAPS | | |
5136 * |____________________| | |
5137 * | | |
5138 * | | |
5139 * _________v__________ | |
5140 * | | | |
5141 * |WAIT_DIGEST_UPDATES | | |
5142 * |____________________| | |
5143 * | | | |
5144 * | +----------+ |
5145 * _________v__________ |
5146 * | | |
5147 * | FINISH | |
5148 * |____________________| |
5149 * | |
5150 * +------------------+
5151 *
5152 * The primary determines the last update from the subset by walking the log. If
5153 * it sees a log entry pertaining to a file in the chunk, it tells the replicas
5154 * to wait until that update is applied before building a scrub map. Both the
5155 * primary and replicas will wait for any active pushes to be applied.
5156 *
5157 * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
5158 *
5159 * scrubber.state encodes the current state of the scrub (refer to state diagram
5160 * for details).
5161 */
5162 void PG::chunky_scrub(ThreadPool::TPHandle &handle)
5163 {
5164 // check for map changes
5165 if (scrubber.is_chunky_scrub_active()) {
5166 if (scrubber.epoch_start != info.history.same_interval_since) {
5167 dout(10) << "scrub pg changed, aborting" << dendl;
5168 scrub_clear_state();
5169 scrub_unreserve_replicas();
5170 return;
5171 }
5172 }
5173
5174 bool done = false;
5175 int ret;
5176
5177 while (!done) {
5178 dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
5179 << " [" << scrubber.start << "," << scrubber.end << ")"
5180 << " max_end " << scrubber.max_end << dendl;
5181
5182 switch (scrubber.state) {
5183 case PG::Scrubber::INACTIVE:
5184 dout(10) << "scrub start" << dendl;
5185 ceph_assert(is_primary());
5186
5187 publish_stats_to_osd();
5188 scrubber.epoch_start = info.history.same_interval_since;
5189 scrubber.active = true;
5190
5191 osd->inc_scrubs_active(scrubber.reserved);
5192 if (scrubber.reserved) {
5193 scrubber.reserved = false;
5194 scrubber.reserved_peers.clear();
5195 }
5196
5197 {
5198 ObjectStore::Transaction t;
5199 scrubber.cleanup_store(&t);
5200 scrubber.store.reset(Scrub::Store::create(osd->store, &t,
5201 info.pgid, coll));
5202 osd->store->queue_transaction(ch, std::move(t), nullptr);
5203 }
5204
5205 // Don't include temporary objects when scrubbing
5206 scrubber.start = info.pgid.pgid.get_hobj_start();
5207 scrubber.state = PG::Scrubber::NEW_CHUNK;
5208
5209 {
5210 bool repair = state_test(PG_STATE_REPAIR);
5211 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5212 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5213 stringstream oss;
5214 oss << info.pgid.pgid << " " << mode << " starts" << std::endl;
5215 osd->clog->debug(oss);
5216 }
5217
5218 scrubber.preempt_left = cct->_conf.get_val<uint64_t>(
5219 "osd_scrub_max_preemptions");
5220 scrubber.preempt_divisor = 1;
5221 break;
5222
5223 case PG::Scrubber::NEW_CHUNK:
5224 scrubber.primary_scrubmap = ScrubMap();
5225 scrubber.received_maps.clear();
5226
5227 // begin (possible) preemption window
5228 if (scrub_preempted) {
5229 scrubber.preempt_left--;
5230 scrubber.preempt_divisor *= 2;
5231 dout(10) << __func__ << " preempted, " << scrubber.preempt_left
5232 << " left" << dendl;
5233 scrub_preempted = false;
5234 }
5235 scrub_can_preempt = scrubber.preempt_left > 0;
5236
5237 {
5238 /* get the start and end of our scrub chunk
5239 *
5240 * Our scrub chunk has an important restriction we're going to need to
5241 * respect. We can't let head be start or end.
5242 * Using a half-open interval means that if end == head,
5243 * we'd scrub/lock head and the clone right next to head in different
5244 * chunks which would allow us to miss clones created between
5245 * scrubbing that chunk and scrubbing the chunk including head.
5246 * This isn't true for any of the other clones since clones can
5247 * only be created "just to the left of" head. There is one exception
5248 * to this: promotion of clones which always happens to the left of the
5249 * left-most clone, but promote_object checks the scrubber in that
5250 * case, so it should be ok. Also, it's ok to "miss" clones at the
5251 * left end of the range if we are a tier because they may legitimately
5252 * not exist (see _scrub).
5253 */
5254 int min = std::max<int64_t>(3, cct->_conf->osd_scrub_chunk_min /
5255 scrubber.preempt_divisor);
5256 int max = std::max<int64_t>(min, cct->_conf->osd_scrub_chunk_max /
5257 scrubber.preempt_divisor);
5258 hobject_t start = scrubber.start;
5259 hobject_t candidate_end;
5260 vector<hobject_t> objects;
5261 ret = get_pgbackend()->objects_list_partial(
5262 start,
5263 min,
5264 max,
5265 &objects,
5266 &candidate_end);
5267 ceph_assert(ret >= 0);
5268
5269 if (!objects.empty()) {
5270 hobject_t back = objects.back();
5271 while (candidate_end.is_head() &&
5272 candidate_end == back.get_head()) {
5273 candidate_end = back;
5274 objects.pop_back();
5275 if (objects.empty()) {
5276 ceph_assert(0 ==
5277 "Somehow we got more than 2 objects which"
5278 "have the same head but are not clones");
5279 }
5280 back = objects.back();
5281 }
5282 if (candidate_end.is_head()) {
5283 ceph_assert(candidate_end != back.get_head());
5284 candidate_end = candidate_end.get_object_boundary();
5285 }
5286 } else {
5287 ceph_assert(candidate_end.is_max());
5288 }
5289
5290 if (!_range_available_for_scrub(scrubber.start, candidate_end)) {
5291 // we'll be requeued by whatever made us unavailable for scrub
5292 dout(10) << __func__ << ": scrub blocked somewhere in range "
5293 << "[" << scrubber.start << ", " << candidate_end << ")"
5294 << dendl;
5295 done = true;
5296 break;
5297 }
5298 scrubber.end = candidate_end;
5299 if (scrubber.end > scrubber.max_end)
5300 scrubber.max_end = scrubber.end;
5301 }
5302
5303 // walk the log to find the latest update that affects our chunk
5304 scrubber.subset_last_update = eversion_t();
5305 for (auto p = projected_log.log.rbegin();
5306 p != projected_log.log.rend();
5307 ++p) {
5308 if (p->soid >= scrubber.start &&
5309 p->soid < scrubber.end) {
5310 scrubber.subset_last_update = p->version;
5311 break;
5312 }
5313 }
5314 if (scrubber.subset_last_update == eversion_t()) {
5315 for (list<pg_log_entry_t>::const_reverse_iterator p =
5316 pg_log.get_log().log.rbegin();
5317 p != pg_log.get_log().log.rend();
5318 ++p) {
5319 if (p->soid >= scrubber.start &&
5320 p->soid < scrubber.end) {
5321 scrubber.subset_last_update = p->version;
5322 break;
5323 }
5324 }
5325 }
5326
5327 scrubber.state = PG::Scrubber::WAIT_PUSHES;
5328 break;
5329
5330 case PG::Scrubber::WAIT_PUSHES:
5331 if (active_pushes == 0) {
5332 scrubber.state = PG::Scrubber::WAIT_LAST_UPDATE;
5333 } else {
5334 dout(15) << "wait for pushes to apply" << dendl;
5335 done = true;
5336 }
5337 break;
5338
5339 case PG::Scrubber::WAIT_LAST_UPDATE:
5340 if (last_update_applied < scrubber.subset_last_update) {
5341 // will be requeued by op_applied
5342 dout(15) << "wait for EC read/modify/writes to queue" << dendl;
5343 done = true;
5344 break;
5345 }
5346
5347 // ask replicas to scan
5348 scrubber.waiting_on_whom.insert(pg_whoami);
5349
5350 // request maps from replicas
5351 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
5352 i != acting_recovery_backfill.end();
5353 ++i) {
5354 if (*i == pg_whoami) continue;
5355 _request_scrub_map(*i, scrubber.subset_last_update,
5356 scrubber.start, scrubber.end, scrubber.deep,
5357 scrubber.preempt_left > 0);
5358 scrubber.waiting_on_whom.insert(*i);
5359 }
5360 dout(10) << __func__ << " waiting_on_whom " << scrubber.waiting_on_whom
5361 << dendl;
5362
5363 scrubber.state = PG::Scrubber::BUILD_MAP;
5364 scrubber.primary_scrubmap_pos.reset();
5365 break;
5366
5367 case PG::Scrubber::BUILD_MAP:
5368 ceph_assert(last_update_applied >= scrubber.subset_last_update);
5369
5370 // build my own scrub map
5371 if (scrub_preempted) {
5372 dout(10) << __func__ << " preempted" << dendl;
5373 scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
5374 break;
5375 }
5376 ret = build_scrub_map_chunk(
5377 scrubber.primary_scrubmap,
5378 scrubber.primary_scrubmap_pos,
5379 scrubber.start, scrubber.end,
5380 scrubber.deep,
5381 handle);
5382 if (ret == -EINPROGRESS) {
5383 requeue_scrub();
5384 done = true;
5385 break;
5386 }
5387 scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
5388 break;
5389
5390 case PG::Scrubber::BUILD_MAP_DONE:
5391 if (scrubber.primary_scrubmap_pos.ret < 0) {
5392 dout(5) << "error: " << scrubber.primary_scrubmap_pos.ret
5393 << ", aborting" << dendl;
5394 scrub_clear_state();
5395 scrub_unreserve_replicas();
5396 return;
5397 }
5398 dout(10) << __func__ << " waiting_on_whom was "
5399 << scrubber.waiting_on_whom << dendl;
5400 ceph_assert(scrubber.waiting_on_whom.count(pg_whoami));
5401 scrubber.waiting_on_whom.erase(pg_whoami);
5402
5403 scrubber.state = PG::Scrubber::WAIT_REPLICAS;
5404 break;
5405
5406 case PG::Scrubber::WAIT_REPLICAS:
5407 if (!scrubber.waiting_on_whom.empty()) {
5408 // will be requeued by sub_op_scrub_map
5409 dout(10) << "wait for replicas to build scrub map" << dendl;
5410 done = true;
5411 break;
5412 }
5413 // end (possible) preemption window
5414 scrub_can_preempt = false;
5415 if (scrub_preempted) {
5416 dout(10) << __func__ << " preempted, restarting chunk" << dendl;
5417 scrubber.state = PG::Scrubber::NEW_CHUNK;
5418 } else {
5419 scrubber.state = PG::Scrubber::COMPARE_MAPS;
5420 }
5421 break;
5422
5423 case PG::Scrubber::COMPARE_MAPS:
5424 ceph_assert(last_update_applied >= scrubber.subset_last_update);
5425 ceph_assert(scrubber.waiting_on_whom.empty());
5426
5427 scrub_compare_maps();
5428 scrubber.start = scrubber.end;
5429 scrubber.run_callbacks();
5430
5431 // requeue the writes from the chunk that just finished
5432 requeue_ops(waiting_for_scrub);
5433
5434 scrubber.state = PG::Scrubber::WAIT_DIGEST_UPDATES;
5435
5436 // fall-thru
5437
5438 case PG::Scrubber::WAIT_DIGEST_UPDATES:
5439 if (scrubber.num_digest_updates_pending) {
5440 dout(10) << __func__ << " waiting on "
5441 << scrubber.num_digest_updates_pending
5442 << " digest updates" << dendl;
5443 done = true;
5444 break;
5445 }
5446
5447 scrubber.preempt_left = cct->_conf.get_val<uint64_t>(
5448 "osd_scrub_max_preemptions");
5449 scrubber.preempt_divisor = 1;
5450
5451 if (!(scrubber.end.is_max())) {
5452 scrubber.state = PG::Scrubber::NEW_CHUNK;
5453 requeue_scrub();
5454 done = true;
5455 } else {
5456 scrubber.state = PG::Scrubber::FINISH;
5457 }
5458
5459 break;
5460
5461 case PG::Scrubber::FINISH:
5462 scrub_finish();
5463 scrubber.state = PG::Scrubber::INACTIVE;
5464 done = true;
5465
5466 if (!snap_trimq.empty()) {
5467 dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
5468 snap_trimmer_scrub_complete();
5469 }
5470
5471 break;
5472
5473 case PG::Scrubber::BUILD_MAP_REPLICA:
5474 // build my own scrub map
5475 if (scrub_preempted) {
5476 dout(10) << __func__ << " preempted" << dendl;
5477 ret = 0;
5478 } else {
5479 ret = build_scrub_map_chunk(
5480 scrubber.replica_scrubmap,
5481 scrubber.replica_scrubmap_pos,
5482 scrubber.start, scrubber.end,
5483 scrubber.deep,
5484 handle);
5485 }
5486 if (ret == -EINPROGRESS) {
5487 requeue_scrub();
5488 done = true;
5489 break;
5490 }
5491 // reply
5492 {
5493 MOSDRepScrubMap *reply = new MOSDRepScrubMap(
5494 spg_t(info.pgid.pgid, get_primary().shard),
5495 scrubber.replica_scrub_start,
5496 pg_whoami);
5497 reply->preempted = scrub_preempted;
5498 ::encode(scrubber.replica_scrubmap, reply->get_data());
5499 osd->send_message_osd_cluster(
5500 get_primary().osd, reply,
5501 scrubber.replica_scrub_start);
5502 }
5503 scrub_preempted = false;
5504 scrub_can_preempt = false;
5505 scrubber.state = PG::Scrubber::INACTIVE;
5506 scrubber.replica_scrubmap = ScrubMap();
5507 scrubber.replica_scrubmap_pos = ScrubMapBuilder();
5508 scrubber.start = hobject_t();
5509 scrubber.end = hobject_t();
5510 scrubber.max_end = hobject_t();
5511 done = true;
5512 break;
5513
5514 default:
5515 ceph_abort();
5516 }
5517 }
5518 dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
5519 << " [" << scrubber.start << "," << scrubber.end << ")"
5520 << " max_end " << scrubber.max_end << dendl;
5521 }
5522
5523 bool PG::write_blocked_by_scrub(const hobject_t& soid)
5524 {
5525 if (soid < scrubber.start || soid >= scrubber.end) {
5526 return false;
5527 }
5528 if (scrub_can_preempt) {
5529 if (!scrub_preempted) {
5530 dout(10) << __func__ << " " << soid << " preempted" << dendl;
5531 scrub_preempted = true;
5532 } else {
5533 dout(10) << __func__ << " " << soid << " already preempted" << dendl;
5534 }
5535 return false;
5536 }
5537 return true;
5538 }
5539
5540 bool PG::range_intersects_scrub(const hobject_t &start, const hobject_t& end)
5541 {
5542 // does [start, end] intersect [scrubber.start, scrubber.max_end)
5543 return (start < scrubber.max_end &&
5544 end >= scrubber.start);
5545 }
5546
5547 void PG::scrub_clear_state(bool has_error)
5548 {
5549 ceph_assert(is_locked());
5550 state_clear(PG_STATE_SCRUBBING);
5551 if (!has_error)
5552 state_clear(PG_STATE_REPAIR);
5553 state_clear(PG_STATE_DEEP_SCRUB);
5554 publish_stats_to_osd();
5555
5556 // active -> nothing.
5557 if (scrubber.active)
5558 osd->dec_scrubs_active();
5559
5560 requeue_ops(waiting_for_scrub);
5561
5562 scrubber.reset();
5563
5564 // type-specific state clear
5565 _scrub_clear_state();
5566 }
5567
5568 void PG::scrub_compare_maps()
5569 {
5570 dout(10) << __func__ << " has maps, analyzing" << dendl;
5571
5572 // construct authoritative scrub map for type specific scrubbing
5573 scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
5574 map<hobject_t,
5575 pair<boost::optional<uint32_t>,
5576 boost::optional<uint32_t>>> missing_digest;
5577
5578 map<pg_shard_t, ScrubMap *> maps;
5579 maps[pg_whoami] = &scrubber.primary_scrubmap;
5580
5581 for (const auto& i : acting_recovery_backfill) {
5582 if (i == pg_whoami) continue;
5583 dout(2) << __func__ << " replica " << i << " has "
5584 << scrubber.received_maps[i].objects.size()
5585 << " items" << dendl;
5586 maps[i] = &scrubber.received_maps[i];
5587 }
5588
5589 set<hobject_t> master_set;
5590
5591 // Construct master set
5592 for (const auto map : maps) {
5593 for (const auto i : map.second->objects) {
5594 master_set.insert(i.first);
5595 }
5596 }
5597
5598 stringstream ss;
5599 get_pgbackend()->be_omap_checks(maps, master_set,
5600 scrubber.omap_stats, ss);
5601
5602 if (!ss.str().empty()) {
5603 osd->clog->warn(ss);
5604 }
5605
5606 if (acting.size() > 1) {
5607 dout(10) << __func__ << " comparing replica scrub maps" << dendl;
5608
5609 // Map from object with errors to good peer
5610 map<hobject_t, list<pg_shard_t>> authoritative;
5611
5612 dout(2) << __func__ << " osd." << acting[0] << " has "
5613 << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
5614
5615 ss.str("");
5616 ss.clear();
5617
5618 get_pgbackend()->be_compare_scrubmaps(
5619 maps,
5620 master_set,
5621 state_test(PG_STATE_REPAIR),
5622 scrubber.missing,
5623 scrubber.inconsistent,
5624 authoritative,
5625 missing_digest,
5626 scrubber.shallow_errors,
5627 scrubber.deep_errors,
5628 scrubber.store.get(),
5629 info.pgid, acting,
5630 ss);
5631 dout(2) << ss.str() << dendl;
5632
5633 if (!ss.str().empty()) {
5634 osd->clog->error(ss);
5635 }
5636
5637 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
5638 i != authoritative.end();
5639 ++i) {
5640 list<pair<ScrubMap::object, pg_shard_t> > good_peers;
5641 for (list<pg_shard_t>::const_iterator j = i->second.begin();
5642 j != i->second.end();
5643 ++j) {
5644 good_peers.push_back(make_pair(maps[*j]->objects[i->first], *j));
5645 }
5646 scrubber.authoritative.insert(
5647 make_pair(
5648 i->first,
5649 good_peers));
5650 }
5651
5652 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
5653 i != authoritative.end();
5654 ++i) {
5655 scrubber.cleaned_meta_map.objects.erase(i->first);
5656 scrubber.cleaned_meta_map.objects.insert(
5657 *(maps[i->second.back()]->objects.find(i->first))
5658 );
5659 }
5660 }
5661
5662 ScrubMap for_meta_scrub;
5663 scrubber.clean_meta_map(for_meta_scrub);
5664
5665 // ok, do the pg-type specific scrubbing
5666 scrub_snapshot_metadata(for_meta_scrub, missing_digest);
5667 // Called here on the primary can use an authoritative map if it isn't the primary
5668 _scan_snaps(for_meta_scrub);
5669 if (!scrubber.store->empty()) {
5670 if (state_test(PG_STATE_REPAIR)) {
5671 dout(10) << __func__ << ": discarding scrub results" << dendl;
5672 scrubber.store->flush(nullptr);
5673 } else {
5674 dout(10) << __func__ << ": updating scrub object" << dendl;
5675 ObjectStore::Transaction t;
5676 scrubber.store->flush(&t);
5677 osd->store->queue_transaction(ch, std::move(t), nullptr);
5678 }
5679 }
5680 }
5681
5682 bool PG::scrub_process_inconsistent()
5683 {
5684 dout(10) << __func__ << ": checking authoritative" << dendl;
5685 bool repair = state_test(PG_STATE_REPAIR);
5686 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5687 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5688
5689 // authoriative only store objects which missing or inconsistent.
5690 if (!scrubber.authoritative.empty()) {
5691 stringstream ss;
5692 ss << info.pgid << " " << mode << " "
5693 << scrubber.missing.size() << " missing, "
5694 << scrubber.inconsistent.size() << " inconsistent objects";
5695 dout(2) << ss.str() << dendl;
5696 osd->clog->error(ss);
5697 if (repair) {
5698 state_clear(PG_STATE_CLEAN);
5699 for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >>::iterator i =
5700 scrubber.authoritative.begin();
5701 i != scrubber.authoritative.end();
5702 ++i) {
5703 set<pg_shard_t>::iterator j;
5704
5705 auto missing_entry = scrubber.missing.find(i->first);
5706 if (missing_entry != scrubber.missing.end()) {
5707 for (j = missing_entry->second.begin();
5708 j != missing_entry->second.end();
5709 ++j) {
5710 repair_object(
5711 i->first,
5712 &(i->second),
5713 *j);
5714 ++scrubber.fixed;
5715 }
5716 }
5717 if (scrubber.inconsistent.count(i->first)) {
5718 for (j = scrubber.inconsistent[i->first].begin();
5719 j != scrubber.inconsistent[i->first].end();
5720 ++j) {
5721 repair_object(i->first,
5722 &(i->second),
5723 *j);
5724 ++scrubber.fixed;
5725 }
5726 }
5727 }
5728 }
5729 }
5730 return (!scrubber.authoritative.empty() && repair);
5731 }
5732
5733 bool PG::ops_blocked_by_scrub() const {
5734 return (waiting_for_scrub.size() != 0);
5735 }
5736
5737 // the part that actually finalizes a scrub
5738 void PG::scrub_finish()
5739 {
5740 dout(20) << __func__ << dendl;
5741 bool repair = state_test(PG_STATE_REPAIR);
5742 bool do_deep_scrub = false;
5743 // if the repair request comes from auto-repair and large number of errors,
5744 // we would like to cancel auto-repair
5745 if (repair && scrubber.auto_repair
5746 && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) {
5747 state_clear(PG_STATE_REPAIR);
5748 repair = false;
5749 }
5750 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5751 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5752
5753 // if a regular scrub had errors within the limit, do a deep scrub to auto repair.
5754 if (scrubber.deep_scrub_on_error
5755 && scrubber.authoritative.size() <= cct->_conf->osd_scrub_auto_repair_num_errors) {
5756 ceph_assert(!deep_scrub);
5757 scrubber.deep_scrub_on_error = false;
5758 do_deep_scrub = true;
5759 dout(20) << __func__ << " Try to auto repair after scrub errors" << dendl;
5760 }
5761
5762 // type-specific finish (can tally more errors)
5763 _scrub_finish();
5764
5765 bool has_error = scrub_process_inconsistent();
5766
5767 {
5768 stringstream oss;
5769 oss << info.pgid.pgid << " " << mode << " ";
5770 int total_errors = scrubber.shallow_errors + scrubber.deep_errors;
5771 if (total_errors)
5772 oss << total_errors << " errors";
5773 else
5774 oss << "ok";
5775 if (!deep_scrub && info.stats.stats.sum.num_deep_scrub_errors)
5776 oss << " ( " << info.stats.stats.sum.num_deep_scrub_errors
5777 << " remaining deep scrub error details lost)";
5778 if (repair)
5779 oss << ", " << scrubber.fixed << " fixed";
5780 if (total_errors)
5781 osd->clog->error(oss);
5782 else
5783 osd->clog->debug(oss);
5784 }
5785
5786 // finish up
5787 unreg_next_scrub();
5788 utime_t now = ceph_clock_now();
5789 info.history.last_scrub = info.last_update;
5790 info.history.last_scrub_stamp = now;
5791 if (scrubber.deep) {
5792 info.history.last_deep_scrub = info.last_update;
5793 info.history.last_deep_scrub_stamp = now;
5794 }
5795 // Since we don't know which errors were fixed, we can only clear them
5796 // when every one has been fixed.
5797 if (repair) {
5798 if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) {
5799 ceph_assert(deep_scrub);
5800 scrubber.shallow_errors = scrubber.deep_errors = 0;
5801 dout(20) << __func__ << " All may be fixed" << dendl;
5802 } else if (has_error) {
5803 // Deep scrub in order to get corrected error counts
5804 scrub_after_recovery = true;
5805 dout(20) << __func__ << " Set scrub_after_recovery" << dendl;
5806 } else if (scrubber.shallow_errors || scrubber.deep_errors) {
5807 // We have errors but nothing can be fixed, so there is no repair
5808 // possible.
5809 state_set(PG_STATE_FAILED_REPAIR);
5810 dout(10) << __func__ << " " << (scrubber.shallow_errors + scrubber.deep_errors)
5811 << " error(s) present with no repair possible" << dendl;
5812 }
5813 }
5814 if (deep_scrub) {
5815 if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0))
5816 info.history.last_clean_scrub_stamp = now;
5817 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
5818 info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
5819 info.stats.stats.sum.num_large_omap_objects = scrubber.omap_stats.large_omap_objects;
5820 info.stats.stats.sum.num_omap_bytes = scrubber.omap_stats.omap_bytes;
5821 info.stats.stats.sum.num_omap_keys = scrubber.omap_stats.omap_keys;
5822 dout(25) << __func__ << " shard " << pg_whoami << " num_omap_bytes = "
5823 << info.stats.stats.sum.num_omap_bytes << " num_omap_keys = "
5824 << info.stats.stats.sum.num_omap_keys << dendl;
5825 } else {
5826 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
5827 // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
5828 // because of deep-scrub errors
5829 if (scrubber.shallow_errors == 0)
5830 info.history.last_clean_scrub_stamp = now;
5831 }
5832 info.stats.stats.sum.num_scrub_errors =
5833 info.stats.stats.sum.num_shallow_scrub_errors +
5834 info.stats.stats.sum.num_deep_scrub_errors;
5835 if (scrubber.check_repair) {
5836 scrubber.check_repair = false;
5837 if (info.stats.stats.sum.num_scrub_errors) {
5838 state_set(PG_STATE_FAILED_REPAIR);
5839 dout(10) << __func__ << " " << info.stats.stats.sum.num_scrub_errors
5840 << " error(s) still present after re-scrub" << dendl;
5841 }
5842 }
5843 publish_stats_to_osd();
5844 if (do_deep_scrub) {
5845 // XXX: Auto scrub won't activate if must_scrub is set, but
5846 // setting the scrub stamps affects what users see.
5847 utime_t stamp = utime_t(0,1);
5848 set_last_scrub_stamp(stamp);
5849 set_last_deep_scrub_stamp(stamp);
5850 }
5851 reg_next_scrub();
5852
5853 {
5854 ObjectStore::Transaction t;
5855 dirty_info = true;
5856 write_if_dirty(t);
5857 int tr = osd->store->queue_transaction(ch, std::move(t), NULL);
5858 ceph_assert(tr == 0);
5859 }
5860
5861
5862 if (has_error) {
5863 queue_peering_event(
5864 PGPeeringEventRef(
5865 std::make_shared<PGPeeringEvent>(
5866 get_osdmap_epoch(),
5867 get_osdmap_epoch(),
5868 DoRecovery())));
5869 }
5870
5871 scrub_clear_state(has_error);
5872 scrub_unreserve_replicas();
5873
5874 if (is_active() && is_primary()) {
5875 share_pg_info();
5876 }
5877 }
5878
5879 void PG::share_pg_info()
5880 {
5881 dout(10) << "share_pg_info" << dendl;
5882
5883 // share new pg_info_t with replicas
5884 ceph_assert(!acting_recovery_backfill.empty());
5885 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
5886 i != acting_recovery_backfill.end();
5887 ++i) {
5888 if (*i == pg_whoami) continue;
5889 auto pg_shard = *i;
5890 auto peer = peer_info.find(pg_shard);
5891 if (peer != peer_info.end()) {
5892 peer->second.last_epoch_started = info.last_epoch_started;
5893 peer->second.last_interval_started = info.last_interval_started;
5894 peer->second.history.merge(info.history);
5895 }
5896 MOSDPGInfo *m = new MOSDPGInfo(get_osdmap_epoch());
5897 m->pg_list.push_back(
5898 make_pair(
5899 pg_notify_t(
5900 pg_shard.shard, pg_whoami.shard,
5901 get_osdmap_epoch(),
5902 get_osdmap_epoch(),
5903 info),
5904 past_intervals));
5905 osd->send_message_osd_cluster(pg_shard.osd, m, get_osdmap_epoch());
5906 }
5907 }
5908
5909 bool PG::append_log_entries_update_missing(
5910 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
5911 ObjectStore::Transaction &t, boost::optional<eversion_t> trim_to,
5912 boost::optional<eversion_t> roll_forward_to)
5913 {
5914 ceph_assert(!entries.empty());
5915 ceph_assert(entries.begin()->version > info.last_update);
5916
5917 PGLogEntryHandler rollbacker{this, &t};
5918 bool invalidate_stats =
5919 pg_log.append_new_log_entries(info.last_backfill,
5920 info.last_backfill_bitwise,
5921 entries,
5922 &rollbacker);
5923
5924 if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
5925 pg_log.roll_forward(&rollbacker);
5926 }
5927 if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
5928 pg_log.roll_forward_to(*roll_forward_to, &rollbacker);
5929 last_rollback_info_trimmed_to_applied = *roll_forward_to;
5930 }
5931
5932 info.last_update = pg_log.get_head();
5933
5934 if (pg_log.get_missing().num_missing() == 0) {
5935 // advance last_complete since nothing else is missing!
5936 info.last_complete = info.last_update;
5937 }
5938 info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
5939
5940 dout(20) << __func__ << " trim_to bool = " << bool(trim_to) << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
5941 if (trim_to)
5942 pg_log.trim(*trim_to, info);
5943 dirty_info = true;
5944 write_if_dirty(t);
5945 return invalidate_stats;
5946 }
5947
5948
5949 void PG::merge_new_log_entries(
5950 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
5951 ObjectStore::Transaction &t,
5952 boost::optional<eversion_t> trim_to,
5953 boost::optional<eversion_t> roll_forward_to)
5954 {
5955 dout(10) << __func__ << " " << entries << dendl;
5956 ceph_assert(is_primary());
5957
5958 bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
5959 for (set<pg_shard_t>::const_iterator i = acting_recovery_backfill.begin();
5960 i != acting_recovery_backfill.end();
5961 ++i) {
5962 pg_shard_t peer(*i);
5963 if (peer == pg_whoami) continue;
5964 ceph_assert(peer_missing.count(peer));
5965 ceph_assert(peer_info.count(peer));
5966 pg_missing_t& pmissing(peer_missing[peer]);
5967 dout(20) << __func__ << " peer_missing for " << peer << " = " << pmissing << dendl;
5968 pg_info_t& pinfo(peer_info[peer]);
5969 bool invalidate_stats = PGLog::append_log_entries_update_missing(
5970 pinfo.last_backfill,
5971 info.last_backfill_bitwise,
5972 entries,
5973 true,
5974 NULL,
5975 pmissing,
5976 NULL,
5977 this);
5978 pinfo.last_update = info.last_update;
5979 pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
5980 rebuild_missing = rebuild_missing || invalidate_stats;
5981 }
5982
5983 if (!rebuild_missing) {
5984 return;
5985 }
5986
5987 for (auto &&i: entries) {
5988 missing_loc.rebuild(
5989 i.soid,
5990 pg_whoami,
5991 acting_recovery_backfill,
5992 info,
5993 pg_log.get_missing(),
5994 peer_missing,
5995 peer_info);
5996 }
5997 }
5998
5999 void PG::update_history(const pg_history_t& new_history)
6000 {
6001 unreg_next_scrub();
6002 if (info.history.merge(new_history)) {
6003 dout(20) << __func__ << " advanced history from " << new_history << dendl;
6004 dirty_info = true;
6005 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
6006 dout(20) << __func__ << " clearing past_intervals" << dendl;
6007 past_intervals.clear();
6008 dirty_big_info = true;
6009 }
6010 }
6011 reg_next_scrub();
6012 }
6013
6014 void PG::fulfill_info(
6015 pg_shard_t from, const pg_query_t &query,
6016 pair<pg_shard_t, pg_info_t> &notify_info)
6017 {
6018 ceph_assert(from == primary);
6019 ceph_assert(query.type == pg_query_t::INFO);
6020
6021 // info
6022 dout(10) << "sending info" << dendl;
6023 notify_info = make_pair(from, info);
6024 }
6025
6026 void PG::fulfill_log(
6027 pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
6028 {
6029 dout(10) << "log request from " << from << dendl;
6030 ceph_assert(from == primary);
6031 ceph_assert(query.type != pg_query_t::INFO);
6032 ConnectionRef con = osd->get_con_osd_cluster(
6033 from.osd, get_osdmap_epoch());
6034 if (!con) return;
6035
6036 MOSDPGLog *mlog = new MOSDPGLog(
6037 from.shard, pg_whoami.shard,
6038 get_osdmap_epoch(),
6039 info, query_epoch);
6040 mlog->missing = pg_log.get_missing();
6041
6042 // primary -> other, when building master log
6043 if (query.type == pg_query_t::LOG) {
6044 dout(10) << " sending info+missing+log since " << query.since
6045 << dendl;
6046 if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
6047 osd->clog->error() << info.pgid << " got broken pg_query_t::LOG since " << query.since
6048 << " when my log.tail is " << pg_log.get_tail()
6049 << ", sending full log instead";
6050 mlog->log = pg_log.get_log(); // primary should not have requested this!!
6051 } else
6052 mlog->log.copy_after(pg_log.get_log(), query.since);
6053 }
6054 else if (query.type == pg_query_t::FULLLOG) {
6055 dout(10) << " sending info+missing+full log" << dendl;
6056 mlog->log = pg_log.get_log();
6057 }
6058
6059 dout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
6060
6061 osd->share_map_peer(from.osd, con.get(), get_osdmap());
6062 osd->send_message_osd_cluster(mlog, con.get());
6063 }
6064
6065 void PG::fulfill_query(const MQuery& query, RecoveryCtx *rctx)
6066 {
6067 if (query.query.type == pg_query_t::INFO) {
6068 pair<pg_shard_t, pg_info_t> notify_info;
6069 update_history(query.query.history);
6070 fulfill_info(query.from, query.query, notify_info);
6071 rctx->send_notify(
6072 notify_info.first,
6073 pg_notify_t(
6074 notify_info.first.shard, pg_whoami.shard,
6075 query.query_epoch,
6076 get_osdmap_epoch(),
6077 notify_info.second),
6078 past_intervals);
6079 } else {
6080 update_history(query.query.history);
6081 fulfill_log(query.from, query.query, query.query_epoch);
6082 }
6083 }
6084
6085 void PG::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
6086 {
6087 bool changed = false;
6088 if (osdmap->test_flag(CEPH_OSDMAP_FULL) &&
6089 !lastmap->test_flag(CEPH_OSDMAP_FULL)) {
6090 dout(10) << " cluster was marked full in " << osdmap->get_epoch() << dendl;
6091 changed = true;
6092 }
6093 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
6094 if (!pi) {
6095 return; // pool deleted
6096 }
6097 if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
6098 const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
6099 if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
6100 dout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
6101 changed = true;
6102 }
6103 }
6104 if (changed) {
6105 info.history.last_epoch_marked_full = osdmap->get_epoch();
6106 dirty_info = true;
6107 }
6108 }
6109
6110 bool PG::should_restart_peering(
6111 int newupprimary,
6112 int newactingprimary,
6113 const vector<int>& newup,
6114 const vector<int>& newacting,
6115 OSDMapRef lastmap,
6116 OSDMapRef osdmap)
6117 {
6118 if (PastIntervals::is_new_interval(
6119 primary.osd,
6120 newactingprimary,
6121 acting,
6122 newacting,
6123 up_primary.osd,
6124 newupprimary,
6125 up,
6126 newup,
6127 osdmap,
6128 lastmap,
6129 info.pgid.pgid)) {
6130 dout(20) << "new interval newup " << newup
6131 << " newacting " << newacting << dendl;
6132 return true;
6133 }
6134 if (!lastmap->is_up(osd->whoami) && osdmap->is_up(osd->whoami)) {
6135 dout(10) << __func__ << " osd transitioned from down -> up" << dendl;
6136 return true;
6137 }
6138 return false;
6139 }
6140
6141 bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
6142 {
6143 if (last_peering_reset > reply_epoch ||
6144 last_peering_reset > query_epoch) {
6145 dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " << query_epoch
6146 << " last_peering_reset " << last_peering_reset
6147 << dendl;
6148 return true;
6149 }
6150 return false;
6151 }
6152
6153 void PG::set_last_peering_reset()
6154 {
6155 dout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl;
6156 if (last_peering_reset != get_osdmap_epoch()) {
6157 last_peering_reset = get_osdmap_epoch();
6158 reset_interval_flush();
6159 }
6160 }
6161
6162 struct FlushState {
6163 PGRef pg;
6164 epoch_t epoch;
6165 FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
6166 ~FlushState() {
6167 pg->lock();
6168 if (!pg->pg_has_reset_since(epoch))
6169 pg->on_flushed();
6170 pg->unlock();
6171 }
6172 };
6173 typedef std::shared_ptr<FlushState> FlushStateRef;
6174
6175 void PG::start_flush(ObjectStore::Transaction *t)
6176 {
6177 // flush in progress ops
6178 FlushStateRef flush_trigger (std::make_shared<FlushState>(
6179 this, get_osdmap_epoch()));
6180 flushes_in_progress++;
6181 t->register_on_applied(new ContainerContext<FlushStateRef>(flush_trigger));
6182 t->register_on_commit(new ContainerContext<FlushStateRef>(flush_trigger));
6183 }
6184
6185 void PG::reset_interval_flush()
6186 {
6187 dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
6188 recovery_state.clear_blocked_outgoing();
6189
6190 Context *c = new QueuePeeringEvt<IntervalFlush>(
6191 this, get_osdmap_epoch(), IntervalFlush());
6192 if (!ch->flush_commit(c)) {
6193 dout(10) << "Beginning to block outgoing recovery messages" << dendl;
6194 recovery_state.begin_block_outgoing();
6195 } else {
6196 dout(10) << "Not blocking outgoing recovery messages" << dendl;
6197 delete c;
6198 }
6199 }
6200
6201 /* Called before initializing peering during advance_map */
6202 void PG::start_peering_interval(
6203 const OSDMapRef lastmap,
6204 const vector<int>& newup, int new_up_primary,
6205 const vector<int>& newacting, int new_acting_primary,
6206 ObjectStore::Transaction *t)
6207 {
6208 const OSDMapRef osdmap = get_osdmap();
6209
6210 set_last_peering_reset();
6211
6212 vector<int> oldacting, oldup;
6213 int oldrole = get_role();
6214
6215 unreg_next_scrub();
6216
6217 if (is_primary()) {
6218 osd->clear_ready_to_merge(this);
6219 }
6220
6221 pg_shard_t old_acting_primary = get_primary();
6222 pg_shard_t old_up_primary = up_primary;
6223 bool was_old_primary = is_primary();
6224 bool was_old_replica = is_replica();
6225
6226 acting.swap(oldacting);
6227 up.swap(oldup);
6228 init_primary_up_acting(
6229 newup,
6230 newacting,
6231 new_up_primary,
6232 new_acting_primary);
6233
6234 if (info.stats.up != up ||
6235 info.stats.acting != acting ||
6236 info.stats.up_primary != new_up_primary ||
6237 info.stats.acting_primary != new_acting_primary) {
6238 info.stats.up = up;
6239 info.stats.up_primary = new_up_primary;
6240 info.stats.acting = acting;
6241 info.stats.acting_primary = new_acting_primary;
6242 info.stats.mapping_epoch = osdmap->get_epoch();
6243 }
6244
6245 pg_stats_publish_lock.Lock();
6246 pg_stats_publish_valid = false;
6247 pg_stats_publish_lock.Unlock();
6248
6249 // This will now be remapped during a backfill in cases
6250 // that it would not have been before.
6251 if (up != acting)
6252 state_set(PG_STATE_REMAPPED);
6253 else
6254 state_clear(PG_STATE_REMAPPED);
6255
6256 int role = osdmap->calc_pg_role(osd->whoami, acting, acting.size());
6257 if (pool.info.is_replicated() || role == pg_whoami.shard)
6258 set_role(role);
6259 else
6260 set_role(-1);
6261
6262 // did acting, up, primary|acker change?
6263 if (!lastmap) {
6264 dout(10) << " no lastmap" << dendl;
6265 dirty_info = true;
6266 dirty_big_info = true;
6267 info.history.same_interval_since = osdmap->get_epoch();
6268 } else {
6269 std::stringstream debug;
6270 ceph_assert(info.history.same_interval_since != 0);
6271 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
6272 get_is_recoverable_predicate());
6273 bool new_interval = PastIntervals::check_new_interval(
6274 old_acting_primary.osd,
6275 new_acting_primary,
6276 oldacting, newacting,
6277 old_up_primary.osd,
6278 new_up_primary,
6279 oldup, newup,
6280 info.history.same_interval_since,
6281 info.history.last_epoch_clean,
6282 osdmap,
6283 lastmap,
6284 info.pgid.pgid,
6285 recoverable.get(),
6286 &past_intervals,
6287 &debug);
6288 dout(10) << __func__ << ": check_new_interval output: "
6289 << debug.str() << dendl;
6290 if (new_interval) {
6291 if (osdmap->get_epoch() == osd->get_superblock().oldest_map &&
6292 info.history.last_epoch_clean < osdmap->get_epoch()) {
6293 dout(10) << " map gap, clearing past_intervals and faking" << dendl;
6294 // our information is incomplete and useless; someone else was clean
6295 // after everything we know if osdmaps were trimmed.
6296 past_intervals.clear();
6297 } else {
6298 dout(10) << " noting past " << past_intervals << dendl;
6299 }
6300 dirty_info = true;
6301 dirty_big_info = true;
6302 info.history.same_interval_since = osdmap->get_epoch();
6303 if (osdmap->have_pg_pool(info.pgid.pgid.pool()) &&
6304 info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
6305 osdmap->get_pg_num(info.pgid.pgid.pool()),
6306 nullptr)) {
6307 info.history.last_epoch_split = osdmap->get_epoch();
6308 }
6309 }
6310 }
6311
6312 if (old_up_primary != up_primary ||
6313 oldup != up) {
6314 info.history.same_up_since = osdmap->get_epoch();
6315 }
6316 // this comparison includes primary rank via pg_shard_t
6317 if (old_acting_primary != get_primary()) {
6318 info.history.same_primary_since = osdmap->get_epoch();
6319 }
6320
6321 on_new_interval();
6322
6323 dout(1) << __func__ << " up " << oldup << " -> " << up
6324 << ", acting " << oldacting << " -> " << acting
6325 << ", acting_primary " << old_acting_primary << " -> " << new_acting_primary
6326 << ", up_primary " << old_up_primary << " -> " << new_up_primary
6327 << ", role " << oldrole << " -> " << role
6328 << ", features acting " << acting_features
6329 << " upacting " << upacting_features
6330 << dendl;
6331
6332 // deactivate.
6333 state_clear(PG_STATE_ACTIVE);
6334 state_clear(PG_STATE_PEERED);
6335 state_clear(PG_STATE_PREMERGE);
6336 state_clear(PG_STATE_DOWN);
6337 state_clear(PG_STATE_RECOVERY_WAIT);
6338 state_clear(PG_STATE_RECOVERY_TOOFULL);
6339 state_clear(PG_STATE_RECOVERING);
6340
6341 peer_purged.clear();
6342 acting_recovery_backfill.clear();
6343 scrub_queued = false;
6344
6345 // reset primary/replica state?
6346 if (was_old_primary || is_primary()) {
6347 osd->remove_want_pg_temp(info.pgid.pgid);
6348 } else if (was_old_replica || is_replica()) {
6349 osd->remove_want_pg_temp(info.pgid.pgid);
6350 }
6351 clear_primary_state();
6352
6353
6354 // pg->on_*
6355 on_change(t);
6356
6357 projected_last_update = eversion_t();
6358
6359 ceph_assert(!deleting);
6360
6361 // should we tell the primary we are here?
6362 send_notify = !is_primary();
6363
6364 if (role != oldrole ||
6365 was_old_primary != is_primary()) {
6366 // did primary change?
6367 if (was_old_primary != is_primary()) {
6368 state_clear(PG_STATE_CLEAN);
6369 clear_publish_stats();
6370 }
6371
6372 on_role_change();
6373
6374 // take active waiters
6375 requeue_ops(waiting_for_peered);
6376
6377 } else {
6378 // no role change.
6379 // did primary change?
6380 if (get_primary() != old_acting_primary) {
6381 dout(10) << *this << " " << oldacting << " -> " << acting
6382 << ", acting primary "
6383 << old_acting_primary << " -> " << get_primary()
6384 << dendl;
6385 } else {
6386 // primary is the same.
6387 if (is_primary()) {
6388 // i am (still) primary. but my replica set changed.
6389 state_clear(PG_STATE_CLEAN);
6390
6391 dout(10) << oldacting << " -> " << acting
6392 << ", replicas changed" << dendl;
6393 }
6394 }
6395 }
6396 cancel_recovery();
6397
6398 if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
6399 dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
6400 osd->queue_want_pg_temp(info.pgid.pgid, acting);
6401 }
6402 }
6403
6404 void PG::on_new_interval()
6405 {
6406 const OSDMapRef osdmap = get_osdmap();
6407
6408 reg_next_scrub();
6409
6410 // initialize features
6411 acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
6412 upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
6413 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
6414 if (*p == CRUSH_ITEM_NONE)
6415 continue;
6416 uint64_t f = osdmap->get_xinfo(*p).features;
6417 acting_features &= f;
6418 upacting_features &= f;
6419 }
6420 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
6421 if (*p == CRUSH_ITEM_NONE)
6422 continue;
6423 upacting_features &= osdmap->get_xinfo(*p).features;
6424 }
6425
6426 _on_new_interval();
6427 }
6428
6429 void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
6430 {
6431 ceph_assert(!is_primary());
6432
6433 update_history(oinfo.history);
6434 if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
6435 info.stats.stats.sum.num_scrub_errors = 0;
6436 info.stats.stats.sum.num_shallow_scrub_errors = 0;
6437 info.stats.stats.sum.num_deep_scrub_errors = 0;
6438 dirty_info = true;
6439 }
6440
6441 if (!(info.purged_snaps == oinfo.purged_snaps)) {
6442 dout(10) << __func__ << " updating purged_snaps to " << oinfo.purged_snaps
6443 << dendl;
6444 info.purged_snaps = oinfo.purged_snaps;
6445 dirty_info = true;
6446 dirty_big_info = true;
6447 }
6448 }
6449
6450 ostream& operator<<(ostream& out, const PG& pg)
6451 {
6452 out << "pg[" << pg.info
6453 << " " << pg.up;
6454 if (pg.acting != pg.up)
6455 out << "/" << pg.acting;
6456 if (pg.is_ec_pg())
6457 out << "p" << pg.get_primary();
6458 if (!pg.async_recovery_targets.empty())
6459 out << " async=[" << pg.async_recovery_targets << "]";
6460 if (!pg.backfill_targets.empty())
6461 out << " backfill=[" << pg.backfill_targets << "]";
6462 out << " r=" << pg.get_role();
6463 out << " lpr=" << pg.get_last_peering_reset();
6464
6465 if (pg.deleting)
6466 out << " DELETING";
6467
6468 if (!pg.past_intervals.empty()) {
6469 out << " pi=[" << pg.past_intervals.get_bounds()
6470 << ")/" << pg.past_intervals.size();
6471 }
6472
6473 if (pg.is_peered()) {
6474 if (pg.last_update_ondisk != pg.info.last_update)
6475 out << " luod=" << pg.last_update_ondisk;
6476 if (pg.last_update_applied != pg.info.last_update)
6477 out << " lua=" << pg.last_update_applied;
6478 }
6479
6480 if (pg.recovery_ops_active)
6481 out << " rops=" << pg.recovery_ops_active;
6482
6483 if (pg.pg_log.get_tail() != pg.info.log_tail ||
6484 pg.pg_log.get_head() != pg.info.last_update)
6485 out << " (info mismatch, " << pg.pg_log.get_log() << ")";
6486
6487 if (!pg.pg_log.get_log().empty()) {
6488 if ((pg.pg_log.get_log().log.begin()->version <= pg.pg_log.get_tail())) {
6489 out << " (log bound mismatch, actual=["
6490 << pg.pg_log.get_log().log.begin()->version << ","
6491 << pg.pg_log.get_log().log.rbegin()->version << "]";
6492 out << ")";
6493 }
6494 }
6495
6496 out << " crt=" << pg.pg_log.get_can_rollback_to();
6497
6498 if (pg.last_complete_ondisk != pg.info.last_complete)
6499 out << " lcod " << pg.last_complete_ondisk;
6500
6501 if (pg.is_primary()) {
6502 out << " mlcod " << pg.min_last_complete_ondisk;
6503 }
6504
6505 out << " " << pg_state_string(pg.get_state());
6506 if (pg.should_send_notify())
6507 out << " NOTIFY";
6508
6509 if (pg.scrubber.must_repair)
6510 out << " MUST_REPAIR";
6511 if (pg.scrubber.auto_repair)
6512 out << " AUTO_REPAIR";
6513 if (pg.scrubber.check_repair)
6514 out << " CHECK_REPAIR";
6515 if (pg.scrubber.deep_scrub_on_error)
6516 out << " DEEP_SCRUB_ON_ERROR";
6517 if (pg.scrubber.must_deep_scrub)
6518 out << " MUST_DEEP_SCRUB";
6519 if (pg.scrubber.must_scrub)
6520 out << " MUST_SCRUB";
6521
6522 //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
6523 if (pg.pg_log.get_missing().num_missing()) {
6524 out << " m=" << pg.pg_log.get_missing().num_missing();
6525 if (pg.is_primary()) {
6526 uint64_t unfound = pg.get_num_unfound();
6527 if (unfound)
6528 out << " u=" << unfound;
6529 }
6530 }
6531 if (!pg.is_clean()) {
6532 out << " mbc=" << pg.missing_loc.get_missing_by_count();
6533 }
6534 if (!pg.snap_trimq.empty()) {
6535 out << " trimq=";
6536 // only show a count if the set is large
6537 if (pg.snap_trimq.num_intervals() > 16) {
6538 out << pg.snap_trimq.size();
6539 } else {
6540 out << pg.snap_trimq;
6541 }
6542 }
6543 if (!pg.info.purged_snaps.empty()) {
6544 out << " ps="; // snap trim queue / purged snaps
6545 if (pg.info.purged_snaps.num_intervals() > 16) {
6546 out << pg.info.purged_snaps.size();
6547 } else {
6548 out << pg.info.purged_snaps;
6549 }
6550 }
6551
6552 out << "]";
6553
6554
6555 return out;
6556 }
6557
6558 bool PG::can_discard_op(OpRequestRef& op)
6559 {
6560 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
6561 if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) {
6562 dout(20) << " discard " << *m << dendl;
6563 return true;
6564 }
6565
6566 if (m->get_map_epoch() < info.history.same_primary_since) {
6567 dout(7) << " changed after " << m->get_map_epoch()
6568 << ", dropping " << *m << dendl;
6569 return true;
6570 }
6571
6572 if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) {
6573 // >= luminous client
6574 if (m->get_connection()->has_feature(CEPH_FEATURE_SERVER_NAUTILUS)) {
6575 // >= nautilus client
6576 if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) {
6577 dout(7) << __func__ << " sent before last_force_op_resend "
6578 << pool.info.last_force_op_resend
6579 << ", dropping" << *m << dendl;
6580 return true;
6581 }
6582 } else {
6583 // == < nautilus client (luminous or mimic)
6584 if (m->get_map_epoch() < pool.info.get_last_force_op_resend_prenautilus()) {
6585 dout(7) << __func__ << " sent before last_force_op_resend_prenautilus "
6586 << pool.info.last_force_op_resend_prenautilus
6587 << ", dropping" << *m << dendl;
6588 return true;
6589 }
6590 }
6591 if (m->get_map_epoch() < info.history.last_epoch_split) {
6592 dout(7) << __func__ << " pg split in "
6593 << info.history.last_epoch_split << ", dropping" << dendl;
6594 return true;
6595 }
6596 } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) {
6597 // < luminous client
6598 if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) {
6599 dout(7) << __func__ << " sent before last_force_op_resend_preluminous "
6600 << pool.info.last_force_op_resend_preluminous
6601 << ", dropping" << *m << dendl;
6602 return true;
6603 }
6604 }
6605
6606 return false;
6607 }
6608
6609 template<typename T, int MSGTYPE>
6610 bool PG::can_discard_replica_op(OpRequestRef& op)
6611 {
6612 const T *m = static_cast<const T *>(op->get_req());
6613 ceph_assert(m->get_type() == MSGTYPE);
6614
6615 int from = m->get_source().num();
6616
6617 // if a repop is replied after a replica goes down in a new osdmap, and
6618 // before the pg advances to this new osdmap, the repop replies before this
6619 // repop can be discarded by that replica OSD, because the primary resets the
6620 // connection to it when handling the new osdmap marking it down, and also
6621 // resets the messenger sesssion when the replica reconnects. to avoid the
6622 // out-of-order replies, the messages from that replica should be discarded.
6623 OSDMapRef next_map = osd->get_next_osdmap();
6624 if (next_map->is_down(from))
6625 return true;
6626 /* Mostly, this overlaps with the old_peering_msg
6627 * condition. An important exception is pushes
6628 * sent by replicas not in the acting set, since
6629 * if such a replica goes down it does not cause
6630 * a new interval. */
6631 if (next_map->get_down_at(from) >= m->map_epoch)
6632 return true;
6633
6634 // same pg?
6635 // if pg changes _at all_, we reset and repeer!
6636 if (old_peering_msg(m->map_epoch, m->map_epoch)) {
6637 dout(10) << "can_discard_replica_op pg changed " << info.history
6638 << " after " << m->map_epoch
6639 << ", dropping" << dendl;
6640 return true;
6641 }
6642 return false;
6643 }
6644
6645 bool PG::can_discard_scan(OpRequestRef op)
6646 {
6647 const MOSDPGScan *m = static_cast<const MOSDPGScan *>(op->get_req());
6648 ceph_assert(m->get_type() == MSG_OSD_PG_SCAN);
6649
6650 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
6651 dout(10) << " got old scan, ignoring" << dendl;
6652 return true;
6653 }
6654 return false;
6655 }
6656
6657 bool PG::can_discard_backfill(OpRequestRef op)
6658 {
6659 const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill *>(op->get_req());
6660 ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL);
6661
6662 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
6663 dout(10) << " got old backfill, ignoring" << dendl;
6664 return true;
6665 }
6666
6667 return false;
6668
6669 }
6670
6671 bool PG::can_discard_request(OpRequestRef& op)
6672 {
6673 switch (op->get_req()->get_type()) {
6674 case CEPH_MSG_OSD_OP:
6675 return can_discard_op(op);
6676 case CEPH_MSG_OSD_BACKOFF:
6677 return false; // never discard
6678 case MSG_OSD_REPOP:
6679 return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op);
6680 case MSG_OSD_PG_PUSH:
6681 return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op);
6682 case MSG_OSD_PG_PULL:
6683 return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
6684 case MSG_OSD_PG_PUSH_REPLY:
6685 return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
6686 case MSG_OSD_REPOPREPLY:
6687 return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
6688 case MSG_OSD_PG_RECOVERY_DELETE:
6689 return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op);
6690
6691 case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
6692 return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op);
6693
6694 case MSG_OSD_EC_WRITE:
6695 return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
6696 case MSG_OSD_EC_WRITE_REPLY:
6697 return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op);
6698 case MSG_OSD_EC_READ:
6699 return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
6700 case MSG_OSD_EC_READ_REPLY:
6701 return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
6702 case MSG_OSD_REP_SCRUB:
6703 return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
6704 case MSG_OSD_SCRUB_RESERVE:
6705 return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op);
6706 case MSG_OSD_REP_SCRUBMAP:
6707 return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op);
6708 case MSG_OSD_PG_UPDATE_LOG_MISSING:
6709 return can_discard_replica_op<
6710 MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op);
6711 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
6712 return can_discard_replica_op<
6713 MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
6714
6715 case MSG_OSD_PG_SCAN:
6716 return can_discard_scan(op);
6717 case MSG_OSD_PG_BACKFILL:
6718 return can_discard_backfill(op);
6719 case MSG_OSD_PG_BACKFILL_REMOVE:
6720 return can_discard_replica_op<MOSDPGBackfillRemove,
6721 MSG_OSD_PG_BACKFILL_REMOVE>(op);
6722 }
6723 return true;
6724 }
6725
6726 void PG::take_waiters()
6727 {
6728 dout(10) << "take_waiters" << dendl;
6729 requeue_map_waiters();
6730 }
6731
6732 void PG::do_peering_event(PGPeeringEventRef evt, RecoveryCtx *rctx)
6733 {
6734 dout(10) << __func__ << ": " << evt->get_desc() << dendl;
6735 ceph_assert(have_same_or_newer_map(evt->get_epoch_sent()));
6736 if (old_peering_evt(evt)) {
6737 dout(10) << "discard old " << evt->get_desc() << dendl;
6738 } else {
6739 recovery_state.handle_event(evt, rctx);
6740 }
6741 // write_if_dirty regardless of path above to ensure we capture any work
6742 // done by OSD::advance_pg().
6743 write_if_dirty(*rctx->transaction);
6744 }
6745
6746 void PG::queue_peering_event(PGPeeringEventRef evt)
6747 {
6748 if (old_peering_evt(evt))
6749 return;
6750 osd->osd->enqueue_peering_evt(info.pgid, evt);
6751 }
6752
6753 void PG::queue_null(epoch_t msg_epoch,
6754 epoch_t query_epoch)
6755 {
6756 dout(10) << "null" << dendl;
6757 queue_peering_event(
6758 PGPeeringEventRef(std::make_shared<PGPeeringEvent>(msg_epoch, query_epoch,
6759 NullEvt())));
6760 }
6761
6762 void PG::find_unfound(epoch_t queued, RecoveryCtx *rctx)
6763 {
6764 /*
6765 * if we couldn't start any recovery ops and things are still
6766 * unfound, see if we can discover more missing object locations.
6767 * It may be that our initial locations were bad and we errored
6768 * out while trying to pull.
6769 */
6770 discover_all_missing(*rctx->query_map);
6771 if (rctx->query_map->empty()) {
6772 string action;
6773 if (state_test(PG_STATE_BACKFILLING)) {
6774 auto evt = PGPeeringEventRef(
6775 new PGPeeringEvent(
6776 queued,
6777 queued,
6778 PG::UnfoundBackfill()));
6779 queue_peering_event(evt);
6780 action = "in backfill";
6781 } else if (state_test(PG_STATE_RECOVERING)) {
6782 auto evt = PGPeeringEventRef(
6783 new PGPeeringEvent(
6784 queued,
6785 queued,
6786 PG::UnfoundRecovery()));
6787 queue_peering_event(evt);
6788 action = "in recovery";
6789 } else {
6790 action = "already out of recovery/backfill";
6791 }
6792 dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
6793 } else {
6794 dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
6795 queue_recovery();
6796 }
6797 }
6798
6799 void PG::handle_advance_map(
6800 OSDMapRef osdmap, OSDMapRef lastmap,
6801 vector<int>& newup, int up_primary,
6802 vector<int>& newacting, int acting_primary,
6803 RecoveryCtx *rctx)
6804 {
6805 ceph_assert(lastmap->get_epoch() == osdmap_ref->get_epoch());
6806 ceph_assert(lastmap == osdmap_ref);
6807 dout(10) << "handle_advance_map "
6808 << newup << "/" << newacting
6809 << " -- " << up_primary << "/" << acting_primary
6810 << dendl;
6811 update_osdmap_ref(osdmap);
6812 osd_shard->update_pg_epoch(pg_slot, osdmap->get_epoch());
6813
6814 pool.update(cct, osdmap);
6815
6816 AdvMap evt(
6817 osdmap, lastmap, newup, up_primary,
6818 newacting, acting_primary);
6819 recovery_state.handle_event(evt, rctx);
6820 if (pool.info.last_change == osdmap_ref->get_epoch()) {
6821 on_pool_change();
6822 update_store_with_options();
6823 }
6824 last_require_osd_release = osdmap->require_osd_release;
6825 }
6826
6827 void PG::handle_activate_map(RecoveryCtx *rctx)
6828 {
6829 dout(10) << "handle_activate_map " << dendl;
6830 ActMap evt;
6831 recovery_state.handle_event(evt, rctx);
6832 if (osdmap_ref->get_epoch() - last_persisted_osdmap >
6833 cct->_conf->osd_pg_epoch_persisted_max_stale) {
6834 dout(20) << __func__ << ": Dirtying info: last_persisted is "
6835 << last_persisted_osdmap
6836 << " while current is " << osdmap_ref->get_epoch() << dendl;
6837 dirty_info = true;
6838 } else {
6839 dout(20) << __func__ << ": Not dirtying info: last_persisted is "
6840 << last_persisted_osdmap
6841 << " while current is " << osdmap_ref->get_epoch() << dendl;
6842 }
6843 if (osdmap_ref->check_new_blacklist_entries()) {
6844 check_blacklisted_watchers();
6845 }
6846 write_if_dirty(*rctx->transaction);
6847 }
6848
6849 void PG::handle_initialize(RecoveryCtx *rctx)
6850 {
6851 dout(10) << __func__ << dendl;
6852 Initialize evt;
6853 recovery_state.handle_event(evt, rctx);
6854 }
6855
6856 void PG::handle_query_state(Formatter *f)
6857 {
6858 dout(10) << "handle_query_state" << dendl;
6859 QueryState q(f);
6860 recovery_state.handle_event(q, 0);
6861 }
6862
6863 void PG::update_store_with_options()
6864 {
6865 auto r = osd->store->set_collection_opts(ch, pool.info.opts);
6866 if(r < 0 && r != -EOPNOTSUPP) {
6867 derr << __func__ << " set_collection_opts returns error:" << r << dendl;
6868 }
6869 }
6870
6871 struct C_DeleteMore : public Context {
6872 PGRef pg;
6873 epoch_t epoch;
6874 C_DeleteMore(PG *p, epoch_t e) : pg(p), epoch(e) {}
6875 void finish(int r) override {
6876 ceph_abort();
6877 }
6878 void complete(int r) override {
6879 ceph_assert(r == 0);
6880 pg->lock();
6881 if (!pg->pg_has_reset_since(epoch)) {
6882 pg->osd->queue_for_pg_delete(pg->get_pgid(), epoch);
6883 }
6884 pg->unlock();
6885 delete this;
6886 }
6887 };
6888
6889 void PG::_delete_some(ObjectStore::Transaction *t)
6890 {
6891 dout(10) << __func__ << dendl;
6892
6893 {
6894 float osd_delete_sleep = osd->osd->get_osd_delete_sleep();
6895 if (osd_delete_sleep > 0 && delete_needs_sleep) {
6896 epoch_t e = get_osdmap()->get_epoch();
6897 PGRef pgref(this);
6898 auto delete_requeue_callback = new FunctionContext([this, pgref, e](int r) {
6899 dout(20) << __func__ << " wake up at "
6900 << ceph_clock_now()
6901 << ", re-queuing delete" << dendl;
6902 lock();
6903 delete_needs_sleep = false;
6904 if (!pg_has_reset_since(e)) {
6905 osd->queue_for_pg_delete(get_pgid(), e);
6906 }
6907 unlock();
6908 });
6909
6910 utime_t delete_schedule_time = ceph_clock_now();
6911 delete_schedule_time += osd_delete_sleep;
6912 Mutex::Locker l(osd->sleep_lock);
6913 osd->sleep_timer.add_event_at(delete_schedule_time,
6914 delete_requeue_callback);
6915 dout(20) << __func__ << " Delete scheduled at " << delete_schedule_time << dendl;
6916 return;
6917 }
6918 }
6919
6920 delete_needs_sleep = true;
6921
6922 vector<ghobject_t> olist;
6923 int max = std::min(osd->store->get_ideal_list_max(),
6924 (int)cct->_conf->osd_target_transaction_size);
6925 ghobject_t next;
6926 osd->store->collection_list(
6927 ch,
6928 next,
6929 ghobject_t::get_max(),
6930 max,
6931 &olist,
6932 &next);
6933 dout(20) << __func__ << " " << olist << dendl;
6934
6935 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
6936 int64_t num = 0;
6937 for (auto& oid : olist) {
6938 if (oid.is_pgmeta()) {
6939 continue;
6940 }
6941 int r = snap_mapper.remove_oid(oid.hobj, &_t);
6942 if (r != 0 && r != -ENOENT) {
6943 ceph_abort();
6944 }
6945 t->remove(coll, oid);
6946 ++num;
6947 }
6948 if (num) {
6949 dout(20) << __func__ << " deleting " << num << " objects" << dendl;
6950 Context *fin = new C_DeleteMore(this, get_osdmap_epoch());
6951 t->register_on_commit(fin);
6952 } else {
6953 dout(20) << __func__ << " finished" << dendl;
6954 if (cct->_conf->osd_inject_failure_on_pg_removal) {
6955 _exit(1);
6956 }
6957
6958 // final flush here to ensure completions drop refs. Of particular concern
6959 // are the SnapMapper ContainerContexts.
6960 {
6961 PGRef pgref(this);
6962 PGLog::clear_info_log(info.pgid, t);
6963 t->remove_collection(coll);
6964 t->register_on_commit(new ContainerContext<PGRef>(pgref));
6965 t->register_on_applied(new ContainerContext<PGRef>(pgref));
6966 osd->store->queue_transaction(ch, std::move(*t));
6967 }
6968 ch->flush();
6969
6970 if (!osd->try_finish_pg_delete(this, pool.info.get_pg_num())) {
6971 dout(1) << __func__ << " raced with merge, reinstantiating" << dendl;
6972 ch = osd->store->create_new_collection(coll);
6973 _create(*t,
6974 info.pgid,
6975 info.pgid.get_split_bits(pool.info.get_pg_num()));
6976 _init(*t, info.pgid, &pool.info);
6977 last_epoch = 0; // to ensure pg epoch is also written
6978 dirty_info = true;
6979 dirty_big_info = true;
6980 } else {
6981 deleted = true;
6982
6983 // cancel reserver here, since the PG is about to get deleted and the
6984 // exit() methods don't run when that happens.
6985 osd->local_reserver.cancel_reservation(info.pgid);
6986
6987 osd->logger->dec(l_osd_pg_removing);
6988 }
6989 }
6990 }
6991
6992 // Compute pending backfill data
6993 static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes)
6994 {
6995 lgeneric_dout(cct, 20) << __func__ << " Adjust local usage " << (local_bytes >> 10) << "KiB"
6996 << " primary usage " << (bf_bytes >> 10) << "KiB" << dendl;
6997 return std::max((int64_t)0, bf_bytes - local_bytes);
6998 }
6999
7000 int PG::pg_stat_adjust(osd_stat_t *ns)
7001 {
7002 osd_stat_t &new_stat = *ns;
7003 if (is_primary()) {
7004 return 0;
7005 }
7006 // Adjust the kb_used by adding pending backfill data
7007 uint64_t reserved_num_bytes = get_reserved_num_bytes();
7008
7009 // For now we don't consider projected space gains here
7010 // I suggest we have an optional 2 pass backfill that frees up
7011 // space in a first pass. This could be triggered when at nearfull
7012 // or near to backfillfull.
7013 if (reserved_num_bytes > 0) {
7014 // TODO: Handle compression by adjusting by the PGs average
7015 // compression precentage.
7016 dout(20) << __func__ << " reserved_num_bytes " << (reserved_num_bytes >> 10) << "KiB"
7017 << " Before kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl;
7018 if (new_stat.statfs.available > reserved_num_bytes)
7019 new_stat.statfs.available -= reserved_num_bytes;
7020 else
7021 new_stat.statfs.available = 0;
7022 dout(20) << __func__ << " After kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl;
7023 return 1;
7024 }
7025 return 0;
7026 }
7027
7028
7029 /*------------ Recovery State Machine----------------*/
7030 #undef dout_prefix
7031 #define dout_prefix (context< RecoveryMachine >().pg->gen_prefix(*_dout) \
7032 << "state<" << get_state_name() << ">: ")
7033
7034 /*------Crashed-------*/
7035 PG::RecoveryState::Crashed::Crashed(my_context ctx)
7036 : my_base(ctx),
7037 NamedState(context< RecoveryMachine >().pg, "Crashed")
7038 {
7039 context< RecoveryMachine >().log_enter(state_name);
7040 ceph_abort_msg("we got a bad state machine event");
7041 }
7042
7043
7044 /*------Initial-------*/
7045 PG::RecoveryState::Initial::Initial(my_context ctx)
7046 : my_base(ctx),
7047 NamedState(context< RecoveryMachine >().pg, "Initial")
7048 {
7049 context< RecoveryMachine >().log_enter(state_name);
7050 }
7051
7052 boost::statechart::result PG::RecoveryState::Initial::react(const MNotifyRec& notify)
7053 {
7054 PG *pg = context< RecoveryMachine >().pg;
7055 pg->proc_replica_info(
7056 notify.from, notify.notify.info, notify.notify.epoch_sent);
7057 pg->set_last_peering_reset();
7058 return transit< Primary >();
7059 }
7060
7061 boost::statechart::result PG::RecoveryState::Initial::react(const MInfoRec& i)
7062 {
7063 PG *pg = context< RecoveryMachine >().pg;
7064 ceph_assert(!pg->is_primary());
7065 post_event(i);
7066 return transit< Stray >();
7067 }
7068
7069 boost::statechart::result PG::RecoveryState::Initial::react(const MLogRec& i)
7070 {
7071 PG *pg = context< RecoveryMachine >().pg;
7072 ceph_assert(!pg->is_primary());
7073 post_event(i);
7074 return transit< Stray >();
7075 }
7076
7077 void PG::RecoveryState::Initial::exit()
7078 {
7079 context< RecoveryMachine >().log_exit(state_name, enter_time);
7080 PG *pg = context< RecoveryMachine >().pg;
7081 utime_t dur = ceph_clock_now() - enter_time;
7082 pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur);
7083 }
7084
7085 /*------Started-------*/
7086 PG::RecoveryState::Started::Started(my_context ctx)
7087 : my_base(ctx),
7088 NamedState(context< RecoveryMachine >().pg, "Started")
7089 {
7090 context< RecoveryMachine >().log_enter(state_name);
7091 }
7092
7093 boost::statechart::result
7094 PG::RecoveryState::Started::react(const IntervalFlush&)
7095 {
7096 PG *pg = context< RecoveryMachine >().pg;
7097 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
7098 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
7099 return discard_event();
7100 }
7101
7102 boost::statechart::result PG::RecoveryState::Started::react(const AdvMap& advmap)
7103 {
7104 PG *pg = context< RecoveryMachine >().pg;
7105 ldout(pg->cct, 10) << "Started advmap" << dendl;
7106 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
7107 if (pg->should_restart_peering(
7108 advmap.up_primary,
7109 advmap.acting_primary,
7110 advmap.newup,
7111 advmap.newacting,
7112 advmap.lastmap,
7113 advmap.osdmap)) {
7114 ldout(pg->cct, 10) << "should_restart_peering, transitioning to Reset"
7115 << dendl;
7116 post_event(advmap);
7117 return transit< Reset >();
7118 }
7119 pg->remove_down_peer_info(advmap.osdmap);
7120 return discard_event();
7121 }
7122
7123 boost::statechart::result PG::RecoveryState::Started::react(const QueryState& q)
7124 {
7125 q.f->open_object_section("state");
7126 q.f->dump_string("name", state_name);
7127 q.f->dump_stream("enter_time") << enter_time;
7128 q.f->close_section();
7129 return discard_event();
7130 }
7131
7132 void PG::RecoveryState::Started::exit()
7133 {
7134 context< RecoveryMachine >().log_exit(state_name, enter_time);
7135 PG *pg = context< RecoveryMachine >().pg;
7136 utime_t dur = ceph_clock_now() - enter_time;
7137 pg->osd->recoverystate_perf->tinc(rs_started_latency, dur);
7138 }
7139
7140 /*--------Reset---------*/
7141 PG::RecoveryState::Reset::Reset(my_context ctx)
7142 : my_base(ctx),
7143 NamedState(context< RecoveryMachine >().pg, "Reset")
7144 {
7145 context< RecoveryMachine >().log_enter(state_name);
7146 PG *pg = context< RecoveryMachine >().pg;
7147
7148 pg->flushes_in_progress = 0;
7149 pg->set_last_peering_reset();
7150 }
7151
7152 boost::statechart::result
7153 PG::RecoveryState::Reset::react(const IntervalFlush&)
7154 {
7155 PG *pg = context< RecoveryMachine >().pg;
7156 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
7157 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
7158 return discard_event();
7159 }
7160
7161 boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
7162 {
7163 PG *pg = context< RecoveryMachine >().pg;
7164 ldout(pg->cct, 10) << "Reset advmap" << dendl;
7165
7166 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
7167
7168 if (pg->should_restart_peering(
7169 advmap.up_primary,
7170 advmap.acting_primary,
7171 advmap.newup,
7172 advmap.newacting,
7173 advmap.lastmap,
7174 advmap.osdmap)) {
7175 ldout(pg->cct, 10) << "should restart peering, calling start_peering_interval again"
7176 << dendl;
7177 pg->start_peering_interval(
7178 advmap.lastmap,
7179 advmap.newup, advmap.up_primary,
7180 advmap.newacting, advmap.acting_primary,
7181 context< RecoveryMachine >().get_cur_transaction());
7182 }
7183 pg->remove_down_peer_info(advmap.osdmap);
7184 pg->check_past_interval_bounds();
7185 return discard_event();
7186 }
7187
7188 boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&)
7189 {
7190 PG *pg = context< RecoveryMachine >().pg;
7191 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7192 context< RecoveryMachine >().send_notify(
7193 pg->get_primary(),
7194 pg_notify_t(
7195 pg->get_primary().shard, pg->pg_whoami.shard,
7196 pg->get_osdmap_epoch(),
7197 pg->get_osdmap_epoch(),
7198 pg->info),
7199 pg->past_intervals);
7200 }
7201
7202 pg->update_heartbeat_peers();
7203 pg->take_waiters();
7204
7205 return transit< Started >();
7206 }
7207
7208 boost::statechart::result PG::RecoveryState::Reset::react(const QueryState& q)
7209 {
7210 q.f->open_object_section("state");
7211 q.f->dump_string("name", state_name);
7212 q.f->dump_stream("enter_time") << enter_time;
7213 q.f->close_section();
7214 return discard_event();
7215 }
7216
7217 void PG::RecoveryState::Reset::exit()
7218 {
7219 context< RecoveryMachine >().log_exit(state_name, enter_time);
7220 PG *pg = context< RecoveryMachine >().pg;
7221 utime_t dur = ceph_clock_now() - enter_time;
7222 pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur);
7223 }
7224
7225 /*-------Start---------*/
7226 PG::RecoveryState::Start::Start(my_context ctx)
7227 : my_base(ctx),
7228 NamedState(context< RecoveryMachine >().pg, "Start")
7229 {
7230 context< RecoveryMachine >().log_enter(state_name);
7231
7232 PG *pg = context< RecoveryMachine >().pg;
7233 if (pg->is_primary()) {
7234 ldout(pg->cct, 1) << "transitioning to Primary" << dendl;
7235 post_event(MakePrimary());
7236 } else { //is_stray
7237 ldout(pg->cct, 1) << "transitioning to Stray" << dendl;
7238 post_event(MakeStray());
7239 }
7240 }
7241
7242 void PG::RecoveryState::Start::exit()
7243 {
7244 context< RecoveryMachine >().log_exit(state_name, enter_time);
7245 PG *pg = context< RecoveryMachine >().pg;
7246 utime_t dur = ceph_clock_now() - enter_time;
7247 pg->osd->recoverystate_perf->tinc(rs_start_latency, dur);
7248 }
7249
7250 /*---------Primary--------*/
7251 PG::RecoveryState::Primary::Primary(my_context ctx)
7252 : my_base(ctx),
7253 NamedState(context< RecoveryMachine >().pg, "Started/Primary")
7254 {
7255 context< RecoveryMachine >().log_enter(state_name);
7256 PG *pg = context< RecoveryMachine >().pg;
7257 ceph_assert(pg->want_acting.empty());
7258
7259 // set CREATING bit until we have peered for the first time.
7260 if (pg->info.history.last_epoch_started == 0) {
7261 pg->state_set(PG_STATE_CREATING);
7262 // use the history timestamp, which ultimately comes from the
7263 // monitor in the create case.
7264 utime_t t = pg->info.history.last_scrub_stamp;
7265 pg->info.stats.last_fresh = t;
7266 pg->info.stats.last_active = t;
7267 pg->info.stats.last_change = t;
7268 pg->info.stats.last_peered = t;
7269 pg->info.stats.last_clean = t;
7270 pg->info.stats.last_unstale = t;
7271 pg->info.stats.last_undegraded = t;
7272 pg->info.stats.last_fullsized = t;
7273 pg->info.stats.last_scrub_stamp = t;
7274 pg->info.stats.last_deep_scrub_stamp = t;
7275 pg->info.stats.last_clean_scrub_stamp = t;
7276 }
7277 }
7278
7279 boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& notevt)
7280 {
7281 PG *pg = context< RecoveryMachine >().pg;
7282 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
7283 pg->proc_replica_info(
7284 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
7285 return discard_event();
7286 }
7287
7288 boost::statechart::result PG::RecoveryState::Primary::react(const ActMap&)
7289 {
7290 PG *pg = context< RecoveryMachine >().pg;
7291 ldout(pg->cct, 7) << "handle ActMap primary" << dendl;
7292 pg->publish_stats_to_osd();
7293 pg->take_waiters();
7294 return discard_event();
7295 }
7296
7297 boost::statechart::result PG::RecoveryState::Primary::react(
7298 const SetForceRecovery&)
7299 {
7300 PG *pg = context< RecoveryMachine >().pg;
7301 pg->set_force_recovery(true);
7302 return discard_event();
7303 }
7304
7305 boost::statechart::result PG::RecoveryState::Primary::react(
7306 const UnsetForceRecovery&)
7307 {
7308 PG *pg = context< RecoveryMachine >().pg;
7309 pg->set_force_recovery(false);
7310 return discard_event();
7311 }
7312
7313 boost::statechart::result PG::RecoveryState::Primary::react(
7314 const RequestScrub& evt)
7315 {
7316 PG *pg = context< RecoveryMachine >().pg;
7317 if (pg->is_primary()) {
7318 pg->unreg_next_scrub();
7319 pg->scrubber.must_scrub = true;
7320 pg->scrubber.must_deep_scrub = evt.deep || evt.repair;
7321 pg->scrubber.must_repair = evt.repair;
7322 pg->reg_next_scrub();
7323 ldout(pg->cct,10) << "marking for scrub" << dendl;
7324 }
7325 return discard_event();
7326 }
7327
7328 boost::statechart::result PG::RecoveryState::Primary::react(
7329 const SetForceBackfill&)
7330 {
7331 PG *pg = context< RecoveryMachine >().pg;
7332 pg->set_force_backfill(true);
7333 return discard_event();
7334 }
7335
7336 boost::statechart::result PG::RecoveryState::Primary::react(
7337 const UnsetForceBackfill&)
7338 {
7339 PG *pg = context< RecoveryMachine >().pg;
7340 pg->set_force_backfill(false);
7341 return discard_event();
7342 }
7343
7344 void PG::RecoveryState::Primary::exit()
7345 {
7346 context< RecoveryMachine >().log_exit(state_name, enter_time);
7347 PG *pg = context< RecoveryMachine >().pg;
7348 pg->want_acting.clear();
7349 utime_t dur = ceph_clock_now() - enter_time;
7350 pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur);
7351 pg->clear_primary_state();
7352 pg->state_clear(PG_STATE_CREATING);
7353 }
7354
7355 /*---------Peering--------*/
7356 PG::RecoveryState::Peering::Peering(my_context ctx)
7357 : my_base(ctx),
7358 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering"),
7359 history_les_bound(false)
7360 {
7361 context< RecoveryMachine >().log_enter(state_name);
7362
7363 PG *pg = context< RecoveryMachine >().pg;
7364 ceph_assert(!pg->is_peered());
7365 ceph_assert(!pg->is_peering());
7366 ceph_assert(pg->is_primary());
7367 pg->state_set(PG_STATE_PEERING);
7368 }
7369
7370 boost::statechart::result PG::RecoveryState::Peering::react(const AdvMap& advmap)
7371 {
7372 PG *pg = context< RecoveryMachine >().pg;
7373 ldout(pg->cct, 10) << "Peering advmap" << dendl;
7374 if (prior_set.affected_by_map(*(advmap.osdmap), pg)) {
7375 ldout(pg->cct, 1) << "Peering, affected_by_map, going to Reset" << dendl;
7376 post_event(advmap);
7377 return transit< Reset >();
7378 }
7379
7380 pg->adjust_need_up_thru(advmap.osdmap);
7381
7382 return forward_event();
7383 }
7384
7385 boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q)
7386 {
7387 PG *pg = context< RecoveryMachine >().pg;
7388
7389 q.f->open_object_section("state");
7390 q.f->dump_string("name", state_name);
7391 q.f->dump_stream("enter_time") << enter_time;
7392
7393 q.f->open_array_section("past_intervals");
7394 pg->past_intervals.dump(q.f);
7395 q.f->close_section();
7396
7397 q.f->open_array_section("probing_osds");
7398 for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
7399 p != prior_set.probe.end();
7400 ++p)
7401 q.f->dump_stream("osd") << *p;
7402 q.f->close_section();
7403
7404 if (prior_set.pg_down)
7405 q.f->dump_string("blocked", "peering is blocked due to down osds");
7406
7407 q.f->open_array_section("down_osds_we_would_probe");
7408 for (set<int>::iterator p = prior_set.down.begin();
7409 p != prior_set.down.end();
7410 ++p)
7411 q.f->dump_int("osd", *p);
7412 q.f->close_section();
7413
7414 q.f->open_array_section("peering_blocked_by");
7415 for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
7416 p != prior_set.blocked_by.end();
7417 ++p) {
7418 q.f->open_object_section("osd");
7419 q.f->dump_int("osd", p->first);
7420 q.f->dump_int("current_lost_at", p->second);
7421 q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
7422 q.f->close_section();
7423 }
7424 q.f->close_section();
7425
7426 if (history_les_bound) {
7427 q.f->open_array_section("peering_blocked_by_detail");
7428 q.f->open_object_section("item");
7429 q.f->dump_string("detail","peering_blocked_by_history_les_bound");
7430 q.f->close_section();
7431 q.f->close_section();
7432 }
7433
7434 q.f->close_section();
7435 return forward_event();
7436 }
7437
7438 void PG::RecoveryState::Peering::exit()
7439 {
7440 PG *pg = context< RecoveryMachine >().pg;
7441 ldout(pg->cct, 10) << "Leaving Peering" << dendl;
7442 context< RecoveryMachine >().log_exit(state_name, enter_time);
7443 pg->state_clear(PG_STATE_PEERING);
7444 pg->clear_probe_targets();
7445
7446 utime_t dur = ceph_clock_now() - enter_time;
7447 pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur);
7448 }
7449
7450
7451 /*------Backfilling-------*/
7452 PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
7453 : my_base(ctx),
7454 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Backfilling")
7455 {
7456 context< RecoveryMachine >().log_enter(state_name);
7457 PG *pg = context< RecoveryMachine >().pg;
7458 pg->backfill_reserved = true;
7459 pg->queue_recovery();
7460 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
7461 pg->state_clear(PG_STATE_BACKFILL_WAIT);
7462 pg->state_set(PG_STATE_BACKFILLING);
7463 pg->publish_stats_to_osd();
7464 }
7465
7466 void PG::RecoveryState::Backfilling::backfill_release_reservations()
7467 {
7468 PG *pg = context< RecoveryMachine >().pg;
7469 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7470 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
7471 it != pg->backfill_targets.end();
7472 ++it) {
7473 ceph_assert(*it != pg->pg_whoami);
7474 ConnectionRef con = pg->osd->get_con_osd_cluster(
7475 it->osd, pg->get_osdmap_epoch());
7476 if (con) {
7477 pg->osd->send_message_osd_cluster(
7478 new MBackfillReserve(
7479 MBackfillReserve::RELEASE,
7480 spg_t(pg->info.pgid.pgid, it->shard),
7481 pg->get_osdmap_epoch()),
7482 con.get());
7483 }
7484 }
7485 }
7486
7487 void PG::RecoveryState::Backfilling::cancel_backfill()
7488 {
7489 PG *pg = context< RecoveryMachine >().pg;
7490 backfill_release_reservations();
7491 if (!pg->waiting_on_backfill.empty()) {
7492 pg->waiting_on_backfill.clear();
7493 pg->finish_recovery_op(hobject_t::get_max());
7494 }
7495 }
7496
7497 boost::statechart::result
7498 PG::RecoveryState::Backfilling::react(const Backfilled &c)
7499 {
7500 backfill_release_reservations();
7501 return transit<Recovered>();
7502 }
7503
7504 boost::statechart::result
7505 PG::RecoveryState::Backfilling::react(const DeferBackfill &c)
7506 {
7507 PG *pg = context< RecoveryMachine >().pg;
7508 ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl;
7509 pg->state_set(PG_STATE_BACKFILL_WAIT);
7510 pg->state_clear(PG_STATE_BACKFILLING);
7511 cancel_backfill();
7512 pg->schedule_backfill_retry(c.delay);
7513 return transit<NotBackfilling>();
7514 }
7515
7516 boost::statechart::result
7517 PG::RecoveryState::Backfilling::react(const UnfoundBackfill &c)
7518 {
7519 PG *pg = context< RecoveryMachine >().pg;
7520 ldout(pg->cct, 10) << "backfill has unfound, can't continue" << dendl;
7521 pg->state_set(PG_STATE_BACKFILL_UNFOUND);
7522 pg->state_clear(PG_STATE_BACKFILLING);
7523 cancel_backfill();
7524 return transit<NotBackfilling>();
7525 }
7526
7527 boost::statechart::result
7528 PG::RecoveryState::Backfilling::react(const RemoteReservationRevokedTooFull &)
7529 {
7530 PG *pg = context< RecoveryMachine >().pg;
7531 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
7532 pg->state_clear(PG_STATE_BACKFILLING);
7533 cancel_backfill();
7534 pg->schedule_backfill_retry(pg->cct->_conf->osd_backfill_retry_interval);
7535 return transit<NotBackfilling>();
7536 }
7537
7538 boost::statechart::result
7539 PG::RecoveryState::Backfilling::react(const RemoteReservationRevoked &)
7540 {
7541 PG *pg = context< RecoveryMachine >().pg;
7542 pg->state_set(PG_STATE_BACKFILL_WAIT);
7543 cancel_backfill();
7544 if (pg->needs_backfill()) {
7545 return transit<WaitLocalBackfillReserved>();
7546 } else {
7547 // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
7548 return discard_event();
7549 }
7550 }
7551
7552 void PG::RecoveryState::Backfilling::exit()
7553 {
7554 context< RecoveryMachine >().log_exit(state_name, enter_time);
7555 PG *pg = context< RecoveryMachine >().pg;
7556 pg->backfill_reserved = false;
7557 pg->backfill_reserving = false;
7558 pg->state_clear(PG_STATE_BACKFILLING);
7559 pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
7560 utime_t dur = ceph_clock_now() - enter_time;
7561 pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
7562 }
7563
7564 /*--WaitRemoteBackfillReserved--*/
7565
7566 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
7567 : my_base(ctx),
7568 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteBackfillReserved"),
7569 backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
7570 {
7571 context< RecoveryMachine >().log_enter(state_name);
7572 PG *pg = context< RecoveryMachine >().pg;
7573 pg->state_set(PG_STATE_BACKFILL_WAIT);
7574 pg->publish_stats_to_osd();
7575 post_event(RemoteBackfillReserved());
7576 }
7577
7578 boost::statechart::result
7579 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
7580 {
7581 PG *pg = context< RecoveryMachine >().pg;
7582
7583 int64_t num_bytes = pg->info.stats.stats.sum.num_bytes;
7584 ldout(pg->cct, 10) << __func__ << " num_bytes " << num_bytes << dendl;
7585 if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) {
7586 //The primary never backfills itself
7587 ceph_assert(*backfill_osd_it != pg->pg_whoami);
7588 ConnectionRef con = pg->osd->get_con_osd_cluster(
7589 backfill_osd_it->osd, pg->get_osdmap_epoch());
7590 if (con) {
7591 pg->osd->send_message_osd_cluster(
7592 new MBackfillReserve(
7593 MBackfillReserve::REQUEST,
7594 spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
7595 pg->get_osdmap_epoch(),
7596 pg->get_backfill_priority(),
7597 num_bytes,
7598 pg->peer_bytes[*backfill_osd_it]),
7599 con.get());
7600 }
7601 ++backfill_osd_it;
7602 } else {
7603 pg->peer_bytes.clear();
7604 post_event(AllBackfillsReserved());
7605 }
7606 return discard_event();
7607 }
7608
7609 void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
7610 {
7611 context< RecoveryMachine >().log_exit(state_name, enter_time);
7612 PG *pg = context< RecoveryMachine >().pg;
7613 utime_t dur = ceph_clock_now() - enter_time;
7614 pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur);
7615 }
7616
7617 void PG::RecoveryState::WaitRemoteBackfillReserved::retry()
7618 {
7619 PG *pg = context< RecoveryMachine >().pg;
7620 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7621
7622 // Send CANCEL to all previously acquired reservations
7623 set<pg_shard_t>::const_iterator it, begin, end;
7624 begin = context< Active >().remote_shards_to_reserve_backfill.begin();
7625 end = context< Active >().remote_shards_to_reserve_backfill.end();
7626 ceph_assert(begin != end);
7627 for (it = begin; it != backfill_osd_it; ++it) {
7628 //The primary never backfills itself
7629 ceph_assert(*it != pg->pg_whoami);
7630 ConnectionRef con = pg->osd->get_con_osd_cluster(
7631 it->osd, pg->get_osdmap_epoch());
7632 if (con) {
7633 pg->osd->send_message_osd_cluster(
7634 new MBackfillReserve(
7635 MBackfillReserve::RELEASE,
7636 spg_t(pg->info.pgid.pgid, it->shard),
7637 pg->get_osdmap_epoch()),
7638 con.get());
7639 }
7640 }
7641
7642 pg->state_clear(PG_STATE_BACKFILL_WAIT);
7643 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
7644 pg->publish_stats_to_osd();
7645
7646 pg->schedule_backfill_retry(pg->cct->_conf->osd_backfill_retry_interval);
7647 }
7648
7649 boost::statechart::result
7650 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt)
7651 {
7652 retry();
7653 return transit<NotBackfilling>();
7654 }
7655
7656 boost::statechart::result
7657 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt)
7658 {
7659 retry();
7660 return transit<NotBackfilling>();
7661 }
7662
7663 /*--WaitLocalBackfillReserved--*/
7664 PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
7665 : my_base(ctx),
7666 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalBackfillReserved")
7667 {
7668 context< RecoveryMachine >().log_enter(state_name);
7669 PG *pg = context< RecoveryMachine >().pg;
7670 pg->state_set(PG_STATE_BACKFILL_WAIT);
7671 pg->osd->local_reserver.request_reservation(
7672 pg->info.pgid,
7673 new QueuePeeringEvt<LocalBackfillReserved>(
7674 pg, pg->get_osdmap_epoch(),
7675 LocalBackfillReserved()),
7676 pg->get_backfill_priority(),
7677 new QueuePeeringEvt<DeferBackfill>(
7678 pg, pg->get_osdmap_epoch(),
7679 DeferBackfill(0.0)));
7680 pg->publish_stats_to_osd();
7681 }
7682
7683 void PG::RecoveryState::WaitLocalBackfillReserved::exit()
7684 {
7685 context< RecoveryMachine >().log_exit(state_name, enter_time);
7686 PG *pg = context< RecoveryMachine >().pg;
7687 utime_t dur = ceph_clock_now() - enter_time;
7688 pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur);
7689 }
7690
7691 /*----NotBackfilling------*/
7692 PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
7693 : my_base(ctx),
7694 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotBackfilling")
7695 {
7696 context< RecoveryMachine >().log_enter(state_name);
7697 PG *pg = context< RecoveryMachine >().pg;
7698 pg->state_clear(PG_STATE_REPAIR);
7699 pg->publish_stats_to_osd();
7700 }
7701
7702 boost::statechart::result
7703 PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
7704 {
7705 return discard_event();
7706 }
7707
7708 boost::statechart::result
7709 PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
7710 {
7711 return discard_event();
7712 }
7713
7714 void PG::RecoveryState::NotBackfilling::exit()
7715 {
7716 context< RecoveryMachine >().log_exit(state_name, enter_time);
7717 PG *pg = context< RecoveryMachine >().pg;
7718 pg->state_clear(PG_STATE_BACKFILL_UNFOUND);
7719 utime_t dur = ceph_clock_now() - enter_time;
7720 pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
7721 }
7722
7723 /*----NotRecovering------*/
7724 PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx)
7725 : my_base(ctx),
7726 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotRecovering")
7727 {
7728 context< RecoveryMachine >().log_enter(state_name);
7729 PG *pg = context< RecoveryMachine >().pg;
7730 pg->publish_stats_to_osd();
7731 }
7732
7733 void PG::RecoveryState::NotRecovering::exit()
7734 {
7735 context< RecoveryMachine >().log_exit(state_name, enter_time);
7736 PG *pg = context< RecoveryMachine >().pg;
7737 pg->state_clear(PG_STATE_RECOVERY_UNFOUND);
7738 utime_t dur = ceph_clock_now() - enter_time;
7739 pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur);
7740 }
7741
7742 /*---RepNotRecovering----*/
7743 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
7744 : my_base(ctx),
7745 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepNotRecovering")
7746 {
7747 context< RecoveryMachine >().log_enter(state_name);
7748 }
7749
7750 boost::statechart::result
7751 PG::RecoveryState::RepNotRecovering::react(const RejectRemoteReservation &evt)
7752 {
7753 PG *pg = context< RecoveryMachine >().pg;
7754 pg->reject_reservation();
7755 post_event(RemoteReservationRejected());
7756 return discard_event();
7757 }
7758
7759 void PG::RecoveryState::RepNotRecovering::exit()
7760 {
7761 context< RecoveryMachine >().log_exit(state_name, enter_time);
7762 PG *pg = context< RecoveryMachine >().pg;
7763 utime_t dur = ceph_clock_now() - enter_time;
7764 pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur);
7765 }
7766
7767 /*---RepWaitRecoveryReserved--*/
7768 PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
7769 : my_base(ctx),
7770 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitRecoveryReserved")
7771 {
7772 context< RecoveryMachine >().log_enter(state_name);
7773 }
7774
7775 boost::statechart::result
7776 PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
7777 {
7778 PG *pg = context< RecoveryMachine >().pg;
7779 pg->osd->send_message_osd_cluster(
7780 pg->primary.osd,
7781 new MRecoveryReserve(
7782 MRecoveryReserve::GRANT,
7783 spg_t(pg->info.pgid.pgid, pg->primary.shard),
7784 pg->get_osdmap_epoch()),
7785 pg->get_osdmap_epoch());
7786 return transit<RepRecovering>();
7787 }
7788
7789 boost::statechart::result
7790 PG::RecoveryState::RepWaitRecoveryReserved::react(
7791 const RemoteReservationCanceled &evt)
7792 {
7793 PG *pg = context< RecoveryMachine >().pg;
7794 pg->clear_reserved_num_bytes();
7795 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7796 return transit<RepNotRecovering>();
7797 }
7798
7799 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
7800 {
7801 context< RecoveryMachine >().log_exit(state_name, enter_time);
7802 PG *pg = context< RecoveryMachine >().pg;
7803 utime_t dur = ceph_clock_now() - enter_time;
7804 pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur);
7805 }
7806
7807 /*-RepWaitBackfillReserved*/
7808 PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
7809 : my_base(ctx),
7810 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitBackfillReserved")
7811 {
7812 context< RecoveryMachine >().log_enter(state_name);
7813 }
7814
7815 boost::statechart::result
7816 PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
7817 {
7818 PG *pg = context< RecoveryMachine >().pg;
7819 // Use tentative_bacfill_full() to make sure enough
7820 // space is available to handle target bytes from primary.
7821
7822 // TODO: If we passed num_objects from primary we could account for
7823 // an estimate of the metadata overhead.
7824
7825 // TODO: If we had compressed_allocated and compressed_original from primary
7826 // we could compute compression ratio and adjust accordingly.
7827
7828 // XXX: There is no way to get omap overhead and this would only apply
7829 // to whatever possibly different partition that is storing the database.
7830
7831 // update_osd_stat() from heartbeat will do this on a new
7832 // statfs using pg->primary_num_bytes.
7833 uint64_t pending_adjustment = 0;
7834 int64_t primary_num_bytes = evt.primary_num_bytes;
7835 int64_t local_num_bytes = evt.local_num_bytes;
7836 if (primary_num_bytes) {
7837 // For erasure coded pool overestimate by a full stripe per object
7838 // because we don't know how each objected rounded to the nearest stripe
7839 if (pg->pool.info.is_erasure()) {
7840 primary_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count();
7841 primary_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects;
7842 local_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count();
7843 local_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects;
7844 }
7845 pending_adjustment = pending_backfill(pg->cct, primary_num_bytes, local_num_bytes);
7846 ldout(pg->cct, 10) << __func__ << " primary_num_bytes " << (primary_num_bytes >> 10) << "KiB"
7847 << " local " << (local_num_bytes >> 10) << "KiB"
7848 << " pending_adjustments " << (pending_adjustment >> 10) << "KiB"
7849 << dendl;
7850 }
7851 // This lock protects not only the stats OSDService but also setting the pg primary_num_bytes
7852 // That's why we don't immediately unlock
7853 Mutex::Locker l(pg->osd->stat_lock);
7854 osd_stat_t cur_stat = pg->osd->osd_stat;
7855 if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
7856 (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
7857 ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
7858 << dendl;
7859 post_event(RejectRemoteReservation());
7860 } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
7861 pg->osd->tentative_backfill_full(pg, pending_adjustment, cur_stat)) {
7862 ldout(pg->cct, 10) << "backfill reservation rejected: backfill full"
7863 << dendl;
7864 post_event(RejectRemoteReservation());
7865 } else {
7866 Context *preempt = nullptr;
7867 // Don't reserve space if skipped reservation check, this is used
7868 // to test the other backfill full check AND in case a corruption
7869 // of num_bytes requires ignoring that value and trying the
7870 // backfill anyway.
7871 if (primary_num_bytes && !pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation)
7872 pg->set_reserved_num_bytes(primary_num_bytes, local_num_bytes);
7873 else
7874 pg->clear_reserved_num_bytes();
7875 // Use un-ec-adjusted bytes for stats.
7876 pg->info.stats.stats.sum.num_bytes = evt.local_num_bytes;
7877 if (HAVE_FEATURE(pg->upacting_features, RECOVERY_RESERVATION_2)) {
7878 // older peers will interpret preemption as TOOFULL
7879 preempt = new QueuePeeringEvt<RemoteBackfillPreempted>(
7880 pg, pg->get_osdmap_epoch(),
7881 RemoteBackfillPreempted());
7882 }
7883 pg->osd->remote_reserver.request_reservation(
7884 pg->info.pgid,
7885 new QueuePeeringEvt<RemoteBackfillReserved>(
7886 pg, pg->get_osdmap_epoch(),
7887 RemoteBackfillReserved()),
7888 evt.priority,
7889 preempt);
7890 }
7891 return transit<RepWaitBackfillReserved>();
7892 }
7893
7894 boost::statechart::result
7895 PG::RecoveryState::RepNotRecovering::react(const RequestRecoveryPrio &evt)
7896 {
7897 PG *pg = context< RecoveryMachine >().pg;
7898
7899 // fall back to a local reckoning of priority of primary doesn't pass one
7900 // (pre-mimic compat)
7901 int prio = evt.priority ? evt.priority : pg->get_recovery_priority();
7902
7903 Context *preempt = nullptr;
7904 if (HAVE_FEATURE(pg->upacting_features, RECOVERY_RESERVATION_2)) {
7905 // older peers can't handle this
7906 preempt = new QueuePeeringEvt<RemoteRecoveryPreempted>(
7907 pg, pg->get_osdmap_epoch(),
7908 RemoteRecoveryPreempted());
7909 }
7910
7911 pg->osd->remote_reserver.request_reservation(
7912 pg->info.pgid,
7913 new QueuePeeringEvt<RemoteRecoveryReserved>(
7914 pg, pg->get_osdmap_epoch(),
7915 RemoteRecoveryReserved()),
7916 prio,
7917 preempt);
7918 return transit<RepWaitRecoveryReserved>();
7919 }
7920
7921 void PG::RecoveryState::RepWaitBackfillReserved::exit()
7922 {
7923 context< RecoveryMachine >().log_exit(state_name, enter_time);
7924 PG *pg = context< RecoveryMachine >().pg;
7925 utime_t dur = ceph_clock_now() - enter_time;
7926 pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur);
7927 }
7928
7929 boost::statechart::result
7930 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
7931 {
7932 PG *pg = context< RecoveryMachine >().pg;
7933
7934 pg->osd->send_message_osd_cluster(
7935 pg->primary.osd,
7936 new MBackfillReserve(
7937 MBackfillReserve::GRANT,
7938 spg_t(pg->info.pgid.pgid, pg->primary.shard),
7939 pg->get_osdmap_epoch()),
7940 pg->get_osdmap_epoch());
7941 return transit<RepRecovering>();
7942 }
7943
7944 boost::statechart::result
7945 PG::RecoveryState::RepWaitBackfillReserved::react(
7946 const RejectRemoteReservation &evt)
7947 {
7948 PG *pg = context< RecoveryMachine >().pg;
7949 pg->reject_reservation();
7950 post_event(RemoteReservationRejected());
7951 return discard_event();
7952 }
7953
7954 boost::statechart::result
7955 PG::RecoveryState::RepWaitBackfillReserved::react(
7956 const RemoteReservationRejected &evt)
7957 {
7958 PG *pg = context< RecoveryMachine >().pg;
7959 pg->clear_reserved_num_bytes();
7960 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7961 return transit<RepNotRecovering>();
7962 }
7963
7964 boost::statechart::result
7965 PG::RecoveryState::RepWaitBackfillReserved::react(
7966 const RemoteReservationCanceled &evt)
7967 {
7968 PG *pg = context< RecoveryMachine >().pg;
7969 pg->clear_reserved_num_bytes();
7970 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7971 return transit<RepNotRecovering>();
7972 }
7973
7974 /*---RepRecovering-------*/
7975 PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx)
7976 : my_base(ctx),
7977 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepRecovering")
7978 {
7979 context< RecoveryMachine >().log_enter(state_name);
7980 }
7981
7982 boost::statechart::result
7983 PG::RecoveryState::RepRecovering::react(const RemoteRecoveryPreempted &)
7984 {
7985 PG *pg = context< RecoveryMachine >().pg;
7986 pg->clear_reserved_num_bytes();
7987 pg->osd->send_message_osd_cluster(
7988 pg->primary.osd,
7989 new MRecoveryReserve(
7990 MRecoveryReserve::REVOKE,
7991 spg_t(pg->info.pgid.pgid, pg->primary.shard),
7992 pg->get_osdmap_epoch()),
7993 pg->get_osdmap_epoch());
7994 return discard_event();
7995 }
7996
7997 boost::statechart::result
7998 PG::RecoveryState::RepRecovering::react(const BackfillTooFull &)
7999 {
8000 PG *pg = context< RecoveryMachine >().pg;
8001 pg->clear_reserved_num_bytes();
8002 pg->osd->send_message_osd_cluster(
8003 pg->primary.osd,
8004 new MBackfillReserve(
8005 MBackfillReserve::TOOFULL,
8006 spg_t(pg->info.pgid.pgid, pg->primary.shard),
8007 pg->get_osdmap_epoch()),
8008 pg->get_osdmap_epoch());
8009 return discard_event();
8010 }
8011
8012 boost::statechart::result
8013 PG::RecoveryState::RepRecovering::react(const RemoteBackfillPreempted &)
8014 {
8015 PG *pg = context< RecoveryMachine >().pg;
8016 pg->clear_reserved_num_bytes();
8017 pg->osd->send_message_osd_cluster(
8018 pg->primary.osd,
8019 new MBackfillReserve(
8020 MBackfillReserve::REVOKE,
8021 spg_t(pg->info.pgid.pgid, pg->primary.shard),
8022 pg->get_osdmap_epoch()),
8023 pg->get_osdmap_epoch());
8024 return discard_event();
8025 }
8026
8027 void PG::RecoveryState::RepRecovering::exit()
8028 {
8029 context< RecoveryMachine >().log_exit(state_name, enter_time);
8030 PG *pg = context< RecoveryMachine >().pg;
8031 pg->clear_reserved_num_bytes();
8032 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
8033 utime_t dur = ceph_clock_now() - enter_time;
8034 pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur);
8035 }
8036
8037 /*------Activating--------*/
8038 PG::RecoveryState::Activating::Activating(my_context ctx)
8039 : my_base(ctx),
8040 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Activating")
8041 {
8042 context< RecoveryMachine >().log_enter(state_name);
8043 }
8044
8045 void PG::RecoveryState::Activating::exit()
8046 {
8047 context< RecoveryMachine >().log_exit(state_name, enter_time);
8048 PG *pg = context< RecoveryMachine >().pg;
8049 utime_t dur = ceph_clock_now() - enter_time;
8050 pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur);
8051 }
8052
8053 PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
8054 : my_base(ctx),
8055 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalRecoveryReserved")
8056 {
8057 context< RecoveryMachine >().log_enter(state_name);
8058 PG *pg = context< RecoveryMachine >().pg;
8059
8060 // Make sure all nodes that part of the recovery aren't full
8061 if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery &&
8062 pg->osd->check_osdmap_full(pg->acting_recovery_backfill)) {
8063 post_event(RecoveryTooFull());
8064 return;
8065 }
8066
8067 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
8068 pg->state_set(PG_STATE_RECOVERY_WAIT);
8069 pg->osd->local_reserver.request_reservation(
8070 pg->info.pgid,
8071 new QueuePeeringEvt<LocalRecoveryReserved>(
8072 pg, pg->get_osdmap_epoch(),
8073 LocalRecoveryReserved()),
8074 pg->get_recovery_priority(),
8075 new QueuePeeringEvt<DeferRecovery>(
8076 pg, pg->get_osdmap_epoch(),
8077 DeferRecovery(0.0)));
8078 pg->publish_stats_to_osd();
8079 }
8080
8081 boost::statechart::result
8082 PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
8083 {
8084 PG *pg = context< RecoveryMachine >().pg;
8085 pg->state_set(PG_STATE_RECOVERY_TOOFULL);
8086 pg->schedule_recovery_retry(pg->cct->_conf->osd_recovery_retry_interval);
8087 return transit<NotRecovering>();
8088 }
8089
8090 void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
8091 {
8092 context< RecoveryMachine >().log_exit(state_name, enter_time);
8093 PG *pg = context< RecoveryMachine >().pg;
8094 utime_t dur = ceph_clock_now() - enter_time;
8095 pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur);
8096 }
8097
8098 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
8099 : my_base(ctx),
8100 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
8101 remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
8102 {
8103 context< RecoveryMachine >().log_enter(state_name);
8104 post_event(RemoteRecoveryReserved());
8105 }
8106
8107 boost::statechart::result
8108 PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
8109 PG *pg = context< RecoveryMachine >().pg;
8110
8111 if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
8112 ceph_assert(*remote_recovery_reservation_it != pg->pg_whoami);
8113 ConnectionRef con = pg->osd->get_con_osd_cluster(
8114 remote_recovery_reservation_it->osd, pg->get_osdmap_epoch());
8115 if (con) {
8116 pg->osd->send_message_osd_cluster(
8117 new MRecoveryReserve(
8118 MRecoveryReserve::REQUEST,
8119 spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard),
8120 pg->get_osdmap_epoch(),
8121 pg->get_recovery_priority()),
8122 con.get());
8123 }
8124 ++remote_recovery_reservation_it;
8125 } else {
8126 post_event(AllRemotesReserved());
8127 }
8128 return discard_event();
8129 }
8130
8131 void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
8132 {
8133 context< RecoveryMachine >().log_exit(state_name, enter_time);
8134 PG *pg = context< RecoveryMachine >().pg;
8135 utime_t dur = ceph_clock_now() - enter_time;
8136 pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur);
8137 }
8138
8139 PG::RecoveryState::Recovering::Recovering(my_context ctx)
8140 : my_base(ctx),
8141 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovering")
8142 {
8143 context< RecoveryMachine >().log_enter(state_name);
8144
8145 PG *pg = context< RecoveryMachine >().pg;
8146 pg->state_clear(PG_STATE_RECOVERY_WAIT);
8147 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
8148 pg->state_set(PG_STATE_RECOVERING);
8149 ceph_assert(!pg->state_test(PG_STATE_ACTIVATING));
8150 pg->publish_stats_to_osd();
8151 pg->queue_recovery();
8152 }
8153
8154 void PG::RecoveryState::Recovering::release_reservations(bool cancel)
8155 {
8156 PG *pg = context< RecoveryMachine >().pg;
8157 ceph_assert(cancel || !pg->pg_log.get_missing().have_missing());
8158
8159 // release remote reservations
8160 for (set<pg_shard_t>::const_iterator i =
8161 context< Active >().remote_shards_to_reserve_recovery.begin();
8162 i != context< Active >().remote_shards_to_reserve_recovery.end();
8163 ++i) {
8164 if (*i == pg->pg_whoami) // skip myself
8165 continue;
8166 ConnectionRef con = pg->osd->get_con_osd_cluster(
8167 i->osd, pg->get_osdmap_epoch());
8168 if (con) {
8169 pg->osd->send_message_osd_cluster(
8170 new MRecoveryReserve(
8171 MRecoveryReserve::RELEASE,
8172 spg_t(pg->info.pgid.pgid, i->shard),
8173 pg->get_osdmap_epoch()),
8174 con.get());
8175 }
8176 }
8177 }
8178
8179 boost::statechart::result
8180 PG::RecoveryState::Recovering::react(const AllReplicasRecovered &evt)
8181 {
8182 PG *pg = context< RecoveryMachine >().pg;
8183 pg->state_clear(PG_STATE_FORCED_RECOVERY);
8184 release_reservations();
8185 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8186 return transit<Recovered>();
8187 }
8188
8189 boost::statechart::result
8190 PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
8191 {
8192 PG *pg = context< RecoveryMachine >().pg;
8193 pg->state_clear(PG_STATE_FORCED_RECOVERY);
8194 release_reservations();
8195 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8196 // XXX: Is this needed?
8197 pg->publish_stats_to_osd();
8198 return transit<WaitLocalBackfillReserved>();
8199 }
8200
8201 boost::statechart::result
8202 PG::RecoveryState::Recovering::react(const DeferRecovery &evt)
8203 {
8204 PG *pg = context< RecoveryMachine >().pg;
8205 if (!pg->state_test(PG_STATE_RECOVERING)) {
8206 // we may have finished recovery and have an AllReplicasRecovered
8207 // event queued to move us to the next state.
8208 ldout(pg->cct, 10) << "got defer recovery but not recovering" << dendl;
8209 return discard_event();
8210 }
8211 ldout(pg->cct, 10) << "defer recovery, retry delay " << evt.delay << dendl;
8212 pg->state_set(PG_STATE_RECOVERY_WAIT);
8213 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8214 release_reservations(true);
8215 pg->schedule_recovery_retry(evt.delay);
8216 return transit<NotRecovering>();
8217 }
8218
8219 boost::statechart::result
8220 PG::RecoveryState::Recovering::react(const UnfoundRecovery &evt)
8221 {
8222 PG *pg = context< RecoveryMachine >().pg;
8223 ldout(pg->cct, 10) << "recovery has unfound, can't continue" << dendl;
8224 pg->state_set(PG_STATE_RECOVERY_UNFOUND);
8225 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8226 release_reservations(true);
8227 return transit<NotRecovering>();
8228 }
8229
8230 void PG::RecoveryState::Recovering::exit()
8231 {
8232 context< RecoveryMachine >().log_exit(state_name, enter_time);
8233 PG *pg = context< RecoveryMachine >().pg;
8234 utime_t dur = ceph_clock_now() - enter_time;
8235 pg->state_clear(PG_STATE_RECOVERING);
8236 pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur);
8237 }
8238
8239 PG::RecoveryState::Recovered::Recovered(my_context ctx)
8240 : my_base(ctx),
8241 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovered")
8242 {
8243 pg_shard_t auth_log_shard;
8244
8245 context< RecoveryMachine >().log_enter(state_name);
8246
8247 PG *pg = context< RecoveryMachine >().pg;
8248
8249 ceph_assert(!pg->needs_recovery());
8250
8251 // if we finished backfill, all acting are active; recheck if
8252 // DEGRADED | UNDERSIZED is appropriate.
8253 ceph_assert(!pg->acting_recovery_backfill.empty());
8254 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
8255 pg->acting_recovery_backfill.size()) {
8256 pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
8257 pg->publish_stats_to_osd();
8258 }
8259
8260 // adjust acting set? (e.g. because backfill completed...)
8261 bool history_les_bound = false;
8262 if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
8263 true, &history_les_bound)) {
8264 ceph_assert(pg->want_acting.size());
8265 } else if (!pg->async_recovery_targets.empty()) {
8266 pg->choose_acting(auth_log_shard, true, &history_les_bound);
8267 }
8268
8269 if (context< Active >().all_replicas_activated &&
8270 pg->async_recovery_targets.empty())
8271 post_event(GoClean());
8272 }
8273
8274 void PG::RecoveryState::Recovered::exit()
8275 {
8276 context< RecoveryMachine >().log_exit(state_name, enter_time);
8277 PG *pg = context< RecoveryMachine >().pg;
8278 utime_t dur = ceph_clock_now() - enter_time;
8279 pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur);
8280 }
8281
8282 PG::RecoveryState::Clean::Clean(my_context ctx)
8283 : my_base(ctx),
8284 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Clean")
8285 {
8286 context< RecoveryMachine >().log_enter(state_name);
8287
8288 PG *pg = context< RecoveryMachine >().pg;
8289
8290 if (pg->info.last_complete != pg->info.last_update) {
8291 ceph_abort();
8292 }
8293 Context *c = pg->finish_recovery();
8294 context< RecoveryMachine >().get_cur_transaction()->register_on_commit(c);
8295
8296 pg->try_mark_clean();
8297 }
8298
8299 void PG::RecoveryState::Clean::exit()
8300 {
8301 context< RecoveryMachine >().log_exit(state_name, enter_time);
8302 PG *pg = context< RecoveryMachine >().pg;
8303 pg->state_clear(PG_STATE_CLEAN);
8304 utime_t dur = ceph_clock_now() - enter_time;
8305 pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur);
8306 }
8307
8308 template <typename T>
8309 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
8310 {
8311 set<int> osds_found;
8312 set<pg_shard_t> out;
8313 for (typename T::const_iterator i = in.begin();
8314 i != in.end();
8315 ++i) {
8316 if (*i != skip && !osds_found.count(i->osd)) {
8317 osds_found.insert(i->osd);
8318 out.insert(*i);
8319 }
8320 }
8321 return out;
8322 }
8323
8324 /*---------Active---------*/
8325 PG::RecoveryState::Active::Active(my_context ctx)
8326 : my_base(ctx),
8327 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active"),
8328 remote_shards_to_reserve_recovery(
8329 unique_osd_shard_set(
8330 context< RecoveryMachine >().pg->pg_whoami,
8331 context< RecoveryMachine >().pg->acting_recovery_backfill)),
8332 remote_shards_to_reserve_backfill(
8333 unique_osd_shard_set(
8334 context< RecoveryMachine >().pg->pg_whoami,
8335 context< RecoveryMachine >().pg->backfill_targets)),
8336 all_replicas_activated(false)
8337 {
8338 context< RecoveryMachine >().log_enter(state_name);
8339
8340 PG *pg = context< RecoveryMachine >().pg;
8341
8342 ceph_assert(!pg->backfill_reserving);
8343 ceph_assert(!pg->backfill_reserved);
8344 ceph_assert(pg->is_primary());
8345 ldout(pg->cct, 10) << "In Active, about to call activate" << dendl;
8346 pg->start_flush(context< RecoveryMachine >().get_cur_transaction());
8347 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
8348 pg->get_osdmap_epoch(),
8349 *context< RecoveryMachine >().get_query_map(),
8350 context< RecoveryMachine >().get_info_map(),
8351 context< RecoveryMachine >().get_recovery_ctx());
8352
8353 // everyone has to commit/ack before we are truly active
8354 pg->blocked_by.clear();
8355 for (set<pg_shard_t>::iterator p = pg->acting_recovery_backfill.begin();
8356 p != pg->acting_recovery_backfill.end();
8357 ++p) {
8358 if (p->shard != pg->pg_whoami.shard) {
8359 pg->blocked_by.insert(p->shard);
8360 }
8361 }
8362 pg->publish_stats_to_osd();
8363 ldout(pg->cct, 10) << "Activate Finished" << dendl;
8364 }
8365
8366 boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
8367 {
8368 PG *pg = context< RecoveryMachine >().pg;
8369 if (pg->should_restart_peering(
8370 advmap.up_primary,
8371 advmap.acting_primary,
8372 advmap.newup,
8373 advmap.newacting,
8374 advmap.lastmap,
8375 advmap.osdmap)) {
8376 ldout(pg->cct, 10) << "Active advmap interval change, fast return" << dendl;
8377 return forward_event();
8378 }
8379 ldout(pg->cct, 10) << "Active advmap" << dendl;
8380 bool need_publish = false;
8381
8382 if (advmap.osdmap->require_osd_release >= CEPH_RELEASE_MIMIC) {
8383 const auto& new_removed_snaps = advmap.osdmap->get_new_removed_snaps();
8384 auto i = new_removed_snaps.find(pg->info.pgid.pool());
8385 if (i != new_removed_snaps.end()) {
8386 bool bad = false;
8387 for (auto j : i->second) {
8388 if (pg->snap_trimq.intersects(j.first, j.second)) {
8389 decltype(pg->snap_trimq) added, overlap;
8390 added.insert(j.first, j.second);
8391 overlap.intersection_of(pg->snap_trimq, added);
8392 if (pg->last_require_osd_release < CEPH_RELEASE_MIMIC) {
8393 lderr(pg->cct) << __func__ << " removed_snaps already contains "
8394 << overlap << ", but this is the first mimic+ osdmap,"
8395 << " so it's expected" << dendl;
8396 } else {
8397 lderr(pg->cct) << __func__ << " removed_snaps already contains "
8398 << overlap << dendl;
8399 bad = true;
8400 }
8401 pg->snap_trimq.union_of(added);
8402 } else {
8403 pg->snap_trimq.insert(j.first, j.second);
8404 }
8405 }
8406 if (pg->last_require_osd_release < CEPH_RELEASE_MIMIC) {
8407 // at upgrade, we report *all* previously removed snaps as removed in
8408 // the first mimic epoch. remove the ones we previously divined were
8409 // removed (and subsequently purged) from the trimq.
8410 lderr(pg->cct) << __func__ << " first mimic map, filtering purged_snaps"
8411 << " from new removed_snaps" << dendl;
8412 pg->snap_trimq.subtract(pg->info.purged_snaps);
8413 }
8414 ldout(pg->cct,10) << __func__ << " new removed_snaps " << i->second
8415 << ", snap_trimq now " << pg->snap_trimq << dendl;
8416 ceph_assert(!bad || !pg->cct->_conf->osd_debug_verify_cached_snaps);
8417 pg->dirty_info = true;
8418 pg->dirty_big_info = true;
8419 }
8420
8421 const auto& new_purged_snaps = advmap.osdmap->get_new_purged_snaps();
8422 auto j = new_purged_snaps.find(pg->info.pgid.pool());
8423 if (j != new_purged_snaps.end()) {
8424 bool bad = false;
8425 for (auto k : j->second) {
8426 if (!pg->info.purged_snaps.contains(k.first, k.second)) {
8427 decltype(pg->info.purged_snaps) rm, overlap;
8428 rm.insert(k.first, k.second);
8429 overlap.intersection_of(pg->info.purged_snaps, rm);
8430 lderr(pg->cct) << __func__ << " purged_snaps does not contain "
8431 << rm << ", only " << overlap << dendl;
8432 pg->info.purged_snaps.subtract(overlap);
8433 // This can currently happen in the normal (if unlikely) course of
8434 // events. Because adding snaps to purged_snaps does not increase
8435 // the pg version or add a pg log entry, we don't reliably propagate
8436 // purged_snaps additions to other OSDs.
8437 // One example:
8438 // - purge S
8439 // - primary and replicas update purged_snaps
8440 // - no object updates
8441 // - pg mapping changes, new primary on different node
8442 // - new primary pg version == eversion_t(), so info is not
8443 // propagated.
8444 //bad = true;
8445 } else {
8446 pg->info.purged_snaps.erase(k.first, k.second);
8447 }
8448 }
8449 ldout(pg->cct,10) << __func__ << " new purged_snaps " << j->second
8450 << ", now " << pg->info.purged_snaps << dendl;
8451 ceph_assert(!bad || !pg->cct->_conf->osd_debug_verify_cached_snaps);
8452 pg->dirty_info = true;
8453 pg->dirty_big_info = true;
8454 }
8455 if (pg->dirty_big_info) {
8456 // share updated purged_snaps to mgr/mon so that we (a) stop reporting
8457 // purged snaps and (b) perhaps share more snaps that we have purged
8458 // but didn't fit in pg_stat_t.
8459 need_publish = true;
8460 pg->share_pg_info();
8461 }
8462 } else if (!pg->pool.newly_removed_snaps.empty()) {
8463 pg->snap_trimq.union_of(pg->pool.newly_removed_snaps);
8464 ldout(pg->cct, 10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl;
8465 pg->dirty_info = true;
8466 pg->dirty_big_info = true;
8467 }
8468
8469 for (size_t i = 0; i < pg->want_acting.size(); i++) {
8470 int osd = pg->want_acting[i];
8471 if (!advmap.osdmap->is_up(osd)) {
8472 pg_shard_t osd_with_shard(osd, shard_id_t(i));
8473 ceph_assert(pg->is_acting(osd_with_shard) || pg->is_up(osd_with_shard));
8474 }
8475 }
8476
8477 /* Check for changes in pool size (if the acting set changed as a result,
8478 * this does not matter) */
8479 if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) !=
8480 pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) {
8481 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) {
8482 pg->state_clear(PG_STATE_UNDERSIZED);
8483 } else {
8484 pg->state_set(PG_STATE_UNDERSIZED);
8485 }
8486 // degraded changes will be detected by call from publish_stats_to_osd()
8487 need_publish = true;
8488 }
8489
8490 // if we haven't reported our PG stats in a long time, do so now.
8491 if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
8492 ldout(pg->cct, 20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch)
8493 << " epochs" << dendl;
8494 need_publish = true;
8495 }
8496
8497 if (need_publish)
8498 pg->publish_stats_to_osd();
8499
8500 return forward_event();
8501 }
8502
8503 boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
8504 {
8505 PG *pg = context< RecoveryMachine >().pg;
8506 ldout(pg->cct, 10) << "Active: handling ActMap" << dendl;
8507 ceph_assert(pg->is_primary());
8508
8509 if (pg->have_unfound()) {
8510 // object may have become unfound
8511 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
8512 }
8513
8514 if (pg->cct->_conf->osd_check_for_log_corruption)
8515 pg->check_log_for_corruption(pg->osd->store);
8516
8517 uint64_t unfound = pg->missing_loc.num_unfound();
8518 if (unfound > 0 &&
8519 pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
8520 if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
8521 pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound
8522 << " objects unfound and apparently lost, would automatically "
8523 << "mark these objects lost but this feature is not yet implemented "
8524 << "(osd_auto_mark_unfound_lost)";
8525 } else
8526 pg->osd->clog->error() << pg->info.pgid.pgid << " has "
8527 << unfound << " objects unfound and apparently lost";
8528 }
8529
8530 if (pg->is_active()) {
8531 ldout(pg->cct, 10) << "Active: kicking snap trim" << dendl;
8532 pg->kick_snap_trim();
8533 }
8534
8535 if (pg->is_peered() &&
8536 !pg->is_clean() &&
8537 !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) &&
8538 (!pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || pg->is_degraded())) {
8539 pg->queue_recovery();
8540 }
8541 return forward_event();
8542 }
8543
8544 boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& notevt)
8545 {
8546 PG *pg = context< RecoveryMachine >().pg;
8547 ceph_assert(pg->is_primary());
8548 if (pg->peer_info.count(notevt.from)) {
8549 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
8550 << ", already have info from that osd, ignoring"
8551 << dendl;
8552 } else if (pg->peer_purged.count(notevt.from)) {
8553 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
8554 << ", already purged that peer, ignoring"
8555 << dendl;
8556 } else {
8557 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
8558 << ", calling proc_replica_info and discover_all_missing"
8559 << dendl;
8560 pg->proc_replica_info(
8561 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
8562 if (pg->have_unfound()) {
8563 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
8564 }
8565 }
8566 return discard_event();
8567 }
8568
8569 boost::statechart::result PG::RecoveryState::Active::react(const MTrim& trim)
8570 {
8571 PG *pg = context< RecoveryMachine >().pg;
8572 ceph_assert(pg->is_primary());
8573
8574 // peer is informing us of their last_complete_ondisk
8575 ldout(pg->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl;
8576 pg->peer_last_complete_ondisk[pg_shard_t(trim.from, trim.shard)] = trim.trim_to;
8577
8578 // trim log when the pg is recovered
8579 pg->calc_min_last_complete_ondisk();
8580 return discard_event();
8581 }
8582
8583 boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoevt)
8584 {
8585 PG *pg = context< RecoveryMachine >().pg;
8586 ceph_assert(pg->is_primary());
8587
8588 ceph_assert(!pg->acting_recovery_backfill.empty());
8589 // don't update history (yet) if we are active and primary; the replica
8590 // may be telling us they have activated (and committed) but we can't
8591 // share that until _everyone_ does the same.
8592 if (pg->is_acting_recovery_backfill(infoevt.from) &&
8593 pg->peer_activated.count(infoevt.from) == 0) {
8594 ldout(pg->cct, 10) << " peer osd." << infoevt.from
8595 << " activated and committed" << dendl;
8596 pg->peer_activated.insert(infoevt.from);
8597 pg->blocked_by.erase(infoevt.from.shard);
8598 pg->publish_stats_to_osd();
8599 if (pg->peer_activated.size() == pg->acting_recovery_backfill.size()) {
8600 pg->all_activated_and_committed();
8601 }
8602 }
8603 return discard_event();
8604 }
8605
8606 boost::statechart::result PG::RecoveryState::Active::react(const MLogRec& logevt)
8607 {
8608 PG *pg = context< RecoveryMachine >().pg;
8609 ldout(pg->cct, 10) << "searching osd." << logevt.from
8610 << " log for unfound items" << dendl;
8611 pg->proc_replica_log(
8612 logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
8613 bool got_missing = pg->search_for_missing(
8614 pg->peer_info[logevt.from],
8615 pg->peer_missing[logevt.from],
8616 logevt.from,
8617 context< RecoveryMachine >().get_recovery_ctx());
8618 // If there are missing AND we are "fully" active then start recovery now
8619 if (got_missing && pg->state_test(PG_STATE_ACTIVE)) {
8620 post_event(DoRecovery());
8621 }
8622 return discard_event();
8623 }
8624
8625 boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
8626 {
8627 PG *pg = context< RecoveryMachine >().pg;
8628
8629 q.f->open_object_section("state");
8630 q.f->dump_string("name", state_name);
8631 q.f->dump_stream("enter_time") << enter_time;
8632
8633 {
8634 q.f->open_array_section("might_have_unfound");
8635 for (set<pg_shard_t>::iterator p = pg->might_have_unfound.begin();
8636 p != pg->might_have_unfound.end();
8637 ++p) {
8638 q.f->open_object_section("osd");
8639 q.f->dump_stream("osd") << *p;
8640 if (pg->peer_missing.count(*p)) {
8641 q.f->dump_string("status", "already probed");
8642 } else if (pg->peer_missing_requested.count(*p)) {
8643 q.f->dump_string("status", "querying");
8644 } else if (!pg->get_osdmap()->is_up(p->osd)) {
8645 q.f->dump_string("status", "osd is down");
8646 } else {
8647 q.f->dump_string("status", "not queried");
8648 }
8649 q.f->close_section();
8650 }
8651 q.f->close_section();
8652 }
8653 {
8654 q.f->open_object_section("recovery_progress");
8655 pg->dump_recovery_info(q.f);
8656 q.f->close_section();
8657 }
8658
8659 {
8660 q.f->open_object_section("scrub");
8661 q.f->dump_stream("scrubber.epoch_start") << pg->scrubber.epoch_start;
8662 q.f->dump_bool("scrubber.active", pg->scrubber.active);
8663 q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state));
8664 q.f->dump_stream("scrubber.start") << pg->scrubber.start;
8665 q.f->dump_stream("scrubber.end") << pg->scrubber.end;
8666 q.f->dump_stream("scrubber.max_end") << pg->scrubber.max_end;
8667 q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update;
8668 q.f->dump_bool("scrubber.deep", pg->scrubber.deep);
8669 {
8670 q.f->open_array_section("scrubber.waiting_on_whom");
8671 for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin();
8672 p != pg->scrubber.waiting_on_whom.end();
8673 ++p) {
8674 q.f->dump_stream("shard") << *p;
8675 }
8676 q.f->close_section();
8677 }
8678 q.f->close_section();
8679 }
8680
8681 q.f->close_section();
8682 return forward_event();
8683 }
8684
8685 boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActivated &evt)
8686 {
8687 PG *pg = context< RecoveryMachine >().pg;
8688 pg_t pgid = pg->info.pgid.pgid;
8689
8690 all_replicas_activated = true;
8691
8692 pg->state_clear(PG_STATE_ACTIVATING);
8693 pg->state_clear(PG_STATE_CREATING);
8694 pg->state_clear(PG_STATE_PREMERGE);
8695
8696 bool merge_target;
8697 if (pg->pool.info.is_pending_merge(pgid, &merge_target)) {
8698 pg->state_set(PG_STATE_PEERED);
8699 pg->state_set(PG_STATE_PREMERGE);
8700
8701 if (pg->actingset.size() != pg->get_osdmap()->get_pg_size(pgid)) {
8702 if (merge_target) {
8703 pg_t src = pgid;
8704 src.set_ps(pg->pool.info.get_pg_num_pending());
8705 assert(src.get_parent() == pgid);
8706 pg->osd->set_not_ready_to_merge_target(pgid, src);
8707 } else {
8708 pg->osd->set_not_ready_to_merge_source(pgid);
8709 }
8710 }
8711 } else if (pg->acting.size() < pg->pool.info.min_size) {
8712 pg->state_set(PG_STATE_PEERED);
8713 } else {
8714 pg->state_set(PG_STATE_ACTIVE);
8715 }
8716
8717 if (pg->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) {
8718 pg->osd->send_pg_created(pgid);
8719 }
8720
8721 pg->info.history.last_epoch_started = pg->info.last_epoch_started;
8722 pg->info.history.last_interval_started = pg->info.last_interval_started;
8723 pg->dirty_info = true;
8724
8725 pg->share_pg_info();
8726 pg->publish_stats_to_osd();
8727
8728 pg->check_local();
8729
8730 // waiters
8731 if (pg->flushes_in_progress == 0) {
8732 pg->requeue_ops(pg->waiting_for_peered);
8733 } else if (!pg->waiting_for_peered.empty()) {
8734 ldout(pg->cct, 10) << __func__ << " flushes in progress, moving "
8735 << pg->waiting_for_peered.size()
8736 << " items to waiting_for_flush"
8737 << dendl;
8738 ceph_assert(pg->waiting_for_flush.empty());
8739 pg->waiting_for_flush.swap(pg->waiting_for_peered);
8740 }
8741
8742 pg->on_activate();
8743
8744 return discard_event();
8745 }
8746
8747 void PG::RecoveryState::Active::exit()
8748 {
8749 context< RecoveryMachine >().log_exit(state_name, enter_time);
8750 PG *pg = context< RecoveryMachine >().pg;
8751 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8752
8753 pg->blocked_by.clear();
8754 pg->backfill_reserved = false;
8755 pg->backfill_reserving = false;
8756 pg->state_clear(PG_STATE_ACTIVATING);
8757 pg->state_clear(PG_STATE_DEGRADED);
8758 pg->state_clear(PG_STATE_UNDERSIZED);
8759 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
8760 pg->state_clear(PG_STATE_BACKFILL_WAIT);
8761 pg->state_clear(PG_STATE_RECOVERY_WAIT);
8762 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
8763 utime_t dur = ceph_clock_now() - enter_time;
8764 pg->osd->recoverystate_perf->tinc(rs_active_latency, dur);
8765 pg->agent_stop();
8766 }
8767
8768 /*------ReplicaActive-----*/
8769 PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx)
8770 : my_base(ctx),
8771 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive")
8772 {
8773 context< RecoveryMachine >().log_enter(state_name);
8774
8775 PG *pg = context< RecoveryMachine >().pg;
8776 pg->start_flush(context< RecoveryMachine >().get_cur_transaction());
8777 }
8778
8779
8780 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
8781 const Activate& actevt) {
8782 PG *pg = context< RecoveryMachine >().pg;
8783 ldout(pg->cct, 10) << "In ReplicaActive, about to call activate" << dendl;
8784 map<int, map<spg_t, pg_query_t> > query_map;
8785 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
8786 actevt.activation_epoch,
8787 query_map, NULL, NULL);
8788 ldout(pg->cct, 10) << "Activate Finished" << dendl;
8789 return discard_event();
8790 }
8791
8792 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MInfoRec& infoevt)
8793 {
8794 PG *pg = context< RecoveryMachine >().pg;
8795 pg->proc_primary_info(*context<RecoveryMachine>().get_cur_transaction(),
8796 infoevt.info);
8797 return discard_event();
8798 }
8799
8800 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MLogRec& logevt)
8801 {
8802 PG *pg = context< RecoveryMachine >().pg;
8803 ldout(pg->cct, 10) << "received log from " << logevt.from << dendl;
8804 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
8805 pg->merge_log(*t, logevt.msg->info, logevt.msg->log, logevt.from);
8806 ceph_assert(pg->pg_log.get_head() == pg->info.last_update);
8807
8808 return discard_event();
8809 }
8810
8811 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MTrim& trim)
8812 {
8813 PG *pg = context< RecoveryMachine >().pg;
8814 // primary is instructing us to trim
8815 pg->pg_log.trim(trim.trim_to, pg->info);
8816 pg->dirty_info = true;
8817 return discard_event();
8818 }
8819
8820 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&)
8821 {
8822 PG *pg = context< RecoveryMachine >().pg;
8823 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
8824 context< RecoveryMachine >().send_notify(
8825 pg->get_primary(),
8826 pg_notify_t(
8827 pg->get_primary().shard, pg->pg_whoami.shard,
8828 pg->get_osdmap_epoch(),
8829 pg->get_osdmap_epoch(),
8830 pg->info),
8831 pg->past_intervals);
8832 }
8833 pg->take_waiters();
8834 return discard_event();
8835 }
8836
8837 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
8838 const MQuery& query)
8839 {
8840 PG *pg = context< RecoveryMachine >().pg;
8841 pg->fulfill_query(query, context<RecoveryMachine>().get_recovery_ctx());
8842 return discard_event();
8843 }
8844
8845 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const QueryState& q)
8846 {
8847 q.f->open_object_section("state");
8848 q.f->dump_string("name", state_name);
8849 q.f->dump_stream("enter_time") << enter_time;
8850 q.f->close_section();
8851 return forward_event();
8852 }
8853
8854 void PG::RecoveryState::ReplicaActive::exit()
8855 {
8856 context< RecoveryMachine >().log_exit(state_name, enter_time);
8857 PG *pg = context< RecoveryMachine >().pg;
8858 pg->clear_reserved_num_bytes();
8859 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
8860 utime_t dur = ceph_clock_now() - enter_time;
8861 pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur);
8862 }
8863
8864 /*-------Stray---*/
8865 PG::RecoveryState::Stray::Stray(my_context ctx)
8866 : my_base(ctx),
8867 NamedState(context< RecoveryMachine >().pg, "Started/Stray")
8868 {
8869 context< RecoveryMachine >().log_enter(state_name);
8870
8871 PG *pg = context< RecoveryMachine >().pg;
8872 ceph_assert(!pg->is_peered());
8873 ceph_assert(!pg->is_peering());
8874 ceph_assert(!pg->is_primary());
8875
8876 if (!pg->get_osdmap()->have_pg_pool(pg->get_pgid().pool())) {
8877 ldout(pg->cct,10) << __func__ << " pool is deleted" << dendl;
8878 post_event(DeleteStart());
8879 } else {
8880 pg->start_flush(context< RecoveryMachine >().get_cur_transaction());
8881 }
8882 }
8883
8884 boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
8885 {
8886 PG *pg = context< RecoveryMachine >().pg;
8887 MOSDPGLog *msg = logevt.msg.get();
8888 ldout(pg->cct, 10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
8889
8890 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
8891 if (msg->info.last_backfill == hobject_t()) {
8892 // restart backfill
8893 pg->unreg_next_scrub();
8894 pg->info = msg->info;
8895 pg->reg_next_scrub();
8896 pg->dirty_info = true;
8897 pg->dirty_big_info = true; // maybe.
8898
8899 PGLogEntryHandler rollbacker{pg, t};
8900 pg->pg_log.reset_backfill_claim_log(msg->log, &rollbacker);
8901
8902 pg->pg_log.reset_backfill();
8903 } else {
8904 pg->merge_log(*t, msg->info, msg->log, logevt.from);
8905 }
8906
8907 ceph_assert(pg->pg_log.get_head() == pg->info.last_update);
8908
8909 post_event(Activate(logevt.msg->info.last_epoch_started));
8910 return transit<ReplicaActive>();
8911 }
8912
8913 boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoevt)
8914 {
8915 PG *pg = context< RecoveryMachine >().pg;
8916 ldout(pg->cct, 10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
8917
8918 if (pg->info.last_update > infoevt.info.last_update) {
8919 // rewind divergent log entries
8920 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
8921 pg->rewind_divergent_log(*t, infoevt.info.last_update);
8922 pg->info.stats = infoevt.info.stats;
8923 pg->info.hit_set = infoevt.info.hit_set;
8924 }
8925
8926 ceph_assert(infoevt.info.last_update == pg->info.last_update);
8927 ceph_assert(pg->pg_log.get_head() == pg->info.last_update);
8928
8929 post_event(Activate(infoevt.info.last_epoch_started));
8930 return transit<ReplicaActive>();
8931 }
8932
8933 boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query)
8934 {
8935 PG *pg = context< RecoveryMachine >().pg;
8936 pg->fulfill_query(query, context<RecoveryMachine>().get_recovery_ctx());
8937 return discard_event();
8938 }
8939
8940 boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&)
8941 {
8942 PG *pg = context< RecoveryMachine >().pg;
8943 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
8944 context< RecoveryMachine >().send_notify(
8945 pg->get_primary(),
8946 pg_notify_t(
8947 pg->get_primary().shard, pg->pg_whoami.shard,
8948 pg->get_osdmap_epoch(),
8949 pg->get_osdmap_epoch(),
8950 pg->info),
8951 pg->past_intervals);
8952 }
8953 pg->take_waiters();
8954 return discard_event();
8955 }
8956
8957 void PG::RecoveryState::Stray::exit()
8958 {
8959 context< RecoveryMachine >().log_exit(state_name, enter_time);
8960 PG *pg = context< RecoveryMachine >().pg;
8961 utime_t dur = ceph_clock_now() - enter_time;
8962 pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur);
8963 }
8964
8965
8966 /*--------ToDelete----------*/
8967 PG::RecoveryState::ToDelete::ToDelete(my_context ctx)
8968 : my_base(ctx),
8969 NamedState(context< RecoveryMachine >().pg, "Started/ToDelete")
8970 {
8971 context< RecoveryMachine >().log_enter(state_name);
8972 PG *pg = context< RecoveryMachine >().pg;
8973 pg->osd->logger->inc(l_osd_pg_removing);
8974 }
8975
8976 void PG::RecoveryState::ToDelete::exit()
8977 {
8978 context< RecoveryMachine >().log_exit(state_name, enter_time);
8979 PG *pg = context< RecoveryMachine >().pg;
8980 // note: on a successful removal, this path doesn't execute. see
8981 // _delete_some().
8982 pg->osd->logger->dec(l_osd_pg_removing);
8983 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8984 }
8985
8986 /*----WaitDeleteReserved----*/
8987 PG::RecoveryState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx)
8988 : my_base(ctx),
8989 NamedState(context< RecoveryMachine >().pg,
8990 "Started/ToDelete/WaitDeleteReseved")
8991 {
8992 context< RecoveryMachine >().log_enter(state_name);
8993 PG *pg = context< RecoveryMachine >().pg;
8994 context<ToDelete>().priority = pg->get_delete_priority();
8995 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8996 pg->osd->local_reserver.request_reservation(
8997 pg->info.pgid,
8998 new QueuePeeringEvt<DeleteReserved>(
8999 pg, pg->get_osdmap_epoch(),
9000 DeleteReserved()),
9001 context<ToDelete>().priority,
9002 new QueuePeeringEvt<DeleteInterrupted>(
9003 pg, pg->get_osdmap_epoch(),
9004 DeleteInterrupted()));
9005 }
9006
9007 boost::statechart::result PG::RecoveryState::ToDelete::react(
9008 const ActMap& evt)
9009 {
9010 PG *pg = context< RecoveryMachine >().pg;
9011 if (pg->get_delete_priority() != priority) {
9012 ldout(pg->cct,10) << __func__ << " delete priority changed, resetting"
9013 << dendl;
9014 return transit<ToDelete>();
9015 }
9016 return discard_event();
9017 }
9018
9019 void PG::RecoveryState::WaitDeleteReserved::exit()
9020 {
9021 context< RecoveryMachine >().log_exit(state_name, enter_time);
9022 }
9023
9024 /*----Deleting-----*/
9025 PG::RecoveryState::Deleting::Deleting(my_context ctx)
9026 : my_base(ctx),
9027 NamedState(context< RecoveryMachine >().pg, "Started/ToDelete/Deleting")
9028 {
9029 context< RecoveryMachine >().log_enter(state_name);
9030 PG *pg = context< RecoveryMachine >().pg;
9031 pg->deleting = true;
9032 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
9033 pg->on_removal(t);
9034 t->register_on_commit(new C_DeleteMore(pg, pg->get_osdmap_epoch()));
9035 }
9036
9037 boost::statechart::result PG::RecoveryState::Deleting::react(
9038 const DeleteSome& evt)
9039 {
9040 PG *pg = context< RecoveryMachine >().pg;
9041 pg->_delete_some(context<RecoveryMachine>().get_cur_transaction());
9042 return discard_event();
9043 }
9044
9045 void PG::RecoveryState::Deleting::exit()
9046 {
9047 context< RecoveryMachine >().log_exit(state_name, enter_time);
9048 PG *pg = context< RecoveryMachine >().pg;
9049 pg->deleting = false;
9050 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
9051 }
9052
9053 /*--------GetInfo---------*/
9054 PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
9055 : my_base(ctx),
9056 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetInfo")
9057 {
9058 context< RecoveryMachine >().log_enter(state_name);
9059
9060 PG *pg = context< RecoveryMachine >().pg;
9061 pg->check_past_interval_bounds();
9062 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
9063
9064 ceph_assert(pg->blocked_by.empty());
9065
9066 prior_set = pg->build_prior();
9067
9068 pg->reset_min_peer_features();
9069 get_infos();
9070 if (prior_set.pg_down) {
9071 post_event(IsDown());
9072 } else if (peer_info_requested.empty()) {
9073 post_event(GotInfo());
9074 }
9075 }
9076
9077 void PG::RecoveryState::GetInfo::get_infos()
9078 {
9079 PG *pg = context< RecoveryMachine >().pg;
9080 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
9081
9082 pg->blocked_by.clear();
9083 for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
9084 it != prior_set.probe.end();
9085 ++it) {
9086 pg_shard_t peer = *it;
9087 if (peer == pg->pg_whoami) {
9088 continue;
9089 }
9090 if (pg->peer_info.count(peer)) {
9091 ldout(pg->cct, 10) << " have osd." << peer << " info " << pg->peer_info[peer] << dendl;
9092 continue;
9093 }
9094 if (peer_info_requested.count(peer)) {
9095 ldout(pg->cct, 10) << " already requested info from osd." << peer << dendl;
9096 pg->blocked_by.insert(peer.osd);
9097 } else if (!pg->get_osdmap()->is_up(peer.osd)) {
9098 ldout(pg->cct, 10) << " not querying info from down osd." << peer << dendl;
9099 } else {
9100 ldout(pg->cct, 10) << " querying info from osd." << peer << dendl;
9101 context< RecoveryMachine >().send_query(
9102 peer, pg_query_t(pg_query_t::INFO,
9103 it->shard, pg->pg_whoami.shard,
9104 pg->info.history,
9105 pg->get_osdmap_epoch()));
9106 peer_info_requested.insert(peer);
9107 pg->blocked_by.insert(peer.osd);
9108 }
9109 }
9110
9111 pg->publish_stats_to_osd();
9112 }
9113
9114 boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt)
9115 {
9116 PG *pg = context< RecoveryMachine >().pg;
9117
9118 set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
9119 if (p != peer_info_requested.end()) {
9120 peer_info_requested.erase(p);
9121 pg->blocked_by.erase(infoevt.from.osd);
9122 }
9123
9124 epoch_t old_start = pg->info.history.last_epoch_started;
9125 if (pg->proc_replica_info(
9126 infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
9127 // we got something new ...
9128 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
9129 if (old_start < pg->info.history.last_epoch_started) {
9130 ldout(pg->cct, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
9131 prior_set = pg->build_prior();
9132
9133 // filter out any osds that got dropped from the probe set from
9134 // peer_info_requested. this is less expensive than restarting
9135 // peering (which would re-probe everyone).
9136 set<pg_shard_t>::iterator p = peer_info_requested.begin();
9137 while (p != peer_info_requested.end()) {
9138 if (prior_set.probe.count(*p) == 0) {
9139 ldout(pg->cct, 20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
9140 peer_info_requested.erase(p++);
9141 } else {
9142 ++p;
9143 }
9144 }
9145 get_infos();
9146 }
9147 ldout(pg->cct, 20) << "Adding osd: " << infoevt.from.osd << " peer features: "
9148 << hex << infoevt.features << dec << dendl;
9149 pg->apply_peer_features(infoevt.features);
9150
9151 // are we done getting everything?
9152 if (peer_info_requested.empty() && !prior_set.pg_down) {
9153 ldout(pg->cct, 20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl;
9154 ldout(pg->cct, 20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl;
9155 ldout(pg->cct, 20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl;
9156 post_event(GotInfo());
9157 }
9158 }
9159 return discard_event();
9160 }
9161
9162 boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q)
9163 {
9164 PG *pg = context< RecoveryMachine >().pg;
9165 q.f->open_object_section("state");
9166 q.f->dump_string("name", state_name);
9167 q.f->dump_stream("enter_time") << enter_time;
9168
9169 q.f->open_array_section("requested_info_from");
9170 for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
9171 p != peer_info_requested.end();
9172 ++p) {
9173 q.f->open_object_section("osd");
9174 q.f->dump_stream("osd") << *p;
9175 if (pg->peer_info.count(*p)) {
9176 q.f->open_object_section("got_info");
9177 pg->peer_info[*p].dump(q.f);
9178 q.f->close_section();
9179 }
9180 q.f->close_section();
9181 }
9182 q.f->close_section();
9183
9184 q.f->close_section();
9185 return forward_event();
9186 }
9187
9188 void PG::RecoveryState::GetInfo::exit()
9189 {
9190 context< RecoveryMachine >().log_exit(state_name, enter_time);
9191 PG *pg = context< RecoveryMachine >().pg;
9192 utime_t dur = ceph_clock_now() - enter_time;
9193 pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur);
9194 pg->blocked_by.clear();
9195 }
9196
9197 /*------GetLog------------*/
9198 PG::RecoveryState::GetLog::GetLog(my_context ctx)
9199 : my_base(ctx),
9200 NamedState(
9201 context< RecoveryMachine >().pg, "Started/Primary/Peering/GetLog"),
9202 msg(0)
9203 {
9204 context< RecoveryMachine >().log_enter(state_name);
9205
9206 PG *pg = context< RecoveryMachine >().pg;
9207
9208 // adjust acting?
9209 if (!pg->choose_acting(auth_log_shard, false,
9210 &context< Peering >().history_les_bound)) {
9211 if (!pg->want_acting.empty()) {
9212 post_event(NeedActingChange());
9213 } else {
9214 post_event(IsIncomplete());
9215 }
9216 return;
9217 }
9218
9219 // am i the best?
9220 if (auth_log_shard == pg->pg_whoami) {
9221 post_event(GotLog());
9222 return;
9223 }
9224
9225 const pg_info_t& best = pg->peer_info[auth_log_shard];
9226
9227 // am i broken?
9228 if (pg->info.last_update < best.log_tail) {
9229 ldout(pg->cct, 10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
9230 post_event(IsIncomplete());
9231 return;
9232 }
9233
9234 // how much log to request?
9235 eversion_t request_log_from = pg->info.last_update;
9236 ceph_assert(!pg->acting_recovery_backfill.empty());
9237 for (set<pg_shard_t>::iterator p = pg->acting_recovery_backfill.begin();
9238 p != pg->acting_recovery_backfill.end();
9239 ++p) {
9240 if (*p == pg->pg_whoami) continue;
9241 pg_info_t& ri = pg->peer_info[*p];
9242 if (ri.last_update < pg->info.log_tail && ri.last_update >= best.log_tail &&
9243 ri.last_update < request_log_from)
9244 request_log_from = ri.last_update;
9245 }
9246
9247 // how much?
9248 ldout(pg->cct, 10) << " requesting log from osd." << auth_log_shard << dendl;
9249 context<RecoveryMachine>().send_query(
9250 auth_log_shard,
9251 pg_query_t(
9252 pg_query_t::LOG,
9253 auth_log_shard.shard, pg->pg_whoami.shard,
9254 request_log_from, pg->info.history,
9255 pg->get_osdmap_epoch()));
9256
9257 ceph_assert(pg->blocked_by.empty());
9258 pg->blocked_by.insert(auth_log_shard.osd);
9259 pg->publish_stats_to_osd();
9260 }
9261
9262 boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap)
9263 {
9264 PG *pg = context< RecoveryMachine >().pg;
9265 // make sure our log source didn't go down. we need to check
9266 // explicitly because it may not be part of the prior set, which
9267 // means the Peering state check won't catch it going down.
9268 if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
9269 ldout(pg->cct, 10) << "GetLog: auth_log_shard osd."
9270 << auth_log_shard.osd << " went down" << dendl;
9271 post_event(advmap);
9272 return transit< Reset >();
9273 }
9274
9275 // let the Peering state do its checks.
9276 return forward_event();
9277 }
9278
9279 boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt)
9280 {
9281 PG *pg = context< RecoveryMachine >().pg;
9282 ceph_assert(!msg);
9283 if (logevt.from != auth_log_shard) {
9284 ldout(pg->cct, 10) << "GetLog: discarding log from "
9285 << "non-auth_log_shard osd." << logevt.from << dendl;
9286 return discard_event();
9287 }
9288 ldout(pg->cct, 10) << "GetLog: received master log from osd"
9289 << logevt.from << dendl;
9290 msg = logevt.msg;
9291 post_event(GotLog());
9292 return discard_event();
9293 }
9294
9295 boost::statechart::result PG::RecoveryState::GetLog::react(const GotLog&)
9296 {
9297 PG *pg = context< RecoveryMachine >().pg;
9298 ldout(pg->cct, 10) << "leaving GetLog" << dendl;
9299 if (msg) {
9300 ldout(pg->cct, 10) << "processing master log" << dendl;
9301 pg->proc_master_log(*context<RecoveryMachine>().get_cur_transaction(),
9302 msg->info, msg->log, msg->missing,
9303 auth_log_shard);
9304 }
9305 pg->start_flush(context< RecoveryMachine >().get_cur_transaction());
9306 return transit< GetMissing >();
9307 }
9308
9309 boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q)
9310 {
9311 q.f->open_object_section("state");
9312 q.f->dump_string("name", state_name);
9313 q.f->dump_stream("enter_time") << enter_time;
9314 q.f->dump_stream("auth_log_shard") << auth_log_shard;
9315 q.f->close_section();
9316 return forward_event();
9317 }
9318
9319 void PG::RecoveryState::GetLog::exit()
9320 {
9321 context< RecoveryMachine >().log_exit(state_name, enter_time);
9322 PG *pg = context< RecoveryMachine >().pg;
9323 utime_t dur = ceph_clock_now() - enter_time;
9324 pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur);
9325 pg->blocked_by.clear();
9326 }
9327
9328 /*------WaitActingChange--------*/
9329 PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx)
9330 : my_base(ctx),
9331 NamedState(context< RecoveryMachine >().pg, "Started/Primary/WaitActingChange")
9332 {
9333 context< RecoveryMachine >().log_enter(state_name);
9334 }
9335
9336 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const AdvMap& advmap)
9337 {
9338 PG *pg = context< RecoveryMachine >().pg;
9339 OSDMapRef osdmap = advmap.osdmap;
9340
9341 ldout(pg->cct, 10) << "verifying no want_acting " << pg->want_acting << " targets didn't go down" << dendl;
9342 for (vector<int>::iterator p = pg->want_acting.begin(); p != pg->want_acting.end(); ++p) {
9343 if (!osdmap->is_up(*p)) {
9344 ldout(pg->cct, 10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
9345 post_event(advmap);
9346 return transit< Reset >();
9347 }
9348 }
9349 return forward_event();
9350 }
9351
9352 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MLogRec& logevt)
9353 {
9354 PG *pg = context< RecoveryMachine >().pg;
9355 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MLocRec" << dendl;
9356 return discard_event();
9357 }
9358
9359 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MInfoRec& evt)
9360 {
9361 PG *pg = context< RecoveryMachine >().pg;
9362 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
9363 return discard_event();
9364 }
9365
9366 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MNotifyRec& evt)
9367 {
9368 PG *pg = context< RecoveryMachine >().pg;
9369 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
9370 return discard_event();
9371 }
9372
9373 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const QueryState& q)
9374 {
9375 q.f->open_object_section("state");
9376 q.f->dump_string("name", state_name);
9377 q.f->dump_stream("enter_time") << enter_time;
9378 q.f->dump_string("comment", "waiting for pg acting set to change");
9379 q.f->close_section();
9380 return forward_event();
9381 }
9382
9383 void PG::RecoveryState::WaitActingChange::exit()
9384 {
9385 context< RecoveryMachine >().log_exit(state_name, enter_time);
9386 PG *pg = context< RecoveryMachine >().pg;
9387 utime_t dur = ceph_clock_now() - enter_time;
9388 pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur);
9389 }
9390
9391 /*------Down--------*/
9392 PG::RecoveryState::Down::Down(my_context ctx)
9393 : my_base(ctx),
9394 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Down")
9395 {
9396 context< RecoveryMachine >().log_enter(state_name);
9397 PG *pg = context< RecoveryMachine >().pg;
9398
9399 pg->state_clear(PG_STATE_PEERING);
9400 pg->state_set(PG_STATE_DOWN);
9401
9402 auto &prior_set = context< Peering >().prior_set;
9403 ceph_assert(pg->blocked_by.empty());
9404 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
9405 pg->publish_stats_to_osd();
9406 }
9407
9408 void PG::RecoveryState::Down::exit()
9409 {
9410 context< RecoveryMachine >().log_exit(state_name, enter_time);
9411 PG *pg = context< RecoveryMachine >().pg;
9412
9413 pg->state_clear(PG_STATE_DOWN);
9414 utime_t dur = ceph_clock_now() - enter_time;
9415 pg->osd->recoverystate_perf->tinc(rs_down_latency, dur);
9416
9417 pg->blocked_by.clear();
9418 }
9419
9420 boost::statechart::result PG::RecoveryState::Down::react(const QueryState& q)
9421 {
9422 q.f->open_object_section("state");
9423 q.f->dump_string("name", state_name);
9424 q.f->dump_stream("enter_time") << enter_time;
9425 q.f->dump_string("comment",
9426 "not enough up instances of this PG to go active");
9427 q.f->close_section();
9428 return forward_event();
9429 }
9430
9431 boost::statechart::result PG::RecoveryState::Down::react(const MNotifyRec& infoevt)
9432 {
9433 PG *pg = context< RecoveryMachine >().pg;
9434
9435 ceph_assert(pg->is_primary());
9436 epoch_t old_start = pg->info.history.last_epoch_started;
9437 if (!pg->peer_info.count(infoevt.from) &&
9438 pg->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) {
9439 pg->update_history(infoevt.notify.info.history);
9440 }
9441 // if we got something new to make pg escape down state
9442 if (pg->info.history.last_epoch_started > old_start) {
9443 ldout(pg->cct, 10) << " last_epoch_started moved forward, re-enter getinfo" << dendl;
9444 pg->state_clear(PG_STATE_DOWN);
9445 pg->state_set(PG_STATE_PEERING);
9446 return transit< GetInfo >();
9447 }
9448
9449 return discard_event();
9450 }
9451
9452
9453 /*------Incomplete--------*/
9454 PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
9455 : my_base(ctx),
9456 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Incomplete")
9457 {
9458 context< RecoveryMachine >().log_enter(state_name);
9459 PG *pg = context< RecoveryMachine >().pg;
9460
9461 pg->state_clear(PG_STATE_PEERING);
9462 pg->state_set(PG_STATE_INCOMPLETE);
9463
9464 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
9465 ceph_assert(pg->blocked_by.empty());
9466 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
9467 pg->publish_stats_to_osd();
9468 }
9469
9470 boost::statechart::result PG::RecoveryState::Incomplete::react(const AdvMap &advmap) {
9471 PG *pg = context< RecoveryMachine >().pg;
9472 int64_t poolnum = pg->info.pgid.pool();
9473
9474 // Reset if min_size turn smaller than previous value, pg might now be able to go active
9475 if (!advmap.osdmap->have_pg_pool(poolnum) ||
9476 advmap.lastmap->get_pools().find(poolnum)->second.min_size >
9477 advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
9478 post_event(advmap);
9479 return transit< Reset >();
9480 }
9481
9482 return forward_event();
9483 }
9484
9485 boost::statechart::result PG::RecoveryState::Incomplete::react(const MNotifyRec& notevt) {
9486 PG *pg = context< RecoveryMachine >().pg;
9487 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
9488 if (pg->proc_replica_info(
9489 notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
9490 // We got something new, try again!
9491 return transit< GetLog >();
9492 } else {
9493 return discard_event();
9494 }
9495 }
9496
9497 boost::statechart::result PG::RecoveryState::Incomplete::react(
9498 const QueryState& q)
9499 {
9500 q.f->open_object_section("state");
9501 q.f->dump_string("name", state_name);
9502 q.f->dump_stream("enter_time") << enter_time;
9503 q.f->dump_string("comment", "not enough complete instances of this PG");
9504 q.f->close_section();
9505 return forward_event();
9506 }
9507
9508 void PG::RecoveryState::Incomplete::exit()
9509 {
9510 context< RecoveryMachine >().log_exit(state_name, enter_time);
9511 PG *pg = context< RecoveryMachine >().pg;
9512
9513 pg->state_clear(PG_STATE_INCOMPLETE);
9514 utime_t dur = ceph_clock_now() - enter_time;
9515 pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur);
9516
9517 pg->blocked_by.clear();
9518 }
9519
9520 /*------GetMissing--------*/
9521 PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
9522 : my_base(ctx),
9523 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetMissing")
9524 {
9525 context< RecoveryMachine >().log_enter(state_name);
9526
9527 PG *pg = context< RecoveryMachine >().pg;
9528 ceph_assert(!pg->acting_recovery_backfill.empty());
9529 eversion_t since;
9530 for (set<pg_shard_t>::iterator i = pg->acting_recovery_backfill.begin();
9531 i != pg->acting_recovery_backfill.end();
9532 ++i) {
9533 if (*i == pg->get_primary()) continue;
9534 const pg_info_t& pi = pg->peer_info[*i];
9535 // reset this so to make sure the pg_missing_t is initialized and
9536 // has the correct semantics even if we don't need to get a
9537 // missing set from a shard. This way later additions due to
9538 // lost+unfound delete work properly.
9539 pg->peer_missing[*i].may_include_deletes = !pg->perform_deletes_during_peering();
9540
9541 if (pi.is_empty())
9542 continue; // no pg data, nothing divergent
9543
9544 if (pi.last_update < pg->pg_log.get_tail()) {
9545 ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
9546 pg->peer_missing[*i].clear();
9547 continue;
9548 }
9549 if (pi.last_backfill == hobject_t()) {
9550 ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
9551 pg->peer_missing[*i].clear();
9552 continue;
9553 }
9554
9555 if (pi.last_update == pi.last_complete && // peer has no missing
9556 pi.last_update == pg->info.last_update) { // peer is up to date
9557 // replica has no missing and identical log as us. no need to
9558 // pull anything.
9559 // FIXME: we can do better here. if last_update==last_complete we
9560 // can infer the rest!
9561 ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl;
9562 pg->peer_missing[*i].clear();
9563 continue;
9564 }
9565
9566 // We pull the log from the peer's last_epoch_started to ensure we
9567 // get enough log to detect divergent updates.
9568 since.epoch = pi.last_epoch_started;
9569 ceph_assert(pi.last_update >= pg->info.log_tail); // or else choose_acting() did a bad thing
9570 if (pi.log_tail <= since) {
9571 ldout(pg->cct, 10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
9572 context< RecoveryMachine >().send_query(
9573 *i,
9574 pg_query_t(
9575 pg_query_t::LOG,
9576 i->shard, pg->pg_whoami.shard,
9577 since, pg->info.history,
9578 pg->get_osdmap_epoch()));
9579 } else {
9580 ldout(pg->cct, 10) << " requesting fulllog+missing from osd." << *i
9581 << " (want since " << since << " < log.tail "
9582 << pi.log_tail << ")" << dendl;
9583 context< RecoveryMachine >().send_query(
9584 *i, pg_query_t(
9585 pg_query_t::FULLLOG,
9586 i->shard, pg->pg_whoami.shard,
9587 pg->info.history, pg->get_osdmap_epoch()));
9588 }
9589 peer_missing_requested.insert(*i);
9590 pg->blocked_by.insert(i->osd);
9591 }
9592
9593 if (peer_missing_requested.empty()) {
9594 if (pg->need_up_thru) {
9595 ldout(pg->cct, 10) << " still need up_thru update before going active"
9596 << dendl;
9597 post_event(NeedUpThru());
9598 return;
9599 }
9600
9601 // all good!
9602 post_event(Activate(pg->get_osdmap_epoch()));
9603 } else {
9604 pg->publish_stats_to_osd();
9605 }
9606 }
9607
9608 boost::statechart::result PG::RecoveryState::GetMissing::react(const MLogRec& logevt)
9609 {
9610 PG *pg = context< RecoveryMachine >().pg;
9611
9612 peer_missing_requested.erase(logevt.from);
9613 pg->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
9614
9615 if (peer_missing_requested.empty()) {
9616 if (pg->need_up_thru) {
9617 ldout(pg->cct, 10) << " still need up_thru update before going active"
9618 << dendl;
9619 post_event(NeedUpThru());
9620 } else {
9621 ldout(pg->cct, 10) << "Got last missing, don't need missing "
9622 << "posting Activate" << dendl;
9623 post_event(Activate(pg->get_osdmap_epoch()));
9624 }
9625 }
9626 return discard_event();
9627 }
9628
9629 boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState& q)
9630 {
9631 PG *pg = context< RecoveryMachine >().pg;
9632 q.f->open_object_section("state");
9633 q.f->dump_string("name", state_name);
9634 q.f->dump_stream("enter_time") << enter_time;
9635
9636 q.f->open_array_section("peer_missing_requested");
9637 for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
9638 p != peer_missing_requested.end();
9639 ++p) {
9640 q.f->open_object_section("osd");
9641 q.f->dump_stream("osd") << *p;
9642 if (pg->peer_missing.count(*p)) {
9643 q.f->open_object_section("got_missing");
9644 pg->peer_missing[*p].dump(q.f);
9645 q.f->close_section();
9646 }
9647 q.f->close_section();
9648 }
9649 q.f->close_section();
9650
9651 q.f->close_section();
9652 return forward_event();
9653 }
9654
9655 void PG::RecoveryState::GetMissing::exit()
9656 {
9657 context< RecoveryMachine >().log_exit(state_name, enter_time);
9658 PG *pg = context< RecoveryMachine >().pg;
9659 utime_t dur = ceph_clock_now() - enter_time;
9660 pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur);
9661 pg->blocked_by.clear();
9662 }
9663
9664 /*------WaitUpThru--------*/
9665 PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx)
9666 : my_base(ctx),
9667 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitUpThru")
9668 {
9669 context< RecoveryMachine >().log_enter(state_name);
9670 }
9671
9672 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const ActMap& am)
9673 {
9674 PG *pg = context< RecoveryMachine >().pg;
9675 if (!pg->need_up_thru) {
9676 post_event(Activate(pg->get_osdmap_epoch()));
9677 }
9678 return forward_event();
9679 }
9680
9681 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const MLogRec& logevt)
9682 {
9683 PG *pg = context< RecoveryMachine >().pg;
9684 ldout(pg->cct, 10) << "Noting missing from osd." << logevt.from << dendl;
9685 pg->peer_missing[logevt.from].claim(logevt.msg->missing);
9686 pg->peer_info[logevt.from] = logevt.msg->info;
9687 return discard_event();
9688 }
9689
9690 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const QueryState& q)
9691 {
9692 q.f->open_object_section("state");
9693 q.f->dump_string("name", state_name);
9694 q.f->dump_stream("enter_time") << enter_time;
9695 q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
9696 q.f->close_section();
9697 return forward_event();
9698 }
9699
9700 void PG::RecoveryState::WaitUpThru::exit()
9701 {
9702 context< RecoveryMachine >().log_exit(state_name, enter_time);
9703 PG *pg = context< RecoveryMachine >().pg;
9704 utime_t dur = ceph_clock_now() - enter_time;
9705 pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur);
9706 }
9707
9708 /*----RecoveryState::RecoveryMachine Methods-----*/
9709 #undef dout_prefix
9710 #define dout_prefix pg->gen_prefix(*_dout)
9711
9712 void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name)
9713 {
9714 PG *pg = context< RecoveryMachine >().pg;
9715 ldout(pg->cct, 5) << "enter " << state_name << dendl;
9716 pg->osd->pg_recovery_stats.log_enter(state_name);
9717 }
9718
9719 void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_t enter_time)
9720 {
9721 utime_t dur = ceph_clock_now() - enter_time;
9722 PG *pg = context< RecoveryMachine >().pg;
9723 ldout(pg->cct, 5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
9724 pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now() - enter_time,
9725 event_count, event_time);
9726 event_count = 0;
9727 event_time = utime_t();
9728 }
9729
9730
9731 /*---------------------------------------------------*/
9732 #undef dout_prefix
9733 #define dout_prefix ((debug_pg ? debug_pg->gen_prefix(*_dout) : *_dout) << " PriorSet: ")
9734
9735 void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
9736 ceph_assert(!rctx);
9737 ceph_assert(!orig_ctx);
9738 orig_ctx = new_ctx;
9739 if (new_ctx) {
9740 if (messages_pending_flush) {
9741 rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
9742 } else {
9743 rctx = *new_ctx;
9744 }
9745 rctx->start_time = ceph_clock_now();
9746 }
9747 }
9748
9749 void PG::RecoveryState::begin_block_outgoing() {
9750 ceph_assert(!messages_pending_flush);
9751 ceph_assert(orig_ctx);
9752 ceph_assert(rctx);
9753 messages_pending_flush = BufferedRecoveryMessages();
9754 rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
9755 }
9756
9757 void PG::RecoveryState::clear_blocked_outgoing() {
9758 ceph_assert(orig_ctx);
9759 ceph_assert(rctx);
9760 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
9761 }
9762
9763 void PG::RecoveryState::end_block_outgoing() {
9764 ceph_assert(messages_pending_flush);
9765 ceph_assert(orig_ctx);
9766 ceph_assert(rctx);
9767
9768 rctx = RecoveryCtx(*orig_ctx);
9769 rctx->accept_buffered_messages(*messages_pending_flush);
9770 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
9771 }
9772
9773 void PG::RecoveryState::end_handle() {
9774 if (rctx) {
9775 utime_t dur = ceph_clock_now() - rctx->start_time;
9776 machine.event_time += dur;
9777 }
9778
9779 machine.event_count++;
9780 rctx = boost::optional<RecoveryCtx>();
9781 orig_ctx = NULL;
9782 }
9783
9784 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi)
9785 {
9786 out << "BackfillInfo(" << bi.begin << "-" << bi.end
9787 << " " << bi.objects.size() << " objects";
9788 if (!bi.objects.empty())
9789 out << " " << bi.objects;
9790 out << ")";
9791 return out;
9792 }
9793
9794 void PG::dump_pgstate_history(Formatter *f)
9795 {
9796 lock();
9797 pgstate_history.dump(f);
9798 unlock();
9799 }
9800
9801 void PG::dump_missing(Formatter *f)
9802 {
9803 for (auto& i : pg_log.get_missing().get_items()) {
9804 f->open_object_section("object");
9805 f->dump_object("oid", i.first);
9806 f->dump_object("missing_info", i.second);
9807 if (missing_loc.needs_recovery(i.first)) {
9808 f->dump_bool("unfound", missing_loc.is_unfound(i.first));
9809 f->open_array_section("locations");
9810 for (auto l : missing_loc.get_locations(i.first)) {
9811 f->dump_object("shard", l);
9812 }
9813 f->close_section();
9814 }
9815 f->close_section();
9816 }
9817 }
9818
9819 void PG::get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f)
9820 {
9821 pg_stats_publish_lock.Lock();
9822 if (pg_stats_publish_valid) {
9823 f(pg_stats_publish, pg_stats_publish.get_effective_last_epoch_clean());
9824 }
9825 pg_stats_publish_lock.Unlock();
9826 }
9827
9828 void PG::with_heartbeat_peers(std::function<void(int)> f)
9829 {
9830 heartbeat_peer_lock.Lock();
9831 for (auto p : heartbeat_peers) {
9832 f(p);
9833 }
9834 for (auto p : probe_targets) {
9835 f(p);
9836 }
9837 heartbeat_peer_lock.Unlock();
9838 }