]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PG.cc
import 14.2.4 nautilus point release
[ceph.git] / ceph / src / osd / PG.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "PG.h"
16 // #include "msg/Messenger.h"
17 #include "messages/MOSDRepScrub.h"
18 // #include "common/cmdparse.h"
19 // #include "common/ceph_context.h"
20
21 #include "common/errno.h"
22 #include "common/config.h"
23 #include "OSD.h"
24 #include "OpRequest.h"
25 #include "ScrubStore.h"
26 #include "Session.h"
27
28 #include "common/Timer.h"
29 #include "common/perf_counters.h"
30
31 #include "messages/MOSDOp.h"
32 #include "messages/MOSDPGNotify.h"
33 // #include "messages/MOSDPGLog.h"
34 #include "messages/MOSDPGRemove.h"
35 #include "messages/MOSDPGInfo.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDPGBackfill.h"
39 #include "messages/MOSDPGBackfillRemove.h"
40 #include "messages/MBackfillReserve.h"
41 #include "messages/MRecoveryReserve.h"
42 #include "messages/MOSDPGPush.h"
43 #include "messages/MOSDPGPushReply.h"
44 #include "messages/MOSDPGPull.h"
45 #include "messages/MOSDECSubOpWrite.h"
46 #include "messages/MOSDECSubOpWriteReply.h"
47 #include "messages/MOSDECSubOpRead.h"
48 #include "messages/MOSDECSubOpReadReply.h"
49 #include "messages/MOSDPGUpdateLogMissing.h"
50 #include "messages/MOSDPGUpdateLogMissingReply.h"
51 #include "messages/MOSDBackoff.h"
52 #include "messages/MOSDScrubReserve.h"
53 #include "messages/MOSDRepOp.h"
54 #include "messages/MOSDRepOpReply.h"
55 #include "messages/MOSDRepScrubMap.h"
56 #include "messages/MOSDPGRecoveryDelete.h"
57 #include "messages/MOSDPGRecoveryDeleteReply.h"
58
59 #include "common/BackTrace.h"
60 #include "common/EventTrace.h"
61
62 #ifdef WITH_LTTNG
63 #define TRACEPOINT_DEFINE
64 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
65 #include "tracing/pg.h"
66 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
67 #undef TRACEPOINT_DEFINE
68 #else
69 #define tracepoint(...)
70 #endif
71
72 #include <sstream>
73
74 #define dout_context cct
75 #define dout_subsys ceph_subsys_osd
76 #undef dout_prefix
77 #define dout_prefix _prefix(_dout, this)
78
79 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
80 // easily skip them
81 const string infover_key("_infover");
82 const string info_key("_info");
83 const string biginfo_key("_biginfo");
84 const string epoch_key("_epoch");
85 const string fastinfo_key("_fastinfo");
86
87 template <class T>
88 static ostream& _prefix(std::ostream *_dout, T *t)
89 {
90 return t->gen_prefix(*_dout);
91 }
92
93 void PGStateHistory::enter(PG* pg, const utime_t entime, const char* state)
94 {
95 // Ignore trimming state machine for now
96 if (::strstr(state, "Trimming") != NULL) {
97 return;
98 } else if (pi != nullptr) {
99 pi->enter_state(entime, state);
100 } else {
101 // Store current state since we can't reliably take the PG lock here
102 if ( tmppi == nullptr) {
103 tmppi = std::unique_ptr<PGStateInstance>(new PGStateInstance);
104 }
105
106 thispg = pg;
107 tmppi->enter_state(entime, state);
108 }
109 }
110
111 void PGStateHistory::exit(const char* state) {
112 // Ignore trimming state machine for now
113 // Do nothing if PG is being destroyed!
114 if (::strstr(state, "Trimming") != NULL || pg_in_destructor) {
115 return;
116 } else {
117 bool ilocked = false;
118 if(!thispg->is_locked()) {
119 thispg->lock();
120 ilocked = true;
121 }
122 if (pi == nullptr) {
123 buffer.push_back(std::unique_ptr<PGStateInstance>(tmppi.release()));
124 pi = buffer.back().get();
125 pi->setepoch(thispg->get_osdmap_epoch());
126 }
127
128 pi->exit_state(ceph_clock_now());
129 if (::strcmp(state, "Reset") == 0) {
130 this->reset();
131 }
132 if(ilocked) {
133 thispg->unlock();
134 }
135 }
136 }
137
138 void PGStateHistory::dump(Formatter* f) const {
139 f->open_array_section("history");
140 for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) {
141 f->open_object_section("states");
142 f->dump_stream("epoch") << (*pi)->this_epoch;
143 for (auto she : (*pi)->state_history) {
144 f->dump_string("state", std::get<2>(she));
145 f->dump_stream("enter") << std::get<0>(she);
146 f->dump_stream("exit") << std::get<1>(she);
147 }
148 f->close_section();
149 }
150 f->close_section();
151 }
152
153 void PG::get(const char* tag)
154 {
155 int after = ++ref;
156 lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " "
157 << "tag " << (tag ? tag : "(none") << " "
158 << (after - 1) << " -> " << after << dendl;
159 #ifdef PG_DEBUG_REFS
160 std::lock_guard l(_ref_id_lock);
161 _tag_counts[tag]++;
162 #endif
163 }
164
165 void PG::put(const char* tag)
166 {
167 #ifdef PG_DEBUG_REFS
168 {
169 std::lock_guard l(_ref_id_lock);
170 auto tag_counts_entry = _tag_counts.find(tag);
171 ceph_assert(tag_counts_entry != _tag_counts.end());
172 --tag_counts_entry->second;
173 if (tag_counts_entry->second == 0) {
174 _tag_counts.erase(tag_counts_entry);
175 }
176 }
177 #endif
178 auto local_cct = cct;
179 int after = --ref;
180 lgeneric_subdout(local_cct, refs, 5) << "PG::put " << this << " "
181 << "tag " << (tag ? tag : "(none") << " "
182 << (after + 1) << " -> " << after
183 << dendl;
184 if (after == 0)
185 delete this;
186 }
187
188 #ifdef PG_DEBUG_REFS
189 uint64_t PG::get_with_id()
190 {
191 ref++;
192 std::lock_guard l(_ref_id_lock);
193 uint64_t id = ++_ref_id;
194 BackTrace bt(0);
195 stringstream ss;
196 bt.print(ss);
197 lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " " << info.pgid
198 << " got id " << id << " "
199 << (ref - 1) << " -> " << ref
200 << dendl;
201 ceph_assert(!_live_ids.count(id));
202 _live_ids.insert(make_pair(id, ss.str()));
203 return id;
204 }
205
206 void PG::put_with_id(uint64_t id)
207 {
208 int newref = --ref;
209 lgeneric_subdout(cct, refs, 5) << "PG::put " << this << " " << info.pgid
210 << " put id " << id << " "
211 << (newref + 1) << " -> " << newref
212 << dendl;
213 {
214 std::lock_guard l(_ref_id_lock);
215 ceph_assert(_live_ids.count(id));
216 _live_ids.erase(id);
217 }
218 if (newref)
219 delete this;
220 }
221
222 void PG::dump_live_ids()
223 {
224 std::lock_guard l(_ref_id_lock);
225 dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl;
226 for (map<uint64_t, string>::iterator i = _live_ids.begin();
227 i != _live_ids.end();
228 ++i) {
229 dout(0) << "\t\tid: " << *i << dendl;
230 }
231 dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl;
232 for (map<string, uint64_t>::iterator i = _tag_counts.begin();
233 i != _tag_counts.end();
234 ++i) {
235 dout(0) << "\t\tid: " << *i << dendl;
236 }
237 }
238 #endif
239
240
241 void PGPool::update(CephContext *cct, OSDMapRef map)
242 {
243 const pg_pool_t *pi = map->get_pg_pool(id);
244 if (!pi) {
245 return; // pool has been deleted
246 }
247 info = *pi;
248 name = map->get_pool_name(id);
249
250 bool updated = false;
251 if ((map->get_epoch() != cached_epoch + 1) ||
252 (pi->get_snap_epoch() == map->get_epoch())) {
253 updated = true;
254 }
255
256 if (map->require_osd_release >= CEPH_RELEASE_MIMIC) {
257 // mimic tracks removed_snaps_queue in the OSDmap and purged_snaps
258 // in the pg_info_t, with deltas for both in each OSDMap. we don't
259 // need to (and can't) track it here.
260 cached_removed_snaps.clear();
261 newly_removed_snaps.clear();
262 } else {
263 // legacy (<= luminous) removed_snaps tracking
264 if (updated) {
265 if (pi->maybe_updated_removed_snaps(cached_removed_snaps)) {
266 pi->build_removed_snaps(newly_removed_snaps);
267 if (cached_removed_snaps.subset_of(newly_removed_snaps)) {
268 interval_set<snapid_t> removed_snaps = newly_removed_snaps;
269 newly_removed_snaps.subtract(cached_removed_snaps);
270 cached_removed_snaps.swap(removed_snaps);
271 } else {
272 lgeneric_subdout(cct, osd, 0) << __func__
273 << " cached_removed_snaps shrank from " << cached_removed_snaps
274 << " to " << newly_removed_snaps << dendl;
275 cached_removed_snaps.swap(newly_removed_snaps);
276 newly_removed_snaps.clear();
277 }
278 } else {
279 newly_removed_snaps.clear();
280 }
281 } else {
282 /* 1) map->get_epoch() == cached_epoch + 1 &&
283 * 2) pi->get_snap_epoch() != map->get_epoch()
284 *
285 * From the if branch, 1 && 2 must be true. From 2, we know that
286 * this map didn't change the set of removed snaps. From 1, we
287 * know that our cached_removed_snaps matches the previous map.
288 * Thus, from 1 && 2, cached_removed snaps matches the current
289 * set of removed snaps and all we have to do is clear
290 * newly_removed_snaps.
291 */
292 newly_removed_snaps.clear();
293 }
294 lgeneric_subdout(cct, osd, 20)
295 << "PGPool::update cached_removed_snaps "
296 << cached_removed_snaps
297 << " newly_removed_snaps "
298 << newly_removed_snaps
299 << " snapc " << snapc
300 << (updated ? " (updated)":" (no change)")
301 << dendl;
302 if (cct->_conf->osd_debug_verify_cached_snaps) {
303 interval_set<snapid_t> actual_removed_snaps;
304 pi->build_removed_snaps(actual_removed_snaps);
305 if (!(actual_removed_snaps == cached_removed_snaps)) {
306 lgeneric_derr(cct) << __func__
307 << ": mismatch between the actual removed snaps "
308 << actual_removed_snaps
309 << " and pool.cached_removed_snaps "
310 << " pool.cached_removed_snaps " << cached_removed_snaps
311 << dendl;
312 }
313 ceph_assert(actual_removed_snaps == cached_removed_snaps);
314 }
315 }
316 if (info.is_pool_snaps_mode() && updated) {
317 snapc = pi->get_snap_context();
318 }
319 cached_epoch = map->get_epoch();
320 }
321
322 PG::PG(OSDService *o, OSDMapRef curmap,
323 const PGPool &_pool, spg_t p) :
324 pg_id(p),
325 coll(p),
326 osd(o),
327 cct(o->cct),
328 osdmap_ref(curmap),
329 pool(_pool),
330 osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
331 snap_mapper(
332 cct,
333 &osdriver,
334 p.ps(),
335 p.get_split_bits(_pool.info.get_pg_num()),
336 _pool.id,
337 p.shard),
338 last_persisted_osdmap(curmap->get_epoch()),
339 deleting(false),
340 trace_endpoint("0.0.0.0", 0, "PG"),
341 dirty_info(false), dirty_big_info(false),
342 info(p),
343 info_struct_v(0),
344 pg_log(cct),
345 pgmeta_oid(p.make_pgmeta_oid()),
346 missing_loc(this),
347 stat_queue_item(this),
348 scrub_queued(false),
349 recovery_queued(false),
350 recovery_ops_active(0),
351 role(-1),
352 state(0),
353 send_notify(false),
354 pg_whoami(osd->whoami, p.shard),
355 need_up_thru(false),
356 last_peering_reset(0),
357 heartbeat_peer_lock("PG::heartbeat_peer_lock"),
358 backfill_reserved(false),
359 backfill_reserving(false),
360 flushes_in_progress(0),
361 pg_stats_publish_lock("PG::pg_stats_publish_lock"),
362 pg_stats_publish_valid(false),
363 finish_sync_event(NULL),
364 backoff_lock("PG::backoff_lock"),
365 scrub_after_recovery(false),
366 active_pushes(0),
367 recovery_state(this),
368 peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
369 acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
370 upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
371 last_epoch(0),
372 last_require_osd_release(curmap->require_osd_release)
373 {
374 #ifdef PG_DEBUG_REFS
375 osd->add_pgid(p, this);
376 #endif
377 #ifdef WITH_BLKIN
378 std::stringstream ss;
379 ss << "PG " << info.pgid;
380 trace_endpoint.copy_name(ss.str());
381 #endif
382 }
383
384 PG::~PG()
385 {
386 pgstate_history.set_pg_in_destructor();
387 #ifdef PG_DEBUG_REFS
388 osd->remove_pgid(info.pgid, this);
389 #endif
390 }
391
392 void PG::lock(bool no_lockdep) const
393 {
394 _lock.Lock(no_lockdep);
395 // if we have unrecorded dirty state with the lock dropped, there is a bug
396 ceph_assert(!dirty_info);
397 ceph_assert(!dirty_big_info);
398
399 dout(30) << "lock" << dendl;
400 }
401
402 std::ostream& PG::gen_prefix(std::ostream& out) const
403 {
404 OSDMapRef mapref = osdmap_ref;
405 if (_lock.is_locked_by_me()) {
406 out << "osd." << osd->whoami
407 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
408 << " " << *this << " ";
409 } else {
410 out << "osd." << osd->whoami
411 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
412 << " pg[" << info.pgid << "(unlocked)] ";
413 }
414 return out;
415 }
416
417 /********* PG **********/
418
419 void PG::proc_master_log(
420 ObjectStore::Transaction& t, pg_info_t &oinfo,
421 pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
422 {
423 dout(10) << "proc_master_log for osd." << from << ": "
424 << olog << " " << omissing << dendl;
425 ceph_assert(!is_peered() && is_primary());
426
427 // merge log into our own log to build master log. no need to
428 // make any adjustments to their missing map; we are taking their
429 // log to be authoritative (i.e., their entries are by definitely
430 // non-divergent).
431 merge_log(t, oinfo, olog, from);
432 peer_info[from] = oinfo;
433 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
434 might_have_unfound.insert(from);
435
436 // See doc/dev/osd_internals/last_epoch_started
437 if (oinfo.last_epoch_started > info.last_epoch_started) {
438 info.last_epoch_started = oinfo.last_epoch_started;
439 dirty_info = true;
440 }
441 if (oinfo.last_interval_started > info.last_interval_started) {
442 info.last_interval_started = oinfo.last_interval_started;
443 dirty_info = true;
444 }
445 update_history(oinfo.history);
446 ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
447 info.last_epoch_started >= info.history.last_epoch_started);
448
449 peer_missing[from].claim(omissing);
450 }
451
452 void PG::proc_replica_log(
453 pg_info_t &oinfo,
454 const pg_log_t &olog,
455 pg_missing_t& omissing,
456 pg_shard_t from)
457 {
458 dout(10) << "proc_replica_log for osd." << from << ": "
459 << oinfo << " " << olog << " " << omissing << dendl;
460
461 pg_log.proc_replica_log(oinfo, olog, omissing, from);
462
463 peer_info[from] = oinfo;
464 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
465 might_have_unfound.insert(from);
466
467 for (map<hobject_t, pg_missing_item>::const_iterator i =
468 omissing.get_items().begin();
469 i != omissing.get_items().end();
470 ++i) {
471 dout(20) << " after missing " << i->first << " need " << i->second.need
472 << " have " << i->second.have << dendl;
473 }
474 peer_missing[from].claim(omissing);
475 }
476
477 bool PG::proc_replica_info(
478 pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
479 {
480 map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
481 if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
482 dout(10) << " got dup osd." << from << " info " << oinfo << ", identical to ours" << dendl;
483 return false;
484 }
485
486 if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
487 dout(10) << " got info " << oinfo << " from down osd." << from
488 << " discarding" << dendl;
489 return false;
490 }
491
492 dout(10) << " got osd." << from << " " << oinfo << dendl;
493 ceph_assert(is_primary());
494 peer_info[from] = oinfo;
495 might_have_unfound.insert(from);
496
497 update_history(oinfo.history);
498
499 // stray?
500 if (!is_up(from) && !is_acting(from)) {
501 dout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
502 stray_set.insert(from);
503 if (is_clean()) {
504 purge_strays();
505 }
506 }
507
508 // was this a new info? if so, update peers!
509 if (p == peer_info.end())
510 update_heartbeat_peers();
511
512 return true;
513 }
514
515 void PG::remove_snap_mapped_object(
516 ObjectStore::Transaction &t, const hobject_t &soid)
517 {
518 t.remove(
519 coll,
520 ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard));
521 clear_object_snap_mapping(&t, soid);
522 }
523
524 void PG::clear_object_snap_mapping(
525 ObjectStore::Transaction *t, const hobject_t &soid)
526 {
527 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
528 if (soid.snap < CEPH_MAXSNAP) {
529 int r = snap_mapper.remove_oid(
530 soid,
531 &_t);
532 if (!(r == 0 || r == -ENOENT)) {
533 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
534 ceph_abort();
535 }
536 }
537 }
538
539 void PG::update_object_snap_mapping(
540 ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps)
541 {
542 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
543 ceph_assert(soid.snap < CEPH_MAXSNAP);
544 int r = snap_mapper.remove_oid(
545 soid,
546 &_t);
547 if (!(r == 0 || r == -ENOENT)) {
548 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
549 ceph_abort();
550 }
551 snap_mapper.add_oid(
552 soid,
553 snaps,
554 &_t);
555 }
556
557 void PG::merge_log(
558 ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_shard_t from)
559 {
560 PGLogEntryHandler rollbacker{this, &t};
561 pg_log.merge_log(
562 oinfo, olog, from, info, &rollbacker, dirty_info, dirty_big_info);
563 }
564
565 void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead)
566 {
567 PGLogEntryHandler rollbacker{this, &t};
568 pg_log.rewind_divergent_log(
569 newhead, info, &rollbacker, dirty_info, dirty_big_info);
570 }
571
572 /*
573 * Process information from a replica to determine if it could have any
574 * objects that i need.
575 *
576 * TODO: if the missing set becomes very large, this could get expensive.
577 * Instead, we probably want to just iterate over our unfound set.
578 */
579 bool PG::search_for_missing(
580 const pg_info_t &oinfo, const pg_missing_t &omissing,
581 pg_shard_t from,
582 RecoveryCtx *ctx)
583 {
584 uint64_t num_unfound_before = missing_loc.num_unfound();
585 bool found_missing = missing_loc.add_source_info(
586 from, oinfo, omissing, ctx->handle);
587 if (found_missing && num_unfound_before != missing_loc.num_unfound())
588 publish_stats_to_osd();
589 // avoid doing this if the peer is empty. This is abit of paranoia
590 // to avoid doing something rash if add_source_info() above
591 // incorrectly decided we found something new. (if the peer has
592 // last_update=0'0 that's impossible.)
593 if (found_missing &&
594 oinfo.last_update != eversion_t()) {
595 pg_info_t tinfo(oinfo);
596 tinfo.pgid.shard = pg_whoami.shard;
597 (*(ctx->info_map))[from.osd].push_back(
598 make_pair(
599 pg_notify_t(
600 from.shard, pg_whoami.shard,
601 get_osdmap_epoch(),
602 get_osdmap_epoch(),
603 tinfo),
604 past_intervals));
605 }
606 return found_missing;
607 }
608
609
610 // MissingLoc
611
612 bool PG::MissingLoc::readable_with_acting(
613 const hobject_t &hoid,
614 const set<pg_shard_t> &acting) const {
615 if (!needs_recovery(hoid))
616 return true;
617 if (is_deleted(hoid))
618 return false;
619 auto missing_loc_entry = missing_loc.find(hoid);
620 if (missing_loc_entry == missing_loc.end())
621 return false;
622 const set<pg_shard_t> &locs = missing_loc_entry->second;
623 ldout(pg->cct, 10) << __func__ << ": locs:" << locs << dendl;
624 set<pg_shard_t> have_acting;
625 for (set<pg_shard_t>::const_iterator i = locs.begin();
626 i != locs.end();
627 ++i) {
628 if (acting.count(*i))
629 have_acting.insert(*i);
630 }
631 return (*is_readable)(have_acting);
632 }
633
634 void PG::MissingLoc::add_batch_sources_info(
635 const set<pg_shard_t> &sources, ThreadPool::TPHandle* handle)
636 {
637 ldout(pg->cct, 10) << __func__ << ": adding sources in batch "
638 << sources.size() << dendl;
639 unsigned loop = 0;
640 bool sources_updated = false;
641 for (map<hobject_t, pg_missing_item>::const_iterator i = needs_recovery_map.begin();
642 i != needs_recovery_map.end();
643 ++i) {
644 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
645 handle->reset_tp_timeout();
646 loop = 0;
647 }
648 if (i->second.is_delete())
649 continue;
650
651 auto p = missing_loc.find(i->first);
652 if (p == missing_loc.end()) {
653 p = missing_loc.emplace(i->first, set<pg_shard_t>()).first;
654 } else {
655 _dec_count(p->second);
656 }
657 missing_loc[i->first].insert(sources.begin(), sources.end());
658 _inc_count(p->second);
659
660 if (!sources_updated) {
661 missing_loc_sources.insert(sources.begin(), sources.end());
662 sources_updated = true;
663 }
664 }
665 }
666
667 bool PG::MissingLoc::add_source_info(
668 pg_shard_t fromosd,
669 const pg_info_t &oinfo,
670 const pg_missing_t &omissing,
671 ThreadPool::TPHandle* handle)
672 {
673 bool found_missing = false;
674 unsigned loop = 0;
675 bool sources_updated = false;
676 // found items?
677 for (map<hobject_t,pg_missing_item>::const_iterator p = needs_recovery_map.begin();
678 p != needs_recovery_map.end();
679 ++p) {
680 const hobject_t &soid(p->first);
681 eversion_t need = p->second.need;
682 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
683 handle->reset_tp_timeout();
684 loop = 0;
685 }
686 if (p->second.is_delete()) {
687 ldout(pg->cct, 10) << __func__ << " " << soid
688 << " delete, ignoring source" << dendl;
689 continue;
690 }
691 if (oinfo.last_update < need) {
692 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
693 << " also missing on osd." << fromosd
694 << " (last_update " << oinfo.last_update
695 << " < needed " << need << ")" << dendl;
696 continue;
697 }
698 if (!oinfo.last_backfill.is_max() &&
699 !oinfo.last_backfill_bitwise) {
700 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
701 << " also missing on osd." << fromosd
702 << " (last_backfill " << oinfo.last_backfill
703 << " but with wrong sort order)"
704 << dendl;
705 continue;
706 }
707 if (p->first >= oinfo.last_backfill) {
708 // FIXME: this is _probably_ true, although it could conceivably
709 // be in the undefined region! Hmm!
710 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
711 << " also missing on osd." << fromosd
712 << " (past last_backfill " << oinfo.last_backfill
713 << ")" << dendl;
714 continue;
715 }
716 if (omissing.is_missing(soid)) {
717 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
718 << " also missing on osd." << fromosd << dendl;
719 continue;
720 }
721
722 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
723 << " is on osd." << fromosd << dendl;
724
725 {
726 auto p = missing_loc.find(soid);
727 if (p == missing_loc.end()) {
728 p = missing_loc.emplace(soid, set<pg_shard_t>()).first;
729 } else {
730 _dec_count(p->second);
731 }
732 p->second.insert(fromosd);
733 _inc_count(p->second);
734 }
735
736 if (!sources_updated) {
737 missing_loc_sources.insert(fromosd);
738 sources_updated = true;
739 }
740 found_missing = true;
741 }
742
743 ldout(pg->cct, 20) << "needs_recovery_map missing " << needs_recovery_map
744 << dendl;
745 return found_missing;
746 }
747
748 void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
749 {
750 set<pg_shard_t> now_down;
751 for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
752 p != missing_loc_sources.end();
753 ) {
754 if (osdmap->is_up(p->osd)) {
755 ++p;
756 continue;
757 }
758 ldout(pg->cct, 10) << __func__ << " source osd." << *p << " now down" << dendl;
759 now_down.insert(*p);
760 missing_loc_sources.erase(p++);
761 }
762
763 if (now_down.empty()) {
764 ldout(pg->cct, 10) << __func__ << " no source osds (" << missing_loc_sources << ") went down" << dendl;
765 } else {
766 ldout(pg->cct, 10) << __func__ << " sources osds " << now_down << " now down, remaining sources are "
767 << missing_loc_sources << dendl;
768
769 // filter missing_loc
770 map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin();
771 while (p != missing_loc.end()) {
772 set<pg_shard_t>::iterator q = p->second.begin();
773 bool changed = false;
774 while (q != p->second.end()) {
775 if (now_down.count(*q)) {
776 if (!changed) {
777 changed = true;
778 _dec_count(p->second);
779 }
780 p->second.erase(q++);
781 } else {
782 ++q;
783 }
784 }
785 if (p->second.empty()) {
786 missing_loc.erase(p++);
787 } else {
788 if (changed) {
789 _inc_count(p->second);
790 }
791 ++p;
792 }
793 }
794 }
795 }
796
797 void PG::discover_all_missing(map<int, map<spg_t,pg_query_t> > &query_map)
798 {
799 auto &missing = pg_log.get_missing();
800 uint64_t unfound = get_num_unfound();
801
802 dout(10) << __func__ << " "
803 << missing.num_missing() << " missing, "
804 << unfound << " unfound"
805 << dendl;
806
807 std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
808 std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
809 for (; m != mend; ++m) {
810 pg_shard_t peer(*m);
811
812 if (!get_osdmap()->is_up(peer.osd)) {
813 dout(20) << __func__ << " skipping down osd." << peer << dendl;
814 continue;
815 }
816
817 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
818 if (iter != peer_info.end() &&
819 (iter->second.is_empty() || iter->second.dne())) {
820 // ignore empty peers
821 continue;
822 }
823
824 // If we've requested any of this stuff, the pg_missing_t information
825 // should be on its way.
826 // TODO: coalsce requested_* into a single data structure
827 if (peer_missing.find(peer) != peer_missing.end()) {
828 dout(20) << __func__ << ": osd." << peer
829 << ": we already have pg_missing_t" << dendl;
830 continue;
831 }
832 if (peer_log_requested.find(peer) != peer_log_requested.end()) {
833 dout(20) << __func__ << ": osd." << peer
834 << ": in peer_log_requested" << dendl;
835 continue;
836 }
837 if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
838 dout(20) << __func__ << ": osd." << peer
839 << ": in peer_missing_requested" << dendl;
840 continue;
841 }
842
843 // Request missing
844 dout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
845 << dendl;
846 peer_missing_requested.insert(peer);
847 query_map[peer.osd][spg_t(info.pgid.pgid, peer.shard)] =
848 pg_query_t(
849 pg_query_t::FULLLOG,
850 peer.shard, pg_whoami.shard,
851 info.history, get_osdmap_epoch());
852 }
853 }
854
855 /******* PG ***********/
856 bool PG::needs_recovery() const
857 {
858 ceph_assert(is_primary());
859
860 auto &missing = pg_log.get_missing();
861
862 if (missing.num_missing()) {
863 dout(10) << __func__ << " primary has " << missing.num_missing()
864 << " missing" << dendl;
865 return true;
866 }
867
868 ceph_assert(!acting_recovery_backfill.empty());
869 set<pg_shard_t>::const_iterator end = acting_recovery_backfill.end();
870 set<pg_shard_t>::const_iterator a = acting_recovery_backfill.begin();
871 for (; a != end; ++a) {
872 if (*a == get_primary()) continue;
873 pg_shard_t peer = *a;
874 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
875 if (pm == peer_missing.end()) {
876 dout(10) << __func__ << " osd." << peer << " doesn't have missing set"
877 << dendl;
878 continue;
879 }
880 if (pm->second.num_missing()) {
881 dout(10) << __func__ << " osd." << peer << " has "
882 << pm->second.num_missing() << " missing" << dendl;
883 return true;
884 }
885 }
886
887 dout(10) << __func__ << " is recovered" << dendl;
888 return false;
889 }
890
891 bool PG::needs_backfill() const
892 {
893 ceph_assert(is_primary());
894
895 // We can assume that only possible osds that need backfill
896 // are on the backfill_targets vector nodes.
897 set<pg_shard_t>::const_iterator end = backfill_targets.end();
898 set<pg_shard_t>::const_iterator a = backfill_targets.begin();
899 for (; a != end; ++a) {
900 pg_shard_t peer = *a;
901 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
902 if (!pi->second.last_backfill.is_max()) {
903 dout(10) << __func__ << " osd." << peer << " has last_backfill " << pi->second.last_backfill << dendl;
904 return true;
905 }
906 }
907
908 dout(10) << __func__ << " does not need backfill" << dendl;
909 return false;
910 }
911
912
913 void PG::check_past_interval_bounds() const
914 {
915 auto rpib = get_required_past_interval_bounds(
916 info,
917 osd->get_superblock().oldest_map);
918 if (rpib.first >= rpib.second) {
919 if (!past_intervals.empty()) {
920 osd->clog->error() << info.pgid << " required past_interval bounds are"
921 << " empty [" << rpib << ") but past_intervals is not: "
922 << past_intervals;
923 derr << info.pgid << " required past_interval bounds are"
924 << " empty [" << rpib << ") but past_intervals is not: "
925 << past_intervals << dendl;
926 }
927 } else {
928 if (past_intervals.empty()) {
929 osd->clog->error() << info.pgid << " required past_interval bounds are"
930 << " not empty [" << rpib << ") but past_intervals "
931 << past_intervals << " is empty";
932 derr << info.pgid << " required past_interval bounds are"
933 << " not empty [" << rpib << ") but past_intervals "
934 << past_intervals << " is empty" << dendl;
935 ceph_assert(!past_intervals.empty());
936 }
937
938 auto apib = past_intervals.get_bounds();
939 if (apib.first > rpib.first) {
940 osd->clog->error() << info.pgid << " past_intervals [" << apib
941 << ") start interval does not contain the required"
942 << " bound [" << rpib << ") start";
943 derr << info.pgid << " past_intervals [" << apib
944 << ") start interval does not contain the required"
945 << " bound [" << rpib << ") start" << dendl;
946 ceph_abort_msg("past_interval start interval mismatch");
947 }
948 if (apib.second != rpib.second) {
949 osd->clog->error() << info.pgid << " past_interal bound [" << apib
950 << ") end does not match required [" << rpib
951 << ") end";
952 derr << info.pgid << " past_interal bound [" << apib
953 << ") end does not match required [" << rpib
954 << ") end" << dendl;
955 ceph_abort_msg("past_interval end mismatch");
956 }
957 }
958 }
959
960 bool PG::adjust_need_up_thru(const OSDMapRef osdmap)
961 {
962 epoch_t up_thru = osdmap->get_up_thru(osd->whoami);
963 if (need_up_thru &&
964 up_thru >= info.history.same_interval_since) {
965 dout(10) << "adjust_need_up_thru now " << up_thru << ", need_up_thru now false" << dendl;
966 need_up_thru = false;
967 return true;
968 }
969 return false;
970 }
971
972 void PG::remove_down_peer_info(const OSDMapRef osdmap)
973 {
974 // Remove any downed osds from peer_info
975 bool removed = false;
976 map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
977 while (p != peer_info.end()) {
978 if (!osdmap->is_up(p->first.osd)) {
979 dout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
980 peer_missing.erase(p->first);
981 peer_log_requested.erase(p->first);
982 peer_missing_requested.erase(p->first);
983 peer_purged.erase(p->first); // so we can re-purge if necessary
984 peer_info.erase(p++);
985 removed = true;
986 } else
987 ++p;
988 }
989
990 // if we removed anyone, update peers (which include peer_info)
991 if (removed)
992 update_heartbeat_peers();
993 check_recovery_sources(osdmap);
994 }
995
996 /*
997 * Returns true unless there is a non-lost OSD in might_have_unfound.
998 */
999 bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
1000 {
1001 ceph_assert(is_primary());
1002
1003 set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
1004 set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
1005 for (; peer != mend; ++peer) {
1006 if (peer_missing.count(*peer))
1007 continue;
1008 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
1009 if (iter != peer_info.end() &&
1010 (iter->second.is_empty() || iter->second.dne()))
1011 continue;
1012 if (!osdmap->exists(peer->osd))
1013 continue;
1014 const osd_info_t &osd_info(osdmap->get_info(peer->osd));
1015 if (osd_info.lost_at <= osd_info.up_from) {
1016 // If there is even one OSD in might_have_unfound that isn't lost, we
1017 // still might retrieve our unfound.
1018 return false;
1019 }
1020 }
1021 dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound
1022 << " have been queried or are marked lost" << dendl;
1023 return true;
1024 }
1025
1026 PastIntervals::PriorSet PG::build_prior()
1027 {
1028 if (1) {
1029 // sanity check
1030 for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
1031 it != peer_info.end();
1032 ++it) {
1033 ceph_assert(info.history.last_epoch_started >= it->second.history.last_epoch_started);
1034 }
1035 }
1036
1037 const OSDMap &osdmap = *get_osdmap();
1038 PastIntervals::PriorSet prior = past_intervals.get_prior_set(
1039 pool.info.is_erasure(),
1040 info.history.last_epoch_started,
1041 get_pgbackend()->get_is_recoverable_predicate(),
1042 [&](epoch_t start, int osd, epoch_t *lost_at) {
1043 const osd_info_t *pinfo = 0;
1044 if (osdmap.exists(osd)) {
1045 pinfo = &osdmap.get_info(osd);
1046 if (lost_at)
1047 *lost_at = pinfo->lost_at;
1048 }
1049
1050 if (osdmap.is_up(osd)) {
1051 return PastIntervals::UP;
1052 } else if (!pinfo) {
1053 return PastIntervals::DNE;
1054 } else if (pinfo->lost_at > start) {
1055 return PastIntervals::LOST;
1056 } else {
1057 return PastIntervals::DOWN;
1058 }
1059 },
1060 up,
1061 acting,
1062 this);
1063
1064 if (prior.pg_down) {
1065 state_set(PG_STATE_DOWN);
1066 }
1067
1068 if (get_osdmap()->get_up_thru(osd->whoami) < info.history.same_interval_since) {
1069 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
1070 << " < same_since " << info.history.same_interval_since
1071 << ", must notify monitor" << dendl;
1072 need_up_thru = true;
1073 } else {
1074 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
1075 << " >= same_since " << info.history.same_interval_since
1076 << ", all is well" << dendl;
1077 need_up_thru = false;
1078 }
1079 set_probe_targets(prior.probe);
1080 return prior;
1081 }
1082
1083 void PG::clear_primary_state()
1084 {
1085 dout(10) << "clear_primary_state" << dendl;
1086
1087 // clear peering state
1088 stray_set.clear();
1089 peer_log_requested.clear();
1090 peer_missing_requested.clear();
1091 peer_info.clear();
1092 peer_bytes.clear();
1093 peer_missing.clear();
1094 need_up_thru = false;
1095 peer_last_complete_ondisk.clear();
1096 peer_activated.clear();
1097 min_last_complete_ondisk = eversion_t();
1098 pg_trim_to = eversion_t();
1099 might_have_unfound.clear();
1100 projected_log = PGLog::IndexedLog();
1101
1102 last_update_ondisk = eversion_t();
1103
1104 snap_trimq.clear();
1105
1106 finish_sync_event = 0; // so that _finish_recovery doesn't go off in another thread
1107
1108 missing_loc.clear();
1109
1110 release_pg_backoffs();
1111
1112 pg_log.reset_recovery_pointers();
1113
1114 scrubber.reserved_peers.clear();
1115 scrub_after_recovery = false;
1116
1117 agent_clear();
1118 }
1119
1120 PG::Scrubber::Scrubber()
1121 : reserved(false), reserve_failed(false),
1122 epoch_start(0),
1123 active(false),
1124 shallow_errors(0), deep_errors(0), fixed(0),
1125 must_scrub(false), must_deep_scrub(false), must_repair(false),
1126 need_auto(false), time_for_deep(false),
1127 auto_repair(false),
1128 check_repair(false),
1129 deep_scrub_on_error(false),
1130 num_digest_updates_pending(0),
1131 state(INACTIVE),
1132 deep(false)
1133 {}
1134
1135 PG::Scrubber::~Scrubber() {}
1136
1137 /**
1138 * find_best_info
1139 *
1140 * Returns an iterator to the best info in infos sorted by:
1141 * 1) Prefer newer last_update
1142 * 2) Prefer longer tail if it brings another info into contiguity
1143 * 3) Prefer current primary
1144 */
1145 map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
1146 const map<pg_shard_t, pg_info_t> &infos,
1147 bool restrict_to_up_acting,
1148 bool *history_les_bound) const
1149 {
1150 ceph_assert(history_les_bound);
1151 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1152 * to make changes to this process. Also, make sure to update it
1153 * when you find bugs! */
1154 eversion_t min_last_update_acceptable = eversion_t::max();
1155 epoch_t max_last_epoch_started_found = 0;
1156 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1157 i != infos.end();
1158 ++i) {
1159 if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1160 max_last_epoch_started_found < i->second.history.last_epoch_started) {
1161 *history_les_bound = true;
1162 max_last_epoch_started_found = i->second.history.last_epoch_started;
1163 }
1164 if (!i->second.is_incomplete() &&
1165 max_last_epoch_started_found < i->second.last_epoch_started) {
1166 *history_les_bound = false;
1167 max_last_epoch_started_found = i->second.last_epoch_started;
1168 }
1169 }
1170 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1171 i != infos.end();
1172 ++i) {
1173 if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1174 if (min_last_update_acceptable > i->second.last_update)
1175 min_last_update_acceptable = i->second.last_update;
1176 }
1177 }
1178 if (min_last_update_acceptable == eversion_t::max())
1179 return infos.end();
1180
1181 map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1182 // find osd with newest last_update (oldest for ec_pool).
1183 // if there are multiples, prefer
1184 // - a longer tail, if it brings another peer into log contiguity
1185 // - the current primary
1186 for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1187 p != infos.end();
1188 ++p) {
1189 if (restrict_to_up_acting && !is_up(p->first) &&
1190 !is_acting(p->first))
1191 continue;
1192 // Only consider peers with last_update >= min_last_update_acceptable
1193 if (p->second.last_update < min_last_update_acceptable)
1194 continue;
1195 // Disqualify anyone with a too old last_epoch_started
1196 if (p->second.last_epoch_started < max_last_epoch_started_found)
1197 continue;
1198 // Disqualify anyone who is incomplete (not fully backfilled)
1199 if (p->second.is_incomplete())
1200 continue;
1201 if (best == infos.end()) {
1202 best = p;
1203 continue;
1204 }
1205 // Prefer newer last_update
1206 if (pool.info.require_rollback()) {
1207 if (p->second.last_update > best->second.last_update)
1208 continue;
1209 if (p->second.last_update < best->second.last_update) {
1210 best = p;
1211 continue;
1212 }
1213 } else {
1214 if (p->second.last_update < best->second.last_update)
1215 continue;
1216 if (p->second.last_update > best->second.last_update) {
1217 best = p;
1218 continue;
1219 }
1220 }
1221
1222 // Prefer longer tail
1223 if (p->second.log_tail > best->second.log_tail) {
1224 continue;
1225 } else if (p->second.log_tail < best->second.log_tail) {
1226 best = p;
1227 continue;
1228 }
1229
1230 if (!p->second.has_missing() && best->second.has_missing()) {
1231 dout(10) << __func__ << " prefer osd." << p->first
1232 << " because it is complete while best has missing"
1233 << dendl;
1234 best = p;
1235 continue;
1236 } else if (p->second.has_missing() && !best->second.has_missing()) {
1237 dout(10) << __func__ << " skipping osd." << p->first
1238 << " because it has missing while best is complete"
1239 << dendl;
1240 continue;
1241 } else {
1242 // both are complete or have missing
1243 // fall through
1244 }
1245
1246 // prefer current primary (usually the caller), all things being equal
1247 if (p->first == pg_whoami) {
1248 dout(10) << "calc_acting prefer osd." << p->first
1249 << " because it is current primary" << dendl;
1250 best = p;
1251 continue;
1252 }
1253 }
1254 return best;
1255 }
1256
1257 void PG::calc_ec_acting(
1258 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1259 unsigned size,
1260 const vector<int> &acting,
1261 const vector<int> &up,
1262 const map<pg_shard_t, pg_info_t> &all_info,
1263 bool restrict_to_up_acting,
1264 vector<int> *_want,
1265 set<pg_shard_t> *backfill,
1266 set<pg_shard_t> *acting_backfill,
1267 ostream &ss)
1268 {
1269 vector<int> want(size, CRUSH_ITEM_NONE);
1270 map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1271 for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1272 i != all_info.end();
1273 ++i) {
1274 all_info_by_shard[i->first.shard].insert(i->first);
1275 }
1276 for (uint8_t i = 0; i < want.size(); ++i) {
1277 ss << "For position " << (unsigned)i << ": ";
1278 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1279 !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1280 all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1281 auth_log_shard->second.log_tail) {
1282 ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1283 want[i] = up[i];
1284 continue;
1285 }
1286 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1287 ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1288 << " and ";
1289 backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1290 }
1291
1292 if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1293 !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1294 all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1295 auth_log_shard->second.log_tail) {
1296 ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1297 want[i] = acting[i];
1298 } else if (!restrict_to_up_acting) {
1299 for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1300 j != all_info_by_shard[shard_id_t(i)].end();
1301 ++j) {
1302 ceph_assert(j->shard == i);
1303 if (!all_info.find(*j)->second.is_incomplete() &&
1304 all_info.find(*j)->second.last_update >=
1305 auth_log_shard->second.log_tail) {
1306 ss << " selecting stray: " << *j << std::endl;
1307 want[i] = j->osd;
1308 break;
1309 }
1310 }
1311 if (want[i] == CRUSH_ITEM_NONE)
1312 ss << " failed to fill position " << (int)i << std::endl;
1313 }
1314 }
1315
1316 for (uint8_t i = 0; i < want.size(); ++i) {
1317 if (want[i] != CRUSH_ITEM_NONE) {
1318 acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1319 }
1320 }
1321 acting_backfill->insert(backfill->begin(), backfill->end());
1322 _want->swap(want);
1323 }
1324
1325 /**
1326 * calculate the desired acting set.
1327 *
1328 * Choose an appropriate acting set. Prefer up[0], unless it is
1329 * incomplete, or another osd has a longer tail that allows us to
1330 * bring other up nodes up to date.
1331 */
1332 void PG::calc_replicated_acting(
1333 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1334 uint64_t force_auth_primary_missing_objects,
1335 unsigned size,
1336 const vector<int> &acting,
1337 const vector<int> &up,
1338 pg_shard_t up_primary,
1339 const map<pg_shard_t, pg_info_t> &all_info,
1340 bool restrict_to_up_acting,
1341 vector<int> *want,
1342 set<pg_shard_t> *backfill,
1343 set<pg_shard_t> *acting_backfill,
1344 const OSDMapRef osdmap,
1345 ostream &ss)
1346 {
1347 pg_shard_t auth_log_shard_id = auth_log_shard->first;
1348
1349 ss << __func__ << " newest update on osd." << auth_log_shard_id
1350 << " with " << auth_log_shard->second
1351 << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1352
1353 // select primary
1354 auto primary = all_info.find(up_primary);
1355 if (up.size() &&
1356 !primary->second.is_incomplete() &&
1357 primary->second.last_update >=
1358 auth_log_shard->second.log_tail) {
1359 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1360 auto approx_missing_objects =
1361 primary->second.stats.stats.sum.num_objects_missing;
1362 auto auth_version = auth_log_shard->second.last_update.version;
1363 auto primary_version = primary->second.last_update.version;
1364 if (auth_version > primary_version) {
1365 approx_missing_objects += auth_version - primary_version;
1366 } else {
1367 approx_missing_objects += primary_version - auth_version;
1368 }
1369 if ((uint64_t)approx_missing_objects >
1370 force_auth_primary_missing_objects) {
1371 primary = auth_log_shard;
1372 ss << "up_primary: " << up_primary << ") has approximate "
1373 << approx_missing_objects
1374 << "(>" << force_auth_primary_missing_objects <<") "
1375 << "missing objects, osd." << auth_log_shard_id
1376 << " selected as primary instead"
1377 << std::endl;
1378 } else {
1379 ss << "up_primary: " << up_primary << ") selected as primary"
1380 << std::endl;
1381 }
1382 } else {
1383 ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1384 }
1385 } else {
1386 ceph_assert(!auth_log_shard->second.is_incomplete());
1387 ss << "up[0] needs backfill, osd." << auth_log_shard_id
1388 << " selected as primary instead" << std::endl;
1389 primary = auth_log_shard;
1390 }
1391
1392 ss << __func__ << " primary is osd." << primary->first
1393 << " with " << primary->second << std::endl;
1394 want->push_back(primary->first.osd);
1395 acting_backfill->insert(primary->first);
1396
1397 /* We include auth_log_shard->second.log_tail because in GetLog,
1398 * we will request logs back to the min last_update over our
1399 * acting_backfill set, which will result in our log being extended
1400 * as far backwards as necessary to pick up any peers which can
1401 * be log recovered by auth_log_shard's log */
1402 eversion_t oldest_auth_log_entry =
1403 std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
1404
1405 // select replicas that have log contiguity with primary.
1406 // prefer up, then acting, then any peer_info osds
1407 for (auto i : up) {
1408 pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
1409 if (up_cand == primary->first)
1410 continue;
1411 const pg_info_t &cur_info = all_info.find(up_cand)->second;
1412 if (cur_info.is_incomplete() ||
1413 cur_info.last_update < oldest_auth_log_entry) {
1414 ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1415 backfill->insert(up_cand);
1416 acting_backfill->insert(up_cand);
1417 } else {
1418 want->push_back(i);
1419 acting_backfill->insert(up_cand);
1420 ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
1421 }
1422 if (want->size() >= size) {
1423 break;
1424 }
1425 }
1426
1427 if (want->size() >= size) {
1428 return;
1429 }
1430
1431 std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
1432 candidate_by_last_update.reserve(acting.size());
1433 // This no longer has backfill OSDs, but they are covered above.
1434 for (auto i : acting) {
1435 pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
1436 // skip up osds we already considered above
1437 if (acting_cand == primary->first)
1438 continue;
1439 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i);
1440 if (up_it != up.end())
1441 continue;
1442
1443 const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1444 if (cur_info.is_incomplete() ||
1445 cur_info.last_update < oldest_auth_log_entry) {
1446 ss << " shard " << acting_cand << " (acting) REJECTED "
1447 << cur_info << std::endl;
1448 } else {
1449 candidate_by_last_update.push_back(make_pair(cur_info.last_update, i));
1450 }
1451 }
1452
1453 auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs,
1454 const std::pair<eversion_t, int> &rhs) {
1455 return lhs.first > rhs.first;
1456 };
1457 // sort by last_update, in descending order.
1458 std::sort(candidate_by_last_update.begin(),
1459 candidate_by_last_update.end(), sort_by_eversion);
1460 for (auto &p: candidate_by_last_update) {
1461 ceph_assert(want->size() < size);
1462 want->push_back(p.second);
1463 pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1464 acting_backfill->insert(s);
1465 ss << " shard " << s << " (acting) accepted "
1466 << all_info.find(s)->second << std::endl;
1467 if (want->size() >= size) {
1468 return;
1469 }
1470 }
1471
1472 if (restrict_to_up_acting) {
1473 return;
1474 }
1475 candidate_by_last_update.clear();
1476 candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
1477 // continue to search stray to find more suitable peers
1478 for (auto &i : all_info) {
1479 // skip up osds we already considered above
1480 if (i.first == primary->first)
1481 continue;
1482 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i.first.osd);
1483 if (up_it != up.end())
1484 continue;
1485 vector<int>::const_iterator acting_it = find(
1486 acting.begin(), acting.end(), i.first.osd);
1487 if (acting_it != acting.end())
1488 continue;
1489
1490 if (i.second.is_incomplete() ||
1491 i.second.last_update < oldest_auth_log_entry) {
1492 ss << " shard " << i.first << " (stray) REJECTED " << i.second
1493 << std::endl;
1494 } else {
1495 candidate_by_last_update.push_back(
1496 make_pair(i.second.last_update, i.first.osd));
1497 }
1498 }
1499
1500 if (candidate_by_last_update.empty()) {
1501 // save us some effort
1502 return;
1503 }
1504
1505 // sort by last_update, in descending order.
1506 std::sort(candidate_by_last_update.begin(),
1507 candidate_by_last_update.end(), sort_by_eversion);
1508
1509 for (auto &p: candidate_by_last_update) {
1510 ceph_assert(want->size() < size);
1511 want->push_back(p.second);
1512 pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1513 acting_backfill->insert(s);
1514 ss << " shard " << s << " (stray) accepted "
1515 << all_info.find(s)->second << std::endl;
1516 if (want->size() >= size) {
1517 return;
1518 }
1519 }
1520 }
1521
1522 bool PG::recoverable_and_ge_min_size(const vector<int> &want) const
1523 {
1524 unsigned num_want_acting = 0;
1525 set<pg_shard_t> have;
1526 for (int i = 0; i < (int)want.size(); ++i) {
1527 if (want[i] != CRUSH_ITEM_NONE) {
1528 ++num_want_acting;
1529 have.insert(
1530 pg_shard_t(
1531 want[i],
1532 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1533 }
1534 }
1535 // We go incomplete if below min_size for ec_pools since backfill
1536 // does not currently maintain rollbackability
1537 // Otherwise, we will go "peered", but not "active"
1538 if (num_want_acting < pool.info.min_size &&
1539 (pool.info.is_erasure() ||
1540 !cct->_conf->osd_allow_recovery_below_min_size)) {
1541 dout(10) << __func__ << " failed, below min size" << dendl;
1542 return false;
1543 }
1544
1545 /* Check whether we have enough acting shards to later perform recovery */
1546 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable_predicate(
1547 get_pgbackend()->get_is_recoverable_predicate());
1548 if (!(*recoverable_predicate)(have)) {
1549 dout(10) << __func__ << " failed, not recoverable" << dendl;
1550 return false;
1551 }
1552
1553 return true;
1554 }
1555
1556 void PG::choose_async_recovery_ec(const map<pg_shard_t, pg_info_t> &all_info,
1557 const pg_info_t &auth_info,
1558 vector<int> *want,
1559 set<pg_shard_t> *async_recovery,
1560 const OSDMapRef osdmap) const
1561 {
1562 set<pair<int, pg_shard_t> > candidates_by_cost;
1563 for (uint8_t i = 0; i < want->size(); ++i) {
1564 if ((*want)[i] == CRUSH_ITEM_NONE)
1565 continue;
1566
1567 // Considering log entries to recover is accurate enough for
1568 // now. We could use minimum_to_decode_with_cost() later if
1569 // necessary.
1570 pg_shard_t shard_i((*want)[i], shard_id_t(i));
1571 // do not include strays
1572 if (stray_set.find(shard_i) != stray_set.end())
1573 continue;
1574 // Do not include an osd that is not up, since choosing it as
1575 // an async_recovery_target will move it out of the acting set.
1576 // This results in it being identified as a stray during peering,
1577 // because it is no longer in the up or acting set.
1578 if (!is_up(shard_i))
1579 continue;
1580 auto shard_info = all_info.find(shard_i)->second;
1581 // for ec pools we rollback all entries past the authoritative
1582 // last_update *before* activation. This is relatively inexpensive
1583 // compared to recovery, since it is purely local, so treat shards
1584 // past the authoritative last_update the same as those equal to it.
1585 version_t auth_version = auth_info.last_update.version;
1586 version_t candidate_version = shard_info.last_update.version;
1587 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1588 auto approx_missing_objects =
1589 shard_info.stats.stats.sum.num_objects_missing;
1590 if (auth_version > candidate_version) {
1591 approx_missing_objects += auth_version - candidate_version;
1592 }
1593 if (static_cast<uint64_t>(approx_missing_objects) >
1594 cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1595 candidates_by_cost.emplace(approx_missing_objects, shard_i);
1596 }
1597 } else {
1598 if (auth_version > candidate_version &&
1599 (auth_version - candidate_version) > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1600 candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i));
1601 }
1602 }
1603 }
1604
1605 dout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1606 << dendl;
1607
1608 // take out as many osds as we can for async recovery, in order of cost
1609 for (auto rit = candidates_by_cost.rbegin();
1610 rit != candidates_by_cost.rend(); ++rit) {
1611 pg_shard_t cur_shard = rit->second;
1612 vector<int> candidate_want(*want);
1613 candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE;
1614 if (recoverable_and_ge_min_size(candidate_want)) {
1615 want->swap(candidate_want);
1616 async_recovery->insert(cur_shard);
1617 }
1618 }
1619 dout(20) << __func__ << " result want=" << *want
1620 << " async_recovery=" << *async_recovery << dendl;
1621 }
1622
1623 void PG::choose_async_recovery_replicated(const map<pg_shard_t, pg_info_t> &all_info,
1624 const pg_info_t &auth_info,
1625 vector<int> *want,
1626 set<pg_shard_t> *async_recovery,
1627 const OSDMapRef osdmap) const
1628 {
1629 set<pair<int, pg_shard_t> > candidates_by_cost;
1630 for (auto osd_num : *want) {
1631 pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD);
1632 // do not include strays
1633 if (stray_set.find(shard_i) != stray_set.end())
1634 continue;
1635 // Do not include an osd that is not up, since choosing it as
1636 // an async_recovery_target will move it out of the acting set.
1637 // This results in it being identified as a stray during peering,
1638 // because it is no longer in the up or acting set.
1639 if (!is_up(shard_i))
1640 continue;
1641 auto shard_info = all_info.find(shard_i)->second;
1642 // use the approximate magnitude of the difference in length of
1643 // logs plus historical missing objects as the cost of recovery
1644 version_t auth_version = auth_info.last_update.version;
1645 version_t candidate_version = shard_info.last_update.version;
1646 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1647 auto approx_missing_objects =
1648 shard_info.stats.stats.sum.num_objects_missing;
1649 if (auth_version > candidate_version) {
1650 approx_missing_objects += auth_version - candidate_version;
1651 } else {
1652 approx_missing_objects += candidate_version - auth_version;
1653 }
1654 if (static_cast<uint64_t>(approx_missing_objects) >
1655 cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1656 candidates_by_cost.emplace(approx_missing_objects, shard_i);
1657 }
1658 } else {
1659 size_t approx_entries;
1660 if (auth_version > candidate_version) {
1661 approx_entries = auth_version - candidate_version;
1662 } else {
1663 approx_entries = candidate_version - auth_version;
1664 }
1665 if (approx_entries > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1666 candidates_by_cost.insert(make_pair(approx_entries, shard_i));
1667 }
1668 }
1669 }
1670
1671 dout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1672 << dendl;
1673 // take out as many osds as we can for async recovery, in order of cost
1674 for (auto rit = candidates_by_cost.rbegin();
1675 rit != candidates_by_cost.rend(); ++rit) {
1676 if (want->size() <= pool.info.min_size) {
1677 break;
1678 }
1679 pg_shard_t cur_shard = rit->second;
1680 vector<int> candidate_want(*want);
1681 for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) {
1682 if (*it == cur_shard.osd) {
1683 candidate_want.erase(it);
1684 want->swap(candidate_want);
1685 async_recovery->insert(cur_shard);
1686 break;
1687 }
1688 }
1689 }
1690 dout(20) << __func__ << " result want=" << *want
1691 << " async_recovery=" << *async_recovery << dendl;
1692 }
1693
1694 /**
1695 * choose acting
1696 *
1697 * calculate the desired acting, and request a change with the monitor
1698 * if it differs from the current acting.
1699 *
1700 * if restrict_to_up_acting=true, we filter out anything that's not in
1701 * up/acting. in order to lift this restriction, we need to
1702 * 1) check whether it's worth switching the acting set any time we get
1703 * a new pg info (not just here, when recovery finishes)
1704 * 2) check whether anything in want_acting went down on each new map
1705 * (and, if so, calculate a new want_acting)
1706 * 3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap)
1707 * TODO!
1708 */
1709 bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
1710 bool restrict_to_up_acting,
1711 bool *history_les_bound)
1712 {
1713 map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
1714 all_info[pg_whoami] = info;
1715
1716 if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
1717 for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
1718 p != all_info.end();
1719 ++p) {
1720 dout(10) << __func__ << " all_info osd." << p->first << " " << p->second << dendl;
1721 }
1722 }
1723
1724 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
1725 find_best_info(all_info, restrict_to_up_acting, history_les_bound);
1726
1727 if (auth_log_shard == all_info.end()) {
1728 if (up != acting) {
1729 dout(10) << __func__ << " no suitable info found (incomplete backfills?),"
1730 << " reverting to up" << dendl;
1731 want_acting = up;
1732 vector<int> empty;
1733 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1734 } else {
1735 dout(10) << __func__ << " failed" << dendl;
1736 ceph_assert(want_acting.empty());
1737 }
1738 return false;
1739 }
1740
1741 ceph_assert(!auth_log_shard->second.is_incomplete());
1742 auth_log_shard_id = auth_log_shard->first;
1743
1744 set<pg_shard_t> want_backfill, want_acting_backfill;
1745 vector<int> want;
1746 stringstream ss;
1747 if (!pool.info.is_erasure())
1748 calc_replicated_acting(
1749 auth_log_shard,
1750 cct->_conf.get_val<uint64_t>(
1751 "osd_force_auth_primary_missing_objects"),
1752 get_osdmap()->get_pg_size(info.pgid.pgid),
1753 acting,
1754 up,
1755 up_primary,
1756 all_info,
1757 restrict_to_up_acting,
1758 &want,
1759 &want_backfill,
1760 &want_acting_backfill,
1761 get_osdmap(),
1762 ss);
1763 else
1764 calc_ec_acting(
1765 auth_log_shard,
1766 get_osdmap()->get_pg_size(info.pgid.pgid),
1767 acting,
1768 up,
1769 all_info,
1770 restrict_to_up_acting,
1771 &want,
1772 &want_backfill,
1773 &want_acting_backfill,
1774 ss);
1775 dout(10) << ss.str() << dendl;
1776
1777 if (!recoverable_and_ge_min_size(want)) {
1778 want_acting.clear();
1779 return false;
1780 }
1781
1782 set<pg_shard_t> want_async_recovery;
1783 if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) {
1784 if (pool.info.is_erasure()) {
1785 choose_async_recovery_ec(all_info, auth_log_shard->second, &want, &want_async_recovery, get_osdmap());
1786 } else {
1787 choose_async_recovery_replicated(all_info, auth_log_shard->second, &want, &want_async_recovery, get_osdmap());
1788 }
1789 }
1790 if (want != acting) {
1791 dout(10) << __func__ << " want " << want << " != acting " << acting
1792 << ", requesting pg_temp change" << dendl;
1793 want_acting = want;
1794
1795 if (!cct->_conf->osd_debug_no_acting_change) {
1796 if (want_acting == up) {
1797 // There can't be any pending backfill if
1798 // want is the same as crush map up OSDs.
1799 ceph_assert(want_backfill.empty());
1800 vector<int> empty;
1801 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1802 } else
1803 osd->queue_want_pg_temp(info.pgid.pgid, want);
1804 }
1805 return false;
1806 }
1807 want_acting.clear();
1808 acting_recovery_backfill = want_acting_backfill;
1809 dout(10) << "acting_recovery_backfill is " << acting_recovery_backfill << dendl;
1810 ceph_assert(backfill_targets.empty() || backfill_targets == want_backfill);
1811 if (backfill_targets.empty()) {
1812 // Caller is GetInfo
1813 backfill_targets = want_backfill;
1814 }
1815 // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
1816 ceph_assert(async_recovery_targets.empty() || async_recovery_targets == want_async_recovery || !needs_recovery());
1817 if (async_recovery_targets.empty() || !needs_recovery()) {
1818 async_recovery_targets = want_async_recovery;
1819 }
1820 // Will not change if already set because up would have had to change
1821 // Verify that nothing in backfill is in stray_set
1822 for (set<pg_shard_t>::iterator i = want_backfill.begin();
1823 i != want_backfill.end();
1824 ++i) {
1825 ceph_assert(stray_set.find(*i) == stray_set.end());
1826 }
1827 dout(10) << "choose_acting want=" << want << " backfill_targets="
1828 << want_backfill << " async_recovery_targets="
1829 << async_recovery_targets << dendl;
1830 return true;
1831 }
1832
1833 /* Build the might_have_unfound set.
1834 *
1835 * This is used by the primary OSD during recovery.
1836 *
1837 * This set tracks the OSDs which might have unfound objects that the primary
1838 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
1839 * will remove the OSD from the set.
1840 */
1841 void PG::build_might_have_unfound()
1842 {
1843 ceph_assert(might_have_unfound.empty());
1844 ceph_assert(is_primary());
1845
1846 dout(10) << __func__ << dendl;
1847
1848 check_past_interval_bounds();
1849
1850 might_have_unfound = past_intervals.get_might_have_unfound(
1851 pg_whoami,
1852 pool.info.is_erasure());
1853
1854 // include any (stray) peers
1855 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
1856 p != peer_info.end();
1857 ++p)
1858 might_have_unfound.insert(p->first);
1859
1860 dout(15) << __func__ << ": built " << might_have_unfound << dendl;
1861 }
1862
1863 void PG::activate(ObjectStore::Transaction& t,
1864 epoch_t activation_epoch,
1865 map<int, map<spg_t,pg_query_t> >& query_map,
1866 map<int,
1867 vector<
1868 pair<pg_notify_t,
1869 PastIntervals> > > *activator_map,
1870 RecoveryCtx *ctx)
1871 {
1872 ceph_assert(!is_peered());
1873 ceph_assert(scrubber.callbacks.empty());
1874 ceph_assert(callbacks_for_degraded_object.empty());
1875
1876 // twiddle pg state
1877 state_clear(PG_STATE_DOWN);
1878
1879 send_notify = false;
1880
1881 if (is_primary()) {
1882 // only update primary last_epoch_started if we will go active
1883 if (acting.size() >= pool.info.min_size) {
1884 ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
1885 info.last_epoch_started <= activation_epoch);
1886 info.last_epoch_started = activation_epoch;
1887 info.last_interval_started = info.history.same_interval_since;
1888 }
1889 } else if (is_acting(pg_whoami)) {
1890 /* update last_epoch_started on acting replica to whatever the primary sent
1891 * unless it's smaller (could happen if we are going peered rather than
1892 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
1893 if (info.last_epoch_started < activation_epoch) {
1894 info.last_epoch_started = activation_epoch;
1895 info.last_interval_started = info.history.same_interval_since;
1896 }
1897 }
1898
1899 auto &missing = pg_log.get_missing();
1900
1901 if (is_primary()) {
1902 last_update_ondisk = info.last_update;
1903 min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
1904 }
1905 last_update_applied = info.last_update;
1906 last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
1907
1908 need_up_thru = false;
1909
1910 // write pg info, log
1911 dirty_info = true;
1912 dirty_big_info = true; // maybe
1913
1914 // find out when we commit
1915 t.register_on_complete(
1916 new C_PG_ActivateCommitted(
1917 this,
1918 get_osdmap_epoch(),
1919 activation_epoch));
1920
1921 if (is_primary()) {
1922 // initialize snap_trimq
1923 if (get_osdmap()->require_osd_release < CEPH_RELEASE_MIMIC) {
1924 dout(20) << "activate - purged_snaps " << info.purged_snaps
1925 << " cached_removed_snaps " << pool.cached_removed_snaps
1926 << dendl;
1927 snap_trimq = pool.cached_removed_snaps;
1928 } else {
1929 auto& removed_snaps_queue = get_osdmap()->get_removed_snaps_queue();
1930 auto p = removed_snaps_queue.find(info.pgid.pgid.pool());
1931 snap_trimq.clear();
1932 if (p != removed_snaps_queue.end()) {
1933 dout(20) << "activate - purged_snaps " << info.purged_snaps
1934 << " removed_snaps " << p->second
1935 << dendl;
1936 for (auto q : p->second) {
1937 snap_trimq.insert(q.first, q.second);
1938 }
1939 }
1940 }
1941 interval_set<snapid_t> purged;
1942 purged.intersection_of(snap_trimq, info.purged_snaps);
1943 snap_trimq.subtract(purged);
1944
1945 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_MIMIC) {
1946 // adjust purged_snaps: PG may have been inactive while snaps were pruned
1947 // from the removed_snaps_queue in the osdmap. update local purged_snaps
1948 // reflect only those snaps that we thought were pruned and were still in
1949 // the queue.
1950 info.purged_snaps.swap(purged);
1951 }
1952 }
1953
1954 // init complete pointer
1955 if (missing.num_missing() == 0) {
1956 dout(10) << "activate - no missing, moving last_complete " << info.last_complete
1957 << " -> " << info.last_update << dendl;
1958 info.last_complete = info.last_update;
1959 info.stats.stats.sum.num_objects_missing = 0;
1960 pg_log.reset_recovery_pointers();
1961 } else {
1962 dout(10) << "activate - not complete, " << missing << dendl;
1963 info.stats.stats.sum.num_objects_missing = missing.num_missing();
1964 pg_log.activate_not_complete(info);
1965 }
1966
1967 log_weirdness();
1968
1969 // if primary..
1970 if (is_primary()) {
1971 ceph_assert(ctx);
1972 // start up replicas
1973
1974 ceph_assert(!acting_recovery_backfill.empty());
1975 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
1976 i != acting_recovery_backfill.end();
1977 ++i) {
1978 if (*i == pg_whoami) continue;
1979 pg_shard_t peer = *i;
1980 ceph_assert(peer_info.count(peer));
1981 pg_info_t& pi = peer_info[peer];
1982
1983 dout(10) << "activate peer osd." << peer << " " << pi << dendl;
1984
1985 MOSDPGLog *m = 0;
1986 ceph_assert(peer_missing.count(peer));
1987 pg_missing_t& pm = peer_missing[peer];
1988
1989 bool needs_past_intervals = pi.dne();
1990
1991 /*
1992 * cover case where peer sort order was different and
1993 * last_backfill cannot be interpreted
1994 */
1995 bool force_restart_backfill =
1996 !pi.last_backfill.is_max() &&
1997 !pi.last_backfill_bitwise;
1998
1999 if (pi.last_update == info.last_update && !force_restart_backfill) {
2000 // empty log
2001 if (!pi.last_backfill.is_max())
2002 osd->clog->info() << info.pgid << " continuing backfill to osd."
2003 << peer
2004 << " from (" << pi.log_tail << "," << pi.last_update
2005 << "] " << pi.last_backfill
2006 << " to " << info.last_update;
2007 if (!pi.is_empty() && activator_map) {
2008 dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
2009 (*activator_map)[peer.osd].push_back(
2010 make_pair(
2011 pg_notify_t(
2012 peer.shard, pg_whoami.shard,
2013 get_osdmap_epoch(),
2014 get_osdmap_epoch(),
2015 info),
2016 past_intervals));
2017 } else {
2018 dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
2019 m = new MOSDPGLog(
2020 i->shard, pg_whoami.shard,
2021 get_osdmap_epoch(), info,
2022 last_peering_reset);
2023 }
2024 } else if (
2025 pg_log.get_tail() > pi.last_update ||
2026 pi.last_backfill == hobject_t() ||
2027 force_restart_backfill ||
2028 (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
2029 /* ^ This last case covers a situation where a replica is not contiguous
2030 * with the auth_log, but is contiguous with this replica. Reshuffling
2031 * the active set to handle this would be tricky, so instead we just go
2032 * ahead and backfill it anyway. This is probably preferrable in any
2033 * case since the replica in question would have to be significantly
2034 * behind.
2035 */
2036 // backfill
2037 osd->clog->debug() << info.pgid << " starting backfill to osd." << peer
2038 << " from (" << pi.log_tail << "," << pi.last_update
2039 << "] " << pi.last_backfill
2040 << " to " << info.last_update;
2041
2042 pi.last_update = info.last_update;
2043 pi.last_complete = info.last_update;
2044 pi.set_last_backfill(hobject_t());
2045 pi.last_epoch_started = info.last_epoch_started;
2046 pi.last_interval_started = info.last_interval_started;
2047 pi.history = info.history;
2048 pi.hit_set = info.hit_set;
2049 // Save num_bytes for reservation request, can't be negative
2050 peer_bytes[peer] = std::max<int64_t>(0, pi.stats.stats.sum.num_bytes);
2051 pi.stats.stats.clear();
2052
2053 // initialize peer with our purged_snaps.
2054 pi.purged_snaps = info.purged_snaps;
2055
2056 m = new MOSDPGLog(
2057 i->shard, pg_whoami.shard,
2058 get_osdmap_epoch(), pi,
2059 last_peering_reset /* epoch to create pg at */);
2060
2061 // send some recent log, so that op dup detection works well.
2062 m->log.copy_up_to(cct, pg_log.get_log(), cct->_conf->osd_min_pg_log_entries);
2063 m->info.log_tail = m->log.tail;
2064 pi.log_tail = m->log.tail; // sigh...
2065
2066 pm.clear();
2067 } else {
2068 // catch up
2069 ceph_assert(pg_log.get_tail() <= pi.last_update);
2070 m = new MOSDPGLog(
2071 i->shard, pg_whoami.shard,
2072 get_osdmap_epoch(), info,
2073 last_peering_reset /* epoch to create pg at */);
2074 // send new stuff to append to replicas log
2075 m->log.copy_after(cct, pg_log.get_log(), pi.last_update);
2076 }
2077
2078 // share past_intervals if we are creating the pg on the replica
2079 // based on whether our info for that peer was dne() *before*
2080 // updating pi.history in the backfill block above.
2081 if (m && needs_past_intervals)
2082 m->past_intervals = past_intervals;
2083
2084 // update local version of peer's missing list!
2085 if (m && pi.last_backfill != hobject_t()) {
2086 for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
2087 p != m->log.log.end();
2088 ++p) {
2089 if (p->soid <= pi.last_backfill &&
2090 !p->is_error()) {
2091 if (perform_deletes_during_peering() && p->is_delete()) {
2092 pm.rm(p->soid, p->version);
2093 } else {
2094 pm.add_next_event(*p);
2095 }
2096 }
2097 }
2098 }
2099
2100 if (m) {
2101 dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
2102 //m->log.print(cout);
2103 osd->send_message_osd_cluster(peer.osd, m, get_osdmap_epoch());
2104 }
2105
2106 // peer now has
2107 pi.last_update = info.last_update;
2108
2109 // update our missing
2110 if (pm.num_missing() == 0) {
2111 pi.last_complete = pi.last_update;
2112 dout(10) << "activate peer osd." << peer << " " << pi << " uptodate" << dendl;
2113 } else {
2114 dout(10) << "activate peer osd." << peer << " " << pi << " missing " << pm << dendl;
2115 }
2116 }
2117
2118 // Set up missing_loc
2119 set<pg_shard_t> complete_shards;
2120 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2121 i != acting_recovery_backfill.end();
2122 ++i) {
2123 dout(20) << __func__ << " setting up missing_loc from shard " << *i << " " << dendl;
2124 if (*i == get_primary()) {
2125 missing_loc.add_active_missing(missing);
2126 if (!missing.have_missing())
2127 complete_shards.insert(*i);
2128 } else {
2129 auto peer_missing_entry = peer_missing.find(*i);
2130 ceph_assert(peer_missing_entry != peer_missing.end());
2131 missing_loc.add_active_missing(peer_missing_entry->second);
2132 if (!peer_missing_entry->second.have_missing() &&
2133 peer_info[*i].last_backfill.is_max())
2134 complete_shards.insert(*i);
2135 }
2136 }
2137
2138 // If necessary, create might_have_unfound to help us find our unfound objects.
2139 // NOTE: It's important that we build might_have_unfound before trimming the
2140 // past intervals.
2141 might_have_unfound.clear();
2142 if (needs_recovery()) {
2143 // If only one shard has missing, we do a trick to add all others as recovery
2144 // source, this is considered safe since the PGLogs have been merged locally,
2145 // and covers vast majority of the use cases, like one OSD/host is down for
2146 // a while for hardware repairing
2147 if (complete_shards.size() + 1 == acting_recovery_backfill.size()) {
2148 missing_loc.add_batch_sources_info(complete_shards, ctx->handle);
2149 } else {
2150 missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
2151 ctx->handle);
2152 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2153 i != acting_recovery_backfill.end();
2154 ++i) {
2155 if (*i == pg_whoami) continue;
2156 dout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
2157 ceph_assert(peer_missing.count(*i));
2158 ceph_assert(peer_info.count(*i));
2159 missing_loc.add_source_info(
2160 *i,
2161 peer_info[*i],
2162 peer_missing[*i],
2163 ctx->handle);
2164 }
2165 }
2166 for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
2167 i != peer_missing.end();
2168 ++i) {
2169 if (is_acting_recovery_backfill(i->first))
2170 continue;
2171 ceph_assert(peer_info.count(i->first));
2172 search_for_missing(
2173 peer_info[i->first],
2174 i->second,
2175 i->first,
2176 ctx);
2177 }
2178
2179 build_might_have_unfound();
2180
2181 // Always call now so _update_calc_stats() will be accurate
2182 discover_all_missing(query_map);
2183 }
2184
2185 // num_objects_degraded if calculated should reflect this too, unless no
2186 // missing and we are about to go clean.
2187 if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
2188 state_set(PG_STATE_UNDERSIZED);
2189 }
2190
2191 state_set(PG_STATE_ACTIVATING);
2192 release_pg_backoffs();
2193 projected_last_update = info.last_update;
2194 }
2195 if (acting.size() >= pool.info.min_size) {
2196 PGLogEntryHandler handler{this, &t};
2197 pg_log.roll_forward(&handler);
2198 }
2199 }
2200
2201 bool PG::op_has_sufficient_caps(OpRequestRef& op)
2202 {
2203 // only check MOSDOp
2204 if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
2205 return true;
2206
2207 const MOSDOp *req = static_cast<const MOSDOp*>(op->get_req());
2208
2209 auto priv = req->get_connection()->get_priv();
2210 auto session = static_cast<Session*>(priv.get());
2211 if (!session) {
2212 dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl;
2213 return false;
2214 }
2215 OSDCap& caps = session->caps;
2216 priv.reset();
2217
2218 const string &key = req->get_hobj().get_key().empty() ?
2219 req->get_oid().name :
2220 req->get_hobj().get_key();
2221
2222 bool cap = caps.is_capable(pool.name, req->get_hobj().nspace,
2223 pool.info.application_metadata,
2224 key,
2225 op->need_read_cap(),
2226 op->need_write_cap(),
2227 op->classes(),
2228 session->get_peer_socket_addr());
2229
2230 dout(20) << "op_has_sufficient_caps "
2231 << "session=" << session
2232 << " pool=" << pool.id << " (" << pool.name
2233 << " " << req->get_hobj().nspace
2234 << ")"
2235 << " pool_app_metadata=" << pool.info.application_metadata
2236 << " need_read_cap=" << op->need_read_cap()
2237 << " need_write_cap=" << op->need_write_cap()
2238 << " classes=" << op->classes()
2239 << " -> " << (cap ? "yes" : "NO")
2240 << dendl;
2241 return cap;
2242 }
2243
2244 void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
2245 {
2246 lock();
2247 if (pg_has_reset_since(epoch)) {
2248 dout(10) << "_activate_committed " << epoch
2249 << ", that was an old interval" << dendl;
2250 } else if (is_primary()) {
2251 ceph_assert(!peer_activated.count(pg_whoami));
2252 peer_activated.insert(pg_whoami);
2253 dout(10) << "_activate_committed " << epoch
2254 << " peer_activated now " << peer_activated
2255 << " last_interval_started " << info.history.last_interval_started
2256 << " last_epoch_started " << info.history.last_epoch_started
2257 << " same_interval_since " << info.history.same_interval_since << dendl;
2258 ceph_assert(!acting_recovery_backfill.empty());
2259 if (peer_activated.size() == acting_recovery_backfill.size())
2260 all_activated_and_committed();
2261 } else {
2262 dout(10) << "_activate_committed " << epoch << " telling primary" << dendl;
2263 MOSDPGInfo *m = new MOSDPGInfo(epoch);
2264 pg_notify_t i = pg_notify_t(
2265 get_primary().shard, pg_whoami.shard,
2266 get_osdmap_epoch(),
2267 get_osdmap_epoch(),
2268 info);
2269
2270 i.info.history.last_epoch_started = activation_epoch;
2271 i.info.history.last_interval_started = i.info.history.same_interval_since;
2272 if (acting.size() >= pool.info.min_size) {
2273 state_set(PG_STATE_ACTIVE);
2274 } else {
2275 state_set(PG_STATE_PEERED);
2276 }
2277
2278 m->pg_list.push_back(make_pair(i, PastIntervals()));
2279 osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap_epoch());
2280
2281 // waiters
2282 if (flushes_in_progress == 0) {
2283 requeue_ops(waiting_for_peered);
2284 } else if (!waiting_for_peered.empty()) {
2285 dout(10) << __func__ << " flushes in progress, moving "
2286 << waiting_for_peered.size() << " items to waiting_for_flush"
2287 << dendl;
2288 ceph_assert(waiting_for_flush.empty());
2289 waiting_for_flush.swap(waiting_for_peered);
2290 }
2291 }
2292
2293 ceph_assert(!dirty_info);
2294
2295 unlock();
2296 }
2297
2298 /*
2299 * update info.history.last_epoch_started ONLY after we and all
2300 * replicas have activated AND committed the activate transaction
2301 * (i.e. the peering results are stable on disk).
2302 */
2303 void PG::all_activated_and_committed()
2304 {
2305 dout(10) << "all_activated_and_committed" << dendl;
2306 ceph_assert(is_primary());
2307 ceph_assert(peer_activated.size() == acting_recovery_backfill.size());
2308 ceph_assert(!acting_recovery_backfill.empty());
2309 ceph_assert(blocked_by.empty());
2310
2311 // Degraded?
2312 _update_calc_stats();
2313 if (info.stats.stats.sum.num_objects_degraded) {
2314 state_set(PG_STATE_DEGRADED);
2315 } else {
2316 state_clear(PG_STATE_DEGRADED);
2317 }
2318
2319 queue_peering_event(
2320 PGPeeringEventRef(
2321 std::make_shared<PGPeeringEvent>(
2322 get_osdmap_epoch(),
2323 get_osdmap_epoch(),
2324 AllReplicasActivated())));
2325 }
2326
2327 bool PG::requeue_scrub(bool high_priority)
2328 {
2329 ceph_assert(is_locked());
2330 if (scrub_queued) {
2331 dout(10) << __func__ << ": already queued" << dendl;
2332 return false;
2333 } else {
2334 dout(10) << __func__ << ": queueing" << dendl;
2335 scrub_queued = true;
2336 osd->queue_for_scrub(this, high_priority);
2337 return true;
2338 }
2339 }
2340
2341 void PG::queue_recovery()
2342 {
2343 if (!is_primary() || !is_peered()) {
2344 dout(10) << "queue_recovery -- not primary or not peered " << dendl;
2345 ceph_assert(!recovery_queued);
2346 } else if (recovery_queued) {
2347 dout(10) << "queue_recovery -- already queued" << dendl;
2348 } else {
2349 dout(10) << "queue_recovery -- queuing" << dendl;
2350 recovery_queued = true;
2351 osd->queue_for_recovery(this);
2352 }
2353 }
2354
2355 bool PG::queue_scrub()
2356 {
2357 ceph_assert(is_locked());
2358 if (is_scrubbing()) {
2359 return false;
2360 }
2361 // An interrupted recovery repair could leave this set.
2362 state_clear(PG_STATE_REPAIR);
2363 if (scrubber.need_auto) {
2364 scrubber.must_scrub = true;
2365 scrubber.must_deep_scrub = true;
2366 scrubber.auto_repair = true;
2367 scrubber.need_auto = false;
2368 }
2369 scrubber.priority = scrubber.must_scrub ?
2370 cct->_conf->osd_requested_scrub_priority : get_scrub_priority();
2371 scrubber.must_scrub = false;
2372 state_set(PG_STATE_SCRUBBING);
2373 if (scrubber.must_deep_scrub) {
2374 state_set(PG_STATE_DEEP_SCRUB);
2375 scrubber.must_deep_scrub = false;
2376 }
2377 if (scrubber.must_repair || scrubber.auto_repair) {
2378 state_set(PG_STATE_REPAIR);
2379 scrubber.must_repair = false;
2380 }
2381 requeue_scrub();
2382 return true;
2383 }
2384
2385 unsigned PG::get_scrub_priority()
2386 {
2387 // a higher value -> a higher priority
2388 int64_t pool_scrub_priority = 0;
2389 pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority);
2390 return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
2391 }
2392
2393 void PG::try_mark_clean()
2394 {
2395 if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2396 state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
2397 state_set(PG_STATE_CLEAN);
2398 info.history.last_epoch_clean = get_osdmap_epoch();
2399 info.history.last_interval_clean = info.history.same_interval_since;
2400 past_intervals.clear();
2401 dirty_big_info = true;
2402 dirty_info = true;
2403 }
2404
2405 if (is_active()) {
2406 kick_snap_trim();
2407 } else if (is_peered()) {
2408 if (is_clean()) {
2409 bool target;
2410 if (pool.info.is_pending_merge(info.pgid.pgid, &target)) {
2411 if (target) {
2412 ldout(cct, 10) << "ready to merge (target)" << dendl;
2413 osd->set_ready_to_merge_target(this,
2414 info.last_update,
2415 info.history.last_epoch_started,
2416 info.history.last_epoch_clean);
2417 } else {
2418 ldout(cct, 10) << "ready to merge (source)" << dendl;
2419 osd->set_ready_to_merge_source(this, info.last_update);
2420 }
2421 }
2422 } else {
2423 ldout(cct, 10) << "not clean, not ready to merge" << dendl;
2424 // we should have notified OSD in Active state entry point
2425 }
2426 }
2427
2428 state_clear(PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
2429
2430 share_pg_info();
2431 publish_stats_to_osd();
2432 requeue_ops(waiting_for_clean_to_primary_repair);
2433 }
2434
2435 bool PG::set_force_recovery(bool b)
2436 {
2437 bool did = false;
2438 if (b) {
2439 if (!(state & PG_STATE_FORCED_RECOVERY) &&
2440 (state & (PG_STATE_DEGRADED |
2441 PG_STATE_RECOVERY_WAIT |
2442 PG_STATE_RECOVERING))) {
2443 dout(20) << __func__ << " set" << dendl;
2444 state_set(PG_STATE_FORCED_RECOVERY);
2445 publish_stats_to_osd();
2446 did = true;
2447 }
2448 } else if (state & PG_STATE_FORCED_RECOVERY) {
2449 dout(20) << __func__ << " clear" << dendl;
2450 state_clear(PG_STATE_FORCED_RECOVERY);
2451 publish_stats_to_osd();
2452 did = true;
2453 }
2454 if (did) {
2455 dout(20) << __func__ << " state " << pgstate_history.get_current_state() << dendl;
2456 osd->local_reserver.update_priority(info.pgid, get_recovery_priority());
2457 }
2458 return did;
2459 }
2460
2461 bool PG::set_force_backfill(bool b)
2462 {
2463 bool did = false;
2464 if (b) {
2465 if (!(state & PG_STATE_FORCED_BACKFILL) &&
2466 (state & (PG_STATE_DEGRADED |
2467 PG_STATE_BACKFILL_WAIT |
2468 PG_STATE_BACKFILLING))) {
2469 dout(10) << __func__ << " set" << dendl;
2470 state_set(PG_STATE_FORCED_BACKFILL);
2471 publish_stats_to_osd();
2472 did = true;
2473 }
2474 } else if (state & PG_STATE_FORCED_BACKFILL) {
2475 dout(10) << __func__ << " clear" << dendl;
2476 state_clear(PG_STATE_FORCED_BACKFILL);
2477 publish_stats_to_osd();
2478 did = true;
2479 }
2480 if (did) {
2481 dout(20) << __func__ << " state " << pgstate_history.get_current_state() << dendl;
2482 osd->local_reserver.update_priority(info.pgid, get_backfill_priority());
2483 }
2484 return did;
2485 }
2486
2487 int PG::clamp_recovery_priority(int priority, int pool_recovery_priority, int max)
2488 {
2489 static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
2490 static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
2491
2492 ceph_assert(max <= OSD_RECOVERY_PRIORITY_MAX);
2493
2494 // User can't set this too high anymore, but might be a legacy value
2495 if (pool_recovery_priority > OSD_POOL_PRIORITY_MAX)
2496 pool_recovery_priority = OSD_POOL_PRIORITY_MAX;
2497 if (pool_recovery_priority < OSD_POOL_PRIORITY_MIN)
2498 pool_recovery_priority = OSD_POOL_PRIORITY_MIN;
2499 // Shift range from min to max to 0 to max - min
2500 pool_recovery_priority += (0 - OSD_POOL_PRIORITY_MIN);
2501 ceph_assert(pool_recovery_priority >= 0 && pool_recovery_priority <= (OSD_POOL_PRIORITY_MAX - OSD_POOL_PRIORITY_MIN));
2502
2503 priority += pool_recovery_priority;
2504
2505 // Clamp to valid range
2506 if (priority > max) {
2507 return max;
2508 } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
2509 return OSD_RECOVERY_PRIORITY_MIN;
2510 } else {
2511 return priority;
2512 }
2513 }
2514
2515 unsigned PG::get_recovery_priority()
2516 {
2517 // a higher value -> a higher priority
2518 int ret = OSD_RECOVERY_PRIORITY_BASE;
2519 int base = ret;
2520
2521 if (state & PG_STATE_FORCED_RECOVERY) {
2522 ret = OSD_RECOVERY_PRIORITY_FORCED;
2523 } else {
2524 // XXX: This priority boost isn't so much about inactive, but about data-at-risk
2525 if (is_degraded() && info.stats.avail_no_missing.size() < pool.info.min_size) {
2526 base = OSD_RECOVERY_INACTIVE_PRIORITY_BASE;
2527 // inactive: no. of replicas < min_size, highest priority since it blocks IO
2528 ret = base + (pool.info.min_size - info.stats.avail_no_missing.size());
2529 }
2530
2531 int64_t pool_recovery_priority = 0;
2532 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2533
2534 ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
2535 }
2536 dout(20) << __func__ << " recovery priority is " << ret << dendl;
2537 return static_cast<unsigned>(ret);
2538 }
2539
2540 unsigned PG::get_backfill_priority()
2541 {
2542 // a higher value -> a higher priority
2543 int ret = OSD_BACKFILL_PRIORITY_BASE;
2544 int base = ret;
2545
2546 if (state & PG_STATE_FORCED_BACKFILL) {
2547 ret = OSD_BACKFILL_PRIORITY_FORCED;
2548 } else {
2549 if (acting.size() < pool.info.min_size) {
2550 base = OSD_BACKFILL_INACTIVE_PRIORITY_BASE;
2551 // inactive: no. of replicas < min_size, highest priority since it blocks IO
2552 ret = base + (pool.info.min_size - acting.size());
2553
2554 } else if (is_undersized()) {
2555 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
2556 ceph_assert(pool.info.size > actingset.size());
2557 base = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
2558 ret = base + (pool.info.size - actingset.size());
2559
2560 } else if (is_degraded()) {
2561 // degraded: baseline degraded
2562 base = ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
2563 }
2564
2565 // Adjust with pool's recovery priority
2566 int64_t pool_recovery_priority = 0;
2567 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2568
2569 ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
2570 }
2571
2572 dout(20) << __func__ << " backfill priority is " << ret << dendl;
2573 return static_cast<unsigned>(ret);
2574 }
2575
2576 unsigned PG::get_delete_priority()
2577 {
2578 auto state = get_osdmap()->get_state(osd->whoami);
2579 if (state & (CEPH_OSD_BACKFILLFULL |
2580 CEPH_OSD_FULL)) {
2581 return OSD_DELETE_PRIORITY_FULL;
2582 } else if (state & CEPH_OSD_NEARFULL) {
2583 return OSD_DELETE_PRIORITY_FULLISH;
2584 } else {
2585 return OSD_DELETE_PRIORITY_NORMAL;
2586 }
2587 }
2588
2589 Context *PG::finish_recovery()
2590 {
2591 dout(10) << "finish_recovery" << dendl;
2592 ceph_assert(info.last_complete == info.last_update);
2593
2594 clear_recovery_state();
2595
2596 /*
2597 * sync all this before purging strays. but don't block!
2598 */
2599 finish_sync_event = new C_PG_FinishRecovery(this);
2600 return finish_sync_event;
2601 }
2602
2603 void PG::_finish_recovery(Context *c)
2604 {
2605 lock();
2606 // When recovery is initiated by a repair, that flag is left on
2607 state_clear(PG_STATE_REPAIR);
2608 if (deleting) {
2609 unlock();
2610 return;
2611 }
2612 if (c == finish_sync_event) {
2613 dout(10) << "_finish_recovery" << dendl;
2614 finish_sync_event = 0;
2615 purge_strays();
2616
2617 publish_stats_to_osd();
2618
2619 if (scrub_after_recovery) {
2620 dout(10) << "_finish_recovery requeueing for scrub" << dendl;
2621 scrub_after_recovery = false;
2622 scrubber.must_deep_scrub = true;
2623 scrubber.check_repair = true;
2624 queue_scrub();
2625 }
2626 } else {
2627 dout(10) << "_finish_recovery -- stale" << dendl;
2628 }
2629 unlock();
2630 }
2631
2632 void PG::start_recovery_op(const hobject_t& soid)
2633 {
2634 dout(10) << "start_recovery_op " << soid
2635 #ifdef DEBUG_RECOVERY_OIDS
2636 << " (" << recovering_oids << ")"
2637 #endif
2638 << dendl;
2639 ceph_assert(recovery_ops_active >= 0);
2640 recovery_ops_active++;
2641 #ifdef DEBUG_RECOVERY_OIDS
2642 recovering_oids.insert(soid);
2643 #endif
2644 osd->start_recovery_op(this, soid);
2645 }
2646
2647 void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
2648 {
2649 dout(10) << "finish_recovery_op " << soid
2650 #ifdef DEBUG_RECOVERY_OIDS
2651 << " (" << recovering_oids << ")"
2652 #endif
2653 << dendl;
2654 ceph_assert(recovery_ops_active > 0);
2655 recovery_ops_active--;
2656 #ifdef DEBUG_RECOVERY_OIDS
2657 ceph_assert(recovering_oids.count(soid));
2658 recovering_oids.erase(recovering_oids.find(soid));
2659 #endif
2660 osd->finish_recovery_op(this, soid, dequeue);
2661
2662 if (!dequeue) {
2663 queue_recovery();
2664 }
2665 }
2666
2667 void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
2668 {
2669 child->update_snap_mapper_bits(split_bits);
2670 child->update_osdmap_ref(get_osdmap());
2671
2672 child->pool = pool;
2673
2674 // Log
2675 pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2676 child->info.last_complete = info.last_complete;
2677
2678 info.last_update = pg_log.get_head();
2679 child->info.last_update = child->pg_log.get_head();
2680
2681 child->info.last_user_version = info.last_user_version;
2682
2683 info.log_tail = pg_log.get_tail();
2684 child->info.log_tail = child->pg_log.get_tail();
2685
2686 // reset last_complete, we might have modified pg_log & missing above
2687 pg_log.reset_complete_to(&info);
2688 child->pg_log.reset_complete_to(&child->info);
2689
2690 // Info
2691 child->info.history = info.history;
2692 child->info.history.epoch_created = get_osdmap_epoch();
2693 child->info.purged_snaps = info.purged_snaps;
2694
2695 if (info.last_backfill.is_max()) {
2696 child->info.set_last_backfill(hobject_t::get_max());
2697 } else {
2698 // restart backfill on parent and child to be safe. we could
2699 // probably do better in the bitwise sort case, but it's more
2700 // fragile (there may be special work to do on backfill completion
2701 // in the future).
2702 info.set_last_backfill(hobject_t());
2703 child->info.set_last_backfill(hobject_t());
2704 // restarting backfill implies that the missing set is empty,
2705 // since it is only used for objects prior to last_backfill
2706 pg_log.reset_backfill();
2707 child->pg_log.reset_backfill();
2708 }
2709
2710 child->info.stats = info.stats;
2711 child->info.stats.parent_split_bits = split_bits;
2712 info.stats.stats_invalid = true;
2713 child->info.stats.stats_invalid = true;
2714 child->info.last_epoch_started = info.last_epoch_started;
2715 child->info.last_interval_started = info.last_interval_started;
2716
2717 child->snap_trimq = snap_trimq;
2718
2719 // There can't be recovery/backfill going on now
2720 int primary, up_primary;
2721 vector<int> newup, newacting;
2722 get_osdmap()->pg_to_up_acting_osds(
2723 child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2724 child->init_primary_up_acting(
2725 newup,
2726 newacting,
2727 up_primary,
2728 primary);
2729 child->role = OSDMap::calc_pg_role(osd->whoami, child->acting);
2730
2731 // this comparison includes primary rank via pg_shard_t
2732 if (get_primary() != child->get_primary())
2733 child->info.history.same_primary_since = get_osdmap_epoch();
2734
2735 child->info.stats.up = up;
2736 child->info.stats.up_primary = up_primary;
2737 child->info.stats.acting = acting;
2738 child->info.stats.acting_primary = primary;
2739 child->info.stats.mapping_epoch = get_osdmap_epoch();
2740
2741 // History
2742 child->past_intervals = past_intervals;
2743
2744 _split_into(child_pgid, child, split_bits);
2745
2746 // release all backoffs for simplicity
2747 release_backoffs(hobject_t(), hobject_t::get_max());
2748
2749 child->on_new_interval();
2750
2751 child->send_notify = !child->is_primary();
2752
2753 child->dirty_info = true;
2754 child->dirty_big_info = true;
2755 dirty_info = true;
2756 dirty_big_info = true;
2757 }
2758
2759 void PG::start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
2760 {
2761 out->resize(childpgs.size() + 1);
2762 info.stats.stats.sum.split(*out);
2763 }
2764
2765 void PG::finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction *t)
2766 {
2767 info.stats.stats.sum = stats;
2768 write_if_dirty(*t);
2769 }
2770
2771 void PG::merge_from(map<spg_t,PGRef>& sources, RecoveryCtx *rctx,
2772 unsigned split_bits,
2773 const pg_merge_meta_t& last_pg_merge_meta)
2774 {
2775 dout(10) << __func__ << " from " << sources << " split_bits " << split_bits
2776 << dendl;
2777 bool incomplete = false;
2778 if (info.last_complete != info.last_update ||
2779 info.is_incomplete() ||
2780 info.dne()) {
2781 dout(10) << __func__ << " target incomplete" << dendl;
2782 incomplete = true;
2783 }
2784 if (last_pg_merge_meta.source_pgid != pg_t()) {
2785 if (info.pgid.pgid != last_pg_merge_meta.source_pgid.get_parent()) {
2786 dout(10) << __func__ << " target doesn't match expected parent "
2787 << last_pg_merge_meta.source_pgid.get_parent()
2788 << " of source_pgid " << last_pg_merge_meta.source_pgid
2789 << dendl;
2790 incomplete = true;
2791 }
2792 if (info.last_update != last_pg_merge_meta.target_version) {
2793 dout(10) << __func__ << " target version doesn't match expected "
2794 << last_pg_merge_meta.target_version << dendl;
2795 incomplete = true;
2796 }
2797 }
2798
2799 PGLogEntryHandler handler{this, rctx->transaction};
2800 pg_log.roll_forward(&handler);
2801
2802 info.last_complete = info.last_update; // to fake out trim()
2803 pg_log.reset_recovery_pointers();
2804 pg_log.trim(info.last_update, info);
2805
2806 vector<PGLog*> log_from;
2807 for (auto& i : sources) {
2808 auto& source = i.second;
2809 if (!source) {
2810 dout(10) << __func__ << " source " << i.first << " missing" << dendl;
2811 incomplete = true;
2812 continue;
2813 }
2814 if (source->info.last_complete != source->info.last_update ||
2815 source->info.is_incomplete() ||
2816 source->info.dne()) {
2817 dout(10) << __func__ << " source " << source->pg_id << " incomplete"
2818 << dendl;
2819 incomplete = true;
2820 }
2821 if (last_pg_merge_meta.source_pgid != pg_t()) {
2822 if (source->info.pgid.pgid != last_pg_merge_meta.source_pgid) {
2823 dout(10) << __func__ << " source " << source->info.pgid.pgid
2824 << " doesn't match expected source pgid "
2825 << last_pg_merge_meta.source_pgid << dendl;
2826 incomplete = true;
2827 }
2828 if (source->info.last_update != last_pg_merge_meta.source_version) {
2829 dout(10) << __func__ << " source version doesn't match expected "
2830 << last_pg_merge_meta.target_version << dendl;
2831 incomplete = true;
2832 }
2833 }
2834
2835 // prepare log
2836 PGLogEntryHandler handler{source.get(), rctx->transaction};
2837 source->pg_log.roll_forward(&handler);
2838 source->info.last_complete = source->info.last_update; // to fake out trim()
2839 source->pg_log.reset_recovery_pointers();
2840 source->pg_log.trim(source->info.last_update, source->info);
2841 log_from.push_back(&source->pg_log);
2842
2843 // wipe out source's pgmeta
2844 rctx->transaction->remove(source->coll, source->pgmeta_oid);
2845
2846 // merge (and destroy source collection)
2847 rctx->transaction->merge_collection(source->coll, coll, split_bits);
2848
2849 // combine stats
2850 info.stats.add(source->info.stats);
2851
2852 // pull up last_update
2853 info.last_update = std::max(info.last_update, source->info.last_update);
2854
2855 // adopt source's PastIntervals if target has none. we can do this since
2856 // pgp_num has been reduced prior to the merge, so the OSD mappings for
2857 // the PGs are identical.
2858 if (past_intervals.empty() && !source->past_intervals.empty()) {
2859 dout(10) << __func__ << " taking source's past_intervals" << dendl;
2860 past_intervals = source->past_intervals;
2861 }
2862 }
2863
2864 // merge_collection does this, but maybe all of our sources were missing.
2865 rctx->transaction->collection_set_bits(coll, split_bits);
2866
2867 info.last_complete = info.last_update;
2868 info.log_tail = info.last_update;
2869 if (incomplete) {
2870 info.last_backfill = hobject_t();
2871 }
2872
2873 snap_mapper.update_bits(split_bits);
2874
2875 // merge logs
2876 pg_log.merge_from(log_from, info.last_update);
2877
2878 // make sure we have a meaningful last_epoch_started/clean (if we were a
2879 // placeholder)
2880 if (info.last_epoch_started == 0) {
2881 // start with (a) source's history, since these PGs *should* have been
2882 // remapped in concert with each other...
2883 info.history = sources.begin()->second->info.history;
2884
2885 // we use the last_epoch_{started,clean} we got from
2886 // the caller, which are the epochs that were reported by the PGs were
2887 // found to be ready for merge.
2888 info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean;
2889 info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started;
2890 info.last_epoch_started = last_pg_merge_meta.last_epoch_started;
2891 dout(10) << __func__
2892 << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/"
2893 << last_pg_merge_meta.last_epoch_clean
2894 << " from pool last_dec_*, source pg history was "
2895 << sources.begin()->second->info.history
2896 << dendl;
2897
2898 // if the past_intervals start is later than last_epoch_clean, it
2899 // implies the source repeered again but the target didn't, or
2900 // that the source became clean in a later epoch than the target.
2901 // avoid the discrepancy but adjusting the interval start
2902 // backwards to match so that check_past_interval_bounds() will
2903 // not complain.
2904 auto pib = past_intervals.get_bounds();
2905 if (info.history.last_epoch_clean < pib.first) {
2906 dout(10) << __func__ << " last_epoch_clean "
2907 << info.history.last_epoch_clean << " < past_interval start "
2908 << pib.first << ", adjusting start backwards" << dendl;
2909 past_intervals.adjust_start_backwards(info.history.last_epoch_clean);
2910 }
2911
2912 // Similarly, if the same_interval_since value is later than
2913 // last_epoch_clean, the next interval change will result in a
2914 // past_interval start that is later than last_epoch_clean. This
2915 // can happen if we use the pg_history values from the merge
2916 // source. Adjust the same_interval_since value backwards if that
2917 // happens. (We trust the les and lec values more because they came from
2918 // the real target, whereas the history value we stole from the source.)
2919 if (info.history.last_epoch_started < info.history.same_interval_since) {
2920 dout(10) << __func__ << " last_epoch_started "
2921 << info.history.last_epoch_started << " < same_interval_since "
2922 << info.history.same_interval_since
2923 << ", adjusting pg_history backwards" << dendl;
2924 info.history.same_interval_since = info.history.last_epoch_clean;
2925 // make sure same_{up,primary}_since are <= same_interval_since
2926 info.history.same_up_since = std::min(
2927 info.history.same_up_since, info.history.same_interval_since);
2928 info.history.same_primary_since = std::min(
2929 info.history.same_primary_since, info.history.same_interval_since);
2930 }
2931 }
2932
2933 dirty_info = true;
2934 dirty_big_info = true;
2935 }
2936
2937 void PG::add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end)
2938 {
2939 ConnectionRef con = s->con;
2940 if (!con) // OSD::ms_handle_reset clears s->con without a lock
2941 return;
2942 BackoffRef b(s->have_backoff(info.pgid, begin));
2943 if (b) {
2944 derr << __func__ << " already have backoff for " << s << " begin " << begin
2945 << " " << *b << dendl;
2946 ceph_abort();
2947 }
2948 std::lock_guard l(backoff_lock);
2949 {
2950 b = new Backoff(info.pgid, this, s, ++s->backoff_seq, begin, end);
2951 backoffs[begin].insert(b);
2952 s->add_backoff(b);
2953 dout(10) << __func__ << " session " << s << " added " << *b << dendl;
2954 }
2955 con->send_message(
2956 new MOSDBackoff(
2957 info.pgid,
2958 get_osdmap_epoch(),
2959 CEPH_OSD_BACKOFF_OP_BLOCK,
2960 b->id,
2961 begin,
2962 end));
2963 }
2964
2965 void PG::release_backoffs(const hobject_t& begin, const hobject_t& end)
2966 {
2967 dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl;
2968 vector<BackoffRef> bv;
2969 {
2970 std::lock_guard l(backoff_lock);
2971 auto p = backoffs.lower_bound(begin);
2972 while (p != backoffs.end()) {
2973 int r = cmp(p->first, end);
2974 dout(20) << __func__ << " ? " << r << " " << p->first
2975 << " " << p->second << dendl;
2976 // note: must still examine begin=end=p->first case
2977 if (r > 0 || (r == 0 && begin < end)) {
2978 break;
2979 }
2980 dout(20) << __func__ << " checking " << p->first
2981 << " " << p->second << dendl;
2982 auto q = p->second.begin();
2983 while (q != p->second.end()) {
2984 dout(20) << __func__ << " checking " << *q << dendl;
2985 int r = cmp((*q)->begin, begin);
2986 if (r == 0 || (r > 0 && (*q)->end < end)) {
2987 bv.push_back(*q);
2988 q = p->second.erase(q);
2989 } else {
2990 ++q;
2991 }
2992 }
2993 if (p->second.empty()) {
2994 p = backoffs.erase(p);
2995 } else {
2996 ++p;
2997 }
2998 }
2999 }
3000 for (auto b : bv) {
3001 std::lock_guard l(b->lock);
3002 dout(10) << __func__ << " " << *b << dendl;
3003 if (b->session) {
3004 ceph_assert(b->pg == this);
3005 ConnectionRef con = b->session->con;
3006 if (con) { // OSD::ms_handle_reset clears s->con without a lock
3007 con->send_message(
3008 new MOSDBackoff(
3009 info.pgid,
3010 get_osdmap_epoch(),
3011 CEPH_OSD_BACKOFF_OP_UNBLOCK,
3012 b->id,
3013 b->begin,
3014 b->end));
3015 }
3016 if (b->is_new()) {
3017 b->state = Backoff::STATE_DELETING;
3018 } else {
3019 b->session->rm_backoff(b);
3020 b->session.reset();
3021 }
3022 b->pg.reset();
3023 }
3024 }
3025 }
3026
3027 void PG::clear_backoffs()
3028 {
3029 dout(10) << __func__ << " " << dendl;
3030 map<hobject_t,set<BackoffRef>> ls;
3031 {
3032 std::lock_guard l(backoff_lock);
3033 ls.swap(backoffs);
3034 }
3035 for (auto& p : ls) {
3036 for (auto& b : p.second) {
3037 std::lock_guard l(b->lock);
3038 dout(10) << __func__ << " " << *b << dendl;
3039 if (b->session) {
3040 ceph_assert(b->pg == this);
3041 if (b->is_new()) {
3042 b->state = Backoff::STATE_DELETING;
3043 } else {
3044 b->session->rm_backoff(b);
3045 b->session.reset();
3046 }
3047 b->pg.reset();
3048 }
3049 }
3050 }
3051 }
3052
3053 // called by Session::clear_backoffs()
3054 void PG::rm_backoff(BackoffRef b)
3055 {
3056 dout(10) << __func__ << " " << *b << dendl;
3057 std::lock_guard l(backoff_lock);
3058 ceph_assert(b->lock.is_locked_by_me());
3059 ceph_assert(b->pg == this);
3060 auto p = backoffs.find(b->begin);
3061 // may race with release_backoffs()
3062 if (p != backoffs.end()) {
3063 auto q = p->second.find(b);
3064 if (q != p->second.end()) {
3065 p->second.erase(q);
3066 if (p->second.empty()) {
3067 backoffs.erase(p);
3068 }
3069 }
3070 }
3071 }
3072
3073 void PG::clear_recovery_state()
3074 {
3075 dout(10) << "clear_recovery_state" << dendl;
3076
3077 pg_log.reset_recovery_pointers();
3078 finish_sync_event = 0;
3079
3080 hobject_t soid;
3081 while (recovery_ops_active > 0) {
3082 #ifdef DEBUG_RECOVERY_OIDS
3083 soid = *recovering_oids.begin();
3084 #endif
3085 finish_recovery_op(soid, true);
3086 }
3087
3088 async_recovery_targets.clear();
3089 backfill_targets.clear();
3090 backfill_info.clear();
3091 peer_backfill_info.clear();
3092 waiting_on_backfill.clear();
3093 _clear_recovery_state(); // pg impl specific hook
3094 }
3095
3096 void PG::cancel_recovery()
3097 {
3098 dout(10) << "cancel_recovery" << dendl;
3099 clear_recovery_state();
3100 }
3101
3102
3103 void PG::purge_strays()
3104 {
3105 if (is_premerge()) {
3106 dout(10) << "purge_strays " << stray_set << " but premerge, doing nothing"
3107 << dendl;
3108 return;
3109 }
3110 if (cct->_conf.get_val<bool>("osd_debug_no_purge_strays")) {
3111 return;
3112 }
3113 dout(10) << "purge_strays " << stray_set << dendl;
3114
3115 bool removed = false;
3116 for (set<pg_shard_t>::iterator p = stray_set.begin();
3117 p != stray_set.end();
3118 ++p) {
3119 ceph_assert(!is_acting_recovery_backfill(*p));
3120 if (get_osdmap()->is_up(p->osd)) {
3121 dout(10) << "sending PGRemove to osd." << *p << dendl;
3122 vector<spg_t> to_remove;
3123 to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
3124 MOSDPGRemove *m = new MOSDPGRemove(
3125 get_osdmap_epoch(),
3126 to_remove);
3127 osd->send_message_osd_cluster(p->osd, m, get_osdmap_epoch());
3128 } else {
3129 dout(10) << "not sending PGRemove to down osd." << *p << dendl;
3130 }
3131 peer_missing.erase(*p);
3132 peer_info.erase(*p);
3133 peer_purged.insert(*p);
3134 removed = true;
3135 }
3136
3137 // if we removed anyone, update peers (which include peer_info)
3138 if (removed)
3139 update_heartbeat_peers();
3140
3141 stray_set.clear();
3142
3143 // clear _requested maps; we may have to peer() again if we discover
3144 // (more) stray content
3145 peer_log_requested.clear();
3146 peer_missing_requested.clear();
3147 }
3148
3149 void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
3150 {
3151 std::lock_guard l(heartbeat_peer_lock);
3152 probe_targets.clear();
3153 for (set<pg_shard_t>::iterator i = probe_set.begin();
3154 i != probe_set.end();
3155 ++i) {
3156 probe_targets.insert(i->osd);
3157 }
3158 }
3159
3160 void PG::clear_probe_targets()
3161 {
3162 std::lock_guard l(heartbeat_peer_lock);
3163 probe_targets.clear();
3164 }
3165
3166 void PG::update_heartbeat_peers()
3167 {
3168 ceph_assert(is_locked());
3169
3170 if (!is_primary())
3171 return;
3172
3173 set<int> new_peers;
3174 for (unsigned i=0; i<acting.size(); i++) {
3175 if (acting[i] != CRUSH_ITEM_NONE)
3176 new_peers.insert(acting[i]);
3177 }
3178 for (unsigned i=0; i<up.size(); i++) {
3179 if (up[i] != CRUSH_ITEM_NONE)
3180 new_peers.insert(up[i]);
3181 }
3182 for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
3183 p != peer_info.end();
3184 ++p)
3185 new_peers.insert(p->first.osd);
3186
3187 bool need_update = false;
3188 heartbeat_peer_lock.Lock();
3189 if (new_peers == heartbeat_peers) {
3190 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl;
3191 } else {
3192 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl;
3193 heartbeat_peers.swap(new_peers);
3194 need_update = true;
3195 }
3196 heartbeat_peer_lock.Unlock();
3197
3198 if (need_update)
3199 osd->need_heartbeat_peer_update();
3200 }
3201
3202
3203 bool PG::check_in_progress_op(
3204 const osd_reqid_t &r,
3205 eversion_t *version,
3206 version_t *user_version,
3207 int *return_code) const
3208 {
3209 return (
3210 projected_log.get_request(r, version, user_version, return_code) ||
3211 pg_log.get_log().get_request(r, version, user_version, return_code));
3212 }
3213
3214 static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard)
3215 {
3216 for (auto&p : pgs)
3217 if (p.shard == shard)
3218 return true;
3219 return false;
3220 }
3221
3222 static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard)
3223 {
3224 for (auto&p : pgs) {
3225 if (p == skip)
3226 continue;
3227 if (p.shard == shard)
3228 return p;
3229 }
3230 return pg_shard_t();
3231 }
3232
3233 void PG::_update_calc_stats()
3234 {
3235 info.stats.version = info.last_update;
3236 info.stats.created = info.history.epoch_created;
3237 info.stats.last_scrub = info.history.last_scrub;
3238 info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
3239 info.stats.last_deep_scrub = info.history.last_deep_scrub;
3240 info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
3241 info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
3242 info.stats.last_epoch_clean = info.history.last_epoch_clean;
3243
3244 info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
3245 info.stats.ondisk_log_size = info.stats.log_size;
3246 info.stats.log_start = pg_log.get_tail();
3247 info.stats.ondisk_log_start = pg_log.get_tail();
3248 info.stats.snaptrimq_len = snap_trimq.size();
3249
3250 unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
3251
3252 // In rare case that upset is too large (usually transient), use as target
3253 // for calculations below.
3254 unsigned target = std::max(num_shards, (unsigned)upset.size());
3255 // For undersized actingset may be larger with OSDs out
3256 unsigned nrep = std::max(actingset.size(), upset.size());
3257 // calc num_object_copies
3258 info.stats.stats.calc_copies(std::max(target, nrep));
3259 info.stats.stats.sum.num_objects_degraded = 0;
3260 info.stats.stats.sum.num_objects_unfound = 0;
3261 info.stats.stats.sum.num_objects_misplaced = 0;
3262 info.stats.avail_no_missing.clear();
3263 info.stats.object_location_counts.clear();
3264
3265 if ((is_remapped() || is_undersized() || !is_clean()) && (is_peered() || is_activating())) {
3266 dout(20) << __func__ << " actingset " << actingset << " upset "
3267 << upset << " acting_recovery_backfill " << acting_recovery_backfill << dendl;
3268 dout(20) << __func__ << " acting " << acting << " up " << up << dendl;
3269
3270 ceph_assert(!acting_recovery_backfill.empty());
3271
3272 bool estimate = false;
3273
3274 // NOTE: we only generate degraded, misplaced and unfound
3275 // values for the summation, not individual stat categories.
3276 int64_t num_objects = info.stats.stats.sum.num_objects;
3277
3278 // Objects missing from up nodes, sorted by # objects.
3279 boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
3280 // Objects missing from nodes not in up, sort by # objects
3281 boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
3282
3283 // Fill missing_target_objects/acting_source_objects
3284
3285 {
3286 int64_t missing;
3287
3288 // Primary first
3289 missing = pg_log.get_missing().num_missing();
3290 ceph_assert(acting_recovery_backfill.count(pg_whoami));
3291 if (upset.count(pg_whoami)) {
3292 missing_target_objects.insert(make_pair(missing, pg_whoami));
3293 } else {
3294 acting_source_objects.insert(make_pair(missing, pg_whoami));
3295 }
3296 info.stats.stats.sum.num_objects_missing_on_primary = missing;
3297 if (missing == 0)
3298 info.stats.avail_no_missing.push_back(pg_whoami);
3299 dout(20) << __func__ << " shard " << pg_whoami
3300 << " primary objects " << num_objects
3301 << " missing " << missing
3302 << dendl;
3303 }
3304
3305 // All other peers
3306 for (auto& peer : peer_info) {
3307 // Primary should not be in the peer_info, skip if it is.
3308 if (peer.first == pg_whoami) continue;
3309 int64_t missing = 0;
3310 int64_t peer_num_objects = peer.second.stats.stats.sum.num_objects;
3311 // Backfill targets always track num_objects accurately
3312 // all other peers track missing accurately.
3313 if (is_backfill_targets(peer.first)) {
3314 missing = std::max((int64_t)0, num_objects - peer_num_objects);
3315 } else {
3316 if (peer_missing.count(peer.first)) {
3317 missing = peer_missing[peer.first].num_missing();
3318 } else {
3319 dout(20) << __func__ << " no peer_missing found for " << peer.first << dendl;
3320 if (is_recovering()) {
3321 estimate = true;
3322 }
3323 missing = std::max((int64_t)0, num_objects - peer_num_objects);
3324 }
3325 }
3326 if (upset.count(peer.first)) {
3327 missing_target_objects.insert(make_pair(missing, peer.first));
3328 } else if (actingset.count(peer.first)) {
3329 acting_source_objects.insert(make_pair(missing, peer.first));
3330 }
3331 peer.second.stats.stats.sum.num_objects_missing = missing;
3332 if (missing == 0)
3333 info.stats.avail_no_missing.push_back(peer.first);
3334 dout(20) << __func__ << " shard " << peer.first
3335 << " objects " << peer_num_objects
3336 << " missing " << missing
3337 << dendl;
3338 }
3339
3340 // Compute object_location_counts
3341 for (auto& ml: missing_loc.get_missing_locs()) {
3342 info.stats.object_location_counts[ml.second]++;
3343 dout(30) << __func__ << " " << ml.first << " object_location_counts["
3344 << ml.second << "]=" << info.stats.object_location_counts[ml.second]
3345 << dendl;
3346 }
3347 int64_t not_missing = num_objects - missing_loc.get_missing_locs().size();
3348 if (not_missing) {
3349 // During recovery we know upset == actingset and is being populated
3350 // During backfill we know that all non-missing objects are in the actingset
3351 info.stats.object_location_counts[actingset] = not_missing;
3352 }
3353 dout(30) << __func__ << " object_location_counts["
3354 << upset << "]=" << info.stats.object_location_counts[upset]
3355 << dendl;
3356 dout(20) << __func__ << " object_location_counts "
3357 << info.stats.object_location_counts << dendl;
3358
3359 // A misplaced object is not stored on the correct OSD
3360 int64_t misplaced = 0;
3361 // a degraded objects has fewer replicas or EC shards than the pool specifies.
3362 int64_t degraded = 0;
3363
3364 if (is_recovering()) {
3365 for (auto& sml: missing_loc.get_missing_by_count()) {
3366 for (auto& ml: sml.second) {
3367 int missing_shards;
3368 if (sml.first == shard_id_t::NO_SHARD) {
3369 dout(20) << __func__ << " ml " << ml.second << " upset size " << upset.size() << " up " << ml.first.up << dendl;
3370 missing_shards = (int)upset.size() - ml.first.up;
3371 } else {
3372 // Handle shards not even in upset below
3373 if (!find_shard(upset, sml.first))
3374 continue;
3375 missing_shards = std::max(0, 1 - ml.first.up);
3376 dout(20) << __func__ << " shard " << sml.first << " ml " << ml.second << " missing shards " << missing_shards << dendl;
3377 }
3378 int odegraded = ml.second * missing_shards;
3379 // Copies on other osds but limited to the possible degraded
3380 int more_osds = std::min(missing_shards, ml.first.other);
3381 int omisplaced = ml.second * more_osds;
3382 ceph_assert(omisplaced <= odegraded);
3383 odegraded -= omisplaced;
3384
3385 misplaced += omisplaced;
3386 degraded += odegraded;
3387 }
3388 }
3389
3390 dout(20) << __func__ << " missing based degraded " << degraded << dendl;
3391 dout(20) << __func__ << " missing based misplaced " << misplaced << dendl;
3392
3393 // Handle undersized case
3394 if (pool.info.is_replicated()) {
3395 // Add degraded for missing targets (num_objects missing)
3396 ceph_assert(target >= upset.size());
3397 unsigned needed = target - upset.size();
3398 degraded += num_objects * needed;
3399 } else {
3400 for (unsigned i = 0 ; i < num_shards; ++i) {
3401 shard_id_t shard(i);
3402
3403 if (!find_shard(upset, shard)) {
3404 pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard);
3405
3406 if (pgs != pg_shard_t()) {
3407 int64_t missing;
3408
3409 if (pgs == pg_whoami)
3410 missing = info.stats.stats.sum.num_objects_missing_on_primary;
3411 else
3412 missing = peer_info[pgs].stats.stats.sum.num_objects_missing;
3413
3414 degraded += missing;
3415 misplaced += std::max((int64_t)0, num_objects - missing);
3416 } else {
3417 // No shard anywhere
3418 degraded += num_objects;
3419 }
3420 }
3421 }
3422 }
3423 goto out;
3424 }
3425
3426 // Handle undersized case
3427 if (pool.info.is_replicated()) {
3428 // Add to missing_target_objects
3429 ceph_assert(target >= missing_target_objects.size());
3430 unsigned needed = target - missing_target_objects.size();
3431 if (needed)
3432 missing_target_objects.insert(make_pair(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD)));
3433 } else {
3434 for (unsigned i = 0 ; i < num_shards; ++i) {
3435 shard_id_t shard(i);
3436 bool found = false;
3437 for (const auto& t : missing_target_objects) {
3438 if (std::get<1>(t).shard == shard) {
3439 found = true;
3440 break;
3441 }
3442 }
3443 if (!found)
3444 missing_target_objects.insert(make_pair(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard)));
3445 }
3446 }
3447
3448 for (const auto& item : missing_target_objects)
3449 dout(20) << __func__ << " missing shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
3450 for (const auto& item : acting_source_objects)
3451 dout(20) << __func__ << " acting shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
3452
3453 // Handle all objects not in missing for remapped
3454 // or backfill
3455 for (auto m = missing_target_objects.rbegin();
3456 m != missing_target_objects.rend(); ++m) {
3457
3458 int64_t extra_missing = -1;
3459
3460 if (pool.info.is_replicated()) {
3461 if (!acting_source_objects.empty()) {
3462 auto extra_copy = acting_source_objects.begin();
3463 extra_missing = std::get<0>(*extra_copy);
3464 acting_source_objects.erase(extra_copy);
3465 }
3466 } else { // Erasure coded
3467 // Use corresponding shard
3468 for (const auto& a : acting_source_objects) {
3469 if (std::get<1>(a).shard == std::get<1>(*m).shard) {
3470 extra_missing = std::get<0>(a);
3471 acting_source_objects.erase(a);
3472 break;
3473 }
3474 }
3475 }
3476
3477 if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
3478 // We don't know which of the objects on the target
3479 // are part of extra_missing so assume are all degraded.
3480 misplaced += std::get<0>(*m) - extra_missing;
3481 degraded += extra_missing;
3482 } else {
3483 // 1. extra_missing == -1, more targets than sources so degraded
3484 // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
3485 // previously degraded are now present on the target.
3486 degraded += std::get<0>(*m);
3487 }
3488 }
3489 // If there are still acting that haven't been accounted for
3490 // then they are misplaced
3491 for (const auto& a : acting_source_objects) {
3492 int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
3493 dout(20) << __func__ << " extra acting misplaced " << extra_misplaced << dendl;
3494 misplaced += extra_misplaced;
3495 }
3496 out:
3497 // NOTE: Tests use these messages to verify this code
3498 dout(20) << __func__ << " degraded " << degraded << (estimate ? " (est)": "") << dendl;
3499 dout(20) << __func__ << " misplaced " << misplaced << (estimate ? " (est)": "")<< dendl;
3500
3501 info.stats.stats.sum.num_objects_degraded = degraded;
3502 info.stats.stats.sum.num_objects_unfound = get_num_unfound();
3503 info.stats.stats.sum.num_objects_misplaced = misplaced;
3504 }
3505 }
3506
3507 void PG::_update_blocked_by()
3508 {
3509 // set a max on the number of blocking peers we report. if we go
3510 // over, report a random subset. keep the result sorted.
3511 unsigned keep = std::min<unsigned>(blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
3512 unsigned skip = blocked_by.size() - keep;
3513 info.stats.blocked_by.clear();
3514 info.stats.blocked_by.resize(keep);
3515 unsigned pos = 0;
3516 for (set<int>::iterator p = blocked_by.begin();
3517 p != blocked_by.end() && keep > 0;
3518 ++p) {
3519 if (skip > 0 && (rand() % (skip + keep) < skip)) {
3520 --skip;
3521 } else {
3522 info.stats.blocked_by[pos++] = *p;
3523 --keep;
3524 }
3525 }
3526 }
3527
3528 void PG::publish_stats_to_osd()
3529 {
3530 if (!is_primary())
3531 return;
3532
3533 pg_stats_publish_lock.Lock();
3534
3535 if (info.stats.stats.sum.num_scrub_errors)
3536 state_set(PG_STATE_INCONSISTENT);
3537 else {
3538 state_clear(PG_STATE_INCONSISTENT);
3539 state_clear(PG_STATE_FAILED_REPAIR);
3540 }
3541
3542 utime_t now = ceph_clock_now();
3543 if (info.stats.state != state) {
3544 info.stats.last_change = now;
3545 // Optimistic estimation, if we just find out an inactive PG,
3546 // assumt it is active till now.
3547 if (!(state & PG_STATE_ACTIVE) &&
3548 (info.stats.state & PG_STATE_ACTIVE))
3549 info.stats.last_active = now;
3550
3551 if ((state & PG_STATE_ACTIVE) &&
3552 !(info.stats.state & PG_STATE_ACTIVE))
3553 info.stats.last_became_active = now;
3554 if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
3555 !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
3556 info.stats.last_became_peered = now;
3557 info.stats.state = state;
3558 }
3559
3560 _update_calc_stats();
3561 if (info.stats.stats.sum.num_objects_degraded) {
3562 state_set(PG_STATE_DEGRADED);
3563 } else {
3564 state_clear(PG_STATE_DEGRADED);
3565 }
3566 _update_blocked_by();
3567
3568 pg_stat_t pre_publish = info.stats;
3569 pre_publish.stats.add(unstable_stats);
3570 utime_t cutoff = now;
3571 cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
3572
3573 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_MIMIC) {
3574 // share (some of) our purged_snaps via the pg_stats. limit # of intervals
3575 // because we don't want to make the pg_stat_t structures too expensive.
3576 unsigned max = cct->_conf->osd_max_snap_prune_intervals_per_epoch;
3577 unsigned num = 0;
3578 auto i = info.purged_snaps.begin();
3579 while (num < max && i != info.purged_snaps.end()) {
3580 pre_publish.purged_snaps.insert(i.get_start(), i.get_len());
3581 ++num;
3582 ++i;
3583 }
3584 dout(20) << __func__ << " reporting purged_snaps "
3585 << pre_publish.purged_snaps << dendl;
3586 }
3587
3588 if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
3589 info.stats.last_fresh > cutoff) {
3590 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3591 << ": no change since " << info.stats.last_fresh << dendl;
3592 } else {
3593 // update our stat summary and timestamps
3594 info.stats.reported_epoch = get_osdmap_epoch();
3595 ++info.stats.reported_seq;
3596
3597 info.stats.last_fresh = now;
3598
3599 if (info.stats.state & PG_STATE_CLEAN)
3600 info.stats.last_clean = now;
3601 if (info.stats.state & PG_STATE_ACTIVE)
3602 info.stats.last_active = now;
3603 if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
3604 info.stats.last_peered = now;
3605 info.stats.last_unstale = now;
3606 if ((info.stats.state & PG_STATE_DEGRADED) == 0)
3607 info.stats.last_undegraded = now;
3608 if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
3609 info.stats.last_fullsized = now;
3610
3611 pg_stats_publish_valid = true;
3612 pg_stats_publish = pre_publish;
3613
3614 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3615 << ":" << pg_stats_publish.reported_seq << dendl;
3616 }
3617 pg_stats_publish_lock.Unlock();
3618 }
3619
3620 void PG::clear_publish_stats()
3621 {
3622 dout(15) << "clear_stats" << dendl;
3623 pg_stats_publish_lock.Lock();
3624 pg_stats_publish_valid = false;
3625 pg_stats_publish_lock.Unlock();
3626 }
3627
3628 /**
3629 * initialize a newly instantiated pg
3630 *
3631 * Initialize PG state, as when a PG is initially created, or when it
3632 * is first instantiated on the current node.
3633 *
3634 * @param role our role/rank
3635 * @param newup up set
3636 * @param newacting acting set
3637 * @param history pg history
3638 * @param pi past_intervals
3639 * @param backfill true if info should be marked as backfill
3640 * @param t transaction to write out our new state in
3641 */
3642 void PG::init(
3643 int role,
3644 const vector<int>& newup, int new_up_primary,
3645 const vector<int>& newacting, int new_acting_primary,
3646 const pg_history_t& history,
3647 const PastIntervals& pi,
3648 bool backfill,
3649 ObjectStore::Transaction *t)
3650 {
3651 dout(10) << "init role " << role << " up " << newup << " acting " << newacting
3652 << " history " << history
3653 << " past_intervals " << pi
3654 << dendl;
3655
3656 set_role(role);
3657 init_primary_up_acting(
3658 newup,
3659 newacting,
3660 new_up_primary,
3661 new_acting_primary);
3662
3663 info.history = history;
3664 past_intervals = pi;
3665
3666 info.stats.up = up;
3667 info.stats.up_primary = new_up_primary;
3668 info.stats.acting = acting;
3669 info.stats.acting_primary = new_acting_primary;
3670 info.stats.mapping_epoch = info.history.same_interval_since;
3671
3672 if (backfill) {
3673 dout(10) << __func__ << ": Setting backfill" << dendl;
3674 info.set_last_backfill(hobject_t());
3675 info.last_complete = info.last_update;
3676 pg_log.mark_log_for_rewrite();
3677 }
3678
3679 on_new_interval();
3680
3681 dirty_info = true;
3682 dirty_big_info = true;
3683 write_if_dirty(*t);
3684 }
3685
3686 void PG::shutdown()
3687 {
3688 ch->flush();
3689 lock();
3690 on_shutdown();
3691 unlock();
3692 }
3693
3694 #pragma GCC diagnostic ignored "-Wpragmas"
3695 #pragma GCC diagnostic push
3696 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3697
3698 void PG::upgrade(ObjectStore *store)
3699 {
3700 dout(0) << __func__ << " " << info_struct_v << " -> " << latest_struct_v
3701 << dendl;
3702 ceph_assert(info_struct_v <= 10);
3703 ObjectStore::Transaction t;
3704
3705 // <do upgrade steps here>
3706
3707 // finished upgrade!
3708 ceph_assert(info_struct_v == 10);
3709
3710 // update infover_key
3711 if (info_struct_v < latest_struct_v) {
3712 map<string,bufferlist> v;
3713 __u8 ver = latest_struct_v;
3714 encode(ver, v[infover_key]);
3715 t.omap_setkeys(coll, pgmeta_oid, v);
3716 }
3717
3718 dirty_info = true;
3719 dirty_big_info = true;
3720 write_if_dirty(t);
3721
3722 ObjectStore::CollectionHandle ch = store->open_collection(coll);
3723 int r = store->queue_transaction(ch, std::move(t));
3724 if (r != 0) {
3725 derr << __func__ << ": queue_transaction returned "
3726 << cpp_strerror(r) << dendl;
3727 ceph_abort();
3728 }
3729 ceph_assert(r == 0);
3730
3731 C_SaferCond waiter;
3732 if (!ch->flush_commit(&waiter)) {
3733 waiter.wait();
3734 }
3735 }
3736
3737 #pragma GCC diagnostic pop
3738 #pragma GCC diagnostic warning "-Wpragmas"
3739
3740 int PG::_prepare_write_info(CephContext* cct,
3741 map<string,bufferlist> *km,
3742 epoch_t epoch,
3743 pg_info_t &info, pg_info_t &last_written_info,
3744 PastIntervals &past_intervals,
3745 bool dirty_big_info,
3746 bool dirty_epoch,
3747 bool try_fast_info,
3748 PerfCounters *logger)
3749 {
3750 if (dirty_epoch) {
3751 encode(epoch, (*km)[epoch_key]);
3752 }
3753
3754 if (logger)
3755 logger->inc(l_osd_pg_info);
3756
3757 // try to do info efficiently?
3758 if (!dirty_big_info && try_fast_info &&
3759 info.last_update > last_written_info.last_update) {
3760 pg_fast_info_t fast;
3761 fast.populate_from(info);
3762 bool did = fast.try_apply_to(&last_written_info);
3763 ceph_assert(did); // we verified last_update increased above
3764 if (info == last_written_info) {
3765 encode(fast, (*km)[fastinfo_key]);
3766 if (logger)
3767 logger->inc(l_osd_pg_fastinfo);
3768 return 0;
3769 }
3770 generic_dout(30) << __func__ << " fastinfo failed, info:\n";
3771 {
3772 JSONFormatter jf(true);
3773 jf.dump_object("info", info);
3774 jf.flush(*_dout);
3775 }
3776 {
3777 *_dout << "\nlast_written_info:\n";
3778 JSONFormatter jf(true);
3779 jf.dump_object("last_written_info", last_written_info);
3780 jf.flush(*_dout);
3781 }
3782 *_dout << dendl;
3783 }
3784 last_written_info = info;
3785
3786 // info. store purged_snaps separately.
3787 interval_set<snapid_t> purged_snaps;
3788 purged_snaps.swap(info.purged_snaps);
3789 encode(info, (*km)[info_key]);
3790 purged_snaps.swap(info.purged_snaps);
3791
3792 if (dirty_big_info) {
3793 // potentially big stuff
3794 bufferlist& bigbl = (*km)[biginfo_key];
3795 encode(past_intervals, bigbl);
3796 encode(info.purged_snaps, bigbl);
3797 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
3798 if (logger)
3799 logger->inc(l_osd_pg_biginfo);
3800 }
3801
3802 return 0;
3803 }
3804
3805 void PG::_create(ObjectStore::Transaction& t, spg_t pgid, int bits)
3806 {
3807 coll_t coll(pgid);
3808 t.create_collection(coll, bits);
3809 }
3810
3811 void PG::_init(ObjectStore::Transaction& t, spg_t pgid, const pg_pool_t *pool)
3812 {
3813 coll_t coll(pgid);
3814
3815 if (pool) {
3816 // Give a hint to the PG collection
3817 bufferlist hint;
3818 uint32_t pg_num = pool->get_pg_num();
3819 uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
3820 encode(pg_num, hint);
3821 encode(expected_num_objects_pg, hint);
3822 uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
3823 t.collection_hint(coll, hint_type, hint);
3824 }
3825
3826 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3827 t.touch(coll, pgmeta_oid);
3828 map<string,bufferlist> values;
3829 __u8 struct_v = latest_struct_v;
3830 encode(struct_v, values[infover_key]);
3831 t.omap_setkeys(coll, pgmeta_oid, values);
3832 }
3833
3834 void PG::prepare_write_info(map<string,bufferlist> *km)
3835 {
3836 info.stats.stats.add(unstable_stats);
3837 unstable_stats.clear();
3838
3839 bool need_update_epoch = last_epoch < get_osdmap_epoch();
3840 int ret = _prepare_write_info(cct, km, get_osdmap_epoch(),
3841 info,
3842 last_written_info,
3843 past_intervals,
3844 dirty_big_info, need_update_epoch,
3845 cct->_conf->osd_fast_info,
3846 osd->logger);
3847 ceph_assert(ret == 0);
3848 if (need_update_epoch)
3849 last_epoch = get_osdmap_epoch();
3850 last_persisted_osdmap = last_epoch;
3851
3852 dirty_info = false;
3853 dirty_big_info = false;
3854 }
3855
3856 #pragma GCC diagnostic ignored "-Wpragmas"
3857 #pragma GCC diagnostic push
3858 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3859
3860 bool PG::_has_removal_flag(ObjectStore *store,
3861 spg_t pgid)
3862 {
3863 coll_t coll(pgid);
3864 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3865
3866 // first try new way
3867 set<string> keys;
3868 keys.insert("_remove");
3869 map<string,bufferlist> values;
3870 auto ch = store->open_collection(coll);
3871 ceph_assert(ch);
3872 if (store->omap_get_values(ch, pgmeta_oid, keys, &values) == 0 &&
3873 values.size() == 1)
3874 return true;
3875
3876 return false;
3877 }
3878
3879 int PG::peek_map_epoch(ObjectStore *store,
3880 spg_t pgid,
3881 epoch_t *pepoch)
3882 {
3883 coll_t coll(pgid);
3884 ghobject_t legacy_infos_oid(OSD::make_infos_oid());
3885 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3886 epoch_t cur_epoch = 0;
3887
3888 // validate collection name
3889 ceph_assert(coll.is_pg());
3890
3891 // try for v8
3892 set<string> keys;
3893 keys.insert(infover_key);
3894 keys.insert(epoch_key);
3895 map<string,bufferlist> values;
3896 auto ch = store->open_collection(coll);
3897 ceph_assert(ch);
3898 int r = store->omap_get_values(ch, pgmeta_oid, keys, &values);
3899 if (r == 0) {
3900 ceph_assert(values.size() == 2);
3901
3902 // sanity check version
3903 auto bp = values[infover_key].cbegin();
3904 __u8 struct_v = 0;
3905 decode(struct_v, bp);
3906 ceph_assert(struct_v >= 8);
3907
3908 // get epoch
3909 bp = values[epoch_key].begin();
3910 decode(cur_epoch, bp);
3911 } else {
3912 // probably bug 10617; see OSD::load_pgs()
3913 return -1;
3914 }
3915
3916 *pepoch = cur_epoch;
3917 return 0;
3918 }
3919
3920 #pragma GCC diagnostic pop
3921 #pragma GCC diagnostic warning "-Wpragmas"
3922
3923 void PG::write_if_dirty(ObjectStore::Transaction& t)
3924 {
3925 map<string,bufferlist> km;
3926 if (dirty_big_info || dirty_info)
3927 prepare_write_info(&km);
3928 pg_log.write_log_and_missing(t, &km, coll, pgmeta_oid, pool.info.require_rollback());
3929 if (!km.empty())
3930 t.omap_setkeys(coll, pgmeta_oid, km);
3931 }
3932
3933 void PG::add_log_entry(const pg_log_entry_t& e, bool applied)
3934 {
3935 // raise last_complete only if we were previously up to date
3936 if (info.last_complete == info.last_update)
3937 info.last_complete = e.version;
3938
3939 // raise last_update.
3940 ceph_assert(e.version > info.last_update);
3941 info.last_update = e.version;
3942
3943 // raise user_version, if it increased (it may have not get bumped
3944 // by all logged updates)
3945 if (e.user_version > info.last_user_version)
3946 info.last_user_version = e.user_version;
3947
3948 // log mutation
3949 pg_log.add(e, applied);
3950 dout(10) << "add_log_entry " << e << dendl;
3951 }
3952
3953
3954 void PG::append_log(
3955 const vector<pg_log_entry_t>& logv,
3956 eversion_t trim_to,
3957 eversion_t roll_forward_to,
3958 ObjectStore::Transaction &t,
3959 bool transaction_applied,
3960 bool async)
3961 {
3962 if (transaction_applied)
3963 update_snap_map(logv, t);
3964
3965 /* The primary has sent an info updating the history, but it may not
3966 * have arrived yet. We want to make sure that we cannot remember this
3967 * write without remembering that it happened in an interval which went
3968 * active in epoch history.last_epoch_started.
3969 */
3970 if (info.last_epoch_started != info.history.last_epoch_started) {
3971 info.history.last_epoch_started = info.last_epoch_started;
3972 }
3973 if (info.last_interval_started != info.history.last_interval_started) {
3974 info.history.last_interval_started = info.last_interval_started;
3975 }
3976 dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3977
3978 PGLogEntryHandler handler{this, &t};
3979 if (!transaction_applied) {
3980 /* We must be a backfill or async recovery peer, so it's ok if we apply
3981 * out-of-turn since we won't be considered when
3982 * determining a min possible last_update.
3983 *
3984 * We skip_rollforward() here, which advances the crt, without
3985 * doing an actual rollforward. This avoids cleaning up entries
3986 * from the backend and we do not end up in a situation, where the
3987 * object is deleted before we can _merge_object_divergent_entries().
3988 */
3989 pg_log.skip_rollforward();
3990 }
3991
3992 for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3993 p != logv.end();
3994 ++p) {
3995 add_log_entry(*p, transaction_applied);
3996
3997 /* We don't want to leave the rollforward artifacts around
3998 * here past last_backfill. It's ok for the same reason as
3999 * above */
4000 if (transaction_applied &&
4001 p->soid > info.last_backfill) {
4002 pg_log.roll_forward(&handler);
4003 }
4004 }
4005 auto last = logv.rbegin();
4006 if (is_primary() && last != logv.rend()) {
4007 projected_log.skip_can_rollback_to_to_head();
4008 projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
4009 }
4010
4011 if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
4012 pg_log.roll_forward_to(
4013 roll_forward_to,
4014 &handler);
4015 last_rollback_info_trimmed_to_applied = roll_forward_to;
4016 }
4017
4018 dout(10) << __func__ << " approx pg log length = "
4019 << pg_log.get_log().approx_size() << dendl;
4020 dout(10) << __func__ << " transaction_applied = "
4021 << transaction_applied << dendl;
4022 if (!transaction_applied || async)
4023 dout(10) << __func__ << " " << pg_whoami
4024 << " is async_recovery or backfill target" << dendl;
4025 pg_log.trim(trim_to, info, transaction_applied, async);
4026
4027 // update the local pg, pg log
4028 dirty_info = true;
4029 write_if_dirty(t);
4030 }
4031
4032 bool PG::check_log_for_corruption(ObjectStore *store)
4033 {
4034 /// TODO: this method needs to work with the omap log
4035 return true;
4036 }
4037
4038 //! Get the name we're going to save our corrupt page log as
4039 std::string PG::get_corrupt_pg_log_name() const
4040 {
4041 const int MAX_BUF = 512;
4042 char buf[MAX_BUF];
4043 struct tm tm_buf;
4044 time_t my_time(time(NULL));
4045 const struct tm *t = localtime_r(&my_time, &tm_buf);
4046 int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t);
4047 if (ret == 0) {
4048 dout(0) << "strftime failed" << dendl;
4049 return "corrupt_log_unknown_time";
4050 }
4051 string out(buf);
4052 out += stringify(info.pgid);
4053 return out;
4054 }
4055
4056 int PG::read_info(
4057 ObjectStore *store, spg_t pgid, const coll_t &coll,
4058 pg_info_t &info, PastIntervals &past_intervals,
4059 __u8 &struct_v)
4060 {
4061 set<string> keys;
4062 keys.insert(infover_key);
4063 keys.insert(info_key);
4064 keys.insert(biginfo_key);
4065 keys.insert(fastinfo_key);
4066 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
4067 map<string,bufferlist> values;
4068 auto ch = store->open_collection(coll);
4069 ceph_assert(ch);
4070 int r = store->omap_get_values(ch, pgmeta_oid, keys, &values);
4071 ceph_assert(r == 0);
4072 ceph_assert(values.size() == 3 ||
4073 values.size() == 4);
4074
4075 auto p = values[infover_key].cbegin();
4076 decode(struct_v, p);
4077 ceph_assert(struct_v >= 10);
4078
4079 p = values[info_key].begin();
4080 decode(info, p);
4081
4082 p = values[biginfo_key].begin();
4083 decode(past_intervals, p);
4084 decode(info.purged_snaps, p);
4085
4086 p = values[fastinfo_key].begin();
4087 if (!p.end()) {
4088 pg_fast_info_t fast;
4089 decode(fast, p);
4090 fast.try_apply_to(&info);
4091 }
4092 return 0;
4093 }
4094
4095 void PG::read_state(ObjectStore *store)
4096 {
4097 int r = read_info(store, pg_id, coll, info, past_intervals,
4098 info_struct_v);
4099 ceph_assert(r >= 0);
4100
4101 if (info_struct_v < compat_struct_v) {
4102 derr << "PG needs upgrade, but on-disk data is too old; upgrade to"
4103 << " an older version first." << dendl;
4104 ceph_abort_msg("PG too old to upgrade");
4105 }
4106
4107 last_written_info = info;
4108
4109 ostringstream oss;
4110 pg_log.read_log_and_missing(
4111 store,
4112 ch,
4113 pgmeta_oid,
4114 info,
4115 oss,
4116 cct->_conf->osd_ignore_stale_divergent_priors,
4117 cct->_conf->osd_debug_verify_missing_on_start);
4118 if (oss.tellp())
4119 osd->clog->error() << oss.str();
4120
4121 // log any weirdness
4122 log_weirdness();
4123
4124 if (info_struct_v < latest_struct_v) {
4125 upgrade(store);
4126 }
4127
4128 // initialize current mapping
4129 {
4130 int primary, up_primary;
4131 vector<int> acting, up;
4132 get_osdmap()->pg_to_up_acting_osds(
4133 pg_id.pgid, &up, &up_primary, &acting, &primary);
4134 init_primary_up_acting(
4135 up,
4136 acting,
4137 up_primary,
4138 primary);
4139 int rr = OSDMap::calc_pg_role(osd->whoami, acting);
4140 if (pool.info.is_replicated() || rr == pg_whoami.shard)
4141 set_role(rr);
4142 else
4143 set_role(-1);
4144 }
4145
4146 PG::RecoveryCtx rctx(0, 0, 0, new ObjectStore::Transaction);
4147 handle_initialize(&rctx);
4148 // note: we don't activate here because we know the OSD will advance maps
4149 // during boot.
4150 write_if_dirty(*rctx.transaction);
4151 store->queue_transaction(ch, std::move(*rctx.transaction));
4152 delete rctx.transaction;
4153 }
4154
4155 void PG::log_weirdness()
4156 {
4157 if (pg_log.get_tail() != info.log_tail)
4158 osd->clog->error() << info.pgid
4159 << " info mismatch, log.tail " << pg_log.get_tail()
4160 << " != info.log_tail " << info.log_tail;
4161 if (pg_log.get_head() != info.last_update)
4162 osd->clog->error() << info.pgid
4163 << " info mismatch, log.head " << pg_log.get_head()
4164 << " != info.last_update " << info.last_update;
4165
4166 if (!pg_log.get_log().empty()) {
4167 // sloppy check
4168 if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
4169 osd->clog->error() << info.pgid
4170 << " log bound mismatch, info (tail,head] ("
4171 << pg_log.get_tail() << "," << pg_log.get_head() << "]"
4172 << " actual ["
4173 << pg_log.get_log().log.begin()->version << ","
4174 << pg_log.get_log().log.rbegin()->version << "]";
4175 }
4176
4177 if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
4178 osd->clog->error() << info.pgid
4179 << " caller_ops.size " << pg_log.get_log().caller_ops.size()
4180 << " > log size " << pg_log.get_log().log.size();
4181 }
4182 }
4183
4184 void PG::update_snap_map(
4185 const vector<pg_log_entry_t> &log_entries,
4186 ObjectStore::Transaction &t)
4187 {
4188 for (vector<pg_log_entry_t>::const_iterator i = log_entries.begin();
4189 i != log_entries.end();
4190 ++i) {
4191 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4192 if (i->soid.snap < CEPH_MAXSNAP) {
4193 if (i->is_delete()) {
4194 int r = snap_mapper.remove_oid(
4195 i->soid,
4196 &_t);
4197 if (r != 0)
4198 derr << __func__ << " remove_oid " << i->soid << " failed with " << r << dendl;
4199 // On removal tolerate missing key corruption
4200 ceph_assert(r == 0 || r == -ENOENT);
4201 } else if (i->is_update()) {
4202 ceph_assert(i->snaps.length() > 0);
4203 vector<snapid_t> snaps;
4204 bufferlist snapbl = i->snaps;
4205 auto p = snapbl.cbegin();
4206 try {
4207 decode(snaps, p);
4208 } catch (...) {
4209 derr << __func__ << " decode snaps failure on " << *i << dendl;
4210 snaps.clear();
4211 }
4212 set<snapid_t> _snaps(snaps.begin(), snaps.end());
4213
4214 if (i->is_clone() || i->is_promote()) {
4215 snap_mapper.add_oid(
4216 i->soid,
4217 _snaps,
4218 &_t);
4219 } else if (i->is_modify()) {
4220 int r = snap_mapper.update_snaps(
4221 i->soid,
4222 _snaps,
4223 0,
4224 &_t);
4225 ceph_assert(r == 0);
4226 } else {
4227 ceph_assert(i->is_clean());
4228 }
4229 }
4230 }
4231 }
4232 }
4233
4234 /**
4235 * filter trimming|trimmed snaps out of snapcontext
4236 */
4237 void PG::filter_snapc(vector<snapid_t> &snaps)
4238 {
4239 // nothing needs to trim, we can return immediately
4240 if (snap_trimq.empty() && info.purged_snaps.empty())
4241 return;
4242
4243 bool filtering = false;
4244 vector<snapid_t> newsnaps;
4245 for (vector<snapid_t>::iterator p = snaps.begin();
4246 p != snaps.end();
4247 ++p) {
4248 if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
4249 if (!filtering) {
4250 // start building a new vector with what we've seen so far
4251 dout(10) << "filter_snapc filtering " << snaps << dendl;
4252 newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
4253 filtering = true;
4254 }
4255 dout(20) << "filter_snapc removing trimq|purged snap " << *p << dendl;
4256 } else {
4257 if (filtering)
4258 newsnaps.push_back(*p); // continue building new vector
4259 }
4260 }
4261 if (filtering) {
4262 snaps.swap(newsnaps);
4263 dout(10) << "filter_snapc result " << snaps << dendl;
4264 }
4265 }
4266
4267 void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m)
4268 {
4269 for (map<hobject_t, list<OpRequestRef>>::iterator it = m.begin();
4270 it != m.end();
4271 ++it)
4272 requeue_ops(it->second);
4273 m.clear();
4274 }
4275
4276 void PG::requeue_op(OpRequestRef op)
4277 {
4278 auto p = waiting_for_map.find(op->get_source());
4279 if (p != waiting_for_map.end()) {
4280 dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")"
4281 << dendl;
4282 p->second.push_front(op);
4283 } else {
4284 dout(20) << __func__ << " " << op << dendl;
4285 osd->enqueue_front(
4286 OpQueueItem(
4287 unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(info.pgid, op)),
4288 op->get_req()->get_cost(),
4289 op->get_req()->get_priority(),
4290 op->get_req()->get_recv_stamp(),
4291 op->get_req()->get_source().num(),
4292 get_osdmap_epoch()));
4293 }
4294 }
4295
4296 void PG::requeue_ops(list<OpRequestRef> &ls)
4297 {
4298 for (list<OpRequestRef>::reverse_iterator i = ls.rbegin();
4299 i != ls.rend();
4300 ++i) {
4301 requeue_op(*i);
4302 }
4303 ls.clear();
4304 }
4305
4306 void PG::requeue_map_waiters()
4307 {
4308 epoch_t epoch = get_osdmap_epoch();
4309 auto p = waiting_for_map.begin();
4310 while (p != waiting_for_map.end()) {
4311 if (epoch < p->second.front()->min_epoch) {
4312 dout(20) << __func__ << " " << p->first << " front op "
4313 << p->second.front() << " must still wait, doing nothing"
4314 << dendl;
4315 ++p;
4316 } else {
4317 dout(20) << __func__ << " " << p->first << " " << p->second << dendl;
4318 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
4319 auto req = *q;
4320 osd->enqueue_front(OpQueueItem(
4321 unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(info.pgid, req)),
4322 req->get_req()->get_cost(),
4323 req->get_req()->get_priority(),
4324 req->get_req()->get_recv_stamp(),
4325 req->get_req()->get_source().num(),
4326 epoch));
4327 }
4328 p = waiting_for_map.erase(p);
4329 }
4330 }
4331 }
4332
4333
4334 // ==========================================================================================
4335 // SCRUB
4336
4337 /*
4338 * when holding pg and sched_scrub_lock, then the states are:
4339 * scheduling:
4340 * scrubber.reserved = true
4341 * scrub_rserved_peers includes whoami
4342 * osd->scrub_pending++
4343 * scheduling, replica declined:
4344 * scrubber.reserved = true
4345 * scrubber.reserved_peers includes -1
4346 * osd->scrub_pending++
4347 * pending:
4348 * scrubber.reserved = true
4349 * scrubber.reserved_peers.size() == acting.size();
4350 * pg on scrub_wq
4351 * osd->scrub_pending++
4352 * scrubbing:
4353 * scrubber.reserved = false;
4354 * scrubber.reserved_peers empty
4355 * osd->scrubber.active++
4356 */
4357
4358 // returns true if a scrub has been newly kicked off
4359 bool PG::sched_scrub()
4360 {
4361 ceph_assert(is_locked());
4362 ceph_assert(!is_scrubbing());
4363 if (!(is_primary() && is_active() && is_clean())) {
4364 return false;
4365 }
4366
4367 // All processing the first time through commits us to whatever
4368 // choices are made.
4369 if (!scrubber.reserved) {
4370 dout(20) << __func__ << ": Start processing pg " << info.pgid << dendl;
4371
4372 bool allow_deep_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
4373 pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB));
4374 bool allow_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
4375 pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB));
4376 bool has_deep_errors = (info.stats.stats.sum.num_deep_scrub_errors > 0);
4377 bool try_to_auto_repair = (cct->_conf->osd_scrub_auto_repair
4378 && get_pgbackend()->auto_repair_supported());
4379
4380 scrubber.time_for_deep = false;
4381 // Clear these in case user issues the scrub/repair command during
4382 // the scheduling of the scrub/repair (e.g. request reservation)
4383 scrubber.deep_scrub_on_error = false;
4384 scrubber.auto_repair = false;
4385
4386 // All periodic scrub handling goes here because must_scrub is
4387 // always set for must_deep_scrub and must_repair.
4388 if (!scrubber.must_scrub) {
4389 ceph_assert(!scrubber.must_deep_scrub && !scrubber.must_repair);
4390 // Handle deep scrub determination only if allowed
4391 if (allow_deep_scrub) {
4392 // Initial entry and scheduled scrubs without nodeep_scrub set get here
4393 if (scrubber.need_auto) {
4394 dout(20) << __func__ << ": need repair after scrub errors" << dendl;
4395 scrubber.time_for_deep = true;
4396 } else {
4397 double deep_scrub_interval = 0;
4398 pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
4399 if (deep_scrub_interval <= 0) {
4400 deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
4401 }
4402 scrubber.time_for_deep = ceph_clock_now() >=
4403 info.history.last_deep_scrub_stamp + deep_scrub_interval;
4404
4405 bool deep_coin_flip = false;
4406 // If we randomize when !allow_scrub && allow_deep_scrub, then it guarantees
4407 // we will deep scrub because this function is called often.
4408 if (!scrubber.time_for_deep && allow_scrub)
4409 deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
4410 dout(20) << __func__ << ": time_for_deep=" << scrubber.time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
4411
4412 scrubber.time_for_deep = (scrubber.time_for_deep || deep_coin_flip);
4413 }
4414
4415 if (!scrubber.time_for_deep && has_deep_errors) {
4416 osd->clog->info() << "osd." << osd->whoami
4417 << " pg " << info.pgid
4418 << " Deep scrub errors, upgrading scrub to deep-scrub";
4419 scrubber.time_for_deep = true;
4420 }
4421
4422 if (try_to_auto_repair) {
4423 if (scrubber.time_for_deep) {
4424 dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
4425 scrubber.auto_repair = true;
4426 } else if (allow_scrub) {
4427 dout(20) << __func__ << ": auto repair with scrubbing, rescrub if errors found" << dendl;
4428 scrubber.deep_scrub_on_error = true;
4429 }
4430 }
4431 } else { // !allow_deep_scrub
4432 dout(20) << __func__ << ": nodeep_scrub set" << dendl;
4433 if (has_deep_errors) {
4434 osd->clog->error() << "osd." << osd->whoami
4435 << " pg " << info.pgid
4436 << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
4437 return false;
4438 }
4439 }
4440
4441 //NOSCRUB so skip regular scrubs
4442 if (!allow_scrub && !scrubber.time_for_deep) {
4443 return false;
4444 }
4445 // scrubber.must_scrub
4446 } else if (!scrubber.must_deep_scrub && has_deep_errors) {
4447 osd->clog->error() << "osd." << osd->whoami
4448 << " pg " << info.pgid
4449 << " Regular scrub request, deep-scrub details will be lost";
4450 }
4451 // Unless precluded this was handle above
4452 scrubber.need_auto = false;
4453
4454 ceph_assert(scrubber.reserved_peers.empty());
4455 if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
4456 osd->inc_scrubs_pending()) {
4457 dout(20) << __func__ << ": reserved locally, reserving replicas" << dendl;
4458 scrubber.reserved = true;
4459 scrubber.reserved_peers.insert(pg_whoami);
4460 scrub_reserve_replicas();
4461 } else {
4462 dout(20) << __func__ << ": failed to reserve locally" << dendl;
4463 return false;
4464 }
4465 }
4466
4467 if (scrubber.reserved) {
4468 if (scrubber.reserve_failed) {
4469 dout(20) << __func__ << ": failed, a peer declined" << dendl;
4470 clear_scrub_reserved();
4471 scrub_unreserve_replicas();
4472 return false;
4473 } else if (scrubber.reserved_peers.size() == actingset.size()) {
4474 dout(20) << __func__ << ": success, reserved self and replicas" << dendl;
4475 if (scrubber.time_for_deep) {
4476 dout(10) << __func__ << ": scrub will be deep" << dendl;
4477 state_set(PG_STATE_DEEP_SCRUB);
4478 scrubber.time_for_deep = false;
4479 }
4480 queue_scrub();
4481 } else {
4482 // none declined, since scrubber.reserved is set
4483 dout(20) << __func__ << ": reserved " << scrubber.reserved_peers
4484 << ", waiting for replicas" << dendl;
4485 }
4486 }
4487 return true;
4488 }
4489
4490 bool PG::is_scrub_registered()
4491 {
4492 return !scrubber.scrub_reg_stamp.is_zero();
4493 }
4494
4495 void PG::reg_next_scrub()
4496 {
4497 if (!is_primary())
4498 return;
4499
4500 utime_t reg_stamp;
4501 bool must = false;
4502 if (scrubber.must_scrub || scrubber.need_auto) {
4503 // Set the smallest time that isn't utime_t()
4504 reg_stamp = Scrubber::scrub_must_stamp();
4505 must = true;
4506 } else if (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats) {
4507 reg_stamp = ceph_clock_now();
4508 must = true;
4509 } else {
4510 reg_stamp = info.history.last_scrub_stamp;
4511 }
4512 // note down the sched_time, so we can locate this scrub, and remove it
4513 // later on.
4514 double scrub_min_interval = 0, scrub_max_interval = 0;
4515 pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
4516 pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
4517 ceph_assert(!is_scrub_registered());
4518 scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
4519 reg_stamp,
4520 scrub_min_interval,
4521 scrub_max_interval,
4522 must);
4523 dout(10) << __func__ << " pg " << pg_id << " register next scrub, scrub time "
4524 << scrubber.scrub_reg_stamp << ", must = " << (int)must << dendl;
4525 }
4526
4527 void PG::unreg_next_scrub()
4528 {
4529 if (is_scrub_registered()) {
4530 osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
4531 scrubber.scrub_reg_stamp = utime_t();
4532 }
4533 }
4534
4535 void PG::on_info_history_change()
4536 {
4537 unreg_next_scrub();
4538 reg_next_scrub();
4539 }
4540
4541 void PG::scrub_requested(bool deep, bool repair, bool need_auto)
4542 {
4543 unreg_next_scrub();
4544 if (need_auto) {
4545 scrubber.need_auto = true;
4546 } else {
4547 scrubber.must_scrub = true;
4548 scrubber.must_deep_scrub = deep || repair;
4549 scrubber.must_repair = repair;
4550 // User might intervene, so clear this
4551 scrubber.need_auto = false;
4552 }
4553 reg_next_scrub();
4554 }
4555
4556 void PG::do_replica_scrub_map(OpRequestRef op)
4557 {
4558 const MOSDRepScrubMap *m = static_cast<const MOSDRepScrubMap*>(op->get_req());
4559 dout(7) << __func__ << " " << *m << dendl;
4560 if (m->map_epoch < info.history.same_interval_since) {
4561 dout(10) << __func__ << " discarding old from "
4562 << m->map_epoch << " < " << info.history.same_interval_since
4563 << dendl;
4564 return;
4565 }
4566 if (!scrubber.is_chunky_scrub_active()) {
4567 dout(10) << __func__ << " scrub isn't active" << dendl;
4568 return;
4569 }
4570
4571 op->mark_started();
4572
4573 auto p = const_cast<bufferlist&>(m->get_data()).cbegin();
4574 scrubber.received_maps[m->from].decode(p, info.pgid.pool());
4575 dout(10) << "map version is "
4576 << scrubber.received_maps[m->from].valid_through
4577 << dendl;
4578
4579 dout(10) << __func__ << " waiting_on_whom was " << scrubber.waiting_on_whom
4580 << dendl;
4581 ceph_assert(scrubber.waiting_on_whom.count(m->from));
4582 scrubber.waiting_on_whom.erase(m->from);
4583 if (m->preempted) {
4584 dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
4585 scrub_preempted = true;
4586 }
4587 if (scrubber.waiting_on_whom.empty()) {
4588 requeue_scrub(ops_blocked_by_scrub());
4589 }
4590 }
4591
4592 // send scrub v3 messages (chunky scrub)
4593 void PG::_request_scrub_map(
4594 pg_shard_t replica, eversion_t version,
4595 hobject_t start, hobject_t end,
4596 bool deep,
4597 bool allow_preemption)
4598 {
4599 ceph_assert(replica != pg_whoami);
4600 dout(10) << "scrub requesting scrubmap from osd." << replica
4601 << " deep " << (int)deep << dendl;
4602 MOSDRepScrub *repscrubop = new MOSDRepScrub(
4603 spg_t(info.pgid.pgid, replica.shard), version,
4604 get_osdmap_epoch(),
4605 get_last_peering_reset(),
4606 start, end, deep,
4607 allow_preemption,
4608 scrubber.priority,
4609 ops_blocked_by_scrub());
4610 // default priority, we want the rep scrub processed prior to any recovery
4611 // or client io messages (we are holding a lock!)
4612 osd->send_message_osd_cluster(
4613 replica.osd, repscrubop, get_osdmap_epoch());
4614 }
4615
4616 void PG::handle_scrub_reserve_request(OpRequestRef op)
4617 {
4618 dout(7) << __func__ << " " << *op->get_req() << dendl;
4619 op->mark_started();
4620 if (scrubber.reserved) {
4621 dout(10) << __func__ << " ignoring reserve request: Already reserved"
4622 << dendl;
4623 return;
4624 }
4625 if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
4626 osd->inc_scrubs_pending()) {
4627 scrubber.reserved = true;
4628 } else {
4629 dout(20) << __func__ << ": failed to reserve remotely" << dendl;
4630 scrubber.reserved = false;
4631 }
4632 const MOSDScrubReserve *m =
4633 static_cast<const MOSDScrubReserve*>(op->get_req());
4634 Message *reply = new MOSDScrubReserve(
4635 spg_t(info.pgid.pgid, primary.shard),
4636 m->map_epoch,
4637 scrubber.reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
4638 pg_whoami);
4639 osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
4640 }
4641
4642 void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
4643 {
4644 dout(7) << __func__ << " " << *op->get_req() << dendl;
4645 op->mark_started();
4646 if (!scrubber.reserved) {
4647 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
4648 return;
4649 }
4650 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
4651 dout(10) << " already had osd." << from << " reserved" << dendl;
4652 } else {
4653 dout(10) << " osd." << from << " scrub reserve = success" << dendl;
4654 scrubber.reserved_peers.insert(from);
4655 sched_scrub();
4656 }
4657 }
4658
4659 void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
4660 {
4661 dout(7) << __func__ << " " << *op->get_req() << dendl;
4662 op->mark_started();
4663 if (!scrubber.reserved) {
4664 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
4665 return;
4666 }
4667 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
4668 dout(10) << " already had osd." << from << " reserved" << dendl;
4669 } else {
4670 /* One decline stops this pg from being scheduled for scrubbing. */
4671 dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
4672 scrubber.reserve_failed = true;
4673 sched_scrub();
4674 }
4675 }
4676
4677 void PG::handle_scrub_reserve_release(OpRequestRef op)
4678 {
4679 dout(7) << __func__ << " " << *op->get_req() << dendl;
4680 op->mark_started();
4681 clear_scrub_reserved();
4682 }
4683
4684 // We can zero the value of primary num_bytes as just an atomic.
4685 // However, setting above zero reserves space for backfill and requires
4686 // the OSDService::stat_lock which protects all OSD usage
4687 void PG::set_reserved_num_bytes(int64_t primary, int64_t local) {
4688 ceph_assert(osd->stat_lock.is_locked_by_me());
4689 primary_num_bytes.store(primary);
4690 local_num_bytes.store(local);
4691 return;
4692 }
4693
4694 void PG::clear_reserved_num_bytes() {
4695 primary_num_bytes.store(0);
4696 local_num_bytes.store(0);
4697 return;
4698 }
4699
4700 void PG::reject_reservation()
4701 {
4702 clear_reserved_num_bytes();
4703 osd->send_message_osd_cluster(
4704 primary.osd,
4705 new MBackfillReserve(
4706 MBackfillReserve::REJECT,
4707 spg_t(info.pgid.pgid, primary.shard),
4708 get_osdmap_epoch()),
4709 get_osdmap_epoch());
4710 }
4711
4712 void PG::schedule_backfill_retry(float delay)
4713 {
4714 std::lock_guard lock(osd->recovery_request_lock);
4715 osd->recovery_request_timer.add_event_after(
4716 delay,
4717 new QueuePeeringEvt<RequestBackfill>(
4718 this, get_osdmap_epoch(),
4719 RequestBackfill()));
4720 }
4721
4722 void PG::schedule_recovery_retry(float delay)
4723 {
4724 std::lock_guard lock(osd->recovery_request_lock);
4725 osd->recovery_request_timer.add_event_after(
4726 delay,
4727 new QueuePeeringEvt<DoRecovery>(
4728 this, get_osdmap_epoch(),
4729 DoRecovery()));
4730 }
4731
4732 void PG::clear_scrub_reserved()
4733 {
4734 scrubber.reserved_peers.clear();
4735 scrubber.reserve_failed = false;
4736
4737 if (scrubber.reserved) {
4738 scrubber.reserved = false;
4739 osd->dec_scrubs_pending();
4740 }
4741 }
4742
4743 void PG::scrub_reserve_replicas()
4744 {
4745 ceph_assert(backfill_targets.empty());
4746 for (set<pg_shard_t>::iterator i = actingset.begin();
4747 i != actingset.end();
4748 ++i) {
4749 if (*i == pg_whoami) continue;
4750 dout(10) << "scrub requesting reserve from osd." << *i << dendl;
4751 osd->send_message_osd_cluster(
4752 i->osd,
4753 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
4754 get_osdmap_epoch(),
4755 MOSDScrubReserve::REQUEST, pg_whoami),
4756 get_osdmap_epoch());
4757 }
4758 }
4759
4760 void PG::scrub_unreserve_replicas()
4761 {
4762 ceph_assert(backfill_targets.empty());
4763 for (set<pg_shard_t>::iterator i = actingset.begin();
4764 i != actingset.end();
4765 ++i) {
4766 if (*i == pg_whoami) continue;
4767 dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
4768 osd->send_message_osd_cluster(
4769 i->osd,
4770 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
4771 get_osdmap_epoch(),
4772 MOSDScrubReserve::RELEASE, pg_whoami),
4773 get_osdmap_epoch());
4774 }
4775 }
4776
4777 void PG::_scan_rollback_obs(const vector<ghobject_t> &rollback_obs)
4778 {
4779 ObjectStore::Transaction t;
4780 eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
4781 for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
4782 i != rollback_obs.end();
4783 ++i) {
4784 if (i->generation < trimmed_to.version) {
4785 dout(10) << __func__ << "osd." << osd->whoami
4786 << " pg " << info.pgid
4787 << " found obsolete rollback obj "
4788 << *i << " generation < trimmed_to "
4789 << trimmed_to
4790 << "...repaired" << dendl;
4791 t.remove(coll, *i);
4792 }
4793 }
4794 if (!t.empty()) {
4795 derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
4796 << dendl;
4797 osd->store->queue_transaction(ch, std::move(t), NULL);
4798 }
4799 }
4800
4801 void PG::_scan_snaps(ScrubMap &smap)
4802 {
4803 hobject_t head;
4804 SnapSet snapset;
4805
4806 // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify
4807 // caller using clean_meta_map(), and it works properly.
4808 dout(20) << __func__ << " start" << dendl;
4809
4810 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4811 i != smap.objects.rend();
4812 ++i) {
4813 const hobject_t &hoid = i->first;
4814 ScrubMap::object &o = i->second;
4815
4816 dout(20) << __func__ << " " << hoid << dendl;
4817
4818 ceph_assert(!hoid.is_snapdir());
4819 if (hoid.is_head()) {
4820 // parse the SnapSet
4821 bufferlist bl;
4822 if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
4823 continue;
4824 }
4825 bl.push_back(o.attrs[SS_ATTR]);
4826 auto p = bl.cbegin();
4827 try {
4828 decode(snapset, p);
4829 } catch(...) {
4830 continue;
4831 }
4832 head = hoid.get_head();
4833 continue;
4834 }
4835 if (hoid.snap < CEPH_MAXSNAP) {
4836 // check and if necessary fix snap_mapper
4837 if (hoid.get_head() != head) {
4838 derr << __func__ << " no head for " << hoid << " (have " << head << ")"
4839 << dendl;
4840 continue;
4841 }
4842 set<snapid_t> obj_snaps;
4843 auto p = snapset.clone_snaps.find(hoid.snap);
4844 if (p == snapset.clone_snaps.end()) {
4845 derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset
4846 << dendl;
4847 continue;
4848 }
4849 obj_snaps.insert(p->second.begin(), p->second.end());
4850 set<snapid_t> cur_snaps;
4851 int r = snap_mapper.get_snaps(hoid, &cur_snaps);
4852 if (r != 0 && r != -ENOENT) {
4853 derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
4854 ceph_abort();
4855 }
4856 if (r == -ENOENT || cur_snaps != obj_snaps) {
4857 ObjectStore::Transaction t;
4858 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4859 if (r == 0) {
4860 r = snap_mapper.remove_oid(hoid, &_t);
4861 if (r != 0) {
4862 derr << __func__ << ": remove_oid returned " << cpp_strerror(r)
4863 << dendl;
4864 ceph_abort();
4865 }
4866 osd->clog->error() << "osd." << osd->whoami
4867 << " found snap mapper error on pg "
4868 << info.pgid
4869 << " oid " << hoid << " snaps in mapper: "
4870 << cur_snaps << ", oi: "
4871 << obj_snaps
4872 << "...repaired";
4873 } else {
4874 osd->clog->error() << "osd." << osd->whoami
4875 << " found snap mapper error on pg "
4876 << info.pgid
4877 << " oid " << hoid << " snaps missing in mapper"
4878 << ", should be: "
4879 << obj_snaps
4880 << " was " << cur_snaps << " r " << r
4881 << "...repaired";
4882 }
4883 snap_mapper.add_oid(hoid, obj_snaps, &_t);
4884
4885 // wait for repair to apply to avoid confusing other bits of the system.
4886 {
4887 Cond my_cond;
4888 Mutex my_lock("PG::_scan_snaps my_lock");
4889 int r = 0;
4890 bool done;
4891 t.register_on_applied_sync(
4892 new C_SafeCond(&my_lock, &my_cond, &done, &r));
4893 r = osd->store->queue_transaction(ch, std::move(t));
4894 if (r != 0) {
4895 derr << __func__ << ": queue_transaction got " << cpp_strerror(r)
4896 << dendl;
4897 } else {
4898 my_lock.Lock();
4899 while (!done)
4900 my_cond.Wait(my_lock);
4901 my_lock.Unlock();
4902 }
4903 }
4904 }
4905 }
4906 }
4907 }
4908
4909 void PG::_repair_oinfo_oid(ScrubMap &smap)
4910 {
4911 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4912 i != smap.objects.rend();
4913 ++i) {
4914 const hobject_t &hoid = i->first;
4915 ScrubMap::object &o = i->second;
4916
4917 bufferlist bl;
4918 if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4919 continue;
4920 }
4921 bl.push_back(o.attrs[OI_ATTR]);
4922 object_info_t oi;
4923 try {
4924 oi.decode(bl);
4925 } catch(...) {
4926 continue;
4927 }
4928 if (oi.soid != hoid) {
4929 ObjectStore::Transaction t;
4930 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4931 osd->clog->error() << "osd." << osd->whoami
4932 << " found object info error on pg "
4933 << info.pgid
4934 << " oid " << hoid << " oid in object info: "
4935 << oi.soid
4936 << "...repaired";
4937 // Fix object info
4938 oi.soid = hoid;
4939 bl.clear();
4940 encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4941
4942 bufferptr bp(bl.c_str(), bl.length());
4943 o.attrs[OI_ATTR] = bp;
4944
4945 t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
4946 int r = osd->store->queue_transaction(ch, std::move(t));
4947 if (r != 0) {
4948 derr << __func__ << ": queue_transaction got " << cpp_strerror(r)
4949 << dendl;
4950 }
4951 }
4952 }
4953 }
4954 int PG::build_scrub_map_chunk(
4955 ScrubMap &map,
4956 ScrubMapBuilder &pos,
4957 hobject_t start,
4958 hobject_t end,
4959 bool deep,
4960 ThreadPool::TPHandle &handle)
4961 {
4962 dout(10) << __func__ << " [" << start << "," << end << ") "
4963 << " pos " << pos
4964 << dendl;
4965
4966 // start
4967 while (pos.empty()) {
4968 pos.deep = deep;
4969 map.valid_through = info.last_update;
4970
4971 // objects
4972 vector<ghobject_t> rollback_obs;
4973 pos.ret = get_pgbackend()->objects_list_range(
4974 start,
4975 end,
4976 &pos.ls,
4977 &rollback_obs);
4978 if (pos.ret < 0) {
4979 dout(5) << "objects_list_range error: " << pos.ret << dendl;
4980 return pos.ret;
4981 }
4982 if (pos.ls.empty()) {
4983 break;
4984 }
4985 _scan_rollback_obs(rollback_obs);
4986 pos.pos = 0;
4987 return -EINPROGRESS;
4988 }
4989
4990 // scan objects
4991 while (!pos.done()) {
4992 int r = get_pgbackend()->be_scan_list(map, pos);
4993 if (r == -EINPROGRESS) {
4994 return r;
4995 }
4996 }
4997
4998 // finish
4999 dout(20) << __func__ << " finishing" << dendl;
5000 ceph_assert(pos.done());
5001 _repair_oinfo_oid(map);
5002 if (!is_primary()) {
5003 ScrubMap for_meta_scrub;
5004 // In case we restarted smaller chunk, clear old data
5005 scrubber.cleaned_meta_map.clear_from(scrubber.start);
5006 scrubber.cleaned_meta_map.insert(map);
5007 scrubber.clean_meta_map(for_meta_scrub);
5008 _scan_snaps(for_meta_scrub);
5009 }
5010
5011 dout(20) << __func__ << " done, got " << map.objects.size() << " items"
5012 << dendl;
5013 return 0;
5014 }
5015
5016 void PG::Scrubber::cleanup_store(ObjectStore::Transaction *t) {
5017 if (!store)
5018 return;
5019 struct OnComplete : Context {
5020 std::unique_ptr<Scrub::Store> store;
5021 explicit OnComplete(
5022 std::unique_ptr<Scrub::Store> &&store)
5023 : store(std::move(store)) {}
5024 void finish(int) override {}
5025 };
5026 store->cleanup(t);
5027 t->register_on_complete(new OnComplete(std::move(store)));
5028 ceph_assert(!store);
5029 }
5030
5031 void PG::repair_object(
5032 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
5033 pg_shard_t bad_peer)
5034 {
5035 list<pg_shard_t> op_shards;
5036 for (auto i : *ok_peers) {
5037 op_shards.push_back(i.second);
5038 }
5039 dout(10) << "repair_object " << soid << " bad_peer osd."
5040 << bad_peer << " ok_peers osd.{" << op_shards << "}" << dendl;
5041 ScrubMap::object &po = ok_peers->back().first;
5042 eversion_t v;
5043 bufferlist bv;
5044 bv.push_back(po.attrs[OI_ATTR]);
5045 object_info_t oi;
5046 try {
5047 auto bliter = bv.cbegin();
5048 decode(oi, bliter);
5049 } catch (...) {
5050 dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
5051 ceph_abort();
5052 }
5053 if (bad_peer != primary) {
5054 peer_missing[bad_peer].add(soid, oi.version, eversion_t(), false);
5055 } else {
5056 // We should only be scrubbing if the PG is clean.
5057 ceph_assert(waiting_for_unreadable_object.empty());
5058
5059 pg_log.missing_add(soid, oi.version, eversion_t());
5060
5061 pg_log.set_last_requested(0);
5062 dout(10) << __func__ << ": primary = " << primary << dendl;
5063 }
5064
5065 if (is_ec_pg() || bad_peer == primary) {
5066 // we'd better collect all shard for EC pg, and prepare good peers as the
5067 // source of pull in the case of replicated pg.
5068 missing_loc.add_missing(soid, oi.version, eversion_t());
5069 list<pair<ScrubMap::object, pg_shard_t> >::iterator i;
5070 for (i = ok_peers->begin();
5071 i != ok_peers->end();
5072 ++i)
5073 missing_loc.add_location(soid, i->second);
5074 }
5075 }
5076
5077 /* replica_scrub
5078 *
5079 * Wait for last_update_applied to match msg->scrub_to as above. Wait
5080 * for pushes to complete in case of recent recovery. Build a single
5081 * scrubmap of objects that are in the range [msg->start, msg->end).
5082 */
5083 void PG::replica_scrub(
5084 OpRequestRef op,
5085 ThreadPool::TPHandle &handle)
5086 {
5087 const MOSDRepScrub *msg = static_cast<const MOSDRepScrub *>(op->get_req());
5088 ceph_assert(!scrubber.active_rep_scrub);
5089 dout(7) << "replica_scrub" << dendl;
5090
5091 if (msg->map_epoch < info.history.same_interval_since) {
5092 dout(10) << "replica_scrub discarding old replica_scrub from "
5093 << msg->map_epoch << " < " << info.history.same_interval_since
5094 << dendl;
5095 return;
5096 }
5097
5098 ceph_assert(msg->chunky);
5099 if (active_pushes > 0) {
5100 dout(10) << "waiting for active pushes to finish" << dendl;
5101 scrubber.active_rep_scrub = op;
5102 return;
5103 }
5104
5105 scrubber.state = Scrubber::BUILD_MAP_REPLICA;
5106 scrubber.replica_scrub_start = msg->min_epoch;
5107 scrubber.start = msg->start;
5108 scrubber.end = msg->end;
5109 scrubber.max_end = msg->end;
5110 scrubber.deep = msg->deep;
5111 scrubber.epoch_start = info.history.same_interval_since;
5112 if (msg->priority) {
5113 scrubber.priority = msg->priority;
5114 } else {
5115 scrubber.priority = get_scrub_priority();
5116 }
5117
5118 scrub_can_preempt = msg->allow_preemption;
5119 scrub_preempted = false;
5120 scrubber.replica_scrubmap_pos.reset();
5121
5122 requeue_scrub(msg->high_priority);
5123 }
5124
5125 /* Scrub:
5126 * PG_STATE_SCRUBBING is set when the scrub is queued
5127 *
5128 * scrub will be chunky if all OSDs in PG support chunky scrub
5129 * scrub will fail if OSDs are too old.
5130 */
5131 void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
5132 {
5133 if (cct->_conf->osd_scrub_sleep > 0 &&
5134 (scrubber.state == PG::Scrubber::NEW_CHUNK ||
5135 scrubber.state == PG::Scrubber::INACTIVE) &&
5136 scrubber.needs_sleep) {
5137 ceph_assert(!scrubber.sleeping);
5138 dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
5139
5140 // Do an async sleep so we don't block the op queue
5141 OSDService *osds = osd;
5142 spg_t pgid = get_pgid();
5143 int state = scrubber.state;
5144 auto scrub_requeue_callback =
5145 new FunctionContext([osds, pgid, state](int r) {
5146 PGRef pg = osds->osd->lookup_lock_pg(pgid);
5147 if (pg == nullptr) {
5148 lgeneric_dout(osds->osd->cct, 20)
5149 << "scrub_requeue_callback: Could not find "
5150 << "PG " << pgid << " can't complete scrub requeue after sleep"
5151 << dendl;
5152 return;
5153 }
5154 pg->scrubber.sleeping = false;
5155 pg->scrubber.needs_sleep = false;
5156 lgeneric_dout(pg->cct, 20)
5157 << "scrub_requeue_callback: slept for "
5158 << ceph_clock_now() - pg->scrubber.sleep_start
5159 << ", re-queuing scrub with state " << state << dendl;
5160 pg->scrub_queued = false;
5161 pg->requeue_scrub();
5162 pg->scrubber.sleep_start = utime_t();
5163 pg->unlock();
5164 });
5165 std::lock_guard l(osd->sleep_lock);
5166 osd->sleep_timer.add_event_after(cct->_conf->osd_scrub_sleep,
5167 scrub_requeue_callback);
5168 scrubber.sleeping = true;
5169 scrubber.sleep_start = ceph_clock_now();
5170 return;
5171 }
5172 if (pg_has_reset_since(queued)) {
5173 return;
5174 }
5175 ceph_assert(scrub_queued);
5176 scrub_queued = false;
5177 scrubber.needs_sleep = true;
5178
5179 // for the replica
5180 if (!is_primary() &&
5181 scrubber.state == PG::Scrubber::BUILD_MAP_REPLICA) {
5182 chunky_scrub(handle);
5183 return;
5184 }
5185
5186 if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
5187 dout(10) << "scrub -- not primary or active or not clean" << dendl;
5188 state_clear(PG_STATE_SCRUBBING);
5189 state_clear(PG_STATE_REPAIR);
5190 state_clear(PG_STATE_DEEP_SCRUB);
5191 publish_stats_to_osd();
5192 return;
5193 }
5194
5195 if (!scrubber.active) {
5196 ceph_assert(backfill_targets.empty());
5197
5198 scrubber.deep = state_test(PG_STATE_DEEP_SCRUB);
5199
5200 dout(10) << "starting a new chunky scrub" << dendl;
5201 }
5202
5203 chunky_scrub(handle);
5204 }
5205
5206 /*
5207 * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
5208 * chunk.
5209 *
5210 * The object store is partitioned into chunks which end on hash boundaries. For
5211 * each chunk, the following logic is performed:
5212 *
5213 * (1) Block writes on the chunk
5214 * (2) Request maps from replicas
5215 * (3) Wait for pushes to be applied (after recovery)
5216 * (4) Wait for writes to flush on the chunk
5217 * (5) Wait for maps from replicas
5218 * (6) Compare / repair all scrub maps
5219 * (7) Wait for digest updates to apply
5220 *
5221 * This logic is encoded in the mostly linear state machine:
5222 *
5223 * +------------------+
5224 * _________v__________ |
5225 * | | |
5226 * | INACTIVE | |
5227 * |____________________| |
5228 * | |
5229 * | +----------+ |
5230 * _________v___v______ | |
5231 * | | | |
5232 * | NEW_CHUNK | | |
5233 * |____________________| | |
5234 * | | |
5235 * _________v__________ | |
5236 * | | | |
5237 * | WAIT_PUSHES | | |
5238 * |____________________| | |
5239 * | | |
5240 * _________v__________ | |
5241 * | | | |
5242 * | WAIT_LAST_UPDATE | | |
5243 * |____________________| | |
5244 * | | |
5245 * _________v__________ | |
5246 * | | | |
5247 * | BUILD_MAP | | |
5248 * |____________________| | |
5249 * | | |
5250 * _________v__________ | |
5251 * | | | |
5252 * | WAIT_REPLICAS | | |
5253 * |____________________| | |
5254 * | | |
5255 * _________v__________ | |
5256 * | | | |
5257 * | COMPARE_MAPS | | |
5258 * |____________________| | |
5259 * | | |
5260 * | | |
5261 * _________v__________ | |
5262 * | | | |
5263 * |WAIT_DIGEST_UPDATES | | |
5264 * |____________________| | |
5265 * | | | |
5266 * | +----------+ |
5267 * _________v__________ |
5268 * | | |
5269 * | FINISH | |
5270 * |____________________| |
5271 * | |
5272 * +------------------+
5273 *
5274 * The primary determines the last update from the subset by walking the log. If
5275 * it sees a log entry pertaining to a file in the chunk, it tells the replicas
5276 * to wait until that update is applied before building a scrub map. Both the
5277 * primary and replicas will wait for any active pushes to be applied.
5278 *
5279 * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
5280 *
5281 * scrubber.state encodes the current state of the scrub (refer to state diagram
5282 * for details).
5283 */
5284 void PG::chunky_scrub(ThreadPool::TPHandle &handle)
5285 {
5286 // check for map changes
5287 if (scrubber.is_chunky_scrub_active()) {
5288 if (scrubber.epoch_start != info.history.same_interval_since) {
5289 dout(10) << "scrub pg changed, aborting" << dendl;
5290 scrub_clear_state();
5291 scrub_unreserve_replicas();
5292 return;
5293 }
5294 }
5295
5296 bool done = false;
5297 int ret;
5298
5299 while (!done) {
5300 dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
5301 << " [" << scrubber.start << "," << scrubber.end << ")"
5302 << " max_end " << scrubber.max_end << dendl;
5303
5304 switch (scrubber.state) {
5305 case PG::Scrubber::INACTIVE:
5306 dout(10) << "scrub start" << dendl;
5307 ceph_assert(is_primary());
5308
5309 publish_stats_to_osd();
5310 scrubber.epoch_start = info.history.same_interval_since;
5311 scrubber.active = true;
5312
5313 osd->inc_scrubs_active(scrubber.reserved);
5314 if (scrubber.reserved) {
5315 scrubber.reserved = false;
5316 scrubber.reserved_peers.clear();
5317 }
5318
5319 {
5320 ObjectStore::Transaction t;
5321 scrubber.cleanup_store(&t);
5322 scrubber.store.reset(Scrub::Store::create(osd->store, &t,
5323 info.pgid, coll));
5324 osd->store->queue_transaction(ch, std::move(t), nullptr);
5325 }
5326
5327 // Don't include temporary objects when scrubbing
5328 scrubber.start = info.pgid.pgid.get_hobj_start();
5329 scrubber.state = PG::Scrubber::NEW_CHUNK;
5330
5331 {
5332 bool repair = state_test(PG_STATE_REPAIR);
5333 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5334 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5335 stringstream oss;
5336 oss << info.pgid.pgid << " " << mode << " starts" << std::endl;
5337 osd->clog->debug(oss);
5338 }
5339
5340 scrubber.preempt_left = cct->_conf.get_val<uint64_t>(
5341 "osd_scrub_max_preemptions");
5342 scrubber.preempt_divisor = 1;
5343 break;
5344
5345 case PG::Scrubber::NEW_CHUNK:
5346 scrubber.primary_scrubmap = ScrubMap();
5347 scrubber.received_maps.clear();
5348
5349 // begin (possible) preemption window
5350 if (scrub_preempted) {
5351 scrubber.preempt_left--;
5352 scrubber.preempt_divisor *= 2;
5353 dout(10) << __func__ << " preempted, " << scrubber.preempt_left
5354 << " left" << dendl;
5355 scrub_preempted = false;
5356 }
5357 scrub_can_preempt = scrubber.preempt_left > 0;
5358
5359 {
5360 /* get the start and end of our scrub chunk
5361 *
5362 * Our scrub chunk has an important restriction we're going to need to
5363 * respect. We can't let head be start or end.
5364 * Using a half-open interval means that if end == head,
5365 * we'd scrub/lock head and the clone right next to head in different
5366 * chunks which would allow us to miss clones created between
5367 * scrubbing that chunk and scrubbing the chunk including head.
5368 * This isn't true for any of the other clones since clones can
5369 * only be created "just to the left of" head. There is one exception
5370 * to this: promotion of clones which always happens to the left of the
5371 * left-most clone, but promote_object checks the scrubber in that
5372 * case, so it should be ok. Also, it's ok to "miss" clones at the
5373 * left end of the range if we are a tier because they may legitimately
5374 * not exist (see _scrub).
5375 */
5376 int min = std::max<int64_t>(3, cct->_conf->osd_scrub_chunk_min /
5377 scrubber.preempt_divisor);
5378 int max = std::max<int64_t>(min, cct->_conf->osd_scrub_chunk_max /
5379 scrubber.preempt_divisor);
5380 hobject_t start = scrubber.start;
5381 hobject_t candidate_end;
5382 vector<hobject_t> objects;
5383 ret = get_pgbackend()->objects_list_partial(
5384 start,
5385 min,
5386 max,
5387 &objects,
5388 &candidate_end);
5389 ceph_assert(ret >= 0);
5390
5391 if (!objects.empty()) {
5392 hobject_t back = objects.back();
5393 while (candidate_end.is_head() &&
5394 candidate_end == back.get_head()) {
5395 candidate_end = back;
5396 objects.pop_back();
5397 if (objects.empty()) {
5398 ceph_assert(0 ==
5399 "Somehow we got more than 2 objects which"
5400 "have the same head but are not clones");
5401 }
5402 back = objects.back();
5403 }
5404 if (candidate_end.is_head()) {
5405 ceph_assert(candidate_end != back.get_head());
5406 candidate_end = candidate_end.get_object_boundary();
5407 }
5408 } else {
5409 ceph_assert(candidate_end.is_max());
5410 }
5411
5412 if (!_range_available_for_scrub(scrubber.start, candidate_end)) {
5413 // we'll be requeued by whatever made us unavailable for scrub
5414 dout(10) << __func__ << ": scrub blocked somewhere in range "
5415 << "[" << scrubber.start << ", " << candidate_end << ")"
5416 << dendl;
5417 done = true;
5418 break;
5419 }
5420 scrubber.end = candidate_end;
5421 if (scrubber.end > scrubber.max_end)
5422 scrubber.max_end = scrubber.end;
5423 }
5424
5425 // walk the log to find the latest update that affects our chunk
5426 scrubber.subset_last_update = eversion_t();
5427 for (auto p = projected_log.log.rbegin();
5428 p != projected_log.log.rend();
5429 ++p) {
5430 if (p->soid >= scrubber.start &&
5431 p->soid < scrubber.end) {
5432 scrubber.subset_last_update = p->version;
5433 break;
5434 }
5435 }
5436 if (scrubber.subset_last_update == eversion_t()) {
5437 for (list<pg_log_entry_t>::const_reverse_iterator p =
5438 pg_log.get_log().log.rbegin();
5439 p != pg_log.get_log().log.rend();
5440 ++p) {
5441 if (p->soid >= scrubber.start &&
5442 p->soid < scrubber.end) {
5443 scrubber.subset_last_update = p->version;
5444 break;
5445 }
5446 }
5447 }
5448
5449 scrubber.state = PG::Scrubber::WAIT_PUSHES;
5450 break;
5451
5452 case PG::Scrubber::WAIT_PUSHES:
5453 if (active_pushes == 0) {
5454 scrubber.state = PG::Scrubber::WAIT_LAST_UPDATE;
5455 } else {
5456 dout(15) << "wait for pushes to apply" << dendl;
5457 done = true;
5458 }
5459 break;
5460
5461 case PG::Scrubber::WAIT_LAST_UPDATE:
5462 if (last_update_applied < scrubber.subset_last_update) {
5463 // will be requeued by op_applied
5464 dout(15) << "wait for EC read/modify/writes to queue" << dendl;
5465 done = true;
5466 break;
5467 }
5468
5469 // ask replicas to scan
5470 scrubber.waiting_on_whom.insert(pg_whoami);
5471
5472 // request maps from replicas
5473 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
5474 i != acting_recovery_backfill.end();
5475 ++i) {
5476 if (*i == pg_whoami) continue;
5477 _request_scrub_map(*i, scrubber.subset_last_update,
5478 scrubber.start, scrubber.end, scrubber.deep,
5479 scrubber.preempt_left > 0);
5480 scrubber.waiting_on_whom.insert(*i);
5481 }
5482 dout(10) << __func__ << " waiting_on_whom " << scrubber.waiting_on_whom
5483 << dendl;
5484
5485 scrubber.state = PG::Scrubber::BUILD_MAP;
5486 scrubber.primary_scrubmap_pos.reset();
5487 break;
5488
5489 case PG::Scrubber::BUILD_MAP:
5490 ceph_assert(last_update_applied >= scrubber.subset_last_update);
5491
5492 // build my own scrub map
5493 if (scrub_preempted) {
5494 dout(10) << __func__ << " preempted" << dendl;
5495 scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
5496 break;
5497 }
5498 ret = build_scrub_map_chunk(
5499 scrubber.primary_scrubmap,
5500 scrubber.primary_scrubmap_pos,
5501 scrubber.start, scrubber.end,
5502 scrubber.deep,
5503 handle);
5504 if (ret == -EINPROGRESS) {
5505 requeue_scrub();
5506 done = true;
5507 break;
5508 }
5509 scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
5510 break;
5511
5512 case PG::Scrubber::BUILD_MAP_DONE:
5513 if (scrubber.primary_scrubmap_pos.ret < 0) {
5514 dout(5) << "error: " << scrubber.primary_scrubmap_pos.ret
5515 << ", aborting" << dendl;
5516 scrub_clear_state();
5517 scrub_unreserve_replicas();
5518 return;
5519 }
5520 dout(10) << __func__ << " waiting_on_whom was "
5521 << scrubber.waiting_on_whom << dendl;
5522 ceph_assert(scrubber.waiting_on_whom.count(pg_whoami));
5523 scrubber.waiting_on_whom.erase(pg_whoami);
5524
5525 scrubber.state = PG::Scrubber::WAIT_REPLICAS;
5526 break;
5527
5528 case PG::Scrubber::WAIT_REPLICAS:
5529 if (!scrubber.waiting_on_whom.empty()) {
5530 // will be requeued by sub_op_scrub_map
5531 dout(10) << "wait for replicas to build scrub map" << dendl;
5532 done = true;
5533 break;
5534 }
5535 // end (possible) preemption window
5536 scrub_can_preempt = false;
5537 if (scrub_preempted) {
5538 dout(10) << __func__ << " preempted, restarting chunk" << dendl;
5539 scrubber.state = PG::Scrubber::NEW_CHUNK;
5540 } else {
5541 scrubber.state = PG::Scrubber::COMPARE_MAPS;
5542 }
5543 break;
5544
5545 case PG::Scrubber::COMPARE_MAPS:
5546 ceph_assert(last_update_applied >= scrubber.subset_last_update);
5547 ceph_assert(scrubber.waiting_on_whom.empty());
5548
5549 scrub_compare_maps();
5550 scrubber.start = scrubber.end;
5551 scrubber.run_callbacks();
5552
5553 // requeue the writes from the chunk that just finished
5554 requeue_ops(waiting_for_scrub);
5555
5556 scrubber.state = PG::Scrubber::WAIT_DIGEST_UPDATES;
5557
5558 // fall-thru
5559
5560 case PG::Scrubber::WAIT_DIGEST_UPDATES:
5561 if (scrubber.num_digest_updates_pending) {
5562 dout(10) << __func__ << " waiting on "
5563 << scrubber.num_digest_updates_pending
5564 << " digest updates" << dendl;
5565 done = true;
5566 break;
5567 }
5568
5569 scrubber.preempt_left = cct->_conf.get_val<uint64_t>(
5570 "osd_scrub_max_preemptions");
5571 scrubber.preempt_divisor = 1;
5572
5573 if (!(scrubber.end.is_max())) {
5574 scrubber.state = PG::Scrubber::NEW_CHUNK;
5575 requeue_scrub();
5576 done = true;
5577 } else {
5578 scrubber.state = PG::Scrubber::FINISH;
5579 }
5580
5581 break;
5582
5583 case PG::Scrubber::FINISH:
5584 scrub_finish();
5585 scrubber.state = PG::Scrubber::INACTIVE;
5586 done = true;
5587
5588 if (!snap_trimq.empty()) {
5589 dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
5590 snap_trimmer_scrub_complete();
5591 }
5592
5593 break;
5594
5595 case PG::Scrubber::BUILD_MAP_REPLICA:
5596 // build my own scrub map
5597 if (scrub_preempted) {
5598 dout(10) << __func__ << " preempted" << dendl;
5599 ret = 0;
5600 } else {
5601 ret = build_scrub_map_chunk(
5602 scrubber.replica_scrubmap,
5603 scrubber.replica_scrubmap_pos,
5604 scrubber.start, scrubber.end,
5605 scrubber.deep,
5606 handle);
5607 }
5608 if (ret == -EINPROGRESS) {
5609 requeue_scrub();
5610 done = true;
5611 break;
5612 }
5613 // reply
5614 {
5615 MOSDRepScrubMap *reply = new MOSDRepScrubMap(
5616 spg_t(info.pgid.pgid, get_primary().shard),
5617 scrubber.replica_scrub_start,
5618 pg_whoami);
5619 reply->preempted = scrub_preempted;
5620 ::encode(scrubber.replica_scrubmap, reply->get_data());
5621 osd->send_message_osd_cluster(
5622 get_primary().osd, reply,
5623 scrubber.replica_scrub_start);
5624 }
5625 scrub_preempted = false;
5626 scrub_can_preempt = false;
5627 scrubber.state = PG::Scrubber::INACTIVE;
5628 scrubber.replica_scrubmap = ScrubMap();
5629 scrubber.replica_scrubmap_pos = ScrubMapBuilder();
5630 scrubber.start = hobject_t();
5631 scrubber.end = hobject_t();
5632 scrubber.max_end = hobject_t();
5633 done = true;
5634 break;
5635
5636 default:
5637 ceph_abort();
5638 }
5639 }
5640 dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
5641 << " [" << scrubber.start << "," << scrubber.end << ")"
5642 << " max_end " << scrubber.max_end << dendl;
5643 }
5644
5645 bool PG::write_blocked_by_scrub(const hobject_t& soid)
5646 {
5647 if (soid < scrubber.start || soid >= scrubber.end) {
5648 return false;
5649 }
5650 if (scrub_can_preempt) {
5651 if (!scrub_preempted) {
5652 dout(10) << __func__ << " " << soid << " preempted" << dendl;
5653 scrub_preempted = true;
5654 } else {
5655 dout(10) << __func__ << " " << soid << " already preempted" << dendl;
5656 }
5657 return false;
5658 }
5659 return true;
5660 }
5661
5662 bool PG::range_intersects_scrub(const hobject_t &start, const hobject_t& end)
5663 {
5664 // does [start, end] intersect [scrubber.start, scrubber.max_end)
5665 return (start < scrubber.max_end &&
5666 end >= scrubber.start);
5667 }
5668
5669 void PG::scrub_clear_state(bool has_error)
5670 {
5671 ceph_assert(is_locked());
5672 state_clear(PG_STATE_SCRUBBING);
5673 if (!has_error)
5674 state_clear(PG_STATE_REPAIR);
5675 state_clear(PG_STATE_DEEP_SCRUB);
5676 publish_stats_to_osd();
5677
5678 // active -> nothing.
5679 if (scrubber.active)
5680 osd->dec_scrubs_active();
5681
5682 requeue_ops(waiting_for_scrub);
5683
5684 scrubber.reset();
5685
5686 // type-specific state clear
5687 _scrub_clear_state();
5688 }
5689
5690 void PG::scrub_compare_maps()
5691 {
5692 dout(10) << __func__ << " has maps, analyzing" << dendl;
5693
5694 // construct authoritative scrub map for type specific scrubbing
5695 scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
5696 map<hobject_t,
5697 pair<boost::optional<uint32_t>,
5698 boost::optional<uint32_t>>> missing_digest;
5699
5700 map<pg_shard_t, ScrubMap *> maps;
5701 maps[pg_whoami] = &scrubber.primary_scrubmap;
5702
5703 for (const auto& i : acting_recovery_backfill) {
5704 if (i == pg_whoami) continue;
5705 dout(2) << __func__ << " replica " << i << " has "
5706 << scrubber.received_maps[i].objects.size()
5707 << " items" << dendl;
5708 maps[i] = &scrubber.received_maps[i];
5709 }
5710
5711 set<hobject_t> master_set;
5712
5713 // Construct master set
5714 for (const auto map : maps) {
5715 for (const auto i : map.second->objects) {
5716 master_set.insert(i.first);
5717 }
5718 }
5719
5720 stringstream ss;
5721 get_pgbackend()->be_omap_checks(maps, master_set,
5722 scrubber.omap_stats, ss);
5723
5724 if (!ss.str().empty()) {
5725 osd->clog->warn(ss);
5726 }
5727
5728 if (acting.size() > 1) {
5729 dout(10) << __func__ << " comparing replica scrub maps" << dendl;
5730
5731 // Map from object with errors to good peer
5732 map<hobject_t, list<pg_shard_t>> authoritative;
5733
5734 dout(2) << __func__ << " osd." << acting[0] << " has "
5735 << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
5736
5737 ss.str("");
5738 ss.clear();
5739
5740 get_pgbackend()->be_compare_scrubmaps(
5741 maps,
5742 master_set,
5743 state_test(PG_STATE_REPAIR),
5744 scrubber.missing,
5745 scrubber.inconsistent,
5746 authoritative,
5747 missing_digest,
5748 scrubber.shallow_errors,
5749 scrubber.deep_errors,
5750 scrubber.store.get(),
5751 info.pgid, acting,
5752 ss);
5753 dout(2) << ss.str() << dendl;
5754
5755 if (!ss.str().empty()) {
5756 osd->clog->error(ss);
5757 }
5758
5759 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
5760 i != authoritative.end();
5761 ++i) {
5762 list<pair<ScrubMap::object, pg_shard_t> > good_peers;
5763 for (list<pg_shard_t>::const_iterator j = i->second.begin();
5764 j != i->second.end();
5765 ++j) {
5766 good_peers.push_back(make_pair(maps[*j]->objects[i->first], *j));
5767 }
5768 scrubber.authoritative.insert(
5769 make_pair(
5770 i->first,
5771 good_peers));
5772 }
5773
5774 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
5775 i != authoritative.end();
5776 ++i) {
5777 scrubber.cleaned_meta_map.objects.erase(i->first);
5778 scrubber.cleaned_meta_map.objects.insert(
5779 *(maps[i->second.back()]->objects.find(i->first))
5780 );
5781 }
5782 }
5783
5784 ScrubMap for_meta_scrub;
5785 scrubber.clean_meta_map(for_meta_scrub);
5786
5787 // ok, do the pg-type specific scrubbing
5788 scrub_snapshot_metadata(for_meta_scrub, missing_digest);
5789 // Called here on the primary can use an authoritative map if it isn't the primary
5790 _scan_snaps(for_meta_scrub);
5791 if (!scrubber.store->empty()) {
5792 if (state_test(PG_STATE_REPAIR)) {
5793 dout(10) << __func__ << ": discarding scrub results" << dendl;
5794 scrubber.store->flush(nullptr);
5795 } else {
5796 dout(10) << __func__ << ": updating scrub object" << dendl;
5797 ObjectStore::Transaction t;
5798 scrubber.store->flush(&t);
5799 osd->store->queue_transaction(ch, std::move(t), nullptr);
5800 }
5801 }
5802 }
5803
5804 bool PG::scrub_process_inconsistent()
5805 {
5806 dout(10) << __func__ << ": checking authoritative" << dendl;
5807 bool repair = state_test(PG_STATE_REPAIR);
5808 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5809 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5810
5811 // authoriative only store objects which missing or inconsistent.
5812 if (!scrubber.authoritative.empty()) {
5813 stringstream ss;
5814 ss << info.pgid << " " << mode << " "
5815 << scrubber.missing.size() << " missing, "
5816 << scrubber.inconsistent.size() << " inconsistent objects";
5817 dout(2) << ss.str() << dendl;
5818 osd->clog->error(ss);
5819 if (repair) {
5820 state_clear(PG_STATE_CLEAN);
5821 for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >>::iterator i =
5822 scrubber.authoritative.begin();
5823 i != scrubber.authoritative.end();
5824 ++i) {
5825 set<pg_shard_t>::iterator j;
5826
5827 auto missing_entry = scrubber.missing.find(i->first);
5828 if (missing_entry != scrubber.missing.end()) {
5829 for (j = missing_entry->second.begin();
5830 j != missing_entry->second.end();
5831 ++j) {
5832 repair_object(
5833 i->first,
5834 &(i->second),
5835 *j);
5836 ++scrubber.fixed;
5837 }
5838 }
5839 if (scrubber.inconsistent.count(i->first)) {
5840 for (j = scrubber.inconsistent[i->first].begin();
5841 j != scrubber.inconsistent[i->first].end();
5842 ++j) {
5843 repair_object(i->first,
5844 &(i->second),
5845 *j);
5846 ++scrubber.fixed;
5847 }
5848 }
5849 }
5850 }
5851 }
5852 return (!scrubber.authoritative.empty() && repair);
5853 }
5854
5855 bool PG::ops_blocked_by_scrub() const {
5856 return (waiting_for_scrub.size() != 0);
5857 }
5858
5859 // the part that actually finalizes a scrub
5860 void PG::scrub_finish()
5861 {
5862 dout(20) << __func__ << dendl;
5863 bool repair = state_test(PG_STATE_REPAIR);
5864 bool do_auto_scrub = false;
5865 // if the repair request comes from auto-repair and large number of errors,
5866 // we would like to cancel auto-repair
5867 if (repair && scrubber.auto_repair
5868 && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) {
5869 state_clear(PG_STATE_REPAIR);
5870 repair = false;
5871 }
5872 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5873 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5874
5875 // if a regular scrub had errors within the limit, do a deep scrub to auto repair.
5876 if (scrubber.deep_scrub_on_error
5877 && scrubber.authoritative.size()
5878 && scrubber.authoritative.size() <= cct->_conf->osd_scrub_auto_repair_num_errors) {
5879 ceph_assert(!deep_scrub);
5880 do_auto_scrub = true;
5881 dout(20) << __func__ << " Try to auto repair after scrub errors" << dendl;
5882 }
5883 scrubber.deep_scrub_on_error = false;
5884
5885 // type-specific finish (can tally more errors)
5886 _scrub_finish();
5887
5888 bool has_error = scrub_process_inconsistent();
5889
5890 {
5891 stringstream oss;
5892 oss << info.pgid.pgid << " " << mode << " ";
5893 int total_errors = scrubber.shallow_errors + scrubber.deep_errors;
5894 if (total_errors)
5895 oss << total_errors << " errors";
5896 else
5897 oss << "ok";
5898 if (!deep_scrub && info.stats.stats.sum.num_deep_scrub_errors)
5899 oss << " ( " << info.stats.stats.sum.num_deep_scrub_errors
5900 << " remaining deep scrub error details lost)";
5901 if (repair)
5902 oss << ", " << scrubber.fixed << " fixed";
5903 if (total_errors)
5904 osd->clog->error(oss);
5905 else
5906 osd->clog->debug(oss);
5907 }
5908
5909 // finish up
5910 unreg_next_scrub();
5911 utime_t now = ceph_clock_now();
5912 info.history.last_scrub = info.last_update;
5913 info.history.last_scrub_stamp = now;
5914 if (scrubber.deep) {
5915 info.history.last_deep_scrub = info.last_update;
5916 info.history.last_deep_scrub_stamp = now;
5917 }
5918 // Since we don't know which errors were fixed, we can only clear them
5919 // when every one has been fixed.
5920 if (repair) {
5921 if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) {
5922 ceph_assert(deep_scrub);
5923 scrubber.shallow_errors = scrubber.deep_errors = 0;
5924 dout(20) << __func__ << " All may be fixed" << dendl;
5925 } else if (has_error) {
5926 // Deep scrub in order to get corrected error counts
5927 scrub_after_recovery = true;
5928 dout(20) << __func__ << " Set scrub_after_recovery" << dendl;
5929 } else if (scrubber.shallow_errors || scrubber.deep_errors) {
5930 // We have errors but nothing can be fixed, so there is no repair
5931 // possible.
5932 state_set(PG_STATE_FAILED_REPAIR);
5933 dout(10) << __func__ << " " << (scrubber.shallow_errors + scrubber.deep_errors)
5934 << " error(s) present with no repair possible" << dendl;
5935 }
5936 }
5937 if (deep_scrub) {
5938 if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0))
5939 info.history.last_clean_scrub_stamp = now;
5940 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
5941 info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
5942 info.stats.stats.sum.num_large_omap_objects = scrubber.omap_stats.large_omap_objects;
5943 info.stats.stats.sum.num_omap_bytes = scrubber.omap_stats.omap_bytes;
5944 info.stats.stats.sum.num_omap_keys = scrubber.omap_stats.omap_keys;
5945 dout(25) << __func__ << " shard " << pg_whoami << " num_omap_bytes = "
5946 << info.stats.stats.sum.num_omap_bytes << " num_omap_keys = "
5947 << info.stats.stats.sum.num_omap_keys << dendl;
5948 } else {
5949 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
5950 // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
5951 // because of deep-scrub errors
5952 if (scrubber.shallow_errors == 0)
5953 info.history.last_clean_scrub_stamp = now;
5954 }
5955 info.stats.stats.sum.num_scrub_errors =
5956 info.stats.stats.sum.num_shallow_scrub_errors +
5957 info.stats.stats.sum.num_deep_scrub_errors;
5958 if (scrubber.check_repair) {
5959 scrubber.check_repair = false;
5960 if (info.stats.stats.sum.num_scrub_errors) {
5961 state_set(PG_STATE_FAILED_REPAIR);
5962 dout(10) << __func__ << " " << info.stats.stats.sum.num_scrub_errors
5963 << " error(s) still present after re-scrub" << dendl;
5964 }
5965 }
5966 publish_stats_to_osd();
5967
5968 {
5969 ObjectStore::Transaction t;
5970 dirty_info = true;
5971 write_if_dirty(t);
5972 int tr = osd->store->queue_transaction(ch, std::move(t), NULL);
5973 ceph_assert(tr == 0);
5974 }
5975
5976
5977 if (has_error) {
5978 queue_peering_event(
5979 PGPeeringEventRef(
5980 std::make_shared<PGPeeringEvent>(
5981 get_osdmap_epoch(),
5982 get_osdmap_epoch(),
5983 DoRecovery())));
5984 }
5985
5986 scrub_clear_state(has_error);
5987 scrub_unreserve_replicas();
5988
5989 if (do_auto_scrub) {
5990 scrub_requested(false, false, true);
5991 } else {
5992 reg_next_scrub();
5993 }
5994
5995 if (is_active() && is_primary()) {
5996 share_pg_info();
5997 }
5998 }
5999
6000 void PG::share_pg_info()
6001 {
6002 dout(10) << "share_pg_info" << dendl;
6003
6004 // share new pg_info_t with replicas
6005 ceph_assert(!acting_recovery_backfill.empty());
6006 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
6007 i != acting_recovery_backfill.end();
6008 ++i) {
6009 if (*i == pg_whoami) continue;
6010 auto pg_shard = *i;
6011 auto peer = peer_info.find(pg_shard);
6012 if (peer != peer_info.end()) {
6013 peer->second.last_epoch_started = info.last_epoch_started;
6014 peer->second.last_interval_started = info.last_interval_started;
6015 peer->second.history.merge(info.history);
6016 }
6017 MOSDPGInfo *m = new MOSDPGInfo(get_osdmap_epoch());
6018 m->pg_list.push_back(
6019 make_pair(
6020 pg_notify_t(
6021 pg_shard.shard, pg_whoami.shard,
6022 get_osdmap_epoch(),
6023 get_osdmap_epoch(),
6024 info),
6025 past_intervals));
6026 osd->send_message_osd_cluster(pg_shard.osd, m, get_osdmap_epoch());
6027 }
6028 }
6029
6030 bool PG::append_log_entries_update_missing(
6031 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
6032 ObjectStore::Transaction &t, boost::optional<eversion_t> trim_to,
6033 boost::optional<eversion_t> roll_forward_to)
6034 {
6035 ceph_assert(!entries.empty());
6036 ceph_assert(entries.begin()->version > info.last_update);
6037
6038 PGLogEntryHandler rollbacker{this, &t};
6039 bool invalidate_stats =
6040 pg_log.append_new_log_entries(info.last_backfill,
6041 info.last_backfill_bitwise,
6042 entries,
6043 &rollbacker);
6044
6045 if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
6046 pg_log.roll_forward(&rollbacker);
6047 }
6048 if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
6049 pg_log.roll_forward_to(*roll_forward_to, &rollbacker);
6050 last_rollback_info_trimmed_to_applied = *roll_forward_to;
6051 }
6052
6053 info.last_update = pg_log.get_head();
6054
6055 if (pg_log.get_missing().num_missing() == 0) {
6056 // advance last_complete since nothing else is missing!
6057 info.last_complete = info.last_update;
6058 }
6059 info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
6060
6061 dout(20) << __func__ << " trim_to bool = " << bool(trim_to) << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
6062 if (trim_to)
6063 pg_log.trim(*trim_to, info);
6064 dirty_info = true;
6065 write_if_dirty(t);
6066 return invalidate_stats;
6067 }
6068
6069
6070 void PG::merge_new_log_entries(
6071 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
6072 ObjectStore::Transaction &t,
6073 boost::optional<eversion_t> trim_to,
6074 boost::optional<eversion_t> roll_forward_to)
6075 {
6076 dout(10) << __func__ << " " << entries << dendl;
6077 ceph_assert(is_primary());
6078
6079 bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
6080 for (set<pg_shard_t>::const_iterator i = acting_recovery_backfill.begin();
6081 i != acting_recovery_backfill.end();
6082 ++i) {
6083 pg_shard_t peer(*i);
6084 if (peer == pg_whoami) continue;
6085 ceph_assert(peer_missing.count(peer));
6086 ceph_assert(peer_info.count(peer));
6087 pg_missing_t& pmissing(peer_missing[peer]);
6088 dout(20) << __func__ << " peer_missing for " << peer << " = " << pmissing << dendl;
6089 pg_info_t& pinfo(peer_info[peer]);
6090 bool invalidate_stats = PGLog::append_log_entries_update_missing(
6091 pinfo.last_backfill,
6092 info.last_backfill_bitwise,
6093 entries,
6094 true,
6095 NULL,
6096 pmissing,
6097 NULL,
6098 this);
6099 pinfo.last_update = info.last_update;
6100 pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
6101 rebuild_missing = rebuild_missing || invalidate_stats;
6102 }
6103
6104 if (!rebuild_missing) {
6105 return;
6106 }
6107
6108 for (auto &&i: entries) {
6109 missing_loc.rebuild(
6110 i.soid,
6111 pg_whoami,
6112 acting_recovery_backfill,
6113 info,
6114 pg_log.get_missing(),
6115 peer_missing,
6116 peer_info);
6117 }
6118 }
6119
6120 void PG::update_history(const pg_history_t& new_history)
6121 {
6122 if (info.history.merge(new_history)) {
6123 dout(20) << __func__ << " advanced history from " << new_history << dendl;
6124 dirty_info = true;
6125 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
6126 dout(20) << __func__ << " clearing past_intervals" << dendl;
6127 past_intervals.clear();
6128 dirty_big_info = true;
6129 }
6130 }
6131 on_info_history_change();
6132 }
6133
6134 void PG::fulfill_info(
6135 pg_shard_t from, const pg_query_t &query,
6136 pair<pg_shard_t, pg_info_t> &notify_info)
6137 {
6138 ceph_assert(from == primary);
6139 ceph_assert(query.type == pg_query_t::INFO);
6140
6141 // info
6142 dout(10) << "sending info" << dendl;
6143 notify_info = make_pair(from, info);
6144 }
6145
6146 void PG::fulfill_log(
6147 pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
6148 {
6149 dout(10) << "log request from " << from << dendl;
6150 ceph_assert(from == primary);
6151 ceph_assert(query.type != pg_query_t::INFO);
6152 ConnectionRef con = osd->get_con_osd_cluster(
6153 from.osd, get_osdmap_epoch());
6154 if (!con) return;
6155
6156 MOSDPGLog *mlog = new MOSDPGLog(
6157 from.shard, pg_whoami.shard,
6158 get_osdmap_epoch(),
6159 info, query_epoch);
6160 mlog->missing = pg_log.get_missing();
6161
6162 // primary -> other, when building master log
6163 if (query.type == pg_query_t::LOG) {
6164 dout(10) << " sending info+missing+log since " << query.since
6165 << dendl;
6166 if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
6167 osd->clog->error() << info.pgid << " got broken pg_query_t::LOG since " << query.since
6168 << " when my log.tail is " << pg_log.get_tail()
6169 << ", sending full log instead";
6170 mlog->log = pg_log.get_log(); // primary should not have requested this!!
6171 } else
6172 mlog->log.copy_after(cct, pg_log.get_log(), query.since);
6173 }
6174 else if (query.type == pg_query_t::FULLLOG) {
6175 dout(10) << " sending info+missing+full log" << dendl;
6176 mlog->log = pg_log.get_log();
6177 }
6178
6179 dout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
6180
6181 osd->share_map_peer(from.osd, con.get(), get_osdmap());
6182 osd->send_message_osd_cluster(mlog, con.get());
6183 }
6184
6185 void PG::fulfill_query(const MQuery& query, RecoveryCtx *rctx)
6186 {
6187 if (query.query.type == pg_query_t::INFO) {
6188 pair<pg_shard_t, pg_info_t> notify_info;
6189 update_history(query.query.history);
6190 fulfill_info(query.from, query.query, notify_info);
6191 rctx->send_notify(
6192 notify_info.first,
6193 pg_notify_t(
6194 notify_info.first.shard, pg_whoami.shard,
6195 query.query_epoch,
6196 get_osdmap_epoch(),
6197 notify_info.second),
6198 past_intervals);
6199 } else {
6200 update_history(query.query.history);
6201 fulfill_log(query.from, query.query, query.query_epoch);
6202 }
6203 }
6204
6205 void PG::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
6206 {
6207 bool changed = false;
6208 if (osdmap->test_flag(CEPH_OSDMAP_FULL) &&
6209 !lastmap->test_flag(CEPH_OSDMAP_FULL)) {
6210 dout(10) << " cluster was marked full in " << osdmap->get_epoch() << dendl;
6211 changed = true;
6212 }
6213 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
6214 if (!pi) {
6215 return; // pool deleted
6216 }
6217 if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
6218 const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
6219 if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
6220 dout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
6221 changed = true;
6222 }
6223 }
6224 if (changed) {
6225 info.history.last_epoch_marked_full = osdmap->get_epoch();
6226 dirty_info = true;
6227 }
6228 }
6229
6230 bool PG::should_restart_peering(
6231 int newupprimary,
6232 int newactingprimary,
6233 const vector<int>& newup,
6234 const vector<int>& newacting,
6235 OSDMapRef lastmap,
6236 OSDMapRef osdmap)
6237 {
6238 if (PastIntervals::is_new_interval(
6239 primary.osd,
6240 newactingprimary,
6241 acting,
6242 newacting,
6243 up_primary.osd,
6244 newupprimary,
6245 up,
6246 newup,
6247 osdmap,
6248 lastmap,
6249 info.pgid.pgid)) {
6250 dout(20) << "new interval newup " << newup
6251 << " newacting " << newacting << dendl;
6252 return true;
6253 }
6254 if (!lastmap->is_up(osd->whoami) && osdmap->is_up(osd->whoami)) {
6255 dout(10) << __func__ << " osd transitioned from down -> up" << dendl;
6256 return true;
6257 }
6258 return false;
6259 }
6260
6261 bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
6262 {
6263 if (last_peering_reset > reply_epoch ||
6264 last_peering_reset > query_epoch) {
6265 dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " << query_epoch
6266 << " last_peering_reset " << last_peering_reset
6267 << dendl;
6268 return true;
6269 }
6270 return false;
6271 }
6272
6273 void PG::set_last_peering_reset()
6274 {
6275 dout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl;
6276 if (last_peering_reset != get_osdmap_epoch()) {
6277 last_peering_reset = get_osdmap_epoch();
6278 reset_interval_flush();
6279 }
6280 }
6281
6282 struct FlushState {
6283 PGRef pg;
6284 epoch_t epoch;
6285 FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
6286 ~FlushState() {
6287 pg->lock();
6288 if (!pg->pg_has_reset_since(epoch))
6289 pg->on_flushed();
6290 pg->unlock();
6291 }
6292 };
6293 typedef std::shared_ptr<FlushState> FlushStateRef;
6294
6295 void PG::start_flush(ObjectStore::Transaction *t)
6296 {
6297 // flush in progress ops
6298 FlushStateRef flush_trigger (std::make_shared<FlushState>(
6299 this, get_osdmap_epoch()));
6300 flushes_in_progress++;
6301 t->register_on_applied(new ContainerContext<FlushStateRef>(flush_trigger));
6302 t->register_on_commit(new ContainerContext<FlushStateRef>(flush_trigger));
6303 }
6304
6305 void PG::reset_interval_flush()
6306 {
6307 dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
6308 recovery_state.clear_blocked_outgoing();
6309
6310 Context *c = new QueuePeeringEvt<IntervalFlush>(
6311 this, get_osdmap_epoch(), IntervalFlush());
6312 if (!ch->flush_commit(c)) {
6313 dout(10) << "Beginning to block outgoing recovery messages" << dendl;
6314 recovery_state.begin_block_outgoing();
6315 } else {
6316 dout(10) << "Not blocking outgoing recovery messages" << dendl;
6317 delete c;
6318 }
6319 }
6320
6321 /* Called before initializing peering during advance_map */
6322 void PG::start_peering_interval(
6323 const OSDMapRef lastmap,
6324 const vector<int>& newup, int new_up_primary,
6325 const vector<int>& newacting, int new_acting_primary,
6326 ObjectStore::Transaction *t)
6327 {
6328 const OSDMapRef osdmap = get_osdmap();
6329
6330 set_last_peering_reset();
6331
6332 vector<int> oldacting, oldup;
6333 int oldrole = get_role();
6334
6335 if (is_primary()) {
6336 osd->clear_ready_to_merge(this);
6337 }
6338
6339 pg_shard_t old_acting_primary = get_primary();
6340 pg_shard_t old_up_primary = up_primary;
6341 bool was_old_primary = is_primary();
6342 bool was_old_replica = is_replica();
6343
6344 acting.swap(oldacting);
6345 up.swap(oldup);
6346 init_primary_up_acting(
6347 newup,
6348 newacting,
6349 new_up_primary,
6350 new_acting_primary);
6351
6352 if (info.stats.up != up ||
6353 info.stats.acting != acting ||
6354 info.stats.up_primary != new_up_primary ||
6355 info.stats.acting_primary != new_acting_primary) {
6356 info.stats.up = up;
6357 info.stats.up_primary = new_up_primary;
6358 info.stats.acting = acting;
6359 info.stats.acting_primary = new_acting_primary;
6360 info.stats.mapping_epoch = osdmap->get_epoch();
6361 }
6362
6363 pg_stats_publish_lock.Lock();
6364 pg_stats_publish_valid = false;
6365 pg_stats_publish_lock.Unlock();
6366
6367 // This will now be remapped during a backfill in cases
6368 // that it would not have been before.
6369 if (up != acting)
6370 state_set(PG_STATE_REMAPPED);
6371 else
6372 state_clear(PG_STATE_REMAPPED);
6373
6374 int role = osdmap->calc_pg_role(osd->whoami, acting, acting.size());
6375 if (pool.info.is_replicated() || role == pg_whoami.shard)
6376 set_role(role);
6377 else
6378 set_role(-1);
6379
6380 // did acting, up, primary|acker change?
6381 if (!lastmap) {
6382 dout(10) << " no lastmap" << dendl;
6383 dirty_info = true;
6384 dirty_big_info = true;
6385 info.history.same_interval_since = osdmap->get_epoch();
6386 } else {
6387 std::stringstream debug;
6388 ceph_assert(info.history.same_interval_since != 0);
6389 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
6390 get_is_recoverable_predicate());
6391 bool new_interval = PastIntervals::check_new_interval(
6392 old_acting_primary.osd,
6393 new_acting_primary,
6394 oldacting, newacting,
6395 old_up_primary.osd,
6396 new_up_primary,
6397 oldup, newup,
6398 info.history.same_interval_since,
6399 info.history.last_epoch_clean,
6400 osdmap,
6401 lastmap,
6402 info.pgid.pgid,
6403 recoverable.get(),
6404 &past_intervals,
6405 &debug);
6406 dout(10) << __func__ << ": check_new_interval output: "
6407 << debug.str() << dendl;
6408 if (new_interval) {
6409 if (osdmap->get_epoch() == osd->get_superblock().oldest_map &&
6410 info.history.last_epoch_clean < osdmap->get_epoch()) {
6411 dout(10) << " map gap, clearing past_intervals and faking" << dendl;
6412 // our information is incomplete and useless; someone else was clean
6413 // after everything we know if osdmaps were trimmed.
6414 past_intervals.clear();
6415 } else {
6416 dout(10) << " noting past " << past_intervals << dendl;
6417 }
6418 dirty_info = true;
6419 dirty_big_info = true;
6420 info.history.same_interval_since = osdmap->get_epoch();
6421 if (osdmap->have_pg_pool(info.pgid.pgid.pool()) &&
6422 info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
6423 osdmap->get_pg_num(info.pgid.pgid.pool()),
6424 nullptr)) {
6425 info.history.last_epoch_split = osdmap->get_epoch();
6426 }
6427 }
6428 }
6429
6430 if (old_up_primary != up_primary ||
6431 oldup != up) {
6432 info.history.same_up_since = osdmap->get_epoch();
6433 }
6434 // this comparison includes primary rank via pg_shard_t
6435 if (old_acting_primary != get_primary()) {
6436 info.history.same_primary_since = osdmap->get_epoch();
6437 }
6438
6439 on_new_interval();
6440
6441 dout(1) << __func__ << " up " << oldup << " -> " << up
6442 << ", acting " << oldacting << " -> " << acting
6443 << ", acting_primary " << old_acting_primary << " -> " << new_acting_primary
6444 << ", up_primary " << old_up_primary << " -> " << new_up_primary
6445 << ", role " << oldrole << " -> " << role
6446 << ", features acting " << acting_features
6447 << " upacting " << upacting_features
6448 << dendl;
6449
6450 // deactivate.
6451 state_clear(PG_STATE_ACTIVE);
6452 state_clear(PG_STATE_PEERED);
6453 state_clear(PG_STATE_PREMERGE);
6454 state_clear(PG_STATE_DOWN);
6455 state_clear(PG_STATE_RECOVERY_WAIT);
6456 state_clear(PG_STATE_RECOVERY_TOOFULL);
6457 state_clear(PG_STATE_RECOVERING);
6458
6459 peer_purged.clear();
6460 acting_recovery_backfill.clear();
6461 scrub_queued = false;
6462
6463 // reset primary/replica state?
6464 if (was_old_primary || is_primary()) {
6465 osd->remove_want_pg_temp(info.pgid.pgid);
6466 } else if (was_old_replica || is_replica()) {
6467 osd->remove_want_pg_temp(info.pgid.pgid);
6468 }
6469 clear_primary_state();
6470
6471
6472 // pg->on_*
6473 on_change(t);
6474
6475 projected_last_update = eversion_t();
6476
6477 ceph_assert(!deleting);
6478
6479 // should we tell the primary we are here?
6480 send_notify = !is_primary();
6481
6482 if (role != oldrole ||
6483 was_old_primary != is_primary()) {
6484 // did primary change?
6485 if (was_old_primary != is_primary()) {
6486 state_clear(PG_STATE_CLEAN);
6487 clear_publish_stats();
6488 }
6489
6490 on_role_change();
6491
6492 // take active waiters
6493 requeue_ops(waiting_for_peered);
6494
6495 } else {
6496 // no role change.
6497 // did primary change?
6498 if (get_primary() != old_acting_primary) {
6499 dout(10) << *this << " " << oldacting << " -> " << acting
6500 << ", acting primary "
6501 << old_acting_primary << " -> " << get_primary()
6502 << dendl;
6503 } else {
6504 // primary is the same.
6505 if (is_primary()) {
6506 // i am (still) primary. but my replica set changed.
6507 state_clear(PG_STATE_CLEAN);
6508
6509 dout(10) << oldacting << " -> " << acting
6510 << ", replicas changed" << dendl;
6511 }
6512 }
6513 }
6514 cancel_recovery();
6515
6516 if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
6517 dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
6518 osd->queue_want_pg_temp(info.pgid.pgid, acting);
6519 }
6520 }
6521
6522 void PG::on_new_interval()
6523 {
6524 const OSDMapRef osdmap = get_osdmap();
6525
6526 on_info_history_change();
6527
6528 // initialize features
6529 acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
6530 upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
6531 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
6532 if (*p == CRUSH_ITEM_NONE)
6533 continue;
6534 uint64_t f = osdmap->get_xinfo(*p).features;
6535 acting_features &= f;
6536 upacting_features &= f;
6537 }
6538 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
6539 if (*p == CRUSH_ITEM_NONE)
6540 continue;
6541 upacting_features &= osdmap->get_xinfo(*p).features;
6542 }
6543
6544 _on_new_interval();
6545 }
6546
6547 void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
6548 {
6549 ceph_assert(!is_primary());
6550
6551 update_history(oinfo.history);
6552 if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
6553 info.stats.stats.sum.num_scrub_errors = 0;
6554 info.stats.stats.sum.num_shallow_scrub_errors = 0;
6555 info.stats.stats.sum.num_deep_scrub_errors = 0;
6556 dirty_info = true;
6557 }
6558
6559 if (!(info.purged_snaps == oinfo.purged_snaps)) {
6560 dout(10) << __func__ << " updating purged_snaps to " << oinfo.purged_snaps
6561 << dendl;
6562 info.purged_snaps = oinfo.purged_snaps;
6563 dirty_info = true;
6564 dirty_big_info = true;
6565 }
6566 }
6567
6568 ostream& operator<<(ostream& out, const PG& pg)
6569 {
6570 out << "pg[" << pg.info
6571 << " " << pg.up;
6572 if (pg.acting != pg.up)
6573 out << "/" << pg.acting;
6574 if (pg.is_ec_pg())
6575 out << "p" << pg.get_primary();
6576 if (!pg.async_recovery_targets.empty())
6577 out << " async=[" << pg.async_recovery_targets << "]";
6578 if (!pg.backfill_targets.empty())
6579 out << " backfill=[" << pg.backfill_targets << "]";
6580 out << " r=" << pg.get_role();
6581 out << " lpr=" << pg.get_last_peering_reset();
6582
6583 if (pg.deleting)
6584 out << " DELETING";
6585
6586 if (!pg.past_intervals.empty()) {
6587 out << " pi=[" << pg.past_intervals.get_bounds()
6588 << ")/" << pg.past_intervals.size();
6589 }
6590
6591 if (pg.is_peered()) {
6592 if (pg.last_update_ondisk != pg.info.last_update)
6593 out << " luod=" << pg.last_update_ondisk;
6594 if (pg.last_update_applied != pg.info.last_update)
6595 out << " lua=" << pg.last_update_applied;
6596 }
6597
6598 if (pg.recovery_ops_active)
6599 out << " rops=" << pg.recovery_ops_active;
6600
6601 if (pg.pg_log.get_tail() != pg.info.log_tail ||
6602 pg.pg_log.get_head() != pg.info.last_update)
6603 out << " (info mismatch, " << pg.pg_log.get_log() << ")";
6604
6605 if (!pg.pg_log.get_log().empty()) {
6606 if ((pg.pg_log.get_log().log.begin()->version <= pg.pg_log.get_tail())) {
6607 out << " (log bound mismatch, actual=["
6608 << pg.pg_log.get_log().log.begin()->version << ","
6609 << pg.pg_log.get_log().log.rbegin()->version << "]";
6610 out << ")";
6611 }
6612 }
6613
6614 out << " crt=" << pg.pg_log.get_can_rollback_to();
6615
6616 if (pg.last_complete_ondisk != pg.info.last_complete)
6617 out << " lcod " << pg.last_complete_ondisk;
6618
6619 if (pg.is_primary()) {
6620 out << " mlcod " << pg.min_last_complete_ondisk;
6621 }
6622
6623 out << " " << pg_state_string(pg.get_state());
6624 if (pg.should_send_notify())
6625 out << " NOTIFY";
6626
6627 if (pg.scrubber.must_repair)
6628 out << " MUST_REPAIR";
6629 if (pg.scrubber.auto_repair)
6630 out << " AUTO_REPAIR";
6631 if (pg.scrubber.check_repair)
6632 out << " CHECK_REPAIR";
6633 if (pg.scrubber.deep_scrub_on_error)
6634 out << " DEEP_SCRUB_ON_ERROR";
6635 if (pg.scrubber.must_deep_scrub)
6636 out << " MUST_DEEP_SCRUB";
6637 if (pg.scrubber.must_scrub)
6638 out << " MUST_SCRUB";
6639 if (pg.scrubber.time_for_deep)
6640 out << " TIME_FOR_DEEP";
6641 if (pg.scrubber.need_auto)
6642 out << " NEED_AUTO";
6643
6644 //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
6645 if (pg.pg_log.get_missing().num_missing()) {
6646 out << " m=" << pg.pg_log.get_missing().num_missing();
6647 if (pg.is_primary()) {
6648 uint64_t unfound = pg.get_num_unfound();
6649 if (unfound)
6650 out << " u=" << unfound;
6651 }
6652 }
6653 if (!pg.is_clean()) {
6654 out << " mbc=" << pg.missing_loc.get_missing_by_count();
6655 }
6656 if (!pg.snap_trimq.empty()) {
6657 out << " trimq=";
6658 // only show a count if the set is large
6659 if (pg.snap_trimq.num_intervals() > 16) {
6660 out << pg.snap_trimq.size();
6661 } else {
6662 out << pg.snap_trimq;
6663 }
6664 }
6665 if (!pg.info.purged_snaps.empty()) {
6666 out << " ps="; // snap trim queue / purged snaps
6667 if (pg.info.purged_snaps.num_intervals() > 16) {
6668 out << pg.info.purged_snaps.size();
6669 } else {
6670 out << pg.info.purged_snaps;
6671 }
6672 }
6673
6674 out << "]";
6675
6676
6677 return out;
6678 }
6679
6680 bool PG::can_discard_op(OpRequestRef& op)
6681 {
6682 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
6683 if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) {
6684 dout(20) << " discard " << *m << dendl;
6685 return true;
6686 }
6687
6688 if (m->get_map_epoch() < info.history.same_primary_since) {
6689 dout(7) << " changed after " << m->get_map_epoch()
6690 << ", dropping " << *m << dendl;
6691 return true;
6692 }
6693
6694 if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) {
6695 // >= luminous client
6696 if (m->get_connection()->has_feature(CEPH_FEATURE_SERVER_NAUTILUS)) {
6697 // >= nautilus client
6698 if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) {
6699 dout(7) << __func__ << " sent before last_force_op_resend "
6700 << pool.info.last_force_op_resend
6701 << ", dropping" << *m << dendl;
6702 return true;
6703 }
6704 } else {
6705 // == < nautilus client (luminous or mimic)
6706 if (m->get_map_epoch() < pool.info.get_last_force_op_resend_prenautilus()) {
6707 dout(7) << __func__ << " sent before last_force_op_resend_prenautilus "
6708 << pool.info.last_force_op_resend_prenautilus
6709 << ", dropping" << *m << dendl;
6710 return true;
6711 }
6712 }
6713 if (m->get_map_epoch() < info.history.last_epoch_split) {
6714 dout(7) << __func__ << " pg split in "
6715 << info.history.last_epoch_split << ", dropping" << dendl;
6716 return true;
6717 }
6718 } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) {
6719 // < luminous client
6720 if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) {
6721 dout(7) << __func__ << " sent before last_force_op_resend_preluminous "
6722 << pool.info.last_force_op_resend_preluminous
6723 << ", dropping" << *m << dendl;
6724 return true;
6725 }
6726 }
6727
6728 return false;
6729 }
6730
6731 template<typename T, int MSGTYPE>
6732 bool PG::can_discard_replica_op(OpRequestRef& op)
6733 {
6734 const T *m = static_cast<const T *>(op->get_req());
6735 ceph_assert(m->get_type() == MSGTYPE);
6736
6737 int from = m->get_source().num();
6738
6739 // if a repop is replied after a replica goes down in a new osdmap, and
6740 // before the pg advances to this new osdmap, the repop replies before this
6741 // repop can be discarded by that replica OSD, because the primary resets the
6742 // connection to it when handling the new osdmap marking it down, and also
6743 // resets the messenger sesssion when the replica reconnects. to avoid the
6744 // out-of-order replies, the messages from that replica should be discarded.
6745 OSDMapRef next_map = osd->get_next_osdmap();
6746 if (next_map->is_down(from))
6747 return true;
6748 /* Mostly, this overlaps with the old_peering_msg
6749 * condition. An important exception is pushes
6750 * sent by replicas not in the acting set, since
6751 * if such a replica goes down it does not cause
6752 * a new interval. */
6753 if (next_map->get_down_at(from) >= m->map_epoch)
6754 return true;
6755
6756 // same pg?
6757 // if pg changes _at all_, we reset and repeer!
6758 if (old_peering_msg(m->map_epoch, m->map_epoch)) {
6759 dout(10) << "can_discard_replica_op pg changed " << info.history
6760 << " after " << m->map_epoch
6761 << ", dropping" << dendl;
6762 return true;
6763 }
6764 return false;
6765 }
6766
6767 bool PG::can_discard_scan(OpRequestRef op)
6768 {
6769 const MOSDPGScan *m = static_cast<const MOSDPGScan *>(op->get_req());
6770 ceph_assert(m->get_type() == MSG_OSD_PG_SCAN);
6771
6772 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
6773 dout(10) << " got old scan, ignoring" << dendl;
6774 return true;
6775 }
6776 return false;
6777 }
6778
6779 bool PG::can_discard_backfill(OpRequestRef op)
6780 {
6781 const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill *>(op->get_req());
6782 ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL);
6783
6784 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
6785 dout(10) << " got old backfill, ignoring" << dendl;
6786 return true;
6787 }
6788
6789 return false;
6790
6791 }
6792
6793 bool PG::can_discard_request(OpRequestRef& op)
6794 {
6795 switch (op->get_req()->get_type()) {
6796 case CEPH_MSG_OSD_OP:
6797 return can_discard_op(op);
6798 case CEPH_MSG_OSD_BACKOFF:
6799 return false; // never discard
6800 case MSG_OSD_REPOP:
6801 return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op);
6802 case MSG_OSD_PG_PUSH:
6803 return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op);
6804 case MSG_OSD_PG_PULL:
6805 return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
6806 case MSG_OSD_PG_PUSH_REPLY:
6807 return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
6808 case MSG_OSD_REPOPREPLY:
6809 return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
6810 case MSG_OSD_PG_RECOVERY_DELETE:
6811 return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op);
6812
6813 case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
6814 return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op);
6815
6816 case MSG_OSD_EC_WRITE:
6817 return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
6818 case MSG_OSD_EC_WRITE_REPLY:
6819 return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op);
6820 case MSG_OSD_EC_READ:
6821 return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
6822 case MSG_OSD_EC_READ_REPLY:
6823 return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
6824 case MSG_OSD_REP_SCRUB:
6825 return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
6826 case MSG_OSD_SCRUB_RESERVE:
6827 return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op);
6828 case MSG_OSD_REP_SCRUBMAP:
6829 return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op);
6830 case MSG_OSD_PG_UPDATE_LOG_MISSING:
6831 return can_discard_replica_op<
6832 MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op);
6833 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
6834 return can_discard_replica_op<
6835 MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
6836
6837 case MSG_OSD_PG_SCAN:
6838 return can_discard_scan(op);
6839 case MSG_OSD_PG_BACKFILL:
6840 return can_discard_backfill(op);
6841 case MSG_OSD_PG_BACKFILL_REMOVE:
6842 return can_discard_replica_op<MOSDPGBackfillRemove,
6843 MSG_OSD_PG_BACKFILL_REMOVE>(op);
6844 }
6845 return true;
6846 }
6847
6848 void PG::take_waiters()
6849 {
6850 dout(10) << "take_waiters" << dendl;
6851 requeue_map_waiters();
6852 }
6853
6854 void PG::do_peering_event(PGPeeringEventRef evt, RecoveryCtx *rctx)
6855 {
6856 dout(10) << __func__ << ": " << evt->get_desc() << dendl;
6857 ceph_assert(have_same_or_newer_map(evt->get_epoch_sent()));
6858 if (old_peering_evt(evt)) {
6859 dout(10) << "discard old " << evt->get_desc() << dendl;
6860 } else {
6861 recovery_state.handle_event(evt, rctx);
6862 }
6863 // write_if_dirty regardless of path above to ensure we capture any work
6864 // done by OSD::advance_pg().
6865 write_if_dirty(*rctx->transaction);
6866 }
6867
6868 void PG::queue_peering_event(PGPeeringEventRef evt)
6869 {
6870 if (old_peering_evt(evt))
6871 return;
6872 osd->osd->enqueue_peering_evt(info.pgid, evt);
6873 }
6874
6875 void PG::queue_null(epoch_t msg_epoch,
6876 epoch_t query_epoch)
6877 {
6878 dout(10) << "null" << dendl;
6879 queue_peering_event(
6880 PGPeeringEventRef(std::make_shared<PGPeeringEvent>(msg_epoch, query_epoch,
6881 NullEvt())));
6882 }
6883
6884 void PG::find_unfound(epoch_t queued, RecoveryCtx *rctx)
6885 {
6886 /*
6887 * if we couldn't start any recovery ops and things are still
6888 * unfound, see if we can discover more missing object locations.
6889 * It may be that our initial locations were bad and we errored
6890 * out while trying to pull.
6891 */
6892 discover_all_missing(*rctx->query_map);
6893 if (rctx->query_map->empty()) {
6894 string action;
6895 if (state_test(PG_STATE_BACKFILLING)) {
6896 auto evt = PGPeeringEventRef(
6897 new PGPeeringEvent(
6898 queued,
6899 queued,
6900 PG::UnfoundBackfill()));
6901 queue_peering_event(evt);
6902 action = "in backfill";
6903 } else if (state_test(PG_STATE_RECOVERING)) {
6904 auto evt = PGPeeringEventRef(
6905 new PGPeeringEvent(
6906 queued,
6907 queued,
6908 PG::UnfoundRecovery()));
6909 queue_peering_event(evt);
6910 action = "in recovery";
6911 } else {
6912 action = "already out of recovery/backfill";
6913 }
6914 dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
6915 } else {
6916 dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
6917 queue_recovery();
6918 }
6919 }
6920
6921 void PG::handle_advance_map(
6922 OSDMapRef osdmap, OSDMapRef lastmap,
6923 vector<int>& newup, int up_primary,
6924 vector<int>& newacting, int acting_primary,
6925 RecoveryCtx *rctx)
6926 {
6927 ceph_assert(lastmap->get_epoch() == osdmap_ref->get_epoch());
6928 ceph_assert(lastmap == osdmap_ref);
6929 dout(10) << "handle_advance_map "
6930 << newup << "/" << newacting
6931 << " -- " << up_primary << "/" << acting_primary
6932 << dendl;
6933 update_osdmap_ref(osdmap);
6934 osd_shard->update_pg_epoch(pg_slot, osdmap->get_epoch());
6935
6936 pool.update(cct, osdmap);
6937
6938 AdvMap evt(
6939 osdmap, lastmap, newup, up_primary,
6940 newacting, acting_primary);
6941 recovery_state.handle_event(evt, rctx);
6942 if (pool.info.last_change == osdmap_ref->get_epoch()) {
6943 on_pool_change();
6944 update_store_with_options();
6945 }
6946 last_require_osd_release = osdmap->require_osd_release;
6947 }
6948
6949 void PG::handle_activate_map(RecoveryCtx *rctx)
6950 {
6951 dout(10) << "handle_activate_map " << dendl;
6952 ActMap evt;
6953 recovery_state.handle_event(evt, rctx);
6954 if (osdmap_ref->get_epoch() - last_persisted_osdmap >
6955 cct->_conf->osd_pg_epoch_persisted_max_stale) {
6956 dout(20) << __func__ << ": Dirtying info: last_persisted is "
6957 << last_persisted_osdmap
6958 << " while current is " << osdmap_ref->get_epoch() << dendl;
6959 dirty_info = true;
6960 } else {
6961 dout(20) << __func__ << ": Not dirtying info: last_persisted is "
6962 << last_persisted_osdmap
6963 << " while current is " << osdmap_ref->get_epoch() << dendl;
6964 }
6965 if (osdmap_ref->check_new_blacklist_entries()) {
6966 check_blacklisted_watchers();
6967 }
6968 write_if_dirty(*rctx->transaction);
6969 }
6970
6971 void PG::handle_initialize(RecoveryCtx *rctx)
6972 {
6973 dout(10) << __func__ << dendl;
6974 Initialize evt;
6975 recovery_state.handle_event(evt, rctx);
6976 }
6977
6978 void PG::handle_query_state(Formatter *f)
6979 {
6980 dout(10) << "handle_query_state" << dendl;
6981 QueryState q(f);
6982 recovery_state.handle_event(q, 0);
6983 }
6984
6985 void PG::update_store_with_options()
6986 {
6987 auto r = osd->store->set_collection_opts(ch, pool.info.opts);
6988 if(r < 0 && r != -EOPNOTSUPP) {
6989 derr << __func__ << " set_collection_opts returns error:" << r << dendl;
6990 }
6991 }
6992
6993 struct C_DeleteMore : public Context {
6994 PGRef pg;
6995 epoch_t epoch;
6996 C_DeleteMore(PG *p, epoch_t e) : pg(p), epoch(e) {}
6997 void finish(int r) override {
6998 ceph_abort();
6999 }
7000 void complete(int r) override {
7001 ceph_assert(r == 0);
7002 pg->lock();
7003 if (!pg->pg_has_reset_since(epoch)) {
7004 pg->osd->queue_for_pg_delete(pg->get_pgid(), epoch);
7005 }
7006 pg->unlock();
7007 delete this;
7008 }
7009 };
7010
7011 void PG::_delete_some(ObjectStore::Transaction *t)
7012 {
7013 dout(10) << __func__ << dendl;
7014
7015 {
7016 float osd_delete_sleep = osd->osd->get_osd_delete_sleep();
7017 if (osd_delete_sleep > 0 && delete_needs_sleep) {
7018 epoch_t e = get_osdmap()->get_epoch();
7019 PGRef pgref(this);
7020 auto delete_requeue_callback = new FunctionContext([this, pgref, e](int r) {
7021 dout(20) << __func__ << " wake up at "
7022 << ceph_clock_now()
7023 << ", re-queuing delete" << dendl;
7024 lock();
7025 delete_needs_sleep = false;
7026 if (!pg_has_reset_since(e)) {
7027 osd->queue_for_pg_delete(get_pgid(), e);
7028 }
7029 unlock();
7030 });
7031
7032 utime_t delete_schedule_time = ceph_clock_now();
7033 delete_schedule_time += osd_delete_sleep;
7034 Mutex::Locker l(osd->sleep_lock);
7035 osd->sleep_timer.add_event_at(delete_schedule_time,
7036 delete_requeue_callback);
7037 dout(20) << __func__ << " Delete scheduled at " << delete_schedule_time << dendl;
7038 return;
7039 }
7040 }
7041
7042 delete_needs_sleep = true;
7043
7044 vector<ghobject_t> olist;
7045 int max = std::min(osd->store->get_ideal_list_max(),
7046 (int)cct->_conf->osd_target_transaction_size);
7047 ghobject_t next;
7048 osd->store->collection_list(
7049 ch,
7050 next,
7051 ghobject_t::get_max(),
7052 max,
7053 &olist,
7054 &next);
7055 dout(20) << __func__ << " " << olist << dendl;
7056
7057 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
7058 int64_t num = 0;
7059 for (auto& oid : olist) {
7060 if (oid == pgmeta_oid) {
7061 continue;
7062 }
7063 if (oid.is_pgmeta()) {
7064 osd->clog->warn() << info.pgid << " found stray pgmeta-like " << oid
7065 << " during PG removal";
7066 }
7067 int r = snap_mapper.remove_oid(oid.hobj, &_t);
7068 if (r != 0 && r != -ENOENT) {
7069 ceph_abort();
7070 }
7071 t->remove(coll, oid);
7072 ++num;
7073 }
7074 if (num) {
7075 dout(20) << __func__ << " deleting " << num << " objects" << dendl;
7076 Context *fin = new C_DeleteMore(this, get_osdmap_epoch());
7077 t->register_on_commit(fin);
7078 } else {
7079 dout(20) << __func__ << " finished" << dendl;
7080 if (cct->_conf->osd_inject_failure_on_pg_removal) {
7081 _exit(1);
7082 }
7083
7084 // final flush here to ensure completions drop refs. Of particular concern
7085 // are the SnapMapper ContainerContexts.
7086 {
7087 PGRef pgref(this);
7088 PGLog::clear_info_log(info.pgid, t);
7089 t->remove_collection(coll);
7090 t->register_on_commit(new ContainerContext<PGRef>(pgref));
7091 t->register_on_applied(new ContainerContext<PGRef>(pgref));
7092 osd->store->queue_transaction(ch, std::move(*t));
7093 }
7094 ch->flush();
7095
7096 if (!osd->try_finish_pg_delete(this, pool.info.get_pg_num())) {
7097 dout(1) << __func__ << " raced with merge, reinstantiating" << dendl;
7098 ch = osd->store->create_new_collection(coll);
7099 _create(*t,
7100 info.pgid,
7101 info.pgid.get_split_bits(pool.info.get_pg_num()));
7102 _init(*t, info.pgid, &pool.info);
7103 last_epoch = 0; // to ensure pg epoch is also written
7104 dirty_info = true;
7105 dirty_big_info = true;
7106 } else {
7107 deleted = true;
7108
7109 // cancel reserver here, since the PG is about to get deleted and the
7110 // exit() methods don't run when that happens.
7111 osd->local_reserver.cancel_reservation(info.pgid);
7112
7113 osd->logger->dec(l_osd_pg_removing);
7114 }
7115 }
7116 }
7117
7118 // Compute pending backfill data
7119 static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes)
7120 {
7121 lgeneric_dout(cct, 20) << __func__ << " Adjust local usage " << (local_bytes >> 10) << "KiB"
7122 << " primary usage " << (bf_bytes >> 10) << "KiB" << dendl;
7123 return std::max((int64_t)0, bf_bytes - local_bytes);
7124 }
7125
7126 int PG::pg_stat_adjust(osd_stat_t *ns)
7127 {
7128 osd_stat_t &new_stat = *ns;
7129 if (is_primary()) {
7130 return 0;
7131 }
7132 // Adjust the kb_used by adding pending backfill data
7133 uint64_t reserved_num_bytes = get_reserved_num_bytes();
7134
7135 // For now we don't consider projected space gains here
7136 // I suggest we have an optional 2 pass backfill that frees up
7137 // space in a first pass. This could be triggered when at nearfull
7138 // or near to backfillfull.
7139 if (reserved_num_bytes > 0) {
7140 // TODO: Handle compression by adjusting by the PGs average
7141 // compression precentage.
7142 dout(20) << __func__ << " reserved_num_bytes " << (reserved_num_bytes >> 10) << "KiB"
7143 << " Before kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl;
7144 if (new_stat.statfs.available > reserved_num_bytes)
7145 new_stat.statfs.available -= reserved_num_bytes;
7146 else
7147 new_stat.statfs.available = 0;
7148 dout(20) << __func__ << " After kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl;
7149 return 1;
7150 }
7151 return 0;
7152 }
7153
7154
7155 /*------------ Recovery State Machine----------------*/
7156 #undef dout_prefix
7157 #define dout_prefix (context< RecoveryMachine >().pg->gen_prefix(*_dout) \
7158 << "state<" << get_state_name() << ">: ")
7159
7160 /*------Crashed-------*/
7161 PG::RecoveryState::Crashed::Crashed(my_context ctx)
7162 : my_base(ctx),
7163 NamedState(context< RecoveryMachine >().pg, "Crashed")
7164 {
7165 context< RecoveryMachine >().log_enter(state_name);
7166 ceph_abort_msg("we got a bad state machine event");
7167 }
7168
7169
7170 /*------Initial-------*/
7171 PG::RecoveryState::Initial::Initial(my_context ctx)
7172 : my_base(ctx),
7173 NamedState(context< RecoveryMachine >().pg, "Initial")
7174 {
7175 context< RecoveryMachine >().log_enter(state_name);
7176 }
7177
7178 boost::statechart::result PG::RecoveryState::Initial::react(const MNotifyRec& notify)
7179 {
7180 PG *pg = context< RecoveryMachine >().pg;
7181 pg->proc_replica_info(
7182 notify.from, notify.notify.info, notify.notify.epoch_sent);
7183 pg->set_last_peering_reset();
7184 return transit< Primary >();
7185 }
7186
7187 boost::statechart::result PG::RecoveryState::Initial::react(const MInfoRec& i)
7188 {
7189 PG *pg = context< RecoveryMachine >().pg;
7190 ceph_assert(!pg->is_primary());
7191 post_event(i);
7192 return transit< Stray >();
7193 }
7194
7195 boost::statechart::result PG::RecoveryState::Initial::react(const MLogRec& i)
7196 {
7197 PG *pg = context< RecoveryMachine >().pg;
7198 ceph_assert(!pg->is_primary());
7199 post_event(i);
7200 return transit< Stray >();
7201 }
7202
7203 void PG::RecoveryState::Initial::exit()
7204 {
7205 context< RecoveryMachine >().log_exit(state_name, enter_time);
7206 PG *pg = context< RecoveryMachine >().pg;
7207 utime_t dur = ceph_clock_now() - enter_time;
7208 pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur);
7209 }
7210
7211 /*------Started-------*/
7212 PG::RecoveryState::Started::Started(my_context ctx)
7213 : my_base(ctx),
7214 NamedState(context< RecoveryMachine >().pg, "Started")
7215 {
7216 context< RecoveryMachine >().log_enter(state_name);
7217 }
7218
7219 boost::statechart::result
7220 PG::RecoveryState::Started::react(const IntervalFlush&)
7221 {
7222 PG *pg = context< RecoveryMachine >().pg;
7223 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
7224 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
7225 return discard_event();
7226 }
7227
7228 boost::statechart::result PG::RecoveryState::Started::react(const AdvMap& advmap)
7229 {
7230 PG *pg = context< RecoveryMachine >().pg;
7231 ldout(pg->cct, 10) << "Started advmap" << dendl;
7232 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
7233 if (pg->should_restart_peering(
7234 advmap.up_primary,
7235 advmap.acting_primary,
7236 advmap.newup,
7237 advmap.newacting,
7238 advmap.lastmap,
7239 advmap.osdmap)) {
7240 ldout(pg->cct, 10) << "should_restart_peering, transitioning to Reset"
7241 << dendl;
7242 post_event(advmap);
7243 return transit< Reset >();
7244 }
7245 pg->remove_down_peer_info(advmap.osdmap);
7246 return discard_event();
7247 }
7248
7249 boost::statechart::result PG::RecoveryState::Started::react(const QueryState& q)
7250 {
7251 q.f->open_object_section("state");
7252 q.f->dump_string("name", state_name);
7253 q.f->dump_stream("enter_time") << enter_time;
7254 q.f->close_section();
7255 return discard_event();
7256 }
7257
7258 void PG::RecoveryState::Started::exit()
7259 {
7260 context< RecoveryMachine >().log_exit(state_name, enter_time);
7261 PG *pg = context< RecoveryMachine >().pg;
7262 utime_t dur = ceph_clock_now() - enter_time;
7263 pg->osd->recoverystate_perf->tinc(rs_started_latency, dur);
7264 }
7265
7266 /*--------Reset---------*/
7267 PG::RecoveryState::Reset::Reset(my_context ctx)
7268 : my_base(ctx),
7269 NamedState(context< RecoveryMachine >().pg, "Reset")
7270 {
7271 context< RecoveryMachine >().log_enter(state_name);
7272 PG *pg = context< RecoveryMachine >().pg;
7273
7274 pg->flushes_in_progress = 0;
7275 pg->set_last_peering_reset();
7276 }
7277
7278 boost::statechart::result
7279 PG::RecoveryState::Reset::react(const IntervalFlush&)
7280 {
7281 PG *pg = context< RecoveryMachine >().pg;
7282 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
7283 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
7284 return discard_event();
7285 }
7286
7287 boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
7288 {
7289 PG *pg = context< RecoveryMachine >().pg;
7290 ldout(pg->cct, 10) << "Reset advmap" << dendl;
7291
7292 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
7293
7294 if (pg->should_restart_peering(
7295 advmap.up_primary,
7296 advmap.acting_primary,
7297 advmap.newup,
7298 advmap.newacting,
7299 advmap.lastmap,
7300 advmap.osdmap)) {
7301 ldout(pg->cct, 10) << "should restart peering, calling start_peering_interval again"
7302 << dendl;
7303 pg->start_peering_interval(
7304 advmap.lastmap,
7305 advmap.newup, advmap.up_primary,
7306 advmap.newacting, advmap.acting_primary,
7307 context< RecoveryMachine >().get_cur_transaction());
7308 }
7309 pg->remove_down_peer_info(advmap.osdmap);
7310 pg->check_past_interval_bounds();
7311 return discard_event();
7312 }
7313
7314 boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&)
7315 {
7316 PG *pg = context< RecoveryMachine >().pg;
7317 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7318 context< RecoveryMachine >().send_notify(
7319 pg->get_primary(),
7320 pg_notify_t(
7321 pg->get_primary().shard, pg->pg_whoami.shard,
7322 pg->get_osdmap_epoch(),
7323 pg->get_osdmap_epoch(),
7324 pg->info),
7325 pg->past_intervals);
7326 }
7327
7328 pg->update_heartbeat_peers();
7329 pg->take_waiters();
7330
7331 return transit< Started >();
7332 }
7333
7334 boost::statechart::result PG::RecoveryState::Reset::react(const QueryState& q)
7335 {
7336 q.f->open_object_section("state");
7337 q.f->dump_string("name", state_name);
7338 q.f->dump_stream("enter_time") << enter_time;
7339 q.f->close_section();
7340 return discard_event();
7341 }
7342
7343 void PG::RecoveryState::Reset::exit()
7344 {
7345 context< RecoveryMachine >().log_exit(state_name, enter_time);
7346 PG *pg = context< RecoveryMachine >().pg;
7347 utime_t dur = ceph_clock_now() - enter_time;
7348 pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur);
7349 }
7350
7351 /*-------Start---------*/
7352 PG::RecoveryState::Start::Start(my_context ctx)
7353 : my_base(ctx),
7354 NamedState(context< RecoveryMachine >().pg, "Start")
7355 {
7356 context< RecoveryMachine >().log_enter(state_name);
7357
7358 PG *pg = context< RecoveryMachine >().pg;
7359 if (pg->is_primary()) {
7360 ldout(pg->cct, 1) << "transitioning to Primary" << dendl;
7361 post_event(MakePrimary());
7362 } else { //is_stray
7363 ldout(pg->cct, 1) << "transitioning to Stray" << dendl;
7364 post_event(MakeStray());
7365 }
7366 }
7367
7368 void PG::RecoveryState::Start::exit()
7369 {
7370 context< RecoveryMachine >().log_exit(state_name, enter_time);
7371 PG *pg = context< RecoveryMachine >().pg;
7372 utime_t dur = ceph_clock_now() - enter_time;
7373 pg->osd->recoverystate_perf->tinc(rs_start_latency, dur);
7374 }
7375
7376 /*---------Primary--------*/
7377 PG::RecoveryState::Primary::Primary(my_context ctx)
7378 : my_base(ctx),
7379 NamedState(context< RecoveryMachine >().pg, "Started/Primary")
7380 {
7381 context< RecoveryMachine >().log_enter(state_name);
7382 PG *pg = context< RecoveryMachine >().pg;
7383 ceph_assert(pg->want_acting.empty());
7384
7385 // set CREATING bit until we have peered for the first time.
7386 if (pg->info.history.last_epoch_started == 0) {
7387 pg->state_set(PG_STATE_CREATING);
7388 // use the history timestamp, which ultimately comes from the
7389 // monitor in the create case.
7390 utime_t t = pg->info.history.last_scrub_stamp;
7391 pg->info.stats.last_fresh = t;
7392 pg->info.stats.last_active = t;
7393 pg->info.stats.last_change = t;
7394 pg->info.stats.last_peered = t;
7395 pg->info.stats.last_clean = t;
7396 pg->info.stats.last_unstale = t;
7397 pg->info.stats.last_undegraded = t;
7398 pg->info.stats.last_fullsized = t;
7399 pg->info.stats.last_scrub_stamp = t;
7400 pg->info.stats.last_deep_scrub_stamp = t;
7401 pg->info.stats.last_clean_scrub_stamp = t;
7402 }
7403 }
7404
7405 boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& notevt)
7406 {
7407 PG *pg = context< RecoveryMachine >().pg;
7408 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
7409 pg->proc_replica_info(
7410 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
7411 return discard_event();
7412 }
7413
7414 boost::statechart::result PG::RecoveryState::Primary::react(const ActMap&)
7415 {
7416 PG *pg = context< RecoveryMachine >().pg;
7417 ldout(pg->cct, 7) << "handle ActMap primary" << dendl;
7418 pg->publish_stats_to_osd();
7419 pg->take_waiters();
7420 return discard_event();
7421 }
7422
7423 boost::statechart::result PG::RecoveryState::Primary::react(
7424 const SetForceRecovery&)
7425 {
7426 PG *pg = context< RecoveryMachine >().pg;
7427 pg->set_force_recovery(true);
7428 return discard_event();
7429 }
7430
7431 boost::statechart::result PG::RecoveryState::Primary::react(
7432 const UnsetForceRecovery&)
7433 {
7434 PG *pg = context< RecoveryMachine >().pg;
7435 pg->set_force_recovery(false);
7436 return discard_event();
7437 }
7438
7439 boost::statechart::result PG::RecoveryState::Primary::react(
7440 const RequestScrub& evt)
7441 {
7442 PG *pg = context< RecoveryMachine >().pg;
7443 if (pg->is_primary()) {
7444 pg->scrub_requested(evt.deep, evt.repair);
7445 ldout(pg->cct,10) << "marking for scrub" << dendl;
7446 }
7447 return discard_event();
7448 }
7449
7450 boost::statechart::result PG::RecoveryState::Primary::react(
7451 const SetForceBackfill&)
7452 {
7453 PG *pg = context< RecoveryMachine >().pg;
7454 pg->set_force_backfill(true);
7455 return discard_event();
7456 }
7457
7458 boost::statechart::result PG::RecoveryState::Primary::react(
7459 const UnsetForceBackfill&)
7460 {
7461 PG *pg = context< RecoveryMachine >().pg;
7462 pg->set_force_backfill(false);
7463 return discard_event();
7464 }
7465
7466 void PG::RecoveryState::Primary::exit()
7467 {
7468 context< RecoveryMachine >().log_exit(state_name, enter_time);
7469 PG *pg = context< RecoveryMachine >().pg;
7470 pg->want_acting.clear();
7471 utime_t dur = ceph_clock_now() - enter_time;
7472 pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur);
7473 pg->clear_primary_state();
7474 pg->state_clear(PG_STATE_CREATING);
7475 }
7476
7477 /*---------Peering--------*/
7478 PG::RecoveryState::Peering::Peering(my_context ctx)
7479 : my_base(ctx),
7480 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering"),
7481 history_les_bound(false)
7482 {
7483 context< RecoveryMachine >().log_enter(state_name);
7484
7485 PG *pg = context< RecoveryMachine >().pg;
7486 ceph_assert(!pg->is_peered());
7487 ceph_assert(!pg->is_peering());
7488 ceph_assert(pg->is_primary());
7489 pg->state_set(PG_STATE_PEERING);
7490 }
7491
7492 boost::statechart::result PG::RecoveryState::Peering::react(const AdvMap& advmap)
7493 {
7494 PG *pg = context< RecoveryMachine >().pg;
7495 ldout(pg->cct, 10) << "Peering advmap" << dendl;
7496 if (prior_set.affected_by_map(*(advmap.osdmap), pg)) {
7497 ldout(pg->cct, 1) << "Peering, affected_by_map, going to Reset" << dendl;
7498 post_event(advmap);
7499 return transit< Reset >();
7500 }
7501
7502 pg->adjust_need_up_thru(advmap.osdmap);
7503
7504 return forward_event();
7505 }
7506
7507 boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q)
7508 {
7509 PG *pg = context< RecoveryMachine >().pg;
7510
7511 q.f->open_object_section("state");
7512 q.f->dump_string("name", state_name);
7513 q.f->dump_stream("enter_time") << enter_time;
7514
7515 q.f->open_array_section("past_intervals");
7516 pg->past_intervals.dump(q.f);
7517 q.f->close_section();
7518
7519 q.f->open_array_section("probing_osds");
7520 for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
7521 p != prior_set.probe.end();
7522 ++p)
7523 q.f->dump_stream("osd") << *p;
7524 q.f->close_section();
7525
7526 if (prior_set.pg_down)
7527 q.f->dump_string("blocked", "peering is blocked due to down osds");
7528
7529 q.f->open_array_section("down_osds_we_would_probe");
7530 for (set<int>::iterator p = prior_set.down.begin();
7531 p != prior_set.down.end();
7532 ++p)
7533 q.f->dump_int("osd", *p);
7534 q.f->close_section();
7535
7536 q.f->open_array_section("peering_blocked_by");
7537 for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
7538 p != prior_set.blocked_by.end();
7539 ++p) {
7540 q.f->open_object_section("osd");
7541 q.f->dump_int("osd", p->first);
7542 q.f->dump_int("current_lost_at", p->second);
7543 q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
7544 q.f->close_section();
7545 }
7546 q.f->close_section();
7547
7548 if (history_les_bound) {
7549 q.f->open_array_section("peering_blocked_by_detail");
7550 q.f->open_object_section("item");
7551 q.f->dump_string("detail","peering_blocked_by_history_les_bound");
7552 q.f->close_section();
7553 q.f->close_section();
7554 }
7555
7556 q.f->close_section();
7557 return forward_event();
7558 }
7559
7560 void PG::RecoveryState::Peering::exit()
7561 {
7562 PG *pg = context< RecoveryMachine >().pg;
7563 ldout(pg->cct, 10) << "Leaving Peering" << dendl;
7564 context< RecoveryMachine >().log_exit(state_name, enter_time);
7565 pg->state_clear(PG_STATE_PEERING);
7566 pg->clear_probe_targets();
7567
7568 utime_t dur = ceph_clock_now() - enter_time;
7569 pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur);
7570 }
7571
7572
7573 /*------Backfilling-------*/
7574 PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
7575 : my_base(ctx),
7576 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Backfilling")
7577 {
7578 context< RecoveryMachine >().log_enter(state_name);
7579 PG *pg = context< RecoveryMachine >().pg;
7580 pg->backfill_reserved = true;
7581 pg->queue_recovery();
7582 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
7583 pg->state_clear(PG_STATE_BACKFILL_WAIT);
7584 pg->state_set(PG_STATE_BACKFILLING);
7585 pg->publish_stats_to_osd();
7586 }
7587
7588 void PG::RecoveryState::Backfilling::backfill_release_reservations()
7589 {
7590 PG *pg = context< RecoveryMachine >().pg;
7591 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7592 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
7593 it != pg->backfill_targets.end();
7594 ++it) {
7595 ceph_assert(*it != pg->pg_whoami);
7596 ConnectionRef con = pg->osd->get_con_osd_cluster(
7597 it->osd, pg->get_osdmap_epoch());
7598 if (con) {
7599 pg->osd->send_message_osd_cluster(
7600 new MBackfillReserve(
7601 MBackfillReserve::RELEASE,
7602 spg_t(pg->info.pgid.pgid, it->shard),
7603 pg->get_osdmap_epoch()),
7604 con.get());
7605 }
7606 }
7607 }
7608
7609 void PG::RecoveryState::Backfilling::cancel_backfill()
7610 {
7611 PG *pg = context< RecoveryMachine >().pg;
7612 backfill_release_reservations();
7613 if (!pg->waiting_on_backfill.empty()) {
7614 pg->waiting_on_backfill.clear();
7615 pg->finish_recovery_op(hobject_t::get_max());
7616 }
7617 }
7618
7619 boost::statechart::result
7620 PG::RecoveryState::Backfilling::react(const Backfilled &c)
7621 {
7622 backfill_release_reservations();
7623 return transit<Recovered>();
7624 }
7625
7626 boost::statechart::result
7627 PG::RecoveryState::Backfilling::react(const DeferBackfill &c)
7628 {
7629 PG *pg = context< RecoveryMachine >().pg;
7630 ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl;
7631 pg->state_set(PG_STATE_BACKFILL_WAIT);
7632 pg->state_clear(PG_STATE_BACKFILLING);
7633 cancel_backfill();
7634 pg->schedule_backfill_retry(c.delay);
7635 return transit<NotBackfilling>();
7636 }
7637
7638 boost::statechart::result
7639 PG::RecoveryState::Backfilling::react(const UnfoundBackfill &c)
7640 {
7641 PG *pg = context< RecoveryMachine >().pg;
7642 ldout(pg->cct, 10) << "backfill has unfound, can't continue" << dendl;
7643 pg->state_set(PG_STATE_BACKFILL_UNFOUND);
7644 pg->state_clear(PG_STATE_BACKFILLING);
7645 cancel_backfill();
7646 return transit<NotBackfilling>();
7647 }
7648
7649 boost::statechart::result
7650 PG::RecoveryState::Backfilling::react(const RemoteReservationRevokedTooFull &)
7651 {
7652 PG *pg = context< RecoveryMachine >().pg;
7653 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
7654 pg->state_clear(PG_STATE_BACKFILLING);
7655 cancel_backfill();
7656 pg->schedule_backfill_retry(pg->cct->_conf->osd_backfill_retry_interval);
7657 return transit<NotBackfilling>();
7658 }
7659
7660 boost::statechart::result
7661 PG::RecoveryState::Backfilling::react(const RemoteReservationRevoked &)
7662 {
7663 PG *pg = context< RecoveryMachine >().pg;
7664 pg->state_set(PG_STATE_BACKFILL_WAIT);
7665 cancel_backfill();
7666 if (pg->needs_backfill()) {
7667 return transit<WaitLocalBackfillReserved>();
7668 } else {
7669 // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
7670 return discard_event();
7671 }
7672 }
7673
7674 void PG::RecoveryState::Backfilling::exit()
7675 {
7676 context< RecoveryMachine >().log_exit(state_name, enter_time);
7677 PG *pg = context< RecoveryMachine >().pg;
7678 pg->backfill_reserved = false;
7679 pg->backfill_reserving = false;
7680 pg->state_clear(PG_STATE_BACKFILLING);
7681 pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
7682 utime_t dur = ceph_clock_now() - enter_time;
7683 pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
7684 }
7685
7686 /*--WaitRemoteBackfillReserved--*/
7687
7688 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
7689 : my_base(ctx),
7690 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteBackfillReserved"),
7691 backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
7692 {
7693 context< RecoveryMachine >().log_enter(state_name);
7694 PG *pg = context< RecoveryMachine >().pg;
7695 pg->state_set(PG_STATE_BACKFILL_WAIT);
7696 pg->publish_stats_to_osd();
7697 post_event(RemoteBackfillReserved());
7698 }
7699
7700 boost::statechart::result
7701 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
7702 {
7703 PG *pg = context< RecoveryMachine >().pg;
7704
7705 int64_t num_bytes = pg->info.stats.stats.sum.num_bytes;
7706 ldout(pg->cct, 10) << __func__ << " num_bytes " << num_bytes << dendl;
7707 if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) {
7708 //The primary never backfills itself
7709 ceph_assert(*backfill_osd_it != pg->pg_whoami);
7710 ConnectionRef con = pg->osd->get_con_osd_cluster(
7711 backfill_osd_it->osd, pg->get_osdmap_epoch());
7712 if (con) {
7713 pg->osd->send_message_osd_cluster(
7714 new MBackfillReserve(
7715 MBackfillReserve::REQUEST,
7716 spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
7717 pg->get_osdmap_epoch(),
7718 pg->get_backfill_priority(),
7719 num_bytes,
7720 pg->peer_bytes[*backfill_osd_it]),
7721 con.get());
7722 }
7723 ++backfill_osd_it;
7724 } else {
7725 pg->peer_bytes.clear();
7726 post_event(AllBackfillsReserved());
7727 }
7728 return discard_event();
7729 }
7730
7731 void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
7732 {
7733 context< RecoveryMachine >().log_exit(state_name, enter_time);
7734 PG *pg = context< RecoveryMachine >().pg;
7735 utime_t dur = ceph_clock_now() - enter_time;
7736 pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur);
7737 }
7738
7739 void PG::RecoveryState::WaitRemoteBackfillReserved::retry()
7740 {
7741 PG *pg = context< RecoveryMachine >().pg;
7742 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7743
7744 // Send CANCEL to all previously acquired reservations
7745 set<pg_shard_t>::const_iterator it, begin, end;
7746 begin = context< Active >().remote_shards_to_reserve_backfill.begin();
7747 end = context< Active >().remote_shards_to_reserve_backfill.end();
7748 ceph_assert(begin != end);
7749 for (it = begin; it != backfill_osd_it; ++it) {
7750 //The primary never backfills itself
7751 ceph_assert(*it != pg->pg_whoami);
7752 ConnectionRef con = pg->osd->get_con_osd_cluster(
7753 it->osd, pg->get_osdmap_epoch());
7754 if (con) {
7755 pg->osd->send_message_osd_cluster(
7756 new MBackfillReserve(
7757 MBackfillReserve::RELEASE,
7758 spg_t(pg->info.pgid.pgid, it->shard),
7759 pg->get_osdmap_epoch()),
7760 con.get());
7761 }
7762 }
7763
7764 pg->state_clear(PG_STATE_BACKFILL_WAIT);
7765 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
7766 pg->publish_stats_to_osd();
7767
7768 pg->schedule_backfill_retry(pg->cct->_conf->osd_backfill_retry_interval);
7769 }
7770
7771 boost::statechart::result
7772 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt)
7773 {
7774 retry();
7775 return transit<NotBackfilling>();
7776 }
7777
7778 boost::statechart::result
7779 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt)
7780 {
7781 retry();
7782 return transit<NotBackfilling>();
7783 }
7784
7785 /*--WaitLocalBackfillReserved--*/
7786 PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
7787 : my_base(ctx),
7788 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalBackfillReserved")
7789 {
7790 context< RecoveryMachine >().log_enter(state_name);
7791 PG *pg = context< RecoveryMachine >().pg;
7792 pg->state_set(PG_STATE_BACKFILL_WAIT);
7793 pg->osd->local_reserver.request_reservation(
7794 pg->info.pgid,
7795 new QueuePeeringEvt<LocalBackfillReserved>(
7796 pg, pg->get_osdmap_epoch(),
7797 LocalBackfillReserved()),
7798 pg->get_backfill_priority(),
7799 new QueuePeeringEvt<DeferBackfill>(
7800 pg, pg->get_osdmap_epoch(),
7801 DeferBackfill(0.0)));
7802 pg->publish_stats_to_osd();
7803 }
7804
7805 void PG::RecoveryState::WaitLocalBackfillReserved::exit()
7806 {
7807 context< RecoveryMachine >().log_exit(state_name, enter_time);
7808 PG *pg = context< RecoveryMachine >().pg;
7809 utime_t dur = ceph_clock_now() - enter_time;
7810 pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur);
7811 }
7812
7813 /*----NotBackfilling------*/
7814 PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
7815 : my_base(ctx),
7816 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotBackfilling")
7817 {
7818 context< RecoveryMachine >().log_enter(state_name);
7819 PG *pg = context< RecoveryMachine >().pg;
7820 pg->state_clear(PG_STATE_REPAIR);
7821 pg->publish_stats_to_osd();
7822 }
7823
7824 boost::statechart::result
7825 PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
7826 {
7827 return discard_event();
7828 }
7829
7830 boost::statechart::result
7831 PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
7832 {
7833 return discard_event();
7834 }
7835
7836 void PG::RecoveryState::NotBackfilling::exit()
7837 {
7838 context< RecoveryMachine >().log_exit(state_name, enter_time);
7839 PG *pg = context< RecoveryMachine >().pg;
7840 pg->state_clear(PG_STATE_BACKFILL_UNFOUND);
7841 utime_t dur = ceph_clock_now() - enter_time;
7842 pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
7843 }
7844
7845 /*----NotRecovering------*/
7846 PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx)
7847 : my_base(ctx),
7848 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotRecovering")
7849 {
7850 context< RecoveryMachine >().log_enter(state_name);
7851 PG *pg = context< RecoveryMachine >().pg;
7852 pg->publish_stats_to_osd();
7853 }
7854
7855 void PG::RecoveryState::NotRecovering::exit()
7856 {
7857 context< RecoveryMachine >().log_exit(state_name, enter_time);
7858 PG *pg = context< RecoveryMachine >().pg;
7859 pg->state_clear(PG_STATE_RECOVERY_UNFOUND);
7860 utime_t dur = ceph_clock_now() - enter_time;
7861 pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur);
7862 }
7863
7864 /*---RepNotRecovering----*/
7865 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
7866 : my_base(ctx),
7867 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepNotRecovering")
7868 {
7869 context< RecoveryMachine >().log_enter(state_name);
7870 }
7871
7872 boost::statechart::result
7873 PG::RecoveryState::RepNotRecovering::react(const RejectRemoteReservation &evt)
7874 {
7875 PG *pg = context< RecoveryMachine >().pg;
7876 pg->reject_reservation();
7877 post_event(RemoteReservationRejected());
7878 return discard_event();
7879 }
7880
7881 void PG::RecoveryState::RepNotRecovering::exit()
7882 {
7883 context< RecoveryMachine >().log_exit(state_name, enter_time);
7884 PG *pg = context< RecoveryMachine >().pg;
7885 utime_t dur = ceph_clock_now() - enter_time;
7886 pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur);
7887 }
7888
7889 /*---RepWaitRecoveryReserved--*/
7890 PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
7891 : my_base(ctx),
7892 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitRecoveryReserved")
7893 {
7894 context< RecoveryMachine >().log_enter(state_name);
7895 }
7896
7897 boost::statechart::result
7898 PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
7899 {
7900 PG *pg = context< RecoveryMachine >().pg;
7901 pg->osd->send_message_osd_cluster(
7902 pg->primary.osd,
7903 new MRecoveryReserve(
7904 MRecoveryReserve::GRANT,
7905 spg_t(pg->info.pgid.pgid, pg->primary.shard),
7906 pg->get_osdmap_epoch()),
7907 pg->get_osdmap_epoch());
7908 return transit<RepRecovering>();
7909 }
7910
7911 boost::statechart::result
7912 PG::RecoveryState::RepWaitRecoveryReserved::react(
7913 const RemoteReservationCanceled &evt)
7914 {
7915 PG *pg = context< RecoveryMachine >().pg;
7916 pg->clear_reserved_num_bytes();
7917 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7918 return transit<RepNotRecovering>();
7919 }
7920
7921 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
7922 {
7923 context< RecoveryMachine >().log_exit(state_name, enter_time);
7924 PG *pg = context< RecoveryMachine >().pg;
7925 utime_t dur = ceph_clock_now() - enter_time;
7926 pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur);
7927 }
7928
7929 /*-RepWaitBackfillReserved*/
7930 PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
7931 : my_base(ctx),
7932 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitBackfillReserved")
7933 {
7934 context< RecoveryMachine >().log_enter(state_name);
7935 }
7936
7937 boost::statechart::result
7938 PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
7939 {
7940 PG *pg = context< RecoveryMachine >().pg;
7941 // Use tentative_bacfill_full() to make sure enough
7942 // space is available to handle target bytes from primary.
7943
7944 // TODO: If we passed num_objects from primary we could account for
7945 // an estimate of the metadata overhead.
7946
7947 // TODO: If we had compressed_allocated and compressed_original from primary
7948 // we could compute compression ratio and adjust accordingly.
7949
7950 // XXX: There is no way to get omap overhead and this would only apply
7951 // to whatever possibly different partition that is storing the database.
7952
7953 // update_osd_stat() from heartbeat will do this on a new
7954 // statfs using pg->primary_num_bytes.
7955 uint64_t pending_adjustment = 0;
7956 int64_t primary_num_bytes = evt.primary_num_bytes;
7957 int64_t local_num_bytes = evt.local_num_bytes;
7958 if (primary_num_bytes) {
7959 // For erasure coded pool overestimate by a full stripe per object
7960 // because we don't know how each objected rounded to the nearest stripe
7961 if (pg->pool.info.is_erasure()) {
7962 primary_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count();
7963 primary_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects;
7964 local_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count();
7965 local_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects;
7966 }
7967 pending_adjustment = pending_backfill(pg->cct, primary_num_bytes, local_num_bytes);
7968 ldout(pg->cct, 10) << __func__ << " primary_num_bytes " << (primary_num_bytes >> 10) << "KiB"
7969 << " local " << (local_num_bytes >> 10) << "KiB"
7970 << " pending_adjustments " << (pending_adjustment >> 10) << "KiB"
7971 << dendl;
7972 }
7973 // This lock protects not only the stats OSDService but also setting the pg primary_num_bytes
7974 // That's why we don't immediately unlock
7975 Mutex::Locker l(pg->osd->stat_lock);
7976 osd_stat_t cur_stat = pg->osd->osd_stat;
7977 if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
7978 (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
7979 ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
7980 << dendl;
7981 post_event(RejectRemoteReservation());
7982 } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
7983 pg->osd->tentative_backfill_full(pg, pending_adjustment, cur_stat)) {
7984 ldout(pg->cct, 10) << "backfill reservation rejected: backfill full"
7985 << dendl;
7986 post_event(RejectRemoteReservation());
7987 } else {
7988 Context *preempt = nullptr;
7989 // Don't reserve space if skipped reservation check, this is used
7990 // to test the other backfill full check AND in case a corruption
7991 // of num_bytes requires ignoring that value and trying the
7992 // backfill anyway.
7993 if (primary_num_bytes && !pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation)
7994 pg->set_reserved_num_bytes(primary_num_bytes, local_num_bytes);
7995 else
7996 pg->clear_reserved_num_bytes();
7997 // Use un-ec-adjusted bytes for stats.
7998 pg->info.stats.stats.sum.num_bytes = evt.local_num_bytes;
7999 if (HAVE_FEATURE(pg->upacting_features, RECOVERY_RESERVATION_2)) {
8000 // older peers will interpret preemption as TOOFULL
8001 preempt = new QueuePeeringEvt<RemoteBackfillPreempted>(
8002 pg, pg->get_osdmap_epoch(),
8003 RemoteBackfillPreempted());
8004 }
8005 pg->osd->remote_reserver.request_reservation(
8006 pg->info.pgid,
8007 new QueuePeeringEvt<RemoteBackfillReserved>(
8008 pg, pg->get_osdmap_epoch(),
8009 RemoteBackfillReserved()),
8010 evt.priority,
8011 preempt);
8012 }
8013 return transit<RepWaitBackfillReserved>();
8014 }
8015
8016 boost::statechart::result
8017 PG::RecoveryState::RepNotRecovering::react(const RequestRecoveryPrio &evt)
8018 {
8019 PG *pg = context< RecoveryMachine >().pg;
8020
8021 // fall back to a local reckoning of priority of primary doesn't pass one
8022 // (pre-mimic compat)
8023 int prio = evt.priority ? evt.priority : pg->get_recovery_priority();
8024
8025 Context *preempt = nullptr;
8026 if (HAVE_FEATURE(pg->upacting_features, RECOVERY_RESERVATION_2)) {
8027 // older peers can't handle this
8028 preempt = new QueuePeeringEvt<RemoteRecoveryPreempted>(
8029 pg, pg->get_osdmap_epoch(),
8030 RemoteRecoveryPreempted());
8031 }
8032
8033 pg->osd->remote_reserver.request_reservation(
8034 pg->info.pgid,
8035 new QueuePeeringEvt<RemoteRecoveryReserved>(
8036 pg, pg->get_osdmap_epoch(),
8037 RemoteRecoveryReserved()),
8038 prio,
8039 preempt);
8040 return transit<RepWaitRecoveryReserved>();
8041 }
8042
8043 void PG::RecoveryState::RepWaitBackfillReserved::exit()
8044 {
8045 context< RecoveryMachine >().log_exit(state_name, enter_time);
8046 PG *pg = context< RecoveryMachine >().pg;
8047 utime_t dur = ceph_clock_now() - enter_time;
8048 pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur);
8049 }
8050
8051 boost::statechart::result
8052 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
8053 {
8054 PG *pg = context< RecoveryMachine >().pg;
8055
8056 pg->osd->send_message_osd_cluster(
8057 pg->primary.osd,
8058 new MBackfillReserve(
8059 MBackfillReserve::GRANT,
8060 spg_t(pg->info.pgid.pgid, pg->primary.shard),
8061 pg->get_osdmap_epoch()),
8062 pg->get_osdmap_epoch());
8063 return transit<RepRecovering>();
8064 }
8065
8066 boost::statechart::result
8067 PG::RecoveryState::RepWaitBackfillReserved::react(
8068 const RejectRemoteReservation &evt)
8069 {
8070 PG *pg = context< RecoveryMachine >().pg;
8071 pg->reject_reservation();
8072 post_event(RemoteReservationRejected());
8073 return discard_event();
8074 }
8075
8076 boost::statechart::result
8077 PG::RecoveryState::RepWaitBackfillReserved::react(
8078 const RemoteReservationRejected &evt)
8079 {
8080 PG *pg = context< RecoveryMachine >().pg;
8081 pg->clear_reserved_num_bytes();
8082 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
8083 return transit<RepNotRecovering>();
8084 }
8085
8086 boost::statechart::result
8087 PG::RecoveryState::RepWaitBackfillReserved::react(
8088 const RemoteReservationCanceled &evt)
8089 {
8090 PG *pg = context< RecoveryMachine >().pg;
8091 pg->clear_reserved_num_bytes();
8092 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
8093 return transit<RepNotRecovering>();
8094 }
8095
8096 /*---RepRecovering-------*/
8097 PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx)
8098 : my_base(ctx),
8099 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepRecovering")
8100 {
8101 context< RecoveryMachine >().log_enter(state_name);
8102 }
8103
8104 boost::statechart::result
8105 PG::RecoveryState::RepRecovering::react(const RemoteRecoveryPreempted &)
8106 {
8107 PG *pg = context< RecoveryMachine >().pg;
8108 pg->clear_reserved_num_bytes();
8109 pg->osd->send_message_osd_cluster(
8110 pg->primary.osd,
8111 new MRecoveryReserve(
8112 MRecoveryReserve::REVOKE,
8113 spg_t(pg->info.pgid.pgid, pg->primary.shard),
8114 pg->get_osdmap_epoch()),
8115 pg->get_osdmap_epoch());
8116 return discard_event();
8117 }
8118
8119 boost::statechart::result
8120 PG::RecoveryState::RepRecovering::react(const BackfillTooFull &)
8121 {
8122 PG *pg = context< RecoveryMachine >().pg;
8123 pg->clear_reserved_num_bytes();
8124 pg->osd->send_message_osd_cluster(
8125 pg->primary.osd,
8126 new MBackfillReserve(
8127 MBackfillReserve::TOOFULL,
8128 spg_t(pg->info.pgid.pgid, pg->primary.shard),
8129 pg->get_osdmap_epoch()),
8130 pg->get_osdmap_epoch());
8131 return discard_event();
8132 }
8133
8134 boost::statechart::result
8135 PG::RecoveryState::RepRecovering::react(const RemoteBackfillPreempted &)
8136 {
8137 PG *pg = context< RecoveryMachine >().pg;
8138 pg->clear_reserved_num_bytes();
8139 pg->osd->send_message_osd_cluster(
8140 pg->primary.osd,
8141 new MBackfillReserve(
8142 MBackfillReserve::REVOKE,
8143 spg_t(pg->info.pgid.pgid, pg->primary.shard),
8144 pg->get_osdmap_epoch()),
8145 pg->get_osdmap_epoch());
8146 return discard_event();
8147 }
8148
8149 void PG::RecoveryState::RepRecovering::exit()
8150 {
8151 context< RecoveryMachine >().log_exit(state_name, enter_time);
8152 PG *pg = context< RecoveryMachine >().pg;
8153 pg->clear_reserved_num_bytes();
8154 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
8155 utime_t dur = ceph_clock_now() - enter_time;
8156 pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur);
8157 }
8158
8159 /*------Activating--------*/
8160 PG::RecoveryState::Activating::Activating(my_context ctx)
8161 : my_base(ctx),
8162 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Activating")
8163 {
8164 context< RecoveryMachine >().log_enter(state_name);
8165 }
8166
8167 void PG::RecoveryState::Activating::exit()
8168 {
8169 context< RecoveryMachine >().log_exit(state_name, enter_time);
8170 PG *pg = context< RecoveryMachine >().pg;
8171 utime_t dur = ceph_clock_now() - enter_time;
8172 pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur);
8173 }
8174
8175 PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
8176 : my_base(ctx),
8177 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalRecoveryReserved")
8178 {
8179 context< RecoveryMachine >().log_enter(state_name);
8180 PG *pg = context< RecoveryMachine >().pg;
8181
8182 // Make sure all nodes that part of the recovery aren't full
8183 if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery &&
8184 pg->osd->check_osdmap_full(pg->acting_recovery_backfill)) {
8185 post_event(RecoveryTooFull());
8186 return;
8187 }
8188
8189 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
8190 pg->state_set(PG_STATE_RECOVERY_WAIT);
8191 pg->osd->local_reserver.request_reservation(
8192 pg->info.pgid,
8193 new QueuePeeringEvt<LocalRecoveryReserved>(
8194 pg, pg->get_osdmap_epoch(),
8195 LocalRecoveryReserved()),
8196 pg->get_recovery_priority(),
8197 new QueuePeeringEvt<DeferRecovery>(
8198 pg, pg->get_osdmap_epoch(),
8199 DeferRecovery(0.0)));
8200 pg->publish_stats_to_osd();
8201 }
8202
8203 boost::statechart::result
8204 PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
8205 {
8206 PG *pg = context< RecoveryMachine >().pg;
8207 pg->state_set(PG_STATE_RECOVERY_TOOFULL);
8208 pg->schedule_recovery_retry(pg->cct->_conf->osd_recovery_retry_interval);
8209 return transit<NotRecovering>();
8210 }
8211
8212 void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
8213 {
8214 context< RecoveryMachine >().log_exit(state_name, enter_time);
8215 PG *pg = context< RecoveryMachine >().pg;
8216 utime_t dur = ceph_clock_now() - enter_time;
8217 pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur);
8218 }
8219
8220 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
8221 : my_base(ctx),
8222 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
8223 remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
8224 {
8225 context< RecoveryMachine >().log_enter(state_name);
8226 post_event(RemoteRecoveryReserved());
8227 }
8228
8229 boost::statechart::result
8230 PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
8231 PG *pg = context< RecoveryMachine >().pg;
8232
8233 if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
8234 ceph_assert(*remote_recovery_reservation_it != pg->pg_whoami);
8235 ConnectionRef con = pg->osd->get_con_osd_cluster(
8236 remote_recovery_reservation_it->osd, pg->get_osdmap_epoch());
8237 if (con) {
8238 pg->osd->send_message_osd_cluster(
8239 new MRecoveryReserve(
8240 MRecoveryReserve::REQUEST,
8241 spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard),
8242 pg->get_osdmap_epoch(),
8243 pg->get_recovery_priority()),
8244 con.get());
8245 }
8246 ++remote_recovery_reservation_it;
8247 } else {
8248 post_event(AllRemotesReserved());
8249 }
8250 return discard_event();
8251 }
8252
8253 void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
8254 {
8255 context< RecoveryMachine >().log_exit(state_name, enter_time);
8256 PG *pg = context< RecoveryMachine >().pg;
8257 utime_t dur = ceph_clock_now() - enter_time;
8258 pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur);
8259 }
8260
8261 PG::RecoveryState::Recovering::Recovering(my_context ctx)
8262 : my_base(ctx),
8263 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovering")
8264 {
8265 context< RecoveryMachine >().log_enter(state_name);
8266
8267 PG *pg = context< RecoveryMachine >().pg;
8268 pg->state_clear(PG_STATE_RECOVERY_WAIT);
8269 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
8270 pg->state_set(PG_STATE_RECOVERING);
8271 ceph_assert(!pg->state_test(PG_STATE_ACTIVATING));
8272 pg->publish_stats_to_osd();
8273 pg->queue_recovery();
8274 }
8275
8276 void PG::RecoveryState::Recovering::release_reservations(bool cancel)
8277 {
8278 PG *pg = context< RecoveryMachine >().pg;
8279 ceph_assert(cancel || !pg->pg_log.get_missing().have_missing());
8280
8281 // release remote reservations
8282 for (set<pg_shard_t>::const_iterator i =
8283 context< Active >().remote_shards_to_reserve_recovery.begin();
8284 i != context< Active >().remote_shards_to_reserve_recovery.end();
8285 ++i) {
8286 if (*i == pg->pg_whoami) // skip myself
8287 continue;
8288 ConnectionRef con = pg->osd->get_con_osd_cluster(
8289 i->osd, pg->get_osdmap_epoch());
8290 if (con) {
8291 pg->osd->send_message_osd_cluster(
8292 new MRecoveryReserve(
8293 MRecoveryReserve::RELEASE,
8294 spg_t(pg->info.pgid.pgid, i->shard),
8295 pg->get_osdmap_epoch()),
8296 con.get());
8297 }
8298 }
8299 }
8300
8301 boost::statechart::result
8302 PG::RecoveryState::Recovering::react(const AllReplicasRecovered &evt)
8303 {
8304 PG *pg = context< RecoveryMachine >().pg;
8305 pg->state_clear(PG_STATE_FORCED_RECOVERY);
8306 release_reservations();
8307 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8308 return transit<Recovered>();
8309 }
8310
8311 boost::statechart::result
8312 PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
8313 {
8314 PG *pg = context< RecoveryMachine >().pg;
8315 pg->state_clear(PG_STATE_FORCED_RECOVERY);
8316 release_reservations();
8317 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8318 // XXX: Is this needed?
8319 pg->publish_stats_to_osd();
8320 return transit<WaitLocalBackfillReserved>();
8321 }
8322
8323 boost::statechart::result
8324 PG::RecoveryState::Recovering::react(const DeferRecovery &evt)
8325 {
8326 PG *pg = context< RecoveryMachine >().pg;
8327 if (!pg->state_test(PG_STATE_RECOVERING)) {
8328 // we may have finished recovery and have an AllReplicasRecovered
8329 // event queued to move us to the next state.
8330 ldout(pg->cct, 10) << "got defer recovery but not recovering" << dendl;
8331 return discard_event();
8332 }
8333 ldout(pg->cct, 10) << "defer recovery, retry delay " << evt.delay << dendl;
8334 pg->state_set(PG_STATE_RECOVERY_WAIT);
8335 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8336 release_reservations(true);
8337 pg->schedule_recovery_retry(evt.delay);
8338 return transit<NotRecovering>();
8339 }
8340
8341 boost::statechart::result
8342 PG::RecoveryState::Recovering::react(const UnfoundRecovery &evt)
8343 {
8344 PG *pg = context< RecoveryMachine >().pg;
8345 ldout(pg->cct, 10) << "recovery has unfound, can't continue" << dendl;
8346 pg->state_set(PG_STATE_RECOVERY_UNFOUND);
8347 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8348 release_reservations(true);
8349 return transit<NotRecovering>();
8350 }
8351
8352 void PG::RecoveryState::Recovering::exit()
8353 {
8354 context< RecoveryMachine >().log_exit(state_name, enter_time);
8355 PG *pg = context< RecoveryMachine >().pg;
8356 utime_t dur = ceph_clock_now() - enter_time;
8357 pg->state_clear(PG_STATE_RECOVERING);
8358 pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur);
8359 }
8360
8361 PG::RecoveryState::Recovered::Recovered(my_context ctx)
8362 : my_base(ctx),
8363 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovered")
8364 {
8365 pg_shard_t auth_log_shard;
8366
8367 context< RecoveryMachine >().log_enter(state_name);
8368
8369 PG *pg = context< RecoveryMachine >().pg;
8370
8371 ceph_assert(!pg->needs_recovery());
8372
8373 // if we finished backfill, all acting are active; recheck if
8374 // DEGRADED | UNDERSIZED is appropriate.
8375 ceph_assert(!pg->acting_recovery_backfill.empty());
8376 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
8377 pg->acting_recovery_backfill.size()) {
8378 pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
8379 pg->publish_stats_to_osd();
8380 }
8381
8382 // adjust acting set? (e.g. because backfill completed...)
8383 bool history_les_bound = false;
8384 if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
8385 true, &history_les_bound)) {
8386 ceph_assert(pg->want_acting.size());
8387 } else if (!pg->async_recovery_targets.empty()) {
8388 pg->choose_acting(auth_log_shard, true, &history_les_bound);
8389 }
8390
8391 if (context< Active >().all_replicas_activated &&
8392 pg->async_recovery_targets.empty())
8393 post_event(GoClean());
8394 }
8395
8396 void PG::RecoveryState::Recovered::exit()
8397 {
8398 context< RecoveryMachine >().log_exit(state_name, enter_time);
8399 PG *pg = context< RecoveryMachine >().pg;
8400 utime_t dur = ceph_clock_now() - enter_time;
8401 pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur);
8402 }
8403
8404 PG::RecoveryState::Clean::Clean(my_context ctx)
8405 : my_base(ctx),
8406 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Clean")
8407 {
8408 context< RecoveryMachine >().log_enter(state_name);
8409
8410 PG *pg = context< RecoveryMachine >().pg;
8411
8412 if (pg->info.last_complete != pg->info.last_update) {
8413 ceph_abort();
8414 }
8415 Context *c = pg->finish_recovery();
8416 context< RecoveryMachine >().get_cur_transaction()->register_on_commit(c);
8417
8418 pg->try_mark_clean();
8419 }
8420
8421 void PG::RecoveryState::Clean::exit()
8422 {
8423 context< RecoveryMachine >().log_exit(state_name, enter_time);
8424 PG *pg = context< RecoveryMachine >().pg;
8425 pg->state_clear(PG_STATE_CLEAN);
8426 utime_t dur = ceph_clock_now() - enter_time;
8427 pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur);
8428 }
8429
8430 template <typename T>
8431 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
8432 {
8433 set<int> osds_found;
8434 set<pg_shard_t> out;
8435 for (typename T::const_iterator i = in.begin();
8436 i != in.end();
8437 ++i) {
8438 if (*i != skip && !osds_found.count(i->osd)) {
8439 osds_found.insert(i->osd);
8440 out.insert(*i);
8441 }
8442 }
8443 return out;
8444 }
8445
8446 /*---------Active---------*/
8447 PG::RecoveryState::Active::Active(my_context ctx)
8448 : my_base(ctx),
8449 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active"),
8450 remote_shards_to_reserve_recovery(
8451 unique_osd_shard_set(
8452 context< RecoveryMachine >().pg->pg_whoami,
8453 context< RecoveryMachine >().pg->acting_recovery_backfill)),
8454 remote_shards_to_reserve_backfill(
8455 unique_osd_shard_set(
8456 context< RecoveryMachine >().pg->pg_whoami,
8457 context< RecoveryMachine >().pg->backfill_targets)),
8458 all_replicas_activated(false)
8459 {
8460 context< RecoveryMachine >().log_enter(state_name);
8461
8462 PG *pg = context< RecoveryMachine >().pg;
8463
8464 ceph_assert(!pg->backfill_reserving);
8465 ceph_assert(!pg->backfill_reserved);
8466 ceph_assert(pg->is_primary());
8467 ldout(pg->cct, 10) << "In Active, about to call activate" << dendl;
8468 pg->start_flush(context< RecoveryMachine >().get_cur_transaction());
8469 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
8470 pg->get_osdmap_epoch(),
8471 *context< RecoveryMachine >().get_query_map(),
8472 context< RecoveryMachine >().get_info_map(),
8473 context< RecoveryMachine >().get_recovery_ctx());
8474
8475 // everyone has to commit/ack before we are truly active
8476 pg->blocked_by.clear();
8477 for (set<pg_shard_t>::iterator p = pg->acting_recovery_backfill.begin();
8478 p != pg->acting_recovery_backfill.end();
8479 ++p) {
8480 if (p->shard != pg->pg_whoami.shard) {
8481 pg->blocked_by.insert(p->shard);
8482 }
8483 }
8484 pg->publish_stats_to_osd();
8485 ldout(pg->cct, 10) << "Activate Finished" << dendl;
8486 }
8487
8488 boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
8489 {
8490 PG *pg = context< RecoveryMachine >().pg;
8491 if (pg->should_restart_peering(
8492 advmap.up_primary,
8493 advmap.acting_primary,
8494 advmap.newup,
8495 advmap.newacting,
8496 advmap.lastmap,
8497 advmap.osdmap)) {
8498 ldout(pg->cct, 10) << "Active advmap interval change, fast return" << dendl;
8499 return forward_event();
8500 }
8501 ldout(pg->cct, 10) << "Active advmap" << dendl;
8502 bool need_publish = false;
8503
8504 if (advmap.osdmap->require_osd_release >= CEPH_RELEASE_MIMIC) {
8505 const auto& new_removed_snaps = advmap.osdmap->get_new_removed_snaps();
8506 auto i = new_removed_snaps.find(pg->info.pgid.pool());
8507 if (i != new_removed_snaps.end()) {
8508 bool bad = false;
8509 for (auto j : i->second) {
8510 if (pg->snap_trimq.intersects(j.first, j.second)) {
8511 decltype(pg->snap_trimq) added, overlap;
8512 added.insert(j.first, j.second);
8513 overlap.intersection_of(pg->snap_trimq, added);
8514 if (pg->last_require_osd_release < CEPH_RELEASE_MIMIC) {
8515 lderr(pg->cct) << __func__ << " removed_snaps already contains "
8516 << overlap << ", but this is the first mimic+ osdmap,"
8517 << " so it's expected" << dendl;
8518 } else {
8519 lderr(pg->cct) << __func__ << " removed_snaps already contains "
8520 << overlap << dendl;
8521 bad = true;
8522 }
8523 pg->snap_trimq.union_of(added);
8524 } else {
8525 pg->snap_trimq.insert(j.first, j.second);
8526 }
8527 }
8528 if (pg->last_require_osd_release < CEPH_RELEASE_MIMIC) {
8529 // at upgrade, we report *all* previously removed snaps as removed in
8530 // the first mimic epoch. remove the ones we previously divined were
8531 // removed (and subsequently purged) from the trimq.
8532 lderr(pg->cct) << __func__ << " first mimic map, filtering purged_snaps"
8533 << " from new removed_snaps" << dendl;
8534 pg->snap_trimq.subtract(pg->info.purged_snaps);
8535 }
8536 ldout(pg->cct,10) << __func__ << " new removed_snaps " << i->second
8537 << ", snap_trimq now " << pg->snap_trimq << dendl;
8538 ceph_assert(!bad || !pg->cct->_conf->osd_debug_verify_cached_snaps);
8539 pg->dirty_info = true;
8540 pg->dirty_big_info = true;
8541 }
8542
8543 const auto& new_purged_snaps = advmap.osdmap->get_new_purged_snaps();
8544 auto j = new_purged_snaps.find(pg->info.pgid.pool());
8545 if (j != new_purged_snaps.end()) {
8546 bool bad = false;
8547 for (auto k : j->second) {
8548 if (!pg->info.purged_snaps.contains(k.first, k.second)) {
8549 decltype(pg->info.purged_snaps) rm, overlap;
8550 rm.insert(k.first, k.second);
8551 overlap.intersection_of(pg->info.purged_snaps, rm);
8552 lderr(pg->cct) << __func__ << " purged_snaps does not contain "
8553 << rm << ", only " << overlap << dendl;
8554 pg->info.purged_snaps.subtract(overlap);
8555 // This can currently happen in the normal (if unlikely) course of
8556 // events. Because adding snaps to purged_snaps does not increase
8557 // the pg version or add a pg log entry, we don't reliably propagate
8558 // purged_snaps additions to other OSDs.
8559 // One example:
8560 // - purge S
8561 // - primary and replicas update purged_snaps
8562 // - no object updates
8563 // - pg mapping changes, new primary on different node
8564 // - new primary pg version == eversion_t(), so info is not
8565 // propagated.
8566 //bad = true;
8567 } else {
8568 pg->info.purged_snaps.erase(k.first, k.second);
8569 }
8570 }
8571 ldout(pg->cct,10) << __func__ << " new purged_snaps " << j->second
8572 << ", now " << pg->info.purged_snaps << dendl;
8573 ceph_assert(!bad || !pg->cct->_conf->osd_debug_verify_cached_snaps);
8574 pg->dirty_info = true;
8575 pg->dirty_big_info = true;
8576 }
8577 if (pg->dirty_big_info) {
8578 // share updated purged_snaps to mgr/mon so that we (a) stop reporting
8579 // purged snaps and (b) perhaps share more snaps that we have purged
8580 // but didn't fit in pg_stat_t.
8581 need_publish = true;
8582 pg->share_pg_info();
8583 }
8584 } else if (!pg->pool.newly_removed_snaps.empty()) {
8585 pg->snap_trimq.union_of(pg->pool.newly_removed_snaps);
8586 ldout(pg->cct, 10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl;
8587 pg->dirty_info = true;
8588 pg->dirty_big_info = true;
8589 }
8590
8591 for (size_t i = 0; i < pg->want_acting.size(); i++) {
8592 int osd = pg->want_acting[i];
8593 if (!advmap.osdmap->is_up(osd)) {
8594 pg_shard_t osd_with_shard(osd, shard_id_t(i));
8595 ceph_assert(pg->is_acting(osd_with_shard) || pg->is_up(osd_with_shard));
8596 }
8597 }
8598
8599 /* Check for changes in pool size (if the acting set changed as a result,
8600 * this does not matter) */
8601 if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) !=
8602 pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) {
8603 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) {
8604 pg->state_clear(PG_STATE_UNDERSIZED);
8605 } else {
8606 pg->state_set(PG_STATE_UNDERSIZED);
8607 }
8608 // degraded changes will be detected by call from publish_stats_to_osd()
8609 need_publish = true;
8610 }
8611
8612 // if we haven't reported our PG stats in a long time, do so now.
8613 if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
8614 ldout(pg->cct, 20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch)
8615 << " epochs" << dendl;
8616 need_publish = true;
8617 }
8618
8619 if (need_publish)
8620 pg->publish_stats_to_osd();
8621
8622 return forward_event();
8623 }
8624
8625 boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
8626 {
8627 PG *pg = context< RecoveryMachine >().pg;
8628 ldout(pg->cct, 10) << "Active: handling ActMap" << dendl;
8629 ceph_assert(pg->is_primary());
8630
8631 if (pg->have_unfound()) {
8632 // object may have become unfound
8633 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
8634 }
8635
8636 if (pg->cct->_conf->osd_check_for_log_corruption)
8637 pg->check_log_for_corruption(pg->osd->store);
8638
8639 uint64_t unfound = pg->missing_loc.num_unfound();
8640 if (unfound > 0 &&
8641 pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
8642 if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
8643 pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound
8644 << " objects unfound and apparently lost, would automatically "
8645 << "mark these objects lost but this feature is not yet implemented "
8646 << "(osd_auto_mark_unfound_lost)";
8647 } else
8648 pg->osd->clog->error() << pg->info.pgid.pgid << " has "
8649 << unfound << " objects unfound and apparently lost";
8650 }
8651
8652 if (pg->is_active()) {
8653 ldout(pg->cct, 10) << "Active: kicking snap trim" << dendl;
8654 pg->kick_snap_trim();
8655 }
8656
8657 if (pg->is_peered() &&
8658 !pg->is_clean() &&
8659 !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) &&
8660 (!pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || pg->is_degraded())) {
8661 pg->queue_recovery();
8662 }
8663 return forward_event();
8664 }
8665
8666 boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& notevt)
8667 {
8668 PG *pg = context< RecoveryMachine >().pg;
8669 ceph_assert(pg->is_primary());
8670 if (pg->peer_info.count(notevt.from)) {
8671 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
8672 << ", already have info from that osd, ignoring"
8673 << dendl;
8674 } else if (pg->peer_purged.count(notevt.from)) {
8675 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
8676 << ", already purged that peer, ignoring"
8677 << dendl;
8678 } else {
8679 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
8680 << ", calling proc_replica_info and discover_all_missing"
8681 << dendl;
8682 pg->proc_replica_info(
8683 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
8684 if (pg->have_unfound() || (pg->is_degraded() && pg->might_have_unfound.count(notevt.from))) {
8685 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
8686 }
8687 }
8688 return discard_event();
8689 }
8690
8691 boost::statechart::result PG::RecoveryState::Active::react(const MTrim& trim)
8692 {
8693 PG *pg = context< RecoveryMachine >().pg;
8694 ceph_assert(pg->is_primary());
8695
8696 // peer is informing us of their last_complete_ondisk
8697 ldout(pg->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl;
8698 pg->peer_last_complete_ondisk[pg_shard_t(trim.from, trim.shard)] = trim.trim_to;
8699
8700 // trim log when the pg is recovered
8701 pg->calc_min_last_complete_ondisk();
8702 return discard_event();
8703 }
8704
8705 boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoevt)
8706 {
8707 PG *pg = context< RecoveryMachine >().pg;
8708 ceph_assert(pg->is_primary());
8709
8710 ceph_assert(!pg->acting_recovery_backfill.empty());
8711 // don't update history (yet) if we are active and primary; the replica
8712 // may be telling us they have activated (and committed) but we can't
8713 // share that until _everyone_ does the same.
8714 if (pg->is_acting_recovery_backfill(infoevt.from) &&
8715 pg->peer_activated.count(infoevt.from) == 0) {
8716 ldout(pg->cct, 10) << " peer osd." << infoevt.from
8717 << " activated and committed" << dendl;
8718 pg->peer_activated.insert(infoevt.from);
8719 pg->blocked_by.erase(infoevt.from.shard);
8720 pg->publish_stats_to_osd();
8721 if (pg->peer_activated.size() == pg->acting_recovery_backfill.size()) {
8722 pg->all_activated_and_committed();
8723 }
8724 }
8725 return discard_event();
8726 }
8727
8728 boost::statechart::result PG::RecoveryState::Active::react(const MLogRec& logevt)
8729 {
8730 PG *pg = context< RecoveryMachine >().pg;
8731 ldout(pg->cct, 10) << "searching osd." << logevt.from
8732 << " log for unfound items" << dendl;
8733 pg->proc_replica_log(
8734 logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
8735 bool got_missing = pg->search_for_missing(
8736 pg->peer_info[logevt.from],
8737 pg->peer_missing[logevt.from],
8738 logevt.from,
8739 context< RecoveryMachine >().get_recovery_ctx());
8740 // If there are missing AND we are "fully" active then start recovery now
8741 if (got_missing && pg->state_test(PG_STATE_ACTIVE)) {
8742 post_event(DoRecovery());
8743 }
8744 return discard_event();
8745 }
8746
8747 boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
8748 {
8749 PG *pg = context< RecoveryMachine >().pg;
8750
8751 q.f->open_object_section("state");
8752 q.f->dump_string("name", state_name);
8753 q.f->dump_stream("enter_time") << enter_time;
8754
8755 {
8756 q.f->open_array_section("might_have_unfound");
8757 for (set<pg_shard_t>::iterator p = pg->might_have_unfound.begin();
8758 p != pg->might_have_unfound.end();
8759 ++p) {
8760 q.f->open_object_section("osd");
8761 q.f->dump_stream("osd") << *p;
8762 if (pg->peer_missing.count(*p)) {
8763 q.f->dump_string("status", "already probed");
8764 } else if (pg->peer_missing_requested.count(*p)) {
8765 q.f->dump_string("status", "querying");
8766 } else if (!pg->get_osdmap()->is_up(p->osd)) {
8767 q.f->dump_string("status", "osd is down");
8768 } else {
8769 q.f->dump_string("status", "not queried");
8770 }
8771 q.f->close_section();
8772 }
8773 q.f->close_section();
8774 }
8775 {
8776 q.f->open_object_section("recovery_progress");
8777 pg->dump_recovery_info(q.f);
8778 q.f->close_section();
8779 }
8780
8781 {
8782 q.f->open_object_section("scrub");
8783 q.f->dump_stream("scrubber.epoch_start") << pg->scrubber.epoch_start;
8784 q.f->dump_bool("scrubber.active", pg->scrubber.active);
8785 q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state));
8786 q.f->dump_stream("scrubber.start") << pg->scrubber.start;
8787 q.f->dump_stream("scrubber.end") << pg->scrubber.end;
8788 q.f->dump_stream("scrubber.max_end") << pg->scrubber.max_end;
8789 q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update;
8790 q.f->dump_bool("scrubber.deep", pg->scrubber.deep);
8791 {
8792 q.f->open_array_section("scrubber.waiting_on_whom");
8793 for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin();
8794 p != pg->scrubber.waiting_on_whom.end();
8795 ++p) {
8796 q.f->dump_stream("shard") << *p;
8797 }
8798 q.f->close_section();
8799 }
8800 q.f->close_section();
8801 }
8802
8803 q.f->close_section();
8804 return forward_event();
8805 }
8806
8807 boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActivated &evt)
8808 {
8809 PG *pg = context< RecoveryMachine >().pg;
8810 pg_t pgid = pg->info.pgid.pgid;
8811
8812 all_replicas_activated = true;
8813
8814 pg->state_clear(PG_STATE_ACTIVATING);
8815 pg->state_clear(PG_STATE_CREATING);
8816 pg->state_clear(PG_STATE_PREMERGE);
8817
8818 bool merge_target;
8819 if (pg->pool.info.is_pending_merge(pgid, &merge_target)) {
8820 pg->state_set(PG_STATE_PEERED);
8821 pg->state_set(PG_STATE_PREMERGE);
8822
8823 if (pg->actingset.size() != pg->get_osdmap()->get_pg_size(pgid)) {
8824 if (merge_target) {
8825 pg_t src = pgid;
8826 src.set_ps(pg->pool.info.get_pg_num_pending());
8827 assert(src.get_parent() == pgid);
8828 pg->osd->set_not_ready_to_merge_target(pgid, src);
8829 } else {
8830 pg->osd->set_not_ready_to_merge_source(pgid);
8831 }
8832 }
8833 } else if (pg->acting.size() < pg->pool.info.min_size) {
8834 pg->state_set(PG_STATE_PEERED);
8835 } else {
8836 pg->state_set(PG_STATE_ACTIVE);
8837 }
8838
8839 if (pg->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) {
8840 pg->osd->send_pg_created(pgid);
8841 }
8842
8843 pg->info.history.last_epoch_started = pg->info.last_epoch_started;
8844 pg->info.history.last_interval_started = pg->info.last_interval_started;
8845 pg->dirty_info = true;
8846
8847 pg->share_pg_info();
8848 pg->publish_stats_to_osd();
8849
8850 pg->check_local();
8851
8852 // waiters
8853 if (pg->flushes_in_progress == 0) {
8854 pg->requeue_ops(pg->waiting_for_peered);
8855 } else if (!pg->waiting_for_peered.empty()) {
8856 ldout(pg->cct, 10) << __func__ << " flushes in progress, moving "
8857 << pg->waiting_for_peered.size()
8858 << " items to waiting_for_flush"
8859 << dendl;
8860 ceph_assert(pg->waiting_for_flush.empty());
8861 pg->waiting_for_flush.swap(pg->waiting_for_peered);
8862 }
8863
8864 pg->on_activate();
8865
8866 return discard_event();
8867 }
8868
8869 void PG::RecoveryState::Active::exit()
8870 {
8871 context< RecoveryMachine >().log_exit(state_name, enter_time);
8872 PG *pg = context< RecoveryMachine >().pg;
8873 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8874
8875 pg->blocked_by.clear();
8876 pg->backfill_reserved = false;
8877 pg->backfill_reserving = false;
8878 pg->state_clear(PG_STATE_ACTIVATING);
8879 pg->state_clear(PG_STATE_DEGRADED);
8880 pg->state_clear(PG_STATE_UNDERSIZED);
8881 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
8882 pg->state_clear(PG_STATE_BACKFILL_WAIT);
8883 pg->state_clear(PG_STATE_RECOVERY_WAIT);
8884 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
8885 utime_t dur = ceph_clock_now() - enter_time;
8886 pg->osd->recoverystate_perf->tinc(rs_active_latency, dur);
8887 pg->agent_stop();
8888 }
8889
8890 /*------ReplicaActive-----*/
8891 PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx)
8892 : my_base(ctx),
8893 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive")
8894 {
8895 context< RecoveryMachine >().log_enter(state_name);
8896
8897 PG *pg = context< RecoveryMachine >().pg;
8898 pg->start_flush(context< RecoveryMachine >().get_cur_transaction());
8899 }
8900
8901
8902 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
8903 const Activate& actevt) {
8904 PG *pg = context< RecoveryMachine >().pg;
8905 ldout(pg->cct, 10) << "In ReplicaActive, about to call activate" << dendl;
8906 map<int, map<spg_t, pg_query_t> > query_map;
8907 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
8908 actevt.activation_epoch,
8909 query_map, NULL, NULL);
8910 ldout(pg->cct, 10) << "Activate Finished" << dendl;
8911 return discard_event();
8912 }
8913
8914 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MInfoRec& infoevt)
8915 {
8916 PG *pg = context< RecoveryMachine >().pg;
8917 pg->proc_primary_info(*context<RecoveryMachine>().get_cur_transaction(),
8918 infoevt.info);
8919 return discard_event();
8920 }
8921
8922 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MLogRec& logevt)
8923 {
8924 PG *pg = context< RecoveryMachine >().pg;
8925 ldout(pg->cct, 10) << "received log from " << logevt.from << dendl;
8926 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
8927 pg->merge_log(*t, logevt.msg->info, logevt.msg->log, logevt.from);
8928 ceph_assert(pg->pg_log.get_head() == pg->info.last_update);
8929
8930 return discard_event();
8931 }
8932
8933 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MTrim& trim)
8934 {
8935 PG *pg = context< RecoveryMachine >().pg;
8936 // primary is instructing us to trim
8937 pg->pg_log.trim(trim.trim_to, pg->info);
8938 pg->dirty_info = true;
8939 return discard_event();
8940 }
8941
8942 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&)
8943 {
8944 PG *pg = context< RecoveryMachine >().pg;
8945 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
8946 context< RecoveryMachine >().send_notify(
8947 pg->get_primary(),
8948 pg_notify_t(
8949 pg->get_primary().shard, pg->pg_whoami.shard,
8950 pg->get_osdmap_epoch(),
8951 pg->get_osdmap_epoch(),
8952 pg->info),
8953 pg->past_intervals);
8954 }
8955 pg->take_waiters();
8956 return discard_event();
8957 }
8958
8959 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
8960 const MQuery& query)
8961 {
8962 PG *pg = context< RecoveryMachine >().pg;
8963 pg->fulfill_query(query, context<RecoveryMachine>().get_recovery_ctx());
8964 return discard_event();
8965 }
8966
8967 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const QueryState& q)
8968 {
8969 q.f->open_object_section("state");
8970 q.f->dump_string("name", state_name);
8971 q.f->dump_stream("enter_time") << enter_time;
8972 q.f->close_section();
8973 return forward_event();
8974 }
8975
8976 void PG::RecoveryState::ReplicaActive::exit()
8977 {
8978 context< RecoveryMachine >().log_exit(state_name, enter_time);
8979 PG *pg = context< RecoveryMachine >().pg;
8980 pg->clear_reserved_num_bytes();
8981 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
8982 utime_t dur = ceph_clock_now() - enter_time;
8983 pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur);
8984 }
8985
8986 /*-------Stray---*/
8987 PG::RecoveryState::Stray::Stray(my_context ctx)
8988 : my_base(ctx),
8989 NamedState(context< RecoveryMachine >().pg, "Started/Stray")
8990 {
8991 context< RecoveryMachine >().log_enter(state_name);
8992
8993 PG *pg = context< RecoveryMachine >().pg;
8994 ceph_assert(!pg->is_peered());
8995 ceph_assert(!pg->is_peering());
8996 ceph_assert(!pg->is_primary());
8997
8998 if (!pg->get_osdmap()->have_pg_pool(pg->get_pgid().pool())) {
8999 ldout(pg->cct,10) << __func__ << " pool is deleted" << dendl;
9000 post_event(DeleteStart());
9001 } else {
9002 pg->start_flush(context< RecoveryMachine >().get_cur_transaction());
9003 }
9004 }
9005
9006 boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
9007 {
9008 PG *pg = context< RecoveryMachine >().pg;
9009 MOSDPGLog *msg = logevt.msg.get();
9010 ldout(pg->cct, 10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
9011
9012 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
9013 if (msg->info.last_backfill == hobject_t()) {
9014 // restart backfill
9015 pg->info = msg->info;
9016 pg->on_info_history_change();
9017 pg->dirty_info = true;
9018 pg->dirty_big_info = true; // maybe.
9019
9020 PGLogEntryHandler rollbacker{pg, t};
9021 pg->pg_log.reset_backfill_claim_log(msg->log, &rollbacker);
9022
9023 pg->pg_log.reset_backfill();
9024 } else {
9025 pg->merge_log(*t, msg->info, msg->log, logevt.from);
9026 }
9027
9028 ceph_assert(pg->pg_log.get_head() == pg->info.last_update);
9029
9030 post_event(Activate(logevt.msg->info.last_epoch_started));
9031 return transit<ReplicaActive>();
9032 }
9033
9034 boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoevt)
9035 {
9036 PG *pg = context< RecoveryMachine >().pg;
9037 ldout(pg->cct, 10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
9038
9039 if (pg->info.last_update > infoevt.info.last_update) {
9040 // rewind divergent log entries
9041 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
9042 pg->rewind_divergent_log(*t, infoevt.info.last_update);
9043 pg->info.stats = infoevt.info.stats;
9044 pg->info.hit_set = infoevt.info.hit_set;
9045 }
9046
9047 ceph_assert(infoevt.info.last_update == pg->info.last_update);
9048 ceph_assert(pg->pg_log.get_head() == pg->info.last_update);
9049
9050 post_event(Activate(infoevt.info.last_epoch_started));
9051 return transit<ReplicaActive>();
9052 }
9053
9054 boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query)
9055 {
9056 PG *pg = context< RecoveryMachine >().pg;
9057 pg->fulfill_query(query, context<RecoveryMachine>().get_recovery_ctx());
9058 return discard_event();
9059 }
9060
9061 boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&)
9062 {
9063 PG *pg = context< RecoveryMachine >().pg;
9064 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
9065 context< RecoveryMachine >().send_notify(
9066 pg->get_primary(),
9067 pg_notify_t(
9068 pg->get_primary().shard, pg->pg_whoami.shard,
9069 pg->get_osdmap_epoch(),
9070 pg->get_osdmap_epoch(),
9071 pg->info),
9072 pg->past_intervals);
9073 }
9074 pg->take_waiters();
9075 return discard_event();
9076 }
9077
9078 void PG::RecoveryState::Stray::exit()
9079 {
9080 context< RecoveryMachine >().log_exit(state_name, enter_time);
9081 PG *pg = context< RecoveryMachine >().pg;
9082 utime_t dur = ceph_clock_now() - enter_time;
9083 pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur);
9084 }
9085
9086
9087 /*--------ToDelete----------*/
9088 PG::RecoveryState::ToDelete::ToDelete(my_context ctx)
9089 : my_base(ctx),
9090 NamedState(context< RecoveryMachine >().pg, "Started/ToDelete")
9091 {
9092 context< RecoveryMachine >().log_enter(state_name);
9093 PG *pg = context< RecoveryMachine >().pg;
9094 pg->osd->logger->inc(l_osd_pg_removing);
9095 }
9096
9097 void PG::RecoveryState::ToDelete::exit()
9098 {
9099 context< RecoveryMachine >().log_exit(state_name, enter_time);
9100 PG *pg = context< RecoveryMachine >().pg;
9101 // note: on a successful removal, this path doesn't execute. see
9102 // _delete_some().
9103 pg->osd->logger->dec(l_osd_pg_removing);
9104 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
9105 }
9106
9107 /*----WaitDeleteReserved----*/
9108 PG::RecoveryState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx)
9109 : my_base(ctx),
9110 NamedState(context< RecoveryMachine >().pg,
9111 "Started/ToDelete/WaitDeleteReseved")
9112 {
9113 context< RecoveryMachine >().log_enter(state_name);
9114 PG *pg = context< RecoveryMachine >().pg;
9115 context<ToDelete>().priority = pg->get_delete_priority();
9116 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
9117 pg->osd->local_reserver.request_reservation(
9118 pg->info.pgid,
9119 new QueuePeeringEvt<DeleteReserved>(
9120 pg, pg->get_osdmap_epoch(),
9121 DeleteReserved()),
9122 context<ToDelete>().priority,
9123 new QueuePeeringEvt<DeleteInterrupted>(
9124 pg, pg->get_osdmap_epoch(),
9125 DeleteInterrupted()));
9126 }
9127
9128 boost::statechart::result PG::RecoveryState::ToDelete::react(
9129 const ActMap& evt)
9130 {
9131 PG *pg = context< RecoveryMachine >().pg;
9132 if (pg->get_delete_priority() != priority) {
9133 ldout(pg->cct,10) << __func__ << " delete priority changed, resetting"
9134 << dendl;
9135 return transit<ToDelete>();
9136 }
9137 return discard_event();
9138 }
9139
9140 void PG::RecoveryState::WaitDeleteReserved::exit()
9141 {
9142 context< RecoveryMachine >().log_exit(state_name, enter_time);
9143 }
9144
9145 /*----Deleting-----*/
9146 PG::RecoveryState::Deleting::Deleting(my_context ctx)
9147 : my_base(ctx),
9148 NamedState(context< RecoveryMachine >().pg, "Started/ToDelete/Deleting")
9149 {
9150 context< RecoveryMachine >().log_enter(state_name);
9151 PG *pg = context< RecoveryMachine >().pg;
9152 pg->deleting = true;
9153 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
9154 pg->on_removal(t);
9155 t->register_on_commit(new C_DeleteMore(pg, pg->get_osdmap_epoch()));
9156 }
9157
9158 boost::statechart::result PG::RecoveryState::Deleting::react(
9159 const DeleteSome& evt)
9160 {
9161 PG *pg = context< RecoveryMachine >().pg;
9162 pg->_delete_some(context<RecoveryMachine>().get_cur_transaction());
9163 return discard_event();
9164 }
9165
9166 void PG::RecoveryState::Deleting::exit()
9167 {
9168 context< RecoveryMachine >().log_exit(state_name, enter_time);
9169 PG *pg = context< RecoveryMachine >().pg;
9170 pg->deleting = false;
9171 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
9172 }
9173
9174 /*--------GetInfo---------*/
9175 PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
9176 : my_base(ctx),
9177 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetInfo")
9178 {
9179 context< RecoveryMachine >().log_enter(state_name);
9180
9181 PG *pg = context< RecoveryMachine >().pg;
9182 pg->check_past_interval_bounds();
9183 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
9184
9185 ceph_assert(pg->blocked_by.empty());
9186
9187 prior_set = pg->build_prior();
9188
9189 pg->reset_min_peer_features();
9190 get_infos();
9191 if (prior_set.pg_down) {
9192 post_event(IsDown());
9193 } else if (peer_info_requested.empty()) {
9194 post_event(GotInfo());
9195 }
9196 }
9197
9198 void PG::RecoveryState::GetInfo::get_infos()
9199 {
9200 PG *pg = context< RecoveryMachine >().pg;
9201 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
9202
9203 pg->blocked_by.clear();
9204 for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
9205 it != prior_set.probe.end();
9206 ++it) {
9207 pg_shard_t peer = *it;
9208 if (peer == pg->pg_whoami) {
9209 continue;
9210 }
9211 if (pg->peer_info.count(peer)) {
9212 ldout(pg->cct, 10) << " have osd." << peer << " info " << pg->peer_info[peer] << dendl;
9213 continue;
9214 }
9215 if (peer_info_requested.count(peer)) {
9216 ldout(pg->cct, 10) << " already requested info from osd." << peer << dendl;
9217 pg->blocked_by.insert(peer.osd);
9218 } else if (!pg->get_osdmap()->is_up(peer.osd)) {
9219 ldout(pg->cct, 10) << " not querying info from down osd." << peer << dendl;
9220 } else {
9221 ldout(pg->cct, 10) << " querying info from osd." << peer << dendl;
9222 context< RecoveryMachine >().send_query(
9223 peer, pg_query_t(pg_query_t::INFO,
9224 it->shard, pg->pg_whoami.shard,
9225 pg->info.history,
9226 pg->get_osdmap_epoch()));
9227 peer_info_requested.insert(peer);
9228 pg->blocked_by.insert(peer.osd);
9229 }
9230 }
9231
9232 pg->publish_stats_to_osd();
9233 }
9234
9235 boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt)
9236 {
9237 PG *pg = context< RecoveryMachine >().pg;
9238
9239 set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
9240 if (p != peer_info_requested.end()) {
9241 peer_info_requested.erase(p);
9242 pg->blocked_by.erase(infoevt.from.osd);
9243 }
9244
9245 epoch_t old_start = pg->info.history.last_epoch_started;
9246 if (pg->proc_replica_info(
9247 infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
9248 // we got something new ...
9249 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
9250 if (old_start < pg->info.history.last_epoch_started) {
9251 ldout(pg->cct, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
9252 prior_set = pg->build_prior();
9253
9254 // filter out any osds that got dropped from the probe set from
9255 // peer_info_requested. this is less expensive than restarting
9256 // peering (which would re-probe everyone).
9257 set<pg_shard_t>::iterator p = peer_info_requested.begin();
9258 while (p != peer_info_requested.end()) {
9259 if (prior_set.probe.count(*p) == 0) {
9260 ldout(pg->cct, 20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
9261 peer_info_requested.erase(p++);
9262 } else {
9263 ++p;
9264 }
9265 }
9266 get_infos();
9267 }
9268 ldout(pg->cct, 20) << "Adding osd: " << infoevt.from.osd << " peer features: "
9269 << hex << infoevt.features << dec << dendl;
9270 pg->apply_peer_features(infoevt.features);
9271
9272 // are we done getting everything?
9273 if (peer_info_requested.empty() && !prior_set.pg_down) {
9274 ldout(pg->cct, 20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl;
9275 ldout(pg->cct, 20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl;
9276 ldout(pg->cct, 20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl;
9277 post_event(GotInfo());
9278 }
9279 }
9280 return discard_event();
9281 }
9282
9283 boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q)
9284 {
9285 PG *pg = context< RecoveryMachine >().pg;
9286 q.f->open_object_section("state");
9287 q.f->dump_string("name", state_name);
9288 q.f->dump_stream("enter_time") << enter_time;
9289
9290 q.f->open_array_section("requested_info_from");
9291 for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
9292 p != peer_info_requested.end();
9293 ++p) {
9294 q.f->open_object_section("osd");
9295 q.f->dump_stream("osd") << *p;
9296 if (pg->peer_info.count(*p)) {
9297 q.f->open_object_section("got_info");
9298 pg->peer_info[*p].dump(q.f);
9299 q.f->close_section();
9300 }
9301 q.f->close_section();
9302 }
9303 q.f->close_section();
9304
9305 q.f->close_section();
9306 return forward_event();
9307 }
9308
9309 void PG::RecoveryState::GetInfo::exit()
9310 {
9311 context< RecoveryMachine >().log_exit(state_name, enter_time);
9312 PG *pg = context< RecoveryMachine >().pg;
9313 utime_t dur = ceph_clock_now() - enter_time;
9314 pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur);
9315 pg->blocked_by.clear();
9316 }
9317
9318 /*------GetLog------------*/
9319 PG::RecoveryState::GetLog::GetLog(my_context ctx)
9320 : my_base(ctx),
9321 NamedState(
9322 context< RecoveryMachine >().pg, "Started/Primary/Peering/GetLog"),
9323 msg(0)
9324 {
9325 context< RecoveryMachine >().log_enter(state_name);
9326
9327 PG *pg = context< RecoveryMachine >().pg;
9328
9329 // adjust acting?
9330 if (!pg->choose_acting(auth_log_shard, false,
9331 &context< Peering >().history_les_bound)) {
9332 if (!pg->want_acting.empty()) {
9333 post_event(NeedActingChange());
9334 } else {
9335 post_event(IsIncomplete());
9336 }
9337 return;
9338 }
9339
9340 // am i the best?
9341 if (auth_log_shard == pg->pg_whoami) {
9342 post_event(GotLog());
9343 return;
9344 }
9345
9346 const pg_info_t& best = pg->peer_info[auth_log_shard];
9347
9348 // am i broken?
9349 if (pg->info.last_update < best.log_tail) {
9350 ldout(pg->cct, 10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
9351 post_event(IsIncomplete());
9352 return;
9353 }
9354
9355 // how much log to request?
9356 eversion_t request_log_from = pg->info.last_update;
9357 ceph_assert(!pg->acting_recovery_backfill.empty());
9358 for (set<pg_shard_t>::iterator p = pg->acting_recovery_backfill.begin();
9359 p != pg->acting_recovery_backfill.end();
9360 ++p) {
9361 if (*p == pg->pg_whoami) continue;
9362 pg_info_t& ri = pg->peer_info[*p];
9363 if (ri.last_update < pg->info.log_tail && ri.last_update >= best.log_tail &&
9364 ri.last_update < request_log_from)
9365 request_log_from = ri.last_update;
9366 }
9367
9368 // how much?
9369 ldout(pg->cct, 10) << " requesting log from osd." << auth_log_shard << dendl;
9370 context<RecoveryMachine>().send_query(
9371 auth_log_shard,
9372 pg_query_t(
9373 pg_query_t::LOG,
9374 auth_log_shard.shard, pg->pg_whoami.shard,
9375 request_log_from, pg->info.history,
9376 pg->get_osdmap_epoch()));
9377
9378 ceph_assert(pg->blocked_by.empty());
9379 pg->blocked_by.insert(auth_log_shard.osd);
9380 pg->publish_stats_to_osd();
9381 }
9382
9383 boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap)
9384 {
9385 PG *pg = context< RecoveryMachine >().pg;
9386 // make sure our log source didn't go down. we need to check
9387 // explicitly because it may not be part of the prior set, which
9388 // means the Peering state check won't catch it going down.
9389 if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
9390 ldout(pg->cct, 10) << "GetLog: auth_log_shard osd."
9391 << auth_log_shard.osd << " went down" << dendl;
9392 post_event(advmap);
9393 return transit< Reset >();
9394 }
9395
9396 // let the Peering state do its checks.
9397 return forward_event();
9398 }
9399
9400 boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt)
9401 {
9402 PG *pg = context< RecoveryMachine >().pg;
9403 ceph_assert(!msg);
9404 if (logevt.from != auth_log_shard) {
9405 ldout(pg->cct, 10) << "GetLog: discarding log from "
9406 << "non-auth_log_shard osd." << logevt.from << dendl;
9407 return discard_event();
9408 }
9409 ldout(pg->cct, 10) << "GetLog: received master log from osd"
9410 << logevt.from << dendl;
9411 msg = logevt.msg;
9412 post_event(GotLog());
9413 return discard_event();
9414 }
9415
9416 boost::statechart::result PG::RecoveryState::GetLog::react(const GotLog&)
9417 {
9418 PG *pg = context< RecoveryMachine >().pg;
9419 ldout(pg->cct, 10) << "leaving GetLog" << dendl;
9420 if (msg) {
9421 ldout(pg->cct, 10) << "processing master log" << dendl;
9422 pg->proc_master_log(*context<RecoveryMachine>().get_cur_transaction(),
9423 msg->info, msg->log, msg->missing,
9424 auth_log_shard);
9425 }
9426 pg->start_flush(context< RecoveryMachine >().get_cur_transaction());
9427 return transit< GetMissing >();
9428 }
9429
9430 boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q)
9431 {
9432 q.f->open_object_section("state");
9433 q.f->dump_string("name", state_name);
9434 q.f->dump_stream("enter_time") << enter_time;
9435 q.f->dump_stream("auth_log_shard") << auth_log_shard;
9436 q.f->close_section();
9437 return forward_event();
9438 }
9439
9440 void PG::RecoveryState::GetLog::exit()
9441 {
9442 context< RecoveryMachine >().log_exit(state_name, enter_time);
9443 PG *pg = context< RecoveryMachine >().pg;
9444 utime_t dur = ceph_clock_now() - enter_time;
9445 pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur);
9446 pg->blocked_by.clear();
9447 }
9448
9449 /*------WaitActingChange--------*/
9450 PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx)
9451 : my_base(ctx),
9452 NamedState(context< RecoveryMachine >().pg, "Started/Primary/WaitActingChange")
9453 {
9454 context< RecoveryMachine >().log_enter(state_name);
9455 }
9456
9457 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const AdvMap& advmap)
9458 {
9459 PG *pg = context< RecoveryMachine >().pg;
9460 OSDMapRef osdmap = advmap.osdmap;
9461
9462 ldout(pg->cct, 10) << "verifying no want_acting " << pg->want_acting << " targets didn't go down" << dendl;
9463 for (vector<int>::iterator p = pg->want_acting.begin(); p != pg->want_acting.end(); ++p) {
9464 if (!osdmap->is_up(*p)) {
9465 ldout(pg->cct, 10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
9466 post_event(advmap);
9467 return transit< Reset >();
9468 }
9469 }
9470 return forward_event();
9471 }
9472
9473 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MLogRec& logevt)
9474 {
9475 PG *pg = context< RecoveryMachine >().pg;
9476 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MLocRec" << dendl;
9477 return discard_event();
9478 }
9479
9480 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MInfoRec& evt)
9481 {
9482 PG *pg = context< RecoveryMachine >().pg;
9483 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
9484 return discard_event();
9485 }
9486
9487 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MNotifyRec& evt)
9488 {
9489 PG *pg = context< RecoveryMachine >().pg;
9490 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
9491 return discard_event();
9492 }
9493
9494 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const QueryState& q)
9495 {
9496 q.f->open_object_section("state");
9497 q.f->dump_string("name", state_name);
9498 q.f->dump_stream("enter_time") << enter_time;
9499 q.f->dump_string("comment", "waiting for pg acting set to change");
9500 q.f->close_section();
9501 return forward_event();
9502 }
9503
9504 void PG::RecoveryState::WaitActingChange::exit()
9505 {
9506 context< RecoveryMachine >().log_exit(state_name, enter_time);
9507 PG *pg = context< RecoveryMachine >().pg;
9508 utime_t dur = ceph_clock_now() - enter_time;
9509 pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur);
9510 }
9511
9512 /*------Down--------*/
9513 PG::RecoveryState::Down::Down(my_context ctx)
9514 : my_base(ctx),
9515 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Down")
9516 {
9517 context< RecoveryMachine >().log_enter(state_name);
9518 PG *pg = context< RecoveryMachine >().pg;
9519
9520 pg->state_clear(PG_STATE_PEERING);
9521 pg->state_set(PG_STATE_DOWN);
9522
9523 auto &prior_set = context< Peering >().prior_set;
9524 ceph_assert(pg->blocked_by.empty());
9525 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
9526 pg->publish_stats_to_osd();
9527 }
9528
9529 void PG::RecoveryState::Down::exit()
9530 {
9531 context< RecoveryMachine >().log_exit(state_name, enter_time);
9532 PG *pg = context< RecoveryMachine >().pg;
9533
9534 pg->state_clear(PG_STATE_DOWN);
9535 utime_t dur = ceph_clock_now() - enter_time;
9536 pg->osd->recoverystate_perf->tinc(rs_down_latency, dur);
9537
9538 pg->blocked_by.clear();
9539 }
9540
9541 boost::statechart::result PG::RecoveryState::Down::react(const QueryState& q)
9542 {
9543 q.f->open_object_section("state");
9544 q.f->dump_string("name", state_name);
9545 q.f->dump_stream("enter_time") << enter_time;
9546 q.f->dump_string("comment",
9547 "not enough up instances of this PG to go active");
9548 q.f->close_section();
9549 return forward_event();
9550 }
9551
9552 boost::statechart::result PG::RecoveryState::Down::react(const MNotifyRec& infoevt)
9553 {
9554 PG *pg = context< RecoveryMachine >().pg;
9555
9556 ceph_assert(pg->is_primary());
9557 epoch_t old_start = pg->info.history.last_epoch_started;
9558 if (!pg->peer_info.count(infoevt.from) &&
9559 pg->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) {
9560 pg->update_history(infoevt.notify.info.history);
9561 }
9562 // if we got something new to make pg escape down state
9563 if (pg->info.history.last_epoch_started > old_start) {
9564 ldout(pg->cct, 10) << " last_epoch_started moved forward, re-enter getinfo" << dendl;
9565 pg->state_clear(PG_STATE_DOWN);
9566 pg->state_set(PG_STATE_PEERING);
9567 return transit< GetInfo >();
9568 }
9569
9570 return discard_event();
9571 }
9572
9573
9574 /*------Incomplete--------*/
9575 PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
9576 : my_base(ctx),
9577 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Incomplete")
9578 {
9579 context< RecoveryMachine >().log_enter(state_name);
9580 PG *pg = context< RecoveryMachine >().pg;
9581
9582 pg->state_clear(PG_STATE_PEERING);
9583 pg->state_set(PG_STATE_INCOMPLETE);
9584
9585 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
9586 ceph_assert(pg->blocked_by.empty());
9587 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
9588 pg->publish_stats_to_osd();
9589 }
9590
9591 boost::statechart::result PG::RecoveryState::Incomplete::react(const AdvMap &advmap) {
9592 PG *pg = context< RecoveryMachine >().pg;
9593 int64_t poolnum = pg->info.pgid.pool();
9594
9595 // Reset if min_size turn smaller than previous value, pg might now be able to go active
9596 if (!advmap.osdmap->have_pg_pool(poolnum) ||
9597 advmap.lastmap->get_pools().find(poolnum)->second.min_size >
9598 advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
9599 post_event(advmap);
9600 return transit< Reset >();
9601 }
9602
9603 return forward_event();
9604 }
9605
9606 boost::statechart::result PG::RecoveryState::Incomplete::react(const MNotifyRec& notevt) {
9607 PG *pg = context< RecoveryMachine >().pg;
9608 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
9609 if (pg->proc_replica_info(
9610 notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
9611 // We got something new, try again!
9612 return transit< GetLog >();
9613 } else {
9614 return discard_event();
9615 }
9616 }
9617
9618 boost::statechart::result PG::RecoveryState::Incomplete::react(
9619 const QueryState& q)
9620 {
9621 q.f->open_object_section("state");
9622 q.f->dump_string("name", state_name);
9623 q.f->dump_stream("enter_time") << enter_time;
9624 q.f->dump_string("comment", "not enough complete instances of this PG");
9625 q.f->close_section();
9626 return forward_event();
9627 }
9628
9629 void PG::RecoveryState::Incomplete::exit()
9630 {
9631 context< RecoveryMachine >().log_exit(state_name, enter_time);
9632 PG *pg = context< RecoveryMachine >().pg;
9633
9634 pg->state_clear(PG_STATE_INCOMPLETE);
9635 utime_t dur = ceph_clock_now() - enter_time;
9636 pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur);
9637
9638 pg->blocked_by.clear();
9639 }
9640
9641 /*------GetMissing--------*/
9642 PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
9643 : my_base(ctx),
9644 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetMissing")
9645 {
9646 context< RecoveryMachine >().log_enter(state_name);
9647
9648 PG *pg = context< RecoveryMachine >().pg;
9649 ceph_assert(!pg->acting_recovery_backfill.empty());
9650 eversion_t since;
9651 for (set<pg_shard_t>::iterator i = pg->acting_recovery_backfill.begin();
9652 i != pg->acting_recovery_backfill.end();
9653 ++i) {
9654 if (*i == pg->get_primary()) continue;
9655 const pg_info_t& pi = pg->peer_info[*i];
9656 // reset this so to make sure the pg_missing_t is initialized and
9657 // has the correct semantics even if we don't need to get a
9658 // missing set from a shard. This way later additions due to
9659 // lost+unfound delete work properly.
9660 pg->peer_missing[*i].may_include_deletes = !pg->perform_deletes_during_peering();
9661
9662 if (pi.is_empty())
9663 continue; // no pg data, nothing divergent
9664
9665 if (pi.last_update < pg->pg_log.get_tail()) {
9666 ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
9667 pg->peer_missing[*i].clear();
9668 continue;
9669 }
9670 if (pi.last_backfill == hobject_t()) {
9671 ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
9672 pg->peer_missing[*i].clear();
9673 continue;
9674 }
9675
9676 if (pi.last_update == pi.last_complete && // peer has no missing
9677 pi.last_update == pg->info.last_update) { // peer is up to date
9678 // replica has no missing and identical log as us. no need to
9679 // pull anything.
9680 // FIXME: we can do better here. if last_update==last_complete we
9681 // can infer the rest!
9682 ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl;
9683 pg->peer_missing[*i].clear();
9684 continue;
9685 }
9686
9687 // We pull the log from the peer's last_epoch_started to ensure we
9688 // get enough log to detect divergent updates.
9689 since.epoch = pi.last_epoch_started;
9690 ceph_assert(pi.last_update >= pg->info.log_tail); // or else choose_acting() did a bad thing
9691 if (pi.log_tail <= since) {
9692 ldout(pg->cct, 10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
9693 context< RecoveryMachine >().send_query(
9694 *i,
9695 pg_query_t(
9696 pg_query_t::LOG,
9697 i->shard, pg->pg_whoami.shard,
9698 since, pg->info.history,
9699 pg->get_osdmap_epoch()));
9700 } else {
9701 ldout(pg->cct, 10) << " requesting fulllog+missing from osd." << *i
9702 << " (want since " << since << " < log.tail "
9703 << pi.log_tail << ")" << dendl;
9704 context< RecoveryMachine >().send_query(
9705 *i, pg_query_t(
9706 pg_query_t::FULLLOG,
9707 i->shard, pg->pg_whoami.shard,
9708 pg->info.history, pg->get_osdmap_epoch()));
9709 }
9710 peer_missing_requested.insert(*i);
9711 pg->blocked_by.insert(i->osd);
9712 }
9713
9714 if (peer_missing_requested.empty()) {
9715 if (pg->need_up_thru) {
9716 ldout(pg->cct, 10) << " still need up_thru update before going active"
9717 << dendl;
9718 post_event(NeedUpThru());
9719 return;
9720 }
9721
9722 // all good!
9723 post_event(Activate(pg->get_osdmap_epoch()));
9724 } else {
9725 pg->publish_stats_to_osd();
9726 }
9727 }
9728
9729 boost::statechart::result PG::RecoveryState::GetMissing::react(const MLogRec& logevt)
9730 {
9731 PG *pg = context< RecoveryMachine >().pg;
9732
9733 peer_missing_requested.erase(logevt.from);
9734 pg->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
9735
9736 if (peer_missing_requested.empty()) {
9737 if (pg->need_up_thru) {
9738 ldout(pg->cct, 10) << " still need up_thru update before going active"
9739 << dendl;
9740 post_event(NeedUpThru());
9741 } else {
9742 ldout(pg->cct, 10) << "Got last missing, don't need missing "
9743 << "posting Activate" << dendl;
9744 post_event(Activate(pg->get_osdmap_epoch()));
9745 }
9746 }
9747 return discard_event();
9748 }
9749
9750 boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState& q)
9751 {
9752 PG *pg = context< RecoveryMachine >().pg;
9753 q.f->open_object_section("state");
9754 q.f->dump_string("name", state_name);
9755 q.f->dump_stream("enter_time") << enter_time;
9756
9757 q.f->open_array_section("peer_missing_requested");
9758 for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
9759 p != peer_missing_requested.end();
9760 ++p) {
9761 q.f->open_object_section("osd");
9762 q.f->dump_stream("osd") << *p;
9763 if (pg->peer_missing.count(*p)) {
9764 q.f->open_object_section("got_missing");
9765 pg->peer_missing[*p].dump(q.f);
9766 q.f->close_section();
9767 }
9768 q.f->close_section();
9769 }
9770 q.f->close_section();
9771
9772 q.f->close_section();
9773 return forward_event();
9774 }
9775
9776 void PG::RecoveryState::GetMissing::exit()
9777 {
9778 context< RecoveryMachine >().log_exit(state_name, enter_time);
9779 PG *pg = context< RecoveryMachine >().pg;
9780 utime_t dur = ceph_clock_now() - enter_time;
9781 pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur);
9782 pg->blocked_by.clear();
9783 }
9784
9785 /*------WaitUpThru--------*/
9786 PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx)
9787 : my_base(ctx),
9788 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitUpThru")
9789 {
9790 context< RecoveryMachine >().log_enter(state_name);
9791 }
9792
9793 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const ActMap& am)
9794 {
9795 PG *pg = context< RecoveryMachine >().pg;
9796 if (!pg->need_up_thru) {
9797 post_event(Activate(pg->get_osdmap_epoch()));
9798 }
9799 return forward_event();
9800 }
9801
9802 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const MLogRec& logevt)
9803 {
9804 PG *pg = context< RecoveryMachine >().pg;
9805 ldout(pg->cct, 10) << "Noting missing from osd." << logevt.from << dendl;
9806 pg->peer_missing[logevt.from].claim(logevt.msg->missing);
9807 pg->peer_info[logevt.from] = logevt.msg->info;
9808 return discard_event();
9809 }
9810
9811 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const QueryState& q)
9812 {
9813 q.f->open_object_section("state");
9814 q.f->dump_string("name", state_name);
9815 q.f->dump_stream("enter_time") << enter_time;
9816 q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
9817 q.f->close_section();
9818 return forward_event();
9819 }
9820
9821 void PG::RecoveryState::WaitUpThru::exit()
9822 {
9823 context< RecoveryMachine >().log_exit(state_name, enter_time);
9824 PG *pg = context< RecoveryMachine >().pg;
9825 utime_t dur = ceph_clock_now() - enter_time;
9826 pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur);
9827 }
9828
9829 /*----RecoveryState::RecoveryMachine Methods-----*/
9830 #undef dout_prefix
9831 #define dout_prefix pg->gen_prefix(*_dout)
9832
9833 void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name)
9834 {
9835 PG *pg = context< RecoveryMachine >().pg;
9836 ldout(pg->cct, 5) << "enter " << state_name << dendl;
9837 pg->osd->pg_recovery_stats.log_enter(state_name);
9838 }
9839
9840 void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_t enter_time)
9841 {
9842 utime_t dur = ceph_clock_now() - enter_time;
9843 PG *pg = context< RecoveryMachine >().pg;
9844 ldout(pg->cct, 5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
9845 pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now() - enter_time,
9846 event_count, event_time);
9847 event_count = 0;
9848 event_time = utime_t();
9849 }
9850
9851
9852 /*---------------------------------------------------*/
9853 #undef dout_prefix
9854 #define dout_prefix ((debug_pg ? debug_pg->gen_prefix(*_dout) : *_dout) << " PriorSet: ")
9855
9856 void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
9857 ceph_assert(!rctx);
9858 ceph_assert(!orig_ctx);
9859 orig_ctx = new_ctx;
9860 if (new_ctx) {
9861 if (messages_pending_flush) {
9862 rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
9863 } else {
9864 rctx = *new_ctx;
9865 }
9866 rctx->start_time = ceph_clock_now();
9867 }
9868 }
9869
9870 void PG::RecoveryState::begin_block_outgoing() {
9871 ceph_assert(!messages_pending_flush);
9872 ceph_assert(orig_ctx);
9873 ceph_assert(rctx);
9874 messages_pending_flush = BufferedRecoveryMessages();
9875 rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
9876 }
9877
9878 void PG::RecoveryState::clear_blocked_outgoing() {
9879 ceph_assert(orig_ctx);
9880 ceph_assert(rctx);
9881 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
9882 }
9883
9884 void PG::RecoveryState::end_block_outgoing() {
9885 ceph_assert(messages_pending_flush);
9886 ceph_assert(orig_ctx);
9887 ceph_assert(rctx);
9888
9889 rctx = RecoveryCtx(*orig_ctx);
9890 rctx->accept_buffered_messages(*messages_pending_flush);
9891 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
9892 }
9893
9894 void PG::RecoveryState::end_handle() {
9895 if (rctx) {
9896 utime_t dur = ceph_clock_now() - rctx->start_time;
9897 machine.event_time += dur;
9898 }
9899
9900 machine.event_count++;
9901 rctx = boost::optional<RecoveryCtx>();
9902 orig_ctx = NULL;
9903 }
9904
9905 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi)
9906 {
9907 out << "BackfillInfo(" << bi.begin << "-" << bi.end
9908 << " " << bi.objects.size() << " objects";
9909 if (!bi.objects.empty())
9910 out << " " << bi.objects;
9911 out << ")";
9912 return out;
9913 }
9914
9915 void PG::dump_pgstate_history(Formatter *f)
9916 {
9917 lock();
9918 pgstate_history.dump(f);
9919 unlock();
9920 }
9921
9922 void PG::dump_missing(Formatter *f)
9923 {
9924 for (auto& i : pg_log.get_missing().get_items()) {
9925 f->open_object_section("object");
9926 f->dump_object("oid", i.first);
9927 f->dump_object("missing_info", i.second);
9928 if (missing_loc.needs_recovery(i.first)) {
9929 f->dump_bool("unfound", missing_loc.is_unfound(i.first));
9930 f->open_array_section("locations");
9931 for (auto l : missing_loc.get_locations(i.first)) {
9932 f->dump_object("shard", l);
9933 }
9934 f->close_section();
9935 }
9936 f->close_section();
9937 }
9938 }
9939
9940 void PG::get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f)
9941 {
9942 pg_stats_publish_lock.Lock();
9943 if (pg_stats_publish_valid) {
9944 f(pg_stats_publish, pg_stats_publish.get_effective_last_epoch_clean());
9945 }
9946 pg_stats_publish_lock.Unlock();
9947 }
9948
9949 void PG::with_heartbeat_peers(std::function<void(int)> f)
9950 {
9951 heartbeat_peer_lock.Lock();
9952 for (auto p : heartbeat_peers) {
9953 f(p);
9954 }
9955 for (auto p : probe_targets) {
9956 f(p);
9957 }
9958 heartbeat_peer_lock.Unlock();
9959 }