]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PG.cc
import ceph nautilus 14.2.2
[ceph.git] / ceph / src / osd / PG.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "PG.h"
16 // #include "msg/Messenger.h"
17 #include "messages/MOSDRepScrub.h"
18 // #include "common/cmdparse.h"
19 // #include "common/ceph_context.h"
20
21 #include "common/errno.h"
22 #include "common/config.h"
23 #include "OSD.h"
24 #include "OpRequest.h"
25 #include "ScrubStore.h"
26 #include "Session.h"
27
28 #include "common/Timer.h"
29 #include "common/perf_counters.h"
30
31 #include "messages/MOSDOp.h"
32 #include "messages/MOSDPGNotify.h"
33 // #include "messages/MOSDPGLog.h"
34 #include "messages/MOSDPGRemove.h"
35 #include "messages/MOSDPGInfo.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDPGBackfill.h"
39 #include "messages/MOSDPGBackfillRemove.h"
40 #include "messages/MBackfillReserve.h"
41 #include "messages/MRecoveryReserve.h"
42 #include "messages/MOSDPGPush.h"
43 #include "messages/MOSDPGPushReply.h"
44 #include "messages/MOSDPGPull.h"
45 #include "messages/MOSDECSubOpWrite.h"
46 #include "messages/MOSDECSubOpWriteReply.h"
47 #include "messages/MOSDECSubOpRead.h"
48 #include "messages/MOSDECSubOpReadReply.h"
49 #include "messages/MOSDPGUpdateLogMissing.h"
50 #include "messages/MOSDPGUpdateLogMissingReply.h"
51 #include "messages/MOSDBackoff.h"
52 #include "messages/MOSDScrubReserve.h"
53 #include "messages/MOSDRepOp.h"
54 #include "messages/MOSDRepOpReply.h"
55 #include "messages/MOSDRepScrubMap.h"
56 #include "messages/MOSDPGRecoveryDelete.h"
57 #include "messages/MOSDPGRecoveryDeleteReply.h"
58
59 #include "common/BackTrace.h"
60 #include "common/EventTrace.h"
61
62 #ifdef WITH_LTTNG
63 #define TRACEPOINT_DEFINE
64 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
65 #include "tracing/pg.h"
66 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
67 #undef TRACEPOINT_DEFINE
68 #else
69 #define tracepoint(...)
70 #endif
71
72 #include <sstream>
73
74 #define dout_context cct
75 #define dout_subsys ceph_subsys_osd
76 #undef dout_prefix
77 #define dout_prefix _prefix(_dout, this)
78
79 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
80 // easily skip them
81 const string infover_key("_infover");
82 const string info_key("_info");
83 const string biginfo_key("_biginfo");
84 const string epoch_key("_epoch");
85 const string fastinfo_key("_fastinfo");
86
87 template <class T>
88 static ostream& _prefix(std::ostream *_dout, T *t)
89 {
90 return t->gen_prefix(*_dout);
91 }
92
93 void PGStateHistory::enter(PG* pg, const utime_t entime, const char* state)
94 {
95 // Ignore trimming state machine for now
96 if (::strstr(state, "Trimming") != NULL) {
97 return;
98 } else if (pi != nullptr) {
99 pi->enter_state(entime, state);
100 } else {
101 // Store current state since we can't reliably take the PG lock here
102 if ( tmppi == nullptr) {
103 tmppi = std::unique_ptr<PGStateInstance>(new PGStateInstance);
104 }
105
106 thispg = pg;
107 tmppi->enter_state(entime, state);
108 }
109 }
110
111 void PGStateHistory::exit(const char* state) {
112 // Ignore trimming state machine for now
113 // Do nothing if PG is being destroyed!
114 if (::strstr(state, "Trimming") != NULL || pg_in_destructor) {
115 return;
116 } else {
117 bool ilocked = false;
118 if(!thispg->is_locked()) {
119 thispg->lock();
120 ilocked = true;
121 }
122 if (pi == nullptr) {
123 buffer.push_back(std::unique_ptr<PGStateInstance>(tmppi.release()));
124 pi = buffer.back().get();
125 pi->setepoch(thispg->get_osdmap_epoch());
126 }
127
128 pi->exit_state(ceph_clock_now());
129 if (::strcmp(state, "Reset") == 0) {
130 this->reset();
131 }
132 if(ilocked) {
133 thispg->unlock();
134 }
135 }
136 }
137
138 void PGStateHistory::dump(Formatter* f) const {
139 f->open_array_section("history");
140 for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) {
141 f->open_object_section("states");
142 f->dump_stream("epoch") << (*pi)->this_epoch;
143 for (auto she : (*pi)->state_history) {
144 f->dump_string("state", std::get<2>(she));
145 f->dump_stream("enter") << std::get<0>(she);
146 f->dump_stream("exit") << std::get<1>(she);
147 }
148 f->close_section();
149 }
150 f->close_section();
151 }
152
153 void PG::get(const char* tag)
154 {
155 int after = ++ref;
156 lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " "
157 << "tag " << (tag ? tag : "(none") << " "
158 << (after - 1) << " -> " << after << dendl;
159 #ifdef PG_DEBUG_REFS
160 std::lock_guard l(_ref_id_lock);
161 _tag_counts[tag]++;
162 #endif
163 }
164
165 void PG::put(const char* tag)
166 {
167 #ifdef PG_DEBUG_REFS
168 {
169 std::lock_guard l(_ref_id_lock);
170 auto tag_counts_entry = _tag_counts.find(tag);
171 ceph_assert(tag_counts_entry != _tag_counts.end());
172 --tag_counts_entry->second;
173 if (tag_counts_entry->second == 0) {
174 _tag_counts.erase(tag_counts_entry);
175 }
176 }
177 #endif
178 auto local_cct = cct;
179 int after = --ref;
180 lgeneric_subdout(local_cct, refs, 5) << "PG::put " << this << " "
181 << "tag " << (tag ? tag : "(none") << " "
182 << (after + 1) << " -> " << after
183 << dendl;
184 if (after == 0)
185 delete this;
186 }
187
188 #ifdef PG_DEBUG_REFS
189 uint64_t PG::get_with_id()
190 {
191 ref++;
192 std::lock_guard l(_ref_id_lock);
193 uint64_t id = ++_ref_id;
194 BackTrace bt(0);
195 stringstream ss;
196 bt.print(ss);
197 lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " " << info.pgid
198 << " got id " << id << " "
199 << (ref - 1) << " -> " << ref
200 << dendl;
201 ceph_assert(!_live_ids.count(id));
202 _live_ids.insert(make_pair(id, ss.str()));
203 return id;
204 }
205
206 void PG::put_with_id(uint64_t id)
207 {
208 int newref = --ref;
209 lgeneric_subdout(cct, refs, 5) << "PG::put " << this << " " << info.pgid
210 << " put id " << id << " "
211 << (newref + 1) << " -> " << newref
212 << dendl;
213 {
214 std::lock_guard l(_ref_id_lock);
215 ceph_assert(_live_ids.count(id));
216 _live_ids.erase(id);
217 }
218 if (newref)
219 delete this;
220 }
221
222 void PG::dump_live_ids()
223 {
224 std::lock_guard l(_ref_id_lock);
225 dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl;
226 for (map<uint64_t, string>::iterator i = _live_ids.begin();
227 i != _live_ids.end();
228 ++i) {
229 dout(0) << "\t\tid: " << *i << dendl;
230 }
231 dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl;
232 for (map<string, uint64_t>::iterator i = _tag_counts.begin();
233 i != _tag_counts.end();
234 ++i) {
235 dout(0) << "\t\tid: " << *i << dendl;
236 }
237 }
238 #endif
239
240
241 void PGPool::update(CephContext *cct, OSDMapRef map)
242 {
243 const pg_pool_t *pi = map->get_pg_pool(id);
244 if (!pi) {
245 return; // pool has been deleted
246 }
247 info = *pi;
248 name = map->get_pool_name(id);
249
250 bool updated = false;
251 if ((map->get_epoch() != cached_epoch + 1) ||
252 (pi->get_snap_epoch() == map->get_epoch())) {
253 updated = true;
254 }
255
256 if (map->require_osd_release >= CEPH_RELEASE_MIMIC) {
257 // mimic tracks removed_snaps_queue in the OSDmap and purged_snaps
258 // in the pg_info_t, with deltas for both in each OSDMap. we don't
259 // need to (and can't) track it here.
260 cached_removed_snaps.clear();
261 newly_removed_snaps.clear();
262 } else {
263 // legacy (<= luminous) removed_snaps tracking
264 if (updated) {
265 if (pi->maybe_updated_removed_snaps(cached_removed_snaps)) {
266 pi->build_removed_snaps(newly_removed_snaps);
267 if (cached_removed_snaps.subset_of(newly_removed_snaps)) {
268 interval_set<snapid_t> removed_snaps = newly_removed_snaps;
269 newly_removed_snaps.subtract(cached_removed_snaps);
270 cached_removed_snaps.swap(removed_snaps);
271 } else {
272 lgeneric_subdout(cct, osd, 0) << __func__
273 << " cached_removed_snaps shrank from " << cached_removed_snaps
274 << " to " << newly_removed_snaps << dendl;
275 cached_removed_snaps.swap(newly_removed_snaps);
276 newly_removed_snaps.clear();
277 }
278 } else {
279 newly_removed_snaps.clear();
280 }
281 } else {
282 /* 1) map->get_epoch() == cached_epoch + 1 &&
283 * 2) pi->get_snap_epoch() != map->get_epoch()
284 *
285 * From the if branch, 1 && 2 must be true. From 2, we know that
286 * this map didn't change the set of removed snaps. From 1, we
287 * know that our cached_removed_snaps matches the previous map.
288 * Thus, from 1 && 2, cached_removed snaps matches the current
289 * set of removed snaps and all we have to do is clear
290 * newly_removed_snaps.
291 */
292 newly_removed_snaps.clear();
293 }
294 lgeneric_subdout(cct, osd, 20)
295 << "PGPool::update cached_removed_snaps "
296 << cached_removed_snaps
297 << " newly_removed_snaps "
298 << newly_removed_snaps
299 << " snapc " << snapc
300 << (updated ? " (updated)":" (no change)")
301 << dendl;
302 if (cct->_conf->osd_debug_verify_cached_snaps) {
303 interval_set<snapid_t> actual_removed_snaps;
304 pi->build_removed_snaps(actual_removed_snaps);
305 if (!(actual_removed_snaps == cached_removed_snaps)) {
306 lgeneric_derr(cct) << __func__
307 << ": mismatch between the actual removed snaps "
308 << actual_removed_snaps
309 << " and pool.cached_removed_snaps "
310 << " pool.cached_removed_snaps " << cached_removed_snaps
311 << dendl;
312 }
313 ceph_assert(actual_removed_snaps == cached_removed_snaps);
314 }
315 }
316 if (info.is_pool_snaps_mode() && updated) {
317 snapc = pi->get_snap_context();
318 }
319 cached_epoch = map->get_epoch();
320 }
321
322 PG::PG(OSDService *o, OSDMapRef curmap,
323 const PGPool &_pool, spg_t p) :
324 pg_id(p),
325 coll(p),
326 osd(o),
327 cct(o->cct),
328 osdmap_ref(curmap),
329 pool(_pool),
330 osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
331 snap_mapper(
332 cct,
333 &osdriver,
334 p.ps(),
335 p.get_split_bits(_pool.info.get_pg_num()),
336 _pool.id,
337 p.shard),
338 last_persisted_osdmap(curmap->get_epoch()),
339 deleting(false),
340 trace_endpoint("0.0.0.0", 0, "PG"),
341 dirty_info(false), dirty_big_info(false),
342 info(p),
343 info_struct_v(0),
344 pg_log(cct),
345 pgmeta_oid(p.make_pgmeta_oid()),
346 missing_loc(this),
347 stat_queue_item(this),
348 scrub_queued(false),
349 recovery_queued(false),
350 recovery_ops_active(0),
351 role(-1),
352 state(0),
353 send_notify(false),
354 pg_whoami(osd->whoami, p.shard),
355 need_up_thru(false),
356 last_peering_reset(0),
357 heartbeat_peer_lock("PG::heartbeat_peer_lock"),
358 backfill_reserved(false),
359 backfill_reserving(false),
360 flushes_in_progress(0),
361 pg_stats_publish_lock("PG::pg_stats_publish_lock"),
362 pg_stats_publish_valid(false),
363 finish_sync_event(NULL),
364 backoff_lock("PG::backoff_lock"),
365 scrub_after_recovery(false),
366 active_pushes(0),
367 recovery_state(this),
368 peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
369 acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
370 upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
371 last_epoch(0),
372 last_require_osd_release(curmap->require_osd_release)
373 {
374 #ifdef PG_DEBUG_REFS
375 osd->add_pgid(p, this);
376 #endif
377 #ifdef WITH_BLKIN
378 std::stringstream ss;
379 ss << "PG " << info.pgid;
380 trace_endpoint.copy_name(ss.str());
381 #endif
382 }
383
384 PG::~PG()
385 {
386 pgstate_history.set_pg_in_destructor();
387 #ifdef PG_DEBUG_REFS
388 osd->remove_pgid(info.pgid, this);
389 #endif
390 }
391
392 void PG::lock(bool no_lockdep) const
393 {
394 _lock.Lock(no_lockdep);
395 // if we have unrecorded dirty state with the lock dropped, there is a bug
396 ceph_assert(!dirty_info);
397 ceph_assert(!dirty_big_info);
398
399 dout(30) << "lock" << dendl;
400 }
401
402 std::ostream& PG::gen_prefix(std::ostream& out) const
403 {
404 OSDMapRef mapref = osdmap_ref;
405 if (_lock.is_locked_by_me()) {
406 out << "osd." << osd->whoami
407 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
408 << " " << *this << " ";
409 } else {
410 out << "osd." << osd->whoami
411 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
412 << " pg[" << info.pgid << "(unlocked)] ";
413 }
414 return out;
415 }
416
417 /********* PG **********/
418
419 void PG::proc_master_log(
420 ObjectStore::Transaction& t, pg_info_t &oinfo,
421 pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
422 {
423 dout(10) << "proc_master_log for osd." << from << ": "
424 << olog << " " << omissing << dendl;
425 ceph_assert(!is_peered() && is_primary());
426
427 // merge log into our own log to build master log. no need to
428 // make any adjustments to their missing map; we are taking their
429 // log to be authoritative (i.e., their entries are by definitely
430 // non-divergent).
431 merge_log(t, oinfo, olog, from);
432 peer_info[from] = oinfo;
433 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
434 might_have_unfound.insert(from);
435
436 // See doc/dev/osd_internals/last_epoch_started
437 if (oinfo.last_epoch_started > info.last_epoch_started) {
438 info.last_epoch_started = oinfo.last_epoch_started;
439 dirty_info = true;
440 }
441 if (oinfo.last_interval_started > info.last_interval_started) {
442 info.last_interval_started = oinfo.last_interval_started;
443 dirty_info = true;
444 }
445 update_history(oinfo.history);
446 ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
447 info.last_epoch_started >= info.history.last_epoch_started);
448
449 peer_missing[from].claim(omissing);
450 }
451
452 void PG::proc_replica_log(
453 pg_info_t &oinfo,
454 const pg_log_t &olog,
455 pg_missing_t& omissing,
456 pg_shard_t from)
457 {
458 dout(10) << "proc_replica_log for osd." << from << ": "
459 << oinfo << " " << olog << " " << omissing << dendl;
460
461 pg_log.proc_replica_log(oinfo, olog, omissing, from);
462
463 peer_info[from] = oinfo;
464 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
465 might_have_unfound.insert(from);
466
467 for (map<hobject_t, pg_missing_item>::const_iterator i =
468 omissing.get_items().begin();
469 i != omissing.get_items().end();
470 ++i) {
471 dout(20) << " after missing " << i->first << " need " << i->second.need
472 << " have " << i->second.have << dendl;
473 }
474 peer_missing[from].claim(omissing);
475 }
476
477 bool PG::proc_replica_info(
478 pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
479 {
480 map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
481 if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
482 dout(10) << " got dup osd." << from << " info " << oinfo << ", identical to ours" << dendl;
483 return false;
484 }
485
486 if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
487 dout(10) << " got info " << oinfo << " from down osd." << from
488 << " discarding" << dendl;
489 return false;
490 }
491
492 dout(10) << " got osd." << from << " " << oinfo << dendl;
493 ceph_assert(is_primary());
494 peer_info[from] = oinfo;
495 might_have_unfound.insert(from);
496
497 update_history(oinfo.history);
498
499 // stray?
500 if (!is_up(from) && !is_acting(from)) {
501 dout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
502 stray_set.insert(from);
503 if (is_clean()) {
504 purge_strays();
505 }
506 }
507
508 // was this a new info? if so, update peers!
509 if (p == peer_info.end())
510 update_heartbeat_peers();
511
512 return true;
513 }
514
515 void PG::remove_snap_mapped_object(
516 ObjectStore::Transaction &t, const hobject_t &soid)
517 {
518 t.remove(
519 coll,
520 ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard));
521 clear_object_snap_mapping(&t, soid);
522 }
523
524 void PG::clear_object_snap_mapping(
525 ObjectStore::Transaction *t, const hobject_t &soid)
526 {
527 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
528 if (soid.snap < CEPH_MAXSNAP) {
529 int r = snap_mapper.remove_oid(
530 soid,
531 &_t);
532 if (!(r == 0 || r == -ENOENT)) {
533 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
534 ceph_abort();
535 }
536 }
537 }
538
539 void PG::update_object_snap_mapping(
540 ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps)
541 {
542 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
543 ceph_assert(soid.snap < CEPH_MAXSNAP);
544 int r = snap_mapper.remove_oid(
545 soid,
546 &_t);
547 if (!(r == 0 || r == -ENOENT)) {
548 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
549 ceph_abort();
550 }
551 snap_mapper.add_oid(
552 soid,
553 snaps,
554 &_t);
555 }
556
557 void PG::merge_log(
558 ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_shard_t from)
559 {
560 PGLogEntryHandler rollbacker{this, &t};
561 pg_log.merge_log(
562 oinfo, olog, from, info, &rollbacker, dirty_info, dirty_big_info);
563 }
564
565 void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead)
566 {
567 PGLogEntryHandler rollbacker{this, &t};
568 pg_log.rewind_divergent_log(
569 newhead, info, &rollbacker, dirty_info, dirty_big_info);
570 }
571
572 /*
573 * Process information from a replica to determine if it could have any
574 * objects that i need.
575 *
576 * TODO: if the missing set becomes very large, this could get expensive.
577 * Instead, we probably want to just iterate over our unfound set.
578 */
579 bool PG::search_for_missing(
580 const pg_info_t &oinfo, const pg_missing_t &omissing,
581 pg_shard_t from,
582 RecoveryCtx *ctx)
583 {
584 uint64_t num_unfound_before = missing_loc.num_unfound();
585 bool found_missing = missing_loc.add_source_info(
586 from, oinfo, omissing, ctx->handle);
587 if (found_missing && num_unfound_before != missing_loc.num_unfound())
588 publish_stats_to_osd();
589 // avoid doing this if the peer is empty. This is abit of paranoia
590 // to avoid doing something rash if add_source_info() above
591 // incorrectly decided we found something new. (if the peer has
592 // last_update=0'0 that's impossible.)
593 if (found_missing &&
594 oinfo.last_update != eversion_t()) {
595 pg_info_t tinfo(oinfo);
596 tinfo.pgid.shard = pg_whoami.shard;
597 (*(ctx->info_map))[from.osd].push_back(
598 make_pair(
599 pg_notify_t(
600 from.shard, pg_whoami.shard,
601 get_osdmap_epoch(),
602 get_osdmap_epoch(),
603 tinfo),
604 past_intervals));
605 }
606 return found_missing;
607 }
608
609
610 // MissingLoc
611
612 bool PG::MissingLoc::readable_with_acting(
613 const hobject_t &hoid,
614 const set<pg_shard_t> &acting) const {
615 if (!needs_recovery(hoid))
616 return true;
617 if (is_deleted(hoid))
618 return false;
619 auto missing_loc_entry = missing_loc.find(hoid);
620 if (missing_loc_entry == missing_loc.end())
621 return false;
622 const set<pg_shard_t> &locs = missing_loc_entry->second;
623 ldout(pg->cct, 10) << __func__ << ": locs:" << locs << dendl;
624 set<pg_shard_t> have_acting;
625 for (set<pg_shard_t>::const_iterator i = locs.begin();
626 i != locs.end();
627 ++i) {
628 if (acting.count(*i))
629 have_acting.insert(*i);
630 }
631 return (*is_readable)(have_acting);
632 }
633
634 void PG::MissingLoc::add_batch_sources_info(
635 const set<pg_shard_t> &sources, ThreadPool::TPHandle* handle)
636 {
637 ldout(pg->cct, 10) << __func__ << ": adding sources in batch "
638 << sources.size() << dendl;
639 unsigned loop = 0;
640 bool sources_updated = false;
641 for (map<hobject_t, pg_missing_item>::const_iterator i = needs_recovery_map.begin();
642 i != needs_recovery_map.end();
643 ++i) {
644 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
645 handle->reset_tp_timeout();
646 loop = 0;
647 }
648 if (i->second.is_delete())
649 continue;
650
651 auto p = missing_loc.find(i->first);
652 if (p == missing_loc.end()) {
653 p = missing_loc.emplace(i->first, set<pg_shard_t>()).first;
654 } else {
655 _dec_count(p->second);
656 }
657 missing_loc[i->first].insert(sources.begin(), sources.end());
658 _inc_count(p->second);
659
660 if (!sources_updated) {
661 missing_loc_sources.insert(sources.begin(), sources.end());
662 sources_updated = true;
663 }
664 }
665 }
666
667 bool PG::MissingLoc::add_source_info(
668 pg_shard_t fromosd,
669 const pg_info_t &oinfo,
670 const pg_missing_t &omissing,
671 ThreadPool::TPHandle* handle)
672 {
673 bool found_missing = false;
674 unsigned loop = 0;
675 bool sources_updated = false;
676 // found items?
677 for (map<hobject_t,pg_missing_item>::const_iterator p = needs_recovery_map.begin();
678 p != needs_recovery_map.end();
679 ++p) {
680 const hobject_t &soid(p->first);
681 eversion_t need = p->second.need;
682 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
683 handle->reset_tp_timeout();
684 loop = 0;
685 }
686 if (p->second.is_delete()) {
687 ldout(pg->cct, 10) << __func__ << " " << soid
688 << " delete, ignoring source" << dendl;
689 continue;
690 }
691 if (oinfo.last_update < need) {
692 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
693 << " also missing on osd." << fromosd
694 << " (last_update " << oinfo.last_update
695 << " < needed " << need << ")" << dendl;
696 continue;
697 }
698 if (!oinfo.last_backfill.is_max() &&
699 !oinfo.last_backfill_bitwise) {
700 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
701 << " also missing on osd." << fromosd
702 << " (last_backfill " << oinfo.last_backfill
703 << " but with wrong sort order)"
704 << dendl;
705 continue;
706 }
707 if (p->first >= oinfo.last_backfill) {
708 // FIXME: this is _probably_ true, although it could conceivably
709 // be in the undefined region! Hmm!
710 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
711 << " also missing on osd." << fromosd
712 << " (past last_backfill " << oinfo.last_backfill
713 << ")" << dendl;
714 continue;
715 }
716 if (omissing.is_missing(soid)) {
717 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
718 << " also missing on osd." << fromosd << dendl;
719 continue;
720 }
721
722 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
723 << " is on osd." << fromosd << dendl;
724
725 {
726 auto p = missing_loc.find(soid);
727 if (p == missing_loc.end()) {
728 p = missing_loc.emplace(soid, set<pg_shard_t>()).first;
729 } else {
730 _dec_count(p->second);
731 }
732 p->second.insert(fromosd);
733 _inc_count(p->second);
734 }
735
736 if (!sources_updated) {
737 missing_loc_sources.insert(fromosd);
738 sources_updated = true;
739 }
740 found_missing = true;
741 }
742
743 ldout(pg->cct, 20) << "needs_recovery_map missing " << needs_recovery_map
744 << dendl;
745 return found_missing;
746 }
747
748 void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
749 {
750 set<pg_shard_t> now_down;
751 for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
752 p != missing_loc_sources.end();
753 ) {
754 if (osdmap->is_up(p->osd)) {
755 ++p;
756 continue;
757 }
758 ldout(pg->cct, 10) << __func__ << " source osd." << *p << " now down" << dendl;
759 now_down.insert(*p);
760 missing_loc_sources.erase(p++);
761 }
762
763 if (now_down.empty()) {
764 ldout(pg->cct, 10) << __func__ << " no source osds (" << missing_loc_sources << ") went down" << dendl;
765 } else {
766 ldout(pg->cct, 10) << __func__ << " sources osds " << now_down << " now down, remaining sources are "
767 << missing_loc_sources << dendl;
768
769 // filter missing_loc
770 map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin();
771 while (p != missing_loc.end()) {
772 set<pg_shard_t>::iterator q = p->second.begin();
773 bool changed = false;
774 while (q != p->second.end()) {
775 if (now_down.count(*q)) {
776 if (!changed) {
777 changed = true;
778 _dec_count(p->second);
779 }
780 p->second.erase(q++);
781 } else {
782 ++q;
783 }
784 }
785 if (p->second.empty()) {
786 missing_loc.erase(p++);
787 } else {
788 if (changed) {
789 _inc_count(p->second);
790 }
791 ++p;
792 }
793 }
794 }
795 }
796
797 void PG::discover_all_missing(map<int, map<spg_t,pg_query_t> > &query_map)
798 {
799 auto &missing = pg_log.get_missing();
800 uint64_t unfound = get_num_unfound();
801
802 dout(10) << __func__ << " "
803 << missing.num_missing() << " missing, "
804 << unfound << " unfound"
805 << dendl;
806
807 std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
808 std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
809 for (; m != mend; ++m) {
810 pg_shard_t peer(*m);
811
812 if (!get_osdmap()->is_up(peer.osd)) {
813 dout(20) << __func__ << " skipping down osd." << peer << dendl;
814 continue;
815 }
816
817 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
818 if (iter != peer_info.end() &&
819 (iter->second.is_empty() || iter->second.dne())) {
820 // ignore empty peers
821 continue;
822 }
823
824 // If we've requested any of this stuff, the pg_missing_t information
825 // should be on its way.
826 // TODO: coalsce requested_* into a single data structure
827 if (peer_missing.find(peer) != peer_missing.end()) {
828 dout(20) << __func__ << ": osd." << peer
829 << ": we already have pg_missing_t" << dendl;
830 continue;
831 }
832 if (peer_log_requested.find(peer) != peer_log_requested.end()) {
833 dout(20) << __func__ << ": osd." << peer
834 << ": in peer_log_requested" << dendl;
835 continue;
836 }
837 if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
838 dout(20) << __func__ << ": osd." << peer
839 << ": in peer_missing_requested" << dendl;
840 continue;
841 }
842
843 // Request missing
844 dout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
845 << dendl;
846 peer_missing_requested.insert(peer);
847 query_map[peer.osd][spg_t(info.pgid.pgid, peer.shard)] =
848 pg_query_t(
849 pg_query_t::FULLLOG,
850 peer.shard, pg_whoami.shard,
851 info.history, get_osdmap_epoch());
852 }
853 }
854
855 /******* PG ***********/
856 bool PG::needs_recovery() const
857 {
858 ceph_assert(is_primary());
859
860 auto &missing = pg_log.get_missing();
861
862 if (missing.num_missing()) {
863 dout(10) << __func__ << " primary has " << missing.num_missing()
864 << " missing" << dendl;
865 return true;
866 }
867
868 ceph_assert(!acting_recovery_backfill.empty());
869 set<pg_shard_t>::const_iterator end = acting_recovery_backfill.end();
870 set<pg_shard_t>::const_iterator a = acting_recovery_backfill.begin();
871 for (; a != end; ++a) {
872 if (*a == get_primary()) continue;
873 pg_shard_t peer = *a;
874 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
875 if (pm == peer_missing.end()) {
876 dout(10) << __func__ << " osd." << peer << " doesn't have missing set"
877 << dendl;
878 continue;
879 }
880 if (pm->second.num_missing()) {
881 dout(10) << __func__ << " osd." << peer << " has "
882 << pm->second.num_missing() << " missing" << dendl;
883 return true;
884 }
885 }
886
887 dout(10) << __func__ << " is recovered" << dendl;
888 return false;
889 }
890
891 bool PG::needs_backfill() const
892 {
893 ceph_assert(is_primary());
894
895 // We can assume that only possible osds that need backfill
896 // are on the backfill_targets vector nodes.
897 set<pg_shard_t>::const_iterator end = backfill_targets.end();
898 set<pg_shard_t>::const_iterator a = backfill_targets.begin();
899 for (; a != end; ++a) {
900 pg_shard_t peer = *a;
901 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
902 if (!pi->second.last_backfill.is_max()) {
903 dout(10) << __func__ << " osd." << peer << " has last_backfill " << pi->second.last_backfill << dendl;
904 return true;
905 }
906 }
907
908 dout(10) << __func__ << " does not need backfill" << dendl;
909 return false;
910 }
911
912
913 void PG::check_past_interval_bounds() const
914 {
915 auto rpib = get_required_past_interval_bounds(
916 info,
917 osd->get_superblock().oldest_map);
918 if (rpib.first >= rpib.second) {
919 if (!past_intervals.empty()) {
920 osd->clog->error() << info.pgid << " required past_interval bounds are"
921 << " empty [" << rpib << ") but past_intervals is not: "
922 << past_intervals;
923 derr << info.pgid << " required past_interval bounds are"
924 << " empty [" << rpib << ") but past_intervals is not: "
925 << past_intervals << dendl;
926 }
927 } else {
928 if (past_intervals.empty()) {
929 osd->clog->error() << info.pgid << " required past_interval bounds are"
930 << " not empty [" << rpib << ") but past_intervals "
931 << past_intervals << " is empty";
932 derr << info.pgid << " required past_interval bounds are"
933 << " not empty [" << rpib << ") but past_intervals "
934 << past_intervals << " is empty" << dendl;
935 ceph_assert(!past_intervals.empty());
936 }
937
938 auto apib = past_intervals.get_bounds();
939 if (apib.first > rpib.first) {
940 osd->clog->error() << info.pgid << " past_intervals [" << apib
941 << ") start interval does not contain the required"
942 << " bound [" << rpib << ") start";
943 derr << info.pgid << " past_intervals [" << apib
944 << ") start interval does not contain the required"
945 << " bound [" << rpib << ") start" << dendl;
946 ceph_abort_msg("past_interval start interval mismatch");
947 }
948 if (apib.second != rpib.second) {
949 osd->clog->error() << info.pgid << " past_interal bound [" << apib
950 << ") end does not match required [" << rpib
951 << ") end";
952 derr << info.pgid << " past_interal bound [" << apib
953 << ") end does not match required [" << rpib
954 << ") end" << dendl;
955 ceph_abort_msg("past_interval end mismatch");
956 }
957 }
958 }
959
960 bool PG::adjust_need_up_thru(const OSDMapRef osdmap)
961 {
962 epoch_t up_thru = osdmap->get_up_thru(osd->whoami);
963 if (need_up_thru &&
964 up_thru >= info.history.same_interval_since) {
965 dout(10) << "adjust_need_up_thru now " << up_thru << ", need_up_thru now false" << dendl;
966 need_up_thru = false;
967 return true;
968 }
969 return false;
970 }
971
972 void PG::remove_down_peer_info(const OSDMapRef osdmap)
973 {
974 // Remove any downed osds from peer_info
975 bool removed = false;
976 map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
977 while (p != peer_info.end()) {
978 if (!osdmap->is_up(p->first.osd)) {
979 dout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
980 peer_missing.erase(p->first);
981 peer_log_requested.erase(p->first);
982 peer_missing_requested.erase(p->first);
983 peer_purged.erase(p->first); // so we can re-purge if necessary
984 peer_info.erase(p++);
985 removed = true;
986 } else
987 ++p;
988 }
989
990 // if we removed anyone, update peers (which include peer_info)
991 if (removed)
992 update_heartbeat_peers();
993 check_recovery_sources(osdmap);
994 }
995
996 /*
997 * Returns true unless there is a non-lost OSD in might_have_unfound.
998 */
999 bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
1000 {
1001 ceph_assert(is_primary());
1002
1003 set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
1004 set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
1005 for (; peer != mend; ++peer) {
1006 if (peer_missing.count(*peer))
1007 continue;
1008 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
1009 if (iter != peer_info.end() &&
1010 (iter->second.is_empty() || iter->second.dne()))
1011 continue;
1012 if (!osdmap->exists(peer->osd))
1013 continue;
1014 const osd_info_t &osd_info(osdmap->get_info(peer->osd));
1015 if (osd_info.lost_at <= osd_info.up_from) {
1016 // If there is even one OSD in might_have_unfound that isn't lost, we
1017 // still might retrieve our unfound.
1018 return false;
1019 }
1020 }
1021 dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound
1022 << " have been queried or are marked lost" << dendl;
1023 return true;
1024 }
1025
1026 PastIntervals::PriorSet PG::build_prior()
1027 {
1028 if (1) {
1029 // sanity check
1030 for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
1031 it != peer_info.end();
1032 ++it) {
1033 ceph_assert(info.history.last_epoch_started >= it->second.history.last_epoch_started);
1034 }
1035 }
1036
1037 const OSDMap &osdmap = *get_osdmap();
1038 PastIntervals::PriorSet prior = past_intervals.get_prior_set(
1039 pool.info.is_erasure(),
1040 info.history.last_epoch_started,
1041 get_pgbackend()->get_is_recoverable_predicate(),
1042 [&](epoch_t start, int osd, epoch_t *lost_at) {
1043 const osd_info_t *pinfo = 0;
1044 if (osdmap.exists(osd)) {
1045 pinfo = &osdmap.get_info(osd);
1046 if (lost_at)
1047 *lost_at = pinfo->lost_at;
1048 }
1049
1050 if (osdmap.is_up(osd)) {
1051 return PastIntervals::UP;
1052 } else if (!pinfo) {
1053 return PastIntervals::DNE;
1054 } else if (pinfo->lost_at > start) {
1055 return PastIntervals::LOST;
1056 } else {
1057 return PastIntervals::DOWN;
1058 }
1059 },
1060 up,
1061 acting,
1062 this);
1063
1064 if (prior.pg_down) {
1065 state_set(PG_STATE_DOWN);
1066 }
1067
1068 if (get_osdmap()->get_up_thru(osd->whoami) < info.history.same_interval_since) {
1069 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
1070 << " < same_since " << info.history.same_interval_since
1071 << ", must notify monitor" << dendl;
1072 need_up_thru = true;
1073 } else {
1074 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
1075 << " >= same_since " << info.history.same_interval_since
1076 << ", all is well" << dendl;
1077 need_up_thru = false;
1078 }
1079 set_probe_targets(prior.probe);
1080 return prior;
1081 }
1082
1083 void PG::clear_primary_state()
1084 {
1085 dout(10) << "clear_primary_state" << dendl;
1086
1087 // clear peering state
1088 stray_set.clear();
1089 peer_log_requested.clear();
1090 peer_missing_requested.clear();
1091 peer_info.clear();
1092 peer_bytes.clear();
1093 peer_missing.clear();
1094 need_up_thru = false;
1095 peer_last_complete_ondisk.clear();
1096 peer_activated.clear();
1097 min_last_complete_ondisk = eversion_t();
1098 pg_trim_to = eversion_t();
1099 might_have_unfound.clear();
1100 projected_log = PGLog::IndexedLog();
1101
1102 last_update_ondisk = eversion_t();
1103
1104 snap_trimq.clear();
1105
1106 finish_sync_event = 0; // so that _finish_recovery doesn't go off in another thread
1107
1108 missing_loc.clear();
1109
1110 release_pg_backoffs();
1111
1112 pg_log.reset_recovery_pointers();
1113
1114 scrubber.reserved_peers.clear();
1115 scrub_after_recovery = false;
1116
1117 agent_clear();
1118 }
1119
1120 PG::Scrubber::Scrubber()
1121 : reserved(false), reserve_failed(false),
1122 epoch_start(0),
1123 active(false),
1124 shallow_errors(0), deep_errors(0), fixed(0),
1125 must_scrub(false), must_deep_scrub(false), must_repair(false),
1126 auto_repair(false),
1127 check_repair(false),
1128 deep_scrub_on_error(false),
1129 num_digest_updates_pending(0),
1130 state(INACTIVE),
1131 deep(false)
1132 {}
1133
1134 PG::Scrubber::~Scrubber() {}
1135
1136 /**
1137 * find_best_info
1138 *
1139 * Returns an iterator to the best info in infos sorted by:
1140 * 1) Prefer newer last_update
1141 * 2) Prefer longer tail if it brings another info into contiguity
1142 * 3) Prefer current primary
1143 */
1144 map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
1145 const map<pg_shard_t, pg_info_t> &infos,
1146 bool restrict_to_up_acting,
1147 bool *history_les_bound) const
1148 {
1149 ceph_assert(history_les_bound);
1150 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1151 * to make changes to this process. Also, make sure to update it
1152 * when you find bugs! */
1153 eversion_t min_last_update_acceptable = eversion_t::max();
1154 epoch_t max_last_epoch_started_found = 0;
1155 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1156 i != infos.end();
1157 ++i) {
1158 if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1159 max_last_epoch_started_found < i->second.history.last_epoch_started) {
1160 *history_les_bound = true;
1161 max_last_epoch_started_found = i->second.history.last_epoch_started;
1162 }
1163 if (!i->second.is_incomplete() &&
1164 max_last_epoch_started_found < i->second.last_epoch_started) {
1165 *history_les_bound = false;
1166 max_last_epoch_started_found = i->second.last_epoch_started;
1167 }
1168 }
1169 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1170 i != infos.end();
1171 ++i) {
1172 if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1173 if (min_last_update_acceptable > i->second.last_update)
1174 min_last_update_acceptable = i->second.last_update;
1175 }
1176 }
1177 if (min_last_update_acceptable == eversion_t::max())
1178 return infos.end();
1179
1180 map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1181 // find osd with newest last_update (oldest for ec_pool).
1182 // if there are multiples, prefer
1183 // - a longer tail, if it brings another peer into log contiguity
1184 // - the current primary
1185 for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1186 p != infos.end();
1187 ++p) {
1188 if (restrict_to_up_acting && !is_up(p->first) &&
1189 !is_acting(p->first))
1190 continue;
1191 // Only consider peers with last_update >= min_last_update_acceptable
1192 if (p->second.last_update < min_last_update_acceptable)
1193 continue;
1194 // Disqualify anyone with a too old last_epoch_started
1195 if (p->second.last_epoch_started < max_last_epoch_started_found)
1196 continue;
1197 // Disqualify anyone who is incomplete (not fully backfilled)
1198 if (p->second.is_incomplete())
1199 continue;
1200 if (best == infos.end()) {
1201 best = p;
1202 continue;
1203 }
1204 // Prefer newer last_update
1205 if (pool.info.require_rollback()) {
1206 if (p->second.last_update > best->second.last_update)
1207 continue;
1208 if (p->second.last_update < best->second.last_update) {
1209 best = p;
1210 continue;
1211 }
1212 } else {
1213 if (p->second.last_update < best->second.last_update)
1214 continue;
1215 if (p->second.last_update > best->second.last_update) {
1216 best = p;
1217 continue;
1218 }
1219 }
1220
1221 // Prefer longer tail
1222 if (p->second.log_tail > best->second.log_tail) {
1223 continue;
1224 } else if (p->second.log_tail < best->second.log_tail) {
1225 best = p;
1226 continue;
1227 }
1228
1229 if (!p->second.has_missing() && best->second.has_missing()) {
1230 dout(10) << __func__ << " prefer osd." << p->first
1231 << " because it is complete while best has missing"
1232 << dendl;
1233 best = p;
1234 continue;
1235 } else if (p->second.has_missing() && !best->second.has_missing()) {
1236 dout(10) << __func__ << " skipping osd." << p->first
1237 << " because it has missing while best is complete"
1238 << dendl;
1239 continue;
1240 } else {
1241 // both are complete or have missing
1242 // fall through
1243 }
1244
1245 // prefer current primary (usually the caller), all things being equal
1246 if (p->first == pg_whoami) {
1247 dout(10) << "calc_acting prefer osd." << p->first
1248 << " because it is current primary" << dendl;
1249 best = p;
1250 continue;
1251 }
1252 }
1253 return best;
1254 }
1255
1256 void PG::calc_ec_acting(
1257 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1258 unsigned size,
1259 const vector<int> &acting,
1260 const vector<int> &up,
1261 const map<pg_shard_t, pg_info_t> &all_info,
1262 bool restrict_to_up_acting,
1263 vector<int> *_want,
1264 set<pg_shard_t> *backfill,
1265 set<pg_shard_t> *acting_backfill,
1266 ostream &ss)
1267 {
1268 vector<int> want(size, CRUSH_ITEM_NONE);
1269 map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1270 for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1271 i != all_info.end();
1272 ++i) {
1273 all_info_by_shard[i->first.shard].insert(i->first);
1274 }
1275 for (uint8_t i = 0; i < want.size(); ++i) {
1276 ss << "For position " << (unsigned)i << ": ";
1277 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1278 !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1279 all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1280 auth_log_shard->second.log_tail) {
1281 ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1282 want[i] = up[i];
1283 continue;
1284 }
1285 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1286 ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1287 << " and ";
1288 backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1289 }
1290
1291 if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1292 !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1293 all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1294 auth_log_shard->second.log_tail) {
1295 ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1296 want[i] = acting[i];
1297 } else if (!restrict_to_up_acting) {
1298 for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1299 j != all_info_by_shard[shard_id_t(i)].end();
1300 ++j) {
1301 ceph_assert(j->shard == i);
1302 if (!all_info.find(*j)->second.is_incomplete() &&
1303 all_info.find(*j)->second.last_update >=
1304 auth_log_shard->second.log_tail) {
1305 ss << " selecting stray: " << *j << std::endl;
1306 want[i] = j->osd;
1307 break;
1308 }
1309 }
1310 if (want[i] == CRUSH_ITEM_NONE)
1311 ss << " failed to fill position " << (int)i << std::endl;
1312 }
1313 }
1314
1315 for (uint8_t i = 0; i < want.size(); ++i) {
1316 if (want[i] != CRUSH_ITEM_NONE) {
1317 acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1318 }
1319 }
1320 acting_backfill->insert(backfill->begin(), backfill->end());
1321 _want->swap(want);
1322 }
1323
1324 /**
1325 * calculate the desired acting set.
1326 *
1327 * Choose an appropriate acting set. Prefer up[0], unless it is
1328 * incomplete, or another osd has a longer tail that allows us to
1329 * bring other up nodes up to date.
1330 */
1331 void PG::calc_replicated_acting(
1332 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1333 uint64_t force_auth_primary_missing_objects,
1334 unsigned size,
1335 const vector<int> &acting,
1336 const vector<int> &up,
1337 pg_shard_t up_primary,
1338 const map<pg_shard_t, pg_info_t> &all_info,
1339 bool restrict_to_up_acting,
1340 vector<int> *want,
1341 set<pg_shard_t> *backfill,
1342 set<pg_shard_t> *acting_backfill,
1343 const OSDMapRef osdmap,
1344 ostream &ss)
1345 {
1346 pg_shard_t auth_log_shard_id = auth_log_shard->first;
1347
1348 ss << __func__ << " newest update on osd." << auth_log_shard_id
1349 << " with " << auth_log_shard->second
1350 << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1351
1352 // select primary
1353 auto primary = all_info.find(up_primary);
1354 if (up.size() &&
1355 !primary->second.is_incomplete() &&
1356 primary->second.last_update >=
1357 auth_log_shard->second.log_tail) {
1358 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1359 auto approx_missing_objects =
1360 primary->second.stats.stats.sum.num_objects_missing;
1361 auto auth_version = auth_log_shard->second.last_update.version;
1362 auto primary_version = primary->second.last_update.version;
1363 if (auth_version > primary_version) {
1364 approx_missing_objects += auth_version - primary_version;
1365 } else {
1366 approx_missing_objects += primary_version - auth_version;
1367 }
1368 if ((uint64_t)approx_missing_objects >
1369 force_auth_primary_missing_objects) {
1370 primary = auth_log_shard;
1371 ss << "up_primary: " << up_primary << ") has approximate "
1372 << approx_missing_objects
1373 << "(>" << force_auth_primary_missing_objects <<") "
1374 << "missing objects, osd." << auth_log_shard_id
1375 << " selected as primary instead"
1376 << std::endl;
1377 } else {
1378 ss << "up_primary: " << up_primary << ") selected as primary"
1379 << std::endl;
1380 }
1381 } else {
1382 ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1383 }
1384 } else {
1385 ceph_assert(!auth_log_shard->second.is_incomplete());
1386 ss << "up[0] needs backfill, osd." << auth_log_shard_id
1387 << " selected as primary instead" << std::endl;
1388 primary = auth_log_shard;
1389 }
1390
1391 ss << __func__ << " primary is osd." << primary->first
1392 << " with " << primary->second << std::endl;
1393 want->push_back(primary->first.osd);
1394 acting_backfill->insert(primary->first);
1395
1396 /* We include auth_log_shard->second.log_tail because in GetLog,
1397 * we will request logs back to the min last_update over our
1398 * acting_backfill set, which will result in our log being extended
1399 * as far backwards as necessary to pick up any peers which can
1400 * be log recovered by auth_log_shard's log */
1401 eversion_t oldest_auth_log_entry =
1402 std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
1403
1404 // select replicas that have log contiguity with primary.
1405 // prefer up, then acting, then any peer_info osds
1406 for (auto i : up) {
1407 pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
1408 if (up_cand == primary->first)
1409 continue;
1410 const pg_info_t &cur_info = all_info.find(up_cand)->second;
1411 if (cur_info.is_incomplete() ||
1412 cur_info.last_update < oldest_auth_log_entry) {
1413 ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1414 backfill->insert(up_cand);
1415 acting_backfill->insert(up_cand);
1416 } else {
1417 want->push_back(i);
1418 acting_backfill->insert(up_cand);
1419 ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
1420 }
1421 if (want->size() >= size) {
1422 break;
1423 }
1424 }
1425
1426 if (want->size() >= size) {
1427 return;
1428 }
1429
1430 std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
1431 candidate_by_last_update.reserve(acting.size());
1432 // This no longer has backfill OSDs, but they are covered above.
1433 for (auto i : acting) {
1434 pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
1435 // skip up osds we already considered above
1436 if (acting_cand == primary->first)
1437 continue;
1438 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i);
1439 if (up_it != up.end())
1440 continue;
1441
1442 const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1443 if (cur_info.is_incomplete() ||
1444 cur_info.last_update < oldest_auth_log_entry) {
1445 ss << " shard " << acting_cand << " (acting) REJECTED "
1446 << cur_info << std::endl;
1447 } else {
1448 candidate_by_last_update.push_back(make_pair(cur_info.last_update, i));
1449 }
1450 }
1451
1452 auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs,
1453 const std::pair<eversion_t, int> &rhs) {
1454 return lhs.first > rhs.first;
1455 };
1456 // sort by last_update, in descending order.
1457 std::sort(candidate_by_last_update.begin(),
1458 candidate_by_last_update.end(), sort_by_eversion);
1459 for (auto &p: candidate_by_last_update) {
1460 ceph_assert(want->size() < size);
1461 want->push_back(p.second);
1462 pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1463 acting_backfill->insert(s);
1464 ss << " shard " << s << " (acting) accepted "
1465 << all_info.find(s)->second << std::endl;
1466 if (want->size() >= size) {
1467 return;
1468 }
1469 }
1470
1471 if (restrict_to_up_acting) {
1472 return;
1473 }
1474 candidate_by_last_update.clear();
1475 candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
1476 // continue to search stray to find more suitable peers
1477 for (auto &i : all_info) {
1478 // skip up osds we already considered above
1479 if (i.first == primary->first)
1480 continue;
1481 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i.first.osd);
1482 if (up_it != up.end())
1483 continue;
1484 vector<int>::const_iterator acting_it = find(
1485 acting.begin(), acting.end(), i.first.osd);
1486 if (acting_it != acting.end())
1487 continue;
1488
1489 if (i.second.is_incomplete() ||
1490 i.second.last_update < oldest_auth_log_entry) {
1491 ss << " shard " << i.first << " (stray) REJECTED " << i.second
1492 << std::endl;
1493 } else {
1494 candidate_by_last_update.push_back(
1495 make_pair(i.second.last_update, i.first.osd));
1496 }
1497 }
1498
1499 if (candidate_by_last_update.empty()) {
1500 // save us some effort
1501 return;
1502 }
1503
1504 // sort by last_update, in descending order.
1505 std::sort(candidate_by_last_update.begin(),
1506 candidate_by_last_update.end(), sort_by_eversion);
1507
1508 for (auto &p: candidate_by_last_update) {
1509 ceph_assert(want->size() < size);
1510 want->push_back(p.second);
1511 pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1512 acting_backfill->insert(s);
1513 ss << " shard " << s << " (stray) accepted "
1514 << all_info.find(s)->second << std::endl;
1515 if (want->size() >= size) {
1516 return;
1517 }
1518 }
1519 }
1520
1521 bool PG::recoverable_and_ge_min_size(const vector<int> &want) const
1522 {
1523 unsigned num_want_acting = 0;
1524 set<pg_shard_t> have;
1525 for (int i = 0; i < (int)want.size(); ++i) {
1526 if (want[i] != CRUSH_ITEM_NONE) {
1527 ++num_want_acting;
1528 have.insert(
1529 pg_shard_t(
1530 want[i],
1531 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1532 }
1533 }
1534 // We go incomplete if below min_size for ec_pools since backfill
1535 // does not currently maintain rollbackability
1536 // Otherwise, we will go "peered", but not "active"
1537 if (num_want_acting < pool.info.min_size &&
1538 (pool.info.is_erasure() ||
1539 !cct->_conf->osd_allow_recovery_below_min_size)) {
1540 dout(10) << __func__ << " failed, below min size" << dendl;
1541 return false;
1542 }
1543
1544 /* Check whether we have enough acting shards to later perform recovery */
1545 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable_predicate(
1546 get_pgbackend()->get_is_recoverable_predicate());
1547 if (!(*recoverable_predicate)(have)) {
1548 dout(10) << __func__ << " failed, not recoverable" << dendl;
1549 return false;
1550 }
1551
1552 return true;
1553 }
1554
1555 void PG::choose_async_recovery_ec(const map<pg_shard_t, pg_info_t> &all_info,
1556 const pg_info_t &auth_info,
1557 vector<int> *want,
1558 set<pg_shard_t> *async_recovery,
1559 const OSDMapRef osdmap) const
1560 {
1561 set<pair<int, pg_shard_t> > candidates_by_cost;
1562 for (uint8_t i = 0; i < want->size(); ++i) {
1563 if ((*want)[i] == CRUSH_ITEM_NONE)
1564 continue;
1565
1566 // Considering log entries to recover is accurate enough for
1567 // now. We could use minimum_to_decode_with_cost() later if
1568 // necessary.
1569 pg_shard_t shard_i((*want)[i], shard_id_t(i));
1570 // do not include strays
1571 if (stray_set.find(shard_i) != stray_set.end())
1572 continue;
1573 // Do not include an osd that is not up, since choosing it as
1574 // an async_recovery_target will move it out of the acting set.
1575 // This results in it being identified as a stray during peering,
1576 // because it is no longer in the up or acting set.
1577 if (!is_up(shard_i))
1578 continue;
1579 auto shard_info = all_info.find(shard_i)->second;
1580 // for ec pools we rollback all entries past the authoritative
1581 // last_update *before* activation. This is relatively inexpensive
1582 // compared to recovery, since it is purely local, so treat shards
1583 // past the authoritative last_update the same as those equal to it.
1584 version_t auth_version = auth_info.last_update.version;
1585 version_t candidate_version = shard_info.last_update.version;
1586 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1587 auto approx_missing_objects =
1588 shard_info.stats.stats.sum.num_objects_missing;
1589 if (auth_version > candidate_version) {
1590 approx_missing_objects += auth_version - candidate_version;
1591 }
1592 if (static_cast<uint64_t>(approx_missing_objects) >
1593 cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1594 candidates_by_cost.emplace(approx_missing_objects, shard_i);
1595 }
1596 } else {
1597 if (auth_version > candidate_version &&
1598 (auth_version - candidate_version) > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1599 candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i));
1600 }
1601 }
1602 }
1603
1604 dout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1605 << dendl;
1606
1607 // take out as many osds as we can for async recovery, in order of cost
1608 for (auto rit = candidates_by_cost.rbegin();
1609 rit != candidates_by_cost.rend(); ++rit) {
1610 pg_shard_t cur_shard = rit->second;
1611 vector<int> candidate_want(*want);
1612 candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE;
1613 if (recoverable_and_ge_min_size(candidate_want)) {
1614 want->swap(candidate_want);
1615 async_recovery->insert(cur_shard);
1616 }
1617 }
1618 dout(20) << __func__ << " result want=" << *want
1619 << " async_recovery=" << *async_recovery << dendl;
1620 }
1621
1622 void PG::choose_async_recovery_replicated(const map<pg_shard_t, pg_info_t> &all_info,
1623 const pg_info_t &auth_info,
1624 vector<int> *want,
1625 set<pg_shard_t> *async_recovery,
1626 const OSDMapRef osdmap) const
1627 {
1628 set<pair<int, pg_shard_t> > candidates_by_cost;
1629 for (auto osd_num : *want) {
1630 pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD);
1631 // do not include strays
1632 if (stray_set.find(shard_i) != stray_set.end())
1633 continue;
1634 // Do not include an osd that is not up, since choosing it as
1635 // an async_recovery_target will move it out of the acting set.
1636 // This results in it being identified as a stray during peering,
1637 // because it is no longer in the up or acting set.
1638 if (!is_up(shard_i))
1639 continue;
1640 auto shard_info = all_info.find(shard_i)->second;
1641 // use the approximate magnitude of the difference in length of
1642 // logs plus historical missing objects as the cost of recovery
1643 version_t auth_version = auth_info.last_update.version;
1644 version_t candidate_version = shard_info.last_update.version;
1645 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1646 auto approx_missing_objects =
1647 shard_info.stats.stats.sum.num_objects_missing;
1648 if (auth_version > candidate_version) {
1649 approx_missing_objects += auth_version - candidate_version;
1650 } else {
1651 approx_missing_objects += candidate_version - auth_version;
1652 }
1653 if (static_cast<uint64_t>(approx_missing_objects) >
1654 cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1655 candidates_by_cost.emplace(approx_missing_objects, shard_i);
1656 }
1657 } else {
1658 size_t approx_entries;
1659 if (auth_version > candidate_version) {
1660 approx_entries = auth_version - candidate_version;
1661 } else {
1662 approx_entries = candidate_version - auth_version;
1663 }
1664 if (approx_entries > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1665 candidates_by_cost.insert(make_pair(approx_entries, shard_i));
1666 }
1667 }
1668 }
1669
1670 dout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1671 << dendl;
1672 // take out as many osds as we can for async recovery, in order of cost
1673 for (auto rit = candidates_by_cost.rbegin();
1674 rit != candidates_by_cost.rend(); ++rit) {
1675 if (want->size() <= pool.info.min_size) {
1676 break;
1677 }
1678 pg_shard_t cur_shard = rit->second;
1679 vector<int> candidate_want(*want);
1680 for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) {
1681 if (*it == cur_shard.osd) {
1682 candidate_want.erase(it);
1683 want->swap(candidate_want);
1684 async_recovery->insert(cur_shard);
1685 break;
1686 }
1687 }
1688 }
1689 dout(20) << __func__ << " result want=" << *want
1690 << " async_recovery=" << *async_recovery << dendl;
1691 }
1692
1693 /**
1694 * choose acting
1695 *
1696 * calculate the desired acting, and request a change with the monitor
1697 * if it differs from the current acting.
1698 *
1699 * if restrict_to_up_acting=true, we filter out anything that's not in
1700 * up/acting. in order to lift this restriction, we need to
1701 * 1) check whether it's worth switching the acting set any time we get
1702 * a new pg info (not just here, when recovery finishes)
1703 * 2) check whether anything in want_acting went down on each new map
1704 * (and, if so, calculate a new want_acting)
1705 * 3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap)
1706 * TODO!
1707 */
1708 bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
1709 bool restrict_to_up_acting,
1710 bool *history_les_bound)
1711 {
1712 map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
1713 all_info[pg_whoami] = info;
1714
1715 if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
1716 for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
1717 p != all_info.end();
1718 ++p) {
1719 dout(10) << __func__ << " all_info osd." << p->first << " " << p->second << dendl;
1720 }
1721 }
1722
1723 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
1724 find_best_info(all_info, restrict_to_up_acting, history_les_bound);
1725
1726 if (auth_log_shard == all_info.end()) {
1727 if (up != acting) {
1728 dout(10) << __func__ << " no suitable info found (incomplete backfills?),"
1729 << " reverting to up" << dendl;
1730 want_acting = up;
1731 vector<int> empty;
1732 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1733 } else {
1734 dout(10) << __func__ << " failed" << dendl;
1735 ceph_assert(want_acting.empty());
1736 }
1737 return false;
1738 }
1739
1740 ceph_assert(!auth_log_shard->second.is_incomplete());
1741 auth_log_shard_id = auth_log_shard->first;
1742
1743 set<pg_shard_t> want_backfill, want_acting_backfill;
1744 vector<int> want;
1745 stringstream ss;
1746 if (!pool.info.is_erasure())
1747 calc_replicated_acting(
1748 auth_log_shard,
1749 cct->_conf.get_val<uint64_t>(
1750 "osd_force_auth_primary_missing_objects"),
1751 get_osdmap()->get_pg_size(info.pgid.pgid),
1752 acting,
1753 up,
1754 up_primary,
1755 all_info,
1756 restrict_to_up_acting,
1757 &want,
1758 &want_backfill,
1759 &want_acting_backfill,
1760 get_osdmap(),
1761 ss);
1762 else
1763 calc_ec_acting(
1764 auth_log_shard,
1765 get_osdmap()->get_pg_size(info.pgid.pgid),
1766 acting,
1767 up,
1768 all_info,
1769 restrict_to_up_acting,
1770 &want,
1771 &want_backfill,
1772 &want_acting_backfill,
1773 ss);
1774 dout(10) << ss.str() << dendl;
1775
1776 if (!recoverable_and_ge_min_size(want)) {
1777 want_acting.clear();
1778 return false;
1779 }
1780
1781 set<pg_shard_t> want_async_recovery;
1782 if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) {
1783 if (pool.info.is_erasure()) {
1784 choose_async_recovery_ec(all_info, auth_log_shard->second, &want, &want_async_recovery, get_osdmap());
1785 } else {
1786 choose_async_recovery_replicated(all_info, auth_log_shard->second, &want, &want_async_recovery, get_osdmap());
1787 }
1788 }
1789 if (want != acting) {
1790 dout(10) << __func__ << " want " << want << " != acting " << acting
1791 << ", requesting pg_temp change" << dendl;
1792 want_acting = want;
1793
1794 if (!cct->_conf->osd_debug_no_acting_change) {
1795 if (want_acting == up) {
1796 // There can't be any pending backfill if
1797 // want is the same as crush map up OSDs.
1798 ceph_assert(want_backfill.empty());
1799 vector<int> empty;
1800 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1801 } else
1802 osd->queue_want_pg_temp(info.pgid.pgid, want);
1803 }
1804 return false;
1805 }
1806 want_acting.clear();
1807 acting_recovery_backfill = want_acting_backfill;
1808 dout(10) << "acting_recovery_backfill is " << acting_recovery_backfill << dendl;
1809 ceph_assert(backfill_targets.empty() || backfill_targets == want_backfill);
1810 if (backfill_targets.empty()) {
1811 // Caller is GetInfo
1812 backfill_targets = want_backfill;
1813 }
1814 // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
1815 ceph_assert(async_recovery_targets.empty() || async_recovery_targets == want_async_recovery || !needs_recovery());
1816 if (async_recovery_targets.empty() || !needs_recovery()) {
1817 async_recovery_targets = want_async_recovery;
1818 }
1819 // Will not change if already set because up would have had to change
1820 // Verify that nothing in backfill is in stray_set
1821 for (set<pg_shard_t>::iterator i = want_backfill.begin();
1822 i != want_backfill.end();
1823 ++i) {
1824 ceph_assert(stray_set.find(*i) == stray_set.end());
1825 }
1826 dout(10) << "choose_acting want=" << want << " backfill_targets="
1827 << want_backfill << " async_recovery_targets="
1828 << async_recovery_targets << dendl;
1829 return true;
1830 }
1831
1832 /* Build the might_have_unfound set.
1833 *
1834 * This is used by the primary OSD during recovery.
1835 *
1836 * This set tracks the OSDs which might have unfound objects that the primary
1837 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
1838 * will remove the OSD from the set.
1839 */
1840 void PG::build_might_have_unfound()
1841 {
1842 ceph_assert(might_have_unfound.empty());
1843 ceph_assert(is_primary());
1844
1845 dout(10) << __func__ << dendl;
1846
1847 check_past_interval_bounds();
1848
1849 might_have_unfound = past_intervals.get_might_have_unfound(
1850 pg_whoami,
1851 pool.info.is_erasure());
1852
1853 // include any (stray) peers
1854 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
1855 p != peer_info.end();
1856 ++p)
1857 might_have_unfound.insert(p->first);
1858
1859 dout(15) << __func__ << ": built " << might_have_unfound << dendl;
1860 }
1861
1862 void PG::activate(ObjectStore::Transaction& t,
1863 epoch_t activation_epoch,
1864 map<int, map<spg_t,pg_query_t> >& query_map,
1865 map<int,
1866 vector<
1867 pair<pg_notify_t,
1868 PastIntervals> > > *activator_map,
1869 RecoveryCtx *ctx)
1870 {
1871 ceph_assert(!is_peered());
1872 ceph_assert(scrubber.callbacks.empty());
1873 ceph_assert(callbacks_for_degraded_object.empty());
1874
1875 // twiddle pg state
1876 state_clear(PG_STATE_DOWN);
1877
1878 send_notify = false;
1879
1880 if (is_primary()) {
1881 // only update primary last_epoch_started if we will go active
1882 if (acting.size() >= pool.info.min_size) {
1883 ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
1884 info.last_epoch_started <= activation_epoch);
1885 info.last_epoch_started = activation_epoch;
1886 info.last_interval_started = info.history.same_interval_since;
1887 }
1888 } else if (is_acting(pg_whoami)) {
1889 /* update last_epoch_started on acting replica to whatever the primary sent
1890 * unless it's smaller (could happen if we are going peered rather than
1891 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
1892 if (info.last_epoch_started < activation_epoch) {
1893 info.last_epoch_started = activation_epoch;
1894 info.last_interval_started = info.history.same_interval_since;
1895 }
1896 }
1897
1898 auto &missing = pg_log.get_missing();
1899
1900 if (is_primary()) {
1901 last_update_ondisk = info.last_update;
1902 min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
1903 }
1904 last_update_applied = info.last_update;
1905 last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
1906
1907 need_up_thru = false;
1908
1909 // write pg info, log
1910 dirty_info = true;
1911 dirty_big_info = true; // maybe
1912
1913 // find out when we commit
1914 t.register_on_complete(
1915 new C_PG_ActivateCommitted(
1916 this,
1917 get_osdmap_epoch(),
1918 activation_epoch));
1919
1920 if (is_primary()) {
1921 // initialize snap_trimq
1922 if (get_osdmap()->require_osd_release < CEPH_RELEASE_MIMIC) {
1923 dout(20) << "activate - purged_snaps " << info.purged_snaps
1924 << " cached_removed_snaps " << pool.cached_removed_snaps
1925 << dendl;
1926 snap_trimq = pool.cached_removed_snaps;
1927 } else {
1928 auto& removed_snaps_queue = get_osdmap()->get_removed_snaps_queue();
1929 auto p = removed_snaps_queue.find(info.pgid.pgid.pool());
1930 snap_trimq.clear();
1931 if (p != removed_snaps_queue.end()) {
1932 dout(20) << "activate - purged_snaps " << info.purged_snaps
1933 << " removed_snaps " << p->second
1934 << dendl;
1935 for (auto q : p->second) {
1936 snap_trimq.insert(q.first, q.second);
1937 }
1938 }
1939 }
1940 interval_set<snapid_t> purged;
1941 purged.intersection_of(snap_trimq, info.purged_snaps);
1942 snap_trimq.subtract(purged);
1943
1944 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_MIMIC) {
1945 // adjust purged_snaps: PG may have been inactive while snaps were pruned
1946 // from the removed_snaps_queue in the osdmap. update local purged_snaps
1947 // reflect only those snaps that we thought were pruned and were still in
1948 // the queue.
1949 info.purged_snaps.swap(purged);
1950 }
1951 }
1952
1953 // init complete pointer
1954 if (missing.num_missing() == 0) {
1955 dout(10) << "activate - no missing, moving last_complete " << info.last_complete
1956 << " -> " << info.last_update << dendl;
1957 info.last_complete = info.last_update;
1958 info.stats.stats.sum.num_objects_missing = 0;
1959 pg_log.reset_recovery_pointers();
1960 } else {
1961 dout(10) << "activate - not complete, " << missing << dendl;
1962 info.stats.stats.sum.num_objects_missing = missing.num_missing();
1963 pg_log.activate_not_complete(info);
1964 }
1965
1966 log_weirdness();
1967
1968 // if primary..
1969 if (is_primary()) {
1970 ceph_assert(ctx);
1971 // start up replicas
1972
1973 ceph_assert(!acting_recovery_backfill.empty());
1974 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
1975 i != acting_recovery_backfill.end();
1976 ++i) {
1977 if (*i == pg_whoami) continue;
1978 pg_shard_t peer = *i;
1979 ceph_assert(peer_info.count(peer));
1980 pg_info_t& pi = peer_info[peer];
1981
1982 dout(10) << "activate peer osd." << peer << " " << pi << dendl;
1983
1984 MOSDPGLog *m = 0;
1985 ceph_assert(peer_missing.count(peer));
1986 pg_missing_t& pm = peer_missing[peer];
1987
1988 bool needs_past_intervals = pi.dne();
1989
1990 /*
1991 * cover case where peer sort order was different and
1992 * last_backfill cannot be interpreted
1993 */
1994 bool force_restart_backfill =
1995 !pi.last_backfill.is_max() &&
1996 !pi.last_backfill_bitwise;
1997
1998 if (pi.last_update == info.last_update && !force_restart_backfill) {
1999 // empty log
2000 if (!pi.last_backfill.is_max())
2001 osd->clog->info() << info.pgid << " continuing backfill to osd."
2002 << peer
2003 << " from (" << pi.log_tail << "," << pi.last_update
2004 << "] " << pi.last_backfill
2005 << " to " << info.last_update;
2006 if (!pi.is_empty() && activator_map) {
2007 dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
2008 (*activator_map)[peer.osd].push_back(
2009 make_pair(
2010 pg_notify_t(
2011 peer.shard, pg_whoami.shard,
2012 get_osdmap_epoch(),
2013 get_osdmap_epoch(),
2014 info),
2015 past_intervals));
2016 } else {
2017 dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
2018 m = new MOSDPGLog(
2019 i->shard, pg_whoami.shard,
2020 get_osdmap_epoch(), info,
2021 last_peering_reset);
2022 }
2023 } else if (
2024 pg_log.get_tail() > pi.last_update ||
2025 pi.last_backfill == hobject_t() ||
2026 force_restart_backfill ||
2027 (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
2028 /* ^ This last case covers a situation where a replica is not contiguous
2029 * with the auth_log, but is contiguous with this replica. Reshuffling
2030 * the active set to handle this would be tricky, so instead we just go
2031 * ahead and backfill it anyway. This is probably preferrable in any
2032 * case since the replica in question would have to be significantly
2033 * behind.
2034 */
2035 // backfill
2036 osd->clog->debug() << info.pgid << " starting backfill to osd." << peer
2037 << " from (" << pi.log_tail << "," << pi.last_update
2038 << "] " << pi.last_backfill
2039 << " to " << info.last_update;
2040
2041 pi.last_update = info.last_update;
2042 pi.last_complete = info.last_update;
2043 pi.set_last_backfill(hobject_t());
2044 pi.last_epoch_started = info.last_epoch_started;
2045 pi.last_interval_started = info.last_interval_started;
2046 pi.history = info.history;
2047 pi.hit_set = info.hit_set;
2048 // Save num_bytes for reservation request, can't be negative
2049 peer_bytes[peer] = std::max<int64_t>(0, pi.stats.stats.sum.num_bytes);
2050 pi.stats.stats.clear();
2051
2052 // initialize peer with our purged_snaps.
2053 pi.purged_snaps = info.purged_snaps;
2054
2055 m = new MOSDPGLog(
2056 i->shard, pg_whoami.shard,
2057 get_osdmap_epoch(), pi,
2058 last_peering_reset /* epoch to create pg at */);
2059
2060 // send some recent log, so that op dup detection works well.
2061 m->log.copy_up_to(cct, pg_log.get_log(), cct->_conf->osd_min_pg_log_entries);
2062 m->info.log_tail = m->log.tail;
2063 pi.log_tail = m->log.tail; // sigh...
2064
2065 pm.clear();
2066 } else {
2067 // catch up
2068 ceph_assert(pg_log.get_tail() <= pi.last_update);
2069 m = new MOSDPGLog(
2070 i->shard, pg_whoami.shard,
2071 get_osdmap_epoch(), info,
2072 last_peering_reset /* epoch to create pg at */);
2073 // send new stuff to append to replicas log
2074 m->log.copy_after(cct, pg_log.get_log(), pi.last_update);
2075 }
2076
2077 // share past_intervals if we are creating the pg on the replica
2078 // based on whether our info for that peer was dne() *before*
2079 // updating pi.history in the backfill block above.
2080 if (m && needs_past_intervals)
2081 m->past_intervals = past_intervals;
2082
2083 // update local version of peer's missing list!
2084 if (m && pi.last_backfill != hobject_t()) {
2085 for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
2086 p != m->log.log.end();
2087 ++p) {
2088 if (p->soid <= pi.last_backfill &&
2089 !p->is_error()) {
2090 if (perform_deletes_during_peering() && p->is_delete()) {
2091 pm.rm(p->soid, p->version);
2092 } else {
2093 pm.add_next_event(*p);
2094 }
2095 }
2096 }
2097 }
2098
2099 if (m) {
2100 dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
2101 //m->log.print(cout);
2102 osd->send_message_osd_cluster(peer.osd, m, get_osdmap_epoch());
2103 }
2104
2105 // peer now has
2106 pi.last_update = info.last_update;
2107
2108 // update our missing
2109 if (pm.num_missing() == 0) {
2110 pi.last_complete = pi.last_update;
2111 dout(10) << "activate peer osd." << peer << " " << pi << " uptodate" << dendl;
2112 } else {
2113 dout(10) << "activate peer osd." << peer << " " << pi << " missing " << pm << dendl;
2114 }
2115 }
2116
2117 // Set up missing_loc
2118 set<pg_shard_t> complete_shards;
2119 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2120 i != acting_recovery_backfill.end();
2121 ++i) {
2122 dout(20) << __func__ << " setting up missing_loc from shard " << *i << " " << dendl;
2123 if (*i == get_primary()) {
2124 missing_loc.add_active_missing(missing);
2125 if (!missing.have_missing())
2126 complete_shards.insert(*i);
2127 } else {
2128 auto peer_missing_entry = peer_missing.find(*i);
2129 ceph_assert(peer_missing_entry != peer_missing.end());
2130 missing_loc.add_active_missing(peer_missing_entry->second);
2131 if (!peer_missing_entry->second.have_missing() &&
2132 peer_info[*i].last_backfill.is_max())
2133 complete_shards.insert(*i);
2134 }
2135 }
2136
2137 // If necessary, create might_have_unfound to help us find our unfound objects.
2138 // NOTE: It's important that we build might_have_unfound before trimming the
2139 // past intervals.
2140 might_have_unfound.clear();
2141 if (needs_recovery()) {
2142 // If only one shard has missing, we do a trick to add all others as recovery
2143 // source, this is considered safe since the PGLogs have been merged locally,
2144 // and covers vast majority of the use cases, like one OSD/host is down for
2145 // a while for hardware repairing
2146 if (complete_shards.size() + 1 == acting_recovery_backfill.size()) {
2147 missing_loc.add_batch_sources_info(complete_shards, ctx->handle);
2148 } else {
2149 missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
2150 ctx->handle);
2151 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2152 i != acting_recovery_backfill.end();
2153 ++i) {
2154 if (*i == pg_whoami) continue;
2155 dout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
2156 ceph_assert(peer_missing.count(*i));
2157 ceph_assert(peer_info.count(*i));
2158 missing_loc.add_source_info(
2159 *i,
2160 peer_info[*i],
2161 peer_missing[*i],
2162 ctx->handle);
2163 }
2164 }
2165 for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
2166 i != peer_missing.end();
2167 ++i) {
2168 if (is_acting_recovery_backfill(i->first))
2169 continue;
2170 ceph_assert(peer_info.count(i->first));
2171 search_for_missing(
2172 peer_info[i->first],
2173 i->second,
2174 i->first,
2175 ctx);
2176 }
2177
2178 build_might_have_unfound();
2179
2180 // Always call now so _update_calc_stats() will be accurate
2181 discover_all_missing(query_map);
2182 }
2183
2184 // num_objects_degraded if calculated should reflect this too, unless no
2185 // missing and we are about to go clean.
2186 if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
2187 state_set(PG_STATE_UNDERSIZED);
2188 }
2189
2190 state_set(PG_STATE_ACTIVATING);
2191 release_pg_backoffs();
2192 projected_last_update = info.last_update;
2193 }
2194 if (acting.size() >= pool.info.min_size) {
2195 PGLogEntryHandler handler{this, &t};
2196 pg_log.roll_forward(&handler);
2197 }
2198 }
2199
2200 bool PG::op_has_sufficient_caps(OpRequestRef& op)
2201 {
2202 // only check MOSDOp
2203 if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
2204 return true;
2205
2206 const MOSDOp *req = static_cast<const MOSDOp*>(op->get_req());
2207
2208 auto priv = req->get_connection()->get_priv();
2209 auto session = static_cast<Session*>(priv.get());
2210 if (!session) {
2211 dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl;
2212 return false;
2213 }
2214 OSDCap& caps = session->caps;
2215 priv.reset();
2216
2217 const string &key = req->get_hobj().get_key().empty() ?
2218 req->get_oid().name :
2219 req->get_hobj().get_key();
2220
2221 bool cap = caps.is_capable(pool.name, req->get_hobj().nspace,
2222 pool.info.application_metadata,
2223 key,
2224 op->need_read_cap(),
2225 op->need_write_cap(),
2226 op->classes(),
2227 session->get_peer_socket_addr());
2228
2229 dout(20) << "op_has_sufficient_caps "
2230 << "session=" << session
2231 << " pool=" << pool.id << " (" << pool.name
2232 << " " << req->get_hobj().nspace
2233 << ")"
2234 << " pool_app_metadata=" << pool.info.application_metadata
2235 << " need_read_cap=" << op->need_read_cap()
2236 << " need_write_cap=" << op->need_write_cap()
2237 << " classes=" << op->classes()
2238 << " -> " << (cap ? "yes" : "NO")
2239 << dendl;
2240 return cap;
2241 }
2242
2243 void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
2244 {
2245 lock();
2246 if (pg_has_reset_since(epoch)) {
2247 dout(10) << "_activate_committed " << epoch
2248 << ", that was an old interval" << dendl;
2249 } else if (is_primary()) {
2250 ceph_assert(!peer_activated.count(pg_whoami));
2251 peer_activated.insert(pg_whoami);
2252 dout(10) << "_activate_committed " << epoch
2253 << " peer_activated now " << peer_activated
2254 << " last_interval_started " << info.history.last_interval_started
2255 << " last_epoch_started " << info.history.last_epoch_started
2256 << " same_interval_since " << info.history.same_interval_since << dendl;
2257 ceph_assert(!acting_recovery_backfill.empty());
2258 if (peer_activated.size() == acting_recovery_backfill.size())
2259 all_activated_and_committed();
2260 } else {
2261 dout(10) << "_activate_committed " << epoch << " telling primary" << dendl;
2262 MOSDPGInfo *m = new MOSDPGInfo(epoch);
2263 pg_notify_t i = pg_notify_t(
2264 get_primary().shard, pg_whoami.shard,
2265 get_osdmap_epoch(),
2266 get_osdmap_epoch(),
2267 info);
2268
2269 i.info.history.last_epoch_started = activation_epoch;
2270 i.info.history.last_interval_started = i.info.history.same_interval_since;
2271 if (acting.size() >= pool.info.min_size) {
2272 state_set(PG_STATE_ACTIVE);
2273 } else {
2274 state_set(PG_STATE_PEERED);
2275 }
2276
2277 m->pg_list.push_back(make_pair(i, PastIntervals()));
2278 osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap_epoch());
2279
2280 // waiters
2281 if (flushes_in_progress == 0) {
2282 requeue_ops(waiting_for_peered);
2283 } else if (!waiting_for_peered.empty()) {
2284 dout(10) << __func__ << " flushes in progress, moving "
2285 << waiting_for_peered.size() << " items to waiting_for_flush"
2286 << dendl;
2287 ceph_assert(waiting_for_flush.empty());
2288 waiting_for_flush.swap(waiting_for_peered);
2289 }
2290 }
2291
2292 ceph_assert(!dirty_info);
2293
2294 unlock();
2295 }
2296
2297 /*
2298 * update info.history.last_epoch_started ONLY after we and all
2299 * replicas have activated AND committed the activate transaction
2300 * (i.e. the peering results are stable on disk).
2301 */
2302 void PG::all_activated_and_committed()
2303 {
2304 dout(10) << "all_activated_and_committed" << dendl;
2305 ceph_assert(is_primary());
2306 ceph_assert(peer_activated.size() == acting_recovery_backfill.size());
2307 ceph_assert(!acting_recovery_backfill.empty());
2308 ceph_assert(blocked_by.empty());
2309
2310 // Degraded?
2311 _update_calc_stats();
2312 if (info.stats.stats.sum.num_objects_degraded) {
2313 state_set(PG_STATE_DEGRADED);
2314 } else {
2315 state_clear(PG_STATE_DEGRADED);
2316 }
2317
2318 queue_peering_event(
2319 PGPeeringEventRef(
2320 std::make_shared<PGPeeringEvent>(
2321 get_osdmap_epoch(),
2322 get_osdmap_epoch(),
2323 AllReplicasActivated())));
2324 }
2325
2326 bool PG::requeue_scrub(bool high_priority)
2327 {
2328 ceph_assert(is_locked());
2329 if (scrub_queued) {
2330 dout(10) << __func__ << ": already queued" << dendl;
2331 return false;
2332 } else {
2333 dout(10) << __func__ << ": queueing" << dendl;
2334 scrub_queued = true;
2335 osd->queue_for_scrub(this, high_priority);
2336 return true;
2337 }
2338 }
2339
2340 void PG::queue_recovery()
2341 {
2342 if (!is_primary() || !is_peered()) {
2343 dout(10) << "queue_recovery -- not primary or not peered " << dendl;
2344 ceph_assert(!recovery_queued);
2345 } else if (recovery_queued) {
2346 dout(10) << "queue_recovery -- already queued" << dendl;
2347 } else {
2348 dout(10) << "queue_recovery -- queuing" << dendl;
2349 recovery_queued = true;
2350 osd->queue_for_recovery(this);
2351 }
2352 }
2353
2354 bool PG::queue_scrub()
2355 {
2356 ceph_assert(is_locked());
2357 if (is_scrubbing()) {
2358 return false;
2359 }
2360 // An interrupted recovery repair could leave this set.
2361 state_clear(PG_STATE_REPAIR);
2362 scrubber.priority = scrubber.must_scrub ?
2363 cct->_conf->osd_requested_scrub_priority : get_scrub_priority();
2364 scrubber.must_scrub = false;
2365 state_set(PG_STATE_SCRUBBING);
2366 if (scrubber.must_deep_scrub) {
2367 state_set(PG_STATE_DEEP_SCRUB);
2368 scrubber.must_deep_scrub = false;
2369 }
2370 if (scrubber.must_repair || scrubber.auto_repair) {
2371 state_set(PG_STATE_REPAIR);
2372 scrubber.must_repair = false;
2373 }
2374 requeue_scrub();
2375 return true;
2376 }
2377
2378 unsigned PG::get_scrub_priority()
2379 {
2380 // a higher value -> a higher priority
2381 int64_t pool_scrub_priority = 0;
2382 pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority);
2383 return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
2384 }
2385
2386 void PG::try_mark_clean()
2387 {
2388 if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2389 state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
2390 state_set(PG_STATE_CLEAN);
2391 info.history.last_epoch_clean = get_osdmap_epoch();
2392 info.history.last_interval_clean = info.history.same_interval_since;
2393 past_intervals.clear();
2394 dirty_big_info = true;
2395 dirty_info = true;
2396 }
2397
2398 if (is_active()) {
2399 kick_snap_trim();
2400 } else if (is_peered()) {
2401 if (is_clean()) {
2402 bool target;
2403 if (pool.info.is_pending_merge(info.pgid.pgid, &target)) {
2404 if (target) {
2405 ldout(cct, 10) << "ready to merge (target)" << dendl;
2406 osd->set_ready_to_merge_target(this,
2407 info.last_update,
2408 info.history.last_epoch_started,
2409 info.history.last_epoch_clean);
2410 } else {
2411 ldout(cct, 10) << "ready to merge (source)" << dendl;
2412 osd->set_ready_to_merge_source(this, info.last_update);
2413 }
2414 }
2415 } else {
2416 ldout(cct, 10) << "not clean, not ready to merge" << dendl;
2417 // we should have notified OSD in Active state entry point
2418 }
2419 }
2420
2421 state_clear(PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
2422
2423 share_pg_info();
2424 publish_stats_to_osd();
2425 requeue_ops(waiting_for_clean_to_primary_repair);
2426 }
2427
2428 bool PG::set_force_recovery(bool b)
2429 {
2430 bool did = false;
2431 if (b) {
2432 if (!(state & PG_STATE_FORCED_RECOVERY) &&
2433 (state & (PG_STATE_DEGRADED |
2434 PG_STATE_RECOVERY_WAIT |
2435 PG_STATE_RECOVERING))) {
2436 dout(20) << __func__ << " set" << dendl;
2437 state_set(PG_STATE_FORCED_RECOVERY);
2438 publish_stats_to_osd();
2439 did = true;
2440 }
2441 } else if (state & PG_STATE_FORCED_RECOVERY) {
2442 dout(20) << __func__ << " clear" << dendl;
2443 state_clear(PG_STATE_FORCED_RECOVERY);
2444 publish_stats_to_osd();
2445 did = true;
2446 }
2447 if (did) {
2448 dout(20) << __func__ << " state " << pgstate_history.get_current_state() << dendl;
2449 osd->local_reserver.update_priority(info.pgid, get_recovery_priority());
2450 }
2451 return did;
2452 }
2453
2454 bool PG::set_force_backfill(bool b)
2455 {
2456 bool did = false;
2457 if (b) {
2458 if (!(state & PG_STATE_FORCED_BACKFILL) &&
2459 (state & (PG_STATE_DEGRADED |
2460 PG_STATE_BACKFILL_WAIT |
2461 PG_STATE_BACKFILLING))) {
2462 dout(10) << __func__ << " set" << dendl;
2463 state_set(PG_STATE_FORCED_BACKFILL);
2464 publish_stats_to_osd();
2465 did = true;
2466 }
2467 } else if (state & PG_STATE_FORCED_BACKFILL) {
2468 dout(10) << __func__ << " clear" << dendl;
2469 state_clear(PG_STATE_FORCED_BACKFILL);
2470 publish_stats_to_osd();
2471 did = true;
2472 }
2473 if (did) {
2474 dout(20) << __func__ << " state " << pgstate_history.get_current_state() << dendl;
2475 osd->local_reserver.update_priority(info.pgid, get_backfill_priority());
2476 }
2477 return did;
2478 }
2479
2480 int PG::clamp_recovery_priority(int priority, int pool_recovery_priority, int max)
2481 {
2482 static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
2483 static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
2484
2485 ceph_assert(max <= OSD_RECOVERY_PRIORITY_MAX);
2486
2487 // User can't set this too high anymore, but might be a legacy value
2488 if (pool_recovery_priority > OSD_POOL_PRIORITY_MAX)
2489 pool_recovery_priority = OSD_POOL_PRIORITY_MAX;
2490 if (pool_recovery_priority < OSD_POOL_PRIORITY_MIN)
2491 pool_recovery_priority = OSD_POOL_PRIORITY_MIN;
2492 // Shift range from min to max to 0 to max - min
2493 pool_recovery_priority += (0 - OSD_POOL_PRIORITY_MIN);
2494 ceph_assert(pool_recovery_priority >= 0 && pool_recovery_priority <= (OSD_POOL_PRIORITY_MAX - OSD_POOL_PRIORITY_MIN));
2495
2496 priority += pool_recovery_priority;
2497
2498 // Clamp to valid range
2499 if (priority > max) {
2500 return max;
2501 } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
2502 return OSD_RECOVERY_PRIORITY_MIN;
2503 } else {
2504 return priority;
2505 }
2506 }
2507
2508 unsigned PG::get_recovery_priority()
2509 {
2510 // a higher value -> a higher priority
2511 int ret = OSD_RECOVERY_PRIORITY_BASE;
2512 int base = ret;
2513
2514 if (state & PG_STATE_FORCED_RECOVERY) {
2515 ret = OSD_RECOVERY_PRIORITY_FORCED;
2516 } else {
2517 // XXX: This priority boost isn't so much about inactive, but about data-at-risk
2518 if (is_degraded() && info.stats.avail_no_missing.size() < pool.info.min_size) {
2519 base = OSD_RECOVERY_INACTIVE_PRIORITY_BASE;
2520 // inactive: no. of replicas < min_size, highest priority since it blocks IO
2521 ret = base + (pool.info.min_size - info.stats.avail_no_missing.size());
2522 }
2523
2524 int64_t pool_recovery_priority = 0;
2525 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2526
2527 ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
2528 }
2529 dout(20) << __func__ << " recovery priority is " << ret << dendl;
2530 return static_cast<unsigned>(ret);
2531 }
2532
2533 unsigned PG::get_backfill_priority()
2534 {
2535 // a higher value -> a higher priority
2536 int ret = OSD_BACKFILL_PRIORITY_BASE;
2537 int base = ret;
2538
2539 if (state & PG_STATE_FORCED_BACKFILL) {
2540 ret = OSD_BACKFILL_PRIORITY_FORCED;
2541 } else {
2542 if (acting.size() < pool.info.min_size) {
2543 base = OSD_BACKFILL_INACTIVE_PRIORITY_BASE;
2544 // inactive: no. of replicas < min_size, highest priority since it blocks IO
2545 ret = base + (pool.info.min_size - acting.size());
2546
2547 } else if (is_undersized()) {
2548 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
2549 ceph_assert(pool.info.size > actingset.size());
2550 base = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
2551 ret = base + (pool.info.size - actingset.size());
2552
2553 } else if (is_degraded()) {
2554 // degraded: baseline degraded
2555 base = ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
2556 }
2557
2558 // Adjust with pool's recovery priority
2559 int64_t pool_recovery_priority = 0;
2560 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2561
2562 ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
2563 }
2564
2565 dout(20) << __func__ << " backfill priority is " << ret << dendl;
2566 return static_cast<unsigned>(ret);
2567 }
2568
2569 unsigned PG::get_delete_priority()
2570 {
2571 auto state = get_osdmap()->get_state(osd->whoami);
2572 if (state & (CEPH_OSD_BACKFILLFULL |
2573 CEPH_OSD_FULL)) {
2574 return OSD_DELETE_PRIORITY_FULL;
2575 } else if (state & CEPH_OSD_NEARFULL) {
2576 return OSD_DELETE_PRIORITY_FULLISH;
2577 } else {
2578 return OSD_DELETE_PRIORITY_NORMAL;
2579 }
2580 }
2581
2582 Context *PG::finish_recovery()
2583 {
2584 dout(10) << "finish_recovery" << dendl;
2585 ceph_assert(info.last_complete == info.last_update);
2586
2587 clear_recovery_state();
2588
2589 /*
2590 * sync all this before purging strays. but don't block!
2591 */
2592 finish_sync_event = new C_PG_FinishRecovery(this);
2593 return finish_sync_event;
2594 }
2595
2596 void PG::_finish_recovery(Context *c)
2597 {
2598 lock();
2599 // When recovery is initiated by a repair, that flag is left on
2600 state_clear(PG_STATE_REPAIR);
2601 if (deleting) {
2602 unlock();
2603 return;
2604 }
2605 if (c == finish_sync_event) {
2606 dout(10) << "_finish_recovery" << dendl;
2607 finish_sync_event = 0;
2608 purge_strays();
2609
2610 publish_stats_to_osd();
2611
2612 if (scrub_after_recovery) {
2613 dout(10) << "_finish_recovery requeueing for scrub" << dendl;
2614 scrub_after_recovery = false;
2615 scrubber.must_deep_scrub = true;
2616 scrubber.check_repair = true;
2617 queue_scrub();
2618 }
2619 } else {
2620 dout(10) << "_finish_recovery -- stale" << dendl;
2621 }
2622 unlock();
2623 }
2624
2625 void PG::start_recovery_op(const hobject_t& soid)
2626 {
2627 dout(10) << "start_recovery_op " << soid
2628 #ifdef DEBUG_RECOVERY_OIDS
2629 << " (" << recovering_oids << ")"
2630 #endif
2631 << dendl;
2632 ceph_assert(recovery_ops_active >= 0);
2633 recovery_ops_active++;
2634 #ifdef DEBUG_RECOVERY_OIDS
2635 recovering_oids.insert(soid);
2636 #endif
2637 osd->start_recovery_op(this, soid);
2638 }
2639
2640 void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
2641 {
2642 dout(10) << "finish_recovery_op " << soid
2643 #ifdef DEBUG_RECOVERY_OIDS
2644 << " (" << recovering_oids << ")"
2645 #endif
2646 << dendl;
2647 ceph_assert(recovery_ops_active > 0);
2648 recovery_ops_active--;
2649 #ifdef DEBUG_RECOVERY_OIDS
2650 ceph_assert(recovering_oids.count(soid));
2651 recovering_oids.erase(recovering_oids.find(soid));
2652 #endif
2653 osd->finish_recovery_op(this, soid, dequeue);
2654
2655 if (!dequeue) {
2656 queue_recovery();
2657 }
2658 }
2659
2660 void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
2661 {
2662 child->update_snap_mapper_bits(split_bits);
2663 child->update_osdmap_ref(get_osdmap());
2664
2665 child->pool = pool;
2666
2667 // Log
2668 pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2669 child->info.last_complete = info.last_complete;
2670
2671 info.last_update = pg_log.get_head();
2672 child->info.last_update = child->pg_log.get_head();
2673
2674 child->info.last_user_version = info.last_user_version;
2675
2676 info.log_tail = pg_log.get_tail();
2677 child->info.log_tail = child->pg_log.get_tail();
2678
2679 // reset last_complete, we might have modified pg_log & missing above
2680 pg_log.reset_complete_to(&info);
2681 child->pg_log.reset_complete_to(&child->info);
2682
2683 // Info
2684 child->info.history = info.history;
2685 child->info.history.epoch_created = get_osdmap_epoch();
2686 child->info.purged_snaps = info.purged_snaps;
2687
2688 if (info.last_backfill.is_max()) {
2689 child->info.set_last_backfill(hobject_t::get_max());
2690 } else {
2691 // restart backfill on parent and child to be safe. we could
2692 // probably do better in the bitwise sort case, but it's more
2693 // fragile (there may be special work to do on backfill completion
2694 // in the future).
2695 info.set_last_backfill(hobject_t());
2696 child->info.set_last_backfill(hobject_t());
2697 // restarting backfill implies that the missing set is empty,
2698 // since it is only used for objects prior to last_backfill
2699 pg_log.reset_backfill();
2700 child->pg_log.reset_backfill();
2701 }
2702
2703 child->info.stats = info.stats;
2704 child->info.stats.parent_split_bits = split_bits;
2705 info.stats.stats_invalid = true;
2706 child->info.stats.stats_invalid = true;
2707 child->info.last_epoch_started = info.last_epoch_started;
2708 child->info.last_interval_started = info.last_interval_started;
2709
2710 child->snap_trimq = snap_trimq;
2711
2712 // There can't be recovery/backfill going on now
2713 int primary, up_primary;
2714 vector<int> newup, newacting;
2715 get_osdmap()->pg_to_up_acting_osds(
2716 child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2717 child->init_primary_up_acting(
2718 newup,
2719 newacting,
2720 up_primary,
2721 primary);
2722 child->role = OSDMap::calc_pg_role(osd->whoami, child->acting);
2723
2724 // this comparison includes primary rank via pg_shard_t
2725 if (get_primary() != child->get_primary())
2726 child->info.history.same_primary_since = get_osdmap_epoch();
2727
2728 child->info.stats.up = up;
2729 child->info.stats.up_primary = up_primary;
2730 child->info.stats.acting = acting;
2731 child->info.stats.acting_primary = primary;
2732 child->info.stats.mapping_epoch = get_osdmap_epoch();
2733
2734 // History
2735 child->past_intervals = past_intervals;
2736
2737 _split_into(child_pgid, child, split_bits);
2738
2739 // release all backoffs for simplicity
2740 release_backoffs(hobject_t(), hobject_t::get_max());
2741
2742 child->on_new_interval();
2743
2744 child->send_notify = !child->is_primary();
2745
2746 child->dirty_info = true;
2747 child->dirty_big_info = true;
2748 dirty_info = true;
2749 dirty_big_info = true;
2750 }
2751
2752 void PG::start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
2753 {
2754 out->resize(childpgs.size() + 1);
2755 info.stats.stats.sum.split(*out);
2756 }
2757
2758 void PG::finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction *t)
2759 {
2760 info.stats.stats.sum = stats;
2761 write_if_dirty(*t);
2762 }
2763
2764 void PG::merge_from(map<spg_t,PGRef>& sources, RecoveryCtx *rctx,
2765 unsigned split_bits,
2766 const pg_merge_meta_t& last_pg_merge_meta)
2767 {
2768 dout(10) << __func__ << " from " << sources << " split_bits " << split_bits
2769 << dendl;
2770 bool incomplete = false;
2771 if (info.last_complete != info.last_update ||
2772 info.is_incomplete() ||
2773 info.dne()) {
2774 dout(10) << __func__ << " target incomplete" << dendl;
2775 incomplete = true;
2776 }
2777 if (last_pg_merge_meta.source_pgid != pg_t()) {
2778 if (info.pgid.pgid != last_pg_merge_meta.source_pgid.get_parent()) {
2779 dout(10) << __func__ << " target doesn't match expected parent "
2780 << last_pg_merge_meta.source_pgid.get_parent()
2781 << " of source_pgid " << last_pg_merge_meta.source_pgid
2782 << dendl;
2783 incomplete = true;
2784 }
2785 if (info.last_update != last_pg_merge_meta.target_version) {
2786 dout(10) << __func__ << " target version doesn't match expected "
2787 << last_pg_merge_meta.target_version << dendl;
2788 incomplete = true;
2789 }
2790 }
2791
2792 PGLogEntryHandler handler{this, rctx->transaction};
2793 pg_log.roll_forward(&handler);
2794
2795 info.last_complete = info.last_update; // to fake out trim()
2796 pg_log.reset_recovery_pointers();
2797 pg_log.trim(info.last_update, info);
2798
2799 vector<PGLog*> log_from;
2800 for (auto& i : sources) {
2801 auto& source = i.second;
2802 if (!source) {
2803 dout(10) << __func__ << " source " << i.first << " missing" << dendl;
2804 incomplete = true;
2805 continue;
2806 }
2807 if (source->info.last_complete != source->info.last_update ||
2808 source->info.is_incomplete() ||
2809 source->info.dne()) {
2810 dout(10) << __func__ << " source " << source->pg_id << " incomplete"
2811 << dendl;
2812 incomplete = true;
2813 }
2814 if (last_pg_merge_meta.source_pgid != pg_t()) {
2815 if (source->info.pgid.pgid != last_pg_merge_meta.source_pgid) {
2816 dout(10) << __func__ << " source " << source->info.pgid.pgid
2817 << " doesn't match expected source pgid "
2818 << last_pg_merge_meta.source_pgid << dendl;
2819 incomplete = true;
2820 }
2821 if (source->info.last_update != last_pg_merge_meta.source_version) {
2822 dout(10) << __func__ << " source version doesn't match expected "
2823 << last_pg_merge_meta.target_version << dendl;
2824 incomplete = true;
2825 }
2826 }
2827
2828 // prepare log
2829 PGLogEntryHandler handler{source.get(), rctx->transaction};
2830 source->pg_log.roll_forward(&handler);
2831 source->info.last_complete = source->info.last_update; // to fake out trim()
2832 source->pg_log.reset_recovery_pointers();
2833 source->pg_log.trim(source->info.last_update, source->info);
2834 log_from.push_back(&source->pg_log);
2835
2836 // wipe out source's pgmeta
2837 rctx->transaction->remove(source->coll, source->pgmeta_oid);
2838
2839 // merge (and destroy source collection)
2840 rctx->transaction->merge_collection(source->coll, coll, split_bits);
2841
2842 // combine stats
2843 info.stats.add(source->info.stats);
2844
2845 // pull up last_update
2846 info.last_update = std::max(info.last_update, source->info.last_update);
2847
2848 // adopt source's PastIntervals if target has none. we can do this since
2849 // pgp_num has been reduced prior to the merge, so the OSD mappings for
2850 // the PGs are identical.
2851 if (past_intervals.empty() && !source->past_intervals.empty()) {
2852 dout(10) << __func__ << " taking source's past_intervals" << dendl;
2853 past_intervals = source->past_intervals;
2854 }
2855 }
2856
2857 // merge_collection does this, but maybe all of our sources were missing.
2858 rctx->transaction->collection_set_bits(coll, split_bits);
2859
2860 info.last_complete = info.last_update;
2861 info.log_tail = info.last_update;
2862 if (incomplete) {
2863 info.last_backfill = hobject_t();
2864 }
2865
2866 snap_mapper.update_bits(split_bits);
2867
2868 // merge logs
2869 pg_log.merge_from(log_from, info.last_update);
2870
2871 // make sure we have a meaningful last_epoch_started/clean (if we were a
2872 // placeholder)
2873 if (info.last_epoch_started == 0) {
2874 // start with (a) source's history, since these PGs *should* have been
2875 // remapped in concert with each other...
2876 info.history = sources.begin()->second->info.history;
2877
2878 // we use the last_epoch_{started,clean} we got from
2879 // the caller, which are the epochs that were reported by the PGs were
2880 // found to be ready for merge.
2881 info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean;
2882 info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started;
2883 info.last_epoch_started = last_pg_merge_meta.last_epoch_started;
2884 dout(10) << __func__
2885 << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/"
2886 << last_pg_merge_meta.last_epoch_clean
2887 << " from pool last_dec_*, source pg history was "
2888 << sources.begin()->second->info.history
2889 << dendl;
2890
2891 // if the past_intervals start is later than last_epoch_clean, it
2892 // implies the source repeered again but the target didn't, or
2893 // that the source became clean in a later epoch than the target.
2894 // avoid the discrepancy but adjusting the interval start
2895 // backwards to match so that check_past_interval_bounds() will
2896 // not complain.
2897 auto pib = past_intervals.get_bounds();
2898 if (info.history.last_epoch_clean < pib.first) {
2899 dout(10) << __func__ << " last_epoch_clean "
2900 << info.history.last_epoch_clean << " < past_interval start "
2901 << pib.first << ", adjusting start backwards" << dendl;
2902 past_intervals.adjust_start_backwards(info.history.last_epoch_clean);
2903 }
2904
2905 // Similarly, if the same_interval_since value is later than
2906 // last_epoch_clean, the next interval change will result in a
2907 // past_interval start that is later than last_epoch_clean. This
2908 // can happen if we use the pg_history values from the merge
2909 // source. Adjust the same_interval_since value backwards if that
2910 // happens. (We trust the les and lec values more because they came from
2911 // the real target, whereas the history value we stole from the source.)
2912 if (info.history.last_epoch_started < info.history.same_interval_since) {
2913 dout(10) << __func__ << " last_epoch_started "
2914 << info.history.last_epoch_started << " < same_interval_since "
2915 << info.history.same_interval_since
2916 << ", adjusting pg_history backwards" << dendl;
2917 info.history.same_interval_since = info.history.last_epoch_clean;
2918 // make sure same_{up,primary}_since are <= same_interval_since
2919 info.history.same_up_since = std::min(
2920 info.history.same_up_since, info.history.same_interval_since);
2921 info.history.same_primary_since = std::min(
2922 info.history.same_primary_since, info.history.same_interval_since);
2923 }
2924 }
2925
2926 dirty_info = true;
2927 dirty_big_info = true;
2928 }
2929
2930 void PG::add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end)
2931 {
2932 ConnectionRef con = s->con;
2933 if (!con) // OSD::ms_handle_reset clears s->con without a lock
2934 return;
2935 BackoffRef b(s->have_backoff(info.pgid, begin));
2936 if (b) {
2937 derr << __func__ << " already have backoff for " << s << " begin " << begin
2938 << " " << *b << dendl;
2939 ceph_abort();
2940 }
2941 std::lock_guard l(backoff_lock);
2942 {
2943 b = new Backoff(info.pgid, this, s, ++s->backoff_seq, begin, end);
2944 backoffs[begin].insert(b);
2945 s->add_backoff(b);
2946 dout(10) << __func__ << " session " << s << " added " << *b << dendl;
2947 }
2948 con->send_message(
2949 new MOSDBackoff(
2950 info.pgid,
2951 get_osdmap_epoch(),
2952 CEPH_OSD_BACKOFF_OP_BLOCK,
2953 b->id,
2954 begin,
2955 end));
2956 }
2957
2958 void PG::release_backoffs(const hobject_t& begin, const hobject_t& end)
2959 {
2960 dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl;
2961 vector<BackoffRef> bv;
2962 {
2963 std::lock_guard l(backoff_lock);
2964 auto p = backoffs.lower_bound(begin);
2965 while (p != backoffs.end()) {
2966 int r = cmp(p->first, end);
2967 dout(20) << __func__ << " ? " << r << " " << p->first
2968 << " " << p->second << dendl;
2969 // note: must still examine begin=end=p->first case
2970 if (r > 0 || (r == 0 && begin < end)) {
2971 break;
2972 }
2973 dout(20) << __func__ << " checking " << p->first
2974 << " " << p->second << dendl;
2975 auto q = p->second.begin();
2976 while (q != p->second.end()) {
2977 dout(20) << __func__ << " checking " << *q << dendl;
2978 int r = cmp((*q)->begin, begin);
2979 if (r == 0 || (r > 0 && (*q)->end < end)) {
2980 bv.push_back(*q);
2981 q = p->second.erase(q);
2982 } else {
2983 ++q;
2984 }
2985 }
2986 if (p->second.empty()) {
2987 p = backoffs.erase(p);
2988 } else {
2989 ++p;
2990 }
2991 }
2992 }
2993 for (auto b : bv) {
2994 std::lock_guard l(b->lock);
2995 dout(10) << __func__ << " " << *b << dendl;
2996 if (b->session) {
2997 ceph_assert(b->pg == this);
2998 ConnectionRef con = b->session->con;
2999 if (con) { // OSD::ms_handle_reset clears s->con without a lock
3000 con->send_message(
3001 new MOSDBackoff(
3002 info.pgid,
3003 get_osdmap_epoch(),
3004 CEPH_OSD_BACKOFF_OP_UNBLOCK,
3005 b->id,
3006 b->begin,
3007 b->end));
3008 }
3009 if (b->is_new()) {
3010 b->state = Backoff::STATE_DELETING;
3011 } else {
3012 b->session->rm_backoff(b);
3013 b->session.reset();
3014 }
3015 b->pg.reset();
3016 }
3017 }
3018 }
3019
3020 void PG::clear_backoffs()
3021 {
3022 dout(10) << __func__ << " " << dendl;
3023 map<hobject_t,set<BackoffRef>> ls;
3024 {
3025 std::lock_guard l(backoff_lock);
3026 ls.swap(backoffs);
3027 }
3028 for (auto& p : ls) {
3029 for (auto& b : p.second) {
3030 std::lock_guard l(b->lock);
3031 dout(10) << __func__ << " " << *b << dendl;
3032 if (b->session) {
3033 ceph_assert(b->pg == this);
3034 if (b->is_new()) {
3035 b->state = Backoff::STATE_DELETING;
3036 } else {
3037 b->session->rm_backoff(b);
3038 b->session.reset();
3039 }
3040 b->pg.reset();
3041 }
3042 }
3043 }
3044 }
3045
3046 // called by Session::clear_backoffs()
3047 void PG::rm_backoff(BackoffRef b)
3048 {
3049 dout(10) << __func__ << " " << *b << dendl;
3050 std::lock_guard l(backoff_lock);
3051 ceph_assert(b->lock.is_locked_by_me());
3052 ceph_assert(b->pg == this);
3053 auto p = backoffs.find(b->begin);
3054 // may race with release_backoffs()
3055 if (p != backoffs.end()) {
3056 auto q = p->second.find(b);
3057 if (q != p->second.end()) {
3058 p->second.erase(q);
3059 if (p->second.empty()) {
3060 backoffs.erase(p);
3061 }
3062 }
3063 }
3064 }
3065
3066 void PG::clear_recovery_state()
3067 {
3068 dout(10) << "clear_recovery_state" << dendl;
3069
3070 pg_log.reset_recovery_pointers();
3071 finish_sync_event = 0;
3072
3073 hobject_t soid;
3074 while (recovery_ops_active > 0) {
3075 #ifdef DEBUG_RECOVERY_OIDS
3076 soid = *recovering_oids.begin();
3077 #endif
3078 finish_recovery_op(soid, true);
3079 }
3080
3081 async_recovery_targets.clear();
3082 backfill_targets.clear();
3083 backfill_info.clear();
3084 peer_backfill_info.clear();
3085 waiting_on_backfill.clear();
3086 _clear_recovery_state(); // pg impl specific hook
3087 }
3088
3089 void PG::cancel_recovery()
3090 {
3091 dout(10) << "cancel_recovery" << dendl;
3092 clear_recovery_state();
3093 }
3094
3095
3096 void PG::purge_strays()
3097 {
3098 if (is_premerge()) {
3099 dout(10) << "purge_strays " << stray_set << " but premerge, doing nothing"
3100 << dendl;
3101 return;
3102 }
3103 if (cct->_conf.get_val<bool>("osd_debug_no_purge_strays")) {
3104 return;
3105 }
3106 dout(10) << "purge_strays " << stray_set << dendl;
3107
3108 bool removed = false;
3109 for (set<pg_shard_t>::iterator p = stray_set.begin();
3110 p != stray_set.end();
3111 ++p) {
3112 ceph_assert(!is_acting_recovery_backfill(*p));
3113 if (get_osdmap()->is_up(p->osd)) {
3114 dout(10) << "sending PGRemove to osd." << *p << dendl;
3115 vector<spg_t> to_remove;
3116 to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
3117 MOSDPGRemove *m = new MOSDPGRemove(
3118 get_osdmap_epoch(),
3119 to_remove);
3120 osd->send_message_osd_cluster(p->osd, m, get_osdmap_epoch());
3121 } else {
3122 dout(10) << "not sending PGRemove to down osd." << *p << dendl;
3123 }
3124 peer_missing.erase(*p);
3125 peer_info.erase(*p);
3126 peer_purged.insert(*p);
3127 removed = true;
3128 }
3129
3130 // if we removed anyone, update peers (which include peer_info)
3131 if (removed)
3132 update_heartbeat_peers();
3133
3134 stray_set.clear();
3135
3136 // clear _requested maps; we may have to peer() again if we discover
3137 // (more) stray content
3138 peer_log_requested.clear();
3139 peer_missing_requested.clear();
3140 }
3141
3142 void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
3143 {
3144 std::lock_guard l(heartbeat_peer_lock);
3145 probe_targets.clear();
3146 for (set<pg_shard_t>::iterator i = probe_set.begin();
3147 i != probe_set.end();
3148 ++i) {
3149 probe_targets.insert(i->osd);
3150 }
3151 }
3152
3153 void PG::clear_probe_targets()
3154 {
3155 std::lock_guard l(heartbeat_peer_lock);
3156 probe_targets.clear();
3157 }
3158
3159 void PG::update_heartbeat_peers()
3160 {
3161 ceph_assert(is_locked());
3162
3163 if (!is_primary())
3164 return;
3165
3166 set<int> new_peers;
3167 for (unsigned i=0; i<acting.size(); i++) {
3168 if (acting[i] != CRUSH_ITEM_NONE)
3169 new_peers.insert(acting[i]);
3170 }
3171 for (unsigned i=0; i<up.size(); i++) {
3172 if (up[i] != CRUSH_ITEM_NONE)
3173 new_peers.insert(up[i]);
3174 }
3175 for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
3176 p != peer_info.end();
3177 ++p)
3178 new_peers.insert(p->first.osd);
3179
3180 bool need_update = false;
3181 heartbeat_peer_lock.Lock();
3182 if (new_peers == heartbeat_peers) {
3183 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl;
3184 } else {
3185 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl;
3186 heartbeat_peers.swap(new_peers);
3187 need_update = true;
3188 }
3189 heartbeat_peer_lock.Unlock();
3190
3191 if (need_update)
3192 osd->need_heartbeat_peer_update();
3193 }
3194
3195
3196 bool PG::check_in_progress_op(
3197 const osd_reqid_t &r,
3198 eversion_t *version,
3199 version_t *user_version,
3200 int *return_code) const
3201 {
3202 return (
3203 projected_log.get_request(r, version, user_version, return_code) ||
3204 pg_log.get_log().get_request(r, version, user_version, return_code));
3205 }
3206
3207 static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard)
3208 {
3209 for (auto&p : pgs)
3210 if (p.shard == shard)
3211 return true;
3212 return false;
3213 }
3214
3215 static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard)
3216 {
3217 for (auto&p : pgs) {
3218 if (p == skip)
3219 continue;
3220 if (p.shard == shard)
3221 return p;
3222 }
3223 return pg_shard_t();
3224 }
3225
3226 void PG::_update_calc_stats()
3227 {
3228 info.stats.version = info.last_update;
3229 info.stats.created = info.history.epoch_created;
3230 info.stats.last_scrub = info.history.last_scrub;
3231 info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
3232 info.stats.last_deep_scrub = info.history.last_deep_scrub;
3233 info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
3234 info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
3235 info.stats.last_epoch_clean = info.history.last_epoch_clean;
3236
3237 info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
3238 info.stats.ondisk_log_size = info.stats.log_size;
3239 info.stats.log_start = pg_log.get_tail();
3240 info.stats.ondisk_log_start = pg_log.get_tail();
3241 info.stats.snaptrimq_len = snap_trimq.size();
3242
3243 unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
3244
3245 // In rare case that upset is too large (usually transient), use as target
3246 // for calculations below.
3247 unsigned target = std::max(num_shards, (unsigned)upset.size());
3248 // For undersized actingset may be larger with OSDs out
3249 unsigned nrep = std::max(actingset.size(), upset.size());
3250 // calc num_object_copies
3251 info.stats.stats.calc_copies(std::max(target, nrep));
3252 info.stats.stats.sum.num_objects_degraded = 0;
3253 info.stats.stats.sum.num_objects_unfound = 0;
3254 info.stats.stats.sum.num_objects_misplaced = 0;
3255 info.stats.avail_no_missing.clear();
3256 info.stats.object_location_counts.clear();
3257
3258 if ((is_remapped() || is_undersized() || !is_clean()) && (is_peered() || is_activating())) {
3259 dout(20) << __func__ << " actingset " << actingset << " upset "
3260 << upset << " acting_recovery_backfill " << acting_recovery_backfill << dendl;
3261 dout(20) << __func__ << " acting " << acting << " up " << up << dendl;
3262
3263 ceph_assert(!acting_recovery_backfill.empty());
3264
3265 bool estimate = false;
3266
3267 // NOTE: we only generate degraded, misplaced and unfound
3268 // values for the summation, not individual stat categories.
3269 int64_t num_objects = info.stats.stats.sum.num_objects;
3270
3271 // Objects missing from up nodes, sorted by # objects.
3272 boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
3273 // Objects missing from nodes not in up, sort by # objects
3274 boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
3275
3276 // Fill missing_target_objects/acting_source_objects
3277
3278 {
3279 int64_t missing;
3280
3281 // Primary first
3282 missing = pg_log.get_missing().num_missing();
3283 ceph_assert(acting_recovery_backfill.count(pg_whoami));
3284 if (upset.count(pg_whoami)) {
3285 missing_target_objects.insert(make_pair(missing, pg_whoami));
3286 } else {
3287 acting_source_objects.insert(make_pair(missing, pg_whoami));
3288 }
3289 info.stats.stats.sum.num_objects_missing_on_primary = missing;
3290 if (missing == 0)
3291 info.stats.avail_no_missing.push_back(pg_whoami);
3292 dout(20) << __func__ << " shard " << pg_whoami
3293 << " primary objects " << num_objects
3294 << " missing " << missing
3295 << dendl;
3296 }
3297
3298 // All other peers
3299 for (auto& peer : peer_info) {
3300 // Primary should not be in the peer_info, skip if it is.
3301 if (peer.first == pg_whoami) continue;
3302 int64_t missing = 0;
3303 int64_t peer_num_objects = peer.second.stats.stats.sum.num_objects;
3304 // Backfill targets always track num_objects accurately
3305 // all other peers track missing accurately.
3306 if (is_backfill_targets(peer.first)) {
3307 missing = std::max((int64_t)0, num_objects - peer_num_objects);
3308 } else {
3309 if (peer_missing.count(peer.first)) {
3310 missing = peer_missing[peer.first].num_missing();
3311 } else {
3312 dout(20) << __func__ << " no peer_missing found for " << peer.first << dendl;
3313 if (is_recovering()) {
3314 estimate = true;
3315 }
3316 missing = std::max((int64_t)0, num_objects - peer_num_objects);
3317 }
3318 }
3319 if (upset.count(peer.first)) {
3320 missing_target_objects.insert(make_pair(missing, peer.first));
3321 } else if (actingset.count(peer.first)) {
3322 acting_source_objects.insert(make_pair(missing, peer.first));
3323 }
3324 peer.second.stats.stats.sum.num_objects_missing = missing;
3325 if (missing == 0)
3326 info.stats.avail_no_missing.push_back(peer.first);
3327 dout(20) << __func__ << " shard " << peer.first
3328 << " objects " << peer_num_objects
3329 << " missing " << missing
3330 << dendl;
3331 }
3332
3333 // Compute object_location_counts
3334 for (auto& ml: missing_loc.get_missing_locs()) {
3335 info.stats.object_location_counts[ml.second]++;
3336 dout(30) << __func__ << " " << ml.first << " object_location_counts["
3337 << ml.second << "]=" << info.stats.object_location_counts[ml.second]
3338 << dendl;
3339 }
3340 int64_t not_missing = num_objects - missing_loc.get_missing_locs().size();
3341 if (not_missing) {
3342 // During recovery we know upset == actingset and is being populated
3343 // During backfill we know that all non-missing objects are in the actingset
3344 info.stats.object_location_counts[actingset] = not_missing;
3345 }
3346 dout(30) << __func__ << " object_location_counts["
3347 << upset << "]=" << info.stats.object_location_counts[upset]
3348 << dendl;
3349 dout(20) << __func__ << " object_location_counts "
3350 << info.stats.object_location_counts << dendl;
3351
3352 // A misplaced object is not stored on the correct OSD
3353 int64_t misplaced = 0;
3354 // a degraded objects has fewer replicas or EC shards than the pool specifies.
3355 int64_t degraded = 0;
3356
3357 if (is_recovering()) {
3358 for (auto& sml: missing_loc.get_missing_by_count()) {
3359 for (auto& ml: sml.second) {
3360 int missing_shards;
3361 if (sml.first == shard_id_t::NO_SHARD) {
3362 dout(20) << __func__ << " ml " << ml.second << " upset size " << upset.size() << " up " << ml.first.up << dendl;
3363 missing_shards = (int)upset.size() - ml.first.up;
3364 } else {
3365 // Handle shards not even in upset below
3366 if (!find_shard(upset, sml.first))
3367 continue;
3368 missing_shards = std::max(0, 1 - ml.first.up);
3369 dout(20) << __func__ << " shard " << sml.first << " ml " << ml.second << " missing shards " << missing_shards << dendl;
3370 }
3371 int odegraded = ml.second * missing_shards;
3372 // Copies on other osds but limited to the possible degraded
3373 int more_osds = std::min(missing_shards, ml.first.other);
3374 int omisplaced = ml.second * more_osds;
3375 ceph_assert(omisplaced <= odegraded);
3376 odegraded -= omisplaced;
3377
3378 misplaced += omisplaced;
3379 degraded += odegraded;
3380 }
3381 }
3382
3383 dout(20) << __func__ << " missing based degraded " << degraded << dendl;
3384 dout(20) << __func__ << " missing based misplaced " << misplaced << dendl;
3385
3386 // Handle undersized case
3387 if (pool.info.is_replicated()) {
3388 // Add degraded for missing targets (num_objects missing)
3389 ceph_assert(target >= upset.size());
3390 unsigned needed = target - upset.size();
3391 degraded += num_objects * needed;
3392 } else {
3393 for (unsigned i = 0 ; i < num_shards; ++i) {
3394 shard_id_t shard(i);
3395
3396 if (!find_shard(upset, shard)) {
3397 pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard);
3398
3399 if (pgs != pg_shard_t()) {
3400 int64_t missing;
3401
3402 if (pgs == pg_whoami)
3403 missing = info.stats.stats.sum.num_objects_missing_on_primary;
3404 else
3405 missing = peer_info[pgs].stats.stats.sum.num_objects_missing;
3406
3407 degraded += missing;
3408 misplaced += std::max((int64_t)0, num_objects - missing);
3409 } else {
3410 // No shard anywhere
3411 degraded += num_objects;
3412 }
3413 }
3414 }
3415 }
3416 goto out;
3417 }
3418
3419 // Handle undersized case
3420 if (pool.info.is_replicated()) {
3421 // Add to missing_target_objects
3422 ceph_assert(target >= missing_target_objects.size());
3423 unsigned needed = target - missing_target_objects.size();
3424 if (needed)
3425 missing_target_objects.insert(make_pair(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD)));
3426 } else {
3427 for (unsigned i = 0 ; i < num_shards; ++i) {
3428 shard_id_t shard(i);
3429 bool found = false;
3430 for (const auto& t : missing_target_objects) {
3431 if (std::get<1>(t).shard == shard) {
3432 found = true;
3433 break;
3434 }
3435 }
3436 if (!found)
3437 missing_target_objects.insert(make_pair(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard)));
3438 }
3439 }
3440
3441 for (const auto& item : missing_target_objects)
3442 dout(20) << __func__ << " missing shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
3443 for (const auto& item : acting_source_objects)
3444 dout(20) << __func__ << " acting shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
3445
3446 // Handle all objects not in missing for remapped
3447 // or backfill
3448 for (auto m = missing_target_objects.rbegin();
3449 m != missing_target_objects.rend(); ++m) {
3450
3451 int64_t extra_missing = -1;
3452
3453 if (pool.info.is_replicated()) {
3454 if (!acting_source_objects.empty()) {
3455 auto extra_copy = acting_source_objects.begin();
3456 extra_missing = std::get<0>(*extra_copy);
3457 acting_source_objects.erase(extra_copy);
3458 }
3459 } else { // Erasure coded
3460 // Use corresponding shard
3461 for (const auto& a : acting_source_objects) {
3462 if (std::get<1>(a).shard == std::get<1>(*m).shard) {
3463 extra_missing = std::get<0>(a);
3464 acting_source_objects.erase(a);
3465 break;
3466 }
3467 }
3468 }
3469
3470 if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
3471 // We don't know which of the objects on the target
3472 // are part of extra_missing so assume are all degraded.
3473 misplaced += std::get<0>(*m) - extra_missing;
3474 degraded += extra_missing;
3475 } else {
3476 // 1. extra_missing == -1, more targets than sources so degraded
3477 // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
3478 // previously degraded are now present on the target.
3479 degraded += std::get<0>(*m);
3480 }
3481 }
3482 // If there are still acting that haven't been accounted for
3483 // then they are misplaced
3484 for (const auto& a : acting_source_objects) {
3485 int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
3486 dout(20) << __func__ << " extra acting misplaced " << extra_misplaced << dendl;
3487 misplaced += extra_misplaced;
3488 }
3489 out:
3490 // NOTE: Tests use these messages to verify this code
3491 dout(20) << __func__ << " degraded " << degraded << (estimate ? " (est)": "") << dendl;
3492 dout(20) << __func__ << " misplaced " << misplaced << (estimate ? " (est)": "")<< dendl;
3493
3494 info.stats.stats.sum.num_objects_degraded = degraded;
3495 info.stats.stats.sum.num_objects_unfound = get_num_unfound();
3496 info.stats.stats.sum.num_objects_misplaced = misplaced;
3497 }
3498 }
3499
3500 void PG::_update_blocked_by()
3501 {
3502 // set a max on the number of blocking peers we report. if we go
3503 // over, report a random subset. keep the result sorted.
3504 unsigned keep = std::min<unsigned>(blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
3505 unsigned skip = blocked_by.size() - keep;
3506 info.stats.blocked_by.clear();
3507 info.stats.blocked_by.resize(keep);
3508 unsigned pos = 0;
3509 for (set<int>::iterator p = blocked_by.begin();
3510 p != blocked_by.end() && keep > 0;
3511 ++p) {
3512 if (skip > 0 && (rand() % (skip + keep) < skip)) {
3513 --skip;
3514 } else {
3515 info.stats.blocked_by[pos++] = *p;
3516 --keep;
3517 }
3518 }
3519 }
3520
3521 void PG::publish_stats_to_osd()
3522 {
3523 if (!is_primary())
3524 return;
3525
3526 pg_stats_publish_lock.Lock();
3527
3528 if (info.stats.stats.sum.num_scrub_errors)
3529 state_set(PG_STATE_INCONSISTENT);
3530 else {
3531 state_clear(PG_STATE_INCONSISTENT);
3532 state_clear(PG_STATE_FAILED_REPAIR);
3533 }
3534
3535 utime_t now = ceph_clock_now();
3536 if (info.stats.state != state) {
3537 info.stats.last_change = now;
3538 // Optimistic estimation, if we just find out an inactive PG,
3539 // assumt it is active till now.
3540 if (!(state & PG_STATE_ACTIVE) &&
3541 (info.stats.state & PG_STATE_ACTIVE))
3542 info.stats.last_active = now;
3543
3544 if ((state & PG_STATE_ACTIVE) &&
3545 !(info.stats.state & PG_STATE_ACTIVE))
3546 info.stats.last_became_active = now;
3547 if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
3548 !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
3549 info.stats.last_became_peered = now;
3550 info.stats.state = state;
3551 }
3552
3553 _update_calc_stats();
3554 if (info.stats.stats.sum.num_objects_degraded) {
3555 state_set(PG_STATE_DEGRADED);
3556 } else {
3557 state_clear(PG_STATE_DEGRADED);
3558 }
3559 _update_blocked_by();
3560
3561 pg_stat_t pre_publish = info.stats;
3562 pre_publish.stats.add(unstable_stats);
3563 utime_t cutoff = now;
3564 cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
3565
3566 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_MIMIC) {
3567 // share (some of) our purged_snaps via the pg_stats. limit # of intervals
3568 // because we don't want to make the pg_stat_t structures too expensive.
3569 unsigned max = cct->_conf->osd_max_snap_prune_intervals_per_epoch;
3570 unsigned num = 0;
3571 auto i = info.purged_snaps.begin();
3572 while (num < max && i != info.purged_snaps.end()) {
3573 pre_publish.purged_snaps.insert(i.get_start(), i.get_len());
3574 ++num;
3575 ++i;
3576 }
3577 dout(20) << __func__ << " reporting purged_snaps "
3578 << pre_publish.purged_snaps << dendl;
3579 }
3580
3581 if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
3582 info.stats.last_fresh > cutoff) {
3583 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3584 << ": no change since " << info.stats.last_fresh << dendl;
3585 } else {
3586 // update our stat summary and timestamps
3587 info.stats.reported_epoch = get_osdmap_epoch();
3588 ++info.stats.reported_seq;
3589
3590 info.stats.last_fresh = now;
3591
3592 if (info.stats.state & PG_STATE_CLEAN)
3593 info.stats.last_clean = now;
3594 if (info.stats.state & PG_STATE_ACTIVE)
3595 info.stats.last_active = now;
3596 if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
3597 info.stats.last_peered = now;
3598 info.stats.last_unstale = now;
3599 if ((info.stats.state & PG_STATE_DEGRADED) == 0)
3600 info.stats.last_undegraded = now;
3601 if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
3602 info.stats.last_fullsized = now;
3603
3604 pg_stats_publish_valid = true;
3605 pg_stats_publish = pre_publish;
3606
3607 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3608 << ":" << pg_stats_publish.reported_seq << dendl;
3609 }
3610 pg_stats_publish_lock.Unlock();
3611 }
3612
3613 void PG::clear_publish_stats()
3614 {
3615 dout(15) << "clear_stats" << dendl;
3616 pg_stats_publish_lock.Lock();
3617 pg_stats_publish_valid = false;
3618 pg_stats_publish_lock.Unlock();
3619 }
3620
3621 /**
3622 * initialize a newly instantiated pg
3623 *
3624 * Initialize PG state, as when a PG is initially created, or when it
3625 * is first instantiated on the current node.
3626 *
3627 * @param role our role/rank
3628 * @param newup up set
3629 * @param newacting acting set
3630 * @param history pg history
3631 * @param pi past_intervals
3632 * @param backfill true if info should be marked as backfill
3633 * @param t transaction to write out our new state in
3634 */
3635 void PG::init(
3636 int role,
3637 const vector<int>& newup, int new_up_primary,
3638 const vector<int>& newacting, int new_acting_primary,
3639 const pg_history_t& history,
3640 const PastIntervals& pi,
3641 bool backfill,
3642 ObjectStore::Transaction *t)
3643 {
3644 dout(10) << "init role " << role << " up " << newup << " acting " << newacting
3645 << " history " << history
3646 << " past_intervals " << pi
3647 << dendl;
3648
3649 set_role(role);
3650 init_primary_up_acting(
3651 newup,
3652 newacting,
3653 new_up_primary,
3654 new_acting_primary);
3655
3656 info.history = history;
3657 past_intervals = pi;
3658
3659 info.stats.up = up;
3660 info.stats.up_primary = new_up_primary;
3661 info.stats.acting = acting;
3662 info.stats.acting_primary = new_acting_primary;
3663 info.stats.mapping_epoch = info.history.same_interval_since;
3664
3665 if (backfill) {
3666 dout(10) << __func__ << ": Setting backfill" << dendl;
3667 info.set_last_backfill(hobject_t());
3668 info.last_complete = info.last_update;
3669 pg_log.mark_log_for_rewrite();
3670 }
3671
3672 on_new_interval();
3673
3674 dirty_info = true;
3675 dirty_big_info = true;
3676 write_if_dirty(*t);
3677 }
3678
3679 void PG::shutdown()
3680 {
3681 ch->flush();
3682 lock();
3683 on_shutdown();
3684 unlock();
3685 }
3686
3687 #pragma GCC diagnostic ignored "-Wpragmas"
3688 #pragma GCC diagnostic push
3689 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3690
3691 void PG::upgrade(ObjectStore *store)
3692 {
3693 dout(0) << __func__ << " " << info_struct_v << " -> " << latest_struct_v
3694 << dendl;
3695 ceph_assert(info_struct_v <= 10);
3696 ObjectStore::Transaction t;
3697
3698 // <do upgrade steps here>
3699
3700 // finished upgrade!
3701 ceph_assert(info_struct_v == 10);
3702
3703 // update infover_key
3704 if (info_struct_v < latest_struct_v) {
3705 map<string,bufferlist> v;
3706 __u8 ver = latest_struct_v;
3707 encode(ver, v[infover_key]);
3708 t.omap_setkeys(coll, pgmeta_oid, v);
3709 }
3710
3711 dirty_info = true;
3712 dirty_big_info = true;
3713 write_if_dirty(t);
3714
3715 ObjectStore::CollectionHandle ch = store->open_collection(coll);
3716 int r = store->queue_transaction(ch, std::move(t));
3717 if (r != 0) {
3718 derr << __func__ << ": queue_transaction returned "
3719 << cpp_strerror(r) << dendl;
3720 ceph_abort();
3721 }
3722 ceph_assert(r == 0);
3723
3724 C_SaferCond waiter;
3725 if (!ch->flush_commit(&waiter)) {
3726 waiter.wait();
3727 }
3728 }
3729
3730 #pragma GCC diagnostic pop
3731 #pragma GCC diagnostic warning "-Wpragmas"
3732
3733 int PG::_prepare_write_info(CephContext* cct,
3734 map<string,bufferlist> *km,
3735 epoch_t epoch,
3736 pg_info_t &info, pg_info_t &last_written_info,
3737 PastIntervals &past_intervals,
3738 bool dirty_big_info,
3739 bool dirty_epoch,
3740 bool try_fast_info,
3741 PerfCounters *logger)
3742 {
3743 if (dirty_epoch) {
3744 encode(epoch, (*km)[epoch_key]);
3745 }
3746
3747 if (logger)
3748 logger->inc(l_osd_pg_info);
3749
3750 // try to do info efficiently?
3751 if (!dirty_big_info && try_fast_info &&
3752 info.last_update > last_written_info.last_update) {
3753 pg_fast_info_t fast;
3754 fast.populate_from(info);
3755 bool did = fast.try_apply_to(&last_written_info);
3756 ceph_assert(did); // we verified last_update increased above
3757 if (info == last_written_info) {
3758 encode(fast, (*km)[fastinfo_key]);
3759 if (logger)
3760 logger->inc(l_osd_pg_fastinfo);
3761 return 0;
3762 }
3763 generic_dout(30) << __func__ << " fastinfo failed, info:\n";
3764 {
3765 JSONFormatter jf(true);
3766 jf.dump_object("info", info);
3767 jf.flush(*_dout);
3768 }
3769 {
3770 *_dout << "\nlast_written_info:\n";
3771 JSONFormatter jf(true);
3772 jf.dump_object("last_written_info", last_written_info);
3773 jf.flush(*_dout);
3774 }
3775 *_dout << dendl;
3776 }
3777 last_written_info = info;
3778
3779 // info. store purged_snaps separately.
3780 interval_set<snapid_t> purged_snaps;
3781 purged_snaps.swap(info.purged_snaps);
3782 encode(info, (*km)[info_key]);
3783 purged_snaps.swap(info.purged_snaps);
3784
3785 if (dirty_big_info) {
3786 // potentially big stuff
3787 bufferlist& bigbl = (*km)[biginfo_key];
3788 encode(past_intervals, bigbl);
3789 encode(info.purged_snaps, bigbl);
3790 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
3791 if (logger)
3792 logger->inc(l_osd_pg_biginfo);
3793 }
3794
3795 return 0;
3796 }
3797
3798 void PG::_create(ObjectStore::Transaction& t, spg_t pgid, int bits)
3799 {
3800 coll_t coll(pgid);
3801 t.create_collection(coll, bits);
3802 }
3803
3804 void PG::_init(ObjectStore::Transaction& t, spg_t pgid, const pg_pool_t *pool)
3805 {
3806 coll_t coll(pgid);
3807
3808 if (pool) {
3809 // Give a hint to the PG collection
3810 bufferlist hint;
3811 uint32_t pg_num = pool->get_pg_num();
3812 uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
3813 encode(pg_num, hint);
3814 encode(expected_num_objects_pg, hint);
3815 uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
3816 t.collection_hint(coll, hint_type, hint);
3817 }
3818
3819 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3820 t.touch(coll, pgmeta_oid);
3821 map<string,bufferlist> values;
3822 __u8 struct_v = latest_struct_v;
3823 encode(struct_v, values[infover_key]);
3824 t.omap_setkeys(coll, pgmeta_oid, values);
3825 }
3826
3827 void PG::prepare_write_info(map<string,bufferlist> *km)
3828 {
3829 info.stats.stats.add(unstable_stats);
3830 unstable_stats.clear();
3831
3832 bool need_update_epoch = last_epoch < get_osdmap_epoch();
3833 int ret = _prepare_write_info(cct, km, get_osdmap_epoch(),
3834 info,
3835 last_written_info,
3836 past_intervals,
3837 dirty_big_info, need_update_epoch,
3838 cct->_conf->osd_fast_info,
3839 osd->logger);
3840 ceph_assert(ret == 0);
3841 if (need_update_epoch)
3842 last_epoch = get_osdmap_epoch();
3843 last_persisted_osdmap = last_epoch;
3844
3845 dirty_info = false;
3846 dirty_big_info = false;
3847 }
3848
3849 #pragma GCC diagnostic ignored "-Wpragmas"
3850 #pragma GCC diagnostic push
3851 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3852
3853 bool PG::_has_removal_flag(ObjectStore *store,
3854 spg_t pgid)
3855 {
3856 coll_t coll(pgid);
3857 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3858
3859 // first try new way
3860 set<string> keys;
3861 keys.insert("_remove");
3862 map<string,bufferlist> values;
3863 auto ch = store->open_collection(coll);
3864 ceph_assert(ch);
3865 if (store->omap_get_values(ch, pgmeta_oid, keys, &values) == 0 &&
3866 values.size() == 1)
3867 return true;
3868
3869 return false;
3870 }
3871
3872 int PG::peek_map_epoch(ObjectStore *store,
3873 spg_t pgid,
3874 epoch_t *pepoch)
3875 {
3876 coll_t coll(pgid);
3877 ghobject_t legacy_infos_oid(OSD::make_infos_oid());
3878 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3879 epoch_t cur_epoch = 0;
3880
3881 // validate collection name
3882 ceph_assert(coll.is_pg());
3883
3884 // try for v8
3885 set<string> keys;
3886 keys.insert(infover_key);
3887 keys.insert(epoch_key);
3888 map<string,bufferlist> values;
3889 auto ch = store->open_collection(coll);
3890 ceph_assert(ch);
3891 int r = store->omap_get_values(ch, pgmeta_oid, keys, &values);
3892 if (r == 0) {
3893 ceph_assert(values.size() == 2);
3894
3895 // sanity check version
3896 auto bp = values[infover_key].cbegin();
3897 __u8 struct_v = 0;
3898 decode(struct_v, bp);
3899 ceph_assert(struct_v >= 8);
3900
3901 // get epoch
3902 bp = values[epoch_key].begin();
3903 decode(cur_epoch, bp);
3904 } else {
3905 // probably bug 10617; see OSD::load_pgs()
3906 return -1;
3907 }
3908
3909 *pepoch = cur_epoch;
3910 return 0;
3911 }
3912
3913 #pragma GCC diagnostic pop
3914 #pragma GCC diagnostic warning "-Wpragmas"
3915
3916 void PG::write_if_dirty(ObjectStore::Transaction& t)
3917 {
3918 map<string,bufferlist> km;
3919 if (dirty_big_info || dirty_info)
3920 prepare_write_info(&km);
3921 pg_log.write_log_and_missing(t, &km, coll, pgmeta_oid, pool.info.require_rollback());
3922 if (!km.empty())
3923 t.omap_setkeys(coll, pgmeta_oid, km);
3924 }
3925
3926 void PG::add_log_entry(const pg_log_entry_t& e, bool applied)
3927 {
3928 // raise last_complete only if we were previously up to date
3929 if (info.last_complete == info.last_update)
3930 info.last_complete = e.version;
3931
3932 // raise last_update.
3933 ceph_assert(e.version > info.last_update);
3934 info.last_update = e.version;
3935
3936 // raise user_version, if it increased (it may have not get bumped
3937 // by all logged updates)
3938 if (e.user_version > info.last_user_version)
3939 info.last_user_version = e.user_version;
3940
3941 // log mutation
3942 pg_log.add(e, applied);
3943 dout(10) << "add_log_entry " << e << dendl;
3944 }
3945
3946
3947 void PG::append_log(
3948 const vector<pg_log_entry_t>& logv,
3949 eversion_t trim_to,
3950 eversion_t roll_forward_to,
3951 ObjectStore::Transaction &t,
3952 bool transaction_applied,
3953 bool async)
3954 {
3955 if (transaction_applied)
3956 update_snap_map(logv, t);
3957
3958 /* The primary has sent an info updating the history, but it may not
3959 * have arrived yet. We want to make sure that we cannot remember this
3960 * write without remembering that it happened in an interval which went
3961 * active in epoch history.last_epoch_started.
3962 */
3963 if (info.last_epoch_started != info.history.last_epoch_started) {
3964 info.history.last_epoch_started = info.last_epoch_started;
3965 }
3966 if (info.last_interval_started != info.history.last_interval_started) {
3967 info.history.last_interval_started = info.last_interval_started;
3968 }
3969 dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3970
3971 PGLogEntryHandler handler{this, &t};
3972 if (!transaction_applied) {
3973 /* We must be a backfill or async recovery peer, so it's ok if we apply
3974 * out-of-turn since we won't be considered when
3975 * determining a min possible last_update.
3976 *
3977 * We skip_rollforward() here, which advances the crt, without
3978 * doing an actual rollforward. This avoids cleaning up entries
3979 * from the backend and we do not end up in a situation, where the
3980 * object is deleted before we can _merge_object_divergent_entries().
3981 */
3982 pg_log.skip_rollforward();
3983 }
3984
3985 for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3986 p != logv.end();
3987 ++p) {
3988 add_log_entry(*p, transaction_applied);
3989
3990 /* We don't want to leave the rollforward artifacts around
3991 * here past last_backfill. It's ok for the same reason as
3992 * above */
3993 if (transaction_applied &&
3994 p->soid > info.last_backfill) {
3995 pg_log.roll_forward(&handler);
3996 }
3997 }
3998 auto last = logv.rbegin();
3999 if (is_primary() && last != logv.rend()) {
4000 projected_log.skip_can_rollback_to_to_head();
4001 projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
4002 }
4003
4004 if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
4005 pg_log.roll_forward_to(
4006 roll_forward_to,
4007 &handler);
4008 last_rollback_info_trimmed_to_applied = roll_forward_to;
4009 }
4010
4011 dout(10) << __func__ << " approx pg log length = "
4012 << pg_log.get_log().approx_size() << dendl;
4013 dout(10) << __func__ << " transaction_applied = "
4014 << transaction_applied << dendl;
4015 if (!transaction_applied || async)
4016 dout(10) << __func__ << " " << pg_whoami
4017 << " is async_recovery or backfill target" << dendl;
4018 pg_log.trim(trim_to, info, transaction_applied, async);
4019
4020 // update the local pg, pg log
4021 dirty_info = true;
4022 write_if_dirty(t);
4023 }
4024
4025 bool PG::check_log_for_corruption(ObjectStore *store)
4026 {
4027 /// TODO: this method needs to work with the omap log
4028 return true;
4029 }
4030
4031 //! Get the name we're going to save our corrupt page log as
4032 std::string PG::get_corrupt_pg_log_name() const
4033 {
4034 const int MAX_BUF = 512;
4035 char buf[MAX_BUF];
4036 struct tm tm_buf;
4037 time_t my_time(time(NULL));
4038 const struct tm *t = localtime_r(&my_time, &tm_buf);
4039 int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t);
4040 if (ret == 0) {
4041 dout(0) << "strftime failed" << dendl;
4042 return "corrupt_log_unknown_time";
4043 }
4044 string out(buf);
4045 out += stringify(info.pgid);
4046 return out;
4047 }
4048
4049 int PG::read_info(
4050 ObjectStore *store, spg_t pgid, const coll_t &coll,
4051 pg_info_t &info, PastIntervals &past_intervals,
4052 __u8 &struct_v)
4053 {
4054 set<string> keys;
4055 keys.insert(infover_key);
4056 keys.insert(info_key);
4057 keys.insert(biginfo_key);
4058 keys.insert(fastinfo_key);
4059 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
4060 map<string,bufferlist> values;
4061 auto ch = store->open_collection(coll);
4062 ceph_assert(ch);
4063 int r = store->omap_get_values(ch, pgmeta_oid, keys, &values);
4064 ceph_assert(r == 0);
4065 ceph_assert(values.size() == 3 ||
4066 values.size() == 4);
4067
4068 auto p = values[infover_key].cbegin();
4069 decode(struct_v, p);
4070 ceph_assert(struct_v >= 10);
4071
4072 p = values[info_key].begin();
4073 decode(info, p);
4074
4075 p = values[biginfo_key].begin();
4076 decode(past_intervals, p);
4077 decode(info.purged_snaps, p);
4078
4079 p = values[fastinfo_key].begin();
4080 if (!p.end()) {
4081 pg_fast_info_t fast;
4082 decode(fast, p);
4083 fast.try_apply_to(&info);
4084 }
4085 return 0;
4086 }
4087
4088 void PG::read_state(ObjectStore *store)
4089 {
4090 int r = read_info(store, pg_id, coll, info, past_intervals,
4091 info_struct_v);
4092 ceph_assert(r >= 0);
4093
4094 if (info_struct_v < compat_struct_v) {
4095 derr << "PG needs upgrade, but on-disk data is too old; upgrade to"
4096 << " an older version first." << dendl;
4097 ceph_abort_msg("PG too old to upgrade");
4098 }
4099
4100 last_written_info = info;
4101
4102 ostringstream oss;
4103 pg_log.read_log_and_missing(
4104 store,
4105 ch,
4106 pgmeta_oid,
4107 info,
4108 oss,
4109 cct->_conf->osd_ignore_stale_divergent_priors,
4110 cct->_conf->osd_debug_verify_missing_on_start);
4111 if (oss.tellp())
4112 osd->clog->error() << oss.str();
4113
4114 // log any weirdness
4115 log_weirdness();
4116
4117 if (info_struct_v < latest_struct_v) {
4118 upgrade(store);
4119 }
4120
4121 // initialize current mapping
4122 {
4123 int primary, up_primary;
4124 vector<int> acting, up;
4125 get_osdmap()->pg_to_up_acting_osds(
4126 pg_id.pgid, &up, &up_primary, &acting, &primary);
4127 init_primary_up_acting(
4128 up,
4129 acting,
4130 up_primary,
4131 primary);
4132 int rr = OSDMap::calc_pg_role(osd->whoami, acting);
4133 if (pool.info.is_replicated() || rr == pg_whoami.shard)
4134 set_role(rr);
4135 else
4136 set_role(-1);
4137 }
4138
4139 PG::RecoveryCtx rctx(0, 0, 0, new ObjectStore::Transaction);
4140 handle_initialize(&rctx);
4141 // note: we don't activate here because we know the OSD will advance maps
4142 // during boot.
4143 write_if_dirty(*rctx.transaction);
4144 store->queue_transaction(ch, std::move(*rctx.transaction));
4145 delete rctx.transaction;
4146 }
4147
4148 void PG::log_weirdness()
4149 {
4150 if (pg_log.get_tail() != info.log_tail)
4151 osd->clog->error() << info.pgid
4152 << " info mismatch, log.tail " << pg_log.get_tail()
4153 << " != info.log_tail " << info.log_tail;
4154 if (pg_log.get_head() != info.last_update)
4155 osd->clog->error() << info.pgid
4156 << " info mismatch, log.head " << pg_log.get_head()
4157 << " != info.last_update " << info.last_update;
4158
4159 if (!pg_log.get_log().empty()) {
4160 // sloppy check
4161 if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
4162 osd->clog->error() << info.pgid
4163 << " log bound mismatch, info (tail,head] ("
4164 << pg_log.get_tail() << "," << pg_log.get_head() << "]"
4165 << " actual ["
4166 << pg_log.get_log().log.begin()->version << ","
4167 << pg_log.get_log().log.rbegin()->version << "]";
4168 }
4169
4170 if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
4171 osd->clog->error() << info.pgid
4172 << " caller_ops.size " << pg_log.get_log().caller_ops.size()
4173 << " > log size " << pg_log.get_log().log.size();
4174 }
4175 }
4176
4177 void PG::update_snap_map(
4178 const vector<pg_log_entry_t> &log_entries,
4179 ObjectStore::Transaction &t)
4180 {
4181 for (vector<pg_log_entry_t>::const_iterator i = log_entries.begin();
4182 i != log_entries.end();
4183 ++i) {
4184 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4185 if (i->soid.snap < CEPH_MAXSNAP) {
4186 if (i->is_delete()) {
4187 int r = snap_mapper.remove_oid(
4188 i->soid,
4189 &_t);
4190 if (r != 0)
4191 derr << __func__ << " remove_oid " << i->soid << " failed with " << r << dendl;
4192 // On removal tolerate missing key corruption
4193 ceph_assert(r == 0 || r == -ENOENT);
4194 } else if (i->is_update()) {
4195 ceph_assert(i->snaps.length() > 0);
4196 vector<snapid_t> snaps;
4197 bufferlist snapbl = i->snaps;
4198 auto p = snapbl.cbegin();
4199 try {
4200 decode(snaps, p);
4201 } catch (...) {
4202 derr << __func__ << " decode snaps failure on " << *i << dendl;
4203 snaps.clear();
4204 }
4205 set<snapid_t> _snaps(snaps.begin(), snaps.end());
4206
4207 if (i->is_clone() || i->is_promote()) {
4208 snap_mapper.add_oid(
4209 i->soid,
4210 _snaps,
4211 &_t);
4212 } else if (i->is_modify()) {
4213 int r = snap_mapper.update_snaps(
4214 i->soid,
4215 _snaps,
4216 0,
4217 &_t);
4218 ceph_assert(r == 0);
4219 } else {
4220 ceph_assert(i->is_clean());
4221 }
4222 }
4223 }
4224 }
4225 }
4226
4227 /**
4228 * filter trimming|trimmed snaps out of snapcontext
4229 */
4230 void PG::filter_snapc(vector<snapid_t> &snaps)
4231 {
4232 // nothing needs to trim, we can return immediately
4233 if (snap_trimq.empty() && info.purged_snaps.empty())
4234 return;
4235
4236 bool filtering = false;
4237 vector<snapid_t> newsnaps;
4238 for (vector<snapid_t>::iterator p = snaps.begin();
4239 p != snaps.end();
4240 ++p) {
4241 if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
4242 if (!filtering) {
4243 // start building a new vector with what we've seen so far
4244 dout(10) << "filter_snapc filtering " << snaps << dendl;
4245 newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
4246 filtering = true;
4247 }
4248 dout(20) << "filter_snapc removing trimq|purged snap " << *p << dendl;
4249 } else {
4250 if (filtering)
4251 newsnaps.push_back(*p); // continue building new vector
4252 }
4253 }
4254 if (filtering) {
4255 snaps.swap(newsnaps);
4256 dout(10) << "filter_snapc result " << snaps << dendl;
4257 }
4258 }
4259
4260 void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m)
4261 {
4262 for (map<hobject_t, list<OpRequestRef>>::iterator it = m.begin();
4263 it != m.end();
4264 ++it)
4265 requeue_ops(it->second);
4266 m.clear();
4267 }
4268
4269 void PG::requeue_op(OpRequestRef op)
4270 {
4271 auto p = waiting_for_map.find(op->get_source());
4272 if (p != waiting_for_map.end()) {
4273 dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")"
4274 << dendl;
4275 p->second.push_front(op);
4276 } else {
4277 dout(20) << __func__ << " " << op << dendl;
4278 osd->enqueue_front(
4279 OpQueueItem(
4280 unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(info.pgid, op)),
4281 op->get_req()->get_cost(),
4282 op->get_req()->get_priority(),
4283 op->get_req()->get_recv_stamp(),
4284 op->get_req()->get_source().num(),
4285 get_osdmap_epoch()));
4286 }
4287 }
4288
4289 void PG::requeue_ops(list<OpRequestRef> &ls)
4290 {
4291 for (list<OpRequestRef>::reverse_iterator i = ls.rbegin();
4292 i != ls.rend();
4293 ++i) {
4294 requeue_op(*i);
4295 }
4296 ls.clear();
4297 }
4298
4299 void PG::requeue_map_waiters()
4300 {
4301 epoch_t epoch = get_osdmap_epoch();
4302 auto p = waiting_for_map.begin();
4303 while (p != waiting_for_map.end()) {
4304 if (epoch < p->second.front()->min_epoch) {
4305 dout(20) << __func__ << " " << p->first << " front op "
4306 << p->second.front() << " must still wait, doing nothing"
4307 << dendl;
4308 ++p;
4309 } else {
4310 dout(20) << __func__ << " " << p->first << " " << p->second << dendl;
4311 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
4312 auto req = *q;
4313 osd->enqueue_front(OpQueueItem(
4314 unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(info.pgid, req)),
4315 req->get_req()->get_cost(),
4316 req->get_req()->get_priority(),
4317 req->get_req()->get_recv_stamp(),
4318 req->get_req()->get_source().num(),
4319 epoch));
4320 }
4321 p = waiting_for_map.erase(p);
4322 }
4323 }
4324 }
4325
4326
4327 // ==========================================================================================
4328 // SCRUB
4329
4330 /*
4331 * when holding pg and sched_scrub_lock, then the states are:
4332 * scheduling:
4333 * scrubber.reserved = true
4334 * scrub_rserved_peers includes whoami
4335 * osd->scrub_pending++
4336 * scheduling, replica declined:
4337 * scrubber.reserved = true
4338 * scrubber.reserved_peers includes -1
4339 * osd->scrub_pending++
4340 * pending:
4341 * scrubber.reserved = true
4342 * scrubber.reserved_peers.size() == acting.size();
4343 * pg on scrub_wq
4344 * osd->scrub_pending++
4345 * scrubbing:
4346 * scrubber.reserved = false;
4347 * scrubber.reserved_peers empty
4348 * osd->scrubber.active++
4349 */
4350
4351 // returns true if a scrub has been newly kicked off
4352 bool PG::sched_scrub()
4353 {
4354 bool nodeep_scrub = false;
4355 ceph_assert(is_locked());
4356 if (!(is_primary() && is_active() && is_clean() && !is_scrubbing())) {
4357 return false;
4358 }
4359
4360 double deep_scrub_interval = 0;
4361 pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
4362 if (deep_scrub_interval <= 0) {
4363 deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
4364 }
4365 bool time_for_deep = ceph_clock_now() >=
4366 info.history.last_deep_scrub_stamp + deep_scrub_interval;
4367
4368 bool deep_coin_flip = false;
4369 // Only add random deep scrubs when NOT user initiated scrub
4370 if (!scrubber.must_scrub)
4371 deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
4372 dout(20) << __func__ << ": time_for_deep=" << time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
4373
4374 time_for_deep = (time_for_deep || deep_coin_flip);
4375
4376 //NODEEP_SCRUB so ignore time initiated deep-scrub
4377 if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
4378 pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
4379 time_for_deep = false;
4380 nodeep_scrub = true;
4381 }
4382
4383 if (!scrubber.must_scrub) {
4384 ceph_assert(!scrubber.must_deep_scrub);
4385
4386 //NOSCRUB so skip regular scrubs
4387 if ((get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
4388 pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) && !time_for_deep) {
4389 if (scrubber.reserved) {
4390 // cancel scrub if it is still in scheduling,
4391 // so pgs from other pools where scrub are still legal
4392 // have a chance to go ahead with scrubbing.
4393 clear_scrub_reserved();
4394 scrub_unreserve_replicas();
4395 }
4396 return false;
4397 }
4398 }
4399
4400 // Clear these in case user issues the scrub/repair command during
4401 // the scheduling of the scrub/repair (e.g. request reservation)
4402 scrubber.deep_scrub_on_error = false;
4403 scrubber.auto_repair = false;
4404 if (cct->_conf->osd_scrub_auto_repair
4405 && get_pgbackend()->auto_repair_supported()
4406 // respect the command from user, and not do auto-repair
4407 && !scrubber.must_repair
4408 && !scrubber.must_scrub
4409 && !scrubber.must_deep_scrub) {
4410 if (time_for_deep) {
4411 dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
4412 scrubber.auto_repair = true;
4413 } else {
4414 dout(20) << __func__ << ": auto repair with scrubbing, rescrub if errors found" << dendl;
4415 scrubber.deep_scrub_on_error = true;
4416 }
4417 }
4418
4419 bool ret = true;
4420 if (!scrubber.reserved) {
4421 ceph_assert(scrubber.reserved_peers.empty());
4422 if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
4423 osd->inc_scrubs_pending()) {
4424 dout(20) << __func__ << ": reserved locally, reserving replicas" << dendl;
4425 scrubber.reserved = true;
4426 scrubber.reserved_peers.insert(pg_whoami);
4427 scrub_reserve_replicas();
4428 } else {
4429 dout(20) << __func__ << ": failed to reserve locally" << dendl;
4430 ret = false;
4431 }
4432 }
4433 if (scrubber.reserved) {
4434 if (scrubber.reserve_failed) {
4435 dout(20) << "sched_scrub: failed, a peer declined" << dendl;
4436 clear_scrub_reserved();
4437 scrub_unreserve_replicas();
4438 ret = false;
4439 } else if (scrubber.reserved_peers.size() == acting.size()) {
4440 dout(20) << "sched_scrub: success, reserved self and replicas" << dendl;
4441 if (time_for_deep) {
4442 dout(10) << "sched_scrub: scrub will be deep" << dendl;
4443 state_set(PG_STATE_DEEP_SCRUB);
4444 } else if (!scrubber.must_deep_scrub && info.stats.stats.sum.num_deep_scrub_errors) {
4445 if (!nodeep_scrub) {
4446 osd->clog->info() << "osd." << osd->whoami
4447 << " pg " << info.pgid
4448 << " Deep scrub errors, upgrading scrub to deep-scrub";
4449 state_set(PG_STATE_DEEP_SCRUB);
4450 } else if (!scrubber.must_scrub) {
4451 osd->clog->error() << "osd." << osd->whoami
4452 << " pg " << info.pgid
4453 << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
4454 clear_scrub_reserved();
4455 scrub_unreserve_replicas();
4456 return false;
4457 } else {
4458 osd->clog->error() << "osd." << osd->whoami
4459 << " pg " << info.pgid
4460 << " Regular scrub request, deep-scrub details will be lost";
4461 }
4462 }
4463 queue_scrub();
4464 } else {
4465 // none declined, since scrubber.reserved is set
4466 dout(20) << "sched_scrub: reserved " << scrubber.reserved_peers << ", waiting for replicas" << dendl;
4467 }
4468 }
4469
4470 return ret;
4471 }
4472
4473 void PG::reg_next_scrub()
4474 {
4475 if (!is_primary())
4476 return;
4477
4478 utime_t reg_stamp;
4479 bool must = false;
4480 if (scrubber.must_scrub) {
4481 // Set the smallest time that isn't utime_t()
4482 reg_stamp = utime_t(0,1);
4483 must = true;
4484 } else if (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats) {
4485 reg_stamp = ceph_clock_now();
4486 must = true;
4487 } else {
4488 reg_stamp = info.history.last_scrub_stamp;
4489 }
4490 // note down the sched_time, so we can locate this scrub, and remove it
4491 // later on.
4492 double scrub_min_interval = 0, scrub_max_interval = 0;
4493 pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
4494 pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
4495 ceph_assert(scrubber.scrub_reg_stamp == utime_t());
4496 scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
4497 reg_stamp,
4498 scrub_min_interval,
4499 scrub_max_interval,
4500 must);
4501 dout(10) << __func__ << " pg " << pg_id << " register next scrub, scrub time "
4502 << scrubber.scrub_reg_stamp << ", must = " << (int)must << dendl;
4503 }
4504
4505 void PG::unreg_next_scrub()
4506 {
4507 if (is_primary()) {
4508 osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
4509 scrubber.scrub_reg_stamp = utime_t();
4510 }
4511 }
4512
4513 void PG::do_replica_scrub_map(OpRequestRef op)
4514 {
4515 const MOSDRepScrubMap *m = static_cast<const MOSDRepScrubMap*>(op->get_req());
4516 dout(7) << __func__ << " " << *m << dendl;
4517 if (m->map_epoch < info.history.same_interval_since) {
4518 dout(10) << __func__ << " discarding old from "
4519 << m->map_epoch << " < " << info.history.same_interval_since
4520 << dendl;
4521 return;
4522 }
4523 if (!scrubber.is_chunky_scrub_active()) {
4524 dout(10) << __func__ << " scrub isn't active" << dendl;
4525 return;
4526 }
4527
4528 op->mark_started();
4529
4530 auto p = const_cast<bufferlist&>(m->get_data()).cbegin();
4531 scrubber.received_maps[m->from].decode(p, info.pgid.pool());
4532 dout(10) << "map version is "
4533 << scrubber.received_maps[m->from].valid_through
4534 << dendl;
4535
4536 dout(10) << __func__ << " waiting_on_whom was " << scrubber.waiting_on_whom
4537 << dendl;
4538 ceph_assert(scrubber.waiting_on_whom.count(m->from));
4539 scrubber.waiting_on_whom.erase(m->from);
4540 if (m->preempted) {
4541 dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
4542 scrub_preempted = true;
4543 }
4544 if (scrubber.waiting_on_whom.empty()) {
4545 requeue_scrub(ops_blocked_by_scrub());
4546 }
4547 }
4548
4549 // send scrub v3 messages (chunky scrub)
4550 void PG::_request_scrub_map(
4551 pg_shard_t replica, eversion_t version,
4552 hobject_t start, hobject_t end,
4553 bool deep,
4554 bool allow_preemption)
4555 {
4556 ceph_assert(replica != pg_whoami);
4557 dout(10) << "scrub requesting scrubmap from osd." << replica
4558 << " deep " << (int)deep << dendl;
4559 MOSDRepScrub *repscrubop = new MOSDRepScrub(
4560 spg_t(info.pgid.pgid, replica.shard), version,
4561 get_osdmap_epoch(),
4562 get_last_peering_reset(),
4563 start, end, deep,
4564 allow_preemption,
4565 scrubber.priority,
4566 ops_blocked_by_scrub());
4567 // default priority, we want the rep scrub processed prior to any recovery
4568 // or client io messages (we are holding a lock!)
4569 osd->send_message_osd_cluster(
4570 replica.osd, repscrubop, get_osdmap_epoch());
4571 }
4572
4573 void PG::handle_scrub_reserve_request(OpRequestRef op)
4574 {
4575 dout(7) << __func__ << " " << *op->get_req() << dendl;
4576 op->mark_started();
4577 if (scrubber.reserved) {
4578 dout(10) << __func__ << " ignoring reserve request: Already reserved"
4579 << dendl;
4580 return;
4581 }
4582 if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
4583 osd->inc_scrubs_pending()) {
4584 scrubber.reserved = true;
4585 } else {
4586 dout(20) << __func__ << ": failed to reserve remotely" << dendl;
4587 scrubber.reserved = false;
4588 }
4589 const MOSDScrubReserve *m =
4590 static_cast<const MOSDScrubReserve*>(op->get_req());
4591 Message *reply = new MOSDScrubReserve(
4592 spg_t(info.pgid.pgid, primary.shard),
4593 m->map_epoch,
4594 scrubber.reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
4595 pg_whoami);
4596 osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
4597 }
4598
4599 void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
4600 {
4601 dout(7) << __func__ << " " << *op->get_req() << dendl;
4602 op->mark_started();
4603 if (!scrubber.reserved) {
4604 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
4605 return;
4606 }
4607 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
4608 dout(10) << " already had osd." << from << " reserved" << dendl;
4609 } else {
4610 dout(10) << " osd." << from << " scrub reserve = success" << dendl;
4611 scrubber.reserved_peers.insert(from);
4612 sched_scrub();
4613 }
4614 }
4615
4616 void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
4617 {
4618 dout(7) << __func__ << " " << *op->get_req() << dendl;
4619 op->mark_started();
4620 if (!scrubber.reserved) {
4621 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
4622 return;
4623 }
4624 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
4625 dout(10) << " already had osd." << from << " reserved" << dendl;
4626 } else {
4627 /* One decline stops this pg from being scheduled for scrubbing. */
4628 dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
4629 scrubber.reserve_failed = true;
4630 sched_scrub();
4631 }
4632 }
4633
4634 void PG::handle_scrub_reserve_release(OpRequestRef op)
4635 {
4636 dout(7) << __func__ << " " << *op->get_req() << dendl;
4637 op->mark_started();
4638 clear_scrub_reserved();
4639 }
4640
4641 // We can zero the value of primary num_bytes as just an atomic.
4642 // However, setting above zero reserves space for backfill and requires
4643 // the OSDService::stat_lock which protects all OSD usage
4644 void PG::set_reserved_num_bytes(int64_t primary, int64_t local) {
4645 ceph_assert(osd->stat_lock.is_locked_by_me());
4646 primary_num_bytes.store(primary);
4647 local_num_bytes.store(local);
4648 return;
4649 }
4650
4651 void PG::clear_reserved_num_bytes() {
4652 primary_num_bytes.store(0);
4653 local_num_bytes.store(0);
4654 return;
4655 }
4656
4657 void PG::reject_reservation()
4658 {
4659 clear_reserved_num_bytes();
4660 osd->send_message_osd_cluster(
4661 primary.osd,
4662 new MBackfillReserve(
4663 MBackfillReserve::REJECT,
4664 spg_t(info.pgid.pgid, primary.shard),
4665 get_osdmap_epoch()),
4666 get_osdmap_epoch());
4667 }
4668
4669 void PG::schedule_backfill_retry(float delay)
4670 {
4671 std::lock_guard lock(osd->recovery_request_lock);
4672 osd->recovery_request_timer.add_event_after(
4673 delay,
4674 new QueuePeeringEvt<RequestBackfill>(
4675 this, get_osdmap_epoch(),
4676 RequestBackfill()));
4677 }
4678
4679 void PG::schedule_recovery_retry(float delay)
4680 {
4681 std::lock_guard lock(osd->recovery_request_lock);
4682 osd->recovery_request_timer.add_event_after(
4683 delay,
4684 new QueuePeeringEvt<DoRecovery>(
4685 this, get_osdmap_epoch(),
4686 DoRecovery()));
4687 }
4688
4689 void PG::clear_scrub_reserved()
4690 {
4691 scrubber.reserved_peers.clear();
4692 scrubber.reserve_failed = false;
4693
4694 if (scrubber.reserved) {
4695 scrubber.reserved = false;
4696 osd->dec_scrubs_pending();
4697 }
4698 }
4699
4700 void PG::scrub_reserve_replicas()
4701 {
4702 ceph_assert(backfill_targets.empty());
4703 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
4704 i != acting_recovery_backfill.end();
4705 ++i) {
4706 if (*i == pg_whoami) continue;
4707 dout(10) << "scrub requesting reserve from osd." << *i << dendl;
4708 osd->send_message_osd_cluster(
4709 i->osd,
4710 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
4711 get_osdmap_epoch(),
4712 MOSDScrubReserve::REQUEST, pg_whoami),
4713 get_osdmap_epoch());
4714 }
4715 }
4716
4717 void PG::scrub_unreserve_replicas()
4718 {
4719 ceph_assert(backfill_targets.empty());
4720 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
4721 i != acting_recovery_backfill.end();
4722 ++i) {
4723 if (*i == pg_whoami) continue;
4724 dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
4725 osd->send_message_osd_cluster(
4726 i->osd,
4727 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
4728 get_osdmap_epoch(),
4729 MOSDScrubReserve::RELEASE, pg_whoami),
4730 get_osdmap_epoch());
4731 }
4732 }
4733
4734 void PG::_scan_rollback_obs(const vector<ghobject_t> &rollback_obs)
4735 {
4736 ObjectStore::Transaction t;
4737 eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
4738 for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
4739 i != rollback_obs.end();
4740 ++i) {
4741 if (i->generation < trimmed_to.version) {
4742 dout(10) << __func__ << "osd." << osd->whoami
4743 << " pg " << info.pgid
4744 << " found obsolete rollback obj "
4745 << *i << " generation < trimmed_to "
4746 << trimmed_to
4747 << "...repaired" << dendl;
4748 t.remove(coll, *i);
4749 }
4750 }
4751 if (!t.empty()) {
4752 derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
4753 << dendl;
4754 osd->store->queue_transaction(ch, std::move(t), NULL);
4755 }
4756 }
4757
4758 void PG::_scan_snaps(ScrubMap &smap)
4759 {
4760 hobject_t head;
4761 SnapSet snapset;
4762
4763 // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify
4764 // caller using clean_meta_map(), and it works properly.
4765 dout(20) << __func__ << " start" << dendl;
4766
4767 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4768 i != smap.objects.rend();
4769 ++i) {
4770 const hobject_t &hoid = i->first;
4771 ScrubMap::object &o = i->second;
4772
4773 dout(20) << __func__ << " " << hoid << dendl;
4774
4775 ceph_assert(!hoid.is_snapdir());
4776 if (hoid.is_head()) {
4777 // parse the SnapSet
4778 bufferlist bl;
4779 if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
4780 continue;
4781 }
4782 bl.push_back(o.attrs[SS_ATTR]);
4783 auto p = bl.cbegin();
4784 try {
4785 decode(snapset, p);
4786 } catch(...) {
4787 continue;
4788 }
4789 head = hoid.get_head();
4790 continue;
4791 }
4792 if (hoid.snap < CEPH_MAXSNAP) {
4793 // check and if necessary fix snap_mapper
4794 if (hoid.get_head() != head) {
4795 derr << __func__ << " no head for " << hoid << " (have " << head << ")"
4796 << dendl;
4797 continue;
4798 }
4799 set<snapid_t> obj_snaps;
4800 auto p = snapset.clone_snaps.find(hoid.snap);
4801 if (p == snapset.clone_snaps.end()) {
4802 derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset
4803 << dendl;
4804 continue;
4805 }
4806 obj_snaps.insert(p->second.begin(), p->second.end());
4807 set<snapid_t> cur_snaps;
4808 int r = snap_mapper.get_snaps(hoid, &cur_snaps);
4809 if (r != 0 && r != -ENOENT) {
4810 derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
4811 ceph_abort();
4812 }
4813 if (r == -ENOENT || cur_snaps != obj_snaps) {
4814 ObjectStore::Transaction t;
4815 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4816 if (r == 0) {
4817 r = snap_mapper.remove_oid(hoid, &_t);
4818 if (r != 0) {
4819 derr << __func__ << ": remove_oid returned " << cpp_strerror(r)
4820 << dendl;
4821 ceph_abort();
4822 }
4823 osd->clog->error() << "osd." << osd->whoami
4824 << " found snap mapper error on pg "
4825 << info.pgid
4826 << " oid " << hoid << " snaps in mapper: "
4827 << cur_snaps << ", oi: "
4828 << obj_snaps
4829 << "...repaired";
4830 } else {
4831 osd->clog->error() << "osd." << osd->whoami
4832 << " found snap mapper error on pg "
4833 << info.pgid
4834 << " oid " << hoid << " snaps missing in mapper"
4835 << ", should be: "
4836 << obj_snaps
4837 << " was " << cur_snaps << " r " << r
4838 << "...repaired";
4839 }
4840 snap_mapper.add_oid(hoid, obj_snaps, &_t);
4841
4842 // wait for repair to apply to avoid confusing other bits of the system.
4843 {
4844 Cond my_cond;
4845 Mutex my_lock("PG::_scan_snaps my_lock");
4846 int r = 0;
4847 bool done;
4848 t.register_on_applied_sync(
4849 new C_SafeCond(&my_lock, &my_cond, &done, &r));
4850 r = osd->store->queue_transaction(ch, std::move(t));
4851 if (r != 0) {
4852 derr << __func__ << ": queue_transaction got " << cpp_strerror(r)
4853 << dendl;
4854 } else {
4855 my_lock.Lock();
4856 while (!done)
4857 my_cond.Wait(my_lock);
4858 my_lock.Unlock();
4859 }
4860 }
4861 }
4862 }
4863 }
4864 }
4865
4866 void PG::_repair_oinfo_oid(ScrubMap &smap)
4867 {
4868 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4869 i != smap.objects.rend();
4870 ++i) {
4871 const hobject_t &hoid = i->first;
4872 ScrubMap::object &o = i->second;
4873
4874 bufferlist bl;
4875 if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4876 continue;
4877 }
4878 bl.push_back(o.attrs[OI_ATTR]);
4879 object_info_t oi;
4880 try {
4881 oi.decode(bl);
4882 } catch(...) {
4883 continue;
4884 }
4885 if (oi.soid != hoid) {
4886 ObjectStore::Transaction t;
4887 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4888 osd->clog->error() << "osd." << osd->whoami
4889 << " found object info error on pg "
4890 << info.pgid
4891 << " oid " << hoid << " oid in object info: "
4892 << oi.soid
4893 << "...repaired";
4894 // Fix object info
4895 oi.soid = hoid;
4896 bl.clear();
4897 encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4898
4899 bufferptr bp(bl.c_str(), bl.length());
4900 o.attrs[OI_ATTR] = bp;
4901
4902 t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
4903 int r = osd->store->queue_transaction(ch, std::move(t));
4904 if (r != 0) {
4905 derr << __func__ << ": queue_transaction got " << cpp_strerror(r)
4906 << dendl;
4907 }
4908 }
4909 }
4910 }
4911 int PG::build_scrub_map_chunk(
4912 ScrubMap &map,
4913 ScrubMapBuilder &pos,
4914 hobject_t start,
4915 hobject_t end,
4916 bool deep,
4917 ThreadPool::TPHandle &handle)
4918 {
4919 dout(10) << __func__ << " [" << start << "," << end << ") "
4920 << " pos " << pos
4921 << dendl;
4922
4923 // start
4924 while (pos.empty()) {
4925 pos.deep = deep;
4926 map.valid_through = info.last_update;
4927
4928 // objects
4929 vector<ghobject_t> rollback_obs;
4930 pos.ret = get_pgbackend()->objects_list_range(
4931 start,
4932 end,
4933 &pos.ls,
4934 &rollback_obs);
4935 if (pos.ret < 0) {
4936 dout(5) << "objects_list_range error: " << pos.ret << dendl;
4937 return pos.ret;
4938 }
4939 if (pos.ls.empty()) {
4940 break;
4941 }
4942 _scan_rollback_obs(rollback_obs);
4943 pos.pos = 0;
4944 return -EINPROGRESS;
4945 }
4946
4947 // scan objects
4948 while (!pos.done()) {
4949 int r = get_pgbackend()->be_scan_list(map, pos);
4950 if (r == -EINPROGRESS) {
4951 return r;
4952 }
4953 }
4954
4955 // finish
4956 dout(20) << __func__ << " finishing" << dendl;
4957 ceph_assert(pos.done());
4958 _repair_oinfo_oid(map);
4959 if (!is_primary()) {
4960 ScrubMap for_meta_scrub;
4961 // In case we restarted smaller chunk, clear old data
4962 scrubber.cleaned_meta_map.clear_from(scrubber.start);
4963 scrubber.cleaned_meta_map.insert(map);
4964 scrubber.clean_meta_map(for_meta_scrub);
4965 _scan_snaps(for_meta_scrub);
4966 }
4967
4968 dout(20) << __func__ << " done, got " << map.objects.size() << " items"
4969 << dendl;
4970 return 0;
4971 }
4972
4973 void PG::Scrubber::cleanup_store(ObjectStore::Transaction *t) {
4974 if (!store)
4975 return;
4976 struct OnComplete : Context {
4977 std::unique_ptr<Scrub::Store> store;
4978 explicit OnComplete(
4979 std::unique_ptr<Scrub::Store> &&store)
4980 : store(std::move(store)) {}
4981 void finish(int) override {}
4982 };
4983 store->cleanup(t);
4984 t->register_on_complete(new OnComplete(std::move(store)));
4985 ceph_assert(!store);
4986 }
4987
4988 void PG::repair_object(
4989 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
4990 pg_shard_t bad_peer)
4991 {
4992 list<pg_shard_t> op_shards;
4993 for (auto i : *ok_peers) {
4994 op_shards.push_back(i.second);
4995 }
4996 dout(10) << "repair_object " << soid << " bad_peer osd."
4997 << bad_peer << " ok_peers osd.{" << op_shards << "}" << dendl;
4998 ScrubMap::object &po = ok_peers->back().first;
4999 eversion_t v;
5000 bufferlist bv;
5001 bv.push_back(po.attrs[OI_ATTR]);
5002 object_info_t oi;
5003 try {
5004 auto bliter = bv.cbegin();
5005 decode(oi, bliter);
5006 } catch (...) {
5007 dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
5008 ceph_abort();
5009 }
5010 if (bad_peer != primary) {
5011 peer_missing[bad_peer].add(soid, oi.version, eversion_t(), false);
5012 } else {
5013 // We should only be scrubbing if the PG is clean.
5014 ceph_assert(waiting_for_unreadable_object.empty());
5015
5016 pg_log.missing_add(soid, oi.version, eversion_t());
5017
5018 pg_log.set_last_requested(0);
5019 dout(10) << __func__ << ": primary = " << primary << dendl;
5020 }
5021
5022 if (is_ec_pg() || bad_peer == primary) {
5023 // we'd better collect all shard for EC pg, and prepare good peers as the
5024 // source of pull in the case of replicated pg.
5025 missing_loc.add_missing(soid, oi.version, eversion_t());
5026 list<pair<ScrubMap::object, pg_shard_t> >::iterator i;
5027 for (i = ok_peers->begin();
5028 i != ok_peers->end();
5029 ++i)
5030 missing_loc.add_location(soid, i->second);
5031 }
5032 }
5033
5034 /* replica_scrub
5035 *
5036 * Wait for last_update_applied to match msg->scrub_to as above. Wait
5037 * for pushes to complete in case of recent recovery. Build a single
5038 * scrubmap of objects that are in the range [msg->start, msg->end).
5039 */
5040 void PG::replica_scrub(
5041 OpRequestRef op,
5042 ThreadPool::TPHandle &handle)
5043 {
5044 const MOSDRepScrub *msg = static_cast<const MOSDRepScrub *>(op->get_req());
5045 ceph_assert(!scrubber.active_rep_scrub);
5046 dout(7) << "replica_scrub" << dendl;
5047
5048 if (msg->map_epoch < info.history.same_interval_since) {
5049 dout(10) << "replica_scrub discarding old replica_scrub from "
5050 << msg->map_epoch << " < " << info.history.same_interval_since
5051 << dendl;
5052 return;
5053 }
5054
5055 ceph_assert(msg->chunky);
5056 if (active_pushes > 0) {
5057 dout(10) << "waiting for active pushes to finish" << dendl;
5058 scrubber.active_rep_scrub = op;
5059 return;
5060 }
5061
5062 scrubber.state = Scrubber::BUILD_MAP_REPLICA;
5063 scrubber.replica_scrub_start = msg->min_epoch;
5064 scrubber.start = msg->start;
5065 scrubber.end = msg->end;
5066 scrubber.max_end = msg->end;
5067 scrubber.deep = msg->deep;
5068 scrubber.epoch_start = info.history.same_interval_since;
5069 if (msg->priority) {
5070 scrubber.priority = msg->priority;
5071 } else {
5072 scrubber.priority = get_scrub_priority();
5073 }
5074
5075 scrub_can_preempt = msg->allow_preemption;
5076 scrub_preempted = false;
5077 scrubber.replica_scrubmap_pos.reset();
5078
5079 requeue_scrub(msg->high_priority);
5080 }
5081
5082 /* Scrub:
5083 * PG_STATE_SCRUBBING is set when the scrub is queued
5084 *
5085 * scrub will be chunky if all OSDs in PG support chunky scrub
5086 * scrub will fail if OSDs are too old.
5087 */
5088 void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
5089 {
5090 if (cct->_conf->osd_scrub_sleep > 0 &&
5091 (scrubber.state == PG::Scrubber::NEW_CHUNK ||
5092 scrubber.state == PG::Scrubber::INACTIVE) &&
5093 scrubber.needs_sleep) {
5094 ceph_assert(!scrubber.sleeping);
5095 dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
5096
5097 // Do an async sleep so we don't block the op queue
5098 OSDService *osds = osd;
5099 spg_t pgid = get_pgid();
5100 int state = scrubber.state;
5101 auto scrub_requeue_callback =
5102 new FunctionContext([osds, pgid, state](int r) {
5103 PGRef pg = osds->osd->lookup_lock_pg(pgid);
5104 if (pg == nullptr) {
5105 lgeneric_dout(osds->osd->cct, 20)
5106 << "scrub_requeue_callback: Could not find "
5107 << "PG " << pgid << " can't complete scrub requeue after sleep"
5108 << dendl;
5109 return;
5110 }
5111 pg->scrubber.sleeping = false;
5112 pg->scrubber.needs_sleep = false;
5113 lgeneric_dout(pg->cct, 20)
5114 << "scrub_requeue_callback: slept for "
5115 << ceph_clock_now() - pg->scrubber.sleep_start
5116 << ", re-queuing scrub with state " << state << dendl;
5117 pg->scrub_queued = false;
5118 pg->requeue_scrub();
5119 pg->scrubber.sleep_start = utime_t();
5120 pg->unlock();
5121 });
5122 std::lock_guard l(osd->sleep_lock);
5123 osd->sleep_timer.add_event_after(cct->_conf->osd_scrub_sleep,
5124 scrub_requeue_callback);
5125 scrubber.sleeping = true;
5126 scrubber.sleep_start = ceph_clock_now();
5127 return;
5128 }
5129 if (pg_has_reset_since(queued)) {
5130 return;
5131 }
5132 ceph_assert(scrub_queued);
5133 scrub_queued = false;
5134 scrubber.needs_sleep = true;
5135
5136 // for the replica
5137 if (!is_primary() &&
5138 scrubber.state == PG::Scrubber::BUILD_MAP_REPLICA) {
5139 chunky_scrub(handle);
5140 return;
5141 }
5142
5143 if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
5144 dout(10) << "scrub -- not primary or active or not clean" << dendl;
5145 state_clear(PG_STATE_SCRUBBING);
5146 state_clear(PG_STATE_REPAIR);
5147 state_clear(PG_STATE_DEEP_SCRUB);
5148 publish_stats_to_osd();
5149 return;
5150 }
5151
5152 if (!scrubber.active) {
5153 ceph_assert(backfill_targets.empty());
5154
5155 scrubber.deep = state_test(PG_STATE_DEEP_SCRUB);
5156
5157 dout(10) << "starting a new chunky scrub" << dendl;
5158 }
5159
5160 chunky_scrub(handle);
5161 }
5162
5163 /*
5164 * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
5165 * chunk.
5166 *
5167 * The object store is partitioned into chunks which end on hash boundaries. For
5168 * each chunk, the following logic is performed:
5169 *
5170 * (1) Block writes on the chunk
5171 * (2) Request maps from replicas
5172 * (3) Wait for pushes to be applied (after recovery)
5173 * (4) Wait for writes to flush on the chunk
5174 * (5) Wait for maps from replicas
5175 * (6) Compare / repair all scrub maps
5176 * (7) Wait for digest updates to apply
5177 *
5178 * This logic is encoded in the mostly linear state machine:
5179 *
5180 * +------------------+
5181 * _________v__________ |
5182 * | | |
5183 * | INACTIVE | |
5184 * |____________________| |
5185 * | |
5186 * | +----------+ |
5187 * _________v___v______ | |
5188 * | | | |
5189 * | NEW_CHUNK | | |
5190 * |____________________| | |
5191 * | | |
5192 * _________v__________ | |
5193 * | | | |
5194 * | WAIT_PUSHES | | |
5195 * |____________________| | |
5196 * | | |
5197 * _________v__________ | |
5198 * | | | |
5199 * | WAIT_LAST_UPDATE | | |
5200 * |____________________| | |
5201 * | | |
5202 * _________v__________ | |
5203 * | | | |
5204 * | BUILD_MAP | | |
5205 * |____________________| | |
5206 * | | |
5207 * _________v__________ | |
5208 * | | | |
5209 * | WAIT_REPLICAS | | |
5210 * |____________________| | |
5211 * | | |
5212 * _________v__________ | |
5213 * | | | |
5214 * | COMPARE_MAPS | | |
5215 * |____________________| | |
5216 * | | |
5217 * | | |
5218 * _________v__________ | |
5219 * | | | |
5220 * |WAIT_DIGEST_UPDATES | | |
5221 * |____________________| | |
5222 * | | | |
5223 * | +----------+ |
5224 * _________v__________ |
5225 * | | |
5226 * | FINISH | |
5227 * |____________________| |
5228 * | |
5229 * +------------------+
5230 *
5231 * The primary determines the last update from the subset by walking the log. If
5232 * it sees a log entry pertaining to a file in the chunk, it tells the replicas
5233 * to wait until that update is applied before building a scrub map. Both the
5234 * primary and replicas will wait for any active pushes to be applied.
5235 *
5236 * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
5237 *
5238 * scrubber.state encodes the current state of the scrub (refer to state diagram
5239 * for details).
5240 */
5241 void PG::chunky_scrub(ThreadPool::TPHandle &handle)
5242 {
5243 // check for map changes
5244 if (scrubber.is_chunky_scrub_active()) {
5245 if (scrubber.epoch_start != info.history.same_interval_since) {
5246 dout(10) << "scrub pg changed, aborting" << dendl;
5247 scrub_clear_state();
5248 scrub_unreserve_replicas();
5249 return;
5250 }
5251 }
5252
5253 bool done = false;
5254 int ret;
5255
5256 while (!done) {
5257 dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
5258 << " [" << scrubber.start << "," << scrubber.end << ")"
5259 << " max_end " << scrubber.max_end << dendl;
5260
5261 switch (scrubber.state) {
5262 case PG::Scrubber::INACTIVE:
5263 dout(10) << "scrub start" << dendl;
5264 ceph_assert(is_primary());
5265
5266 publish_stats_to_osd();
5267 scrubber.epoch_start = info.history.same_interval_since;
5268 scrubber.active = true;
5269
5270 osd->inc_scrubs_active(scrubber.reserved);
5271 if (scrubber.reserved) {
5272 scrubber.reserved = false;
5273 scrubber.reserved_peers.clear();
5274 }
5275
5276 {
5277 ObjectStore::Transaction t;
5278 scrubber.cleanup_store(&t);
5279 scrubber.store.reset(Scrub::Store::create(osd->store, &t,
5280 info.pgid, coll));
5281 osd->store->queue_transaction(ch, std::move(t), nullptr);
5282 }
5283
5284 // Don't include temporary objects when scrubbing
5285 scrubber.start = info.pgid.pgid.get_hobj_start();
5286 scrubber.state = PG::Scrubber::NEW_CHUNK;
5287
5288 {
5289 bool repair = state_test(PG_STATE_REPAIR);
5290 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5291 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5292 stringstream oss;
5293 oss << info.pgid.pgid << " " << mode << " starts" << std::endl;
5294 osd->clog->debug(oss);
5295 }
5296
5297 scrubber.preempt_left = cct->_conf.get_val<uint64_t>(
5298 "osd_scrub_max_preemptions");
5299 scrubber.preempt_divisor = 1;
5300 break;
5301
5302 case PG::Scrubber::NEW_CHUNK:
5303 scrubber.primary_scrubmap = ScrubMap();
5304 scrubber.received_maps.clear();
5305
5306 // begin (possible) preemption window
5307 if (scrub_preempted) {
5308 scrubber.preempt_left--;
5309 scrubber.preempt_divisor *= 2;
5310 dout(10) << __func__ << " preempted, " << scrubber.preempt_left
5311 << " left" << dendl;
5312 scrub_preempted = false;
5313 }
5314 scrub_can_preempt = scrubber.preempt_left > 0;
5315
5316 {
5317 /* get the start and end of our scrub chunk
5318 *
5319 * Our scrub chunk has an important restriction we're going to need to
5320 * respect. We can't let head be start or end.
5321 * Using a half-open interval means that if end == head,
5322 * we'd scrub/lock head and the clone right next to head in different
5323 * chunks which would allow us to miss clones created between
5324 * scrubbing that chunk and scrubbing the chunk including head.
5325 * This isn't true for any of the other clones since clones can
5326 * only be created "just to the left of" head. There is one exception
5327 * to this: promotion of clones which always happens to the left of the
5328 * left-most clone, but promote_object checks the scrubber in that
5329 * case, so it should be ok. Also, it's ok to "miss" clones at the
5330 * left end of the range if we are a tier because they may legitimately
5331 * not exist (see _scrub).
5332 */
5333 int min = std::max<int64_t>(3, cct->_conf->osd_scrub_chunk_min /
5334 scrubber.preempt_divisor);
5335 int max = std::max<int64_t>(min, cct->_conf->osd_scrub_chunk_max /
5336 scrubber.preempt_divisor);
5337 hobject_t start = scrubber.start;
5338 hobject_t candidate_end;
5339 vector<hobject_t> objects;
5340 ret = get_pgbackend()->objects_list_partial(
5341 start,
5342 min,
5343 max,
5344 &objects,
5345 &candidate_end);
5346 ceph_assert(ret >= 0);
5347
5348 if (!objects.empty()) {
5349 hobject_t back = objects.back();
5350 while (candidate_end.is_head() &&
5351 candidate_end == back.get_head()) {
5352 candidate_end = back;
5353 objects.pop_back();
5354 if (objects.empty()) {
5355 ceph_assert(0 ==
5356 "Somehow we got more than 2 objects which"
5357 "have the same head but are not clones");
5358 }
5359 back = objects.back();
5360 }
5361 if (candidate_end.is_head()) {
5362 ceph_assert(candidate_end != back.get_head());
5363 candidate_end = candidate_end.get_object_boundary();
5364 }
5365 } else {
5366 ceph_assert(candidate_end.is_max());
5367 }
5368
5369 if (!_range_available_for_scrub(scrubber.start, candidate_end)) {
5370 // we'll be requeued by whatever made us unavailable for scrub
5371 dout(10) << __func__ << ": scrub blocked somewhere in range "
5372 << "[" << scrubber.start << ", " << candidate_end << ")"
5373 << dendl;
5374 done = true;
5375 break;
5376 }
5377 scrubber.end = candidate_end;
5378 if (scrubber.end > scrubber.max_end)
5379 scrubber.max_end = scrubber.end;
5380 }
5381
5382 // walk the log to find the latest update that affects our chunk
5383 scrubber.subset_last_update = eversion_t();
5384 for (auto p = projected_log.log.rbegin();
5385 p != projected_log.log.rend();
5386 ++p) {
5387 if (p->soid >= scrubber.start &&
5388 p->soid < scrubber.end) {
5389 scrubber.subset_last_update = p->version;
5390 break;
5391 }
5392 }
5393 if (scrubber.subset_last_update == eversion_t()) {
5394 for (list<pg_log_entry_t>::const_reverse_iterator p =
5395 pg_log.get_log().log.rbegin();
5396 p != pg_log.get_log().log.rend();
5397 ++p) {
5398 if (p->soid >= scrubber.start &&
5399 p->soid < scrubber.end) {
5400 scrubber.subset_last_update = p->version;
5401 break;
5402 }
5403 }
5404 }
5405
5406 scrubber.state = PG::Scrubber::WAIT_PUSHES;
5407 break;
5408
5409 case PG::Scrubber::WAIT_PUSHES:
5410 if (active_pushes == 0) {
5411 scrubber.state = PG::Scrubber::WAIT_LAST_UPDATE;
5412 } else {
5413 dout(15) << "wait for pushes to apply" << dendl;
5414 done = true;
5415 }
5416 break;
5417
5418 case PG::Scrubber::WAIT_LAST_UPDATE:
5419 if (last_update_applied < scrubber.subset_last_update) {
5420 // will be requeued by op_applied
5421 dout(15) << "wait for EC read/modify/writes to queue" << dendl;
5422 done = true;
5423 break;
5424 }
5425
5426 // ask replicas to scan
5427 scrubber.waiting_on_whom.insert(pg_whoami);
5428
5429 // request maps from replicas
5430 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
5431 i != acting_recovery_backfill.end();
5432 ++i) {
5433 if (*i == pg_whoami) continue;
5434 _request_scrub_map(*i, scrubber.subset_last_update,
5435 scrubber.start, scrubber.end, scrubber.deep,
5436 scrubber.preempt_left > 0);
5437 scrubber.waiting_on_whom.insert(*i);
5438 }
5439 dout(10) << __func__ << " waiting_on_whom " << scrubber.waiting_on_whom
5440 << dendl;
5441
5442 scrubber.state = PG::Scrubber::BUILD_MAP;
5443 scrubber.primary_scrubmap_pos.reset();
5444 break;
5445
5446 case PG::Scrubber::BUILD_MAP:
5447 ceph_assert(last_update_applied >= scrubber.subset_last_update);
5448
5449 // build my own scrub map
5450 if (scrub_preempted) {
5451 dout(10) << __func__ << " preempted" << dendl;
5452 scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
5453 break;
5454 }
5455 ret = build_scrub_map_chunk(
5456 scrubber.primary_scrubmap,
5457 scrubber.primary_scrubmap_pos,
5458 scrubber.start, scrubber.end,
5459 scrubber.deep,
5460 handle);
5461 if (ret == -EINPROGRESS) {
5462 requeue_scrub();
5463 done = true;
5464 break;
5465 }
5466 scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
5467 break;
5468
5469 case PG::Scrubber::BUILD_MAP_DONE:
5470 if (scrubber.primary_scrubmap_pos.ret < 0) {
5471 dout(5) << "error: " << scrubber.primary_scrubmap_pos.ret
5472 << ", aborting" << dendl;
5473 scrub_clear_state();
5474 scrub_unreserve_replicas();
5475 return;
5476 }
5477 dout(10) << __func__ << " waiting_on_whom was "
5478 << scrubber.waiting_on_whom << dendl;
5479 ceph_assert(scrubber.waiting_on_whom.count(pg_whoami));
5480 scrubber.waiting_on_whom.erase(pg_whoami);
5481
5482 scrubber.state = PG::Scrubber::WAIT_REPLICAS;
5483 break;
5484
5485 case PG::Scrubber::WAIT_REPLICAS:
5486 if (!scrubber.waiting_on_whom.empty()) {
5487 // will be requeued by sub_op_scrub_map
5488 dout(10) << "wait for replicas to build scrub map" << dendl;
5489 done = true;
5490 break;
5491 }
5492 // end (possible) preemption window
5493 scrub_can_preempt = false;
5494 if (scrub_preempted) {
5495 dout(10) << __func__ << " preempted, restarting chunk" << dendl;
5496 scrubber.state = PG::Scrubber::NEW_CHUNK;
5497 } else {
5498 scrubber.state = PG::Scrubber::COMPARE_MAPS;
5499 }
5500 break;
5501
5502 case PG::Scrubber::COMPARE_MAPS:
5503 ceph_assert(last_update_applied >= scrubber.subset_last_update);
5504 ceph_assert(scrubber.waiting_on_whom.empty());
5505
5506 scrub_compare_maps();
5507 scrubber.start = scrubber.end;
5508 scrubber.run_callbacks();
5509
5510 // requeue the writes from the chunk that just finished
5511 requeue_ops(waiting_for_scrub);
5512
5513 scrubber.state = PG::Scrubber::WAIT_DIGEST_UPDATES;
5514
5515 // fall-thru
5516
5517 case PG::Scrubber::WAIT_DIGEST_UPDATES:
5518 if (scrubber.num_digest_updates_pending) {
5519 dout(10) << __func__ << " waiting on "
5520 << scrubber.num_digest_updates_pending
5521 << " digest updates" << dendl;
5522 done = true;
5523 break;
5524 }
5525
5526 scrubber.preempt_left = cct->_conf.get_val<uint64_t>(
5527 "osd_scrub_max_preemptions");
5528 scrubber.preempt_divisor = 1;
5529
5530 if (!(scrubber.end.is_max())) {
5531 scrubber.state = PG::Scrubber::NEW_CHUNK;
5532 requeue_scrub();
5533 done = true;
5534 } else {
5535 scrubber.state = PG::Scrubber::FINISH;
5536 }
5537
5538 break;
5539
5540 case PG::Scrubber::FINISH:
5541 scrub_finish();
5542 scrubber.state = PG::Scrubber::INACTIVE;
5543 done = true;
5544
5545 if (!snap_trimq.empty()) {
5546 dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
5547 snap_trimmer_scrub_complete();
5548 }
5549
5550 break;
5551
5552 case PG::Scrubber::BUILD_MAP_REPLICA:
5553 // build my own scrub map
5554 if (scrub_preempted) {
5555 dout(10) << __func__ << " preempted" << dendl;
5556 ret = 0;
5557 } else {
5558 ret = build_scrub_map_chunk(
5559 scrubber.replica_scrubmap,
5560 scrubber.replica_scrubmap_pos,
5561 scrubber.start, scrubber.end,
5562 scrubber.deep,
5563 handle);
5564 }
5565 if (ret == -EINPROGRESS) {
5566 requeue_scrub();
5567 done = true;
5568 break;
5569 }
5570 // reply
5571 {
5572 MOSDRepScrubMap *reply = new MOSDRepScrubMap(
5573 spg_t(info.pgid.pgid, get_primary().shard),
5574 scrubber.replica_scrub_start,
5575 pg_whoami);
5576 reply->preempted = scrub_preempted;
5577 ::encode(scrubber.replica_scrubmap, reply->get_data());
5578 osd->send_message_osd_cluster(
5579 get_primary().osd, reply,
5580 scrubber.replica_scrub_start);
5581 }
5582 scrub_preempted = false;
5583 scrub_can_preempt = false;
5584 scrubber.state = PG::Scrubber::INACTIVE;
5585 scrubber.replica_scrubmap = ScrubMap();
5586 scrubber.replica_scrubmap_pos = ScrubMapBuilder();
5587 scrubber.start = hobject_t();
5588 scrubber.end = hobject_t();
5589 scrubber.max_end = hobject_t();
5590 done = true;
5591 break;
5592
5593 default:
5594 ceph_abort();
5595 }
5596 }
5597 dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
5598 << " [" << scrubber.start << "," << scrubber.end << ")"
5599 << " max_end " << scrubber.max_end << dendl;
5600 }
5601
5602 bool PG::write_blocked_by_scrub(const hobject_t& soid)
5603 {
5604 if (soid < scrubber.start || soid >= scrubber.end) {
5605 return false;
5606 }
5607 if (scrub_can_preempt) {
5608 if (!scrub_preempted) {
5609 dout(10) << __func__ << " " << soid << " preempted" << dendl;
5610 scrub_preempted = true;
5611 } else {
5612 dout(10) << __func__ << " " << soid << " already preempted" << dendl;
5613 }
5614 return false;
5615 }
5616 return true;
5617 }
5618
5619 bool PG::range_intersects_scrub(const hobject_t &start, const hobject_t& end)
5620 {
5621 // does [start, end] intersect [scrubber.start, scrubber.max_end)
5622 return (start < scrubber.max_end &&
5623 end >= scrubber.start);
5624 }
5625
5626 void PG::scrub_clear_state(bool has_error)
5627 {
5628 ceph_assert(is_locked());
5629 state_clear(PG_STATE_SCRUBBING);
5630 if (!has_error)
5631 state_clear(PG_STATE_REPAIR);
5632 state_clear(PG_STATE_DEEP_SCRUB);
5633 publish_stats_to_osd();
5634
5635 // active -> nothing.
5636 if (scrubber.active)
5637 osd->dec_scrubs_active();
5638
5639 requeue_ops(waiting_for_scrub);
5640
5641 scrubber.reset();
5642
5643 // type-specific state clear
5644 _scrub_clear_state();
5645 }
5646
5647 void PG::scrub_compare_maps()
5648 {
5649 dout(10) << __func__ << " has maps, analyzing" << dendl;
5650
5651 // construct authoritative scrub map for type specific scrubbing
5652 scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
5653 map<hobject_t,
5654 pair<boost::optional<uint32_t>,
5655 boost::optional<uint32_t>>> missing_digest;
5656
5657 map<pg_shard_t, ScrubMap *> maps;
5658 maps[pg_whoami] = &scrubber.primary_scrubmap;
5659
5660 for (const auto& i : acting_recovery_backfill) {
5661 if (i == pg_whoami) continue;
5662 dout(2) << __func__ << " replica " << i << " has "
5663 << scrubber.received_maps[i].objects.size()
5664 << " items" << dendl;
5665 maps[i] = &scrubber.received_maps[i];
5666 }
5667
5668 set<hobject_t> master_set;
5669
5670 // Construct master set
5671 for (const auto map : maps) {
5672 for (const auto i : map.second->objects) {
5673 master_set.insert(i.first);
5674 }
5675 }
5676
5677 stringstream ss;
5678 get_pgbackend()->be_omap_checks(maps, master_set,
5679 scrubber.omap_stats, ss);
5680
5681 if (!ss.str().empty()) {
5682 osd->clog->warn(ss);
5683 }
5684
5685 if (acting.size() > 1) {
5686 dout(10) << __func__ << " comparing replica scrub maps" << dendl;
5687
5688 // Map from object with errors to good peer
5689 map<hobject_t, list<pg_shard_t>> authoritative;
5690
5691 dout(2) << __func__ << " osd." << acting[0] << " has "
5692 << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
5693
5694 ss.str("");
5695 ss.clear();
5696
5697 get_pgbackend()->be_compare_scrubmaps(
5698 maps,
5699 master_set,
5700 state_test(PG_STATE_REPAIR),
5701 scrubber.missing,
5702 scrubber.inconsistent,
5703 authoritative,
5704 missing_digest,
5705 scrubber.shallow_errors,
5706 scrubber.deep_errors,
5707 scrubber.store.get(),
5708 info.pgid, acting,
5709 ss);
5710 dout(2) << ss.str() << dendl;
5711
5712 if (!ss.str().empty()) {
5713 osd->clog->error(ss);
5714 }
5715
5716 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
5717 i != authoritative.end();
5718 ++i) {
5719 list<pair<ScrubMap::object, pg_shard_t> > good_peers;
5720 for (list<pg_shard_t>::const_iterator j = i->second.begin();
5721 j != i->second.end();
5722 ++j) {
5723 good_peers.push_back(make_pair(maps[*j]->objects[i->first], *j));
5724 }
5725 scrubber.authoritative.insert(
5726 make_pair(
5727 i->first,
5728 good_peers));
5729 }
5730
5731 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
5732 i != authoritative.end();
5733 ++i) {
5734 scrubber.cleaned_meta_map.objects.erase(i->first);
5735 scrubber.cleaned_meta_map.objects.insert(
5736 *(maps[i->second.back()]->objects.find(i->first))
5737 );
5738 }
5739 }
5740
5741 ScrubMap for_meta_scrub;
5742 scrubber.clean_meta_map(for_meta_scrub);
5743
5744 // ok, do the pg-type specific scrubbing
5745 scrub_snapshot_metadata(for_meta_scrub, missing_digest);
5746 // Called here on the primary can use an authoritative map if it isn't the primary
5747 _scan_snaps(for_meta_scrub);
5748 if (!scrubber.store->empty()) {
5749 if (state_test(PG_STATE_REPAIR)) {
5750 dout(10) << __func__ << ": discarding scrub results" << dendl;
5751 scrubber.store->flush(nullptr);
5752 } else {
5753 dout(10) << __func__ << ": updating scrub object" << dendl;
5754 ObjectStore::Transaction t;
5755 scrubber.store->flush(&t);
5756 osd->store->queue_transaction(ch, std::move(t), nullptr);
5757 }
5758 }
5759 }
5760
5761 bool PG::scrub_process_inconsistent()
5762 {
5763 dout(10) << __func__ << ": checking authoritative" << dendl;
5764 bool repair = state_test(PG_STATE_REPAIR);
5765 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5766 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5767
5768 // authoriative only store objects which missing or inconsistent.
5769 if (!scrubber.authoritative.empty()) {
5770 stringstream ss;
5771 ss << info.pgid << " " << mode << " "
5772 << scrubber.missing.size() << " missing, "
5773 << scrubber.inconsistent.size() << " inconsistent objects";
5774 dout(2) << ss.str() << dendl;
5775 osd->clog->error(ss);
5776 if (repair) {
5777 state_clear(PG_STATE_CLEAN);
5778 for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >>::iterator i =
5779 scrubber.authoritative.begin();
5780 i != scrubber.authoritative.end();
5781 ++i) {
5782 set<pg_shard_t>::iterator j;
5783
5784 auto missing_entry = scrubber.missing.find(i->first);
5785 if (missing_entry != scrubber.missing.end()) {
5786 for (j = missing_entry->second.begin();
5787 j != missing_entry->second.end();
5788 ++j) {
5789 repair_object(
5790 i->first,
5791 &(i->second),
5792 *j);
5793 ++scrubber.fixed;
5794 }
5795 }
5796 if (scrubber.inconsistent.count(i->first)) {
5797 for (j = scrubber.inconsistent[i->first].begin();
5798 j != scrubber.inconsistent[i->first].end();
5799 ++j) {
5800 repair_object(i->first,
5801 &(i->second),
5802 *j);
5803 ++scrubber.fixed;
5804 }
5805 }
5806 }
5807 }
5808 }
5809 return (!scrubber.authoritative.empty() && repair);
5810 }
5811
5812 bool PG::ops_blocked_by_scrub() const {
5813 return (waiting_for_scrub.size() != 0);
5814 }
5815
5816 // the part that actually finalizes a scrub
5817 void PG::scrub_finish()
5818 {
5819 dout(20) << __func__ << dendl;
5820 bool repair = state_test(PG_STATE_REPAIR);
5821 bool do_deep_scrub = false;
5822 // if the repair request comes from auto-repair and large number of errors,
5823 // we would like to cancel auto-repair
5824 if (repair && scrubber.auto_repair
5825 && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) {
5826 state_clear(PG_STATE_REPAIR);
5827 repair = false;
5828 }
5829 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5830 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5831
5832 // if a regular scrub had errors within the limit, do a deep scrub to auto repair.
5833 if (scrubber.deep_scrub_on_error
5834 && scrubber.authoritative.size() <= cct->_conf->osd_scrub_auto_repair_num_errors) {
5835 ceph_assert(!deep_scrub);
5836 scrubber.deep_scrub_on_error = false;
5837 do_deep_scrub = true;
5838 dout(20) << __func__ << " Try to auto repair after scrub errors" << dendl;
5839 }
5840
5841 // type-specific finish (can tally more errors)
5842 _scrub_finish();
5843
5844 bool has_error = scrub_process_inconsistent();
5845
5846 {
5847 stringstream oss;
5848 oss << info.pgid.pgid << " " << mode << " ";
5849 int total_errors = scrubber.shallow_errors + scrubber.deep_errors;
5850 if (total_errors)
5851 oss << total_errors << " errors";
5852 else
5853 oss << "ok";
5854 if (!deep_scrub && info.stats.stats.sum.num_deep_scrub_errors)
5855 oss << " ( " << info.stats.stats.sum.num_deep_scrub_errors
5856 << " remaining deep scrub error details lost)";
5857 if (repair)
5858 oss << ", " << scrubber.fixed << " fixed";
5859 if (total_errors)
5860 osd->clog->error(oss);
5861 else
5862 osd->clog->debug(oss);
5863 }
5864
5865 // finish up
5866 unreg_next_scrub();
5867 utime_t now = ceph_clock_now();
5868 info.history.last_scrub = info.last_update;
5869 info.history.last_scrub_stamp = now;
5870 if (scrubber.deep) {
5871 info.history.last_deep_scrub = info.last_update;
5872 info.history.last_deep_scrub_stamp = now;
5873 }
5874 // Since we don't know which errors were fixed, we can only clear them
5875 // when every one has been fixed.
5876 if (repair) {
5877 if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) {
5878 ceph_assert(deep_scrub);
5879 scrubber.shallow_errors = scrubber.deep_errors = 0;
5880 dout(20) << __func__ << " All may be fixed" << dendl;
5881 } else if (has_error) {
5882 // Deep scrub in order to get corrected error counts
5883 scrub_after_recovery = true;
5884 dout(20) << __func__ << " Set scrub_after_recovery" << dendl;
5885 } else if (scrubber.shallow_errors || scrubber.deep_errors) {
5886 // We have errors but nothing can be fixed, so there is no repair
5887 // possible.
5888 state_set(PG_STATE_FAILED_REPAIR);
5889 dout(10) << __func__ << " " << (scrubber.shallow_errors + scrubber.deep_errors)
5890 << " error(s) present with no repair possible" << dendl;
5891 }
5892 }
5893 if (deep_scrub) {
5894 if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0))
5895 info.history.last_clean_scrub_stamp = now;
5896 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
5897 info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
5898 info.stats.stats.sum.num_large_omap_objects = scrubber.omap_stats.large_omap_objects;
5899 info.stats.stats.sum.num_omap_bytes = scrubber.omap_stats.omap_bytes;
5900 info.stats.stats.sum.num_omap_keys = scrubber.omap_stats.omap_keys;
5901 dout(25) << __func__ << " shard " << pg_whoami << " num_omap_bytes = "
5902 << info.stats.stats.sum.num_omap_bytes << " num_omap_keys = "
5903 << info.stats.stats.sum.num_omap_keys << dendl;
5904 } else {
5905 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
5906 // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
5907 // because of deep-scrub errors
5908 if (scrubber.shallow_errors == 0)
5909 info.history.last_clean_scrub_stamp = now;
5910 }
5911 info.stats.stats.sum.num_scrub_errors =
5912 info.stats.stats.sum.num_shallow_scrub_errors +
5913 info.stats.stats.sum.num_deep_scrub_errors;
5914 if (scrubber.check_repair) {
5915 scrubber.check_repair = false;
5916 if (info.stats.stats.sum.num_scrub_errors) {
5917 state_set(PG_STATE_FAILED_REPAIR);
5918 dout(10) << __func__ << " " << info.stats.stats.sum.num_scrub_errors
5919 << " error(s) still present after re-scrub" << dendl;
5920 }
5921 }
5922 publish_stats_to_osd();
5923 if (do_deep_scrub) {
5924 // XXX: Auto scrub won't activate if must_scrub is set, but
5925 // setting the scrub stamps affects what users see.
5926 utime_t stamp = utime_t(0,1);
5927 set_last_scrub_stamp(stamp);
5928 set_last_deep_scrub_stamp(stamp);
5929 }
5930 reg_next_scrub();
5931
5932 {
5933 ObjectStore::Transaction t;
5934 dirty_info = true;
5935 write_if_dirty(t);
5936 int tr = osd->store->queue_transaction(ch, std::move(t), NULL);
5937 ceph_assert(tr == 0);
5938 }
5939
5940
5941 if (has_error) {
5942 queue_peering_event(
5943 PGPeeringEventRef(
5944 std::make_shared<PGPeeringEvent>(
5945 get_osdmap_epoch(),
5946 get_osdmap_epoch(),
5947 DoRecovery())));
5948 }
5949
5950 scrub_clear_state(has_error);
5951 scrub_unreserve_replicas();
5952
5953 if (is_active() && is_primary()) {
5954 share_pg_info();
5955 }
5956 }
5957
5958 void PG::share_pg_info()
5959 {
5960 dout(10) << "share_pg_info" << dendl;
5961
5962 // share new pg_info_t with replicas
5963 ceph_assert(!acting_recovery_backfill.empty());
5964 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
5965 i != acting_recovery_backfill.end();
5966 ++i) {
5967 if (*i == pg_whoami) continue;
5968 auto pg_shard = *i;
5969 auto peer = peer_info.find(pg_shard);
5970 if (peer != peer_info.end()) {
5971 peer->second.last_epoch_started = info.last_epoch_started;
5972 peer->second.last_interval_started = info.last_interval_started;
5973 peer->second.history.merge(info.history);
5974 }
5975 MOSDPGInfo *m = new MOSDPGInfo(get_osdmap_epoch());
5976 m->pg_list.push_back(
5977 make_pair(
5978 pg_notify_t(
5979 pg_shard.shard, pg_whoami.shard,
5980 get_osdmap_epoch(),
5981 get_osdmap_epoch(),
5982 info),
5983 past_intervals));
5984 osd->send_message_osd_cluster(pg_shard.osd, m, get_osdmap_epoch());
5985 }
5986 }
5987
5988 bool PG::append_log_entries_update_missing(
5989 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
5990 ObjectStore::Transaction &t, boost::optional<eversion_t> trim_to,
5991 boost::optional<eversion_t> roll_forward_to)
5992 {
5993 ceph_assert(!entries.empty());
5994 ceph_assert(entries.begin()->version > info.last_update);
5995
5996 PGLogEntryHandler rollbacker{this, &t};
5997 bool invalidate_stats =
5998 pg_log.append_new_log_entries(info.last_backfill,
5999 info.last_backfill_bitwise,
6000 entries,
6001 &rollbacker);
6002
6003 if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
6004 pg_log.roll_forward(&rollbacker);
6005 }
6006 if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
6007 pg_log.roll_forward_to(*roll_forward_to, &rollbacker);
6008 last_rollback_info_trimmed_to_applied = *roll_forward_to;
6009 }
6010
6011 info.last_update = pg_log.get_head();
6012
6013 if (pg_log.get_missing().num_missing() == 0) {
6014 // advance last_complete since nothing else is missing!
6015 info.last_complete = info.last_update;
6016 }
6017 info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
6018
6019 dout(20) << __func__ << " trim_to bool = " << bool(trim_to) << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
6020 if (trim_to)
6021 pg_log.trim(*trim_to, info);
6022 dirty_info = true;
6023 write_if_dirty(t);
6024 return invalidate_stats;
6025 }
6026
6027
6028 void PG::merge_new_log_entries(
6029 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
6030 ObjectStore::Transaction &t,
6031 boost::optional<eversion_t> trim_to,
6032 boost::optional<eversion_t> roll_forward_to)
6033 {
6034 dout(10) << __func__ << " " << entries << dendl;
6035 ceph_assert(is_primary());
6036
6037 bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
6038 for (set<pg_shard_t>::const_iterator i = acting_recovery_backfill.begin();
6039 i != acting_recovery_backfill.end();
6040 ++i) {
6041 pg_shard_t peer(*i);
6042 if (peer == pg_whoami) continue;
6043 ceph_assert(peer_missing.count(peer));
6044 ceph_assert(peer_info.count(peer));
6045 pg_missing_t& pmissing(peer_missing[peer]);
6046 dout(20) << __func__ << " peer_missing for " << peer << " = " << pmissing << dendl;
6047 pg_info_t& pinfo(peer_info[peer]);
6048 bool invalidate_stats = PGLog::append_log_entries_update_missing(
6049 pinfo.last_backfill,
6050 info.last_backfill_bitwise,
6051 entries,
6052 true,
6053 NULL,
6054 pmissing,
6055 NULL,
6056 this);
6057 pinfo.last_update = info.last_update;
6058 pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
6059 rebuild_missing = rebuild_missing || invalidate_stats;
6060 }
6061
6062 if (!rebuild_missing) {
6063 return;
6064 }
6065
6066 for (auto &&i: entries) {
6067 missing_loc.rebuild(
6068 i.soid,
6069 pg_whoami,
6070 acting_recovery_backfill,
6071 info,
6072 pg_log.get_missing(),
6073 peer_missing,
6074 peer_info);
6075 }
6076 }
6077
6078 void PG::update_history(const pg_history_t& new_history)
6079 {
6080 unreg_next_scrub();
6081 if (info.history.merge(new_history)) {
6082 dout(20) << __func__ << " advanced history from " << new_history << dendl;
6083 dirty_info = true;
6084 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
6085 dout(20) << __func__ << " clearing past_intervals" << dendl;
6086 past_intervals.clear();
6087 dirty_big_info = true;
6088 }
6089 }
6090 reg_next_scrub();
6091 }
6092
6093 void PG::fulfill_info(
6094 pg_shard_t from, const pg_query_t &query,
6095 pair<pg_shard_t, pg_info_t> &notify_info)
6096 {
6097 ceph_assert(from == primary);
6098 ceph_assert(query.type == pg_query_t::INFO);
6099
6100 // info
6101 dout(10) << "sending info" << dendl;
6102 notify_info = make_pair(from, info);
6103 }
6104
6105 void PG::fulfill_log(
6106 pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
6107 {
6108 dout(10) << "log request from " << from << dendl;
6109 ceph_assert(from == primary);
6110 ceph_assert(query.type != pg_query_t::INFO);
6111 ConnectionRef con = osd->get_con_osd_cluster(
6112 from.osd, get_osdmap_epoch());
6113 if (!con) return;
6114
6115 MOSDPGLog *mlog = new MOSDPGLog(
6116 from.shard, pg_whoami.shard,
6117 get_osdmap_epoch(),
6118 info, query_epoch);
6119 mlog->missing = pg_log.get_missing();
6120
6121 // primary -> other, when building master log
6122 if (query.type == pg_query_t::LOG) {
6123 dout(10) << " sending info+missing+log since " << query.since
6124 << dendl;
6125 if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
6126 osd->clog->error() << info.pgid << " got broken pg_query_t::LOG since " << query.since
6127 << " when my log.tail is " << pg_log.get_tail()
6128 << ", sending full log instead";
6129 mlog->log = pg_log.get_log(); // primary should not have requested this!!
6130 } else
6131 mlog->log.copy_after(cct, pg_log.get_log(), query.since);
6132 }
6133 else if (query.type == pg_query_t::FULLLOG) {
6134 dout(10) << " sending info+missing+full log" << dendl;
6135 mlog->log = pg_log.get_log();
6136 }
6137
6138 dout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
6139
6140 osd->share_map_peer(from.osd, con.get(), get_osdmap());
6141 osd->send_message_osd_cluster(mlog, con.get());
6142 }
6143
6144 void PG::fulfill_query(const MQuery& query, RecoveryCtx *rctx)
6145 {
6146 if (query.query.type == pg_query_t::INFO) {
6147 pair<pg_shard_t, pg_info_t> notify_info;
6148 update_history(query.query.history);
6149 fulfill_info(query.from, query.query, notify_info);
6150 rctx->send_notify(
6151 notify_info.first,
6152 pg_notify_t(
6153 notify_info.first.shard, pg_whoami.shard,
6154 query.query_epoch,
6155 get_osdmap_epoch(),
6156 notify_info.second),
6157 past_intervals);
6158 } else {
6159 update_history(query.query.history);
6160 fulfill_log(query.from, query.query, query.query_epoch);
6161 }
6162 }
6163
6164 void PG::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
6165 {
6166 bool changed = false;
6167 if (osdmap->test_flag(CEPH_OSDMAP_FULL) &&
6168 !lastmap->test_flag(CEPH_OSDMAP_FULL)) {
6169 dout(10) << " cluster was marked full in " << osdmap->get_epoch() << dendl;
6170 changed = true;
6171 }
6172 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
6173 if (!pi) {
6174 return; // pool deleted
6175 }
6176 if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
6177 const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
6178 if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
6179 dout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
6180 changed = true;
6181 }
6182 }
6183 if (changed) {
6184 info.history.last_epoch_marked_full = osdmap->get_epoch();
6185 dirty_info = true;
6186 }
6187 }
6188
6189 bool PG::should_restart_peering(
6190 int newupprimary,
6191 int newactingprimary,
6192 const vector<int>& newup,
6193 const vector<int>& newacting,
6194 OSDMapRef lastmap,
6195 OSDMapRef osdmap)
6196 {
6197 if (PastIntervals::is_new_interval(
6198 primary.osd,
6199 newactingprimary,
6200 acting,
6201 newacting,
6202 up_primary.osd,
6203 newupprimary,
6204 up,
6205 newup,
6206 osdmap,
6207 lastmap,
6208 info.pgid.pgid)) {
6209 dout(20) << "new interval newup " << newup
6210 << " newacting " << newacting << dendl;
6211 return true;
6212 }
6213 if (!lastmap->is_up(osd->whoami) && osdmap->is_up(osd->whoami)) {
6214 dout(10) << __func__ << " osd transitioned from down -> up" << dendl;
6215 return true;
6216 }
6217 return false;
6218 }
6219
6220 bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
6221 {
6222 if (last_peering_reset > reply_epoch ||
6223 last_peering_reset > query_epoch) {
6224 dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " << query_epoch
6225 << " last_peering_reset " << last_peering_reset
6226 << dendl;
6227 return true;
6228 }
6229 return false;
6230 }
6231
6232 void PG::set_last_peering_reset()
6233 {
6234 dout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl;
6235 if (last_peering_reset != get_osdmap_epoch()) {
6236 last_peering_reset = get_osdmap_epoch();
6237 reset_interval_flush();
6238 }
6239 }
6240
6241 struct FlushState {
6242 PGRef pg;
6243 epoch_t epoch;
6244 FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
6245 ~FlushState() {
6246 pg->lock();
6247 if (!pg->pg_has_reset_since(epoch))
6248 pg->on_flushed();
6249 pg->unlock();
6250 }
6251 };
6252 typedef std::shared_ptr<FlushState> FlushStateRef;
6253
6254 void PG::start_flush(ObjectStore::Transaction *t)
6255 {
6256 // flush in progress ops
6257 FlushStateRef flush_trigger (std::make_shared<FlushState>(
6258 this, get_osdmap_epoch()));
6259 flushes_in_progress++;
6260 t->register_on_applied(new ContainerContext<FlushStateRef>(flush_trigger));
6261 t->register_on_commit(new ContainerContext<FlushStateRef>(flush_trigger));
6262 }
6263
6264 void PG::reset_interval_flush()
6265 {
6266 dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
6267 recovery_state.clear_blocked_outgoing();
6268
6269 Context *c = new QueuePeeringEvt<IntervalFlush>(
6270 this, get_osdmap_epoch(), IntervalFlush());
6271 if (!ch->flush_commit(c)) {
6272 dout(10) << "Beginning to block outgoing recovery messages" << dendl;
6273 recovery_state.begin_block_outgoing();
6274 } else {
6275 dout(10) << "Not blocking outgoing recovery messages" << dendl;
6276 delete c;
6277 }
6278 }
6279
6280 /* Called before initializing peering during advance_map */
6281 void PG::start_peering_interval(
6282 const OSDMapRef lastmap,
6283 const vector<int>& newup, int new_up_primary,
6284 const vector<int>& newacting, int new_acting_primary,
6285 ObjectStore::Transaction *t)
6286 {
6287 const OSDMapRef osdmap = get_osdmap();
6288
6289 set_last_peering_reset();
6290
6291 vector<int> oldacting, oldup;
6292 int oldrole = get_role();
6293
6294 unreg_next_scrub();
6295
6296 if (is_primary()) {
6297 osd->clear_ready_to_merge(this);
6298 }
6299
6300 pg_shard_t old_acting_primary = get_primary();
6301 pg_shard_t old_up_primary = up_primary;
6302 bool was_old_primary = is_primary();
6303 bool was_old_replica = is_replica();
6304
6305 acting.swap(oldacting);
6306 up.swap(oldup);
6307 init_primary_up_acting(
6308 newup,
6309 newacting,
6310 new_up_primary,
6311 new_acting_primary);
6312
6313 if (info.stats.up != up ||
6314 info.stats.acting != acting ||
6315 info.stats.up_primary != new_up_primary ||
6316 info.stats.acting_primary != new_acting_primary) {
6317 info.stats.up = up;
6318 info.stats.up_primary = new_up_primary;
6319 info.stats.acting = acting;
6320 info.stats.acting_primary = new_acting_primary;
6321 info.stats.mapping_epoch = osdmap->get_epoch();
6322 }
6323
6324 pg_stats_publish_lock.Lock();
6325 pg_stats_publish_valid = false;
6326 pg_stats_publish_lock.Unlock();
6327
6328 // This will now be remapped during a backfill in cases
6329 // that it would not have been before.
6330 if (up != acting)
6331 state_set(PG_STATE_REMAPPED);
6332 else
6333 state_clear(PG_STATE_REMAPPED);
6334
6335 int role = osdmap->calc_pg_role(osd->whoami, acting, acting.size());
6336 if (pool.info.is_replicated() || role == pg_whoami.shard)
6337 set_role(role);
6338 else
6339 set_role(-1);
6340
6341 // did acting, up, primary|acker change?
6342 if (!lastmap) {
6343 dout(10) << " no lastmap" << dendl;
6344 dirty_info = true;
6345 dirty_big_info = true;
6346 info.history.same_interval_since = osdmap->get_epoch();
6347 } else {
6348 std::stringstream debug;
6349 ceph_assert(info.history.same_interval_since != 0);
6350 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
6351 get_is_recoverable_predicate());
6352 bool new_interval = PastIntervals::check_new_interval(
6353 old_acting_primary.osd,
6354 new_acting_primary,
6355 oldacting, newacting,
6356 old_up_primary.osd,
6357 new_up_primary,
6358 oldup, newup,
6359 info.history.same_interval_since,
6360 info.history.last_epoch_clean,
6361 osdmap,
6362 lastmap,
6363 info.pgid.pgid,
6364 recoverable.get(),
6365 &past_intervals,
6366 &debug);
6367 dout(10) << __func__ << ": check_new_interval output: "
6368 << debug.str() << dendl;
6369 if (new_interval) {
6370 if (osdmap->get_epoch() == osd->get_superblock().oldest_map &&
6371 info.history.last_epoch_clean < osdmap->get_epoch()) {
6372 dout(10) << " map gap, clearing past_intervals and faking" << dendl;
6373 // our information is incomplete and useless; someone else was clean
6374 // after everything we know if osdmaps were trimmed.
6375 past_intervals.clear();
6376 } else {
6377 dout(10) << " noting past " << past_intervals << dendl;
6378 }
6379 dirty_info = true;
6380 dirty_big_info = true;
6381 info.history.same_interval_since = osdmap->get_epoch();
6382 if (osdmap->have_pg_pool(info.pgid.pgid.pool()) &&
6383 info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
6384 osdmap->get_pg_num(info.pgid.pgid.pool()),
6385 nullptr)) {
6386 info.history.last_epoch_split = osdmap->get_epoch();
6387 }
6388 }
6389 }
6390
6391 if (old_up_primary != up_primary ||
6392 oldup != up) {
6393 info.history.same_up_since = osdmap->get_epoch();
6394 }
6395 // this comparison includes primary rank via pg_shard_t
6396 if (old_acting_primary != get_primary()) {
6397 info.history.same_primary_since = osdmap->get_epoch();
6398 }
6399
6400 on_new_interval();
6401
6402 dout(1) << __func__ << " up " << oldup << " -> " << up
6403 << ", acting " << oldacting << " -> " << acting
6404 << ", acting_primary " << old_acting_primary << " -> " << new_acting_primary
6405 << ", up_primary " << old_up_primary << " -> " << new_up_primary
6406 << ", role " << oldrole << " -> " << role
6407 << ", features acting " << acting_features
6408 << " upacting " << upacting_features
6409 << dendl;
6410
6411 // deactivate.
6412 state_clear(PG_STATE_ACTIVE);
6413 state_clear(PG_STATE_PEERED);
6414 state_clear(PG_STATE_PREMERGE);
6415 state_clear(PG_STATE_DOWN);
6416 state_clear(PG_STATE_RECOVERY_WAIT);
6417 state_clear(PG_STATE_RECOVERY_TOOFULL);
6418 state_clear(PG_STATE_RECOVERING);
6419
6420 peer_purged.clear();
6421 acting_recovery_backfill.clear();
6422 scrub_queued = false;
6423
6424 // reset primary/replica state?
6425 if (was_old_primary || is_primary()) {
6426 osd->remove_want_pg_temp(info.pgid.pgid);
6427 } else if (was_old_replica || is_replica()) {
6428 osd->remove_want_pg_temp(info.pgid.pgid);
6429 }
6430 clear_primary_state();
6431
6432
6433 // pg->on_*
6434 on_change(t);
6435
6436 projected_last_update = eversion_t();
6437
6438 ceph_assert(!deleting);
6439
6440 // should we tell the primary we are here?
6441 send_notify = !is_primary();
6442
6443 if (role != oldrole ||
6444 was_old_primary != is_primary()) {
6445 // did primary change?
6446 if (was_old_primary != is_primary()) {
6447 state_clear(PG_STATE_CLEAN);
6448 clear_publish_stats();
6449 }
6450
6451 on_role_change();
6452
6453 // take active waiters
6454 requeue_ops(waiting_for_peered);
6455
6456 } else {
6457 // no role change.
6458 // did primary change?
6459 if (get_primary() != old_acting_primary) {
6460 dout(10) << *this << " " << oldacting << " -> " << acting
6461 << ", acting primary "
6462 << old_acting_primary << " -> " << get_primary()
6463 << dendl;
6464 } else {
6465 // primary is the same.
6466 if (is_primary()) {
6467 // i am (still) primary. but my replica set changed.
6468 state_clear(PG_STATE_CLEAN);
6469
6470 dout(10) << oldacting << " -> " << acting
6471 << ", replicas changed" << dendl;
6472 }
6473 }
6474 }
6475 cancel_recovery();
6476
6477 if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
6478 dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
6479 osd->queue_want_pg_temp(info.pgid.pgid, acting);
6480 }
6481 }
6482
6483 void PG::on_new_interval()
6484 {
6485 const OSDMapRef osdmap = get_osdmap();
6486
6487 reg_next_scrub();
6488
6489 // initialize features
6490 acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
6491 upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
6492 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
6493 if (*p == CRUSH_ITEM_NONE)
6494 continue;
6495 uint64_t f = osdmap->get_xinfo(*p).features;
6496 acting_features &= f;
6497 upacting_features &= f;
6498 }
6499 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
6500 if (*p == CRUSH_ITEM_NONE)
6501 continue;
6502 upacting_features &= osdmap->get_xinfo(*p).features;
6503 }
6504
6505 _on_new_interval();
6506 }
6507
6508 void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
6509 {
6510 ceph_assert(!is_primary());
6511
6512 update_history(oinfo.history);
6513 if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
6514 info.stats.stats.sum.num_scrub_errors = 0;
6515 info.stats.stats.sum.num_shallow_scrub_errors = 0;
6516 info.stats.stats.sum.num_deep_scrub_errors = 0;
6517 dirty_info = true;
6518 }
6519
6520 if (!(info.purged_snaps == oinfo.purged_snaps)) {
6521 dout(10) << __func__ << " updating purged_snaps to " << oinfo.purged_snaps
6522 << dendl;
6523 info.purged_snaps = oinfo.purged_snaps;
6524 dirty_info = true;
6525 dirty_big_info = true;
6526 }
6527 }
6528
6529 ostream& operator<<(ostream& out, const PG& pg)
6530 {
6531 out << "pg[" << pg.info
6532 << " " << pg.up;
6533 if (pg.acting != pg.up)
6534 out << "/" << pg.acting;
6535 if (pg.is_ec_pg())
6536 out << "p" << pg.get_primary();
6537 if (!pg.async_recovery_targets.empty())
6538 out << " async=[" << pg.async_recovery_targets << "]";
6539 if (!pg.backfill_targets.empty())
6540 out << " backfill=[" << pg.backfill_targets << "]";
6541 out << " r=" << pg.get_role();
6542 out << " lpr=" << pg.get_last_peering_reset();
6543
6544 if (pg.deleting)
6545 out << " DELETING";
6546
6547 if (!pg.past_intervals.empty()) {
6548 out << " pi=[" << pg.past_intervals.get_bounds()
6549 << ")/" << pg.past_intervals.size();
6550 }
6551
6552 if (pg.is_peered()) {
6553 if (pg.last_update_ondisk != pg.info.last_update)
6554 out << " luod=" << pg.last_update_ondisk;
6555 if (pg.last_update_applied != pg.info.last_update)
6556 out << " lua=" << pg.last_update_applied;
6557 }
6558
6559 if (pg.recovery_ops_active)
6560 out << " rops=" << pg.recovery_ops_active;
6561
6562 if (pg.pg_log.get_tail() != pg.info.log_tail ||
6563 pg.pg_log.get_head() != pg.info.last_update)
6564 out << " (info mismatch, " << pg.pg_log.get_log() << ")";
6565
6566 if (!pg.pg_log.get_log().empty()) {
6567 if ((pg.pg_log.get_log().log.begin()->version <= pg.pg_log.get_tail())) {
6568 out << " (log bound mismatch, actual=["
6569 << pg.pg_log.get_log().log.begin()->version << ","
6570 << pg.pg_log.get_log().log.rbegin()->version << "]";
6571 out << ")";
6572 }
6573 }
6574
6575 out << " crt=" << pg.pg_log.get_can_rollback_to();
6576
6577 if (pg.last_complete_ondisk != pg.info.last_complete)
6578 out << " lcod " << pg.last_complete_ondisk;
6579
6580 if (pg.is_primary()) {
6581 out << " mlcod " << pg.min_last_complete_ondisk;
6582 }
6583
6584 out << " " << pg_state_string(pg.get_state());
6585 if (pg.should_send_notify())
6586 out << " NOTIFY";
6587
6588 if (pg.scrubber.must_repair)
6589 out << " MUST_REPAIR";
6590 if (pg.scrubber.auto_repair)
6591 out << " AUTO_REPAIR";
6592 if (pg.scrubber.check_repair)
6593 out << " CHECK_REPAIR";
6594 if (pg.scrubber.deep_scrub_on_error)
6595 out << " DEEP_SCRUB_ON_ERROR";
6596 if (pg.scrubber.must_deep_scrub)
6597 out << " MUST_DEEP_SCRUB";
6598 if (pg.scrubber.must_scrub)
6599 out << " MUST_SCRUB";
6600
6601 //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
6602 if (pg.pg_log.get_missing().num_missing()) {
6603 out << " m=" << pg.pg_log.get_missing().num_missing();
6604 if (pg.is_primary()) {
6605 uint64_t unfound = pg.get_num_unfound();
6606 if (unfound)
6607 out << " u=" << unfound;
6608 }
6609 }
6610 if (!pg.is_clean()) {
6611 out << " mbc=" << pg.missing_loc.get_missing_by_count();
6612 }
6613 if (!pg.snap_trimq.empty()) {
6614 out << " trimq=";
6615 // only show a count if the set is large
6616 if (pg.snap_trimq.num_intervals() > 16) {
6617 out << pg.snap_trimq.size();
6618 } else {
6619 out << pg.snap_trimq;
6620 }
6621 }
6622 if (!pg.info.purged_snaps.empty()) {
6623 out << " ps="; // snap trim queue / purged snaps
6624 if (pg.info.purged_snaps.num_intervals() > 16) {
6625 out << pg.info.purged_snaps.size();
6626 } else {
6627 out << pg.info.purged_snaps;
6628 }
6629 }
6630
6631 out << "]";
6632
6633
6634 return out;
6635 }
6636
6637 bool PG::can_discard_op(OpRequestRef& op)
6638 {
6639 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
6640 if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) {
6641 dout(20) << " discard " << *m << dendl;
6642 return true;
6643 }
6644
6645 if (m->get_map_epoch() < info.history.same_primary_since) {
6646 dout(7) << " changed after " << m->get_map_epoch()
6647 << ", dropping " << *m << dendl;
6648 return true;
6649 }
6650
6651 if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) {
6652 // >= luminous client
6653 if (m->get_connection()->has_feature(CEPH_FEATURE_SERVER_NAUTILUS)) {
6654 // >= nautilus client
6655 if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) {
6656 dout(7) << __func__ << " sent before last_force_op_resend "
6657 << pool.info.last_force_op_resend
6658 << ", dropping" << *m << dendl;
6659 return true;
6660 }
6661 } else {
6662 // == < nautilus client (luminous or mimic)
6663 if (m->get_map_epoch() < pool.info.get_last_force_op_resend_prenautilus()) {
6664 dout(7) << __func__ << " sent before last_force_op_resend_prenautilus "
6665 << pool.info.last_force_op_resend_prenautilus
6666 << ", dropping" << *m << dendl;
6667 return true;
6668 }
6669 }
6670 if (m->get_map_epoch() < info.history.last_epoch_split) {
6671 dout(7) << __func__ << " pg split in "
6672 << info.history.last_epoch_split << ", dropping" << dendl;
6673 return true;
6674 }
6675 } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) {
6676 // < luminous client
6677 if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) {
6678 dout(7) << __func__ << " sent before last_force_op_resend_preluminous "
6679 << pool.info.last_force_op_resend_preluminous
6680 << ", dropping" << *m << dendl;
6681 return true;
6682 }
6683 }
6684
6685 return false;
6686 }
6687
6688 template<typename T, int MSGTYPE>
6689 bool PG::can_discard_replica_op(OpRequestRef& op)
6690 {
6691 const T *m = static_cast<const T *>(op->get_req());
6692 ceph_assert(m->get_type() == MSGTYPE);
6693
6694 int from = m->get_source().num();
6695
6696 // if a repop is replied after a replica goes down in a new osdmap, and
6697 // before the pg advances to this new osdmap, the repop replies before this
6698 // repop can be discarded by that replica OSD, because the primary resets the
6699 // connection to it when handling the new osdmap marking it down, and also
6700 // resets the messenger sesssion when the replica reconnects. to avoid the
6701 // out-of-order replies, the messages from that replica should be discarded.
6702 OSDMapRef next_map = osd->get_next_osdmap();
6703 if (next_map->is_down(from))
6704 return true;
6705 /* Mostly, this overlaps with the old_peering_msg
6706 * condition. An important exception is pushes
6707 * sent by replicas not in the acting set, since
6708 * if such a replica goes down it does not cause
6709 * a new interval. */
6710 if (next_map->get_down_at(from) >= m->map_epoch)
6711 return true;
6712
6713 // same pg?
6714 // if pg changes _at all_, we reset and repeer!
6715 if (old_peering_msg(m->map_epoch, m->map_epoch)) {
6716 dout(10) << "can_discard_replica_op pg changed " << info.history
6717 << " after " << m->map_epoch
6718 << ", dropping" << dendl;
6719 return true;
6720 }
6721 return false;
6722 }
6723
6724 bool PG::can_discard_scan(OpRequestRef op)
6725 {
6726 const MOSDPGScan *m = static_cast<const MOSDPGScan *>(op->get_req());
6727 ceph_assert(m->get_type() == MSG_OSD_PG_SCAN);
6728
6729 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
6730 dout(10) << " got old scan, ignoring" << dendl;
6731 return true;
6732 }
6733 return false;
6734 }
6735
6736 bool PG::can_discard_backfill(OpRequestRef op)
6737 {
6738 const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill *>(op->get_req());
6739 ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL);
6740
6741 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
6742 dout(10) << " got old backfill, ignoring" << dendl;
6743 return true;
6744 }
6745
6746 return false;
6747
6748 }
6749
6750 bool PG::can_discard_request(OpRequestRef& op)
6751 {
6752 switch (op->get_req()->get_type()) {
6753 case CEPH_MSG_OSD_OP:
6754 return can_discard_op(op);
6755 case CEPH_MSG_OSD_BACKOFF:
6756 return false; // never discard
6757 case MSG_OSD_REPOP:
6758 return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op);
6759 case MSG_OSD_PG_PUSH:
6760 return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op);
6761 case MSG_OSD_PG_PULL:
6762 return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
6763 case MSG_OSD_PG_PUSH_REPLY:
6764 return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
6765 case MSG_OSD_REPOPREPLY:
6766 return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
6767 case MSG_OSD_PG_RECOVERY_DELETE:
6768 return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op);
6769
6770 case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
6771 return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op);
6772
6773 case MSG_OSD_EC_WRITE:
6774 return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
6775 case MSG_OSD_EC_WRITE_REPLY:
6776 return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op);
6777 case MSG_OSD_EC_READ:
6778 return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
6779 case MSG_OSD_EC_READ_REPLY:
6780 return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
6781 case MSG_OSD_REP_SCRUB:
6782 return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
6783 case MSG_OSD_SCRUB_RESERVE:
6784 return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op);
6785 case MSG_OSD_REP_SCRUBMAP:
6786 return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op);
6787 case MSG_OSD_PG_UPDATE_LOG_MISSING:
6788 return can_discard_replica_op<
6789 MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op);
6790 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
6791 return can_discard_replica_op<
6792 MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
6793
6794 case MSG_OSD_PG_SCAN:
6795 return can_discard_scan(op);
6796 case MSG_OSD_PG_BACKFILL:
6797 return can_discard_backfill(op);
6798 case MSG_OSD_PG_BACKFILL_REMOVE:
6799 return can_discard_replica_op<MOSDPGBackfillRemove,
6800 MSG_OSD_PG_BACKFILL_REMOVE>(op);
6801 }
6802 return true;
6803 }
6804
6805 void PG::take_waiters()
6806 {
6807 dout(10) << "take_waiters" << dendl;
6808 requeue_map_waiters();
6809 }
6810
6811 void PG::do_peering_event(PGPeeringEventRef evt, RecoveryCtx *rctx)
6812 {
6813 dout(10) << __func__ << ": " << evt->get_desc() << dendl;
6814 ceph_assert(have_same_or_newer_map(evt->get_epoch_sent()));
6815 if (old_peering_evt(evt)) {
6816 dout(10) << "discard old " << evt->get_desc() << dendl;
6817 } else {
6818 recovery_state.handle_event(evt, rctx);
6819 }
6820 // write_if_dirty regardless of path above to ensure we capture any work
6821 // done by OSD::advance_pg().
6822 write_if_dirty(*rctx->transaction);
6823 }
6824
6825 void PG::queue_peering_event(PGPeeringEventRef evt)
6826 {
6827 if (old_peering_evt(evt))
6828 return;
6829 osd->osd->enqueue_peering_evt(info.pgid, evt);
6830 }
6831
6832 void PG::queue_null(epoch_t msg_epoch,
6833 epoch_t query_epoch)
6834 {
6835 dout(10) << "null" << dendl;
6836 queue_peering_event(
6837 PGPeeringEventRef(std::make_shared<PGPeeringEvent>(msg_epoch, query_epoch,
6838 NullEvt())));
6839 }
6840
6841 void PG::find_unfound(epoch_t queued, RecoveryCtx *rctx)
6842 {
6843 /*
6844 * if we couldn't start any recovery ops and things are still
6845 * unfound, see if we can discover more missing object locations.
6846 * It may be that our initial locations were bad and we errored
6847 * out while trying to pull.
6848 */
6849 discover_all_missing(*rctx->query_map);
6850 if (rctx->query_map->empty()) {
6851 string action;
6852 if (state_test(PG_STATE_BACKFILLING)) {
6853 auto evt = PGPeeringEventRef(
6854 new PGPeeringEvent(
6855 queued,
6856 queued,
6857 PG::UnfoundBackfill()));
6858 queue_peering_event(evt);
6859 action = "in backfill";
6860 } else if (state_test(PG_STATE_RECOVERING)) {
6861 auto evt = PGPeeringEventRef(
6862 new PGPeeringEvent(
6863 queued,
6864 queued,
6865 PG::UnfoundRecovery()));
6866 queue_peering_event(evt);
6867 action = "in recovery";
6868 } else {
6869 action = "already out of recovery/backfill";
6870 }
6871 dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
6872 } else {
6873 dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
6874 queue_recovery();
6875 }
6876 }
6877
6878 void PG::handle_advance_map(
6879 OSDMapRef osdmap, OSDMapRef lastmap,
6880 vector<int>& newup, int up_primary,
6881 vector<int>& newacting, int acting_primary,
6882 RecoveryCtx *rctx)
6883 {
6884 ceph_assert(lastmap->get_epoch() == osdmap_ref->get_epoch());
6885 ceph_assert(lastmap == osdmap_ref);
6886 dout(10) << "handle_advance_map "
6887 << newup << "/" << newacting
6888 << " -- " << up_primary << "/" << acting_primary
6889 << dendl;
6890 update_osdmap_ref(osdmap);
6891 osd_shard->update_pg_epoch(pg_slot, osdmap->get_epoch());
6892
6893 pool.update(cct, osdmap);
6894
6895 AdvMap evt(
6896 osdmap, lastmap, newup, up_primary,
6897 newacting, acting_primary);
6898 recovery_state.handle_event(evt, rctx);
6899 if (pool.info.last_change == osdmap_ref->get_epoch()) {
6900 on_pool_change();
6901 update_store_with_options();
6902 }
6903 last_require_osd_release = osdmap->require_osd_release;
6904 }
6905
6906 void PG::handle_activate_map(RecoveryCtx *rctx)
6907 {
6908 dout(10) << "handle_activate_map " << dendl;
6909 ActMap evt;
6910 recovery_state.handle_event(evt, rctx);
6911 if (osdmap_ref->get_epoch() - last_persisted_osdmap >
6912 cct->_conf->osd_pg_epoch_persisted_max_stale) {
6913 dout(20) << __func__ << ": Dirtying info: last_persisted is "
6914 << last_persisted_osdmap
6915 << " while current is " << osdmap_ref->get_epoch() << dendl;
6916 dirty_info = true;
6917 } else {
6918 dout(20) << __func__ << ": Not dirtying info: last_persisted is "
6919 << last_persisted_osdmap
6920 << " while current is " << osdmap_ref->get_epoch() << dendl;
6921 }
6922 if (osdmap_ref->check_new_blacklist_entries()) {
6923 check_blacklisted_watchers();
6924 }
6925 write_if_dirty(*rctx->transaction);
6926 }
6927
6928 void PG::handle_initialize(RecoveryCtx *rctx)
6929 {
6930 dout(10) << __func__ << dendl;
6931 Initialize evt;
6932 recovery_state.handle_event(evt, rctx);
6933 }
6934
6935 void PG::handle_query_state(Formatter *f)
6936 {
6937 dout(10) << "handle_query_state" << dendl;
6938 QueryState q(f);
6939 recovery_state.handle_event(q, 0);
6940 }
6941
6942 void PG::update_store_with_options()
6943 {
6944 auto r = osd->store->set_collection_opts(ch, pool.info.opts);
6945 if(r < 0 && r != -EOPNOTSUPP) {
6946 derr << __func__ << " set_collection_opts returns error:" << r << dendl;
6947 }
6948 }
6949
6950 struct C_DeleteMore : public Context {
6951 PGRef pg;
6952 epoch_t epoch;
6953 C_DeleteMore(PG *p, epoch_t e) : pg(p), epoch(e) {}
6954 void finish(int r) override {
6955 ceph_abort();
6956 }
6957 void complete(int r) override {
6958 ceph_assert(r == 0);
6959 pg->lock();
6960 if (!pg->pg_has_reset_since(epoch)) {
6961 pg->osd->queue_for_pg_delete(pg->get_pgid(), epoch);
6962 }
6963 pg->unlock();
6964 delete this;
6965 }
6966 };
6967
6968 void PG::_delete_some(ObjectStore::Transaction *t)
6969 {
6970 dout(10) << __func__ << dendl;
6971
6972 {
6973 float osd_delete_sleep = osd->osd->get_osd_delete_sleep();
6974 if (osd_delete_sleep > 0 && delete_needs_sleep) {
6975 epoch_t e = get_osdmap()->get_epoch();
6976 PGRef pgref(this);
6977 auto delete_requeue_callback = new FunctionContext([this, pgref, e](int r) {
6978 dout(20) << __func__ << " wake up at "
6979 << ceph_clock_now()
6980 << ", re-queuing delete" << dendl;
6981 lock();
6982 delete_needs_sleep = false;
6983 if (!pg_has_reset_since(e)) {
6984 osd->queue_for_pg_delete(get_pgid(), e);
6985 }
6986 unlock();
6987 });
6988
6989 utime_t delete_schedule_time = ceph_clock_now();
6990 delete_schedule_time += osd_delete_sleep;
6991 Mutex::Locker l(osd->sleep_lock);
6992 osd->sleep_timer.add_event_at(delete_schedule_time,
6993 delete_requeue_callback);
6994 dout(20) << __func__ << " Delete scheduled at " << delete_schedule_time << dendl;
6995 return;
6996 }
6997 }
6998
6999 delete_needs_sleep = true;
7000
7001 vector<ghobject_t> olist;
7002 int max = std::min(osd->store->get_ideal_list_max(),
7003 (int)cct->_conf->osd_target_transaction_size);
7004 ghobject_t next;
7005 osd->store->collection_list(
7006 ch,
7007 next,
7008 ghobject_t::get_max(),
7009 max,
7010 &olist,
7011 &next);
7012 dout(20) << __func__ << " " << olist << dendl;
7013
7014 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
7015 int64_t num = 0;
7016 for (auto& oid : olist) {
7017 if (oid.is_pgmeta()) {
7018 continue;
7019 }
7020 int r = snap_mapper.remove_oid(oid.hobj, &_t);
7021 if (r != 0 && r != -ENOENT) {
7022 ceph_abort();
7023 }
7024 t->remove(coll, oid);
7025 ++num;
7026 }
7027 if (num) {
7028 dout(20) << __func__ << " deleting " << num << " objects" << dendl;
7029 Context *fin = new C_DeleteMore(this, get_osdmap_epoch());
7030 t->register_on_commit(fin);
7031 } else {
7032 dout(20) << __func__ << " finished" << dendl;
7033 if (cct->_conf->osd_inject_failure_on_pg_removal) {
7034 _exit(1);
7035 }
7036
7037 // final flush here to ensure completions drop refs. Of particular concern
7038 // are the SnapMapper ContainerContexts.
7039 {
7040 PGRef pgref(this);
7041 PGLog::clear_info_log(info.pgid, t);
7042 t->remove_collection(coll);
7043 t->register_on_commit(new ContainerContext<PGRef>(pgref));
7044 t->register_on_applied(new ContainerContext<PGRef>(pgref));
7045 osd->store->queue_transaction(ch, std::move(*t));
7046 }
7047 ch->flush();
7048
7049 if (!osd->try_finish_pg_delete(this, pool.info.get_pg_num())) {
7050 dout(1) << __func__ << " raced with merge, reinstantiating" << dendl;
7051 ch = osd->store->create_new_collection(coll);
7052 _create(*t,
7053 info.pgid,
7054 info.pgid.get_split_bits(pool.info.get_pg_num()));
7055 _init(*t, info.pgid, &pool.info);
7056 last_epoch = 0; // to ensure pg epoch is also written
7057 dirty_info = true;
7058 dirty_big_info = true;
7059 } else {
7060 deleted = true;
7061
7062 // cancel reserver here, since the PG is about to get deleted and the
7063 // exit() methods don't run when that happens.
7064 osd->local_reserver.cancel_reservation(info.pgid);
7065
7066 osd->logger->dec(l_osd_pg_removing);
7067 }
7068 }
7069 }
7070
7071 // Compute pending backfill data
7072 static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes)
7073 {
7074 lgeneric_dout(cct, 20) << __func__ << " Adjust local usage " << (local_bytes >> 10) << "KiB"
7075 << " primary usage " << (bf_bytes >> 10) << "KiB" << dendl;
7076 return std::max((int64_t)0, bf_bytes - local_bytes);
7077 }
7078
7079 int PG::pg_stat_adjust(osd_stat_t *ns)
7080 {
7081 osd_stat_t &new_stat = *ns;
7082 if (is_primary()) {
7083 return 0;
7084 }
7085 // Adjust the kb_used by adding pending backfill data
7086 uint64_t reserved_num_bytes = get_reserved_num_bytes();
7087
7088 // For now we don't consider projected space gains here
7089 // I suggest we have an optional 2 pass backfill that frees up
7090 // space in a first pass. This could be triggered when at nearfull
7091 // or near to backfillfull.
7092 if (reserved_num_bytes > 0) {
7093 // TODO: Handle compression by adjusting by the PGs average
7094 // compression precentage.
7095 dout(20) << __func__ << " reserved_num_bytes " << (reserved_num_bytes >> 10) << "KiB"
7096 << " Before kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl;
7097 if (new_stat.statfs.available > reserved_num_bytes)
7098 new_stat.statfs.available -= reserved_num_bytes;
7099 else
7100 new_stat.statfs.available = 0;
7101 dout(20) << __func__ << " After kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl;
7102 return 1;
7103 }
7104 return 0;
7105 }
7106
7107
7108 /*------------ Recovery State Machine----------------*/
7109 #undef dout_prefix
7110 #define dout_prefix (context< RecoveryMachine >().pg->gen_prefix(*_dout) \
7111 << "state<" << get_state_name() << ">: ")
7112
7113 /*------Crashed-------*/
7114 PG::RecoveryState::Crashed::Crashed(my_context ctx)
7115 : my_base(ctx),
7116 NamedState(context< RecoveryMachine >().pg, "Crashed")
7117 {
7118 context< RecoveryMachine >().log_enter(state_name);
7119 ceph_abort_msg("we got a bad state machine event");
7120 }
7121
7122
7123 /*------Initial-------*/
7124 PG::RecoveryState::Initial::Initial(my_context ctx)
7125 : my_base(ctx),
7126 NamedState(context< RecoveryMachine >().pg, "Initial")
7127 {
7128 context< RecoveryMachine >().log_enter(state_name);
7129 }
7130
7131 boost::statechart::result PG::RecoveryState::Initial::react(const MNotifyRec& notify)
7132 {
7133 PG *pg = context< RecoveryMachine >().pg;
7134 pg->proc_replica_info(
7135 notify.from, notify.notify.info, notify.notify.epoch_sent);
7136 pg->set_last_peering_reset();
7137 return transit< Primary >();
7138 }
7139
7140 boost::statechart::result PG::RecoveryState::Initial::react(const MInfoRec& i)
7141 {
7142 PG *pg = context< RecoveryMachine >().pg;
7143 ceph_assert(!pg->is_primary());
7144 post_event(i);
7145 return transit< Stray >();
7146 }
7147
7148 boost::statechart::result PG::RecoveryState::Initial::react(const MLogRec& i)
7149 {
7150 PG *pg = context< RecoveryMachine >().pg;
7151 ceph_assert(!pg->is_primary());
7152 post_event(i);
7153 return transit< Stray >();
7154 }
7155
7156 void PG::RecoveryState::Initial::exit()
7157 {
7158 context< RecoveryMachine >().log_exit(state_name, enter_time);
7159 PG *pg = context< RecoveryMachine >().pg;
7160 utime_t dur = ceph_clock_now() - enter_time;
7161 pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur);
7162 }
7163
7164 /*------Started-------*/
7165 PG::RecoveryState::Started::Started(my_context ctx)
7166 : my_base(ctx),
7167 NamedState(context< RecoveryMachine >().pg, "Started")
7168 {
7169 context< RecoveryMachine >().log_enter(state_name);
7170 }
7171
7172 boost::statechart::result
7173 PG::RecoveryState::Started::react(const IntervalFlush&)
7174 {
7175 PG *pg = context< RecoveryMachine >().pg;
7176 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
7177 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
7178 return discard_event();
7179 }
7180
7181 boost::statechart::result PG::RecoveryState::Started::react(const AdvMap& advmap)
7182 {
7183 PG *pg = context< RecoveryMachine >().pg;
7184 ldout(pg->cct, 10) << "Started advmap" << dendl;
7185 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
7186 if (pg->should_restart_peering(
7187 advmap.up_primary,
7188 advmap.acting_primary,
7189 advmap.newup,
7190 advmap.newacting,
7191 advmap.lastmap,
7192 advmap.osdmap)) {
7193 ldout(pg->cct, 10) << "should_restart_peering, transitioning to Reset"
7194 << dendl;
7195 post_event(advmap);
7196 return transit< Reset >();
7197 }
7198 pg->remove_down_peer_info(advmap.osdmap);
7199 return discard_event();
7200 }
7201
7202 boost::statechart::result PG::RecoveryState::Started::react(const QueryState& q)
7203 {
7204 q.f->open_object_section("state");
7205 q.f->dump_string("name", state_name);
7206 q.f->dump_stream("enter_time") << enter_time;
7207 q.f->close_section();
7208 return discard_event();
7209 }
7210
7211 void PG::RecoveryState::Started::exit()
7212 {
7213 context< RecoveryMachine >().log_exit(state_name, enter_time);
7214 PG *pg = context< RecoveryMachine >().pg;
7215 utime_t dur = ceph_clock_now() - enter_time;
7216 pg->osd->recoverystate_perf->tinc(rs_started_latency, dur);
7217 }
7218
7219 /*--------Reset---------*/
7220 PG::RecoveryState::Reset::Reset(my_context ctx)
7221 : my_base(ctx),
7222 NamedState(context< RecoveryMachine >().pg, "Reset")
7223 {
7224 context< RecoveryMachine >().log_enter(state_name);
7225 PG *pg = context< RecoveryMachine >().pg;
7226
7227 pg->flushes_in_progress = 0;
7228 pg->set_last_peering_reset();
7229 }
7230
7231 boost::statechart::result
7232 PG::RecoveryState::Reset::react(const IntervalFlush&)
7233 {
7234 PG *pg = context< RecoveryMachine >().pg;
7235 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
7236 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
7237 return discard_event();
7238 }
7239
7240 boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
7241 {
7242 PG *pg = context< RecoveryMachine >().pg;
7243 ldout(pg->cct, 10) << "Reset advmap" << dendl;
7244
7245 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
7246
7247 if (pg->should_restart_peering(
7248 advmap.up_primary,
7249 advmap.acting_primary,
7250 advmap.newup,
7251 advmap.newacting,
7252 advmap.lastmap,
7253 advmap.osdmap)) {
7254 ldout(pg->cct, 10) << "should restart peering, calling start_peering_interval again"
7255 << dendl;
7256 pg->start_peering_interval(
7257 advmap.lastmap,
7258 advmap.newup, advmap.up_primary,
7259 advmap.newacting, advmap.acting_primary,
7260 context< RecoveryMachine >().get_cur_transaction());
7261 }
7262 pg->remove_down_peer_info(advmap.osdmap);
7263 pg->check_past_interval_bounds();
7264 return discard_event();
7265 }
7266
7267 boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&)
7268 {
7269 PG *pg = context< RecoveryMachine >().pg;
7270 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7271 context< RecoveryMachine >().send_notify(
7272 pg->get_primary(),
7273 pg_notify_t(
7274 pg->get_primary().shard, pg->pg_whoami.shard,
7275 pg->get_osdmap_epoch(),
7276 pg->get_osdmap_epoch(),
7277 pg->info),
7278 pg->past_intervals);
7279 }
7280
7281 pg->update_heartbeat_peers();
7282 pg->take_waiters();
7283
7284 return transit< Started >();
7285 }
7286
7287 boost::statechart::result PG::RecoveryState::Reset::react(const QueryState& q)
7288 {
7289 q.f->open_object_section("state");
7290 q.f->dump_string("name", state_name);
7291 q.f->dump_stream("enter_time") << enter_time;
7292 q.f->close_section();
7293 return discard_event();
7294 }
7295
7296 void PG::RecoveryState::Reset::exit()
7297 {
7298 context< RecoveryMachine >().log_exit(state_name, enter_time);
7299 PG *pg = context< RecoveryMachine >().pg;
7300 utime_t dur = ceph_clock_now() - enter_time;
7301 pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur);
7302 }
7303
7304 /*-------Start---------*/
7305 PG::RecoveryState::Start::Start(my_context ctx)
7306 : my_base(ctx),
7307 NamedState(context< RecoveryMachine >().pg, "Start")
7308 {
7309 context< RecoveryMachine >().log_enter(state_name);
7310
7311 PG *pg = context< RecoveryMachine >().pg;
7312 if (pg->is_primary()) {
7313 ldout(pg->cct, 1) << "transitioning to Primary" << dendl;
7314 post_event(MakePrimary());
7315 } else { //is_stray
7316 ldout(pg->cct, 1) << "transitioning to Stray" << dendl;
7317 post_event(MakeStray());
7318 }
7319 }
7320
7321 void PG::RecoveryState::Start::exit()
7322 {
7323 context< RecoveryMachine >().log_exit(state_name, enter_time);
7324 PG *pg = context< RecoveryMachine >().pg;
7325 utime_t dur = ceph_clock_now() - enter_time;
7326 pg->osd->recoverystate_perf->tinc(rs_start_latency, dur);
7327 }
7328
7329 /*---------Primary--------*/
7330 PG::RecoveryState::Primary::Primary(my_context ctx)
7331 : my_base(ctx),
7332 NamedState(context< RecoveryMachine >().pg, "Started/Primary")
7333 {
7334 context< RecoveryMachine >().log_enter(state_name);
7335 PG *pg = context< RecoveryMachine >().pg;
7336 ceph_assert(pg->want_acting.empty());
7337
7338 // set CREATING bit until we have peered for the first time.
7339 if (pg->info.history.last_epoch_started == 0) {
7340 pg->state_set(PG_STATE_CREATING);
7341 // use the history timestamp, which ultimately comes from the
7342 // monitor in the create case.
7343 utime_t t = pg->info.history.last_scrub_stamp;
7344 pg->info.stats.last_fresh = t;
7345 pg->info.stats.last_active = t;
7346 pg->info.stats.last_change = t;
7347 pg->info.stats.last_peered = t;
7348 pg->info.stats.last_clean = t;
7349 pg->info.stats.last_unstale = t;
7350 pg->info.stats.last_undegraded = t;
7351 pg->info.stats.last_fullsized = t;
7352 pg->info.stats.last_scrub_stamp = t;
7353 pg->info.stats.last_deep_scrub_stamp = t;
7354 pg->info.stats.last_clean_scrub_stamp = t;
7355 }
7356 }
7357
7358 boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& notevt)
7359 {
7360 PG *pg = context< RecoveryMachine >().pg;
7361 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
7362 pg->proc_replica_info(
7363 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
7364 return discard_event();
7365 }
7366
7367 boost::statechart::result PG::RecoveryState::Primary::react(const ActMap&)
7368 {
7369 PG *pg = context< RecoveryMachine >().pg;
7370 ldout(pg->cct, 7) << "handle ActMap primary" << dendl;
7371 pg->publish_stats_to_osd();
7372 pg->take_waiters();
7373 return discard_event();
7374 }
7375
7376 boost::statechart::result PG::RecoveryState::Primary::react(
7377 const SetForceRecovery&)
7378 {
7379 PG *pg = context< RecoveryMachine >().pg;
7380 pg->set_force_recovery(true);
7381 return discard_event();
7382 }
7383
7384 boost::statechart::result PG::RecoveryState::Primary::react(
7385 const UnsetForceRecovery&)
7386 {
7387 PG *pg = context< RecoveryMachine >().pg;
7388 pg->set_force_recovery(false);
7389 return discard_event();
7390 }
7391
7392 boost::statechart::result PG::RecoveryState::Primary::react(
7393 const RequestScrub& evt)
7394 {
7395 PG *pg = context< RecoveryMachine >().pg;
7396 if (pg->is_primary()) {
7397 pg->unreg_next_scrub();
7398 pg->scrubber.must_scrub = true;
7399 pg->scrubber.must_deep_scrub = evt.deep || evt.repair;
7400 pg->scrubber.must_repair = evt.repair;
7401 pg->reg_next_scrub();
7402 ldout(pg->cct,10) << "marking for scrub" << dendl;
7403 }
7404 return discard_event();
7405 }
7406
7407 boost::statechart::result PG::RecoveryState::Primary::react(
7408 const SetForceBackfill&)
7409 {
7410 PG *pg = context< RecoveryMachine >().pg;
7411 pg->set_force_backfill(true);
7412 return discard_event();
7413 }
7414
7415 boost::statechart::result PG::RecoveryState::Primary::react(
7416 const UnsetForceBackfill&)
7417 {
7418 PG *pg = context< RecoveryMachine >().pg;
7419 pg->set_force_backfill(false);
7420 return discard_event();
7421 }
7422
7423 void PG::RecoveryState::Primary::exit()
7424 {
7425 context< RecoveryMachine >().log_exit(state_name, enter_time);
7426 PG *pg = context< RecoveryMachine >().pg;
7427 pg->want_acting.clear();
7428 utime_t dur = ceph_clock_now() - enter_time;
7429 pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur);
7430 pg->clear_primary_state();
7431 pg->state_clear(PG_STATE_CREATING);
7432 }
7433
7434 /*---------Peering--------*/
7435 PG::RecoveryState::Peering::Peering(my_context ctx)
7436 : my_base(ctx),
7437 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering"),
7438 history_les_bound(false)
7439 {
7440 context< RecoveryMachine >().log_enter(state_name);
7441
7442 PG *pg = context< RecoveryMachine >().pg;
7443 ceph_assert(!pg->is_peered());
7444 ceph_assert(!pg->is_peering());
7445 ceph_assert(pg->is_primary());
7446 pg->state_set(PG_STATE_PEERING);
7447 }
7448
7449 boost::statechart::result PG::RecoveryState::Peering::react(const AdvMap& advmap)
7450 {
7451 PG *pg = context< RecoveryMachine >().pg;
7452 ldout(pg->cct, 10) << "Peering advmap" << dendl;
7453 if (prior_set.affected_by_map(*(advmap.osdmap), pg)) {
7454 ldout(pg->cct, 1) << "Peering, affected_by_map, going to Reset" << dendl;
7455 post_event(advmap);
7456 return transit< Reset >();
7457 }
7458
7459 pg->adjust_need_up_thru(advmap.osdmap);
7460
7461 return forward_event();
7462 }
7463
7464 boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q)
7465 {
7466 PG *pg = context< RecoveryMachine >().pg;
7467
7468 q.f->open_object_section("state");
7469 q.f->dump_string("name", state_name);
7470 q.f->dump_stream("enter_time") << enter_time;
7471
7472 q.f->open_array_section("past_intervals");
7473 pg->past_intervals.dump(q.f);
7474 q.f->close_section();
7475
7476 q.f->open_array_section("probing_osds");
7477 for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
7478 p != prior_set.probe.end();
7479 ++p)
7480 q.f->dump_stream("osd") << *p;
7481 q.f->close_section();
7482
7483 if (prior_set.pg_down)
7484 q.f->dump_string("blocked", "peering is blocked due to down osds");
7485
7486 q.f->open_array_section("down_osds_we_would_probe");
7487 for (set<int>::iterator p = prior_set.down.begin();
7488 p != prior_set.down.end();
7489 ++p)
7490 q.f->dump_int("osd", *p);
7491 q.f->close_section();
7492
7493 q.f->open_array_section("peering_blocked_by");
7494 for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
7495 p != prior_set.blocked_by.end();
7496 ++p) {
7497 q.f->open_object_section("osd");
7498 q.f->dump_int("osd", p->first);
7499 q.f->dump_int("current_lost_at", p->second);
7500 q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
7501 q.f->close_section();
7502 }
7503 q.f->close_section();
7504
7505 if (history_les_bound) {
7506 q.f->open_array_section("peering_blocked_by_detail");
7507 q.f->open_object_section("item");
7508 q.f->dump_string("detail","peering_blocked_by_history_les_bound");
7509 q.f->close_section();
7510 q.f->close_section();
7511 }
7512
7513 q.f->close_section();
7514 return forward_event();
7515 }
7516
7517 void PG::RecoveryState::Peering::exit()
7518 {
7519 PG *pg = context< RecoveryMachine >().pg;
7520 ldout(pg->cct, 10) << "Leaving Peering" << dendl;
7521 context< RecoveryMachine >().log_exit(state_name, enter_time);
7522 pg->state_clear(PG_STATE_PEERING);
7523 pg->clear_probe_targets();
7524
7525 utime_t dur = ceph_clock_now() - enter_time;
7526 pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur);
7527 }
7528
7529
7530 /*------Backfilling-------*/
7531 PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
7532 : my_base(ctx),
7533 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Backfilling")
7534 {
7535 context< RecoveryMachine >().log_enter(state_name);
7536 PG *pg = context< RecoveryMachine >().pg;
7537 pg->backfill_reserved = true;
7538 pg->queue_recovery();
7539 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
7540 pg->state_clear(PG_STATE_BACKFILL_WAIT);
7541 pg->state_set(PG_STATE_BACKFILLING);
7542 pg->publish_stats_to_osd();
7543 }
7544
7545 void PG::RecoveryState::Backfilling::backfill_release_reservations()
7546 {
7547 PG *pg = context< RecoveryMachine >().pg;
7548 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7549 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
7550 it != pg->backfill_targets.end();
7551 ++it) {
7552 ceph_assert(*it != pg->pg_whoami);
7553 ConnectionRef con = pg->osd->get_con_osd_cluster(
7554 it->osd, pg->get_osdmap_epoch());
7555 if (con) {
7556 pg->osd->send_message_osd_cluster(
7557 new MBackfillReserve(
7558 MBackfillReserve::RELEASE,
7559 spg_t(pg->info.pgid.pgid, it->shard),
7560 pg->get_osdmap_epoch()),
7561 con.get());
7562 }
7563 }
7564 }
7565
7566 void PG::RecoveryState::Backfilling::cancel_backfill()
7567 {
7568 PG *pg = context< RecoveryMachine >().pg;
7569 backfill_release_reservations();
7570 if (!pg->waiting_on_backfill.empty()) {
7571 pg->waiting_on_backfill.clear();
7572 pg->finish_recovery_op(hobject_t::get_max());
7573 }
7574 }
7575
7576 boost::statechart::result
7577 PG::RecoveryState::Backfilling::react(const Backfilled &c)
7578 {
7579 backfill_release_reservations();
7580 return transit<Recovered>();
7581 }
7582
7583 boost::statechart::result
7584 PG::RecoveryState::Backfilling::react(const DeferBackfill &c)
7585 {
7586 PG *pg = context< RecoveryMachine >().pg;
7587 ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl;
7588 pg->state_set(PG_STATE_BACKFILL_WAIT);
7589 pg->state_clear(PG_STATE_BACKFILLING);
7590 cancel_backfill();
7591 pg->schedule_backfill_retry(c.delay);
7592 return transit<NotBackfilling>();
7593 }
7594
7595 boost::statechart::result
7596 PG::RecoveryState::Backfilling::react(const UnfoundBackfill &c)
7597 {
7598 PG *pg = context< RecoveryMachine >().pg;
7599 ldout(pg->cct, 10) << "backfill has unfound, can't continue" << dendl;
7600 pg->state_set(PG_STATE_BACKFILL_UNFOUND);
7601 pg->state_clear(PG_STATE_BACKFILLING);
7602 cancel_backfill();
7603 return transit<NotBackfilling>();
7604 }
7605
7606 boost::statechart::result
7607 PG::RecoveryState::Backfilling::react(const RemoteReservationRevokedTooFull &)
7608 {
7609 PG *pg = context< RecoveryMachine >().pg;
7610 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
7611 pg->state_clear(PG_STATE_BACKFILLING);
7612 cancel_backfill();
7613 pg->schedule_backfill_retry(pg->cct->_conf->osd_backfill_retry_interval);
7614 return transit<NotBackfilling>();
7615 }
7616
7617 boost::statechart::result
7618 PG::RecoveryState::Backfilling::react(const RemoteReservationRevoked &)
7619 {
7620 PG *pg = context< RecoveryMachine >().pg;
7621 pg->state_set(PG_STATE_BACKFILL_WAIT);
7622 cancel_backfill();
7623 if (pg->needs_backfill()) {
7624 return transit<WaitLocalBackfillReserved>();
7625 } else {
7626 // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
7627 return discard_event();
7628 }
7629 }
7630
7631 void PG::RecoveryState::Backfilling::exit()
7632 {
7633 context< RecoveryMachine >().log_exit(state_name, enter_time);
7634 PG *pg = context< RecoveryMachine >().pg;
7635 pg->backfill_reserved = false;
7636 pg->backfill_reserving = false;
7637 pg->state_clear(PG_STATE_BACKFILLING);
7638 pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
7639 utime_t dur = ceph_clock_now() - enter_time;
7640 pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
7641 }
7642
7643 /*--WaitRemoteBackfillReserved--*/
7644
7645 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
7646 : my_base(ctx),
7647 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteBackfillReserved"),
7648 backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
7649 {
7650 context< RecoveryMachine >().log_enter(state_name);
7651 PG *pg = context< RecoveryMachine >().pg;
7652 pg->state_set(PG_STATE_BACKFILL_WAIT);
7653 pg->publish_stats_to_osd();
7654 post_event(RemoteBackfillReserved());
7655 }
7656
7657 boost::statechart::result
7658 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
7659 {
7660 PG *pg = context< RecoveryMachine >().pg;
7661
7662 int64_t num_bytes = pg->info.stats.stats.sum.num_bytes;
7663 ldout(pg->cct, 10) << __func__ << " num_bytes " << num_bytes << dendl;
7664 if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) {
7665 //The primary never backfills itself
7666 ceph_assert(*backfill_osd_it != pg->pg_whoami);
7667 ConnectionRef con = pg->osd->get_con_osd_cluster(
7668 backfill_osd_it->osd, pg->get_osdmap_epoch());
7669 if (con) {
7670 pg->osd->send_message_osd_cluster(
7671 new MBackfillReserve(
7672 MBackfillReserve::REQUEST,
7673 spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
7674 pg->get_osdmap_epoch(),
7675 pg->get_backfill_priority(),
7676 num_bytes,
7677 pg->peer_bytes[*backfill_osd_it]),
7678 con.get());
7679 }
7680 ++backfill_osd_it;
7681 } else {
7682 pg->peer_bytes.clear();
7683 post_event(AllBackfillsReserved());
7684 }
7685 return discard_event();
7686 }
7687
7688 void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
7689 {
7690 context< RecoveryMachine >().log_exit(state_name, enter_time);
7691 PG *pg = context< RecoveryMachine >().pg;
7692 utime_t dur = ceph_clock_now() - enter_time;
7693 pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur);
7694 }
7695
7696 void PG::RecoveryState::WaitRemoteBackfillReserved::retry()
7697 {
7698 PG *pg = context< RecoveryMachine >().pg;
7699 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7700
7701 // Send CANCEL to all previously acquired reservations
7702 set<pg_shard_t>::const_iterator it, begin, end;
7703 begin = context< Active >().remote_shards_to_reserve_backfill.begin();
7704 end = context< Active >().remote_shards_to_reserve_backfill.end();
7705 ceph_assert(begin != end);
7706 for (it = begin; it != backfill_osd_it; ++it) {
7707 //The primary never backfills itself
7708 ceph_assert(*it != pg->pg_whoami);
7709 ConnectionRef con = pg->osd->get_con_osd_cluster(
7710 it->osd, pg->get_osdmap_epoch());
7711 if (con) {
7712 pg->osd->send_message_osd_cluster(
7713 new MBackfillReserve(
7714 MBackfillReserve::RELEASE,
7715 spg_t(pg->info.pgid.pgid, it->shard),
7716 pg->get_osdmap_epoch()),
7717 con.get());
7718 }
7719 }
7720
7721 pg->state_clear(PG_STATE_BACKFILL_WAIT);
7722 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
7723 pg->publish_stats_to_osd();
7724
7725 pg->schedule_backfill_retry(pg->cct->_conf->osd_backfill_retry_interval);
7726 }
7727
7728 boost::statechart::result
7729 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt)
7730 {
7731 retry();
7732 return transit<NotBackfilling>();
7733 }
7734
7735 boost::statechart::result
7736 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt)
7737 {
7738 retry();
7739 return transit<NotBackfilling>();
7740 }
7741
7742 /*--WaitLocalBackfillReserved--*/
7743 PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
7744 : my_base(ctx),
7745 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalBackfillReserved")
7746 {
7747 context< RecoveryMachine >().log_enter(state_name);
7748 PG *pg = context< RecoveryMachine >().pg;
7749 pg->state_set(PG_STATE_BACKFILL_WAIT);
7750 pg->osd->local_reserver.request_reservation(
7751 pg->info.pgid,
7752 new QueuePeeringEvt<LocalBackfillReserved>(
7753 pg, pg->get_osdmap_epoch(),
7754 LocalBackfillReserved()),
7755 pg->get_backfill_priority(),
7756 new QueuePeeringEvt<DeferBackfill>(
7757 pg, pg->get_osdmap_epoch(),
7758 DeferBackfill(0.0)));
7759 pg->publish_stats_to_osd();
7760 }
7761
7762 void PG::RecoveryState::WaitLocalBackfillReserved::exit()
7763 {
7764 context< RecoveryMachine >().log_exit(state_name, enter_time);
7765 PG *pg = context< RecoveryMachine >().pg;
7766 utime_t dur = ceph_clock_now() - enter_time;
7767 pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur);
7768 }
7769
7770 /*----NotBackfilling------*/
7771 PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
7772 : my_base(ctx),
7773 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotBackfilling")
7774 {
7775 context< RecoveryMachine >().log_enter(state_name);
7776 PG *pg = context< RecoveryMachine >().pg;
7777 pg->state_clear(PG_STATE_REPAIR);
7778 pg->publish_stats_to_osd();
7779 }
7780
7781 boost::statechart::result
7782 PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
7783 {
7784 return discard_event();
7785 }
7786
7787 boost::statechart::result
7788 PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
7789 {
7790 return discard_event();
7791 }
7792
7793 void PG::RecoveryState::NotBackfilling::exit()
7794 {
7795 context< RecoveryMachine >().log_exit(state_name, enter_time);
7796 PG *pg = context< RecoveryMachine >().pg;
7797 pg->state_clear(PG_STATE_BACKFILL_UNFOUND);
7798 utime_t dur = ceph_clock_now() - enter_time;
7799 pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
7800 }
7801
7802 /*----NotRecovering------*/
7803 PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx)
7804 : my_base(ctx),
7805 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotRecovering")
7806 {
7807 context< RecoveryMachine >().log_enter(state_name);
7808 PG *pg = context< RecoveryMachine >().pg;
7809 pg->publish_stats_to_osd();
7810 }
7811
7812 void PG::RecoveryState::NotRecovering::exit()
7813 {
7814 context< RecoveryMachine >().log_exit(state_name, enter_time);
7815 PG *pg = context< RecoveryMachine >().pg;
7816 pg->state_clear(PG_STATE_RECOVERY_UNFOUND);
7817 utime_t dur = ceph_clock_now() - enter_time;
7818 pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur);
7819 }
7820
7821 /*---RepNotRecovering----*/
7822 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
7823 : my_base(ctx),
7824 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepNotRecovering")
7825 {
7826 context< RecoveryMachine >().log_enter(state_name);
7827 }
7828
7829 boost::statechart::result
7830 PG::RecoveryState::RepNotRecovering::react(const RejectRemoteReservation &evt)
7831 {
7832 PG *pg = context< RecoveryMachine >().pg;
7833 pg->reject_reservation();
7834 post_event(RemoteReservationRejected());
7835 return discard_event();
7836 }
7837
7838 void PG::RecoveryState::RepNotRecovering::exit()
7839 {
7840 context< RecoveryMachine >().log_exit(state_name, enter_time);
7841 PG *pg = context< RecoveryMachine >().pg;
7842 utime_t dur = ceph_clock_now() - enter_time;
7843 pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur);
7844 }
7845
7846 /*---RepWaitRecoveryReserved--*/
7847 PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
7848 : my_base(ctx),
7849 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitRecoveryReserved")
7850 {
7851 context< RecoveryMachine >().log_enter(state_name);
7852 }
7853
7854 boost::statechart::result
7855 PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
7856 {
7857 PG *pg = context< RecoveryMachine >().pg;
7858 pg->osd->send_message_osd_cluster(
7859 pg->primary.osd,
7860 new MRecoveryReserve(
7861 MRecoveryReserve::GRANT,
7862 spg_t(pg->info.pgid.pgid, pg->primary.shard),
7863 pg->get_osdmap_epoch()),
7864 pg->get_osdmap_epoch());
7865 return transit<RepRecovering>();
7866 }
7867
7868 boost::statechart::result
7869 PG::RecoveryState::RepWaitRecoveryReserved::react(
7870 const RemoteReservationCanceled &evt)
7871 {
7872 PG *pg = context< RecoveryMachine >().pg;
7873 pg->clear_reserved_num_bytes();
7874 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7875 return transit<RepNotRecovering>();
7876 }
7877
7878 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
7879 {
7880 context< RecoveryMachine >().log_exit(state_name, enter_time);
7881 PG *pg = context< RecoveryMachine >().pg;
7882 utime_t dur = ceph_clock_now() - enter_time;
7883 pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur);
7884 }
7885
7886 /*-RepWaitBackfillReserved*/
7887 PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
7888 : my_base(ctx),
7889 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitBackfillReserved")
7890 {
7891 context< RecoveryMachine >().log_enter(state_name);
7892 }
7893
7894 boost::statechart::result
7895 PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
7896 {
7897 PG *pg = context< RecoveryMachine >().pg;
7898 // Use tentative_bacfill_full() to make sure enough
7899 // space is available to handle target bytes from primary.
7900
7901 // TODO: If we passed num_objects from primary we could account for
7902 // an estimate of the metadata overhead.
7903
7904 // TODO: If we had compressed_allocated and compressed_original from primary
7905 // we could compute compression ratio and adjust accordingly.
7906
7907 // XXX: There is no way to get omap overhead and this would only apply
7908 // to whatever possibly different partition that is storing the database.
7909
7910 // update_osd_stat() from heartbeat will do this on a new
7911 // statfs using pg->primary_num_bytes.
7912 uint64_t pending_adjustment = 0;
7913 int64_t primary_num_bytes = evt.primary_num_bytes;
7914 int64_t local_num_bytes = evt.local_num_bytes;
7915 if (primary_num_bytes) {
7916 // For erasure coded pool overestimate by a full stripe per object
7917 // because we don't know how each objected rounded to the nearest stripe
7918 if (pg->pool.info.is_erasure()) {
7919 primary_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count();
7920 primary_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects;
7921 local_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count();
7922 local_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects;
7923 }
7924 pending_adjustment = pending_backfill(pg->cct, primary_num_bytes, local_num_bytes);
7925 ldout(pg->cct, 10) << __func__ << " primary_num_bytes " << (primary_num_bytes >> 10) << "KiB"
7926 << " local " << (local_num_bytes >> 10) << "KiB"
7927 << " pending_adjustments " << (pending_adjustment >> 10) << "KiB"
7928 << dendl;
7929 }
7930 // This lock protects not only the stats OSDService but also setting the pg primary_num_bytes
7931 // That's why we don't immediately unlock
7932 Mutex::Locker l(pg->osd->stat_lock);
7933 osd_stat_t cur_stat = pg->osd->osd_stat;
7934 if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
7935 (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
7936 ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
7937 << dendl;
7938 post_event(RejectRemoteReservation());
7939 } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
7940 pg->osd->tentative_backfill_full(pg, pending_adjustment, cur_stat)) {
7941 ldout(pg->cct, 10) << "backfill reservation rejected: backfill full"
7942 << dendl;
7943 post_event(RejectRemoteReservation());
7944 } else {
7945 Context *preempt = nullptr;
7946 // Don't reserve space if skipped reservation check, this is used
7947 // to test the other backfill full check AND in case a corruption
7948 // of num_bytes requires ignoring that value and trying the
7949 // backfill anyway.
7950 if (primary_num_bytes && !pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation)
7951 pg->set_reserved_num_bytes(primary_num_bytes, local_num_bytes);
7952 else
7953 pg->clear_reserved_num_bytes();
7954 // Use un-ec-adjusted bytes for stats.
7955 pg->info.stats.stats.sum.num_bytes = evt.local_num_bytes;
7956 if (HAVE_FEATURE(pg->upacting_features, RECOVERY_RESERVATION_2)) {
7957 // older peers will interpret preemption as TOOFULL
7958 preempt = new QueuePeeringEvt<RemoteBackfillPreempted>(
7959 pg, pg->get_osdmap_epoch(),
7960 RemoteBackfillPreempted());
7961 }
7962 pg->osd->remote_reserver.request_reservation(
7963 pg->info.pgid,
7964 new QueuePeeringEvt<RemoteBackfillReserved>(
7965 pg, pg->get_osdmap_epoch(),
7966 RemoteBackfillReserved()),
7967 evt.priority,
7968 preempt);
7969 }
7970 return transit<RepWaitBackfillReserved>();
7971 }
7972
7973 boost::statechart::result
7974 PG::RecoveryState::RepNotRecovering::react(const RequestRecoveryPrio &evt)
7975 {
7976 PG *pg = context< RecoveryMachine >().pg;
7977
7978 // fall back to a local reckoning of priority of primary doesn't pass one
7979 // (pre-mimic compat)
7980 int prio = evt.priority ? evt.priority : pg->get_recovery_priority();
7981
7982 Context *preempt = nullptr;
7983 if (HAVE_FEATURE(pg->upacting_features, RECOVERY_RESERVATION_2)) {
7984 // older peers can't handle this
7985 preempt = new QueuePeeringEvt<RemoteRecoveryPreempted>(
7986 pg, pg->get_osdmap_epoch(),
7987 RemoteRecoveryPreempted());
7988 }
7989
7990 pg->osd->remote_reserver.request_reservation(
7991 pg->info.pgid,
7992 new QueuePeeringEvt<RemoteRecoveryReserved>(
7993 pg, pg->get_osdmap_epoch(),
7994 RemoteRecoveryReserved()),
7995 prio,
7996 preempt);
7997 return transit<RepWaitRecoveryReserved>();
7998 }
7999
8000 void PG::RecoveryState::RepWaitBackfillReserved::exit()
8001 {
8002 context< RecoveryMachine >().log_exit(state_name, enter_time);
8003 PG *pg = context< RecoveryMachine >().pg;
8004 utime_t dur = ceph_clock_now() - enter_time;
8005 pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur);
8006 }
8007
8008 boost::statechart::result
8009 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
8010 {
8011 PG *pg = context< RecoveryMachine >().pg;
8012
8013 pg->osd->send_message_osd_cluster(
8014 pg->primary.osd,
8015 new MBackfillReserve(
8016 MBackfillReserve::GRANT,
8017 spg_t(pg->info.pgid.pgid, pg->primary.shard),
8018 pg->get_osdmap_epoch()),
8019 pg->get_osdmap_epoch());
8020 return transit<RepRecovering>();
8021 }
8022
8023 boost::statechart::result
8024 PG::RecoveryState::RepWaitBackfillReserved::react(
8025 const RejectRemoteReservation &evt)
8026 {
8027 PG *pg = context< RecoveryMachine >().pg;
8028 pg->reject_reservation();
8029 post_event(RemoteReservationRejected());
8030 return discard_event();
8031 }
8032
8033 boost::statechart::result
8034 PG::RecoveryState::RepWaitBackfillReserved::react(
8035 const RemoteReservationRejected &evt)
8036 {
8037 PG *pg = context< RecoveryMachine >().pg;
8038 pg->clear_reserved_num_bytes();
8039 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
8040 return transit<RepNotRecovering>();
8041 }
8042
8043 boost::statechart::result
8044 PG::RecoveryState::RepWaitBackfillReserved::react(
8045 const RemoteReservationCanceled &evt)
8046 {
8047 PG *pg = context< RecoveryMachine >().pg;
8048 pg->clear_reserved_num_bytes();
8049 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
8050 return transit<RepNotRecovering>();
8051 }
8052
8053 /*---RepRecovering-------*/
8054 PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx)
8055 : my_base(ctx),
8056 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepRecovering")
8057 {
8058 context< RecoveryMachine >().log_enter(state_name);
8059 }
8060
8061 boost::statechart::result
8062 PG::RecoveryState::RepRecovering::react(const RemoteRecoveryPreempted &)
8063 {
8064 PG *pg = context< RecoveryMachine >().pg;
8065 pg->clear_reserved_num_bytes();
8066 pg->osd->send_message_osd_cluster(
8067 pg->primary.osd,
8068 new MRecoveryReserve(
8069 MRecoveryReserve::REVOKE,
8070 spg_t(pg->info.pgid.pgid, pg->primary.shard),
8071 pg->get_osdmap_epoch()),
8072 pg->get_osdmap_epoch());
8073 return discard_event();
8074 }
8075
8076 boost::statechart::result
8077 PG::RecoveryState::RepRecovering::react(const BackfillTooFull &)
8078 {
8079 PG *pg = context< RecoveryMachine >().pg;
8080 pg->clear_reserved_num_bytes();
8081 pg->osd->send_message_osd_cluster(
8082 pg->primary.osd,
8083 new MBackfillReserve(
8084 MBackfillReserve::TOOFULL,
8085 spg_t(pg->info.pgid.pgid, pg->primary.shard),
8086 pg->get_osdmap_epoch()),
8087 pg->get_osdmap_epoch());
8088 return discard_event();
8089 }
8090
8091 boost::statechart::result
8092 PG::RecoveryState::RepRecovering::react(const RemoteBackfillPreempted &)
8093 {
8094 PG *pg = context< RecoveryMachine >().pg;
8095 pg->clear_reserved_num_bytes();
8096 pg->osd->send_message_osd_cluster(
8097 pg->primary.osd,
8098 new MBackfillReserve(
8099 MBackfillReserve::REVOKE,
8100 spg_t(pg->info.pgid.pgid, pg->primary.shard),
8101 pg->get_osdmap_epoch()),
8102 pg->get_osdmap_epoch());
8103 return discard_event();
8104 }
8105
8106 void PG::RecoveryState::RepRecovering::exit()
8107 {
8108 context< RecoveryMachine >().log_exit(state_name, enter_time);
8109 PG *pg = context< RecoveryMachine >().pg;
8110 pg->clear_reserved_num_bytes();
8111 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
8112 utime_t dur = ceph_clock_now() - enter_time;
8113 pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur);
8114 }
8115
8116 /*------Activating--------*/
8117 PG::RecoveryState::Activating::Activating(my_context ctx)
8118 : my_base(ctx),
8119 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Activating")
8120 {
8121 context< RecoveryMachine >().log_enter(state_name);
8122 }
8123
8124 void PG::RecoveryState::Activating::exit()
8125 {
8126 context< RecoveryMachine >().log_exit(state_name, enter_time);
8127 PG *pg = context< RecoveryMachine >().pg;
8128 utime_t dur = ceph_clock_now() - enter_time;
8129 pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur);
8130 }
8131
8132 PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
8133 : my_base(ctx),
8134 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalRecoveryReserved")
8135 {
8136 context< RecoveryMachine >().log_enter(state_name);
8137 PG *pg = context< RecoveryMachine >().pg;
8138
8139 // Make sure all nodes that part of the recovery aren't full
8140 if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery &&
8141 pg->osd->check_osdmap_full(pg->acting_recovery_backfill)) {
8142 post_event(RecoveryTooFull());
8143 return;
8144 }
8145
8146 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
8147 pg->state_set(PG_STATE_RECOVERY_WAIT);
8148 pg->osd->local_reserver.request_reservation(
8149 pg->info.pgid,
8150 new QueuePeeringEvt<LocalRecoveryReserved>(
8151 pg, pg->get_osdmap_epoch(),
8152 LocalRecoveryReserved()),
8153 pg->get_recovery_priority(),
8154 new QueuePeeringEvt<DeferRecovery>(
8155 pg, pg->get_osdmap_epoch(),
8156 DeferRecovery(0.0)));
8157 pg->publish_stats_to_osd();
8158 }
8159
8160 boost::statechart::result
8161 PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
8162 {
8163 PG *pg = context< RecoveryMachine >().pg;
8164 pg->state_set(PG_STATE_RECOVERY_TOOFULL);
8165 pg->schedule_recovery_retry(pg->cct->_conf->osd_recovery_retry_interval);
8166 return transit<NotRecovering>();
8167 }
8168
8169 void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
8170 {
8171 context< RecoveryMachine >().log_exit(state_name, enter_time);
8172 PG *pg = context< RecoveryMachine >().pg;
8173 utime_t dur = ceph_clock_now() - enter_time;
8174 pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur);
8175 }
8176
8177 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
8178 : my_base(ctx),
8179 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
8180 remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
8181 {
8182 context< RecoveryMachine >().log_enter(state_name);
8183 post_event(RemoteRecoveryReserved());
8184 }
8185
8186 boost::statechart::result
8187 PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
8188 PG *pg = context< RecoveryMachine >().pg;
8189
8190 if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
8191 ceph_assert(*remote_recovery_reservation_it != pg->pg_whoami);
8192 ConnectionRef con = pg->osd->get_con_osd_cluster(
8193 remote_recovery_reservation_it->osd, pg->get_osdmap_epoch());
8194 if (con) {
8195 pg->osd->send_message_osd_cluster(
8196 new MRecoveryReserve(
8197 MRecoveryReserve::REQUEST,
8198 spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard),
8199 pg->get_osdmap_epoch(),
8200 pg->get_recovery_priority()),
8201 con.get());
8202 }
8203 ++remote_recovery_reservation_it;
8204 } else {
8205 post_event(AllRemotesReserved());
8206 }
8207 return discard_event();
8208 }
8209
8210 void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
8211 {
8212 context< RecoveryMachine >().log_exit(state_name, enter_time);
8213 PG *pg = context< RecoveryMachine >().pg;
8214 utime_t dur = ceph_clock_now() - enter_time;
8215 pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur);
8216 }
8217
8218 PG::RecoveryState::Recovering::Recovering(my_context ctx)
8219 : my_base(ctx),
8220 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovering")
8221 {
8222 context< RecoveryMachine >().log_enter(state_name);
8223
8224 PG *pg = context< RecoveryMachine >().pg;
8225 pg->state_clear(PG_STATE_RECOVERY_WAIT);
8226 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
8227 pg->state_set(PG_STATE_RECOVERING);
8228 ceph_assert(!pg->state_test(PG_STATE_ACTIVATING));
8229 pg->publish_stats_to_osd();
8230 pg->queue_recovery();
8231 }
8232
8233 void PG::RecoveryState::Recovering::release_reservations(bool cancel)
8234 {
8235 PG *pg = context< RecoveryMachine >().pg;
8236 ceph_assert(cancel || !pg->pg_log.get_missing().have_missing());
8237
8238 // release remote reservations
8239 for (set<pg_shard_t>::const_iterator i =
8240 context< Active >().remote_shards_to_reserve_recovery.begin();
8241 i != context< Active >().remote_shards_to_reserve_recovery.end();
8242 ++i) {
8243 if (*i == pg->pg_whoami) // skip myself
8244 continue;
8245 ConnectionRef con = pg->osd->get_con_osd_cluster(
8246 i->osd, pg->get_osdmap_epoch());
8247 if (con) {
8248 pg->osd->send_message_osd_cluster(
8249 new MRecoveryReserve(
8250 MRecoveryReserve::RELEASE,
8251 spg_t(pg->info.pgid.pgid, i->shard),
8252 pg->get_osdmap_epoch()),
8253 con.get());
8254 }
8255 }
8256 }
8257
8258 boost::statechart::result
8259 PG::RecoveryState::Recovering::react(const AllReplicasRecovered &evt)
8260 {
8261 PG *pg = context< RecoveryMachine >().pg;
8262 pg->state_clear(PG_STATE_FORCED_RECOVERY);
8263 release_reservations();
8264 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8265 return transit<Recovered>();
8266 }
8267
8268 boost::statechart::result
8269 PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
8270 {
8271 PG *pg = context< RecoveryMachine >().pg;
8272 pg->state_clear(PG_STATE_FORCED_RECOVERY);
8273 release_reservations();
8274 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8275 // XXX: Is this needed?
8276 pg->publish_stats_to_osd();
8277 return transit<WaitLocalBackfillReserved>();
8278 }
8279
8280 boost::statechart::result
8281 PG::RecoveryState::Recovering::react(const DeferRecovery &evt)
8282 {
8283 PG *pg = context< RecoveryMachine >().pg;
8284 if (!pg->state_test(PG_STATE_RECOVERING)) {
8285 // we may have finished recovery and have an AllReplicasRecovered
8286 // event queued to move us to the next state.
8287 ldout(pg->cct, 10) << "got defer recovery but not recovering" << dendl;
8288 return discard_event();
8289 }
8290 ldout(pg->cct, 10) << "defer recovery, retry delay " << evt.delay << dendl;
8291 pg->state_set(PG_STATE_RECOVERY_WAIT);
8292 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8293 release_reservations(true);
8294 pg->schedule_recovery_retry(evt.delay);
8295 return transit<NotRecovering>();
8296 }
8297
8298 boost::statechart::result
8299 PG::RecoveryState::Recovering::react(const UnfoundRecovery &evt)
8300 {
8301 PG *pg = context< RecoveryMachine >().pg;
8302 ldout(pg->cct, 10) << "recovery has unfound, can't continue" << dendl;
8303 pg->state_set(PG_STATE_RECOVERY_UNFOUND);
8304 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8305 release_reservations(true);
8306 return transit<NotRecovering>();
8307 }
8308
8309 void PG::RecoveryState::Recovering::exit()
8310 {
8311 context< RecoveryMachine >().log_exit(state_name, enter_time);
8312 PG *pg = context< RecoveryMachine >().pg;
8313 utime_t dur = ceph_clock_now() - enter_time;
8314 pg->state_clear(PG_STATE_RECOVERING);
8315 pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur);
8316 }
8317
8318 PG::RecoveryState::Recovered::Recovered(my_context ctx)
8319 : my_base(ctx),
8320 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovered")
8321 {
8322 pg_shard_t auth_log_shard;
8323
8324 context< RecoveryMachine >().log_enter(state_name);
8325
8326 PG *pg = context< RecoveryMachine >().pg;
8327
8328 ceph_assert(!pg->needs_recovery());
8329
8330 // if we finished backfill, all acting are active; recheck if
8331 // DEGRADED | UNDERSIZED is appropriate.
8332 ceph_assert(!pg->acting_recovery_backfill.empty());
8333 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
8334 pg->acting_recovery_backfill.size()) {
8335 pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
8336 pg->publish_stats_to_osd();
8337 }
8338
8339 // adjust acting set? (e.g. because backfill completed...)
8340 bool history_les_bound = false;
8341 if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
8342 true, &history_les_bound)) {
8343 ceph_assert(pg->want_acting.size());
8344 } else if (!pg->async_recovery_targets.empty()) {
8345 pg->choose_acting(auth_log_shard, true, &history_les_bound);
8346 }
8347
8348 if (context< Active >().all_replicas_activated &&
8349 pg->async_recovery_targets.empty())
8350 post_event(GoClean());
8351 }
8352
8353 void PG::RecoveryState::Recovered::exit()
8354 {
8355 context< RecoveryMachine >().log_exit(state_name, enter_time);
8356 PG *pg = context< RecoveryMachine >().pg;
8357 utime_t dur = ceph_clock_now() - enter_time;
8358 pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur);
8359 }
8360
8361 PG::RecoveryState::Clean::Clean(my_context ctx)
8362 : my_base(ctx),
8363 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Clean")
8364 {
8365 context< RecoveryMachine >().log_enter(state_name);
8366
8367 PG *pg = context< RecoveryMachine >().pg;
8368
8369 if (pg->info.last_complete != pg->info.last_update) {
8370 ceph_abort();
8371 }
8372 Context *c = pg->finish_recovery();
8373 context< RecoveryMachine >().get_cur_transaction()->register_on_commit(c);
8374
8375 pg->try_mark_clean();
8376 }
8377
8378 void PG::RecoveryState::Clean::exit()
8379 {
8380 context< RecoveryMachine >().log_exit(state_name, enter_time);
8381 PG *pg = context< RecoveryMachine >().pg;
8382 pg->state_clear(PG_STATE_CLEAN);
8383 utime_t dur = ceph_clock_now() - enter_time;
8384 pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur);
8385 }
8386
8387 template <typename T>
8388 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
8389 {
8390 set<int> osds_found;
8391 set<pg_shard_t> out;
8392 for (typename T::const_iterator i = in.begin();
8393 i != in.end();
8394 ++i) {
8395 if (*i != skip && !osds_found.count(i->osd)) {
8396 osds_found.insert(i->osd);
8397 out.insert(*i);
8398 }
8399 }
8400 return out;
8401 }
8402
8403 /*---------Active---------*/
8404 PG::RecoveryState::Active::Active(my_context ctx)
8405 : my_base(ctx),
8406 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active"),
8407 remote_shards_to_reserve_recovery(
8408 unique_osd_shard_set(
8409 context< RecoveryMachine >().pg->pg_whoami,
8410 context< RecoveryMachine >().pg->acting_recovery_backfill)),
8411 remote_shards_to_reserve_backfill(
8412 unique_osd_shard_set(
8413 context< RecoveryMachine >().pg->pg_whoami,
8414 context< RecoveryMachine >().pg->backfill_targets)),
8415 all_replicas_activated(false)
8416 {
8417 context< RecoveryMachine >().log_enter(state_name);
8418
8419 PG *pg = context< RecoveryMachine >().pg;
8420
8421 ceph_assert(!pg->backfill_reserving);
8422 ceph_assert(!pg->backfill_reserved);
8423 ceph_assert(pg->is_primary());
8424 ldout(pg->cct, 10) << "In Active, about to call activate" << dendl;
8425 pg->start_flush(context< RecoveryMachine >().get_cur_transaction());
8426 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
8427 pg->get_osdmap_epoch(),
8428 *context< RecoveryMachine >().get_query_map(),
8429 context< RecoveryMachine >().get_info_map(),
8430 context< RecoveryMachine >().get_recovery_ctx());
8431
8432 // everyone has to commit/ack before we are truly active
8433 pg->blocked_by.clear();
8434 for (set<pg_shard_t>::iterator p = pg->acting_recovery_backfill.begin();
8435 p != pg->acting_recovery_backfill.end();
8436 ++p) {
8437 if (p->shard != pg->pg_whoami.shard) {
8438 pg->blocked_by.insert(p->shard);
8439 }
8440 }
8441 pg->publish_stats_to_osd();
8442 ldout(pg->cct, 10) << "Activate Finished" << dendl;
8443 }
8444
8445 boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
8446 {
8447 PG *pg = context< RecoveryMachine >().pg;
8448 if (pg->should_restart_peering(
8449 advmap.up_primary,
8450 advmap.acting_primary,
8451 advmap.newup,
8452 advmap.newacting,
8453 advmap.lastmap,
8454 advmap.osdmap)) {
8455 ldout(pg->cct, 10) << "Active advmap interval change, fast return" << dendl;
8456 return forward_event();
8457 }
8458 ldout(pg->cct, 10) << "Active advmap" << dendl;
8459 bool need_publish = false;
8460
8461 if (advmap.osdmap->require_osd_release >= CEPH_RELEASE_MIMIC) {
8462 const auto& new_removed_snaps = advmap.osdmap->get_new_removed_snaps();
8463 auto i = new_removed_snaps.find(pg->info.pgid.pool());
8464 if (i != new_removed_snaps.end()) {
8465 bool bad = false;
8466 for (auto j : i->second) {
8467 if (pg->snap_trimq.intersects(j.first, j.second)) {
8468 decltype(pg->snap_trimq) added, overlap;
8469 added.insert(j.first, j.second);
8470 overlap.intersection_of(pg->snap_trimq, added);
8471 if (pg->last_require_osd_release < CEPH_RELEASE_MIMIC) {
8472 lderr(pg->cct) << __func__ << " removed_snaps already contains "
8473 << overlap << ", but this is the first mimic+ osdmap,"
8474 << " so it's expected" << dendl;
8475 } else {
8476 lderr(pg->cct) << __func__ << " removed_snaps already contains "
8477 << overlap << dendl;
8478 bad = true;
8479 }
8480 pg->snap_trimq.union_of(added);
8481 } else {
8482 pg->snap_trimq.insert(j.first, j.second);
8483 }
8484 }
8485 if (pg->last_require_osd_release < CEPH_RELEASE_MIMIC) {
8486 // at upgrade, we report *all* previously removed snaps as removed in
8487 // the first mimic epoch. remove the ones we previously divined were
8488 // removed (and subsequently purged) from the trimq.
8489 lderr(pg->cct) << __func__ << " first mimic map, filtering purged_snaps"
8490 << " from new removed_snaps" << dendl;
8491 pg->snap_trimq.subtract(pg->info.purged_snaps);
8492 }
8493 ldout(pg->cct,10) << __func__ << " new removed_snaps " << i->second
8494 << ", snap_trimq now " << pg->snap_trimq << dendl;
8495 ceph_assert(!bad || !pg->cct->_conf->osd_debug_verify_cached_snaps);
8496 pg->dirty_info = true;
8497 pg->dirty_big_info = true;
8498 }
8499
8500 const auto& new_purged_snaps = advmap.osdmap->get_new_purged_snaps();
8501 auto j = new_purged_snaps.find(pg->info.pgid.pool());
8502 if (j != new_purged_snaps.end()) {
8503 bool bad = false;
8504 for (auto k : j->second) {
8505 if (!pg->info.purged_snaps.contains(k.first, k.second)) {
8506 decltype(pg->info.purged_snaps) rm, overlap;
8507 rm.insert(k.first, k.second);
8508 overlap.intersection_of(pg->info.purged_snaps, rm);
8509 lderr(pg->cct) << __func__ << " purged_snaps does not contain "
8510 << rm << ", only " << overlap << dendl;
8511 pg->info.purged_snaps.subtract(overlap);
8512 // This can currently happen in the normal (if unlikely) course of
8513 // events. Because adding snaps to purged_snaps does not increase
8514 // the pg version or add a pg log entry, we don't reliably propagate
8515 // purged_snaps additions to other OSDs.
8516 // One example:
8517 // - purge S
8518 // - primary and replicas update purged_snaps
8519 // - no object updates
8520 // - pg mapping changes, new primary on different node
8521 // - new primary pg version == eversion_t(), so info is not
8522 // propagated.
8523 //bad = true;
8524 } else {
8525 pg->info.purged_snaps.erase(k.first, k.second);
8526 }
8527 }
8528 ldout(pg->cct,10) << __func__ << " new purged_snaps " << j->second
8529 << ", now " << pg->info.purged_snaps << dendl;
8530 ceph_assert(!bad || !pg->cct->_conf->osd_debug_verify_cached_snaps);
8531 pg->dirty_info = true;
8532 pg->dirty_big_info = true;
8533 }
8534 if (pg->dirty_big_info) {
8535 // share updated purged_snaps to mgr/mon so that we (a) stop reporting
8536 // purged snaps and (b) perhaps share more snaps that we have purged
8537 // but didn't fit in pg_stat_t.
8538 need_publish = true;
8539 pg->share_pg_info();
8540 }
8541 } else if (!pg->pool.newly_removed_snaps.empty()) {
8542 pg->snap_trimq.union_of(pg->pool.newly_removed_snaps);
8543 ldout(pg->cct, 10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl;
8544 pg->dirty_info = true;
8545 pg->dirty_big_info = true;
8546 }
8547
8548 for (size_t i = 0; i < pg->want_acting.size(); i++) {
8549 int osd = pg->want_acting[i];
8550 if (!advmap.osdmap->is_up(osd)) {
8551 pg_shard_t osd_with_shard(osd, shard_id_t(i));
8552 ceph_assert(pg->is_acting(osd_with_shard) || pg->is_up(osd_with_shard));
8553 }
8554 }
8555
8556 /* Check for changes in pool size (if the acting set changed as a result,
8557 * this does not matter) */
8558 if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) !=
8559 pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) {
8560 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) {
8561 pg->state_clear(PG_STATE_UNDERSIZED);
8562 } else {
8563 pg->state_set(PG_STATE_UNDERSIZED);
8564 }
8565 // degraded changes will be detected by call from publish_stats_to_osd()
8566 need_publish = true;
8567 }
8568
8569 // if we haven't reported our PG stats in a long time, do so now.
8570 if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
8571 ldout(pg->cct, 20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch)
8572 << " epochs" << dendl;
8573 need_publish = true;
8574 }
8575
8576 if (need_publish)
8577 pg->publish_stats_to_osd();
8578
8579 return forward_event();
8580 }
8581
8582 boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
8583 {
8584 PG *pg = context< RecoveryMachine >().pg;
8585 ldout(pg->cct, 10) << "Active: handling ActMap" << dendl;
8586 ceph_assert(pg->is_primary());
8587
8588 if (pg->have_unfound()) {
8589 // object may have become unfound
8590 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
8591 }
8592
8593 if (pg->cct->_conf->osd_check_for_log_corruption)
8594 pg->check_log_for_corruption(pg->osd->store);
8595
8596 uint64_t unfound = pg->missing_loc.num_unfound();
8597 if (unfound > 0 &&
8598 pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
8599 if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
8600 pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound
8601 << " objects unfound and apparently lost, would automatically "
8602 << "mark these objects lost but this feature is not yet implemented "
8603 << "(osd_auto_mark_unfound_lost)";
8604 } else
8605 pg->osd->clog->error() << pg->info.pgid.pgid << " has "
8606 << unfound << " objects unfound and apparently lost";
8607 }
8608
8609 if (pg->is_active()) {
8610 ldout(pg->cct, 10) << "Active: kicking snap trim" << dendl;
8611 pg->kick_snap_trim();
8612 }
8613
8614 if (pg->is_peered() &&
8615 !pg->is_clean() &&
8616 !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) &&
8617 (!pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || pg->is_degraded())) {
8618 pg->queue_recovery();
8619 }
8620 return forward_event();
8621 }
8622
8623 boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& notevt)
8624 {
8625 PG *pg = context< RecoveryMachine >().pg;
8626 ceph_assert(pg->is_primary());
8627 if (pg->peer_info.count(notevt.from)) {
8628 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
8629 << ", already have info from that osd, ignoring"
8630 << dendl;
8631 } else if (pg->peer_purged.count(notevt.from)) {
8632 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
8633 << ", already purged that peer, ignoring"
8634 << dendl;
8635 } else {
8636 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
8637 << ", calling proc_replica_info and discover_all_missing"
8638 << dendl;
8639 pg->proc_replica_info(
8640 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
8641 if (pg->have_unfound() || (pg->is_degraded() && pg->might_have_unfound.count(notevt.from))) {
8642 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
8643 }
8644 }
8645 return discard_event();
8646 }
8647
8648 boost::statechart::result PG::RecoveryState::Active::react(const MTrim& trim)
8649 {
8650 PG *pg = context< RecoveryMachine >().pg;
8651 ceph_assert(pg->is_primary());
8652
8653 // peer is informing us of their last_complete_ondisk
8654 ldout(pg->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl;
8655 pg->peer_last_complete_ondisk[pg_shard_t(trim.from, trim.shard)] = trim.trim_to;
8656
8657 // trim log when the pg is recovered
8658 pg->calc_min_last_complete_ondisk();
8659 return discard_event();
8660 }
8661
8662 boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoevt)
8663 {
8664 PG *pg = context< RecoveryMachine >().pg;
8665 ceph_assert(pg->is_primary());
8666
8667 ceph_assert(!pg->acting_recovery_backfill.empty());
8668 // don't update history (yet) if we are active and primary; the replica
8669 // may be telling us they have activated (and committed) but we can't
8670 // share that until _everyone_ does the same.
8671 if (pg->is_acting_recovery_backfill(infoevt.from) &&
8672 pg->peer_activated.count(infoevt.from) == 0) {
8673 ldout(pg->cct, 10) << " peer osd." << infoevt.from
8674 << " activated and committed" << dendl;
8675 pg->peer_activated.insert(infoevt.from);
8676 pg->blocked_by.erase(infoevt.from.shard);
8677 pg->publish_stats_to_osd();
8678 if (pg->peer_activated.size() == pg->acting_recovery_backfill.size()) {
8679 pg->all_activated_and_committed();
8680 }
8681 }
8682 return discard_event();
8683 }
8684
8685 boost::statechart::result PG::RecoveryState::Active::react(const MLogRec& logevt)
8686 {
8687 PG *pg = context< RecoveryMachine >().pg;
8688 ldout(pg->cct, 10) << "searching osd." << logevt.from
8689 << " log for unfound items" << dendl;
8690 pg->proc_replica_log(
8691 logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
8692 bool got_missing = pg->search_for_missing(
8693 pg->peer_info[logevt.from],
8694 pg->peer_missing[logevt.from],
8695 logevt.from,
8696 context< RecoveryMachine >().get_recovery_ctx());
8697 // If there are missing AND we are "fully" active then start recovery now
8698 if (got_missing && pg->state_test(PG_STATE_ACTIVE)) {
8699 post_event(DoRecovery());
8700 }
8701 return discard_event();
8702 }
8703
8704 boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
8705 {
8706 PG *pg = context< RecoveryMachine >().pg;
8707
8708 q.f->open_object_section("state");
8709 q.f->dump_string("name", state_name);
8710 q.f->dump_stream("enter_time") << enter_time;
8711
8712 {
8713 q.f->open_array_section("might_have_unfound");
8714 for (set<pg_shard_t>::iterator p = pg->might_have_unfound.begin();
8715 p != pg->might_have_unfound.end();
8716 ++p) {
8717 q.f->open_object_section("osd");
8718 q.f->dump_stream("osd") << *p;
8719 if (pg->peer_missing.count(*p)) {
8720 q.f->dump_string("status", "already probed");
8721 } else if (pg->peer_missing_requested.count(*p)) {
8722 q.f->dump_string("status", "querying");
8723 } else if (!pg->get_osdmap()->is_up(p->osd)) {
8724 q.f->dump_string("status", "osd is down");
8725 } else {
8726 q.f->dump_string("status", "not queried");
8727 }
8728 q.f->close_section();
8729 }
8730 q.f->close_section();
8731 }
8732 {
8733 q.f->open_object_section("recovery_progress");
8734 pg->dump_recovery_info(q.f);
8735 q.f->close_section();
8736 }
8737
8738 {
8739 q.f->open_object_section("scrub");
8740 q.f->dump_stream("scrubber.epoch_start") << pg->scrubber.epoch_start;
8741 q.f->dump_bool("scrubber.active", pg->scrubber.active);
8742 q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state));
8743 q.f->dump_stream("scrubber.start") << pg->scrubber.start;
8744 q.f->dump_stream("scrubber.end") << pg->scrubber.end;
8745 q.f->dump_stream("scrubber.max_end") << pg->scrubber.max_end;
8746 q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update;
8747 q.f->dump_bool("scrubber.deep", pg->scrubber.deep);
8748 {
8749 q.f->open_array_section("scrubber.waiting_on_whom");
8750 for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin();
8751 p != pg->scrubber.waiting_on_whom.end();
8752 ++p) {
8753 q.f->dump_stream("shard") << *p;
8754 }
8755 q.f->close_section();
8756 }
8757 q.f->close_section();
8758 }
8759
8760 q.f->close_section();
8761 return forward_event();
8762 }
8763
8764 boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActivated &evt)
8765 {
8766 PG *pg = context< RecoveryMachine >().pg;
8767 pg_t pgid = pg->info.pgid.pgid;
8768
8769 all_replicas_activated = true;
8770
8771 pg->state_clear(PG_STATE_ACTIVATING);
8772 pg->state_clear(PG_STATE_CREATING);
8773 pg->state_clear(PG_STATE_PREMERGE);
8774
8775 bool merge_target;
8776 if (pg->pool.info.is_pending_merge(pgid, &merge_target)) {
8777 pg->state_set(PG_STATE_PEERED);
8778 pg->state_set(PG_STATE_PREMERGE);
8779
8780 if (pg->actingset.size() != pg->get_osdmap()->get_pg_size(pgid)) {
8781 if (merge_target) {
8782 pg_t src = pgid;
8783 src.set_ps(pg->pool.info.get_pg_num_pending());
8784 assert(src.get_parent() == pgid);
8785 pg->osd->set_not_ready_to_merge_target(pgid, src);
8786 } else {
8787 pg->osd->set_not_ready_to_merge_source(pgid);
8788 }
8789 }
8790 } else if (pg->acting.size() < pg->pool.info.min_size) {
8791 pg->state_set(PG_STATE_PEERED);
8792 } else {
8793 pg->state_set(PG_STATE_ACTIVE);
8794 }
8795
8796 if (pg->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) {
8797 pg->osd->send_pg_created(pgid);
8798 }
8799
8800 pg->info.history.last_epoch_started = pg->info.last_epoch_started;
8801 pg->info.history.last_interval_started = pg->info.last_interval_started;
8802 pg->dirty_info = true;
8803
8804 pg->share_pg_info();
8805 pg->publish_stats_to_osd();
8806
8807 pg->check_local();
8808
8809 // waiters
8810 if (pg->flushes_in_progress == 0) {
8811 pg->requeue_ops(pg->waiting_for_peered);
8812 } else if (!pg->waiting_for_peered.empty()) {
8813 ldout(pg->cct, 10) << __func__ << " flushes in progress, moving "
8814 << pg->waiting_for_peered.size()
8815 << " items to waiting_for_flush"
8816 << dendl;
8817 ceph_assert(pg->waiting_for_flush.empty());
8818 pg->waiting_for_flush.swap(pg->waiting_for_peered);
8819 }
8820
8821 pg->on_activate();
8822
8823 return discard_event();
8824 }
8825
8826 void PG::RecoveryState::Active::exit()
8827 {
8828 context< RecoveryMachine >().log_exit(state_name, enter_time);
8829 PG *pg = context< RecoveryMachine >().pg;
8830 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8831
8832 pg->blocked_by.clear();
8833 pg->backfill_reserved = false;
8834 pg->backfill_reserving = false;
8835 pg->state_clear(PG_STATE_ACTIVATING);
8836 pg->state_clear(PG_STATE_DEGRADED);
8837 pg->state_clear(PG_STATE_UNDERSIZED);
8838 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
8839 pg->state_clear(PG_STATE_BACKFILL_WAIT);
8840 pg->state_clear(PG_STATE_RECOVERY_WAIT);
8841 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
8842 utime_t dur = ceph_clock_now() - enter_time;
8843 pg->osd->recoverystate_perf->tinc(rs_active_latency, dur);
8844 pg->agent_stop();
8845 }
8846
8847 /*------ReplicaActive-----*/
8848 PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx)
8849 : my_base(ctx),
8850 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive")
8851 {
8852 context< RecoveryMachine >().log_enter(state_name);
8853
8854 PG *pg = context< RecoveryMachine >().pg;
8855 pg->start_flush(context< RecoveryMachine >().get_cur_transaction());
8856 }
8857
8858
8859 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
8860 const Activate& actevt) {
8861 PG *pg = context< RecoveryMachine >().pg;
8862 ldout(pg->cct, 10) << "In ReplicaActive, about to call activate" << dendl;
8863 map<int, map<spg_t, pg_query_t> > query_map;
8864 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
8865 actevt.activation_epoch,
8866 query_map, NULL, NULL);
8867 ldout(pg->cct, 10) << "Activate Finished" << dendl;
8868 return discard_event();
8869 }
8870
8871 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MInfoRec& infoevt)
8872 {
8873 PG *pg = context< RecoveryMachine >().pg;
8874 pg->proc_primary_info(*context<RecoveryMachine>().get_cur_transaction(),
8875 infoevt.info);
8876 return discard_event();
8877 }
8878
8879 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MLogRec& logevt)
8880 {
8881 PG *pg = context< RecoveryMachine >().pg;
8882 ldout(pg->cct, 10) << "received log from " << logevt.from << dendl;
8883 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
8884 pg->merge_log(*t, logevt.msg->info, logevt.msg->log, logevt.from);
8885 ceph_assert(pg->pg_log.get_head() == pg->info.last_update);
8886
8887 return discard_event();
8888 }
8889
8890 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MTrim& trim)
8891 {
8892 PG *pg = context< RecoveryMachine >().pg;
8893 // primary is instructing us to trim
8894 pg->pg_log.trim(trim.trim_to, pg->info);
8895 pg->dirty_info = true;
8896 return discard_event();
8897 }
8898
8899 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&)
8900 {
8901 PG *pg = context< RecoveryMachine >().pg;
8902 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
8903 context< RecoveryMachine >().send_notify(
8904 pg->get_primary(),
8905 pg_notify_t(
8906 pg->get_primary().shard, pg->pg_whoami.shard,
8907 pg->get_osdmap_epoch(),
8908 pg->get_osdmap_epoch(),
8909 pg->info),
8910 pg->past_intervals);
8911 }
8912 pg->take_waiters();
8913 return discard_event();
8914 }
8915
8916 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
8917 const MQuery& query)
8918 {
8919 PG *pg = context< RecoveryMachine >().pg;
8920 pg->fulfill_query(query, context<RecoveryMachine>().get_recovery_ctx());
8921 return discard_event();
8922 }
8923
8924 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const QueryState& q)
8925 {
8926 q.f->open_object_section("state");
8927 q.f->dump_string("name", state_name);
8928 q.f->dump_stream("enter_time") << enter_time;
8929 q.f->close_section();
8930 return forward_event();
8931 }
8932
8933 void PG::RecoveryState::ReplicaActive::exit()
8934 {
8935 context< RecoveryMachine >().log_exit(state_name, enter_time);
8936 PG *pg = context< RecoveryMachine >().pg;
8937 pg->clear_reserved_num_bytes();
8938 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
8939 utime_t dur = ceph_clock_now() - enter_time;
8940 pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur);
8941 }
8942
8943 /*-------Stray---*/
8944 PG::RecoveryState::Stray::Stray(my_context ctx)
8945 : my_base(ctx),
8946 NamedState(context< RecoveryMachine >().pg, "Started/Stray")
8947 {
8948 context< RecoveryMachine >().log_enter(state_name);
8949
8950 PG *pg = context< RecoveryMachine >().pg;
8951 ceph_assert(!pg->is_peered());
8952 ceph_assert(!pg->is_peering());
8953 ceph_assert(!pg->is_primary());
8954
8955 if (!pg->get_osdmap()->have_pg_pool(pg->get_pgid().pool())) {
8956 ldout(pg->cct,10) << __func__ << " pool is deleted" << dendl;
8957 post_event(DeleteStart());
8958 } else {
8959 pg->start_flush(context< RecoveryMachine >().get_cur_transaction());
8960 }
8961 }
8962
8963 boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
8964 {
8965 PG *pg = context< RecoveryMachine >().pg;
8966 MOSDPGLog *msg = logevt.msg.get();
8967 ldout(pg->cct, 10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
8968
8969 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
8970 if (msg->info.last_backfill == hobject_t()) {
8971 // restart backfill
8972 pg->unreg_next_scrub();
8973 pg->info = msg->info;
8974 pg->reg_next_scrub();
8975 pg->dirty_info = true;
8976 pg->dirty_big_info = true; // maybe.
8977
8978 PGLogEntryHandler rollbacker{pg, t};
8979 pg->pg_log.reset_backfill_claim_log(msg->log, &rollbacker);
8980
8981 pg->pg_log.reset_backfill();
8982 } else {
8983 pg->merge_log(*t, msg->info, msg->log, logevt.from);
8984 }
8985
8986 ceph_assert(pg->pg_log.get_head() == pg->info.last_update);
8987
8988 post_event(Activate(logevt.msg->info.last_epoch_started));
8989 return transit<ReplicaActive>();
8990 }
8991
8992 boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoevt)
8993 {
8994 PG *pg = context< RecoveryMachine >().pg;
8995 ldout(pg->cct, 10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
8996
8997 if (pg->info.last_update > infoevt.info.last_update) {
8998 // rewind divergent log entries
8999 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
9000 pg->rewind_divergent_log(*t, infoevt.info.last_update);
9001 pg->info.stats = infoevt.info.stats;
9002 pg->info.hit_set = infoevt.info.hit_set;
9003 }
9004
9005 ceph_assert(infoevt.info.last_update == pg->info.last_update);
9006 ceph_assert(pg->pg_log.get_head() == pg->info.last_update);
9007
9008 post_event(Activate(infoevt.info.last_epoch_started));
9009 return transit<ReplicaActive>();
9010 }
9011
9012 boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query)
9013 {
9014 PG *pg = context< RecoveryMachine >().pg;
9015 pg->fulfill_query(query, context<RecoveryMachine>().get_recovery_ctx());
9016 return discard_event();
9017 }
9018
9019 boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&)
9020 {
9021 PG *pg = context< RecoveryMachine >().pg;
9022 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
9023 context< RecoveryMachine >().send_notify(
9024 pg->get_primary(),
9025 pg_notify_t(
9026 pg->get_primary().shard, pg->pg_whoami.shard,
9027 pg->get_osdmap_epoch(),
9028 pg->get_osdmap_epoch(),
9029 pg->info),
9030 pg->past_intervals);
9031 }
9032 pg->take_waiters();
9033 return discard_event();
9034 }
9035
9036 void PG::RecoveryState::Stray::exit()
9037 {
9038 context< RecoveryMachine >().log_exit(state_name, enter_time);
9039 PG *pg = context< RecoveryMachine >().pg;
9040 utime_t dur = ceph_clock_now() - enter_time;
9041 pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur);
9042 }
9043
9044
9045 /*--------ToDelete----------*/
9046 PG::RecoveryState::ToDelete::ToDelete(my_context ctx)
9047 : my_base(ctx),
9048 NamedState(context< RecoveryMachine >().pg, "Started/ToDelete")
9049 {
9050 context< RecoveryMachine >().log_enter(state_name);
9051 PG *pg = context< RecoveryMachine >().pg;
9052 pg->osd->logger->inc(l_osd_pg_removing);
9053 }
9054
9055 void PG::RecoveryState::ToDelete::exit()
9056 {
9057 context< RecoveryMachine >().log_exit(state_name, enter_time);
9058 PG *pg = context< RecoveryMachine >().pg;
9059 // note: on a successful removal, this path doesn't execute. see
9060 // _delete_some().
9061 pg->osd->logger->dec(l_osd_pg_removing);
9062 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
9063 }
9064
9065 /*----WaitDeleteReserved----*/
9066 PG::RecoveryState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx)
9067 : my_base(ctx),
9068 NamedState(context< RecoveryMachine >().pg,
9069 "Started/ToDelete/WaitDeleteReseved")
9070 {
9071 context< RecoveryMachine >().log_enter(state_name);
9072 PG *pg = context< RecoveryMachine >().pg;
9073 context<ToDelete>().priority = pg->get_delete_priority();
9074 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
9075 pg->osd->local_reserver.request_reservation(
9076 pg->info.pgid,
9077 new QueuePeeringEvt<DeleteReserved>(
9078 pg, pg->get_osdmap_epoch(),
9079 DeleteReserved()),
9080 context<ToDelete>().priority,
9081 new QueuePeeringEvt<DeleteInterrupted>(
9082 pg, pg->get_osdmap_epoch(),
9083 DeleteInterrupted()));
9084 }
9085
9086 boost::statechart::result PG::RecoveryState::ToDelete::react(
9087 const ActMap& evt)
9088 {
9089 PG *pg = context< RecoveryMachine >().pg;
9090 if (pg->get_delete_priority() != priority) {
9091 ldout(pg->cct,10) << __func__ << " delete priority changed, resetting"
9092 << dendl;
9093 return transit<ToDelete>();
9094 }
9095 return discard_event();
9096 }
9097
9098 void PG::RecoveryState::WaitDeleteReserved::exit()
9099 {
9100 context< RecoveryMachine >().log_exit(state_name, enter_time);
9101 }
9102
9103 /*----Deleting-----*/
9104 PG::RecoveryState::Deleting::Deleting(my_context ctx)
9105 : my_base(ctx),
9106 NamedState(context< RecoveryMachine >().pg, "Started/ToDelete/Deleting")
9107 {
9108 context< RecoveryMachine >().log_enter(state_name);
9109 PG *pg = context< RecoveryMachine >().pg;
9110 pg->deleting = true;
9111 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
9112 pg->on_removal(t);
9113 t->register_on_commit(new C_DeleteMore(pg, pg->get_osdmap_epoch()));
9114 }
9115
9116 boost::statechart::result PG::RecoveryState::Deleting::react(
9117 const DeleteSome& evt)
9118 {
9119 PG *pg = context< RecoveryMachine >().pg;
9120 pg->_delete_some(context<RecoveryMachine>().get_cur_transaction());
9121 return discard_event();
9122 }
9123
9124 void PG::RecoveryState::Deleting::exit()
9125 {
9126 context< RecoveryMachine >().log_exit(state_name, enter_time);
9127 PG *pg = context< RecoveryMachine >().pg;
9128 pg->deleting = false;
9129 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
9130 }
9131
9132 /*--------GetInfo---------*/
9133 PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
9134 : my_base(ctx),
9135 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetInfo")
9136 {
9137 context< RecoveryMachine >().log_enter(state_name);
9138
9139 PG *pg = context< RecoveryMachine >().pg;
9140 pg->check_past_interval_bounds();
9141 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
9142
9143 ceph_assert(pg->blocked_by.empty());
9144
9145 prior_set = pg->build_prior();
9146
9147 pg->reset_min_peer_features();
9148 get_infos();
9149 if (prior_set.pg_down) {
9150 post_event(IsDown());
9151 } else if (peer_info_requested.empty()) {
9152 post_event(GotInfo());
9153 }
9154 }
9155
9156 void PG::RecoveryState::GetInfo::get_infos()
9157 {
9158 PG *pg = context< RecoveryMachine >().pg;
9159 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
9160
9161 pg->blocked_by.clear();
9162 for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
9163 it != prior_set.probe.end();
9164 ++it) {
9165 pg_shard_t peer = *it;
9166 if (peer == pg->pg_whoami) {
9167 continue;
9168 }
9169 if (pg->peer_info.count(peer)) {
9170 ldout(pg->cct, 10) << " have osd." << peer << " info " << pg->peer_info[peer] << dendl;
9171 continue;
9172 }
9173 if (peer_info_requested.count(peer)) {
9174 ldout(pg->cct, 10) << " already requested info from osd." << peer << dendl;
9175 pg->blocked_by.insert(peer.osd);
9176 } else if (!pg->get_osdmap()->is_up(peer.osd)) {
9177 ldout(pg->cct, 10) << " not querying info from down osd." << peer << dendl;
9178 } else {
9179 ldout(pg->cct, 10) << " querying info from osd." << peer << dendl;
9180 context< RecoveryMachine >().send_query(
9181 peer, pg_query_t(pg_query_t::INFO,
9182 it->shard, pg->pg_whoami.shard,
9183 pg->info.history,
9184 pg->get_osdmap_epoch()));
9185 peer_info_requested.insert(peer);
9186 pg->blocked_by.insert(peer.osd);
9187 }
9188 }
9189
9190 pg->publish_stats_to_osd();
9191 }
9192
9193 boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt)
9194 {
9195 PG *pg = context< RecoveryMachine >().pg;
9196
9197 set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
9198 if (p != peer_info_requested.end()) {
9199 peer_info_requested.erase(p);
9200 pg->blocked_by.erase(infoevt.from.osd);
9201 }
9202
9203 epoch_t old_start = pg->info.history.last_epoch_started;
9204 if (pg->proc_replica_info(
9205 infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
9206 // we got something new ...
9207 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
9208 if (old_start < pg->info.history.last_epoch_started) {
9209 ldout(pg->cct, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
9210 prior_set = pg->build_prior();
9211
9212 // filter out any osds that got dropped from the probe set from
9213 // peer_info_requested. this is less expensive than restarting
9214 // peering (which would re-probe everyone).
9215 set<pg_shard_t>::iterator p = peer_info_requested.begin();
9216 while (p != peer_info_requested.end()) {
9217 if (prior_set.probe.count(*p) == 0) {
9218 ldout(pg->cct, 20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
9219 peer_info_requested.erase(p++);
9220 } else {
9221 ++p;
9222 }
9223 }
9224 get_infos();
9225 }
9226 ldout(pg->cct, 20) << "Adding osd: " << infoevt.from.osd << " peer features: "
9227 << hex << infoevt.features << dec << dendl;
9228 pg->apply_peer_features(infoevt.features);
9229
9230 // are we done getting everything?
9231 if (peer_info_requested.empty() && !prior_set.pg_down) {
9232 ldout(pg->cct, 20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl;
9233 ldout(pg->cct, 20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl;
9234 ldout(pg->cct, 20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl;
9235 post_event(GotInfo());
9236 }
9237 }
9238 return discard_event();
9239 }
9240
9241 boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q)
9242 {
9243 PG *pg = context< RecoveryMachine >().pg;
9244 q.f->open_object_section("state");
9245 q.f->dump_string("name", state_name);
9246 q.f->dump_stream("enter_time") << enter_time;
9247
9248 q.f->open_array_section("requested_info_from");
9249 for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
9250 p != peer_info_requested.end();
9251 ++p) {
9252 q.f->open_object_section("osd");
9253 q.f->dump_stream("osd") << *p;
9254 if (pg->peer_info.count(*p)) {
9255 q.f->open_object_section("got_info");
9256 pg->peer_info[*p].dump(q.f);
9257 q.f->close_section();
9258 }
9259 q.f->close_section();
9260 }
9261 q.f->close_section();
9262
9263 q.f->close_section();
9264 return forward_event();
9265 }
9266
9267 void PG::RecoveryState::GetInfo::exit()
9268 {
9269 context< RecoveryMachine >().log_exit(state_name, enter_time);
9270 PG *pg = context< RecoveryMachine >().pg;
9271 utime_t dur = ceph_clock_now() - enter_time;
9272 pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur);
9273 pg->blocked_by.clear();
9274 }
9275
9276 /*------GetLog------------*/
9277 PG::RecoveryState::GetLog::GetLog(my_context ctx)
9278 : my_base(ctx),
9279 NamedState(
9280 context< RecoveryMachine >().pg, "Started/Primary/Peering/GetLog"),
9281 msg(0)
9282 {
9283 context< RecoveryMachine >().log_enter(state_name);
9284
9285 PG *pg = context< RecoveryMachine >().pg;
9286
9287 // adjust acting?
9288 if (!pg->choose_acting(auth_log_shard, false,
9289 &context< Peering >().history_les_bound)) {
9290 if (!pg->want_acting.empty()) {
9291 post_event(NeedActingChange());
9292 } else {
9293 post_event(IsIncomplete());
9294 }
9295 return;
9296 }
9297
9298 // am i the best?
9299 if (auth_log_shard == pg->pg_whoami) {
9300 post_event(GotLog());
9301 return;
9302 }
9303
9304 const pg_info_t& best = pg->peer_info[auth_log_shard];
9305
9306 // am i broken?
9307 if (pg->info.last_update < best.log_tail) {
9308 ldout(pg->cct, 10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
9309 post_event(IsIncomplete());
9310 return;
9311 }
9312
9313 // how much log to request?
9314 eversion_t request_log_from = pg->info.last_update;
9315 ceph_assert(!pg->acting_recovery_backfill.empty());
9316 for (set<pg_shard_t>::iterator p = pg->acting_recovery_backfill.begin();
9317 p != pg->acting_recovery_backfill.end();
9318 ++p) {
9319 if (*p == pg->pg_whoami) continue;
9320 pg_info_t& ri = pg->peer_info[*p];
9321 if (ri.last_update < pg->info.log_tail && ri.last_update >= best.log_tail &&
9322 ri.last_update < request_log_from)
9323 request_log_from = ri.last_update;
9324 }
9325
9326 // how much?
9327 ldout(pg->cct, 10) << " requesting log from osd." << auth_log_shard << dendl;
9328 context<RecoveryMachine>().send_query(
9329 auth_log_shard,
9330 pg_query_t(
9331 pg_query_t::LOG,
9332 auth_log_shard.shard, pg->pg_whoami.shard,
9333 request_log_from, pg->info.history,
9334 pg->get_osdmap_epoch()));
9335
9336 ceph_assert(pg->blocked_by.empty());
9337 pg->blocked_by.insert(auth_log_shard.osd);
9338 pg->publish_stats_to_osd();
9339 }
9340
9341 boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap)
9342 {
9343 PG *pg = context< RecoveryMachine >().pg;
9344 // make sure our log source didn't go down. we need to check
9345 // explicitly because it may not be part of the prior set, which
9346 // means the Peering state check won't catch it going down.
9347 if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
9348 ldout(pg->cct, 10) << "GetLog: auth_log_shard osd."
9349 << auth_log_shard.osd << " went down" << dendl;
9350 post_event(advmap);
9351 return transit< Reset >();
9352 }
9353
9354 // let the Peering state do its checks.
9355 return forward_event();
9356 }
9357
9358 boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt)
9359 {
9360 PG *pg = context< RecoveryMachine >().pg;
9361 ceph_assert(!msg);
9362 if (logevt.from != auth_log_shard) {
9363 ldout(pg->cct, 10) << "GetLog: discarding log from "
9364 << "non-auth_log_shard osd." << logevt.from << dendl;
9365 return discard_event();
9366 }
9367 ldout(pg->cct, 10) << "GetLog: received master log from osd"
9368 << logevt.from << dendl;
9369 msg = logevt.msg;
9370 post_event(GotLog());
9371 return discard_event();
9372 }
9373
9374 boost::statechart::result PG::RecoveryState::GetLog::react(const GotLog&)
9375 {
9376 PG *pg = context< RecoveryMachine >().pg;
9377 ldout(pg->cct, 10) << "leaving GetLog" << dendl;
9378 if (msg) {
9379 ldout(pg->cct, 10) << "processing master log" << dendl;
9380 pg->proc_master_log(*context<RecoveryMachine>().get_cur_transaction(),
9381 msg->info, msg->log, msg->missing,
9382 auth_log_shard);
9383 }
9384 pg->start_flush(context< RecoveryMachine >().get_cur_transaction());
9385 return transit< GetMissing >();
9386 }
9387
9388 boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q)
9389 {
9390 q.f->open_object_section("state");
9391 q.f->dump_string("name", state_name);
9392 q.f->dump_stream("enter_time") << enter_time;
9393 q.f->dump_stream("auth_log_shard") << auth_log_shard;
9394 q.f->close_section();
9395 return forward_event();
9396 }
9397
9398 void PG::RecoveryState::GetLog::exit()
9399 {
9400 context< RecoveryMachine >().log_exit(state_name, enter_time);
9401 PG *pg = context< RecoveryMachine >().pg;
9402 utime_t dur = ceph_clock_now() - enter_time;
9403 pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur);
9404 pg->blocked_by.clear();
9405 }
9406
9407 /*------WaitActingChange--------*/
9408 PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx)
9409 : my_base(ctx),
9410 NamedState(context< RecoveryMachine >().pg, "Started/Primary/WaitActingChange")
9411 {
9412 context< RecoveryMachine >().log_enter(state_name);
9413 }
9414
9415 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const AdvMap& advmap)
9416 {
9417 PG *pg = context< RecoveryMachine >().pg;
9418 OSDMapRef osdmap = advmap.osdmap;
9419
9420 ldout(pg->cct, 10) << "verifying no want_acting " << pg->want_acting << " targets didn't go down" << dendl;
9421 for (vector<int>::iterator p = pg->want_acting.begin(); p != pg->want_acting.end(); ++p) {
9422 if (!osdmap->is_up(*p)) {
9423 ldout(pg->cct, 10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
9424 post_event(advmap);
9425 return transit< Reset >();
9426 }
9427 }
9428 return forward_event();
9429 }
9430
9431 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MLogRec& logevt)
9432 {
9433 PG *pg = context< RecoveryMachine >().pg;
9434 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MLocRec" << dendl;
9435 return discard_event();
9436 }
9437
9438 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MInfoRec& evt)
9439 {
9440 PG *pg = context< RecoveryMachine >().pg;
9441 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
9442 return discard_event();
9443 }
9444
9445 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MNotifyRec& evt)
9446 {
9447 PG *pg = context< RecoveryMachine >().pg;
9448 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
9449 return discard_event();
9450 }
9451
9452 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const QueryState& q)
9453 {
9454 q.f->open_object_section("state");
9455 q.f->dump_string("name", state_name);
9456 q.f->dump_stream("enter_time") << enter_time;
9457 q.f->dump_string("comment", "waiting for pg acting set to change");
9458 q.f->close_section();
9459 return forward_event();
9460 }
9461
9462 void PG::RecoveryState::WaitActingChange::exit()
9463 {
9464 context< RecoveryMachine >().log_exit(state_name, enter_time);
9465 PG *pg = context< RecoveryMachine >().pg;
9466 utime_t dur = ceph_clock_now() - enter_time;
9467 pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur);
9468 }
9469
9470 /*------Down--------*/
9471 PG::RecoveryState::Down::Down(my_context ctx)
9472 : my_base(ctx),
9473 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Down")
9474 {
9475 context< RecoveryMachine >().log_enter(state_name);
9476 PG *pg = context< RecoveryMachine >().pg;
9477
9478 pg->state_clear(PG_STATE_PEERING);
9479 pg->state_set(PG_STATE_DOWN);
9480
9481 auto &prior_set = context< Peering >().prior_set;
9482 ceph_assert(pg->blocked_by.empty());
9483 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
9484 pg->publish_stats_to_osd();
9485 }
9486
9487 void PG::RecoveryState::Down::exit()
9488 {
9489 context< RecoveryMachine >().log_exit(state_name, enter_time);
9490 PG *pg = context< RecoveryMachine >().pg;
9491
9492 pg->state_clear(PG_STATE_DOWN);
9493 utime_t dur = ceph_clock_now() - enter_time;
9494 pg->osd->recoverystate_perf->tinc(rs_down_latency, dur);
9495
9496 pg->blocked_by.clear();
9497 }
9498
9499 boost::statechart::result PG::RecoveryState::Down::react(const QueryState& q)
9500 {
9501 q.f->open_object_section("state");
9502 q.f->dump_string("name", state_name);
9503 q.f->dump_stream("enter_time") << enter_time;
9504 q.f->dump_string("comment",
9505 "not enough up instances of this PG to go active");
9506 q.f->close_section();
9507 return forward_event();
9508 }
9509
9510 boost::statechart::result PG::RecoveryState::Down::react(const MNotifyRec& infoevt)
9511 {
9512 PG *pg = context< RecoveryMachine >().pg;
9513
9514 ceph_assert(pg->is_primary());
9515 epoch_t old_start = pg->info.history.last_epoch_started;
9516 if (!pg->peer_info.count(infoevt.from) &&
9517 pg->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) {
9518 pg->update_history(infoevt.notify.info.history);
9519 }
9520 // if we got something new to make pg escape down state
9521 if (pg->info.history.last_epoch_started > old_start) {
9522 ldout(pg->cct, 10) << " last_epoch_started moved forward, re-enter getinfo" << dendl;
9523 pg->state_clear(PG_STATE_DOWN);
9524 pg->state_set(PG_STATE_PEERING);
9525 return transit< GetInfo >();
9526 }
9527
9528 return discard_event();
9529 }
9530
9531
9532 /*------Incomplete--------*/
9533 PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
9534 : my_base(ctx),
9535 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Incomplete")
9536 {
9537 context< RecoveryMachine >().log_enter(state_name);
9538 PG *pg = context< RecoveryMachine >().pg;
9539
9540 pg->state_clear(PG_STATE_PEERING);
9541 pg->state_set(PG_STATE_INCOMPLETE);
9542
9543 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
9544 ceph_assert(pg->blocked_by.empty());
9545 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
9546 pg->publish_stats_to_osd();
9547 }
9548
9549 boost::statechart::result PG::RecoveryState::Incomplete::react(const AdvMap &advmap) {
9550 PG *pg = context< RecoveryMachine >().pg;
9551 int64_t poolnum = pg->info.pgid.pool();
9552
9553 // Reset if min_size turn smaller than previous value, pg might now be able to go active
9554 if (!advmap.osdmap->have_pg_pool(poolnum) ||
9555 advmap.lastmap->get_pools().find(poolnum)->second.min_size >
9556 advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
9557 post_event(advmap);
9558 return transit< Reset >();
9559 }
9560
9561 return forward_event();
9562 }
9563
9564 boost::statechart::result PG::RecoveryState::Incomplete::react(const MNotifyRec& notevt) {
9565 PG *pg = context< RecoveryMachine >().pg;
9566 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
9567 if (pg->proc_replica_info(
9568 notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
9569 // We got something new, try again!
9570 return transit< GetLog >();
9571 } else {
9572 return discard_event();
9573 }
9574 }
9575
9576 boost::statechart::result PG::RecoveryState::Incomplete::react(
9577 const QueryState& q)
9578 {
9579 q.f->open_object_section("state");
9580 q.f->dump_string("name", state_name);
9581 q.f->dump_stream("enter_time") << enter_time;
9582 q.f->dump_string("comment", "not enough complete instances of this PG");
9583 q.f->close_section();
9584 return forward_event();
9585 }
9586
9587 void PG::RecoveryState::Incomplete::exit()
9588 {
9589 context< RecoveryMachine >().log_exit(state_name, enter_time);
9590 PG *pg = context< RecoveryMachine >().pg;
9591
9592 pg->state_clear(PG_STATE_INCOMPLETE);
9593 utime_t dur = ceph_clock_now() - enter_time;
9594 pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur);
9595
9596 pg->blocked_by.clear();
9597 }
9598
9599 /*------GetMissing--------*/
9600 PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
9601 : my_base(ctx),
9602 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetMissing")
9603 {
9604 context< RecoveryMachine >().log_enter(state_name);
9605
9606 PG *pg = context< RecoveryMachine >().pg;
9607 ceph_assert(!pg->acting_recovery_backfill.empty());
9608 eversion_t since;
9609 for (set<pg_shard_t>::iterator i = pg->acting_recovery_backfill.begin();
9610 i != pg->acting_recovery_backfill.end();
9611 ++i) {
9612 if (*i == pg->get_primary()) continue;
9613 const pg_info_t& pi = pg->peer_info[*i];
9614 // reset this so to make sure the pg_missing_t is initialized and
9615 // has the correct semantics even if we don't need to get a
9616 // missing set from a shard. This way later additions due to
9617 // lost+unfound delete work properly.
9618 pg->peer_missing[*i].may_include_deletes = !pg->perform_deletes_during_peering();
9619
9620 if (pi.is_empty())
9621 continue; // no pg data, nothing divergent
9622
9623 if (pi.last_update < pg->pg_log.get_tail()) {
9624 ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
9625 pg->peer_missing[*i].clear();
9626 continue;
9627 }
9628 if (pi.last_backfill == hobject_t()) {
9629 ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
9630 pg->peer_missing[*i].clear();
9631 continue;
9632 }
9633
9634 if (pi.last_update == pi.last_complete && // peer has no missing
9635 pi.last_update == pg->info.last_update) { // peer is up to date
9636 // replica has no missing and identical log as us. no need to
9637 // pull anything.
9638 // FIXME: we can do better here. if last_update==last_complete we
9639 // can infer the rest!
9640 ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl;
9641 pg->peer_missing[*i].clear();
9642 continue;
9643 }
9644
9645 // We pull the log from the peer's last_epoch_started to ensure we
9646 // get enough log to detect divergent updates.
9647 since.epoch = pi.last_epoch_started;
9648 ceph_assert(pi.last_update >= pg->info.log_tail); // or else choose_acting() did a bad thing
9649 if (pi.log_tail <= since) {
9650 ldout(pg->cct, 10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
9651 context< RecoveryMachine >().send_query(
9652 *i,
9653 pg_query_t(
9654 pg_query_t::LOG,
9655 i->shard, pg->pg_whoami.shard,
9656 since, pg->info.history,
9657 pg->get_osdmap_epoch()));
9658 } else {
9659 ldout(pg->cct, 10) << " requesting fulllog+missing from osd." << *i
9660 << " (want since " << since << " < log.tail "
9661 << pi.log_tail << ")" << dendl;
9662 context< RecoveryMachine >().send_query(
9663 *i, pg_query_t(
9664 pg_query_t::FULLLOG,
9665 i->shard, pg->pg_whoami.shard,
9666 pg->info.history, pg->get_osdmap_epoch()));
9667 }
9668 peer_missing_requested.insert(*i);
9669 pg->blocked_by.insert(i->osd);
9670 }
9671
9672 if (peer_missing_requested.empty()) {
9673 if (pg->need_up_thru) {
9674 ldout(pg->cct, 10) << " still need up_thru update before going active"
9675 << dendl;
9676 post_event(NeedUpThru());
9677 return;
9678 }
9679
9680 // all good!
9681 post_event(Activate(pg->get_osdmap_epoch()));
9682 } else {
9683 pg->publish_stats_to_osd();
9684 }
9685 }
9686
9687 boost::statechart::result PG::RecoveryState::GetMissing::react(const MLogRec& logevt)
9688 {
9689 PG *pg = context< RecoveryMachine >().pg;
9690
9691 peer_missing_requested.erase(logevt.from);
9692 pg->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
9693
9694 if (peer_missing_requested.empty()) {
9695 if (pg->need_up_thru) {
9696 ldout(pg->cct, 10) << " still need up_thru update before going active"
9697 << dendl;
9698 post_event(NeedUpThru());
9699 } else {
9700 ldout(pg->cct, 10) << "Got last missing, don't need missing "
9701 << "posting Activate" << dendl;
9702 post_event(Activate(pg->get_osdmap_epoch()));
9703 }
9704 }
9705 return discard_event();
9706 }
9707
9708 boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState& q)
9709 {
9710 PG *pg = context< RecoveryMachine >().pg;
9711 q.f->open_object_section("state");
9712 q.f->dump_string("name", state_name);
9713 q.f->dump_stream("enter_time") << enter_time;
9714
9715 q.f->open_array_section("peer_missing_requested");
9716 for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
9717 p != peer_missing_requested.end();
9718 ++p) {
9719 q.f->open_object_section("osd");
9720 q.f->dump_stream("osd") << *p;
9721 if (pg->peer_missing.count(*p)) {
9722 q.f->open_object_section("got_missing");
9723 pg->peer_missing[*p].dump(q.f);
9724 q.f->close_section();
9725 }
9726 q.f->close_section();
9727 }
9728 q.f->close_section();
9729
9730 q.f->close_section();
9731 return forward_event();
9732 }
9733
9734 void PG::RecoveryState::GetMissing::exit()
9735 {
9736 context< RecoveryMachine >().log_exit(state_name, enter_time);
9737 PG *pg = context< RecoveryMachine >().pg;
9738 utime_t dur = ceph_clock_now() - enter_time;
9739 pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur);
9740 pg->blocked_by.clear();
9741 }
9742
9743 /*------WaitUpThru--------*/
9744 PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx)
9745 : my_base(ctx),
9746 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitUpThru")
9747 {
9748 context< RecoveryMachine >().log_enter(state_name);
9749 }
9750
9751 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const ActMap& am)
9752 {
9753 PG *pg = context< RecoveryMachine >().pg;
9754 if (!pg->need_up_thru) {
9755 post_event(Activate(pg->get_osdmap_epoch()));
9756 }
9757 return forward_event();
9758 }
9759
9760 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const MLogRec& logevt)
9761 {
9762 PG *pg = context< RecoveryMachine >().pg;
9763 ldout(pg->cct, 10) << "Noting missing from osd." << logevt.from << dendl;
9764 pg->peer_missing[logevt.from].claim(logevt.msg->missing);
9765 pg->peer_info[logevt.from] = logevt.msg->info;
9766 return discard_event();
9767 }
9768
9769 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const QueryState& q)
9770 {
9771 q.f->open_object_section("state");
9772 q.f->dump_string("name", state_name);
9773 q.f->dump_stream("enter_time") << enter_time;
9774 q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
9775 q.f->close_section();
9776 return forward_event();
9777 }
9778
9779 void PG::RecoveryState::WaitUpThru::exit()
9780 {
9781 context< RecoveryMachine >().log_exit(state_name, enter_time);
9782 PG *pg = context< RecoveryMachine >().pg;
9783 utime_t dur = ceph_clock_now() - enter_time;
9784 pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur);
9785 }
9786
9787 /*----RecoveryState::RecoveryMachine Methods-----*/
9788 #undef dout_prefix
9789 #define dout_prefix pg->gen_prefix(*_dout)
9790
9791 void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name)
9792 {
9793 PG *pg = context< RecoveryMachine >().pg;
9794 ldout(pg->cct, 5) << "enter " << state_name << dendl;
9795 pg->osd->pg_recovery_stats.log_enter(state_name);
9796 }
9797
9798 void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_t enter_time)
9799 {
9800 utime_t dur = ceph_clock_now() - enter_time;
9801 PG *pg = context< RecoveryMachine >().pg;
9802 ldout(pg->cct, 5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
9803 pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now() - enter_time,
9804 event_count, event_time);
9805 event_count = 0;
9806 event_time = utime_t();
9807 }
9808
9809
9810 /*---------------------------------------------------*/
9811 #undef dout_prefix
9812 #define dout_prefix ((debug_pg ? debug_pg->gen_prefix(*_dout) : *_dout) << " PriorSet: ")
9813
9814 void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
9815 ceph_assert(!rctx);
9816 ceph_assert(!orig_ctx);
9817 orig_ctx = new_ctx;
9818 if (new_ctx) {
9819 if (messages_pending_flush) {
9820 rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
9821 } else {
9822 rctx = *new_ctx;
9823 }
9824 rctx->start_time = ceph_clock_now();
9825 }
9826 }
9827
9828 void PG::RecoveryState::begin_block_outgoing() {
9829 ceph_assert(!messages_pending_flush);
9830 ceph_assert(orig_ctx);
9831 ceph_assert(rctx);
9832 messages_pending_flush = BufferedRecoveryMessages();
9833 rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
9834 }
9835
9836 void PG::RecoveryState::clear_blocked_outgoing() {
9837 ceph_assert(orig_ctx);
9838 ceph_assert(rctx);
9839 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
9840 }
9841
9842 void PG::RecoveryState::end_block_outgoing() {
9843 ceph_assert(messages_pending_flush);
9844 ceph_assert(orig_ctx);
9845 ceph_assert(rctx);
9846
9847 rctx = RecoveryCtx(*orig_ctx);
9848 rctx->accept_buffered_messages(*messages_pending_flush);
9849 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
9850 }
9851
9852 void PG::RecoveryState::end_handle() {
9853 if (rctx) {
9854 utime_t dur = ceph_clock_now() - rctx->start_time;
9855 machine.event_time += dur;
9856 }
9857
9858 machine.event_count++;
9859 rctx = boost::optional<RecoveryCtx>();
9860 orig_ctx = NULL;
9861 }
9862
9863 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi)
9864 {
9865 out << "BackfillInfo(" << bi.begin << "-" << bi.end
9866 << " " << bi.objects.size() << " objects";
9867 if (!bi.objects.empty())
9868 out << " " << bi.objects;
9869 out << ")";
9870 return out;
9871 }
9872
9873 void PG::dump_pgstate_history(Formatter *f)
9874 {
9875 lock();
9876 pgstate_history.dump(f);
9877 unlock();
9878 }
9879
9880 void PG::dump_missing(Formatter *f)
9881 {
9882 for (auto& i : pg_log.get_missing().get_items()) {
9883 f->open_object_section("object");
9884 f->dump_object("oid", i.first);
9885 f->dump_object("missing_info", i.second);
9886 if (missing_loc.needs_recovery(i.first)) {
9887 f->dump_bool("unfound", missing_loc.is_unfound(i.first));
9888 f->open_array_section("locations");
9889 for (auto l : missing_loc.get_locations(i.first)) {
9890 f->dump_object("shard", l);
9891 }
9892 f->close_section();
9893 }
9894 f->close_section();
9895 }
9896 }
9897
9898 void PG::get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f)
9899 {
9900 pg_stats_publish_lock.Lock();
9901 if (pg_stats_publish_valid) {
9902 f(pg_stats_publish, pg_stats_publish.get_effective_last_epoch_clean());
9903 }
9904 pg_stats_publish_lock.Unlock();
9905 }
9906
9907 void PG::with_heartbeat_peers(std::function<void(int)> f)
9908 {
9909 heartbeat_peer_lock.Lock();
9910 for (auto p : heartbeat_peers) {
9911 f(p);
9912 }
9913 for (auto p : probe_targets) {
9914 f(p);
9915 }
9916 heartbeat_peer_lock.Unlock();
9917 }