]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PG.cc
update sources to v12.1.1
[ceph.git] / ceph / src / osd / PG.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "PG.h"
16// #include "msg/Messenger.h"
17#include "messages/MOSDRepScrub.h"
18// #include "common/cmdparse.h"
19// #include "common/ceph_context.h"
20
21#include "common/errno.h"
22#include "common/config.h"
23#include "OSD.h"
24#include "OpRequest.h"
25#include "ScrubStore.h"
26#include "Session.h"
27
28#include "common/Timer.h"
29#include "common/perf_counters.h"
30
31#include "messages/MOSDOp.h"
32#include "messages/MOSDPGNotify.h"
33// #include "messages/MOSDPGLog.h"
34#include "messages/MOSDPGRemove.h"
35#include "messages/MOSDPGInfo.h"
36#include "messages/MOSDPGTrim.h"
37#include "messages/MOSDPGScan.h"
38#include "messages/MOSDPGBackfill.h"
39#include "messages/MOSDPGBackfillRemove.h"
40#include "messages/MBackfillReserve.h"
41#include "messages/MRecoveryReserve.h"
42#include "messages/MOSDPGPush.h"
43#include "messages/MOSDPGPushReply.h"
44#include "messages/MOSDPGPull.h"
45#include "messages/MOSDECSubOpWrite.h"
46#include "messages/MOSDECSubOpWriteReply.h"
47#include "messages/MOSDECSubOpRead.h"
48#include "messages/MOSDECSubOpReadReply.h"
49#include "messages/MOSDPGUpdateLogMissing.h"
50#include "messages/MOSDPGUpdateLogMissingReply.h"
51#include "messages/MOSDBackoff.h"
52#include "messages/MOSDScrubReserve.h"
53#include "messages/MOSDSubOp.h"
54#include "messages/MOSDRepOp.h"
55#include "messages/MOSDSubOpReply.h"
56#include "messages/MOSDRepOpReply.h"
57#include "messages/MOSDRepScrubMap.h"
58
59#include "common/BackTrace.h"
60#include "common/EventTrace.h"
61
62#ifdef WITH_LTTNG
63#define TRACEPOINT_DEFINE
64#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
65#include "tracing/pg.h"
66#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
67#undef TRACEPOINT_DEFINE
68#else
69#define tracepoint(...)
70#endif
71
72#include <sstream>
73
74#define dout_context cct
75#define dout_subsys ceph_subsys_osd
76#undef dout_prefix
77#define dout_prefix _prefix(_dout, this)
78
79// prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
80// easily skip them
81const string infover_key("_infover");
82const string info_key("_info");
83const string biginfo_key("_biginfo");
84const string epoch_key("_epoch");
85const string fastinfo_key("_fastinfo");
86
87template <class T>
88static ostream& _prefix(std::ostream *_dout, T *t)
89{
90 return *_dout << t->gen_prefix();
91}
92
93MEMPOOL_DEFINE_OBJECT_FACTORY(PG::CephPeeringEvt, pg_peering_evt, osd);
94
95void PGStateHistory::enter(PG* pg, const utime_t entime, const char* state)
96{
97 // Ignore trimming state machine for now
98 if (::strstr(state, "Trimming") != NULL) {
99 return;
100 } else if (pi != nullptr) {
101 pi->enter_state(entime, state);
102 } else {
103 // Store current state since we can't reliably take the PG lock here
104 if ( tmppi == nullptr) {
105 tmppi = std::unique_ptr<PGStateInstance>(new PGStateInstance);
106 }
107
108 thispg = pg;
109 tmppi->enter_state(entime, state);
110 }
111}
112
113void PGStateHistory::exit(const char* state) {
114 // Ignore trimming state machine for now
115 // Do nothing if PG is being destroyed!
116 if (::strstr(state, "Trimming") != NULL || pg_in_destructor) {
117 return;
118 } else {
119 bool ilocked = false;
120 if(!thispg->is_locked()) {
121 thispg->lock();
122 ilocked = true;
123 }
124 if (pi == nullptr) {
125 buffer.push_back(std::unique_ptr<PGStateInstance>(tmppi.release()));
126 pi = buffer.back().get();
127 pi->setepoch(thispg->get_osdmap()->get_epoch());
128 }
129
130 pi->exit_state(ceph_clock_now());
131 if (::strcmp(state, "Reset") == 0) {
132 this->reset();
133 }
134 if(ilocked) {
135 thispg->unlock();
136 }
137 }
138}
139
140void PGStateHistory::dump(Formatter* f) const {
141 f->open_array_section("history");
142 for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) {
143 f->open_object_section("states");
144 f->dump_stream("epoch") << (*pi)->this_epoch;
145 for (auto she : (*pi)->state_history) {
146 f->dump_string("state", std::get<2>(she));
147 f->dump_stream("enter") << std::get<0>(she);
148 f->dump_stream("exit") << std::get<1>(she);
149 }
150 f->close_section();
151 }
152 f->close_section();
153}
154
155void PG::get(const char* tag)
156{
157 ref++;
158#ifdef PG_DEBUG_REFS
159 Mutex::Locker l(_ref_id_lock);
160 _tag_counts[tag]++;
161#endif
162}
163
164void PG::put(const char* tag)
165{
166#ifdef PG_DEBUG_REFS
167 {
168 Mutex::Locker l(_ref_id_lock);
169 auto tag_counts_entry = _tag_counts.find(tag);
31f18b77 170 assert(tag_counts_entry != _tag_counts.end());
7c673cae
FG
171 --tag_counts_entry->second;
172 if (tag_counts_entry->second == 0) {
173 _tag_counts.erase(tag_counts_entry);
174 }
175 }
176#endif
177 if (--ref== 0)
178 delete this;
179}
180
181#ifdef PG_DEBUG_REFS
182uint64_t PG::get_with_id()
183{
184 ref++;
185 Mutex::Locker l(_ref_id_lock);
186 uint64_t id = ++_ref_id;
187 BackTrace bt(0);
188 stringstream ss;
189 bt.print(ss);
31f18b77 190 dout(20) << __func__ << ": " << info.pgid << " got id " << id << " (new) ref==" << ref << dendl;
7c673cae
FG
191 assert(!_live_ids.count(id));
192 _live_ids.insert(make_pair(id, ss.str()));
193 return id;
194}
195
196void PG::put_with_id(uint64_t id)
197{
31f18b77 198 dout(20) << __func__ << ": " << info.pgid << " put id " << id << " (current) ref==" << ref << dendl;
7c673cae
FG
199 {
200 Mutex::Locker l(_ref_id_lock);
201 assert(_live_ids.count(id));
202 _live_ids.erase(id);
203 }
204 if (--ref == 0)
205 delete this;
206}
207
208void PG::dump_live_ids()
209{
210 Mutex::Locker l(_ref_id_lock);
211 dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl;
212 for (map<uint64_t, string>::iterator i = _live_ids.begin();
213 i != _live_ids.end();
214 ++i) {
215 dout(0) << "\t\tid: " << *i << dendl;
216 }
217 dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl;
218 for (map<string, uint64_t>::iterator i = _tag_counts.begin();
219 i != _tag_counts.end();
220 ++i) {
221 dout(0) << "\t\tid: " << *i << dendl;
222 }
223}
224#endif
225
226void PGPool::update(OSDMapRef map)
227{
228 const pg_pool_t *pi = map->get_pg_pool(id);
229 assert(pi);
230 info = *pi;
231 auid = pi->auid;
232 name = map->get_pool_name(id);
233 bool updated = false;
234 if ((map->get_epoch() != cached_epoch + 1) ||
235 (pi->get_snap_epoch() == map->get_epoch())) {
236 updated = true;
237 pi->build_removed_snaps(newly_removed_snaps);
238 interval_set<snapid_t> intersection;
239 intersection.intersection_of(newly_removed_snaps, cached_removed_snaps);
240 if (intersection == cached_removed_snaps) {
241 newly_removed_snaps.subtract(cached_removed_snaps);
242 cached_removed_snaps.union_of(newly_removed_snaps);
243 } else {
244 lgeneric_subdout(cct, osd, 0) << __func__
245 << " cached_removed_snaps shrank from " << cached_removed_snaps
246 << " to " << newly_removed_snaps << dendl;
247 cached_removed_snaps = newly_removed_snaps;
248 newly_removed_snaps.clear();
249 }
250 snapc = pi->get_snap_context();
251 } else {
252 /* 1) map->get_epoch() == cached_epoch + 1 &&
253 * 2) pi->get_snap_epoch() != map->get_epoch()
254 *
255 * From the if branch, 1 && 2 must be true. From 2, we know that
256 * this map didn't change the set of removed snaps. From 1, we
257 * know that our cached_removed_snaps matches the previous map.
258 * Thus, from 1 && 2, cached_removed snaps matches the current
259 * set of removed snaps and all we have to do is clear
260 * newly_removed_snaps.
261 */
262 newly_removed_snaps.clear();
263 }
264 cached_epoch = map->get_epoch();
265 lgeneric_subdout(cct, osd, 20)
266 << "PGPool::update cached_removed_snaps "
267 << cached_removed_snaps
268 << " newly_removed_snaps "
269 << newly_removed_snaps
270 << " snapc " << snapc
271 << (updated ? " (updated)":" (no change)")
272 << dendl;
273}
274
275PG::PG(OSDService *o, OSDMapRef curmap,
276 const PGPool &_pool, spg_t p) :
277 osd(o),
278 cct(o->cct),
279 osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
280 snap_mapper(
281 cct,
282 &osdriver,
283 p.ps(),
284 p.get_split_bits(curmap->get_pg_num(_pool.id)),
285 _pool.id,
286 p.shard),
287 osdmap_ref(curmap), last_persisted_osdmap_ref(curmap), pool(_pool),
288 _lock("PG::_lock"),
289 #ifdef PG_DEBUG_REFS
290 _ref_id_lock("PG::_ref_id_lock"), _ref_id(0),
291 #endif
292 deleting(false),
293 trace_endpoint("0.0.0.0", 0, "PG"),
294 dirty_info(false), dirty_big_info(false),
295 info(p),
296 info_struct_v(0),
297 coll(p), pg_log(cct),
298 pgmeta_oid(p.make_pgmeta_oid()),
299 missing_loc(this),
300 past_intervals(
301 curmap->get_pools().at(p.pgid.pool()).ec_pool(),
302 *curmap),
303 stat_queue_item(this),
304 scrub_queued(false),
305 recovery_queued(false),
306 recovery_ops_active(0),
307 role(-1),
308 state(0),
309 send_notify(false),
310 pg_whoami(osd->whoami, p.shard),
311 need_up_thru(false),
312 last_peering_reset(0),
313 heartbeat_peer_lock("PG::heartbeat_peer_lock"),
314 backfill_reserved(false),
315 backfill_reserving(false),
316 flushes_in_progress(0),
317 pg_stats_publish_lock("PG::pg_stats_publish_lock"),
318 pg_stats_publish_valid(false),
319 osr(osd->osr_registry.lookup_or_create(p, (stringify(p)))),
320 finish_sync_event(NULL),
321 backoff_lock("PG::backoff_lock"),
322 scrub_after_recovery(false),
323 active_pushes(0),
324 recovery_state(this),
325 pg_id(p),
326 peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
327 acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
328 upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
31f18b77 329 last_epoch(0)
7c673cae
FG
330{
331#ifdef PG_DEBUG_REFS
332 osd->add_pgid(p, this);
333#endif
334#ifdef WITH_BLKIN
335 std::stringstream ss;
336 ss << "PG " << info.pgid;
337 trace_endpoint.copy_name(ss.str());
338#endif
339 osr->shard_hint = p;
340}
341
342PG::~PG()
343{
344 pgstate_history.set_pg_in_destructor();
7c673cae
FG
345#ifdef PG_DEBUG_REFS
346 osd->remove_pgid(info.pgid, this);
347#endif
348}
349
350void PG::lock_suspend_timeout(ThreadPool::TPHandle &handle)
351{
352 handle.suspend_tp_timeout();
353 lock();
354 handle.reset_tp_timeout();
355}
356
357void PG::lock(bool no_lockdep) const
358{
359 _lock.Lock(no_lockdep);
360 // if we have unrecorded dirty state with the lock dropped, there is a bug
361 assert(!dirty_info);
362 assert(!dirty_big_info);
363
364 dout(30) << "lock" << dendl;
365}
366
367std::string PG::gen_prefix() const
368{
369 stringstream out;
370 OSDMapRef mapref = osdmap_ref;
371 if (_lock.is_locked_by_me()) {
372 out << "osd." << osd->whoami
373 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
374 << " " << *this << " ";
375 } else {
376 out << "osd." << osd->whoami
377 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
378 << " pg[" << info.pgid << "(unlocked)] ";
379 }
380 return out.str();
381}
382
383/********* PG **********/
384
385void PG::proc_master_log(
386 ObjectStore::Transaction& t, pg_info_t &oinfo,
387 pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
388{
389 dout(10) << "proc_master_log for osd." << from << ": "
390 << olog << " " << omissing << dendl;
391 assert(!is_peered() && is_primary());
392
393 // merge log into our own log to build master log. no need to
394 // make any adjustments to their missing map; we are taking their
395 // log to be authoritative (i.e., their entries are by definitely
396 // non-divergent).
397 merge_log(t, oinfo, olog, from);
398 peer_info[from] = oinfo;
399 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
400 might_have_unfound.insert(from);
401
402 // See doc/dev/osd_internals/last_epoch_started
403 if (oinfo.last_epoch_started > info.last_epoch_started) {
404 info.last_epoch_started = oinfo.last_epoch_started;
405 dirty_info = true;
406 }
407 if (oinfo.last_interval_started > info.last_interval_started) {
408 info.last_interval_started = oinfo.last_interval_started;
409 dirty_info = true;
410 }
411 update_history(oinfo.history);
412 assert(cct->_conf->osd_find_best_info_ignore_history_les ||
413 info.last_epoch_started >= info.history.last_epoch_started);
414
415 peer_missing[from].claim(omissing);
416}
417
418void PG::proc_replica_log(
419 pg_info_t &oinfo,
420 const pg_log_t &olog,
421 pg_missing_t& omissing,
422 pg_shard_t from)
423{
424 dout(10) << "proc_replica_log for osd." << from << ": "
425 << oinfo << " " << olog << " " << omissing << dendl;
426
427 pg_log.proc_replica_log(oinfo, olog, omissing, from);
428
429 peer_info[from] = oinfo;
430 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
431 might_have_unfound.insert(from);
432
433 for (map<hobject_t, pg_missing_item>::const_iterator i =
434 omissing.get_items().begin();
435 i != omissing.get_items().end();
436 ++i) {
437 dout(20) << " after missing " << i->first << " need " << i->second.need
438 << " have " << i->second.have << dendl;
439 }
440 peer_missing[from].claim(omissing);
441}
442
443bool PG::proc_replica_info(
444 pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
445{
446 map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
447 if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
448 dout(10) << " got dup osd." << from << " info " << oinfo << ", identical to ours" << dendl;
449 return false;
450 }
451
452 if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
453 dout(10) << " got info " << oinfo << " from down osd." << from
454 << " discarding" << dendl;
455 return false;
456 }
457
458 dout(10) << " got osd." << from << " " << oinfo << dendl;
459 assert(is_primary());
460 peer_info[from] = oinfo;
461 might_have_unfound.insert(from);
462
463 update_history(oinfo.history);
464
465 // stray?
466 if (!is_up(from) && !is_acting(from)) {
467 dout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
468 stray_set.insert(from);
469 if (is_clean()) {
470 purge_strays();
471 }
472 }
473
474 // was this a new info? if so, update peers!
475 if (p == peer_info.end())
476 update_heartbeat_peers();
477
478 return true;
479}
480
481void PG::remove_snap_mapped_object(
482 ObjectStore::Transaction &t, const hobject_t &soid)
483{
484 t.remove(
485 coll,
486 ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard));
487 clear_object_snap_mapping(&t, soid);
488}
489
490void PG::clear_object_snap_mapping(
491 ObjectStore::Transaction *t, const hobject_t &soid)
492{
493 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
494 if (soid.snap < CEPH_MAXSNAP) {
495 int r = snap_mapper.remove_oid(
496 soid,
497 &_t);
498 if (!(r == 0 || r == -ENOENT)) {
499 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
500 ceph_abort();
501 }
502 }
503}
504
505void PG::update_object_snap_mapping(
506 ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps)
507{
508 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
509 assert(soid.snap < CEPH_MAXSNAP);
510 int r = snap_mapper.remove_oid(
511 soid,
512 &_t);
513 if (!(r == 0 || r == -ENOENT)) {
514 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
515 ceph_abort();
516 }
517 snap_mapper.add_oid(
518 soid,
519 snaps,
520 &_t);
521}
522
523void PG::merge_log(
524 ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_shard_t from)
525{
526 PGLogEntryHandler rollbacker{this, &t};
527 pg_log.merge_log(
528 oinfo, olog, from, info, &rollbacker, dirty_info, dirty_big_info);
529}
530
531void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead)
532{
533 PGLogEntryHandler rollbacker{this, &t};
534 pg_log.rewind_divergent_log(
535 newhead, info, &rollbacker, dirty_info, dirty_big_info);
536}
537
538/*
539 * Process information from a replica to determine if it could have any
540 * objects that i need.
541 *
542 * TODO: if the missing set becomes very large, this could get expensive.
543 * Instead, we probably want to just iterate over our unfound set.
544 */
545bool PG::search_for_missing(
546 const pg_info_t &oinfo, const pg_missing_t &omissing,
547 pg_shard_t from,
548 RecoveryCtx *ctx)
549{
550 uint64_t num_unfound_before = missing_loc.num_unfound();
551 bool found_missing = missing_loc.add_source_info(
552 from, oinfo, omissing, ctx->handle);
553 if (found_missing && num_unfound_before != missing_loc.num_unfound())
554 publish_stats_to_osd();
555 if (found_missing &&
556 (get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
557 CEPH_FEATURE_OSD_ERASURE_CODES)) {
558 pg_info_t tinfo(oinfo);
559 tinfo.pgid.shard = pg_whoami.shard;
560 (*(ctx->info_map))[from.osd].push_back(
561 make_pair(
562 pg_notify_t(
563 from.shard, pg_whoami.shard,
564 get_osdmap()->get_epoch(),
565 get_osdmap()->get_epoch(),
566 tinfo),
567 past_intervals));
568 }
569 return found_missing;
570}
571
572bool PG::MissingLoc::readable_with_acting(
573 const hobject_t &hoid,
574 const set<pg_shard_t> &acting) const {
575 if (!needs_recovery(hoid)) return true;
576 auto missing_loc_entry = missing_loc.find(hoid);
577 if (missing_loc_entry == missing_loc.end()) return false;
578 const set<pg_shard_t> &locs = missing_loc_entry->second;
579 ldout(pg->cct, 10) << __func__ << ": locs:" << locs << dendl;
580 set<pg_shard_t> have_acting;
581 for (set<pg_shard_t>::const_iterator i = locs.begin();
582 i != locs.end();
583 ++i) {
584 if (acting.count(*i))
585 have_acting.insert(*i);
586 }
587 return (*is_readable)(have_acting);
588}
589
590void PG::MissingLoc::add_batch_sources_info(
591 const set<pg_shard_t> &sources, ThreadPool::TPHandle* handle)
592{
593 ldout(pg->cct, 10) << __func__ << ": adding sources in batch "
594 << sources.size() << dendl;
595 unsigned loop = 0;
596 for (map<hobject_t, pg_missing_item>::const_iterator i = needs_recovery_map.begin();
597 i != needs_recovery_map.end();
598 ++i) {
599 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
600 handle->reset_tp_timeout();
601 loop = 0;
602 }
603 missing_loc[i->first].insert(sources.begin(), sources.end());
604 missing_loc_sources.insert(sources.begin(), sources.end());
605 }
606}
607
608bool PG::MissingLoc::add_source_info(
609 pg_shard_t fromosd,
610 const pg_info_t &oinfo,
611 const pg_missing_t &omissing,
612 ThreadPool::TPHandle* handle)
613{
614 bool found_missing = false;
615 unsigned loop = 0;
616 // found items?
617 for (map<hobject_t,pg_missing_item>::const_iterator p = needs_recovery_map.begin();
618 p != needs_recovery_map.end();
619 ++p) {
620 const hobject_t &soid(p->first);
621 eversion_t need = p->second.need;
622 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
623 handle->reset_tp_timeout();
624 loop = 0;
625 }
626 if (oinfo.last_update < need) {
627 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
628 << " also missing on osd." << fromosd
629 << " (last_update " << oinfo.last_update
630 << " < needed " << need << ")" << dendl;
631 continue;
632 }
633 if (!oinfo.last_backfill.is_max() &&
634 !oinfo.last_backfill_bitwise) {
635 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
636 << " also missing on osd." << fromosd
637 << " (last_backfill " << oinfo.last_backfill
638 << " but with wrong sort order)"
639 << dendl;
640 continue;
641 }
642 if (p->first >= oinfo.last_backfill) {
643 // FIXME: this is _probably_ true, although it could conceivably
644 // be in the undefined region! Hmm!
645 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
646 << " also missing on osd." << fromosd
647 << " (past last_backfill " << oinfo.last_backfill
648 << ")" << dendl;
649 continue;
650 }
651 if (oinfo.last_complete < need) {
652 if (omissing.is_missing(soid)) {
653 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
654 << " also missing on osd." << fromosd << dendl;
655 continue;
656 }
657 }
658
659 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
660 << " is on osd." << fromosd << dendl;
661
662 missing_loc[soid].insert(fromosd);
663 missing_loc_sources.insert(fromosd);
664 found_missing = true;
665 }
666
667 ldout(pg->cct, 20) << "needs_recovery_map missing " << needs_recovery_map
668 << dendl;
669 return found_missing;
670}
671
672void PG::discover_all_missing(map<int, map<spg_t,pg_query_t> > &query_map)
673{
674 auto &missing = pg_log.get_missing();
675 uint64_t unfound = get_num_unfound();
676 assert(unfound > 0);
677
678 dout(10) << __func__ << " "
679 << missing.num_missing() << " missing, "
680 << unfound << " unfound"
681 << dendl;
682
683 std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
684 std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
685 for (; m != mend; ++m) {
686 pg_shard_t peer(*m);
687
688 if (!get_osdmap()->is_up(peer.osd)) {
689 dout(20) << __func__ << " skipping down osd." << peer << dendl;
690 continue;
691 }
692
693 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
694 if (iter != peer_info.end() &&
695 (iter->second.is_empty() || iter->second.dne())) {
696 // ignore empty peers
697 continue;
698 }
699
700 // If we've requested any of this stuff, the pg_missing_t information
701 // should be on its way.
702 // TODO: coalsce requested_* into a single data structure
703 if (peer_missing.find(peer) != peer_missing.end()) {
704 dout(20) << __func__ << ": osd." << peer
705 << ": we already have pg_missing_t" << dendl;
706 continue;
707 }
708 if (peer_log_requested.find(peer) != peer_log_requested.end()) {
709 dout(20) << __func__ << ": osd." << peer
710 << ": in peer_log_requested" << dendl;
711 continue;
712 }
713 if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
714 dout(20) << __func__ << ": osd." << peer
715 << ": in peer_missing_requested" << dendl;
716 continue;
717 }
718
719 // Request missing
720 dout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
721 << dendl;
722 peer_missing_requested.insert(peer);
723 query_map[peer.osd][spg_t(info.pgid.pgid, peer.shard)] =
724 pg_query_t(
725 pg_query_t::FULLLOG,
726 peer.shard, pg_whoami.shard,
727 info.history, get_osdmap()->get_epoch());
728 }
729}
730
731/******* PG ***********/
732bool PG::needs_recovery() const
733{
734 assert(is_primary());
735
736 auto &missing = pg_log.get_missing();
737
738 if (missing.num_missing()) {
739 dout(10) << __func__ << " primary has " << missing.num_missing()
740 << " missing" << dendl;
741 return true;
742 }
743
744 assert(!actingbackfill.empty());
745 set<pg_shard_t>::const_iterator end = actingbackfill.end();
746 set<pg_shard_t>::const_iterator a = actingbackfill.begin();
747 for (; a != end; ++a) {
748 if (*a == get_primary()) continue;
749 pg_shard_t peer = *a;
750 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
751 if (pm == peer_missing.end()) {
752 dout(10) << __func__ << " osd." << peer << " doesn't have missing set"
753 << dendl;
754 continue;
755 }
756 if (pm->second.num_missing()) {
757 dout(10) << __func__ << " osd." << peer << " has "
758 << pm->second.num_missing() << " missing" << dendl;
759 return true;
760 }
761 }
762
763 dout(10) << __func__ << " is recovered" << dendl;
764 return false;
765}
766
767bool PG::needs_backfill() const
768{
769 assert(is_primary());
770
771 // We can assume that only possible osds that need backfill
772 // are on the backfill_targets vector nodes.
773 set<pg_shard_t>::const_iterator end = backfill_targets.end();
774 set<pg_shard_t>::const_iterator a = backfill_targets.begin();
775 for (; a != end; ++a) {
776 pg_shard_t peer = *a;
777 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
778 if (!pi->second.last_backfill.is_max()) {
779 dout(10) << __func__ << " osd." << peer << " has last_backfill " << pi->second.last_backfill << dendl;
780 return true;
781 }
782 }
783
784 dout(10) << __func__ << " does not need backfill" << dendl;
785 return false;
786}
787
788
789void PG::check_past_interval_bounds() const
790{
791 auto rpib = get_required_past_interval_bounds(
792 info,
793 osd->get_superblock().oldest_map);
794 if (rpib.first >= rpib.second) {
795 if (!past_intervals.empty()) {
796 osd->clog->error() << info.pgid << " required past_interval bounds are"
797 << " empty [" << rpib << ") but past_intervals is not: "
798 << past_intervals;
799 derr << info.pgid << " required past_interval bounds are"
800 << " empty [" << rpib << ") but past_intervals is not: "
801 << past_intervals << dendl;
7c673cae
FG
802 }
803 } else {
804 if (past_intervals.empty()) {
805 osd->clog->error() << info.pgid << " required past_interval bounds are"
806 << " not empty [" << rpib << ") but past_intervals "
807 << past_intervals << " is empty";
808 derr << info.pgid << " required past_interval bounds are"
809 << " not empty [" << rpib << ") but past_intervals "
810 << past_intervals << " is empty" << dendl;
811 assert(!past_intervals.empty());
812 }
813
814 auto apib = past_intervals.get_bounds();
815 if (apib.first > rpib.first) {
816 osd->clog->error() << info.pgid << " past_intervals [" << apib
817 << ") start interval does not contain the required"
818 << " bound [" << rpib << ") start";
819 derr << info.pgid << " past_intervals [" << apib
820 << ") start interval does not contain the required"
821 << " bound [" << rpib << ") start" << dendl;
822 assert(0 == "past_interval start interval mismatch");
823 }
824 if (apib.second != rpib.second) {
825 osd->clog->error() << info.pgid << " past_interal bound [" << apib
826 << ") end does not match required [" << rpib
827 << ") end";
828 derr << info.pgid << " past_interal bound [" << apib
829 << ") end does not match required [" << rpib
830 << ") end" << dendl;
831 assert(0 == "past_interval end mismatch");
832 }
833 }
834}
835
836bool PG::adjust_need_up_thru(const OSDMapRef osdmap)
837{
838 epoch_t up_thru = osdmap->get_up_thru(osd->whoami);
839 if (need_up_thru &&
840 up_thru >= info.history.same_interval_since) {
841 dout(10) << "adjust_need_up_thru now " << up_thru << ", need_up_thru now false" << dendl;
842 need_up_thru = false;
843 return true;
844 }
845 return false;
846}
847
848void PG::remove_down_peer_info(const OSDMapRef osdmap)
849{
850 // Remove any downed osds from peer_info
851 bool removed = false;
852 map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
853 while (p != peer_info.end()) {
854 if (!osdmap->is_up(p->first.osd)) {
855 dout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
856 peer_missing.erase(p->first);
857 peer_log_requested.erase(p->first);
858 peer_missing_requested.erase(p->first);
859 peer_info.erase(p++);
860 removed = true;
861 } else
862 ++p;
863 }
864
865 // if we removed anyone, update peers (which include peer_info)
866 if (removed)
867 update_heartbeat_peers();
868 check_recovery_sources(osdmap);
869}
870
871/*
872 * Returns true unless there is a non-lost OSD in might_have_unfound.
873 */
874bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
875{
876 assert(is_primary());
877
878 set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
879 set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
880 for (; peer != mend; ++peer) {
881 if (peer_missing.count(*peer))
882 continue;
883 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
884 if (iter != peer_info.end() &&
885 (iter->second.is_empty() || iter->second.dne()))
886 continue;
887 if (!osdmap->exists(peer->osd))
888 continue;
889 const osd_info_t &osd_info(osdmap->get_info(peer->osd));
890 if (osd_info.lost_at <= osd_info.up_from) {
891 // If there is even one OSD in might_have_unfound that isn't lost, we
892 // still might retrieve our unfound.
893 return false;
894 }
895 }
896 dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound
897 << " have been queried or are marked lost" << dendl;
898 return true;
899}
900
901PastIntervals::PriorSet PG::build_prior()
902{
903 if (1) {
904 // sanity check
905 for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
906 it != peer_info.end();
907 ++it) {
908 assert(info.history.last_epoch_started >= it->second.history.last_epoch_started);
909 }
910 }
911
912 const OSDMap &osdmap = *get_osdmap();
913 PastIntervals::PriorSet prior = past_intervals.get_prior_set(
914 pool.info.ec_pool(),
915 info.history.last_epoch_started,
916 get_pgbackend()->get_is_recoverable_predicate(),
917 [&](epoch_t start, int osd, epoch_t *lost_at) {
918 const osd_info_t *pinfo = 0;
919 if (osdmap.exists(osd)) {
920 pinfo = &osdmap.get_info(osd);
921 if (lost_at)
922 *lost_at = pinfo->lost_at;
923 }
924
925 if (osdmap.is_up(osd)) {
926 return PastIntervals::UP;
927 } else if (!pinfo) {
928 return PastIntervals::DNE;
929 } else if (pinfo->lost_at > start) {
930 return PastIntervals::LOST;
931 } else {
932 return PastIntervals::DOWN;
933 }
934 },
935 up,
936 acting,
937 this);
938
939 if (prior.pg_down) {
940 state_set(PG_STATE_DOWN);
941 }
942
943 if (get_osdmap()->get_up_thru(osd->whoami) < info.history.same_interval_since) {
944 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
945 << " < same_since " << info.history.same_interval_since
946 << ", must notify monitor" << dendl;
947 need_up_thru = true;
948 } else {
949 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
950 << " >= same_since " << info.history.same_interval_since
951 << ", all is well" << dendl;
952 need_up_thru = false;
953 }
954 set_probe_targets(prior.probe);
955 return prior;
956}
957
958void PG::clear_primary_state()
959{
960 dout(10) << "clear_primary_state" << dendl;
961
962 // clear peering state
963 stray_set.clear();
964 peer_log_requested.clear();
965 peer_missing_requested.clear();
966 peer_info.clear();
967 peer_missing.clear();
968 need_up_thru = false;
969 peer_last_complete_ondisk.clear();
970 peer_activated.clear();
971 min_last_complete_ondisk = eversion_t();
972 pg_trim_to = eversion_t();
973 might_have_unfound.clear();
974 projected_log = PGLog::IndexedLog();
975
976 last_update_ondisk = eversion_t();
977
978 snap_trimq.clear();
979
980 finish_sync_event = 0; // so that _finish_recovery doesn't go off in another thread
981
982 missing_loc.clear();
983
984 release_pg_backoffs();
985
986 pg_log.reset_recovery_pointers();
987
988 scrubber.reserved_peers.clear();
989 scrub_after_recovery = false;
990
991 agent_clear();
992}
993
994PG::Scrubber::Scrubber()
995 : reserved(false), reserve_failed(false),
996 epoch_start(0),
224ce89b 997 active(false),
7c673cae
FG
998 waiting_on(0), shallow_errors(0), deep_errors(0), fixed(0),
999 must_scrub(false), must_deep_scrub(false), must_repair(false),
1000 auto_repair(false),
1001 num_digest_updates_pending(0),
1002 state(INACTIVE),
1003 deep(false),
1004 seed(0)
1005{}
1006
1007PG::Scrubber::~Scrubber() {}
1008
1009/**
1010 * find_best_info
1011 *
1012 * Returns an iterator to the best info in infos sorted by:
1013 * 1) Prefer newer last_update
1014 * 2) Prefer longer tail if it brings another info into contiguity
1015 * 3) Prefer current primary
1016 */
1017map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
1018 const map<pg_shard_t, pg_info_t> &infos,
1019 bool restrict_to_up_acting,
1020 bool *history_les_bound) const
1021{
1022 assert(history_les_bound);
1023 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1024 * to make changes to this process. Also, make sure to update it
1025 * when you find bugs! */
1026 eversion_t min_last_update_acceptable = eversion_t::max();
1027 epoch_t max_last_epoch_started_found = 0;
1028 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1029 i != infos.end();
1030 ++i) {
1031 if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1032 max_last_epoch_started_found < i->second.history.last_epoch_started) {
1033 *history_les_bound = true;
1034 max_last_epoch_started_found = i->second.history.last_epoch_started;
1035 }
1036 if (!i->second.is_incomplete() &&
1037 max_last_epoch_started_found < i->second.last_epoch_started) {
1038 max_last_epoch_started_found = i->second.last_epoch_started;
1039 }
1040 }
1041 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1042 i != infos.end();
1043 ++i) {
1044 if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1045 if (min_last_update_acceptable > i->second.last_update)
1046 min_last_update_acceptable = i->second.last_update;
1047 }
1048 }
1049 if (min_last_update_acceptable == eversion_t::max())
1050 return infos.end();
1051
1052 map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1053 // find osd with newest last_update (oldest for ec_pool).
1054 // if there are multiples, prefer
1055 // - a longer tail, if it brings another peer into log contiguity
1056 // - the current primary
1057 for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1058 p != infos.end();
1059 ++p) {
1060 if (restrict_to_up_acting && !is_up(p->first) &&
1061 !is_acting(p->first))
1062 continue;
1063 // Only consider peers with last_update >= min_last_update_acceptable
1064 if (p->second.last_update < min_last_update_acceptable)
1065 continue;
1066 // Disqualify anyone with a too old last_epoch_started
1067 if (p->second.last_epoch_started < max_last_epoch_started_found)
1068 continue;
1069 // Disqualify anyone who is incomplete (not fully backfilled)
1070 if (p->second.is_incomplete())
1071 continue;
1072 if (best == infos.end()) {
1073 best = p;
1074 continue;
1075 }
1076 // Prefer newer last_update
1077 if (pool.info.require_rollback()) {
1078 if (p->second.last_update > best->second.last_update)
1079 continue;
1080 if (p->second.last_update < best->second.last_update) {
1081 best = p;
1082 continue;
1083 }
1084 } else {
1085 if (p->second.last_update < best->second.last_update)
1086 continue;
1087 if (p->second.last_update > best->second.last_update) {
1088 best = p;
1089 continue;
1090 }
1091 }
1092
1093 // Prefer longer tail
1094 if (p->second.log_tail > best->second.log_tail) {
1095 continue;
1096 } else if (p->second.log_tail < best->second.log_tail) {
1097 best = p;
1098 continue;
1099 }
1100
1101 // prefer current primary (usually the caller), all things being equal
1102 if (p->first == pg_whoami) {
1103 dout(10) << "calc_acting prefer osd." << p->first
1104 << " because it is current primary" << dendl;
1105 best = p;
1106 continue;
1107 }
1108 }
1109 return best;
1110}
1111
1112void PG::calc_ec_acting(
1113 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1114 unsigned size,
1115 const vector<int> &acting,
1116 pg_shard_t acting_primary,
1117 const vector<int> &up,
1118 pg_shard_t up_primary,
1119 const map<pg_shard_t, pg_info_t> &all_info,
7c673cae
FG
1120 bool restrict_to_up_acting,
1121 vector<int> *_want,
1122 set<pg_shard_t> *backfill,
1123 set<pg_shard_t> *acting_backfill,
1124 pg_shard_t *want_primary,
1125 ostream &ss)
1126{
1127 vector<int> want(size, CRUSH_ITEM_NONE);
1128 map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1129 unsigned usable = 0;
1130 for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1131 i != all_info.end();
1132 ++i) {
1133 all_info_by_shard[i->first.shard].insert(i->first);
1134 }
1135 for (uint8_t i = 0; i < want.size(); ++i) {
1136 ss << "For position " << (unsigned)i << ": ";
1137 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1138 !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1139 all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1140 auth_log_shard->second.log_tail) {
1141 ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1142 want[i] = up[i];
1143 ++usable;
1144 continue;
1145 }
1146 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1147 ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1148 << " and ";
1149 backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1150 }
1151
1152 if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1153 !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1154 all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1155 auth_log_shard->second.log_tail) {
1156 ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1157 want[i] = acting[i];
1158 ++usable;
1159 } else if (!restrict_to_up_acting) {
1160 for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1161 j != all_info_by_shard[shard_id_t(i)].end();
1162 ++j) {
1163 assert(j->shard == i);
1164 if (!all_info.find(*j)->second.is_incomplete() &&
1165 all_info.find(*j)->second.last_update >=
1166 auth_log_shard->second.log_tail) {
1167 ss << " selecting stray: " << *j << std::endl;
1168 want[i] = j->osd;
1169 ++usable;
1170 break;
1171 }
1172 }
1173 if (want[i] == CRUSH_ITEM_NONE)
1174 ss << " failed to fill position " << (int)i << std::endl;
1175 }
1176 }
1177
1178 bool found_primary = false;
1179 for (uint8_t i = 0; i < want.size(); ++i) {
1180 if (want[i] != CRUSH_ITEM_NONE) {
1181 acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1182 if (!found_primary) {
1183 *want_primary = pg_shard_t(want[i], shard_id_t(i));
1184 found_primary = true;
1185 }
1186 }
1187 }
1188 acting_backfill->insert(backfill->begin(), backfill->end());
1189 _want->swap(want);
1190}
1191
1192/**
1193 * calculate the desired acting set.
1194 *
1195 * Choose an appropriate acting set. Prefer up[0], unless it is
1196 * incomplete, or another osd has a longer tail that allows us to
1197 * bring other up nodes up to date.
1198 */
1199void PG::calc_replicated_acting(
1200 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1201 unsigned size,
1202 const vector<int> &acting,
1203 pg_shard_t acting_primary,
1204 const vector<int> &up,
1205 pg_shard_t up_primary,
1206 const map<pg_shard_t, pg_info_t> &all_info,
7c673cae
FG
1207 bool restrict_to_up_acting,
1208 vector<int> *want,
1209 set<pg_shard_t> *backfill,
1210 set<pg_shard_t> *acting_backfill,
1211 pg_shard_t *want_primary,
1212 ostream &ss)
1213{
1214 ss << "calc_acting newest update on osd." << auth_log_shard->first
1215 << " with " << auth_log_shard->second
1216 << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1217 pg_shard_t auth_log_shard_id = auth_log_shard->first;
1218
1219 // select primary
1220 map<pg_shard_t,pg_info_t>::const_iterator primary;
1221 if (up.size() &&
1222 !all_info.find(up_primary)->second.is_incomplete() &&
1223 all_info.find(up_primary)->second.last_update >=
1224 auth_log_shard->second.log_tail) {
1225 ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1226 primary = all_info.find(up_primary); // prefer up[0], all thing being equal
1227 } else {
1228 assert(!auth_log_shard->second.is_incomplete());
1229 ss << "up[0] needs backfill, osd." << auth_log_shard_id
1230 << " selected as primary instead" << std::endl;
1231 primary = auth_log_shard;
1232 }
1233
1234 ss << "calc_acting primary is osd." << primary->first
1235 << " with " << primary->second << std::endl;
1236 *want_primary = primary->first;
1237 want->push_back(primary->first.osd);
1238 acting_backfill->insert(primary->first);
1239 unsigned usable = 1;
1240
1241 // select replicas that have log contiguity with primary.
1242 // prefer up, then acting, then any peer_info osds
1243 for (vector<int>::const_iterator i = up.begin();
1244 i != up.end();
1245 ++i) {
1246 pg_shard_t up_cand = pg_shard_t(*i, shard_id_t::NO_SHARD);
1247 if (up_cand == primary->first)
1248 continue;
1249 const pg_info_t &cur_info = all_info.find(up_cand)->second;
1250 if (cur_info.is_incomplete() ||
1251 cur_info.last_update < MIN(
1252 primary->second.log_tail,
1253 auth_log_shard->second.log_tail)) {
1254 /* We include auth_log_shard->second.log_tail because in GetLog,
1255 * we will request logs back to the min last_update over our
1256 * acting_backfill set, which will result in our log being extended
1257 * as far backwards as necessary to pick up any peers which can
1258 * be log recovered by auth_log_shard's log */
1259 ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
31f18b77
FG
1260 backfill->insert(up_cand);
1261 acting_backfill->insert(up_cand);
7c673cae
FG
1262 } else {
1263 want->push_back(*i);
1264 acting_backfill->insert(up_cand);
1265 usable++;
1266 ss << " osd." << *i << " (up) accepted " << cur_info << std::endl;
1267 }
1268 }
1269
1270 // This no longer has backfill OSDs, but they are covered above.
1271 for (vector<int>::const_iterator i = acting.begin();
1272 i != acting.end();
1273 ++i) {
1274 pg_shard_t acting_cand(*i, shard_id_t::NO_SHARD);
1275 if (usable >= size)
1276 break;
1277
1278 // skip up osds we already considered above
1279 if (acting_cand == primary->first)
1280 continue;
1281 vector<int>::const_iterator up_it = find(up.begin(), up.end(), acting_cand.osd);
1282 if (up_it != up.end())
1283 continue;
1284
1285 const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1286 if (cur_info.is_incomplete() ||
1287 cur_info.last_update < primary->second.log_tail) {
1288 ss << " shard " << acting_cand << " (stray) REJECTED "
1289 << cur_info << std::endl;
1290 } else {
1291 want->push_back(*i);
1292 acting_backfill->insert(acting_cand);
1293 ss << " shard " << acting_cand << " (stray) accepted "
1294 << cur_info << std::endl;
1295 usable++;
1296 }
1297 }
1298
1299 if (restrict_to_up_acting) {
1300 return;
1301 }
1302 for (map<pg_shard_t,pg_info_t>::const_iterator i = all_info.begin();
1303 i != all_info.end();
1304 ++i) {
1305 if (usable >= size)
1306 break;
1307
1308 // skip up osds we already considered above
1309 if (i->first == primary->first)
1310 continue;
1311 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i->first.osd);
1312 if (up_it != up.end())
1313 continue;
1314 vector<int>::const_iterator acting_it = find(
1315 acting.begin(), acting.end(), i->first.osd);
1316 if (acting_it != acting.end())
1317 continue;
1318
1319 if (i->second.is_incomplete() ||
1320 i->second.last_update < primary->second.log_tail) {
1321 ss << " shard " << i->first << " (stray) REJECTED "
1322 << i->second << std::endl;
1323 } else {
1324 want->push_back(i->first.osd);
1325 acting_backfill->insert(i->first);
1326 ss << " shard " << i->first << " (stray) accepted "
1327 << i->second << std::endl;
1328 usable++;
1329 }
1330 }
1331}
1332
1333/**
1334 * choose acting
1335 *
1336 * calculate the desired acting, and request a change with the monitor
1337 * if it differs from the current acting.
1338 *
1339 * if restrict_to_up_acting=true, we filter out anything that's not in
1340 * up/acting. in order to lift this restriction, we need to
1341 * 1) check whether it's worth switching the acting set any time we get
1342 * a new pg info (not just here, when recovery finishes)
1343 * 2) check whether anything in want_acting went down on each new map
1344 * (and, if so, calculate a new want_acting)
1345 * 3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap)
1346 * TODO!
1347 */
1348bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
1349 bool restrict_to_up_acting,
1350 bool *history_les_bound)
1351{
1352 map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
1353 all_info[pg_whoami] = info;
1354
1355 for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
1356 p != all_info.end();
1357 ++p) {
1358 dout(10) << "calc_acting osd." << p->first << " " << p->second << dendl;
1359 }
1360
1361 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
1362 find_best_info(all_info, restrict_to_up_acting, history_les_bound);
1363
1364 if (auth_log_shard == all_info.end()) {
1365 if (up != acting) {
1366 dout(10) << "choose_acting no suitable info found (incomplete backfills?),"
1367 << " reverting to up" << dendl;
1368 want_acting = up;
1369 vector<int> empty;
1370 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1371 } else {
1372 dout(10) << "choose_acting failed" << dendl;
1373 assert(want_acting.empty());
1374 }
1375 return false;
1376 }
1377
1378 assert(!auth_log_shard->second.is_incomplete());
1379 auth_log_shard_id = auth_log_shard->first;
1380
7c673cae
FG
1381 set<pg_shard_t> want_backfill, want_acting_backfill;
1382 vector<int> want;
1383 pg_shard_t want_primary;
1384 stringstream ss;
1385 if (!pool.info.ec_pool())
1386 calc_replicated_acting(
1387 auth_log_shard,
1388 get_osdmap()->get_pg_size(info.pgid.pgid),
1389 acting,
1390 primary,
1391 up,
1392 up_primary,
1393 all_info,
7c673cae
FG
1394 restrict_to_up_acting,
1395 &want,
1396 &want_backfill,
1397 &want_acting_backfill,
1398 &want_primary,
1399 ss);
1400 else
1401 calc_ec_acting(
1402 auth_log_shard,
1403 get_osdmap()->get_pg_size(info.pgid.pgid),
1404 acting,
1405 primary,
1406 up,
1407 up_primary,
1408 all_info,
7c673cae
FG
1409 restrict_to_up_acting,
1410 &want,
1411 &want_backfill,
1412 &want_acting_backfill,
1413 &want_primary,
1414 ss);
1415 dout(10) << ss.str() << dendl;
1416
1417 unsigned num_want_acting = 0;
1418 set<pg_shard_t> have;
1419 for (int i = 0; i < (int)want.size(); ++i) {
1420 if (want[i] != CRUSH_ITEM_NONE) {
1421 ++num_want_acting;
1422 have.insert(
1423 pg_shard_t(
1424 want[i],
1425 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1426 }
1427 }
1428
1429 // We go incomplete if below min_size for ec_pools since backfill
1430 // does not currently maintain rollbackability
1431 // Otherwise, we will go "peered", but not "active"
1432 if (num_want_acting < pool.info.min_size &&
1433 (pool.info.ec_pool() ||
1434 !cct->_conf->osd_allow_recovery_below_min_size)) {
1435 want_acting.clear();
1436 dout(10) << "choose_acting failed, below min size" << dendl;
1437 return false;
1438 }
1439
1440 /* Check whether we have enough acting shards to later perform recovery */
1441 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable_predicate(
1442 get_pgbackend()->get_is_recoverable_predicate());
1443 if (!(*recoverable_predicate)(have)) {
1444 want_acting.clear();
1445 dout(10) << "choose_acting failed, not recoverable" << dendl;
1446 return false;
1447 }
1448
1449 if (want != acting) {
1450 dout(10) << "choose_acting want " << want << " != acting " << acting
1451 << ", requesting pg_temp change" << dendl;
1452 want_acting = want;
1453
1454 if (want_acting == up) {
1455 // There can't be any pending backfill if
1456 // want is the same as crush map up OSDs.
31f18b77 1457 assert(want_backfill.empty());
7c673cae
FG
1458 vector<int> empty;
1459 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1460 } else
1461 osd->queue_want_pg_temp(info.pgid.pgid, want);
1462 return false;
1463 }
1464 want_acting.clear();
1465 actingbackfill = want_acting_backfill;
1466 dout(10) << "actingbackfill is " << actingbackfill << dendl;
1467 assert(backfill_targets.empty() || backfill_targets == want_backfill);
1468 if (backfill_targets.empty()) {
1469 // Caller is GetInfo
1470 backfill_targets = want_backfill;
1471 }
1472 // Will not change if already set because up would have had to change
1473 // Verify that nothing in backfill is in stray_set
1474 for (set<pg_shard_t>::iterator i = want_backfill.begin();
1475 i != want_backfill.end();
1476 ++i) {
1477 assert(stray_set.find(*i) == stray_set.end());
1478 }
1479 dout(10) << "choose_acting want " << want << " (== acting) backfill_targets "
1480 << want_backfill << dendl;
1481 return true;
1482}
1483
1484/* Build the might_have_unfound set.
1485 *
1486 * This is used by the primary OSD during recovery.
1487 *
1488 * This set tracks the OSDs which might have unfound objects that the primary
1489 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
1490 * will remove the OSD from the set.
1491 */
1492void PG::build_might_have_unfound()
1493{
1494 assert(might_have_unfound.empty());
1495 assert(is_primary());
1496
1497 dout(10) << __func__ << dendl;
1498
1499 check_past_interval_bounds();
1500
1501 might_have_unfound = past_intervals.get_might_have_unfound(
1502 pg_whoami,
1503 pool.info.ec_pool());
1504
1505 // include any (stray) peers
1506 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
1507 p != peer_info.end();
1508 ++p)
1509 might_have_unfound.insert(p->first);
1510
1511 dout(15) << __func__ << ": built " << might_have_unfound << dendl;
1512}
1513
1514struct C_PG_ActivateCommitted : public Context {
1515 PGRef pg;
1516 epoch_t epoch;
1517 epoch_t activation_epoch;
1518 C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
1519 : pg(p), epoch(e), activation_epoch(ae) {}
1520 void finish(int r) override {
1521 pg->_activate_committed(epoch, activation_epoch);
1522 }
1523};
1524
1525void PG::activate(ObjectStore::Transaction& t,
1526 epoch_t activation_epoch,
1527 list<Context*>& tfin,
1528 map<int, map<spg_t,pg_query_t> >& query_map,
1529 map<int,
1530 vector<
1531 pair<pg_notify_t,
1532 PastIntervals> > > *activator_map,
1533 RecoveryCtx *ctx)
1534{
1535 assert(!is_peered());
1536 assert(scrubber.callbacks.empty());
1537 assert(callbacks_for_degraded_object.empty());
1538
1539 // twiddle pg state
1540 state_clear(PG_STATE_DOWN);
1541
1542 send_notify = false;
1543
1544 if (is_primary()) {
1545 // only update primary last_epoch_started if we will go active
1546 if (acting.size() >= pool.info.min_size) {
1547 assert(cct->_conf->osd_find_best_info_ignore_history_les ||
1548 info.last_epoch_started <= activation_epoch);
1549 info.last_epoch_started = activation_epoch;
1550 info.last_interval_started = info.history.same_interval_since;
1551 }
1552 } else if (is_acting(pg_whoami)) {
1553 /* update last_epoch_started on acting replica to whatever the primary sent
1554 * unless it's smaller (could happen if we are going peered rather than
1555 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
1556 if (info.last_epoch_started < activation_epoch) {
1557 info.last_epoch_started = activation_epoch;
1558 info.last_interval_started = info.history.same_interval_since;
1559 }
1560 }
1561
1562 auto &missing = pg_log.get_missing();
1563
1564 if (is_primary()) {
1565 last_update_ondisk = info.last_update;
1566 min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
1567 }
1568 last_update_applied = info.last_update;
1569 last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
1570
1571 need_up_thru = false;
1572
1573 // write pg info, log
1574 dirty_info = true;
1575 dirty_big_info = true; // maybe
1576
1577 // find out when we commit
1578 t.register_on_complete(
1579 new C_PG_ActivateCommitted(
1580 this,
1581 get_osdmap()->get_epoch(),
1582 activation_epoch));
1583
1584 // initialize snap_trimq
1585 if (is_primary()) {
1586 dout(20) << "activate - purged_snaps " << info.purged_snaps
1587 << " cached_removed_snaps " << pool.cached_removed_snaps << dendl;
1588 snap_trimq = pool.cached_removed_snaps;
1589 interval_set<snapid_t> intersection;
1590 intersection.intersection_of(snap_trimq, info.purged_snaps);
1591 if (intersection == info.purged_snaps) {
1592 snap_trimq.subtract(info.purged_snaps);
1593 } else {
1594 dout(0) << "warning: info.purged_snaps (" << info.purged_snaps
1595 << ") is not a subset of pool.cached_removed_snaps ("
1596 << pool.cached_removed_snaps << ")" << dendl;
1597 snap_trimq.subtract(intersection);
1598 }
1599 }
1600
1601 // init complete pointer
1602 if (missing.num_missing() == 0) {
1603 dout(10) << "activate - no missing, moving last_complete " << info.last_complete
1604 << " -> " << info.last_update << dendl;
1605 info.last_complete = info.last_update;
1606 pg_log.reset_recovery_pointers();
1607 } else {
1608 dout(10) << "activate - not complete, " << missing << dendl;
1609 pg_log.activate_not_complete(info);
1610 }
1611
1612 log_weirdness();
1613
1614 // if primary..
1615 if (is_primary()) {
1616 assert(ctx);
1617 // start up replicas
1618
1619 assert(!actingbackfill.empty());
1620 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1621 i != actingbackfill.end();
1622 ++i) {
1623 if (*i == pg_whoami) continue;
1624 pg_shard_t peer = *i;
1625 assert(peer_info.count(peer));
1626 pg_info_t& pi = peer_info[peer];
1627
1628 dout(10) << "activate peer osd." << peer << " " << pi << dendl;
1629
1630 MOSDPGLog *m = 0;
1631 pg_missing_t& pm = peer_missing[peer];
1632
1633 bool needs_past_intervals = pi.dne();
1634
1635 /*
1636 * cover case where peer sort order was different and
1637 * last_backfill cannot be interpreted
1638 */
1639 bool force_restart_backfill =
1640 !pi.last_backfill.is_max() &&
1641 !pi.last_backfill_bitwise;
1642
1643 if (pi.last_update == info.last_update && !force_restart_backfill) {
1644 // empty log
1645 if (!pi.last_backfill.is_max())
1646 osd->clog->info() << info.pgid << " continuing backfill to osd."
1647 << peer
1648 << " from (" << pi.log_tail << "," << pi.last_update
1649 << "] " << pi.last_backfill
1650 << " to " << info.last_update;
1651 if (!pi.is_empty() && activator_map) {
1652 dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
1653 (*activator_map)[peer.osd].push_back(
1654 make_pair(
1655 pg_notify_t(
1656 peer.shard, pg_whoami.shard,
1657 get_osdmap()->get_epoch(),
1658 get_osdmap()->get_epoch(),
1659 info),
1660 past_intervals));
1661 } else {
1662 dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
1663 m = new MOSDPGLog(
1664 i->shard, pg_whoami.shard,
1665 get_osdmap()->get_epoch(), info);
1666 }
1667 } else if (
1668 pg_log.get_tail() > pi.last_update ||
1669 pi.last_backfill == hobject_t() ||
1670 force_restart_backfill ||
1671 (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
1672 /* ^ This last case covers a situation where a replica is not contiguous
1673 * with the auth_log, but is contiguous with this replica. Reshuffling
1674 * the active set to handle this would be tricky, so instead we just go
1675 * ahead and backfill it anyway. This is probably preferrable in any
1676 * case since the replica in question would have to be significantly
1677 * behind.
1678 */
1679 // backfill
224ce89b 1680 osd->clog->debug() << info.pgid << " starting backfill to osd." << peer
7c673cae
FG
1681 << " from (" << pi.log_tail << "," << pi.last_update
1682 << "] " << pi.last_backfill
1683 << " to " << info.last_update;
1684
1685 pi.last_update = info.last_update;
1686 pi.last_complete = info.last_update;
1687 pi.set_last_backfill(hobject_t());
1688 pi.last_epoch_started = info.last_epoch_started;
1689 pi.last_interval_started = info.last_interval_started;
1690 pi.history = info.history;
1691 pi.hit_set = info.hit_set;
1692 pi.stats.stats.clear();
1693
1694 // initialize peer with our purged_snaps.
1695 pi.purged_snaps = info.purged_snaps;
1696
1697 m = new MOSDPGLog(
1698 i->shard, pg_whoami.shard,
1699 get_osdmap()->get_epoch(), pi);
1700
1701 // send some recent log, so that op dup detection works well.
1702 m->log.copy_up_to(pg_log.get_log(), cct->_conf->osd_min_pg_log_entries);
1703 m->info.log_tail = m->log.tail;
1704 pi.log_tail = m->log.tail; // sigh...
1705
1706 pm.clear();
1707 } else {
1708 // catch up
1709 assert(pg_log.get_tail() <= pi.last_update);
1710 m = new MOSDPGLog(
1711 i->shard, pg_whoami.shard,
1712 get_osdmap()->get_epoch(), info);
1713 // send new stuff to append to replicas log
1714 m->log.copy_after(pg_log.get_log(), pi.last_update);
1715 }
1716
1717 // share past_intervals if we are creating the pg on the replica
1718 // based on whether our info for that peer was dne() *before*
1719 // updating pi.history in the backfill block above.
1720 if (m && needs_past_intervals)
1721 m->past_intervals = past_intervals;
1722
1723 // update local version of peer's missing list!
1724 if (m && pi.last_backfill != hobject_t()) {
1725 for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
1726 p != m->log.log.end();
1727 ++p)
31f18b77
FG
1728 if (p->soid <= pi.last_backfill &&
1729 !p->is_error())
7c673cae
FG
1730 pm.add_next_event(*p);
1731 }
1732
1733 if (m) {
1734 dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
1735 //m->log.print(cout);
1736 osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
1737 }
1738
1739 // peer now has
1740 pi.last_update = info.last_update;
1741
1742 // update our missing
1743 if (pm.num_missing() == 0) {
1744 pi.last_complete = pi.last_update;
1745 dout(10) << "activate peer osd." << peer << " " << pi << " uptodate" << dendl;
1746 } else {
1747 dout(10) << "activate peer osd." << peer << " " << pi << " missing " << pm << dendl;
1748 }
1749 }
1750
1751 // Set up missing_loc
1752 set<pg_shard_t> complete_shards;
1753 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1754 i != actingbackfill.end();
1755 ++i) {
1756 if (*i == get_primary()) {
1757 missing_loc.add_active_missing(missing);
1758 if (!missing.have_missing())
1759 complete_shards.insert(*i);
1760 } else {
1761 auto peer_missing_entry = peer_missing.find(*i);
1762 assert(peer_missing_entry != peer_missing.end());
1763 missing_loc.add_active_missing(peer_missing_entry->second);
1764 if (!peer_missing_entry->second.have_missing() &&
1765 peer_info[*i].last_backfill.is_max())
1766 complete_shards.insert(*i);
1767 }
1768 }
1769 // If necessary, create might_have_unfound to help us find our unfound objects.
1770 // NOTE: It's important that we build might_have_unfound before trimming the
1771 // past intervals.
1772 might_have_unfound.clear();
1773 if (needs_recovery()) {
1774 // If only one shard has missing, we do a trick to add all others as recovery
1775 // source, this is considered safe since the PGLogs have been merged locally,
1776 // and covers vast majority of the use cases, like one OSD/host is down for
1777 // a while for hardware repairing
1778 if (complete_shards.size() + 1 == actingbackfill.size()) {
1779 missing_loc.add_batch_sources_info(complete_shards, ctx->handle);
1780 } else {
1781 missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
1782 ctx->handle);
1783 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1784 i != actingbackfill.end();
1785 ++i) {
1786 if (*i == pg_whoami) continue;
1787 dout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
1788 assert(peer_missing.count(*i));
1789 assert(peer_info.count(*i));
1790 missing_loc.add_source_info(
1791 *i,
1792 peer_info[*i],
1793 peer_missing[*i],
1794 ctx->handle);
1795 }
1796 }
1797 for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
1798 i != peer_missing.end();
1799 ++i) {
1800 if (is_actingbackfill(i->first))
1801 continue;
1802 assert(peer_info.count(i->first));
1803 search_for_missing(
1804 peer_info[i->first],
1805 i->second,
1806 i->first,
1807 ctx);
1808 }
1809
1810 build_might_have_unfound();
1811
1812 state_set(PG_STATE_DEGRADED);
1813 if (have_unfound())
1814 discover_all_missing(query_map);
1815 }
1816
1817 // degraded?
1818 if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
1819 state_set(PG_STATE_DEGRADED);
1820 state_set(PG_STATE_UNDERSIZED);
1821 }
1822
1823 state_set(PG_STATE_ACTIVATING);
1824 release_pg_backoffs();
1825 projected_last_update = info.last_update;
1826 }
1827 if (acting.size() >= pool.info.min_size) {
1828 PGLogEntryHandler handler{this, &t};
1829 pg_log.roll_forward(&handler);
1830 }
1831}
1832
1833bool PG::op_has_sufficient_caps(OpRequestRef& op)
1834{
1835 // only check MOSDOp
1836 if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
1837 return true;
1838
1839 const MOSDOp *req = static_cast<const MOSDOp*>(op->get_req());
1840
1841 Session *session = static_cast<Session*>(req->get_connection()->get_priv());
1842 if (!session) {
1843 dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl;
1844 return false;
1845 }
1846 OSDCap& caps = session->caps;
1847 session->put();
1848
1849 const string &key = req->get_hobj().get_key().empty() ?
1850 req->get_oid().name :
1851 req->get_hobj().get_key();
1852
1853 bool cap = caps.is_capable(pool.name, req->get_hobj().nspace,
1854 pool.auid, key,
1855 op->need_read_cap(),
1856 op->need_write_cap(),
1857 op->classes());
1858
1859 dout(20) << "op_has_sufficient_caps pool=" << pool.id << " (" << pool.name
1860 << " " << req->get_hobj().nspace
1861 << ") owner=" << pool.auid
1862 << " need_read_cap=" << op->need_read_cap()
1863 << " need_write_cap=" << op->need_write_cap()
1864 << " classes=" << op->classes()
1865 << " -> " << (cap ? "yes" : "NO")
1866 << dendl;
1867 return cap;
1868}
1869
1870void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
1871{
1872 lock();
1873 if (pg_has_reset_since(epoch)) {
1874 dout(10) << "_activate_committed " << epoch
1875 << ", that was an old interval" << dendl;
1876 } else if (is_primary()) {
1877 peer_activated.insert(pg_whoami);
1878 dout(10) << "_activate_committed " << epoch
1879 << " peer_activated now " << peer_activated
1880 << " last_interval_started " << info.history.last_interval_started
1881 << " last_epoch_started " << info.history.last_epoch_started
1882 << " same_interval_since " << info.history.same_interval_since << dendl;
1883 assert(!actingbackfill.empty());
1884 if (peer_activated.size() == actingbackfill.size())
1885 all_activated_and_committed();
1886 } else {
1887 dout(10) << "_activate_committed " << epoch << " telling primary" << dendl;
1888 MOSDPGInfo *m = new MOSDPGInfo(epoch);
1889 pg_notify_t i = pg_notify_t(
1890 get_primary().shard, pg_whoami.shard,
1891 get_osdmap()->get_epoch(),
1892 get_osdmap()->get_epoch(),
1893 info);
1894
1895 i.info.history.last_epoch_started = activation_epoch;
1896 i.info.history.last_interval_started = i.info.history.same_interval_since;
1897 if (acting.size() >= pool.info.min_size) {
1898 state_set(PG_STATE_ACTIVE);
1899 } else {
1900 state_set(PG_STATE_PEERED);
1901 }
1902
1903 m->pg_list.push_back(make_pair(i, PastIntervals()));
1904 osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap()->get_epoch());
1905
1906 // waiters
1907 if (flushes_in_progress == 0) {
1908 requeue_ops(waiting_for_peered);
1909 }
1910 }
1911
1912 assert(!dirty_info);
1913
1914 unlock();
1915}
1916
1917/*
1918 * update info.history.last_epoch_started ONLY after we and all
1919 * replicas have activated AND committed the activate transaction
1920 * (i.e. the peering results are stable on disk).
1921 */
1922void PG::all_activated_and_committed()
1923{
1924 dout(10) << "all_activated_and_committed" << dendl;
1925 assert(is_primary());
1926 assert(peer_activated.size() == actingbackfill.size());
1927 assert(!actingbackfill.empty());
1928 assert(blocked_by.empty());
1929
1930 queue_peering_event(
1931 CephPeeringEvtRef(
1932 std::make_shared<CephPeeringEvt>(
1933 get_osdmap()->get_epoch(),
1934 get_osdmap()->get_epoch(),
1935 AllReplicasActivated())));
1936}
1937
31f18b77 1938bool PG::requeue_scrub(bool high_priority)
7c673cae
FG
1939{
1940 assert(is_locked());
1941 if (scrub_queued) {
1942 dout(10) << __func__ << ": already queued" << dendl;
1943 return false;
1944 } else {
1945 dout(10) << __func__ << ": queueing" << dendl;
1946 scrub_queued = true;
31f18b77 1947 osd->queue_for_scrub(this, high_priority);
7c673cae
FG
1948 return true;
1949 }
1950}
1951
1952void PG::queue_recovery(bool front)
1953{
1954 if (!is_primary() || !is_peered()) {
1955 dout(10) << "queue_recovery -- not primary or not peered " << dendl;
1956 assert(!recovery_queued);
1957 } else if (recovery_queued) {
1958 dout(10) << "queue_recovery -- already queued" << dendl;
1959 } else {
1960 dout(10) << "queue_recovery -- queuing" << dendl;
1961 recovery_queued = true;
1962 osd->queue_for_recovery(this, front);
1963 }
1964}
1965
1966bool PG::queue_scrub()
1967{
1968 assert(is_locked());
1969 if (is_scrubbing()) {
1970 return false;
1971 }
1972 scrubber.priority = scrubber.must_scrub ?
1973 cct->_conf->osd_requested_scrub_priority : get_scrub_priority();
1974 scrubber.must_scrub = false;
1975 state_set(PG_STATE_SCRUBBING);
1976 if (scrubber.must_deep_scrub) {
1977 state_set(PG_STATE_DEEP_SCRUB);
1978 scrubber.must_deep_scrub = false;
1979 }
1980 if (scrubber.must_repair || scrubber.auto_repair) {
1981 state_set(PG_STATE_REPAIR);
1982 scrubber.must_repair = false;
1983 }
1984 requeue_scrub();
1985 return true;
1986}
1987
1988unsigned PG::get_scrub_priority()
1989{
1990 // a higher value -> a higher priority
1991 int pool_scrub_priority = 0;
1992 pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority);
1993 return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
1994}
1995
1996struct C_PG_FinishRecovery : public Context {
1997 PGRef pg;
1998 explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
1999 void finish(int r) override {
2000 pg->_finish_recovery(this);
2001 }
2002};
2003
2004void PG::mark_clean()
2005{
224ce89b 2006 if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
7c673cae 2007 state_set(PG_STATE_CLEAN);
224ce89b
WB
2008 info.history.last_epoch_clean = get_osdmap()->get_epoch();
2009 info.history.last_interval_clean = info.history.same_interval_since;
2010 past_intervals.clear();
2011 dirty_big_info = true;
2012 dirty_info = true;
7c673cae
FG
2013 }
2014
224ce89b 2015 kick_snap_trim();
7c673cae
FG
2016}
2017
2018unsigned PG::get_recovery_priority()
2019{
2020 // a higher value -> a higher priority
2021
2022 int pool_recovery_priority = 0;
2023 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2024
2025 int ret = OSD_RECOVERY_PRIORITY_BASE + pool_recovery_priority;
2026
2027 // Clamp to valid range
2028 if (ret > OSD_RECOVERY_PRIORITY_MAX) {
2029 ret = OSD_RECOVERY_PRIORITY_MAX;
2030 } else if (ret < OSD_RECOVERY_PRIORITY_MIN) {
2031 ret = OSD_RECOVERY_PRIORITY_MIN;
2032 }
2033
2034 static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
2035 static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
2036
2037 return static_cast<unsigned>(ret);
2038}
2039
2040unsigned PG::get_backfill_priority()
2041{
2042 // a higher value -> a higher priority
2043
2044 int ret = OSD_BACKFILL_PRIORITY_BASE;
2045 if (acting.size() < pool.info.min_size) {
2046 // inactive: no. of replicas < min_size, highest priority since it blocks IO
2047 ret = OSD_BACKFILL_INACTIVE_PRIORITY_BASE + (pool.info.min_size - acting.size());
2048
2049 } else if (is_undersized()) {
2050 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
2051 assert(pool.info.size > actingset.size());
2052 ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE + (pool.info.size - actingset.size());
2053
2054 } else if (is_degraded()) {
2055 // degraded: baseline degraded
2056 ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
2057 }
2058
2059 // Adjust with pool's recovery priority
2060 int pool_recovery_priority = 0;
2061 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2062 ret += pool_recovery_priority;
2063
2064 // Clamp to valid range
2065 if (ret > OSD_RECOVERY_PRIORITY_MAX) {
2066 ret = OSD_RECOVERY_PRIORITY_MAX;
2067 } else if (ret < OSD_RECOVERY_PRIORITY_MIN) {
2068 ret = OSD_RECOVERY_PRIORITY_MIN;
2069 }
2070
2071 return static_cast<unsigned>(ret);
2072}
2073
2074void PG::finish_recovery(list<Context*>& tfin)
2075{
2076 dout(10) << "finish_recovery" << dendl;
2077 assert(info.last_complete == info.last_update);
2078
2079 clear_recovery_state();
2080
2081 /*
2082 * sync all this before purging strays. but don't block!
2083 */
2084 finish_sync_event = new C_PG_FinishRecovery(this);
2085 tfin.push_back(finish_sync_event);
2086}
2087
2088void PG::_finish_recovery(Context *c)
2089{
2090 lock();
2091 if (deleting) {
2092 unlock();
2093 return;
2094 }
2095 if (c == finish_sync_event) {
2096 dout(10) << "_finish_recovery" << dendl;
2097 finish_sync_event = 0;
2098 purge_strays();
2099
2100 publish_stats_to_osd();
2101
2102 if (scrub_after_recovery) {
2103 dout(10) << "_finish_recovery requeueing for scrub" << dendl;
2104 scrub_after_recovery = false;
2105 scrubber.must_deep_scrub = true;
2106 queue_scrub();
2107 }
2108 } else {
2109 dout(10) << "_finish_recovery -- stale" << dendl;
2110 }
2111 unlock();
2112}
2113
2114void PG::start_recovery_op(const hobject_t& soid)
2115{
2116 dout(10) << "start_recovery_op " << soid
2117#ifdef DEBUG_RECOVERY_OIDS
2118 << " (" << recovering_oids << ")"
2119#endif
2120 << dendl;
2121 assert(recovery_ops_active >= 0);
2122 recovery_ops_active++;
2123#ifdef DEBUG_RECOVERY_OIDS
2124 assert(recovering_oids.count(soid) == 0);
2125 recovering_oids.insert(soid);
2126#endif
2127 osd->start_recovery_op(this, soid);
2128}
2129
2130void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
2131{
2132 dout(10) << "finish_recovery_op " << soid
2133#ifdef DEBUG_RECOVERY_OIDS
2134 << " (" << recovering_oids << ")"
2135#endif
2136 << dendl;
2137 assert(recovery_ops_active > 0);
2138 recovery_ops_active--;
2139#ifdef DEBUG_RECOVERY_OIDS
2140 assert(recovering_oids.count(soid));
2141 recovering_oids.erase(soid);
2142#endif
2143 osd->finish_recovery_op(this, soid, dequeue);
2144
2145 if (!dequeue) {
2146 queue_recovery();
2147 }
2148}
2149
2150void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
2151{
2152 child->update_snap_mapper_bits(split_bits);
2153 child->update_osdmap_ref(get_osdmap());
2154
2155 child->pool = pool;
2156
2157 // Log
2158 pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2159 child->info.last_complete = info.last_complete;
2160
2161 info.last_update = pg_log.get_head();
2162 child->info.last_update = child->pg_log.get_head();
2163
2164 child->info.last_user_version = info.last_user_version;
2165
2166 info.log_tail = pg_log.get_tail();
2167 child->info.log_tail = child->pg_log.get_tail();
2168
2169 if (info.last_complete < pg_log.get_tail())
2170 info.last_complete = pg_log.get_tail();
2171 if (child->info.last_complete < child->pg_log.get_tail())
2172 child->info.last_complete = child->pg_log.get_tail();
2173
2174 // Info
2175 child->info.history = info.history;
2176 child->info.history.epoch_created = get_osdmap()->get_epoch();
2177 child->info.purged_snaps = info.purged_snaps;
2178
2179 if (info.last_backfill.is_max()) {
2180 child->info.set_last_backfill(hobject_t::get_max());
2181 } else {
2182 // restart backfill on parent and child to be safe. we could
2183 // probably do better in the bitwise sort case, but it's more
2184 // fragile (there may be special work to do on backfill completion
2185 // in the future).
2186 info.set_last_backfill(hobject_t());
2187 child->info.set_last_backfill(hobject_t());
2188 }
2189
2190 child->info.stats = info.stats;
2191 child->info.stats.parent_split_bits = split_bits;
2192 info.stats.stats_invalid = true;
2193 child->info.stats.stats_invalid = true;
2194 child->info.last_epoch_started = info.last_epoch_started;
2195 child->info.last_interval_started = info.last_interval_started;
2196
2197 child->snap_trimq = snap_trimq;
2198
2199 // There can't be recovery/backfill going on now
2200 int primary, up_primary;
2201 vector<int> newup, newacting;
2202 get_osdmap()->pg_to_up_acting_osds(
2203 child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2204 child->init_primary_up_acting(
2205 newup,
2206 newacting,
2207 up_primary,
2208 primary);
2209 child->role = OSDMap::calc_pg_role(osd->whoami, child->acting);
2210
2211 // this comparison includes primary rank via pg_shard_t
2212 if (get_primary() != child->get_primary())
2213 child->info.history.same_primary_since = get_osdmap()->get_epoch();
2214
2215 child->info.stats.up = up;
2216 child->info.stats.up_primary = up_primary;
2217 child->info.stats.acting = acting;
2218 child->info.stats.acting_primary = primary;
2219 child->info.stats.mapping_epoch = get_osdmap()->get_epoch();
2220
2221 // History
2222 child->past_intervals = past_intervals;
2223
2224 _split_into(child_pgid, child, split_bits);
2225
2226 // release all backoffs for simplicity
2227 release_backoffs(hobject_t(), hobject_t::get_max());
2228
2229 child->on_new_interval();
2230
2231 child->dirty_info = true;
2232 child->dirty_big_info = true;
2233 dirty_info = true;
2234 dirty_big_info = true;
2235}
2236
2237void PG::add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end)
2238{
2239 ConnectionRef con = s->con;
2240 if (!con) // OSD::ms_handle_reset clears s->con without a lock
2241 return;
2242 BackoffRef b(s->have_backoff(info.pgid, begin));
2243 if (b) {
2244 derr << __func__ << " already have backoff for " << s << " begin " << begin
2245 << " " << *b << dendl;
2246 ceph_abort();
2247 }
2248 Mutex::Locker l(backoff_lock);
2249 {
2250 b = new Backoff(info.pgid, this, s, ++s->backoff_seq, begin, end);
2251 backoffs[begin].insert(b);
2252 s->add_backoff(b);
2253 dout(10) << __func__ << " session " << s << " added " << *b << dendl;
2254 }
2255 con->send_message(
2256 new MOSDBackoff(
2257 info.pgid,
2258 get_osdmap()->get_epoch(),
2259 CEPH_OSD_BACKOFF_OP_BLOCK,
2260 b->id,
2261 begin,
2262 end));
2263}
2264
2265void PG::release_backoffs(const hobject_t& begin, const hobject_t& end)
2266{
2267 dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl;
2268 vector<BackoffRef> bv;
2269 {
2270 Mutex::Locker l(backoff_lock);
2271 auto p = backoffs.lower_bound(begin);
2272 while (p != backoffs.end()) {
2273 int r = cmp(p->first, end);
2274 dout(20) << __func__ << " ? " << r << " " << p->first
2275 << " " << p->second << dendl;
2276 // note: must still examine begin=end=p->first case
2277 if (r > 0 || (r == 0 && begin < end)) {
2278 break;
2279 }
2280 dout(20) << __func__ << " checking " << p->first
2281 << " " << p->second << dendl;
2282 auto q = p->second.begin();
2283 while (q != p->second.end()) {
2284 dout(20) << __func__ << " checking " << *q << dendl;
2285 int r = cmp((*q)->begin, begin);
2286 if (r == 0 || (r > 0 && (*q)->end < end)) {
2287 bv.push_back(*q);
2288 q = p->second.erase(q);
2289 } else {
2290 ++q;
2291 }
2292 }
2293 if (p->second.empty()) {
2294 p = backoffs.erase(p);
2295 } else {
2296 ++p;
2297 }
2298 }
2299 }
2300 for (auto b : bv) {
2301 Mutex::Locker l(b->lock);
2302 dout(10) << __func__ << " " << *b << dendl;
2303 if (b->session) {
2304 assert(b->pg == this);
2305 ConnectionRef con = b->session->con;
2306 if (con) { // OSD::ms_handle_reset clears s->con without a lock
2307 con->send_message(
2308 new MOSDBackoff(
2309 info.pgid,
2310 get_osdmap()->get_epoch(),
2311 CEPH_OSD_BACKOFF_OP_UNBLOCK,
2312 b->id,
2313 b->begin,
2314 b->end));
2315 }
2316 if (b->is_new()) {
2317 b->state = Backoff::STATE_DELETING;
2318 } else {
2319 b->session->rm_backoff(b);
2320 b->session.reset();
2321 }
2322 b->pg.reset();
2323 }
2324 }
2325}
2326
2327void PG::clear_backoffs()
2328{
2329 dout(10) << __func__ << " " << dendl;
2330 map<hobject_t,set<BackoffRef>> ls;
2331 {
2332 Mutex::Locker l(backoff_lock);
2333 ls.swap(backoffs);
2334 }
2335 for (auto& p : ls) {
2336 for (auto& b : p.second) {
2337 Mutex::Locker l(b->lock);
2338 dout(10) << __func__ << " " << *b << dendl;
2339 if (b->session) {
2340 assert(b->pg == this);
2341 if (b->is_new()) {
2342 b->state = Backoff::STATE_DELETING;
2343 } else {
2344 b->session->rm_backoff(b);
2345 b->session.reset();
2346 }
2347 b->pg.reset();
2348 }
2349 }
2350 }
2351}
2352
2353// called by Session::clear_backoffs()
2354void PG::rm_backoff(BackoffRef b)
2355{
2356 dout(10) << __func__ << " " << *b << dendl;
2357 Mutex::Locker l(backoff_lock);
2358 assert(b->lock.is_locked_by_me());
2359 assert(b->pg == this);
2360 auto p = backoffs.find(b->begin);
2361 // may race with release_backoffs()
2362 if (p != backoffs.end()) {
2363 auto q = p->second.find(b);
2364 if (q != p->second.end()) {
2365 p->second.erase(q);
2366 if (p->second.empty()) {
2367 backoffs.erase(p);
2368 }
2369 }
2370 }
2371}
2372
2373void PG::clear_recovery_state()
2374{
2375 dout(10) << "clear_recovery_state" << dendl;
2376
2377 pg_log.reset_recovery_pointers();
2378 finish_sync_event = 0;
2379
2380 hobject_t soid;
2381 while (recovery_ops_active > 0) {
2382#ifdef DEBUG_RECOVERY_OIDS
2383 soid = *recovering_oids.begin();
2384#endif
2385 finish_recovery_op(soid, true);
2386 }
2387
2388 backfill_targets.clear();
2389 backfill_info.clear();
2390 peer_backfill_info.clear();
2391 waiting_on_backfill.clear();
2392 _clear_recovery_state(); // pg impl specific hook
2393}
2394
2395void PG::cancel_recovery()
2396{
2397 dout(10) << "cancel_recovery" << dendl;
2398 clear_recovery_state();
2399}
2400
2401
2402void PG::purge_strays()
2403{
2404 dout(10) << "purge_strays " << stray_set << dendl;
2405
2406 bool removed = false;
2407 for (set<pg_shard_t>::iterator p = stray_set.begin();
2408 p != stray_set.end();
2409 ++p) {
2410 assert(!is_actingbackfill(*p));
2411 if (get_osdmap()->is_up(p->osd)) {
2412 dout(10) << "sending PGRemove to osd." << *p << dendl;
2413 vector<spg_t> to_remove;
2414 to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
2415 MOSDPGRemove *m = new MOSDPGRemove(
2416 get_osdmap()->get_epoch(),
2417 to_remove);
2418 osd->send_message_osd_cluster(p->osd, m, get_osdmap()->get_epoch());
2419 } else {
2420 dout(10) << "not sending PGRemove to down osd." << *p << dendl;
2421 }
2422 peer_missing.erase(*p);
2423 peer_info.erase(*p);
2424 peer_purged.insert(*p);
2425 removed = true;
2426 }
2427
2428 // if we removed anyone, update peers (which include peer_info)
2429 if (removed)
2430 update_heartbeat_peers();
2431
2432 stray_set.clear();
2433
2434 // clear _requested maps; we may have to peer() again if we discover
2435 // (more) stray content
2436 peer_log_requested.clear();
2437 peer_missing_requested.clear();
2438}
2439
2440void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
2441{
2442 Mutex::Locker l(heartbeat_peer_lock);
2443 probe_targets.clear();
2444 for (set<pg_shard_t>::iterator i = probe_set.begin();
2445 i != probe_set.end();
2446 ++i) {
2447 probe_targets.insert(i->osd);
2448 }
2449}
2450
2451void PG::clear_probe_targets()
2452{
2453 Mutex::Locker l(heartbeat_peer_lock);
2454 probe_targets.clear();
2455}
2456
2457void PG::update_heartbeat_peers()
2458{
2459 assert(is_locked());
2460
2461 if (!is_primary())
2462 return;
2463
2464 set<int> new_peers;
2465 for (unsigned i=0; i<acting.size(); i++) {
2466 if (acting[i] != CRUSH_ITEM_NONE)
2467 new_peers.insert(acting[i]);
2468 }
2469 for (unsigned i=0; i<up.size(); i++) {
2470 if (up[i] != CRUSH_ITEM_NONE)
2471 new_peers.insert(up[i]);
2472 }
2473 for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
2474 p != peer_info.end();
2475 ++p)
2476 new_peers.insert(p->first.osd);
2477
2478 bool need_update = false;
2479 heartbeat_peer_lock.Lock();
2480 if (new_peers == heartbeat_peers) {
2481 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl;
2482 } else {
2483 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl;
2484 heartbeat_peers.swap(new_peers);
2485 need_update = true;
2486 }
2487 heartbeat_peer_lock.Unlock();
2488
2489 if (need_update)
2490 osd->need_heartbeat_peer_update();
2491}
2492
2493
2494bool PG::check_in_progress_op(
2495 const osd_reqid_t &r,
2496 eversion_t *version,
2497 version_t *user_version,
2498 int *return_code) const
2499{
2500 return (
2501 projected_log.get_request(r, version, user_version, return_code) ||
2502 pg_log.get_log().get_request(r, version, user_version, return_code));
2503}
2504
2505void PG::_update_calc_stats()
2506{
2507 info.stats.version = info.last_update;
2508 info.stats.created = info.history.epoch_created;
2509 info.stats.last_scrub = info.history.last_scrub;
2510 info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
2511 info.stats.last_deep_scrub = info.history.last_deep_scrub;
2512 info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
2513 info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
2514 info.stats.last_epoch_clean = info.history.last_epoch_clean;
2515
2516 info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
2517 info.stats.ondisk_log_size = info.stats.log_size;
2518 info.stats.log_start = pg_log.get_tail();
2519 info.stats.ondisk_log_start = pg_log.get_tail();
2520
2521 // If actingset is larger then upset we will have misplaced,
2522 // so we will report based on actingset size.
2523
2524 // If upset is larger then we will have degraded,
2525 // so we will report based on upset size.
2526
2527 // If target is the largest of them all, it will contribute to
2528 // the degraded count because num_object_copies is
2529 // computed using target and eventual used to get degraded total.
2530
2531 unsigned target = get_osdmap()->get_pg_size(info.pgid.pgid);
2532 unsigned nrep = MAX(actingset.size(), upset.size());
2533 // calc num_object_copies
2534 info.stats.stats.calc_copies(MAX(target, nrep));
2535 info.stats.stats.sum.num_objects_degraded = 0;
2536 info.stats.stats.sum.num_objects_unfound = 0;
2537 info.stats.stats.sum.num_objects_misplaced = 0;
2538 if ((is_degraded() || is_undersized() || !is_clean()) && is_peered()) {
2539 // NOTE: we only generate copies, degraded, misplaced and unfound
2540 // values for the summation, not individual stat categories.
2541 int64_t num_objects = info.stats.stats.sum.num_objects;
2542
2543 // Total sum of all missing
2544 int64_t missing = 0;
2545 // Objects that have arrived backfilled to up OSDs (not in acting)
2546 int64_t backfilled = 0;
2547 // A misplaced object is not stored on the correct OSD
2548 int64_t misplaced = 0;
2549 // Total of object copies/shards found
2550 int64_t object_copies = 0;
2551
2552 // num_objects_missing on each peer
2553 for (map<pg_shard_t, pg_info_t>::iterator pi =
2554 peer_info.begin();
2555 pi != peer_info.end();
2556 ++pi) {
2557 map<pg_shard_t, pg_missing_t>::const_iterator pm =
2558 peer_missing.find(pi->first);
2559 if (pm != peer_missing.end()) {
2560 pi->second.stats.stats.sum.num_objects_missing =
2561 pm->second.num_missing();
2562 }
2563 }
2564
2565 assert(!actingbackfill.empty());
2566 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
2567 i != actingbackfill.end();
2568 ++i) {
2569 const pg_shard_t &p = *i;
2570
2571 bool in_up = (upset.find(p) != upset.end());
2572 bool in_acting = (actingset.find(p) != actingset.end());
2573 assert(in_up || in_acting);
2574
2575 // in acting Compute total objects excluding num_missing
2576 // in acting and not in up Compute misplaced objects excluding num_missing
2577 // in up and not in acting Compute total objects already backfilled
2578 if (in_acting) {
2579 unsigned osd_missing;
2580 // primary handling
2581 if (p == pg_whoami) {
2582 osd_missing = pg_log.get_missing().num_missing();
2583 info.stats.stats.sum.num_objects_missing_on_primary =
2584 osd_missing;
2585 object_copies += num_objects; // My local (primary) count
2586 } else {
2587 assert(peer_missing.count(p));
2588 osd_missing = peer_missing[p].num_missing();
2589 object_copies += peer_info[p].stats.stats.sum.num_objects;
2590 }
2591 missing += osd_missing;
2592 // Count non-missing objects not in up as misplaced
2593 if (!in_up && num_objects > osd_missing)
2594 misplaced += num_objects - osd_missing;
2595 } else {
2596 assert(in_up && !in_acting);
2597
2598 // If this peer has more objects then it should, ignore them
2599 backfilled += MIN(num_objects, peer_info[p].stats.stats.sum.num_objects);
2600 }
2601 }
2602
2603 // Any objects that have been backfilled to up OSDs can deducted from misplaced
2604 misplaced = MAX(0, misplaced - backfilled);
2605
2606 // Deduct computed total missing on acting nodes
2607 object_copies -= missing;
2608 // Include computed backfilled objects on up nodes
2609 object_copies += backfilled;
2610 // a degraded objects has fewer replicas or EC shards than the
2611 // pool specifies. num_object_copies will never be smaller than target * num_copies.
2612 int64_t degraded = MAX(0, info.stats.stats.sum.num_object_copies - object_copies);
2613
2614 info.stats.stats.sum.num_objects_degraded = degraded;
2615 info.stats.stats.sum.num_objects_unfound = get_num_unfound();
2616 info.stats.stats.sum.num_objects_misplaced = misplaced;
2617 }
2618}
2619
2620void PG::_update_blocked_by()
2621{
2622 // set a max on the number of blocking peers we report. if we go
2623 // over, report a random subset. keep the result sorted.
2624 unsigned keep = MIN(blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
2625 unsigned skip = blocked_by.size() - keep;
2626 info.stats.blocked_by.clear();
2627 info.stats.blocked_by.resize(keep);
2628 unsigned pos = 0;
2629 for (set<int>::iterator p = blocked_by.begin();
2630 p != blocked_by.end() && keep > 0;
2631 ++p) {
2632 if (skip > 0 && (rand() % (skip + keep) < skip)) {
2633 --skip;
2634 } else {
2635 info.stats.blocked_by[pos++] = *p;
2636 --keep;
2637 }
2638 }
2639}
2640
2641void PG::publish_stats_to_osd()
2642{
2643 if (!is_primary())
2644 return;
2645
2646 pg_stats_publish_lock.Lock();
2647
2648 if (info.stats.stats.sum.num_scrub_errors)
2649 state_set(PG_STATE_INCONSISTENT);
2650 else
2651 state_clear(PG_STATE_INCONSISTENT);
2652
2653 utime_t now = ceph_clock_now();
2654 if (info.stats.state != state) {
2655 info.stats.last_change = now;
2656 // Optimistic estimation, if we just find out an inactive PG,
2657 // assumt it is active till now.
2658 if (!(state & PG_STATE_ACTIVE) &&
2659 (info.stats.state & PG_STATE_ACTIVE))
2660 info.stats.last_active = now;
2661
2662 if ((state & PG_STATE_ACTIVE) &&
2663 !(info.stats.state & PG_STATE_ACTIVE))
2664 info.stats.last_became_active = now;
2665 if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
2666 !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
2667 info.stats.last_became_peered = now;
2668 if (!(state & PG_STATE_CREATING) &&
2669 (info.stats.state & PG_STATE_CREATING)) {
2670 osd->send_pg_created(get_pgid().pgid);
2671 }
2672 info.stats.state = state;
2673 }
2674
2675 _update_calc_stats();
2676 _update_blocked_by();
2677
2678 bool publish = false;
2679 pg_stat_t pre_publish = info.stats;
2680 pre_publish.stats.add(unstable_stats);
2681 utime_t cutoff = now;
2682 cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
2683 if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
2684 info.stats.last_fresh > cutoff) {
2685 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2686 << ": no change since " << info.stats.last_fresh << dendl;
2687 } else {
2688 // update our stat summary and timestamps
2689 info.stats.reported_epoch = get_osdmap()->get_epoch();
2690 ++info.stats.reported_seq;
2691
2692 info.stats.last_fresh = now;
2693
2694 if (info.stats.state & PG_STATE_CLEAN)
2695 info.stats.last_clean = now;
2696 if (info.stats.state & PG_STATE_ACTIVE)
2697 info.stats.last_active = now;
2698 if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
2699 info.stats.last_peered = now;
2700 info.stats.last_unstale = now;
2701 if ((info.stats.state & PG_STATE_DEGRADED) == 0)
2702 info.stats.last_undegraded = now;
2703 if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
2704 info.stats.last_fullsized = now;
2705
31f18b77
FG
2706 // do not send pgstat to mon anymore once we are luminous, since mgr takes
2707 // care of this by sending MMonMgrReport to mon.
2708 publish =
2709 osd->osd->get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7c673cae
FG
2710 pg_stats_publish_valid = true;
2711 pg_stats_publish = pre_publish;
2712
2713 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2714 << ":" << pg_stats_publish.reported_seq << dendl;
2715 }
2716 pg_stats_publish_lock.Unlock();
2717
2718 if (publish)
2719 osd->pg_stat_queue_enqueue(this);
2720}
2721
2722void PG::clear_publish_stats()
2723{
2724 dout(15) << "clear_stats" << dendl;
2725 pg_stats_publish_lock.Lock();
2726 pg_stats_publish_valid = false;
2727 pg_stats_publish_lock.Unlock();
2728
2729 osd->pg_stat_queue_dequeue(this);
2730}
2731
2732/**
2733 * initialize a newly instantiated pg
2734 *
2735 * Initialize PG state, as when a PG is initially created, or when it
2736 * is first instantiated on the current node.
2737 *
2738 * @param role our role/rank
2739 * @param newup up set
2740 * @param newacting acting set
2741 * @param history pg history
2742 * @param pi past_intervals
2743 * @param backfill true if info should be marked as backfill
2744 * @param t transaction to write out our new state in
2745 */
2746void PG::init(
2747 int role,
2748 const vector<int>& newup, int new_up_primary,
2749 const vector<int>& newacting, int new_acting_primary,
2750 const pg_history_t& history,
2751 const PastIntervals& pi,
2752 bool backfill,
2753 ObjectStore::Transaction *t)
2754{
2755 dout(10) << "init role " << role << " up " << newup << " acting " << newacting
2756 << " history " << history
2757 << " past_intervals " << pi
2758 << dendl;
2759
2760 set_role(role);
2761 acting = newacting;
2762 up = newup;
2763 init_primary_up_acting(
2764 newup,
2765 newacting,
2766 new_up_primary,
2767 new_acting_primary);
2768
2769 info.history = history;
2770 past_intervals = pi;
2771
2772 info.stats.up = up;
2773 info.stats.up_primary = new_up_primary;
2774 info.stats.acting = acting;
2775 info.stats.acting_primary = new_acting_primary;
2776 info.stats.mapping_epoch = info.history.same_interval_since;
2777
2778 if (backfill) {
2779 dout(10) << __func__ << ": Setting backfill" << dendl;
2780 info.set_last_backfill(hobject_t());
2781 info.last_complete = info.last_update;
2782 pg_log.mark_log_for_rewrite();
2783 }
2784
2785 on_new_interval();
2786
2787 dirty_info = true;
2788 dirty_big_info = true;
2789 write_if_dirty(*t);
7c673cae
FG
2790}
2791
2792#pragma GCC diagnostic ignored "-Wpragmas"
2793#pragma GCC diagnostic push
2794#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2795
2796void PG::upgrade(ObjectStore *store)
2797{
2798 assert(info_struct_v <= 10);
2799 ObjectStore::Transaction t;
2800
2801 assert(info_struct_v >= 7);
2802
2803 // 7 -> 8
2804 if (info_struct_v <= 7) {
2805 pg_log.mark_log_for_rewrite();
2806 ghobject_t log_oid(OSD::make_pg_log_oid(pg_id));
2807 ghobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id));
2808 t.remove(coll_t::meta(), log_oid);
2809 t.remove(coll_t::meta(), biginfo_oid);
2810 t.touch(coll, pgmeta_oid);
2811 }
2812
2813 // 8 -> 9
2814 if (info_struct_v <= 8) {
2815 // no special action needed.
2816 }
2817
2818 // 9 -> 10
2819 if (info_struct_v <= 9) {
2820 // previous versions weren't (as) aggressively clearing past_intervals
2821 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
2822 dout(20) << __func__ << " clearing past_intervals" << dendl;
2823 past_intervals.clear();
2824 }
2825 }
2826
2827 // update infover_key
2828 if (info_struct_v < cur_struct_v) {
2829 map<string,bufferlist> v;
2830 __u8 ver = cur_struct_v;
2831 ::encode(ver, v[infover_key]);
2832 t.omap_setkeys(coll, pgmeta_oid, v);
2833 }
2834
2835 dirty_info = true;
2836 dirty_big_info = true;
2837 write_if_dirty(t);
2838
2839 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
2840 ObjectStore::Sequencer>("upgrade"));
2841 int r = store->apply_transaction(osr.get(), std::move(t));
2842 if (r != 0) {
2843 derr << __func__ << ": apply_transaction returned "
2844 << cpp_strerror(r) << dendl;
2845 ceph_abort();
2846 }
2847 assert(r == 0);
2848
2849 C_SaferCond waiter;
2850 if (!osr->flush_commit(&waiter)) {
2851 waiter.wait();
2852 }
2853}
2854
2855#pragma GCC diagnostic pop
2856#pragma GCC diagnostic warning "-Wpragmas"
2857
2858int PG::_prepare_write_info(CephContext* cct,
2859 map<string,bufferlist> *km,
2860 epoch_t epoch,
2861 pg_info_t &info, pg_info_t &last_written_info,
2862 PastIntervals &past_intervals,
2863 bool dirty_big_info,
2864 bool dirty_epoch,
2865 bool try_fast_info,
2866 PerfCounters *logger)
2867{
2868 if (dirty_epoch) {
2869 ::encode(epoch, (*km)[epoch_key]);
2870 }
2871
2872 if (logger)
2873 logger->inc(l_osd_pg_info);
2874
2875 // try to do info efficiently?
2876 if (!dirty_big_info && try_fast_info &&
2877 info.last_update > last_written_info.last_update) {
2878 pg_fast_info_t fast;
2879 fast.populate_from(info);
2880 bool did = fast.try_apply_to(&last_written_info);
2881 assert(did); // we verified last_update increased above
2882 if (info == last_written_info) {
2883 ::encode(fast, (*km)[fastinfo_key]);
2884 if (logger)
2885 logger->inc(l_osd_pg_fastinfo);
2886 return 0;
2887 }
2888 generic_dout(30) << __func__ << " fastinfo failed, info:\n";
2889 {
2890 JSONFormatter jf(true);
2891 jf.dump_object("info", info);
2892 jf.flush(*_dout);
2893 }
2894 {
2895 *_dout << "\nlast_written_info:\n";
2896 JSONFormatter jf(true);
2897 jf.dump_object("last_written_info", last_written_info);
2898 jf.flush(*_dout);
2899 }
2900 *_dout << dendl;
2901 }
2902 last_written_info = info;
2903
2904 // info. store purged_snaps separately.
2905 interval_set<snapid_t> purged_snaps;
2906 purged_snaps.swap(info.purged_snaps);
2907 ::encode(info, (*km)[info_key]);
2908 purged_snaps.swap(info.purged_snaps);
2909
2910 if (dirty_big_info) {
2911 // potentially big stuff
2912 bufferlist& bigbl = (*km)[biginfo_key];
2913 ::encode(past_intervals, bigbl);
2914 ::encode(info.purged_snaps, bigbl);
2915 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
2916 if (logger)
2917 logger->inc(l_osd_pg_biginfo);
2918 }
2919
2920 return 0;
2921}
2922
2923void PG::_create(ObjectStore::Transaction& t, spg_t pgid, int bits)
2924{
2925 coll_t coll(pgid);
2926 t.create_collection(coll, bits);
2927}
2928
2929void PG::_init(ObjectStore::Transaction& t, spg_t pgid, const pg_pool_t *pool)
2930{
2931 coll_t coll(pgid);
2932
2933 if (pool) {
2934 // Give a hint to the PG collection
2935 bufferlist hint;
2936 uint32_t pg_num = pool->get_pg_num();
2937 uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
2938 ::encode(pg_num, hint);
2939 ::encode(expected_num_objects_pg, hint);
2940 uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
2941 t.collection_hint(coll, hint_type, hint);
2942 }
2943
2944 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
2945 t.touch(coll, pgmeta_oid);
2946 map<string,bufferlist> values;
2947 __u8 struct_v = cur_struct_v;
2948 ::encode(struct_v, values[infover_key]);
2949 t.omap_setkeys(coll, pgmeta_oid, values);
2950}
2951
2952void PG::prepare_write_info(map<string,bufferlist> *km)
2953{
2954 info.stats.stats.add(unstable_stats);
2955 unstable_stats.clear();
2956
2957 bool need_update_epoch = last_epoch < get_osdmap()->get_epoch();
2958 int ret = _prepare_write_info(cct, km, get_osdmap()->get_epoch(),
2959 info,
2960 last_written_info,
2961 past_intervals,
2962 dirty_big_info, need_update_epoch,
2963 cct->_conf->osd_fast_info,
2964 osd->logger);
2965 assert(ret == 0);
2966 if (need_update_epoch)
2967 last_epoch = get_osdmap()->get_epoch();
2968 last_persisted_osdmap_ref = osdmap_ref;
2969
2970 dirty_info = false;
2971 dirty_big_info = false;
2972}
2973
2974#pragma GCC diagnostic ignored "-Wpragmas"
2975#pragma GCC diagnostic push
2976#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2977
2978bool PG::_has_removal_flag(ObjectStore *store,
2979 spg_t pgid)
2980{
2981 coll_t coll(pgid);
2982 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
2983
2984 // first try new way
2985 set<string> keys;
2986 keys.insert("_remove");
2987 map<string,bufferlist> values;
2988 if (store->omap_get_values(coll, pgmeta_oid, keys, &values) == 0 &&
2989 values.size() == 1)
2990 return true;
2991
2992 return false;
2993}
2994
2995int PG::peek_map_epoch(ObjectStore *store,
2996 spg_t pgid,
2997 epoch_t *pepoch,
2998 bufferlist *bl)
2999{
3000 coll_t coll(pgid);
3001 ghobject_t legacy_infos_oid(OSD::make_infos_oid());
3002 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3003 epoch_t cur_epoch = 0;
3004
3005 assert(bl);
3006 {
3007 // validate collection name
3008 assert(coll.is_pg());
3009 }
3010
3011 // try for v8
3012 set<string> keys;
3013 keys.insert(infover_key);
3014 keys.insert(epoch_key);
3015 map<string,bufferlist> values;
3016 int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3017 if (r == 0) {
3018 assert(values.size() == 2);
3019
3020 // sanity check version
3021 bufferlist::iterator bp = values[infover_key].begin();
3022 __u8 struct_v = 0;
3023 ::decode(struct_v, bp);
3024 assert(struct_v >= 8);
3025
3026 // get epoch
3027 bp = values[epoch_key].begin();
3028 ::decode(cur_epoch, bp);
3029 } else {
3030 // probably bug 10617; see OSD::load_pgs()
3031 return -1;
3032 }
3033
3034 *pepoch = cur_epoch;
3035 return 0;
3036}
3037
3038#pragma GCC diagnostic pop
3039#pragma GCC diagnostic warning "-Wpragmas"
3040
3041void PG::write_if_dirty(ObjectStore::Transaction& t)
3042{
3043 map<string,bufferlist> km;
3044 if (dirty_big_info || dirty_info)
3045 prepare_write_info(&km);
3046 pg_log.write_log_and_missing(t, &km, coll, pgmeta_oid, pool.info.require_rollback());
3047 if (!km.empty())
3048 t.omap_setkeys(coll, pgmeta_oid, km);
3049}
3050
3051void PG::trim_log()
3052{
3053 assert(is_primary());
3054 calc_trim_to();
3055 dout(10) << __func__ << " to " << pg_trim_to << dendl;
3056 if (pg_trim_to != eversion_t()) {
3057 // inform peers to trim log
3058 assert(!actingbackfill.empty());
3059 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3060 i != actingbackfill.end();
3061 ++i) {
3062 if (*i == pg_whoami) continue;
3063 osd->send_message_osd_cluster(
3064 i->osd,
3065 new MOSDPGTrim(
3066 get_osdmap()->get_epoch(),
3067 spg_t(info.pgid.pgid, i->shard),
3068 pg_trim_to),
3069 get_osdmap()->get_epoch());
3070 }
3071
3072 // trim primary as well
3073 pg_log.trim(pg_trim_to, info);
3074 dirty_info = true;
3075 }
3076}
3077
3078void PG::add_log_entry(const pg_log_entry_t& e, bool applied)
3079{
3080 // raise last_complete only if we were previously up to date
3081 if (info.last_complete == info.last_update)
3082 info.last_complete = e.version;
3083
3084 // raise last_update.
3085 assert(e.version > info.last_update);
3086 info.last_update = e.version;
3087
3088 // raise user_version, if it increased (it may have not get bumped
3089 // by all logged updates)
3090 if (e.user_version > info.last_user_version)
3091 info.last_user_version = e.user_version;
3092
3093 // log mutation
3094 pg_log.add(e, applied);
3095 dout(10) << "add_log_entry " << e << dendl;
3096}
3097
3098
3099void PG::append_log(
3100 const vector<pg_log_entry_t>& logv,
3101 eversion_t trim_to,
3102 eversion_t roll_forward_to,
3103 ObjectStore::Transaction &t,
3104 bool transaction_applied)
3105{
3106 if (transaction_applied)
3107 update_snap_map(logv, t);
3108
3109 /* The primary has sent an info updating the history, but it may not
3110 * have arrived yet. We want to make sure that we cannot remember this
3111 * write without remembering that it happened in an interval which went
3112 * active in epoch history.last_epoch_started.
3113 */
3114 if (info.last_epoch_started != info.history.last_epoch_started) {
3115 info.history.last_epoch_started = info.last_epoch_started;
3116 }
3117 if (info.last_interval_started != info.history.last_interval_started) {
3118 info.history.last_interval_started = info.last_interval_started;
3119 }
3120 dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3121
3122 PGLogEntryHandler handler{this, &t};
3123 if (!transaction_applied) {
3124 /* We must be a backfill peer, so it's ok if we apply
3125 * out-of-turn since we won't be considered when
3126 * determining a min possible last_update.
3127 */
3128 pg_log.roll_forward(&handler);
3129 }
3130
3131 for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3132 p != logv.end();
3133 ++p) {
3134 add_log_entry(*p, transaction_applied);
3135
3136 /* We don't want to leave the rollforward artifacts around
3137 * here past last_backfill. It's ok for the same reason as
3138 * above */
3139 if (transaction_applied &&
3140 p->soid > info.last_backfill) {
3141 pg_log.roll_forward(&handler);
3142 }
3143 }
3144 auto last = logv.rbegin();
3145 if (is_primary() && last != logv.rend()) {
3146 projected_log.skip_can_rollback_to_to_head();
3147 projected_log.trim(cct, last->version, nullptr);
3148 }
3149
3150 if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
3151 pg_log.roll_forward_to(
3152 roll_forward_to,
3153 &handler);
3154 t.register_on_applied(
3155 new C_UpdateLastRollbackInfoTrimmedToApplied(
3156 this,
3157 get_osdmap()->get_epoch(),
3158 roll_forward_to));
3159 }
3160
3161 pg_log.trim(trim_to, info);
3162
3163 // update the local pg, pg log
3164 dirty_info = true;
3165 write_if_dirty(t);
3166}
3167
3168bool PG::check_log_for_corruption(ObjectStore *store)
3169{
3170 /// TODO: this method needs to work with the omap log
3171 return true;
3172}
3173
3174//! Get the name we're going to save our corrupt page log as
3175std::string PG::get_corrupt_pg_log_name() const
3176{
3177 const int MAX_BUF = 512;
3178 char buf[MAX_BUF];
3179 struct tm tm_buf;
3180 time_t my_time(time(NULL));
3181 const struct tm *t = localtime_r(&my_time, &tm_buf);
3182 int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t);
3183 if (ret == 0) {
3184 dout(0) << "strftime failed" << dendl;
3185 return "corrupt_log_unknown_time";
3186 }
3187 string out(buf);
3188 out += stringify(info.pgid);
3189 return out;
3190}
3191
3192int PG::read_info(
3193 ObjectStore *store, spg_t pgid, const coll_t &coll, bufferlist &bl,
3194 pg_info_t &info, PastIntervals &past_intervals,
3195 __u8 &struct_v)
3196{
3197 // try for v8 or later
3198 set<string> keys;
3199 keys.insert(infover_key);
3200 keys.insert(info_key);
3201 keys.insert(biginfo_key);
3202 keys.insert(fastinfo_key);
3203 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3204 map<string,bufferlist> values;
3205 int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3206 if (r == 0) {
3207 assert(values.size() == 3 ||
3208 values.size() == 4);
3209
3210 bufferlist::iterator p = values[infover_key].begin();
3211 ::decode(struct_v, p);
3212 assert(struct_v >= 8);
3213
3214 p = values[info_key].begin();
3215 ::decode(info, p);
3216
3217 p = values[biginfo_key].begin();
3218 if (struct_v >= 10) {
3219 ::decode(past_intervals, p);
3220 } else {
3221 past_intervals.decode_classic(p);
3222 }
3223 ::decode(info.purged_snaps, p);
3224
3225 p = values[fastinfo_key].begin();
3226 if (!p.end()) {
3227 pg_fast_info_t fast;
3228 ::decode(fast, p);
3229 fast.try_apply_to(&info);
3230 }
3231 return 0;
3232 }
3233
3234 // legacy (ver < 8)
3235 ghobject_t infos_oid(OSD::make_infos_oid());
3236 bufferlist::iterator p = bl.begin();
3237 ::decode(struct_v, p);
3238 assert(struct_v == 7);
3239
3240 // get info out of leveldb
3241 string k = get_info_key(info.pgid);
3242 string bk = get_biginfo_key(info.pgid);
3243 keys.clear();
3244 keys.insert(k);
3245 keys.insert(bk);
3246 values.clear();
3247 store->omap_get_values(coll_t::meta(), ghobject_t(infos_oid), keys, &values);
3248 assert(values.size() == 2);
3249
3250 p = values[k].begin();
3251 ::decode(info, p);
3252
3253 p = values[bk].begin();
3254 ::decode(past_intervals, p);
3255 interval_set<snapid_t> snap_collections; // obsolete
3256 ::decode(snap_collections, p);
3257 ::decode(info.purged_snaps, p);
3258 return 0;
3259}
3260
3261void PG::read_state(ObjectStore *store, bufferlist &bl)
3262{
3263 int r = read_info(store, pg_id, coll, bl, info, past_intervals,
3264 info_struct_v);
3265 assert(r >= 0);
3266
3267 last_written_info = info;
3268
3269 ostringstream oss;
3270 pg_log.read_log_and_missing(
3271 store,
3272 coll,
3273 info_struct_v < 8 ? coll_t::meta() : coll,
3274 ghobject_t(info_struct_v < 8 ? OSD::make_pg_log_oid(pg_id) : pgmeta_oid),
3275 info,
3276 oss,
3277 cct->_conf->osd_ignore_stale_divergent_priors,
3278 cct->_conf->osd_debug_verify_missing_on_start);
3279 if (oss.tellp())
3280 osd->clog->error() << oss.rdbuf();
3281
3282 // log any weirdness
3283 log_weirdness();
3284}
3285
3286void PG::log_weirdness()
3287{
3288 if (pg_log.get_tail() != info.log_tail)
3289 osd->clog->error() << info.pgid
3290 << " info mismatch, log.tail " << pg_log.get_tail()
3291 << " != info.log_tail " << info.log_tail;
3292 if (pg_log.get_head() != info.last_update)
3293 osd->clog->error() << info.pgid
3294 << " info mismatch, log.head " << pg_log.get_head()
3295 << " != info.last_update " << info.last_update;
3296
3297 if (!pg_log.get_log().empty()) {
3298 // sloppy check
3299 if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
3300 osd->clog->error() << info.pgid
3301 << " log bound mismatch, info (" << pg_log.get_tail() << ","
3302 << pg_log.get_head() << "]"
3303 << " actual ["
3304 << pg_log.get_log().log.begin()->version << ","
3305 << pg_log.get_log().log.rbegin()->version << "]";
3306 }
3307
3308 if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
3309 osd->clog->error() << info.pgid
3310 << " caller_ops.size " << pg_log.get_log().caller_ops.size()
3311 << " > log size " << pg_log.get_log().log.size();
3312 }
3313}
3314
3315void PG::update_snap_map(
3316 const vector<pg_log_entry_t> &log_entries,
3317 ObjectStore::Transaction &t)
3318{
3319 for (vector<pg_log_entry_t>::const_iterator i = log_entries.begin();
3320 i != log_entries.end();
3321 ++i) {
3322 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
3323 if (i->soid.snap < CEPH_MAXSNAP) {
3324 if (i->is_delete()) {
3325 int r = snap_mapper.remove_oid(
3326 i->soid,
3327 &_t);
3328 assert(r == 0);
3329 } else if (i->is_update()) {
3330 assert(i->snaps.length() > 0);
3331 vector<snapid_t> snaps;
3332 bufferlist snapbl = i->snaps;
3333 bufferlist::iterator p = snapbl.begin();
3334 try {
3335 ::decode(snaps, p);
3336 } catch (...) {
3337 snaps.clear();
3338 }
3339 set<snapid_t> _snaps(snaps.begin(), snaps.end());
3340
3341 if (i->is_clone() || i->is_promote()) {
3342 snap_mapper.add_oid(
3343 i->soid,
3344 _snaps,
3345 &_t);
3346 } else if (i->is_modify()) {
3347 assert(i->is_modify());
3348 int r = snap_mapper.update_snaps(
3349 i->soid,
3350 _snaps,
3351 0,
3352 &_t);
3353 assert(r == 0);
3354 } else {
3355 assert(i->is_clean());
3356 }
3357 }
3358 }
3359 }
3360}
3361
3362/**
3363 * filter trimming|trimmed snaps out of snapcontext
3364 */
3365void PG::filter_snapc(vector<snapid_t> &snaps)
3366{
3367 //nothing needs to trim, we can return immediately
3368 if(snap_trimq.empty() && info.purged_snaps.empty())
3369 return;
3370
3371 bool filtering = false;
3372 vector<snapid_t> newsnaps;
3373 for (vector<snapid_t>::iterator p = snaps.begin();
3374 p != snaps.end();
3375 ++p) {
3376 if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
3377 if (!filtering) {
3378 // start building a new vector with what we've seen so far
3379 dout(10) << "filter_snapc filtering " << snaps << dendl;
3380 newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
3381 filtering = true;
3382 }
3383 dout(20) << "filter_snapc removing trimq|purged snap " << *p << dendl;
3384 } else {
3385 if (filtering)
3386 newsnaps.push_back(*p); // continue building new vector
3387 }
3388 }
3389 if (filtering) {
3390 snaps.swap(newsnaps);
3391 dout(10) << "filter_snapc result " << snaps << dendl;
3392 }
3393}
3394
3395void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m)
3396{
3397 for (map<hobject_t, list<OpRequestRef>>::iterator it = m.begin();
3398 it != m.end();
3399 ++it)
3400 requeue_ops(it->second);
3401 m.clear();
3402}
3403
3404void PG::requeue_op(OpRequestRef op)
3405{
3406 auto p = waiting_for_map.find(op->get_source());
3407 if (p != waiting_for_map.end()) {
3408 dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")"
3409 << dendl;
3410 p->second.push_front(op);
3411 } else {
3412 dout(20) << __func__ << " " << op << dendl;
3413 osd->enqueue_front(info.pgid, PGQueueable(op, get_osdmap()->get_epoch()));
3414 }
3415}
3416
3417void PG::requeue_ops(list<OpRequestRef> &ls)
3418{
3419 for (list<OpRequestRef>::reverse_iterator i = ls.rbegin();
3420 i != ls.rend();
3421 ++i) {
3422 auto p = waiting_for_map.find((*i)->get_source());
3423 if (p != waiting_for_map.end()) {
3424 dout(20) << __func__ << " " << *i << " (waiting_for_map " << p->first
3425 << ")" << dendl;
3426 p->second.push_front(*i);
3427 } else {
3428 dout(20) << __func__ << " " << *i << dendl;
3429 osd->enqueue_front(info.pgid, PGQueueable(*i, get_osdmap()->get_epoch()));
3430 }
3431 }
3432 ls.clear();
3433}
3434
3435void PG::requeue_map_waiters()
3436{
3437 epoch_t epoch = get_osdmap()->get_epoch();
3438 auto p = waiting_for_map.begin();
3439 while (p != waiting_for_map.end()) {
3440 if (epoch < p->second.front()->min_epoch) {
3441 dout(20) << __func__ << " " << p->first << " front op "
3442 << p->second.front() << " must still wait, doing nothing"
3443 << dendl;
3444 ++p;
3445 } else {
3446 dout(20) << __func__ << " " << p->first << " " << p->second << dendl;
3447 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
3448 osd->enqueue_front(info.pgid, PGQueueable(*q, epoch));
3449 }
3450 p = waiting_for_map.erase(p);
3451 }
3452 }
3453}
3454
3455
3456// ==========================================================================================
3457// SCRUB
3458
3459/*
3460 * when holding pg and sched_scrub_lock, then the states are:
3461 * scheduling:
3462 * scrubber.reserved = true
3463 * scrub_rserved_peers includes whoami
3464 * osd->scrub_pending++
3465 * scheduling, replica declined:
3466 * scrubber.reserved = true
3467 * scrubber.reserved_peers includes -1
3468 * osd->scrub_pending++
3469 * pending:
3470 * scrubber.reserved = true
3471 * scrubber.reserved_peers.size() == acting.size();
3472 * pg on scrub_wq
3473 * osd->scrub_pending++
3474 * scrubbing:
3475 * scrubber.reserved = false;
3476 * scrubber.reserved_peers empty
3477 * osd->scrubber.active++
3478 */
3479
3480// returns true if a scrub has been newly kicked off
3481bool PG::sched_scrub()
3482{
3483 bool nodeep_scrub = false;
3484 assert(is_locked());
3485 if (!(is_primary() && is_active() && is_clean() && !is_scrubbing())) {
3486 return false;
3487 }
3488
3489 double deep_scrub_interval = 0;
3490 pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
3491 if (deep_scrub_interval <= 0) {
3492 deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
3493 }
3494 bool time_for_deep = ceph_clock_now() >=
3495 info.history.last_deep_scrub_stamp + deep_scrub_interval;
3496
3497 bool deep_coin_flip = false;
3498 // Only add random deep scrubs when NOT user initiated scrub
3499 if (!scrubber.must_scrub)
3500 deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
3501 dout(20) << __func__ << ": time_for_deep=" << time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
3502
3503 time_for_deep = (time_for_deep || deep_coin_flip);
3504
3505 //NODEEP_SCRUB so ignore time initiated deep-scrub
3506 if (osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
3507 pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
3508 time_for_deep = false;
3509 nodeep_scrub = true;
3510 }
3511
3512 if (!scrubber.must_scrub) {
3513 assert(!scrubber.must_deep_scrub);
3514
3515 //NOSCRUB so skip regular scrubs
3516 if ((osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
3517 pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) && !time_for_deep) {
3518 if (scrubber.reserved) {
3519 // cancel scrub if it is still in scheduling,
3520 // so pgs from other pools where scrub are still legal
3521 // have a chance to go ahead with scrubbing.
3522 clear_scrub_reserved();
3523 scrub_unreserve_replicas();
3524 }
3525 return false;
3526 }
3527 }
3528
3529 if (cct->_conf->osd_scrub_auto_repair
3530 && get_pgbackend()->auto_repair_supported()
3531 && time_for_deep
3532 // respect the command from user, and not do auto-repair
3533 && !scrubber.must_repair
3534 && !scrubber.must_scrub
3535 && !scrubber.must_deep_scrub) {
3536 dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
3537 scrubber.auto_repair = true;
3538 } else {
3539 // this happens when user issue the scrub/repair command during
3540 // the scheduling of the scrub/repair (e.g. request reservation)
3541 scrubber.auto_repair = false;
3542 }
3543
3544 bool ret = true;
3545 if (!scrubber.reserved) {
3546 assert(scrubber.reserved_peers.empty());
3547 if (osd->inc_scrubs_pending()) {
3548 dout(20) << "sched_scrub: reserved locally, reserving replicas" << dendl;
3549 scrubber.reserved = true;
3550 scrubber.reserved_peers.insert(pg_whoami);
3551 scrub_reserve_replicas();
3552 } else {
3553 dout(20) << "sched_scrub: failed to reserve locally" << dendl;
3554 ret = false;
3555 }
3556 }
3557 if (scrubber.reserved) {
3558 if (scrubber.reserve_failed) {
3559 dout(20) << "sched_scrub: failed, a peer declined" << dendl;
3560 clear_scrub_reserved();
3561 scrub_unreserve_replicas();
3562 ret = false;
3563 } else if (scrubber.reserved_peers.size() == acting.size()) {
3564 dout(20) << "sched_scrub: success, reserved self and replicas" << dendl;
3565 if (time_for_deep) {
3566 dout(10) << "sched_scrub: scrub will be deep" << dendl;
3567 state_set(PG_STATE_DEEP_SCRUB);
3568 } else if (!scrubber.must_deep_scrub && info.stats.stats.sum.num_deep_scrub_errors) {
3569 if (!nodeep_scrub) {
3570 osd->clog->info() << "osd." << osd->whoami
3571 << " pg " << info.pgid
3572 << " Deep scrub errors, upgrading scrub to deep-scrub";
3573 state_set(PG_STATE_DEEP_SCRUB);
3574 } else if (!scrubber.must_scrub) {
3575 osd->clog->error() << "osd." << osd->whoami
3576 << " pg " << info.pgid
3577 << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
3578 clear_scrub_reserved();
3579 scrub_unreserve_replicas();
3580 return false;
3581 } else {
3582 osd->clog->error() << "osd." << osd->whoami
3583 << " pg " << info.pgid
3584 << " Regular scrub request, losing deep-scrub details";
3585 }
3586 }
3587 queue_scrub();
3588 } else {
3589 // none declined, since scrubber.reserved is set
3590 dout(20) << "sched_scrub: reserved " << scrubber.reserved_peers << ", waiting for replicas" << dendl;
3591 }
3592 }
3593
3594 return ret;
3595}
3596
3597void PG::reg_next_scrub()
3598{
3599 if (!is_primary())
3600 return;
3601
3602 utime_t reg_stamp;
3603 if (scrubber.must_scrub ||
3604 (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats)) {
3605 reg_stamp = ceph_clock_now();
3606 } else {
3607 reg_stamp = info.history.last_scrub_stamp;
3608 }
3609 // note down the sched_time, so we can locate this scrub, and remove it
3610 // later on.
3611 double scrub_min_interval = 0, scrub_max_interval = 0;
3612 pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
3613 pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
3614 assert(scrubber.scrub_reg_stamp == utime_t());
3615 scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
3616 reg_stamp,
3617 scrub_min_interval,
3618 scrub_max_interval,
3619 scrubber.must_scrub);
3620}
3621
3622void PG::unreg_next_scrub()
3623{
3624 if (is_primary()) {
3625 osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
3626 scrubber.scrub_reg_stamp = utime_t();
3627 }
3628}
3629
3630void PG::do_replica_scrub_map(OpRequestRef op)
3631{
3632 const MOSDRepScrubMap *m = static_cast<const MOSDRepScrubMap*>(op->get_req());
3633 dout(7) << __func__ << " " << *m << dendl;
3634 if (m->map_epoch < info.history.same_interval_since) {
3635 dout(10) << __func__ << " discarding old from "
3636 << m->map_epoch << " < " << info.history.same_interval_since
3637 << dendl;
3638 return;
3639 }
3640 if (!scrubber.is_chunky_scrub_active()) {
3641 dout(10) << __func__ << " scrub isn't active" << dendl;
3642 return;
3643 }
3644
3645 op->mark_started();
3646
3647 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3648 scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3649 dout(10) << "map version is "
3650 << scrubber.received_maps[m->from].valid_through
3651 << dendl;
3652
3653 --scrubber.waiting_on;
3654 scrubber.waiting_on_whom.erase(m->from);
3655 if (scrubber.waiting_on == 0) {
31f18b77
FG
3656 if (ops_blocked_by_scrub()) {
3657 requeue_scrub(true);
3658 } else {
3659 requeue_scrub(false);
3660 }
7c673cae
FG
3661 }
3662}
3663
3664void PG::sub_op_scrub_map(OpRequestRef op)
3665{
3666 // for legacy jewel compatibility only
3667 const MOSDSubOp *m = static_cast<const MOSDSubOp *>(op->get_req());
3668 assert(m->get_type() == MSG_OSD_SUBOP);
3669 dout(7) << "sub_op_scrub_map" << dendl;
3670
3671 if (m->map_epoch < info.history.same_interval_since) {
3672 dout(10) << "sub_op_scrub discarding old sub_op from "
3673 << m->map_epoch << " < " << info.history.same_interval_since << dendl;
3674 return;
3675 }
3676
3677 if (!scrubber.is_chunky_scrub_active()) {
3678 dout(10) << "sub_op_scrub_map scrub isn't active" << dendl;
3679 return;
3680 }
3681
3682 op->mark_started();
3683
3684 dout(10) << " got " << m->from << " scrub map" << dendl;
3685 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3686
3687 scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3688 dout(10) << "map version is "
3689 << scrubber.received_maps[m->from].valid_through
3690 << dendl;
3691
3692 --scrubber.waiting_on;
3693 scrubber.waiting_on_whom.erase(m->from);
3694
3695 if (scrubber.waiting_on == 0) {
31f18b77
FG
3696 if (ops_blocked_by_scrub()) {
3697 requeue_scrub(true);
3698 } else {
3699 requeue_scrub(false);
3700 }
7c673cae
FG
3701 }
3702}
3703
3704// send scrub v3 messages (chunky scrub)
3705void PG::_request_scrub_map(
3706 pg_shard_t replica, eversion_t version,
3707 hobject_t start, hobject_t end,
3708 bool deep, uint32_t seed)
3709{
3710 assert(replica != pg_whoami);
3711 dout(10) << "scrub requesting scrubmap from osd." << replica
3712 << " deep " << (int)deep << " seed " << seed << dendl;
3713 MOSDRepScrub *repscrubop = new MOSDRepScrub(
3714 spg_t(info.pgid.pgid, replica.shard), version,
3715 get_osdmap()->get_epoch(),
3716 get_last_peering_reset(),
3717 start, end, deep, seed);
3718 // default priority, we want the rep scrub processed prior to any recovery
3719 // or client io messages (we are holding a lock!)
3720 osd->send_message_osd_cluster(
3721 replica.osd, repscrubop, get_osdmap()->get_epoch());
3722}
3723
3724void PG::handle_scrub_reserve_request(OpRequestRef op)
3725{
3726 dout(7) << __func__ << " " << *op->get_req() << dendl;
3727 op->mark_started();
3728 if (scrubber.reserved) {
3729 dout(10) << __func__ << " ignoring reserve request: Already reserved"
3730 << dendl;
3731 return;
3732 }
3733 scrubber.reserved = osd->inc_scrubs_pending();
3734 if (op->get_req()->get_type() == MSG_OSD_SCRUB_RESERVE) {
3735 const MOSDScrubReserve *m =
3736 static_cast<const MOSDScrubReserve*>(op->get_req());
3737 Message *reply = new MOSDScrubReserve(
3738 spg_t(info.pgid.pgid, primary.shard),
3739 m->map_epoch,
3740 scrubber.reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
3741 pg_whoami);
3742 osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
3743 } else {
3744 // for jewel compat only
3745 const MOSDSubOp *req = static_cast<const MOSDSubOp*>(op->get_req());
3746 assert(req->get_type() == MSG_OSD_SUBOP);
3747 MOSDSubOpReply *reply = new MOSDSubOpReply(
3748 req, pg_whoami, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
3749 ::encode(scrubber.reserved, reply->get_data());
3750 osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
3751 }
3752}
3753
3754void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
3755{
3756 dout(7) << __func__ << " " << *op->get_req() << dendl;
3757 op->mark_started();
3758 if (!scrubber.reserved) {
3759 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
3760 return;
3761 }
3762 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
3763 dout(10) << " already had osd." << from << " reserved" << dendl;
3764 } else {
3765 dout(10) << " osd." << from << " scrub reserve = success" << dendl;
3766 scrubber.reserved_peers.insert(from);
3767 sched_scrub();
3768 }
3769}
3770
3771void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
3772{
3773 dout(7) << __func__ << " " << *op->get_req() << dendl;
3774 op->mark_started();
3775 if (!scrubber.reserved) {
3776 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
3777 return;
3778 }
3779 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
3780 dout(10) << " already had osd." << from << " reserved" << dendl;
3781 } else {
3782 /* One decline stops this pg from being scheduled for scrubbing. */
3783 dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
3784 scrubber.reserve_failed = true;
3785 sched_scrub();
3786 }
3787}
3788
3789void PG::handle_scrub_reserve_release(OpRequestRef op)
3790{
3791 dout(7) << __func__ << " " << *op->get_req() << dendl;
3792 op->mark_started();
3793 clear_scrub_reserved();
3794}
3795
3796void PG::reject_reservation()
3797{
3798 osd->send_message_osd_cluster(
3799 primary.osd,
3800 new MBackfillReserve(
3801 MBackfillReserve::REJECT,
3802 spg_t(info.pgid.pgid, primary.shard),
3803 get_osdmap()->get_epoch()),
3804 get_osdmap()->get_epoch());
3805}
3806
3807void PG::schedule_backfill_full_retry()
3808{
3809 Mutex::Locker lock(osd->recovery_request_lock);
3810 osd->recovery_request_timer.add_event_after(
3811 cct->_conf->osd_backfill_retry_interval,
3812 new QueuePeeringEvt<RequestBackfill>(
3813 this, get_osdmap()->get_epoch(),
3814 RequestBackfill()));
3815}
3816
3817void PG::schedule_recovery_full_retry()
3818{
3819 Mutex::Locker lock(osd->recovery_request_lock);
3820 osd->recovery_request_timer.add_event_after(
3821 cct->_conf->osd_recovery_retry_interval,
3822 new QueuePeeringEvt<DoRecovery>(
3823 this, get_osdmap()->get_epoch(),
3824 DoRecovery()));
3825}
3826
3827void PG::clear_scrub_reserved()
3828{
3829 scrubber.reserved_peers.clear();
3830 scrubber.reserve_failed = false;
3831
3832 if (scrubber.reserved) {
3833 scrubber.reserved = false;
3834 osd->dec_scrubs_pending();
3835 }
3836}
3837
3838void PG::scrub_reserve_replicas()
3839{
3840 assert(backfill_targets.empty());
3841 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3842 i != actingbackfill.end();
3843 ++i) {
3844 if (*i == pg_whoami) continue;
3845 dout(10) << "scrub requesting reserve from osd." << *i << dendl;
3846 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
3847 osd->send_message_osd_cluster(
3848 i->osd,
3849 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
3850 get_osdmap()->get_epoch(),
3851 MOSDScrubReserve::REQUEST, pg_whoami),
3852 get_osdmap()->get_epoch());
3853 } else {
3854 // for jewel compat only
3855 vector<OSDOp> scrub(1);
3856 scrub[0].op.op = CEPH_OSD_OP_SCRUB_RESERVE;
3857 hobject_t poid;
3858 eversion_t v;
3859 osd_reqid_t reqid;
3860 MOSDSubOp *subop = new MOSDSubOp(
3861 reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
3862 get_osdmap()->get_epoch(), osd->get_tid(), v);
3863 subop->ops = scrub;
3864 osd->send_message_osd_cluster(
3865 i->osd, subop, get_osdmap()->get_epoch());
3866 }
3867 }
3868}
3869
3870void PG::scrub_unreserve_replicas()
3871{
3872 assert(backfill_targets.empty());
3873 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3874 i != actingbackfill.end();
3875 ++i) {
3876 if (*i == pg_whoami) continue;
3877 dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
3878 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
3879 osd->send_message_osd_cluster(
3880 i->osd,
3881 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
3882 get_osdmap()->get_epoch(),
3883 MOSDScrubReserve::RELEASE, pg_whoami),
3884 get_osdmap()->get_epoch());
3885 } else {
3886 // for jewel compat only
3887 vector<OSDOp> scrub(1);
3888 scrub[0].op.op = CEPH_OSD_OP_SCRUB_UNRESERVE;
3889 hobject_t poid;
3890 eversion_t v;
3891 osd_reqid_t reqid;
3892 MOSDSubOp *subop = new MOSDSubOp(
3893 reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
3894 get_osdmap()->get_epoch(), osd->get_tid(), v);
3895 subop->ops = scrub;
3896 osd->send_message_osd_cluster(i->osd, subop, get_osdmap()->get_epoch());
3897 }
3898 }
3899}
3900
3901void PG::_scan_rollback_obs(
3902 const vector<ghobject_t> &rollback_obs,
3903 ThreadPool::TPHandle &handle)
3904{
3905 ObjectStore::Transaction t;
3906 eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
3907 for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
3908 i != rollback_obs.end();
3909 ++i) {
3910 if (i->generation < trimmed_to.version) {
3911 osd->clog->error() << "osd." << osd->whoami
3912 << " pg " << info.pgid
3913 << " found obsolete rollback obj "
3914 << *i << " generation < trimmed_to "
3915 << trimmed_to
3916 << "...repaired";
3917 t.remove(coll, *i);
3918 }
3919 }
3920 if (!t.empty()) {
3921 derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
3922 << dendl;
3923 osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3924 }
3925}
3926
3927void PG::_scan_snaps(ScrubMap &smap)
3928{
3929 hobject_t head;
3930 SnapSet snapset;
3931 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
3932 i != smap.objects.rend();
3933 ++i) {
3934 const hobject_t &hoid = i->first;
3935 ScrubMap::object &o = i->second;
3936
3937 if (hoid.is_head() || hoid.is_snapdir()) {
3938 // parse the SnapSet
3939 bufferlist bl;
3940 if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
3941 continue;
3942 }
3943 bl.push_back(o.attrs[SS_ATTR]);
3944 auto p = bl.begin();
3945 try {
3946 ::decode(snapset, p);
3947 } catch(...) {
3948 continue;
3949 }
3950 head = hoid.get_head();
3951 continue;
3952 }
3953 if (hoid.snap < CEPH_MAXSNAP) {
3954 // check and if necessary fix snap_mapper
3955 if (hoid.get_head() != head) {
3956 derr << __func__ << " no head for " << hoid << " (have " << head << ")"
3957 << dendl;
3958 continue;
3959 }
3960 set<snapid_t> obj_snaps;
3961 if (!snapset.is_legacy()) {
3962 auto p = snapset.clone_snaps.find(hoid.snap);
3963 if (p == snapset.clone_snaps.end()) {
3964 derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset
3965 << dendl;
3966 continue;
3967 }
3968 obj_snaps.insert(p->second.begin(), p->second.end());
3969 } else {
3970 bufferlist bl;
3971 if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
3972 continue;
3973 }
3974 bl.push_back(o.attrs[OI_ATTR]);
3975 object_info_t oi;
3976 try {
3977 oi.decode(bl);
3978 } catch(...) {
3979 continue;
3980 }
3981 obj_snaps.insert(oi.legacy_snaps.begin(), oi.legacy_snaps.end());
3982 }
3983 set<snapid_t> cur_snaps;
3984 int r = snap_mapper.get_snaps(hoid, &cur_snaps);
3985 if (r != 0 && r != -ENOENT) {
3986 derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
3987 ceph_abort();
3988 }
3989 if (r == -ENOENT || cur_snaps != obj_snaps) {
3990 ObjectStore::Transaction t;
3991 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
3992 if (r == 0) {
3993 r = snap_mapper.remove_oid(hoid, &_t);
3994 if (r != 0) {
3995 derr << __func__ << ": remove_oid returned " << cpp_strerror(r)
3996 << dendl;
3997 ceph_abort();
3998 }
3999 osd->clog->error() << "osd." << osd->whoami
4000 << " found snap mapper error on pg "
4001 << info.pgid
4002 << " oid " << hoid << " snaps in mapper: "
4003 << cur_snaps << ", oi: "
4004 << obj_snaps
4005 << "...repaired";
4006 } else {
4007 osd->clog->error() << "osd." << osd->whoami
4008 << " found snap mapper error on pg "
4009 << info.pgid
4010 << " oid " << hoid << " snaps missing in mapper"
4011 << ", should be: "
4012 << obj_snaps
4013 << "...repaired";
4014 }
4015 snap_mapper.add_oid(hoid, obj_snaps, &_t);
4016 r = osd->store->apply_transaction(osr.get(), std::move(t));
4017 if (r != 0) {
4018 derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4019 << dendl;
4020 }
4021 }
4022 }
4023 }
4024}
4025
224ce89b
WB
4026void PG::_repair_oinfo_oid(ScrubMap &smap)
4027{
4028 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4029 i != smap.objects.rend();
4030 ++i) {
4031 const hobject_t &hoid = i->first;
4032 ScrubMap::object &o = i->second;
4033
4034 bufferlist bl;
4035 if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4036 continue;
4037 }
4038 bl.push_back(o.attrs[OI_ATTR]);
4039 object_info_t oi;
4040 try {
4041 oi.decode(bl);
4042 } catch(...) {
4043 continue;
4044 }
4045 if (oi.soid != hoid) {
4046 ObjectStore::Transaction t;
4047 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4048 osd->clog->error() << "osd." << osd->whoami
4049 << " found object info error on pg "
4050 << info.pgid
4051 << " oid " << hoid << " oid in object info: "
4052 << oi.soid
4053 << "...repaired";
4054 // Fix object info
4055 oi.soid = hoid;
4056 bl.clear();
4057 ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4058
4059 bufferptr bp(bl.c_str(), bl.length());
4060 o.attrs[OI_ATTR] = bp;
4061
4062 t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
4063 int r = osd->store->apply_transaction(osr.get(), std::move(t));
4064 if (r != 0) {
4065 derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4066 << dendl;
4067 }
4068 }
4069 }
4070}
4071
7c673cae
FG
4072/*
4073 * build a scrub map over a chunk without releasing the lock
4074 * only used by chunky scrub
4075 */
4076int PG::build_scrub_map_chunk(
4077 ScrubMap &map,
4078 hobject_t start, hobject_t end, bool deep, uint32_t seed,
4079 ThreadPool::TPHandle &handle)
4080{
4081 dout(10) << __func__ << " [" << start << "," << end << ") "
4082 << " seed " << seed << dendl;
4083
4084 map.valid_through = info.last_update;
4085
4086 // objects
4087 vector<hobject_t> ls;
4088 vector<ghobject_t> rollback_obs;
4089 int ret = get_pgbackend()->objects_list_range(
4090 start,
4091 end,
4092 0,
4093 &ls,
4094 &rollback_obs);
4095 if (ret < 0) {
4096 dout(5) << "objects_list_range error: " << ret << dendl;
4097 return ret;
4098 }
4099
4100
4101 get_pgbackend()->be_scan_list(map, ls, deep, seed, handle);
4102 _scan_rollback_obs(rollback_obs, handle);
4103 _scan_snaps(map);
224ce89b 4104 _repair_oinfo_oid(map);
7c673cae
FG
4105
4106 dout(20) << __func__ << " done" << dendl;
4107 return 0;
4108}
4109
4110void PG::Scrubber::cleanup_store(ObjectStore::Transaction *t) {
4111 if (!store)
4112 return;
4113 struct OnComplete : Context {
4114 std::unique_ptr<Scrub::Store> store;
4115 OnComplete(
4116 std::unique_ptr<Scrub::Store> &&store)
4117 : store(std::move(store)) {}
4118 void finish(int) override {}
4119 };
4120 store->cleanup(t);
4121 t->register_on_complete(new OnComplete(std::move(store)));
4122 assert(!store);
4123}
4124
4125void PG::repair_object(
4126 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
4127 pg_shard_t bad_peer)
4128{
4129 list<pg_shard_t> op_shards;
4130 for (auto i : *ok_peers) {
4131 op_shards.push_back(i.second);
4132 }
4133 dout(10) << "repair_object " << soid << " bad_peer osd."
4134 << bad_peer << " ok_peers osd.{" << op_shards << "}" << dendl;
4135 ScrubMap::object &po = ok_peers->back().first;
4136 eversion_t v;
4137 bufferlist bv;
4138 bv.push_back(po.attrs[OI_ATTR]);
224ce89b
WB
4139 object_info_t oi;
4140 try {
4141 bufferlist::iterator bliter = bv.begin();
4142 ::decode(oi, bliter);
4143 } catch (...) {
4144 dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
4145 assert(0);
4146 }
7c673cae
FG
4147 if (bad_peer != primary) {
4148 peer_missing[bad_peer].add(soid, oi.version, eversion_t());
4149 } else {
4150 // We should only be scrubbing if the PG is clean.
4151 assert(waiting_for_unreadable_object.empty());
4152
4153 pg_log.missing_add(soid, oi.version, eversion_t());
4154
4155 pg_log.set_last_requested(0);
4156 dout(10) << __func__ << ": primary = " << primary << dendl;
4157 }
4158
4159 if (is_ec_pg() || bad_peer == primary) {
4160 // we'd better collect all shard for EC pg, and prepare good peers as the
4161 // source of pull in the case of replicated pg.
4162 missing_loc.add_missing(soid, oi.version, eversion_t());
4163 list<pair<ScrubMap::object, pg_shard_t> >::iterator i;
4164 for (i = ok_peers->begin();
4165 i != ok_peers->end();
4166 ++i)
4167 missing_loc.add_location(soid, i->second);
4168 }
4169}
4170
4171/* replica_scrub
4172 *
4173 * Wait for last_update_applied to match msg->scrub_to as above. Wait
4174 * for pushes to complete in case of recent recovery. Build a single
4175 * scrubmap of objects that are in the range [msg->start, msg->end).
4176 */
4177void PG::replica_scrub(
4178 OpRequestRef op,
4179 ThreadPool::TPHandle &handle)
4180{
4181 const MOSDRepScrub *msg = static_cast<const MOSDRepScrub *>(op->get_req());
4182 assert(!scrubber.active_rep_scrub);
4183 dout(7) << "replica_scrub" << dendl;
4184
4185 if (msg->map_epoch < info.history.same_interval_since) {
4186 dout(10) << "replica_scrub discarding old replica_scrub from "
4187 << msg->map_epoch << " < " << info.history.same_interval_since
4188 << dendl;
4189 return;
4190 }
4191
4192 ScrubMap map;
4193
4194 assert(msg->chunky);
4195 if (last_update_applied < msg->scrub_to) {
4196 dout(10) << "waiting for last_update_applied to catch up" << dendl;
4197 scrubber.active_rep_scrub = op;
4198 return;
4199 }
4200
4201 if (active_pushes > 0) {
4202 dout(10) << "waiting for active pushes to finish" << dendl;
4203 scrubber.active_rep_scrub = op;
4204 return;
4205 }
4206
4207 // compensate for hobject_t's with wrong pool from sloppy hammer OSDs
4208 hobject_t start = msg->start;
4209 hobject_t end = msg->end;
4210 if (!start.is_max())
4211 start.pool = info.pgid.pool();
4212 if (!end.is_max())
4213 end.pool = info.pgid.pool();
4214
4215 build_scrub_map_chunk(
4216 map, start, end, msg->deep, msg->seed,
4217 handle);
4218
4219 if (HAVE_FEATURE(acting_features, SERVER_LUMINOUS)) {
4220 MOSDRepScrubMap *reply = new MOSDRepScrubMap(
4221 spg_t(info.pgid.pgid, get_primary().shard),
4222 msg->map_epoch,
4223 pg_whoami);
4224 ::encode(map, reply->get_data());
4225 osd->send_message_osd_cluster(reply, msg->get_connection());
4226 } else {
4227 // for jewel compatibility
4228 vector<OSDOp> scrub(1);
4229 scrub[0].op.op = CEPH_OSD_OP_SCRUB_MAP;
4230 hobject_t poid;
4231 eversion_t v;
4232 osd_reqid_t reqid;
4233 MOSDSubOp *subop = new MOSDSubOp(
4234 reqid,
4235 pg_whoami,
4236 spg_t(info.pgid.pgid, get_primary().shard),
4237 poid,
4238 0,
4239 msg->map_epoch,
4240 osd->get_tid(),
4241 v);
4242 ::encode(map, subop->get_data());
4243 subop->ops = scrub;
4244 osd->send_message_osd_cluster(subop, msg->get_connection());
4245 }
4246}
4247
4248/* Scrub:
4249 * PG_STATE_SCRUBBING is set when the scrub is queued
4250 *
4251 * scrub will be chunky if all OSDs in PG support chunky scrub
4252 * scrub will fail if OSDs are too old.
4253 */
4254void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
4255{
4256 if (cct->_conf->osd_scrub_sleep > 0 &&
4257 (scrubber.state == PG::Scrubber::NEW_CHUNK ||
31f18b77
FG
4258 scrubber.state == PG::Scrubber::INACTIVE) &&
4259 scrubber.needs_sleep) {
7c673cae
FG
4260 ceph_assert(!scrubber.sleeping);
4261 dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
31f18b77 4262
7c673cae 4263 // Do an async sleep so we don't block the op queue
31f18b77
FG
4264 OSDService *osds = osd;
4265 spg_t pgid = get_pgid();
4266 int state = scrubber.state;
4267 auto scrub_requeue_callback =
4268 new FunctionContext([osds, pgid, state](int r) {
4269 PG *pg = osds->osd->lookup_lock_pg(pgid);
4270 if (pg == nullptr) {
4271 lgeneric_dout(osds->osd->cct, 20)
4272 << "scrub_requeue_callback: Could not find "
4273 << "PG " << pgid << " can't complete scrub requeue after sleep"
4274 << dendl;
4275 return;
4276 }
4277 pg->scrubber.sleeping = false;
4278 pg->scrubber.needs_sleep = false;
4279 lgeneric_dout(pg->cct, 20)
4280 << "scrub_requeue_callback: slept for "
4281 << ceph_clock_now() - pg->scrubber.sleep_start
4282 << ", re-queuing scrub with state " << state << dendl;
4283 pg->scrub_queued = false;
4284 pg->requeue_scrub();
4285 pg->scrubber.sleep_start = utime_t();
4286 pg->unlock();
4287 });
4288 Mutex::Locker l(osd->scrub_sleep_lock);
4289 osd->scrub_sleep_timer.add_event_after(cct->_conf->osd_scrub_sleep,
4290 scrub_requeue_callback);
7c673cae
FG
4291 scrubber.sleeping = true;
4292 scrubber.sleep_start = ceph_clock_now();
4293 return;
4294 }
4295 if (pg_has_reset_since(queued)) {
4296 return;
4297 }
4298 assert(scrub_queued);
4299 scrub_queued = false;
4300 scrubber.needs_sleep = true;
4301
4302 if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
4303 dout(10) << "scrub -- not primary or active or not clean" << dendl;
4304 state_clear(PG_STATE_SCRUBBING);
4305 state_clear(PG_STATE_REPAIR);
4306 state_clear(PG_STATE_DEEP_SCRUB);
4307 publish_stats_to_osd();
4308 return;
4309 }
4310
4311 if (!scrubber.active) {
4312 assert(backfill_targets.empty());
4313
4314 scrubber.deep = state_test(PG_STATE_DEEP_SCRUB);
4315
4316 dout(10) << "starting a new chunky scrub" << dendl;
4317 }
4318
4319 chunky_scrub(handle);
4320}
4321
4322/*
4323 * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
4324 * chunk.
4325 *
4326 * The object store is partitioned into chunks which end on hash boundaries. For
4327 * each chunk, the following logic is performed:
4328 *
4329 * (1) Block writes on the chunk
4330 * (2) Request maps from replicas
4331 * (3) Wait for pushes to be applied (after recovery)
4332 * (4) Wait for writes to flush on the chunk
4333 * (5) Wait for maps from replicas
4334 * (6) Compare / repair all scrub maps
4335 * (7) Wait for digest updates to apply
4336 *
4337 * This logic is encoded in the mostly linear state machine:
4338 *
4339 * +------------------+
4340 * _________v__________ |
4341 * | | |
4342 * | INACTIVE | |
4343 * |____________________| |
4344 * | |
4345 * | +----------+ |
4346 * _________v___v______ | |
4347 * | | | |
4348 * | NEW_CHUNK | | |
4349 * |____________________| | |
4350 * | | |
4351 * _________v__________ | |
4352 * | | | |
4353 * | WAIT_PUSHES | | |
4354 * |____________________| | |
4355 * | | |
4356 * _________v__________ | |
4357 * | | | |
4358 * | WAIT_LAST_UPDATE | | |
4359 * |____________________| | |
4360 * | | |
4361 * _________v__________ | |
4362 * | | | |
4363 * | BUILD_MAP | | |
4364 * |____________________| | |
4365 * | | |
4366 * _________v__________ | |
4367 * | | | |
4368 * | WAIT_REPLICAS | | |
4369 * |____________________| | |
4370 * | | |
4371 * _________v__________ | |
4372 * | | | |
4373 * | COMPARE_MAPS | | |
4374 * |____________________| | |
4375 * | | |
4376 * | | |
4377 * _________v__________ | |
4378 * | | | |
4379 * |WAIT_DIGEST_UPDATES | | |
4380 * |____________________| | |
4381 * | | | |
4382 * | +----------+ |
4383 * _________v__________ |
4384 * | | |
4385 * | FINISH | |
4386 * |____________________| |
4387 * | |
4388 * +------------------+
4389 *
4390 * The primary determines the last update from the subset by walking the log. If
4391 * it sees a log entry pertaining to a file in the chunk, it tells the replicas
4392 * to wait until that update is applied before building a scrub map. Both the
4393 * primary and replicas will wait for any active pushes to be applied.
4394 *
4395 * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
4396 *
4397 * scrubber.state encodes the current state of the scrub (refer to state diagram
4398 * for details).
4399 */
4400void PG::chunky_scrub(ThreadPool::TPHandle &handle)
4401{
4402 // check for map changes
4403 if (scrubber.is_chunky_scrub_active()) {
4404 if (scrubber.epoch_start != info.history.same_interval_since) {
4405 dout(10) << "scrub pg changed, aborting" << dendl;
4406 scrub_clear_state();
4407 scrub_unreserve_replicas();
4408 return;
4409 }
4410 }
4411
4412 bool done = false;
4413 int ret;
4414
4415 while (!done) {
4416 dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
4417 << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
4418
4419 switch (scrubber.state) {
4420 case PG::Scrubber::INACTIVE:
4421 dout(10) << "scrub start" << dendl;
4422
4423 publish_stats_to_osd();
4424 scrubber.epoch_start = info.history.same_interval_since;
4425 scrubber.active = true;
4426
4427 osd->inc_scrubs_active(scrubber.reserved);
4428 if (scrubber.reserved) {
4429 scrubber.reserved = false;
4430 scrubber.reserved_peers.clear();
4431 }
4432
4433 {
4434 ObjectStore::Transaction t;
4435 scrubber.cleanup_store(&t);
4436 scrubber.store.reset(Scrub::Store::create(osd->store, &t,
4437 info.pgid, coll));
4438 osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4439 }
4440
4441 // Don't include temporary objects when scrubbing
4442 scrubber.start = info.pgid.pgid.get_hobj_start();
4443 scrubber.state = PG::Scrubber::NEW_CHUNK;
4444
4445 {
4446 bool repair = state_test(PG_STATE_REPAIR);
4447 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4448 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4449 stringstream oss;
4450 oss << info.pgid.pgid << " " << mode << " starts" << std::endl;
4451 osd->clog->info(oss);
4452 }
4453
4454 scrubber.seed = -1;
4455
4456 break;
4457
4458 case PG::Scrubber::NEW_CHUNK:
4459 scrubber.primary_scrubmap = ScrubMap();
4460 scrubber.received_maps.clear();
4461
4462 {
4463 /* get the start and end of our scrub chunk
4464 *
4465 * Our scrub chunk has an important restriction we're going to need to
4466 * respect. We can't let head or snapdir be start or end.
4467 * Using a half-open interval means that if end == head|snapdir,
4468 * we'd scrub/lock head and the clone right next to head in different
4469 * chunks which would allow us to miss clones created between
4470 * scrubbing that chunk and scrubbing the chunk including head.
4471 * This isn't true for any of the other clones since clones can
4472 * only be created "just to the left of" head. There is one exception
4473 * to this: promotion of clones which always happens to the left of the
4474 * left-most clone, but promote_object checks the scrubber in that
4475 * case, so it should be ok. Also, it's ok to "miss" clones at the
4476 * left end of the range if we are a tier because they may legitimately
4477 * not exist (see _scrub).
4478 */
4479 int min = MAX(3, cct->_conf->osd_scrub_chunk_min);
4480 hobject_t start = scrubber.start;
4481 hobject_t candidate_end;
4482 vector<hobject_t> objects;
4483 ret = get_pgbackend()->objects_list_partial(
4484 start,
4485 min,
4486 MAX(min, cct->_conf->osd_scrub_chunk_max),
4487 &objects,
4488 &candidate_end);
4489 assert(ret >= 0);
4490
4491 if (!objects.empty()) {
4492 hobject_t back = objects.back();
4493 while (candidate_end.has_snapset() &&
4494 candidate_end.get_head() == back.get_head()) {
4495 candidate_end = back;
4496 objects.pop_back();
4497 if (objects.empty()) {
4498 assert(0 ==
4499 "Somehow we got more than 2 objects which"
4500 "have the same head but are not clones");
4501 }
4502 back = objects.back();
4503 }
4504 if (candidate_end.has_snapset()) {
4505 assert(candidate_end.get_head() != back.get_head());
4506 candidate_end = candidate_end.get_object_boundary();
4507 }
4508 } else {
4509 assert(candidate_end.is_max());
4510 }
4511
4512 if (!_range_available_for_scrub(scrubber.start, candidate_end)) {
4513 // we'll be requeued by whatever made us unavailable for scrub
4514 dout(10) << __func__ << ": scrub blocked somewhere in range "
4515 << "[" << scrubber.start << ", " << candidate_end << ")"
4516 << dendl;
4517 done = true;
4518 break;
4519 }
4520 scrubber.end = candidate_end;
4521 }
4522
4523 // walk the log to find the latest update that affects our chunk
4524 scrubber.subset_last_update = eversion_t();
4525 for (auto p = projected_log.log.rbegin();
4526 p != projected_log.log.rend();
4527 ++p) {
4528 if (p->soid >= scrubber.start &&
4529 p->soid < scrubber.end) {
4530 scrubber.subset_last_update = p->version;
4531 break;
4532 }
4533 }
4534 if (scrubber.subset_last_update == eversion_t()) {
4535 for (list<pg_log_entry_t>::const_reverse_iterator p =
4536 pg_log.get_log().log.rbegin();
4537 p != pg_log.get_log().log.rend();
4538 ++p) {
4539 if (p->soid >= scrubber.start &&
4540 p->soid < scrubber.end) {
4541 scrubber.subset_last_update = p->version;
4542 break;
4543 }
4544 }
4545 }
4546
4547 // ask replicas to wait until
4548 // last_update_applied >= scrubber.subset_last_update and then scan
4549 scrubber.waiting_on_whom.insert(pg_whoami);
4550 ++scrubber.waiting_on;
4551
4552 // request maps from replicas
4553 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4554 i != actingbackfill.end();
4555 ++i) {
4556 if (*i == pg_whoami) continue;
4557 _request_scrub_map(*i, scrubber.subset_last_update,
4558 scrubber.start, scrubber.end, scrubber.deep,
4559 scrubber.seed);
4560 scrubber.waiting_on_whom.insert(*i);
4561 ++scrubber.waiting_on;
4562 }
4563
4564 scrubber.state = PG::Scrubber::WAIT_PUSHES;
4565
4566 break;
4567
4568 case PG::Scrubber::WAIT_PUSHES:
4569 if (active_pushes == 0) {
4570 scrubber.state = PG::Scrubber::WAIT_LAST_UPDATE;
4571 } else {
4572 dout(15) << "wait for pushes to apply" << dendl;
4573 done = true;
4574 }
4575 break;
4576
4577 case PG::Scrubber::WAIT_LAST_UPDATE:
4578 if (last_update_applied >= scrubber.subset_last_update) {
4579 scrubber.state = PG::Scrubber::BUILD_MAP;
4580 } else {
4581 // will be requeued by op_applied
4582 dout(15) << "wait for writes to flush" << dendl;
4583 done = true;
4584 }
4585 break;
4586
4587 case PG::Scrubber::BUILD_MAP:
4588 assert(last_update_applied >= scrubber.subset_last_update);
4589
4590 // build my own scrub map
4591 ret = build_scrub_map_chunk(scrubber.primary_scrubmap,
4592 scrubber.start, scrubber.end,
4593 scrubber.deep, scrubber.seed,
4594 handle);
4595 if (ret < 0) {
4596 dout(5) << "error building scrub map: " << ret << ", aborting" << dendl;
4597 scrub_clear_state();
4598 scrub_unreserve_replicas();
4599 return;
4600 }
4601
4602 --scrubber.waiting_on;
4603 scrubber.waiting_on_whom.erase(pg_whoami);
4604
4605 scrubber.state = PG::Scrubber::WAIT_REPLICAS;
4606 break;
4607
4608 case PG::Scrubber::WAIT_REPLICAS:
4609 if (scrubber.waiting_on > 0) {
4610 // will be requeued by sub_op_scrub_map
4611 dout(10) << "wait for replicas to build scrub map" << dendl;
4612 done = true;
4613 } else {
4614 scrubber.state = PG::Scrubber::COMPARE_MAPS;
4615 }
4616 break;
4617
4618 case PG::Scrubber::COMPARE_MAPS:
4619 assert(last_update_applied >= scrubber.subset_last_update);
4620 assert(scrubber.waiting_on == 0);
4621
4622 scrub_compare_maps();
4623 scrubber.start = scrubber.end;
4624 scrubber.run_callbacks();
4625
4626 // requeue the writes from the chunk that just finished
4627 requeue_ops(waiting_for_scrub);
4628
4629 scrubber.state = PG::Scrubber::WAIT_DIGEST_UPDATES;
4630
4631 // fall-thru
4632
4633 case PG::Scrubber::WAIT_DIGEST_UPDATES:
4634 if (scrubber.num_digest_updates_pending) {
4635 dout(10) << __func__ << " waiting on "
4636 << scrubber.num_digest_updates_pending
4637 << " digest updates" << dendl;
4638 done = true;
4639 break;
4640 }
4641
4642 if (!(scrubber.end.is_max())) {
4643 scrubber.state = PG::Scrubber::NEW_CHUNK;
4644 requeue_scrub();
4645 done = true;
4646 } else {
4647 scrubber.state = PG::Scrubber::FINISH;
4648 }
4649
4650 break;
4651
4652 case PG::Scrubber::FINISH:
4653 scrub_finish();
4654 scrubber.state = PG::Scrubber::INACTIVE;
4655 done = true;
4656
224ce89b
WB
4657 if (!snap_trimq.empty()) {
4658 dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
4659 snap_trimmer_scrub_complete();
4660 }
4661
7c673cae
FG
4662 break;
4663
4664 default:
4665 ceph_abort();
4666 }
4667 }
4668 dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
4669 << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
4670}
4671
4672void PG::scrub_clear_state()
4673{
4674 assert(is_locked());
4675 state_clear(PG_STATE_SCRUBBING);
4676 state_clear(PG_STATE_REPAIR);
4677 state_clear(PG_STATE_DEEP_SCRUB);
4678 publish_stats_to_osd();
4679
4680 // active -> nothing.
4681 if (scrubber.active)
4682 osd->dec_scrubs_active();
4683
4684 requeue_ops(waiting_for_scrub);
4685
7c673cae
FG
4686 scrubber.reset();
4687
4688 // type-specific state clear
4689 _scrub_clear_state();
4690}
4691
4692void PG::scrub_compare_maps()
4693{
4694 dout(10) << __func__ << " has maps, analyzing" << dendl;
4695
4696 // construct authoritative scrub map for type specific scrubbing
4697 scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
4698 map<hobject_t, pair<uint32_t, uint32_t>> missing_digest;
4699
4700 if (acting.size() > 1) {
4701 dout(10) << __func__ << " comparing replica scrub maps" << dendl;
4702
4703 stringstream ss;
4704
4705 // Map from object with errors to good peer
4706 map<hobject_t, list<pg_shard_t>> authoritative;
4707 map<pg_shard_t, ScrubMap *> maps;
4708
4709 dout(2) << __func__ << " osd." << acting[0] << " has "
4710 << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
4711 maps[pg_whoami] = &scrubber.primary_scrubmap;
4712
4713 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4714 i != actingbackfill.end();
4715 ++i) {
4716 if (*i == pg_whoami) continue;
4717 dout(2) << __func__ << " replica " << *i << " has "
4718 << scrubber.received_maps[*i].objects.size()
4719 << " items" << dendl;
4720 maps[*i] = &scrubber.received_maps[*i];
4721 }
4722
4723 get_pgbackend()->be_compare_scrubmaps(
4724 maps,
4725 state_test(PG_STATE_REPAIR),
4726 scrubber.missing,
4727 scrubber.inconsistent,
4728 authoritative,
4729 missing_digest,
4730 scrubber.shallow_errors,
4731 scrubber.deep_errors,
4732 scrubber.store.get(),
4733 info.pgid, acting,
4734 ss);
4735 dout(2) << ss.str() << dendl;
4736
4737 if (!ss.str().empty()) {
4738 osd->clog->error(ss);
4739 }
4740
4741 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
4742 i != authoritative.end();
4743 ++i) {
4744 list<pair<ScrubMap::object, pg_shard_t> > good_peers;
4745 for (list<pg_shard_t>::const_iterator j = i->second.begin();
4746 j != i->second.end();
4747 ++j) {
4748 good_peers.push_back(make_pair(maps[*j]->objects[i->first], *j));
4749 }
4750 scrubber.authoritative.insert(
4751 make_pair(
4752 i->first,
4753 good_peers));
4754 }
4755
4756 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
4757 i != authoritative.end();
4758 ++i) {
4759 scrubber.cleaned_meta_map.objects.erase(i->first);
4760 scrubber.cleaned_meta_map.objects.insert(
4761 *(maps[i->second.back()]->objects.find(i->first))
4762 );
4763 }
4764 }
4765
4766 ScrubMap for_meta_scrub;
4767 if (scrubber.end.is_max() ||
4768 scrubber.cleaned_meta_map.objects.empty()) {
4769 scrubber.cleaned_meta_map.swap(for_meta_scrub);
4770 } else {
4771 auto iter = scrubber.cleaned_meta_map.objects.end();
4772 --iter; // not empty, see if clause
4773 auto begin = scrubber.cleaned_meta_map.objects.begin();
4774 while (iter != begin) {
4775 auto next = iter--;
4776 if (next->first.get_head() != iter->first.get_head()) {
4777 ++iter;
4778 break;
4779 }
4780 }
4781 for_meta_scrub.objects.insert(begin, iter);
4782 scrubber.cleaned_meta_map.objects.erase(begin, iter);
4783 }
4784
4785 // ok, do the pg-type specific scrubbing
4786 scrub_snapshot_metadata(for_meta_scrub, missing_digest);
4787 if (!scrubber.store->empty()) {
4788 if (state_test(PG_STATE_REPAIR)) {
4789 dout(10) << __func__ << ": discarding scrub results" << dendl;
4790 scrubber.store->flush(nullptr);
4791 } else {
4792 dout(10) << __func__ << ": updating scrub object" << dendl;
4793 ObjectStore::Transaction t;
4794 scrubber.store->flush(&t);
4795 osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4796 }
4797 }
4798}
4799
4800bool PG::scrub_process_inconsistent()
4801{
4802 dout(10) << __func__ << ": checking authoritative" << dendl;
4803 bool repair = state_test(PG_STATE_REPAIR);
4804 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4805 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4806
4807 // authoriative only store objects which missing or inconsistent.
4808 if (!scrubber.authoritative.empty()) {
4809 stringstream ss;
4810 ss << info.pgid << " " << mode << " "
4811 << scrubber.missing.size() << " missing, "
4812 << scrubber.inconsistent.size() << " inconsistent objects";
4813 dout(2) << ss.str() << dendl;
4814 osd->clog->error(ss);
4815 if (repair) {
4816 state_clear(PG_STATE_CLEAN);
4817 for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >>::iterator i =
4818 scrubber.authoritative.begin();
4819 i != scrubber.authoritative.end();
4820 ++i) {
4821 set<pg_shard_t>::iterator j;
4822
4823 auto missing_entry = scrubber.missing.find(i->first);
4824 if (missing_entry != scrubber.missing.end()) {
4825 for (j = missing_entry->second.begin();
4826 j != missing_entry->second.end();
4827 ++j) {
4828 repair_object(
4829 i->first,
4830 &(i->second),
4831 *j);
4832 ++scrubber.fixed;
4833 }
4834 }
4835 if (scrubber.inconsistent.count(i->first)) {
4836 for (j = scrubber.inconsistent[i->first].begin();
4837 j != scrubber.inconsistent[i->first].end();
4838 ++j) {
4839 repair_object(i->first,
4840 &(i->second),
4841 *j);
4842 ++scrubber.fixed;
4843 }
4844 }
4845 }
4846 }
4847 }
4848 return (!scrubber.authoritative.empty() && repair);
4849}
4850
31f18b77
FG
4851bool PG::ops_blocked_by_scrub() const {
4852 return (waiting_for_scrub.size() != 0);
4853}
4854
7c673cae
FG
4855// the part that actually finalizes a scrub
4856void PG::scrub_finish()
4857{
4858 bool repair = state_test(PG_STATE_REPAIR);
4859 // if the repair request comes from auto-repair and large number of errors,
4860 // we would like to cancel auto-repair
4861 if (repair && scrubber.auto_repair
4862 && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) {
4863 state_clear(PG_STATE_REPAIR);
4864 repair = false;
4865 }
4866 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4867 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4868
4869 // type-specific finish (can tally more errors)
4870 _scrub_finish();
4871
4872 bool has_error = scrub_process_inconsistent();
4873
4874 {
4875 stringstream oss;
4876 oss << info.pgid.pgid << " " << mode << " ";
4877 int total_errors = scrubber.shallow_errors + scrubber.deep_errors;
4878 if (total_errors)
4879 oss << total_errors << " errors";
4880 else
4881 oss << "ok";
4882 if (!deep_scrub && info.stats.stats.sum.num_deep_scrub_errors)
4883 oss << " ( " << info.stats.stats.sum.num_deep_scrub_errors
4884 << " remaining deep scrub error details lost)";
4885 if (repair)
4886 oss << ", " << scrubber.fixed << " fixed";
4887 if (total_errors)
4888 osd->clog->error(oss);
4889 else
4890 osd->clog->info(oss);
4891 }
4892
4893 // finish up
4894 unreg_next_scrub();
4895 utime_t now = ceph_clock_now();
4896 info.history.last_scrub = info.last_update;
4897 info.history.last_scrub_stamp = now;
4898 if (scrubber.deep) {
4899 info.history.last_deep_scrub = info.last_update;
4900 info.history.last_deep_scrub_stamp = now;
4901 }
4902 // Since we don't know which errors were fixed, we can only clear them
4903 // when every one has been fixed.
4904 if (repair) {
4905 if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) {
4906 assert(deep_scrub);
4907 scrubber.shallow_errors = scrubber.deep_errors = 0;
4908 } else {
4909 // Deep scrub in order to get corrected error counts
4910 scrub_after_recovery = true;
4911 }
4912 }
4913 if (deep_scrub) {
4914 if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0))
4915 info.history.last_clean_scrub_stamp = now;
4916 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
4917 info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
4918 } else {
4919 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
4920 // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
4921 // because of deep-scrub errors
4922 if (scrubber.shallow_errors == 0)
4923 info.history.last_clean_scrub_stamp = now;
4924 }
4925 info.stats.stats.sum.num_scrub_errors =
4926 info.stats.stats.sum.num_shallow_scrub_errors +
4927 info.stats.stats.sum.num_deep_scrub_errors;
4928 reg_next_scrub();
4929
4930 {
4931 ObjectStore::Transaction t;
4932 dirty_info = true;
4933 write_if_dirty(t);
4934 int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
4935 assert(tr == 0);
4936 }
4937
4938
4939 if (has_error) {
4940 queue_peering_event(
4941 CephPeeringEvtRef(
4942 std::make_shared<CephPeeringEvt>(
4943 get_osdmap()->get_epoch(),
4944 get_osdmap()->get_epoch(),
4945 DoRecovery())));
4946 }
4947
4948 scrub_clear_state();
4949 scrub_unreserve_replicas();
4950
4951 if (is_active() && is_primary()) {
4952 share_pg_info();
4953 }
4954}
4955
4956void PG::share_pg_info()
4957{
4958 dout(10) << "share_pg_info" << dendl;
4959
4960 // share new pg_info_t with replicas
4961 assert(!actingbackfill.empty());
4962 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4963 i != actingbackfill.end();
4964 ++i) {
4965 if (*i == pg_whoami) continue;
4966 pg_shard_t peer = *i;
4967 if (peer_info.count(peer)) {
4968 peer_info[peer].last_epoch_started = info.last_epoch_started;
4969 peer_info[peer].last_interval_started = info.last_interval_started;
4970 peer_info[peer].history.merge(info.history);
4971 }
4972 MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
4973 m->pg_list.push_back(
4974 make_pair(
4975 pg_notify_t(
4976 peer.shard, pg_whoami.shard,
4977 get_osdmap()->get_epoch(),
4978 get_osdmap()->get_epoch(),
4979 info),
4980 PastIntervals()));
4981 osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
4982 }
4983}
4984
4985bool PG::append_log_entries_update_missing(
31f18b77 4986 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
7c673cae
FG
4987 ObjectStore::Transaction &t)
4988{
4989 assert(!entries.empty());
4990 assert(entries.begin()->version > info.last_update);
4991
4992 PGLogEntryHandler rollbacker{this, &t};
4993 bool invalidate_stats =
4994 pg_log.append_new_log_entries(info.last_backfill,
4995 info.last_backfill_bitwise,
4996 entries,
4997 &rollbacker);
4998 info.last_update = pg_log.get_head();
4999
5000 if (pg_log.get_missing().num_missing() == 0) {
5001 // advance last_complete since nothing else is missing!
5002 info.last_complete = info.last_update;
5003 }
5004
5005 info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
5006 dirty_info = true;
5007 write_if_dirty(t);
5008 return invalidate_stats;
5009}
5010
5011
5012void PG::merge_new_log_entries(
31f18b77 5013 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
7c673cae
FG
5014 ObjectStore::Transaction &t)
5015{
5016 dout(10) << __func__ << " " << entries << dendl;
5017 assert(is_primary());
5018
5019 bool rebuild_missing = append_log_entries_update_missing(entries, t);
5020 for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
5021 i != actingbackfill.end();
5022 ++i) {
5023 pg_shard_t peer(*i);
5024 if (peer == pg_whoami) continue;
5025 assert(peer_missing.count(peer));
5026 assert(peer_info.count(peer));
5027 pg_missing_t& pmissing(peer_missing[peer]);
5028 pg_info_t& pinfo(peer_info[peer]);
5029 bool invalidate_stats = PGLog::append_log_entries_update_missing(
5030 pinfo.last_backfill,
5031 info.last_backfill_bitwise,
5032 entries,
5033 true,
5034 NULL,
5035 pmissing,
5036 NULL,
5037 this);
5038 pinfo.last_update = info.last_update;
5039 pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
5040 rebuild_missing = rebuild_missing || invalidate_stats;
5041 }
5042
5043 if (!rebuild_missing) {
5044 return;
5045 }
5046
5047 for (auto &&i: entries) {
5048 missing_loc.rebuild(
5049 i.soid,
5050 pg_whoami,
5051 actingbackfill,
5052 info,
5053 pg_log.get_missing(),
5054 peer_missing,
5055 peer_info);
5056 }
5057}
5058
5059void PG::update_history(const pg_history_t& new_history)
5060{
5061 unreg_next_scrub();
5062 if (info.history.merge(new_history)) {
5063 dout(20) << __func__ << " advanced history from " << new_history << dendl;
5064 dirty_info = true;
5065 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
5066 dout(20) << __func__ << " clearing past_intervals" << dendl;
5067 past_intervals.clear();
5068 dirty_big_info = true;
5069 }
5070 }
5071 reg_next_scrub();
5072}
5073
5074void PG::fulfill_info(
5075 pg_shard_t from, const pg_query_t &query,
5076 pair<pg_shard_t, pg_info_t> &notify_info)
5077{
5078 assert(from == primary);
5079 assert(query.type == pg_query_t::INFO);
5080
5081 // info
5082 dout(10) << "sending info" << dendl;
5083 notify_info = make_pair(from, info);
5084}
5085
5086void PG::fulfill_log(
5087 pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
5088{
5089 dout(10) << "log request from " << from << dendl;
5090 assert(from == primary);
5091 assert(query.type != pg_query_t::INFO);
5092 ConnectionRef con = osd->get_con_osd_cluster(
5093 from.osd, get_osdmap()->get_epoch());
5094 if (!con) return;
5095
5096 MOSDPGLog *mlog = new MOSDPGLog(
5097 from.shard, pg_whoami.shard,
5098 get_osdmap()->get_epoch(),
5099 info, query_epoch);
5100 mlog->missing = pg_log.get_missing();
5101
5102 // primary -> other, when building master log
5103 if (query.type == pg_query_t::LOG) {
5104 dout(10) << " sending info+missing+log since " << query.since
5105 << dendl;
5106 if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
5107 osd->clog->error() << info.pgid << " got broken pg_query_t::LOG since " << query.since
5108 << " when my log.tail is " << pg_log.get_tail()
5109 << ", sending full log instead";
5110 mlog->log = pg_log.get_log(); // primary should not have requested this!!
5111 } else
5112 mlog->log.copy_after(pg_log.get_log(), query.since);
5113 }
5114 else if (query.type == pg_query_t::FULLLOG) {
5115 dout(10) << " sending info+missing+full log" << dendl;
5116 mlog->log = pg_log.get_log();
5117 }
5118
5119 dout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
5120
5121 osd->share_map_peer(from.osd, con.get(), get_osdmap());
5122 osd->send_message_osd_cluster(mlog, con.get());
5123}
5124
5125void PG::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
5126{
5127 bool changed = false;
5128 if (osdmap->test_flag(CEPH_OSDMAP_FULL) &&
5129 !lastmap->test_flag(CEPH_OSDMAP_FULL)) {
5130 dout(10) << " cluster was marked full in " << osdmap->get_epoch() << dendl;
5131 changed = true;
5132 }
5133 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
5134 assert(pi);
5135 if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
5136 const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
5137 if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
5138 dout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
5139 changed = true;
5140 }
5141 }
5142 if (changed) {
5143 info.history.last_epoch_marked_full = osdmap->get_epoch();
5144 dirty_info = true;
5145 }
5146}
5147
5148bool PG::should_restart_peering(
5149 int newupprimary,
5150 int newactingprimary,
5151 const vector<int>& newup,
5152 const vector<int>& newacting,
5153 OSDMapRef lastmap,
5154 OSDMapRef osdmap)
5155{
5156 if (PastIntervals::is_new_interval(
5157 primary.osd,
5158 newactingprimary,
5159 acting,
5160 newacting,
5161 up_primary.osd,
5162 newupprimary,
5163 up,
5164 newup,
5165 osdmap,
5166 lastmap,
5167 info.pgid.pgid)) {
5168 dout(20) << "new interval newup " << newup
5169 << " newacting " << newacting << dendl;
5170 return true;
5171 } else {
5172 return false;
5173 }
5174}
5175
5176bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
5177{
5178 if (last_peering_reset > reply_epoch ||
5179 last_peering_reset > query_epoch) {
5180 dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " << query_epoch
5181 << " last_peering_reset " << last_peering_reset
5182 << dendl;
5183 return true;
5184 }
5185 return false;
5186}
5187
5188void PG::set_last_peering_reset()
5189{
5190 dout(20) << "set_last_peering_reset " << get_osdmap()->get_epoch() << dendl;
5191 if (last_peering_reset != get_osdmap()->get_epoch()) {
5192 last_peering_reset = get_osdmap()->get_epoch();
5193 reset_interval_flush();
5194 }
5195}
5196
5197struct FlushState {
5198 PGRef pg;
5199 epoch_t epoch;
5200 FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
5201 ~FlushState() {
5202 pg->lock();
5203 if (!pg->pg_has_reset_since(epoch))
5204 pg->queue_flushed(epoch);
5205 pg->unlock();
5206 }
5207};
5208typedef ceph::shared_ptr<FlushState> FlushStateRef;
5209
5210void PG::start_flush(ObjectStore::Transaction *t,
5211 list<Context *> *on_applied,
5212 list<Context *> *on_safe)
5213{
5214 // flush in progress ops
5215 FlushStateRef flush_trigger (std::make_shared<FlushState>(
5216 this, get_osdmap()->get_epoch()));
5217 t->nop();
5218 flushes_in_progress++;
5219 on_applied->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5220 on_safe->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5221}
5222
5223void PG::reset_interval_flush()
5224{
5225 dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
5226 recovery_state.clear_blocked_outgoing();
5227
5228 Context *c = new QueuePeeringEvt<IntervalFlush>(
5229 this, get_osdmap()->get_epoch(), IntervalFlush());
5230 if (!osr->flush_commit(c)) {
5231 dout(10) << "Beginning to block outgoing recovery messages" << dendl;
5232 recovery_state.begin_block_outgoing();
5233 } else {
5234 dout(10) << "Not blocking outgoing recovery messages" << dendl;
5235 delete c;
5236 }
5237}
5238
5239/* Called before initializing peering during advance_map */
5240void PG::start_peering_interval(
5241 const OSDMapRef lastmap,
5242 const vector<int>& newup, int new_up_primary,
5243 const vector<int>& newacting, int new_acting_primary,
5244 ObjectStore::Transaction *t)
5245{
5246 const OSDMapRef osdmap = get_osdmap();
5247
5248 set_last_peering_reset();
5249
5250 vector<int> oldacting, oldup;
5251 int oldrole = get_role();
5252
5253 unreg_next_scrub();
5254
5255 pg_shard_t old_acting_primary = get_primary();
5256 pg_shard_t old_up_primary = up_primary;
5257 bool was_old_primary = is_primary();
5258
5259 acting.swap(oldacting);
5260 up.swap(oldup);
5261 init_primary_up_acting(
5262 newup,
5263 newacting,
5264 new_up_primary,
5265 new_acting_primary);
5266
5267 if (info.stats.up != up ||
5268 info.stats.acting != acting ||
5269 info.stats.up_primary != new_up_primary ||
5270 info.stats.acting_primary != new_acting_primary) {
5271 info.stats.up = up;
5272 info.stats.up_primary = new_up_primary;
5273 info.stats.acting = acting;
5274 info.stats.acting_primary = new_acting_primary;
5275 info.stats.mapping_epoch = osdmap->get_epoch();
5276 }
5277
5278 pg_stats_publish_lock.Lock();
5279 pg_stats_publish_valid = false;
5280 pg_stats_publish_lock.Unlock();
5281
5282 // This will now be remapped during a backfill in cases
5283 // that it would not have been before.
5284 if (up != acting)
5285 state_set(PG_STATE_REMAPPED);
5286 else
5287 state_clear(PG_STATE_REMAPPED);
5288
5289 int role = osdmap->calc_pg_role(osd->whoami, acting, acting.size());
5290 if (pool.info.is_replicated() || role == pg_whoami.shard)
5291 set_role(role);
5292 else
5293 set_role(-1);
5294
5295 // did acting, up, primary|acker change?
5296 if (!lastmap) {
5297 dout(10) << " no lastmap" << dendl;
5298 dirty_info = true;
5299 dirty_big_info = true;
5300 info.history.same_interval_since = osdmap->get_epoch();
5301 } else {
5302 std::stringstream debug;
5303 assert(info.history.same_interval_since != 0);
5304 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
5305 get_is_recoverable_predicate());
5306 bool new_interval = PastIntervals::check_new_interval(
5307 old_acting_primary.osd,
5308 new_acting_primary,
5309 oldacting, newacting,
5310 old_up_primary.osd,
5311 new_up_primary,
5312 oldup, newup,
5313 info.history.same_interval_since,
5314 info.history.last_epoch_clean,
5315 osdmap,
5316 lastmap,
5317 info.pgid.pgid,
5318 recoverable.get(),
5319 &past_intervals,
5320 &debug);
5321 dout(10) << __func__ << ": check_new_interval output: "
5322 << debug.str() << dendl;
5323 if (new_interval) {
5324 if (osdmap->get_epoch() == osd->get_superblock().oldest_map &&
5325 info.history.last_epoch_clean < osdmap->get_epoch()) {
5326 dout(10) << " map gap, clearing past_intervals and faking" << dendl;
5327 // our information is incomplete and useless; someone else was clean
5328 // after everything we know if osdmaps were trimmed.
5329 past_intervals.clear();
5330 } else {
5331 dout(10) << " noting past " << past_intervals << dendl;
5332 }
5333 dirty_info = true;
5334 dirty_big_info = true;
5335 info.history.same_interval_since = osdmap->get_epoch();
5336 if (info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
5337 osdmap->get_pg_num(info.pgid.pgid.pool()),
5338 nullptr)) {
5339 info.history.last_epoch_split = osdmap->get_epoch();
5340 }
5341 }
5342 }
5343
5344 if (old_up_primary != up_primary ||
5345 oldup != up) {
5346 info.history.same_up_since = osdmap->get_epoch();
5347 }
5348 // this comparison includes primary rank via pg_shard_t
5349 if (old_acting_primary != get_primary()) {
5350 info.history.same_primary_since = osdmap->get_epoch();
5351 }
5352
5353 on_new_interval();
5354
5355 dout(1) << __func__ << " up " << oldup << " -> " << up
5356 << ", acting " << oldacting << " -> " << acting
5357 << ", acting_primary " << old_acting_primary << " -> " << new_acting_primary
5358 << ", up_primary " << old_up_primary << " -> " << new_up_primary
5359 << ", role " << oldrole << " -> " << role
5360 << ", features acting " << acting_features
5361 << " upacting " << upacting_features
5362 << dendl;
5363
5364 // deactivate.
5365 state_clear(PG_STATE_ACTIVE);
5366 state_clear(PG_STATE_PEERED);
5367 state_clear(PG_STATE_DOWN);
5368 state_clear(PG_STATE_RECOVERY_WAIT);
5369 state_clear(PG_STATE_RECOVERY_TOOFULL);
5370 state_clear(PG_STATE_RECOVERING);
5371
5372 peer_purged.clear();
5373 actingbackfill.clear();
5374 scrub_queued = false;
5375
5376 // reset primary state?
5377 if (was_old_primary || is_primary()) {
5378 osd->remove_want_pg_temp(info.pgid.pgid);
5379 }
5380 clear_primary_state();
5381
5382
5383 // pg->on_*
5384 on_change(t);
5385
5386 projected_last_update = eversion_t();
5387
5388 assert(!deleting);
5389
5390 // should we tell the primary we are here?
5391 send_notify = !is_primary();
5392
5393 if (role != oldrole ||
5394 was_old_primary != is_primary()) {
5395 // did primary change?
5396 if (was_old_primary != is_primary()) {
5397 state_clear(PG_STATE_CLEAN);
5398 clear_publish_stats();
5399 }
5400
5401 on_role_change();
5402
5403 // take active waiters
5404 requeue_ops(waiting_for_peered);
5405
5406 } else {
5407 // no role change.
5408 // did primary change?
5409 if (get_primary() != old_acting_primary) {
5410 dout(10) << *this << " " << oldacting << " -> " << acting
5411 << ", acting primary "
5412 << old_acting_primary << " -> " << get_primary()
5413 << dendl;
5414 } else {
5415 // primary is the same.
5416 if (is_primary()) {
5417 // i am (still) primary. but my replica set changed.
5418 state_clear(PG_STATE_CLEAN);
5419
5420 dout(10) << oldacting << " -> " << acting
5421 << ", replicas changed" << dendl;
5422 }
5423 }
5424 }
5425 cancel_recovery();
5426
5427 if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
5428 dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
5429 osd->queue_want_pg_temp(info.pgid.pgid, acting);
5430 }
5431}
5432
5433void PG::on_new_interval()
5434{
5435 const OSDMapRef osdmap = get_osdmap();
5436
5437 reg_next_scrub();
5438
5439 // initialize features
5440 acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5441 upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5442 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
5443 if (*p == CRUSH_ITEM_NONE)
5444 continue;
5445 uint64_t f = osdmap->get_xinfo(*p).features;
5446 acting_features &= f;
5447 upacting_features &= f;
5448 }
5449 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
5450 if (*p == CRUSH_ITEM_NONE)
5451 continue;
5452 upacting_features &= osdmap->get_xinfo(*p).features;
5453 }
5454
5455 assert(osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE));
5456
5457 _on_new_interval();
5458}
5459
5460void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
5461{
5462 assert(!is_primary());
5463
5464 update_history(oinfo.history);
5465
5466 if (last_complete_ondisk.epoch >= info.history.last_epoch_started) {
5467 // DEBUG: verify that the snaps are empty in snap_mapper
5468 if (cct->_conf->osd_debug_verify_snaps_on_info) {
5469 interval_set<snapid_t> p;
5470 p.union_of(oinfo.purged_snaps, info.purged_snaps);
5471 p.subtract(info.purged_snaps);
5472 if (!p.empty()) {
5473 for (interval_set<snapid_t>::iterator i = p.begin();
5474 i != p.end();
5475 ++i) {
5476 for (snapid_t snap = i.get_start();
5477 snap != i.get_len() + i.get_start();
5478 ++snap) {
5479 vector<hobject_t> hoids;
5480 int r = snap_mapper.get_next_objects_to_trim(snap, 1, &hoids);
5481 if (r != 0 && r != -ENOENT) {
5482 derr << __func__ << ": snap_mapper get_next_object_to_trim returned "
5483 << cpp_strerror(r) << dendl;
5484 ceph_abort();
5485 } else if (r != -ENOENT) {
5486 assert(!hoids.empty());
5487 derr << __func__ << ": snap_mapper get_next_object_to_trim returned "
5488 << cpp_strerror(r) << " for object "
5489 << hoids[0] << " on snap " << snap
5490 << " which should have been fully trimmed " << dendl;
5491 ceph_abort();
5492 }
5493 }
5494 }
5495 }
5496 }
5497 info.purged_snaps = oinfo.purged_snaps;
5498 dirty_info = true;
5499 dirty_big_info = true;
5500 }
5501}
5502
5503ostream& operator<<(ostream& out, const PG& pg)
5504{
5505 out << "pg[" << pg.info
5506 << " " << pg.up;
5507 if (pg.acting != pg.up)
5508 out << "/" << pg.acting;
5509 out << " r=" << pg.get_role();
5510 out << " lpr=" << pg.get_last_peering_reset();
5511
5512 if (!pg.past_intervals.empty()) {
5513 out << " pi=[" << pg.past_intervals.get_bounds()
5514 << ")/" << pg.past_intervals.size();
5515 }
5516
5517 if (pg.is_peered()) {
5518 if (pg.last_update_ondisk != pg.info.last_update)
5519 out << " luod=" << pg.last_update_ondisk;
5520 if (pg.last_update_applied != pg.info.last_update)
5521 out << " lua=" << pg.last_update_applied;
5522 }
5523
5524 if (pg.recovery_ops_active)
5525 out << " rops=" << pg.recovery_ops_active;
5526
5527 if (pg.pg_log.get_tail() != pg.info.log_tail ||
5528 pg.pg_log.get_head() != pg.info.last_update)
5529 out << " (info mismatch, " << pg.pg_log.get_log() << ")";
5530
5531 if (!pg.pg_log.get_log().empty()) {
5532 if ((pg.pg_log.get_log().log.begin()->version <= pg.pg_log.get_tail())) {
5533 out << " (log bound mismatch, actual=["
5534 << pg.pg_log.get_log().log.begin()->version << ","
5535 << pg.pg_log.get_log().log.rbegin()->version << "]";
5536 out << ")";
5537 }
5538 }
5539
5540 if (!pg.backfill_targets.empty())
5541 out << " bft=" << pg.backfill_targets;
5542 out << " crt=" << pg.pg_log.get_can_rollback_to();
5543
5544 if (pg.last_complete_ondisk != pg.info.last_complete)
5545 out << " lcod " << pg.last_complete_ondisk;
5546
5547 if (pg.is_primary()) {
5548 out << " mlcod " << pg.min_last_complete_ondisk;
5549 }
5550
5551 out << " " << pg_state_string(pg.get_state());
5552 if (pg.should_send_notify())
5553 out << " NOTIFY";
5554
5555 if (pg.scrubber.must_repair)
5556 out << " MUST_REPAIR";
5557 if (pg.scrubber.auto_repair)
5558 out << " AUTO_REPAIR";
5559 if (pg.scrubber.must_deep_scrub)
5560 out << " MUST_DEEP_SCRUB";
5561 if (pg.scrubber.must_scrub)
5562 out << " MUST_SCRUB";
5563
5564 //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
5565 if (pg.pg_log.get_missing().num_missing()) {
5566 out << " m=" << pg.pg_log.get_missing().num_missing();
5567 if (pg.is_primary()) {
5568 uint64_t unfound = pg.get_num_unfound();
5569 if (unfound)
5570 out << " u=" << unfound;
5571 }
5572 }
5573 if (pg.snap_trimq.size())
5574 out << " snaptrimq=" << pg.snap_trimq;
5575
5576 out << "]";
5577
5578
5579 return out;
5580}
5581
5582bool PG::can_discard_op(OpRequestRef& op)
5583{
5584 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
5585 if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) {
5586 dout(20) << " discard " << *m << dendl;
5587 return true;
5588 }
5589
5590 if (m->get_map_epoch() < info.history.same_primary_since) {
5591 dout(7) << " changed after " << m->get_map_epoch()
5592 << ", dropping " << *m << dendl;
5593 return true;
5594 }
5595
5596 if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) {
5597 if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) {
5598 dout(7) << __func__ << " sent before last_force_op_resend "
5599 << pool.info.last_force_op_resend << ", dropping" << *m << dendl;
5600 return true;
5601 }
5602 if (m->get_map_epoch() < info.history.last_epoch_split) {
5603 dout(7) << __func__ << " pg split in "
5604 << info.history.last_epoch_split << ", dropping" << dendl;
5605 return true;
5606 }
5607 } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) {
5608 if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) {
5609 dout(7) << __func__ << " sent before last_force_op_resend_preluminous "
5610 << pool.info.last_force_op_resend_preluminous
5611 << ", dropping" << *m << dendl;
5612 return true;
5613 }
5614 }
5615
5616 return false;
5617}
5618
5619template<typename T, int MSGTYPE>
5620bool PG::can_discard_replica_op(OpRequestRef& op)
5621{
5622 const T *m = static_cast<const T *>(op->get_req());
5623 assert(m->get_type() == MSGTYPE);
5624
5625 /* Mostly, this overlaps with the old_peering_msg
5626 * condition. An important exception is pushes
5627 * sent by replicas not in the acting set, since
5628 * if such a replica goes down it does not cause
5629 * a new interval. */
5630 int from = m->get_source().num();
5631 if (get_osdmap()->get_down_at(from) >= m->map_epoch)
5632 return true;
5633
5634 // same pg?
5635 // if pg changes _at all_, we reset and repeer!
5636 if (old_peering_msg(m->map_epoch, m->map_epoch)) {
5637 dout(10) << "can_discard_replica_op pg changed " << info.history
5638 << " after " << m->map_epoch
5639 << ", dropping" << dendl;
5640 return true;
5641 }
5642 return false;
5643}
5644
5645bool PG::can_discard_scan(OpRequestRef op)
5646{
5647 const MOSDPGScan *m = static_cast<const MOSDPGScan *>(op->get_req());
5648 assert(m->get_type() == MSG_OSD_PG_SCAN);
5649
5650 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
5651 dout(10) << " got old scan, ignoring" << dendl;
5652 return true;
5653 }
5654 return false;
5655}
5656
5657bool PG::can_discard_backfill(OpRequestRef op)
5658{
5659 const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill *>(op->get_req());
5660 assert(m->get_type() == MSG_OSD_PG_BACKFILL);
5661
5662 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
5663 dout(10) << " got old backfill, ignoring" << dendl;
5664 return true;
5665 }
5666
5667 return false;
5668
5669}
5670
5671bool PG::can_discard_request(OpRequestRef& op)
5672{
5673 switch (op->get_req()->get_type()) {
5674 case CEPH_MSG_OSD_OP:
5675 return can_discard_op(op);
5676 case CEPH_MSG_OSD_BACKOFF:
5677 return false; // never discard
5678 case MSG_OSD_SUBOP:
5679 return can_discard_replica_op<MOSDSubOp, MSG_OSD_SUBOP>(op);
5680 case MSG_OSD_REPOP:
5681 return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op);
5682 case MSG_OSD_PG_PUSH:
5683 return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op);
5684 case MSG_OSD_PG_PULL:
5685 return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
5686 case MSG_OSD_PG_PUSH_REPLY:
5687 return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
5688 case MSG_OSD_SUBOPREPLY:
5689 return can_discard_replica_op<MOSDSubOpReply, MSG_OSD_SUBOPREPLY>(op);
5690 case MSG_OSD_REPOPREPLY:
5691 return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
5692
5693 case MSG_OSD_EC_WRITE:
5694 return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
5695 case MSG_OSD_EC_WRITE_REPLY:
5696 return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op);
5697 case MSG_OSD_EC_READ:
5698 return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
5699 case MSG_OSD_EC_READ_REPLY:
5700 return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
5701 case MSG_OSD_REP_SCRUB:
5702 return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
5703 case MSG_OSD_SCRUB_RESERVE:
5704 return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op);
5705 case MSG_OSD_REP_SCRUBMAP:
5706 return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op);
5707 case MSG_OSD_PG_UPDATE_LOG_MISSING:
5708 return can_discard_replica_op<
5709 MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op);
5710 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
5711 return can_discard_replica_op<
5712 MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
5713
5714 case MSG_OSD_PG_SCAN:
5715 return can_discard_scan(op);
5716 case MSG_OSD_PG_BACKFILL:
5717 return can_discard_backfill(op);
5718 case MSG_OSD_PG_BACKFILL_REMOVE:
5719 return can_discard_replica_op<MOSDPGBackfillRemove,
5720 MSG_OSD_PG_BACKFILL_REMOVE>(op);
5721 }
5722 return true;
5723}
5724
5725void PG::take_waiters()
5726{
5727 dout(10) << "take_waiters" << dendl;
5728 requeue_map_waiters();
5729 for (list<CephPeeringEvtRef>::iterator i = peering_waiters.begin();
5730 i != peering_waiters.end();
5731 ++i) osd->queue_for_peering(this);
5732 peering_queue.splice(peering_queue.begin(), peering_waiters,
5733 peering_waiters.begin(), peering_waiters.end());
5734}
5735
5736void PG::handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx)
5737{
5738 dout(10) << "handle_peering_event: " << evt->get_desc() << dendl;
5739 if (!have_same_or_newer_map(evt->get_epoch_sent())) {
5740 dout(10) << "deferring event " << evt->get_desc() << dendl;
5741 peering_waiters.push_back(evt);
5742 return;
5743 }
5744 if (old_peering_evt(evt))
5745 return;
5746 recovery_state.handle_event(evt, rctx);
5747}
5748
5749void PG::queue_peering_event(CephPeeringEvtRef evt)
5750{
5751 if (old_peering_evt(evt))
5752 return;
5753 peering_queue.push_back(evt);
5754 osd->queue_for_peering(this);
5755}
5756
5757void PG::queue_null(epoch_t msg_epoch,
5758 epoch_t query_epoch)
5759{
5760 dout(10) << "null" << dendl;
5761 queue_peering_event(
5762 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
5763 NullEvt())));
5764}
5765
5766void PG::queue_flushed(epoch_t e)
5767{
5768 dout(10) << "flushed" << dendl;
5769 queue_peering_event(
5770 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(e, e,
5771 FlushedEvt())));
5772}
5773
5774void PG::queue_query(epoch_t msg_epoch,
5775 epoch_t query_epoch,
5776 pg_shard_t from, const pg_query_t& q)
5777{
5778 dout(10) << "handle_query " << q << " from replica " << from << dendl;
5779 queue_peering_event(
5780 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
5781 MQuery(from, q, query_epoch))));
5782}
5783
5784void PG::handle_advance_map(
5785 OSDMapRef osdmap, OSDMapRef lastmap,
5786 vector<int>& newup, int up_primary,
5787 vector<int>& newacting, int acting_primary,
5788 RecoveryCtx *rctx)
5789{
5790 assert(lastmap->get_epoch() == osdmap_ref->get_epoch());
5791 assert(lastmap == osdmap_ref);
5792 dout(10) << "handle_advance_map "
5793 << newup << "/" << newacting
5794 << " -- " << up_primary << "/" << acting_primary
5795 << dendl;
5796 update_osdmap_ref(osdmap);
5797 pool.update(osdmap);
5798 past_intervals.update_type_from_map(pool.info.ec_pool(), *osdmap);
5799 if (cct->_conf->osd_debug_verify_cached_snaps) {
5800 interval_set<snapid_t> actual_removed_snaps;
5801 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
5802 assert(pi);
5803 pi->build_removed_snaps(actual_removed_snaps);
5804 if (!(actual_removed_snaps == pool.cached_removed_snaps)) {
5805 derr << __func__ << ": mismatch between the actual removed snaps "
5806 << actual_removed_snaps << " and pool.cached_removed_snaps "
5807 << " pool.cached_removed_snaps " << pool.cached_removed_snaps
5808 << dendl;
5809 }
5810 assert(actual_removed_snaps == pool.cached_removed_snaps);
5811 }
5812 AdvMap evt(
5813 osdmap, lastmap, newup, up_primary,
5814 newacting, acting_primary);
5815 recovery_state.handle_event(evt, rctx);
5816 if (pool.info.last_change == osdmap_ref->get_epoch()) {
5817 on_pool_change();
5818 update_store_with_options();
5819 }
5820}
5821
5822void PG::handle_activate_map(RecoveryCtx *rctx)
5823{
5824 dout(10) << "handle_activate_map " << dendl;
5825 ActMap evt;
5826 recovery_state.handle_event(evt, rctx);
5827 if (osdmap_ref->get_epoch() - last_persisted_osdmap_ref->get_epoch() >
5828 cct->_conf->osd_pg_epoch_persisted_max_stale) {
5829 dout(20) << __func__ << ": Dirtying info: last_persisted is "
5830 << last_persisted_osdmap_ref->get_epoch()
5831 << " while current is " << osdmap_ref->get_epoch() << dendl;
5832 dirty_info = true;
5833 } else {
5834 dout(20) << __func__ << ": Not dirtying info: last_persisted is "
5835 << last_persisted_osdmap_ref->get_epoch()
5836 << " while current is " << osdmap_ref->get_epoch() << dendl;
5837 }
5838 if (osdmap_ref->check_new_blacklist_entries()) check_blacklisted_watchers();
5839}
5840
5841void PG::handle_loaded(RecoveryCtx *rctx)
5842{
5843 dout(10) << "handle_loaded" << dendl;
5844 Load evt;
5845 recovery_state.handle_event(evt, rctx);
5846}
5847
5848void PG::handle_create(RecoveryCtx *rctx)
5849{
5850 dout(10) << "handle_create" << dendl;
5851 rctx->created_pgs.insert(this);
5852 Initialize evt;
5853 recovery_state.handle_event(evt, rctx);
5854 ActMap evt2;
5855 recovery_state.handle_event(evt2, rctx);
5856}
5857
5858void PG::handle_query_state(Formatter *f)
5859{
5860 dout(10) << "handle_query_state" << dendl;
5861 QueryState q(f);
5862 recovery_state.handle_event(q, 0);
5863}
5864
5865void PG::update_store_with_options()
5866{
5867 auto r = osd->store->set_collection_opts(coll, pool.info.opts);
5868 if(r < 0 && r != -EOPNOTSUPP) {
5869 derr << __func__ << "set_collection_opts returns error:" << r << dendl;
5870 }
5871}
5872
5873void PG::update_store_on_load()
5874{
5875 if (osd->store->get_type() == "filestore") {
5876 // legacy filestore didn't store collection bit width; fix.
5877 int bits = osd->store->collection_bits(coll);
5878 if (bits < 0) {
5879 if (coll.is_meta())
5880 bits = 0;
5881 else
5882 bits = info.pgid.get_split_bits(pool.info.get_pg_num());
5883 lderr(cct) << __func__ << " setting bit width to " << bits << dendl;
5884 ObjectStore::Transaction t;
5885 t.collection_set_bits(coll, bits);
5886 osd->store->apply_transaction(osr.get(), std::move(t));
5887 }
5888 }
5889}
5890
5891/*------------ Recovery State Machine----------------*/
5892#undef dout_prefix
5893#define dout_prefix (*_dout << context< RecoveryMachine >().pg->gen_prefix() \
5894 << "state<" << get_state_name() << ">: ")
5895
5896/*------Crashed-------*/
5897PG::RecoveryState::Crashed::Crashed(my_context ctx)
5898 : my_base(ctx),
5899 NamedState(context< RecoveryMachine >().pg, "Crashed")
5900{
5901 context< RecoveryMachine >().log_enter(state_name);
5902 assert(0 == "we got a bad state machine event");
5903}
5904
5905
5906/*------Initial-------*/
5907PG::RecoveryState::Initial::Initial(my_context ctx)
5908 : my_base(ctx),
5909 NamedState(context< RecoveryMachine >().pg, "Initial")
5910{
5911 context< RecoveryMachine >().log_enter(state_name);
5912}
5913
5914boost::statechart::result PG::RecoveryState::Initial::react(const Load& l)
5915{
5916 PG *pg = context< RecoveryMachine >().pg;
5917
5918 // do we tell someone we're here?
5919 pg->send_notify = (!pg->is_primary());
5920 pg->update_store_with_options();
5921
5922 pg->update_store_on_load();
5923
5924 return transit< Reset >();
5925}
5926
5927boost::statechart::result PG::RecoveryState::Initial::react(const MNotifyRec& notify)
5928{
5929 PG *pg = context< RecoveryMachine >().pg;
5930 pg->proc_replica_info(
5931 notify.from, notify.notify.info, notify.notify.epoch_sent);
5932 pg->set_last_peering_reset();
5933 return transit< Primary >();
5934}
5935
5936boost::statechart::result PG::RecoveryState::Initial::react(const MInfoRec& i)
5937{
5938 PG *pg = context< RecoveryMachine >().pg;
5939 assert(!pg->is_primary());
5940 post_event(i);
5941 return transit< Stray >();
5942}
5943
5944boost::statechart::result PG::RecoveryState::Initial::react(const MLogRec& i)
5945{
5946 PG *pg = context< RecoveryMachine >().pg;
5947 assert(!pg->is_primary());
5948 post_event(i);
5949 return transit< Stray >();
5950}
5951
5952void PG::RecoveryState::Initial::exit()
5953{
5954 context< RecoveryMachine >().log_exit(state_name, enter_time);
5955 PG *pg = context< RecoveryMachine >().pg;
5956 utime_t dur = ceph_clock_now() - enter_time;
5957 pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur);
5958}
5959
5960/*------Started-------*/
5961PG::RecoveryState::Started::Started(my_context ctx)
5962 : my_base(ctx),
5963 NamedState(context< RecoveryMachine >().pg, "Started")
5964{
5965 context< RecoveryMachine >().log_enter(state_name);
5966}
5967
5968boost::statechart::result
5969PG::RecoveryState::Started::react(const IntervalFlush&)
5970{
5971 PG *pg = context< RecoveryMachine >().pg;
5972 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
5973 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
5974 return discard_event();
5975}
5976
5977
5978boost::statechart::result
5979PG::RecoveryState::Started::react(const FlushedEvt&)
5980{
5981 PG *pg = context< RecoveryMachine >().pg;
5982 pg->on_flushed();
5983 return discard_event();
5984}
5985
5986
5987boost::statechart::result PG::RecoveryState::Started::react(const AdvMap& advmap)
5988{
5989 PG *pg = context< RecoveryMachine >().pg;
5990 ldout(pg->cct, 10) << "Started advmap" << dendl;
5991 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
5992 if (pg->should_restart_peering(
5993 advmap.up_primary,
5994 advmap.acting_primary,
5995 advmap.newup,
5996 advmap.newacting,
5997 advmap.lastmap,
5998 advmap.osdmap)) {
5999 ldout(pg->cct, 10) << "should_restart_peering, transitioning to Reset"
6000 << dendl;
6001 post_event(advmap);
6002 return transit< Reset >();
6003 }
6004 pg->remove_down_peer_info(advmap.osdmap);
6005 return discard_event();
6006}
6007
6008boost::statechart::result PG::RecoveryState::Started::react(const QueryState& q)
6009{
6010 q.f->open_object_section("state");
6011 q.f->dump_string("name", state_name);
6012 q.f->dump_stream("enter_time") << enter_time;
6013 q.f->close_section();
6014 return discard_event();
6015}
6016
6017void PG::RecoveryState::Started::exit()
6018{
6019 context< RecoveryMachine >().log_exit(state_name, enter_time);
6020 PG *pg = context< RecoveryMachine >().pg;
6021 utime_t dur = ceph_clock_now() - enter_time;
6022 pg->osd->recoverystate_perf->tinc(rs_started_latency, dur);
6023}
6024
6025/*--------Reset---------*/
6026PG::RecoveryState::Reset::Reset(my_context ctx)
6027 : my_base(ctx),
6028 NamedState(context< RecoveryMachine >().pg, "Reset")
6029{
6030 context< RecoveryMachine >().log_enter(state_name);
6031 PG *pg = context< RecoveryMachine >().pg;
6032
6033 pg->flushes_in_progress = 0;
6034 pg->set_last_peering_reset();
6035}
6036
6037boost::statechart::result
6038PG::RecoveryState::Reset::react(const FlushedEvt&)
6039{
6040 PG *pg = context< RecoveryMachine >().pg;
6041 pg->on_flushed();
6042 return discard_event();
6043}
6044
6045boost::statechart::result
6046PG::RecoveryState::Reset::react(const IntervalFlush&)
6047{
6048 PG *pg = context< RecoveryMachine >().pg;
6049 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6050 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6051 return discard_event();
6052}
6053
6054boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
6055{
6056 PG *pg = context< RecoveryMachine >().pg;
6057 ldout(pg->cct, 10) << "Reset advmap" << dendl;
6058
6059 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6060
6061 if (pg->should_restart_peering(
6062 advmap.up_primary,
6063 advmap.acting_primary,
6064 advmap.newup,
6065 advmap.newacting,
6066 advmap.lastmap,
6067 advmap.osdmap)) {
6068 ldout(pg->cct, 10) << "should restart peering, calling start_peering_interval again"
6069 << dendl;
6070 pg->start_peering_interval(
6071 advmap.lastmap,
6072 advmap.newup, advmap.up_primary,
6073 advmap.newacting, advmap.acting_primary,
6074 context< RecoveryMachine >().get_cur_transaction());
6075 }
6076 pg->remove_down_peer_info(advmap.osdmap);
6077 pg->check_past_interval_bounds();
6078 return discard_event();
6079}
6080
6081boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&)
6082{
6083 PG *pg = context< RecoveryMachine >().pg;
6084 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
6085 context< RecoveryMachine >().send_notify(
6086 pg->get_primary(),
6087 pg_notify_t(
6088 pg->get_primary().shard, pg->pg_whoami.shard,
6089 pg->get_osdmap()->get_epoch(),
6090 pg->get_osdmap()->get_epoch(),
6091 pg->info),
6092 pg->past_intervals);
6093 }
6094
6095 pg->update_heartbeat_peers();
6096 pg->take_waiters();
6097
6098 return transit< Started >();
6099}
6100
6101boost::statechart::result PG::RecoveryState::Reset::react(const QueryState& q)
6102{
6103 q.f->open_object_section("state");
6104 q.f->dump_string("name", state_name);
6105 q.f->dump_stream("enter_time") << enter_time;
6106 q.f->close_section();
6107 return discard_event();
6108}
6109
6110void PG::RecoveryState::Reset::exit()
6111{
6112 context< RecoveryMachine >().log_exit(state_name, enter_time);
6113 PG *pg = context< RecoveryMachine >().pg;
6114 utime_t dur = ceph_clock_now() - enter_time;
6115 pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur);
6116}
6117
6118/*-------Start---------*/
6119PG::RecoveryState::Start::Start(my_context ctx)
6120 : my_base(ctx),
6121 NamedState(context< RecoveryMachine >().pg, "Start")
6122{
6123 context< RecoveryMachine >().log_enter(state_name);
6124
6125 PG *pg = context< RecoveryMachine >().pg;
6126 if (pg->is_primary()) {
6127 ldout(pg->cct, 1) << "transitioning to Primary" << dendl;
6128 post_event(MakePrimary());
6129 } else { //is_stray
6130 ldout(pg->cct, 1) << "transitioning to Stray" << dendl;
6131 post_event(MakeStray());
6132 }
6133}
6134
6135void PG::RecoveryState::Start::exit()
6136{
6137 context< RecoveryMachine >().log_exit(state_name, enter_time);
6138 PG *pg = context< RecoveryMachine >().pg;
6139 utime_t dur = ceph_clock_now() - enter_time;
6140 pg->osd->recoverystate_perf->tinc(rs_start_latency, dur);
6141}
6142
6143/*---------Primary--------*/
6144PG::RecoveryState::Primary::Primary(my_context ctx)
6145 : my_base(ctx),
6146 NamedState(context< RecoveryMachine >().pg, "Started/Primary")
6147{
6148 context< RecoveryMachine >().log_enter(state_name);
6149 PG *pg = context< RecoveryMachine >().pg;
6150 assert(pg->want_acting.empty());
6151
6152 // set CREATING bit until we have peered for the first time.
6153 if (pg->info.history.last_epoch_started == 0) {
6154 pg->state_set(PG_STATE_CREATING);
6155 // use the history timestamp, which ultimately comes from the
6156 // monitor in the create case.
6157 utime_t t = pg->info.history.last_scrub_stamp;
6158 pg->info.stats.last_fresh = t;
6159 pg->info.stats.last_active = t;
6160 pg->info.stats.last_change = t;
6161 pg->info.stats.last_peered = t;
6162 pg->info.stats.last_clean = t;
6163 pg->info.stats.last_unstale = t;
6164 pg->info.stats.last_undegraded = t;
6165 pg->info.stats.last_fullsized = t;
6166 pg->info.stats.last_scrub_stamp = t;
6167 pg->info.stats.last_deep_scrub_stamp = t;
6168 pg->info.stats.last_clean_scrub_stamp = t;
6169 }
6170}
6171
6172boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& notevt)
6173{
6174 PG *pg = context< RecoveryMachine >().pg;
6175 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
6176 pg->proc_replica_info(
6177 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
6178 return discard_event();
6179}
6180
6181boost::statechart::result PG::RecoveryState::Primary::react(const ActMap&)
6182{
6183 PG *pg = context< RecoveryMachine >().pg;
6184 ldout(pg->cct, 7) << "handle ActMap primary" << dendl;
6185 pg->publish_stats_to_osd();
6186 pg->take_waiters();
6187 return discard_event();
6188}
6189
6190void PG::RecoveryState::Primary::exit()
6191{
6192 context< RecoveryMachine >().log_exit(state_name, enter_time);
6193 PG *pg = context< RecoveryMachine >().pg;
6194 pg->want_acting.clear();
6195 utime_t dur = ceph_clock_now() - enter_time;
6196 pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur);
6197 pg->clear_primary_state();
6198 pg->state_clear(PG_STATE_CREATING);
6199}
6200
6201/*---------Peering--------*/
6202PG::RecoveryState::Peering::Peering(my_context ctx)
6203 : my_base(ctx),
6204 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering"),
6205 history_les_bound(false)
6206{
6207 context< RecoveryMachine >().log_enter(state_name);
6208
6209 PG *pg = context< RecoveryMachine >().pg;
6210 assert(!pg->is_peered());
6211 assert(!pg->is_peering());
6212 assert(pg->is_primary());
6213 pg->state_set(PG_STATE_PEERING);
6214}
6215
6216boost::statechart::result PG::RecoveryState::Peering::react(const AdvMap& advmap)
6217{
6218 PG *pg = context< RecoveryMachine >().pg;
6219 ldout(pg->cct, 10) << "Peering advmap" << dendl;
6220 if (prior_set.affected_by_map(*(advmap.osdmap), pg)) {
6221 ldout(pg->cct, 1) << "Peering, affected_by_map, going to Reset" << dendl;
6222 post_event(advmap);
6223 return transit< Reset >();
6224 }
6225
6226 pg->adjust_need_up_thru(advmap.osdmap);
6227
6228 return forward_event();
6229}
6230
6231boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q)
6232{
6233 PG *pg = context< RecoveryMachine >().pg;
6234
6235 q.f->open_object_section("state");
6236 q.f->dump_string("name", state_name);
6237 q.f->dump_stream("enter_time") << enter_time;
6238
6239 q.f->open_array_section("past_intervals");
6240 pg->past_intervals.dump(q.f);
6241 q.f->close_section();
6242
6243 q.f->open_array_section("probing_osds");
6244 for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
6245 p != prior_set.probe.end();
6246 ++p)
6247 q.f->dump_stream("osd") << *p;
6248 q.f->close_section();
6249
6250 if (prior_set.pg_down)
6251 q.f->dump_string("blocked", "peering is blocked due to down osds");
6252
6253 q.f->open_array_section("down_osds_we_would_probe");
6254 for (set<int>::iterator p = prior_set.down.begin();
6255 p != prior_set.down.end();
6256 ++p)
6257 q.f->dump_int("osd", *p);
6258 q.f->close_section();
6259
6260 q.f->open_array_section("peering_blocked_by");
6261 for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
6262 p != prior_set.blocked_by.end();
6263 ++p) {
6264 q.f->open_object_section("osd");
6265 q.f->dump_int("osd", p->first);
6266 q.f->dump_int("current_lost_at", p->second);
6267 q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
6268 q.f->close_section();
6269 }
6270 q.f->close_section();
6271
6272 if (history_les_bound) {
6273 q.f->open_array_section("peering_blocked_by_detail");
6274 q.f->open_object_section("item");
6275 q.f->dump_string("detail","peering_blocked_by_history_les_bound");
6276 q.f->close_section();
6277 q.f->close_section();
6278 }
6279
6280 q.f->close_section();
6281 return forward_event();
6282}
6283
6284void PG::RecoveryState::Peering::exit()
6285{
6286 PG *pg = context< RecoveryMachine >().pg;
6287 ldout(pg->cct, 10) << "Leaving Peering" << dendl;
6288 context< RecoveryMachine >().log_exit(state_name, enter_time);
6289 pg->state_clear(PG_STATE_PEERING);
6290 pg->clear_probe_targets();
6291
6292 utime_t dur = ceph_clock_now() - enter_time;
6293 pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur);
6294}
6295
6296
6297/*------Backfilling-------*/
6298PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
6299 : my_base(ctx),
6300 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Backfilling")
6301{
6302 context< RecoveryMachine >().log_enter(state_name);
6303 PG *pg = context< RecoveryMachine >().pg;
6304 pg->backfill_reserved = true;
6305 pg->queue_recovery();
6306 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
6307 pg->state_clear(PG_STATE_BACKFILL_WAIT);
6308 pg->state_set(PG_STATE_BACKFILL);
6309 pg->publish_stats_to_osd();
6310}
6311
224ce89b
WB
6312boost::statechart::result
6313PG::RecoveryState::Backfilling::react(const CancelBackfill &)
6314{
6315 PG *pg = context< RecoveryMachine >().pg;
6316 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6317 // XXX: Add a new pg state so user can see why backfill isn't proceeding
6318 // Can't use PG_STATE_BACKFILL_WAIT since it means waiting for reservations
6319 //pg->state_set(PG_STATE_BACKFILL_STALLED????);
6320
6321 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6322 it != pg->backfill_targets.end();
6323 ++it) {
6324 assert(*it != pg->pg_whoami);
6325 ConnectionRef con = pg->osd->get_con_osd_cluster(
6326 it->osd, pg->get_osdmap()->get_epoch());
6327 if (con) {
6328 pg->osd->send_message_osd_cluster(
6329 new MBackfillReserve(
6330 MBackfillReserve::REJECT,
6331 spg_t(pg->info.pgid.pgid, it->shard),
6332 pg->get_osdmap()->get_epoch()),
6333 con.get());
6334 }
6335 }
6336
6337 pg->waiting_on_backfill.clear();
6338
6339 pg->schedule_backfill_full_retry();
6340 return transit<NotBackfilling>();
6341}
6342
7c673cae
FG
6343boost::statechart::result
6344PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &)
6345{
6346 PG *pg = context< RecoveryMachine >().pg;
6347 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6348 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6349
6350 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6351 it != pg->backfill_targets.end();
6352 ++it) {
6353 assert(*it != pg->pg_whoami);
6354 ConnectionRef con = pg->osd->get_con_osd_cluster(
6355 it->osd, pg->get_osdmap()->get_epoch());
6356 if (con) {
6357 pg->osd->send_message_osd_cluster(
6358 new MBackfillReserve(
6359 MBackfillReserve::REJECT,
6360 spg_t(pg->info.pgid.pgid, it->shard),
6361 pg->get_osdmap()->get_epoch()),
6362 con.get());
6363 }
6364 }
6365
6366 pg->waiting_on_backfill.clear();
6367 pg->finish_recovery_op(hobject_t::get_max());
6368
6369 pg->schedule_backfill_full_retry();
6370 return transit<NotBackfilling>();
6371}
6372
6373void PG::RecoveryState::Backfilling::exit()
6374{
6375 context< RecoveryMachine >().log_exit(state_name, enter_time);
6376 PG *pg = context< RecoveryMachine >().pg;
6377 pg->backfill_reserved = false;
6378 pg->backfill_reserving = false;
6379 pg->state_clear(PG_STATE_BACKFILL);
6380 utime_t dur = ceph_clock_now() - enter_time;
6381 pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
6382}
6383
6384/*--WaitRemoteBackfillReserved--*/
6385
6386PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
6387 : my_base(ctx),
6388 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteBackfillReserved"),
6389 backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
6390{
6391 context< RecoveryMachine >().log_enter(state_name);
6392 PG *pg = context< RecoveryMachine >().pg;
6393 pg->state_set(PG_STATE_BACKFILL_WAIT);
6394 pg->publish_stats_to_osd();
6395 post_event(RemoteBackfillReserved());
6396}
6397
6398boost::statechart::result
6399PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
6400{
6401 PG *pg = context< RecoveryMachine >().pg;
6402
6403 if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) {
6404 //The primary never backfills itself
6405 assert(*backfill_osd_it != pg->pg_whoami);
6406 ConnectionRef con = pg->osd->get_con_osd_cluster(
6407 backfill_osd_it->osd, pg->get_osdmap()->get_epoch());
6408 if (con) {
6409 pg->osd->send_message_osd_cluster(
6410 new MBackfillReserve(
6411 MBackfillReserve::REQUEST,
6412 spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
6413 pg->get_osdmap()->get_epoch(),
6414 pg->get_backfill_priority()),
6415 con.get());
6416 }
6417 ++backfill_osd_it;
6418 } else {
6419 post_event(AllBackfillsReserved());
6420 }
6421 return discard_event();
6422}
6423
6424void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
6425{
6426 context< RecoveryMachine >().log_exit(state_name, enter_time);
6427 PG *pg = context< RecoveryMachine >().pg;
6428 utime_t dur = ceph_clock_now() - enter_time;
6429 pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur);
6430}
6431
6432boost::statechart::result
6433PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt)
6434{
6435 PG *pg = context< RecoveryMachine >().pg;
6436 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6437
6438 // Send REJECT to all previously acquired reservations
6439 set<pg_shard_t>::const_iterator it, begin, end, next;
6440 begin = context< Active >().remote_shards_to_reserve_backfill.begin();
6441 end = context< Active >().remote_shards_to_reserve_backfill.end();
6442 assert(begin != end);
6443 for (next = it = begin, ++next ; next != backfill_osd_it; ++it, ++next) {
6444 //The primary never backfills itself
6445 assert(*it != pg->pg_whoami);
6446 ConnectionRef con = pg->osd->get_con_osd_cluster(
6447 it->osd, pg->get_osdmap()->get_epoch());
6448 if (con) {
6449 pg->osd->send_message_osd_cluster(
6450 new MBackfillReserve(
6451 MBackfillReserve::REJECT,
6452 spg_t(pg->info.pgid.pgid, it->shard),
6453 pg->get_osdmap()->get_epoch()),
6454 con.get());
6455 }
6456 }
6457
6458 pg->state_clear(PG_STATE_BACKFILL_WAIT);
6459 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6460 pg->publish_stats_to_osd();
6461
6462 pg->schedule_backfill_full_retry();
6463
6464 return transit<NotBackfilling>();
6465}
6466
6467/*--WaitLocalBackfillReserved--*/
6468PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
6469 : my_base(ctx),
6470 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalBackfillReserved")
6471{
6472 context< RecoveryMachine >().log_enter(state_name);
6473 PG *pg = context< RecoveryMachine >().pg;
6474 pg->state_set(PG_STATE_BACKFILL_WAIT);
6475 pg->osd->local_reserver.request_reservation(
6476 pg->info.pgid,
6477 new QueuePeeringEvt<LocalBackfillReserved>(
6478 pg, pg->get_osdmap()->get_epoch(),
6479 LocalBackfillReserved()),
6480 pg->get_backfill_priority());
6481 pg->publish_stats_to_osd();
6482}
6483
6484void PG::RecoveryState::WaitLocalBackfillReserved::exit()
6485{
6486 context< RecoveryMachine >().log_exit(state_name, enter_time);
6487 PG *pg = context< RecoveryMachine >().pg;
6488 utime_t dur = ceph_clock_now() - enter_time;
6489 pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur);
6490}
6491
6492/*----NotBackfilling------*/
6493PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
6494 : my_base(ctx),
6495 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotBackfilling")
6496{
6497 context< RecoveryMachine >().log_enter(state_name);
6498 PG *pg = context< RecoveryMachine >().pg;
6499 pg->publish_stats_to_osd();
6500}
6501
6502boost::statechart::result
6503PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
6504{
6505 return discard_event();
6506}
6507
6508boost::statechart::result
6509PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
6510{
6511 return discard_event();
6512}
6513
6514void PG::RecoveryState::NotBackfilling::exit()
6515{
6516 context< RecoveryMachine >().log_exit(state_name, enter_time);
6517 PG *pg = context< RecoveryMachine >().pg;
6518 utime_t dur = ceph_clock_now() - enter_time;
6519 pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
6520}
6521
6522/*----NotRecovering------*/
6523PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx)
6524 : my_base(ctx),
6525 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotRecovering")
6526{
6527 context< RecoveryMachine >().log_enter(state_name);
6528 PG *pg = context< RecoveryMachine >().pg;
6529 pg->publish_stats_to_osd();
6530}
6531
6532void PG::RecoveryState::NotRecovering::exit()
6533{
6534 context< RecoveryMachine >().log_exit(state_name, enter_time);
6535 PG *pg = context< RecoveryMachine >().pg;
6536 utime_t dur = ceph_clock_now() - enter_time;
6537 pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur);
6538}
6539
6540/*---RepNotRecovering----*/
6541PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
6542 : my_base(ctx),
6543 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepNotRecovering")
6544{
6545 context< RecoveryMachine >().log_enter(state_name);
6546}
6547
6548void PG::RecoveryState::RepNotRecovering::exit()
6549{
6550 context< RecoveryMachine >().log_exit(state_name, enter_time);
6551 PG *pg = context< RecoveryMachine >().pg;
6552 utime_t dur = ceph_clock_now() - enter_time;
6553 pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur);
6554}
6555
6556/*---RepWaitRecoveryReserved--*/
6557PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
6558 : my_base(ctx),
6559 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitRecoveryReserved")
6560{
6561 context< RecoveryMachine >().log_enter(state_name);
6562 PG *pg = context< RecoveryMachine >().pg;
6563
6564 pg->osd->remote_reserver.request_reservation(
6565 pg->info.pgid,
6566 new QueuePeeringEvt<RemoteRecoveryReserved>(
6567 pg, pg->get_osdmap()->get_epoch(),
6568 RemoteRecoveryReserved()),
6569 pg->get_recovery_priority());
6570}
6571
6572boost::statechart::result
6573PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
6574{
6575 PG *pg = context< RecoveryMachine >().pg;
6576 pg->osd->send_message_osd_cluster(
6577 pg->primary.osd,
6578 new MRecoveryReserve(
6579 MRecoveryReserve::GRANT,
6580 spg_t(pg->info.pgid.pgid, pg->primary.shard),
6581 pg->get_osdmap()->get_epoch()),
6582 pg->get_osdmap()->get_epoch());
6583 return transit<RepRecovering>();
6584}
6585
6586void PG::RecoveryState::RepWaitRecoveryReserved::exit()
6587{
6588 context< RecoveryMachine >().log_exit(state_name, enter_time);
6589 PG *pg = context< RecoveryMachine >().pg;
6590 utime_t dur = ceph_clock_now() - enter_time;
6591 pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur);
6592}
6593
6594/*-RepWaitBackfillReserved*/
6595PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
6596 : my_base(ctx),
6597 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitBackfillReserved")
6598{
6599 context< RecoveryMachine >().log_enter(state_name);
6600}
6601
6602boost::statechart::result
6603PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
6604{
6605 PG *pg = context< RecoveryMachine >().pg;
6606 ostringstream ss;
6607
6608 if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
6609 (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
6610 ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
6611 << dendl;
6612 post_event(RemoteReservationRejected());
6613 } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
6614 pg->osd->check_backfill_full(ss)) {
6615 ldout(pg->cct, 10) << "backfill reservation rejected: "
6616 << ss.str() << dendl;
6617 post_event(RemoteReservationRejected());
6618 } else {
6619 pg->osd->remote_reserver.request_reservation(
6620 pg->info.pgid,
6621 new QueuePeeringEvt<RemoteBackfillReserved>(
6622 pg, pg->get_osdmap()->get_epoch(),
6623 RemoteBackfillReserved()), evt.priority);
6624 }
6625 return transit<RepWaitBackfillReserved>();
6626}
6627
6628void PG::RecoveryState::RepWaitBackfillReserved::exit()
6629{
6630 context< RecoveryMachine >().log_exit(state_name, enter_time);
6631 PG *pg = context< RecoveryMachine >().pg;
6632 utime_t dur = ceph_clock_now() - enter_time;
6633 pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur);
6634}
6635
6636boost::statechart::result
6637PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
6638{
6639 PG *pg = context< RecoveryMachine >().pg;
6640
6641 ostringstream ss;
6642 if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
6643 (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
6644 ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
6645 << "failure injection" << dendl;
6646 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6647 post_event(RemoteReservationRejected());
6648 return discard_event();
6649 } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
6650 pg->osd->check_backfill_full(ss)) {
6651 ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
6652 << ss.str() << dendl;
6653 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6654 post_event(RemoteReservationRejected());
6655 return discard_event();
6656 } else {
6657 pg->osd->send_message_osd_cluster(
6658 pg->primary.osd,
6659 new MBackfillReserve(
6660 MBackfillReserve::GRANT,
6661 spg_t(pg->info.pgid.pgid, pg->primary.shard),
6662 pg->get_osdmap()->get_epoch()),
6663 pg->get_osdmap()->get_epoch());
6664 return transit<RepRecovering>();
6665 }
6666}
6667
6668boost::statechart::result
6669PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteReservationRejected &evt)
6670{
6671 PG *pg = context< RecoveryMachine >().pg;
6672 pg->reject_reservation();
6673 return transit<RepNotRecovering>();
6674}
6675
6676/*---RepRecovering-------*/
6677PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx)
6678 : my_base(ctx),
6679 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepRecovering")
6680{
6681 context< RecoveryMachine >().log_enter(state_name);
6682}
6683
6684boost::statechart::result
6685PG::RecoveryState::RepRecovering::react(const BackfillTooFull &)
6686{
6687 PG *pg = context< RecoveryMachine >().pg;
6688 pg->reject_reservation();
6689 return discard_event();
6690}
6691
6692void PG::RecoveryState::RepRecovering::exit()
6693{
6694 context< RecoveryMachine >().log_exit(state_name, enter_time);
6695 PG *pg = context< RecoveryMachine >().pg;
6696 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6697 utime_t dur = ceph_clock_now() - enter_time;
6698 pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur);
6699}
6700
6701/*------Activating--------*/
6702PG::RecoveryState::Activating::Activating(my_context ctx)
6703 : my_base(ctx),
6704 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Activating")
6705{
6706 context< RecoveryMachine >().log_enter(state_name);
6707}
6708
6709void PG::RecoveryState::Activating::exit()
6710{
6711 context< RecoveryMachine >().log_exit(state_name, enter_time);
6712 PG *pg = context< RecoveryMachine >().pg;
6713 utime_t dur = ceph_clock_now() - enter_time;
6714 pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur);
6715}
6716
6717PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
6718 : my_base(ctx),
6719 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalRecoveryReserved")
6720{
6721 context< RecoveryMachine >().log_enter(state_name);
6722 PG *pg = context< RecoveryMachine >().pg;
6723
6724 // Make sure all nodes that part of the recovery aren't full
6725 if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery &&
6726 pg->osd->check_osdmap_full(pg->actingbackfill)) {
6727 post_event(RecoveryTooFull());
6728 return;
6729 }
6730
6731 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
6732 pg->state_set(PG_STATE_RECOVERY_WAIT);
6733 pg->osd->local_reserver.request_reservation(
6734 pg->info.pgid,
6735 new QueuePeeringEvt<LocalRecoveryReserved>(
6736 pg, pg->get_osdmap()->get_epoch(),
6737 LocalRecoveryReserved()),
6738 pg->get_recovery_priority());
6739 pg->publish_stats_to_osd();
6740}
6741
6742boost::statechart::result
6743PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
6744{
6745 PG *pg = context< RecoveryMachine >().pg;
6746 pg->state_set(PG_STATE_RECOVERY_TOOFULL);
6747 pg->schedule_recovery_full_retry();
6748 return transit<NotRecovering>();
6749}
6750
6751void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
6752{
6753 context< RecoveryMachine >().log_exit(state_name, enter_time);
6754 PG *pg = context< RecoveryMachine >().pg;
6755 utime_t dur = ceph_clock_now() - enter_time;
6756 pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur);
6757}
6758
6759PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
6760 : my_base(ctx),
6761 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
6762 remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
6763{
6764 context< RecoveryMachine >().log_enter(state_name);
6765 post_event(RemoteRecoveryReserved());
6766}
6767
6768boost::statechart::result
6769PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
6770 PG *pg = context< RecoveryMachine >().pg;
6771
6772 if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
6773 assert(*remote_recovery_reservation_it != pg->pg_whoami);
6774 ConnectionRef con = pg->osd->get_con_osd_cluster(
6775 remote_recovery_reservation_it->osd, pg->get_osdmap()->get_epoch());
6776 if (con) {
6777 pg->osd->send_message_osd_cluster(
6778 new MRecoveryReserve(
6779 MRecoveryReserve::REQUEST,
6780 spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard),
6781 pg->get_osdmap()->get_epoch()),
6782 con.get());
6783 }
6784 ++remote_recovery_reservation_it;
6785 } else {
6786 post_event(AllRemotesReserved());
6787 }
6788 return discard_event();
6789}
6790
6791void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
6792{
6793 context< RecoveryMachine >().log_exit(state_name, enter_time);
6794 PG *pg = context< RecoveryMachine >().pg;
6795 utime_t dur = ceph_clock_now() - enter_time;
6796 pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur);
6797}
6798
6799PG::RecoveryState::Recovering::Recovering(my_context ctx)
6800 : my_base(ctx),
6801 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovering")
6802{
6803 context< RecoveryMachine >().log_enter(state_name);
6804
6805 PG *pg = context< RecoveryMachine >().pg;
6806 pg->state_clear(PG_STATE_RECOVERY_WAIT);
6807 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
6808 pg->state_set(PG_STATE_RECOVERING);
6809 pg->publish_stats_to_osd();
6810 pg->queue_recovery();
6811}
6812
224ce89b 6813void PG::RecoveryState::Recovering::release_reservations(bool cancel)
7c673cae
FG
6814{
6815 PG *pg = context< RecoveryMachine >().pg;
224ce89b 6816 assert(cancel || !pg->pg_log.get_missing().have_missing());
7c673cae
FG
6817
6818 // release remote reservations
6819 for (set<pg_shard_t>::const_iterator i =
6820 context< Active >().remote_shards_to_reserve_recovery.begin();
6821 i != context< Active >().remote_shards_to_reserve_recovery.end();
6822 ++i) {
6823 if (*i == pg->pg_whoami) // skip myself
6824 continue;
6825 ConnectionRef con = pg->osd->get_con_osd_cluster(
6826 i->osd, pg->get_osdmap()->get_epoch());
6827 if (con) {
6828 pg->osd->send_message_osd_cluster(
6829 new MRecoveryReserve(
6830 MRecoveryReserve::RELEASE,
6831 spg_t(pg->info.pgid.pgid, i->shard),
6832 pg->get_osdmap()->get_epoch()),
6833 con.get());
6834 }
6835 }
6836}
6837
6838boost::statechart::result
6839PG::RecoveryState::Recovering::react(const AllReplicasRecovered &evt)
6840{
6841 PG *pg = context< RecoveryMachine >().pg;
6842 pg->state_clear(PG_STATE_RECOVERING);
6843 release_reservations();
6844 return transit<Recovered>();
6845}
6846
6847boost::statechart::result
6848PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
6849{
6850 PG *pg = context< RecoveryMachine >().pg;
6851 pg->state_clear(PG_STATE_RECOVERING);
6852 release_reservations();
6853 return transit<WaitRemoteBackfillReserved>();
6854}
6855
224ce89b
WB
6856boost::statechart::result
6857PG::RecoveryState::Recovering::react(const CancelRecovery &evt)
6858{
6859 PG *pg = context< RecoveryMachine >().pg;
6860 pg->state_clear(PG_STATE_RECOVERING);
6861 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6862 release_reservations(true);
6863 pg->schedule_recovery_full_retry();
6864 return transit<NotRecovering>();
6865}
6866
7c673cae
FG
6867void PG::RecoveryState::Recovering::exit()
6868{
6869 context< RecoveryMachine >().log_exit(state_name, enter_time);
6870 PG *pg = context< RecoveryMachine >().pg;
6871 utime_t dur = ceph_clock_now() - enter_time;
6872 pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur);
6873}
6874
6875PG::RecoveryState::Recovered::Recovered(my_context ctx)
6876 : my_base(ctx),
6877 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovered")
6878{
6879 pg_shard_t auth_log_shard;
6880
6881 context< RecoveryMachine >().log_enter(state_name);
6882
6883 PG *pg = context< RecoveryMachine >().pg;
6884 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6885
6886 assert(!pg->needs_recovery());
6887
6888 // if we finished backfill, all acting are active; recheck if
6889 // DEGRADED | UNDERSIZED is appropriate.
6890 assert(!pg->actingbackfill.empty());
6891 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
6892 pg->actingbackfill.size()) {
6893 pg->state_clear(PG_STATE_DEGRADED);
6894 pg->publish_stats_to_osd();
6895 }
6896
6897 // trim pglog on recovered
6898 pg->trim_log();
6899
6900 // adjust acting set? (e.g. because backfill completed...)
6901 bool history_les_bound = false;
6902 if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
6903 true, &history_les_bound))
6904 assert(pg->want_acting.size());
6905
6906 if (context< Active >().all_replicas_activated)
6907 post_event(GoClean());
6908}
6909
6910void PG::RecoveryState::Recovered::exit()
6911{
6912 context< RecoveryMachine >().log_exit(state_name, enter_time);
6913 PG *pg = context< RecoveryMachine >().pg;
6914 utime_t dur = ceph_clock_now() - enter_time;
6915 pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur);
6916}
6917
6918PG::RecoveryState::Clean::Clean(my_context ctx)
6919 : my_base(ctx),
6920 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Clean")
6921{
6922 context< RecoveryMachine >().log_enter(state_name);
6923
6924 PG *pg = context< RecoveryMachine >().pg;
6925
6926 if (pg->info.last_complete != pg->info.last_update) {
6927 ceph_abort();
6928 }
6929 pg->finish_recovery(*context< RecoveryMachine >().get_on_safe_context_list());
224ce89b
WB
6930
6931 if (pg->is_active()) {
6932 pg->mark_clean();
6933 }
7c673cae
FG
6934
6935 pg->share_pg_info();
6936 pg->publish_stats_to_osd();
224ce89b 6937 pg->requeue_ops(pg->waiting_for_clean_to_primary_repair);
7c673cae
FG
6938}
6939
6940void PG::RecoveryState::Clean::exit()
6941{
6942 context< RecoveryMachine >().log_exit(state_name, enter_time);
6943 PG *pg = context< RecoveryMachine >().pg;
6944 pg->state_clear(PG_STATE_CLEAN);
6945 utime_t dur = ceph_clock_now() - enter_time;
6946 pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur);
6947}
6948
6949template <typename T>
6950set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
6951{
6952 set<int> osds_found;
6953 set<pg_shard_t> out;
6954 for (typename T::const_iterator i = in.begin();
6955 i != in.end();
6956 ++i) {
6957 if (*i != skip && !osds_found.count(i->osd)) {
6958 osds_found.insert(i->osd);
6959 out.insert(*i);
6960 }
6961 }
6962 return out;
6963}
6964
6965/*---------Active---------*/
6966PG::RecoveryState::Active::Active(my_context ctx)
6967 : my_base(ctx),
6968 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active"),
6969 remote_shards_to_reserve_recovery(
6970 unique_osd_shard_set(
6971 context< RecoveryMachine >().pg->pg_whoami,
6972 context< RecoveryMachine >().pg->actingbackfill)),
6973 remote_shards_to_reserve_backfill(
6974 unique_osd_shard_set(
6975 context< RecoveryMachine >().pg->pg_whoami,
6976 context< RecoveryMachine >().pg->backfill_targets)),
6977 all_replicas_activated(false)
6978{
6979 context< RecoveryMachine >().log_enter(state_name);
6980
6981 PG *pg = context< RecoveryMachine >().pg;
6982
6983 assert(!pg->backfill_reserving);
6984 assert(!pg->backfill_reserved);
6985 assert(pg->is_primary());
6986 ldout(pg->cct, 10) << "In Active, about to call activate" << dendl;
6987 pg->start_flush(
6988 context< RecoveryMachine >().get_cur_transaction(),
6989 context< RecoveryMachine >().get_on_applied_context_list(),
6990 context< RecoveryMachine >().get_on_safe_context_list());
6991 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
6992 pg->get_osdmap()->get_epoch(),
6993 *context< RecoveryMachine >().get_on_safe_context_list(),
6994 *context< RecoveryMachine >().get_query_map(),
6995 context< RecoveryMachine >().get_info_map(),
6996 context< RecoveryMachine >().get_recovery_ctx());
6997
6998 // everyone has to commit/ack before we are truly active
6999 pg->blocked_by.clear();
7000 for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
7001 p != pg->actingbackfill.end();
7002 ++p) {
7003 if (p->shard != pg->pg_whoami.shard) {
7004 pg->blocked_by.insert(p->shard);
7005 }
7006 }
7007 pg->publish_stats_to_osd();
7008 ldout(pg->cct, 10) << "Activate Finished" << dendl;
7009}
7010
7011boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
7012{
7013 PG *pg = context< RecoveryMachine >().pg;
7014 ldout(pg->cct, 10) << "Active advmap" << dendl;
7015 if (!pg->pool.newly_removed_snaps.empty()) {
7016 pg->snap_trimq.union_of(pg->pool.newly_removed_snaps);
7017 ldout(pg->cct, 10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl;
7018 pg->dirty_info = true;
7019 pg->dirty_big_info = true;
7020 }
7021
7022 for (size_t i = 0; i < pg->want_acting.size(); i++) {
7023 int osd = pg->want_acting[i];
7024 if (!advmap.osdmap->is_up(osd)) {
7025 pg_shard_t osd_with_shard(osd, shard_id_t(i));
7026 assert(pg->is_acting(osd_with_shard) || pg->is_up(osd_with_shard));
7027 }
7028 }
7029
7030 bool need_publish = false;
7031 /* Check for changes in pool size (if the acting set changed as a result,
7032 * this does not matter) */
7033 if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) !=
7034 pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) {
7035 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) {
7036 pg->state_clear(PG_STATE_UNDERSIZED);
7037 if (pg->needs_recovery()) {
7038 pg->state_set(PG_STATE_DEGRADED);
7039 } else {
7040 pg->state_clear(PG_STATE_DEGRADED);
7041 }
7042 } else {
7043 pg->state_set(PG_STATE_UNDERSIZED);
7044 pg->state_set(PG_STATE_DEGRADED);
7045 }
7046 need_publish = true; // degraded may have changed
7047 }
7048
7049 // if we haven't reported our PG stats in a long time, do so now.
7050 if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
7051 ldout(pg->cct, 20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch)
7052 << " epochs" << dendl;
7053 need_publish = true;
7054 }
7055
7056 if (need_publish)
7057 pg->publish_stats_to_osd();
7058
7059 return forward_event();
7060}
7061
7062boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
7063{
7064 PG *pg = context< RecoveryMachine >().pg;
7065 ldout(pg->cct, 10) << "Active: handling ActMap" << dendl;
7066 assert(pg->is_primary());
7067
7068 if (pg->have_unfound()) {
7069 // object may have become unfound
7070 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7071 }
7072
7073 if (pg->cct->_conf->osd_check_for_log_corruption)
7074 pg->check_log_for_corruption(pg->osd->store);
7075
7076 uint64_t unfound = pg->missing_loc.num_unfound();
7077 if (unfound > 0 &&
7078 pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
7079 if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
7080 pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound
7081 << " objects unfound and apparently lost, would automatically marking lost but NOT IMPLEMENTED";
7082 } else
7083 pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound << " objects unfound and apparently lost";
7084 }
7085
7086 if (pg->is_active()) {
7087 ldout(pg->cct, 10) << "Active: kicking snap trim" << dendl;
7088 pg->kick_snap_trim();
7089 }
7090
7091 if (pg->is_peered() &&
7092 !pg->is_clean() &&
7093 !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) &&
7094 (!pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || pg->is_degraded())) {
7095 pg->queue_recovery();
7096 }
7097 return forward_event();
7098}
7099
7100boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& notevt)
7101{
7102 PG *pg = context< RecoveryMachine >().pg;
7103 assert(pg->is_primary());
7104 if (pg->peer_info.count(notevt.from)) {
7105 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7106 << ", already have info from that osd, ignoring"
7107 << dendl;
7108 } else if (pg->peer_purged.count(notevt.from)) {
7109 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7110 << ", already purged that peer, ignoring"
7111 << dendl;
7112 } else {
7113 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7114 << ", calling proc_replica_info and discover_all_missing"
7115 << dendl;
7116 pg->proc_replica_info(
7117 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
7118 if (pg->have_unfound()) {
7119 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7120 }
7121 }
7122 return discard_event();
7123}
7124
7125boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoevt)
7126{
7127 PG *pg = context< RecoveryMachine >().pg;
7128 assert(pg->is_primary());
7129
7130 assert(!pg->actingbackfill.empty());
7131 // don't update history (yet) if we are active and primary; the replica
7132 // may be telling us they have activated (and committed) but we can't
7133 // share that until _everyone_ does the same.
7134 if (pg->is_actingbackfill(infoevt.from)) {
7135 ldout(pg->cct, 10) << " peer osd." << infoevt.from
7136 << " activated and committed" << dendl;
7137 pg->peer_activated.insert(infoevt.from);
7138 pg->blocked_by.erase(infoevt.from.shard);
7139 pg->publish_stats_to_osd();
7140 if (pg->peer_activated.size() == pg->actingbackfill.size()) {
7141 pg->all_activated_and_committed();
7142 }
7143 }
7144 return discard_event();
7145}
7146
7147boost::statechart::result PG::RecoveryState::Active::react(const MLogRec& logevt)
7148{
7149 PG *pg = context< RecoveryMachine >().pg;
7150 ldout(pg->cct, 10) << "searching osd." << logevt.from
7151 << " log for unfound items" << dendl;
7152 pg->proc_replica_log(
7153 logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
7154 bool got_missing = pg->search_for_missing(
7155 pg->peer_info[logevt.from],
7156 pg->peer_missing[logevt.from],
7157 logevt.from,
7158 context< RecoveryMachine >().get_recovery_ctx());
7159 if (pg->is_peered() &&
7160 got_missing)
7161 pg->queue_recovery();
7162 return discard_event();
7163}
7164
7165boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
7166{
7167 PG *pg = context< RecoveryMachine >().pg;
7168
7169 q.f->open_object_section("state");
7170 q.f->dump_string("name", state_name);
7171 q.f->dump_stream("enter_time") << enter_time;
7172
7173 {
7174 q.f->open_array_section("might_have_unfound");
7175 for (set<pg_shard_t>::iterator p = pg->might_have_unfound.begin();
7176 p != pg->might_have_unfound.end();
7177 ++p) {
7178 q.f->open_object_section("osd");
7179 q.f->dump_stream("osd") << *p;
7180 if (pg->peer_missing.count(*p)) {
7181 q.f->dump_string("status", "already probed");
7182 } else if (pg->peer_missing_requested.count(*p)) {
7183 q.f->dump_string("status", "querying");
7184 } else if (!pg->get_osdmap()->is_up(p->osd)) {
7185 q.f->dump_string("status", "osd is down");
7186 } else {
7187 q.f->dump_string("status", "not queried");
7188 }
7189 q.f->close_section();
7190 }
7191 q.f->close_section();
7192 }
7193 {
7194 q.f->open_object_section("recovery_progress");
7195 pg->dump_recovery_info(q.f);
7196 q.f->close_section();
7197 }
7198
7199 {
7200 q.f->open_object_section("scrub");
7201 q.f->dump_stream("scrubber.epoch_start") << pg->scrubber.epoch_start;
7202 q.f->dump_bool("scrubber.active", pg->scrubber.active);
7203 q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state));
7204 q.f->dump_stream("scrubber.start") << pg->scrubber.start;
7205 q.f->dump_stream("scrubber.end") << pg->scrubber.end;
7206 q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update;
7207 q.f->dump_bool("scrubber.deep", pg->scrubber.deep);
7208 q.f->dump_unsigned("scrubber.seed", pg->scrubber.seed);
7209 q.f->dump_int("scrubber.waiting_on", pg->scrubber.waiting_on);
7210 {
7211 q.f->open_array_section("scrubber.waiting_on_whom");
7212 for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin();
7213 p != pg->scrubber.waiting_on_whom.end();
7214 ++p) {
7215 q.f->dump_stream("shard") << *p;
7216 }
7217 q.f->close_section();
7218 }
7219 q.f->close_section();
7220 }
7221
7222 q.f->close_section();
7223 return forward_event();
7224}
7225
7226boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActivated &evt)
7227{
7228 PG *pg = context< RecoveryMachine >().pg;
7229 all_replicas_activated = true;
7230
7231 pg->state_clear(PG_STATE_ACTIVATING);
7232 pg->state_clear(PG_STATE_CREATING);
7233 if (pg->acting.size() >= pg->pool.info.min_size) {
7234 pg->state_set(PG_STATE_ACTIVE);
7235 } else {
7236 pg->state_set(PG_STATE_PEERED);
7237 }
7238
7239 // info.last_epoch_started is set during activate()
7240 pg->info.history.last_epoch_started = pg->info.last_epoch_started;
7241 pg->info.history.last_interval_started = pg->info.last_interval_started;
7242 pg->dirty_info = true;
7243
7244 pg->share_pg_info();
7245 pg->publish_stats_to_osd();
7246
7247 pg->check_local();
7248
7249 // waiters
7250 if (pg->flushes_in_progress == 0) {
7251 pg->requeue_ops(pg->waiting_for_peered);
7252 }
7253
7254 pg->on_activate();
7255
7256 return discard_event();
7257}
7258
7259void PG::RecoveryState::Active::exit()
7260{
7261 context< RecoveryMachine >().log_exit(state_name, enter_time);
7262 PG *pg = context< RecoveryMachine >().pg;
7263 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7264
7265 pg->blocked_by.clear();
7266 pg->backfill_reserved = false;
7267 pg->backfill_reserving = false;
7268 pg->state_clear(PG_STATE_ACTIVATING);
7269 pg->state_clear(PG_STATE_DEGRADED);
7270 pg->state_clear(PG_STATE_UNDERSIZED);
7271 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
7272 pg->state_clear(PG_STATE_BACKFILL_WAIT);
7273 pg->state_clear(PG_STATE_RECOVERY_WAIT);
7274 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7275 utime_t dur = ceph_clock_now() - enter_time;
7276 pg->osd->recoverystate_perf->tinc(rs_active_latency, dur);
7277 pg->agent_stop();
7278}
7279
7280/*------ReplicaActive-----*/
7281PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx)
7282 : my_base(ctx),
7283 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive")
7284{
7285 context< RecoveryMachine >().log_enter(state_name);
7286
7287 PG *pg = context< RecoveryMachine >().pg;
7288 pg->start_flush(
7289 context< RecoveryMachine >().get_cur_transaction(),
7290 context< RecoveryMachine >().get_on_applied_context_list(),
7291 context< RecoveryMachine >().get_on_safe_context_list());
7292}
7293
7294
7295boost::statechart::result PG::RecoveryState::ReplicaActive::react(
7296 const Activate& actevt) {
7297 PG *pg = context< RecoveryMachine >().pg;
7298 ldout(pg->cct, 10) << "In ReplicaActive, about to call activate" << dendl;
7299 map<int, map<spg_t, pg_query_t> > query_map;
7300 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7301 actevt.activation_epoch,
7302 *context< RecoveryMachine >().get_on_safe_context_list(),
7303 query_map, NULL, NULL);
7304 ldout(pg->cct, 10) << "Activate Finished" << dendl;
7305 return discard_event();
7306}
7307
7308boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MInfoRec& infoevt)
7309{
7310 PG *pg = context< RecoveryMachine >().pg;
7311 pg->proc_primary_info(*context<RecoveryMachine>().get_cur_transaction(),
7312 infoevt.info);
7313 return discard_event();
7314}
7315
7316boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MLogRec& logevt)
7317{
7318 PG *pg = context< RecoveryMachine >().pg;
7319 ldout(pg->cct, 10) << "received log from " << logevt.from << dendl;
7320 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7321 pg->merge_log(*t, logevt.msg->info, logevt.msg->log, logevt.from);
7322 assert(pg->pg_log.get_head() == pg->info.last_update);
7323
7324 return discard_event();
7325}
7326
7327boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&)
7328{
7329 PG *pg = context< RecoveryMachine >().pg;
7330 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7331 context< RecoveryMachine >().send_notify(
7332 pg->get_primary(),
7333 pg_notify_t(
7334 pg->get_primary().shard, pg->pg_whoami.shard,
7335 pg->get_osdmap()->get_epoch(),
7336 pg->get_osdmap()->get_epoch(),
7337 pg->info),
7338 pg->past_intervals);
7339 }
7340 pg->take_waiters();
7341 return discard_event();
7342}
7343
7344boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MQuery& query)
7345{
7346 PG *pg = context< RecoveryMachine >().pg;
7347 if (query.query.type == pg_query_t::MISSING) {
7348 pg->update_history(query.query.history);
7349 pg->fulfill_log(query.from, query.query, query.query_epoch);
7350 } // else: from prior to activation, safe to ignore
7351 return discard_event();
7352}
7353
7354boost::statechart::result PG::RecoveryState::ReplicaActive::react(const QueryState& q)
7355{
7356 q.f->open_object_section("state");
7357 q.f->dump_string("name", state_name);
7358 q.f->dump_stream("enter_time") << enter_time;
7359 q.f->close_section();
7360 return forward_event();
7361}
7362
7363void PG::RecoveryState::ReplicaActive::exit()
7364{
7365 context< RecoveryMachine >().log_exit(state_name, enter_time);
7366 PG *pg = context< RecoveryMachine >().pg;
7367 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7368 utime_t dur = ceph_clock_now() - enter_time;
7369 pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur);
7370}
7371
7372/*-------Stray---*/
7373PG::RecoveryState::Stray::Stray(my_context ctx)
7374 : my_base(ctx),
7375 NamedState(context< RecoveryMachine >().pg, "Started/Stray")
7376{
7377 context< RecoveryMachine >().log_enter(state_name);
7378
7379 PG *pg = context< RecoveryMachine >().pg;
7380 assert(!pg->is_peered());
7381 assert(!pg->is_peering());
7382 assert(!pg->is_primary());
7383 pg->start_flush(
7384 context< RecoveryMachine >().get_cur_transaction(),
7385 context< RecoveryMachine >().get_on_applied_context_list(),
7386 context< RecoveryMachine >().get_on_safe_context_list());
7387}
7388
7389boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
7390{
7391 PG *pg = context< RecoveryMachine >().pg;
7392 MOSDPGLog *msg = logevt.msg.get();
7393 ldout(pg->cct, 10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
7394
7395 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7396 if (msg->info.last_backfill == hobject_t()) {
7397 // restart backfill
7398 pg->unreg_next_scrub();
7399 pg->info = msg->info;
7400 pg->reg_next_scrub();
7401 pg->dirty_info = true;
7402 pg->dirty_big_info = true; // maybe.
7403
7404 PGLogEntryHandler rollbacker{pg, t};
7405 pg->pg_log.reset_backfill_claim_log(msg->log, &rollbacker);
7406
7407 pg->pg_log.reset_backfill();
7408 } else {
7409 pg->merge_log(*t, msg->info, msg->log, logevt.from);
7410 }
7411
7412 assert(pg->pg_log.get_head() == pg->info.last_update);
7413
7414 post_event(Activate(logevt.msg->info.last_epoch_started));
7415 return transit<ReplicaActive>();
7416}
7417
7418boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoevt)
7419{
7420 PG *pg = context< RecoveryMachine >().pg;
7421 ldout(pg->cct, 10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
7422
7423 if (pg->info.last_update > infoevt.info.last_update) {
7424 // rewind divergent log entries
7425 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7426 pg->rewind_divergent_log(*t, infoevt.info.last_update);
7427 pg->info.stats = infoevt.info.stats;
7428 pg->info.hit_set = infoevt.info.hit_set;
7429 }
7430
7431 assert(infoevt.info.last_update == pg->info.last_update);
7432 assert(pg->pg_log.get_head() == pg->info.last_update);
7433
7434 post_event(Activate(infoevt.info.last_epoch_started));
7435 return transit<ReplicaActive>();
7436}
7437
7438boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query)
7439{
7440 PG *pg = context< RecoveryMachine >().pg;
7441 if (query.query.type == pg_query_t::INFO) {
7442 pair<pg_shard_t, pg_info_t> notify_info;
7443 pg->update_history(query.query.history);
7444 pg->fulfill_info(query.from, query.query, notify_info);
7445 context< RecoveryMachine >().send_notify(
7446 notify_info.first,
7447 pg_notify_t(
7448 notify_info.first.shard, pg->pg_whoami.shard,
7449 query.query_epoch,
7450 pg->get_osdmap()->get_epoch(),
7451 notify_info.second),
7452 pg->past_intervals);
7453 } else {
7454 pg->fulfill_log(query.from, query.query, query.query_epoch);
7455 }
7456 return discard_event();
7457}
7458
7459boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&)
7460{
7461 PG *pg = context< RecoveryMachine >().pg;
7462 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7463 context< RecoveryMachine >().send_notify(
7464 pg->get_primary(),
7465 pg_notify_t(
7466 pg->get_primary().shard, pg->pg_whoami.shard,
7467 pg->get_osdmap()->get_epoch(),
7468 pg->get_osdmap()->get_epoch(),
7469 pg->info),
7470 pg->past_intervals);
7471 }
7472 pg->take_waiters();
7473 return discard_event();
7474}
7475
7476void PG::RecoveryState::Stray::exit()
7477{
7478 context< RecoveryMachine >().log_exit(state_name, enter_time);
7479 PG *pg = context< RecoveryMachine >().pg;
7480 utime_t dur = ceph_clock_now() - enter_time;
7481 pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur);
7482}
7483
7484/*--------GetInfo---------*/
7485PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
7486 : my_base(ctx),
7487 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetInfo")
7488{
7489 context< RecoveryMachine >().log_enter(state_name);
7490
7491 PG *pg = context< RecoveryMachine >().pg;
7492 pg->check_past_interval_bounds();
7493 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7494
7495 assert(pg->blocked_by.empty());
7496
7497 prior_set = pg->build_prior();
7498
7499 pg->reset_min_peer_features();
7500 get_infos();
7501 if (prior_set.pg_down) {
7502 post_event(IsDown());
7503 } else if (peer_info_requested.empty()) {
7504 post_event(GotInfo());
7505 }
7506}
7507
7508void PG::RecoveryState::GetInfo::get_infos()
7509{
7510 PG *pg = context< RecoveryMachine >().pg;
7511 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7512
7513 pg->blocked_by.clear();
7514 for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
7515 it != prior_set.probe.end();
7516 ++it) {
7517 pg_shard_t peer = *it;
7518 if (peer == pg->pg_whoami) {
7519 continue;
7520 }
7521 if (pg->peer_info.count(peer)) {
7522 ldout(pg->cct, 10) << " have osd." << peer << " info " << pg->peer_info[peer] << dendl;
7523 continue;
7524 }
7525 if (peer_info_requested.count(peer)) {
7526 ldout(pg->cct, 10) << " already requested info from osd." << peer << dendl;
7527 pg->blocked_by.insert(peer.osd);
7528 } else if (!pg->get_osdmap()->is_up(peer.osd)) {
7529 ldout(pg->cct, 10) << " not querying info from down osd." << peer << dendl;
7530 } else {
7531 ldout(pg->cct, 10) << " querying info from osd." << peer << dendl;
7532 context< RecoveryMachine >().send_query(
7533 peer, pg_query_t(pg_query_t::INFO,
7534 it->shard, pg->pg_whoami.shard,
7535 pg->info.history,
7536 pg->get_osdmap()->get_epoch()));
7537 peer_info_requested.insert(peer);
7538 pg->blocked_by.insert(peer.osd);
7539 }
7540 }
7541
7542 pg->publish_stats_to_osd();
7543}
7544
7545boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt)
7546{
7547 PG *pg = context< RecoveryMachine >().pg;
7548
7549 set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
7550 if (p != peer_info_requested.end()) {
7551 peer_info_requested.erase(p);
7552 pg->blocked_by.erase(infoevt.from.osd);
7553 }
7554
7555 epoch_t old_start = pg->info.history.last_epoch_started;
7556 if (pg->proc_replica_info(
7557 infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
7558 // we got something new ...
7559 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7560 if (old_start < pg->info.history.last_epoch_started) {
7561 ldout(pg->cct, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
7562 prior_set = pg->build_prior();
7563
7564 // filter out any osds that got dropped from the probe set from
7565 // peer_info_requested. this is less expensive than restarting
7566 // peering (which would re-probe everyone).
7567 set<pg_shard_t>::iterator p = peer_info_requested.begin();
7568 while (p != peer_info_requested.end()) {
7569 if (prior_set.probe.count(*p) == 0) {
7570 ldout(pg->cct, 20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
7571 peer_info_requested.erase(p++);
7572 } else {
7573 ++p;
7574 }
7575 }
7576 get_infos();
7577 }
7578 ldout(pg->cct, 20) << "Adding osd: " << infoevt.from.osd << " peer features: "
7579 << hex << infoevt.features << dec << dendl;
7580 pg->apply_peer_features(infoevt.features);
7581
7582 // are we done getting everything?
7583 if (peer_info_requested.empty() && !prior_set.pg_down) {
7584 ldout(pg->cct, 20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl;
7585 ldout(pg->cct, 20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl;
7586 ldout(pg->cct, 20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl;
7587 post_event(GotInfo());
7588 }
7589 }
7590 return discard_event();
7591}
7592
7593boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q)
7594{
7595 PG *pg = context< RecoveryMachine >().pg;
7596 q.f->open_object_section("state");
7597 q.f->dump_string("name", state_name);
7598 q.f->dump_stream("enter_time") << enter_time;
7599
7600 q.f->open_array_section("requested_info_from");
7601 for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
7602 p != peer_info_requested.end();
7603 ++p) {
7604 q.f->open_object_section("osd");
7605 q.f->dump_stream("osd") << *p;
7606 if (pg->peer_info.count(*p)) {
7607 q.f->open_object_section("got_info");
7608 pg->peer_info[*p].dump(q.f);
7609 q.f->close_section();
7610 }
7611 q.f->close_section();
7612 }
7613 q.f->close_section();
7614
7615 q.f->close_section();
7616 return forward_event();
7617}
7618
7619void PG::RecoveryState::GetInfo::exit()
7620{
7621 context< RecoveryMachine >().log_exit(state_name, enter_time);
7622 PG *pg = context< RecoveryMachine >().pg;
7623 utime_t dur = ceph_clock_now() - enter_time;
7624 pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur);
7625 pg->blocked_by.clear();
7626 pg->publish_stats_to_osd();
7627}
7628
7629/*------GetLog------------*/
7630PG::RecoveryState::GetLog::GetLog(my_context ctx)
7631 : my_base(ctx),
7632 NamedState(
7633 context< RecoveryMachine >().pg, "Started/Primary/Peering/GetLog"),
7634 msg(0)
7635{
7636 context< RecoveryMachine >().log_enter(state_name);
7637
7638 PG *pg = context< RecoveryMachine >().pg;
7639
7640 // adjust acting?
7641 if (!pg->choose_acting(auth_log_shard, false,
7642 &context< Peering >().history_les_bound)) {
7643 if (!pg->want_acting.empty()) {
7644 post_event(NeedActingChange());
7645 } else {
7646 post_event(IsIncomplete());
7647 }
7648 return;
7649 }
7650
7651 // am i the best?
7652 if (auth_log_shard == pg->pg_whoami) {
7653 post_event(GotLog());
7654 return;
7655 }
7656
7657 const pg_info_t& best = pg->peer_info[auth_log_shard];
7658
7659 // am i broken?
7660 if (pg->info.last_update < best.log_tail) {
7661 ldout(pg->cct, 10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
7662 post_event(IsIncomplete());
7663 return;
7664 }
7665
7666 // how much log to request?
7667 eversion_t request_log_from = pg->info.last_update;
7668 assert(!pg->actingbackfill.empty());
7669 for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
7670 p != pg->actingbackfill.end();
7671 ++p) {
7672 if (*p == pg->pg_whoami) continue;
7673 pg_info_t& ri = pg->peer_info[*p];
7674 if (ri.last_update < pg->info.log_tail && ri.last_update >= best.log_tail &&
7675 ri.last_update < request_log_from)
7676 request_log_from = ri.last_update;
7677 }
7678
7679 // how much?
7680 ldout(pg->cct, 10) << " requesting log from osd." << auth_log_shard << dendl;
7681 context<RecoveryMachine>().send_query(
7682 auth_log_shard,
7683 pg_query_t(
7684 pg_query_t::LOG,
7685 auth_log_shard.shard, pg->pg_whoami.shard,
7686 request_log_from, pg->info.history,
7687 pg->get_osdmap()->get_epoch()));
7688
7689 assert(pg->blocked_by.empty());
7690 pg->blocked_by.insert(auth_log_shard.osd);
7691 pg->publish_stats_to_osd();
7692}
7693
7694boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap)
7695{
7696 PG *pg = context< RecoveryMachine >().pg;
7697 // make sure our log source didn't go down. we need to check
7698 // explicitly because it may not be part of the prior set, which
7699 // means the Peering state check won't catch it going down.
7700 if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
7701 ldout(pg->cct, 10) << "GetLog: auth_log_shard osd."
7702 << auth_log_shard.osd << " went down" << dendl;
7703 post_event(advmap);
7704 return transit< Reset >();
7705 }
7706
7707 // let the Peering state do its checks.
7708 return forward_event();
7709}
7710
7711boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt)
7712{
7713 PG *pg = context< RecoveryMachine >().pg;
7714 assert(!msg);
7715 if (logevt.from != auth_log_shard) {
7716 ldout(pg->cct, 10) << "GetLog: discarding log from "
7717 << "non-auth_log_shard osd." << logevt.from << dendl;
7718 return discard_event();
7719 }
7720 ldout(pg->cct, 10) << "GetLog: received master log from osd"
7721 << logevt.from << dendl;
7722 msg = logevt.msg;
7723 post_event(GotLog());
7724 return discard_event();
7725}
7726
7727boost::statechart::result PG::RecoveryState::GetLog::react(const GotLog&)
7728{
7729 PG *pg = context< RecoveryMachine >().pg;
7730 ldout(pg->cct, 10) << "leaving GetLog" << dendl;
7731 if (msg) {
7732 ldout(pg->cct, 10) << "processing master log" << dendl;
7733 pg->proc_master_log(*context<RecoveryMachine>().get_cur_transaction(),
7734 msg->info, msg->log, msg->missing,
7735 auth_log_shard);
7736 }
7737 pg->start_flush(
7738 context< RecoveryMachine >().get_cur_transaction(),
7739 context< RecoveryMachine >().get_on_applied_context_list(),
7740 context< RecoveryMachine >().get_on_safe_context_list());
7741 return transit< GetMissing >();
7742}
7743
7744boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q)
7745{
7746 q.f->open_object_section("state");
7747 q.f->dump_string("name", state_name);
7748 q.f->dump_stream("enter_time") << enter_time;
7749 q.f->dump_stream("auth_log_shard") << auth_log_shard;
7750 q.f->close_section();
7751 return forward_event();
7752}
7753
7754void PG::RecoveryState::GetLog::exit()
7755{
7756 context< RecoveryMachine >().log_exit(state_name, enter_time);
7757 PG *pg = context< RecoveryMachine >().pg;
7758 utime_t dur = ceph_clock_now() - enter_time;
7759 pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur);
7760 pg->blocked_by.clear();
7761 pg->publish_stats_to_osd();
7762}
7763
7764/*------WaitActingChange--------*/
7765PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx)
7766 : my_base(ctx),
7767 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitActingChange")
7768{
7769 context< RecoveryMachine >().log_enter(state_name);
7770}
7771
7772boost::statechart::result PG::RecoveryState::WaitActingChange::react(const AdvMap& advmap)
7773{
7774 PG *pg = context< RecoveryMachine >().pg;
7775 OSDMapRef osdmap = advmap.osdmap;
7776
7777 ldout(pg->cct, 10) << "verifying no want_acting " << pg->want_acting << " targets didn't go down" << dendl;
7778 for (vector<int>::iterator p = pg->want_acting.begin(); p != pg->want_acting.end(); ++p) {
7779 if (!osdmap->is_up(*p)) {
7780 ldout(pg->cct, 10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
7781 post_event(advmap);
7782 return transit< Reset >();
7783 }
7784 }
7785 return forward_event();
7786}
7787
7788boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MLogRec& logevt)
7789{
7790 PG *pg = context< RecoveryMachine >().pg;
7791 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MLocRec" << dendl;
7792 return discard_event();
7793}
7794
7795boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MInfoRec& evt)
7796{
7797 PG *pg = context< RecoveryMachine >().pg;
7798 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
7799 return discard_event();
7800}
7801
7802boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MNotifyRec& evt)
7803{
7804 PG *pg = context< RecoveryMachine >().pg;
7805 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
7806 return discard_event();
7807}
7808
7809boost::statechart::result PG::RecoveryState::WaitActingChange::react(const QueryState& q)
7810{
7811 q.f->open_object_section("state");
7812 q.f->dump_string("name", state_name);
7813 q.f->dump_stream("enter_time") << enter_time;
7814 q.f->dump_string("comment", "waiting for pg acting set to change");
7815 q.f->close_section();
7816 return forward_event();
7817}
7818
7819void PG::RecoveryState::WaitActingChange::exit()
7820{
7821 context< RecoveryMachine >().log_exit(state_name, enter_time);
7822 PG *pg = context< RecoveryMachine >().pg;
7823 utime_t dur = ceph_clock_now() - enter_time;
7824 pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur);
7825}
7826
7827/*------Down--------*/
7828PG::RecoveryState::Down::Down(my_context ctx)
7829 : my_base(ctx),
7830 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Down")
7831{
7832 context< RecoveryMachine >().log_enter(state_name);
7833 PG *pg = context< RecoveryMachine >().pg;
7834
7835 pg->state_clear(PG_STATE_PEERING);
7836 pg->state_set(PG_STATE_DOWN);
7837
7838 auto &prior_set = context< Peering >().prior_set;
7839 assert(pg->blocked_by.empty());
7840 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
7841 pg->publish_stats_to_osd();
7842}
7843
7844void PG::RecoveryState::Down::exit()
7845{
7846 context< RecoveryMachine >().log_exit(state_name, enter_time);
7847 PG *pg = context< RecoveryMachine >().pg;
7848
7849 pg->state_clear(PG_STATE_DOWN);
7850 utime_t dur = ceph_clock_now() - enter_time;
7851 pg->osd->recoverystate_perf->tinc(rs_down_latency, dur);
7852
7853 pg->blocked_by.clear();
7854 pg->publish_stats_to_osd();
7855}
7856
7857boost::statechart::result PG::RecoveryState::Down::react(const QueryState& q)
7858{
7859 q.f->open_object_section("state");
7860 q.f->dump_string("name", state_name);
7861 q.f->dump_stream("enter_time") << enter_time;
7862 q.f->dump_string("comment",
7863 "not enough up instances of this PG to go active");
7864 q.f->close_section();
7865 return forward_event();
7866}
7867
7868/*------Incomplete--------*/
7869PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
7870 : my_base(ctx),
7871 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Incomplete")
7872{
7873 context< RecoveryMachine >().log_enter(state_name);
7874 PG *pg = context< RecoveryMachine >().pg;
7875
7876 pg->state_clear(PG_STATE_PEERING);
7877 pg->state_set(PG_STATE_INCOMPLETE);
7878
7879 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7880 assert(pg->blocked_by.empty());
7881 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
7882 pg->publish_stats_to_osd();
7883}
7884
7885boost::statechart::result PG::RecoveryState::Incomplete::react(const AdvMap &advmap) {
7886 PG *pg = context< RecoveryMachine >().pg;
7887 int64_t poolnum = pg->info.pgid.pool();
7888
7889 // Reset if min_size turn smaller than previous value, pg might now be able to go active
7890 if (advmap.lastmap->get_pools().find(poolnum)->second.min_size >
7891 advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
7892 post_event(advmap);
7893 return transit< Reset >();
7894 }
7895
7896 return forward_event();
7897}
7898
7899boost::statechart::result PG::RecoveryState::Incomplete::react(const MNotifyRec& notevt) {
7900 PG *pg = context< RecoveryMachine >().pg;
7901 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
7902 if (pg->proc_replica_info(
7903 notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
7904 // We got something new, try again!
7905 return transit< GetLog >();
7906 } else {
7907 return discard_event();
7908 }
7909}
7910
7911boost::statechart::result PG::RecoveryState::Incomplete::react(
7912 const QueryState& q)
7913{
7914 q.f->open_object_section("state");
7915 q.f->dump_string("name", state_name);
7916 q.f->dump_stream("enter_time") << enter_time;
7917 q.f->dump_string("comment", "not enough complete instances of this PG");
7918 q.f->close_section();
7919 return forward_event();
7920}
7921
7922void PG::RecoveryState::Incomplete::exit()
7923{
7924 context< RecoveryMachine >().log_exit(state_name, enter_time);
7925 PG *pg = context< RecoveryMachine >().pg;
7926
7927 pg->state_clear(PG_STATE_INCOMPLETE);
7928 utime_t dur = ceph_clock_now() - enter_time;
7929 pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur);
7930
7931 pg->blocked_by.clear();
7932 pg->publish_stats_to_osd();
7933}
7934
7935/*------GetMissing--------*/
7936PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
7937 : my_base(ctx),
7938 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetMissing")
7939{
7940 context< RecoveryMachine >().log_enter(state_name);
7941
7942 PG *pg = context< RecoveryMachine >().pg;
7943 assert(!pg->actingbackfill.empty());
7944 eversion_t since;
7945 for (set<pg_shard_t>::iterator i = pg->actingbackfill.begin();
7946 i != pg->actingbackfill.end();
7947 ++i) {
7948 if (*i == pg->get_primary()) continue;
7949 const pg_info_t& pi = pg->peer_info[*i];
7950
7951 if (pi.is_empty())
7952 continue; // no pg data, nothing divergent
7953
7954 if (pi.last_update < pg->pg_log.get_tail()) {
7955 ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
7956 pg->peer_missing[*i];
7957 continue;
7958 }
7959 if (pi.last_backfill == hobject_t()) {
7960 ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
7961 pg->peer_missing[*i];
7962 continue;
7963 }
7964
7965 if (pi.last_update == pi.last_complete && // peer has no missing
7966 pi.last_update == pg->info.last_update) { // peer is up to date
7967 // replica has no missing and identical log as us. no need to
7968 // pull anything.
7969 // FIXME: we can do better here. if last_update==last_complete we
7970 // can infer the rest!
7971 ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl;
7972 pg->peer_missing[*i];
7973 continue;
7974 }
7975
7976 // We pull the log from the peer's last_epoch_started to ensure we
7977 // get enough log to detect divergent updates.
7978 since.epoch = pi.last_epoch_started;
7979 assert(pi.last_update >= pg->info.log_tail); // or else choose_acting() did a bad thing
7980 if (pi.log_tail <= since) {
7981 ldout(pg->cct, 10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
7982 context< RecoveryMachine >().send_query(
7983 *i,
7984 pg_query_t(
7985 pg_query_t::LOG,
7986 i->shard, pg->pg_whoami.shard,
7987 since, pg->info.history,
7988 pg->get_osdmap()->get_epoch()));
7989 } else {
7990 ldout(pg->cct, 10) << " requesting fulllog+missing from osd." << *i
7991 << " (want since " << since << " < log.tail "
7992 << pi.log_tail << ")" << dendl;
7993 context< RecoveryMachine >().send_query(
7994 *i, pg_query_t(
7995 pg_query_t::FULLLOG,
7996 i->shard, pg->pg_whoami.shard,
7997 pg->info.history, pg->get_osdmap()->get_epoch()));
7998 }
7999 peer_missing_requested.insert(*i);
8000 pg->blocked_by.insert(i->osd);
8001 }
8002
8003 if (peer_missing_requested.empty()) {
8004 if (pg->need_up_thru) {
8005 ldout(pg->cct, 10) << " still need up_thru update before going active"
8006 << dendl;
8007 post_event(NeedUpThru());
8008 return;
8009 }
8010
8011 // all good!
8012 post_event(Activate(pg->get_osdmap()->get_epoch()));
8013 } else {
8014 pg->publish_stats_to_osd();
8015 }
8016}
8017
8018boost::statechart::result PG::RecoveryState::GetMissing::react(const MLogRec& logevt)
8019{
8020 PG *pg = context< RecoveryMachine >().pg;
8021
8022 peer_missing_requested.erase(logevt.from);
8023 pg->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
8024
8025 if (peer_missing_requested.empty()) {
8026 if (pg->need_up_thru) {
8027 ldout(pg->cct, 10) << " still need up_thru update before going active"
8028 << dendl;
8029 post_event(NeedUpThru());
8030 } else {
8031 ldout(pg->cct, 10) << "Got last missing, don't need missing "
8032 << "posting Activate" << dendl;
8033 post_event(Activate(pg->get_osdmap()->get_epoch()));
8034 }
8035 }
8036 return discard_event();
8037}
8038
8039boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState& q)
8040{
8041 PG *pg = context< RecoveryMachine >().pg;
8042 q.f->open_object_section("state");
8043 q.f->dump_string("name", state_name);
8044 q.f->dump_stream("enter_time") << enter_time;
8045
8046 q.f->open_array_section("peer_missing_requested");
8047 for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
8048 p != peer_missing_requested.end();
8049 ++p) {
8050 q.f->open_object_section("osd");
8051 q.f->dump_stream("osd") << *p;
8052 if (pg->peer_missing.count(*p)) {
8053 q.f->open_object_section("got_missing");
8054 pg->peer_missing[*p].dump(q.f);
8055 q.f->close_section();
8056 }
8057 q.f->close_section();
8058 }
8059 q.f->close_section();
8060
8061 q.f->close_section();
8062 return forward_event();
8063}
8064
8065void PG::RecoveryState::GetMissing::exit()
8066{
8067 context< RecoveryMachine >().log_exit(state_name, enter_time);
8068 PG *pg = context< RecoveryMachine >().pg;
8069 utime_t dur = ceph_clock_now() - enter_time;
8070 pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur);
8071 pg->blocked_by.clear();
8072 pg->publish_stats_to_osd();
8073}
8074
8075/*------WaitUpThru--------*/
8076PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx)
8077 : my_base(ctx),
8078 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitUpThru")
8079{
8080 context< RecoveryMachine >().log_enter(state_name);
8081}
8082
8083boost::statechart::result PG::RecoveryState::WaitUpThru::react(const ActMap& am)
8084{
8085 PG *pg = context< RecoveryMachine >().pg;
8086 if (!pg->need_up_thru) {
8087 post_event(Activate(pg->get_osdmap()->get_epoch()));
8088 }
8089 return forward_event();
8090}
8091
8092boost::statechart::result PG::RecoveryState::WaitUpThru::react(const MLogRec& logevt)
8093{
8094 PG *pg = context< RecoveryMachine >().pg;
8095 ldout(pg->cct, 10) << "Noting missing from osd." << logevt.from << dendl;
8096 pg->peer_missing[logevt.from].claim(logevt.msg->missing);
8097 pg->peer_info[logevt.from] = logevt.msg->info;
8098 return discard_event();
8099}
8100
8101boost::statechart::result PG::RecoveryState::WaitUpThru::react(const QueryState& q)
8102{
8103 q.f->open_object_section("state");
8104 q.f->dump_string("name", state_name);
8105 q.f->dump_stream("enter_time") << enter_time;
8106 q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
8107 q.f->close_section();
8108 return forward_event();
8109}
8110
8111void PG::RecoveryState::WaitUpThru::exit()
8112{
8113 context< RecoveryMachine >().log_exit(state_name, enter_time);
8114 PG *pg = context< RecoveryMachine >().pg;
8115 utime_t dur = ceph_clock_now() - enter_time;
8116 pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur);
8117}
8118
8119/*----RecoveryState::RecoveryMachine Methods-----*/
8120#undef dout_prefix
8121#define dout_prefix *_dout << pg->gen_prefix()
8122
8123void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name)
8124{
8125 PG *pg = context< RecoveryMachine >().pg;
8126 ldout(pg->cct, 5) << "enter " << state_name << dendl;
8127 pg->osd->pg_recovery_stats.log_enter(state_name);
8128}
8129
8130void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_t enter_time)
8131{
8132 utime_t dur = ceph_clock_now() - enter_time;
8133 PG *pg = context< RecoveryMachine >().pg;
8134 ldout(pg->cct, 5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
8135 pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now() - enter_time,
8136 event_count, event_time);
8137 event_count = 0;
8138 event_time = utime_t();
8139}
8140
8141
8142/*---------------------------------------------------*/
8143#undef dout_prefix
8144#define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ")
8145
8146void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
8147 assert(!rctx);
8148 assert(!orig_ctx);
8149 orig_ctx = new_ctx;
8150 if (new_ctx) {
8151 if (messages_pending_flush) {
8152 rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
8153 } else {
8154 rctx = *new_ctx;
8155 }
8156 rctx->start_time = ceph_clock_now();
8157 }
8158}
8159
8160void PG::RecoveryState::begin_block_outgoing() {
8161 assert(!messages_pending_flush);
8162 assert(orig_ctx);
8163 assert(rctx);
8164 messages_pending_flush = BufferedRecoveryMessages();
8165 rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
8166}
8167
8168void PG::RecoveryState::clear_blocked_outgoing() {
8169 assert(orig_ctx);
8170 assert(rctx);
8171 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8172}
8173
8174void PG::RecoveryState::end_block_outgoing() {
8175 assert(messages_pending_flush);
8176 assert(orig_ctx);
8177 assert(rctx);
8178
8179 rctx = RecoveryCtx(*orig_ctx);
8180 rctx->accept_buffered_messages(*messages_pending_flush);
8181 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8182}
8183
8184void PG::RecoveryState::end_handle() {
8185 if (rctx) {
8186 utime_t dur = ceph_clock_now() - rctx->start_time;
8187 machine.event_time += dur;
8188 }
8189
8190 machine.event_count++;
8191 rctx = boost::optional<RecoveryCtx>();
8192 orig_ctx = NULL;
8193}
8194
8195ostream& operator<<(ostream& out, const PG::BackfillInterval& bi)
8196{
8197 out << "BackfillInfo(" << bi.begin << "-" << bi.end
8198 << " " << bi.objects.size() << " objects";
8199 if (!bi.objects.empty())
8200 out << " " << bi.objects;
8201 out << ")";
8202 return out;
8203}
8204
8205void intrusive_ptr_add_ref(PG *pg) { pg->get("intptr"); }
8206void intrusive_ptr_release(PG *pg) { pg->put("intptr"); }
8207
8208#ifdef PG_DEBUG_REFS
8209 uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
8210 void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
8211#endif