]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PG.cc
update sources to v12.2.1
[ceph.git] / ceph / src / osd / PG.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "PG.h"
16// #include "msg/Messenger.h"
17#include "messages/MOSDRepScrub.h"
18// #include "common/cmdparse.h"
19// #include "common/ceph_context.h"
20
21#include "common/errno.h"
22#include "common/config.h"
23#include "OSD.h"
24#include "OpRequest.h"
25#include "ScrubStore.h"
26#include "Session.h"
27
28#include "common/Timer.h"
29#include "common/perf_counters.h"
30
31#include "messages/MOSDOp.h"
32#include "messages/MOSDPGNotify.h"
33// #include "messages/MOSDPGLog.h"
34#include "messages/MOSDPGRemove.h"
35#include "messages/MOSDPGInfo.h"
36#include "messages/MOSDPGTrim.h"
37#include "messages/MOSDPGScan.h"
38#include "messages/MOSDPGBackfill.h"
39#include "messages/MOSDPGBackfillRemove.h"
40#include "messages/MBackfillReserve.h"
41#include "messages/MRecoveryReserve.h"
42#include "messages/MOSDPGPush.h"
43#include "messages/MOSDPGPushReply.h"
44#include "messages/MOSDPGPull.h"
45#include "messages/MOSDECSubOpWrite.h"
46#include "messages/MOSDECSubOpWriteReply.h"
47#include "messages/MOSDECSubOpRead.h"
48#include "messages/MOSDECSubOpReadReply.h"
49#include "messages/MOSDPGUpdateLogMissing.h"
50#include "messages/MOSDPGUpdateLogMissingReply.h"
51#include "messages/MOSDBackoff.h"
52#include "messages/MOSDScrubReserve.h"
53#include "messages/MOSDSubOp.h"
54#include "messages/MOSDRepOp.h"
55#include "messages/MOSDSubOpReply.h"
56#include "messages/MOSDRepOpReply.h"
57#include "messages/MOSDRepScrubMap.h"
c07f9fc5
FG
58#include "messages/MOSDPGRecoveryDelete.h"
59#include "messages/MOSDPGRecoveryDeleteReply.h"
7c673cae
FG
60
61#include "common/BackTrace.h"
62#include "common/EventTrace.h"
63
64#ifdef WITH_LTTNG
65#define TRACEPOINT_DEFINE
66#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
67#include "tracing/pg.h"
68#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
69#undef TRACEPOINT_DEFINE
70#else
71#define tracepoint(...)
72#endif
73
74#include <sstream>
75
76#define dout_context cct
77#define dout_subsys ceph_subsys_osd
78#undef dout_prefix
79#define dout_prefix _prefix(_dout, this)
80
81// prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
82// easily skip them
83const string infover_key("_infover");
84const string info_key("_info");
85const string biginfo_key("_biginfo");
86const string epoch_key("_epoch");
87const string fastinfo_key("_fastinfo");
88
89template <class T>
90static ostream& _prefix(std::ostream *_dout, T *t)
91{
92 return *_dout << t->gen_prefix();
93}
94
95MEMPOOL_DEFINE_OBJECT_FACTORY(PG::CephPeeringEvt, pg_peering_evt, osd);
96
97void PGStateHistory::enter(PG* pg, const utime_t entime, const char* state)
98{
99 // Ignore trimming state machine for now
100 if (::strstr(state, "Trimming") != NULL) {
101 return;
102 } else if (pi != nullptr) {
103 pi->enter_state(entime, state);
104 } else {
105 // Store current state since we can't reliably take the PG lock here
106 if ( tmppi == nullptr) {
107 tmppi = std::unique_ptr<PGStateInstance>(new PGStateInstance);
108 }
109
110 thispg = pg;
111 tmppi->enter_state(entime, state);
112 }
113}
114
115void PGStateHistory::exit(const char* state) {
116 // Ignore trimming state machine for now
117 // Do nothing if PG is being destroyed!
118 if (::strstr(state, "Trimming") != NULL || pg_in_destructor) {
119 return;
120 } else {
121 bool ilocked = false;
122 if(!thispg->is_locked()) {
123 thispg->lock();
124 ilocked = true;
125 }
126 if (pi == nullptr) {
127 buffer.push_back(std::unique_ptr<PGStateInstance>(tmppi.release()));
128 pi = buffer.back().get();
129 pi->setepoch(thispg->get_osdmap()->get_epoch());
130 }
131
132 pi->exit_state(ceph_clock_now());
133 if (::strcmp(state, "Reset") == 0) {
134 this->reset();
135 }
136 if(ilocked) {
137 thispg->unlock();
138 }
139 }
140}
141
142void PGStateHistory::dump(Formatter* f) const {
143 f->open_array_section("history");
144 for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) {
145 f->open_object_section("states");
146 f->dump_stream("epoch") << (*pi)->this_epoch;
147 for (auto she : (*pi)->state_history) {
148 f->dump_string("state", std::get<2>(she));
149 f->dump_stream("enter") << std::get<0>(she);
150 f->dump_stream("exit") << std::get<1>(she);
151 }
152 f->close_section();
153 }
154 f->close_section();
155}
156
157void PG::get(const char* tag)
158{
159 ref++;
160#ifdef PG_DEBUG_REFS
161 Mutex::Locker l(_ref_id_lock);
162 _tag_counts[tag]++;
163#endif
164}
165
166void PG::put(const char* tag)
167{
168#ifdef PG_DEBUG_REFS
169 {
170 Mutex::Locker l(_ref_id_lock);
171 auto tag_counts_entry = _tag_counts.find(tag);
31f18b77 172 assert(tag_counts_entry != _tag_counts.end());
7c673cae
FG
173 --tag_counts_entry->second;
174 if (tag_counts_entry->second == 0) {
175 _tag_counts.erase(tag_counts_entry);
176 }
177 }
178#endif
179 if (--ref== 0)
180 delete this;
181}
182
183#ifdef PG_DEBUG_REFS
184uint64_t PG::get_with_id()
185{
186 ref++;
187 Mutex::Locker l(_ref_id_lock);
188 uint64_t id = ++_ref_id;
189 BackTrace bt(0);
190 stringstream ss;
191 bt.print(ss);
31f18b77 192 dout(20) << __func__ << ": " << info.pgid << " got id " << id << " (new) ref==" << ref << dendl;
7c673cae
FG
193 assert(!_live_ids.count(id));
194 _live_ids.insert(make_pair(id, ss.str()));
195 return id;
196}
197
198void PG::put_with_id(uint64_t id)
199{
31f18b77 200 dout(20) << __func__ << ": " << info.pgid << " put id " << id << " (current) ref==" << ref << dendl;
7c673cae
FG
201 {
202 Mutex::Locker l(_ref_id_lock);
203 assert(_live_ids.count(id));
204 _live_ids.erase(id);
205 }
206 if (--ref == 0)
207 delete this;
208}
209
210void PG::dump_live_ids()
211{
212 Mutex::Locker l(_ref_id_lock);
213 dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl;
214 for (map<uint64_t, string>::iterator i = _live_ids.begin();
215 i != _live_ids.end();
216 ++i) {
217 dout(0) << "\t\tid: " << *i << dendl;
218 }
219 dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl;
220 for (map<string, uint64_t>::iterator i = _tag_counts.begin();
221 i != _tag_counts.end();
222 ++i) {
223 dout(0) << "\t\tid: " << *i << dendl;
224 }
225}
226#endif
227
c07f9fc5 228
7c673cae
FG
229void PGPool::update(OSDMapRef map)
230{
231 const pg_pool_t *pi = map->get_pg_pool(id);
232 assert(pi);
233 info = *pi;
234 auid = pi->auid;
235 name = map->get_pool_name(id);
236 bool updated = false;
237 if ((map->get_epoch() != cached_epoch + 1) ||
238 (pi->get_snap_epoch() == map->get_epoch())) {
239 updated = true;
240 pi->build_removed_snaps(newly_removed_snaps);
241 interval_set<snapid_t> intersection;
242 intersection.intersection_of(newly_removed_snaps, cached_removed_snaps);
243 if (intersection == cached_removed_snaps) {
244 newly_removed_snaps.subtract(cached_removed_snaps);
245 cached_removed_snaps.union_of(newly_removed_snaps);
246 } else {
247 lgeneric_subdout(cct, osd, 0) << __func__
248 << " cached_removed_snaps shrank from " << cached_removed_snaps
249 << " to " << newly_removed_snaps << dendl;
250 cached_removed_snaps = newly_removed_snaps;
251 newly_removed_snaps.clear();
252 }
253 snapc = pi->get_snap_context();
254 } else {
255 /* 1) map->get_epoch() == cached_epoch + 1 &&
256 * 2) pi->get_snap_epoch() != map->get_epoch()
257 *
258 * From the if branch, 1 && 2 must be true. From 2, we know that
259 * this map didn't change the set of removed snaps. From 1, we
260 * know that our cached_removed_snaps matches the previous map.
261 * Thus, from 1 && 2, cached_removed snaps matches the current
262 * set of removed snaps and all we have to do is clear
263 * newly_removed_snaps.
264 */
265 newly_removed_snaps.clear();
266 }
267 cached_epoch = map->get_epoch();
268 lgeneric_subdout(cct, osd, 20)
269 << "PGPool::update cached_removed_snaps "
270 << cached_removed_snaps
271 << " newly_removed_snaps "
272 << newly_removed_snaps
273 << " snapc " << snapc
274 << (updated ? " (updated)":" (no change)")
275 << dendl;
276}
277
278PG::PG(OSDService *o, OSDMapRef curmap,
279 const PGPool &_pool, spg_t p) :
280 osd(o),
281 cct(o->cct),
282 osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
283 snap_mapper(
284 cct,
285 &osdriver,
286 p.ps(),
287 p.get_split_bits(curmap->get_pg_num(_pool.id)),
288 _pool.id,
289 p.shard),
290 osdmap_ref(curmap), last_persisted_osdmap_ref(curmap), pool(_pool),
291 _lock("PG::_lock"),
292 #ifdef PG_DEBUG_REFS
293 _ref_id_lock("PG::_ref_id_lock"), _ref_id(0),
294 #endif
295 deleting(false),
296 trace_endpoint("0.0.0.0", 0, "PG"),
297 dirty_info(false), dirty_big_info(false),
298 info(p),
299 info_struct_v(0),
c07f9fc5
FG
300 coll(p),
301 pg_log(cct),
7c673cae
FG
302 pgmeta_oid(p.make_pgmeta_oid()),
303 missing_loc(this),
304 past_intervals(
305 curmap->get_pools().at(p.pgid.pool()).ec_pool(),
306 *curmap),
307 stat_queue_item(this),
308 scrub_queued(false),
309 recovery_queued(false),
310 recovery_ops_active(0),
311 role(-1),
312 state(0),
313 send_notify(false),
314 pg_whoami(osd->whoami, p.shard),
315 need_up_thru(false),
316 last_peering_reset(0),
317 heartbeat_peer_lock("PG::heartbeat_peer_lock"),
318 backfill_reserved(false),
319 backfill_reserving(false),
320 flushes_in_progress(0),
321 pg_stats_publish_lock("PG::pg_stats_publish_lock"),
322 pg_stats_publish_valid(false),
323 osr(osd->osr_registry.lookup_or_create(p, (stringify(p)))),
324 finish_sync_event(NULL),
325 backoff_lock("PG::backoff_lock"),
326 scrub_after_recovery(false),
327 active_pushes(0),
328 recovery_state(this),
329 pg_id(p),
330 peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
331 acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
332 upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
31f18b77 333 last_epoch(0)
7c673cae
FG
334{
335#ifdef PG_DEBUG_REFS
336 osd->add_pgid(p, this);
337#endif
338#ifdef WITH_BLKIN
339 std::stringstream ss;
340 ss << "PG " << info.pgid;
341 trace_endpoint.copy_name(ss.str());
342#endif
343 osr->shard_hint = p;
344}
345
346PG::~PG()
347{
348 pgstate_history.set_pg_in_destructor();
7c673cae
FG
349#ifdef PG_DEBUG_REFS
350 osd->remove_pgid(info.pgid, this);
351#endif
352}
353
354void PG::lock_suspend_timeout(ThreadPool::TPHandle &handle)
355{
356 handle.suspend_tp_timeout();
357 lock();
358 handle.reset_tp_timeout();
359}
360
361void PG::lock(bool no_lockdep) const
362{
363 _lock.Lock(no_lockdep);
364 // if we have unrecorded dirty state with the lock dropped, there is a bug
365 assert(!dirty_info);
366 assert(!dirty_big_info);
367
368 dout(30) << "lock" << dendl;
369}
370
371std::string PG::gen_prefix() const
372{
373 stringstream out;
374 OSDMapRef mapref = osdmap_ref;
375 if (_lock.is_locked_by_me()) {
376 out << "osd." << osd->whoami
377 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
378 << " " << *this << " ";
379 } else {
380 out << "osd." << osd->whoami
381 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
382 << " pg[" << info.pgid << "(unlocked)] ";
383 }
384 return out.str();
385}
386
387/********* PG **********/
388
389void PG::proc_master_log(
390 ObjectStore::Transaction& t, pg_info_t &oinfo,
391 pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
392{
393 dout(10) << "proc_master_log for osd." << from << ": "
394 << olog << " " << omissing << dendl;
395 assert(!is_peered() && is_primary());
396
397 // merge log into our own log to build master log. no need to
398 // make any adjustments to their missing map; we are taking their
399 // log to be authoritative (i.e., their entries are by definitely
400 // non-divergent).
401 merge_log(t, oinfo, olog, from);
402 peer_info[from] = oinfo;
403 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
404 might_have_unfound.insert(from);
405
406 // See doc/dev/osd_internals/last_epoch_started
407 if (oinfo.last_epoch_started > info.last_epoch_started) {
408 info.last_epoch_started = oinfo.last_epoch_started;
409 dirty_info = true;
410 }
411 if (oinfo.last_interval_started > info.last_interval_started) {
412 info.last_interval_started = oinfo.last_interval_started;
413 dirty_info = true;
414 }
415 update_history(oinfo.history);
416 assert(cct->_conf->osd_find_best_info_ignore_history_les ||
417 info.last_epoch_started >= info.history.last_epoch_started);
418
419 peer_missing[from].claim(omissing);
420}
421
422void PG::proc_replica_log(
423 pg_info_t &oinfo,
424 const pg_log_t &olog,
425 pg_missing_t& omissing,
426 pg_shard_t from)
427{
428 dout(10) << "proc_replica_log for osd." << from << ": "
429 << oinfo << " " << olog << " " << omissing << dendl;
430
431 pg_log.proc_replica_log(oinfo, olog, omissing, from);
432
433 peer_info[from] = oinfo;
434 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
435 might_have_unfound.insert(from);
436
437 for (map<hobject_t, pg_missing_item>::const_iterator i =
438 omissing.get_items().begin();
439 i != omissing.get_items().end();
440 ++i) {
441 dout(20) << " after missing " << i->first << " need " << i->second.need
442 << " have " << i->second.have << dendl;
443 }
444 peer_missing[from].claim(omissing);
445}
446
447bool PG::proc_replica_info(
448 pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
449{
450 map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
451 if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
452 dout(10) << " got dup osd." << from << " info " << oinfo << ", identical to ours" << dendl;
453 return false;
454 }
455
456 if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
457 dout(10) << " got info " << oinfo << " from down osd." << from
458 << " discarding" << dendl;
459 return false;
460 }
461
462 dout(10) << " got osd." << from << " " << oinfo << dendl;
463 assert(is_primary());
464 peer_info[from] = oinfo;
465 might_have_unfound.insert(from);
466
467 update_history(oinfo.history);
468
469 // stray?
470 if (!is_up(from) && !is_acting(from)) {
471 dout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
472 stray_set.insert(from);
473 if (is_clean()) {
474 purge_strays();
475 }
476 }
477
478 // was this a new info? if so, update peers!
479 if (p == peer_info.end())
480 update_heartbeat_peers();
481
482 return true;
483}
484
485void PG::remove_snap_mapped_object(
486 ObjectStore::Transaction &t, const hobject_t &soid)
487{
488 t.remove(
489 coll,
490 ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard));
491 clear_object_snap_mapping(&t, soid);
492}
493
494void PG::clear_object_snap_mapping(
495 ObjectStore::Transaction *t, const hobject_t &soid)
496{
497 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
498 if (soid.snap < CEPH_MAXSNAP) {
499 int r = snap_mapper.remove_oid(
500 soid,
501 &_t);
502 if (!(r == 0 || r == -ENOENT)) {
503 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
504 ceph_abort();
505 }
506 }
507}
508
509void PG::update_object_snap_mapping(
510 ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps)
511{
512 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
513 assert(soid.snap < CEPH_MAXSNAP);
514 int r = snap_mapper.remove_oid(
515 soid,
516 &_t);
517 if (!(r == 0 || r == -ENOENT)) {
518 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
519 ceph_abort();
520 }
521 snap_mapper.add_oid(
522 soid,
523 snaps,
524 &_t);
525}
526
527void PG::merge_log(
528 ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_shard_t from)
529{
530 PGLogEntryHandler rollbacker{this, &t};
531 pg_log.merge_log(
532 oinfo, olog, from, info, &rollbacker, dirty_info, dirty_big_info);
533}
534
535void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead)
536{
537 PGLogEntryHandler rollbacker{this, &t};
538 pg_log.rewind_divergent_log(
539 newhead, info, &rollbacker, dirty_info, dirty_big_info);
540}
541
542/*
543 * Process information from a replica to determine if it could have any
544 * objects that i need.
545 *
546 * TODO: if the missing set becomes very large, this could get expensive.
547 * Instead, we probably want to just iterate over our unfound set.
548 */
549bool PG::search_for_missing(
550 const pg_info_t &oinfo, const pg_missing_t &omissing,
551 pg_shard_t from,
552 RecoveryCtx *ctx)
553{
554 uint64_t num_unfound_before = missing_loc.num_unfound();
555 bool found_missing = missing_loc.add_source_info(
556 from, oinfo, omissing, ctx->handle);
557 if (found_missing && num_unfound_before != missing_loc.num_unfound())
558 publish_stats_to_osd();
559 if (found_missing &&
560 (get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
561 CEPH_FEATURE_OSD_ERASURE_CODES)) {
562 pg_info_t tinfo(oinfo);
563 tinfo.pgid.shard = pg_whoami.shard;
564 (*(ctx->info_map))[from.osd].push_back(
565 make_pair(
566 pg_notify_t(
567 from.shard, pg_whoami.shard,
568 get_osdmap()->get_epoch(),
569 get_osdmap()->get_epoch(),
570 tinfo),
571 past_intervals));
572 }
573 return found_missing;
574}
575
576bool PG::MissingLoc::readable_with_acting(
577 const hobject_t &hoid,
578 const set<pg_shard_t> &acting) const {
c07f9fc5
FG
579 if (!needs_recovery(hoid))
580 return true;
581 if (is_deleted(hoid))
582 return false;
7c673cae 583 auto missing_loc_entry = missing_loc.find(hoid);
c07f9fc5
FG
584 if (missing_loc_entry == missing_loc.end())
585 return false;
7c673cae
FG
586 const set<pg_shard_t> &locs = missing_loc_entry->second;
587 ldout(pg->cct, 10) << __func__ << ": locs:" << locs << dendl;
588 set<pg_shard_t> have_acting;
589 for (set<pg_shard_t>::const_iterator i = locs.begin();
590 i != locs.end();
591 ++i) {
592 if (acting.count(*i))
593 have_acting.insert(*i);
594 }
595 return (*is_readable)(have_acting);
596}
597
598void PG::MissingLoc::add_batch_sources_info(
599 const set<pg_shard_t> &sources, ThreadPool::TPHandle* handle)
600{
601 ldout(pg->cct, 10) << __func__ << ": adding sources in batch "
602 << sources.size() << dendl;
603 unsigned loop = 0;
604 for (map<hobject_t, pg_missing_item>::const_iterator i = needs_recovery_map.begin();
605 i != needs_recovery_map.end();
606 ++i) {
607 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
608 handle->reset_tp_timeout();
609 loop = 0;
610 }
c07f9fc5
FG
611 if (i->second.is_delete())
612 continue;
7c673cae
FG
613 missing_loc[i->first].insert(sources.begin(), sources.end());
614 missing_loc_sources.insert(sources.begin(), sources.end());
615 }
616}
617
618bool PG::MissingLoc::add_source_info(
619 pg_shard_t fromosd,
620 const pg_info_t &oinfo,
621 const pg_missing_t &omissing,
622 ThreadPool::TPHandle* handle)
623{
624 bool found_missing = false;
625 unsigned loop = 0;
626 // found items?
627 for (map<hobject_t,pg_missing_item>::const_iterator p = needs_recovery_map.begin();
628 p != needs_recovery_map.end();
629 ++p) {
630 const hobject_t &soid(p->first);
631 eversion_t need = p->second.need;
632 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
633 handle->reset_tp_timeout();
634 loop = 0;
635 }
c07f9fc5
FG
636 if (p->second.is_delete()) {
637 ldout(pg->cct, 10) << __func__ << " " << soid
638 << " delete, ignoring source" << dendl;
639 found_missing = true;
640 continue;
641 }
7c673cae
FG
642 if (oinfo.last_update < need) {
643 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
644 << " also missing on osd." << fromosd
645 << " (last_update " << oinfo.last_update
646 << " < needed " << need << ")" << dendl;
647 continue;
648 }
649 if (!oinfo.last_backfill.is_max() &&
650 !oinfo.last_backfill_bitwise) {
651 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
652 << " also missing on osd." << fromosd
653 << " (last_backfill " << oinfo.last_backfill
654 << " but with wrong sort order)"
655 << dendl;
656 continue;
657 }
658 if (p->first >= oinfo.last_backfill) {
659 // FIXME: this is _probably_ true, although it could conceivably
660 // be in the undefined region! Hmm!
661 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
662 << " also missing on osd." << fromosd
663 << " (past last_backfill " << oinfo.last_backfill
664 << ")" << dendl;
665 continue;
666 }
667 if (oinfo.last_complete < need) {
668 if (omissing.is_missing(soid)) {
669 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
670 << " also missing on osd." << fromosd << dendl;
671 continue;
672 }
673 }
674
675 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
676 << " is on osd." << fromosd << dendl;
677
678 missing_loc[soid].insert(fromosd);
679 missing_loc_sources.insert(fromosd);
680 found_missing = true;
681 }
682
683 ldout(pg->cct, 20) << "needs_recovery_map missing " << needs_recovery_map
684 << dendl;
685 return found_missing;
686}
687
688void PG::discover_all_missing(map<int, map<spg_t,pg_query_t> > &query_map)
689{
690 auto &missing = pg_log.get_missing();
691 uint64_t unfound = get_num_unfound();
692 assert(unfound > 0);
693
694 dout(10) << __func__ << " "
695 << missing.num_missing() << " missing, "
696 << unfound << " unfound"
697 << dendl;
698
699 std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
700 std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
701 for (; m != mend; ++m) {
702 pg_shard_t peer(*m);
703
704 if (!get_osdmap()->is_up(peer.osd)) {
705 dout(20) << __func__ << " skipping down osd." << peer << dendl;
706 continue;
707 }
708
709 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
710 if (iter != peer_info.end() &&
711 (iter->second.is_empty() || iter->second.dne())) {
712 // ignore empty peers
713 continue;
714 }
715
716 // If we've requested any of this stuff, the pg_missing_t information
717 // should be on its way.
718 // TODO: coalsce requested_* into a single data structure
719 if (peer_missing.find(peer) != peer_missing.end()) {
720 dout(20) << __func__ << ": osd." << peer
721 << ": we already have pg_missing_t" << dendl;
722 continue;
723 }
724 if (peer_log_requested.find(peer) != peer_log_requested.end()) {
725 dout(20) << __func__ << ": osd." << peer
726 << ": in peer_log_requested" << dendl;
727 continue;
728 }
729 if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
730 dout(20) << __func__ << ": osd." << peer
731 << ": in peer_missing_requested" << dendl;
732 continue;
733 }
734
735 // Request missing
736 dout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
737 << dendl;
738 peer_missing_requested.insert(peer);
739 query_map[peer.osd][spg_t(info.pgid.pgid, peer.shard)] =
740 pg_query_t(
741 pg_query_t::FULLLOG,
742 peer.shard, pg_whoami.shard,
743 info.history, get_osdmap()->get_epoch());
744 }
745}
746
747/******* PG ***********/
748bool PG::needs_recovery() const
749{
750 assert(is_primary());
751
752 auto &missing = pg_log.get_missing();
753
754 if (missing.num_missing()) {
755 dout(10) << __func__ << " primary has " << missing.num_missing()
756 << " missing" << dendl;
757 return true;
758 }
759
760 assert(!actingbackfill.empty());
761 set<pg_shard_t>::const_iterator end = actingbackfill.end();
762 set<pg_shard_t>::const_iterator a = actingbackfill.begin();
763 for (; a != end; ++a) {
764 if (*a == get_primary()) continue;
765 pg_shard_t peer = *a;
766 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
767 if (pm == peer_missing.end()) {
768 dout(10) << __func__ << " osd." << peer << " doesn't have missing set"
769 << dendl;
770 continue;
771 }
772 if (pm->second.num_missing()) {
773 dout(10) << __func__ << " osd." << peer << " has "
774 << pm->second.num_missing() << " missing" << dendl;
775 return true;
776 }
777 }
778
779 dout(10) << __func__ << " is recovered" << dendl;
780 return false;
781}
782
783bool PG::needs_backfill() const
784{
785 assert(is_primary());
786
787 // We can assume that only possible osds that need backfill
788 // are on the backfill_targets vector nodes.
789 set<pg_shard_t>::const_iterator end = backfill_targets.end();
790 set<pg_shard_t>::const_iterator a = backfill_targets.begin();
791 for (; a != end; ++a) {
792 pg_shard_t peer = *a;
793 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
794 if (!pi->second.last_backfill.is_max()) {
795 dout(10) << __func__ << " osd." << peer << " has last_backfill " << pi->second.last_backfill << dendl;
796 return true;
797 }
798 }
799
800 dout(10) << __func__ << " does not need backfill" << dendl;
801 return false;
802}
803
804
805void PG::check_past_interval_bounds() const
806{
807 auto rpib = get_required_past_interval_bounds(
808 info,
809 osd->get_superblock().oldest_map);
810 if (rpib.first >= rpib.second) {
811 if (!past_intervals.empty()) {
812 osd->clog->error() << info.pgid << " required past_interval bounds are"
813 << " empty [" << rpib << ") but past_intervals is not: "
814 << past_intervals;
815 derr << info.pgid << " required past_interval bounds are"
816 << " empty [" << rpib << ") but past_intervals is not: "
817 << past_intervals << dendl;
7c673cae
FG
818 }
819 } else {
820 if (past_intervals.empty()) {
821 osd->clog->error() << info.pgid << " required past_interval bounds are"
822 << " not empty [" << rpib << ") but past_intervals "
823 << past_intervals << " is empty";
824 derr << info.pgid << " required past_interval bounds are"
825 << " not empty [" << rpib << ") but past_intervals "
826 << past_intervals << " is empty" << dendl;
827 assert(!past_intervals.empty());
828 }
829
830 auto apib = past_intervals.get_bounds();
831 if (apib.first > rpib.first) {
832 osd->clog->error() << info.pgid << " past_intervals [" << apib
833 << ") start interval does not contain the required"
834 << " bound [" << rpib << ") start";
835 derr << info.pgid << " past_intervals [" << apib
836 << ") start interval does not contain the required"
837 << " bound [" << rpib << ") start" << dendl;
838 assert(0 == "past_interval start interval mismatch");
839 }
840 if (apib.second != rpib.second) {
841 osd->clog->error() << info.pgid << " past_interal bound [" << apib
842 << ") end does not match required [" << rpib
843 << ") end";
844 derr << info.pgid << " past_interal bound [" << apib
845 << ") end does not match required [" << rpib
846 << ") end" << dendl;
847 assert(0 == "past_interval end mismatch");
848 }
849 }
850}
851
852bool PG::adjust_need_up_thru(const OSDMapRef osdmap)
853{
854 epoch_t up_thru = osdmap->get_up_thru(osd->whoami);
855 if (need_up_thru &&
856 up_thru >= info.history.same_interval_since) {
857 dout(10) << "adjust_need_up_thru now " << up_thru << ", need_up_thru now false" << dendl;
858 need_up_thru = false;
859 return true;
860 }
861 return false;
862}
863
864void PG::remove_down_peer_info(const OSDMapRef osdmap)
865{
866 // Remove any downed osds from peer_info
867 bool removed = false;
868 map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
869 while (p != peer_info.end()) {
870 if (!osdmap->is_up(p->first.osd)) {
871 dout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
872 peer_missing.erase(p->first);
873 peer_log_requested.erase(p->first);
874 peer_missing_requested.erase(p->first);
875 peer_info.erase(p++);
876 removed = true;
877 } else
878 ++p;
879 }
880
881 // if we removed anyone, update peers (which include peer_info)
882 if (removed)
883 update_heartbeat_peers();
884 check_recovery_sources(osdmap);
885}
886
887/*
888 * Returns true unless there is a non-lost OSD in might_have_unfound.
889 */
890bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
891{
892 assert(is_primary());
893
894 set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
895 set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
896 for (; peer != mend; ++peer) {
897 if (peer_missing.count(*peer))
898 continue;
899 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
900 if (iter != peer_info.end() &&
901 (iter->second.is_empty() || iter->second.dne()))
902 continue;
903 if (!osdmap->exists(peer->osd))
904 continue;
905 const osd_info_t &osd_info(osdmap->get_info(peer->osd));
906 if (osd_info.lost_at <= osd_info.up_from) {
907 // If there is even one OSD in might_have_unfound that isn't lost, we
908 // still might retrieve our unfound.
909 return false;
910 }
911 }
912 dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound
913 << " have been queried or are marked lost" << dendl;
914 return true;
915}
916
917PastIntervals::PriorSet PG::build_prior()
918{
919 if (1) {
920 // sanity check
921 for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
922 it != peer_info.end();
923 ++it) {
924 assert(info.history.last_epoch_started >= it->second.history.last_epoch_started);
925 }
926 }
927
928 const OSDMap &osdmap = *get_osdmap();
929 PastIntervals::PriorSet prior = past_intervals.get_prior_set(
930 pool.info.ec_pool(),
931 info.history.last_epoch_started,
932 get_pgbackend()->get_is_recoverable_predicate(),
933 [&](epoch_t start, int osd, epoch_t *lost_at) {
934 const osd_info_t *pinfo = 0;
935 if (osdmap.exists(osd)) {
936 pinfo = &osdmap.get_info(osd);
937 if (lost_at)
938 *lost_at = pinfo->lost_at;
939 }
940
941 if (osdmap.is_up(osd)) {
942 return PastIntervals::UP;
943 } else if (!pinfo) {
944 return PastIntervals::DNE;
945 } else if (pinfo->lost_at > start) {
946 return PastIntervals::LOST;
947 } else {
948 return PastIntervals::DOWN;
949 }
950 },
951 up,
952 acting,
953 this);
954
955 if (prior.pg_down) {
956 state_set(PG_STATE_DOWN);
957 }
958
959 if (get_osdmap()->get_up_thru(osd->whoami) < info.history.same_interval_since) {
960 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
961 << " < same_since " << info.history.same_interval_since
962 << ", must notify monitor" << dendl;
963 need_up_thru = true;
964 } else {
965 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
966 << " >= same_since " << info.history.same_interval_since
967 << ", all is well" << dendl;
968 need_up_thru = false;
969 }
970 set_probe_targets(prior.probe);
971 return prior;
972}
973
974void PG::clear_primary_state()
975{
976 dout(10) << "clear_primary_state" << dendl;
977
978 // clear peering state
979 stray_set.clear();
980 peer_log_requested.clear();
981 peer_missing_requested.clear();
982 peer_info.clear();
983 peer_missing.clear();
984 need_up_thru = false;
985 peer_last_complete_ondisk.clear();
986 peer_activated.clear();
987 min_last_complete_ondisk = eversion_t();
988 pg_trim_to = eversion_t();
989 might_have_unfound.clear();
990 projected_log = PGLog::IndexedLog();
991
992 last_update_ondisk = eversion_t();
993
994 snap_trimq.clear();
995
996 finish_sync_event = 0; // so that _finish_recovery doesn't go off in another thread
997
998 missing_loc.clear();
999
1000 release_pg_backoffs();
1001
1002 pg_log.reset_recovery_pointers();
1003
1004 scrubber.reserved_peers.clear();
1005 scrub_after_recovery = false;
1006
1007 agent_clear();
1008}
1009
1010PG::Scrubber::Scrubber()
1011 : reserved(false), reserve_failed(false),
1012 epoch_start(0),
224ce89b 1013 active(false),
7c673cae
FG
1014 waiting_on(0), shallow_errors(0), deep_errors(0), fixed(0),
1015 must_scrub(false), must_deep_scrub(false), must_repair(false),
1016 auto_repair(false),
1017 num_digest_updates_pending(0),
1018 state(INACTIVE),
1019 deep(false),
1020 seed(0)
1021{}
1022
1023PG::Scrubber::~Scrubber() {}
1024
1025/**
1026 * find_best_info
1027 *
1028 * Returns an iterator to the best info in infos sorted by:
1029 * 1) Prefer newer last_update
1030 * 2) Prefer longer tail if it brings another info into contiguity
1031 * 3) Prefer current primary
1032 */
1033map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
1034 const map<pg_shard_t, pg_info_t> &infos,
1035 bool restrict_to_up_acting,
1036 bool *history_les_bound) const
1037{
1038 assert(history_les_bound);
1039 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1040 * to make changes to this process. Also, make sure to update it
1041 * when you find bugs! */
1042 eversion_t min_last_update_acceptable = eversion_t::max();
1043 epoch_t max_last_epoch_started_found = 0;
1044 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1045 i != infos.end();
1046 ++i) {
1047 if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1048 max_last_epoch_started_found < i->second.history.last_epoch_started) {
1049 *history_les_bound = true;
1050 max_last_epoch_started_found = i->second.history.last_epoch_started;
1051 }
1052 if (!i->second.is_incomplete() &&
1053 max_last_epoch_started_found < i->second.last_epoch_started) {
1054 max_last_epoch_started_found = i->second.last_epoch_started;
1055 }
1056 }
1057 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1058 i != infos.end();
1059 ++i) {
1060 if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1061 if (min_last_update_acceptable > i->second.last_update)
1062 min_last_update_acceptable = i->second.last_update;
1063 }
1064 }
1065 if (min_last_update_acceptable == eversion_t::max())
1066 return infos.end();
1067
1068 map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1069 // find osd with newest last_update (oldest for ec_pool).
1070 // if there are multiples, prefer
1071 // - a longer tail, if it brings another peer into log contiguity
1072 // - the current primary
1073 for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1074 p != infos.end();
1075 ++p) {
1076 if (restrict_to_up_acting && !is_up(p->first) &&
1077 !is_acting(p->first))
1078 continue;
1079 // Only consider peers with last_update >= min_last_update_acceptable
1080 if (p->second.last_update < min_last_update_acceptable)
1081 continue;
1082 // Disqualify anyone with a too old last_epoch_started
1083 if (p->second.last_epoch_started < max_last_epoch_started_found)
1084 continue;
1085 // Disqualify anyone who is incomplete (not fully backfilled)
1086 if (p->second.is_incomplete())
1087 continue;
1088 if (best == infos.end()) {
1089 best = p;
1090 continue;
1091 }
1092 // Prefer newer last_update
1093 if (pool.info.require_rollback()) {
1094 if (p->second.last_update > best->second.last_update)
1095 continue;
1096 if (p->second.last_update < best->second.last_update) {
1097 best = p;
1098 continue;
1099 }
1100 } else {
1101 if (p->second.last_update < best->second.last_update)
1102 continue;
1103 if (p->second.last_update > best->second.last_update) {
1104 best = p;
1105 continue;
1106 }
1107 }
1108
1109 // Prefer longer tail
1110 if (p->second.log_tail > best->second.log_tail) {
1111 continue;
1112 } else if (p->second.log_tail < best->second.log_tail) {
1113 best = p;
1114 continue;
1115 }
1116
1117 // prefer current primary (usually the caller), all things being equal
1118 if (p->first == pg_whoami) {
1119 dout(10) << "calc_acting prefer osd." << p->first
1120 << " because it is current primary" << dendl;
1121 best = p;
1122 continue;
1123 }
1124 }
1125 return best;
1126}
1127
1128void PG::calc_ec_acting(
1129 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1130 unsigned size,
1131 const vector<int> &acting,
1132 pg_shard_t acting_primary,
1133 const vector<int> &up,
1134 pg_shard_t up_primary,
1135 const map<pg_shard_t, pg_info_t> &all_info,
7c673cae
FG
1136 bool restrict_to_up_acting,
1137 vector<int> *_want,
1138 set<pg_shard_t> *backfill,
1139 set<pg_shard_t> *acting_backfill,
1140 pg_shard_t *want_primary,
1141 ostream &ss)
1142{
1143 vector<int> want(size, CRUSH_ITEM_NONE);
1144 map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1145 unsigned usable = 0;
1146 for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1147 i != all_info.end();
1148 ++i) {
1149 all_info_by_shard[i->first.shard].insert(i->first);
1150 }
1151 for (uint8_t i = 0; i < want.size(); ++i) {
1152 ss << "For position " << (unsigned)i << ": ";
1153 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1154 !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1155 all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1156 auth_log_shard->second.log_tail) {
1157 ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1158 want[i] = up[i];
1159 ++usable;
1160 continue;
1161 }
1162 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1163 ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1164 << " and ";
1165 backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1166 }
1167
1168 if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1169 !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1170 all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1171 auth_log_shard->second.log_tail) {
1172 ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1173 want[i] = acting[i];
1174 ++usable;
1175 } else if (!restrict_to_up_acting) {
1176 for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1177 j != all_info_by_shard[shard_id_t(i)].end();
1178 ++j) {
1179 assert(j->shard == i);
1180 if (!all_info.find(*j)->second.is_incomplete() &&
1181 all_info.find(*j)->second.last_update >=
1182 auth_log_shard->second.log_tail) {
1183 ss << " selecting stray: " << *j << std::endl;
1184 want[i] = j->osd;
1185 ++usable;
1186 break;
1187 }
1188 }
1189 if (want[i] == CRUSH_ITEM_NONE)
1190 ss << " failed to fill position " << (int)i << std::endl;
1191 }
1192 }
1193
1194 bool found_primary = false;
1195 for (uint8_t i = 0; i < want.size(); ++i) {
1196 if (want[i] != CRUSH_ITEM_NONE) {
1197 acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1198 if (!found_primary) {
1199 *want_primary = pg_shard_t(want[i], shard_id_t(i));
1200 found_primary = true;
1201 }
1202 }
1203 }
1204 acting_backfill->insert(backfill->begin(), backfill->end());
1205 _want->swap(want);
1206}
1207
1208/**
1209 * calculate the desired acting set.
1210 *
1211 * Choose an appropriate acting set. Prefer up[0], unless it is
1212 * incomplete, or another osd has a longer tail that allows us to
1213 * bring other up nodes up to date.
1214 */
1215void PG::calc_replicated_acting(
1216 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1217 unsigned size,
1218 const vector<int> &acting,
1219 pg_shard_t acting_primary,
1220 const vector<int> &up,
1221 pg_shard_t up_primary,
1222 const map<pg_shard_t, pg_info_t> &all_info,
7c673cae
FG
1223 bool restrict_to_up_acting,
1224 vector<int> *want,
1225 set<pg_shard_t> *backfill,
1226 set<pg_shard_t> *acting_backfill,
1227 pg_shard_t *want_primary,
1228 ostream &ss)
1229{
1230 ss << "calc_acting newest update on osd." << auth_log_shard->first
1231 << " with " << auth_log_shard->second
1232 << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1233 pg_shard_t auth_log_shard_id = auth_log_shard->first;
1234
1235 // select primary
1236 map<pg_shard_t,pg_info_t>::const_iterator primary;
1237 if (up.size() &&
1238 !all_info.find(up_primary)->second.is_incomplete() &&
1239 all_info.find(up_primary)->second.last_update >=
1240 auth_log_shard->second.log_tail) {
1241 ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1242 primary = all_info.find(up_primary); // prefer up[0], all thing being equal
1243 } else {
1244 assert(!auth_log_shard->second.is_incomplete());
1245 ss << "up[0] needs backfill, osd." << auth_log_shard_id
1246 << " selected as primary instead" << std::endl;
1247 primary = auth_log_shard;
1248 }
1249
1250 ss << "calc_acting primary is osd." << primary->first
1251 << " with " << primary->second << std::endl;
1252 *want_primary = primary->first;
1253 want->push_back(primary->first.osd);
1254 acting_backfill->insert(primary->first);
1255 unsigned usable = 1;
1256
1257 // select replicas that have log contiguity with primary.
1258 // prefer up, then acting, then any peer_info osds
1259 for (vector<int>::const_iterator i = up.begin();
1260 i != up.end();
1261 ++i) {
1262 pg_shard_t up_cand = pg_shard_t(*i, shard_id_t::NO_SHARD);
1263 if (up_cand == primary->first)
1264 continue;
1265 const pg_info_t &cur_info = all_info.find(up_cand)->second;
1266 if (cur_info.is_incomplete() ||
1267 cur_info.last_update < MIN(
1268 primary->second.log_tail,
1269 auth_log_shard->second.log_tail)) {
1270 /* We include auth_log_shard->second.log_tail because in GetLog,
1271 * we will request logs back to the min last_update over our
1272 * acting_backfill set, which will result in our log being extended
1273 * as far backwards as necessary to pick up any peers which can
1274 * be log recovered by auth_log_shard's log */
1275 ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
31f18b77
FG
1276 backfill->insert(up_cand);
1277 acting_backfill->insert(up_cand);
7c673cae
FG
1278 } else {
1279 want->push_back(*i);
1280 acting_backfill->insert(up_cand);
1281 usable++;
1282 ss << " osd." << *i << " (up) accepted " << cur_info << std::endl;
1283 }
1284 }
1285
1286 // This no longer has backfill OSDs, but they are covered above.
1287 for (vector<int>::const_iterator i = acting.begin();
1288 i != acting.end();
1289 ++i) {
1290 pg_shard_t acting_cand(*i, shard_id_t::NO_SHARD);
1291 if (usable >= size)
1292 break;
1293
1294 // skip up osds we already considered above
1295 if (acting_cand == primary->first)
1296 continue;
1297 vector<int>::const_iterator up_it = find(up.begin(), up.end(), acting_cand.osd);
1298 if (up_it != up.end())
1299 continue;
1300
1301 const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1302 if (cur_info.is_incomplete() ||
1303 cur_info.last_update < primary->second.log_tail) {
1304 ss << " shard " << acting_cand << " (stray) REJECTED "
1305 << cur_info << std::endl;
1306 } else {
1307 want->push_back(*i);
1308 acting_backfill->insert(acting_cand);
1309 ss << " shard " << acting_cand << " (stray) accepted "
1310 << cur_info << std::endl;
1311 usable++;
1312 }
1313 }
1314
1315 if (restrict_to_up_acting) {
1316 return;
1317 }
1318 for (map<pg_shard_t,pg_info_t>::const_iterator i = all_info.begin();
1319 i != all_info.end();
1320 ++i) {
1321 if (usable >= size)
1322 break;
1323
1324 // skip up osds we already considered above
1325 if (i->first == primary->first)
1326 continue;
1327 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i->first.osd);
1328 if (up_it != up.end())
1329 continue;
1330 vector<int>::const_iterator acting_it = find(
1331 acting.begin(), acting.end(), i->first.osd);
1332 if (acting_it != acting.end())
1333 continue;
1334
1335 if (i->second.is_incomplete() ||
1336 i->second.last_update < primary->second.log_tail) {
1337 ss << " shard " << i->first << " (stray) REJECTED "
1338 << i->second << std::endl;
1339 } else {
1340 want->push_back(i->first.osd);
1341 acting_backfill->insert(i->first);
1342 ss << " shard " << i->first << " (stray) accepted "
1343 << i->second << std::endl;
1344 usable++;
1345 }
1346 }
1347}
1348
1349/**
1350 * choose acting
1351 *
1352 * calculate the desired acting, and request a change with the monitor
1353 * if it differs from the current acting.
1354 *
1355 * if restrict_to_up_acting=true, we filter out anything that's not in
1356 * up/acting. in order to lift this restriction, we need to
1357 * 1) check whether it's worth switching the acting set any time we get
1358 * a new pg info (not just here, when recovery finishes)
1359 * 2) check whether anything in want_acting went down on each new map
1360 * (and, if so, calculate a new want_acting)
1361 * 3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap)
1362 * TODO!
1363 */
1364bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
1365 bool restrict_to_up_acting,
1366 bool *history_les_bound)
1367{
1368 map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
1369 all_info[pg_whoami] = info;
1370
1371 for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
1372 p != all_info.end();
1373 ++p) {
1374 dout(10) << "calc_acting osd." << p->first << " " << p->second << dendl;
1375 }
1376
1377 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
1378 find_best_info(all_info, restrict_to_up_acting, history_les_bound);
1379
1380 if (auth_log_shard == all_info.end()) {
1381 if (up != acting) {
1382 dout(10) << "choose_acting no suitable info found (incomplete backfills?),"
1383 << " reverting to up" << dendl;
1384 want_acting = up;
1385 vector<int> empty;
1386 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1387 } else {
1388 dout(10) << "choose_acting failed" << dendl;
1389 assert(want_acting.empty());
1390 }
1391 return false;
1392 }
1393
1394 assert(!auth_log_shard->second.is_incomplete());
1395 auth_log_shard_id = auth_log_shard->first;
1396
7c673cae
FG
1397 set<pg_shard_t> want_backfill, want_acting_backfill;
1398 vector<int> want;
1399 pg_shard_t want_primary;
1400 stringstream ss;
1401 if (!pool.info.ec_pool())
1402 calc_replicated_acting(
1403 auth_log_shard,
1404 get_osdmap()->get_pg_size(info.pgid.pgid),
1405 acting,
1406 primary,
1407 up,
1408 up_primary,
1409 all_info,
7c673cae
FG
1410 restrict_to_up_acting,
1411 &want,
1412 &want_backfill,
1413 &want_acting_backfill,
1414 &want_primary,
1415 ss);
1416 else
1417 calc_ec_acting(
1418 auth_log_shard,
1419 get_osdmap()->get_pg_size(info.pgid.pgid),
1420 acting,
1421 primary,
1422 up,
1423 up_primary,
1424 all_info,
7c673cae
FG
1425 restrict_to_up_acting,
1426 &want,
1427 &want_backfill,
1428 &want_acting_backfill,
1429 &want_primary,
1430 ss);
1431 dout(10) << ss.str() << dendl;
1432
1433 unsigned num_want_acting = 0;
1434 set<pg_shard_t> have;
1435 for (int i = 0; i < (int)want.size(); ++i) {
1436 if (want[i] != CRUSH_ITEM_NONE) {
1437 ++num_want_acting;
1438 have.insert(
1439 pg_shard_t(
1440 want[i],
1441 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1442 }
1443 }
1444
1445 // We go incomplete if below min_size for ec_pools since backfill
1446 // does not currently maintain rollbackability
1447 // Otherwise, we will go "peered", but not "active"
1448 if (num_want_acting < pool.info.min_size &&
1449 (pool.info.ec_pool() ||
1450 !cct->_conf->osd_allow_recovery_below_min_size)) {
1451 want_acting.clear();
1452 dout(10) << "choose_acting failed, below min size" << dendl;
1453 return false;
1454 }
1455
1456 /* Check whether we have enough acting shards to later perform recovery */
1457 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable_predicate(
1458 get_pgbackend()->get_is_recoverable_predicate());
1459 if (!(*recoverable_predicate)(have)) {
1460 want_acting.clear();
1461 dout(10) << "choose_acting failed, not recoverable" << dendl;
1462 return false;
1463 }
1464
1465 if (want != acting) {
1466 dout(10) << "choose_acting want " << want << " != acting " << acting
1467 << ", requesting pg_temp change" << dendl;
1468 want_acting = want;
1469
1470 if (want_acting == up) {
1471 // There can't be any pending backfill if
1472 // want is the same as crush map up OSDs.
31f18b77 1473 assert(want_backfill.empty());
7c673cae
FG
1474 vector<int> empty;
1475 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1476 } else
1477 osd->queue_want_pg_temp(info.pgid.pgid, want);
1478 return false;
1479 }
1480 want_acting.clear();
1481 actingbackfill = want_acting_backfill;
1482 dout(10) << "actingbackfill is " << actingbackfill << dendl;
1483 assert(backfill_targets.empty() || backfill_targets == want_backfill);
1484 if (backfill_targets.empty()) {
1485 // Caller is GetInfo
1486 backfill_targets = want_backfill;
1487 }
1488 // Will not change if already set because up would have had to change
1489 // Verify that nothing in backfill is in stray_set
1490 for (set<pg_shard_t>::iterator i = want_backfill.begin();
1491 i != want_backfill.end();
1492 ++i) {
1493 assert(stray_set.find(*i) == stray_set.end());
1494 }
1495 dout(10) << "choose_acting want " << want << " (== acting) backfill_targets "
1496 << want_backfill << dendl;
1497 return true;
1498}
1499
1500/* Build the might_have_unfound set.
1501 *
1502 * This is used by the primary OSD during recovery.
1503 *
1504 * This set tracks the OSDs which might have unfound objects that the primary
1505 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
1506 * will remove the OSD from the set.
1507 */
1508void PG::build_might_have_unfound()
1509{
1510 assert(might_have_unfound.empty());
1511 assert(is_primary());
1512
1513 dout(10) << __func__ << dendl;
1514
1515 check_past_interval_bounds();
1516
1517 might_have_unfound = past_intervals.get_might_have_unfound(
1518 pg_whoami,
1519 pool.info.ec_pool());
1520
1521 // include any (stray) peers
1522 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
1523 p != peer_info.end();
1524 ++p)
1525 might_have_unfound.insert(p->first);
1526
1527 dout(15) << __func__ << ": built " << might_have_unfound << dendl;
1528}
1529
1530struct C_PG_ActivateCommitted : public Context {
1531 PGRef pg;
1532 epoch_t epoch;
1533 epoch_t activation_epoch;
1534 C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
1535 : pg(p), epoch(e), activation_epoch(ae) {}
1536 void finish(int r) override {
1537 pg->_activate_committed(epoch, activation_epoch);
1538 }
1539};
1540
1541void PG::activate(ObjectStore::Transaction& t,
1542 epoch_t activation_epoch,
1543 list<Context*>& tfin,
1544 map<int, map<spg_t,pg_query_t> >& query_map,
1545 map<int,
1546 vector<
1547 pair<pg_notify_t,
1548 PastIntervals> > > *activator_map,
1549 RecoveryCtx *ctx)
1550{
1551 assert(!is_peered());
1552 assert(scrubber.callbacks.empty());
1553 assert(callbacks_for_degraded_object.empty());
1554
1555 // twiddle pg state
1556 state_clear(PG_STATE_DOWN);
1557
1558 send_notify = false;
1559
1560 if (is_primary()) {
1561 // only update primary last_epoch_started if we will go active
1562 if (acting.size() >= pool.info.min_size) {
1563 assert(cct->_conf->osd_find_best_info_ignore_history_les ||
1564 info.last_epoch_started <= activation_epoch);
1565 info.last_epoch_started = activation_epoch;
1566 info.last_interval_started = info.history.same_interval_since;
1567 }
1568 } else if (is_acting(pg_whoami)) {
1569 /* update last_epoch_started on acting replica to whatever the primary sent
1570 * unless it's smaller (could happen if we are going peered rather than
1571 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
1572 if (info.last_epoch_started < activation_epoch) {
1573 info.last_epoch_started = activation_epoch;
1574 info.last_interval_started = info.history.same_interval_since;
1575 }
1576 }
1577
1578 auto &missing = pg_log.get_missing();
1579
1580 if (is_primary()) {
1581 last_update_ondisk = info.last_update;
1582 min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
1583 }
1584 last_update_applied = info.last_update;
1585 last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
1586
1587 need_up_thru = false;
1588
1589 // write pg info, log
1590 dirty_info = true;
1591 dirty_big_info = true; // maybe
1592
1593 // find out when we commit
1594 t.register_on_complete(
1595 new C_PG_ActivateCommitted(
1596 this,
1597 get_osdmap()->get_epoch(),
1598 activation_epoch));
1599
1600 // initialize snap_trimq
1601 if (is_primary()) {
1602 dout(20) << "activate - purged_snaps " << info.purged_snaps
1603 << " cached_removed_snaps " << pool.cached_removed_snaps << dendl;
1604 snap_trimq = pool.cached_removed_snaps;
1605 interval_set<snapid_t> intersection;
1606 intersection.intersection_of(snap_trimq, info.purged_snaps);
1607 if (intersection == info.purged_snaps) {
1608 snap_trimq.subtract(info.purged_snaps);
1609 } else {
1610 dout(0) << "warning: info.purged_snaps (" << info.purged_snaps
1611 << ") is not a subset of pool.cached_removed_snaps ("
1612 << pool.cached_removed_snaps << ")" << dendl;
1613 snap_trimq.subtract(intersection);
1614 }
1615 }
1616
1617 // init complete pointer
1618 if (missing.num_missing() == 0) {
1619 dout(10) << "activate - no missing, moving last_complete " << info.last_complete
1620 << " -> " << info.last_update << dendl;
1621 info.last_complete = info.last_update;
1622 pg_log.reset_recovery_pointers();
1623 } else {
1624 dout(10) << "activate - not complete, " << missing << dendl;
1625 pg_log.activate_not_complete(info);
1626 }
1627
1628 log_weirdness();
1629
1630 // if primary..
1631 if (is_primary()) {
1632 assert(ctx);
1633 // start up replicas
1634
1635 assert(!actingbackfill.empty());
1636 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1637 i != actingbackfill.end();
1638 ++i) {
1639 if (*i == pg_whoami) continue;
1640 pg_shard_t peer = *i;
1641 assert(peer_info.count(peer));
1642 pg_info_t& pi = peer_info[peer];
1643
1644 dout(10) << "activate peer osd." << peer << " " << pi << dendl;
1645
1646 MOSDPGLog *m = 0;
d2e6a577 1647 assert(peer_missing.count(peer));
7c673cae
FG
1648 pg_missing_t& pm = peer_missing[peer];
1649
1650 bool needs_past_intervals = pi.dne();
1651
1652 /*
1653 * cover case where peer sort order was different and
1654 * last_backfill cannot be interpreted
1655 */
1656 bool force_restart_backfill =
1657 !pi.last_backfill.is_max() &&
1658 !pi.last_backfill_bitwise;
1659
1660 if (pi.last_update == info.last_update && !force_restart_backfill) {
1661 // empty log
1662 if (!pi.last_backfill.is_max())
1663 osd->clog->info() << info.pgid << " continuing backfill to osd."
1664 << peer
1665 << " from (" << pi.log_tail << "," << pi.last_update
1666 << "] " << pi.last_backfill
1667 << " to " << info.last_update;
1668 if (!pi.is_empty() && activator_map) {
1669 dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
1670 (*activator_map)[peer.osd].push_back(
1671 make_pair(
1672 pg_notify_t(
1673 peer.shard, pg_whoami.shard,
1674 get_osdmap()->get_epoch(),
1675 get_osdmap()->get_epoch(),
1676 info),
1677 past_intervals));
1678 } else {
1679 dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
1680 m = new MOSDPGLog(
1681 i->shard, pg_whoami.shard,
1682 get_osdmap()->get_epoch(), info);
1683 }
1684 } else if (
1685 pg_log.get_tail() > pi.last_update ||
1686 pi.last_backfill == hobject_t() ||
1687 force_restart_backfill ||
1688 (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
1689 /* ^ This last case covers a situation where a replica is not contiguous
1690 * with the auth_log, but is contiguous with this replica. Reshuffling
1691 * the active set to handle this would be tricky, so instead we just go
1692 * ahead and backfill it anyway. This is probably preferrable in any
1693 * case since the replica in question would have to be significantly
1694 * behind.
1695 */
1696 // backfill
224ce89b 1697 osd->clog->debug() << info.pgid << " starting backfill to osd." << peer
7c673cae
FG
1698 << " from (" << pi.log_tail << "," << pi.last_update
1699 << "] " << pi.last_backfill
1700 << " to " << info.last_update;
1701
1702 pi.last_update = info.last_update;
1703 pi.last_complete = info.last_update;
1704 pi.set_last_backfill(hobject_t());
1705 pi.last_epoch_started = info.last_epoch_started;
1706 pi.last_interval_started = info.last_interval_started;
1707 pi.history = info.history;
1708 pi.hit_set = info.hit_set;
1709 pi.stats.stats.clear();
1710
1711 // initialize peer with our purged_snaps.
1712 pi.purged_snaps = info.purged_snaps;
1713
1714 m = new MOSDPGLog(
1715 i->shard, pg_whoami.shard,
1716 get_osdmap()->get_epoch(), pi);
1717
1718 // send some recent log, so that op dup detection works well.
1719 m->log.copy_up_to(pg_log.get_log(), cct->_conf->osd_min_pg_log_entries);
1720 m->info.log_tail = m->log.tail;
1721 pi.log_tail = m->log.tail; // sigh...
1722
1723 pm.clear();
1724 } else {
1725 // catch up
1726 assert(pg_log.get_tail() <= pi.last_update);
1727 m = new MOSDPGLog(
1728 i->shard, pg_whoami.shard,
1729 get_osdmap()->get_epoch(), info);
1730 // send new stuff to append to replicas log
1731 m->log.copy_after(pg_log.get_log(), pi.last_update);
1732 }
1733
1734 // share past_intervals if we are creating the pg on the replica
1735 // based on whether our info for that peer was dne() *before*
1736 // updating pi.history in the backfill block above.
1737 if (m && needs_past_intervals)
1738 m->past_intervals = past_intervals;
1739
1740 // update local version of peer's missing list!
1741 if (m && pi.last_backfill != hobject_t()) {
1742 for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
1743 p != m->log.log.end();
c07f9fc5 1744 ++p) {
31f18b77 1745 if (p->soid <= pi.last_backfill &&
c07f9fc5
FG
1746 !p->is_error()) {
1747 if (perform_deletes_during_peering() && p->is_delete()) {
1748 pm.rm(p->soid, p->version);
1749 } else {
1750 pm.add_next_event(*p);
1751 }
1752 }
1753 }
7c673cae 1754 }
c07f9fc5 1755
7c673cae
FG
1756 if (m) {
1757 dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
1758 //m->log.print(cout);
1759 osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
1760 }
1761
1762 // peer now has
1763 pi.last_update = info.last_update;
1764
1765 // update our missing
1766 if (pm.num_missing() == 0) {
1767 pi.last_complete = pi.last_update;
1768 dout(10) << "activate peer osd." << peer << " " << pi << " uptodate" << dendl;
1769 } else {
1770 dout(10) << "activate peer osd." << peer << " " << pi << " missing " << pm << dendl;
1771 }
1772 }
1773
1774 // Set up missing_loc
1775 set<pg_shard_t> complete_shards;
1776 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1777 i != actingbackfill.end();
1778 ++i) {
c07f9fc5 1779 dout(20) << __func__ << " setting up missing_loc from shard " << *i << " " << dendl;
7c673cae
FG
1780 if (*i == get_primary()) {
1781 missing_loc.add_active_missing(missing);
1782 if (!missing.have_missing())
1783 complete_shards.insert(*i);
1784 } else {
1785 auto peer_missing_entry = peer_missing.find(*i);
1786 assert(peer_missing_entry != peer_missing.end());
1787 missing_loc.add_active_missing(peer_missing_entry->second);
1788 if (!peer_missing_entry->second.have_missing() &&
1789 peer_info[*i].last_backfill.is_max())
1790 complete_shards.insert(*i);
1791 }
1792 }
1793 // If necessary, create might_have_unfound to help us find our unfound objects.
1794 // NOTE: It's important that we build might_have_unfound before trimming the
1795 // past intervals.
1796 might_have_unfound.clear();
1797 if (needs_recovery()) {
1798 // If only one shard has missing, we do a trick to add all others as recovery
1799 // source, this is considered safe since the PGLogs have been merged locally,
1800 // and covers vast majority of the use cases, like one OSD/host is down for
1801 // a while for hardware repairing
1802 if (complete_shards.size() + 1 == actingbackfill.size()) {
1803 missing_loc.add_batch_sources_info(complete_shards, ctx->handle);
1804 } else {
1805 missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
1806 ctx->handle);
1807 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1808 i != actingbackfill.end();
1809 ++i) {
1810 if (*i == pg_whoami) continue;
1811 dout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
1812 assert(peer_missing.count(*i));
1813 assert(peer_info.count(*i));
1814 missing_loc.add_source_info(
1815 *i,
1816 peer_info[*i],
1817 peer_missing[*i],
1818 ctx->handle);
1819 }
1820 }
1821 for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
1822 i != peer_missing.end();
1823 ++i) {
1824 if (is_actingbackfill(i->first))
1825 continue;
1826 assert(peer_info.count(i->first));
1827 search_for_missing(
1828 peer_info[i->first],
1829 i->second,
1830 i->first,
1831 ctx);
1832 }
1833
1834 build_might_have_unfound();
1835
1836 state_set(PG_STATE_DEGRADED);
1837 if (have_unfound())
1838 discover_all_missing(query_map);
1839 }
1840
1841 // degraded?
1842 if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
1843 state_set(PG_STATE_DEGRADED);
1844 state_set(PG_STATE_UNDERSIZED);
1845 }
1846
1847 state_set(PG_STATE_ACTIVATING);
1848 release_pg_backoffs();
1849 projected_last_update = info.last_update;
1850 }
1851 if (acting.size() >= pool.info.min_size) {
1852 PGLogEntryHandler handler{this, &t};
1853 pg_log.roll_forward(&handler);
1854 }
1855}
1856
1857bool PG::op_has_sufficient_caps(OpRequestRef& op)
1858{
1859 // only check MOSDOp
1860 if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
1861 return true;
1862
1863 const MOSDOp *req = static_cast<const MOSDOp*>(op->get_req());
1864
1865 Session *session = static_cast<Session*>(req->get_connection()->get_priv());
1866 if (!session) {
1867 dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl;
1868 return false;
1869 }
1870 OSDCap& caps = session->caps;
1871 session->put();
1872
1873 const string &key = req->get_hobj().get_key().empty() ?
1874 req->get_oid().name :
1875 req->get_hobj().get_key();
1876
1877 bool cap = caps.is_capable(pool.name, req->get_hobj().nspace,
1878 pool.auid, key,
1879 op->need_read_cap(),
1880 op->need_write_cap(),
1881 op->classes());
1882
c07f9fc5
FG
1883 dout(20) << "op_has_sufficient_caps "
1884 << "session=" << session
1885 << " pool=" << pool.id << " (" << pool.name
1886 << " " << req->get_hobj().nspace
7c673cae
FG
1887 << ") owner=" << pool.auid
1888 << " need_read_cap=" << op->need_read_cap()
1889 << " need_write_cap=" << op->need_write_cap()
1890 << " classes=" << op->classes()
1891 << " -> " << (cap ? "yes" : "NO")
1892 << dendl;
1893 return cap;
1894}
1895
1896void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
1897{
1898 lock();
1899 if (pg_has_reset_since(epoch)) {
1900 dout(10) << "_activate_committed " << epoch
1901 << ", that was an old interval" << dendl;
1902 } else if (is_primary()) {
1903 peer_activated.insert(pg_whoami);
1904 dout(10) << "_activate_committed " << epoch
1905 << " peer_activated now " << peer_activated
1906 << " last_interval_started " << info.history.last_interval_started
1907 << " last_epoch_started " << info.history.last_epoch_started
1908 << " same_interval_since " << info.history.same_interval_since << dendl;
1909 assert(!actingbackfill.empty());
1910 if (peer_activated.size() == actingbackfill.size())
1911 all_activated_and_committed();
1912 } else {
1913 dout(10) << "_activate_committed " << epoch << " telling primary" << dendl;
1914 MOSDPGInfo *m = new MOSDPGInfo(epoch);
1915 pg_notify_t i = pg_notify_t(
1916 get_primary().shard, pg_whoami.shard,
1917 get_osdmap()->get_epoch(),
1918 get_osdmap()->get_epoch(),
1919 info);
1920
1921 i.info.history.last_epoch_started = activation_epoch;
1922 i.info.history.last_interval_started = i.info.history.same_interval_since;
1923 if (acting.size() >= pool.info.min_size) {
1924 state_set(PG_STATE_ACTIVE);
1925 } else {
1926 state_set(PG_STATE_PEERED);
1927 }
1928
1929 m->pg_list.push_back(make_pair(i, PastIntervals()));
1930 osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap()->get_epoch());
1931
1932 // waiters
1933 if (flushes_in_progress == 0) {
1934 requeue_ops(waiting_for_peered);
1935 }
1936 }
1937
1938 assert(!dirty_info);
1939
1940 unlock();
1941}
1942
1943/*
1944 * update info.history.last_epoch_started ONLY after we and all
1945 * replicas have activated AND committed the activate transaction
1946 * (i.e. the peering results are stable on disk).
1947 */
1948void PG::all_activated_and_committed()
1949{
1950 dout(10) << "all_activated_and_committed" << dendl;
1951 assert(is_primary());
1952 assert(peer_activated.size() == actingbackfill.size());
1953 assert(!actingbackfill.empty());
1954 assert(blocked_by.empty());
1955
1956 queue_peering_event(
1957 CephPeeringEvtRef(
1958 std::make_shared<CephPeeringEvt>(
1959 get_osdmap()->get_epoch(),
1960 get_osdmap()->get_epoch(),
1961 AllReplicasActivated())));
1962}
1963
31f18b77 1964bool PG::requeue_scrub(bool high_priority)
7c673cae
FG
1965{
1966 assert(is_locked());
1967 if (scrub_queued) {
1968 dout(10) << __func__ << ": already queued" << dendl;
1969 return false;
1970 } else {
1971 dout(10) << __func__ << ": queueing" << dendl;
1972 scrub_queued = true;
31f18b77 1973 osd->queue_for_scrub(this, high_priority);
7c673cae
FG
1974 return true;
1975 }
1976}
1977
c07f9fc5 1978void PG::queue_recovery()
7c673cae
FG
1979{
1980 if (!is_primary() || !is_peered()) {
1981 dout(10) << "queue_recovery -- not primary or not peered " << dendl;
1982 assert(!recovery_queued);
1983 } else if (recovery_queued) {
1984 dout(10) << "queue_recovery -- already queued" << dendl;
1985 } else {
1986 dout(10) << "queue_recovery -- queuing" << dendl;
1987 recovery_queued = true;
c07f9fc5 1988 osd->queue_for_recovery(this);
7c673cae
FG
1989 }
1990}
1991
1992bool PG::queue_scrub()
1993{
1994 assert(is_locked());
1995 if (is_scrubbing()) {
1996 return false;
1997 }
1998 scrubber.priority = scrubber.must_scrub ?
1999 cct->_conf->osd_requested_scrub_priority : get_scrub_priority();
2000 scrubber.must_scrub = false;
2001 state_set(PG_STATE_SCRUBBING);
2002 if (scrubber.must_deep_scrub) {
2003 state_set(PG_STATE_DEEP_SCRUB);
2004 scrubber.must_deep_scrub = false;
2005 }
2006 if (scrubber.must_repair || scrubber.auto_repair) {
2007 state_set(PG_STATE_REPAIR);
2008 scrubber.must_repair = false;
2009 }
2010 requeue_scrub();
2011 return true;
2012}
2013
2014unsigned PG::get_scrub_priority()
2015{
2016 // a higher value -> a higher priority
2017 int pool_scrub_priority = 0;
2018 pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority);
2019 return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
2020}
2021
2022struct C_PG_FinishRecovery : public Context {
2023 PGRef pg;
2024 explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
2025 void finish(int r) override {
2026 pg->_finish_recovery(this);
2027 }
2028};
2029
2030void PG::mark_clean()
2031{
224ce89b 2032 if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
c07f9fc5 2033 state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
7c673cae 2034 state_set(PG_STATE_CLEAN);
224ce89b
WB
2035 info.history.last_epoch_clean = get_osdmap()->get_epoch();
2036 info.history.last_interval_clean = info.history.same_interval_since;
2037 past_intervals.clear();
2038 dirty_big_info = true;
2039 dirty_info = true;
7c673cae
FG
2040 }
2041
224ce89b 2042 kick_snap_trim();
7c673cae
FG
2043}
2044
d2e6a577 2045void PG::_change_recovery_force_mode(int new_mode, bool clear)
7c673cae 2046{
d2e6a577
FG
2047 if (!deleting) {
2048 // we can't and shouldn't do anything if the PG is being deleted locally
2049 if (clear) {
2050 state_clear(new_mode);
2051 } else {
2052 state_set(new_mode);
2053 }
2054 publish_stats_to_osd();
c07f9fc5 2055 }
c07f9fc5 2056}
7c673cae 2057
c07f9fc5
FG
2058inline int PG::clamp_recovery_priority(int priority)
2059{
2060 static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
2061 static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
7c673cae
FG
2062
2063 // Clamp to valid range
c07f9fc5
FG
2064 if (priority > OSD_RECOVERY_PRIORITY_MAX) {
2065 return OSD_RECOVERY_PRIORITY_MAX;
2066 } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
2067 return OSD_RECOVERY_PRIORITY_MIN;
2068 } else {
2069 return priority;
7c673cae 2070 }
c07f9fc5 2071}
7c673cae 2072
c07f9fc5
FG
2073unsigned PG::get_recovery_priority()
2074{
2075 // a higher value -> a higher priority
2076 int ret = 0;
7c673cae 2077
c07f9fc5
FG
2078 if (state & PG_STATE_FORCED_RECOVERY) {
2079 ret = OSD_RECOVERY_PRIORITY_FORCED;
2080 } else {
2081 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &ret);
2082 ret = clamp_recovery_priority(OSD_RECOVERY_PRIORITY_BASE + ret);
2083 }
2084 dout(20) << __func__ << " recovery priority for " << *this << " is " << ret << ", state is " << state << dendl;
7c673cae
FG
2085 return static_cast<unsigned>(ret);
2086}
2087
2088unsigned PG::get_backfill_priority()
2089{
2090 // a higher value -> a higher priority
7c673cae 2091 int ret = OSD_BACKFILL_PRIORITY_BASE;
c07f9fc5
FG
2092 if (state & PG_STATE_FORCED_BACKFILL) {
2093 ret = OSD_RECOVERY_PRIORITY_FORCED;
2094 } else {
2095 if (acting.size() < pool.info.min_size) {
2096 // inactive: no. of replicas < min_size, highest priority since it blocks IO
2097 ret = OSD_BACKFILL_INACTIVE_PRIORITY_BASE + (pool.info.min_size - acting.size());
2098
2099 } else if (is_undersized()) {
2100 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
2101 assert(pool.info.size > actingset.size());
2102 ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE + (pool.info.size - actingset.size());
2103
2104 } else if (is_degraded()) {
2105 // degraded: baseline degraded
2106 ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
2107 }
7c673cae 2108
c07f9fc5
FG
2109 // Adjust with pool's recovery priority
2110 int pool_recovery_priority = 0;
2111 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
7c673cae 2112
c07f9fc5 2113 ret = clamp_recovery_priority(pool_recovery_priority + ret);
7c673cae
FG
2114 }
2115
2116 return static_cast<unsigned>(ret);
2117}
2118
2119void PG::finish_recovery(list<Context*>& tfin)
2120{
2121 dout(10) << "finish_recovery" << dendl;
2122 assert(info.last_complete == info.last_update);
2123
2124 clear_recovery_state();
2125
2126 /*
2127 * sync all this before purging strays. but don't block!
2128 */
2129 finish_sync_event = new C_PG_FinishRecovery(this);
2130 tfin.push_back(finish_sync_event);
2131}
2132
2133void PG::_finish_recovery(Context *c)
2134{
2135 lock();
2136 if (deleting) {
2137 unlock();
2138 return;
2139 }
2140 if (c == finish_sync_event) {
2141 dout(10) << "_finish_recovery" << dendl;
2142 finish_sync_event = 0;
2143 purge_strays();
2144
2145 publish_stats_to_osd();
2146
2147 if (scrub_after_recovery) {
2148 dout(10) << "_finish_recovery requeueing for scrub" << dendl;
2149 scrub_after_recovery = false;
2150 scrubber.must_deep_scrub = true;
2151 queue_scrub();
2152 }
2153 } else {
2154 dout(10) << "_finish_recovery -- stale" << dendl;
2155 }
2156 unlock();
2157}
2158
2159void PG::start_recovery_op(const hobject_t& soid)
2160{
2161 dout(10) << "start_recovery_op " << soid
2162#ifdef DEBUG_RECOVERY_OIDS
2163 << " (" << recovering_oids << ")"
2164#endif
2165 << dendl;
2166 assert(recovery_ops_active >= 0);
2167 recovery_ops_active++;
2168#ifdef DEBUG_RECOVERY_OIDS
2169 assert(recovering_oids.count(soid) == 0);
2170 recovering_oids.insert(soid);
2171#endif
2172 osd->start_recovery_op(this, soid);
2173}
2174
2175void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
2176{
2177 dout(10) << "finish_recovery_op " << soid
2178#ifdef DEBUG_RECOVERY_OIDS
2179 << " (" << recovering_oids << ")"
2180#endif
2181 << dendl;
2182 assert(recovery_ops_active > 0);
2183 recovery_ops_active--;
2184#ifdef DEBUG_RECOVERY_OIDS
2185 assert(recovering_oids.count(soid));
2186 recovering_oids.erase(soid);
2187#endif
2188 osd->finish_recovery_op(this, soid, dequeue);
2189
2190 if (!dequeue) {
2191 queue_recovery();
2192 }
2193}
2194
2195void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
2196{
2197 child->update_snap_mapper_bits(split_bits);
2198 child->update_osdmap_ref(get_osdmap());
2199
2200 child->pool = pool;
2201
2202 // Log
2203 pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2204 child->info.last_complete = info.last_complete;
2205
2206 info.last_update = pg_log.get_head();
2207 child->info.last_update = child->pg_log.get_head();
2208
2209 child->info.last_user_version = info.last_user_version;
2210
2211 info.log_tail = pg_log.get_tail();
2212 child->info.log_tail = child->pg_log.get_tail();
2213
2214 if (info.last_complete < pg_log.get_tail())
2215 info.last_complete = pg_log.get_tail();
2216 if (child->info.last_complete < child->pg_log.get_tail())
2217 child->info.last_complete = child->pg_log.get_tail();
2218
2219 // Info
2220 child->info.history = info.history;
2221 child->info.history.epoch_created = get_osdmap()->get_epoch();
2222 child->info.purged_snaps = info.purged_snaps;
2223
2224 if (info.last_backfill.is_max()) {
2225 child->info.set_last_backfill(hobject_t::get_max());
2226 } else {
2227 // restart backfill on parent and child to be safe. we could
2228 // probably do better in the bitwise sort case, but it's more
2229 // fragile (there may be special work to do on backfill completion
2230 // in the future).
2231 info.set_last_backfill(hobject_t());
2232 child->info.set_last_backfill(hobject_t());
d2e6a577
FG
2233 // restarting backfill implies that the missing set is empty,
2234 // since it is only used for objects prior to last_backfill
2235 pg_log.reset_backfill();
2236 child->pg_log.reset_backfill();
7c673cae
FG
2237 }
2238
2239 child->info.stats = info.stats;
2240 child->info.stats.parent_split_bits = split_bits;
2241 info.stats.stats_invalid = true;
2242 child->info.stats.stats_invalid = true;
2243 child->info.last_epoch_started = info.last_epoch_started;
2244 child->info.last_interval_started = info.last_interval_started;
2245
2246 child->snap_trimq = snap_trimq;
2247
2248 // There can't be recovery/backfill going on now
2249 int primary, up_primary;
2250 vector<int> newup, newacting;
2251 get_osdmap()->pg_to_up_acting_osds(
2252 child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2253 child->init_primary_up_acting(
2254 newup,
2255 newacting,
2256 up_primary,
2257 primary);
2258 child->role = OSDMap::calc_pg_role(osd->whoami, child->acting);
2259
2260 // this comparison includes primary rank via pg_shard_t
2261 if (get_primary() != child->get_primary())
2262 child->info.history.same_primary_since = get_osdmap()->get_epoch();
2263
2264 child->info.stats.up = up;
2265 child->info.stats.up_primary = up_primary;
2266 child->info.stats.acting = acting;
2267 child->info.stats.acting_primary = primary;
2268 child->info.stats.mapping_epoch = get_osdmap()->get_epoch();
2269
2270 // History
2271 child->past_intervals = past_intervals;
2272
2273 _split_into(child_pgid, child, split_bits);
2274
2275 // release all backoffs for simplicity
2276 release_backoffs(hobject_t(), hobject_t::get_max());
2277
2278 child->on_new_interval();
2279
2280 child->dirty_info = true;
2281 child->dirty_big_info = true;
2282 dirty_info = true;
2283 dirty_big_info = true;
2284}
2285
2286void PG::add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end)
2287{
2288 ConnectionRef con = s->con;
2289 if (!con) // OSD::ms_handle_reset clears s->con without a lock
2290 return;
2291 BackoffRef b(s->have_backoff(info.pgid, begin));
2292 if (b) {
2293 derr << __func__ << " already have backoff for " << s << " begin " << begin
2294 << " " << *b << dendl;
2295 ceph_abort();
2296 }
2297 Mutex::Locker l(backoff_lock);
2298 {
2299 b = new Backoff(info.pgid, this, s, ++s->backoff_seq, begin, end);
2300 backoffs[begin].insert(b);
2301 s->add_backoff(b);
2302 dout(10) << __func__ << " session " << s << " added " << *b << dendl;
2303 }
2304 con->send_message(
2305 new MOSDBackoff(
2306 info.pgid,
2307 get_osdmap()->get_epoch(),
2308 CEPH_OSD_BACKOFF_OP_BLOCK,
2309 b->id,
2310 begin,
2311 end));
2312}
2313
2314void PG::release_backoffs(const hobject_t& begin, const hobject_t& end)
2315{
2316 dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl;
2317 vector<BackoffRef> bv;
2318 {
2319 Mutex::Locker l(backoff_lock);
2320 auto p = backoffs.lower_bound(begin);
2321 while (p != backoffs.end()) {
2322 int r = cmp(p->first, end);
2323 dout(20) << __func__ << " ? " << r << " " << p->first
2324 << " " << p->second << dendl;
2325 // note: must still examine begin=end=p->first case
2326 if (r > 0 || (r == 0 && begin < end)) {
2327 break;
2328 }
2329 dout(20) << __func__ << " checking " << p->first
2330 << " " << p->second << dendl;
2331 auto q = p->second.begin();
2332 while (q != p->second.end()) {
2333 dout(20) << __func__ << " checking " << *q << dendl;
2334 int r = cmp((*q)->begin, begin);
2335 if (r == 0 || (r > 0 && (*q)->end < end)) {
2336 bv.push_back(*q);
2337 q = p->second.erase(q);
2338 } else {
2339 ++q;
2340 }
2341 }
2342 if (p->second.empty()) {
2343 p = backoffs.erase(p);
2344 } else {
2345 ++p;
2346 }
2347 }
2348 }
2349 for (auto b : bv) {
2350 Mutex::Locker l(b->lock);
2351 dout(10) << __func__ << " " << *b << dendl;
2352 if (b->session) {
2353 assert(b->pg == this);
2354 ConnectionRef con = b->session->con;
2355 if (con) { // OSD::ms_handle_reset clears s->con without a lock
2356 con->send_message(
2357 new MOSDBackoff(
2358 info.pgid,
2359 get_osdmap()->get_epoch(),
2360 CEPH_OSD_BACKOFF_OP_UNBLOCK,
2361 b->id,
2362 b->begin,
2363 b->end));
2364 }
2365 if (b->is_new()) {
2366 b->state = Backoff::STATE_DELETING;
2367 } else {
2368 b->session->rm_backoff(b);
2369 b->session.reset();
2370 }
2371 b->pg.reset();
2372 }
2373 }
2374}
2375
2376void PG::clear_backoffs()
2377{
2378 dout(10) << __func__ << " " << dendl;
2379 map<hobject_t,set<BackoffRef>> ls;
2380 {
2381 Mutex::Locker l(backoff_lock);
2382 ls.swap(backoffs);
2383 }
2384 for (auto& p : ls) {
2385 for (auto& b : p.second) {
2386 Mutex::Locker l(b->lock);
2387 dout(10) << __func__ << " " << *b << dendl;
2388 if (b->session) {
2389 assert(b->pg == this);
2390 if (b->is_new()) {
2391 b->state = Backoff::STATE_DELETING;
2392 } else {
2393 b->session->rm_backoff(b);
2394 b->session.reset();
2395 }
2396 b->pg.reset();
2397 }
2398 }
2399 }
2400}
2401
2402// called by Session::clear_backoffs()
2403void PG::rm_backoff(BackoffRef b)
2404{
2405 dout(10) << __func__ << " " << *b << dendl;
2406 Mutex::Locker l(backoff_lock);
2407 assert(b->lock.is_locked_by_me());
2408 assert(b->pg == this);
2409 auto p = backoffs.find(b->begin);
2410 // may race with release_backoffs()
2411 if (p != backoffs.end()) {
2412 auto q = p->second.find(b);
2413 if (q != p->second.end()) {
2414 p->second.erase(q);
2415 if (p->second.empty()) {
2416 backoffs.erase(p);
2417 }
2418 }
2419 }
2420}
2421
2422void PG::clear_recovery_state()
2423{
2424 dout(10) << "clear_recovery_state" << dendl;
2425
2426 pg_log.reset_recovery_pointers();
2427 finish_sync_event = 0;
2428
2429 hobject_t soid;
2430 while (recovery_ops_active > 0) {
2431#ifdef DEBUG_RECOVERY_OIDS
2432 soid = *recovering_oids.begin();
2433#endif
2434 finish_recovery_op(soid, true);
2435 }
2436
2437 backfill_targets.clear();
2438 backfill_info.clear();
2439 peer_backfill_info.clear();
2440 waiting_on_backfill.clear();
2441 _clear_recovery_state(); // pg impl specific hook
2442}
2443
2444void PG::cancel_recovery()
2445{
2446 dout(10) << "cancel_recovery" << dendl;
2447 clear_recovery_state();
2448}
2449
2450
2451void PG::purge_strays()
2452{
2453 dout(10) << "purge_strays " << stray_set << dendl;
2454
2455 bool removed = false;
2456 for (set<pg_shard_t>::iterator p = stray_set.begin();
2457 p != stray_set.end();
2458 ++p) {
2459 assert(!is_actingbackfill(*p));
2460 if (get_osdmap()->is_up(p->osd)) {
2461 dout(10) << "sending PGRemove to osd." << *p << dendl;
2462 vector<spg_t> to_remove;
2463 to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
2464 MOSDPGRemove *m = new MOSDPGRemove(
2465 get_osdmap()->get_epoch(),
2466 to_remove);
2467 osd->send_message_osd_cluster(p->osd, m, get_osdmap()->get_epoch());
2468 } else {
2469 dout(10) << "not sending PGRemove to down osd." << *p << dendl;
2470 }
2471 peer_missing.erase(*p);
2472 peer_info.erase(*p);
2473 peer_purged.insert(*p);
2474 removed = true;
2475 }
2476
2477 // if we removed anyone, update peers (which include peer_info)
2478 if (removed)
2479 update_heartbeat_peers();
2480
2481 stray_set.clear();
2482
2483 // clear _requested maps; we may have to peer() again if we discover
2484 // (more) stray content
2485 peer_log_requested.clear();
2486 peer_missing_requested.clear();
2487}
2488
2489void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
2490{
2491 Mutex::Locker l(heartbeat_peer_lock);
2492 probe_targets.clear();
2493 for (set<pg_shard_t>::iterator i = probe_set.begin();
2494 i != probe_set.end();
2495 ++i) {
2496 probe_targets.insert(i->osd);
2497 }
2498}
2499
2500void PG::clear_probe_targets()
2501{
2502 Mutex::Locker l(heartbeat_peer_lock);
2503 probe_targets.clear();
2504}
2505
2506void PG::update_heartbeat_peers()
2507{
2508 assert(is_locked());
2509
2510 if (!is_primary())
2511 return;
2512
2513 set<int> new_peers;
2514 for (unsigned i=0; i<acting.size(); i++) {
2515 if (acting[i] != CRUSH_ITEM_NONE)
2516 new_peers.insert(acting[i]);
2517 }
2518 for (unsigned i=0; i<up.size(); i++) {
2519 if (up[i] != CRUSH_ITEM_NONE)
2520 new_peers.insert(up[i]);
2521 }
2522 for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
2523 p != peer_info.end();
2524 ++p)
2525 new_peers.insert(p->first.osd);
2526
2527 bool need_update = false;
2528 heartbeat_peer_lock.Lock();
2529 if (new_peers == heartbeat_peers) {
2530 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl;
2531 } else {
2532 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl;
2533 heartbeat_peers.swap(new_peers);
2534 need_update = true;
2535 }
2536 heartbeat_peer_lock.Unlock();
2537
2538 if (need_update)
2539 osd->need_heartbeat_peer_update();
2540}
2541
2542
2543bool PG::check_in_progress_op(
2544 const osd_reqid_t &r,
2545 eversion_t *version,
2546 version_t *user_version,
2547 int *return_code) const
2548{
2549 return (
2550 projected_log.get_request(r, version, user_version, return_code) ||
2551 pg_log.get_log().get_request(r, version, user_version, return_code));
2552}
2553
2554void PG::_update_calc_stats()
2555{
2556 info.stats.version = info.last_update;
2557 info.stats.created = info.history.epoch_created;
2558 info.stats.last_scrub = info.history.last_scrub;
2559 info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
2560 info.stats.last_deep_scrub = info.history.last_deep_scrub;
2561 info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
2562 info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
2563 info.stats.last_epoch_clean = info.history.last_epoch_clean;
2564
2565 info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
2566 info.stats.ondisk_log_size = info.stats.log_size;
2567 info.stats.log_start = pg_log.get_tail();
2568 info.stats.ondisk_log_start = pg_log.get_tail();
2569
2570 // If actingset is larger then upset we will have misplaced,
2571 // so we will report based on actingset size.
2572
2573 // If upset is larger then we will have degraded,
2574 // so we will report based on upset size.
2575
2576 // If target is the largest of them all, it will contribute to
2577 // the degraded count because num_object_copies is
2578 // computed using target and eventual used to get degraded total.
2579
2580 unsigned target = get_osdmap()->get_pg_size(info.pgid.pgid);
2581 unsigned nrep = MAX(actingset.size(), upset.size());
2582 // calc num_object_copies
2583 info.stats.stats.calc_copies(MAX(target, nrep));
2584 info.stats.stats.sum.num_objects_degraded = 0;
2585 info.stats.stats.sum.num_objects_unfound = 0;
2586 info.stats.stats.sum.num_objects_misplaced = 0;
2587 if ((is_degraded() || is_undersized() || !is_clean()) && is_peered()) {
2588 // NOTE: we only generate copies, degraded, misplaced and unfound
2589 // values for the summation, not individual stat categories.
2590 int64_t num_objects = info.stats.stats.sum.num_objects;
2591
2592 // Total sum of all missing
2593 int64_t missing = 0;
2594 // Objects that have arrived backfilled to up OSDs (not in acting)
2595 int64_t backfilled = 0;
2596 // A misplaced object is not stored on the correct OSD
2597 int64_t misplaced = 0;
2598 // Total of object copies/shards found
2599 int64_t object_copies = 0;
2600
2601 // num_objects_missing on each peer
2602 for (map<pg_shard_t, pg_info_t>::iterator pi =
2603 peer_info.begin();
2604 pi != peer_info.end();
2605 ++pi) {
2606 map<pg_shard_t, pg_missing_t>::const_iterator pm =
2607 peer_missing.find(pi->first);
2608 if (pm != peer_missing.end()) {
2609 pi->second.stats.stats.sum.num_objects_missing =
2610 pm->second.num_missing();
2611 }
2612 }
2613
2614 assert(!actingbackfill.empty());
2615 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
2616 i != actingbackfill.end();
2617 ++i) {
2618 const pg_shard_t &p = *i;
2619
2620 bool in_up = (upset.find(p) != upset.end());
2621 bool in_acting = (actingset.find(p) != actingset.end());
2622 assert(in_up || in_acting);
2623
2624 // in acting Compute total objects excluding num_missing
2625 // in acting and not in up Compute misplaced objects excluding num_missing
2626 // in up and not in acting Compute total objects already backfilled
2627 if (in_acting) {
2628 unsigned osd_missing;
2629 // primary handling
2630 if (p == pg_whoami) {
2631 osd_missing = pg_log.get_missing().num_missing();
2632 info.stats.stats.sum.num_objects_missing_on_primary =
2633 osd_missing;
2634 object_copies += num_objects; // My local (primary) count
2635 } else {
2636 assert(peer_missing.count(p));
2637 osd_missing = peer_missing[p].num_missing();
2638 object_copies += peer_info[p].stats.stats.sum.num_objects;
2639 }
2640 missing += osd_missing;
2641 // Count non-missing objects not in up as misplaced
2642 if (!in_up && num_objects > osd_missing)
2643 misplaced += num_objects - osd_missing;
2644 } else {
2645 assert(in_up && !in_acting);
2646
2647 // If this peer has more objects then it should, ignore them
2648 backfilled += MIN(num_objects, peer_info[p].stats.stats.sum.num_objects);
2649 }
2650 }
2651
2652 // Any objects that have been backfilled to up OSDs can deducted from misplaced
2653 misplaced = MAX(0, misplaced - backfilled);
2654
2655 // Deduct computed total missing on acting nodes
2656 object_copies -= missing;
2657 // Include computed backfilled objects on up nodes
2658 object_copies += backfilled;
2659 // a degraded objects has fewer replicas or EC shards than the
2660 // pool specifies. num_object_copies will never be smaller than target * num_copies.
2661 int64_t degraded = MAX(0, info.stats.stats.sum.num_object_copies - object_copies);
2662
2663 info.stats.stats.sum.num_objects_degraded = degraded;
2664 info.stats.stats.sum.num_objects_unfound = get_num_unfound();
2665 info.stats.stats.sum.num_objects_misplaced = misplaced;
2666 }
2667}
2668
2669void PG::_update_blocked_by()
2670{
2671 // set a max on the number of blocking peers we report. if we go
2672 // over, report a random subset. keep the result sorted.
2673 unsigned keep = MIN(blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
2674 unsigned skip = blocked_by.size() - keep;
2675 info.stats.blocked_by.clear();
2676 info.stats.blocked_by.resize(keep);
2677 unsigned pos = 0;
2678 for (set<int>::iterator p = blocked_by.begin();
2679 p != blocked_by.end() && keep > 0;
2680 ++p) {
2681 if (skip > 0 && (rand() % (skip + keep) < skip)) {
2682 --skip;
2683 } else {
2684 info.stats.blocked_by[pos++] = *p;
2685 --keep;
2686 }
2687 }
2688}
2689
2690void PG::publish_stats_to_osd()
2691{
2692 if (!is_primary())
2693 return;
2694
2695 pg_stats_publish_lock.Lock();
2696
2697 if (info.stats.stats.sum.num_scrub_errors)
2698 state_set(PG_STATE_INCONSISTENT);
2699 else
2700 state_clear(PG_STATE_INCONSISTENT);
2701
2702 utime_t now = ceph_clock_now();
2703 if (info.stats.state != state) {
2704 info.stats.last_change = now;
2705 // Optimistic estimation, if we just find out an inactive PG,
2706 // assumt it is active till now.
2707 if (!(state & PG_STATE_ACTIVE) &&
2708 (info.stats.state & PG_STATE_ACTIVE))
2709 info.stats.last_active = now;
2710
2711 if ((state & PG_STATE_ACTIVE) &&
2712 !(info.stats.state & PG_STATE_ACTIVE))
2713 info.stats.last_became_active = now;
2714 if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
2715 !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
2716 info.stats.last_became_peered = now;
2717 if (!(state & PG_STATE_CREATING) &&
2718 (info.stats.state & PG_STATE_CREATING)) {
2719 osd->send_pg_created(get_pgid().pgid);
2720 }
2721 info.stats.state = state;
2722 }
2723
2724 _update_calc_stats();
2725 _update_blocked_by();
2726
2727 bool publish = false;
2728 pg_stat_t pre_publish = info.stats;
2729 pre_publish.stats.add(unstable_stats);
2730 utime_t cutoff = now;
2731 cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
2732 if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
2733 info.stats.last_fresh > cutoff) {
2734 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2735 << ": no change since " << info.stats.last_fresh << dendl;
2736 } else {
2737 // update our stat summary and timestamps
2738 info.stats.reported_epoch = get_osdmap()->get_epoch();
2739 ++info.stats.reported_seq;
2740
2741 info.stats.last_fresh = now;
2742
2743 if (info.stats.state & PG_STATE_CLEAN)
2744 info.stats.last_clean = now;
2745 if (info.stats.state & PG_STATE_ACTIVE)
2746 info.stats.last_active = now;
2747 if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
2748 info.stats.last_peered = now;
2749 info.stats.last_unstale = now;
2750 if ((info.stats.state & PG_STATE_DEGRADED) == 0)
2751 info.stats.last_undegraded = now;
2752 if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
2753 info.stats.last_fullsized = now;
2754
31f18b77
FG
2755 // do not send pgstat to mon anymore once we are luminous, since mgr takes
2756 // care of this by sending MMonMgrReport to mon.
2757 publish =
2758 osd->osd->get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7c673cae
FG
2759 pg_stats_publish_valid = true;
2760 pg_stats_publish = pre_publish;
2761
2762 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2763 << ":" << pg_stats_publish.reported_seq << dendl;
2764 }
2765 pg_stats_publish_lock.Unlock();
2766
2767 if (publish)
2768 osd->pg_stat_queue_enqueue(this);
2769}
2770
2771void PG::clear_publish_stats()
2772{
2773 dout(15) << "clear_stats" << dendl;
2774 pg_stats_publish_lock.Lock();
2775 pg_stats_publish_valid = false;
2776 pg_stats_publish_lock.Unlock();
2777
2778 osd->pg_stat_queue_dequeue(this);
2779}
2780
2781/**
2782 * initialize a newly instantiated pg
2783 *
2784 * Initialize PG state, as when a PG is initially created, or when it
2785 * is first instantiated on the current node.
2786 *
2787 * @param role our role/rank
2788 * @param newup up set
2789 * @param newacting acting set
2790 * @param history pg history
2791 * @param pi past_intervals
2792 * @param backfill true if info should be marked as backfill
2793 * @param t transaction to write out our new state in
2794 */
2795void PG::init(
2796 int role,
2797 const vector<int>& newup, int new_up_primary,
2798 const vector<int>& newacting, int new_acting_primary,
2799 const pg_history_t& history,
2800 const PastIntervals& pi,
2801 bool backfill,
2802 ObjectStore::Transaction *t)
2803{
2804 dout(10) << "init role " << role << " up " << newup << " acting " << newacting
2805 << " history " << history
2806 << " past_intervals " << pi
2807 << dendl;
2808
2809 set_role(role);
2810 acting = newacting;
2811 up = newup;
2812 init_primary_up_acting(
2813 newup,
2814 newacting,
2815 new_up_primary,
2816 new_acting_primary);
2817
2818 info.history = history;
2819 past_intervals = pi;
2820
2821 info.stats.up = up;
2822 info.stats.up_primary = new_up_primary;
2823 info.stats.acting = acting;
2824 info.stats.acting_primary = new_acting_primary;
2825 info.stats.mapping_epoch = info.history.same_interval_since;
2826
2827 if (backfill) {
2828 dout(10) << __func__ << ": Setting backfill" << dendl;
2829 info.set_last_backfill(hobject_t());
2830 info.last_complete = info.last_update;
2831 pg_log.mark_log_for_rewrite();
2832 }
2833
2834 on_new_interval();
2835
2836 dirty_info = true;
2837 dirty_big_info = true;
2838 write_if_dirty(*t);
7c673cae
FG
2839}
2840
2841#pragma GCC diagnostic ignored "-Wpragmas"
2842#pragma GCC diagnostic push
2843#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2844
2845void PG::upgrade(ObjectStore *store)
2846{
2847 assert(info_struct_v <= 10);
2848 ObjectStore::Transaction t;
2849
2850 assert(info_struct_v >= 7);
2851
2852 // 7 -> 8
2853 if (info_struct_v <= 7) {
2854 pg_log.mark_log_for_rewrite();
2855 ghobject_t log_oid(OSD::make_pg_log_oid(pg_id));
2856 ghobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id));
2857 t.remove(coll_t::meta(), log_oid);
2858 t.remove(coll_t::meta(), biginfo_oid);
2859 t.touch(coll, pgmeta_oid);
2860 }
2861
2862 // 8 -> 9
2863 if (info_struct_v <= 8) {
2864 // no special action needed.
2865 }
2866
2867 // 9 -> 10
2868 if (info_struct_v <= 9) {
2869 // previous versions weren't (as) aggressively clearing past_intervals
2870 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
2871 dout(20) << __func__ << " clearing past_intervals" << dendl;
2872 past_intervals.clear();
2873 }
2874 }
2875
2876 // update infover_key
2877 if (info_struct_v < cur_struct_v) {
2878 map<string,bufferlist> v;
2879 __u8 ver = cur_struct_v;
2880 ::encode(ver, v[infover_key]);
2881 t.omap_setkeys(coll, pgmeta_oid, v);
2882 }
2883
2884 dirty_info = true;
2885 dirty_big_info = true;
2886 write_if_dirty(t);
2887
2888 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
2889 ObjectStore::Sequencer>("upgrade"));
2890 int r = store->apply_transaction(osr.get(), std::move(t));
2891 if (r != 0) {
2892 derr << __func__ << ": apply_transaction returned "
2893 << cpp_strerror(r) << dendl;
2894 ceph_abort();
2895 }
2896 assert(r == 0);
2897
2898 C_SaferCond waiter;
2899 if (!osr->flush_commit(&waiter)) {
2900 waiter.wait();
2901 }
2902}
2903
2904#pragma GCC diagnostic pop
2905#pragma GCC diagnostic warning "-Wpragmas"
2906
2907int PG::_prepare_write_info(CephContext* cct,
2908 map<string,bufferlist> *km,
2909 epoch_t epoch,
2910 pg_info_t &info, pg_info_t &last_written_info,
2911 PastIntervals &past_intervals,
2912 bool dirty_big_info,
2913 bool dirty_epoch,
2914 bool try_fast_info,
2915 PerfCounters *logger)
2916{
2917 if (dirty_epoch) {
2918 ::encode(epoch, (*km)[epoch_key]);
2919 }
2920
2921 if (logger)
2922 logger->inc(l_osd_pg_info);
2923
2924 // try to do info efficiently?
2925 if (!dirty_big_info && try_fast_info &&
2926 info.last_update > last_written_info.last_update) {
2927 pg_fast_info_t fast;
2928 fast.populate_from(info);
2929 bool did = fast.try_apply_to(&last_written_info);
2930 assert(did); // we verified last_update increased above
2931 if (info == last_written_info) {
2932 ::encode(fast, (*km)[fastinfo_key]);
2933 if (logger)
2934 logger->inc(l_osd_pg_fastinfo);
2935 return 0;
2936 }
2937 generic_dout(30) << __func__ << " fastinfo failed, info:\n";
2938 {
2939 JSONFormatter jf(true);
2940 jf.dump_object("info", info);
2941 jf.flush(*_dout);
2942 }
2943 {
2944 *_dout << "\nlast_written_info:\n";
2945 JSONFormatter jf(true);
2946 jf.dump_object("last_written_info", last_written_info);
2947 jf.flush(*_dout);
2948 }
2949 *_dout << dendl;
2950 }
2951 last_written_info = info;
2952
2953 // info. store purged_snaps separately.
2954 interval_set<snapid_t> purged_snaps;
2955 purged_snaps.swap(info.purged_snaps);
2956 ::encode(info, (*km)[info_key]);
2957 purged_snaps.swap(info.purged_snaps);
2958
2959 if (dirty_big_info) {
2960 // potentially big stuff
2961 bufferlist& bigbl = (*km)[biginfo_key];
2962 ::encode(past_intervals, bigbl);
2963 ::encode(info.purged_snaps, bigbl);
2964 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
2965 if (logger)
2966 logger->inc(l_osd_pg_biginfo);
2967 }
2968
2969 return 0;
2970}
2971
2972void PG::_create(ObjectStore::Transaction& t, spg_t pgid, int bits)
2973{
2974 coll_t coll(pgid);
2975 t.create_collection(coll, bits);
2976}
2977
2978void PG::_init(ObjectStore::Transaction& t, spg_t pgid, const pg_pool_t *pool)
2979{
2980 coll_t coll(pgid);
2981
2982 if (pool) {
2983 // Give a hint to the PG collection
2984 bufferlist hint;
2985 uint32_t pg_num = pool->get_pg_num();
2986 uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
2987 ::encode(pg_num, hint);
2988 ::encode(expected_num_objects_pg, hint);
2989 uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
2990 t.collection_hint(coll, hint_type, hint);
2991 }
2992
2993 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
2994 t.touch(coll, pgmeta_oid);
2995 map<string,bufferlist> values;
2996 __u8 struct_v = cur_struct_v;
2997 ::encode(struct_v, values[infover_key]);
2998 t.omap_setkeys(coll, pgmeta_oid, values);
2999}
3000
3001void PG::prepare_write_info(map<string,bufferlist> *km)
3002{
3003 info.stats.stats.add(unstable_stats);
3004 unstable_stats.clear();
3005
3006 bool need_update_epoch = last_epoch < get_osdmap()->get_epoch();
3007 int ret = _prepare_write_info(cct, km, get_osdmap()->get_epoch(),
3008 info,
3009 last_written_info,
3010 past_intervals,
3011 dirty_big_info, need_update_epoch,
3012 cct->_conf->osd_fast_info,
3013 osd->logger);
3014 assert(ret == 0);
3015 if (need_update_epoch)
3016 last_epoch = get_osdmap()->get_epoch();
3017 last_persisted_osdmap_ref = osdmap_ref;
3018
3019 dirty_info = false;
3020 dirty_big_info = false;
3021}
3022
3023#pragma GCC diagnostic ignored "-Wpragmas"
3024#pragma GCC diagnostic push
3025#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3026
3027bool PG::_has_removal_flag(ObjectStore *store,
3028 spg_t pgid)
3029{
3030 coll_t coll(pgid);
3031 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3032
3033 // first try new way
3034 set<string> keys;
3035 keys.insert("_remove");
3036 map<string,bufferlist> values;
3037 if (store->omap_get_values(coll, pgmeta_oid, keys, &values) == 0 &&
3038 values.size() == 1)
3039 return true;
3040
3041 return false;
3042}
3043
3044int PG::peek_map_epoch(ObjectStore *store,
3045 spg_t pgid,
3046 epoch_t *pepoch,
3047 bufferlist *bl)
3048{
3049 coll_t coll(pgid);
3050 ghobject_t legacy_infos_oid(OSD::make_infos_oid());
3051 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3052 epoch_t cur_epoch = 0;
3053
3054 assert(bl);
3055 {
3056 // validate collection name
3057 assert(coll.is_pg());
3058 }
3059
3060 // try for v8
3061 set<string> keys;
3062 keys.insert(infover_key);
3063 keys.insert(epoch_key);
3064 map<string,bufferlist> values;
3065 int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3066 if (r == 0) {
3067 assert(values.size() == 2);
3068
3069 // sanity check version
3070 bufferlist::iterator bp = values[infover_key].begin();
3071 __u8 struct_v = 0;
3072 ::decode(struct_v, bp);
3073 assert(struct_v >= 8);
3074
3075 // get epoch
3076 bp = values[epoch_key].begin();
3077 ::decode(cur_epoch, bp);
3078 } else {
3079 // probably bug 10617; see OSD::load_pgs()
3080 return -1;
3081 }
3082
3083 *pepoch = cur_epoch;
3084 return 0;
3085}
3086
3087#pragma GCC diagnostic pop
3088#pragma GCC diagnostic warning "-Wpragmas"
3089
3090void PG::write_if_dirty(ObjectStore::Transaction& t)
3091{
3092 map<string,bufferlist> km;
3093 if (dirty_big_info || dirty_info)
3094 prepare_write_info(&km);
3095 pg_log.write_log_and_missing(t, &km, coll, pgmeta_oid, pool.info.require_rollback());
3096 if (!km.empty())
3097 t.omap_setkeys(coll, pgmeta_oid, km);
3098}
3099
3100void PG::trim_log()
3101{
3102 assert(is_primary());
3103 calc_trim_to();
3104 dout(10) << __func__ << " to " << pg_trim_to << dendl;
3105 if (pg_trim_to != eversion_t()) {
3106 // inform peers to trim log
3107 assert(!actingbackfill.empty());
3108 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3109 i != actingbackfill.end();
3110 ++i) {
3111 if (*i == pg_whoami) continue;
3112 osd->send_message_osd_cluster(
3113 i->osd,
3114 new MOSDPGTrim(
3115 get_osdmap()->get_epoch(),
3116 spg_t(info.pgid.pgid, i->shard),
3117 pg_trim_to),
3118 get_osdmap()->get_epoch());
3119 }
3120
3121 // trim primary as well
3122 pg_log.trim(pg_trim_to, info);
3123 dirty_info = true;
3124 }
3125}
3126
3127void PG::add_log_entry(const pg_log_entry_t& e, bool applied)
3128{
3129 // raise last_complete only if we were previously up to date
3130 if (info.last_complete == info.last_update)
3131 info.last_complete = e.version;
3132
3133 // raise last_update.
3134 assert(e.version > info.last_update);
3135 info.last_update = e.version;
3136
3137 // raise user_version, if it increased (it may have not get bumped
3138 // by all logged updates)
3139 if (e.user_version > info.last_user_version)
3140 info.last_user_version = e.user_version;
3141
3142 // log mutation
3143 pg_log.add(e, applied);
3144 dout(10) << "add_log_entry " << e << dendl;
3145}
3146
3147
3148void PG::append_log(
3149 const vector<pg_log_entry_t>& logv,
3150 eversion_t trim_to,
3151 eversion_t roll_forward_to,
3152 ObjectStore::Transaction &t,
3153 bool transaction_applied)
3154{
3155 if (transaction_applied)
3156 update_snap_map(logv, t);
3157
3158 /* The primary has sent an info updating the history, but it may not
3159 * have arrived yet. We want to make sure that we cannot remember this
3160 * write without remembering that it happened in an interval which went
3161 * active in epoch history.last_epoch_started.
3162 */
3163 if (info.last_epoch_started != info.history.last_epoch_started) {
3164 info.history.last_epoch_started = info.last_epoch_started;
3165 }
3166 if (info.last_interval_started != info.history.last_interval_started) {
3167 info.history.last_interval_started = info.last_interval_started;
3168 }
3169 dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3170
3171 PGLogEntryHandler handler{this, &t};
3172 if (!transaction_applied) {
3173 /* We must be a backfill peer, so it's ok if we apply
3174 * out-of-turn since we won't be considered when
3175 * determining a min possible last_update.
3176 */
3177 pg_log.roll_forward(&handler);
3178 }
3179
3180 for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3181 p != logv.end();
3182 ++p) {
3183 add_log_entry(*p, transaction_applied);
3184
3185 /* We don't want to leave the rollforward artifacts around
3186 * here past last_backfill. It's ok for the same reason as
3187 * above */
3188 if (transaction_applied &&
3189 p->soid > info.last_backfill) {
3190 pg_log.roll_forward(&handler);
3191 }
3192 }
3193 auto last = logv.rbegin();
3194 if (is_primary() && last != logv.rend()) {
3195 projected_log.skip_can_rollback_to_to_head();
c07f9fc5 3196 projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
7c673cae
FG
3197 }
3198
3199 if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
3200 pg_log.roll_forward_to(
3201 roll_forward_to,
3202 &handler);
3203 t.register_on_applied(
3204 new C_UpdateLastRollbackInfoTrimmedToApplied(
3205 this,
3206 get_osdmap()->get_epoch(),
3207 roll_forward_to));
3208 }
3209
3210 pg_log.trim(trim_to, info);
3211
3212 // update the local pg, pg log
3213 dirty_info = true;
3214 write_if_dirty(t);
3215}
3216
3217bool PG::check_log_for_corruption(ObjectStore *store)
3218{
3219 /// TODO: this method needs to work with the omap log
3220 return true;
3221}
3222
3223//! Get the name we're going to save our corrupt page log as
3224std::string PG::get_corrupt_pg_log_name() const
3225{
3226 const int MAX_BUF = 512;
3227 char buf[MAX_BUF];
3228 struct tm tm_buf;
3229 time_t my_time(time(NULL));
3230 const struct tm *t = localtime_r(&my_time, &tm_buf);
3231 int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t);
3232 if (ret == 0) {
3233 dout(0) << "strftime failed" << dendl;
3234 return "corrupt_log_unknown_time";
3235 }
3236 string out(buf);
3237 out += stringify(info.pgid);
3238 return out;
3239}
3240
3241int PG::read_info(
3242 ObjectStore *store, spg_t pgid, const coll_t &coll, bufferlist &bl,
3243 pg_info_t &info, PastIntervals &past_intervals,
3244 __u8 &struct_v)
3245{
3246 // try for v8 or later
3247 set<string> keys;
3248 keys.insert(infover_key);
3249 keys.insert(info_key);
3250 keys.insert(biginfo_key);
3251 keys.insert(fastinfo_key);
3252 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3253 map<string,bufferlist> values;
3254 int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3255 if (r == 0) {
3256 assert(values.size() == 3 ||
3257 values.size() == 4);
3258
3259 bufferlist::iterator p = values[infover_key].begin();
3260 ::decode(struct_v, p);
3261 assert(struct_v >= 8);
3262
3263 p = values[info_key].begin();
3264 ::decode(info, p);
3265
3266 p = values[biginfo_key].begin();
3267 if (struct_v >= 10) {
3268 ::decode(past_intervals, p);
3269 } else {
3270 past_intervals.decode_classic(p);
3271 }
3272 ::decode(info.purged_snaps, p);
3273
3274 p = values[fastinfo_key].begin();
3275 if (!p.end()) {
3276 pg_fast_info_t fast;
3277 ::decode(fast, p);
3278 fast.try_apply_to(&info);
3279 }
3280 return 0;
3281 }
3282
3283 // legacy (ver < 8)
3284 ghobject_t infos_oid(OSD::make_infos_oid());
3285 bufferlist::iterator p = bl.begin();
3286 ::decode(struct_v, p);
3287 assert(struct_v == 7);
3288
3289 // get info out of leveldb
3290 string k = get_info_key(info.pgid);
3291 string bk = get_biginfo_key(info.pgid);
3292 keys.clear();
3293 keys.insert(k);
3294 keys.insert(bk);
3295 values.clear();
3296 store->omap_get_values(coll_t::meta(), ghobject_t(infos_oid), keys, &values);
3297 assert(values.size() == 2);
3298
3299 p = values[k].begin();
3300 ::decode(info, p);
3301
3302 p = values[bk].begin();
3303 ::decode(past_intervals, p);
3304 interval_set<snapid_t> snap_collections; // obsolete
3305 ::decode(snap_collections, p);
3306 ::decode(info.purged_snaps, p);
3307 return 0;
3308}
3309
3310void PG::read_state(ObjectStore *store, bufferlist &bl)
3311{
3312 int r = read_info(store, pg_id, coll, bl, info, past_intervals,
3313 info_struct_v);
3314 assert(r >= 0);
3315
3316 last_written_info = info;
3317
d2e6a577
FG
3318 // if we are upgrading from jewel, we need to force rebuild of
3319 // missing set. v9 was fastinfo, added v11.0.2-331-g1d5dc29a13
3320 // (before kraken). persisted missing set was circa
3321 // v11.0.0-866-gb0e239da95 (a bit earlier, also before kraken).
3322 // v8 was pre-jewel (per-pg meta object).
3323 bool force_rebuild_missing = info_struct_v < 9;
3324 if (force_rebuild_missing) {
3325 dout(10) << __func__ << " detected upgrade from jewel, force_rebuild_missing"
3326 << dendl;
3327 }
3328
7c673cae
FG
3329 ostringstream oss;
3330 pg_log.read_log_and_missing(
3331 store,
3332 coll,
3333 info_struct_v < 8 ? coll_t::meta() : coll,
3334 ghobject_t(info_struct_v < 8 ? OSD::make_pg_log_oid(pg_id) : pgmeta_oid),
3335 info,
d2e6a577 3336 force_rebuild_missing,
7c673cae
FG
3337 oss,
3338 cct->_conf->osd_ignore_stale_divergent_priors,
3339 cct->_conf->osd_debug_verify_missing_on_start);
3340 if (oss.tellp())
3341 osd->clog->error() << oss.rdbuf();
3342
d2e6a577
FG
3343 if (force_rebuild_missing) {
3344 dout(10) << __func__ << " forced rebuild of missing got "
3345 << pg_log.get_missing()
3346 << dendl;
3347 }
3348
7c673cae
FG
3349 // log any weirdness
3350 log_weirdness();
3351}
3352
3353void PG::log_weirdness()
3354{
3355 if (pg_log.get_tail() != info.log_tail)
3356 osd->clog->error() << info.pgid
3357 << " info mismatch, log.tail " << pg_log.get_tail()
3358 << " != info.log_tail " << info.log_tail;
3359 if (pg_log.get_head() != info.last_update)
3360 osd->clog->error() << info.pgid
3361 << " info mismatch, log.head " << pg_log.get_head()
3362 << " != info.last_update " << info.last_update;
3363
3364 if (!pg_log.get_log().empty()) {
3365 // sloppy check
3366 if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
3367 osd->clog->error() << info.pgid
c07f9fc5
FG
3368 << " log bound mismatch, info (tail,head] ("
3369 << pg_log.get_tail() << "," << pg_log.get_head() << "]"
7c673cae
FG
3370 << " actual ["
3371 << pg_log.get_log().log.begin()->version << ","
3372 << pg_log.get_log().log.rbegin()->version << "]";
3373 }
3374
3375 if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
3376 osd->clog->error() << info.pgid
3377 << " caller_ops.size " << pg_log.get_log().caller_ops.size()
3378 << " > log size " << pg_log.get_log().log.size();
3379 }
3380}
3381
3382void PG::update_snap_map(
3383 const vector<pg_log_entry_t> &log_entries,
3384 ObjectStore::Transaction &t)
3385{
3386 for (vector<pg_log_entry_t>::const_iterator i = log_entries.begin();
3387 i != log_entries.end();
3388 ++i) {
3389 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
3390 if (i->soid.snap < CEPH_MAXSNAP) {
3391 if (i->is_delete()) {
3392 int r = snap_mapper.remove_oid(
3393 i->soid,
3394 &_t);
3395 assert(r == 0);
3396 } else if (i->is_update()) {
3397 assert(i->snaps.length() > 0);
3398 vector<snapid_t> snaps;
3399 bufferlist snapbl = i->snaps;
3400 bufferlist::iterator p = snapbl.begin();
3401 try {
3402 ::decode(snaps, p);
3403 } catch (...) {
3404 snaps.clear();
3405 }
3406 set<snapid_t> _snaps(snaps.begin(), snaps.end());
3407
3408 if (i->is_clone() || i->is_promote()) {
3409 snap_mapper.add_oid(
3410 i->soid,
3411 _snaps,
3412 &_t);
3413 } else if (i->is_modify()) {
3414 assert(i->is_modify());
3415 int r = snap_mapper.update_snaps(
3416 i->soid,
3417 _snaps,
3418 0,
3419 &_t);
3420 assert(r == 0);
3421 } else {
3422 assert(i->is_clean());
3423 }
3424 }
3425 }
3426 }
3427}
3428
3429/**
3430 * filter trimming|trimmed snaps out of snapcontext
3431 */
3432void PG::filter_snapc(vector<snapid_t> &snaps)
3433{
3434 //nothing needs to trim, we can return immediately
3435 if(snap_trimq.empty() && info.purged_snaps.empty())
3436 return;
3437
3438 bool filtering = false;
3439 vector<snapid_t> newsnaps;
3440 for (vector<snapid_t>::iterator p = snaps.begin();
3441 p != snaps.end();
3442 ++p) {
3443 if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
3444 if (!filtering) {
3445 // start building a new vector with what we've seen so far
3446 dout(10) << "filter_snapc filtering " << snaps << dendl;
3447 newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
3448 filtering = true;
3449 }
3450 dout(20) << "filter_snapc removing trimq|purged snap " << *p << dendl;
3451 } else {
3452 if (filtering)
3453 newsnaps.push_back(*p); // continue building new vector
3454 }
3455 }
3456 if (filtering) {
3457 snaps.swap(newsnaps);
3458 dout(10) << "filter_snapc result " << snaps << dendl;
3459 }
3460}
3461
3462void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m)
3463{
3464 for (map<hobject_t, list<OpRequestRef>>::iterator it = m.begin();
3465 it != m.end();
3466 ++it)
3467 requeue_ops(it->second);
3468 m.clear();
3469}
3470
3471void PG::requeue_op(OpRequestRef op)
3472{
3473 auto p = waiting_for_map.find(op->get_source());
3474 if (p != waiting_for_map.end()) {
3475 dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")"
3476 << dendl;
3477 p->second.push_front(op);
3478 } else {
3479 dout(20) << __func__ << " " << op << dendl;
3480 osd->enqueue_front(info.pgid, PGQueueable(op, get_osdmap()->get_epoch()));
3481 }
3482}
3483
3484void PG::requeue_ops(list<OpRequestRef> &ls)
3485{
3486 for (list<OpRequestRef>::reverse_iterator i = ls.rbegin();
3487 i != ls.rend();
3488 ++i) {
3489 auto p = waiting_for_map.find((*i)->get_source());
3490 if (p != waiting_for_map.end()) {
3491 dout(20) << __func__ << " " << *i << " (waiting_for_map " << p->first
3492 << ")" << dendl;
3493 p->second.push_front(*i);
3494 } else {
3495 dout(20) << __func__ << " " << *i << dendl;
3496 osd->enqueue_front(info.pgid, PGQueueable(*i, get_osdmap()->get_epoch()));
3497 }
3498 }
3499 ls.clear();
3500}
3501
3502void PG::requeue_map_waiters()
3503{
3504 epoch_t epoch = get_osdmap()->get_epoch();
3505 auto p = waiting_for_map.begin();
3506 while (p != waiting_for_map.end()) {
3507 if (epoch < p->second.front()->min_epoch) {
3508 dout(20) << __func__ << " " << p->first << " front op "
3509 << p->second.front() << " must still wait, doing nothing"
3510 << dendl;
3511 ++p;
3512 } else {
3513 dout(20) << __func__ << " " << p->first << " " << p->second << dendl;
3514 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
3515 osd->enqueue_front(info.pgid, PGQueueable(*q, epoch));
3516 }
3517 p = waiting_for_map.erase(p);
3518 }
3519 }
3520}
3521
3522
3523// ==========================================================================================
3524// SCRUB
3525
3526/*
3527 * when holding pg and sched_scrub_lock, then the states are:
3528 * scheduling:
3529 * scrubber.reserved = true
3530 * scrub_rserved_peers includes whoami
3531 * osd->scrub_pending++
3532 * scheduling, replica declined:
3533 * scrubber.reserved = true
3534 * scrubber.reserved_peers includes -1
3535 * osd->scrub_pending++
3536 * pending:
3537 * scrubber.reserved = true
3538 * scrubber.reserved_peers.size() == acting.size();
3539 * pg on scrub_wq
3540 * osd->scrub_pending++
3541 * scrubbing:
3542 * scrubber.reserved = false;
3543 * scrubber.reserved_peers empty
3544 * osd->scrubber.active++
3545 */
3546
3547// returns true if a scrub has been newly kicked off
3548bool PG::sched_scrub()
3549{
3550 bool nodeep_scrub = false;
3551 assert(is_locked());
3552 if (!(is_primary() && is_active() && is_clean() && !is_scrubbing())) {
3553 return false;
3554 }
3555
3556 double deep_scrub_interval = 0;
3557 pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
3558 if (deep_scrub_interval <= 0) {
3559 deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
3560 }
3561 bool time_for_deep = ceph_clock_now() >=
3562 info.history.last_deep_scrub_stamp + deep_scrub_interval;
3563
3564 bool deep_coin_flip = false;
3565 // Only add random deep scrubs when NOT user initiated scrub
3566 if (!scrubber.must_scrub)
3567 deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
3568 dout(20) << __func__ << ": time_for_deep=" << time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
3569
3570 time_for_deep = (time_for_deep || deep_coin_flip);
3571
3572 //NODEEP_SCRUB so ignore time initiated deep-scrub
3573 if (osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
3574 pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
3575 time_for_deep = false;
3576 nodeep_scrub = true;
3577 }
3578
3579 if (!scrubber.must_scrub) {
3580 assert(!scrubber.must_deep_scrub);
3581
3582 //NOSCRUB so skip regular scrubs
3583 if ((osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
3584 pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) && !time_for_deep) {
3585 if (scrubber.reserved) {
3586 // cancel scrub if it is still in scheduling,
3587 // so pgs from other pools where scrub are still legal
3588 // have a chance to go ahead with scrubbing.
3589 clear_scrub_reserved();
3590 scrub_unreserve_replicas();
3591 }
3592 return false;
3593 }
3594 }
3595
3596 if (cct->_conf->osd_scrub_auto_repair
3597 && get_pgbackend()->auto_repair_supported()
3598 && time_for_deep
3599 // respect the command from user, and not do auto-repair
3600 && !scrubber.must_repair
3601 && !scrubber.must_scrub
3602 && !scrubber.must_deep_scrub) {
3603 dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
3604 scrubber.auto_repair = true;
3605 } else {
3606 // this happens when user issue the scrub/repair command during
3607 // the scheduling of the scrub/repair (e.g. request reservation)
3608 scrubber.auto_repair = false;
3609 }
3610
3611 bool ret = true;
3612 if (!scrubber.reserved) {
3613 assert(scrubber.reserved_peers.empty());
b5b8bbf5
FG
3614 if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
3615 osd->inc_scrubs_pending()) {
3616 dout(20) << __func__ << ": reserved locally, reserving replicas" << dendl;
7c673cae
FG
3617 scrubber.reserved = true;
3618 scrubber.reserved_peers.insert(pg_whoami);
3619 scrub_reserve_replicas();
3620 } else {
b5b8bbf5 3621 dout(20) << __func__ << ": failed to reserve locally" << dendl;
7c673cae
FG
3622 ret = false;
3623 }
3624 }
3625 if (scrubber.reserved) {
3626 if (scrubber.reserve_failed) {
3627 dout(20) << "sched_scrub: failed, a peer declined" << dendl;
3628 clear_scrub_reserved();
3629 scrub_unreserve_replicas();
3630 ret = false;
3631 } else if (scrubber.reserved_peers.size() == acting.size()) {
3632 dout(20) << "sched_scrub: success, reserved self and replicas" << dendl;
3633 if (time_for_deep) {
3634 dout(10) << "sched_scrub: scrub will be deep" << dendl;
3635 state_set(PG_STATE_DEEP_SCRUB);
3636 } else if (!scrubber.must_deep_scrub && info.stats.stats.sum.num_deep_scrub_errors) {
3637 if (!nodeep_scrub) {
3638 osd->clog->info() << "osd." << osd->whoami
3639 << " pg " << info.pgid
3640 << " Deep scrub errors, upgrading scrub to deep-scrub";
3641 state_set(PG_STATE_DEEP_SCRUB);
3642 } else if (!scrubber.must_scrub) {
3643 osd->clog->error() << "osd." << osd->whoami
3644 << " pg " << info.pgid
3645 << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
3646 clear_scrub_reserved();
3647 scrub_unreserve_replicas();
3648 return false;
3649 } else {
3650 osd->clog->error() << "osd." << osd->whoami
3651 << " pg " << info.pgid
c07f9fc5 3652 << " Regular scrub request, deep-scrub details will be lost";
7c673cae
FG
3653 }
3654 }
3655 queue_scrub();
3656 } else {
3657 // none declined, since scrubber.reserved is set
3658 dout(20) << "sched_scrub: reserved " << scrubber.reserved_peers << ", waiting for replicas" << dendl;
3659 }
3660 }
3661
3662 return ret;
3663}
3664
3665void PG::reg_next_scrub()
3666{
3667 if (!is_primary())
3668 return;
3669
3670 utime_t reg_stamp;
3671 if (scrubber.must_scrub ||
3672 (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats)) {
3673 reg_stamp = ceph_clock_now();
3674 } else {
3675 reg_stamp = info.history.last_scrub_stamp;
3676 }
3677 // note down the sched_time, so we can locate this scrub, and remove it
3678 // later on.
3679 double scrub_min_interval = 0, scrub_max_interval = 0;
3680 pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
3681 pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
3682 assert(scrubber.scrub_reg_stamp == utime_t());
3683 scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
3684 reg_stamp,
3685 scrub_min_interval,
3686 scrub_max_interval,
3687 scrubber.must_scrub);
3688}
3689
3690void PG::unreg_next_scrub()
3691{
3692 if (is_primary()) {
3693 osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
3694 scrubber.scrub_reg_stamp = utime_t();
3695 }
3696}
3697
3698void PG::do_replica_scrub_map(OpRequestRef op)
3699{
3700 const MOSDRepScrubMap *m = static_cast<const MOSDRepScrubMap*>(op->get_req());
3701 dout(7) << __func__ << " " << *m << dendl;
3702 if (m->map_epoch < info.history.same_interval_since) {
3703 dout(10) << __func__ << " discarding old from "
3704 << m->map_epoch << " < " << info.history.same_interval_since
3705 << dendl;
3706 return;
3707 }
3708 if (!scrubber.is_chunky_scrub_active()) {
3709 dout(10) << __func__ << " scrub isn't active" << dendl;
3710 return;
3711 }
3712
3713 op->mark_started();
3714
3715 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3716 scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3717 dout(10) << "map version is "
3718 << scrubber.received_maps[m->from].valid_through
3719 << dendl;
3720
3721 --scrubber.waiting_on;
3722 scrubber.waiting_on_whom.erase(m->from);
3723 if (scrubber.waiting_on == 0) {
31f18b77
FG
3724 if (ops_blocked_by_scrub()) {
3725 requeue_scrub(true);
3726 } else {
3727 requeue_scrub(false);
3728 }
7c673cae
FG
3729 }
3730}
3731
3732void PG::sub_op_scrub_map(OpRequestRef op)
3733{
3734 // for legacy jewel compatibility only
3735 const MOSDSubOp *m = static_cast<const MOSDSubOp *>(op->get_req());
3736 assert(m->get_type() == MSG_OSD_SUBOP);
3737 dout(7) << "sub_op_scrub_map" << dendl;
3738
3739 if (m->map_epoch < info.history.same_interval_since) {
3740 dout(10) << "sub_op_scrub discarding old sub_op from "
3741 << m->map_epoch << " < " << info.history.same_interval_since << dendl;
3742 return;
3743 }
3744
3745 if (!scrubber.is_chunky_scrub_active()) {
3746 dout(10) << "sub_op_scrub_map scrub isn't active" << dendl;
3747 return;
3748 }
3749
3750 op->mark_started();
3751
3752 dout(10) << " got " << m->from << " scrub map" << dendl;
3753 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3754
3755 scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3756 dout(10) << "map version is "
3757 << scrubber.received_maps[m->from].valid_through
3758 << dendl;
3759
3760 --scrubber.waiting_on;
3761 scrubber.waiting_on_whom.erase(m->from);
3762
3763 if (scrubber.waiting_on == 0) {
31f18b77
FG
3764 if (ops_blocked_by_scrub()) {
3765 requeue_scrub(true);
3766 } else {
3767 requeue_scrub(false);
3768 }
7c673cae
FG
3769 }
3770}
3771
3772// send scrub v3 messages (chunky scrub)
3773void PG::_request_scrub_map(
3774 pg_shard_t replica, eversion_t version,
3775 hobject_t start, hobject_t end,
3776 bool deep, uint32_t seed)
3777{
3778 assert(replica != pg_whoami);
3779 dout(10) << "scrub requesting scrubmap from osd." << replica
3780 << " deep " << (int)deep << " seed " << seed << dendl;
3781 MOSDRepScrub *repscrubop = new MOSDRepScrub(
3782 spg_t(info.pgid.pgid, replica.shard), version,
3783 get_osdmap()->get_epoch(),
3784 get_last_peering_reset(),
3785 start, end, deep, seed);
3786 // default priority, we want the rep scrub processed prior to any recovery
3787 // or client io messages (we are holding a lock!)
3788 osd->send_message_osd_cluster(
3789 replica.osd, repscrubop, get_osdmap()->get_epoch());
3790}
3791
3792void PG::handle_scrub_reserve_request(OpRequestRef op)
3793{
3794 dout(7) << __func__ << " " << *op->get_req() << dendl;
3795 op->mark_started();
3796 if (scrubber.reserved) {
3797 dout(10) << __func__ << " ignoring reserve request: Already reserved"
3798 << dendl;
3799 return;
3800 }
b5b8bbf5
FG
3801 if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
3802 osd->inc_scrubs_pending()) {
3803 scrubber.reserved = true;
3804 } else {
3805 dout(20) << __func__ << ": failed to reserve remotely" << dendl;
3806 scrubber.reserved = false;
3807 }
7c673cae
FG
3808 if (op->get_req()->get_type() == MSG_OSD_SCRUB_RESERVE) {
3809 const MOSDScrubReserve *m =
3810 static_cast<const MOSDScrubReserve*>(op->get_req());
3811 Message *reply = new MOSDScrubReserve(
3812 spg_t(info.pgid.pgid, primary.shard),
3813 m->map_epoch,
3814 scrubber.reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
3815 pg_whoami);
3816 osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
3817 } else {
3818 // for jewel compat only
3819 const MOSDSubOp *req = static_cast<const MOSDSubOp*>(op->get_req());
3820 assert(req->get_type() == MSG_OSD_SUBOP);
3821 MOSDSubOpReply *reply = new MOSDSubOpReply(
3822 req, pg_whoami, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
3823 ::encode(scrubber.reserved, reply->get_data());
3824 osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
3825 }
3826}
3827
3828void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
3829{
3830 dout(7) << __func__ << " " << *op->get_req() << dendl;
3831 op->mark_started();
3832 if (!scrubber.reserved) {
3833 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
3834 return;
3835 }
3836 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
3837 dout(10) << " already had osd." << from << " reserved" << dendl;
3838 } else {
3839 dout(10) << " osd." << from << " scrub reserve = success" << dendl;
3840 scrubber.reserved_peers.insert(from);
3841 sched_scrub();
3842 }
3843}
3844
3845void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
3846{
3847 dout(7) << __func__ << " " << *op->get_req() << dendl;
3848 op->mark_started();
3849 if (!scrubber.reserved) {
3850 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
3851 return;
3852 }
3853 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
3854 dout(10) << " already had osd." << from << " reserved" << dendl;
3855 } else {
3856 /* One decline stops this pg from being scheduled for scrubbing. */
3857 dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
3858 scrubber.reserve_failed = true;
3859 sched_scrub();
3860 }
3861}
3862
3863void PG::handle_scrub_reserve_release(OpRequestRef op)
3864{
3865 dout(7) << __func__ << " " << *op->get_req() << dendl;
3866 op->mark_started();
3867 clear_scrub_reserved();
3868}
3869
3870void PG::reject_reservation()
3871{
3872 osd->send_message_osd_cluster(
3873 primary.osd,
3874 new MBackfillReserve(
3875 MBackfillReserve::REJECT,
3876 spg_t(info.pgid.pgid, primary.shard),
3877 get_osdmap()->get_epoch()),
3878 get_osdmap()->get_epoch());
3879}
3880
3881void PG::schedule_backfill_full_retry()
3882{
3883 Mutex::Locker lock(osd->recovery_request_lock);
3884 osd->recovery_request_timer.add_event_after(
3885 cct->_conf->osd_backfill_retry_interval,
3886 new QueuePeeringEvt<RequestBackfill>(
3887 this, get_osdmap()->get_epoch(),
3888 RequestBackfill()));
3889}
3890
3891void PG::schedule_recovery_full_retry()
3892{
3893 Mutex::Locker lock(osd->recovery_request_lock);
3894 osd->recovery_request_timer.add_event_after(
3895 cct->_conf->osd_recovery_retry_interval,
3896 new QueuePeeringEvt<DoRecovery>(
3897 this, get_osdmap()->get_epoch(),
3898 DoRecovery()));
3899}
3900
3901void PG::clear_scrub_reserved()
3902{
3903 scrubber.reserved_peers.clear();
3904 scrubber.reserve_failed = false;
3905
3906 if (scrubber.reserved) {
3907 scrubber.reserved = false;
3908 osd->dec_scrubs_pending();
3909 }
3910}
3911
3912void PG::scrub_reserve_replicas()
3913{
3914 assert(backfill_targets.empty());
3915 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3916 i != actingbackfill.end();
3917 ++i) {
3918 if (*i == pg_whoami) continue;
3919 dout(10) << "scrub requesting reserve from osd." << *i << dendl;
3920 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
3921 osd->send_message_osd_cluster(
3922 i->osd,
3923 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
3924 get_osdmap()->get_epoch(),
3925 MOSDScrubReserve::REQUEST, pg_whoami),
3926 get_osdmap()->get_epoch());
3927 } else {
3928 // for jewel compat only
3929 vector<OSDOp> scrub(1);
3930 scrub[0].op.op = CEPH_OSD_OP_SCRUB_RESERVE;
3931 hobject_t poid;
3932 eversion_t v;
3933 osd_reqid_t reqid;
3934 MOSDSubOp *subop = new MOSDSubOp(
3935 reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
3936 get_osdmap()->get_epoch(), osd->get_tid(), v);
3937 subop->ops = scrub;
3938 osd->send_message_osd_cluster(
3939 i->osd, subop, get_osdmap()->get_epoch());
3940 }
3941 }
3942}
3943
3944void PG::scrub_unreserve_replicas()
3945{
3946 assert(backfill_targets.empty());
3947 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3948 i != actingbackfill.end();
3949 ++i) {
3950 if (*i == pg_whoami) continue;
3951 dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
3952 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
3953 osd->send_message_osd_cluster(
3954 i->osd,
3955 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
3956 get_osdmap()->get_epoch(),
3957 MOSDScrubReserve::RELEASE, pg_whoami),
3958 get_osdmap()->get_epoch());
3959 } else {
3960 // for jewel compat only
3961 vector<OSDOp> scrub(1);
3962 scrub[0].op.op = CEPH_OSD_OP_SCRUB_UNRESERVE;
3963 hobject_t poid;
3964 eversion_t v;
3965 osd_reqid_t reqid;
3966 MOSDSubOp *subop = new MOSDSubOp(
3967 reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
3968 get_osdmap()->get_epoch(), osd->get_tid(), v);
3969 subop->ops = scrub;
3970 osd->send_message_osd_cluster(i->osd, subop, get_osdmap()->get_epoch());
3971 }
3972 }
3973}
3974
3975void PG::_scan_rollback_obs(
3976 const vector<ghobject_t> &rollback_obs,
3977 ThreadPool::TPHandle &handle)
3978{
3979 ObjectStore::Transaction t;
3980 eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
3981 for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
3982 i != rollback_obs.end();
3983 ++i) {
3984 if (i->generation < trimmed_to.version) {
3985 osd->clog->error() << "osd." << osd->whoami
3986 << " pg " << info.pgid
3987 << " found obsolete rollback obj "
3988 << *i << " generation < trimmed_to "
3989 << trimmed_to
3990 << "...repaired";
3991 t.remove(coll, *i);
3992 }
3993 }
3994 if (!t.empty()) {
3995 derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
3996 << dendl;
3997 osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3998 }
3999}
4000
4001void PG::_scan_snaps(ScrubMap &smap)
4002{
4003 hobject_t head;
4004 SnapSet snapset;
4005 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4006 i != smap.objects.rend();
4007 ++i) {
4008 const hobject_t &hoid = i->first;
4009 ScrubMap::object &o = i->second;
4010
4011 if (hoid.is_head() || hoid.is_snapdir()) {
4012 // parse the SnapSet
4013 bufferlist bl;
4014 if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
4015 continue;
4016 }
4017 bl.push_back(o.attrs[SS_ATTR]);
4018 auto p = bl.begin();
4019 try {
4020 ::decode(snapset, p);
4021 } catch(...) {
4022 continue;
4023 }
4024 head = hoid.get_head();
b5b8bbf5
FG
4025 // Make sure head_exists is correct for is_legacy() check
4026 if (hoid.is_head())
4027 snapset.head_exists = true;
7c673cae
FG
4028 continue;
4029 }
4030 if (hoid.snap < CEPH_MAXSNAP) {
4031 // check and if necessary fix snap_mapper
4032 if (hoid.get_head() != head) {
4033 derr << __func__ << " no head for " << hoid << " (have " << head << ")"
4034 << dendl;
4035 continue;
4036 }
4037 set<snapid_t> obj_snaps;
4038 if (!snapset.is_legacy()) {
4039 auto p = snapset.clone_snaps.find(hoid.snap);
4040 if (p == snapset.clone_snaps.end()) {
4041 derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset
4042 << dendl;
4043 continue;
4044 }
4045 obj_snaps.insert(p->second.begin(), p->second.end());
4046 } else {
4047 bufferlist bl;
4048 if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4049 continue;
4050 }
4051 bl.push_back(o.attrs[OI_ATTR]);
4052 object_info_t oi;
4053 try {
4054 oi.decode(bl);
4055 } catch(...) {
4056 continue;
4057 }
4058 obj_snaps.insert(oi.legacy_snaps.begin(), oi.legacy_snaps.end());
4059 }
4060 set<snapid_t> cur_snaps;
4061 int r = snap_mapper.get_snaps(hoid, &cur_snaps);
4062 if (r != 0 && r != -ENOENT) {
4063 derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
4064 ceph_abort();
4065 }
4066 if (r == -ENOENT || cur_snaps != obj_snaps) {
4067 ObjectStore::Transaction t;
4068 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4069 if (r == 0) {
4070 r = snap_mapper.remove_oid(hoid, &_t);
4071 if (r != 0) {
4072 derr << __func__ << ": remove_oid returned " << cpp_strerror(r)
4073 << dendl;
4074 ceph_abort();
4075 }
4076 osd->clog->error() << "osd." << osd->whoami
4077 << " found snap mapper error on pg "
4078 << info.pgid
4079 << " oid " << hoid << " snaps in mapper: "
4080 << cur_snaps << ", oi: "
4081 << obj_snaps
4082 << "...repaired";
4083 } else {
4084 osd->clog->error() << "osd." << osd->whoami
4085 << " found snap mapper error on pg "
4086 << info.pgid
4087 << " oid " << hoid << " snaps missing in mapper"
4088 << ", should be: "
4089 << obj_snaps
4090 << "...repaired";
4091 }
4092 snap_mapper.add_oid(hoid, obj_snaps, &_t);
4093 r = osd->store->apply_transaction(osr.get(), std::move(t));
4094 if (r != 0) {
4095 derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4096 << dendl;
4097 }
4098 }
4099 }
4100 }
4101}
4102
224ce89b
WB
4103void PG::_repair_oinfo_oid(ScrubMap &smap)
4104{
4105 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4106 i != smap.objects.rend();
4107 ++i) {
4108 const hobject_t &hoid = i->first;
4109 ScrubMap::object &o = i->second;
4110
4111 bufferlist bl;
4112 if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4113 continue;
4114 }
4115 bl.push_back(o.attrs[OI_ATTR]);
4116 object_info_t oi;
4117 try {
4118 oi.decode(bl);
4119 } catch(...) {
4120 continue;
4121 }
4122 if (oi.soid != hoid) {
4123 ObjectStore::Transaction t;
4124 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4125 osd->clog->error() << "osd." << osd->whoami
4126 << " found object info error on pg "
4127 << info.pgid
4128 << " oid " << hoid << " oid in object info: "
4129 << oi.soid
4130 << "...repaired";
4131 // Fix object info
4132 oi.soid = hoid;
4133 bl.clear();
4134 ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4135
4136 bufferptr bp(bl.c_str(), bl.length());
4137 o.attrs[OI_ATTR] = bp;
4138
4139 t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
4140 int r = osd->store->apply_transaction(osr.get(), std::move(t));
4141 if (r != 0) {
4142 derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4143 << dendl;
4144 }
4145 }
4146 }
4147}
4148
7c673cae
FG
4149/*
4150 * build a scrub map over a chunk without releasing the lock
4151 * only used by chunky scrub
4152 */
4153int PG::build_scrub_map_chunk(
4154 ScrubMap &map,
4155 hobject_t start, hobject_t end, bool deep, uint32_t seed,
4156 ThreadPool::TPHandle &handle)
4157{
4158 dout(10) << __func__ << " [" << start << "," << end << ") "
4159 << " seed " << seed << dendl;
4160
4161 map.valid_through = info.last_update;
4162
4163 // objects
4164 vector<hobject_t> ls;
4165 vector<ghobject_t> rollback_obs;
4166 int ret = get_pgbackend()->objects_list_range(
4167 start,
4168 end,
4169 0,
4170 &ls,
4171 &rollback_obs);
4172 if (ret < 0) {
4173 dout(5) << "objects_list_range error: " << ret << dendl;
4174 return ret;
4175 }
4176
4177
4178 get_pgbackend()->be_scan_list(map, ls, deep, seed, handle);
4179 _scan_rollback_obs(rollback_obs, handle);
4180 _scan_snaps(map);
224ce89b 4181 _repair_oinfo_oid(map);
7c673cae
FG
4182
4183 dout(20) << __func__ << " done" << dendl;
4184 return 0;
4185}
4186
4187void PG::Scrubber::cleanup_store(ObjectStore::Transaction *t) {
4188 if (!store)
4189 return;
4190 struct OnComplete : Context {
4191 std::unique_ptr<Scrub::Store> store;
4192 OnComplete(
4193 std::unique_ptr<Scrub::Store> &&store)
4194 : store(std::move(store)) {}
4195 void finish(int) override {}
4196 };
4197 store->cleanup(t);
4198 t->register_on_complete(new OnComplete(std::move(store)));
4199 assert(!store);
4200}
4201
4202void PG::repair_object(
4203 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
4204 pg_shard_t bad_peer)
4205{
4206 list<pg_shard_t> op_shards;
4207 for (auto i : *ok_peers) {
4208 op_shards.push_back(i.second);
4209 }
4210 dout(10) << "repair_object " << soid << " bad_peer osd."
4211 << bad_peer << " ok_peers osd.{" << op_shards << "}" << dendl;
4212 ScrubMap::object &po = ok_peers->back().first;
4213 eversion_t v;
4214 bufferlist bv;
4215 bv.push_back(po.attrs[OI_ATTR]);
224ce89b
WB
4216 object_info_t oi;
4217 try {
4218 bufferlist::iterator bliter = bv.begin();
4219 ::decode(oi, bliter);
4220 } catch (...) {
4221 dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
4222 assert(0);
4223 }
7c673cae 4224 if (bad_peer != primary) {
c07f9fc5 4225 peer_missing[bad_peer].add(soid, oi.version, eversion_t(), false);
7c673cae
FG
4226 } else {
4227 // We should only be scrubbing if the PG is clean.
4228 assert(waiting_for_unreadable_object.empty());
4229
4230 pg_log.missing_add(soid, oi.version, eversion_t());
4231
4232 pg_log.set_last_requested(0);
4233 dout(10) << __func__ << ": primary = " << primary << dendl;
4234 }
4235
4236 if (is_ec_pg() || bad_peer == primary) {
4237 // we'd better collect all shard for EC pg, and prepare good peers as the
4238 // source of pull in the case of replicated pg.
4239 missing_loc.add_missing(soid, oi.version, eversion_t());
4240 list<pair<ScrubMap::object, pg_shard_t> >::iterator i;
4241 for (i = ok_peers->begin();
4242 i != ok_peers->end();
4243 ++i)
4244 missing_loc.add_location(soid, i->second);
4245 }
4246}
4247
4248/* replica_scrub
4249 *
4250 * Wait for last_update_applied to match msg->scrub_to as above. Wait
4251 * for pushes to complete in case of recent recovery. Build a single
4252 * scrubmap of objects that are in the range [msg->start, msg->end).
4253 */
4254void PG::replica_scrub(
4255 OpRequestRef op,
4256 ThreadPool::TPHandle &handle)
4257{
4258 const MOSDRepScrub *msg = static_cast<const MOSDRepScrub *>(op->get_req());
4259 assert(!scrubber.active_rep_scrub);
4260 dout(7) << "replica_scrub" << dendl;
4261
4262 if (msg->map_epoch < info.history.same_interval_since) {
4263 dout(10) << "replica_scrub discarding old replica_scrub from "
4264 << msg->map_epoch << " < " << info.history.same_interval_since
4265 << dendl;
4266 return;
4267 }
4268
4269 ScrubMap map;
4270
4271 assert(msg->chunky);
4272 if (last_update_applied < msg->scrub_to) {
4273 dout(10) << "waiting for last_update_applied to catch up" << dendl;
4274 scrubber.active_rep_scrub = op;
4275 return;
4276 }
4277
4278 if (active_pushes > 0) {
4279 dout(10) << "waiting for active pushes to finish" << dendl;
4280 scrubber.active_rep_scrub = op;
4281 return;
4282 }
4283
4284 // compensate for hobject_t's with wrong pool from sloppy hammer OSDs
4285 hobject_t start = msg->start;
4286 hobject_t end = msg->end;
4287 if (!start.is_max())
4288 start.pool = info.pgid.pool();
4289 if (!end.is_max())
4290 end.pool = info.pgid.pool();
4291
4292 build_scrub_map_chunk(
4293 map, start, end, msg->deep, msg->seed,
4294 handle);
4295
4296 if (HAVE_FEATURE(acting_features, SERVER_LUMINOUS)) {
4297 MOSDRepScrubMap *reply = new MOSDRepScrubMap(
4298 spg_t(info.pgid.pgid, get_primary().shard),
4299 msg->map_epoch,
4300 pg_whoami);
4301 ::encode(map, reply->get_data());
4302 osd->send_message_osd_cluster(reply, msg->get_connection());
4303 } else {
4304 // for jewel compatibility
4305 vector<OSDOp> scrub(1);
4306 scrub[0].op.op = CEPH_OSD_OP_SCRUB_MAP;
4307 hobject_t poid;
4308 eversion_t v;
4309 osd_reqid_t reqid;
4310 MOSDSubOp *subop = new MOSDSubOp(
4311 reqid,
4312 pg_whoami,
4313 spg_t(info.pgid.pgid, get_primary().shard),
4314 poid,
4315 0,
4316 msg->map_epoch,
4317 osd->get_tid(),
4318 v);
4319 ::encode(map, subop->get_data());
4320 subop->ops = scrub;
4321 osd->send_message_osd_cluster(subop, msg->get_connection());
4322 }
4323}
4324
4325/* Scrub:
4326 * PG_STATE_SCRUBBING is set when the scrub is queued
4327 *
4328 * scrub will be chunky if all OSDs in PG support chunky scrub
4329 * scrub will fail if OSDs are too old.
4330 */
4331void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
4332{
4333 if (cct->_conf->osd_scrub_sleep > 0 &&
4334 (scrubber.state == PG::Scrubber::NEW_CHUNK ||
31f18b77
FG
4335 scrubber.state == PG::Scrubber::INACTIVE) &&
4336 scrubber.needs_sleep) {
7c673cae
FG
4337 ceph_assert(!scrubber.sleeping);
4338 dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
31f18b77 4339
7c673cae 4340 // Do an async sleep so we don't block the op queue
31f18b77
FG
4341 OSDService *osds = osd;
4342 spg_t pgid = get_pgid();
4343 int state = scrubber.state;
4344 auto scrub_requeue_callback =
4345 new FunctionContext([osds, pgid, state](int r) {
4346 PG *pg = osds->osd->lookup_lock_pg(pgid);
4347 if (pg == nullptr) {
4348 lgeneric_dout(osds->osd->cct, 20)
4349 << "scrub_requeue_callback: Could not find "
4350 << "PG " << pgid << " can't complete scrub requeue after sleep"
4351 << dendl;
4352 return;
4353 }
4354 pg->scrubber.sleeping = false;
4355 pg->scrubber.needs_sleep = false;
4356 lgeneric_dout(pg->cct, 20)
4357 << "scrub_requeue_callback: slept for "
4358 << ceph_clock_now() - pg->scrubber.sleep_start
4359 << ", re-queuing scrub with state " << state << dendl;
4360 pg->scrub_queued = false;
4361 pg->requeue_scrub();
4362 pg->scrubber.sleep_start = utime_t();
4363 pg->unlock();
4364 });
4365 Mutex::Locker l(osd->scrub_sleep_lock);
4366 osd->scrub_sleep_timer.add_event_after(cct->_conf->osd_scrub_sleep,
4367 scrub_requeue_callback);
7c673cae
FG
4368 scrubber.sleeping = true;
4369 scrubber.sleep_start = ceph_clock_now();
4370 return;
4371 }
4372 if (pg_has_reset_since(queued)) {
4373 return;
4374 }
4375 assert(scrub_queued);
4376 scrub_queued = false;
4377 scrubber.needs_sleep = true;
4378
4379 if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
4380 dout(10) << "scrub -- not primary or active or not clean" << dendl;
4381 state_clear(PG_STATE_SCRUBBING);
4382 state_clear(PG_STATE_REPAIR);
4383 state_clear(PG_STATE_DEEP_SCRUB);
4384 publish_stats_to_osd();
4385 return;
4386 }
4387
4388 if (!scrubber.active) {
4389 assert(backfill_targets.empty());
4390
4391 scrubber.deep = state_test(PG_STATE_DEEP_SCRUB);
4392
4393 dout(10) << "starting a new chunky scrub" << dendl;
4394 }
4395
4396 chunky_scrub(handle);
4397}
4398
4399/*
4400 * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
4401 * chunk.
4402 *
4403 * The object store is partitioned into chunks which end on hash boundaries. For
4404 * each chunk, the following logic is performed:
4405 *
4406 * (1) Block writes on the chunk
4407 * (2) Request maps from replicas
4408 * (3) Wait for pushes to be applied (after recovery)
4409 * (4) Wait for writes to flush on the chunk
4410 * (5) Wait for maps from replicas
4411 * (6) Compare / repair all scrub maps
4412 * (7) Wait for digest updates to apply
4413 *
4414 * This logic is encoded in the mostly linear state machine:
4415 *
4416 * +------------------+
4417 * _________v__________ |
4418 * | | |
4419 * | INACTIVE | |
4420 * |____________________| |
4421 * | |
4422 * | +----------+ |
4423 * _________v___v______ | |
4424 * | | | |
4425 * | NEW_CHUNK | | |
4426 * |____________________| | |
4427 * | | |
4428 * _________v__________ | |
4429 * | | | |
4430 * | WAIT_PUSHES | | |
4431 * |____________________| | |
4432 * | | |
4433 * _________v__________ | |
4434 * | | | |
4435 * | WAIT_LAST_UPDATE | | |
4436 * |____________________| | |
4437 * | | |
4438 * _________v__________ | |
4439 * | | | |
4440 * | BUILD_MAP | | |
4441 * |____________________| | |
4442 * | | |
4443 * _________v__________ | |
4444 * | | | |
4445 * | WAIT_REPLICAS | | |
4446 * |____________________| | |
4447 * | | |
4448 * _________v__________ | |
4449 * | | | |
4450 * | COMPARE_MAPS | | |
4451 * |____________________| | |
4452 * | | |
4453 * | | |
4454 * _________v__________ | |
4455 * | | | |
4456 * |WAIT_DIGEST_UPDATES | | |
4457 * |____________________| | |
4458 * | | | |
4459 * | +----------+ |
4460 * _________v__________ |
4461 * | | |
4462 * | FINISH | |
4463 * |____________________| |
4464 * | |
4465 * +------------------+
4466 *
4467 * The primary determines the last update from the subset by walking the log. If
4468 * it sees a log entry pertaining to a file in the chunk, it tells the replicas
4469 * to wait until that update is applied before building a scrub map. Both the
4470 * primary and replicas will wait for any active pushes to be applied.
4471 *
4472 * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
4473 *
4474 * scrubber.state encodes the current state of the scrub (refer to state diagram
4475 * for details).
4476 */
4477void PG::chunky_scrub(ThreadPool::TPHandle &handle)
4478{
4479 // check for map changes
4480 if (scrubber.is_chunky_scrub_active()) {
4481 if (scrubber.epoch_start != info.history.same_interval_since) {
4482 dout(10) << "scrub pg changed, aborting" << dendl;
4483 scrub_clear_state();
4484 scrub_unreserve_replicas();
4485 return;
4486 }
4487 }
4488
4489 bool done = false;
4490 int ret;
4491
4492 while (!done) {
4493 dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
4494 << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
4495
4496 switch (scrubber.state) {
4497 case PG::Scrubber::INACTIVE:
4498 dout(10) << "scrub start" << dendl;
4499
4500 publish_stats_to_osd();
4501 scrubber.epoch_start = info.history.same_interval_since;
4502 scrubber.active = true;
4503
4504 osd->inc_scrubs_active(scrubber.reserved);
4505 if (scrubber.reserved) {
4506 scrubber.reserved = false;
4507 scrubber.reserved_peers.clear();
4508 }
4509
4510 {
4511 ObjectStore::Transaction t;
4512 scrubber.cleanup_store(&t);
4513 scrubber.store.reset(Scrub::Store::create(osd->store, &t,
4514 info.pgid, coll));
4515 osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4516 }
4517
4518 // Don't include temporary objects when scrubbing
4519 scrubber.start = info.pgid.pgid.get_hobj_start();
4520 scrubber.state = PG::Scrubber::NEW_CHUNK;
4521
4522 {
4523 bool repair = state_test(PG_STATE_REPAIR);
4524 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4525 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4526 stringstream oss;
4527 oss << info.pgid.pgid << " " << mode << " starts" << std::endl;
d2e6a577 4528 osd->clog->debug(oss);
7c673cae
FG
4529 }
4530
4531 scrubber.seed = -1;
4532
4533 break;
4534
4535 case PG::Scrubber::NEW_CHUNK:
4536 scrubber.primary_scrubmap = ScrubMap();
4537 scrubber.received_maps.clear();
4538
4539 {
4540 /* get the start and end of our scrub chunk
4541 *
4542 * Our scrub chunk has an important restriction we're going to need to
4543 * respect. We can't let head or snapdir be start or end.
4544 * Using a half-open interval means that if end == head|snapdir,
4545 * we'd scrub/lock head and the clone right next to head in different
4546 * chunks which would allow us to miss clones created between
4547 * scrubbing that chunk and scrubbing the chunk including head.
4548 * This isn't true for any of the other clones since clones can
4549 * only be created "just to the left of" head. There is one exception
4550 * to this: promotion of clones which always happens to the left of the
4551 * left-most clone, but promote_object checks the scrubber in that
4552 * case, so it should be ok. Also, it's ok to "miss" clones at the
4553 * left end of the range if we are a tier because they may legitimately
4554 * not exist (see _scrub).
4555 */
4556 int min = MAX(3, cct->_conf->osd_scrub_chunk_min);
4557 hobject_t start = scrubber.start;
4558 hobject_t candidate_end;
4559 vector<hobject_t> objects;
4560 ret = get_pgbackend()->objects_list_partial(
4561 start,
4562 min,
4563 MAX(min, cct->_conf->osd_scrub_chunk_max),
4564 &objects,
4565 &candidate_end);
4566 assert(ret >= 0);
4567
4568 if (!objects.empty()) {
4569 hobject_t back = objects.back();
4570 while (candidate_end.has_snapset() &&
4571 candidate_end.get_head() == back.get_head()) {
4572 candidate_end = back;
4573 objects.pop_back();
4574 if (objects.empty()) {
4575 assert(0 ==
4576 "Somehow we got more than 2 objects which"
4577 "have the same head but are not clones");
4578 }
4579 back = objects.back();
4580 }
4581 if (candidate_end.has_snapset()) {
4582 assert(candidate_end.get_head() != back.get_head());
4583 candidate_end = candidate_end.get_object_boundary();
4584 }
4585 } else {
4586 assert(candidate_end.is_max());
4587 }
4588
4589 if (!_range_available_for_scrub(scrubber.start, candidate_end)) {
4590 // we'll be requeued by whatever made us unavailable for scrub
4591 dout(10) << __func__ << ": scrub blocked somewhere in range "
4592 << "[" << scrubber.start << ", " << candidate_end << ")"
4593 << dendl;
4594 done = true;
4595 break;
4596 }
4597 scrubber.end = candidate_end;
4598 }
4599
4600 // walk the log to find the latest update that affects our chunk
4601 scrubber.subset_last_update = eversion_t();
4602 for (auto p = projected_log.log.rbegin();
4603 p != projected_log.log.rend();
4604 ++p) {
4605 if (p->soid >= scrubber.start &&
4606 p->soid < scrubber.end) {
4607 scrubber.subset_last_update = p->version;
4608 break;
4609 }
4610 }
4611 if (scrubber.subset_last_update == eversion_t()) {
4612 for (list<pg_log_entry_t>::const_reverse_iterator p =
4613 pg_log.get_log().log.rbegin();
4614 p != pg_log.get_log().log.rend();
4615 ++p) {
4616 if (p->soid >= scrubber.start &&
4617 p->soid < scrubber.end) {
4618 scrubber.subset_last_update = p->version;
4619 break;
4620 }
4621 }
4622 }
4623
4624 // ask replicas to wait until
4625 // last_update_applied >= scrubber.subset_last_update and then scan
4626 scrubber.waiting_on_whom.insert(pg_whoami);
4627 ++scrubber.waiting_on;
4628
4629 // request maps from replicas
4630 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4631 i != actingbackfill.end();
4632 ++i) {
4633 if (*i == pg_whoami) continue;
4634 _request_scrub_map(*i, scrubber.subset_last_update,
4635 scrubber.start, scrubber.end, scrubber.deep,
4636 scrubber.seed);
4637 scrubber.waiting_on_whom.insert(*i);
4638 ++scrubber.waiting_on;
4639 }
4640
4641 scrubber.state = PG::Scrubber::WAIT_PUSHES;
4642
4643 break;
4644
4645 case PG::Scrubber::WAIT_PUSHES:
4646 if (active_pushes == 0) {
4647 scrubber.state = PG::Scrubber::WAIT_LAST_UPDATE;
4648 } else {
4649 dout(15) << "wait for pushes to apply" << dendl;
4650 done = true;
4651 }
4652 break;
4653
4654 case PG::Scrubber::WAIT_LAST_UPDATE:
4655 if (last_update_applied >= scrubber.subset_last_update) {
4656 scrubber.state = PG::Scrubber::BUILD_MAP;
4657 } else {
4658 // will be requeued by op_applied
4659 dout(15) << "wait for writes to flush" << dendl;
4660 done = true;
4661 }
4662 break;
4663
4664 case PG::Scrubber::BUILD_MAP:
4665 assert(last_update_applied >= scrubber.subset_last_update);
4666
4667 // build my own scrub map
4668 ret = build_scrub_map_chunk(scrubber.primary_scrubmap,
4669 scrubber.start, scrubber.end,
4670 scrubber.deep, scrubber.seed,
4671 handle);
4672 if (ret < 0) {
4673 dout(5) << "error building scrub map: " << ret << ", aborting" << dendl;
4674 scrub_clear_state();
4675 scrub_unreserve_replicas();
4676 return;
4677 }
4678
4679 --scrubber.waiting_on;
4680 scrubber.waiting_on_whom.erase(pg_whoami);
4681
4682 scrubber.state = PG::Scrubber::WAIT_REPLICAS;
4683 break;
4684
4685 case PG::Scrubber::WAIT_REPLICAS:
4686 if (scrubber.waiting_on > 0) {
4687 // will be requeued by sub_op_scrub_map
4688 dout(10) << "wait for replicas to build scrub map" << dendl;
4689 done = true;
4690 } else {
4691 scrubber.state = PG::Scrubber::COMPARE_MAPS;
4692 }
4693 break;
4694
4695 case PG::Scrubber::COMPARE_MAPS:
4696 assert(last_update_applied >= scrubber.subset_last_update);
4697 assert(scrubber.waiting_on == 0);
4698
4699 scrub_compare_maps();
4700 scrubber.start = scrubber.end;
4701 scrubber.run_callbacks();
4702
4703 // requeue the writes from the chunk that just finished
4704 requeue_ops(waiting_for_scrub);
4705
4706 scrubber.state = PG::Scrubber::WAIT_DIGEST_UPDATES;
4707
4708 // fall-thru
4709
4710 case PG::Scrubber::WAIT_DIGEST_UPDATES:
4711 if (scrubber.num_digest_updates_pending) {
4712 dout(10) << __func__ << " waiting on "
4713 << scrubber.num_digest_updates_pending
4714 << " digest updates" << dendl;
4715 done = true;
4716 break;
4717 }
4718
4719 if (!(scrubber.end.is_max())) {
4720 scrubber.state = PG::Scrubber::NEW_CHUNK;
4721 requeue_scrub();
4722 done = true;
4723 } else {
4724 scrubber.state = PG::Scrubber::FINISH;
4725 }
4726
4727 break;
4728
4729 case PG::Scrubber::FINISH:
4730 scrub_finish();
4731 scrubber.state = PG::Scrubber::INACTIVE;
4732 done = true;
4733
224ce89b
WB
4734 if (!snap_trimq.empty()) {
4735 dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
4736 snap_trimmer_scrub_complete();
4737 }
4738
7c673cae
FG
4739 break;
4740
4741 default:
4742 ceph_abort();
4743 }
4744 }
4745 dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
4746 << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
4747}
4748
4749void PG::scrub_clear_state()
4750{
4751 assert(is_locked());
4752 state_clear(PG_STATE_SCRUBBING);
4753 state_clear(PG_STATE_REPAIR);
4754 state_clear(PG_STATE_DEEP_SCRUB);
4755 publish_stats_to_osd();
4756
4757 // active -> nothing.
4758 if (scrubber.active)
4759 osd->dec_scrubs_active();
4760
4761 requeue_ops(waiting_for_scrub);
4762
7c673cae
FG
4763 scrubber.reset();
4764
4765 // type-specific state clear
4766 _scrub_clear_state();
4767}
4768
4769void PG::scrub_compare_maps()
4770{
4771 dout(10) << __func__ << " has maps, analyzing" << dendl;
4772
4773 // construct authoritative scrub map for type specific scrubbing
4774 scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
4775 map<hobject_t, pair<uint32_t, uint32_t>> missing_digest;
4776
4777 if (acting.size() > 1) {
4778 dout(10) << __func__ << " comparing replica scrub maps" << dendl;
4779
4780 stringstream ss;
4781
4782 // Map from object with errors to good peer
4783 map<hobject_t, list<pg_shard_t>> authoritative;
4784 map<pg_shard_t, ScrubMap *> maps;
4785
4786 dout(2) << __func__ << " osd." << acting[0] << " has "
4787 << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
4788 maps[pg_whoami] = &scrubber.primary_scrubmap;
4789
4790 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4791 i != actingbackfill.end();
4792 ++i) {
4793 if (*i == pg_whoami) continue;
4794 dout(2) << __func__ << " replica " << *i << " has "
4795 << scrubber.received_maps[*i].objects.size()
4796 << " items" << dendl;
4797 maps[*i] = &scrubber.received_maps[*i];
4798 }
4799
4800 get_pgbackend()->be_compare_scrubmaps(
4801 maps,
4802 state_test(PG_STATE_REPAIR),
4803 scrubber.missing,
4804 scrubber.inconsistent,
4805 authoritative,
4806 missing_digest,
4807 scrubber.shallow_errors,
4808 scrubber.deep_errors,
4809 scrubber.store.get(),
4810 info.pgid, acting,
4811 ss);
4812 dout(2) << ss.str() << dendl;
4813
4814 if (!ss.str().empty()) {
4815 osd->clog->error(ss);
4816 }
4817
4818 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
4819 i != authoritative.end();
4820 ++i) {
4821 list<pair<ScrubMap::object, pg_shard_t> > good_peers;
4822 for (list<pg_shard_t>::const_iterator j = i->second.begin();
4823 j != i->second.end();
4824 ++j) {
4825 good_peers.push_back(make_pair(maps[*j]->objects[i->first], *j));
4826 }
4827 scrubber.authoritative.insert(
4828 make_pair(
4829 i->first,
4830 good_peers));
4831 }
4832
4833 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
4834 i != authoritative.end();
4835 ++i) {
4836 scrubber.cleaned_meta_map.objects.erase(i->first);
4837 scrubber.cleaned_meta_map.objects.insert(
4838 *(maps[i->second.back()]->objects.find(i->first))
4839 );
4840 }
4841 }
4842
4843 ScrubMap for_meta_scrub;
4844 if (scrubber.end.is_max() ||
4845 scrubber.cleaned_meta_map.objects.empty()) {
4846 scrubber.cleaned_meta_map.swap(for_meta_scrub);
4847 } else {
4848 auto iter = scrubber.cleaned_meta_map.objects.end();
4849 --iter; // not empty, see if clause
4850 auto begin = scrubber.cleaned_meta_map.objects.begin();
4851 while (iter != begin) {
4852 auto next = iter--;
4853 if (next->first.get_head() != iter->first.get_head()) {
4854 ++iter;
4855 break;
4856 }
4857 }
4858 for_meta_scrub.objects.insert(begin, iter);
4859 scrubber.cleaned_meta_map.objects.erase(begin, iter);
4860 }
4861
4862 // ok, do the pg-type specific scrubbing
4863 scrub_snapshot_metadata(for_meta_scrub, missing_digest);
4864 if (!scrubber.store->empty()) {
4865 if (state_test(PG_STATE_REPAIR)) {
4866 dout(10) << __func__ << ": discarding scrub results" << dendl;
4867 scrubber.store->flush(nullptr);
4868 } else {
4869 dout(10) << __func__ << ": updating scrub object" << dendl;
4870 ObjectStore::Transaction t;
4871 scrubber.store->flush(&t);
4872 osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4873 }
4874 }
4875}
4876
4877bool PG::scrub_process_inconsistent()
4878{
4879 dout(10) << __func__ << ": checking authoritative" << dendl;
4880 bool repair = state_test(PG_STATE_REPAIR);
4881 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4882 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4883
4884 // authoriative only store objects which missing or inconsistent.
4885 if (!scrubber.authoritative.empty()) {
4886 stringstream ss;
4887 ss << info.pgid << " " << mode << " "
4888 << scrubber.missing.size() << " missing, "
4889 << scrubber.inconsistent.size() << " inconsistent objects";
4890 dout(2) << ss.str() << dendl;
4891 osd->clog->error(ss);
4892 if (repair) {
4893 state_clear(PG_STATE_CLEAN);
4894 for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >>::iterator i =
4895 scrubber.authoritative.begin();
4896 i != scrubber.authoritative.end();
4897 ++i) {
4898 set<pg_shard_t>::iterator j;
4899
4900 auto missing_entry = scrubber.missing.find(i->first);
4901 if (missing_entry != scrubber.missing.end()) {
4902 for (j = missing_entry->second.begin();
4903 j != missing_entry->second.end();
4904 ++j) {
4905 repair_object(
4906 i->first,
4907 &(i->second),
4908 *j);
4909 ++scrubber.fixed;
4910 }
4911 }
4912 if (scrubber.inconsistent.count(i->first)) {
4913 for (j = scrubber.inconsistent[i->first].begin();
4914 j != scrubber.inconsistent[i->first].end();
4915 ++j) {
4916 repair_object(i->first,
4917 &(i->second),
4918 *j);
4919 ++scrubber.fixed;
4920 }
4921 }
4922 }
4923 }
4924 }
4925 return (!scrubber.authoritative.empty() && repair);
4926}
4927
31f18b77
FG
4928bool PG::ops_blocked_by_scrub() const {
4929 return (waiting_for_scrub.size() != 0);
4930}
4931
7c673cae
FG
4932// the part that actually finalizes a scrub
4933void PG::scrub_finish()
4934{
4935 bool repair = state_test(PG_STATE_REPAIR);
4936 // if the repair request comes from auto-repair and large number of errors,
4937 // we would like to cancel auto-repair
4938 if (repair && scrubber.auto_repair
4939 && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) {
4940 state_clear(PG_STATE_REPAIR);
4941 repair = false;
4942 }
4943 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4944 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4945
4946 // type-specific finish (can tally more errors)
4947 _scrub_finish();
4948
4949 bool has_error = scrub_process_inconsistent();
4950
4951 {
4952 stringstream oss;
4953 oss << info.pgid.pgid << " " << mode << " ";
4954 int total_errors = scrubber.shallow_errors + scrubber.deep_errors;
4955 if (total_errors)
4956 oss << total_errors << " errors";
4957 else
4958 oss << "ok";
4959 if (!deep_scrub && info.stats.stats.sum.num_deep_scrub_errors)
4960 oss << " ( " << info.stats.stats.sum.num_deep_scrub_errors
4961 << " remaining deep scrub error details lost)";
4962 if (repair)
4963 oss << ", " << scrubber.fixed << " fixed";
4964 if (total_errors)
4965 osd->clog->error(oss);
4966 else
d2e6a577 4967 osd->clog->debug(oss);
7c673cae
FG
4968 }
4969
4970 // finish up
4971 unreg_next_scrub();
4972 utime_t now = ceph_clock_now();
4973 info.history.last_scrub = info.last_update;
4974 info.history.last_scrub_stamp = now;
4975 if (scrubber.deep) {
4976 info.history.last_deep_scrub = info.last_update;
4977 info.history.last_deep_scrub_stamp = now;
4978 }
4979 // Since we don't know which errors were fixed, we can only clear them
4980 // when every one has been fixed.
4981 if (repair) {
4982 if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) {
4983 assert(deep_scrub);
4984 scrubber.shallow_errors = scrubber.deep_errors = 0;
4985 } else {
4986 // Deep scrub in order to get corrected error counts
4987 scrub_after_recovery = true;
4988 }
4989 }
4990 if (deep_scrub) {
4991 if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0))
4992 info.history.last_clean_scrub_stamp = now;
4993 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
4994 info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
4995 } else {
4996 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
4997 // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
4998 // because of deep-scrub errors
4999 if (scrubber.shallow_errors == 0)
5000 info.history.last_clean_scrub_stamp = now;
5001 }
5002 info.stats.stats.sum.num_scrub_errors =
5003 info.stats.stats.sum.num_shallow_scrub_errors +
5004 info.stats.stats.sum.num_deep_scrub_errors;
5005 reg_next_scrub();
5006
5007 {
5008 ObjectStore::Transaction t;
5009 dirty_info = true;
5010 write_if_dirty(t);
5011 int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
5012 assert(tr == 0);
5013 }
5014
5015
5016 if (has_error) {
5017 queue_peering_event(
5018 CephPeeringEvtRef(
5019 std::make_shared<CephPeeringEvt>(
5020 get_osdmap()->get_epoch(),
5021 get_osdmap()->get_epoch(),
5022 DoRecovery())));
5023 }
5024
5025 scrub_clear_state();
5026 scrub_unreserve_replicas();
5027
5028 if (is_active() && is_primary()) {
5029 share_pg_info();
5030 }
5031}
5032
5033void PG::share_pg_info()
5034{
5035 dout(10) << "share_pg_info" << dendl;
5036
5037 // share new pg_info_t with replicas
5038 assert(!actingbackfill.empty());
5039 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
5040 i != actingbackfill.end();
5041 ++i) {
5042 if (*i == pg_whoami) continue;
5043 pg_shard_t peer = *i;
5044 if (peer_info.count(peer)) {
5045 peer_info[peer].last_epoch_started = info.last_epoch_started;
5046 peer_info[peer].last_interval_started = info.last_interval_started;
5047 peer_info[peer].history.merge(info.history);
5048 }
5049 MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
5050 m->pg_list.push_back(
5051 make_pair(
5052 pg_notify_t(
5053 peer.shard, pg_whoami.shard,
5054 get_osdmap()->get_epoch(),
5055 get_osdmap()->get_epoch(),
5056 info),
5057 PastIntervals()));
5058 osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
5059 }
5060}
5061
5062bool PG::append_log_entries_update_missing(
31f18b77 5063 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
7c673cae
FG
5064 ObjectStore::Transaction &t)
5065{
5066 assert(!entries.empty());
5067 assert(entries.begin()->version > info.last_update);
5068
5069 PGLogEntryHandler rollbacker{this, &t};
5070 bool invalidate_stats =
5071 pg_log.append_new_log_entries(info.last_backfill,
5072 info.last_backfill_bitwise,
5073 entries,
5074 &rollbacker);
5075 info.last_update = pg_log.get_head();
5076
5077 if (pg_log.get_missing().num_missing() == 0) {
5078 // advance last_complete since nothing else is missing!
5079 info.last_complete = info.last_update;
5080 }
5081
5082 info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
5083 dirty_info = true;
5084 write_if_dirty(t);
5085 return invalidate_stats;
5086}
5087
5088
5089void PG::merge_new_log_entries(
31f18b77 5090 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
7c673cae
FG
5091 ObjectStore::Transaction &t)
5092{
5093 dout(10) << __func__ << " " << entries << dendl;
5094 assert(is_primary());
5095
5096 bool rebuild_missing = append_log_entries_update_missing(entries, t);
5097 for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
5098 i != actingbackfill.end();
5099 ++i) {
5100 pg_shard_t peer(*i);
5101 if (peer == pg_whoami) continue;
5102 assert(peer_missing.count(peer));
5103 assert(peer_info.count(peer));
5104 pg_missing_t& pmissing(peer_missing[peer]);
d2e6a577 5105 dout(20) << __func__ << " peer_missing for " << peer << " = " << pmissing << dendl;
7c673cae
FG
5106 pg_info_t& pinfo(peer_info[peer]);
5107 bool invalidate_stats = PGLog::append_log_entries_update_missing(
5108 pinfo.last_backfill,
5109 info.last_backfill_bitwise,
5110 entries,
5111 true,
5112 NULL,
5113 pmissing,
5114 NULL,
5115 this);
5116 pinfo.last_update = info.last_update;
5117 pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
5118 rebuild_missing = rebuild_missing || invalidate_stats;
5119 }
5120
5121 if (!rebuild_missing) {
5122 return;
5123 }
5124
5125 for (auto &&i: entries) {
5126 missing_loc.rebuild(
5127 i.soid,
5128 pg_whoami,
5129 actingbackfill,
5130 info,
5131 pg_log.get_missing(),
5132 peer_missing,
5133 peer_info);
5134 }
5135}
5136
5137void PG::update_history(const pg_history_t& new_history)
5138{
5139 unreg_next_scrub();
5140 if (info.history.merge(new_history)) {
5141 dout(20) << __func__ << " advanced history from " << new_history << dendl;
5142 dirty_info = true;
5143 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
5144 dout(20) << __func__ << " clearing past_intervals" << dendl;
5145 past_intervals.clear();
5146 dirty_big_info = true;
5147 }
5148 }
5149 reg_next_scrub();
5150}
5151
5152void PG::fulfill_info(
5153 pg_shard_t from, const pg_query_t &query,
5154 pair<pg_shard_t, pg_info_t> &notify_info)
5155{
5156 assert(from == primary);
5157 assert(query.type == pg_query_t::INFO);
5158
5159 // info
5160 dout(10) << "sending info" << dendl;
5161 notify_info = make_pair(from, info);
5162}
5163
5164void PG::fulfill_log(
5165 pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
5166{
5167 dout(10) << "log request from " << from << dendl;
5168 assert(from == primary);
5169 assert(query.type != pg_query_t::INFO);
5170 ConnectionRef con = osd->get_con_osd_cluster(
5171 from.osd, get_osdmap()->get_epoch());
5172 if (!con) return;
5173
5174 MOSDPGLog *mlog = new MOSDPGLog(
5175 from.shard, pg_whoami.shard,
5176 get_osdmap()->get_epoch(),
5177 info, query_epoch);
5178 mlog->missing = pg_log.get_missing();
5179
5180 // primary -> other, when building master log
5181 if (query.type == pg_query_t::LOG) {
5182 dout(10) << " sending info+missing+log since " << query.since
5183 << dendl;
5184 if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
5185 osd->clog->error() << info.pgid << " got broken pg_query_t::LOG since " << query.since
5186 << " when my log.tail is " << pg_log.get_tail()
5187 << ", sending full log instead";
5188 mlog->log = pg_log.get_log(); // primary should not have requested this!!
5189 } else
5190 mlog->log.copy_after(pg_log.get_log(), query.since);
5191 }
5192 else if (query.type == pg_query_t::FULLLOG) {
5193 dout(10) << " sending info+missing+full log" << dendl;
5194 mlog->log = pg_log.get_log();
5195 }
5196
5197 dout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
5198
5199 osd->share_map_peer(from.osd, con.get(), get_osdmap());
5200 osd->send_message_osd_cluster(mlog, con.get());
5201}
5202
5203void PG::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
5204{
5205 bool changed = false;
5206 if (osdmap->test_flag(CEPH_OSDMAP_FULL) &&
5207 !lastmap->test_flag(CEPH_OSDMAP_FULL)) {
5208 dout(10) << " cluster was marked full in " << osdmap->get_epoch() << dendl;
5209 changed = true;
5210 }
5211 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
5212 assert(pi);
5213 if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
5214 const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
5215 if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
5216 dout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
5217 changed = true;
5218 }
5219 }
5220 if (changed) {
5221 info.history.last_epoch_marked_full = osdmap->get_epoch();
5222 dirty_info = true;
5223 }
5224}
5225
5226bool PG::should_restart_peering(
5227 int newupprimary,
5228 int newactingprimary,
5229 const vector<int>& newup,
5230 const vector<int>& newacting,
5231 OSDMapRef lastmap,
5232 OSDMapRef osdmap)
5233{
5234 if (PastIntervals::is_new_interval(
5235 primary.osd,
5236 newactingprimary,
5237 acting,
5238 newacting,
5239 up_primary.osd,
5240 newupprimary,
5241 up,
5242 newup,
5243 osdmap,
5244 lastmap,
5245 info.pgid.pgid)) {
5246 dout(20) << "new interval newup " << newup
5247 << " newacting " << newacting << dendl;
5248 return true;
5249 } else {
5250 return false;
5251 }
5252}
5253
5254bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
5255{
5256 if (last_peering_reset > reply_epoch ||
5257 last_peering_reset > query_epoch) {
5258 dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " << query_epoch
5259 << " last_peering_reset " << last_peering_reset
5260 << dendl;
5261 return true;
5262 }
5263 return false;
5264}
5265
5266void PG::set_last_peering_reset()
5267{
5268 dout(20) << "set_last_peering_reset " << get_osdmap()->get_epoch() << dendl;
5269 if (last_peering_reset != get_osdmap()->get_epoch()) {
5270 last_peering_reset = get_osdmap()->get_epoch();
5271 reset_interval_flush();
5272 }
5273}
5274
5275struct FlushState {
5276 PGRef pg;
5277 epoch_t epoch;
5278 FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
5279 ~FlushState() {
5280 pg->lock();
5281 if (!pg->pg_has_reset_since(epoch))
5282 pg->queue_flushed(epoch);
5283 pg->unlock();
5284 }
5285};
5286typedef ceph::shared_ptr<FlushState> FlushStateRef;
5287
5288void PG::start_flush(ObjectStore::Transaction *t,
5289 list<Context *> *on_applied,
5290 list<Context *> *on_safe)
5291{
5292 // flush in progress ops
5293 FlushStateRef flush_trigger (std::make_shared<FlushState>(
5294 this, get_osdmap()->get_epoch()));
5295 t->nop();
5296 flushes_in_progress++;
5297 on_applied->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5298 on_safe->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5299}
5300
5301void PG::reset_interval_flush()
5302{
5303 dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
5304 recovery_state.clear_blocked_outgoing();
5305
5306 Context *c = new QueuePeeringEvt<IntervalFlush>(
5307 this, get_osdmap()->get_epoch(), IntervalFlush());
5308 if (!osr->flush_commit(c)) {
5309 dout(10) << "Beginning to block outgoing recovery messages" << dendl;
5310 recovery_state.begin_block_outgoing();
5311 } else {
5312 dout(10) << "Not blocking outgoing recovery messages" << dendl;
5313 delete c;
5314 }
5315}
5316
5317/* Called before initializing peering during advance_map */
5318void PG::start_peering_interval(
5319 const OSDMapRef lastmap,
5320 const vector<int>& newup, int new_up_primary,
5321 const vector<int>& newacting, int new_acting_primary,
5322 ObjectStore::Transaction *t)
5323{
5324 const OSDMapRef osdmap = get_osdmap();
5325
5326 set_last_peering_reset();
5327
5328 vector<int> oldacting, oldup;
5329 int oldrole = get_role();
5330
5331 unreg_next_scrub();
5332
5333 pg_shard_t old_acting_primary = get_primary();
5334 pg_shard_t old_up_primary = up_primary;
5335 bool was_old_primary = is_primary();
5336
5337 acting.swap(oldacting);
5338 up.swap(oldup);
5339 init_primary_up_acting(
5340 newup,
5341 newacting,
5342 new_up_primary,
5343 new_acting_primary);
5344
5345 if (info.stats.up != up ||
5346 info.stats.acting != acting ||
5347 info.stats.up_primary != new_up_primary ||
5348 info.stats.acting_primary != new_acting_primary) {
5349 info.stats.up = up;
5350 info.stats.up_primary = new_up_primary;
5351 info.stats.acting = acting;
5352 info.stats.acting_primary = new_acting_primary;
5353 info.stats.mapping_epoch = osdmap->get_epoch();
5354 }
5355
5356 pg_stats_publish_lock.Lock();
5357 pg_stats_publish_valid = false;
5358 pg_stats_publish_lock.Unlock();
5359
5360 // This will now be remapped during a backfill in cases
5361 // that it would not have been before.
5362 if (up != acting)
5363 state_set(PG_STATE_REMAPPED);
5364 else
5365 state_clear(PG_STATE_REMAPPED);
5366
5367 int role = osdmap->calc_pg_role(osd->whoami, acting, acting.size());
5368 if (pool.info.is_replicated() || role == pg_whoami.shard)
5369 set_role(role);
5370 else
5371 set_role(-1);
5372
5373 // did acting, up, primary|acker change?
5374 if (!lastmap) {
5375 dout(10) << " no lastmap" << dendl;
5376 dirty_info = true;
5377 dirty_big_info = true;
5378 info.history.same_interval_since = osdmap->get_epoch();
5379 } else {
5380 std::stringstream debug;
5381 assert(info.history.same_interval_since != 0);
5382 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
5383 get_is_recoverable_predicate());
5384 bool new_interval = PastIntervals::check_new_interval(
5385 old_acting_primary.osd,
5386 new_acting_primary,
5387 oldacting, newacting,
5388 old_up_primary.osd,
5389 new_up_primary,
5390 oldup, newup,
5391 info.history.same_interval_since,
5392 info.history.last_epoch_clean,
5393 osdmap,
5394 lastmap,
5395 info.pgid.pgid,
5396 recoverable.get(),
5397 &past_intervals,
5398 &debug);
5399 dout(10) << __func__ << ": check_new_interval output: "
5400 << debug.str() << dendl;
5401 if (new_interval) {
5402 if (osdmap->get_epoch() == osd->get_superblock().oldest_map &&
5403 info.history.last_epoch_clean < osdmap->get_epoch()) {
5404 dout(10) << " map gap, clearing past_intervals and faking" << dendl;
5405 // our information is incomplete and useless; someone else was clean
5406 // after everything we know if osdmaps were trimmed.
5407 past_intervals.clear();
5408 } else {
5409 dout(10) << " noting past " << past_intervals << dendl;
5410 }
5411 dirty_info = true;
5412 dirty_big_info = true;
5413 info.history.same_interval_since = osdmap->get_epoch();
5414 if (info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
5415 osdmap->get_pg_num(info.pgid.pgid.pool()),
5416 nullptr)) {
5417 info.history.last_epoch_split = osdmap->get_epoch();
5418 }
5419 }
5420 }
5421
5422 if (old_up_primary != up_primary ||
5423 oldup != up) {
5424 info.history.same_up_since = osdmap->get_epoch();
5425 }
5426 // this comparison includes primary rank via pg_shard_t
5427 if (old_acting_primary != get_primary()) {
5428 info.history.same_primary_since = osdmap->get_epoch();
5429 }
5430
5431 on_new_interval();
5432
5433 dout(1) << __func__ << " up " << oldup << " -> " << up
5434 << ", acting " << oldacting << " -> " << acting
5435 << ", acting_primary " << old_acting_primary << " -> " << new_acting_primary
5436 << ", up_primary " << old_up_primary << " -> " << new_up_primary
5437 << ", role " << oldrole << " -> " << role
5438 << ", features acting " << acting_features
5439 << " upacting " << upacting_features
5440 << dendl;
5441
5442 // deactivate.
5443 state_clear(PG_STATE_ACTIVE);
5444 state_clear(PG_STATE_PEERED);
5445 state_clear(PG_STATE_DOWN);
5446 state_clear(PG_STATE_RECOVERY_WAIT);
5447 state_clear(PG_STATE_RECOVERY_TOOFULL);
5448 state_clear(PG_STATE_RECOVERING);
5449
5450 peer_purged.clear();
5451 actingbackfill.clear();
5452 scrub_queued = false;
5453
5454 // reset primary state?
5455 if (was_old_primary || is_primary()) {
5456 osd->remove_want_pg_temp(info.pgid.pgid);
5457 }
5458 clear_primary_state();
5459
5460
5461 // pg->on_*
5462 on_change(t);
5463
5464 projected_last_update = eversion_t();
5465
5466 assert(!deleting);
5467
5468 // should we tell the primary we are here?
5469 send_notify = !is_primary();
5470
5471 if (role != oldrole ||
5472 was_old_primary != is_primary()) {
5473 // did primary change?
5474 if (was_old_primary != is_primary()) {
5475 state_clear(PG_STATE_CLEAN);
5476 clear_publish_stats();
5477 }
5478
5479 on_role_change();
5480
5481 // take active waiters
5482 requeue_ops(waiting_for_peered);
5483
5484 } else {
5485 // no role change.
5486 // did primary change?
5487 if (get_primary() != old_acting_primary) {
5488 dout(10) << *this << " " << oldacting << " -> " << acting
5489 << ", acting primary "
5490 << old_acting_primary << " -> " << get_primary()
5491 << dendl;
5492 } else {
5493 // primary is the same.
5494 if (is_primary()) {
5495 // i am (still) primary. but my replica set changed.
5496 state_clear(PG_STATE_CLEAN);
5497
5498 dout(10) << oldacting << " -> " << acting
5499 << ", replicas changed" << dendl;
5500 }
5501 }
5502 }
5503 cancel_recovery();
5504
5505 if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
5506 dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
5507 osd->queue_want_pg_temp(info.pgid.pgid, acting);
5508 }
5509}
5510
5511void PG::on_new_interval()
5512{
5513 const OSDMapRef osdmap = get_osdmap();
5514
5515 reg_next_scrub();
5516
5517 // initialize features
5518 acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5519 upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5520 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
5521 if (*p == CRUSH_ITEM_NONE)
5522 continue;
5523 uint64_t f = osdmap->get_xinfo(*p).features;
5524 acting_features &= f;
5525 upacting_features &= f;
5526 }
5527 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
5528 if (*p == CRUSH_ITEM_NONE)
5529 continue;
5530 upacting_features &= osdmap->get_xinfo(*p).features;
5531 }
5532
5533 assert(osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE));
5534
5535 _on_new_interval();
5536}
5537
5538void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
5539{
5540 assert(!is_primary());
5541
5542 update_history(oinfo.history);
5543
5544 if (last_complete_ondisk.epoch >= info.history.last_epoch_started) {
5545 // DEBUG: verify that the snaps are empty in snap_mapper
5546 if (cct->_conf->osd_debug_verify_snaps_on_info) {
5547 interval_set<snapid_t> p;
5548 p.union_of(oinfo.purged_snaps, info.purged_snaps);
5549 p.subtract(info.purged_snaps);
5550 if (!p.empty()) {
5551 for (interval_set<snapid_t>::iterator i = p.begin();
5552 i != p.end();
5553 ++i) {
5554 for (snapid_t snap = i.get_start();
5555 snap != i.get_len() + i.get_start();
5556 ++snap) {
5557 vector<hobject_t> hoids;
5558 int r = snap_mapper.get_next_objects_to_trim(snap, 1, &hoids);
5559 if (r != 0 && r != -ENOENT) {
5560 derr << __func__ << ": snap_mapper get_next_object_to_trim returned "
5561 << cpp_strerror(r) << dendl;
5562 ceph_abort();
5563 } else if (r != -ENOENT) {
5564 assert(!hoids.empty());
5565 derr << __func__ << ": snap_mapper get_next_object_to_trim returned "
5566 << cpp_strerror(r) << " for object "
5567 << hoids[0] << " on snap " << snap
5568 << " which should have been fully trimmed " << dendl;
5569 ceph_abort();
5570 }
5571 }
5572 }
5573 }
5574 }
5575 info.purged_snaps = oinfo.purged_snaps;
5576 dirty_info = true;
5577 dirty_big_info = true;
5578 }
5579}
5580
5581ostream& operator<<(ostream& out, const PG& pg)
5582{
5583 out << "pg[" << pg.info
5584 << " " << pg.up;
5585 if (pg.acting != pg.up)
5586 out << "/" << pg.acting;
5587 out << " r=" << pg.get_role();
5588 out << " lpr=" << pg.get_last_peering_reset();
5589
5590 if (!pg.past_intervals.empty()) {
5591 out << " pi=[" << pg.past_intervals.get_bounds()
5592 << ")/" << pg.past_intervals.size();
5593 }
5594
5595 if (pg.is_peered()) {
5596 if (pg.last_update_ondisk != pg.info.last_update)
5597 out << " luod=" << pg.last_update_ondisk;
5598 if (pg.last_update_applied != pg.info.last_update)
5599 out << " lua=" << pg.last_update_applied;
5600 }
5601
5602 if (pg.recovery_ops_active)
5603 out << " rops=" << pg.recovery_ops_active;
5604
5605 if (pg.pg_log.get_tail() != pg.info.log_tail ||
5606 pg.pg_log.get_head() != pg.info.last_update)
5607 out << " (info mismatch, " << pg.pg_log.get_log() << ")";
5608
5609 if (!pg.pg_log.get_log().empty()) {
5610 if ((pg.pg_log.get_log().log.begin()->version <= pg.pg_log.get_tail())) {
5611 out << " (log bound mismatch, actual=["
5612 << pg.pg_log.get_log().log.begin()->version << ","
5613 << pg.pg_log.get_log().log.rbegin()->version << "]";
5614 out << ")";
5615 }
5616 }
5617
5618 if (!pg.backfill_targets.empty())
5619 out << " bft=" << pg.backfill_targets;
5620 out << " crt=" << pg.pg_log.get_can_rollback_to();
5621
5622 if (pg.last_complete_ondisk != pg.info.last_complete)
5623 out << " lcod " << pg.last_complete_ondisk;
5624
5625 if (pg.is_primary()) {
5626 out << " mlcod " << pg.min_last_complete_ondisk;
5627 }
5628
5629 out << " " << pg_state_string(pg.get_state());
5630 if (pg.should_send_notify())
5631 out << " NOTIFY";
5632
5633 if (pg.scrubber.must_repair)
5634 out << " MUST_REPAIR";
5635 if (pg.scrubber.auto_repair)
5636 out << " AUTO_REPAIR";
5637 if (pg.scrubber.must_deep_scrub)
5638 out << " MUST_DEEP_SCRUB";
5639 if (pg.scrubber.must_scrub)
5640 out << " MUST_SCRUB";
5641
5642 //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
5643 if (pg.pg_log.get_missing().num_missing()) {
5644 out << " m=" << pg.pg_log.get_missing().num_missing();
5645 if (pg.is_primary()) {
5646 uint64_t unfound = pg.get_num_unfound();
5647 if (unfound)
5648 out << " u=" << unfound;
5649 }
5650 }
5651 if (pg.snap_trimq.size())
5652 out << " snaptrimq=" << pg.snap_trimq;
5653
5654 out << "]";
5655
5656
5657 return out;
5658}
5659
5660bool PG::can_discard_op(OpRequestRef& op)
5661{
5662 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
5663 if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) {
5664 dout(20) << " discard " << *m << dendl;
5665 return true;
5666 }
5667
5668 if (m->get_map_epoch() < info.history.same_primary_since) {
5669 dout(7) << " changed after " << m->get_map_epoch()
5670 << ", dropping " << *m << dendl;
5671 return true;
5672 }
5673
5674 if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) {
5675 if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) {
5676 dout(7) << __func__ << " sent before last_force_op_resend "
5677 << pool.info.last_force_op_resend << ", dropping" << *m << dendl;
5678 return true;
5679 }
5680 if (m->get_map_epoch() < info.history.last_epoch_split) {
5681 dout(7) << __func__ << " pg split in "
5682 << info.history.last_epoch_split << ", dropping" << dendl;
5683 return true;
5684 }
5685 } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) {
5686 if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) {
5687 dout(7) << __func__ << " sent before last_force_op_resend_preluminous "
5688 << pool.info.last_force_op_resend_preluminous
5689 << ", dropping" << *m << dendl;
5690 return true;
5691 }
5692 }
5693
5694 return false;
5695}
5696
5697template<typename T, int MSGTYPE>
5698bool PG::can_discard_replica_op(OpRequestRef& op)
5699{
5700 const T *m = static_cast<const T *>(op->get_req());
5701 assert(m->get_type() == MSGTYPE);
5702
181888fb
FG
5703 int from = m->get_source().num();
5704
5705 // if a repop is replied after a replica goes down in a new osdmap, and
5706 // before the pg advances to this new osdmap, the repop replies before this
5707 // repop can be discarded by that replica OSD, because the primary resets the
5708 // connection to it when handling the new osdmap marking it down, and also
5709 // resets the messenger sesssion when the replica reconnects. to avoid the
5710 // out-of-order replies, the messages from that replica should be discarded.
5711 if (osd->get_osdmap()->is_down(from))
5712 return true;
7c673cae
FG
5713 /* Mostly, this overlaps with the old_peering_msg
5714 * condition. An important exception is pushes
5715 * sent by replicas not in the acting set, since
5716 * if such a replica goes down it does not cause
5717 * a new interval. */
7c673cae
FG
5718 if (get_osdmap()->get_down_at(from) >= m->map_epoch)
5719 return true;
5720
5721 // same pg?
5722 // if pg changes _at all_, we reset and repeer!
5723 if (old_peering_msg(m->map_epoch, m->map_epoch)) {
5724 dout(10) << "can_discard_replica_op pg changed " << info.history
5725 << " after " << m->map_epoch
5726 << ", dropping" << dendl;
5727 return true;
5728 }
5729 return false;
5730}
5731
5732bool PG::can_discard_scan(OpRequestRef op)
5733{
5734 const MOSDPGScan *m = static_cast<const MOSDPGScan *>(op->get_req());
5735 assert(m->get_type() == MSG_OSD_PG_SCAN);
5736
5737 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
5738 dout(10) << " got old scan, ignoring" << dendl;
5739 return true;
5740 }
5741 return false;
5742}
5743
5744bool PG::can_discard_backfill(OpRequestRef op)
5745{
5746 const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill *>(op->get_req());
5747 assert(m->get_type() == MSG_OSD_PG_BACKFILL);
5748
5749 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
5750 dout(10) << " got old backfill, ignoring" << dendl;
5751 return true;
5752 }
5753
5754 return false;
5755
5756}
5757
5758bool PG::can_discard_request(OpRequestRef& op)
5759{
5760 switch (op->get_req()->get_type()) {
5761 case CEPH_MSG_OSD_OP:
5762 return can_discard_op(op);
5763 case CEPH_MSG_OSD_BACKOFF:
5764 return false; // never discard
5765 case MSG_OSD_SUBOP:
5766 return can_discard_replica_op<MOSDSubOp, MSG_OSD_SUBOP>(op);
5767 case MSG_OSD_REPOP:
5768 return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op);
5769 case MSG_OSD_PG_PUSH:
5770 return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op);
5771 case MSG_OSD_PG_PULL:
5772 return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
5773 case MSG_OSD_PG_PUSH_REPLY:
5774 return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
5775 case MSG_OSD_SUBOPREPLY:
5776 return can_discard_replica_op<MOSDSubOpReply, MSG_OSD_SUBOPREPLY>(op);
5777 case MSG_OSD_REPOPREPLY:
5778 return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
c07f9fc5
FG
5779 case MSG_OSD_PG_RECOVERY_DELETE:
5780 return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op);
5781
5782 case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
5783 return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op);
7c673cae
FG
5784
5785 case MSG_OSD_EC_WRITE:
5786 return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
5787 case MSG_OSD_EC_WRITE_REPLY:
5788 return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op);
5789 case MSG_OSD_EC_READ:
5790 return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
5791 case MSG_OSD_EC_READ_REPLY:
5792 return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
5793 case MSG_OSD_REP_SCRUB:
5794 return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
5795 case MSG_OSD_SCRUB_RESERVE:
5796 return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op);
5797 case MSG_OSD_REP_SCRUBMAP:
5798 return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op);
5799 case MSG_OSD_PG_UPDATE_LOG_MISSING:
5800 return can_discard_replica_op<
5801 MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op);
5802 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
5803 return can_discard_replica_op<
5804 MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
5805
5806 case MSG_OSD_PG_SCAN:
5807 return can_discard_scan(op);
5808 case MSG_OSD_PG_BACKFILL:
5809 return can_discard_backfill(op);
5810 case MSG_OSD_PG_BACKFILL_REMOVE:
5811 return can_discard_replica_op<MOSDPGBackfillRemove,
5812 MSG_OSD_PG_BACKFILL_REMOVE>(op);
5813 }
5814 return true;
5815}
5816
5817void PG::take_waiters()
5818{
5819 dout(10) << "take_waiters" << dendl;
5820 requeue_map_waiters();
5821 for (list<CephPeeringEvtRef>::iterator i = peering_waiters.begin();
5822 i != peering_waiters.end();
5823 ++i) osd->queue_for_peering(this);
5824 peering_queue.splice(peering_queue.begin(), peering_waiters,
5825 peering_waiters.begin(), peering_waiters.end());
5826}
5827
5828void PG::handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx)
5829{
5830 dout(10) << "handle_peering_event: " << evt->get_desc() << dendl;
5831 if (!have_same_or_newer_map(evt->get_epoch_sent())) {
5832 dout(10) << "deferring event " << evt->get_desc() << dendl;
5833 peering_waiters.push_back(evt);
5834 return;
5835 }
5836 if (old_peering_evt(evt))
5837 return;
5838 recovery_state.handle_event(evt, rctx);
5839}
5840
5841void PG::queue_peering_event(CephPeeringEvtRef evt)
5842{
5843 if (old_peering_evt(evt))
5844 return;
5845 peering_queue.push_back(evt);
5846 osd->queue_for_peering(this);
5847}
5848
5849void PG::queue_null(epoch_t msg_epoch,
5850 epoch_t query_epoch)
5851{
5852 dout(10) << "null" << dendl;
5853 queue_peering_event(
5854 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
5855 NullEvt())));
5856}
5857
5858void PG::queue_flushed(epoch_t e)
5859{
5860 dout(10) << "flushed" << dendl;
5861 queue_peering_event(
5862 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(e, e,
5863 FlushedEvt())));
5864}
5865
5866void PG::queue_query(epoch_t msg_epoch,
5867 epoch_t query_epoch,
5868 pg_shard_t from, const pg_query_t& q)
5869{
5870 dout(10) << "handle_query " << q << " from replica " << from << dendl;
5871 queue_peering_event(
5872 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
5873 MQuery(from, q, query_epoch))));
5874}
5875
5876void PG::handle_advance_map(
5877 OSDMapRef osdmap, OSDMapRef lastmap,
5878 vector<int>& newup, int up_primary,
5879 vector<int>& newacting, int acting_primary,
5880 RecoveryCtx *rctx)
5881{
5882 assert(lastmap->get_epoch() == osdmap_ref->get_epoch());
5883 assert(lastmap == osdmap_ref);
5884 dout(10) << "handle_advance_map "
5885 << newup << "/" << newacting
5886 << " -- " << up_primary << "/" << acting_primary
5887 << dendl;
5888 update_osdmap_ref(osdmap);
5889 pool.update(osdmap);
5890 past_intervals.update_type_from_map(pool.info.ec_pool(), *osdmap);
5891 if (cct->_conf->osd_debug_verify_cached_snaps) {
5892 interval_set<snapid_t> actual_removed_snaps;
5893 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
5894 assert(pi);
5895 pi->build_removed_snaps(actual_removed_snaps);
5896 if (!(actual_removed_snaps == pool.cached_removed_snaps)) {
5897 derr << __func__ << ": mismatch between the actual removed snaps "
5898 << actual_removed_snaps << " and pool.cached_removed_snaps "
5899 << " pool.cached_removed_snaps " << pool.cached_removed_snaps
5900 << dendl;
5901 }
5902 assert(actual_removed_snaps == pool.cached_removed_snaps);
5903 }
5904 AdvMap evt(
5905 osdmap, lastmap, newup, up_primary,
5906 newacting, acting_primary);
5907 recovery_state.handle_event(evt, rctx);
5908 if (pool.info.last_change == osdmap_ref->get_epoch()) {
5909 on_pool_change();
5910 update_store_with_options();
5911 }
5912}
5913
5914void PG::handle_activate_map(RecoveryCtx *rctx)
5915{
5916 dout(10) << "handle_activate_map " << dendl;
5917 ActMap evt;
5918 recovery_state.handle_event(evt, rctx);
5919 if (osdmap_ref->get_epoch() - last_persisted_osdmap_ref->get_epoch() >
5920 cct->_conf->osd_pg_epoch_persisted_max_stale) {
5921 dout(20) << __func__ << ": Dirtying info: last_persisted is "
5922 << last_persisted_osdmap_ref->get_epoch()
5923 << " while current is " << osdmap_ref->get_epoch() << dendl;
5924 dirty_info = true;
5925 } else {
5926 dout(20) << __func__ << ": Not dirtying info: last_persisted is "
5927 << last_persisted_osdmap_ref->get_epoch()
5928 << " while current is " << osdmap_ref->get_epoch() << dendl;
5929 }
5930 if (osdmap_ref->check_new_blacklist_entries()) check_blacklisted_watchers();
5931}
5932
5933void PG::handle_loaded(RecoveryCtx *rctx)
5934{
5935 dout(10) << "handle_loaded" << dendl;
5936 Load evt;
5937 recovery_state.handle_event(evt, rctx);
5938}
5939
5940void PG::handle_create(RecoveryCtx *rctx)
5941{
5942 dout(10) << "handle_create" << dendl;
5943 rctx->created_pgs.insert(this);
5944 Initialize evt;
5945 recovery_state.handle_event(evt, rctx);
5946 ActMap evt2;
5947 recovery_state.handle_event(evt2, rctx);
5948}
5949
5950void PG::handle_query_state(Formatter *f)
5951{
5952 dout(10) << "handle_query_state" << dendl;
5953 QueryState q(f);
5954 recovery_state.handle_event(q, 0);
5955}
5956
5957void PG::update_store_with_options()
5958{
5959 auto r = osd->store->set_collection_opts(coll, pool.info.opts);
5960 if(r < 0 && r != -EOPNOTSUPP) {
5961 derr << __func__ << "set_collection_opts returns error:" << r << dendl;
5962 }
5963}
5964
5965void PG::update_store_on_load()
5966{
5967 if (osd->store->get_type() == "filestore") {
5968 // legacy filestore didn't store collection bit width; fix.
5969 int bits = osd->store->collection_bits(coll);
5970 if (bits < 0) {
181888fb
FG
5971 assert(!coll.is_meta()); // otherwise OSD::load_pgs() did a bad thing
5972 bits = info.pgid.get_split_bits(pool.info.get_pg_num());
7c673cae
FG
5973 lderr(cct) << __func__ << " setting bit width to " << bits << dendl;
5974 ObjectStore::Transaction t;
5975 t.collection_set_bits(coll, bits);
5976 osd->store->apply_transaction(osr.get(), std::move(t));
5977 }
5978 }
5979}
5980
5981/*------------ Recovery State Machine----------------*/
5982#undef dout_prefix
5983#define dout_prefix (*_dout << context< RecoveryMachine >().pg->gen_prefix() \
5984 << "state<" << get_state_name() << ">: ")
5985
5986/*------Crashed-------*/
5987PG::RecoveryState::Crashed::Crashed(my_context ctx)
5988 : my_base(ctx),
5989 NamedState(context< RecoveryMachine >().pg, "Crashed")
5990{
5991 context< RecoveryMachine >().log_enter(state_name);
5992 assert(0 == "we got a bad state machine event");
5993}
5994
5995
5996/*------Initial-------*/
5997PG::RecoveryState::Initial::Initial(my_context ctx)
5998 : my_base(ctx),
5999 NamedState(context< RecoveryMachine >().pg, "Initial")
6000{
6001 context< RecoveryMachine >().log_enter(state_name);
6002}
6003
6004boost::statechart::result PG::RecoveryState::Initial::react(const Load& l)
6005{
6006 PG *pg = context< RecoveryMachine >().pg;
6007
6008 // do we tell someone we're here?
6009 pg->send_notify = (!pg->is_primary());
6010 pg->update_store_with_options();
6011
6012 pg->update_store_on_load();
6013
6014 return transit< Reset >();
6015}
6016
6017boost::statechart::result PG::RecoveryState::Initial::react(const MNotifyRec& notify)
6018{
6019 PG *pg = context< RecoveryMachine >().pg;
6020 pg->proc_replica_info(
6021 notify.from, notify.notify.info, notify.notify.epoch_sent);
6022 pg->set_last_peering_reset();
6023 return transit< Primary >();
6024}
6025
6026boost::statechart::result PG::RecoveryState::Initial::react(const MInfoRec& i)
6027{
6028 PG *pg = context< RecoveryMachine >().pg;
6029 assert(!pg->is_primary());
6030 post_event(i);
6031 return transit< Stray >();
6032}
6033
6034boost::statechart::result PG::RecoveryState::Initial::react(const MLogRec& i)
6035{
6036 PG *pg = context< RecoveryMachine >().pg;
6037 assert(!pg->is_primary());
6038 post_event(i);
6039 return transit< Stray >();
6040}
6041
6042void PG::RecoveryState::Initial::exit()
6043{
6044 context< RecoveryMachine >().log_exit(state_name, enter_time);
6045 PG *pg = context< RecoveryMachine >().pg;
6046 utime_t dur = ceph_clock_now() - enter_time;
6047 pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur);
6048}
6049
6050/*------Started-------*/
6051PG::RecoveryState::Started::Started(my_context ctx)
6052 : my_base(ctx),
6053 NamedState(context< RecoveryMachine >().pg, "Started")
6054{
6055 context< RecoveryMachine >().log_enter(state_name);
6056}
6057
6058boost::statechart::result
6059PG::RecoveryState::Started::react(const IntervalFlush&)
6060{
6061 PG *pg = context< RecoveryMachine >().pg;
6062 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6063 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6064 return discard_event();
6065}
6066
6067
6068boost::statechart::result
6069PG::RecoveryState::Started::react(const FlushedEvt&)
6070{
6071 PG *pg = context< RecoveryMachine >().pg;
6072 pg->on_flushed();
6073 return discard_event();
6074}
6075
6076
6077boost::statechart::result PG::RecoveryState::Started::react(const AdvMap& advmap)
6078{
6079 PG *pg = context< RecoveryMachine >().pg;
6080 ldout(pg->cct, 10) << "Started advmap" << dendl;
6081 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6082 if (pg->should_restart_peering(
6083 advmap.up_primary,
6084 advmap.acting_primary,
6085 advmap.newup,
6086 advmap.newacting,
6087 advmap.lastmap,
6088 advmap.osdmap)) {
6089 ldout(pg->cct, 10) << "should_restart_peering, transitioning to Reset"
6090 << dendl;
6091 post_event(advmap);
6092 return transit< Reset >();
6093 }
6094 pg->remove_down_peer_info(advmap.osdmap);
6095 return discard_event();
6096}
6097
6098boost::statechart::result PG::RecoveryState::Started::react(const QueryState& q)
6099{
6100 q.f->open_object_section("state");
6101 q.f->dump_string("name", state_name);
6102 q.f->dump_stream("enter_time") << enter_time;
6103 q.f->close_section();
6104 return discard_event();
6105}
6106
6107void PG::RecoveryState::Started::exit()
6108{
6109 context< RecoveryMachine >().log_exit(state_name, enter_time);
6110 PG *pg = context< RecoveryMachine >().pg;
6111 utime_t dur = ceph_clock_now() - enter_time;
6112 pg->osd->recoverystate_perf->tinc(rs_started_latency, dur);
6113}
6114
6115/*--------Reset---------*/
6116PG::RecoveryState::Reset::Reset(my_context ctx)
6117 : my_base(ctx),
6118 NamedState(context< RecoveryMachine >().pg, "Reset")
6119{
6120 context< RecoveryMachine >().log_enter(state_name);
6121 PG *pg = context< RecoveryMachine >().pg;
6122
6123 pg->flushes_in_progress = 0;
6124 pg->set_last_peering_reset();
6125}
6126
6127boost::statechart::result
6128PG::RecoveryState::Reset::react(const FlushedEvt&)
6129{
6130 PG *pg = context< RecoveryMachine >().pg;
6131 pg->on_flushed();
6132 return discard_event();
6133}
6134
6135boost::statechart::result
6136PG::RecoveryState::Reset::react(const IntervalFlush&)
6137{
6138 PG *pg = context< RecoveryMachine >().pg;
6139 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6140 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6141 return discard_event();
6142}
6143
6144boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
6145{
6146 PG *pg = context< RecoveryMachine >().pg;
6147 ldout(pg->cct, 10) << "Reset advmap" << dendl;
6148
6149 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6150
6151 if (pg->should_restart_peering(
6152 advmap.up_primary,
6153 advmap.acting_primary,
6154 advmap.newup,
6155 advmap.newacting,
6156 advmap.lastmap,
6157 advmap.osdmap)) {
6158 ldout(pg->cct, 10) << "should restart peering, calling start_peering_interval again"
6159 << dendl;
6160 pg->start_peering_interval(
6161 advmap.lastmap,
6162 advmap.newup, advmap.up_primary,
6163 advmap.newacting, advmap.acting_primary,
6164 context< RecoveryMachine >().get_cur_transaction());
6165 }
6166 pg->remove_down_peer_info(advmap.osdmap);
6167 pg->check_past_interval_bounds();
6168 return discard_event();
6169}
6170
6171boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&)
6172{
6173 PG *pg = context< RecoveryMachine >().pg;
6174 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
6175 context< RecoveryMachine >().send_notify(
6176 pg->get_primary(),
6177 pg_notify_t(
6178 pg->get_primary().shard, pg->pg_whoami.shard,
6179 pg->get_osdmap()->get_epoch(),
6180 pg->get_osdmap()->get_epoch(),
6181 pg->info),
6182 pg->past_intervals);
6183 }
6184
6185 pg->update_heartbeat_peers();
6186 pg->take_waiters();
6187
6188 return transit< Started >();
6189}
6190
6191boost::statechart::result PG::RecoveryState::Reset::react(const QueryState& q)
6192{
6193 q.f->open_object_section("state");
6194 q.f->dump_string("name", state_name);
6195 q.f->dump_stream("enter_time") << enter_time;
6196 q.f->close_section();
6197 return discard_event();
6198}
6199
6200void PG::RecoveryState::Reset::exit()
6201{
6202 context< RecoveryMachine >().log_exit(state_name, enter_time);
6203 PG *pg = context< RecoveryMachine >().pg;
6204 utime_t dur = ceph_clock_now() - enter_time;
6205 pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur);
6206}
6207
6208/*-------Start---------*/
6209PG::RecoveryState::Start::Start(my_context ctx)
6210 : my_base(ctx),
6211 NamedState(context< RecoveryMachine >().pg, "Start")
6212{
6213 context< RecoveryMachine >().log_enter(state_name);
6214
6215 PG *pg = context< RecoveryMachine >().pg;
6216 if (pg->is_primary()) {
6217 ldout(pg->cct, 1) << "transitioning to Primary" << dendl;
6218 post_event(MakePrimary());
6219 } else { //is_stray
6220 ldout(pg->cct, 1) << "transitioning to Stray" << dendl;
6221 post_event(MakeStray());
6222 }
6223}
6224
6225void PG::RecoveryState::Start::exit()
6226{
6227 context< RecoveryMachine >().log_exit(state_name, enter_time);
6228 PG *pg = context< RecoveryMachine >().pg;
6229 utime_t dur = ceph_clock_now() - enter_time;
6230 pg->osd->recoverystate_perf->tinc(rs_start_latency, dur);
6231}
6232
6233/*---------Primary--------*/
6234PG::RecoveryState::Primary::Primary(my_context ctx)
6235 : my_base(ctx),
6236 NamedState(context< RecoveryMachine >().pg, "Started/Primary")
6237{
6238 context< RecoveryMachine >().log_enter(state_name);
6239 PG *pg = context< RecoveryMachine >().pg;
6240 assert(pg->want_acting.empty());
6241
6242 // set CREATING bit until we have peered for the first time.
6243 if (pg->info.history.last_epoch_started == 0) {
6244 pg->state_set(PG_STATE_CREATING);
6245 // use the history timestamp, which ultimately comes from the
6246 // monitor in the create case.
6247 utime_t t = pg->info.history.last_scrub_stamp;
6248 pg->info.stats.last_fresh = t;
6249 pg->info.stats.last_active = t;
6250 pg->info.stats.last_change = t;
6251 pg->info.stats.last_peered = t;
6252 pg->info.stats.last_clean = t;
6253 pg->info.stats.last_unstale = t;
6254 pg->info.stats.last_undegraded = t;
6255 pg->info.stats.last_fullsized = t;
6256 pg->info.stats.last_scrub_stamp = t;
6257 pg->info.stats.last_deep_scrub_stamp = t;
6258 pg->info.stats.last_clean_scrub_stamp = t;
6259 }
6260}
6261
6262boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& notevt)
6263{
6264 PG *pg = context< RecoveryMachine >().pg;
6265 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
6266 pg->proc_replica_info(
6267 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
6268 return discard_event();
6269}
6270
6271boost::statechart::result PG::RecoveryState::Primary::react(const ActMap&)
6272{
6273 PG *pg = context< RecoveryMachine >().pg;
6274 ldout(pg->cct, 7) << "handle ActMap primary" << dendl;
6275 pg->publish_stats_to_osd();
6276 pg->take_waiters();
6277 return discard_event();
6278}
6279
6280void PG::RecoveryState::Primary::exit()
6281{
6282 context< RecoveryMachine >().log_exit(state_name, enter_time);
6283 PG *pg = context< RecoveryMachine >().pg;
6284 pg->want_acting.clear();
6285 utime_t dur = ceph_clock_now() - enter_time;
6286 pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur);
6287 pg->clear_primary_state();
6288 pg->state_clear(PG_STATE_CREATING);
6289}
6290
6291/*---------Peering--------*/
6292PG::RecoveryState::Peering::Peering(my_context ctx)
6293 : my_base(ctx),
6294 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering"),
6295 history_les_bound(false)
6296{
6297 context< RecoveryMachine >().log_enter(state_name);
6298
6299 PG *pg = context< RecoveryMachine >().pg;
6300 assert(!pg->is_peered());
6301 assert(!pg->is_peering());
6302 assert(pg->is_primary());
6303 pg->state_set(PG_STATE_PEERING);
6304}
6305
6306boost::statechart::result PG::RecoveryState::Peering::react(const AdvMap& advmap)
6307{
6308 PG *pg = context< RecoveryMachine >().pg;
6309 ldout(pg->cct, 10) << "Peering advmap" << dendl;
6310 if (prior_set.affected_by_map(*(advmap.osdmap), pg)) {
6311 ldout(pg->cct, 1) << "Peering, affected_by_map, going to Reset" << dendl;
6312 post_event(advmap);
6313 return transit< Reset >();
6314 }
6315
6316 pg->adjust_need_up_thru(advmap.osdmap);
6317
6318 return forward_event();
6319}
6320
6321boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q)
6322{
6323 PG *pg = context< RecoveryMachine >().pg;
6324
6325 q.f->open_object_section("state");
6326 q.f->dump_string("name", state_name);
6327 q.f->dump_stream("enter_time") << enter_time;
6328
6329 q.f->open_array_section("past_intervals");
6330 pg->past_intervals.dump(q.f);
6331 q.f->close_section();
6332
6333 q.f->open_array_section("probing_osds");
6334 for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
6335 p != prior_set.probe.end();
6336 ++p)
6337 q.f->dump_stream("osd") << *p;
6338 q.f->close_section();
6339
6340 if (prior_set.pg_down)
6341 q.f->dump_string("blocked", "peering is blocked due to down osds");
6342
6343 q.f->open_array_section("down_osds_we_would_probe");
6344 for (set<int>::iterator p = prior_set.down.begin();
6345 p != prior_set.down.end();
6346 ++p)
6347 q.f->dump_int("osd", *p);
6348 q.f->close_section();
6349
6350 q.f->open_array_section("peering_blocked_by");
6351 for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
6352 p != prior_set.blocked_by.end();
6353 ++p) {
6354 q.f->open_object_section("osd");
6355 q.f->dump_int("osd", p->first);
6356 q.f->dump_int("current_lost_at", p->second);
6357 q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
6358 q.f->close_section();
6359 }
6360 q.f->close_section();
6361
6362 if (history_les_bound) {
6363 q.f->open_array_section("peering_blocked_by_detail");
6364 q.f->open_object_section("item");
6365 q.f->dump_string("detail","peering_blocked_by_history_les_bound");
6366 q.f->close_section();
6367 q.f->close_section();
6368 }
6369
6370 q.f->close_section();
6371 return forward_event();
6372}
6373
6374void PG::RecoveryState::Peering::exit()
6375{
6376 PG *pg = context< RecoveryMachine >().pg;
6377 ldout(pg->cct, 10) << "Leaving Peering" << dendl;
6378 context< RecoveryMachine >().log_exit(state_name, enter_time);
6379 pg->state_clear(PG_STATE_PEERING);
6380 pg->clear_probe_targets();
6381
6382 utime_t dur = ceph_clock_now() - enter_time;
6383 pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur);
6384}
6385
6386
6387/*------Backfilling-------*/
6388PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
6389 : my_base(ctx),
6390 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Backfilling")
6391{
6392 context< RecoveryMachine >().log_enter(state_name);
6393 PG *pg = context< RecoveryMachine >().pg;
6394 pg->backfill_reserved = true;
6395 pg->queue_recovery();
6396 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
6397 pg->state_clear(PG_STATE_BACKFILL_WAIT);
6398 pg->state_set(PG_STATE_BACKFILL);
6399 pg->publish_stats_to_osd();
6400}
6401
224ce89b
WB
6402boost::statechart::result
6403PG::RecoveryState::Backfilling::react(const CancelBackfill &)
6404{
6405 PG *pg = context< RecoveryMachine >().pg;
6406 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6407 // XXX: Add a new pg state so user can see why backfill isn't proceeding
6408 // Can't use PG_STATE_BACKFILL_WAIT since it means waiting for reservations
6409 //pg->state_set(PG_STATE_BACKFILL_STALLED????);
6410
6411 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6412 it != pg->backfill_targets.end();
6413 ++it) {
6414 assert(*it != pg->pg_whoami);
6415 ConnectionRef con = pg->osd->get_con_osd_cluster(
6416 it->osd, pg->get_osdmap()->get_epoch());
6417 if (con) {
6418 pg->osd->send_message_osd_cluster(
6419 new MBackfillReserve(
6420 MBackfillReserve::REJECT,
6421 spg_t(pg->info.pgid.pgid, it->shard),
6422 pg->get_osdmap()->get_epoch()),
6423 con.get());
6424 }
6425 }
6426
6427 pg->waiting_on_backfill.clear();
6428
6429 pg->schedule_backfill_full_retry();
6430 return transit<NotBackfilling>();
6431}
6432
7c673cae
FG
6433boost::statechart::result
6434PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &)
6435{
6436 PG *pg = context< RecoveryMachine >().pg;
6437 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6438 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6439
6440 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6441 it != pg->backfill_targets.end();
6442 ++it) {
6443 assert(*it != pg->pg_whoami);
6444 ConnectionRef con = pg->osd->get_con_osd_cluster(
6445 it->osd, pg->get_osdmap()->get_epoch());
6446 if (con) {
6447 pg->osd->send_message_osd_cluster(
6448 new MBackfillReserve(
6449 MBackfillReserve::REJECT,
6450 spg_t(pg->info.pgid.pgid, it->shard),
6451 pg->get_osdmap()->get_epoch()),
6452 con.get());
6453 }
6454 }
6455
6456 pg->waiting_on_backfill.clear();
6457 pg->finish_recovery_op(hobject_t::get_max());
6458
6459 pg->schedule_backfill_full_retry();
6460 return transit<NotBackfilling>();
6461}
6462
6463void PG::RecoveryState::Backfilling::exit()
6464{
6465 context< RecoveryMachine >().log_exit(state_name, enter_time);
6466 PG *pg = context< RecoveryMachine >().pg;
6467 pg->backfill_reserved = false;
6468 pg->backfill_reserving = false;
6469 pg->state_clear(PG_STATE_BACKFILL);
c07f9fc5 6470 pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
7c673cae
FG
6471 utime_t dur = ceph_clock_now() - enter_time;
6472 pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
6473}
6474
6475/*--WaitRemoteBackfillReserved--*/
6476
6477PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
6478 : my_base(ctx),
6479 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteBackfillReserved"),
6480 backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
6481{
6482 context< RecoveryMachine >().log_enter(state_name);
6483 PG *pg = context< RecoveryMachine >().pg;
6484 pg->state_set(PG_STATE_BACKFILL_WAIT);
6485 pg->publish_stats_to_osd();
6486 post_event(RemoteBackfillReserved());
6487}
6488
6489boost::statechart::result
6490PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
6491{
6492 PG *pg = context< RecoveryMachine >().pg;
6493
6494 if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) {
6495 //The primary never backfills itself
6496 assert(*backfill_osd_it != pg->pg_whoami);
6497 ConnectionRef con = pg->osd->get_con_osd_cluster(
6498 backfill_osd_it->osd, pg->get_osdmap()->get_epoch());
6499 if (con) {
6500 pg->osd->send_message_osd_cluster(
6501 new MBackfillReserve(
6502 MBackfillReserve::REQUEST,
6503 spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
6504 pg->get_osdmap()->get_epoch(),
6505 pg->get_backfill_priority()),
6506 con.get());
6507 }
6508 ++backfill_osd_it;
6509 } else {
6510 post_event(AllBackfillsReserved());
6511 }
6512 return discard_event();
6513}
6514
6515void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
6516{
6517 context< RecoveryMachine >().log_exit(state_name, enter_time);
6518 PG *pg = context< RecoveryMachine >().pg;
6519 utime_t dur = ceph_clock_now() - enter_time;
6520 pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur);
6521}
6522
6523boost::statechart::result
6524PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt)
6525{
6526 PG *pg = context< RecoveryMachine >().pg;
6527 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6528
6529 // Send REJECT to all previously acquired reservations
6530 set<pg_shard_t>::const_iterator it, begin, end, next;
6531 begin = context< Active >().remote_shards_to_reserve_backfill.begin();
6532 end = context< Active >().remote_shards_to_reserve_backfill.end();
6533 assert(begin != end);
6534 for (next = it = begin, ++next ; next != backfill_osd_it; ++it, ++next) {
6535 //The primary never backfills itself
6536 assert(*it != pg->pg_whoami);
6537 ConnectionRef con = pg->osd->get_con_osd_cluster(
6538 it->osd, pg->get_osdmap()->get_epoch());
6539 if (con) {
6540 pg->osd->send_message_osd_cluster(
6541 new MBackfillReserve(
6542 MBackfillReserve::REJECT,
6543 spg_t(pg->info.pgid.pgid, it->shard),
6544 pg->get_osdmap()->get_epoch()),
6545 con.get());
6546 }
6547 }
6548
6549 pg->state_clear(PG_STATE_BACKFILL_WAIT);
6550 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6551 pg->publish_stats_to_osd();
6552
6553 pg->schedule_backfill_full_retry();
6554
6555 return transit<NotBackfilling>();
6556}
6557
6558/*--WaitLocalBackfillReserved--*/
6559PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
6560 : my_base(ctx),
6561 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalBackfillReserved")
6562{
6563 context< RecoveryMachine >().log_enter(state_name);
6564 PG *pg = context< RecoveryMachine >().pg;
6565 pg->state_set(PG_STATE_BACKFILL_WAIT);
6566 pg->osd->local_reserver.request_reservation(
6567 pg->info.pgid,
6568 new QueuePeeringEvt<LocalBackfillReserved>(
6569 pg, pg->get_osdmap()->get_epoch(),
6570 LocalBackfillReserved()),
6571 pg->get_backfill_priority());
6572 pg->publish_stats_to_osd();
6573}
6574
6575void PG::RecoveryState::WaitLocalBackfillReserved::exit()
6576{
6577 context< RecoveryMachine >().log_exit(state_name, enter_time);
6578 PG *pg = context< RecoveryMachine >().pg;
6579 utime_t dur = ceph_clock_now() - enter_time;
6580 pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur);
6581}
6582
6583/*----NotBackfilling------*/
6584PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
6585 : my_base(ctx),
6586 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotBackfilling")
6587{
6588 context< RecoveryMachine >().log_enter(state_name);
6589 PG *pg = context< RecoveryMachine >().pg;
6590 pg->publish_stats_to_osd();
6591}
6592
6593boost::statechart::result
6594PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
6595{
6596 return discard_event();
6597}
6598
6599boost::statechart::result
6600PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
6601{
6602 return discard_event();
6603}
6604
6605void PG::RecoveryState::NotBackfilling::exit()
6606{
6607 context< RecoveryMachine >().log_exit(state_name, enter_time);
6608 PG *pg = context< RecoveryMachine >().pg;
6609 utime_t dur = ceph_clock_now() - enter_time;
6610 pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
6611}
6612
6613/*----NotRecovering------*/
6614PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx)
6615 : my_base(ctx),
6616 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotRecovering")
6617{
6618 context< RecoveryMachine >().log_enter(state_name);
6619 PG *pg = context< RecoveryMachine >().pg;
6620 pg->publish_stats_to_osd();
6621}
6622
6623void PG::RecoveryState::NotRecovering::exit()
6624{
6625 context< RecoveryMachine >().log_exit(state_name, enter_time);
6626 PG *pg = context< RecoveryMachine >().pg;
6627 utime_t dur = ceph_clock_now() - enter_time;
6628 pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur);
6629}
6630
6631/*---RepNotRecovering----*/
6632PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
6633 : my_base(ctx),
6634 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepNotRecovering")
6635{
6636 context< RecoveryMachine >().log_enter(state_name);
6637}
6638
6639void PG::RecoveryState::RepNotRecovering::exit()
6640{
6641 context< RecoveryMachine >().log_exit(state_name, enter_time);
6642 PG *pg = context< RecoveryMachine >().pg;
6643 utime_t dur = ceph_clock_now() - enter_time;
6644 pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur);
6645}
6646
6647/*---RepWaitRecoveryReserved--*/
6648PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
6649 : my_base(ctx),
6650 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitRecoveryReserved")
6651{
6652 context< RecoveryMachine >().log_enter(state_name);
6653 PG *pg = context< RecoveryMachine >().pg;
6654
6655 pg->osd->remote_reserver.request_reservation(
6656 pg->info.pgid,
6657 new QueuePeeringEvt<RemoteRecoveryReserved>(
6658 pg, pg->get_osdmap()->get_epoch(),
6659 RemoteRecoveryReserved()),
6660 pg->get_recovery_priority());
6661}
6662
6663boost::statechart::result
6664PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
6665{
6666 PG *pg = context< RecoveryMachine >().pg;
6667 pg->osd->send_message_osd_cluster(
6668 pg->primary.osd,
6669 new MRecoveryReserve(
6670 MRecoveryReserve::GRANT,
6671 spg_t(pg->info.pgid.pgid, pg->primary.shard),
6672 pg->get_osdmap()->get_epoch()),
6673 pg->get_osdmap()->get_epoch());
6674 return transit<RepRecovering>();
6675}
6676
6677void PG::RecoveryState::RepWaitRecoveryReserved::exit()
6678{
6679 context< RecoveryMachine >().log_exit(state_name, enter_time);
6680 PG *pg = context< RecoveryMachine >().pg;
6681 utime_t dur = ceph_clock_now() - enter_time;
6682 pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur);
6683}
6684
6685/*-RepWaitBackfillReserved*/
6686PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
6687 : my_base(ctx),
6688 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitBackfillReserved")
6689{
6690 context< RecoveryMachine >().log_enter(state_name);
6691}
6692
6693boost::statechart::result
6694PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
6695{
6696 PG *pg = context< RecoveryMachine >().pg;
6697 ostringstream ss;
6698
6699 if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
6700 (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
6701 ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
6702 << dendl;
6703 post_event(RemoteReservationRejected());
6704 } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
6705 pg->osd->check_backfill_full(ss)) {
6706 ldout(pg->cct, 10) << "backfill reservation rejected: "
6707 << ss.str() << dendl;
6708 post_event(RemoteReservationRejected());
6709 } else {
6710 pg->osd->remote_reserver.request_reservation(
6711 pg->info.pgid,
6712 new QueuePeeringEvt<RemoteBackfillReserved>(
6713 pg, pg->get_osdmap()->get_epoch(),
6714 RemoteBackfillReserved()), evt.priority);
6715 }
6716 return transit<RepWaitBackfillReserved>();
6717}
6718
6719void PG::RecoveryState::RepWaitBackfillReserved::exit()
6720{
6721 context< RecoveryMachine >().log_exit(state_name, enter_time);
6722 PG *pg = context< RecoveryMachine >().pg;
6723 utime_t dur = ceph_clock_now() - enter_time;
6724 pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur);
6725}
6726
6727boost::statechart::result
6728PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
6729{
6730 PG *pg = context< RecoveryMachine >().pg;
6731
6732 ostringstream ss;
6733 if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
6734 (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
6735 ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
6736 << "failure injection" << dendl;
6737 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6738 post_event(RemoteReservationRejected());
6739 return discard_event();
6740 } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
6741 pg->osd->check_backfill_full(ss)) {
6742 ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
6743 << ss.str() << dendl;
6744 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6745 post_event(RemoteReservationRejected());
6746 return discard_event();
6747 } else {
6748 pg->osd->send_message_osd_cluster(
6749 pg->primary.osd,
6750 new MBackfillReserve(
6751 MBackfillReserve::GRANT,
6752 spg_t(pg->info.pgid.pgid, pg->primary.shard),
6753 pg->get_osdmap()->get_epoch()),
6754 pg->get_osdmap()->get_epoch());
6755 return transit<RepRecovering>();
6756 }
6757}
6758
6759boost::statechart::result
6760PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteReservationRejected &evt)
6761{
6762 PG *pg = context< RecoveryMachine >().pg;
6763 pg->reject_reservation();
6764 return transit<RepNotRecovering>();
6765}
6766
6767/*---RepRecovering-------*/
6768PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx)
6769 : my_base(ctx),
6770 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepRecovering")
6771{
6772 context< RecoveryMachine >().log_enter(state_name);
6773}
6774
6775boost::statechart::result
6776PG::RecoveryState::RepRecovering::react(const BackfillTooFull &)
6777{
6778 PG *pg = context< RecoveryMachine >().pg;
6779 pg->reject_reservation();
6780 return discard_event();
6781}
6782
6783void PG::RecoveryState::RepRecovering::exit()
6784{
6785 context< RecoveryMachine >().log_exit(state_name, enter_time);
6786 PG *pg = context< RecoveryMachine >().pg;
6787 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6788 utime_t dur = ceph_clock_now() - enter_time;
6789 pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur);
6790}
6791
6792/*------Activating--------*/
6793PG::RecoveryState::Activating::Activating(my_context ctx)
6794 : my_base(ctx),
6795 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Activating")
6796{
6797 context< RecoveryMachine >().log_enter(state_name);
6798}
6799
6800void PG::RecoveryState::Activating::exit()
6801{
6802 context< RecoveryMachine >().log_exit(state_name, enter_time);
6803 PG *pg = context< RecoveryMachine >().pg;
6804 utime_t dur = ceph_clock_now() - enter_time;
6805 pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur);
6806}
6807
6808PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
6809 : my_base(ctx),
6810 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalRecoveryReserved")
6811{
6812 context< RecoveryMachine >().log_enter(state_name);
6813 PG *pg = context< RecoveryMachine >().pg;
6814
6815 // Make sure all nodes that part of the recovery aren't full
6816 if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery &&
6817 pg->osd->check_osdmap_full(pg->actingbackfill)) {
6818 post_event(RecoveryTooFull());
6819 return;
6820 }
6821
6822 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
6823 pg->state_set(PG_STATE_RECOVERY_WAIT);
6824 pg->osd->local_reserver.request_reservation(
6825 pg->info.pgid,
6826 new QueuePeeringEvt<LocalRecoveryReserved>(
6827 pg, pg->get_osdmap()->get_epoch(),
6828 LocalRecoveryReserved()),
6829 pg->get_recovery_priority());
6830 pg->publish_stats_to_osd();
6831}
6832
6833boost::statechart::result
6834PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
6835{
6836 PG *pg = context< RecoveryMachine >().pg;
6837 pg->state_set(PG_STATE_RECOVERY_TOOFULL);
6838 pg->schedule_recovery_full_retry();
6839 return transit<NotRecovering>();
6840}
6841
6842void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
6843{
6844 context< RecoveryMachine >().log_exit(state_name, enter_time);
6845 PG *pg = context< RecoveryMachine >().pg;
6846 utime_t dur = ceph_clock_now() - enter_time;
6847 pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur);
6848}
6849
6850PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
6851 : my_base(ctx),
6852 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
6853 remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
6854{
6855 context< RecoveryMachine >().log_enter(state_name);
6856 post_event(RemoteRecoveryReserved());
6857}
6858
6859boost::statechart::result
6860PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
6861 PG *pg = context< RecoveryMachine >().pg;
6862
6863 if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
6864 assert(*remote_recovery_reservation_it != pg->pg_whoami);
6865 ConnectionRef con = pg->osd->get_con_osd_cluster(
6866 remote_recovery_reservation_it->osd, pg->get_osdmap()->get_epoch());
6867 if (con) {
6868 pg->osd->send_message_osd_cluster(
6869 new MRecoveryReserve(
6870 MRecoveryReserve::REQUEST,
6871 spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard),
6872 pg->get_osdmap()->get_epoch()),
6873 con.get());
6874 }
6875 ++remote_recovery_reservation_it;
6876 } else {
6877 post_event(AllRemotesReserved());
6878 }
6879 return discard_event();
6880}
6881
6882void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
6883{
6884 context< RecoveryMachine >().log_exit(state_name, enter_time);
6885 PG *pg = context< RecoveryMachine >().pg;
6886 utime_t dur = ceph_clock_now() - enter_time;
6887 pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur);
6888}
6889
6890PG::RecoveryState::Recovering::Recovering(my_context ctx)
6891 : my_base(ctx),
6892 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovering")
6893{
6894 context< RecoveryMachine >().log_enter(state_name);
6895
6896 PG *pg = context< RecoveryMachine >().pg;
6897 pg->state_clear(PG_STATE_RECOVERY_WAIT);
6898 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
6899 pg->state_set(PG_STATE_RECOVERING);
6900 pg->publish_stats_to_osd();
6901 pg->queue_recovery();
6902}
6903
224ce89b 6904void PG::RecoveryState::Recovering::release_reservations(bool cancel)
7c673cae
FG
6905{
6906 PG *pg = context< RecoveryMachine >().pg;
224ce89b 6907 assert(cancel || !pg->pg_log.get_missing().have_missing());
7c673cae
FG
6908
6909 // release remote reservations
6910 for (set<pg_shard_t>::const_iterator i =
6911 context< Active >().remote_shards_to_reserve_recovery.begin();
6912 i != context< Active >().remote_shards_to_reserve_recovery.end();
6913 ++i) {
6914 if (*i == pg->pg_whoami) // skip myself
6915 continue;
6916 ConnectionRef con = pg->osd->get_con_osd_cluster(
6917 i->osd, pg->get_osdmap()->get_epoch());
6918 if (con) {
6919 pg->osd->send_message_osd_cluster(
6920 new MRecoveryReserve(
6921 MRecoveryReserve::RELEASE,
6922 spg_t(pg->info.pgid.pgid, i->shard),
6923 pg->get_osdmap()->get_epoch()),
6924 con.get());
6925 }
6926 }
6927}
6928
6929boost::statechart::result
6930PG::RecoveryState::Recovering::react(const AllReplicasRecovered &evt)
6931{
6932 PG *pg = context< RecoveryMachine >().pg;
6933 pg->state_clear(PG_STATE_RECOVERING);
c07f9fc5 6934 pg->state_clear(PG_STATE_FORCED_RECOVERY);
7c673cae
FG
6935 release_reservations();
6936 return transit<Recovered>();
6937}
6938
6939boost::statechart::result
6940PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
6941{
6942 PG *pg = context< RecoveryMachine >().pg;
6943 pg->state_clear(PG_STATE_RECOVERING);
c07f9fc5 6944 pg->state_clear(PG_STATE_FORCED_RECOVERY);
7c673cae
FG
6945 release_reservations();
6946 return transit<WaitRemoteBackfillReserved>();
6947}
6948
224ce89b
WB
6949boost::statechart::result
6950PG::RecoveryState::Recovering::react(const CancelRecovery &evt)
6951{
6952 PG *pg = context< RecoveryMachine >().pg;
6953 pg->state_clear(PG_STATE_RECOVERING);
6954 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6955 release_reservations(true);
6956 pg->schedule_recovery_full_retry();
6957 return transit<NotRecovering>();
6958}
6959
7c673cae
FG
6960void PG::RecoveryState::Recovering::exit()
6961{
6962 context< RecoveryMachine >().log_exit(state_name, enter_time);
6963 PG *pg = context< RecoveryMachine >().pg;
6964 utime_t dur = ceph_clock_now() - enter_time;
6965 pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur);
6966}
6967
6968PG::RecoveryState::Recovered::Recovered(my_context ctx)
6969 : my_base(ctx),
6970 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovered")
6971{
6972 pg_shard_t auth_log_shard;
6973
6974 context< RecoveryMachine >().log_enter(state_name);
6975
6976 PG *pg = context< RecoveryMachine >().pg;
6977 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6978
6979 assert(!pg->needs_recovery());
6980
6981 // if we finished backfill, all acting are active; recheck if
6982 // DEGRADED | UNDERSIZED is appropriate.
6983 assert(!pg->actingbackfill.empty());
6984 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
6985 pg->actingbackfill.size()) {
6986 pg->state_clear(PG_STATE_DEGRADED);
c07f9fc5 6987 pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
7c673cae
FG
6988 pg->publish_stats_to_osd();
6989 }
6990
6991 // trim pglog on recovered
6992 pg->trim_log();
6993
6994 // adjust acting set? (e.g. because backfill completed...)
6995 bool history_les_bound = false;
6996 if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
6997 true, &history_les_bound))
6998 assert(pg->want_acting.size());
6999
7000 if (context< Active >().all_replicas_activated)
7001 post_event(GoClean());
7002}
7003
7004void PG::RecoveryState::Recovered::exit()
7005{
7006 context< RecoveryMachine >().log_exit(state_name, enter_time);
7007 PG *pg = context< RecoveryMachine >().pg;
7008 utime_t dur = ceph_clock_now() - enter_time;
7009 pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur);
7010}
7011
7012PG::RecoveryState::Clean::Clean(my_context ctx)
7013 : my_base(ctx),
7014 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Clean")
7015{
7016 context< RecoveryMachine >().log_enter(state_name);
7017
7018 PG *pg = context< RecoveryMachine >().pg;
7019
7020 if (pg->info.last_complete != pg->info.last_update) {
7021 ceph_abort();
7022 }
7023 pg->finish_recovery(*context< RecoveryMachine >().get_on_safe_context_list());
224ce89b
WB
7024
7025 if (pg->is_active()) {
7026 pg->mark_clean();
7027 }
7c673cae
FG
7028
7029 pg->share_pg_info();
7030 pg->publish_stats_to_osd();
224ce89b 7031 pg->requeue_ops(pg->waiting_for_clean_to_primary_repair);
7c673cae
FG
7032}
7033
7034void PG::RecoveryState::Clean::exit()
7035{
7036 context< RecoveryMachine >().log_exit(state_name, enter_time);
7037 PG *pg = context< RecoveryMachine >().pg;
7038 pg->state_clear(PG_STATE_CLEAN);
7039 utime_t dur = ceph_clock_now() - enter_time;
7040 pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur);
7041}
7042
7043template <typename T>
7044set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
7045{
7046 set<int> osds_found;
7047 set<pg_shard_t> out;
7048 for (typename T::const_iterator i = in.begin();
7049 i != in.end();
7050 ++i) {
7051 if (*i != skip && !osds_found.count(i->osd)) {
7052 osds_found.insert(i->osd);
7053 out.insert(*i);
7054 }
7055 }
7056 return out;
7057}
7058
7059/*---------Active---------*/
7060PG::RecoveryState::Active::Active(my_context ctx)
7061 : my_base(ctx),
7062 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active"),
7063 remote_shards_to_reserve_recovery(
7064 unique_osd_shard_set(
7065 context< RecoveryMachine >().pg->pg_whoami,
7066 context< RecoveryMachine >().pg->actingbackfill)),
7067 remote_shards_to_reserve_backfill(
7068 unique_osd_shard_set(
7069 context< RecoveryMachine >().pg->pg_whoami,
7070 context< RecoveryMachine >().pg->backfill_targets)),
7071 all_replicas_activated(false)
7072{
7073 context< RecoveryMachine >().log_enter(state_name);
7074
7075 PG *pg = context< RecoveryMachine >().pg;
7076
7077 assert(!pg->backfill_reserving);
7078 assert(!pg->backfill_reserved);
7079 assert(pg->is_primary());
7080 ldout(pg->cct, 10) << "In Active, about to call activate" << dendl;
7081 pg->start_flush(
7082 context< RecoveryMachine >().get_cur_transaction(),
7083 context< RecoveryMachine >().get_on_applied_context_list(),
7084 context< RecoveryMachine >().get_on_safe_context_list());
7085 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7086 pg->get_osdmap()->get_epoch(),
7087 *context< RecoveryMachine >().get_on_safe_context_list(),
7088 *context< RecoveryMachine >().get_query_map(),
7089 context< RecoveryMachine >().get_info_map(),
7090 context< RecoveryMachine >().get_recovery_ctx());
7091
7092 // everyone has to commit/ack before we are truly active
7093 pg->blocked_by.clear();
7094 for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
7095 p != pg->actingbackfill.end();
7096 ++p) {
7097 if (p->shard != pg->pg_whoami.shard) {
7098 pg->blocked_by.insert(p->shard);
7099 }
7100 }
7101 pg->publish_stats_to_osd();
7102 ldout(pg->cct, 10) << "Activate Finished" << dendl;
7103}
7104
7105boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
7106{
7107 PG *pg = context< RecoveryMachine >().pg;
7108 ldout(pg->cct, 10) << "Active advmap" << dendl;
7109 if (!pg->pool.newly_removed_snaps.empty()) {
7110 pg->snap_trimq.union_of(pg->pool.newly_removed_snaps);
7111 ldout(pg->cct, 10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl;
7112 pg->dirty_info = true;
7113 pg->dirty_big_info = true;
7114 }
7115
7116 for (size_t i = 0; i < pg->want_acting.size(); i++) {
7117 int osd = pg->want_acting[i];
7118 if (!advmap.osdmap->is_up(osd)) {
7119 pg_shard_t osd_with_shard(osd, shard_id_t(i));
7120 assert(pg->is_acting(osd_with_shard) || pg->is_up(osd_with_shard));
7121 }
7122 }
7123
7124 bool need_publish = false;
7125 /* Check for changes in pool size (if the acting set changed as a result,
7126 * this does not matter) */
7127 if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) !=
7128 pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) {
7129 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) {
7130 pg->state_clear(PG_STATE_UNDERSIZED);
7131 if (pg->needs_recovery()) {
7132 pg->state_set(PG_STATE_DEGRADED);
7133 } else {
7134 pg->state_clear(PG_STATE_DEGRADED);
7135 }
7136 } else {
7137 pg->state_set(PG_STATE_UNDERSIZED);
7138 pg->state_set(PG_STATE_DEGRADED);
7139 }
7140 need_publish = true; // degraded may have changed
7141 }
7142
7143 // if we haven't reported our PG stats in a long time, do so now.
7144 if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
7145 ldout(pg->cct, 20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch)
7146 << " epochs" << dendl;
7147 need_publish = true;
7148 }
7149
7150 if (need_publish)
7151 pg->publish_stats_to_osd();
7152
7153 return forward_event();
7154}
7155
7156boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
7157{
7158 PG *pg = context< RecoveryMachine >().pg;
7159 ldout(pg->cct, 10) << "Active: handling ActMap" << dendl;
7160 assert(pg->is_primary());
7161
7162 if (pg->have_unfound()) {
7163 // object may have become unfound
7164 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7165 }
7166
7167 if (pg->cct->_conf->osd_check_for_log_corruption)
7168 pg->check_log_for_corruption(pg->osd->store);
7169
7170 uint64_t unfound = pg->missing_loc.num_unfound();
7171 if (unfound > 0 &&
7172 pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
7173 if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
7174 pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound
c07f9fc5
FG
7175 << " objects unfound and apparently lost, would automatically "
7176 << "mark these objects lost but this feature is not yet implemented "
7177 << "(osd_auto_mark_unfound_lost)";
7c673cae 7178 } else
c07f9fc5
FG
7179 pg->osd->clog->error() << pg->info.pgid.pgid << " has "
7180 << unfound << " objects unfound and apparently lost";
7c673cae
FG
7181 }
7182
7183 if (pg->is_active()) {
7184 ldout(pg->cct, 10) << "Active: kicking snap trim" << dendl;
7185 pg->kick_snap_trim();
7186 }
7187
7188 if (pg->is_peered() &&
7189 !pg->is_clean() &&
7190 !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) &&
7191 (!pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || pg->is_degraded())) {
7192 pg->queue_recovery();
7193 }
7194 return forward_event();
7195}
7196
7197boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& notevt)
7198{
7199 PG *pg = context< RecoveryMachine >().pg;
7200 assert(pg->is_primary());
7201 if (pg->peer_info.count(notevt.from)) {
7202 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7203 << ", already have info from that osd, ignoring"
7204 << dendl;
7205 } else if (pg->peer_purged.count(notevt.from)) {
7206 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7207 << ", already purged that peer, ignoring"
7208 << dendl;
7209 } else {
7210 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7211 << ", calling proc_replica_info and discover_all_missing"
7212 << dendl;
7213 pg->proc_replica_info(
7214 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
7215 if (pg->have_unfound()) {
7216 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7217 }
7218 }
7219 return discard_event();
7220}
7221
7222boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoevt)
7223{
7224 PG *pg = context< RecoveryMachine >().pg;
7225 assert(pg->is_primary());
7226
7227 assert(!pg->actingbackfill.empty());
7228 // don't update history (yet) if we are active and primary; the replica
7229 // may be telling us they have activated (and committed) but we can't
7230 // share that until _everyone_ does the same.
7231 if (pg->is_actingbackfill(infoevt.from)) {
7232 ldout(pg->cct, 10) << " peer osd." << infoevt.from
7233 << " activated and committed" << dendl;
7234 pg->peer_activated.insert(infoevt.from);
7235 pg->blocked_by.erase(infoevt.from.shard);
7236 pg->publish_stats_to_osd();
7237 if (pg->peer_activated.size() == pg->actingbackfill.size()) {
7238 pg->all_activated_and_committed();
7239 }
7240 }
7241 return discard_event();
7242}
7243
7244boost::statechart::result PG::RecoveryState::Active::react(const MLogRec& logevt)
7245{
7246 PG *pg = context< RecoveryMachine >().pg;
7247 ldout(pg->cct, 10) << "searching osd." << logevt.from
7248 << " log for unfound items" << dendl;
7249 pg->proc_replica_log(
7250 logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
7251 bool got_missing = pg->search_for_missing(
7252 pg->peer_info[logevt.from],
7253 pg->peer_missing[logevt.from],
7254 logevt.from,
7255 context< RecoveryMachine >().get_recovery_ctx());
7256 if (pg->is_peered() &&
7257 got_missing)
7258 pg->queue_recovery();
7259 return discard_event();
7260}
7261
7262boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
7263{
7264 PG *pg = context< RecoveryMachine >().pg;
7265
7266 q.f->open_object_section("state");
7267 q.f->dump_string("name", state_name);
7268 q.f->dump_stream("enter_time") << enter_time;
7269
7270 {
7271 q.f->open_array_section("might_have_unfound");
7272 for (set<pg_shard_t>::iterator p = pg->might_have_unfound.begin();
7273 p != pg->might_have_unfound.end();
7274 ++p) {
7275 q.f->open_object_section("osd");
7276 q.f->dump_stream("osd") << *p;
7277 if (pg->peer_missing.count(*p)) {
7278 q.f->dump_string("status", "already probed");
7279 } else if (pg->peer_missing_requested.count(*p)) {
7280 q.f->dump_string("status", "querying");
7281 } else if (!pg->get_osdmap()->is_up(p->osd)) {
7282 q.f->dump_string("status", "osd is down");
7283 } else {
7284 q.f->dump_string("status", "not queried");
7285 }
7286 q.f->close_section();
7287 }
7288 q.f->close_section();
7289 }
7290 {
7291 q.f->open_object_section("recovery_progress");
7292 pg->dump_recovery_info(q.f);
7293 q.f->close_section();
7294 }
7295
7296 {
7297 q.f->open_object_section("scrub");
7298 q.f->dump_stream("scrubber.epoch_start") << pg->scrubber.epoch_start;
7299 q.f->dump_bool("scrubber.active", pg->scrubber.active);
7300 q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state));
7301 q.f->dump_stream("scrubber.start") << pg->scrubber.start;
7302 q.f->dump_stream("scrubber.end") << pg->scrubber.end;
7303 q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update;
7304 q.f->dump_bool("scrubber.deep", pg->scrubber.deep);
7305 q.f->dump_unsigned("scrubber.seed", pg->scrubber.seed);
7306 q.f->dump_int("scrubber.waiting_on", pg->scrubber.waiting_on);
7307 {
7308 q.f->open_array_section("scrubber.waiting_on_whom");
7309 for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin();
7310 p != pg->scrubber.waiting_on_whom.end();
7311 ++p) {
7312 q.f->dump_stream("shard") << *p;
7313 }
7314 q.f->close_section();
7315 }
7316 q.f->close_section();
7317 }
7318
7319 q.f->close_section();
7320 return forward_event();
7321}
7322
7323boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActivated &evt)
7324{
7325 PG *pg = context< RecoveryMachine >().pg;
7326 all_replicas_activated = true;
7327
7328 pg->state_clear(PG_STATE_ACTIVATING);
7329 pg->state_clear(PG_STATE_CREATING);
7330 if (pg->acting.size() >= pg->pool.info.min_size) {
7331 pg->state_set(PG_STATE_ACTIVE);
7332 } else {
7333 pg->state_set(PG_STATE_PEERED);
7334 }
7335
7336 // info.last_epoch_started is set during activate()
7337 pg->info.history.last_epoch_started = pg->info.last_epoch_started;
7338 pg->info.history.last_interval_started = pg->info.last_interval_started;
7339 pg->dirty_info = true;
7340
7341 pg->share_pg_info();
7342 pg->publish_stats_to_osd();
7343
7344 pg->check_local();
7345
7346 // waiters
7347 if (pg->flushes_in_progress == 0) {
7348 pg->requeue_ops(pg->waiting_for_peered);
7349 }
7350
7351 pg->on_activate();
7352
7353 return discard_event();
7354}
7355
7356void PG::RecoveryState::Active::exit()
7357{
7358 context< RecoveryMachine >().log_exit(state_name, enter_time);
7359 PG *pg = context< RecoveryMachine >().pg;
7360 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7361
7362 pg->blocked_by.clear();
7363 pg->backfill_reserved = false;
7364 pg->backfill_reserving = false;
7365 pg->state_clear(PG_STATE_ACTIVATING);
7366 pg->state_clear(PG_STATE_DEGRADED);
7367 pg->state_clear(PG_STATE_UNDERSIZED);
7368 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
7369 pg->state_clear(PG_STATE_BACKFILL_WAIT);
7370 pg->state_clear(PG_STATE_RECOVERY_WAIT);
7371 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7372 utime_t dur = ceph_clock_now() - enter_time;
7373 pg->osd->recoverystate_perf->tinc(rs_active_latency, dur);
7374 pg->agent_stop();
7375}
7376
7377/*------ReplicaActive-----*/
7378PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx)
7379 : my_base(ctx),
7380 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive")
7381{
7382 context< RecoveryMachine >().log_enter(state_name);
7383
7384 PG *pg = context< RecoveryMachine >().pg;
7385 pg->start_flush(
7386 context< RecoveryMachine >().get_cur_transaction(),
7387 context< RecoveryMachine >().get_on_applied_context_list(),
7388 context< RecoveryMachine >().get_on_safe_context_list());
7389}
7390
7391
7392boost::statechart::result PG::RecoveryState::ReplicaActive::react(
7393 const Activate& actevt) {
7394 PG *pg = context< RecoveryMachine >().pg;
7395 ldout(pg->cct, 10) << "In ReplicaActive, about to call activate" << dendl;
7396 map<int, map<spg_t, pg_query_t> > query_map;
7397 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7398 actevt.activation_epoch,
7399 *context< RecoveryMachine >().get_on_safe_context_list(),
7400 query_map, NULL, NULL);
7401 ldout(pg->cct, 10) << "Activate Finished" << dendl;
7402 return discard_event();
7403}
7404
7405boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MInfoRec& infoevt)
7406{
7407 PG *pg = context< RecoveryMachine >().pg;
7408 pg->proc_primary_info(*context<RecoveryMachine>().get_cur_transaction(),
7409 infoevt.info);
7410 return discard_event();
7411}
7412
7413boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MLogRec& logevt)
7414{
7415 PG *pg = context< RecoveryMachine >().pg;
7416 ldout(pg->cct, 10) << "received log from " << logevt.from << dendl;
7417 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7418 pg->merge_log(*t, logevt.msg->info, logevt.msg->log, logevt.from);
7419 assert(pg->pg_log.get_head() == pg->info.last_update);
7420
7421 return discard_event();
7422}
7423
7424boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&)
7425{
7426 PG *pg = context< RecoveryMachine >().pg;
7427 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7428 context< RecoveryMachine >().send_notify(
7429 pg->get_primary(),
7430 pg_notify_t(
7431 pg->get_primary().shard, pg->pg_whoami.shard,
7432 pg->get_osdmap()->get_epoch(),
7433 pg->get_osdmap()->get_epoch(),
7434 pg->info),
7435 pg->past_intervals);
7436 }
7437 pg->take_waiters();
7438 return discard_event();
7439}
7440
7441boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MQuery& query)
7442{
7443 PG *pg = context< RecoveryMachine >().pg;
7444 if (query.query.type == pg_query_t::MISSING) {
7445 pg->update_history(query.query.history);
7446 pg->fulfill_log(query.from, query.query, query.query_epoch);
7447 } // else: from prior to activation, safe to ignore
7448 return discard_event();
7449}
7450
7451boost::statechart::result PG::RecoveryState::ReplicaActive::react(const QueryState& q)
7452{
7453 q.f->open_object_section("state");
7454 q.f->dump_string("name", state_name);
7455 q.f->dump_stream("enter_time") << enter_time;
7456 q.f->close_section();
7457 return forward_event();
7458}
7459
7460void PG::RecoveryState::ReplicaActive::exit()
7461{
7462 context< RecoveryMachine >().log_exit(state_name, enter_time);
7463 PG *pg = context< RecoveryMachine >().pg;
7464 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7465 utime_t dur = ceph_clock_now() - enter_time;
7466 pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur);
7467}
7468
7469/*-------Stray---*/
7470PG::RecoveryState::Stray::Stray(my_context ctx)
7471 : my_base(ctx),
7472 NamedState(context< RecoveryMachine >().pg, "Started/Stray")
7473{
7474 context< RecoveryMachine >().log_enter(state_name);
7475
7476 PG *pg = context< RecoveryMachine >().pg;
7477 assert(!pg->is_peered());
7478 assert(!pg->is_peering());
7479 assert(!pg->is_primary());
7480 pg->start_flush(
7481 context< RecoveryMachine >().get_cur_transaction(),
7482 context< RecoveryMachine >().get_on_applied_context_list(),
7483 context< RecoveryMachine >().get_on_safe_context_list());
7484}
7485
7486boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
7487{
7488 PG *pg = context< RecoveryMachine >().pg;
7489 MOSDPGLog *msg = logevt.msg.get();
7490 ldout(pg->cct, 10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
7491
7492 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7493 if (msg->info.last_backfill == hobject_t()) {
7494 // restart backfill
7495 pg->unreg_next_scrub();
7496 pg->info = msg->info;
7497 pg->reg_next_scrub();
7498 pg->dirty_info = true;
7499 pg->dirty_big_info = true; // maybe.
7500
7501 PGLogEntryHandler rollbacker{pg, t};
7502 pg->pg_log.reset_backfill_claim_log(msg->log, &rollbacker);
7503
7504 pg->pg_log.reset_backfill();
7505 } else {
7506 pg->merge_log(*t, msg->info, msg->log, logevt.from);
7507 }
7508
7509 assert(pg->pg_log.get_head() == pg->info.last_update);
7510
7511 post_event(Activate(logevt.msg->info.last_epoch_started));
7512 return transit<ReplicaActive>();
7513}
7514
7515boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoevt)
7516{
7517 PG *pg = context< RecoveryMachine >().pg;
7518 ldout(pg->cct, 10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
7519
7520 if (pg->info.last_update > infoevt.info.last_update) {
7521 // rewind divergent log entries
7522 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7523 pg->rewind_divergent_log(*t, infoevt.info.last_update);
7524 pg->info.stats = infoevt.info.stats;
7525 pg->info.hit_set = infoevt.info.hit_set;
7526 }
7527
7528 assert(infoevt.info.last_update == pg->info.last_update);
7529 assert(pg->pg_log.get_head() == pg->info.last_update);
7530
7531 post_event(Activate(infoevt.info.last_epoch_started));
7532 return transit<ReplicaActive>();
7533}
7534
7535boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query)
7536{
7537 PG *pg = context< RecoveryMachine >().pg;
7538 if (query.query.type == pg_query_t::INFO) {
7539 pair<pg_shard_t, pg_info_t> notify_info;
7540 pg->update_history(query.query.history);
7541 pg->fulfill_info(query.from, query.query, notify_info);
7542 context< RecoveryMachine >().send_notify(
7543 notify_info.first,
7544 pg_notify_t(
7545 notify_info.first.shard, pg->pg_whoami.shard,
7546 query.query_epoch,
7547 pg->get_osdmap()->get_epoch(),
7548 notify_info.second),
7549 pg->past_intervals);
7550 } else {
7551 pg->fulfill_log(query.from, query.query, query.query_epoch);
7552 }
7553 return discard_event();
7554}
7555
7556boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&)
7557{
7558 PG *pg = context< RecoveryMachine >().pg;
7559 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7560 context< RecoveryMachine >().send_notify(
7561 pg->get_primary(),
7562 pg_notify_t(
7563 pg->get_primary().shard, pg->pg_whoami.shard,
7564 pg->get_osdmap()->get_epoch(),
7565 pg->get_osdmap()->get_epoch(),
7566 pg->info),
7567 pg->past_intervals);
7568 }
7569 pg->take_waiters();
7570 return discard_event();
7571}
7572
7573void PG::RecoveryState::Stray::exit()
7574{
7575 context< RecoveryMachine >().log_exit(state_name, enter_time);
7576 PG *pg = context< RecoveryMachine >().pg;
7577 utime_t dur = ceph_clock_now() - enter_time;
7578 pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur);
7579}
7580
7581/*--------GetInfo---------*/
7582PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
7583 : my_base(ctx),
7584 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetInfo")
7585{
7586 context< RecoveryMachine >().log_enter(state_name);
7587
7588 PG *pg = context< RecoveryMachine >().pg;
7589 pg->check_past_interval_bounds();
7590 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7591
7592 assert(pg->blocked_by.empty());
7593
7594 prior_set = pg->build_prior();
7595
7596 pg->reset_min_peer_features();
7597 get_infos();
7598 if (prior_set.pg_down) {
7599 post_event(IsDown());
7600 } else if (peer_info_requested.empty()) {
7601 post_event(GotInfo());
7602 }
7603}
7604
7605void PG::RecoveryState::GetInfo::get_infos()
7606{
7607 PG *pg = context< RecoveryMachine >().pg;
7608 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7609
7610 pg->blocked_by.clear();
7611 for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
7612 it != prior_set.probe.end();
7613 ++it) {
7614 pg_shard_t peer = *it;
7615 if (peer == pg->pg_whoami) {
7616 continue;
7617 }
7618 if (pg->peer_info.count(peer)) {
7619 ldout(pg->cct, 10) << " have osd." << peer << " info " << pg->peer_info[peer] << dendl;
7620 continue;
7621 }
7622 if (peer_info_requested.count(peer)) {
7623 ldout(pg->cct, 10) << " already requested info from osd." << peer << dendl;
7624 pg->blocked_by.insert(peer.osd);
7625 } else if (!pg->get_osdmap()->is_up(peer.osd)) {
7626 ldout(pg->cct, 10) << " not querying info from down osd." << peer << dendl;
7627 } else {
7628 ldout(pg->cct, 10) << " querying info from osd." << peer << dendl;
7629 context< RecoveryMachine >().send_query(
7630 peer, pg_query_t(pg_query_t::INFO,
7631 it->shard, pg->pg_whoami.shard,
7632 pg->info.history,
7633 pg->get_osdmap()->get_epoch()));
7634 peer_info_requested.insert(peer);
7635 pg->blocked_by.insert(peer.osd);
7636 }
7637 }
7638
7639 pg->publish_stats_to_osd();
7640}
7641
7642boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt)
7643{
7644 PG *pg = context< RecoveryMachine >().pg;
7645
7646 set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
7647 if (p != peer_info_requested.end()) {
7648 peer_info_requested.erase(p);
7649 pg->blocked_by.erase(infoevt.from.osd);
7650 }
7651
7652 epoch_t old_start = pg->info.history.last_epoch_started;
7653 if (pg->proc_replica_info(
7654 infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
7655 // we got something new ...
7656 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7657 if (old_start < pg->info.history.last_epoch_started) {
7658 ldout(pg->cct, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
7659 prior_set = pg->build_prior();
7660
7661 // filter out any osds that got dropped from the probe set from
7662 // peer_info_requested. this is less expensive than restarting
7663 // peering (which would re-probe everyone).
7664 set<pg_shard_t>::iterator p = peer_info_requested.begin();
7665 while (p != peer_info_requested.end()) {
7666 if (prior_set.probe.count(*p) == 0) {
7667 ldout(pg->cct, 20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
7668 peer_info_requested.erase(p++);
7669 } else {
7670 ++p;
7671 }
7672 }
7673 get_infos();
7674 }
7675 ldout(pg->cct, 20) << "Adding osd: " << infoevt.from.osd << " peer features: "
7676 << hex << infoevt.features << dec << dendl;
7677 pg->apply_peer_features(infoevt.features);
7678
7679 // are we done getting everything?
7680 if (peer_info_requested.empty() && !prior_set.pg_down) {
7681 ldout(pg->cct, 20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl;
7682 ldout(pg->cct, 20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl;
7683 ldout(pg->cct, 20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl;
7684 post_event(GotInfo());
7685 }
7686 }
7687 return discard_event();
7688}
7689
7690boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q)
7691{
7692 PG *pg = context< RecoveryMachine >().pg;
7693 q.f->open_object_section("state");
7694 q.f->dump_string("name", state_name);
7695 q.f->dump_stream("enter_time") << enter_time;
7696
7697 q.f->open_array_section("requested_info_from");
7698 for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
7699 p != peer_info_requested.end();
7700 ++p) {
7701 q.f->open_object_section("osd");
7702 q.f->dump_stream("osd") << *p;
7703 if (pg->peer_info.count(*p)) {
7704 q.f->open_object_section("got_info");
7705 pg->peer_info[*p].dump(q.f);
7706 q.f->close_section();
7707 }
7708 q.f->close_section();
7709 }
7710 q.f->close_section();
7711
7712 q.f->close_section();
7713 return forward_event();
7714}
7715
7716void PG::RecoveryState::GetInfo::exit()
7717{
7718 context< RecoveryMachine >().log_exit(state_name, enter_time);
7719 PG *pg = context< RecoveryMachine >().pg;
7720 utime_t dur = ceph_clock_now() - enter_time;
7721 pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur);
7722 pg->blocked_by.clear();
7723 pg->publish_stats_to_osd();
7724}
7725
7726/*------GetLog------------*/
7727PG::RecoveryState::GetLog::GetLog(my_context ctx)
7728 : my_base(ctx),
7729 NamedState(
7730 context< RecoveryMachine >().pg, "Started/Primary/Peering/GetLog"),
7731 msg(0)
7732{
7733 context< RecoveryMachine >().log_enter(state_name);
7734
7735 PG *pg = context< RecoveryMachine >().pg;
7736
7737 // adjust acting?
7738 if (!pg->choose_acting(auth_log_shard, false,
7739 &context< Peering >().history_les_bound)) {
7740 if (!pg->want_acting.empty()) {
7741 post_event(NeedActingChange());
7742 } else {
7743 post_event(IsIncomplete());
7744 }
7745 return;
7746 }
7747
7748 // am i the best?
7749 if (auth_log_shard == pg->pg_whoami) {
7750 post_event(GotLog());
7751 return;
7752 }
7753
7754 const pg_info_t& best = pg->peer_info[auth_log_shard];
7755
7756 // am i broken?
7757 if (pg->info.last_update < best.log_tail) {
7758 ldout(pg->cct, 10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
7759 post_event(IsIncomplete());
7760 return;
7761 }
7762
7763 // how much log to request?
7764 eversion_t request_log_from = pg->info.last_update;
7765 assert(!pg->actingbackfill.empty());
7766 for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
7767 p != pg->actingbackfill.end();
7768 ++p) {
7769 if (*p == pg->pg_whoami) continue;
7770 pg_info_t& ri = pg->peer_info[*p];
7771 if (ri.last_update < pg->info.log_tail && ri.last_update >= best.log_tail &&
7772 ri.last_update < request_log_from)
7773 request_log_from = ri.last_update;
7774 }
7775
7776 // how much?
7777 ldout(pg->cct, 10) << " requesting log from osd." << auth_log_shard << dendl;
7778 context<RecoveryMachine>().send_query(
7779 auth_log_shard,
7780 pg_query_t(
7781 pg_query_t::LOG,
7782 auth_log_shard.shard, pg->pg_whoami.shard,
7783 request_log_from, pg->info.history,
7784 pg->get_osdmap()->get_epoch()));
7785
7786 assert(pg->blocked_by.empty());
7787 pg->blocked_by.insert(auth_log_shard.osd);
7788 pg->publish_stats_to_osd();
7789}
7790
7791boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap)
7792{
7793 PG *pg = context< RecoveryMachine >().pg;
7794 // make sure our log source didn't go down. we need to check
7795 // explicitly because it may not be part of the prior set, which
7796 // means the Peering state check won't catch it going down.
7797 if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
7798 ldout(pg->cct, 10) << "GetLog: auth_log_shard osd."
7799 << auth_log_shard.osd << " went down" << dendl;
7800 post_event(advmap);
7801 return transit< Reset >();
7802 }
7803
7804 // let the Peering state do its checks.
7805 return forward_event();
7806}
7807
7808boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt)
7809{
7810 PG *pg = context< RecoveryMachine >().pg;
7811 assert(!msg);
7812 if (logevt.from != auth_log_shard) {
7813 ldout(pg->cct, 10) << "GetLog: discarding log from "
7814 << "non-auth_log_shard osd." << logevt.from << dendl;
7815 return discard_event();
7816 }
7817 ldout(pg->cct, 10) << "GetLog: received master log from osd"
7818 << logevt.from << dendl;
7819 msg = logevt.msg;
7820 post_event(GotLog());
7821 return discard_event();
7822}
7823
7824boost::statechart::result PG::RecoveryState::GetLog::react(const GotLog&)
7825{
7826 PG *pg = context< RecoveryMachine >().pg;
7827 ldout(pg->cct, 10) << "leaving GetLog" << dendl;
7828 if (msg) {
7829 ldout(pg->cct, 10) << "processing master log" << dendl;
7830 pg->proc_master_log(*context<RecoveryMachine>().get_cur_transaction(),
7831 msg->info, msg->log, msg->missing,
7832 auth_log_shard);
7833 }
7834 pg->start_flush(
7835 context< RecoveryMachine >().get_cur_transaction(),
7836 context< RecoveryMachine >().get_on_applied_context_list(),
7837 context< RecoveryMachine >().get_on_safe_context_list());
7838 return transit< GetMissing >();
7839}
7840
7841boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q)
7842{
7843 q.f->open_object_section("state");
7844 q.f->dump_string("name", state_name);
7845 q.f->dump_stream("enter_time") << enter_time;
7846 q.f->dump_stream("auth_log_shard") << auth_log_shard;
7847 q.f->close_section();
7848 return forward_event();
7849}
7850
7851void PG::RecoveryState::GetLog::exit()
7852{
7853 context< RecoveryMachine >().log_exit(state_name, enter_time);
7854 PG *pg = context< RecoveryMachine >().pg;
7855 utime_t dur = ceph_clock_now() - enter_time;
7856 pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur);
7857 pg->blocked_by.clear();
7858 pg->publish_stats_to_osd();
7859}
7860
7861/*------WaitActingChange--------*/
7862PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx)
7863 : my_base(ctx),
7864 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitActingChange")
7865{
7866 context< RecoveryMachine >().log_enter(state_name);
7867}
7868
7869boost::statechart::result PG::RecoveryState::WaitActingChange::react(const AdvMap& advmap)
7870{
7871 PG *pg = context< RecoveryMachine >().pg;
7872 OSDMapRef osdmap = advmap.osdmap;
7873
7874 ldout(pg->cct, 10) << "verifying no want_acting " << pg->want_acting << " targets didn't go down" << dendl;
7875 for (vector<int>::iterator p = pg->want_acting.begin(); p != pg->want_acting.end(); ++p) {
7876 if (!osdmap->is_up(*p)) {
7877 ldout(pg->cct, 10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
7878 post_event(advmap);
7879 return transit< Reset >();
7880 }
7881 }
7882 return forward_event();
7883}
7884
7885boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MLogRec& logevt)
7886{
7887 PG *pg = context< RecoveryMachine >().pg;
7888 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MLocRec" << dendl;
7889 return discard_event();
7890}
7891
7892boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MInfoRec& evt)
7893{
7894 PG *pg = context< RecoveryMachine >().pg;
7895 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
7896 return discard_event();
7897}
7898
7899boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MNotifyRec& evt)
7900{
7901 PG *pg = context< RecoveryMachine >().pg;
7902 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
7903 return discard_event();
7904}
7905
7906boost::statechart::result PG::RecoveryState::WaitActingChange::react(const QueryState& q)
7907{
7908 q.f->open_object_section("state");
7909 q.f->dump_string("name", state_name);
7910 q.f->dump_stream("enter_time") << enter_time;
7911 q.f->dump_string("comment", "waiting for pg acting set to change");
7912 q.f->close_section();
7913 return forward_event();
7914}
7915
7916void PG::RecoveryState::WaitActingChange::exit()
7917{
7918 context< RecoveryMachine >().log_exit(state_name, enter_time);
7919 PG *pg = context< RecoveryMachine >().pg;
7920 utime_t dur = ceph_clock_now() - enter_time;
7921 pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur);
7922}
7923
7924/*------Down--------*/
7925PG::RecoveryState::Down::Down(my_context ctx)
7926 : my_base(ctx),
7927 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Down")
7928{
7929 context< RecoveryMachine >().log_enter(state_name);
7930 PG *pg = context< RecoveryMachine >().pg;
7931
7932 pg->state_clear(PG_STATE_PEERING);
7933 pg->state_set(PG_STATE_DOWN);
7934
7935 auto &prior_set = context< Peering >().prior_set;
7936 assert(pg->blocked_by.empty());
7937 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
7938 pg->publish_stats_to_osd();
7939}
7940
7941void PG::RecoveryState::Down::exit()
7942{
7943 context< RecoveryMachine >().log_exit(state_name, enter_time);
7944 PG *pg = context< RecoveryMachine >().pg;
7945
7946 pg->state_clear(PG_STATE_DOWN);
7947 utime_t dur = ceph_clock_now() - enter_time;
7948 pg->osd->recoverystate_perf->tinc(rs_down_latency, dur);
7949
7950 pg->blocked_by.clear();
7951 pg->publish_stats_to_osd();
7952}
7953
7954boost::statechart::result PG::RecoveryState::Down::react(const QueryState& q)
7955{
7956 q.f->open_object_section("state");
7957 q.f->dump_string("name", state_name);
7958 q.f->dump_stream("enter_time") << enter_time;
7959 q.f->dump_string("comment",
7960 "not enough up instances of this PG to go active");
7961 q.f->close_section();
7962 return forward_event();
7963}
7964
7965/*------Incomplete--------*/
7966PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
7967 : my_base(ctx),
7968 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Incomplete")
7969{
7970 context< RecoveryMachine >().log_enter(state_name);
7971 PG *pg = context< RecoveryMachine >().pg;
7972
7973 pg->state_clear(PG_STATE_PEERING);
7974 pg->state_set(PG_STATE_INCOMPLETE);
7975
7976 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7977 assert(pg->blocked_by.empty());
7978 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
7979 pg->publish_stats_to_osd();
7980}
7981
7982boost::statechart::result PG::RecoveryState::Incomplete::react(const AdvMap &advmap) {
7983 PG *pg = context< RecoveryMachine >().pg;
7984 int64_t poolnum = pg->info.pgid.pool();
7985
7986 // Reset if min_size turn smaller than previous value, pg might now be able to go active
7987 if (advmap.lastmap->get_pools().find(poolnum)->second.min_size >
7988 advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
7989 post_event(advmap);
7990 return transit< Reset >();
7991 }
7992
7993 return forward_event();
7994}
7995
7996boost::statechart::result PG::RecoveryState::Incomplete::react(const MNotifyRec& notevt) {
7997 PG *pg = context< RecoveryMachine >().pg;
7998 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
7999 if (pg->proc_replica_info(
8000 notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
8001 // We got something new, try again!
8002 return transit< GetLog >();
8003 } else {
8004 return discard_event();
8005 }
8006}
8007
8008boost::statechart::result PG::RecoveryState::Incomplete::react(
8009 const QueryState& q)
8010{
8011 q.f->open_object_section("state");
8012 q.f->dump_string("name", state_name);
8013 q.f->dump_stream("enter_time") << enter_time;
8014 q.f->dump_string("comment", "not enough complete instances of this PG");
8015 q.f->close_section();
8016 return forward_event();
8017}
8018
8019void PG::RecoveryState::Incomplete::exit()
8020{
8021 context< RecoveryMachine >().log_exit(state_name, enter_time);
8022 PG *pg = context< RecoveryMachine >().pg;
8023
8024 pg->state_clear(PG_STATE_INCOMPLETE);
8025 utime_t dur = ceph_clock_now() - enter_time;
8026 pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur);
8027
8028 pg->blocked_by.clear();
8029 pg->publish_stats_to_osd();
8030}
8031
8032/*------GetMissing--------*/
8033PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
8034 : my_base(ctx),
8035 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetMissing")
8036{
8037 context< RecoveryMachine >().log_enter(state_name);
8038
8039 PG *pg = context< RecoveryMachine >().pg;
8040 assert(!pg->actingbackfill.empty());
8041 eversion_t since;
8042 for (set<pg_shard_t>::iterator i = pg->actingbackfill.begin();
8043 i != pg->actingbackfill.end();
8044 ++i) {
8045 if (*i == pg->get_primary()) continue;
8046 const pg_info_t& pi = pg->peer_info[*i];
d2e6a577
FG
8047 // reset this so to make sure the pg_missing_t is initialized and
8048 // has the correct semantics even if we don't need to get a
8049 // missing set from a shard. This way later additions due to
8050 // lost+unfound delete work properly.
8051 pg->peer_missing[*i].may_include_deletes = !pg->perform_deletes_during_peering();
7c673cae
FG
8052
8053 if (pi.is_empty())
8054 continue; // no pg data, nothing divergent
8055
8056 if (pi.last_update < pg->pg_log.get_tail()) {
8057 ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
d2e6a577 8058 pg->peer_missing[*i].clear();
7c673cae
FG
8059 continue;
8060 }
8061 if (pi.last_backfill == hobject_t()) {
8062 ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
d2e6a577 8063 pg->peer_missing[*i].clear();
7c673cae
FG
8064 continue;
8065 }
8066
8067 if (pi.last_update == pi.last_complete && // peer has no missing
8068 pi.last_update == pg->info.last_update) { // peer is up to date
8069 // replica has no missing and identical log as us. no need to
8070 // pull anything.
8071 // FIXME: we can do better here. if last_update==last_complete we
8072 // can infer the rest!
8073 ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl;
d2e6a577 8074 pg->peer_missing[*i].clear();
7c673cae
FG
8075 continue;
8076 }
8077
8078 // We pull the log from the peer's last_epoch_started to ensure we
8079 // get enough log to detect divergent updates.
8080 since.epoch = pi.last_epoch_started;
8081 assert(pi.last_update >= pg->info.log_tail); // or else choose_acting() did a bad thing
8082 if (pi.log_tail <= since) {
8083 ldout(pg->cct, 10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
8084 context< RecoveryMachine >().send_query(
8085 *i,
8086 pg_query_t(
8087 pg_query_t::LOG,
8088 i->shard, pg->pg_whoami.shard,
8089 since, pg->info.history,
8090 pg->get_osdmap()->get_epoch()));
8091 } else {
8092 ldout(pg->cct, 10) << " requesting fulllog+missing from osd." << *i
8093 << " (want since " << since << " < log.tail "
8094 << pi.log_tail << ")" << dendl;
8095 context< RecoveryMachine >().send_query(
8096 *i, pg_query_t(
8097 pg_query_t::FULLLOG,
8098 i->shard, pg->pg_whoami.shard,
8099 pg->info.history, pg->get_osdmap()->get_epoch()));
8100 }
8101 peer_missing_requested.insert(*i);
8102 pg->blocked_by.insert(i->osd);
8103 }
8104
8105 if (peer_missing_requested.empty()) {
8106 if (pg->need_up_thru) {
8107 ldout(pg->cct, 10) << " still need up_thru update before going active"
8108 << dendl;
8109 post_event(NeedUpThru());
8110 return;
8111 }
8112
8113 // all good!
8114 post_event(Activate(pg->get_osdmap()->get_epoch()));
8115 } else {
8116 pg->publish_stats_to_osd();
8117 }
8118}
8119
8120boost::statechart::result PG::RecoveryState::GetMissing::react(const MLogRec& logevt)
8121{
8122 PG *pg = context< RecoveryMachine >().pg;
8123
8124 peer_missing_requested.erase(logevt.from);
8125 pg->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
8126
8127 if (peer_missing_requested.empty()) {
8128 if (pg->need_up_thru) {
8129 ldout(pg->cct, 10) << " still need up_thru update before going active"
8130 << dendl;
8131 post_event(NeedUpThru());
8132 } else {
8133 ldout(pg->cct, 10) << "Got last missing, don't need missing "
8134 << "posting Activate" << dendl;
8135 post_event(Activate(pg->get_osdmap()->get_epoch()));
8136 }
8137 }
8138 return discard_event();
8139}
8140
8141boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState& q)
8142{
8143 PG *pg = context< RecoveryMachine >().pg;
8144 q.f->open_object_section("state");
8145 q.f->dump_string("name", state_name);
8146 q.f->dump_stream("enter_time") << enter_time;
8147
8148 q.f->open_array_section("peer_missing_requested");
8149 for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
8150 p != peer_missing_requested.end();
8151 ++p) {
8152 q.f->open_object_section("osd");
8153 q.f->dump_stream("osd") << *p;
8154 if (pg->peer_missing.count(*p)) {
8155 q.f->open_object_section("got_missing");
8156 pg->peer_missing[*p].dump(q.f);
8157 q.f->close_section();
8158 }
8159 q.f->close_section();
8160 }
8161 q.f->close_section();
8162
8163 q.f->close_section();
8164 return forward_event();
8165}
8166
8167void PG::RecoveryState::GetMissing::exit()
8168{
8169 context< RecoveryMachine >().log_exit(state_name, enter_time);
8170 PG *pg = context< RecoveryMachine >().pg;
8171 utime_t dur = ceph_clock_now() - enter_time;
8172 pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur);
8173 pg->blocked_by.clear();
8174 pg->publish_stats_to_osd();
8175}
8176
8177/*------WaitUpThru--------*/
8178PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx)
8179 : my_base(ctx),
8180 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitUpThru")
8181{
8182 context< RecoveryMachine >().log_enter(state_name);
8183}
8184
8185boost::statechart::result PG::RecoveryState::WaitUpThru::react(const ActMap& am)
8186{
8187 PG *pg = context< RecoveryMachine >().pg;
8188 if (!pg->need_up_thru) {
8189 post_event(Activate(pg->get_osdmap()->get_epoch()));
8190 }
8191 return forward_event();
8192}
8193
8194boost::statechart::result PG::RecoveryState::WaitUpThru::react(const MLogRec& logevt)
8195{
8196 PG *pg = context< RecoveryMachine >().pg;
8197 ldout(pg->cct, 10) << "Noting missing from osd." << logevt.from << dendl;
8198 pg->peer_missing[logevt.from].claim(logevt.msg->missing);
8199 pg->peer_info[logevt.from] = logevt.msg->info;
8200 return discard_event();
8201}
8202
8203boost::statechart::result PG::RecoveryState::WaitUpThru::react(const QueryState& q)
8204{
8205 q.f->open_object_section("state");
8206 q.f->dump_string("name", state_name);
8207 q.f->dump_stream("enter_time") << enter_time;
8208 q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
8209 q.f->close_section();
8210 return forward_event();
8211}
8212
8213void PG::RecoveryState::WaitUpThru::exit()
8214{
8215 context< RecoveryMachine >().log_exit(state_name, enter_time);
8216 PG *pg = context< RecoveryMachine >().pg;
8217 utime_t dur = ceph_clock_now() - enter_time;
8218 pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur);
8219}
8220
8221/*----RecoveryState::RecoveryMachine Methods-----*/
8222#undef dout_prefix
8223#define dout_prefix *_dout << pg->gen_prefix()
8224
8225void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name)
8226{
8227 PG *pg = context< RecoveryMachine >().pg;
8228 ldout(pg->cct, 5) << "enter " << state_name << dendl;
8229 pg->osd->pg_recovery_stats.log_enter(state_name);
8230}
8231
8232void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_t enter_time)
8233{
8234 utime_t dur = ceph_clock_now() - enter_time;
8235 PG *pg = context< RecoveryMachine >().pg;
8236 ldout(pg->cct, 5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
8237 pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now() - enter_time,
8238 event_count, event_time);
8239 event_count = 0;
8240 event_time = utime_t();
8241}
8242
8243
8244/*---------------------------------------------------*/
8245#undef dout_prefix
8246#define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ")
8247
8248void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
8249 assert(!rctx);
8250 assert(!orig_ctx);
8251 orig_ctx = new_ctx;
8252 if (new_ctx) {
8253 if (messages_pending_flush) {
8254 rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
8255 } else {
8256 rctx = *new_ctx;
8257 }
8258 rctx->start_time = ceph_clock_now();
8259 }
8260}
8261
8262void PG::RecoveryState::begin_block_outgoing() {
8263 assert(!messages_pending_flush);
8264 assert(orig_ctx);
8265 assert(rctx);
8266 messages_pending_flush = BufferedRecoveryMessages();
8267 rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
8268}
8269
8270void PG::RecoveryState::clear_blocked_outgoing() {
8271 assert(orig_ctx);
8272 assert(rctx);
8273 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8274}
8275
8276void PG::RecoveryState::end_block_outgoing() {
8277 assert(messages_pending_flush);
8278 assert(orig_ctx);
8279 assert(rctx);
8280
8281 rctx = RecoveryCtx(*orig_ctx);
8282 rctx->accept_buffered_messages(*messages_pending_flush);
8283 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8284}
8285
8286void PG::RecoveryState::end_handle() {
8287 if (rctx) {
8288 utime_t dur = ceph_clock_now() - rctx->start_time;
8289 machine.event_time += dur;
8290 }
8291
8292 machine.event_count++;
8293 rctx = boost::optional<RecoveryCtx>();
8294 orig_ctx = NULL;
8295}
8296
8297ostream& operator<<(ostream& out, const PG::BackfillInterval& bi)
8298{
8299 out << "BackfillInfo(" << bi.begin << "-" << bi.end
8300 << " " << bi.objects.size() << " objects";
8301 if (!bi.objects.empty())
8302 out << " " << bi.objects;
8303 out << ")";
8304 return out;
8305}
8306
8307void intrusive_ptr_add_ref(PG *pg) { pg->get("intptr"); }
8308void intrusive_ptr_release(PG *pg) { pg->put("intptr"); }
8309
8310#ifdef PG_DEBUG_REFS
8311 uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
8312 void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
8313#endif