]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PG.cc
import ceph 12.2.12
[ceph.git] / ceph / src / osd / PG.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "PG.h"
16 // #include "msg/Messenger.h"
17 #include "messages/MOSDRepScrub.h"
18 // #include "common/cmdparse.h"
19 // #include "common/ceph_context.h"
20
21 #include "common/errno.h"
22 #include "common/config.h"
23 #include "OSD.h"
24 #include "OpRequest.h"
25 #include "ScrubStore.h"
26 #include "Session.h"
27
28 #include "common/Timer.h"
29 #include "common/perf_counters.h"
30
31 #include "messages/MOSDOp.h"
32 #include "messages/MOSDPGNotify.h"
33 // #include "messages/MOSDPGLog.h"
34 #include "messages/MOSDPGRemove.h"
35 #include "messages/MOSDPGInfo.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDPGBackfill.h"
39 #include "messages/MOSDPGBackfillRemove.h"
40 #include "messages/MBackfillReserve.h"
41 #include "messages/MRecoveryReserve.h"
42 #include "messages/MOSDPGPush.h"
43 #include "messages/MOSDPGPushReply.h"
44 #include "messages/MOSDPGPull.h"
45 #include "messages/MOSDECSubOpWrite.h"
46 #include "messages/MOSDECSubOpWriteReply.h"
47 #include "messages/MOSDECSubOpRead.h"
48 #include "messages/MOSDECSubOpReadReply.h"
49 #include "messages/MOSDPGUpdateLogMissing.h"
50 #include "messages/MOSDPGUpdateLogMissingReply.h"
51 #include "messages/MOSDBackoff.h"
52 #include "messages/MOSDScrubReserve.h"
53 #include "messages/MOSDSubOp.h"
54 #include "messages/MOSDRepOp.h"
55 #include "messages/MOSDSubOpReply.h"
56 #include "messages/MOSDRepOpReply.h"
57 #include "messages/MOSDRepScrubMap.h"
58 #include "messages/MOSDPGRecoveryDelete.h"
59 #include "messages/MOSDPGRecoveryDeleteReply.h"
60
61 #include "common/BackTrace.h"
62 #include "common/EventTrace.h"
63
64 #ifdef WITH_LTTNG
65 #define TRACEPOINT_DEFINE
66 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
67 #include "tracing/pg.h"
68 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
69 #undef TRACEPOINT_DEFINE
70 #else
71 #define tracepoint(...)
72 #endif
73
74 #include <sstream>
75
76 #define dout_context cct
77 #define dout_subsys ceph_subsys_osd
78 #undef dout_prefix
79 #define dout_prefix _prefix(_dout, this)
80
81 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
82 // easily skip them
83 const string infover_key("_infover");
84 const string info_key("_info");
85 const string biginfo_key("_biginfo");
86 const string epoch_key("_epoch");
87 const string fastinfo_key("_fastinfo");
88
89 template <class T>
90 static ostream& _prefix(std::ostream *_dout, T *t)
91 {
92 return *_dout << t->gen_prefix();
93 }
94
95 MEMPOOL_DEFINE_OBJECT_FACTORY(PG::CephPeeringEvt, pg_peering_evt, osd);
96
97 void PGStateHistory::enter(PG* pg, const utime_t entime, const char* state)
98 {
99 // Ignore trimming state machine for now
100 if (::strstr(state, "Trimming") != NULL) {
101 return;
102 } else if (pi != nullptr) {
103 pi->enter_state(entime, state);
104 } else {
105 // Store current state since we can't reliably take the PG lock here
106 if ( tmppi == nullptr) {
107 tmppi = std::unique_ptr<PGStateInstance>(new PGStateInstance);
108 }
109
110 thispg = pg;
111 tmppi->enter_state(entime, state);
112 }
113 }
114
115 void PGStateHistory::exit(const char* state) {
116 // Ignore trimming state machine for now
117 // Do nothing if PG is being destroyed!
118 if (::strstr(state, "Trimming") != NULL || pg_in_destructor) {
119 return;
120 } else {
121 bool ilocked = false;
122 if(!thispg->is_locked()) {
123 thispg->lock();
124 ilocked = true;
125 }
126 if (pi == nullptr) {
127 buffer.push_back(std::unique_ptr<PGStateInstance>(tmppi.release()));
128 pi = buffer.back().get();
129 pi->setepoch(thispg->get_osdmap()->get_epoch());
130 }
131
132 pi->exit_state(ceph_clock_now());
133 if (::strcmp(state, "Reset") == 0) {
134 this->reset();
135 }
136 if(ilocked) {
137 thispg->unlock();
138 }
139 }
140 }
141
142 void PGStateHistory::dump(Formatter* f) const {
143 f->open_array_section("history");
144 for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) {
145 f->open_object_section("states");
146 f->dump_stream("epoch") << (*pi)->this_epoch;
147 for (auto she : (*pi)->state_history) {
148 f->dump_string("state", std::get<2>(she));
149 f->dump_stream("enter") << std::get<0>(she);
150 f->dump_stream("exit") << std::get<1>(she);
151 }
152 f->close_section();
153 }
154 f->close_section();
155 }
156
157 void PG::get(const char* tag)
158 {
159 ref++;
160 #ifdef PG_DEBUG_REFS
161 Mutex::Locker l(_ref_id_lock);
162 _tag_counts[tag]++;
163 #endif
164 }
165
166 void PG::put(const char* tag)
167 {
168 #ifdef PG_DEBUG_REFS
169 {
170 Mutex::Locker l(_ref_id_lock);
171 auto tag_counts_entry = _tag_counts.find(tag);
172 assert(tag_counts_entry != _tag_counts.end());
173 --tag_counts_entry->second;
174 if (tag_counts_entry->second == 0) {
175 _tag_counts.erase(tag_counts_entry);
176 }
177 }
178 #endif
179 if (--ref== 0)
180 delete this;
181 }
182
183 #ifdef PG_DEBUG_REFS
184 uint64_t PG::get_with_id()
185 {
186 ref++;
187 Mutex::Locker l(_ref_id_lock);
188 uint64_t id = ++_ref_id;
189 BackTrace bt(0);
190 stringstream ss;
191 bt.print(ss);
192 dout(20) << __func__ << ": " << info.pgid << " got id " << id << " (new) ref==" << ref << dendl;
193 assert(!_live_ids.count(id));
194 _live_ids.insert(make_pair(id, ss.str()));
195 return id;
196 }
197
198 void PG::put_with_id(uint64_t id)
199 {
200 dout(20) << __func__ << ": " << info.pgid << " put id " << id << " (current) ref==" << ref << dendl;
201 {
202 Mutex::Locker l(_ref_id_lock);
203 assert(_live_ids.count(id));
204 _live_ids.erase(id);
205 }
206 if (--ref == 0)
207 delete this;
208 }
209
210 void PG::dump_live_ids()
211 {
212 Mutex::Locker l(_ref_id_lock);
213 dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl;
214 for (map<uint64_t, string>::iterator i = _live_ids.begin();
215 i != _live_ids.end();
216 ++i) {
217 dout(0) << "\t\tid: " << *i << dendl;
218 }
219 dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl;
220 for (map<string, uint64_t>::iterator i = _tag_counts.begin();
221 i != _tag_counts.end();
222 ++i) {
223 dout(0) << "\t\tid: " << *i << dendl;
224 }
225 }
226 #endif
227
228
229 void PGPool::update(OSDMapRef map)
230 {
231 const pg_pool_t *pi = map->get_pg_pool(id);
232 assert(pi);
233 info = *pi;
234 auid = pi->auid;
235 name = map->get_pool_name(id);
236 bool updated = false;
237 if ((map->get_epoch() != cached_epoch + 1) ||
238 (pi->get_snap_epoch() == map->get_epoch())) {
239 updated = true;
240 if (pi->maybe_updated_removed_snaps(cached_removed_snaps)) {
241 pi->build_removed_snaps(newly_removed_snaps);
242 if (cached_removed_snaps.subset_of(newly_removed_snaps)) {
243 interval_set<snapid_t> removed_snaps = newly_removed_snaps;
244 newly_removed_snaps.subtract(cached_removed_snaps);
245 cached_removed_snaps.swap(removed_snaps);
246 } else {
247 lgeneric_subdout(cct, osd, 0) << __func__
248 << " cached_removed_snaps shrank from " << cached_removed_snaps
249 << " to " << newly_removed_snaps << dendl;
250 cached_removed_snaps.swap(newly_removed_snaps);
251 newly_removed_snaps.clear();
252 }
253 } else
254 newly_removed_snaps.clear();
255 snapc = pi->get_snap_context();
256 } else {
257 /* 1) map->get_epoch() == cached_epoch + 1 &&
258 * 2) pi->get_snap_epoch() != map->get_epoch()
259 *
260 * From the if branch, 1 && 2 must be true. From 2, we know that
261 * this map didn't change the set of removed snaps. From 1, we
262 * know that our cached_removed_snaps matches the previous map.
263 * Thus, from 1 && 2, cached_removed snaps matches the current
264 * set of removed snaps and all we have to do is clear
265 * newly_removed_snaps.
266 */
267 newly_removed_snaps.clear();
268 }
269 cached_epoch = map->get_epoch();
270 lgeneric_subdout(cct, osd, 20)
271 << "PGPool::update cached_removed_snaps "
272 << cached_removed_snaps
273 << " newly_removed_snaps "
274 << newly_removed_snaps
275 << " snapc " << snapc
276 << (updated ? " (updated)":" (no change)")
277 << dendl;
278 }
279
280 PG::PG(OSDService *o, OSDMapRef curmap,
281 const PGPool &_pool, spg_t p) :
282 osd(o),
283 cct(o->cct),
284 osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
285 snap_mapper(
286 cct,
287 &osdriver,
288 p.ps(),
289 p.get_split_bits(curmap->get_pg_num(_pool.id)),
290 _pool.id,
291 p.shard),
292 osdmap_ref(curmap), last_persisted_osdmap_ref(curmap), pool(_pool),
293 _lock("PG::_lock"),
294 #ifdef PG_DEBUG_REFS
295 _ref_id_lock("PG::_ref_id_lock"), _ref_id(0),
296 #endif
297 deleting(false),
298 trace_endpoint("0.0.0.0", 0, "PG"),
299 dirty_info(false), dirty_big_info(false),
300 info(p),
301 info_struct_v(0),
302 coll(p),
303 pg_log(cct),
304 pgmeta_oid(p.make_pgmeta_oid()),
305 missing_loc(this),
306 past_intervals(
307 curmap->get_pools().at(p.pgid.pool()).ec_pool(),
308 *curmap),
309 stat_queue_item(this),
310 scrub_queued(false),
311 recovery_queued(false),
312 recovery_ops_active(0),
313 role(-1),
314 state(0),
315 send_notify(false),
316 pg_whoami(osd->whoami, p.shard),
317 need_up_thru(false),
318 last_peering_reset(0),
319 heartbeat_peer_lock("PG::heartbeat_peer_lock"),
320 backfill_reserved(false),
321 backfill_reserving(false),
322 flushes_in_progress(0),
323 pg_stats_publish_lock("PG::pg_stats_publish_lock"),
324 pg_stats_publish_valid(false),
325 osr(osd->osr_registry.lookup_or_create(p, (stringify(p)))),
326 finish_sync_event(NULL),
327 backoff_lock("PG::backoff_lock"),
328 scrub_after_recovery(false),
329 active_pushes(0),
330 recovery_state(this),
331 pg_id(p),
332 peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
333 acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
334 upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
335 last_epoch(0)
336 {
337 #ifdef PG_DEBUG_REFS
338 osd->add_pgid(p, this);
339 #endif
340 #ifdef WITH_BLKIN
341 std::stringstream ss;
342 ss << "PG " << info.pgid;
343 trace_endpoint.copy_name(ss.str());
344 #endif
345 osr->shard_hint = p;
346 }
347
348 PG::~PG()
349 {
350 pgstate_history.set_pg_in_destructor();
351 #ifdef PG_DEBUG_REFS
352 osd->remove_pgid(info.pgid, this);
353 #endif
354 }
355
356 void PG::lock_suspend_timeout(ThreadPool::TPHandle &handle)
357 {
358 handle.suspend_tp_timeout();
359 lock();
360 handle.reset_tp_timeout();
361 }
362
363 void PG::lock(bool no_lockdep) const
364 {
365 _lock.Lock(no_lockdep);
366 // if we have unrecorded dirty state with the lock dropped, there is a bug
367 assert(!dirty_info);
368 assert(!dirty_big_info);
369
370 dout(30) << "lock" << dendl;
371 }
372
373 std::string PG::gen_prefix() const
374 {
375 stringstream out;
376 OSDMapRef mapref = osdmap_ref;
377 if (_lock.is_locked_by_me()) {
378 out << "osd." << osd->whoami
379 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
380 << " " << *this << " ";
381 } else {
382 out << "osd." << osd->whoami
383 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
384 << " pg[" << info.pgid << "(unlocked)] ";
385 }
386 return out.str();
387 }
388
389 /********* PG **********/
390
391 void PG::proc_master_log(
392 ObjectStore::Transaction& t, pg_info_t &oinfo,
393 pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
394 {
395 dout(10) << "proc_master_log for osd." << from << ": "
396 << olog << " " << omissing << dendl;
397 assert(!is_peered() && is_primary());
398
399 // merge log into our own log to build master log. no need to
400 // make any adjustments to their missing map; we are taking their
401 // log to be authoritative (i.e., their entries are by definitely
402 // non-divergent).
403 merge_log(t, oinfo, olog, from);
404 peer_info[from] = oinfo;
405 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
406 might_have_unfound.insert(from);
407
408 // See doc/dev/osd_internals/last_epoch_started
409 if (oinfo.last_epoch_started > info.last_epoch_started) {
410 info.last_epoch_started = oinfo.last_epoch_started;
411 dirty_info = true;
412 }
413 if (oinfo.last_interval_started > info.last_interval_started) {
414 info.last_interval_started = oinfo.last_interval_started;
415 dirty_info = true;
416 }
417 update_history(oinfo.history);
418 assert(cct->_conf->osd_find_best_info_ignore_history_les ||
419 info.last_epoch_started >= info.history.last_epoch_started);
420
421 peer_missing[from].claim(omissing);
422 }
423
424 void PG::proc_replica_log(
425 pg_info_t &oinfo,
426 const pg_log_t &olog,
427 pg_missing_t& omissing,
428 pg_shard_t from)
429 {
430 dout(10) << "proc_replica_log for osd." << from << ": "
431 << oinfo << " " << olog << " " << omissing << dendl;
432
433 pg_log.proc_replica_log(oinfo, olog, omissing, from);
434
435 peer_info[from] = oinfo;
436 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
437 might_have_unfound.insert(from);
438
439 for (map<hobject_t, pg_missing_item>::const_iterator i =
440 omissing.get_items().begin();
441 i != omissing.get_items().end();
442 ++i) {
443 dout(20) << " after missing " << i->first << " need " << i->second.need
444 << " have " << i->second.have << dendl;
445 }
446 peer_missing[from].claim(omissing);
447 }
448
449 bool PG::proc_replica_info(
450 pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
451 {
452 map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
453 if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
454 dout(10) << " got dup osd." << from << " info " << oinfo << ", identical to ours" << dendl;
455 return false;
456 }
457
458 if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
459 dout(10) << " got info " << oinfo << " from down osd." << from
460 << " discarding" << dendl;
461 return false;
462 }
463
464 dout(10) << " got osd." << from << " " << oinfo << dendl;
465 assert(is_primary());
466 peer_info[from] = oinfo;
467 might_have_unfound.insert(from);
468
469 update_history(oinfo.history);
470
471 // stray?
472 if (!is_up(from) && !is_acting(from)) {
473 dout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
474 stray_set.insert(from);
475 if (is_clean()) {
476 purge_strays();
477 }
478 }
479
480 // was this a new info? if so, update peers!
481 if (p == peer_info.end())
482 update_heartbeat_peers();
483
484 return true;
485 }
486
487 void PG::remove_snap_mapped_object(
488 ObjectStore::Transaction &t, const hobject_t &soid)
489 {
490 t.remove(
491 coll,
492 ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard));
493 clear_object_snap_mapping(&t, soid);
494 }
495
496 void PG::clear_object_snap_mapping(
497 ObjectStore::Transaction *t, const hobject_t &soid)
498 {
499 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
500 if (soid.snap < CEPH_MAXSNAP) {
501 int r = snap_mapper.remove_oid(
502 soid,
503 &_t);
504 if (!(r == 0 || r == -ENOENT)) {
505 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
506 ceph_abort();
507 }
508 }
509 }
510
511 void PG::update_object_snap_mapping(
512 ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps)
513 {
514 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
515 assert(soid.snap < CEPH_MAXSNAP);
516 int r = snap_mapper.remove_oid(
517 soid,
518 &_t);
519 if (!(r == 0 || r == -ENOENT)) {
520 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
521 ceph_abort();
522 }
523 snap_mapper.add_oid(
524 soid,
525 snaps,
526 &_t);
527 }
528
529 void PG::merge_log(
530 ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_shard_t from)
531 {
532 PGLogEntryHandler rollbacker{this, &t};
533 pg_log.merge_log(
534 oinfo, olog, from, info, &rollbacker, dirty_info, dirty_big_info);
535 }
536
537 void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead)
538 {
539 PGLogEntryHandler rollbacker{this, &t};
540 pg_log.rewind_divergent_log(
541 newhead, info, &rollbacker, dirty_info, dirty_big_info);
542 }
543
544 /*
545 * Process information from a replica to determine if it could have any
546 * objects that i need.
547 *
548 * TODO: if the missing set becomes very large, this could get expensive.
549 * Instead, we probably want to just iterate over our unfound set.
550 */
551 bool PG::search_for_missing(
552 const pg_info_t &oinfo, const pg_missing_t &omissing,
553 pg_shard_t from,
554 RecoveryCtx *ctx)
555 {
556 uint64_t num_unfound_before = missing_loc.num_unfound();
557 bool found_missing = missing_loc.add_source_info(
558 from, oinfo, omissing, ctx->handle);
559 if (found_missing && num_unfound_before != missing_loc.num_unfound())
560 publish_stats_to_osd();
561 // avoid doing this if the peer is empty. This is abit of paranoia
562 // to avoid doing something rash if add_source_info() above
563 // incorrectly decided we found something new. (if the peer has
564 // last_update=0'0 that's impossible.)
565 if (found_missing &&
566 (get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
567 CEPH_FEATURE_OSD_ERASURE_CODES) &&
568 oinfo.last_update != eversion_t()) {
569 pg_info_t tinfo(oinfo);
570 tinfo.pgid.shard = pg_whoami.shard;
571 (*(ctx->info_map))[from.osd].push_back(
572 make_pair(
573 pg_notify_t(
574 from.shard, pg_whoami.shard,
575 get_osdmap()->get_epoch(),
576 get_osdmap()->get_epoch(),
577 tinfo),
578 past_intervals));
579 }
580 return found_missing;
581 }
582
583
584 // MissingLoc
585
586 bool PG::MissingLoc::readable_with_acting(
587 const hobject_t &hoid,
588 const set<pg_shard_t> &acting) const {
589 if (!needs_recovery(hoid))
590 return true;
591 if (is_deleted(hoid))
592 return false;
593 auto missing_loc_entry = missing_loc.find(hoid);
594 if (missing_loc_entry == missing_loc.end())
595 return false;
596 const set<pg_shard_t> &locs = missing_loc_entry->second;
597 ldout(pg->cct, 10) << __func__ << ": locs:" << locs << dendl;
598 set<pg_shard_t> have_acting;
599 for (set<pg_shard_t>::const_iterator i = locs.begin();
600 i != locs.end();
601 ++i) {
602 if (acting.count(*i))
603 have_acting.insert(*i);
604 }
605 return (*is_readable)(have_acting);
606 }
607
608 void PG::MissingLoc::add_batch_sources_info(
609 const set<pg_shard_t> &sources, ThreadPool::TPHandle* handle)
610 {
611 ldout(pg->cct, 10) << __func__ << ": adding sources in batch "
612 << sources.size() << dendl;
613 unsigned loop = 0;
614 for (map<hobject_t, pg_missing_item>::const_iterator i = needs_recovery_map.begin();
615 i != needs_recovery_map.end();
616 ++i) {
617 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
618 handle->reset_tp_timeout();
619 loop = 0;
620 }
621 if (i->second.is_delete())
622 continue;
623
624 auto p = missing_loc.find(i->first);
625 if (p == missing_loc.end()) {
626 p = missing_loc.emplace(i->first, set<pg_shard_t>()).first;
627 } else {
628 _dec_count(p->second);
629 }
630 missing_loc[i->first].insert(sources.begin(), sources.end());
631 missing_loc_sources.insert(sources.begin(), sources.end());
632 _inc_count(p->second);
633
634 }
635 }
636
637 bool PG::MissingLoc::add_source_info(
638 pg_shard_t fromosd,
639 const pg_info_t &oinfo,
640 const pg_missing_t &omissing,
641 ThreadPool::TPHandle* handle)
642 {
643 bool found_missing = false;
644 unsigned loop = 0;
645 // found items?
646 for (map<hobject_t,pg_missing_item>::const_iterator p = needs_recovery_map.begin();
647 p != needs_recovery_map.end();
648 ++p) {
649 const hobject_t &soid(p->first);
650 eversion_t need = p->second.need;
651 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
652 handle->reset_tp_timeout();
653 loop = 0;
654 }
655 if (p->second.is_delete()) {
656 ldout(pg->cct, 10) << __func__ << " " << soid
657 << " delete, ignoring source" << dendl;
658 continue;
659 }
660 if (oinfo.last_update < need) {
661 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
662 << " also missing on osd." << fromosd
663 << " (last_update " << oinfo.last_update
664 << " < needed " << need << ")" << dendl;
665 continue;
666 }
667 if (!oinfo.last_backfill.is_max() &&
668 !oinfo.last_backfill_bitwise) {
669 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
670 << " also missing on osd." << fromosd
671 << " (last_backfill " << oinfo.last_backfill
672 << " but with wrong sort order)"
673 << dendl;
674 continue;
675 }
676 if (p->first >= oinfo.last_backfill) {
677 // FIXME: this is _probably_ true, although it could conceivably
678 // be in the undefined region! Hmm!
679 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
680 << " also missing on osd." << fromosd
681 << " (past last_backfill " << oinfo.last_backfill
682 << ")" << dendl;
683 continue;
684 }
685 if (omissing.is_missing(soid)) {
686 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
687 << " also missing on osd." << fromosd << dendl;
688 continue;
689 }
690
691 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
692 << " is on osd." << fromosd << dendl;
693
694 missing_loc_sources.insert(fromosd);
695 {
696 auto p = missing_loc.find(soid);
697 if (p == missing_loc.end()) {
698 p = missing_loc.emplace(soid, set<pg_shard_t>()).first;
699 } else {
700 _dec_count(p->second);
701 }
702 p->second.insert(fromosd);
703 _inc_count(p->second);
704 }
705
706 found_missing = true;
707 }
708
709 ldout(pg->cct, 20) << "needs_recovery_map missing " << needs_recovery_map
710 << dendl;
711 return found_missing;
712 }
713
714 void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
715 {
716 set<pg_shard_t> now_down;
717 for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
718 p != missing_loc_sources.end();
719 ) {
720 if (osdmap->is_up(p->osd)) {
721 ++p;
722 continue;
723 }
724 ldout(pg->cct, 10) << __func__ << " source osd." << *p << " now down" << dendl;
725 now_down.insert(*p);
726 missing_loc_sources.erase(p++);
727 }
728
729 if (now_down.empty()) {
730 ldout(pg->cct, 10) << __func__ << " no source osds (" << missing_loc_sources << ") went down" << dendl;
731 } else {
732 ldout(pg->cct, 10) << __func__ << " sources osds " << now_down << " now down, remaining sources are "
733 << missing_loc_sources << dendl;
734
735 // filter missing_loc
736 map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin();
737 while (p != missing_loc.end()) {
738 set<pg_shard_t>::iterator q = p->second.begin();
739 bool changed = false;
740 while (q != p->second.end()) {
741 if (now_down.count(*q)) {
742 if (!changed) {
743 changed = true;
744 _dec_count(p->second);
745 }
746 p->second.erase(q++);
747 } else {
748 ++q;
749 }
750 }
751 if (p->second.empty()) {
752 missing_loc.erase(p++);
753 } else {
754 if (changed) {
755 _inc_count(p->second);
756 }
757 ++p;
758 }
759 }
760 }
761 }
762
763 void PG::discover_all_missing(map<int, map<spg_t,pg_query_t> > &query_map)
764 {
765 auto &missing = pg_log.get_missing();
766 uint64_t unfound = get_num_unfound();
767
768 dout(10) << __func__ << " "
769 << missing.num_missing() << " missing, "
770 << unfound << " unfound"
771 << dendl;
772
773 std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
774 std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
775 for (; m != mend; ++m) {
776 pg_shard_t peer(*m);
777
778 if (!get_osdmap()->is_up(peer.osd)) {
779 dout(20) << __func__ << " skipping down osd." << peer << dendl;
780 continue;
781 }
782
783 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
784 if (iter != peer_info.end() &&
785 (iter->second.is_empty() || iter->second.dne())) {
786 // ignore empty peers
787 continue;
788 }
789
790 // If we've requested any of this stuff, the pg_missing_t information
791 // should be on its way.
792 // TODO: coalsce requested_* into a single data structure
793 if (peer_missing.find(peer) != peer_missing.end()) {
794 dout(20) << __func__ << ": osd." << peer
795 << ": we already have pg_missing_t" << dendl;
796 continue;
797 }
798 if (peer_log_requested.find(peer) != peer_log_requested.end()) {
799 dout(20) << __func__ << ": osd." << peer
800 << ": in peer_log_requested" << dendl;
801 continue;
802 }
803 if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
804 dout(20) << __func__ << ": osd." << peer
805 << ": in peer_missing_requested" << dendl;
806 continue;
807 }
808
809 // Request missing
810 dout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
811 << dendl;
812 peer_missing_requested.insert(peer);
813 query_map[peer.osd][spg_t(info.pgid.pgid, peer.shard)] =
814 pg_query_t(
815 pg_query_t::FULLLOG,
816 peer.shard, pg_whoami.shard,
817 info.history, get_osdmap()->get_epoch());
818 }
819 }
820
821 /******* PG ***********/
822 bool PG::needs_recovery() const
823 {
824 assert(is_primary());
825
826 auto &missing = pg_log.get_missing();
827
828 if (missing.num_missing()) {
829 dout(10) << __func__ << " primary has " << missing.num_missing()
830 << " missing" << dendl;
831 return true;
832 }
833
834 assert(!actingbackfill.empty());
835 set<pg_shard_t>::const_iterator end = actingbackfill.end();
836 set<pg_shard_t>::const_iterator a = actingbackfill.begin();
837 for (; a != end; ++a) {
838 if (*a == get_primary()) continue;
839 pg_shard_t peer = *a;
840 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
841 if (pm == peer_missing.end()) {
842 dout(10) << __func__ << " osd." << peer << " doesn't have missing set"
843 << dendl;
844 continue;
845 }
846 if (pm->second.num_missing()) {
847 dout(10) << __func__ << " osd." << peer << " has "
848 << pm->second.num_missing() << " missing" << dendl;
849 return true;
850 }
851 }
852
853 dout(10) << __func__ << " is recovered" << dendl;
854 return false;
855 }
856
857 bool PG::needs_backfill() const
858 {
859 assert(is_primary());
860
861 // We can assume that only possible osds that need backfill
862 // are on the backfill_targets vector nodes.
863 set<pg_shard_t>::const_iterator end = backfill_targets.end();
864 set<pg_shard_t>::const_iterator a = backfill_targets.begin();
865 for (; a != end; ++a) {
866 pg_shard_t peer = *a;
867 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
868 if (!pi->second.last_backfill.is_max()) {
869 dout(10) << __func__ << " osd." << peer << " has last_backfill " << pi->second.last_backfill << dendl;
870 return true;
871 }
872 }
873
874 dout(10) << __func__ << " does not need backfill" << dendl;
875 return false;
876 }
877
878
879 void PG::check_past_interval_bounds() const
880 {
881 auto rpib = get_required_past_interval_bounds(
882 info,
883 osd->get_superblock().oldest_map);
884 if (rpib.first >= rpib.second) {
885 if (!past_intervals.empty()) {
886 osd->clog->error() << info.pgid << " required past_interval bounds are"
887 << " empty [" << rpib << ") but past_intervals is not: "
888 << past_intervals;
889 derr << info.pgid << " required past_interval bounds are"
890 << " empty [" << rpib << ") but past_intervals is not: "
891 << past_intervals << dendl;
892 }
893 } else {
894 if (past_intervals.empty()) {
895 osd->clog->error() << info.pgid << " required past_interval bounds are"
896 << " not empty [" << rpib << ") but past_intervals "
897 << past_intervals << " is empty";
898 derr << info.pgid << " required past_interval bounds are"
899 << " not empty [" << rpib << ") but past_intervals "
900 << past_intervals << " is empty" << dendl;
901 assert(!past_intervals.empty());
902 }
903
904 auto apib = past_intervals.get_bounds();
905 if (apib.first > rpib.first) {
906 osd->clog->error() << info.pgid << " past_intervals [" << apib
907 << ") start interval does not contain the required"
908 << " bound [" << rpib << ") start";
909 derr << info.pgid << " past_intervals [" << apib
910 << ") start interval does not contain the required"
911 << " bound [" << rpib << ") start" << dendl;
912 assert(0 == "past_interval start interval mismatch");
913 }
914 if (apib.second != rpib.second) {
915 osd->clog->error() << info.pgid << " past_interal bound [" << apib
916 << ") end does not match required [" << rpib
917 << ") end";
918 derr << info.pgid << " past_interal bound [" << apib
919 << ") end does not match required [" << rpib
920 << ") end" << dendl;
921 assert(0 == "past_interval end mismatch");
922 }
923 }
924 }
925
926 bool PG::adjust_need_up_thru(const OSDMapRef osdmap)
927 {
928 epoch_t up_thru = osdmap->get_up_thru(osd->whoami);
929 if (need_up_thru &&
930 up_thru >= info.history.same_interval_since) {
931 dout(10) << "adjust_need_up_thru now " << up_thru << ", need_up_thru now false" << dendl;
932 need_up_thru = false;
933 return true;
934 }
935 return false;
936 }
937
938 void PG::remove_down_peer_info(const OSDMapRef osdmap)
939 {
940 // Remove any downed osds from peer_info
941 bool removed = false;
942 map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
943 while (p != peer_info.end()) {
944 if (!osdmap->is_up(p->first.osd)) {
945 dout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
946 peer_missing.erase(p->first);
947 peer_log_requested.erase(p->first);
948 peer_missing_requested.erase(p->first);
949 peer_info.erase(p++);
950 removed = true;
951 } else
952 ++p;
953 }
954
955 // if we removed anyone, update peers (which include peer_info)
956 if (removed)
957 update_heartbeat_peers();
958 check_recovery_sources(osdmap);
959 }
960
961 /*
962 * Returns true unless there is a non-lost OSD in might_have_unfound.
963 */
964 bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
965 {
966 assert(is_primary());
967
968 set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
969 set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
970 for (; peer != mend; ++peer) {
971 if (peer_missing.count(*peer))
972 continue;
973 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
974 if (iter != peer_info.end() &&
975 (iter->second.is_empty() || iter->second.dne()))
976 continue;
977 if (!osdmap->exists(peer->osd))
978 continue;
979 const osd_info_t &osd_info(osdmap->get_info(peer->osd));
980 if (osd_info.lost_at <= osd_info.up_from) {
981 // If there is even one OSD in might_have_unfound that isn't lost, we
982 // still might retrieve our unfound.
983 return false;
984 }
985 }
986 dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound
987 << " have been queried or are marked lost" << dendl;
988 return true;
989 }
990
991 PastIntervals::PriorSet PG::build_prior()
992 {
993 if (1) {
994 // sanity check
995 for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
996 it != peer_info.end();
997 ++it) {
998 assert(info.history.last_epoch_started >= it->second.history.last_epoch_started);
999 }
1000 }
1001
1002 const OSDMap &osdmap = *get_osdmap();
1003 PastIntervals::PriorSet prior = past_intervals.get_prior_set(
1004 pool.info.ec_pool(),
1005 info.history.last_epoch_started,
1006 get_pgbackend()->get_is_recoverable_predicate(),
1007 [&](epoch_t start, int osd, epoch_t *lost_at) {
1008 const osd_info_t *pinfo = 0;
1009 if (osdmap.exists(osd)) {
1010 pinfo = &osdmap.get_info(osd);
1011 if (lost_at)
1012 *lost_at = pinfo->lost_at;
1013 }
1014
1015 if (osdmap.is_up(osd)) {
1016 return PastIntervals::UP;
1017 } else if (!pinfo) {
1018 return PastIntervals::DNE;
1019 } else if (pinfo->lost_at > start) {
1020 return PastIntervals::LOST;
1021 } else {
1022 return PastIntervals::DOWN;
1023 }
1024 },
1025 up,
1026 acting,
1027 this);
1028
1029 if (prior.pg_down) {
1030 state_set(PG_STATE_DOWN);
1031 }
1032
1033 if (get_osdmap()->get_up_thru(osd->whoami) < info.history.same_interval_since) {
1034 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
1035 << " < same_since " << info.history.same_interval_since
1036 << ", must notify monitor" << dendl;
1037 need_up_thru = true;
1038 } else {
1039 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
1040 << " >= same_since " << info.history.same_interval_since
1041 << ", all is well" << dendl;
1042 need_up_thru = false;
1043 }
1044 set_probe_targets(prior.probe);
1045 return prior;
1046 }
1047
1048 void PG::clear_primary_state()
1049 {
1050 dout(10) << "clear_primary_state" << dendl;
1051
1052 // clear peering state
1053 stray_set.clear();
1054 peer_log_requested.clear();
1055 peer_missing_requested.clear();
1056 peer_info.clear();
1057 peer_missing.clear();
1058 need_up_thru = false;
1059 peer_last_complete_ondisk.clear();
1060 peer_activated.clear();
1061 min_last_complete_ondisk = eversion_t();
1062 pg_trim_to = eversion_t();
1063 might_have_unfound.clear();
1064 projected_log = PGLog::IndexedLog();
1065
1066 last_update_ondisk = eversion_t();
1067
1068 snap_trimq.clear();
1069
1070 finish_sync_event = 0; // so that _finish_recovery doesn't go off in another thread
1071
1072 missing_loc.clear();
1073
1074 release_pg_backoffs();
1075
1076 pg_log.reset_recovery_pointers();
1077
1078 scrubber.reserved_peers.clear();
1079 scrub_after_recovery = false;
1080
1081 agent_clear();
1082 }
1083
1084 PG::Scrubber::Scrubber()
1085 : reserved(false), reserve_failed(false),
1086 epoch_start(0),
1087 active(false),
1088 shallow_errors(0), deep_errors(0), fixed(0),
1089 must_scrub(false), must_deep_scrub(false), must_repair(false),
1090 auto_repair(false),
1091 num_digest_updates_pending(0),
1092 state(INACTIVE),
1093 deep(false)
1094 {}
1095
1096 PG::Scrubber::~Scrubber() {}
1097
1098 /**
1099 * find_best_info
1100 *
1101 * Returns an iterator to the best info in infos sorted by:
1102 * 1) Prefer newer last_update
1103 * 2) Prefer longer tail if it brings another info into contiguity
1104 * 3) Prefer current primary
1105 */
1106 map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
1107 const map<pg_shard_t, pg_info_t> &infos,
1108 bool restrict_to_up_acting,
1109 bool *history_les_bound) const
1110 {
1111 assert(history_les_bound);
1112 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1113 * to make changes to this process. Also, make sure to update it
1114 * when you find bugs! */
1115 eversion_t min_last_update_acceptable = eversion_t::max();
1116 epoch_t max_last_epoch_started_found = 0;
1117 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1118 i != infos.end();
1119 ++i) {
1120 if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1121 max_last_epoch_started_found < i->second.history.last_epoch_started) {
1122 *history_les_bound = true;
1123 max_last_epoch_started_found = i->second.history.last_epoch_started;
1124 }
1125 if (!i->second.is_incomplete() &&
1126 max_last_epoch_started_found < i->second.last_epoch_started) {
1127 max_last_epoch_started_found = i->second.last_epoch_started;
1128 }
1129 }
1130 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1131 i != infos.end();
1132 ++i) {
1133 if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1134 if (min_last_update_acceptable > i->second.last_update)
1135 min_last_update_acceptable = i->second.last_update;
1136 }
1137 }
1138 if (min_last_update_acceptable == eversion_t::max())
1139 return infos.end();
1140
1141 map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1142 // find osd with newest last_update (oldest for ec_pool).
1143 // if there are multiples, prefer
1144 // - a longer tail, if it brings another peer into log contiguity
1145 // - the current primary
1146 for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1147 p != infos.end();
1148 ++p) {
1149 if (restrict_to_up_acting && !is_up(p->first) &&
1150 !is_acting(p->first))
1151 continue;
1152 // Only consider peers with last_update >= min_last_update_acceptable
1153 if (p->second.last_update < min_last_update_acceptable)
1154 continue;
1155 // Disqualify anyone with a too old last_epoch_started
1156 if (p->second.last_epoch_started < max_last_epoch_started_found)
1157 continue;
1158 // Disqualify anyone who is incomplete (not fully backfilled)
1159 if (p->second.is_incomplete())
1160 continue;
1161 if (best == infos.end()) {
1162 best = p;
1163 continue;
1164 }
1165 // Prefer newer last_update
1166 if (pool.info.require_rollback()) {
1167 if (p->second.last_update > best->second.last_update)
1168 continue;
1169 if (p->second.last_update < best->second.last_update) {
1170 best = p;
1171 continue;
1172 }
1173 } else {
1174 if (p->second.last_update < best->second.last_update)
1175 continue;
1176 if (p->second.last_update > best->second.last_update) {
1177 best = p;
1178 continue;
1179 }
1180 }
1181
1182 // Prefer longer tail
1183 if (p->second.log_tail > best->second.log_tail) {
1184 continue;
1185 } else if (p->second.log_tail < best->second.log_tail) {
1186 best = p;
1187 continue;
1188 }
1189
1190 // prefer current primary (usually the caller), all things being equal
1191 if (p->first == pg_whoami) {
1192 dout(10) << "calc_acting prefer osd." << p->first
1193 << " because it is current primary" << dendl;
1194 best = p;
1195 continue;
1196 }
1197 }
1198 return best;
1199 }
1200
1201 void PG::calc_ec_acting(
1202 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1203 unsigned size,
1204 const vector<int> &acting,
1205 pg_shard_t acting_primary,
1206 const vector<int> &up,
1207 pg_shard_t up_primary,
1208 const map<pg_shard_t, pg_info_t> &all_info,
1209 bool restrict_to_up_acting,
1210 vector<int> *_want,
1211 set<pg_shard_t> *backfill,
1212 set<pg_shard_t> *acting_backfill,
1213 pg_shard_t *want_primary,
1214 ostream &ss)
1215 {
1216 vector<int> want(size, CRUSH_ITEM_NONE);
1217 map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1218 unsigned usable = 0;
1219 for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1220 i != all_info.end();
1221 ++i) {
1222 all_info_by_shard[i->first.shard].insert(i->first);
1223 }
1224 for (uint8_t i = 0; i < want.size(); ++i) {
1225 ss << "For position " << (unsigned)i << ": ";
1226 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1227 !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1228 all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1229 auth_log_shard->second.log_tail) {
1230 ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1231 want[i] = up[i];
1232 ++usable;
1233 continue;
1234 }
1235 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1236 ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1237 << " and ";
1238 backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1239 }
1240
1241 if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1242 !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1243 all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1244 auth_log_shard->second.log_tail) {
1245 ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1246 want[i] = acting[i];
1247 ++usable;
1248 } else if (!restrict_to_up_acting) {
1249 for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1250 j != all_info_by_shard[shard_id_t(i)].end();
1251 ++j) {
1252 assert(j->shard == i);
1253 if (!all_info.find(*j)->second.is_incomplete() &&
1254 all_info.find(*j)->second.last_update >=
1255 auth_log_shard->second.log_tail) {
1256 ss << " selecting stray: " << *j << std::endl;
1257 want[i] = j->osd;
1258 ++usable;
1259 break;
1260 }
1261 }
1262 if (want[i] == CRUSH_ITEM_NONE)
1263 ss << " failed to fill position " << (int)i << std::endl;
1264 }
1265 }
1266
1267 bool found_primary = false;
1268 for (uint8_t i = 0; i < want.size(); ++i) {
1269 if (want[i] != CRUSH_ITEM_NONE) {
1270 acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1271 if (!found_primary) {
1272 *want_primary = pg_shard_t(want[i], shard_id_t(i));
1273 found_primary = true;
1274 }
1275 }
1276 }
1277 acting_backfill->insert(backfill->begin(), backfill->end());
1278 _want->swap(want);
1279 }
1280
1281 /**
1282 * calculate the desired acting set.
1283 *
1284 * Choose an appropriate acting set. Prefer up[0], unless it is
1285 * incomplete, or another osd has a longer tail that allows us to
1286 * bring other up nodes up to date.
1287 */
1288 void PG::calc_replicated_acting(
1289 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1290 unsigned size,
1291 const vector<int> &acting,
1292 pg_shard_t acting_primary,
1293 const vector<int> &up,
1294 pg_shard_t up_primary,
1295 const map<pg_shard_t, pg_info_t> &all_info,
1296 bool restrict_to_up_acting,
1297 vector<int> *want,
1298 set<pg_shard_t> *backfill,
1299 set<pg_shard_t> *acting_backfill,
1300 pg_shard_t *want_primary,
1301 ostream &ss)
1302 {
1303 ss << "calc_acting newest update on osd." << auth_log_shard->first
1304 << " with " << auth_log_shard->second
1305 << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1306 pg_shard_t auth_log_shard_id = auth_log_shard->first;
1307
1308 // select primary
1309 map<pg_shard_t,pg_info_t>::const_iterator primary;
1310 if (up.size() &&
1311 !all_info.find(up_primary)->second.is_incomplete() &&
1312 all_info.find(up_primary)->second.last_update >=
1313 auth_log_shard->second.log_tail) {
1314 ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1315 primary = all_info.find(up_primary); // prefer up[0], all thing being equal
1316 } else {
1317 assert(!auth_log_shard->second.is_incomplete());
1318 ss << "up[0] needs backfill, osd." << auth_log_shard_id
1319 << " selected as primary instead" << std::endl;
1320 primary = auth_log_shard;
1321 }
1322
1323 ss << "calc_acting primary is osd." << primary->first
1324 << " with " << primary->second << std::endl;
1325 *want_primary = primary->first;
1326 want->push_back(primary->first.osd);
1327 acting_backfill->insert(primary->first);
1328 unsigned usable = 1;
1329
1330 // select replicas that have log contiguity with primary.
1331 // prefer up, then acting, then any peer_info osds
1332 for (vector<int>::const_iterator i = up.begin();
1333 i != up.end();
1334 ++i) {
1335 pg_shard_t up_cand = pg_shard_t(*i, shard_id_t::NO_SHARD);
1336 if (up_cand == primary->first)
1337 continue;
1338 const pg_info_t &cur_info = all_info.find(up_cand)->second;
1339 if (cur_info.is_incomplete() ||
1340 cur_info.last_update < MIN(
1341 primary->second.log_tail,
1342 auth_log_shard->second.log_tail)) {
1343 /* We include auth_log_shard->second.log_tail because in GetLog,
1344 * we will request logs back to the min last_update over our
1345 * acting_backfill set, which will result in our log being extended
1346 * as far backwards as necessary to pick up any peers which can
1347 * be log recovered by auth_log_shard's log */
1348 ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1349 backfill->insert(up_cand);
1350 acting_backfill->insert(up_cand);
1351 } else {
1352 want->push_back(*i);
1353 acting_backfill->insert(up_cand);
1354 usable++;
1355 ss << " osd." << *i << " (up) accepted " << cur_info << std::endl;
1356 }
1357 if (want->size() >= size) {
1358 break;
1359 }
1360 }
1361
1362 // This no longer has backfill OSDs, but they are covered above.
1363 for (vector<int>::const_iterator i = acting.begin();
1364 i != acting.end();
1365 ++i) {
1366 pg_shard_t acting_cand(*i, shard_id_t::NO_SHARD);
1367 if (usable >= size)
1368 break;
1369
1370 // skip up osds we already considered above
1371 if (acting_cand == primary->first)
1372 continue;
1373 vector<int>::const_iterator up_it = find(up.begin(), up.end(), acting_cand.osd);
1374 if (up_it != up.end())
1375 continue;
1376
1377 const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1378 if (cur_info.is_incomplete() ||
1379 cur_info.last_update < primary->second.log_tail) {
1380 ss << " shard " << acting_cand << " (stray) REJECTED "
1381 << cur_info << std::endl;
1382 } else {
1383 want->push_back(*i);
1384 acting_backfill->insert(acting_cand);
1385 ss << " shard " << acting_cand << " (stray) accepted "
1386 << cur_info << std::endl;
1387 usable++;
1388 }
1389 }
1390
1391 if (restrict_to_up_acting) {
1392 return;
1393 }
1394 for (map<pg_shard_t,pg_info_t>::const_iterator i = all_info.begin();
1395 i != all_info.end();
1396 ++i) {
1397 if (usable >= size)
1398 break;
1399
1400 // skip up osds we already considered above
1401 if (i->first == primary->first)
1402 continue;
1403 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i->first.osd);
1404 if (up_it != up.end())
1405 continue;
1406 vector<int>::const_iterator acting_it = find(
1407 acting.begin(), acting.end(), i->first.osd);
1408 if (acting_it != acting.end())
1409 continue;
1410
1411 if (i->second.is_incomplete() ||
1412 i->second.last_update < primary->second.log_tail) {
1413 ss << " shard " << i->first << " (stray) REJECTED "
1414 << i->second << std::endl;
1415 } else {
1416 want->push_back(i->first.osd);
1417 acting_backfill->insert(i->first);
1418 ss << " shard " << i->first << " (stray) accepted "
1419 << i->second << std::endl;
1420 usable++;
1421 }
1422 }
1423 }
1424
1425 /**
1426 * choose acting
1427 *
1428 * calculate the desired acting, and request a change with the monitor
1429 * if it differs from the current acting.
1430 *
1431 * if restrict_to_up_acting=true, we filter out anything that's not in
1432 * up/acting. in order to lift this restriction, we need to
1433 * 1) check whether it's worth switching the acting set any time we get
1434 * a new pg info (not just here, when recovery finishes)
1435 * 2) check whether anything in want_acting went down on each new map
1436 * (and, if so, calculate a new want_acting)
1437 * 3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap)
1438 * TODO!
1439 */
1440 bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
1441 bool restrict_to_up_acting,
1442 bool *history_les_bound)
1443 {
1444 map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
1445 all_info[pg_whoami] = info;
1446
1447 for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
1448 p != all_info.end();
1449 ++p) {
1450 dout(10) << __func__ << " all_info osd." << p->first << " " << p->second << dendl;
1451 }
1452
1453 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
1454 find_best_info(all_info, restrict_to_up_acting, history_les_bound);
1455
1456 if (auth_log_shard == all_info.end()) {
1457 if (up != acting) {
1458 dout(10) << "choose_acting no suitable info found (incomplete backfills?),"
1459 << " reverting to up" << dendl;
1460 want_acting = up;
1461 vector<int> empty;
1462 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1463 } else {
1464 dout(10) << "choose_acting failed" << dendl;
1465 assert(want_acting.empty());
1466 }
1467 return false;
1468 }
1469
1470 assert(!auth_log_shard->second.is_incomplete());
1471 auth_log_shard_id = auth_log_shard->first;
1472
1473 set<pg_shard_t> want_backfill, want_acting_backfill;
1474 vector<int> want;
1475 pg_shard_t want_primary;
1476 stringstream ss;
1477 if (!pool.info.ec_pool())
1478 calc_replicated_acting(
1479 auth_log_shard,
1480 get_osdmap()->get_pg_size(info.pgid.pgid),
1481 acting,
1482 primary,
1483 up,
1484 up_primary,
1485 all_info,
1486 restrict_to_up_acting,
1487 &want,
1488 &want_backfill,
1489 &want_acting_backfill,
1490 &want_primary,
1491 ss);
1492 else
1493 calc_ec_acting(
1494 auth_log_shard,
1495 get_osdmap()->get_pg_size(info.pgid.pgid),
1496 acting,
1497 primary,
1498 up,
1499 up_primary,
1500 all_info,
1501 restrict_to_up_acting,
1502 &want,
1503 &want_backfill,
1504 &want_acting_backfill,
1505 &want_primary,
1506 ss);
1507 dout(10) << ss.str() << dendl;
1508
1509 unsigned num_want_acting = 0;
1510 set<pg_shard_t> have;
1511 for (int i = 0; i < (int)want.size(); ++i) {
1512 if (want[i] != CRUSH_ITEM_NONE) {
1513 ++num_want_acting;
1514 have.insert(
1515 pg_shard_t(
1516 want[i],
1517 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1518 }
1519 }
1520
1521 // We go incomplete if below min_size for ec_pools since backfill
1522 // does not currently maintain rollbackability
1523 // Otherwise, we will go "peered", but not "active"
1524 if (num_want_acting < pool.info.min_size &&
1525 (pool.info.ec_pool() ||
1526 !cct->_conf->osd_allow_recovery_below_min_size)) {
1527 want_acting.clear();
1528 dout(10) << "choose_acting failed, below min size" << dendl;
1529 return false;
1530 }
1531
1532 /* Check whether we have enough acting shards to later perform recovery */
1533 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable_predicate(
1534 get_pgbackend()->get_is_recoverable_predicate());
1535 if (!(*recoverable_predicate)(have)) {
1536 want_acting.clear();
1537 dout(10) << "choose_acting failed, not recoverable" << dendl;
1538 return false;
1539 }
1540
1541 if (want != acting) {
1542 dout(10) << "choose_acting want " << want << " != acting " << acting
1543 << ", requesting pg_temp change" << dendl;
1544 want_acting = want;
1545
1546 if (want_acting == up) {
1547 // There can't be any pending backfill if
1548 // want is the same as crush map up OSDs.
1549 assert(want_backfill.empty());
1550 vector<int> empty;
1551 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1552 } else
1553 osd->queue_want_pg_temp(info.pgid.pgid, want);
1554 return false;
1555 }
1556 want_acting.clear();
1557 actingbackfill = want_acting_backfill;
1558 dout(10) << "actingbackfill is " << actingbackfill << dendl;
1559 assert(backfill_targets.empty() || backfill_targets == want_backfill);
1560 if (backfill_targets.empty()) {
1561 // Caller is GetInfo
1562 backfill_targets = want_backfill;
1563 }
1564 // Will not change if already set because up would have had to change
1565 // Verify that nothing in backfill is in stray_set
1566 for (set<pg_shard_t>::iterator i = want_backfill.begin();
1567 i != want_backfill.end();
1568 ++i) {
1569 assert(stray_set.find(*i) == stray_set.end());
1570 }
1571 dout(10) << "choose_acting want " << want << " (== acting) backfill_targets "
1572 << want_backfill << dendl;
1573 return true;
1574 }
1575
1576 /* Build the might_have_unfound set.
1577 *
1578 * This is used by the primary OSD during recovery.
1579 *
1580 * This set tracks the OSDs which might have unfound objects that the primary
1581 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
1582 * will remove the OSD from the set.
1583 */
1584 void PG::build_might_have_unfound()
1585 {
1586 assert(might_have_unfound.empty());
1587 assert(is_primary());
1588
1589 dout(10) << __func__ << dendl;
1590
1591 check_past_interval_bounds();
1592
1593 might_have_unfound = past_intervals.get_might_have_unfound(
1594 pg_whoami,
1595 pool.info.ec_pool());
1596
1597 // include any (stray) peers
1598 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
1599 p != peer_info.end();
1600 ++p)
1601 might_have_unfound.insert(p->first);
1602
1603 dout(15) << __func__ << ": built " << might_have_unfound << dendl;
1604 }
1605
1606 struct C_PG_ActivateCommitted : public Context {
1607 PGRef pg;
1608 epoch_t epoch;
1609 epoch_t activation_epoch;
1610 C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
1611 : pg(p), epoch(e), activation_epoch(ae) {}
1612 void finish(int r) override {
1613 pg->_activate_committed(epoch, activation_epoch);
1614 }
1615 };
1616
1617 void PG::activate(ObjectStore::Transaction& t,
1618 epoch_t activation_epoch,
1619 list<Context*>& tfin,
1620 map<int, map<spg_t,pg_query_t> >& query_map,
1621 map<int,
1622 vector<
1623 pair<pg_notify_t,
1624 PastIntervals> > > *activator_map,
1625 RecoveryCtx *ctx)
1626 {
1627 assert(!is_peered());
1628 assert(scrubber.callbacks.empty());
1629 assert(callbacks_for_degraded_object.empty());
1630
1631 // twiddle pg state
1632 state_clear(PG_STATE_DOWN);
1633
1634 send_notify = false;
1635
1636 if (is_primary()) {
1637 // only update primary last_epoch_started if we will go active
1638 if (acting.size() >= pool.info.min_size) {
1639 assert(cct->_conf->osd_find_best_info_ignore_history_les ||
1640 info.last_epoch_started <= activation_epoch);
1641 info.last_epoch_started = activation_epoch;
1642 info.last_interval_started = info.history.same_interval_since;
1643 }
1644 } else if (is_acting(pg_whoami)) {
1645 /* update last_epoch_started on acting replica to whatever the primary sent
1646 * unless it's smaller (could happen if we are going peered rather than
1647 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
1648 if (info.last_epoch_started < activation_epoch) {
1649 info.last_epoch_started = activation_epoch;
1650 info.last_interval_started = info.history.same_interval_since;
1651 }
1652 }
1653
1654 auto &missing = pg_log.get_missing();
1655
1656 if (is_primary()) {
1657 last_update_ondisk = info.last_update;
1658 min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
1659 }
1660 last_update_applied = info.last_update;
1661 last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
1662
1663 need_up_thru = false;
1664
1665 // write pg info, log
1666 dirty_info = true;
1667 dirty_big_info = true; // maybe
1668
1669 // find out when we commit
1670 t.register_on_complete(
1671 new C_PG_ActivateCommitted(
1672 this,
1673 get_osdmap()->get_epoch(),
1674 activation_epoch));
1675
1676 // initialize snap_trimq
1677 if (is_primary()) {
1678 dout(20) << "activate - purged_snaps " << info.purged_snaps
1679 << " cached_removed_snaps " << pool.cached_removed_snaps << dendl;
1680 snap_trimq = pool.cached_removed_snaps;
1681 interval_set<snapid_t> intersection;
1682 intersection.intersection_of(snap_trimq, info.purged_snaps);
1683 if (intersection == info.purged_snaps) {
1684 snap_trimq.subtract(info.purged_snaps);
1685 } else {
1686 dout(0) << "warning: info.purged_snaps (" << info.purged_snaps
1687 << ") is not a subset of pool.cached_removed_snaps ("
1688 << pool.cached_removed_snaps << ")" << dendl;
1689 snap_trimq.subtract(intersection);
1690 }
1691 }
1692
1693 // init complete pointer
1694 if (missing.num_missing() == 0) {
1695 dout(10) << "activate - no missing, moving last_complete " << info.last_complete
1696 << " -> " << info.last_update << dendl;
1697 info.last_complete = info.last_update;
1698 pg_log.reset_recovery_pointers();
1699 } else {
1700 dout(10) << "activate - not complete, " << missing << dendl;
1701 pg_log.activate_not_complete(info);
1702 }
1703
1704 log_weirdness();
1705
1706 // if primary..
1707 if (is_primary()) {
1708 assert(ctx);
1709 // start up replicas
1710
1711 assert(!actingbackfill.empty());
1712 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1713 i != actingbackfill.end();
1714 ++i) {
1715 if (*i == pg_whoami) continue;
1716 pg_shard_t peer = *i;
1717 assert(peer_info.count(peer));
1718 pg_info_t& pi = peer_info[peer];
1719
1720 dout(10) << "activate peer osd." << peer << " " << pi << dendl;
1721
1722 MOSDPGLog *m = 0;
1723 assert(peer_missing.count(peer));
1724 pg_missing_t& pm = peer_missing[peer];
1725
1726 bool needs_past_intervals = pi.dne();
1727
1728 /*
1729 * cover case where peer sort order was different and
1730 * last_backfill cannot be interpreted
1731 */
1732 bool force_restart_backfill =
1733 !pi.last_backfill.is_max() &&
1734 !pi.last_backfill_bitwise;
1735
1736 if (pi.last_update == info.last_update && !force_restart_backfill) {
1737 // empty log
1738 if (!pi.last_backfill.is_max())
1739 osd->clog->info() << info.pgid << " continuing backfill to osd."
1740 << peer
1741 << " from (" << pi.log_tail << "," << pi.last_update
1742 << "] " << pi.last_backfill
1743 << " to " << info.last_update;
1744 if (!pi.is_empty() && activator_map) {
1745 dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
1746 (*activator_map)[peer.osd].push_back(
1747 make_pair(
1748 pg_notify_t(
1749 peer.shard, pg_whoami.shard,
1750 get_osdmap()->get_epoch(),
1751 get_osdmap()->get_epoch(),
1752 info),
1753 past_intervals));
1754 } else {
1755 dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
1756 m = new MOSDPGLog(
1757 i->shard, pg_whoami.shard,
1758 get_osdmap()->get_epoch(), info);
1759 }
1760 } else if (
1761 pg_log.get_tail() > pi.last_update ||
1762 pi.last_backfill == hobject_t() ||
1763 force_restart_backfill ||
1764 (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
1765 /* ^ This last case covers a situation where a replica is not contiguous
1766 * with the auth_log, but is contiguous with this replica. Reshuffling
1767 * the active set to handle this would be tricky, so instead we just go
1768 * ahead and backfill it anyway. This is probably preferrable in any
1769 * case since the replica in question would have to be significantly
1770 * behind.
1771 */
1772 // backfill
1773 osd->clog->debug() << info.pgid << " starting backfill to osd." << peer
1774 << " from (" << pi.log_tail << "," << pi.last_update
1775 << "] " << pi.last_backfill
1776 << " to " << info.last_update;
1777
1778 pi.last_update = info.last_update;
1779 pi.last_complete = info.last_update;
1780 pi.set_last_backfill(hobject_t());
1781 pi.last_epoch_started = info.last_epoch_started;
1782 pi.last_interval_started = info.last_interval_started;
1783 pi.history = info.history;
1784 pi.hit_set = info.hit_set;
1785 pi.stats.stats.clear();
1786
1787 // initialize peer with our purged_snaps.
1788 pi.purged_snaps = info.purged_snaps;
1789
1790 m = new MOSDPGLog(
1791 i->shard, pg_whoami.shard,
1792 get_osdmap()->get_epoch(), pi);
1793
1794 // send some recent log, so that op dup detection works well.
1795 m->log.copy_up_to(pg_log.get_log(), cct->_conf->osd_min_pg_log_entries);
1796 m->info.log_tail = m->log.tail;
1797 pi.log_tail = m->log.tail; // sigh...
1798
1799 pm.clear();
1800 } else {
1801 // catch up
1802 assert(pg_log.get_tail() <= pi.last_update);
1803 m = new MOSDPGLog(
1804 i->shard, pg_whoami.shard,
1805 get_osdmap()->get_epoch(), info);
1806 // send new stuff to append to replicas log
1807 m->log.copy_after(pg_log.get_log(), pi.last_update);
1808 }
1809
1810 // share past_intervals if we are creating the pg on the replica
1811 // based on whether our info for that peer was dne() *before*
1812 // updating pi.history in the backfill block above.
1813 if (m && needs_past_intervals)
1814 m->past_intervals = past_intervals;
1815
1816 // update local version of peer's missing list!
1817 if (m && pi.last_backfill != hobject_t()) {
1818 for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
1819 p != m->log.log.end();
1820 ++p) {
1821 if (p->soid <= pi.last_backfill &&
1822 !p->is_error()) {
1823 if (perform_deletes_during_peering() && p->is_delete()) {
1824 pm.rm(p->soid, p->version);
1825 } else {
1826 pm.add_next_event(*p);
1827 }
1828 }
1829 }
1830 }
1831
1832 if (m) {
1833 dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
1834 //m->log.print(cout);
1835 osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
1836 }
1837
1838 // peer now has
1839 pi.last_update = info.last_update;
1840
1841 // update our missing
1842 if (pm.num_missing() == 0) {
1843 pi.last_complete = pi.last_update;
1844 dout(10) << "activate peer osd." << peer << " " << pi << " uptodate" << dendl;
1845 } else {
1846 dout(10) << "activate peer osd." << peer << " " << pi << " missing " << pm << dendl;
1847 }
1848 }
1849
1850 // Set up missing_loc
1851 set<pg_shard_t> complete_shards;
1852 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1853 i != actingbackfill.end();
1854 ++i) {
1855 dout(20) << __func__ << " setting up missing_loc from shard " << *i << " " << dendl;
1856 if (*i == get_primary()) {
1857 missing_loc.add_active_missing(missing);
1858 if (!missing.have_missing())
1859 complete_shards.insert(*i);
1860 } else {
1861 auto peer_missing_entry = peer_missing.find(*i);
1862 assert(peer_missing_entry != peer_missing.end());
1863 missing_loc.add_active_missing(peer_missing_entry->second);
1864 if (!peer_missing_entry->second.have_missing() &&
1865 peer_info[*i].last_backfill.is_max())
1866 complete_shards.insert(*i);
1867 }
1868 }
1869
1870 // If necessary, create might_have_unfound to help us find our unfound objects.
1871 // NOTE: It's important that we build might_have_unfound before trimming the
1872 // past intervals.
1873 might_have_unfound.clear();
1874 if (needs_recovery()) {
1875 // If only one shard has missing, we do a trick to add all others as recovery
1876 // source, this is considered safe since the PGLogs have been merged locally,
1877 // and covers vast majority of the use cases, like one OSD/host is down for
1878 // a while for hardware repairing
1879 if (complete_shards.size() + 1 == actingbackfill.size()) {
1880 missing_loc.add_batch_sources_info(complete_shards, ctx->handle);
1881 } else {
1882 missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
1883 ctx->handle);
1884 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1885 i != actingbackfill.end();
1886 ++i) {
1887 if (*i == pg_whoami) continue;
1888 dout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
1889 assert(peer_missing.count(*i));
1890 assert(peer_info.count(*i));
1891 missing_loc.add_source_info(
1892 *i,
1893 peer_info[*i],
1894 peer_missing[*i],
1895 ctx->handle);
1896 }
1897 }
1898 for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
1899 i != peer_missing.end();
1900 ++i) {
1901 if (is_actingbackfill(i->first))
1902 continue;
1903 assert(peer_info.count(i->first));
1904 search_for_missing(
1905 peer_info[i->first],
1906 i->second,
1907 i->first,
1908 ctx);
1909 }
1910
1911 build_might_have_unfound();
1912
1913 // Always call now so _update_calc_stats() will be accurate
1914 discover_all_missing(query_map);
1915 }
1916
1917 // num_objects_degraded if calculated should reflect this too, unless no
1918 // missing and we are about to go clean.
1919 if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
1920 state_set(PG_STATE_UNDERSIZED);
1921 }
1922
1923 state_set(PG_STATE_ACTIVATING);
1924 release_pg_backoffs();
1925 projected_last_update = info.last_update;
1926 }
1927 if (acting.size() >= pool.info.min_size) {
1928 PGLogEntryHandler handler{this, &t};
1929 pg_log.roll_forward(&handler);
1930 }
1931 }
1932
1933 bool PG::op_has_sufficient_caps(OpRequestRef& op)
1934 {
1935 // only check MOSDOp
1936 if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
1937 return true;
1938
1939 const MOSDOp *req = static_cast<const MOSDOp*>(op->get_req());
1940
1941 Session *session = static_cast<Session*>(req->get_connection()->get_priv());
1942 if (!session) {
1943 dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl;
1944 return false;
1945 }
1946 OSDCap& caps = session->caps;
1947 session->put();
1948
1949 const string &key = req->get_hobj().get_key().empty() ?
1950 req->get_oid().name :
1951 req->get_hobj().get_key();
1952
1953 bool cap = caps.is_capable(pool.name, req->get_hobj().nspace,
1954 pool.auid, key,
1955 op->need_read_cap(),
1956 op->need_write_cap(),
1957 op->classes());
1958
1959 dout(20) << "op_has_sufficient_caps "
1960 << "session=" << session
1961 << " pool=" << pool.id << " (" << pool.name
1962 << " " << req->get_hobj().nspace
1963 << ") owner=" << pool.auid
1964 << " need_read_cap=" << op->need_read_cap()
1965 << " need_write_cap=" << op->need_write_cap()
1966 << " classes=" << op->classes()
1967 << " -> " << (cap ? "yes" : "NO")
1968 << dendl;
1969 return cap;
1970 }
1971
1972 void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
1973 {
1974 lock();
1975 if (pg_has_reset_since(epoch)) {
1976 dout(10) << "_activate_committed " << epoch
1977 << ", that was an old interval" << dendl;
1978 } else if (is_primary()) {
1979 peer_activated.insert(pg_whoami);
1980 dout(10) << "_activate_committed " << epoch
1981 << " peer_activated now " << peer_activated
1982 << " last_interval_started " << info.history.last_interval_started
1983 << " last_epoch_started " << info.history.last_epoch_started
1984 << " same_interval_since " << info.history.same_interval_since << dendl;
1985 assert(!actingbackfill.empty());
1986 if (peer_activated.size() == actingbackfill.size())
1987 all_activated_and_committed();
1988 } else {
1989 dout(10) << "_activate_committed " << epoch << " telling primary" << dendl;
1990 MOSDPGInfo *m = new MOSDPGInfo(epoch);
1991 pg_notify_t i = pg_notify_t(
1992 get_primary().shard, pg_whoami.shard,
1993 get_osdmap()->get_epoch(),
1994 get_osdmap()->get_epoch(),
1995 info);
1996
1997 i.info.history.last_epoch_started = activation_epoch;
1998 i.info.history.last_interval_started = i.info.history.same_interval_since;
1999 if (acting.size() >= pool.info.min_size) {
2000 state_set(PG_STATE_ACTIVE);
2001 } else {
2002 state_set(PG_STATE_PEERED);
2003 }
2004
2005 m->pg_list.push_back(make_pair(i, PastIntervals()));
2006 osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap()->get_epoch());
2007
2008 // waiters
2009 if (flushes_in_progress == 0) {
2010 requeue_ops(waiting_for_peered);
2011 } else if (!waiting_for_peered.empty()) {
2012 dout(10) << __func__ << " flushes in progress, moving "
2013 << waiting_for_peered.size() << " items to waiting_for_flush"
2014 << dendl;
2015 assert(waiting_for_flush.empty());
2016 waiting_for_flush.swap(waiting_for_peered);
2017 }
2018 }
2019
2020 assert(!dirty_info);
2021
2022 unlock();
2023 }
2024
2025 /*
2026 * update info.history.last_epoch_started ONLY after we and all
2027 * replicas have activated AND committed the activate transaction
2028 * (i.e. the peering results are stable on disk).
2029 */
2030 void PG::all_activated_and_committed()
2031 {
2032 dout(10) << "all_activated_and_committed" << dendl;
2033 assert(is_primary());
2034 assert(peer_activated.size() == actingbackfill.size());
2035 assert(!actingbackfill.empty());
2036 assert(blocked_by.empty());
2037
2038 // Degraded?
2039 _update_calc_stats();
2040 if (info.stats.stats.sum.num_objects_degraded) {
2041 state_set(PG_STATE_DEGRADED);
2042 } else {
2043 state_clear(PG_STATE_DEGRADED);
2044 }
2045
2046 queue_peering_event(
2047 CephPeeringEvtRef(
2048 std::make_shared<CephPeeringEvt>(
2049 get_osdmap()->get_epoch(),
2050 get_osdmap()->get_epoch(),
2051 AllReplicasActivated())));
2052 }
2053
2054 bool PG::requeue_scrub(bool high_priority)
2055 {
2056 assert(is_locked());
2057 if (scrub_queued) {
2058 dout(10) << __func__ << ": already queued" << dendl;
2059 return false;
2060 } else {
2061 dout(10) << __func__ << ": queueing" << dendl;
2062 scrub_queued = true;
2063 osd->queue_for_scrub(this, high_priority);
2064 return true;
2065 }
2066 }
2067
2068 void PG::queue_recovery()
2069 {
2070 if (!is_primary() || !is_peered()) {
2071 dout(10) << "queue_recovery -- not primary or not peered " << dendl;
2072 assert(!recovery_queued);
2073 } else if (recovery_queued) {
2074 dout(10) << "queue_recovery -- already queued" << dendl;
2075 } else {
2076 dout(10) << "queue_recovery -- queuing" << dendl;
2077 recovery_queued = true;
2078 osd->queue_for_recovery(this);
2079 }
2080 }
2081
2082 bool PG::queue_scrub()
2083 {
2084 assert(is_locked());
2085 if (is_scrubbing()) {
2086 return false;
2087 }
2088 scrubber.priority = scrubber.must_scrub ?
2089 cct->_conf->osd_requested_scrub_priority : get_scrub_priority();
2090 scrubber.must_scrub = false;
2091 state_set(PG_STATE_SCRUBBING);
2092 if (scrubber.must_deep_scrub) {
2093 state_set(PG_STATE_DEEP_SCRUB);
2094 scrubber.must_deep_scrub = false;
2095 }
2096 if (scrubber.must_repair || scrubber.auto_repair) {
2097 state_set(PG_STATE_REPAIR);
2098 scrubber.must_repair = false;
2099 }
2100 requeue_scrub();
2101 return true;
2102 }
2103
2104 unsigned PG::get_scrub_priority()
2105 {
2106 // a higher value -> a higher priority
2107 int pool_scrub_priority = 0;
2108 pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority);
2109 return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
2110 }
2111
2112 struct C_PG_FinishRecovery : public Context {
2113 PGRef pg;
2114 explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
2115 void finish(int r) override {
2116 pg->_finish_recovery(this);
2117 }
2118 };
2119
2120 void PG::mark_clean()
2121 {
2122 if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2123 state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
2124 state_set(PG_STATE_CLEAN);
2125 info.history.last_epoch_clean = get_osdmap()->get_epoch();
2126 info.history.last_interval_clean = info.history.same_interval_since;
2127 past_intervals.clear();
2128 dirty_big_info = true;
2129 dirty_info = true;
2130 }
2131
2132 kick_snap_trim();
2133 }
2134
2135 bool PG::set_force_recovery(bool b)
2136 {
2137 bool did = false;
2138 lock();
2139 if (!deleting) {
2140 if (b) {
2141 if (!(state & PG_STATE_FORCED_RECOVERY) &&
2142 (state & (PG_STATE_DEGRADED |
2143 PG_STATE_RECOVERY_WAIT |
2144 PG_STATE_RECOVERING))) {
2145 dout(20) << __func__ << " set" << dendl;
2146 state_set(PG_STATE_FORCED_RECOVERY);
2147 publish_stats_to_osd();
2148 did = true;
2149 }
2150 } else if (state & PG_STATE_FORCED_RECOVERY) {
2151 dout(20) << __func__ << " clear" << dendl;
2152 state_clear(PG_STATE_FORCED_RECOVERY);
2153 publish_stats_to_osd();
2154 did = true;
2155 }
2156 }
2157 unlock();
2158 if (did) {
2159 dout(20) << __func__ << " state " << pgstate_history.get_current_state() << dendl;
2160 osd->local_reserver.update_priority(info.pgid, get_recovery_priority());
2161 }
2162 return did;
2163 }
2164
2165 bool PG::set_force_backfill(bool b)
2166 {
2167 bool did = false;
2168 lock();
2169 if (!deleting) {
2170 if (b) {
2171 if (!(state & PG_STATE_FORCED_BACKFILL) &&
2172 (state & (PG_STATE_DEGRADED |
2173 PG_STATE_BACKFILL_WAIT |
2174 PG_STATE_BACKFILLING))) {
2175 dout(10) << __func__ << " set" << dendl;
2176 state_set(PG_STATE_FORCED_BACKFILL);
2177 publish_stats_to_osd();
2178 did = true;
2179 }
2180 } else if (state & PG_STATE_FORCED_BACKFILL) {
2181 dout(10) << __func__ << " clear" << dendl;
2182 state_clear(PG_STATE_FORCED_BACKFILL);
2183 publish_stats_to_osd();
2184 did = true;
2185 }
2186 }
2187 unlock();
2188 if (did) {
2189 dout(20) << __func__ << " state " << pgstate_history.get_current_state() << dendl;
2190 osd->local_reserver.update_priority(info.pgid, get_backfill_priority());
2191 }
2192 return did;
2193 }
2194
2195 inline int PG::clamp_recovery_priority(int priority)
2196 {
2197 static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
2198 static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
2199
2200 // Clamp to valid range
2201 if (priority > OSD_RECOVERY_PRIORITY_MAX) {
2202 return OSD_RECOVERY_PRIORITY_MAX;
2203 } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
2204 return OSD_RECOVERY_PRIORITY_MIN;
2205 } else {
2206 return priority;
2207 }
2208 }
2209
2210 unsigned PG::get_recovery_priority()
2211 {
2212 // a higher value -> a higher priority
2213 int ret = 0;
2214
2215 if (state & PG_STATE_FORCED_RECOVERY) {
2216 ret = OSD_RECOVERY_PRIORITY_FORCED;
2217 } else {
2218 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &ret);
2219 ret = clamp_recovery_priority(OSD_RECOVERY_PRIORITY_BASE + ret);
2220 }
2221 dout(20) << __func__ << " recovery priority for " << *this << " is " << ret << ", state is " << state << dendl;
2222 return static_cast<unsigned>(ret);
2223 }
2224
2225 unsigned PG::get_backfill_priority()
2226 {
2227 // a higher value -> a higher priority
2228 int ret = OSD_BACKFILL_PRIORITY_BASE;
2229 if (state & PG_STATE_FORCED_BACKFILL) {
2230 ret = OSD_BACKFILL_PRIORITY_FORCED;
2231 } else {
2232 if (acting.size() < pool.info.min_size) {
2233 // inactive: no. of replicas < min_size, highest priority since it blocks IO
2234 ret = OSD_BACKFILL_INACTIVE_PRIORITY_BASE + (pool.info.min_size - acting.size());
2235
2236 } else if (is_undersized()) {
2237 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
2238 assert(pool.info.size > actingset.size());
2239 ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE + (pool.info.size - actingset.size());
2240
2241 } else if (is_degraded()) {
2242 // degraded: baseline degraded
2243 ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
2244 }
2245
2246 // Adjust with pool's recovery priority
2247 int pool_recovery_priority = 0;
2248 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2249
2250 ret = clamp_recovery_priority(pool_recovery_priority + ret);
2251 }
2252
2253 return static_cast<unsigned>(ret);
2254 }
2255
2256 void PG::finish_recovery(list<Context*>& tfin)
2257 {
2258 dout(10) << "finish_recovery" << dendl;
2259 assert(info.last_complete == info.last_update);
2260
2261 clear_recovery_state();
2262
2263 /*
2264 * sync all this before purging strays. but don't block!
2265 */
2266 finish_sync_event = new C_PG_FinishRecovery(this);
2267 tfin.push_back(finish_sync_event);
2268 }
2269
2270 void PG::_finish_recovery(Context *c)
2271 {
2272 lock();
2273 if (deleting) {
2274 unlock();
2275 return;
2276 }
2277 if (c == finish_sync_event) {
2278 dout(10) << "_finish_recovery" << dendl;
2279 finish_sync_event = 0;
2280 purge_strays();
2281
2282 publish_stats_to_osd();
2283
2284 if (scrub_after_recovery) {
2285 dout(10) << "_finish_recovery requeueing for scrub" << dendl;
2286 scrub_after_recovery = false;
2287 scrubber.must_deep_scrub = true;
2288 queue_scrub();
2289 }
2290 } else {
2291 dout(10) << "_finish_recovery -- stale" << dendl;
2292 }
2293 unlock();
2294 }
2295
2296 void PG::start_recovery_op(const hobject_t& soid)
2297 {
2298 dout(10) << "start_recovery_op " << soid
2299 #ifdef DEBUG_RECOVERY_OIDS
2300 << " (" << recovering_oids << ")"
2301 #endif
2302 << dendl;
2303 assert(recovery_ops_active >= 0);
2304 recovery_ops_active++;
2305 #ifdef DEBUG_RECOVERY_OIDS
2306 assert(recovering_oids.count(soid) == 0);
2307 recovering_oids.insert(soid);
2308 #endif
2309 osd->start_recovery_op(this, soid);
2310 }
2311
2312 void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
2313 {
2314 dout(10) << "finish_recovery_op " << soid
2315 #ifdef DEBUG_RECOVERY_OIDS
2316 << " (" << recovering_oids << ")"
2317 #endif
2318 << dendl;
2319 assert(recovery_ops_active > 0);
2320 recovery_ops_active--;
2321 #ifdef DEBUG_RECOVERY_OIDS
2322 assert(recovering_oids.count(soid));
2323 recovering_oids.erase(soid);
2324 #endif
2325 osd->finish_recovery_op(this, soid, dequeue);
2326
2327 if (!dequeue) {
2328 queue_recovery();
2329 }
2330 }
2331
2332 void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
2333 {
2334 child->update_snap_mapper_bits(split_bits);
2335 child->update_osdmap_ref(get_osdmap());
2336
2337 child->pool = pool;
2338
2339 // Log
2340 pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2341 child->info.last_complete = info.last_complete;
2342
2343 info.last_update = pg_log.get_head();
2344 child->info.last_update = child->pg_log.get_head();
2345
2346 child->info.last_user_version = info.last_user_version;
2347
2348 info.log_tail = pg_log.get_tail();
2349 child->info.log_tail = child->pg_log.get_tail();
2350
2351 if (info.last_complete < pg_log.get_tail())
2352 info.last_complete = pg_log.get_tail();
2353 if (child->info.last_complete < child->pg_log.get_tail())
2354 child->info.last_complete = child->pg_log.get_tail();
2355
2356 // Info
2357 child->info.history = info.history;
2358 child->info.history.epoch_created = get_osdmap()->get_epoch();
2359 child->info.purged_snaps = info.purged_snaps;
2360
2361 if (info.last_backfill.is_max()) {
2362 child->info.set_last_backfill(hobject_t::get_max());
2363 } else {
2364 // restart backfill on parent and child to be safe. we could
2365 // probably do better in the bitwise sort case, but it's more
2366 // fragile (there may be special work to do on backfill completion
2367 // in the future).
2368 info.set_last_backfill(hobject_t());
2369 child->info.set_last_backfill(hobject_t());
2370 // restarting backfill implies that the missing set is empty,
2371 // since it is only used for objects prior to last_backfill
2372 pg_log.reset_backfill();
2373 child->pg_log.reset_backfill();
2374 }
2375
2376 child->info.stats = info.stats;
2377 child->info.stats.parent_split_bits = split_bits;
2378 info.stats.stats_invalid = true;
2379 child->info.stats.stats_invalid = true;
2380 child->info.last_epoch_started = info.last_epoch_started;
2381 child->info.last_interval_started = info.last_interval_started;
2382
2383 child->snap_trimq = snap_trimq;
2384
2385 // There can't be recovery/backfill going on now
2386 int primary, up_primary;
2387 vector<int> newup, newacting;
2388 get_osdmap()->pg_to_up_acting_osds(
2389 child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2390 child->init_primary_up_acting(
2391 newup,
2392 newacting,
2393 up_primary,
2394 primary);
2395 child->role = OSDMap::calc_pg_role(osd->whoami, child->acting);
2396
2397 // this comparison includes primary rank via pg_shard_t
2398 if (get_primary() != child->get_primary())
2399 child->info.history.same_primary_since = get_osdmap()->get_epoch();
2400
2401 child->info.stats.up = up;
2402 child->info.stats.up_primary = up_primary;
2403 child->info.stats.acting = acting;
2404 child->info.stats.acting_primary = primary;
2405 child->info.stats.mapping_epoch = get_osdmap()->get_epoch();
2406
2407 // History
2408 child->past_intervals = past_intervals;
2409
2410 _split_into(child_pgid, child, split_bits);
2411
2412 // release all backoffs for simplicity
2413 release_backoffs(hobject_t(), hobject_t::get_max());
2414
2415 child->on_new_interval();
2416
2417 child->dirty_info = true;
2418 child->dirty_big_info = true;
2419 dirty_info = true;
2420 dirty_big_info = true;
2421 }
2422
2423 void PG::add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end)
2424 {
2425 ConnectionRef con = s->con;
2426 if (!con) // OSD::ms_handle_reset clears s->con without a lock
2427 return;
2428 BackoffRef b(s->have_backoff(info.pgid, begin));
2429 if (b) {
2430 derr << __func__ << " already have backoff for " << s << " begin " << begin
2431 << " " << *b << dendl;
2432 ceph_abort();
2433 }
2434 Mutex::Locker l(backoff_lock);
2435 {
2436 b = new Backoff(info.pgid, this, s, ++s->backoff_seq, begin, end);
2437 backoffs[begin].insert(b);
2438 s->add_backoff(b);
2439 dout(10) << __func__ << " session " << s << " added " << *b << dendl;
2440 }
2441 con->send_message(
2442 new MOSDBackoff(
2443 info.pgid,
2444 get_osdmap()->get_epoch(),
2445 CEPH_OSD_BACKOFF_OP_BLOCK,
2446 b->id,
2447 begin,
2448 end));
2449 }
2450
2451 void PG::release_backoffs(const hobject_t& begin, const hobject_t& end)
2452 {
2453 dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl;
2454 vector<BackoffRef> bv;
2455 {
2456 Mutex::Locker l(backoff_lock);
2457 auto p = backoffs.lower_bound(begin);
2458 while (p != backoffs.end()) {
2459 int r = cmp(p->first, end);
2460 dout(20) << __func__ << " ? " << r << " " << p->first
2461 << " " << p->second << dendl;
2462 // note: must still examine begin=end=p->first case
2463 if (r > 0 || (r == 0 && begin < end)) {
2464 break;
2465 }
2466 dout(20) << __func__ << " checking " << p->first
2467 << " " << p->second << dendl;
2468 auto q = p->second.begin();
2469 while (q != p->second.end()) {
2470 dout(20) << __func__ << " checking " << *q << dendl;
2471 int r = cmp((*q)->begin, begin);
2472 if (r == 0 || (r > 0 && (*q)->end < end)) {
2473 bv.push_back(*q);
2474 q = p->second.erase(q);
2475 } else {
2476 ++q;
2477 }
2478 }
2479 if (p->second.empty()) {
2480 p = backoffs.erase(p);
2481 } else {
2482 ++p;
2483 }
2484 }
2485 }
2486 for (auto b : bv) {
2487 Mutex::Locker l(b->lock);
2488 dout(10) << __func__ << " " << *b << dendl;
2489 if (b->session) {
2490 assert(b->pg == this);
2491 ConnectionRef con = b->session->con;
2492 if (con) { // OSD::ms_handle_reset clears s->con without a lock
2493 con->send_message(
2494 new MOSDBackoff(
2495 info.pgid,
2496 get_osdmap()->get_epoch(),
2497 CEPH_OSD_BACKOFF_OP_UNBLOCK,
2498 b->id,
2499 b->begin,
2500 b->end));
2501 }
2502 if (b->is_new()) {
2503 b->state = Backoff::STATE_DELETING;
2504 } else {
2505 b->session->rm_backoff(b);
2506 b->session.reset();
2507 }
2508 b->pg.reset();
2509 }
2510 }
2511 }
2512
2513 void PG::clear_backoffs()
2514 {
2515 dout(10) << __func__ << " " << dendl;
2516 map<hobject_t,set<BackoffRef>> ls;
2517 {
2518 Mutex::Locker l(backoff_lock);
2519 ls.swap(backoffs);
2520 }
2521 for (auto& p : ls) {
2522 for (auto& b : p.second) {
2523 Mutex::Locker l(b->lock);
2524 dout(10) << __func__ << " " << *b << dendl;
2525 if (b->session) {
2526 assert(b->pg == this);
2527 if (b->is_new()) {
2528 b->state = Backoff::STATE_DELETING;
2529 } else {
2530 b->session->rm_backoff(b);
2531 b->session.reset();
2532 }
2533 b->pg.reset();
2534 }
2535 }
2536 }
2537 }
2538
2539 // called by Session::clear_backoffs()
2540 void PG::rm_backoff(BackoffRef b)
2541 {
2542 dout(10) << __func__ << " " << *b << dendl;
2543 Mutex::Locker l(backoff_lock);
2544 assert(b->lock.is_locked_by_me());
2545 assert(b->pg == this);
2546 auto p = backoffs.find(b->begin);
2547 // may race with release_backoffs()
2548 if (p != backoffs.end()) {
2549 auto q = p->second.find(b);
2550 if (q != p->second.end()) {
2551 p->second.erase(q);
2552 if (p->second.empty()) {
2553 backoffs.erase(p);
2554 }
2555 }
2556 }
2557 }
2558
2559 void PG::clear_recovery_state()
2560 {
2561 dout(10) << "clear_recovery_state" << dendl;
2562
2563 pg_log.reset_recovery_pointers();
2564 finish_sync_event = 0;
2565
2566 hobject_t soid;
2567 while (recovery_ops_active > 0) {
2568 #ifdef DEBUG_RECOVERY_OIDS
2569 soid = *recovering_oids.begin();
2570 #endif
2571 finish_recovery_op(soid, true);
2572 }
2573
2574 backfill_targets.clear();
2575 backfill_info.clear();
2576 peer_backfill_info.clear();
2577 waiting_on_backfill.clear();
2578 _clear_recovery_state(); // pg impl specific hook
2579 }
2580
2581 void PG::cancel_recovery()
2582 {
2583 dout(10) << "cancel_recovery" << dendl;
2584 clear_recovery_state();
2585 }
2586
2587
2588 void PG::purge_strays()
2589 {
2590 dout(10) << "purge_strays " << stray_set << dendl;
2591
2592 bool removed = false;
2593 for (set<pg_shard_t>::iterator p = stray_set.begin();
2594 p != stray_set.end();
2595 ++p) {
2596 assert(!is_actingbackfill(*p));
2597 if (get_osdmap()->is_up(p->osd)) {
2598 dout(10) << "sending PGRemove to osd." << *p << dendl;
2599 vector<spg_t> to_remove;
2600 to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
2601 MOSDPGRemove *m = new MOSDPGRemove(
2602 get_osdmap()->get_epoch(),
2603 to_remove);
2604 osd->send_message_osd_cluster(p->osd, m, get_osdmap()->get_epoch());
2605 } else {
2606 dout(10) << "not sending PGRemove to down osd." << *p << dendl;
2607 }
2608 peer_missing.erase(*p);
2609 peer_info.erase(*p);
2610 peer_purged.insert(*p);
2611 removed = true;
2612 }
2613
2614 // if we removed anyone, update peers (which include peer_info)
2615 if (removed)
2616 update_heartbeat_peers();
2617
2618 stray_set.clear();
2619
2620 // clear _requested maps; we may have to peer() again if we discover
2621 // (more) stray content
2622 peer_log_requested.clear();
2623 peer_missing_requested.clear();
2624 }
2625
2626 void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
2627 {
2628 Mutex::Locker l(heartbeat_peer_lock);
2629 probe_targets.clear();
2630 for (set<pg_shard_t>::iterator i = probe_set.begin();
2631 i != probe_set.end();
2632 ++i) {
2633 probe_targets.insert(i->osd);
2634 }
2635 }
2636
2637 void PG::clear_probe_targets()
2638 {
2639 Mutex::Locker l(heartbeat_peer_lock);
2640 probe_targets.clear();
2641 }
2642
2643 void PG::update_heartbeat_peers()
2644 {
2645 assert(is_locked());
2646
2647 if (!is_primary())
2648 return;
2649
2650 set<int> new_peers;
2651 for (unsigned i=0; i<acting.size(); i++) {
2652 if (acting[i] != CRUSH_ITEM_NONE)
2653 new_peers.insert(acting[i]);
2654 }
2655 for (unsigned i=0; i<up.size(); i++) {
2656 if (up[i] != CRUSH_ITEM_NONE)
2657 new_peers.insert(up[i]);
2658 }
2659 for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
2660 p != peer_info.end();
2661 ++p)
2662 new_peers.insert(p->first.osd);
2663
2664 bool need_update = false;
2665 heartbeat_peer_lock.Lock();
2666 if (new_peers == heartbeat_peers) {
2667 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl;
2668 } else {
2669 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl;
2670 heartbeat_peers.swap(new_peers);
2671 need_update = true;
2672 }
2673 heartbeat_peer_lock.Unlock();
2674
2675 if (need_update)
2676 osd->need_heartbeat_peer_update();
2677 }
2678
2679
2680 bool PG::check_in_progress_op(
2681 const osd_reqid_t &r,
2682 eversion_t *version,
2683 version_t *user_version,
2684 int *return_code) const
2685 {
2686 return (
2687 projected_log.get_request(r, version, user_version, return_code) ||
2688 pg_log.get_log().get_request(r, version, user_version, return_code));
2689 }
2690
2691 static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard)
2692 {
2693 for (auto&p : pgs)
2694 if (p.shard == shard)
2695 return true;
2696 return false;
2697 }
2698
2699 static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard)
2700 {
2701 for (auto&p : pgs) {
2702 if (p == skip)
2703 continue;
2704 if (p.shard == shard)
2705 return p;
2706 }
2707 return pg_shard_t();
2708 }
2709
2710 void PG::_update_calc_stats()
2711 {
2712 info.stats.version = info.last_update;
2713 info.stats.created = info.history.epoch_created;
2714 info.stats.last_scrub = info.history.last_scrub;
2715 info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
2716 info.stats.last_deep_scrub = info.history.last_deep_scrub;
2717 info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
2718 info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
2719 info.stats.last_epoch_clean = info.history.last_epoch_clean;
2720
2721 info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
2722 info.stats.ondisk_log_size = info.stats.log_size;
2723 info.stats.log_start = pg_log.get_tail();
2724 info.stats.ondisk_log_start = pg_log.get_tail();
2725 info.stats.snaptrimq_len = snap_trimq.size();
2726
2727 unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
2728
2729 // In rare case that upset is too large (usually transient), use as target
2730 // for calculations below.
2731 unsigned target = std::max(num_shards, (unsigned)upset.size());
2732 // For undersized actingset may be larger with OSDs out
2733 unsigned nrep = std::max(actingset.size(), upset.size());
2734 // calc num_object_copies
2735 info.stats.stats.calc_copies(MAX(target, nrep));
2736 info.stats.stats.sum.num_objects_degraded = 0;
2737 info.stats.stats.sum.num_objects_unfound = 0;
2738 info.stats.stats.sum.num_objects_misplaced = 0;
2739
2740 if ((is_remapped() || is_undersized() || !is_clean()) && (is_peered() || is_activating())) {
2741 dout(20) << __func__ << " actingset " << actingset << " upset "
2742 << upset << " actingbackfill " << actingbackfill << dendl;
2743 dout(20) << __func__ << " acting " << acting << " up " << up << dendl;
2744
2745 assert(!actingbackfill.empty());
2746
2747 bool estimate = false;
2748
2749 // NOTE: we only generate degraded, misplaced and unfound
2750 // values for the summation, not individual stat categories.
2751 int64_t num_objects = info.stats.stats.sum.num_objects;
2752
2753 // Objects missing from up nodes, sorted by # objects.
2754 boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
2755 // Objects missing from nodes not in up, sort by # objects
2756 boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
2757
2758 // Fill missing_target_objects/acting_source_objects
2759
2760 {
2761 int64_t missing;
2762
2763 // Primary first
2764 missing = pg_log.get_missing().num_missing();
2765 assert(actingbackfill.count(pg_whoami));
2766 if (upset.count(pg_whoami)) {
2767 missing_target_objects.insert(make_pair(missing, pg_whoami));
2768 } else {
2769 acting_source_objects.insert(make_pair(missing, pg_whoami));
2770 }
2771 info.stats.stats.sum.num_objects_missing_on_primary = missing;
2772 dout(20) << __func__ << " shard " << pg_whoami
2773 << " primary objects " << num_objects
2774 << " missing " << missing
2775 << dendl;
2776
2777 }
2778
2779 // All other peers
2780 for (auto& peer : peer_info) {
2781 // Primary should not be in the peer_info, skip if it is.
2782 if (peer.first == pg_whoami) continue;
2783 int64_t missing = 0;
2784 int64_t peer_num_objects = peer.second.stats.stats.sum.num_objects;
2785 // Backfill targets always track num_objects accurately
2786 // all other peers track missing accurately.
2787 if (is_backfill_targets(peer.first)) {
2788 missing = std::max((int64_t)0, num_objects - peer_num_objects);
2789 } else {
2790 if (peer_missing.count(peer.first)) {
2791 missing = peer_missing[peer.first].num_missing();
2792 } else {
2793 dout(20) << __func__ << " no peer_missing found for " << peer.first << dendl;
2794 if (is_recovering()) {
2795 estimate = true;
2796 }
2797 missing = std::max((int64_t)0, num_objects - peer_num_objects);
2798 }
2799 }
2800 if (upset.count(peer.first)) {
2801 missing_target_objects.insert(make_pair(missing, peer.first));
2802 } else if (actingset.count(peer.first)) {
2803 acting_source_objects.insert(make_pair(missing, peer.first));
2804 }
2805 peer.second.stats.stats.sum.num_objects_missing = missing;
2806 dout(20) << __func__ << " shard " << peer.first
2807 << " objects " << peer_num_objects
2808 << " missing " << missing
2809 << dendl;
2810 }
2811
2812 // A misplaced object is not stored on the correct OSD
2813 int64_t misplaced = 0;
2814 // a degraded objects has fewer replicas or EC shards than the pool specifies.
2815 int64_t degraded = 0;
2816
2817 if (is_recovering()) {
2818 for (auto& sml: missing_loc.get_missing_by_count()) {
2819 for (auto& ml: sml.second) {
2820 int missing_shards;
2821 if (sml.first == shard_id_t::NO_SHARD) {
2822 dout(20) << __func__ << " ml " << ml.second << " upset size " << upset.size() << " up " << ml.first.up << dendl;
2823 missing_shards = (int)upset.size() - ml.first.up;
2824 } else {
2825 // Handle shards not even in upset below
2826 if (!find_shard(upset, sml.first))
2827 continue;
2828 missing_shards = std::max(0, 1 - ml.first.up);
2829 dout(20) << __func__ << " shard " << sml.first << " ml " << ml.second << " missing shards " << missing_shards << dendl;
2830 }
2831 int odegraded = ml.second * missing_shards;
2832 // Copies on other osds but limited to the possible degraded
2833 int more_osds = std::min(missing_shards, ml.first.other);
2834 int omisplaced = ml.second * more_osds;
2835 assert(omisplaced <= odegraded);
2836 odegraded -= omisplaced;
2837
2838 misplaced += omisplaced;
2839 degraded += odegraded;
2840 }
2841 }
2842
2843 dout(20) << __func__ << " missing based degraded " << degraded << dendl;
2844 dout(20) << __func__ << " missing based misplaced " << misplaced << dendl;
2845
2846 // Handle undersized case
2847 if (pool.info.is_replicated()) {
2848 // Add degraded for missing targets (num_objects missing)
2849 assert(target >= upset.size());
2850 unsigned needed = target - upset.size();
2851 degraded += num_objects * needed;
2852 } else {
2853 for (unsigned i = 0 ; i < num_shards; ++i) {
2854 shard_id_t shard(i);
2855
2856 if (!find_shard(upset, shard)) {
2857 pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard);
2858
2859 if (pgs != pg_shard_t()) {
2860 int64_t missing;
2861
2862 if (pgs == pg_whoami)
2863 missing = info.stats.stats.sum.num_objects_missing_on_primary;
2864 else
2865 missing = peer_info[pgs].stats.stats.sum.num_objects_missing;
2866
2867 degraded += missing;
2868 misplaced += std::max((int64_t)0, num_objects - missing);
2869 } else {
2870 // No shard anywhere
2871 degraded += num_objects;
2872 }
2873 }
2874 }
2875 }
2876 goto out;
2877 }
2878
2879 // Handle undersized case
2880 if (pool.info.is_replicated()) {
2881 // Add to missing_target_objects
2882 assert(target >= missing_target_objects.size());
2883 unsigned needed = target - missing_target_objects.size();
2884 if (needed)
2885 missing_target_objects.insert(make_pair(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD)));
2886 } else {
2887 for (unsigned i = 0 ; i < num_shards; ++i) {
2888 shard_id_t shard(i);
2889 bool found = false;
2890 for (const auto& t : missing_target_objects) {
2891 if (std::get<1>(t).shard == shard) {
2892 found = true;
2893 break;
2894 }
2895 }
2896 if (!found)
2897 missing_target_objects.insert(make_pair(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard)));
2898 }
2899 }
2900
2901 for (const auto& item : missing_target_objects)
2902 dout(20) << __func__ << " missing shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
2903 for (const auto& item : acting_source_objects)
2904 dout(20) << __func__ << " acting shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
2905
2906 // Handle all objects not in missing for remapped
2907 // or backfill
2908 for (auto m = missing_target_objects.rbegin();
2909 m != missing_target_objects.rend(); ++m) {
2910
2911 int64_t extra_missing = -1;
2912
2913 if (pool.info.is_replicated()) {
2914 if (!acting_source_objects.empty()) {
2915 auto extra_copy = acting_source_objects.begin();
2916 extra_missing = std::get<0>(*extra_copy);
2917 acting_source_objects.erase(extra_copy);
2918 }
2919 } else { // Erasure coded
2920 // Use corresponding shard
2921 for (const auto& a : acting_source_objects) {
2922 if (std::get<1>(a).shard == std::get<1>(*m).shard) {
2923 extra_missing = std::get<0>(a);
2924 acting_source_objects.erase(a);
2925 break;
2926 }
2927 }
2928 }
2929
2930 if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
2931 // We don't know which of the objects on the target
2932 // are part of extra_missing so assume are all degraded.
2933 misplaced += std::get<0>(*m) - extra_missing;
2934 degraded += extra_missing;
2935 } else {
2936 // 1. extra_missing == -1, more targets than sources so degraded
2937 // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
2938 // previously degraded are now present on the target.
2939 degraded += std::get<0>(*m);
2940 }
2941 }
2942 // If there are still acting that haven't been accounted for
2943 // then they are misplaced
2944 for (const auto& a : acting_source_objects) {
2945 int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
2946 dout(20) << __func__ << " extra acting misplaced " << extra_misplaced << dendl;
2947 misplaced += extra_misplaced;
2948 }
2949 out:
2950 // NOTE: Tests use these messages to verify this code
2951 dout(20) << __func__ << " degraded " << degraded << (estimate ? " (est)": "") << dendl;
2952 dout(20) << __func__ << " misplaced " << misplaced << (estimate ? " (est)": "")<< dendl;
2953
2954 info.stats.stats.sum.num_objects_degraded = degraded;
2955 info.stats.stats.sum.num_objects_unfound = get_num_unfound();
2956 info.stats.stats.sum.num_objects_misplaced = misplaced;
2957 }
2958 }
2959
2960 void PG::_update_blocked_by()
2961 {
2962 // set a max on the number of blocking peers we report. if we go
2963 // over, report a random subset. keep the result sorted.
2964 unsigned keep = MIN(blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
2965 unsigned skip = blocked_by.size() - keep;
2966 info.stats.blocked_by.clear();
2967 info.stats.blocked_by.resize(keep);
2968 unsigned pos = 0;
2969 for (set<int>::iterator p = blocked_by.begin();
2970 p != blocked_by.end() && keep > 0;
2971 ++p) {
2972 if (skip > 0 && (rand() % (skip + keep) < skip)) {
2973 --skip;
2974 } else {
2975 info.stats.blocked_by[pos++] = *p;
2976 --keep;
2977 }
2978 }
2979 }
2980
2981 void PG::publish_stats_to_osd()
2982 {
2983 if (!is_primary())
2984 return;
2985
2986 pg_stats_publish_lock.Lock();
2987
2988 if (info.stats.stats.sum.num_scrub_errors)
2989 state_set(PG_STATE_INCONSISTENT);
2990 else
2991 state_clear(PG_STATE_INCONSISTENT);
2992
2993 utime_t now = ceph_clock_now();
2994 if (info.stats.state != state) {
2995 info.stats.last_change = now;
2996 // Optimistic estimation, if we just find out an inactive PG,
2997 // assumt it is active till now.
2998 if (!(state & PG_STATE_ACTIVE) &&
2999 (info.stats.state & PG_STATE_ACTIVE))
3000 info.stats.last_active = now;
3001
3002 if ((state & PG_STATE_ACTIVE) &&
3003 !(info.stats.state & PG_STATE_ACTIVE))
3004 info.stats.last_became_active = now;
3005 if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
3006 !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
3007 info.stats.last_became_peered = now;
3008 if (!(state & PG_STATE_CREATING) &&
3009 (info.stats.state & PG_STATE_CREATING)) {
3010 osd->send_pg_created(get_pgid().pgid);
3011 }
3012 info.stats.state = state;
3013 }
3014
3015 _update_calc_stats();
3016 if (info.stats.stats.sum.num_objects_degraded) {
3017 state_set(PG_STATE_DEGRADED);
3018 } else {
3019 state_clear(PG_STATE_DEGRADED);
3020 }
3021 _update_blocked_by();
3022
3023 bool publish = false;
3024 pg_stat_t pre_publish = info.stats;
3025 pre_publish.stats.add(unstable_stats);
3026 utime_t cutoff = now;
3027 cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
3028 if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
3029 info.stats.last_fresh > cutoff) {
3030 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3031 << ": no change since " << info.stats.last_fresh << dendl;
3032 } else {
3033 // update our stat summary and timestamps
3034 info.stats.reported_epoch = get_osdmap()->get_epoch();
3035 ++info.stats.reported_seq;
3036
3037 info.stats.last_fresh = now;
3038
3039 if (info.stats.state & PG_STATE_CLEAN)
3040 info.stats.last_clean = now;
3041 if (info.stats.state & PG_STATE_ACTIVE)
3042 info.stats.last_active = now;
3043 if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
3044 info.stats.last_peered = now;
3045 info.stats.last_unstale = now;
3046 if ((info.stats.state & PG_STATE_DEGRADED) == 0)
3047 info.stats.last_undegraded = now;
3048 if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
3049 info.stats.last_fullsized = now;
3050
3051 // do not send pgstat to mon anymore once we are luminous, since mgr takes
3052 // care of this by sending MMonMgrReport to mon.
3053 publish =
3054 osd->osd->get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
3055 pg_stats_publish_valid = true;
3056 pg_stats_publish = pre_publish;
3057
3058 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3059 << ":" << pg_stats_publish.reported_seq << dendl;
3060 }
3061 pg_stats_publish_lock.Unlock();
3062
3063 if (publish)
3064 osd->pg_stat_queue_enqueue(this);
3065 }
3066
3067 void PG::clear_publish_stats()
3068 {
3069 dout(15) << "clear_stats" << dendl;
3070 pg_stats_publish_lock.Lock();
3071 pg_stats_publish_valid = false;
3072 pg_stats_publish_lock.Unlock();
3073
3074 osd->pg_stat_queue_dequeue(this);
3075 }
3076
3077 /**
3078 * initialize a newly instantiated pg
3079 *
3080 * Initialize PG state, as when a PG is initially created, or when it
3081 * is first instantiated on the current node.
3082 *
3083 * @param role our role/rank
3084 * @param newup up set
3085 * @param newacting acting set
3086 * @param history pg history
3087 * @param pi past_intervals
3088 * @param backfill true if info should be marked as backfill
3089 * @param t transaction to write out our new state in
3090 */
3091 void PG::init(
3092 int role,
3093 const vector<int>& newup, int new_up_primary,
3094 const vector<int>& newacting, int new_acting_primary,
3095 const pg_history_t& history,
3096 const PastIntervals& pi,
3097 bool backfill,
3098 ObjectStore::Transaction *t)
3099 {
3100 dout(10) << "init role " << role << " up " << newup << " acting " << newacting
3101 << " history " << history
3102 << " past_intervals " << pi
3103 << dendl;
3104
3105 set_role(role);
3106 acting = newacting;
3107 up = newup;
3108 init_primary_up_acting(
3109 newup,
3110 newacting,
3111 new_up_primary,
3112 new_acting_primary);
3113
3114 info.history = history;
3115 past_intervals = pi;
3116
3117 info.stats.up = up;
3118 info.stats.up_primary = new_up_primary;
3119 info.stats.acting = acting;
3120 info.stats.acting_primary = new_acting_primary;
3121 info.stats.mapping_epoch = info.history.same_interval_since;
3122
3123 if (backfill) {
3124 dout(10) << __func__ << ": Setting backfill" << dendl;
3125 info.set_last_backfill(hobject_t());
3126 info.last_complete = info.last_update;
3127 pg_log.mark_log_for_rewrite();
3128 }
3129
3130 on_new_interval();
3131
3132 dirty_info = true;
3133 dirty_big_info = true;
3134 write_if_dirty(*t);
3135 }
3136
3137 #pragma GCC diagnostic ignored "-Wpragmas"
3138 #pragma GCC diagnostic push
3139 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3140
3141 void PG::upgrade(ObjectStore *store)
3142 {
3143 assert(info_struct_v <= 10);
3144 ObjectStore::Transaction t;
3145
3146 assert(info_struct_v >= 7);
3147
3148 // 7 -> 8
3149 if (info_struct_v <= 7) {
3150 pg_log.mark_log_for_rewrite();
3151 ghobject_t log_oid(OSD::make_pg_log_oid(pg_id));
3152 ghobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id));
3153 t.remove(coll_t::meta(), log_oid);
3154 t.remove(coll_t::meta(), biginfo_oid);
3155 t.touch(coll, pgmeta_oid);
3156 }
3157
3158 // 8 -> 9
3159 if (info_struct_v <= 8) {
3160 // no special action needed.
3161 }
3162
3163 // 9 -> 10
3164 if (info_struct_v <= 9) {
3165 // previous versions weren't (as) aggressively clearing past_intervals
3166 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
3167 dout(20) << __func__ << " clearing past_intervals" << dendl;
3168 past_intervals.clear();
3169 }
3170 }
3171
3172 // update infover_key
3173 if (info_struct_v < cur_struct_v) {
3174 map<string,bufferlist> v;
3175 __u8 ver = cur_struct_v;
3176 ::encode(ver, v[infover_key]);
3177 t.omap_setkeys(coll, pgmeta_oid, v);
3178 }
3179
3180 dirty_info = true;
3181 dirty_big_info = true;
3182 write_if_dirty(t);
3183
3184 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
3185 ObjectStore::Sequencer>("upgrade"));
3186 int r = store->apply_transaction(osr.get(), std::move(t));
3187 if (r != 0) {
3188 derr << __func__ << ": apply_transaction returned "
3189 << cpp_strerror(r) << dendl;
3190 ceph_abort();
3191 }
3192 assert(r == 0);
3193
3194 C_SaferCond waiter;
3195 if (!osr->flush_commit(&waiter)) {
3196 waiter.wait();
3197 }
3198 }
3199
3200 #pragma GCC diagnostic pop
3201 #pragma GCC diagnostic warning "-Wpragmas"
3202
3203 int PG::_prepare_write_info(CephContext* cct,
3204 map<string,bufferlist> *km,
3205 epoch_t epoch,
3206 pg_info_t &info, pg_info_t &last_written_info,
3207 PastIntervals &past_intervals,
3208 bool dirty_big_info,
3209 bool dirty_epoch,
3210 bool try_fast_info,
3211 PerfCounters *logger)
3212 {
3213 if (dirty_epoch) {
3214 ::encode(epoch, (*km)[epoch_key]);
3215 }
3216
3217 if (logger)
3218 logger->inc(l_osd_pg_info);
3219
3220 // try to do info efficiently?
3221 if (!dirty_big_info && try_fast_info &&
3222 info.last_update > last_written_info.last_update) {
3223 pg_fast_info_t fast;
3224 fast.populate_from(info);
3225 bool did = fast.try_apply_to(&last_written_info);
3226 assert(did); // we verified last_update increased above
3227 if (info == last_written_info) {
3228 ::encode(fast, (*km)[fastinfo_key]);
3229 if (logger)
3230 logger->inc(l_osd_pg_fastinfo);
3231 return 0;
3232 }
3233 generic_dout(30) << __func__ << " fastinfo failed, info:\n";
3234 {
3235 JSONFormatter jf(true);
3236 jf.dump_object("info", info);
3237 jf.flush(*_dout);
3238 }
3239 {
3240 *_dout << "\nlast_written_info:\n";
3241 JSONFormatter jf(true);
3242 jf.dump_object("last_written_info", last_written_info);
3243 jf.flush(*_dout);
3244 }
3245 *_dout << dendl;
3246 }
3247 last_written_info = info;
3248
3249 // info. store purged_snaps separately.
3250 interval_set<snapid_t> purged_snaps;
3251 purged_snaps.swap(info.purged_snaps);
3252 ::encode(info, (*km)[info_key]);
3253 purged_snaps.swap(info.purged_snaps);
3254
3255 if (dirty_big_info) {
3256 // potentially big stuff
3257 bufferlist& bigbl = (*km)[biginfo_key];
3258 ::encode(past_intervals, bigbl);
3259 ::encode(info.purged_snaps, bigbl);
3260 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
3261 if (logger)
3262 logger->inc(l_osd_pg_biginfo);
3263 }
3264
3265 return 0;
3266 }
3267
3268 void PG::_create(ObjectStore::Transaction& t, spg_t pgid, int bits)
3269 {
3270 coll_t coll(pgid);
3271 t.create_collection(coll, bits);
3272 }
3273
3274 void PG::_init(ObjectStore::Transaction& t, spg_t pgid, const pg_pool_t *pool)
3275 {
3276 coll_t coll(pgid);
3277
3278 if (pool) {
3279 // Give a hint to the PG collection
3280 bufferlist hint;
3281 uint32_t pg_num = pool->get_pg_num();
3282 uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
3283 ::encode(pg_num, hint);
3284 ::encode(expected_num_objects_pg, hint);
3285 uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
3286 t.collection_hint(coll, hint_type, hint);
3287 }
3288
3289 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3290 t.touch(coll, pgmeta_oid);
3291 map<string,bufferlist> values;
3292 __u8 struct_v = cur_struct_v;
3293 ::encode(struct_v, values[infover_key]);
3294 t.omap_setkeys(coll, pgmeta_oid, values);
3295 }
3296
3297 void PG::prepare_write_info(map<string,bufferlist> *km)
3298 {
3299 info.stats.stats.add(unstable_stats);
3300 unstable_stats.clear();
3301
3302 bool need_update_epoch = last_epoch < get_osdmap()->get_epoch();
3303 int ret = _prepare_write_info(cct, km, get_osdmap()->get_epoch(),
3304 info,
3305 last_written_info,
3306 past_intervals,
3307 dirty_big_info, need_update_epoch,
3308 cct->_conf->osd_fast_info,
3309 osd->logger);
3310 assert(ret == 0);
3311 if (need_update_epoch)
3312 last_epoch = get_osdmap()->get_epoch();
3313 last_persisted_osdmap_ref = osdmap_ref;
3314
3315 dirty_info = false;
3316 dirty_big_info = false;
3317 }
3318
3319 #pragma GCC diagnostic ignored "-Wpragmas"
3320 #pragma GCC diagnostic push
3321 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3322
3323 bool PG::_has_removal_flag(ObjectStore *store,
3324 spg_t pgid)
3325 {
3326 coll_t coll(pgid);
3327 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3328
3329 // first try new way
3330 set<string> keys;
3331 keys.insert("_remove");
3332 map<string,bufferlist> values;
3333 if (store->omap_get_values(coll, pgmeta_oid, keys, &values) == 0 &&
3334 values.size() == 1)
3335 return true;
3336
3337 return false;
3338 }
3339
3340 int PG::peek_map_epoch(ObjectStore *store,
3341 spg_t pgid,
3342 epoch_t *pepoch,
3343 bufferlist *bl)
3344 {
3345 coll_t coll(pgid);
3346 ghobject_t legacy_infos_oid(OSD::make_infos_oid());
3347 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3348 epoch_t cur_epoch = 0;
3349
3350 assert(bl);
3351 {
3352 // validate collection name
3353 assert(coll.is_pg());
3354 }
3355
3356 // try for v8
3357 set<string> keys;
3358 keys.insert(infover_key);
3359 keys.insert(epoch_key);
3360 map<string,bufferlist> values;
3361 int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3362 if (r == 0) {
3363 assert(values.size() == 2);
3364
3365 // sanity check version
3366 bufferlist::iterator bp = values[infover_key].begin();
3367 __u8 struct_v = 0;
3368 ::decode(struct_v, bp);
3369 assert(struct_v >= 8);
3370
3371 // get epoch
3372 bp = values[epoch_key].begin();
3373 ::decode(cur_epoch, bp);
3374 } else {
3375 // probably bug 10617; see OSD::load_pgs()
3376 return -1;
3377 }
3378
3379 *pepoch = cur_epoch;
3380 return 0;
3381 }
3382
3383 #pragma GCC diagnostic pop
3384 #pragma GCC diagnostic warning "-Wpragmas"
3385
3386 void PG::write_if_dirty(ObjectStore::Transaction& t)
3387 {
3388 map<string,bufferlist> km;
3389 if (dirty_big_info || dirty_info)
3390 prepare_write_info(&km);
3391 pg_log.write_log_and_missing(t, &km, coll, pgmeta_oid, pool.info.require_rollback());
3392 if (!km.empty())
3393 t.omap_setkeys(coll, pgmeta_oid, km);
3394 }
3395
3396 void PG::add_log_entry(const pg_log_entry_t& e, bool applied)
3397 {
3398 // raise last_complete only if we were previously up to date
3399 if (info.last_complete == info.last_update)
3400 info.last_complete = e.version;
3401
3402 // raise last_update.
3403 assert(e.version > info.last_update);
3404 info.last_update = e.version;
3405
3406 // raise user_version, if it increased (it may have not get bumped
3407 // by all logged updates)
3408 if (e.user_version > info.last_user_version)
3409 info.last_user_version = e.user_version;
3410
3411 // log mutation
3412 pg_log.add(e, applied);
3413 dout(10) << "add_log_entry " << e << dendl;
3414 }
3415
3416
3417 void PG::append_log(
3418 const vector<pg_log_entry_t>& logv,
3419 eversion_t trim_to,
3420 eversion_t roll_forward_to,
3421 ObjectStore::Transaction &t,
3422 bool transaction_applied)
3423 {
3424 if (transaction_applied)
3425 update_snap_map(logv, t);
3426
3427 /* The primary has sent an info updating the history, but it may not
3428 * have arrived yet. We want to make sure that we cannot remember this
3429 * write without remembering that it happened in an interval which went
3430 * active in epoch history.last_epoch_started.
3431 */
3432 if (info.last_epoch_started != info.history.last_epoch_started) {
3433 info.history.last_epoch_started = info.last_epoch_started;
3434 }
3435 if (info.last_interval_started != info.history.last_interval_started) {
3436 info.history.last_interval_started = info.last_interval_started;
3437 }
3438 dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3439
3440 PGLogEntryHandler handler{this, &t};
3441 if (!transaction_applied) {
3442 /* We must be a backfill peer, so it's ok if we apply
3443 * out-of-turn since we won't be considered when
3444 * determining a min possible last_update.
3445 */
3446 pg_log.roll_forward(&handler);
3447 }
3448
3449 for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3450 p != logv.end();
3451 ++p) {
3452 add_log_entry(*p, transaction_applied);
3453
3454 /* We don't want to leave the rollforward artifacts around
3455 * here past last_backfill. It's ok for the same reason as
3456 * above */
3457 if (transaction_applied &&
3458 p->soid > info.last_backfill) {
3459 pg_log.roll_forward(&handler);
3460 }
3461 }
3462 auto last = logv.rbegin();
3463 if (is_primary() && last != logv.rend()) {
3464 projected_log.skip_can_rollback_to_to_head();
3465 projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
3466 }
3467
3468 if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
3469 pg_log.roll_forward_to(
3470 roll_forward_to,
3471 &handler);
3472 t.register_on_applied(
3473 new C_UpdateLastRollbackInfoTrimmedToApplied(
3474 this,
3475 get_osdmap()->get_epoch(),
3476 roll_forward_to));
3477 }
3478
3479 dout(10) << __func__ << " approx pg log length = "
3480 << pg_log.get_log().approx_size() << dendl;
3481 dout(10) << __func__ << " transaction_applied = "
3482 << transaction_applied << dendl;
3483 if (!transaction_applied)
3484 dout(10) << __func__ << " " << pg_whoami
3485 << " is backfill target" << dendl;
3486 pg_log.trim(trim_to, info, transaction_applied);
3487
3488 // update the local pg, pg log
3489 dirty_info = true;
3490 write_if_dirty(t);
3491 }
3492
3493 bool PG::check_log_for_corruption(ObjectStore *store)
3494 {
3495 /// TODO: this method needs to work with the omap log
3496 return true;
3497 }
3498
3499 //! Get the name we're going to save our corrupt page log as
3500 std::string PG::get_corrupt_pg_log_name() const
3501 {
3502 const int MAX_BUF = 512;
3503 char buf[MAX_BUF];
3504 struct tm tm_buf;
3505 time_t my_time(time(NULL));
3506 const struct tm *t = localtime_r(&my_time, &tm_buf);
3507 int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t);
3508 if (ret == 0) {
3509 dout(0) << "strftime failed" << dendl;
3510 return "corrupt_log_unknown_time";
3511 }
3512 string out(buf);
3513 out += stringify(info.pgid);
3514 return out;
3515 }
3516
3517 int PG::read_info(
3518 ObjectStore *store, spg_t pgid, const coll_t &coll, bufferlist &bl,
3519 pg_info_t &info, PastIntervals &past_intervals,
3520 __u8 &struct_v)
3521 {
3522 // try for v8 or later
3523 set<string> keys;
3524 keys.insert(infover_key);
3525 keys.insert(info_key);
3526 keys.insert(biginfo_key);
3527 keys.insert(fastinfo_key);
3528 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3529 map<string,bufferlist> values;
3530 int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3531 if (r == 0) {
3532 assert(values.size() == 3 ||
3533 values.size() == 4);
3534
3535 bufferlist::iterator p = values[infover_key].begin();
3536 ::decode(struct_v, p);
3537 assert(struct_v >= 8);
3538
3539 p = values[info_key].begin();
3540 ::decode(info, p);
3541
3542 p = values[biginfo_key].begin();
3543 if (struct_v >= 10) {
3544 ::decode(past_intervals, p);
3545 } else {
3546 past_intervals.decode_classic(p);
3547 }
3548 ::decode(info.purged_snaps, p);
3549
3550 p = values[fastinfo_key].begin();
3551 if (!p.end()) {
3552 pg_fast_info_t fast;
3553 ::decode(fast, p);
3554 fast.try_apply_to(&info);
3555 }
3556 return 0;
3557 }
3558
3559 // legacy (ver < 8)
3560 ghobject_t infos_oid(OSD::make_infos_oid());
3561 bufferlist::iterator p = bl.begin();
3562 ::decode(struct_v, p);
3563 assert(struct_v == 7);
3564
3565 // get info out of leveldb
3566 string k = get_info_key(info.pgid);
3567 string bk = get_biginfo_key(info.pgid);
3568 keys.clear();
3569 keys.insert(k);
3570 keys.insert(bk);
3571 values.clear();
3572 store->omap_get_values(coll_t::meta(), ghobject_t(infos_oid), keys, &values);
3573 assert(values.size() == 2);
3574
3575 p = values[k].begin();
3576 ::decode(info, p);
3577
3578 p = values[bk].begin();
3579 ::decode(past_intervals, p);
3580 interval_set<snapid_t> snap_collections; // obsolete
3581 ::decode(snap_collections, p);
3582 ::decode(info.purged_snaps, p);
3583 return 0;
3584 }
3585
3586 void PG::read_state(ObjectStore *store, bufferlist &bl)
3587 {
3588 int r = read_info(store, pg_id, coll, bl, info, past_intervals,
3589 info_struct_v);
3590 assert(r >= 0);
3591
3592 last_written_info = info;
3593
3594 // if we are upgrading from jewel, we need to force rebuild of
3595 // missing set. v9 was fastinfo, added v11.0.2-331-g1d5dc29a13
3596 // (before kraken). persisted missing set was circa
3597 // v11.0.0-866-gb0e239da95 (a bit earlier, also before kraken).
3598 // v8 was pre-jewel (per-pg meta object).
3599 bool force_rebuild_missing = info_struct_v < 9;
3600 if (force_rebuild_missing) {
3601 dout(10) << __func__ << " detected upgrade from jewel, force_rebuild_missing"
3602 << dendl;
3603 }
3604
3605 ostringstream oss;
3606 pg_log.read_log_and_missing(
3607 store,
3608 coll,
3609 info_struct_v < 8 ? coll_t::meta() : coll,
3610 ghobject_t(info_struct_v < 8 ? OSD::make_pg_log_oid(pg_id) : pgmeta_oid),
3611 info,
3612 force_rebuild_missing,
3613 oss,
3614 cct->_conf->osd_ignore_stale_divergent_priors,
3615 cct->_conf->osd_debug_verify_missing_on_start);
3616 if (oss.tellp())
3617 osd->clog->error() << oss.str();
3618
3619 if (force_rebuild_missing) {
3620 dout(10) << __func__ << " forced rebuild of missing got "
3621 << pg_log.get_missing()
3622 << dendl;
3623 }
3624
3625 // log any weirdness
3626 log_weirdness();
3627 }
3628
3629 void PG::log_weirdness()
3630 {
3631 if (pg_log.get_tail() != info.log_tail)
3632 osd->clog->error() << info.pgid
3633 << " info mismatch, log.tail " << pg_log.get_tail()
3634 << " != info.log_tail " << info.log_tail;
3635 if (pg_log.get_head() != info.last_update)
3636 osd->clog->error() << info.pgid
3637 << " info mismatch, log.head " << pg_log.get_head()
3638 << " != info.last_update " << info.last_update;
3639
3640 if (!pg_log.get_log().empty()) {
3641 // sloppy check
3642 if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
3643 osd->clog->error() << info.pgid
3644 << " log bound mismatch, info (tail,head] ("
3645 << pg_log.get_tail() << "," << pg_log.get_head() << "]"
3646 << " actual ["
3647 << pg_log.get_log().log.begin()->version << ","
3648 << pg_log.get_log().log.rbegin()->version << "]";
3649 }
3650
3651 if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
3652 osd->clog->error() << info.pgid
3653 << " caller_ops.size " << pg_log.get_log().caller_ops.size()
3654 << " > log size " << pg_log.get_log().log.size();
3655 }
3656 }
3657
3658 void PG::update_snap_map(
3659 const vector<pg_log_entry_t> &log_entries,
3660 ObjectStore::Transaction &t)
3661 {
3662 for (vector<pg_log_entry_t>::const_iterator i = log_entries.begin();
3663 i != log_entries.end();
3664 ++i) {
3665 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
3666 if (i->soid.snap < CEPH_MAXSNAP) {
3667 if (i->is_delete()) {
3668 int r = snap_mapper.remove_oid(
3669 i->soid,
3670 &_t);
3671 assert(r == 0);
3672 } else if (i->is_update()) {
3673 assert(i->snaps.length() > 0);
3674 vector<snapid_t> snaps;
3675 bufferlist snapbl = i->snaps;
3676 bufferlist::iterator p = snapbl.begin();
3677 try {
3678 ::decode(snaps, p);
3679 } catch (...) {
3680 derr << __func__ << " decode snaps failure on " << *i << dendl;
3681 snaps.clear();
3682 }
3683 set<snapid_t> _snaps(snaps.begin(), snaps.end());
3684
3685 if (i->is_clone() || i->is_promote()) {
3686 snap_mapper.add_oid(
3687 i->soid,
3688 _snaps,
3689 &_t);
3690 } else if (i->is_modify()) {
3691 assert(i->is_modify());
3692 int r = snap_mapper.update_snaps(
3693 i->soid,
3694 _snaps,
3695 0,
3696 &_t);
3697 assert(r == 0);
3698 } else {
3699 assert(i->is_clean());
3700 }
3701 }
3702 }
3703 }
3704 }
3705
3706 /**
3707 * filter trimming|trimmed snaps out of snapcontext
3708 */
3709 void PG::filter_snapc(vector<snapid_t> &snaps)
3710 {
3711 //nothing needs to trim, we can return immediately
3712 if(snap_trimq.empty() && info.purged_snaps.empty())
3713 return;
3714
3715 bool filtering = false;
3716 vector<snapid_t> newsnaps;
3717 for (vector<snapid_t>::iterator p = snaps.begin();
3718 p != snaps.end();
3719 ++p) {
3720 if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
3721 if (!filtering) {
3722 // start building a new vector with what we've seen so far
3723 dout(10) << "filter_snapc filtering " << snaps << dendl;
3724 newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
3725 filtering = true;
3726 }
3727 dout(20) << "filter_snapc removing trimq|purged snap " << *p << dendl;
3728 } else {
3729 if (filtering)
3730 newsnaps.push_back(*p); // continue building new vector
3731 }
3732 }
3733 if (filtering) {
3734 snaps.swap(newsnaps);
3735 dout(10) << "filter_snapc result " << snaps << dendl;
3736 }
3737 }
3738
3739 void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m)
3740 {
3741 for (map<hobject_t, list<OpRequestRef>>::iterator it = m.begin();
3742 it != m.end();
3743 ++it)
3744 requeue_ops(it->second);
3745 m.clear();
3746 }
3747
3748 void PG::requeue_op(OpRequestRef op)
3749 {
3750 auto p = waiting_for_map.find(op->get_source());
3751 if (p != waiting_for_map.end()) {
3752 dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")"
3753 << dendl;
3754 p->second.push_front(op);
3755 } else {
3756 dout(20) << __func__ << " " << op << dendl;
3757 osd->enqueue_front(info.pgid, PGQueueable(op, get_osdmap()->get_epoch()));
3758 }
3759 }
3760
3761 void PG::requeue_ops(list<OpRequestRef> &ls)
3762 {
3763 for (list<OpRequestRef>::reverse_iterator i = ls.rbegin();
3764 i != ls.rend();
3765 ++i) {
3766 auto p = waiting_for_map.find((*i)->get_source());
3767 if (p != waiting_for_map.end()) {
3768 dout(20) << __func__ << " " << *i << " (waiting_for_map " << p->first
3769 << ")" << dendl;
3770 p->second.push_front(*i);
3771 } else {
3772 dout(20) << __func__ << " " << *i << dendl;
3773 osd->enqueue_front(info.pgid, PGQueueable(*i, get_osdmap()->get_epoch()));
3774 }
3775 }
3776 ls.clear();
3777 }
3778
3779 void PG::requeue_map_waiters()
3780 {
3781 epoch_t epoch = get_osdmap()->get_epoch();
3782 auto p = waiting_for_map.begin();
3783 while (p != waiting_for_map.end()) {
3784 if (epoch < p->second.front()->min_epoch) {
3785 dout(20) << __func__ << " " << p->first << " front op "
3786 << p->second.front() << " must still wait, doing nothing"
3787 << dendl;
3788 ++p;
3789 } else {
3790 dout(20) << __func__ << " " << p->first << " " << p->second << dendl;
3791 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
3792 osd->enqueue_front(info.pgid, PGQueueable(*q, epoch));
3793 }
3794 p = waiting_for_map.erase(p);
3795 }
3796 }
3797 }
3798
3799
3800 // ==========================================================================================
3801 // SCRUB
3802
3803 /*
3804 * when holding pg and sched_scrub_lock, then the states are:
3805 * scheduling:
3806 * scrubber.reserved = true
3807 * scrub_rserved_peers includes whoami
3808 * osd->scrub_pending++
3809 * scheduling, replica declined:
3810 * scrubber.reserved = true
3811 * scrubber.reserved_peers includes -1
3812 * osd->scrub_pending++
3813 * pending:
3814 * scrubber.reserved = true
3815 * scrubber.reserved_peers.size() == acting.size();
3816 * pg on scrub_wq
3817 * osd->scrub_pending++
3818 * scrubbing:
3819 * scrubber.reserved = false;
3820 * scrubber.reserved_peers empty
3821 * osd->scrubber.active++
3822 */
3823
3824 // returns true if a scrub has been newly kicked off
3825 bool PG::sched_scrub()
3826 {
3827 bool nodeep_scrub = false;
3828 assert(is_locked());
3829 if (!(is_primary() && is_active() && is_clean() && !is_scrubbing())) {
3830 return false;
3831 }
3832
3833 double deep_scrub_interval = 0;
3834 pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
3835 if (deep_scrub_interval <= 0) {
3836 deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
3837 }
3838 bool time_for_deep = ceph_clock_now() >=
3839 info.history.last_deep_scrub_stamp + deep_scrub_interval;
3840
3841 bool deep_coin_flip = false;
3842 // Only add random deep scrubs when NOT user initiated scrub
3843 if (!scrubber.must_scrub)
3844 deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
3845 dout(20) << __func__ << ": time_for_deep=" << time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
3846
3847 time_for_deep = (time_for_deep || deep_coin_flip);
3848
3849 //NODEEP_SCRUB so ignore time initiated deep-scrub
3850 if (osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
3851 pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
3852 time_for_deep = false;
3853 nodeep_scrub = true;
3854 }
3855
3856 if (!scrubber.must_scrub) {
3857 assert(!scrubber.must_deep_scrub);
3858
3859 //NOSCRUB so skip regular scrubs
3860 if ((osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
3861 pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) && !time_for_deep) {
3862 if (scrubber.reserved) {
3863 // cancel scrub if it is still in scheduling,
3864 // so pgs from other pools where scrub are still legal
3865 // have a chance to go ahead with scrubbing.
3866 clear_scrub_reserved();
3867 scrub_unreserve_replicas();
3868 }
3869 return false;
3870 }
3871 }
3872
3873 if (cct->_conf->osd_scrub_auto_repair
3874 && get_pgbackend()->auto_repair_supported()
3875 && time_for_deep
3876 // respect the command from user, and not do auto-repair
3877 && !scrubber.must_repair
3878 && !scrubber.must_scrub
3879 && !scrubber.must_deep_scrub) {
3880 dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
3881 scrubber.auto_repair = true;
3882 } else {
3883 // this happens when user issue the scrub/repair command during
3884 // the scheduling of the scrub/repair (e.g. request reservation)
3885 scrubber.auto_repair = false;
3886 }
3887
3888 bool ret = true;
3889 if (!scrubber.reserved) {
3890 assert(scrubber.reserved_peers.empty());
3891 if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
3892 osd->inc_scrubs_pending()) {
3893 dout(20) << __func__ << ": reserved locally, reserving replicas" << dendl;
3894 scrubber.reserved = true;
3895 scrubber.reserved_peers.insert(pg_whoami);
3896 scrub_reserve_replicas();
3897 } else {
3898 dout(20) << __func__ << ": failed to reserve locally" << dendl;
3899 ret = false;
3900 }
3901 }
3902 if (scrubber.reserved) {
3903 if (scrubber.reserve_failed) {
3904 dout(20) << "sched_scrub: failed, a peer declined" << dendl;
3905 clear_scrub_reserved();
3906 scrub_unreserve_replicas();
3907 ret = false;
3908 } else if (scrubber.reserved_peers.size() == acting.size()) {
3909 dout(20) << "sched_scrub: success, reserved self and replicas" << dendl;
3910 if (time_for_deep) {
3911 dout(10) << "sched_scrub: scrub will be deep" << dendl;
3912 state_set(PG_STATE_DEEP_SCRUB);
3913 } else if (!scrubber.must_deep_scrub && info.stats.stats.sum.num_deep_scrub_errors) {
3914 if (!nodeep_scrub) {
3915 osd->clog->info() << "osd." << osd->whoami
3916 << " pg " << info.pgid
3917 << " Deep scrub errors, upgrading scrub to deep-scrub";
3918 state_set(PG_STATE_DEEP_SCRUB);
3919 } else if (!scrubber.must_scrub) {
3920 osd->clog->error() << "osd." << osd->whoami
3921 << " pg " << info.pgid
3922 << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
3923 clear_scrub_reserved();
3924 scrub_unreserve_replicas();
3925 return false;
3926 } else {
3927 osd->clog->error() << "osd." << osd->whoami
3928 << " pg " << info.pgid
3929 << " Regular scrub request, deep-scrub details will be lost";
3930 }
3931 }
3932 queue_scrub();
3933 } else {
3934 // none declined, since scrubber.reserved is set
3935 dout(20) << "sched_scrub: reserved " << scrubber.reserved_peers << ", waiting for replicas" << dendl;
3936 }
3937 }
3938
3939 return ret;
3940 }
3941
3942 void PG::reg_next_scrub()
3943 {
3944 if (!is_primary())
3945 return;
3946
3947 utime_t reg_stamp;
3948 bool must = false;
3949 if (scrubber.must_scrub) {
3950 // Set the smallest time that isn't utime_t()
3951 reg_stamp = utime_t(0,1);
3952 must = true;
3953 } else if (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats) {
3954 reg_stamp = ceph_clock_now();
3955 must = true;
3956 } else {
3957 reg_stamp = info.history.last_scrub_stamp;
3958 }
3959 // note down the sched_time, so we can locate this scrub, and remove it
3960 // later on.
3961 double scrub_min_interval = 0, scrub_max_interval = 0;
3962 pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
3963 pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
3964 assert(scrubber.scrub_reg_stamp == utime_t());
3965 scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
3966 reg_stamp,
3967 scrub_min_interval,
3968 scrub_max_interval,
3969 must);
3970 }
3971
3972 void PG::unreg_next_scrub()
3973 {
3974 if (is_primary()) {
3975 osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
3976 scrubber.scrub_reg_stamp = utime_t();
3977 }
3978 }
3979
3980 void PG::do_replica_scrub_map(OpRequestRef op)
3981 {
3982 const MOSDRepScrubMap *m = static_cast<const MOSDRepScrubMap*>(op->get_req());
3983 dout(7) << __func__ << " " << *m << dendl;
3984 if (m->map_epoch < info.history.same_interval_since) {
3985 dout(10) << __func__ << " discarding old from "
3986 << m->map_epoch << " < " << info.history.same_interval_since
3987 << dendl;
3988 return;
3989 }
3990 if (!scrubber.is_chunky_scrub_active()) {
3991 dout(10) << __func__ << " scrub isn't active" << dendl;
3992 return;
3993 }
3994
3995 op->mark_started();
3996
3997 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3998 scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3999 dout(10) << "map version is "
4000 << scrubber.received_maps[m->from].valid_through
4001 << dendl;
4002
4003 dout(10) << __func__ << " waiting_on_whom was " << scrubber.waiting_on_whom
4004 << dendl;
4005 assert(scrubber.waiting_on_whom.count(m->from));
4006 scrubber.waiting_on_whom.erase(m->from);
4007 if (m->preempted) {
4008 dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
4009 scrub_preempted = true;
4010 }
4011 if (scrubber.waiting_on_whom.empty()) {
4012 if (ops_blocked_by_scrub()) {
4013 requeue_scrub(true);
4014 } else {
4015 requeue_scrub(false);
4016 }
4017 }
4018 }
4019
4020 void PG::sub_op_scrub_map(OpRequestRef op)
4021 {
4022 // for legacy jewel compatibility only
4023 const MOSDSubOp *m = static_cast<const MOSDSubOp *>(op->get_req());
4024 assert(m->get_type() == MSG_OSD_SUBOP);
4025 dout(7) << "sub_op_scrub_map" << dendl;
4026
4027 if (m->map_epoch < info.history.same_interval_since) {
4028 dout(10) << "sub_op_scrub discarding old sub_op from "
4029 << m->map_epoch << " < " << info.history.same_interval_since << dendl;
4030 return;
4031 }
4032
4033 if (!scrubber.is_chunky_scrub_active()) {
4034 dout(10) << "sub_op_scrub_map scrub isn't active" << dendl;
4035 return;
4036 }
4037
4038 op->mark_started();
4039
4040 dout(10) << " got " << m->from << " scrub map" << dendl;
4041 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
4042
4043 scrubber.received_maps[m->from].decode(p, info.pgid.pool());
4044 dout(10) << "map version is "
4045 << scrubber.received_maps[m->from].valid_through
4046 << dendl;
4047
4048 scrubber.waiting_on_whom.erase(m->from);
4049
4050 if (scrubber.waiting_on_whom.empty()) {
4051 if (ops_blocked_by_scrub()) {
4052 requeue_scrub(true);
4053 } else {
4054 requeue_scrub(false);
4055 }
4056 }
4057 }
4058
4059 // send scrub v3 messages (chunky scrub)
4060 void PG::_request_scrub_map(
4061 pg_shard_t replica, eversion_t version,
4062 hobject_t start, hobject_t end,
4063 bool deep,
4064 bool allow_preemption)
4065 {
4066 assert(replica != pg_whoami);
4067 dout(10) << "scrub requesting scrubmap from osd." << replica
4068 << " deep " << (int)deep << dendl;
4069 MOSDRepScrub *repscrubop = new MOSDRepScrub(
4070 spg_t(info.pgid.pgid, replica.shard), version,
4071 get_osdmap()->get_epoch(),
4072 get_last_peering_reset(),
4073 start, end, deep,
4074 allow_preemption,
4075 scrubber.priority,
4076 ops_blocked_by_scrub());
4077 // default priority, we want the rep scrub processed prior to any recovery
4078 // or client io messages (we are holding a lock!)
4079 osd->send_message_osd_cluster(
4080 replica.osd, repscrubop, get_osdmap()->get_epoch());
4081 }
4082
4083 void PG::handle_scrub_reserve_request(OpRequestRef op)
4084 {
4085 dout(7) << __func__ << " " << *op->get_req() << dendl;
4086 op->mark_started();
4087 if (scrubber.reserved) {
4088 dout(10) << __func__ << " ignoring reserve request: Already reserved"
4089 << dendl;
4090 return;
4091 }
4092 if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
4093 osd->inc_scrubs_pending()) {
4094 scrubber.reserved = true;
4095 } else {
4096 dout(20) << __func__ << ": failed to reserve remotely" << dendl;
4097 scrubber.reserved = false;
4098 }
4099 if (op->get_req()->get_type() == MSG_OSD_SCRUB_RESERVE) {
4100 const MOSDScrubReserve *m =
4101 static_cast<const MOSDScrubReserve*>(op->get_req());
4102 Message *reply = new MOSDScrubReserve(
4103 spg_t(info.pgid.pgid, primary.shard),
4104 m->map_epoch,
4105 scrubber.reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
4106 pg_whoami);
4107 osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
4108 } else {
4109 // for jewel compat only
4110 const MOSDSubOp *req = static_cast<const MOSDSubOp*>(op->get_req());
4111 assert(req->get_type() == MSG_OSD_SUBOP);
4112 MOSDSubOpReply *reply = new MOSDSubOpReply(
4113 req, pg_whoami, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
4114 ::encode(scrubber.reserved, reply->get_data());
4115 osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
4116 }
4117 }
4118
4119 void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
4120 {
4121 dout(7) << __func__ << " " << *op->get_req() << dendl;
4122 op->mark_started();
4123 if (!scrubber.reserved) {
4124 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
4125 return;
4126 }
4127 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
4128 dout(10) << " already had osd." << from << " reserved" << dendl;
4129 } else {
4130 dout(10) << " osd." << from << " scrub reserve = success" << dendl;
4131 scrubber.reserved_peers.insert(from);
4132 sched_scrub();
4133 }
4134 }
4135
4136 void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
4137 {
4138 dout(7) << __func__ << " " << *op->get_req() << dendl;
4139 op->mark_started();
4140 if (!scrubber.reserved) {
4141 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
4142 return;
4143 }
4144 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
4145 dout(10) << " already had osd." << from << " reserved" << dendl;
4146 } else {
4147 /* One decline stops this pg from being scheduled for scrubbing. */
4148 dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
4149 scrubber.reserve_failed = true;
4150 sched_scrub();
4151 }
4152 }
4153
4154 void PG::handle_scrub_reserve_release(OpRequestRef op)
4155 {
4156 dout(7) << __func__ << " " << *op->get_req() << dendl;
4157 op->mark_started();
4158 clear_scrub_reserved();
4159 }
4160
4161 void PG::reject_reservation()
4162 {
4163 osd->send_message_osd_cluster(
4164 primary.osd,
4165 new MBackfillReserve(
4166 MBackfillReserve::REJECT,
4167 spg_t(info.pgid.pgid, primary.shard),
4168 get_osdmap()->get_epoch()),
4169 get_osdmap()->get_epoch());
4170 }
4171
4172 void PG::schedule_backfill_retry(float delay)
4173 {
4174 Mutex::Locker lock(osd->recovery_request_lock);
4175 osd->recovery_request_timer.add_event_after(
4176 delay,
4177 new QueuePeeringEvt<RequestBackfill>(
4178 this, get_osdmap()->get_epoch(),
4179 RequestBackfill()));
4180 }
4181
4182 void PG::schedule_recovery_retry(float delay)
4183 {
4184 Mutex::Locker lock(osd->recovery_request_lock);
4185 osd->recovery_request_timer.add_event_after(
4186 delay,
4187 new QueuePeeringEvt<DoRecovery>(
4188 this, get_osdmap()->get_epoch(),
4189 DoRecovery()));
4190 }
4191
4192 void PG::clear_scrub_reserved()
4193 {
4194 scrubber.reserved_peers.clear();
4195 scrubber.reserve_failed = false;
4196
4197 if (scrubber.reserved) {
4198 scrubber.reserved = false;
4199 osd->dec_scrubs_pending();
4200 }
4201 }
4202
4203 void PG::scrub_reserve_replicas()
4204 {
4205 assert(backfill_targets.empty());
4206 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4207 i != actingbackfill.end();
4208 ++i) {
4209 if (*i == pg_whoami) continue;
4210 dout(10) << "scrub requesting reserve from osd." << *i << dendl;
4211 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
4212 osd->send_message_osd_cluster(
4213 i->osd,
4214 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
4215 get_osdmap()->get_epoch(),
4216 MOSDScrubReserve::REQUEST, pg_whoami),
4217 get_osdmap()->get_epoch());
4218 } else {
4219 // for jewel compat only
4220 vector<OSDOp> scrub(1);
4221 scrub[0].op.op = CEPH_OSD_OP_SCRUB_RESERVE;
4222 hobject_t poid;
4223 eversion_t v;
4224 osd_reqid_t reqid;
4225 MOSDSubOp *subop = new MOSDSubOp(
4226 reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
4227 get_osdmap()->get_epoch(), osd->get_tid(), v);
4228 subop->ops = scrub;
4229 osd->send_message_osd_cluster(
4230 i->osd, subop, get_osdmap()->get_epoch());
4231 }
4232 }
4233 }
4234
4235 void PG::scrub_unreserve_replicas()
4236 {
4237 assert(backfill_targets.empty());
4238 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4239 i != actingbackfill.end();
4240 ++i) {
4241 if (*i == pg_whoami) continue;
4242 dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
4243 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
4244 osd->send_message_osd_cluster(
4245 i->osd,
4246 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
4247 get_osdmap()->get_epoch(),
4248 MOSDScrubReserve::RELEASE, pg_whoami),
4249 get_osdmap()->get_epoch());
4250 } else {
4251 // for jewel compat only
4252 vector<OSDOp> scrub(1);
4253 scrub[0].op.op = CEPH_OSD_OP_SCRUB_UNRESERVE;
4254 hobject_t poid;
4255 eversion_t v;
4256 osd_reqid_t reqid;
4257 MOSDSubOp *subop = new MOSDSubOp(
4258 reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
4259 get_osdmap()->get_epoch(), osd->get_tid(), v);
4260 subop->ops = scrub;
4261 osd->send_message_osd_cluster(i->osd, subop, get_osdmap()->get_epoch());
4262 }
4263 }
4264 }
4265
4266 void PG::_scan_rollback_obs(
4267 const vector<ghobject_t> &rollback_obs,
4268 ThreadPool::TPHandle &handle)
4269 {
4270 ObjectStore::Transaction t;
4271 eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
4272 for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
4273 i != rollback_obs.end();
4274 ++i) {
4275 if (i->generation < trimmed_to.version) {
4276 osd->clog->error() << "osd." << osd->whoami
4277 << " pg " << info.pgid
4278 << " found obsolete rollback obj "
4279 << *i << " generation < trimmed_to "
4280 << trimmed_to
4281 << "...repaired";
4282 t.remove(coll, *i);
4283 }
4284 }
4285 if (!t.empty()) {
4286 derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
4287 << dendl;
4288 osd->store->queue_transaction(osr.get(), std::move(t), NULL);
4289 }
4290 }
4291
4292 void PG::_scan_snaps(ScrubMap &smap)
4293 {
4294 hobject_t head;
4295 SnapSet snapset;
4296
4297 // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify
4298 // caller using clean_meta_map(), and it works properly.
4299 dout(20) << __func__ << " start" << dendl;
4300
4301 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4302 i != smap.objects.rend();
4303 ++i) {
4304 const hobject_t &hoid = i->first;
4305 ScrubMap::object &o = i->second;
4306
4307 dout(20) << __func__ << " " << hoid << dendl;
4308
4309 if (hoid.is_head() || hoid.is_snapdir()) {
4310 // parse the SnapSet
4311 bufferlist bl;
4312 if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
4313 continue;
4314 }
4315 bl.push_back(o.attrs[SS_ATTR]);
4316 auto p = bl.begin();
4317 try {
4318 ::decode(snapset, p);
4319 } catch(...) {
4320 continue;
4321 }
4322 head = hoid.get_head();
4323 // Make sure head_exists is correct for is_legacy() check
4324 if (hoid.is_head())
4325 snapset.head_exists = true;
4326 continue;
4327 }
4328 if (hoid.snap < CEPH_MAXSNAP) {
4329 // check and if necessary fix snap_mapper
4330 if (hoid.get_head() != head) {
4331 derr << __func__ << " no head for " << hoid << " (have " << head << ")"
4332 << dendl;
4333 continue;
4334 }
4335 set<snapid_t> obj_snaps;
4336 if (!snapset.is_legacy()) {
4337 auto p = snapset.clone_snaps.find(hoid.snap);
4338 if (p == snapset.clone_snaps.end()) {
4339 derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset
4340 << dendl;
4341 continue;
4342 }
4343 obj_snaps.insert(p->second.begin(), p->second.end());
4344 } else {
4345 bufferlist bl;
4346 if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4347 continue;
4348 }
4349 bl.push_back(o.attrs[OI_ATTR]);
4350 object_info_t oi;
4351 try {
4352 oi.decode(bl);
4353 } catch(...) {
4354 continue;
4355 }
4356 obj_snaps.insert(oi.legacy_snaps.begin(), oi.legacy_snaps.end());
4357 }
4358 set<snapid_t> cur_snaps;
4359 int r = snap_mapper.get_snaps(hoid, &cur_snaps);
4360 if (r != 0 && r != -ENOENT) {
4361 derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
4362 ceph_abort();
4363 }
4364 if (r == -ENOENT || cur_snaps != obj_snaps) {
4365 ObjectStore::Transaction t;
4366 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4367 if (r == 0) {
4368 r = snap_mapper.remove_oid(hoid, &_t);
4369 if (r != 0) {
4370 derr << __func__ << ": remove_oid returned " << cpp_strerror(r)
4371 << dendl;
4372 ceph_abort();
4373 }
4374 osd->clog->error() << "osd." << osd->whoami
4375 << " found snap mapper error on pg "
4376 << info.pgid
4377 << " oid " << hoid << " snaps in mapper: "
4378 << cur_snaps << ", oi: "
4379 << obj_snaps
4380 << "...repaired";
4381 } else {
4382 osd->clog->error() << "osd." << osd->whoami
4383 << " found snap mapper error on pg "
4384 << info.pgid
4385 << " oid " << hoid << " snaps missing in mapper"
4386 << ", should be: "
4387 << obj_snaps
4388 << "...repaired";
4389 }
4390 snap_mapper.add_oid(hoid, obj_snaps, &_t);
4391
4392 // wait for repair to apply to avoid confusing other bits of the system.
4393 {
4394 Cond my_cond;
4395 Mutex my_lock("PG::_scan_snaps my_lock");
4396 int r = 0;
4397 bool done;
4398 t.register_on_applied_sync(
4399 new C_SafeCond(&my_lock, &my_cond, &done, &r));
4400 r = osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4401 if (r != 0) {
4402 derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4403 << dendl;
4404 } else {
4405 my_lock.Lock();
4406 while (!done)
4407 my_cond.Wait(my_lock);
4408 my_lock.Unlock();
4409 }
4410 }
4411 }
4412 }
4413 }
4414 }
4415
4416 void PG::_repair_oinfo_oid(ScrubMap &smap)
4417 {
4418 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4419 i != smap.objects.rend();
4420 ++i) {
4421 const hobject_t &hoid = i->first;
4422 ScrubMap::object &o = i->second;
4423
4424 bufferlist bl;
4425 if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4426 continue;
4427 }
4428 bl.push_back(o.attrs[OI_ATTR]);
4429 object_info_t oi;
4430 try {
4431 oi.decode(bl);
4432 } catch(...) {
4433 continue;
4434 }
4435 if (oi.soid != hoid) {
4436 ObjectStore::Transaction t;
4437 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4438 osd->clog->error() << "osd." << osd->whoami
4439 << " found object info error on pg "
4440 << info.pgid
4441 << " oid " << hoid << " oid in object info: "
4442 << oi.soid
4443 << "...repaired";
4444 // Fix object info
4445 oi.soid = hoid;
4446 bl.clear();
4447 ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4448
4449 bufferptr bp(bl.c_str(), bl.length());
4450 o.attrs[OI_ATTR] = bp;
4451
4452 t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
4453 int r = osd->store->apply_transaction(osr.get(), std::move(t));
4454 if (r != 0) {
4455 derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4456 << dendl;
4457 }
4458 }
4459 }
4460 }
4461 int PG::build_scrub_map_chunk(
4462 ScrubMap &map,
4463 ScrubMapBuilder &pos,
4464 hobject_t start,
4465 hobject_t end,
4466 bool deep,
4467 ThreadPool::TPHandle &handle)
4468 {
4469 dout(10) << __func__ << " [" << start << "," << end << ") "
4470 << " pos " << pos
4471 << dendl;
4472
4473 // start
4474 while (pos.empty()) {
4475 pos.deep = deep;
4476 map.valid_through = info.last_update;
4477 osr->flush();
4478
4479 // objects
4480 vector<ghobject_t> rollback_obs;
4481 pos.ret = get_pgbackend()->objects_list_range(
4482 start,
4483 end,
4484 0,
4485 &pos.ls,
4486 &rollback_obs);
4487 if (pos.ret < 0) {
4488 dout(5) << "objects_list_range error: " << pos.ret << dendl;
4489 return pos.ret;
4490 }
4491 if (pos.ls.empty()) {
4492 break;
4493 }
4494 _scan_rollback_obs(rollback_obs, handle);
4495 pos.pos = 0;
4496 return -EINPROGRESS;
4497 }
4498
4499 // scan objects
4500 while (!pos.done()) {
4501 int r = get_pgbackend()->be_scan_list(map, pos);
4502 if (r == -EINPROGRESS) {
4503 return r;
4504 }
4505 }
4506
4507 // finish
4508 dout(20) << __func__ << " finishing" << dendl;
4509 assert(pos.done());
4510 _repair_oinfo_oid(map);
4511 if (!is_primary()) {
4512 ScrubMap for_meta_scrub;
4513 // In case we restarted smaller chunk, clear old data
4514 scrubber.cleaned_meta_map.clear_from(scrubber.start);
4515 scrubber.cleaned_meta_map.insert(map);
4516 scrubber.clean_meta_map(for_meta_scrub);
4517 _scan_snaps(for_meta_scrub);
4518 }
4519
4520 dout(20) << __func__ << " done, got " << map.objects.size() << " items"
4521 << dendl;
4522 return 0;
4523 }
4524
4525 void PG::Scrubber::cleanup_store(ObjectStore::Transaction *t) {
4526 if (!store)
4527 return;
4528 struct OnComplete : Context {
4529 std::unique_ptr<Scrub::Store> store;
4530 OnComplete(
4531 std::unique_ptr<Scrub::Store> &&store)
4532 : store(std::move(store)) {}
4533 void finish(int) override {}
4534 };
4535 store->cleanup(t);
4536 t->register_on_complete(new OnComplete(std::move(store)));
4537 assert(!store);
4538 }
4539
4540 void PG::repair_object(
4541 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
4542 pg_shard_t bad_peer)
4543 {
4544 list<pg_shard_t> op_shards;
4545 for (auto i : *ok_peers) {
4546 op_shards.push_back(i.second);
4547 }
4548 dout(10) << "repair_object " << soid << " bad_peer osd."
4549 << bad_peer << " ok_peers osd.{" << op_shards << "}" << dendl;
4550 ScrubMap::object &po = ok_peers->back().first;
4551 eversion_t v;
4552 bufferlist bv;
4553 bv.push_back(po.attrs[OI_ATTR]);
4554 object_info_t oi;
4555 try {
4556 bufferlist::iterator bliter = bv.begin();
4557 ::decode(oi, bliter);
4558 } catch (...) {
4559 dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
4560 assert(0);
4561 }
4562 if (bad_peer != primary) {
4563 peer_missing[bad_peer].add(soid, oi.version, eversion_t(), false);
4564 } else {
4565 // We should only be scrubbing if the PG is clean.
4566 assert(waiting_for_unreadable_object.empty());
4567
4568 pg_log.missing_add(soid, oi.version, eversion_t());
4569
4570 pg_log.set_last_requested(0);
4571 dout(10) << __func__ << ": primary = " << primary << dendl;
4572 }
4573
4574 if (is_ec_pg() || bad_peer == primary) {
4575 // we'd better collect all shard for EC pg, and prepare good peers as the
4576 // source of pull in the case of replicated pg.
4577 missing_loc.add_missing(soid, oi.version, eversion_t());
4578 list<pair<ScrubMap::object, pg_shard_t> >::iterator i;
4579 for (i = ok_peers->begin();
4580 i != ok_peers->end();
4581 ++i)
4582 missing_loc.add_location(soid, i->second);
4583 }
4584 }
4585
4586 /* replica_scrub
4587 *
4588 * Wait for last_update_applied to match msg->scrub_to as above. Wait
4589 * for pushes to complete in case of recent recovery. Build a single
4590 * scrubmap of objects that are in the range [msg->start, msg->end).
4591 */
4592 void PG::replica_scrub(
4593 OpRequestRef op,
4594 ThreadPool::TPHandle &handle)
4595 {
4596 const MOSDRepScrub *msg = static_cast<const MOSDRepScrub *>(op->get_req());
4597 assert(!scrubber.active_rep_scrub);
4598 dout(7) << "replica_scrub" << dendl;
4599
4600 if (msg->map_epoch < info.history.same_interval_since) {
4601 dout(10) << "replica_scrub discarding old replica_scrub from "
4602 << msg->map_epoch << " < " << info.history.same_interval_since
4603 << dendl;
4604 return;
4605 }
4606
4607 assert(msg->chunky);
4608 if (last_update_applied < msg->scrub_to) {
4609 dout(10) << "waiting for last_update_applied to catch up" << dendl;
4610 scrubber.active_rep_scrub = op;
4611 return;
4612 }
4613
4614 if (active_pushes > 0) {
4615 dout(10) << "waiting for active pushes to finish" << dendl;
4616 scrubber.active_rep_scrub = op;
4617 return;
4618 }
4619
4620 scrubber.state = Scrubber::BUILD_MAP_REPLICA;
4621 scrubber.replica_scrub_start = msg->min_epoch;
4622 scrubber.start = msg->start;
4623 scrubber.end = msg->end;
4624 scrubber.max_end = msg->end;
4625 scrubber.deep = msg->deep;
4626 scrubber.epoch_start = info.history.same_interval_since;
4627 if (msg->priority) {
4628 scrubber.priority = msg->priority;
4629 } else {
4630 scrubber.priority = get_scrub_priority();
4631 }
4632
4633 scrub_can_preempt = msg->allow_preemption;
4634 scrub_preempted = false;
4635 scrubber.replica_scrubmap_pos.reset();
4636
4637 requeue_scrub(msg->high_priority);
4638 }
4639
4640 /* Scrub:
4641 * PG_STATE_SCRUBBING is set when the scrub is queued
4642 *
4643 * scrub will be chunky if all OSDs in PG support chunky scrub
4644 * scrub will fail if OSDs are too old.
4645 */
4646 void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
4647 {
4648 if (cct->_conf->osd_scrub_sleep > 0 &&
4649 (scrubber.state == PG::Scrubber::NEW_CHUNK ||
4650 scrubber.state == PG::Scrubber::INACTIVE) &&
4651 scrubber.needs_sleep) {
4652 ceph_assert(!scrubber.sleeping);
4653 dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
4654
4655 // Do an async sleep so we don't block the op queue
4656 OSDService *osds = osd;
4657 spg_t pgid = get_pgid();
4658 int state = scrubber.state;
4659 auto scrub_requeue_callback =
4660 new FunctionContext([osds, pgid, state](int r) {
4661 PG *pg = osds->osd->lookup_lock_pg(pgid);
4662 if (pg == nullptr) {
4663 lgeneric_dout(osds->osd->cct, 20)
4664 << "scrub_requeue_callback: Could not find "
4665 << "PG " << pgid << " can't complete scrub requeue after sleep"
4666 << dendl;
4667 return;
4668 }
4669 pg->scrubber.sleeping = false;
4670 pg->scrubber.needs_sleep = false;
4671 lgeneric_dout(pg->cct, 20)
4672 << "scrub_requeue_callback: slept for "
4673 << ceph_clock_now() - pg->scrubber.sleep_start
4674 << ", re-queuing scrub with state " << state << dendl;
4675 pg->scrub_queued = false;
4676 pg->requeue_scrub();
4677 pg->scrubber.sleep_start = utime_t();
4678 pg->unlock();
4679 });
4680 Mutex::Locker l(osd->scrub_sleep_lock);
4681 osd->scrub_sleep_timer.add_event_after(cct->_conf->osd_scrub_sleep,
4682 scrub_requeue_callback);
4683 scrubber.sleeping = true;
4684 scrubber.sleep_start = ceph_clock_now();
4685 return;
4686 }
4687 if (pg_has_reset_since(queued)) {
4688 return;
4689 }
4690 assert(scrub_queued);
4691 scrub_queued = false;
4692 scrubber.needs_sleep = true;
4693
4694 // for the replica
4695 if (!is_primary() &&
4696 scrubber.state == PG::Scrubber::BUILD_MAP_REPLICA) {
4697 chunky_scrub(handle);
4698 return;
4699 }
4700
4701 if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
4702 dout(10) << "scrub -- not primary or active or not clean" << dendl;
4703 state_clear(PG_STATE_SCRUBBING);
4704 state_clear(PG_STATE_REPAIR);
4705 state_clear(PG_STATE_DEEP_SCRUB);
4706 publish_stats_to_osd();
4707 return;
4708 }
4709
4710 if (!scrubber.active) {
4711 assert(backfill_targets.empty());
4712
4713 scrubber.deep = state_test(PG_STATE_DEEP_SCRUB);
4714
4715 dout(10) << "starting a new chunky scrub" << dendl;
4716 }
4717
4718 chunky_scrub(handle);
4719 }
4720
4721 /*
4722 * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
4723 * chunk.
4724 *
4725 * The object store is partitioned into chunks which end on hash boundaries. For
4726 * each chunk, the following logic is performed:
4727 *
4728 * (1) Block writes on the chunk
4729 * (2) Request maps from replicas
4730 * (3) Wait for pushes to be applied (after recovery)
4731 * (4) Wait for writes to flush on the chunk
4732 * (5) Wait for maps from replicas
4733 * (6) Compare / repair all scrub maps
4734 * (7) Wait for digest updates to apply
4735 *
4736 * This logic is encoded in the mostly linear state machine:
4737 *
4738 * +------------------+
4739 * _________v__________ |
4740 * | | |
4741 * | INACTIVE | |
4742 * |____________________| |
4743 * | |
4744 * | +----------+ |
4745 * _________v___v______ | |
4746 * | | | |
4747 * | NEW_CHUNK | | |
4748 * |____________________| | |
4749 * | | |
4750 * _________v__________ | |
4751 * | | | |
4752 * | WAIT_PUSHES | | |
4753 * |____________________| | |
4754 * | | |
4755 * _________v__________ | |
4756 * | | | |
4757 * | WAIT_LAST_UPDATE | | |
4758 * |____________________| | |
4759 * | | |
4760 * _________v__________ | |
4761 * | | | |
4762 * | BUILD_MAP | | |
4763 * |____________________| | |
4764 * | | |
4765 * _________v__________ | |
4766 * | | | |
4767 * | WAIT_REPLICAS | | |
4768 * |____________________| | |
4769 * | | |
4770 * _________v__________ | |
4771 * | | | |
4772 * | COMPARE_MAPS | | |
4773 * |____________________| | |
4774 * | | |
4775 * | | |
4776 * _________v__________ | |
4777 * | | | |
4778 * |WAIT_DIGEST_UPDATES | | |
4779 * |____________________| | |
4780 * | | | |
4781 * | +----------+ |
4782 * _________v__________ |
4783 * | | |
4784 * | FINISH | |
4785 * |____________________| |
4786 * | |
4787 * +------------------+
4788 *
4789 * The primary determines the last update from the subset by walking the log. If
4790 * it sees a log entry pertaining to a file in the chunk, it tells the replicas
4791 * to wait until that update is applied before building a scrub map. Both the
4792 * primary and replicas will wait for any active pushes to be applied.
4793 *
4794 * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
4795 *
4796 * scrubber.state encodes the current state of the scrub (refer to state diagram
4797 * for details).
4798 */
4799 void PG::chunky_scrub(ThreadPool::TPHandle &handle)
4800 {
4801 // check for map changes
4802 if (scrubber.is_chunky_scrub_active()) {
4803 if (scrubber.epoch_start != info.history.same_interval_since) {
4804 dout(10) << "scrub pg changed, aborting" << dendl;
4805 scrub_clear_state();
4806 scrub_unreserve_replicas();
4807 return;
4808 }
4809 }
4810
4811 bool done = false;
4812 int ret;
4813
4814 while (!done) {
4815 dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
4816 << " [" << scrubber.start << "," << scrubber.end << ")"
4817 << " max_end " << scrubber.max_end << dendl;
4818
4819 switch (scrubber.state) {
4820 case PG::Scrubber::INACTIVE:
4821 dout(10) << "scrub start" << dendl;
4822 assert(is_primary());
4823
4824 publish_stats_to_osd();
4825 scrubber.epoch_start = info.history.same_interval_since;
4826 scrubber.active = true;
4827
4828 osd->inc_scrubs_active(scrubber.reserved);
4829 if (scrubber.reserved) {
4830 scrubber.reserved = false;
4831 scrubber.reserved_peers.clear();
4832 }
4833
4834 {
4835 ObjectStore::Transaction t;
4836 scrubber.cleanup_store(&t);
4837 scrubber.store.reset(Scrub::Store::create(osd->store, &t,
4838 info.pgid, coll));
4839 osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4840 }
4841
4842 // Don't include temporary objects when scrubbing
4843 scrubber.start = info.pgid.pgid.get_hobj_start();
4844 scrubber.state = PG::Scrubber::NEW_CHUNK;
4845
4846 {
4847 bool repair = state_test(PG_STATE_REPAIR);
4848 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4849 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4850 stringstream oss;
4851 oss << info.pgid.pgid << " " << mode << " starts" << std::endl;
4852 osd->clog->debug(oss);
4853 }
4854
4855 scrubber.preempt_left = cct->_conf->get_val<uint64_t>(
4856 "osd_scrub_max_preemptions");
4857 scrubber.preempt_divisor = 1;
4858 break;
4859
4860 case PG::Scrubber::NEW_CHUNK:
4861 scrubber.primary_scrubmap = ScrubMap();
4862 scrubber.received_maps.clear();
4863
4864 // begin (possible) preemption window
4865 if (scrub_preempted) {
4866 scrubber.preempt_left--;
4867 scrubber.preempt_divisor *= 2;
4868 dout(10) << __func__ << " preempted, " << scrubber.preempt_left
4869 << " left" << dendl;
4870 scrub_preempted = false;
4871 }
4872 scrub_can_preempt = scrubber.preempt_left > 0;
4873
4874 {
4875 /* get the start and end of our scrub chunk
4876 *
4877 * Our scrub chunk has an important restriction we're going to need to
4878 * respect. We can't let head or snapdir be start or end.
4879 * Using a half-open interval means that if end == head|snapdir,
4880 * we'd scrub/lock head and the clone right next to head in different
4881 * chunks which would allow us to miss clones created between
4882 * scrubbing that chunk and scrubbing the chunk including head.
4883 * This isn't true for any of the other clones since clones can
4884 * only be created "just to the left of" head. There is one exception
4885 * to this: promotion of clones which always happens to the left of the
4886 * left-most clone, but promote_object checks the scrubber in that
4887 * case, so it should be ok. Also, it's ok to "miss" clones at the
4888 * left end of the range if we are a tier because they may legitimately
4889 * not exist (see _scrub).
4890 */
4891 int min = std::max<int64_t>(3, cct->_conf->osd_scrub_chunk_min /
4892 scrubber.preempt_divisor);
4893 int max = std::max<int64_t>(min, cct->_conf->osd_scrub_chunk_max /
4894 scrubber.preempt_divisor);
4895 hobject_t start = scrubber.start;
4896 hobject_t candidate_end;
4897 vector<hobject_t> objects;
4898 osr->flush();
4899 ret = get_pgbackend()->objects_list_partial(
4900 start,
4901 min,
4902 max,
4903 &objects,
4904 &candidate_end);
4905 assert(ret >= 0);
4906
4907 if (!objects.empty()) {
4908 hobject_t back = objects.back();
4909 while (candidate_end.has_snapset() &&
4910 candidate_end.get_head() == back.get_head()) {
4911 candidate_end = back;
4912 objects.pop_back();
4913 if (objects.empty()) {
4914 assert(0 ==
4915 "Somehow we got more than 2 objects which"
4916 "have the same head but are not clones");
4917 }
4918 back = objects.back();
4919 }
4920 if (candidate_end.has_snapset()) {
4921 assert(candidate_end.get_head() != back.get_head());
4922 candidate_end = candidate_end.get_object_boundary();
4923 }
4924 } else {
4925 assert(candidate_end.is_max());
4926 }
4927
4928 if (!_range_available_for_scrub(scrubber.start, candidate_end)) {
4929 // we'll be requeued by whatever made us unavailable for scrub
4930 dout(10) << __func__ << ": scrub blocked somewhere in range "
4931 << "[" << scrubber.start << ", " << candidate_end << ")"
4932 << dendl;
4933 done = true;
4934 break;
4935 }
4936 scrubber.end = candidate_end;
4937 if (scrubber.end > scrubber.max_end)
4938 scrubber.max_end = scrubber.end;
4939 }
4940
4941 // walk the log to find the latest update that affects our chunk
4942 scrubber.subset_last_update = eversion_t();
4943 for (auto p = projected_log.log.rbegin();
4944 p != projected_log.log.rend();
4945 ++p) {
4946 if (p->soid >= scrubber.start &&
4947 p->soid < scrubber.end) {
4948 scrubber.subset_last_update = p->version;
4949 break;
4950 }
4951 }
4952 if (scrubber.subset_last_update == eversion_t()) {
4953 for (list<pg_log_entry_t>::const_reverse_iterator p =
4954 pg_log.get_log().log.rbegin();
4955 p != pg_log.get_log().log.rend();
4956 ++p) {
4957 if (p->soid >= scrubber.start &&
4958 p->soid < scrubber.end) {
4959 scrubber.subset_last_update = p->version;
4960 break;
4961 }
4962 }
4963 }
4964
4965 // ask replicas to wait until
4966 // last_update_applied >= scrubber.subset_last_update and then scan
4967 scrubber.waiting_on_whom.insert(pg_whoami);
4968
4969 // request maps from replicas
4970 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4971 i != actingbackfill.end();
4972 ++i) {
4973 if (*i == pg_whoami) continue;
4974 _request_scrub_map(*i, scrubber.subset_last_update,
4975 scrubber.start, scrubber.end, scrubber.deep,
4976 scrubber.preempt_left > 0);
4977 scrubber.waiting_on_whom.insert(*i);
4978 }
4979 dout(10) << __func__ << " waiting_on_whom " << scrubber.waiting_on_whom
4980 << dendl;
4981
4982 scrubber.state = PG::Scrubber::WAIT_PUSHES;
4983 break;
4984
4985 case PG::Scrubber::WAIT_PUSHES:
4986 if (active_pushes == 0) {
4987 scrubber.state = PG::Scrubber::WAIT_LAST_UPDATE;
4988 } else {
4989 dout(15) << "wait for pushes to apply" << dendl;
4990 done = true;
4991 }
4992 break;
4993
4994 case PG::Scrubber::WAIT_LAST_UPDATE:
4995 if (last_update_applied < scrubber.subset_last_update) {
4996 // will be requeued by op_applied
4997 dout(15) << "wait for writes to flush" << dendl;
4998 done = true;
4999 break;
5000 }
5001
5002 scrubber.state = PG::Scrubber::BUILD_MAP;
5003 scrubber.primary_scrubmap_pos.reset();
5004 break;
5005
5006 case PG::Scrubber::BUILD_MAP:
5007 assert(last_update_applied >= scrubber.subset_last_update);
5008
5009 // build my own scrub map
5010 if (scrub_preempted) {
5011 dout(10) << __func__ << " preempted" << dendl;
5012 scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
5013 break;
5014 }
5015 ret = build_scrub_map_chunk(
5016 scrubber.primary_scrubmap,
5017 scrubber.primary_scrubmap_pos,
5018 scrubber.start, scrubber.end,
5019 scrubber.deep,
5020 handle);
5021 if (ret == -EINPROGRESS) {
5022 requeue_scrub();
5023 done = true;
5024 break;
5025 }
5026 scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
5027 break;
5028
5029 case PG::Scrubber::BUILD_MAP_DONE:
5030 if (scrubber.primary_scrubmap_pos.ret < 0) {
5031 dout(5) << "error: " << scrubber.primary_scrubmap_pos.ret
5032 << ", aborting" << dendl;
5033 scrub_clear_state();
5034 scrub_unreserve_replicas();
5035 return;
5036 }
5037 dout(10) << __func__ << " waiting_on_whom was "
5038 << scrubber.waiting_on_whom << dendl;
5039 assert(scrubber.waiting_on_whom.count(pg_whoami));
5040 scrubber.waiting_on_whom.erase(pg_whoami);
5041
5042 scrubber.state = PG::Scrubber::WAIT_REPLICAS;
5043 break;
5044
5045 case PG::Scrubber::WAIT_REPLICAS:
5046 if (!scrubber.waiting_on_whom.empty()) {
5047 // will be requeued by sub_op_scrub_map
5048 dout(10) << "wait for replicas to build scrub map" << dendl;
5049 done = true;
5050 break;
5051 }
5052 // end (possible) preemption window
5053 scrub_can_preempt = false;
5054 if (scrub_preempted) {
5055 dout(10) << __func__ << " preempted, restarting chunk" << dendl;
5056 scrubber.state = PG::Scrubber::NEW_CHUNK;
5057 } else {
5058 scrubber.state = PG::Scrubber::COMPARE_MAPS;
5059 }
5060 break;
5061
5062 case PG::Scrubber::COMPARE_MAPS:
5063 assert(last_update_applied >= scrubber.subset_last_update);
5064 assert(scrubber.waiting_on_whom.empty());
5065
5066 scrub_compare_maps();
5067 scrubber.start = scrubber.end;
5068 scrubber.run_callbacks();
5069
5070 // requeue the writes from the chunk that just finished
5071 requeue_ops(waiting_for_scrub);
5072
5073 scrubber.state = PG::Scrubber::WAIT_DIGEST_UPDATES;
5074
5075 // fall-thru
5076
5077 case PG::Scrubber::WAIT_DIGEST_UPDATES:
5078 if (scrubber.num_digest_updates_pending) {
5079 dout(10) << __func__ << " waiting on "
5080 << scrubber.num_digest_updates_pending
5081 << " digest updates" << dendl;
5082 done = true;
5083 break;
5084 }
5085
5086 scrubber.preempt_left = cct->_conf->get_val<uint64_t>(
5087 "osd_scrub_max_preemptions");
5088 scrubber.preempt_divisor = 1;
5089
5090 if (!(scrubber.end.is_max())) {
5091 scrubber.state = PG::Scrubber::NEW_CHUNK;
5092 requeue_scrub();
5093 done = true;
5094 } else {
5095 scrubber.state = PG::Scrubber::FINISH;
5096 }
5097
5098 break;
5099
5100 case PG::Scrubber::FINISH:
5101 scrub_finish();
5102 scrubber.state = PG::Scrubber::INACTIVE;
5103 done = true;
5104
5105 if (!snap_trimq.empty()) {
5106 dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
5107 snap_trimmer_scrub_complete();
5108 }
5109
5110 break;
5111
5112 case PG::Scrubber::BUILD_MAP_REPLICA:
5113 // build my own scrub map
5114 if (scrub_preempted) {
5115 dout(10) << __func__ << " preempted" << dendl;
5116 ret = 0;
5117 } else {
5118 ret = build_scrub_map_chunk(
5119 scrubber.replica_scrubmap,
5120 scrubber.replica_scrubmap_pos,
5121 scrubber.start, scrubber.end,
5122 scrubber.deep,
5123 handle);
5124 }
5125 if (ret == -EINPROGRESS) {
5126 requeue_scrub();
5127 done = true;
5128 break;
5129 }
5130 // reply
5131 if (HAVE_FEATURE(acting_features, SERVER_LUMINOUS)) {
5132 MOSDRepScrubMap *reply = new MOSDRepScrubMap(
5133 spg_t(info.pgid.pgid, get_primary().shard),
5134 scrubber.replica_scrub_start,
5135 pg_whoami);
5136 reply->preempted = scrub_preempted;
5137 ::encode(scrubber.replica_scrubmap, reply->get_data());
5138 osd->send_message_osd_cluster(
5139 get_primary().osd, reply,
5140 scrubber.replica_scrub_start);
5141 } else {
5142 // for jewel compatibility
5143 vector<OSDOp> scrub(1);
5144 scrub[0].op.op = CEPH_OSD_OP_SCRUB_MAP;
5145 hobject_t poid;
5146 eversion_t v;
5147 osd_reqid_t reqid;
5148 MOSDSubOp *subop = new MOSDSubOp(
5149 reqid,
5150 pg_whoami,
5151 spg_t(info.pgid.pgid, get_primary().shard),
5152 poid,
5153 0,
5154 scrubber.replica_scrub_start,
5155 osd->get_tid(),
5156 v);
5157 ::encode(scrubber.replica_scrubmap, subop->get_data());
5158 subop->ops = scrub;
5159 osd->send_message_osd_cluster(
5160 get_primary().osd, subop,
5161 scrubber.replica_scrub_start);
5162 }
5163 scrub_preempted = false;
5164 scrub_can_preempt = false;
5165 scrubber.state = PG::Scrubber::INACTIVE;
5166 scrubber.replica_scrubmap = ScrubMap();
5167 scrubber.replica_scrubmap_pos = ScrubMapBuilder();
5168 scrubber.start = hobject_t();
5169 scrubber.end = hobject_t();
5170 scrubber.max_end = hobject_t();
5171 done = true;
5172 break;
5173
5174 default:
5175 ceph_abort();
5176 }
5177 }
5178 dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
5179 << " [" << scrubber.start << "," << scrubber.end << ")"
5180 << " max_end " << scrubber.max_end << dendl;
5181 }
5182
5183 bool PG::write_blocked_by_scrub(const hobject_t& soid)
5184 {
5185 if (soid < scrubber.start || soid >= scrubber.end) {
5186 return false;
5187 }
5188 if (scrub_can_preempt) {
5189 if (!scrub_preempted) {
5190 dout(10) << __func__ << " " << soid << " preempted" << dendl;
5191 scrub_preempted = true;
5192 } else {
5193 dout(10) << __func__ << " " << soid << " already preempted" << dendl;
5194 }
5195 return false;
5196 }
5197 return true;
5198 }
5199
5200 bool PG::range_intersects_scrub(const hobject_t &start, const hobject_t& end)
5201 {
5202 // does [start, end] intersect [scrubber.start, scrubber.max_end)
5203 return (start < scrubber.max_end &&
5204 end >= scrubber.start);
5205 }
5206
5207 void PG::scrub_clear_state()
5208 {
5209 assert(is_locked());
5210 state_clear(PG_STATE_SCRUBBING);
5211 state_clear(PG_STATE_REPAIR);
5212 state_clear(PG_STATE_DEEP_SCRUB);
5213 publish_stats_to_osd();
5214
5215 // active -> nothing.
5216 if (scrubber.active)
5217 osd->dec_scrubs_active();
5218
5219 requeue_ops(waiting_for_scrub);
5220
5221 scrubber.reset();
5222
5223 // type-specific state clear
5224 _scrub_clear_state();
5225 }
5226
5227 void PG::scrub_compare_maps()
5228 {
5229 dout(10) << __func__ << " has maps, analyzing" << dendl;
5230
5231 // construct authoritative scrub map for type specific scrubbing
5232 scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
5233 map<hobject_t,
5234 pair<boost::optional<uint32_t>,
5235 boost::optional<uint32_t>>> missing_digest;
5236
5237 map<pg_shard_t, ScrubMap *> maps;
5238 maps[pg_whoami] = &scrubber.primary_scrubmap;
5239
5240 for (const auto& i : actingbackfill) {
5241 if (i == pg_whoami) continue;
5242 dout(2) << __func__ << " replica " << i << " has "
5243 << scrubber.received_maps[i].objects.size()
5244 << " items" << dendl;
5245 maps[i] = &scrubber.received_maps[i];
5246 }
5247
5248 set<hobject_t> master_set;
5249
5250 // Construct master set
5251 for (const auto map : maps) {
5252 for (const auto i : map.second->objects) {
5253 master_set.insert(i.first);
5254 }
5255 }
5256
5257 stringstream ss;
5258 get_pgbackend()->be_large_omap_check(maps, master_set,
5259 scrubber.large_omap_objects, ss);
5260 if (!ss.str().empty()) {
5261 osd->clog->warn(ss);
5262 }
5263
5264 if (acting.size() > 1) {
5265 dout(10) << __func__ << " comparing replica scrub maps" << dendl;
5266
5267 // Map from object with errors to good peer
5268 map<hobject_t, list<pg_shard_t>> authoritative;
5269
5270 dout(2) << __func__ << " osd." << acting[0] << " has "
5271 << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
5272
5273 ss.str("");
5274 ss.clear();
5275
5276 get_pgbackend()->be_compare_scrubmaps(
5277 maps,
5278 master_set,
5279 state_test(PG_STATE_REPAIR),
5280 scrubber.missing,
5281 scrubber.inconsistent,
5282 authoritative,
5283 missing_digest,
5284 scrubber.shallow_errors,
5285 scrubber.deep_errors,
5286 scrubber.store.get(),
5287 info.pgid, acting,
5288 ss);
5289 dout(2) << ss.str() << dendl;
5290
5291 if (!ss.str().empty()) {
5292 osd->clog->error(ss);
5293 }
5294
5295 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
5296 i != authoritative.end();
5297 ++i) {
5298 list<pair<ScrubMap::object, pg_shard_t> > good_peers;
5299 for (list<pg_shard_t>::const_iterator j = i->second.begin();
5300 j != i->second.end();
5301 ++j) {
5302 good_peers.push_back(make_pair(maps[*j]->objects[i->first], *j));
5303 }
5304 scrubber.authoritative.insert(
5305 make_pair(
5306 i->first,
5307 good_peers));
5308 }
5309
5310 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
5311 i != authoritative.end();
5312 ++i) {
5313 scrubber.cleaned_meta_map.objects.erase(i->first);
5314 scrubber.cleaned_meta_map.objects.insert(
5315 *(maps[i->second.back()]->objects.find(i->first))
5316 );
5317 }
5318 }
5319
5320 ScrubMap for_meta_scrub;
5321 scrubber.clean_meta_map(for_meta_scrub);
5322
5323 // ok, do the pg-type specific scrubbing
5324 scrub_snapshot_metadata(for_meta_scrub, missing_digest);
5325 // Called here on the primary can use an authoritative map if it isn't the primary
5326 _scan_snaps(for_meta_scrub);
5327 if (!scrubber.store->empty()) {
5328 if (state_test(PG_STATE_REPAIR)) {
5329 dout(10) << __func__ << ": discarding scrub results" << dendl;
5330 scrubber.store->flush(nullptr);
5331 } else {
5332 dout(10) << __func__ << ": updating scrub object" << dendl;
5333 ObjectStore::Transaction t;
5334 scrubber.store->flush(&t);
5335 osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
5336 }
5337 }
5338 }
5339
5340 bool PG::scrub_process_inconsistent()
5341 {
5342 dout(10) << __func__ << ": checking authoritative" << dendl;
5343 bool repair = state_test(PG_STATE_REPAIR);
5344 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5345 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5346
5347 // authoriative only store objects which missing or inconsistent.
5348 if (!scrubber.authoritative.empty()) {
5349 stringstream ss;
5350 ss << info.pgid << " " << mode << " "
5351 << scrubber.missing.size() << " missing, "
5352 << scrubber.inconsistent.size() << " inconsistent objects";
5353 dout(2) << ss.str() << dendl;
5354 osd->clog->error(ss);
5355 if (repair) {
5356 state_clear(PG_STATE_CLEAN);
5357 for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >>::iterator i =
5358 scrubber.authoritative.begin();
5359 i != scrubber.authoritative.end();
5360 ++i) {
5361 set<pg_shard_t>::iterator j;
5362
5363 auto missing_entry = scrubber.missing.find(i->first);
5364 if (missing_entry != scrubber.missing.end()) {
5365 for (j = missing_entry->second.begin();
5366 j != missing_entry->second.end();
5367 ++j) {
5368 repair_object(
5369 i->first,
5370 &(i->second),
5371 *j);
5372 ++scrubber.fixed;
5373 }
5374 }
5375 if (scrubber.inconsistent.count(i->first)) {
5376 for (j = scrubber.inconsistent[i->first].begin();
5377 j != scrubber.inconsistent[i->first].end();
5378 ++j) {
5379 repair_object(i->first,
5380 &(i->second),
5381 *j);
5382 ++scrubber.fixed;
5383 }
5384 }
5385 }
5386 }
5387 }
5388 return (!scrubber.authoritative.empty() && repair);
5389 }
5390
5391 bool PG::ops_blocked_by_scrub() const {
5392 return (waiting_for_scrub.size() != 0);
5393 }
5394
5395 // the part that actually finalizes a scrub
5396 void PG::scrub_finish()
5397 {
5398 bool repair = state_test(PG_STATE_REPAIR);
5399 // if the repair request comes from auto-repair and large number of errors,
5400 // we would like to cancel auto-repair
5401 if (repair && scrubber.auto_repair
5402 && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) {
5403 state_clear(PG_STATE_REPAIR);
5404 repair = false;
5405 }
5406 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5407 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5408
5409 // type-specific finish (can tally more errors)
5410 _scrub_finish();
5411
5412 bool has_error = scrub_process_inconsistent();
5413
5414 {
5415 stringstream oss;
5416 oss << info.pgid.pgid << " " << mode << " ";
5417 int total_errors = scrubber.shallow_errors + scrubber.deep_errors;
5418 if (total_errors)
5419 oss << total_errors << " errors";
5420 else
5421 oss << "ok";
5422 if (!deep_scrub && info.stats.stats.sum.num_deep_scrub_errors)
5423 oss << " ( " << info.stats.stats.sum.num_deep_scrub_errors
5424 << " remaining deep scrub error details lost)";
5425 if (repair)
5426 oss << ", " << scrubber.fixed << " fixed";
5427 if (total_errors)
5428 osd->clog->error(oss);
5429 else
5430 osd->clog->debug(oss);
5431 }
5432
5433 // finish up
5434 unreg_next_scrub();
5435 utime_t now = ceph_clock_now();
5436 info.history.last_scrub = info.last_update;
5437 info.history.last_scrub_stamp = now;
5438 if (scrubber.deep) {
5439 info.history.last_deep_scrub = info.last_update;
5440 info.history.last_deep_scrub_stamp = now;
5441 }
5442 // Since we don't know which errors were fixed, we can only clear them
5443 // when every one has been fixed.
5444 if (repair) {
5445 if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) {
5446 assert(deep_scrub);
5447 scrubber.shallow_errors = scrubber.deep_errors = 0;
5448 } else {
5449 // Deep scrub in order to get corrected error counts
5450 scrub_after_recovery = true;
5451 }
5452 }
5453 if (deep_scrub) {
5454 if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0))
5455 info.history.last_clean_scrub_stamp = now;
5456 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
5457 info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
5458 info.stats.stats.sum.num_large_omap_objects = scrubber.large_omap_objects;
5459 } else {
5460 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
5461 // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
5462 // because of deep-scrub errors
5463 if (scrubber.shallow_errors == 0)
5464 info.history.last_clean_scrub_stamp = now;
5465 }
5466 info.stats.stats.sum.num_scrub_errors =
5467 info.stats.stats.sum.num_shallow_scrub_errors +
5468 info.stats.stats.sum.num_deep_scrub_errors;
5469 reg_next_scrub();
5470
5471 {
5472 ObjectStore::Transaction t;
5473 dirty_info = true;
5474 write_if_dirty(t);
5475 int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
5476 assert(tr == 0);
5477 }
5478
5479
5480 if (has_error) {
5481 queue_peering_event(
5482 CephPeeringEvtRef(
5483 std::make_shared<CephPeeringEvt>(
5484 get_osdmap()->get_epoch(),
5485 get_osdmap()->get_epoch(),
5486 DoRecovery())));
5487 }
5488
5489 scrub_clear_state();
5490 scrub_unreserve_replicas();
5491
5492 if (is_active() && is_primary()) {
5493 share_pg_info();
5494 }
5495 }
5496
5497 void PG::share_pg_info()
5498 {
5499 dout(10) << "share_pg_info" << dendl;
5500
5501 // share new pg_info_t with replicas
5502 assert(!actingbackfill.empty());
5503 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
5504 i != actingbackfill.end();
5505 ++i) {
5506 if (*i == pg_whoami) continue;
5507 pg_shard_t peer = *i;
5508 if (peer_info.count(peer)) {
5509 peer_info[peer].last_epoch_started = info.last_epoch_started;
5510 peer_info[peer].last_interval_started = info.last_interval_started;
5511 peer_info[peer].history.merge(info.history);
5512 }
5513 MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
5514 m->pg_list.push_back(
5515 make_pair(
5516 pg_notify_t(
5517 peer.shard, pg_whoami.shard,
5518 get_osdmap()->get_epoch(),
5519 get_osdmap()->get_epoch(),
5520 info),
5521 PastIntervals()));
5522 osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
5523 }
5524 }
5525
5526 bool PG::append_log_entries_update_missing(
5527 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
5528 ObjectStore::Transaction &t, boost::optional<eversion_t> trim_to,
5529 boost::optional<eversion_t> roll_forward_to)
5530 {
5531 assert(!entries.empty());
5532 assert(entries.begin()->version > info.last_update);
5533
5534 PGLogEntryHandler rollbacker{this, &t};
5535 bool invalidate_stats =
5536 pg_log.append_new_log_entries(info.last_backfill,
5537 info.last_backfill_bitwise,
5538 entries,
5539 &rollbacker);
5540
5541 if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
5542 pg_log.roll_forward(&rollbacker);
5543 }
5544 if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
5545 pg_log.roll_forward_to(*roll_forward_to, &rollbacker);
5546 last_rollback_info_trimmed_to_applied = *roll_forward_to;
5547 }
5548
5549 info.last_update = pg_log.get_head();
5550
5551 if (pg_log.get_missing().num_missing() == 0) {
5552 // advance last_complete since nothing else is missing!
5553 info.last_complete = info.last_update;
5554 }
5555 info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
5556
5557 dout(20) << __func__ << "trim_to bool = " << bool(trim_to) << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
5558 if (trim_to)
5559 pg_log.trim(*trim_to, info);
5560 dirty_info = true;
5561 write_if_dirty(t);
5562 return invalidate_stats;
5563 }
5564
5565
5566 void PG::merge_new_log_entries(
5567 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
5568 ObjectStore::Transaction &t,
5569 boost::optional<eversion_t> trim_to,
5570 boost::optional<eversion_t> roll_forward_to)
5571 {
5572 dout(10) << __func__ << " " << entries << dendl;
5573 assert(is_primary());
5574
5575 bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
5576 for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
5577 i != actingbackfill.end();
5578 ++i) {
5579 pg_shard_t peer(*i);
5580 if (peer == pg_whoami) continue;
5581 assert(peer_missing.count(peer));
5582 assert(peer_info.count(peer));
5583 pg_missing_t& pmissing(peer_missing[peer]);
5584 dout(20) << __func__ << " peer_missing for " << peer << " = " << pmissing << dendl;
5585 pg_info_t& pinfo(peer_info[peer]);
5586 bool invalidate_stats = PGLog::append_log_entries_update_missing(
5587 pinfo.last_backfill,
5588 info.last_backfill_bitwise,
5589 entries,
5590 true,
5591 NULL,
5592 pmissing,
5593 NULL,
5594 this);
5595 pinfo.last_update = info.last_update;
5596 pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
5597 rebuild_missing = rebuild_missing || invalidate_stats;
5598 }
5599
5600 if (!rebuild_missing) {
5601 return;
5602 }
5603
5604 for (auto &&i: entries) {
5605 missing_loc.rebuild(
5606 i.soid,
5607 pg_whoami,
5608 actingbackfill,
5609 info,
5610 pg_log.get_missing(),
5611 peer_missing,
5612 peer_info);
5613 }
5614 }
5615
5616 void PG::update_history(const pg_history_t& new_history)
5617 {
5618 unreg_next_scrub();
5619 if (info.history.merge(new_history)) {
5620 dout(20) << __func__ << " advanced history from " << new_history << dendl;
5621 dirty_info = true;
5622 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
5623 dout(20) << __func__ << " clearing past_intervals" << dendl;
5624 past_intervals.clear();
5625 dirty_big_info = true;
5626 }
5627 }
5628 reg_next_scrub();
5629 }
5630
5631 void PG::fulfill_info(
5632 pg_shard_t from, const pg_query_t &query,
5633 pair<pg_shard_t, pg_info_t> &notify_info)
5634 {
5635 assert(from == primary);
5636 assert(query.type == pg_query_t::INFO);
5637
5638 // info
5639 dout(10) << "sending info" << dendl;
5640 notify_info = make_pair(from, info);
5641 }
5642
5643 void PG::fulfill_log(
5644 pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
5645 {
5646 dout(10) << "log request from " << from << dendl;
5647 assert(from == primary);
5648 assert(query.type != pg_query_t::INFO);
5649 ConnectionRef con = osd->get_con_osd_cluster(
5650 from.osd, get_osdmap()->get_epoch());
5651 if (!con) return;
5652
5653 MOSDPGLog *mlog = new MOSDPGLog(
5654 from.shard, pg_whoami.shard,
5655 get_osdmap()->get_epoch(),
5656 info, query_epoch);
5657 mlog->missing = pg_log.get_missing();
5658
5659 // primary -> other, when building master log
5660 if (query.type == pg_query_t::LOG) {
5661 dout(10) << " sending info+missing+log since " << query.since
5662 << dendl;
5663 if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
5664 osd->clog->error() << info.pgid << " got broken pg_query_t::LOG since " << query.since
5665 << " when my log.tail is " << pg_log.get_tail()
5666 << ", sending full log instead";
5667 mlog->log = pg_log.get_log(); // primary should not have requested this!!
5668 } else
5669 mlog->log.copy_after(pg_log.get_log(), query.since);
5670 }
5671 else if (query.type == pg_query_t::FULLLOG) {
5672 dout(10) << " sending info+missing+full log" << dendl;
5673 mlog->log = pg_log.get_log();
5674 }
5675
5676 dout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
5677
5678 osd->share_map_peer(from.osd, con.get(), get_osdmap());
5679 osd->send_message_osd_cluster(mlog, con.get());
5680 }
5681
5682 void PG::fulfill_query(const MQuery& query, RecoveryCtx *rctx)
5683 {
5684 if (query.query.type == pg_query_t::INFO) {
5685 pair<pg_shard_t, pg_info_t> notify_info;
5686 update_history(query.query.history);
5687 fulfill_info(query.from, query.query, notify_info);
5688 rctx->send_notify(
5689 notify_info.first,
5690 pg_notify_t(
5691 notify_info.first.shard, pg_whoami.shard,
5692 query.query_epoch,
5693 get_osdmap()->get_epoch(),
5694 notify_info.second),
5695 past_intervals);
5696 } else {
5697 update_history(query.query.history);
5698 fulfill_log(query.from, query.query, query.query_epoch);
5699 }
5700 }
5701
5702 void PG::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
5703 {
5704 bool changed = false;
5705 if (osdmap->test_flag(CEPH_OSDMAP_FULL) &&
5706 !lastmap->test_flag(CEPH_OSDMAP_FULL)) {
5707 dout(10) << " cluster was marked full in " << osdmap->get_epoch() << dendl;
5708 changed = true;
5709 }
5710 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
5711 assert(pi);
5712 if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
5713 const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
5714 if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
5715 dout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
5716 changed = true;
5717 }
5718 }
5719 if (changed) {
5720 info.history.last_epoch_marked_full = osdmap->get_epoch();
5721 dirty_info = true;
5722 }
5723 }
5724
5725 bool PG::should_restart_peering(
5726 int newupprimary,
5727 int newactingprimary,
5728 const vector<int>& newup,
5729 const vector<int>& newacting,
5730 OSDMapRef lastmap,
5731 OSDMapRef osdmap)
5732 {
5733 if (PastIntervals::is_new_interval(
5734 primary.osd,
5735 newactingprimary,
5736 acting,
5737 newacting,
5738 up_primary.osd,
5739 newupprimary,
5740 up,
5741 newup,
5742 osdmap,
5743 lastmap,
5744 info.pgid.pgid)) {
5745 dout(20) << "new interval newup " << newup
5746 << " newacting " << newacting << dendl;
5747 return true;
5748 }
5749 if (!lastmap->is_up(osd->whoami) && osdmap->is_up(osd->whoami)) {
5750 dout(10) << __func__ << " osd transitioned from down -> up" << dendl;
5751 return true;
5752 }
5753 return false;
5754 }
5755
5756 bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
5757 {
5758 if (last_peering_reset > reply_epoch ||
5759 last_peering_reset > query_epoch) {
5760 dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " << query_epoch
5761 << " last_peering_reset " << last_peering_reset
5762 << dendl;
5763 return true;
5764 }
5765 return false;
5766 }
5767
5768 void PG::set_last_peering_reset()
5769 {
5770 dout(20) << "set_last_peering_reset " << get_osdmap()->get_epoch() << dendl;
5771 if (last_peering_reset != get_osdmap()->get_epoch()) {
5772 last_peering_reset = get_osdmap()->get_epoch();
5773 reset_interval_flush();
5774 }
5775 }
5776
5777 struct FlushState {
5778 PGRef pg;
5779 epoch_t epoch;
5780 FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
5781 ~FlushState() {
5782 pg->lock();
5783 if (!pg->pg_has_reset_since(epoch))
5784 pg->queue_flushed(epoch);
5785 pg->unlock();
5786 }
5787 };
5788 typedef ceph::shared_ptr<FlushState> FlushStateRef;
5789
5790 void PG::start_flush(ObjectStore::Transaction *t,
5791 list<Context *> *on_applied,
5792 list<Context *> *on_safe)
5793 {
5794 // flush in progress ops
5795 FlushStateRef flush_trigger (std::make_shared<FlushState>(
5796 this, get_osdmap()->get_epoch()));
5797 t->nop();
5798 flushes_in_progress++;
5799 on_applied->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5800 on_safe->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5801 }
5802
5803 void PG::reset_interval_flush()
5804 {
5805 dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
5806 recovery_state.clear_blocked_outgoing();
5807
5808 Context *c = new QueuePeeringEvt<IntervalFlush>(
5809 this, get_osdmap()->get_epoch(), IntervalFlush());
5810 if (!osr->flush_commit(c)) {
5811 dout(10) << "Beginning to block outgoing recovery messages" << dendl;
5812 recovery_state.begin_block_outgoing();
5813 } else {
5814 dout(10) << "Not blocking outgoing recovery messages" << dendl;
5815 delete c;
5816 }
5817 }
5818
5819 /* Called before initializing peering during advance_map */
5820 void PG::start_peering_interval(
5821 const OSDMapRef lastmap,
5822 const vector<int>& newup, int new_up_primary,
5823 const vector<int>& newacting, int new_acting_primary,
5824 ObjectStore::Transaction *t)
5825 {
5826 const OSDMapRef osdmap = get_osdmap();
5827
5828 set_last_peering_reset();
5829
5830 vector<int> oldacting, oldup;
5831 int oldrole = get_role();
5832
5833 unreg_next_scrub();
5834
5835 pg_shard_t old_acting_primary = get_primary();
5836 pg_shard_t old_up_primary = up_primary;
5837 bool was_old_primary = is_primary();
5838 bool was_old_replica = is_replica();
5839
5840 acting.swap(oldacting);
5841 up.swap(oldup);
5842 init_primary_up_acting(
5843 newup,
5844 newacting,
5845 new_up_primary,
5846 new_acting_primary);
5847
5848 if (info.stats.up != up ||
5849 info.stats.acting != acting ||
5850 info.stats.up_primary != new_up_primary ||
5851 info.stats.acting_primary != new_acting_primary) {
5852 info.stats.up = up;
5853 info.stats.up_primary = new_up_primary;
5854 info.stats.acting = acting;
5855 info.stats.acting_primary = new_acting_primary;
5856 info.stats.mapping_epoch = osdmap->get_epoch();
5857 }
5858
5859 pg_stats_publish_lock.Lock();
5860 pg_stats_publish_valid = false;
5861 pg_stats_publish_lock.Unlock();
5862
5863 // This will now be remapped during a backfill in cases
5864 // that it would not have been before.
5865 if (up != acting)
5866 state_set(PG_STATE_REMAPPED);
5867 else
5868 state_clear(PG_STATE_REMAPPED);
5869
5870 int role = osdmap->calc_pg_role(osd->whoami, acting, acting.size());
5871 if (pool.info.is_replicated() || role == pg_whoami.shard)
5872 set_role(role);
5873 else
5874 set_role(-1);
5875
5876 // did acting, up, primary|acker change?
5877 if (!lastmap) {
5878 dout(10) << " no lastmap" << dendl;
5879 dirty_info = true;
5880 dirty_big_info = true;
5881 info.history.same_interval_since = osdmap->get_epoch();
5882 } else {
5883 std::stringstream debug;
5884 assert(info.history.same_interval_since != 0);
5885 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
5886 get_is_recoverable_predicate());
5887 bool new_interval = PastIntervals::check_new_interval(
5888 old_acting_primary.osd,
5889 new_acting_primary,
5890 oldacting, newacting,
5891 old_up_primary.osd,
5892 new_up_primary,
5893 oldup, newup,
5894 info.history.same_interval_since,
5895 info.history.last_epoch_clean,
5896 osdmap,
5897 lastmap,
5898 info.pgid.pgid,
5899 recoverable.get(),
5900 &past_intervals,
5901 &debug);
5902 dout(10) << __func__ << ": check_new_interval output: "
5903 << debug.str() << dendl;
5904 if (new_interval) {
5905 if (osdmap->get_epoch() == osd->get_superblock().oldest_map &&
5906 info.history.last_epoch_clean < osdmap->get_epoch()) {
5907 dout(10) << " map gap, clearing past_intervals and faking" << dendl;
5908 // our information is incomplete and useless; someone else was clean
5909 // after everything we know if osdmaps were trimmed.
5910 past_intervals.clear();
5911 } else {
5912 dout(10) << " noting past " << past_intervals << dendl;
5913 }
5914 dirty_info = true;
5915 dirty_big_info = true;
5916 info.history.same_interval_since = osdmap->get_epoch();
5917 if (info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
5918 osdmap->get_pg_num(info.pgid.pgid.pool()),
5919 nullptr)) {
5920 info.history.last_epoch_split = osdmap->get_epoch();
5921 }
5922 }
5923 }
5924
5925 if (old_up_primary != up_primary ||
5926 oldup != up) {
5927 info.history.same_up_since = osdmap->get_epoch();
5928 }
5929 // this comparison includes primary rank via pg_shard_t
5930 if (old_acting_primary != get_primary()) {
5931 info.history.same_primary_since = osdmap->get_epoch();
5932 }
5933
5934 on_new_interval();
5935
5936 dout(1) << __func__ << " up " << oldup << " -> " << up
5937 << ", acting " << oldacting << " -> " << acting
5938 << ", acting_primary " << old_acting_primary << " -> " << new_acting_primary
5939 << ", up_primary " << old_up_primary << " -> " << new_up_primary
5940 << ", role " << oldrole << " -> " << role
5941 << ", features acting " << acting_features
5942 << " upacting " << upacting_features
5943 << dendl;
5944
5945 // deactivate.
5946 state_clear(PG_STATE_ACTIVE);
5947 state_clear(PG_STATE_PEERED);
5948 state_clear(PG_STATE_DOWN);
5949 state_clear(PG_STATE_RECOVERY_WAIT);
5950 state_clear(PG_STATE_RECOVERY_TOOFULL);
5951 state_clear(PG_STATE_RECOVERING);
5952
5953 peer_purged.clear();
5954 actingbackfill.clear();
5955 scrub_queued = false;
5956
5957 // reset primary/replica state?
5958 if (was_old_primary || is_primary()) {
5959 osd->remove_want_pg_temp(info.pgid.pgid);
5960 } else if (was_old_replica || is_replica()) {
5961 osd->remove_want_pg_temp(info.pgid.pgid);
5962 }
5963 clear_primary_state();
5964
5965
5966 // pg->on_*
5967 on_change(t);
5968
5969 projected_last_update = eversion_t();
5970
5971 assert(!deleting);
5972
5973 // should we tell the primary we are here?
5974 send_notify = !is_primary();
5975
5976 if (role != oldrole ||
5977 was_old_primary != is_primary()) {
5978 // did primary change?
5979 if (was_old_primary != is_primary()) {
5980 state_clear(PG_STATE_CLEAN);
5981 clear_publish_stats();
5982 }
5983
5984 on_role_change();
5985
5986 // take active waiters
5987 requeue_ops(waiting_for_peered);
5988
5989 } else {
5990 // no role change.
5991 // did primary change?
5992 if (get_primary() != old_acting_primary) {
5993 dout(10) << *this << " " << oldacting << " -> " << acting
5994 << ", acting primary "
5995 << old_acting_primary << " -> " << get_primary()
5996 << dendl;
5997 } else {
5998 // primary is the same.
5999 if (is_primary()) {
6000 // i am (still) primary. but my replica set changed.
6001 state_clear(PG_STATE_CLEAN);
6002
6003 dout(10) << oldacting << " -> " << acting
6004 << ", replicas changed" << dendl;
6005 }
6006 }
6007 }
6008 cancel_recovery();
6009
6010 if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
6011 dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
6012 osd->queue_want_pg_temp(info.pgid.pgid, acting);
6013 }
6014 }
6015
6016 void PG::on_new_interval()
6017 {
6018 const OSDMapRef osdmap = get_osdmap();
6019
6020 reg_next_scrub();
6021
6022 // initialize features
6023 acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
6024 upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
6025 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
6026 if (*p == CRUSH_ITEM_NONE)
6027 continue;
6028 uint64_t f = osdmap->get_xinfo(*p).features;
6029 acting_features &= f;
6030 upacting_features &= f;
6031 }
6032 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
6033 if (*p == CRUSH_ITEM_NONE)
6034 continue;
6035 upacting_features &= osdmap->get_xinfo(*p).features;
6036 }
6037
6038 _on_new_interval();
6039 }
6040
6041 void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
6042 {
6043 assert(!is_primary());
6044
6045 update_history(oinfo.history);
6046 if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
6047 info.stats.stats.sum.num_scrub_errors = 0;
6048 info.stats.stats.sum.num_shallow_scrub_errors = 0;
6049 info.stats.stats.sum.num_deep_scrub_errors = 0;
6050 dirty_info = true;
6051 }
6052
6053 if (!(info.purged_snaps == oinfo.purged_snaps)) {
6054 dout(10) << __func__ << " updating purged_snaps to " << oinfo.purged_snaps
6055 << dendl;
6056 info.purged_snaps = oinfo.purged_snaps;
6057 dirty_info = true;
6058 dirty_big_info = true;
6059 }
6060 }
6061
6062 ostream& operator<<(ostream& out, const PG& pg)
6063 {
6064 out << "pg[" << pg.info
6065 << " " << pg.up;
6066 if (pg.acting != pg.up)
6067 out << "/" << pg.acting;
6068 if (pg.is_ec_pg())
6069 out << "p" << pg.get_primary();
6070 out << " r=" << pg.get_role();
6071 out << " lpr=" << pg.get_last_peering_reset();
6072
6073 if (!pg.past_intervals.empty()) {
6074 out << " pi=[" << pg.past_intervals.get_bounds()
6075 << ")/" << pg.past_intervals.size();
6076 }
6077
6078 if (pg.is_peered()) {
6079 if (pg.last_update_ondisk != pg.info.last_update)
6080 out << " luod=" << pg.last_update_ondisk;
6081 if (pg.last_update_applied != pg.info.last_update)
6082 out << " lua=" << pg.last_update_applied;
6083 }
6084
6085 if (pg.recovery_ops_active)
6086 out << " rops=" << pg.recovery_ops_active;
6087
6088 if (pg.pg_log.get_tail() != pg.info.log_tail ||
6089 pg.pg_log.get_head() != pg.info.last_update)
6090 out << " (info mismatch, " << pg.pg_log.get_log() << ")";
6091
6092 if (!pg.pg_log.get_log().empty()) {
6093 if ((pg.pg_log.get_log().log.begin()->version <= pg.pg_log.get_tail())) {
6094 out << " (log bound mismatch, actual=["
6095 << pg.pg_log.get_log().log.begin()->version << ","
6096 << pg.pg_log.get_log().log.rbegin()->version << "]";
6097 out << ")";
6098 }
6099 }
6100
6101 if (!pg.backfill_targets.empty())
6102 out << " bft=" << pg.backfill_targets;
6103 out << " crt=" << pg.pg_log.get_can_rollback_to();
6104
6105 if (pg.last_complete_ondisk != pg.info.last_complete)
6106 out << " lcod " << pg.last_complete_ondisk;
6107
6108 if (pg.is_primary()) {
6109 out << " mlcod " << pg.min_last_complete_ondisk;
6110 }
6111
6112 out << " " << pg_state_string(pg.get_state());
6113 if (pg.should_send_notify())
6114 out << " NOTIFY";
6115
6116 if (pg.scrubber.must_repair)
6117 out << " MUST_REPAIR";
6118 if (pg.scrubber.auto_repair)
6119 out << " AUTO_REPAIR";
6120 if (pg.scrubber.must_deep_scrub)
6121 out << " MUST_DEEP_SCRUB";
6122 if (pg.scrubber.must_scrub)
6123 out << " MUST_SCRUB";
6124
6125 //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
6126 if (pg.pg_log.get_missing().num_missing()) {
6127 out << " m=" << pg.pg_log.get_missing().num_missing();
6128 if (pg.is_primary()) {
6129 uint64_t unfound = pg.get_num_unfound();
6130 if (unfound)
6131 out << " u=" << unfound;
6132 }
6133 }
6134 if (pg.snap_trimq.size())
6135 out << " snaptrimq=" << pg.snap_trimq;
6136 if (!pg.is_clean()) {
6137 out << " mbc=" << pg.missing_loc.get_missing_by_count();
6138 }
6139
6140 out << "]";
6141
6142
6143 return out;
6144 }
6145
6146 bool PG::can_discard_op(OpRequestRef& op)
6147 {
6148 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
6149 if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) {
6150 dout(20) << " discard " << *m << dendl;
6151 return true;
6152 }
6153
6154 if (m->get_map_epoch() < info.history.same_primary_since) {
6155 dout(7) << " changed after " << m->get_map_epoch()
6156 << ", dropping " << *m << dendl;
6157 return true;
6158 }
6159
6160 if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) {
6161 if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) {
6162 dout(7) << __func__ << " sent before last_force_op_resend "
6163 << pool.info.last_force_op_resend << ", dropping" << *m << dendl;
6164 return true;
6165 }
6166 if (m->get_map_epoch() < info.history.last_epoch_split) {
6167 dout(7) << __func__ << " pg split in "
6168 << info.history.last_epoch_split << ", dropping" << dendl;
6169 return true;
6170 }
6171 } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) {
6172 if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) {
6173 dout(7) << __func__ << " sent before last_force_op_resend_preluminous "
6174 << pool.info.last_force_op_resend_preluminous
6175 << ", dropping" << *m << dendl;
6176 return true;
6177 }
6178 }
6179
6180 return false;
6181 }
6182
6183 template<typename T, int MSGTYPE>
6184 bool PG::can_discard_replica_op(OpRequestRef& op)
6185 {
6186 const T *m = static_cast<const T *>(op->get_req());
6187 assert(m->get_type() == MSGTYPE);
6188
6189 int from = m->get_source().num();
6190
6191 // if a repop is replied after a replica goes down in a new osdmap, and
6192 // before the pg advances to this new osdmap, the repop replies before this
6193 // repop can be discarded by that replica OSD, because the primary resets the
6194 // connection to it when handling the new osdmap marking it down, and also
6195 // resets the messenger sesssion when the replica reconnects. to avoid the
6196 // out-of-order replies, the messages from that replica should be discarded.
6197 if (osd->get_osdmap()->is_down(from))
6198 return true;
6199 /* Mostly, this overlaps with the old_peering_msg
6200 * condition. An important exception is pushes
6201 * sent by replicas not in the acting set, since
6202 * if such a replica goes down it does not cause
6203 * a new interval. */
6204 if (get_osdmap()->get_down_at(from) >= m->map_epoch)
6205 return true;
6206
6207 // same pg?
6208 // if pg changes _at all_, we reset and repeer!
6209 if (old_peering_msg(m->map_epoch, m->map_epoch)) {
6210 dout(10) << "can_discard_replica_op pg changed " << info.history
6211 << " after " << m->map_epoch
6212 << ", dropping" << dendl;
6213 return true;
6214 }
6215 return false;
6216 }
6217
6218 bool PG::can_discard_scan(OpRequestRef op)
6219 {
6220 const MOSDPGScan *m = static_cast<const MOSDPGScan *>(op->get_req());
6221 assert(m->get_type() == MSG_OSD_PG_SCAN);
6222
6223 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
6224 dout(10) << " got old scan, ignoring" << dendl;
6225 return true;
6226 }
6227 return false;
6228 }
6229
6230 bool PG::can_discard_backfill(OpRequestRef op)
6231 {
6232 const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill *>(op->get_req());
6233 assert(m->get_type() == MSG_OSD_PG_BACKFILL);
6234
6235 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
6236 dout(10) << " got old backfill, ignoring" << dendl;
6237 return true;
6238 }
6239
6240 return false;
6241
6242 }
6243
6244 bool PG::can_discard_request(OpRequestRef& op)
6245 {
6246 switch (op->get_req()->get_type()) {
6247 case CEPH_MSG_OSD_OP:
6248 return can_discard_op(op);
6249 case CEPH_MSG_OSD_BACKOFF:
6250 return false; // never discard
6251 case MSG_OSD_SUBOP:
6252 return can_discard_replica_op<MOSDSubOp, MSG_OSD_SUBOP>(op);
6253 case MSG_OSD_REPOP:
6254 return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op);
6255 case MSG_OSD_PG_PUSH:
6256 return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op);
6257 case MSG_OSD_PG_PULL:
6258 return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
6259 case MSG_OSD_PG_PUSH_REPLY:
6260 return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
6261 case MSG_OSD_SUBOPREPLY:
6262 return can_discard_replica_op<MOSDSubOpReply, MSG_OSD_SUBOPREPLY>(op);
6263 case MSG_OSD_REPOPREPLY:
6264 return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
6265 case MSG_OSD_PG_RECOVERY_DELETE:
6266 return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op);
6267
6268 case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
6269 return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op);
6270
6271 case MSG_OSD_EC_WRITE:
6272 return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
6273 case MSG_OSD_EC_WRITE_REPLY:
6274 return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op);
6275 case MSG_OSD_EC_READ:
6276 return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
6277 case MSG_OSD_EC_READ_REPLY:
6278 return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
6279 case MSG_OSD_REP_SCRUB:
6280 return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
6281 case MSG_OSD_SCRUB_RESERVE:
6282 return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op);
6283 case MSG_OSD_REP_SCRUBMAP:
6284 return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op);
6285 case MSG_OSD_PG_UPDATE_LOG_MISSING:
6286 return can_discard_replica_op<
6287 MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op);
6288 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
6289 return can_discard_replica_op<
6290 MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
6291
6292 case MSG_OSD_PG_SCAN:
6293 return can_discard_scan(op);
6294 case MSG_OSD_PG_BACKFILL:
6295 return can_discard_backfill(op);
6296 case MSG_OSD_PG_BACKFILL_REMOVE:
6297 return can_discard_replica_op<MOSDPGBackfillRemove,
6298 MSG_OSD_PG_BACKFILL_REMOVE>(op);
6299 }
6300 return true;
6301 }
6302
6303 void PG::take_waiters()
6304 {
6305 dout(10) << "take_waiters" << dendl;
6306 requeue_map_waiters();
6307 for (list<CephPeeringEvtRef>::iterator i = peering_waiters.begin();
6308 i != peering_waiters.end();
6309 ++i) osd->queue_for_peering(this);
6310 peering_queue.splice(peering_queue.begin(), peering_waiters,
6311 peering_waiters.begin(), peering_waiters.end());
6312 }
6313
6314 void PG::handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx)
6315 {
6316 dout(10) << "handle_peering_event: " << evt->get_desc() << dendl;
6317 if (!have_same_or_newer_map(evt->get_epoch_sent())) {
6318 dout(10) << "deferring event " << evt->get_desc() << dendl;
6319 peering_waiters.push_back(evt);
6320 return;
6321 }
6322 if (old_peering_evt(evt))
6323 return;
6324 recovery_state.handle_event(evt, rctx);
6325 }
6326
6327 void PG::queue_peering_event(CephPeeringEvtRef evt)
6328 {
6329 if (old_peering_evt(evt))
6330 return;
6331 peering_queue.push_back(evt);
6332 osd->queue_for_peering(this);
6333 }
6334
6335 void PG::queue_null(epoch_t msg_epoch,
6336 epoch_t query_epoch)
6337 {
6338 dout(10) << "null" << dendl;
6339 queue_peering_event(
6340 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
6341 NullEvt())));
6342 }
6343
6344 void PG::queue_flushed(epoch_t e)
6345 {
6346 dout(10) << "flushed" << dendl;
6347 queue_peering_event(
6348 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(e, e,
6349 FlushedEvt())));
6350 }
6351
6352 void PG::queue_query(epoch_t msg_epoch,
6353 epoch_t query_epoch,
6354 pg_shard_t from, const pg_query_t& q)
6355 {
6356 dout(10) << "handle_query " << q << " from replica " << from << dendl;
6357 queue_peering_event(
6358 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
6359 MQuery(from, q, query_epoch))));
6360 }
6361
6362 void PG::handle_advance_map(
6363 OSDMapRef osdmap, OSDMapRef lastmap,
6364 vector<int>& newup, int up_primary,
6365 vector<int>& newacting, int acting_primary,
6366 RecoveryCtx *rctx)
6367 {
6368 assert(lastmap->get_epoch() == osdmap_ref->get_epoch());
6369 assert(lastmap == osdmap_ref);
6370 dout(10) << "handle_advance_map "
6371 << newup << "/" << newacting
6372 << " -- " << up_primary << "/" << acting_primary
6373 << dendl;
6374 update_osdmap_ref(osdmap);
6375 pool.update(osdmap);
6376 past_intervals.update_type_from_map(pool.info.ec_pool(), *osdmap);
6377 if (cct->_conf->osd_debug_verify_cached_snaps) {
6378 interval_set<snapid_t> actual_removed_snaps;
6379 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
6380 assert(pi);
6381 pi->build_removed_snaps(actual_removed_snaps);
6382 if (!(actual_removed_snaps == pool.cached_removed_snaps)) {
6383 derr << __func__ << ": mismatch between the actual removed snaps "
6384 << actual_removed_snaps << " and pool.cached_removed_snaps "
6385 << " pool.cached_removed_snaps " << pool.cached_removed_snaps
6386 << dendl;
6387 }
6388 assert(actual_removed_snaps == pool.cached_removed_snaps);
6389 }
6390 AdvMap evt(
6391 osdmap, lastmap, newup, up_primary,
6392 newacting, acting_primary);
6393 recovery_state.handle_event(evt, rctx);
6394 if (pool.info.last_change == osdmap_ref->get_epoch()) {
6395 on_pool_change();
6396 update_store_with_options();
6397 }
6398 }
6399
6400 void PG::handle_activate_map(RecoveryCtx *rctx)
6401 {
6402 dout(10) << "handle_activate_map " << dendl;
6403 ActMap evt;
6404 recovery_state.handle_event(evt, rctx);
6405 if (osdmap_ref->get_epoch() - last_persisted_osdmap_ref->get_epoch() >
6406 cct->_conf->osd_pg_epoch_persisted_max_stale) {
6407 dout(20) << __func__ << ": Dirtying info: last_persisted is "
6408 << last_persisted_osdmap_ref->get_epoch()
6409 << " while current is " << osdmap_ref->get_epoch() << dendl;
6410 dirty_info = true;
6411 } else {
6412 dout(20) << __func__ << ": Not dirtying info: last_persisted is "
6413 << last_persisted_osdmap_ref->get_epoch()
6414 << " while current is " << osdmap_ref->get_epoch() << dendl;
6415 }
6416 if (osdmap_ref->check_new_blacklist_entries()) check_blacklisted_watchers();
6417 }
6418
6419 void PG::handle_loaded(RecoveryCtx *rctx)
6420 {
6421 dout(10) << "handle_loaded" << dendl;
6422 Load evt;
6423 recovery_state.handle_event(evt, rctx);
6424 }
6425
6426 void PG::handle_create(RecoveryCtx *rctx)
6427 {
6428 dout(10) << "handle_create" << dendl;
6429 rctx->created_pgs.insert(this);
6430 Initialize evt;
6431 recovery_state.handle_event(evt, rctx);
6432 ActMap evt2;
6433 recovery_state.handle_event(evt2, rctx);
6434
6435 rctx->on_applied->add(make_lambda_context([this]() {
6436 update_store_with_options();
6437 }));
6438 }
6439
6440 void PG::handle_query_state(Formatter *f)
6441 {
6442 dout(10) << "handle_query_state" << dendl;
6443 QueryState q(f);
6444 recovery_state.handle_event(q, 0);
6445 }
6446
6447 void PG::update_store_with_options()
6448 {
6449 auto r = osd->store->set_collection_opts(coll, pool.info.opts);
6450 if(r < 0 && r != -EOPNOTSUPP) {
6451 derr << __func__ << " set_collection_opts returns error:" << r << dendl;
6452 }
6453 }
6454
6455 void PG::update_store_on_load()
6456 {
6457 if (osd->store->get_type() == "filestore") {
6458 // legacy filestore didn't store collection bit width; fix.
6459 int bits = osd->store->collection_bits(coll);
6460 if (bits < 0) {
6461 assert(!coll.is_meta()); // otherwise OSD::load_pgs() did a bad thing
6462 bits = info.pgid.get_split_bits(pool.info.get_pg_num());
6463 lderr(cct) << __func__ << " setting bit width to " << bits << dendl;
6464 ObjectStore::Transaction t;
6465 t.collection_set_bits(coll, bits);
6466 osd->store->apply_transaction(osr.get(), std::move(t));
6467 }
6468 }
6469 }
6470
6471 /*------------ Recovery State Machine----------------*/
6472 #undef dout_prefix
6473 #define dout_prefix (*_dout << context< RecoveryMachine >().pg->gen_prefix() \
6474 << "state<" << get_state_name() << ">: ")
6475
6476 /*------Crashed-------*/
6477 PG::RecoveryState::Crashed::Crashed(my_context ctx)
6478 : my_base(ctx),
6479 NamedState(context< RecoveryMachine >().pg, "Crashed")
6480 {
6481 context< RecoveryMachine >().log_enter(state_name);
6482 assert(0 == "we got a bad state machine event");
6483 }
6484
6485
6486 /*------Initial-------*/
6487 PG::RecoveryState::Initial::Initial(my_context ctx)
6488 : my_base(ctx),
6489 NamedState(context< RecoveryMachine >().pg, "Initial")
6490 {
6491 context< RecoveryMachine >().log_enter(state_name);
6492 }
6493
6494 boost::statechart::result PG::RecoveryState::Initial::react(const Load& l)
6495 {
6496 PG *pg = context< RecoveryMachine >().pg;
6497
6498 // do we tell someone we're here?
6499 pg->send_notify = (!pg->is_primary());
6500 pg->update_store_with_options();
6501
6502 pg->update_store_on_load();
6503
6504 return transit< Reset >();
6505 }
6506
6507 boost::statechart::result PG::RecoveryState::Initial::react(const MNotifyRec& notify)
6508 {
6509 PG *pg = context< RecoveryMachine >().pg;
6510 pg->proc_replica_info(
6511 notify.from, notify.notify.info, notify.notify.epoch_sent);
6512 pg->set_last_peering_reset();
6513 return transit< Primary >();
6514 }
6515
6516 boost::statechart::result PG::RecoveryState::Initial::react(const MInfoRec& i)
6517 {
6518 PG *pg = context< RecoveryMachine >().pg;
6519 assert(!pg->is_primary());
6520 post_event(i);
6521 return transit< Stray >();
6522 }
6523
6524 boost::statechart::result PG::RecoveryState::Initial::react(const MLogRec& i)
6525 {
6526 PG *pg = context< RecoveryMachine >().pg;
6527 assert(!pg->is_primary());
6528 post_event(i);
6529 return transit< Stray >();
6530 }
6531
6532 void PG::RecoveryState::Initial::exit()
6533 {
6534 context< RecoveryMachine >().log_exit(state_name, enter_time);
6535 PG *pg = context< RecoveryMachine >().pg;
6536 utime_t dur = ceph_clock_now() - enter_time;
6537 pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur);
6538 }
6539
6540 /*------Started-------*/
6541 PG::RecoveryState::Started::Started(my_context ctx)
6542 : my_base(ctx),
6543 NamedState(context< RecoveryMachine >().pg, "Started")
6544 {
6545 context< RecoveryMachine >().log_enter(state_name);
6546 }
6547
6548 boost::statechart::result
6549 PG::RecoveryState::Started::react(const IntervalFlush&)
6550 {
6551 PG *pg = context< RecoveryMachine >().pg;
6552 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6553 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6554 return discard_event();
6555 }
6556
6557
6558 boost::statechart::result
6559 PG::RecoveryState::Started::react(const FlushedEvt&)
6560 {
6561 PG *pg = context< RecoveryMachine >().pg;
6562 pg->on_flushed();
6563 return discard_event();
6564 }
6565
6566
6567 boost::statechart::result PG::RecoveryState::Started::react(const AdvMap& advmap)
6568 {
6569 PG *pg = context< RecoveryMachine >().pg;
6570 ldout(pg->cct, 10) << "Started advmap" << dendl;
6571 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6572 if (pg->should_restart_peering(
6573 advmap.up_primary,
6574 advmap.acting_primary,
6575 advmap.newup,
6576 advmap.newacting,
6577 advmap.lastmap,
6578 advmap.osdmap)) {
6579 ldout(pg->cct, 10) << "should_restart_peering, transitioning to Reset"
6580 << dendl;
6581 post_event(advmap);
6582 return transit< Reset >();
6583 }
6584 pg->remove_down_peer_info(advmap.osdmap);
6585 return discard_event();
6586 }
6587
6588 boost::statechart::result PG::RecoveryState::Started::react(const QueryState& q)
6589 {
6590 q.f->open_object_section("state");
6591 q.f->dump_string("name", state_name);
6592 q.f->dump_stream("enter_time") << enter_time;
6593 q.f->close_section();
6594 return discard_event();
6595 }
6596
6597 void PG::RecoveryState::Started::exit()
6598 {
6599 context< RecoveryMachine >().log_exit(state_name, enter_time);
6600 PG *pg = context< RecoveryMachine >().pg;
6601 utime_t dur = ceph_clock_now() - enter_time;
6602 pg->osd->recoverystate_perf->tinc(rs_started_latency, dur);
6603 }
6604
6605 /*--------Reset---------*/
6606 PG::RecoveryState::Reset::Reset(my_context ctx)
6607 : my_base(ctx),
6608 NamedState(context< RecoveryMachine >().pg, "Reset")
6609 {
6610 context< RecoveryMachine >().log_enter(state_name);
6611 PG *pg = context< RecoveryMachine >().pg;
6612
6613 pg->flushes_in_progress = 0;
6614 pg->set_last_peering_reset();
6615 }
6616
6617 boost::statechart::result
6618 PG::RecoveryState::Reset::react(const FlushedEvt&)
6619 {
6620 PG *pg = context< RecoveryMachine >().pg;
6621 pg->on_flushed();
6622 return discard_event();
6623 }
6624
6625 boost::statechart::result
6626 PG::RecoveryState::Reset::react(const IntervalFlush&)
6627 {
6628 PG *pg = context< RecoveryMachine >().pg;
6629 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6630 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6631 return discard_event();
6632 }
6633
6634 boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
6635 {
6636 PG *pg = context< RecoveryMachine >().pg;
6637 ldout(pg->cct, 10) << "Reset advmap" << dendl;
6638
6639 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6640
6641 if (pg->should_restart_peering(
6642 advmap.up_primary,
6643 advmap.acting_primary,
6644 advmap.newup,
6645 advmap.newacting,
6646 advmap.lastmap,
6647 advmap.osdmap)) {
6648 ldout(pg->cct, 10) << "should restart peering, calling start_peering_interval again"
6649 << dendl;
6650 pg->start_peering_interval(
6651 advmap.lastmap,
6652 advmap.newup, advmap.up_primary,
6653 advmap.newacting, advmap.acting_primary,
6654 context< RecoveryMachine >().get_cur_transaction());
6655 }
6656 pg->remove_down_peer_info(advmap.osdmap);
6657 pg->check_past_interval_bounds();
6658 return discard_event();
6659 }
6660
6661 boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&)
6662 {
6663 PG *pg = context< RecoveryMachine >().pg;
6664 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
6665 context< RecoveryMachine >().send_notify(
6666 pg->get_primary(),
6667 pg_notify_t(
6668 pg->get_primary().shard, pg->pg_whoami.shard,
6669 pg->get_osdmap()->get_epoch(),
6670 pg->get_osdmap()->get_epoch(),
6671 pg->info),
6672 pg->past_intervals);
6673 }
6674
6675 pg->update_heartbeat_peers();
6676 pg->take_waiters();
6677
6678 return transit< Started >();
6679 }
6680
6681 boost::statechart::result PG::RecoveryState::Reset::react(const QueryState& q)
6682 {
6683 q.f->open_object_section("state");
6684 q.f->dump_string("name", state_name);
6685 q.f->dump_stream("enter_time") << enter_time;
6686 q.f->close_section();
6687 return discard_event();
6688 }
6689
6690 void PG::RecoveryState::Reset::exit()
6691 {
6692 context< RecoveryMachine >().log_exit(state_name, enter_time);
6693 PG *pg = context< RecoveryMachine >().pg;
6694 utime_t dur = ceph_clock_now() - enter_time;
6695 pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur);
6696 }
6697
6698 /*-------Start---------*/
6699 PG::RecoveryState::Start::Start(my_context ctx)
6700 : my_base(ctx),
6701 NamedState(context< RecoveryMachine >().pg, "Start")
6702 {
6703 context< RecoveryMachine >().log_enter(state_name);
6704
6705 PG *pg = context< RecoveryMachine >().pg;
6706 if (pg->is_primary()) {
6707 ldout(pg->cct, 1) << "transitioning to Primary" << dendl;
6708 post_event(MakePrimary());
6709 } else { //is_stray
6710 ldout(pg->cct, 1) << "transitioning to Stray" << dendl;
6711 post_event(MakeStray());
6712 }
6713 }
6714
6715 void PG::RecoveryState::Start::exit()
6716 {
6717 context< RecoveryMachine >().log_exit(state_name, enter_time);
6718 PG *pg = context< RecoveryMachine >().pg;
6719 utime_t dur = ceph_clock_now() - enter_time;
6720 pg->osd->recoverystate_perf->tinc(rs_start_latency, dur);
6721 }
6722
6723 /*---------Primary--------*/
6724 PG::RecoveryState::Primary::Primary(my_context ctx)
6725 : my_base(ctx),
6726 NamedState(context< RecoveryMachine >().pg, "Started/Primary")
6727 {
6728 context< RecoveryMachine >().log_enter(state_name);
6729 PG *pg = context< RecoveryMachine >().pg;
6730 assert(pg->want_acting.empty());
6731
6732 // set CREATING bit until we have peered for the first time.
6733 if (pg->info.history.last_epoch_started == 0) {
6734 pg->state_set(PG_STATE_CREATING);
6735 // use the history timestamp, which ultimately comes from the
6736 // monitor in the create case.
6737 utime_t t = pg->info.history.last_scrub_stamp;
6738 pg->info.stats.last_fresh = t;
6739 pg->info.stats.last_active = t;
6740 pg->info.stats.last_change = t;
6741 pg->info.stats.last_peered = t;
6742 pg->info.stats.last_clean = t;
6743 pg->info.stats.last_unstale = t;
6744 pg->info.stats.last_undegraded = t;
6745 pg->info.stats.last_fullsized = t;
6746 pg->info.stats.last_scrub_stamp = t;
6747 pg->info.stats.last_deep_scrub_stamp = t;
6748 pg->info.stats.last_clean_scrub_stamp = t;
6749 }
6750 }
6751
6752 boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& notevt)
6753 {
6754 PG *pg = context< RecoveryMachine >().pg;
6755 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
6756 pg->proc_replica_info(
6757 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
6758 return discard_event();
6759 }
6760
6761 boost::statechart::result PG::RecoveryState::Primary::react(const ActMap&)
6762 {
6763 PG *pg = context< RecoveryMachine >().pg;
6764 ldout(pg->cct, 7) << "handle ActMap primary" << dendl;
6765 pg->publish_stats_to_osd();
6766 pg->take_waiters();
6767 return discard_event();
6768 }
6769
6770 void PG::RecoveryState::Primary::exit()
6771 {
6772 context< RecoveryMachine >().log_exit(state_name, enter_time);
6773 PG *pg = context< RecoveryMachine >().pg;
6774 pg->want_acting.clear();
6775 utime_t dur = ceph_clock_now() - enter_time;
6776 pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur);
6777 pg->clear_primary_state();
6778 pg->state_clear(PG_STATE_CREATING);
6779 }
6780
6781 /*---------Peering--------*/
6782 PG::RecoveryState::Peering::Peering(my_context ctx)
6783 : my_base(ctx),
6784 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering"),
6785 history_les_bound(false)
6786 {
6787 context< RecoveryMachine >().log_enter(state_name);
6788
6789 PG *pg = context< RecoveryMachine >().pg;
6790 assert(!pg->is_peered());
6791 assert(!pg->is_peering());
6792 assert(pg->is_primary());
6793 pg->state_set(PG_STATE_PEERING);
6794 }
6795
6796 boost::statechart::result PG::RecoveryState::Peering::react(const AdvMap& advmap)
6797 {
6798 PG *pg = context< RecoveryMachine >().pg;
6799 ldout(pg->cct, 10) << "Peering advmap" << dendl;
6800 if (prior_set.affected_by_map(*(advmap.osdmap), pg)) {
6801 ldout(pg->cct, 1) << "Peering, affected_by_map, going to Reset" << dendl;
6802 post_event(advmap);
6803 return transit< Reset >();
6804 }
6805
6806 pg->adjust_need_up_thru(advmap.osdmap);
6807
6808 return forward_event();
6809 }
6810
6811 boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q)
6812 {
6813 PG *pg = context< RecoveryMachine >().pg;
6814
6815 q.f->open_object_section("state");
6816 q.f->dump_string("name", state_name);
6817 q.f->dump_stream("enter_time") << enter_time;
6818
6819 q.f->open_array_section("past_intervals");
6820 pg->past_intervals.dump(q.f);
6821 q.f->close_section();
6822
6823 q.f->open_array_section("probing_osds");
6824 for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
6825 p != prior_set.probe.end();
6826 ++p)
6827 q.f->dump_stream("osd") << *p;
6828 q.f->close_section();
6829
6830 if (prior_set.pg_down)
6831 q.f->dump_string("blocked", "peering is blocked due to down osds");
6832
6833 q.f->open_array_section("down_osds_we_would_probe");
6834 for (set<int>::iterator p = prior_set.down.begin();
6835 p != prior_set.down.end();
6836 ++p)
6837 q.f->dump_int("osd", *p);
6838 q.f->close_section();
6839
6840 q.f->open_array_section("peering_blocked_by");
6841 for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
6842 p != prior_set.blocked_by.end();
6843 ++p) {
6844 q.f->open_object_section("osd");
6845 q.f->dump_int("osd", p->first);
6846 q.f->dump_int("current_lost_at", p->second);
6847 q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
6848 q.f->close_section();
6849 }
6850 q.f->close_section();
6851
6852 if (history_les_bound) {
6853 q.f->open_array_section("peering_blocked_by_detail");
6854 q.f->open_object_section("item");
6855 q.f->dump_string("detail","peering_blocked_by_history_les_bound");
6856 q.f->close_section();
6857 q.f->close_section();
6858 }
6859
6860 q.f->close_section();
6861 return forward_event();
6862 }
6863
6864 void PG::RecoveryState::Peering::exit()
6865 {
6866 PG *pg = context< RecoveryMachine >().pg;
6867 ldout(pg->cct, 10) << "Leaving Peering" << dendl;
6868 context< RecoveryMachine >().log_exit(state_name, enter_time);
6869 pg->state_clear(PG_STATE_PEERING);
6870 pg->clear_probe_targets();
6871
6872 utime_t dur = ceph_clock_now() - enter_time;
6873 pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur);
6874 }
6875
6876
6877 /*------Backfilling-------*/
6878 PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
6879 : my_base(ctx),
6880 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Backfilling")
6881 {
6882 context< RecoveryMachine >().log_enter(state_name);
6883 PG *pg = context< RecoveryMachine >().pg;
6884 pg->backfill_reserved = true;
6885 pg->queue_recovery();
6886 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
6887 pg->state_clear(PG_STATE_BACKFILL_WAIT);
6888 pg->state_set(PG_STATE_BACKFILLING);
6889 pg->publish_stats_to_osd();
6890 }
6891
6892 boost::statechart::result
6893 PG::RecoveryState::Backfilling::react(const DeferBackfill &c)
6894 {
6895 PG *pg = context< RecoveryMachine >().pg;
6896 ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl;
6897 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6898
6899 pg->state_set(PG_STATE_BACKFILL_WAIT);
6900 pg->state_clear(PG_STATE_BACKFILLING);
6901
6902 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6903 it != pg->backfill_targets.end();
6904 ++it) {
6905 assert(*it != pg->pg_whoami);
6906 ConnectionRef con = pg->osd->get_con_osd_cluster(
6907 it->osd, pg->get_osdmap()->get_epoch());
6908 if (con) {
6909 pg->osd->send_message_osd_cluster(
6910 new MBackfillReserve(
6911 MBackfillReserve::REJECT,
6912 spg_t(pg->info.pgid.pgid, it->shard),
6913 pg->get_osdmap()->get_epoch()),
6914 con.get());
6915 }
6916 }
6917
6918
6919 if (!pg->waiting_on_backfill.empty()) {
6920 pg->waiting_on_backfill.clear();
6921 pg->finish_recovery_op(hobject_t::get_max());
6922 }
6923
6924 pg->schedule_backfill_retry(c.delay);
6925 return transit<NotBackfilling>();
6926 }
6927
6928 boost::statechart::result
6929 PG::RecoveryState::Backfilling::react(const UnfoundBackfill &c)
6930 {
6931 PG *pg = context< RecoveryMachine >().pg;
6932 ldout(pg->cct, 10) << "backfill has unfound, can't continue" << dendl;
6933 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6934
6935 pg->state_set(PG_STATE_BACKFILL_UNFOUND);
6936 pg->state_clear(PG_STATE_BACKFILLING);
6937
6938 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6939 it != pg->backfill_targets.end();
6940 ++it) {
6941 assert(*it != pg->pg_whoami);
6942 ConnectionRef con = pg->osd->get_con_osd_cluster(
6943 it->osd, pg->get_osdmap()->get_epoch());
6944 if (con) {
6945 pg->osd->send_message_osd_cluster(
6946 new MBackfillReserve(
6947 MBackfillReserve::REJECT,
6948 spg_t(pg->info.pgid.pgid, it->shard),
6949 pg->get_osdmap()->get_epoch()),
6950 con.get());
6951 }
6952 }
6953
6954 pg->waiting_on_backfill.clear();
6955
6956 return transit<NotBackfilling>();
6957 }
6958
6959 boost::statechart::result
6960 PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &)
6961 {
6962 PG *pg = context< RecoveryMachine >().pg;
6963 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6964 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6965
6966 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6967 it != pg->backfill_targets.end();
6968 ++it) {
6969 assert(*it != pg->pg_whoami);
6970 ConnectionRef con = pg->osd->get_con_osd_cluster(
6971 it->osd, pg->get_osdmap()->get_epoch());
6972 if (con) {
6973 pg->osd->send_message_osd_cluster(
6974 new MBackfillReserve(
6975 MBackfillReserve::REJECT,
6976 spg_t(pg->info.pgid.pgid, it->shard),
6977 pg->get_osdmap()->get_epoch()),
6978 con.get());
6979 }
6980 }
6981
6982 if (!pg->waiting_on_backfill.empty()) {
6983 pg->waiting_on_backfill.clear();
6984 pg->finish_recovery_op(hobject_t::get_max());
6985 }
6986
6987 pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
6988 return transit<NotBackfilling>();
6989 }
6990
6991 void PG::RecoveryState::Backfilling::exit()
6992 {
6993 context< RecoveryMachine >().log_exit(state_name, enter_time);
6994 PG *pg = context< RecoveryMachine >().pg;
6995 pg->backfill_reserved = false;
6996 pg->backfill_reserving = false;
6997 pg->state_clear(PG_STATE_BACKFILLING);
6998 pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
6999 utime_t dur = ceph_clock_now() - enter_time;
7000 pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
7001 }
7002
7003 /*--WaitRemoteBackfillReserved--*/
7004
7005 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
7006 : my_base(ctx),
7007 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteBackfillReserved"),
7008 backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
7009 {
7010 context< RecoveryMachine >().log_enter(state_name);
7011 PG *pg = context< RecoveryMachine >().pg;
7012 pg->state_set(PG_STATE_BACKFILL_WAIT);
7013 pg->publish_stats_to_osd();
7014 post_event(RemoteBackfillReserved());
7015 }
7016
7017 boost::statechart::result
7018 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
7019 {
7020 PG *pg = context< RecoveryMachine >().pg;
7021
7022 if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) {
7023 //The primary never backfills itself
7024 assert(*backfill_osd_it != pg->pg_whoami);
7025 ConnectionRef con = pg->osd->get_con_osd_cluster(
7026 backfill_osd_it->osd, pg->get_osdmap()->get_epoch());
7027 if (con) {
7028 pg->osd->send_message_osd_cluster(
7029 new MBackfillReserve(
7030 MBackfillReserve::REQUEST,
7031 spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
7032 pg->get_osdmap()->get_epoch(),
7033 pg->get_backfill_priority()),
7034 con.get());
7035 }
7036 ++backfill_osd_it;
7037 } else {
7038 post_event(AllBackfillsReserved());
7039 }
7040 return discard_event();
7041 }
7042
7043 void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
7044 {
7045 context< RecoveryMachine >().log_exit(state_name, enter_time);
7046 PG *pg = context< RecoveryMachine >().pg;
7047 utime_t dur = ceph_clock_now() - enter_time;
7048 pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur);
7049 }
7050
7051 boost::statechart::result
7052 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt)
7053 {
7054 PG *pg = context< RecoveryMachine >().pg;
7055 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7056
7057 // Send REJECT to all previously acquired reservations
7058 set<pg_shard_t>::const_iterator it, begin, end, next;
7059 begin = context< Active >().remote_shards_to_reserve_backfill.begin();
7060 end = context< Active >().remote_shards_to_reserve_backfill.end();
7061 assert(begin != end);
7062 for (next = it = begin, ++next ; next != backfill_osd_it; ++it, ++next) {
7063 //The primary never backfills itself
7064 assert(*it != pg->pg_whoami);
7065 ConnectionRef con = pg->osd->get_con_osd_cluster(
7066 it->osd, pg->get_osdmap()->get_epoch());
7067 if (con) {
7068 pg->osd->send_message_osd_cluster(
7069 new MBackfillReserve(
7070 MBackfillReserve::REJECT,
7071 spg_t(pg->info.pgid.pgid, it->shard),
7072 pg->get_osdmap()->get_epoch()),
7073 con.get());
7074 }
7075 }
7076
7077 pg->state_clear(PG_STATE_BACKFILL_WAIT);
7078 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
7079 pg->publish_stats_to_osd();
7080
7081 pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
7082
7083 return transit<NotBackfilling>();
7084 }
7085
7086 /*--WaitLocalBackfillReserved--*/
7087 PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
7088 : my_base(ctx),
7089 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalBackfillReserved")
7090 {
7091 context< RecoveryMachine >().log_enter(state_name);
7092 PG *pg = context< RecoveryMachine >().pg;
7093 pg->state_set(PG_STATE_BACKFILL_WAIT);
7094 pg->osd->local_reserver.request_reservation(
7095 pg->info.pgid,
7096 new QueuePeeringEvt<LocalBackfillReserved>(
7097 pg, pg->get_osdmap()->get_epoch(),
7098 LocalBackfillReserved()),
7099 pg->get_backfill_priority(),
7100 new QueuePeeringEvt<DeferBackfill>(
7101 pg, pg->get_osdmap()->get_epoch(),
7102 DeferBackfill(0.0)));
7103 pg->publish_stats_to_osd();
7104 }
7105
7106 void PG::RecoveryState::WaitLocalBackfillReserved::exit()
7107 {
7108 context< RecoveryMachine >().log_exit(state_name, enter_time);
7109 PG *pg = context< RecoveryMachine >().pg;
7110 utime_t dur = ceph_clock_now() - enter_time;
7111 pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur);
7112 }
7113
7114 /*----NotBackfilling------*/
7115 PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
7116 : my_base(ctx),
7117 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotBackfilling")
7118 {
7119 context< RecoveryMachine >().log_enter(state_name);
7120 PG *pg = context< RecoveryMachine >().pg;
7121 pg->publish_stats_to_osd();
7122 }
7123
7124 boost::statechart::result
7125 PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
7126 {
7127 return discard_event();
7128 }
7129
7130 boost::statechart::result
7131 PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
7132 {
7133 return discard_event();
7134 }
7135
7136 void PG::RecoveryState::NotBackfilling::exit()
7137 {
7138 context< RecoveryMachine >().log_exit(state_name, enter_time);
7139 PG *pg = context< RecoveryMachine >().pg;
7140 pg->state_clear(PG_STATE_BACKFILL_UNFOUND);
7141 utime_t dur = ceph_clock_now() - enter_time;
7142 pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
7143 }
7144
7145 /*----NotRecovering------*/
7146 PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx)
7147 : my_base(ctx),
7148 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotRecovering")
7149 {
7150 context< RecoveryMachine >().log_enter(state_name);
7151 PG *pg = context< RecoveryMachine >().pg;
7152 pg->publish_stats_to_osd();
7153 }
7154
7155 void PG::RecoveryState::NotRecovering::exit()
7156 {
7157 context< RecoveryMachine >().log_exit(state_name, enter_time);
7158 PG *pg = context< RecoveryMachine >().pg;
7159 pg->state_clear(PG_STATE_RECOVERY_UNFOUND);
7160 utime_t dur = ceph_clock_now() - enter_time;
7161 pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur);
7162 }
7163
7164 /*---RepNotRecovering----*/
7165 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
7166 : my_base(ctx),
7167 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepNotRecovering")
7168 {
7169 context< RecoveryMachine >().log_enter(state_name);
7170 }
7171
7172 boost::statechart::result
7173 PG::RecoveryState::RepNotRecovering::react(const RejectRemoteReservation &evt)
7174 {
7175 PG *pg = context< RecoveryMachine >().pg;
7176 pg->reject_reservation();
7177 post_event(RemoteReservationRejected());
7178 return discard_event();
7179 }
7180
7181 void PG::RecoveryState::RepNotRecovering::exit()
7182 {
7183 context< RecoveryMachine >().log_exit(state_name, enter_time);
7184 PG *pg = context< RecoveryMachine >().pg;
7185 utime_t dur = ceph_clock_now() - enter_time;
7186 pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur);
7187 }
7188
7189 /*---RepWaitRecoveryReserved--*/
7190 PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
7191 : my_base(ctx),
7192 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitRecoveryReserved")
7193 {
7194 context< RecoveryMachine >().log_enter(state_name);
7195 PG *pg = context< RecoveryMachine >().pg;
7196
7197 pg->osd->remote_reserver.request_reservation(
7198 pg->info.pgid,
7199 new QueuePeeringEvt<RemoteRecoveryReserved>(
7200 pg, pg->get_osdmap()->get_epoch(),
7201 RemoteRecoveryReserved()),
7202 pg->get_recovery_priority());
7203 }
7204
7205 boost::statechart::result
7206 PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
7207 {
7208 PG *pg = context< RecoveryMachine >().pg;
7209 pg->osd->send_message_osd_cluster(
7210 pg->primary.osd,
7211 new MRecoveryReserve(
7212 MRecoveryReserve::GRANT,
7213 spg_t(pg->info.pgid.pgid, pg->primary.shard),
7214 pg->get_osdmap()->get_epoch()),
7215 pg->get_osdmap()->get_epoch());
7216 return transit<RepRecovering>();
7217 }
7218
7219 boost::statechart::result
7220 PG::RecoveryState::RepWaitRecoveryReserved::react(
7221 const RemoteReservationCanceled &evt)
7222 {
7223 PG *pg = context< RecoveryMachine >().pg;
7224 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7225 return transit<RepNotRecovering>();
7226 }
7227
7228 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
7229 {
7230 context< RecoveryMachine >().log_exit(state_name, enter_time);
7231 PG *pg = context< RecoveryMachine >().pg;
7232 utime_t dur = ceph_clock_now() - enter_time;
7233 pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur);
7234 }
7235
7236 /*-RepWaitBackfillReserved*/
7237 PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
7238 : my_base(ctx),
7239 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitBackfillReserved")
7240 {
7241 context< RecoveryMachine >().log_enter(state_name);
7242 }
7243
7244 boost::statechart::result
7245 PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
7246 {
7247 PG *pg = context< RecoveryMachine >().pg;
7248 ostringstream ss;
7249
7250 if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
7251 (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
7252 ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
7253 << dendl;
7254 post_event(RejectRemoteReservation());
7255 } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
7256 pg->osd->check_backfill_full(ss)) {
7257 ldout(pg->cct, 10) << "backfill reservation rejected: "
7258 << ss.str() << dendl;
7259 post_event(RejectRemoteReservation());
7260 } else {
7261 pg->osd->remote_reserver.request_reservation(
7262 pg->info.pgid,
7263 new QueuePeeringEvt<RemoteBackfillReserved>(
7264 pg, pg->get_osdmap()->get_epoch(),
7265 RemoteBackfillReserved()), evt.priority);
7266 }
7267 return transit<RepWaitBackfillReserved>();
7268 }
7269
7270 void PG::RecoveryState::RepWaitBackfillReserved::exit()
7271 {
7272 context< RecoveryMachine >().log_exit(state_name, enter_time);
7273 PG *pg = context< RecoveryMachine >().pg;
7274 utime_t dur = ceph_clock_now() - enter_time;
7275 pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur);
7276 }
7277
7278 boost::statechart::result
7279 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
7280 {
7281 PG *pg = context< RecoveryMachine >().pg;
7282
7283 ostringstream ss;
7284 if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
7285 (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
7286 ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
7287 << "failure injection" << dendl;
7288 post_event(RejectRemoteReservation());
7289 return discard_event();
7290 } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
7291 pg->osd->check_backfill_full(ss)) {
7292 ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
7293 << ss.str() << dendl;
7294 post_event(RejectRemoteReservation());
7295 return discard_event();
7296 } else {
7297 pg->osd->send_message_osd_cluster(
7298 pg->primary.osd,
7299 new MBackfillReserve(
7300 MBackfillReserve::GRANT,
7301 spg_t(pg->info.pgid.pgid, pg->primary.shard),
7302 pg->get_osdmap()->get_epoch()),
7303 pg->get_osdmap()->get_epoch());
7304 return transit<RepRecovering>();
7305 }
7306 }
7307
7308 boost::statechart::result
7309 PG::RecoveryState::RepWaitBackfillReserved::react(
7310 const RejectRemoteReservation &evt)
7311 {
7312 PG *pg = context< RecoveryMachine >().pg;
7313 pg->reject_reservation();
7314 post_event(RemoteReservationRejected());
7315 return discard_event();
7316 }
7317
7318 boost::statechart::result
7319 PG::RecoveryState::RepWaitBackfillReserved::react(
7320 const RemoteReservationRejected &evt)
7321 {
7322 PG *pg = context< RecoveryMachine >().pg;
7323 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7324 return transit<RepNotRecovering>();
7325 }
7326
7327 boost::statechart::result
7328 PG::RecoveryState::RepWaitBackfillReserved::react(
7329 const RemoteReservationCanceled &evt)
7330 {
7331 PG *pg = context< RecoveryMachine >().pg;
7332 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7333 return transit<RepNotRecovering>();
7334 }
7335
7336 /*---RepRecovering-------*/
7337 PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx)
7338 : my_base(ctx),
7339 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepRecovering")
7340 {
7341 context< RecoveryMachine >().log_enter(state_name);
7342 }
7343
7344 boost::statechart::result
7345 PG::RecoveryState::RepRecovering::react(const BackfillTooFull &)
7346 {
7347 PG *pg = context< RecoveryMachine >().pg;
7348 pg->reject_reservation();
7349 return discard_event();
7350 }
7351
7352 void PG::RecoveryState::RepRecovering::exit()
7353 {
7354 context< RecoveryMachine >().log_exit(state_name, enter_time);
7355 PG *pg = context< RecoveryMachine >().pg;
7356 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7357 utime_t dur = ceph_clock_now() - enter_time;
7358 pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur);
7359 }
7360
7361 /*------Activating--------*/
7362 PG::RecoveryState::Activating::Activating(my_context ctx)
7363 : my_base(ctx),
7364 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Activating")
7365 {
7366 context< RecoveryMachine >().log_enter(state_name);
7367 }
7368
7369 void PG::RecoveryState::Activating::exit()
7370 {
7371 context< RecoveryMachine >().log_exit(state_name, enter_time);
7372 PG *pg = context< RecoveryMachine >().pg;
7373 utime_t dur = ceph_clock_now() - enter_time;
7374 pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur);
7375 }
7376
7377 PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
7378 : my_base(ctx),
7379 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalRecoveryReserved")
7380 {
7381 context< RecoveryMachine >().log_enter(state_name);
7382 PG *pg = context< RecoveryMachine >().pg;
7383
7384 // Make sure all nodes that part of the recovery aren't full
7385 if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery &&
7386 pg->osd->check_osdmap_full(pg->actingbackfill)) {
7387 post_event(RecoveryTooFull());
7388 return;
7389 }
7390
7391 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7392 pg->state_set(PG_STATE_RECOVERY_WAIT);
7393 pg->osd->local_reserver.request_reservation(
7394 pg->info.pgid,
7395 new QueuePeeringEvt<LocalRecoveryReserved>(
7396 pg, pg->get_osdmap()->get_epoch(),
7397 LocalRecoveryReserved()),
7398 pg->get_recovery_priority(),
7399 new QueuePeeringEvt<DeferRecovery>(
7400 pg, pg->get_osdmap()->get_epoch(),
7401 DeferRecovery(0.0)));
7402 pg->publish_stats_to_osd();
7403 }
7404
7405 boost::statechart::result
7406 PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
7407 {
7408 PG *pg = context< RecoveryMachine >().pg;
7409 pg->state_set(PG_STATE_RECOVERY_TOOFULL);
7410 pg->schedule_recovery_retry(pg->cct->_conf->osd_recovery_retry_interval);
7411 return transit<NotRecovering>();
7412 }
7413
7414 void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
7415 {
7416 context< RecoveryMachine >().log_exit(state_name, enter_time);
7417 PG *pg = context< RecoveryMachine >().pg;
7418 utime_t dur = ceph_clock_now() - enter_time;
7419 pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur);
7420 }
7421
7422 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
7423 : my_base(ctx),
7424 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
7425 remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
7426 {
7427 context< RecoveryMachine >().log_enter(state_name);
7428 post_event(RemoteRecoveryReserved());
7429 }
7430
7431 boost::statechart::result
7432 PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
7433 PG *pg = context< RecoveryMachine >().pg;
7434
7435 if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
7436 assert(*remote_recovery_reservation_it != pg->pg_whoami);
7437 ConnectionRef con = pg->osd->get_con_osd_cluster(
7438 remote_recovery_reservation_it->osd, pg->get_osdmap()->get_epoch());
7439 if (con) {
7440 pg->osd->send_message_osd_cluster(
7441 new MRecoveryReserve(
7442 MRecoveryReserve::REQUEST,
7443 spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard),
7444 pg->get_osdmap()->get_epoch()),
7445 con.get());
7446 }
7447 ++remote_recovery_reservation_it;
7448 } else {
7449 post_event(AllRemotesReserved());
7450 }
7451 return discard_event();
7452 }
7453
7454 void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
7455 {
7456 context< RecoveryMachine >().log_exit(state_name, enter_time);
7457 PG *pg = context< RecoveryMachine >().pg;
7458 utime_t dur = ceph_clock_now() - enter_time;
7459 pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur);
7460 }
7461
7462 PG::RecoveryState::Recovering::Recovering(my_context ctx)
7463 : my_base(ctx),
7464 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovering")
7465 {
7466 context< RecoveryMachine >().log_enter(state_name);
7467
7468 PG *pg = context< RecoveryMachine >().pg;
7469 pg->state_clear(PG_STATE_RECOVERY_WAIT);
7470 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7471 pg->state_set(PG_STATE_RECOVERING);
7472 assert(!pg->state_test(PG_STATE_ACTIVATING));
7473 pg->publish_stats_to_osd();
7474 pg->queue_recovery();
7475 }
7476
7477 void PG::RecoveryState::Recovering::release_reservations(bool cancel)
7478 {
7479 PG *pg = context< RecoveryMachine >().pg;
7480 assert(cancel || !pg->pg_log.get_missing().have_missing());
7481
7482 // release remote reservations
7483 for (set<pg_shard_t>::const_iterator i =
7484 context< Active >().remote_shards_to_reserve_recovery.begin();
7485 i != context< Active >().remote_shards_to_reserve_recovery.end();
7486 ++i) {
7487 if (*i == pg->pg_whoami) // skip myself
7488 continue;
7489 ConnectionRef con = pg->osd->get_con_osd_cluster(
7490 i->osd, pg->get_osdmap()->get_epoch());
7491 if (con) {
7492 pg->osd->send_message_osd_cluster(
7493 new MRecoveryReserve(
7494 MRecoveryReserve::RELEASE,
7495 spg_t(pg->info.pgid.pgid, i->shard),
7496 pg->get_osdmap()->get_epoch()),
7497 con.get());
7498 }
7499 }
7500 }
7501
7502 boost::statechart::result
7503 PG::RecoveryState::Recovering::react(const AllReplicasRecovered &evt)
7504 {
7505 PG *pg = context< RecoveryMachine >().pg;
7506 pg->state_clear(PG_STATE_RECOVERING);
7507 pg->state_clear(PG_STATE_FORCED_RECOVERY);
7508 release_reservations();
7509 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7510 return transit<Recovered>();
7511 }
7512
7513 boost::statechart::result
7514 PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
7515 {
7516 PG *pg = context< RecoveryMachine >().pg;
7517 pg->state_clear(PG_STATE_RECOVERING);
7518 pg->state_clear(PG_STATE_FORCED_RECOVERY);
7519 release_reservations();
7520 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7521 // XXX: Is this needed?
7522 pg->publish_stats_to_osd();
7523 return transit<WaitLocalBackfillReserved>();
7524 }
7525
7526 boost::statechart::result
7527 PG::RecoveryState::Recovering::react(const DeferRecovery &evt)
7528 {
7529 PG *pg = context< RecoveryMachine >().pg;
7530 if (!pg->state_test(PG_STATE_RECOVERING)) {
7531 // we may have finished recovery and have an AllReplicasRecovered
7532 // event queued to move us to the next state.
7533 ldout(pg->cct, 10) << "got defer recovery but not recovering" << dendl;
7534 return discard_event();
7535 }
7536 ldout(pg->cct, 10) << "defer recovery, retry delay " << evt.delay << dendl;
7537 pg->state_clear(PG_STATE_RECOVERING);
7538 pg->state_set(PG_STATE_RECOVERY_WAIT);
7539 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7540 release_reservations(true);
7541 pg->schedule_recovery_retry(evt.delay);
7542 return transit<NotRecovering>();
7543 }
7544
7545 boost::statechart::result
7546 PG::RecoveryState::Recovering::react(const UnfoundRecovery &evt)
7547 {
7548 PG *pg = context< RecoveryMachine >().pg;
7549 ldout(pg->cct, 10) << "recovery has unfound, can't continue" << dendl;
7550 pg->state_set(PG_STATE_RECOVERY_UNFOUND);
7551 pg->state_clear(PG_STATE_RECOVERING);
7552 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7553 release_reservations(true);
7554 return transit<NotRecovering>();
7555 }
7556
7557 void PG::RecoveryState::Recovering::exit()
7558 {
7559 context< RecoveryMachine >().log_exit(state_name, enter_time);
7560 PG *pg = context< RecoveryMachine >().pg;
7561 utime_t dur = ceph_clock_now() - enter_time;
7562 pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur);
7563 }
7564
7565 PG::RecoveryState::Recovered::Recovered(my_context ctx)
7566 : my_base(ctx),
7567 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovered")
7568 {
7569 pg_shard_t auth_log_shard;
7570
7571 context< RecoveryMachine >().log_enter(state_name);
7572
7573 PG *pg = context< RecoveryMachine >().pg;
7574
7575 assert(!pg->needs_recovery());
7576
7577 // if we finished backfill, all acting are active; recheck if
7578 // DEGRADED | UNDERSIZED is appropriate.
7579 assert(!pg->actingbackfill.empty());
7580 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
7581 pg->actingbackfill.size()) {
7582 pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
7583 pg->publish_stats_to_osd();
7584 }
7585
7586 // adjust acting set? (e.g. because backfill completed...)
7587 bool history_les_bound = false;
7588 if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
7589 true, &history_les_bound))
7590 assert(pg->want_acting.size());
7591
7592 if (context< Active >().all_replicas_activated)
7593 post_event(GoClean());
7594 }
7595
7596 void PG::RecoveryState::Recovered::exit()
7597 {
7598 context< RecoveryMachine >().log_exit(state_name, enter_time);
7599 PG *pg = context< RecoveryMachine >().pg;
7600 utime_t dur = ceph_clock_now() - enter_time;
7601 pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur);
7602 }
7603
7604 PG::RecoveryState::Clean::Clean(my_context ctx)
7605 : my_base(ctx),
7606 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Clean")
7607 {
7608 context< RecoveryMachine >().log_enter(state_name);
7609
7610 PG *pg = context< RecoveryMachine >().pg;
7611
7612 if (pg->info.last_complete != pg->info.last_update) {
7613 ceph_abort();
7614 }
7615 pg->finish_recovery(*context< RecoveryMachine >().get_on_safe_context_list());
7616
7617 if (pg->is_active()) {
7618 pg->mark_clean();
7619 }
7620
7621 pg->share_pg_info();
7622 pg->publish_stats_to_osd();
7623 pg->requeue_ops(pg->waiting_for_clean_to_primary_repair);
7624 }
7625
7626 void PG::RecoveryState::Clean::exit()
7627 {
7628 context< RecoveryMachine >().log_exit(state_name, enter_time);
7629 PG *pg = context< RecoveryMachine >().pg;
7630 pg->state_clear(PG_STATE_CLEAN);
7631 utime_t dur = ceph_clock_now() - enter_time;
7632 pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur);
7633 }
7634
7635 template <typename T>
7636 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
7637 {
7638 set<int> osds_found;
7639 set<pg_shard_t> out;
7640 for (typename T::const_iterator i = in.begin();
7641 i != in.end();
7642 ++i) {
7643 if (*i != skip && !osds_found.count(i->osd)) {
7644 osds_found.insert(i->osd);
7645 out.insert(*i);
7646 }
7647 }
7648 return out;
7649 }
7650
7651 /*---------Active---------*/
7652 PG::RecoveryState::Active::Active(my_context ctx)
7653 : my_base(ctx),
7654 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active"),
7655 remote_shards_to_reserve_recovery(
7656 unique_osd_shard_set(
7657 context< RecoveryMachine >().pg->pg_whoami,
7658 context< RecoveryMachine >().pg->actingbackfill)),
7659 remote_shards_to_reserve_backfill(
7660 unique_osd_shard_set(
7661 context< RecoveryMachine >().pg->pg_whoami,
7662 context< RecoveryMachine >().pg->backfill_targets)),
7663 all_replicas_activated(false)
7664 {
7665 context< RecoveryMachine >().log_enter(state_name);
7666
7667 PG *pg = context< RecoveryMachine >().pg;
7668
7669 assert(!pg->backfill_reserving);
7670 assert(!pg->backfill_reserved);
7671 assert(pg->is_primary());
7672 ldout(pg->cct, 10) << "In Active, about to call activate" << dendl;
7673 pg->start_flush(
7674 context< RecoveryMachine >().get_cur_transaction(),
7675 context< RecoveryMachine >().get_on_applied_context_list(),
7676 context< RecoveryMachine >().get_on_safe_context_list());
7677 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7678 pg->get_osdmap()->get_epoch(),
7679 *context< RecoveryMachine >().get_on_safe_context_list(),
7680 *context< RecoveryMachine >().get_query_map(),
7681 context< RecoveryMachine >().get_info_map(),
7682 context< RecoveryMachine >().get_recovery_ctx());
7683
7684 // everyone has to commit/ack before we are truly active
7685 pg->blocked_by.clear();
7686 for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
7687 p != pg->actingbackfill.end();
7688 ++p) {
7689 if (p->shard != pg->pg_whoami.shard) {
7690 pg->blocked_by.insert(p->shard);
7691 }
7692 }
7693 pg->publish_stats_to_osd();
7694 ldout(pg->cct, 10) << "Activate Finished" << dendl;
7695 }
7696
7697 boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
7698 {
7699 PG *pg = context< RecoveryMachine >().pg;
7700 ldout(pg->cct, 10) << "Active advmap" << dendl;
7701 if (!pg->pool.newly_removed_snaps.empty()) {
7702 pg->snap_trimq.union_of(pg->pool.newly_removed_snaps);
7703 ldout(pg->cct, 10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl;
7704 pg->dirty_info = true;
7705 pg->dirty_big_info = true;
7706 }
7707
7708 for (size_t i = 0; i < pg->want_acting.size(); i++) {
7709 int osd = pg->want_acting[i];
7710 if (!advmap.osdmap->is_up(osd)) {
7711 pg_shard_t osd_with_shard(osd, shard_id_t(i));
7712 assert(pg->is_acting(osd_with_shard) || pg->is_up(osd_with_shard));
7713 }
7714 }
7715
7716 bool need_publish = false;
7717 /* Check for changes in pool size (if the acting set changed as a result,
7718 * this does not matter) */
7719 if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) !=
7720 pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) {
7721 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) {
7722 pg->state_clear(PG_STATE_UNDERSIZED);
7723 } else {
7724 pg->state_set(PG_STATE_UNDERSIZED);
7725 }
7726 // degraded changes will be detected by call from publish_stats_to_osd()
7727 need_publish = true;
7728 }
7729
7730 // if we haven't reported our PG stats in a long time, do so now.
7731 if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
7732 ldout(pg->cct, 20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch)
7733 << " epochs" << dendl;
7734 need_publish = true;
7735 }
7736
7737 if (need_publish)
7738 pg->publish_stats_to_osd();
7739
7740 return forward_event();
7741 }
7742
7743 boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
7744 {
7745 PG *pg = context< RecoveryMachine >().pg;
7746 ldout(pg->cct, 10) << "Active: handling ActMap" << dendl;
7747 assert(pg->is_primary());
7748
7749 if (pg->have_unfound()) {
7750 // object may have become unfound
7751 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7752 }
7753
7754 if (pg->cct->_conf->osd_check_for_log_corruption)
7755 pg->check_log_for_corruption(pg->osd->store);
7756
7757 uint64_t unfound = pg->missing_loc.num_unfound();
7758 if (unfound > 0 &&
7759 pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
7760 if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
7761 pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound
7762 << " objects unfound and apparently lost, would automatically "
7763 << "mark these objects lost but this feature is not yet implemented "
7764 << "(osd_auto_mark_unfound_lost)";
7765 } else
7766 pg->osd->clog->error() << pg->info.pgid.pgid << " has "
7767 << unfound << " objects unfound and apparently lost";
7768 }
7769
7770 if (pg->is_active()) {
7771 ldout(pg->cct, 10) << "Active: kicking snap trim" << dendl;
7772 pg->kick_snap_trim();
7773 }
7774
7775 if (pg->is_peered() &&
7776 !pg->is_clean() &&
7777 !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) &&
7778 (!pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || pg->is_degraded())) {
7779 pg->queue_recovery();
7780 }
7781 return forward_event();
7782 }
7783
7784 boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& notevt)
7785 {
7786 PG *pg = context< RecoveryMachine >().pg;
7787 assert(pg->is_primary());
7788 if (pg->peer_info.count(notevt.from)) {
7789 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7790 << ", already have info from that osd, ignoring"
7791 << dendl;
7792 } else if (pg->peer_purged.count(notevt.from)) {
7793 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7794 << ", already purged that peer, ignoring"
7795 << dendl;
7796 } else {
7797 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7798 << ", calling proc_replica_info and discover_all_missing"
7799 << dendl;
7800 pg->proc_replica_info(
7801 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
7802 if (pg->have_unfound()) {
7803 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7804 }
7805 }
7806 return discard_event();
7807 }
7808
7809 boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoevt)
7810 {
7811 PG *pg = context< RecoveryMachine >().pg;
7812 assert(pg->is_primary());
7813
7814 assert(!pg->actingbackfill.empty());
7815 // don't update history (yet) if we are active and primary; the replica
7816 // may be telling us they have activated (and committed) but we can't
7817 // share that until _everyone_ does the same.
7818 if (pg->is_actingbackfill(infoevt.from)) {
7819 ldout(pg->cct, 10) << " peer osd." << infoevt.from
7820 << " activated and committed" << dendl;
7821 pg->peer_activated.insert(infoevt.from);
7822 pg->blocked_by.erase(infoevt.from.shard);
7823 pg->publish_stats_to_osd();
7824 if (pg->peer_activated.size() == pg->actingbackfill.size()) {
7825 pg->all_activated_and_committed();
7826 }
7827 }
7828 return discard_event();
7829 }
7830
7831 boost::statechart::result PG::RecoveryState::Active::react(const MLogRec& logevt)
7832 {
7833 PG *pg = context< RecoveryMachine >().pg;
7834 ldout(pg->cct, 10) << "searching osd." << logevt.from
7835 << " log for unfound items" << dendl;
7836 pg->proc_replica_log(
7837 logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
7838 bool got_missing = pg->search_for_missing(
7839 pg->peer_info[logevt.from],
7840 pg->peer_missing[logevt.from],
7841 logevt.from,
7842 context< RecoveryMachine >().get_recovery_ctx());
7843 // If there are missing AND we are "fully" active then start recovery now
7844 if (got_missing && pg->state_test(PG_STATE_ACTIVE)) {
7845 post_event(DoRecovery());
7846 }
7847 return discard_event();
7848 }
7849
7850 boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
7851 {
7852 PG *pg = context< RecoveryMachine >().pg;
7853
7854 q.f->open_object_section("state");
7855 q.f->dump_string("name", state_name);
7856 q.f->dump_stream("enter_time") << enter_time;
7857
7858 {
7859 q.f->open_array_section("might_have_unfound");
7860 for (set<pg_shard_t>::iterator p = pg->might_have_unfound.begin();
7861 p != pg->might_have_unfound.end();
7862 ++p) {
7863 q.f->open_object_section("osd");
7864 q.f->dump_stream("osd") << *p;
7865 if (pg->peer_missing.count(*p)) {
7866 q.f->dump_string("status", "already probed");
7867 } else if (pg->peer_missing_requested.count(*p)) {
7868 q.f->dump_string("status", "querying");
7869 } else if (!pg->get_osdmap()->is_up(p->osd)) {
7870 q.f->dump_string("status", "osd is down");
7871 } else {
7872 q.f->dump_string("status", "not queried");
7873 }
7874 q.f->close_section();
7875 }
7876 q.f->close_section();
7877 }
7878 {
7879 q.f->open_object_section("recovery_progress");
7880 pg->dump_recovery_info(q.f);
7881 q.f->close_section();
7882 }
7883
7884 {
7885 q.f->open_object_section("scrub");
7886 q.f->dump_stream("scrubber.epoch_start") << pg->scrubber.epoch_start;
7887 q.f->dump_bool("scrubber.active", pg->scrubber.active);
7888 q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state));
7889 q.f->dump_stream("scrubber.start") << pg->scrubber.start;
7890 q.f->dump_stream("scrubber.end") << pg->scrubber.end;
7891 q.f->dump_stream("scrubber.max_end") << pg->scrubber.max_end;
7892 q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update;
7893 q.f->dump_bool("scrubber.deep", pg->scrubber.deep);
7894 {
7895 q.f->open_array_section("scrubber.waiting_on_whom");
7896 for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin();
7897 p != pg->scrubber.waiting_on_whom.end();
7898 ++p) {
7899 q.f->dump_stream("shard") << *p;
7900 }
7901 q.f->close_section();
7902 }
7903 q.f->close_section();
7904 }
7905
7906 q.f->close_section();
7907 return forward_event();
7908 }
7909
7910 boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActivated &evt)
7911 {
7912 PG *pg = context< RecoveryMachine >().pg;
7913 all_replicas_activated = true;
7914
7915 pg->state_clear(PG_STATE_ACTIVATING);
7916 pg->state_clear(PG_STATE_CREATING);
7917 if (pg->acting.size() >= pg->pool.info.min_size) {
7918 pg->state_set(PG_STATE_ACTIVE);
7919 } else {
7920 pg->state_set(PG_STATE_PEERED);
7921 }
7922
7923 // info.last_epoch_started is set during activate()
7924 pg->info.history.last_epoch_started = pg->info.last_epoch_started;
7925 pg->info.history.last_interval_started = pg->info.last_interval_started;
7926 pg->dirty_info = true;
7927
7928 pg->share_pg_info();
7929 pg->publish_stats_to_osd();
7930
7931 pg->check_local();
7932
7933 // waiters
7934 if (pg->flushes_in_progress == 0) {
7935 pg->requeue_ops(pg->waiting_for_peered);
7936 } else if (!pg->waiting_for_peered.empty()) {
7937 ldout(pg->cct, 10) << __func__ << " flushes in progress, moving "
7938 << pg->waiting_for_peered.size()
7939 << " items to waiting_for_flush"
7940 << dendl;
7941 assert(pg->waiting_for_flush.empty());
7942 pg->waiting_for_flush.swap(pg->waiting_for_peered);
7943 }
7944
7945 pg->on_activate();
7946
7947 return discard_event();
7948 }
7949
7950 void PG::RecoveryState::Active::exit()
7951 {
7952 context< RecoveryMachine >().log_exit(state_name, enter_time);
7953 PG *pg = context< RecoveryMachine >().pg;
7954 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7955
7956 pg->blocked_by.clear();
7957 pg->backfill_reserved = false;
7958 pg->backfill_reserving = false;
7959 pg->state_clear(PG_STATE_ACTIVATING);
7960 pg->state_clear(PG_STATE_DEGRADED);
7961 pg->state_clear(PG_STATE_UNDERSIZED);
7962 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
7963 pg->state_clear(PG_STATE_BACKFILL_WAIT);
7964 pg->state_clear(PG_STATE_RECOVERY_WAIT);
7965 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7966 utime_t dur = ceph_clock_now() - enter_time;
7967 pg->osd->recoverystate_perf->tinc(rs_active_latency, dur);
7968 pg->agent_stop();
7969 }
7970
7971 /*------ReplicaActive-----*/
7972 PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx)
7973 : my_base(ctx),
7974 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive")
7975 {
7976 context< RecoveryMachine >().log_enter(state_name);
7977
7978 PG *pg = context< RecoveryMachine >().pg;
7979 pg->start_flush(
7980 context< RecoveryMachine >().get_cur_transaction(),
7981 context< RecoveryMachine >().get_on_applied_context_list(),
7982 context< RecoveryMachine >().get_on_safe_context_list());
7983 }
7984
7985
7986 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
7987 const Activate& actevt) {
7988 PG *pg = context< RecoveryMachine >().pg;
7989 ldout(pg->cct, 10) << "In ReplicaActive, about to call activate" << dendl;
7990 map<int, map<spg_t, pg_query_t> > query_map;
7991 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7992 actevt.activation_epoch,
7993 *context< RecoveryMachine >().get_on_safe_context_list(),
7994 query_map, NULL, NULL);
7995 ldout(pg->cct, 10) << "Activate Finished" << dendl;
7996 return discard_event();
7997 }
7998
7999 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MInfoRec& infoevt)
8000 {
8001 PG *pg = context< RecoveryMachine >().pg;
8002 pg->proc_primary_info(*context<RecoveryMachine>().get_cur_transaction(),
8003 infoevt.info);
8004 return discard_event();
8005 }
8006
8007 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MLogRec& logevt)
8008 {
8009 PG *pg = context< RecoveryMachine >().pg;
8010 ldout(pg->cct, 10) << "received log from " << logevt.from << dendl;
8011 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
8012 pg->merge_log(*t, logevt.msg->info, logevt.msg->log, logevt.from);
8013 assert(pg->pg_log.get_head() == pg->info.last_update);
8014
8015 return discard_event();
8016 }
8017
8018 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&)
8019 {
8020 PG *pg = context< RecoveryMachine >().pg;
8021 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
8022 context< RecoveryMachine >().send_notify(
8023 pg->get_primary(),
8024 pg_notify_t(
8025 pg->get_primary().shard, pg->pg_whoami.shard,
8026 pg->get_osdmap()->get_epoch(),
8027 pg->get_osdmap()->get_epoch(),
8028 pg->info),
8029 pg->past_intervals);
8030 }
8031 pg->take_waiters();
8032 return discard_event();
8033 }
8034
8035 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
8036 const MQuery& query)
8037 {
8038 PG *pg = context< RecoveryMachine >().pg;
8039 pg->fulfill_query(query, context<RecoveryMachine>().get_recovery_ctx());
8040 return discard_event();
8041 }
8042
8043 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const QueryState& q)
8044 {
8045 q.f->open_object_section("state");
8046 q.f->dump_string("name", state_name);
8047 q.f->dump_stream("enter_time") << enter_time;
8048 q.f->close_section();
8049 return forward_event();
8050 }
8051
8052 void PG::RecoveryState::ReplicaActive::exit()
8053 {
8054 context< RecoveryMachine >().log_exit(state_name, enter_time);
8055 PG *pg = context< RecoveryMachine >().pg;
8056 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
8057 utime_t dur = ceph_clock_now() - enter_time;
8058 pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur);
8059 }
8060
8061 /*-------Stray---*/
8062 PG::RecoveryState::Stray::Stray(my_context ctx)
8063 : my_base(ctx),
8064 NamedState(context< RecoveryMachine >().pg, "Started/Stray")
8065 {
8066 context< RecoveryMachine >().log_enter(state_name);
8067
8068 PG *pg = context< RecoveryMachine >().pg;
8069 assert(!pg->is_peered());
8070 assert(!pg->is_peering());
8071 assert(!pg->is_primary());
8072 pg->start_flush(
8073 context< RecoveryMachine >().get_cur_transaction(),
8074 context< RecoveryMachine >().get_on_applied_context_list(),
8075 context< RecoveryMachine >().get_on_safe_context_list());
8076 }
8077
8078 boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
8079 {
8080 PG *pg = context< RecoveryMachine >().pg;
8081 MOSDPGLog *msg = logevt.msg.get();
8082 ldout(pg->cct, 10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
8083
8084 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
8085 if (msg->info.last_backfill == hobject_t()) {
8086 // restart backfill
8087 pg->unreg_next_scrub();
8088 pg->info = msg->info;
8089 pg->reg_next_scrub();
8090 pg->dirty_info = true;
8091 pg->dirty_big_info = true; // maybe.
8092
8093 PGLogEntryHandler rollbacker{pg, t};
8094 pg->pg_log.reset_backfill_claim_log(msg->log, &rollbacker);
8095
8096 pg->pg_log.reset_backfill();
8097 } else {
8098 pg->merge_log(*t, msg->info, msg->log, logevt.from);
8099 }
8100
8101 assert(pg->pg_log.get_head() == pg->info.last_update);
8102
8103 post_event(Activate(logevt.msg->info.last_epoch_started));
8104 return transit<ReplicaActive>();
8105 }
8106
8107 boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoevt)
8108 {
8109 PG *pg = context< RecoveryMachine >().pg;
8110 ldout(pg->cct, 10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
8111
8112 if (pg->info.last_update > infoevt.info.last_update) {
8113 // rewind divergent log entries
8114 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
8115 pg->rewind_divergent_log(*t, infoevt.info.last_update);
8116 pg->info.stats = infoevt.info.stats;
8117 pg->info.hit_set = infoevt.info.hit_set;
8118 }
8119
8120 assert(infoevt.info.last_update == pg->info.last_update);
8121 assert(pg->pg_log.get_head() == pg->info.last_update);
8122
8123 post_event(Activate(infoevt.info.last_epoch_started));
8124 return transit<ReplicaActive>();
8125 }
8126
8127 boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query)
8128 {
8129 PG *pg = context< RecoveryMachine >().pg;
8130 pg->fulfill_query(query, context<RecoveryMachine>().get_recovery_ctx());
8131 return discard_event();
8132 }
8133
8134 boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&)
8135 {
8136 PG *pg = context< RecoveryMachine >().pg;
8137 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
8138 context< RecoveryMachine >().send_notify(
8139 pg->get_primary(),
8140 pg_notify_t(
8141 pg->get_primary().shard, pg->pg_whoami.shard,
8142 pg->get_osdmap()->get_epoch(),
8143 pg->get_osdmap()->get_epoch(),
8144 pg->info),
8145 pg->past_intervals);
8146 }
8147 pg->take_waiters();
8148 return discard_event();
8149 }
8150
8151 void PG::RecoveryState::Stray::exit()
8152 {
8153 context< RecoveryMachine >().log_exit(state_name, enter_time);
8154 PG *pg = context< RecoveryMachine >().pg;
8155 utime_t dur = ceph_clock_now() - enter_time;
8156 pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur);
8157 }
8158
8159 /*--------GetInfo---------*/
8160 PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
8161 : my_base(ctx),
8162 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetInfo")
8163 {
8164 context< RecoveryMachine >().log_enter(state_name);
8165
8166 PG *pg = context< RecoveryMachine >().pg;
8167 pg->check_past_interval_bounds();
8168 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
8169
8170 assert(pg->blocked_by.empty());
8171
8172 prior_set = pg->build_prior();
8173
8174 pg->reset_min_peer_features();
8175 get_infos();
8176 if (prior_set.pg_down) {
8177 post_event(IsDown());
8178 } else if (peer_info_requested.empty()) {
8179 post_event(GotInfo());
8180 }
8181 }
8182
8183 void PG::RecoveryState::GetInfo::get_infos()
8184 {
8185 PG *pg = context< RecoveryMachine >().pg;
8186 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
8187
8188 pg->blocked_by.clear();
8189 for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
8190 it != prior_set.probe.end();
8191 ++it) {
8192 pg_shard_t peer = *it;
8193 if (peer == pg->pg_whoami) {
8194 continue;
8195 }
8196 if (pg->peer_info.count(peer)) {
8197 ldout(pg->cct, 10) << " have osd." << peer << " info " << pg->peer_info[peer] << dendl;
8198 continue;
8199 }
8200 if (peer_info_requested.count(peer)) {
8201 ldout(pg->cct, 10) << " already requested info from osd." << peer << dendl;
8202 pg->blocked_by.insert(peer.osd);
8203 } else if (!pg->get_osdmap()->is_up(peer.osd)) {
8204 ldout(pg->cct, 10) << " not querying info from down osd." << peer << dendl;
8205 } else {
8206 ldout(pg->cct, 10) << " querying info from osd." << peer << dendl;
8207 context< RecoveryMachine >().send_query(
8208 peer, pg_query_t(pg_query_t::INFO,
8209 it->shard, pg->pg_whoami.shard,
8210 pg->info.history,
8211 pg->get_osdmap()->get_epoch()));
8212 peer_info_requested.insert(peer);
8213 pg->blocked_by.insert(peer.osd);
8214 }
8215 }
8216
8217 pg->publish_stats_to_osd();
8218 }
8219
8220 boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt)
8221 {
8222 PG *pg = context< RecoveryMachine >().pg;
8223
8224 set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
8225 if (p != peer_info_requested.end()) {
8226 peer_info_requested.erase(p);
8227 pg->blocked_by.erase(infoevt.from.osd);
8228 }
8229
8230 epoch_t old_start = pg->info.history.last_epoch_started;
8231 if (pg->proc_replica_info(
8232 infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
8233 // we got something new ...
8234 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
8235 if (old_start < pg->info.history.last_epoch_started) {
8236 ldout(pg->cct, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
8237 prior_set = pg->build_prior();
8238
8239 // filter out any osds that got dropped from the probe set from
8240 // peer_info_requested. this is less expensive than restarting
8241 // peering (which would re-probe everyone).
8242 set<pg_shard_t>::iterator p = peer_info_requested.begin();
8243 while (p != peer_info_requested.end()) {
8244 if (prior_set.probe.count(*p) == 0) {
8245 ldout(pg->cct, 20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
8246 peer_info_requested.erase(p++);
8247 } else {
8248 ++p;
8249 }
8250 }
8251 get_infos();
8252 }
8253 ldout(pg->cct, 20) << "Adding osd: " << infoevt.from.osd << " peer features: "
8254 << hex << infoevt.features << dec << dendl;
8255 pg->apply_peer_features(infoevt.features);
8256
8257 // are we done getting everything?
8258 if (peer_info_requested.empty() && !prior_set.pg_down) {
8259 ldout(pg->cct, 20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl;
8260 ldout(pg->cct, 20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl;
8261 ldout(pg->cct, 20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl;
8262 post_event(GotInfo());
8263 }
8264 }
8265 return discard_event();
8266 }
8267
8268 boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q)
8269 {
8270 PG *pg = context< RecoveryMachine >().pg;
8271 q.f->open_object_section("state");
8272 q.f->dump_string("name", state_name);
8273 q.f->dump_stream("enter_time") << enter_time;
8274
8275 q.f->open_array_section("requested_info_from");
8276 for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
8277 p != peer_info_requested.end();
8278 ++p) {
8279 q.f->open_object_section("osd");
8280 q.f->dump_stream("osd") << *p;
8281 if (pg->peer_info.count(*p)) {
8282 q.f->open_object_section("got_info");
8283 pg->peer_info[*p].dump(q.f);
8284 q.f->close_section();
8285 }
8286 q.f->close_section();
8287 }
8288 q.f->close_section();
8289
8290 q.f->close_section();
8291 return forward_event();
8292 }
8293
8294 void PG::RecoveryState::GetInfo::exit()
8295 {
8296 context< RecoveryMachine >().log_exit(state_name, enter_time);
8297 PG *pg = context< RecoveryMachine >().pg;
8298 utime_t dur = ceph_clock_now() - enter_time;
8299 pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur);
8300 pg->blocked_by.clear();
8301 pg->publish_stats_to_osd();
8302 }
8303
8304 /*------GetLog------------*/
8305 PG::RecoveryState::GetLog::GetLog(my_context ctx)
8306 : my_base(ctx),
8307 NamedState(
8308 context< RecoveryMachine >().pg, "Started/Primary/Peering/GetLog"),
8309 msg(0)
8310 {
8311 context< RecoveryMachine >().log_enter(state_name);
8312
8313 PG *pg = context< RecoveryMachine >().pg;
8314
8315 // adjust acting?
8316 if (!pg->choose_acting(auth_log_shard, false,
8317 &context< Peering >().history_les_bound)) {
8318 if (!pg->want_acting.empty()) {
8319 post_event(NeedActingChange());
8320 } else {
8321 post_event(IsIncomplete());
8322 }
8323 return;
8324 }
8325
8326 // am i the best?
8327 if (auth_log_shard == pg->pg_whoami) {
8328 post_event(GotLog());
8329 return;
8330 }
8331
8332 const pg_info_t& best = pg->peer_info[auth_log_shard];
8333
8334 // am i broken?
8335 if (pg->info.last_update < best.log_tail) {
8336 ldout(pg->cct, 10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
8337 post_event(IsIncomplete());
8338 return;
8339 }
8340
8341 // how much log to request?
8342 eversion_t request_log_from = pg->info.last_update;
8343 assert(!pg->actingbackfill.empty());
8344 for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
8345 p != pg->actingbackfill.end();
8346 ++p) {
8347 if (*p == pg->pg_whoami) continue;
8348 pg_info_t& ri = pg->peer_info[*p];
8349 if (ri.last_update < pg->info.log_tail && ri.last_update >= best.log_tail &&
8350 ri.last_update < request_log_from)
8351 request_log_from = ri.last_update;
8352 }
8353
8354 // how much?
8355 ldout(pg->cct, 10) << " requesting log from osd." << auth_log_shard << dendl;
8356 context<RecoveryMachine>().send_query(
8357 auth_log_shard,
8358 pg_query_t(
8359 pg_query_t::LOG,
8360 auth_log_shard.shard, pg->pg_whoami.shard,
8361 request_log_from, pg->info.history,
8362 pg->get_osdmap()->get_epoch()));
8363
8364 assert(pg->blocked_by.empty());
8365 pg->blocked_by.insert(auth_log_shard.osd);
8366 pg->publish_stats_to_osd();
8367 }
8368
8369 boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap)
8370 {
8371 PG *pg = context< RecoveryMachine >().pg;
8372 // make sure our log source didn't go down. we need to check
8373 // explicitly because it may not be part of the prior set, which
8374 // means the Peering state check won't catch it going down.
8375 if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
8376 ldout(pg->cct, 10) << "GetLog: auth_log_shard osd."
8377 << auth_log_shard.osd << " went down" << dendl;
8378 post_event(advmap);
8379 return transit< Reset >();
8380 }
8381
8382 // let the Peering state do its checks.
8383 return forward_event();
8384 }
8385
8386 boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt)
8387 {
8388 PG *pg = context< RecoveryMachine >().pg;
8389 assert(!msg);
8390 if (logevt.from != auth_log_shard) {
8391 ldout(pg->cct, 10) << "GetLog: discarding log from "
8392 << "non-auth_log_shard osd." << logevt.from << dendl;
8393 return discard_event();
8394 }
8395 ldout(pg->cct, 10) << "GetLog: received master log from osd"
8396 << logevt.from << dendl;
8397 msg = logevt.msg;
8398 post_event(GotLog());
8399 return discard_event();
8400 }
8401
8402 boost::statechart::result PG::RecoveryState::GetLog::react(const GotLog&)
8403 {
8404 PG *pg = context< RecoveryMachine >().pg;
8405 ldout(pg->cct, 10) << "leaving GetLog" << dendl;
8406 if (msg) {
8407 ldout(pg->cct, 10) << "processing master log" << dendl;
8408 pg->proc_master_log(*context<RecoveryMachine>().get_cur_transaction(),
8409 msg->info, msg->log, msg->missing,
8410 auth_log_shard);
8411 }
8412 pg->start_flush(
8413 context< RecoveryMachine >().get_cur_transaction(),
8414 context< RecoveryMachine >().get_on_applied_context_list(),
8415 context< RecoveryMachine >().get_on_safe_context_list());
8416 return transit< GetMissing >();
8417 }
8418
8419 boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q)
8420 {
8421 q.f->open_object_section("state");
8422 q.f->dump_string("name", state_name);
8423 q.f->dump_stream("enter_time") << enter_time;
8424 q.f->dump_stream("auth_log_shard") << auth_log_shard;
8425 q.f->close_section();
8426 return forward_event();
8427 }
8428
8429 void PG::RecoveryState::GetLog::exit()
8430 {
8431 context< RecoveryMachine >().log_exit(state_name, enter_time);
8432 PG *pg = context< RecoveryMachine >().pg;
8433 utime_t dur = ceph_clock_now() - enter_time;
8434 pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur);
8435 pg->blocked_by.clear();
8436 pg->publish_stats_to_osd();
8437 }
8438
8439 /*------WaitActingChange--------*/
8440 PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx)
8441 : my_base(ctx),
8442 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitActingChange")
8443 {
8444 context< RecoveryMachine >().log_enter(state_name);
8445 }
8446
8447 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const AdvMap& advmap)
8448 {
8449 PG *pg = context< RecoveryMachine >().pg;
8450 OSDMapRef osdmap = advmap.osdmap;
8451
8452 ldout(pg->cct, 10) << "verifying no want_acting " << pg->want_acting << " targets didn't go down" << dendl;
8453 for (vector<int>::iterator p = pg->want_acting.begin(); p != pg->want_acting.end(); ++p) {
8454 if (!osdmap->is_up(*p)) {
8455 ldout(pg->cct, 10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
8456 post_event(advmap);
8457 return transit< Reset >();
8458 }
8459 }
8460 return forward_event();
8461 }
8462
8463 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MLogRec& logevt)
8464 {
8465 PG *pg = context< RecoveryMachine >().pg;
8466 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MLocRec" << dendl;
8467 return discard_event();
8468 }
8469
8470 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MInfoRec& evt)
8471 {
8472 PG *pg = context< RecoveryMachine >().pg;
8473 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
8474 return discard_event();
8475 }
8476
8477 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MNotifyRec& evt)
8478 {
8479 PG *pg = context< RecoveryMachine >().pg;
8480 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
8481 return discard_event();
8482 }
8483
8484 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const QueryState& q)
8485 {
8486 q.f->open_object_section("state");
8487 q.f->dump_string("name", state_name);
8488 q.f->dump_stream("enter_time") << enter_time;
8489 q.f->dump_string("comment", "waiting for pg acting set to change");
8490 q.f->close_section();
8491 return forward_event();
8492 }
8493
8494 void PG::RecoveryState::WaitActingChange::exit()
8495 {
8496 context< RecoveryMachine >().log_exit(state_name, enter_time);
8497 PG *pg = context< RecoveryMachine >().pg;
8498 utime_t dur = ceph_clock_now() - enter_time;
8499 pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur);
8500 }
8501
8502 /*------Down--------*/
8503 PG::RecoveryState::Down::Down(my_context ctx)
8504 : my_base(ctx),
8505 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Down")
8506 {
8507 context< RecoveryMachine >().log_enter(state_name);
8508 PG *pg = context< RecoveryMachine >().pg;
8509
8510 pg->state_clear(PG_STATE_PEERING);
8511 pg->state_set(PG_STATE_DOWN);
8512
8513 auto &prior_set = context< Peering >().prior_set;
8514 assert(pg->blocked_by.empty());
8515 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
8516 pg->publish_stats_to_osd();
8517 }
8518
8519 void PG::RecoveryState::Down::exit()
8520 {
8521 context< RecoveryMachine >().log_exit(state_name, enter_time);
8522 PG *pg = context< RecoveryMachine >().pg;
8523
8524 pg->state_clear(PG_STATE_DOWN);
8525 utime_t dur = ceph_clock_now() - enter_time;
8526 pg->osd->recoverystate_perf->tinc(rs_down_latency, dur);
8527
8528 pg->blocked_by.clear();
8529 pg->publish_stats_to_osd();
8530 }
8531
8532 boost::statechart::result PG::RecoveryState::Down::react(const QueryState& q)
8533 {
8534 q.f->open_object_section("state");
8535 q.f->dump_string("name", state_name);
8536 q.f->dump_stream("enter_time") << enter_time;
8537 q.f->dump_string("comment",
8538 "not enough up instances of this PG to go active");
8539 q.f->close_section();
8540 return forward_event();
8541 }
8542
8543 /*------Incomplete--------*/
8544 PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
8545 : my_base(ctx),
8546 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Incomplete")
8547 {
8548 context< RecoveryMachine >().log_enter(state_name);
8549 PG *pg = context< RecoveryMachine >().pg;
8550
8551 pg->state_clear(PG_STATE_PEERING);
8552 pg->state_set(PG_STATE_INCOMPLETE);
8553
8554 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
8555 assert(pg->blocked_by.empty());
8556 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
8557 pg->publish_stats_to_osd();
8558 }
8559
8560 boost::statechart::result PG::RecoveryState::Incomplete::react(const AdvMap &advmap) {
8561 PG *pg = context< RecoveryMachine >().pg;
8562 int64_t poolnum = pg->info.pgid.pool();
8563
8564 // Reset if min_size turn smaller than previous value, pg might now be able to go active
8565 if (!advmap.osdmap->have_pg_pool(poolnum) ||
8566 advmap.lastmap->get_pools().find(poolnum)->second.min_size >
8567 advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
8568 post_event(advmap);
8569 return transit< Reset >();
8570 }
8571
8572 return forward_event();
8573 }
8574
8575 boost::statechart::result PG::RecoveryState::Incomplete::react(const MNotifyRec& notevt) {
8576 PG *pg = context< RecoveryMachine >().pg;
8577 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
8578 if (pg->proc_replica_info(
8579 notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
8580 // We got something new, try again!
8581 return transit< GetLog >();
8582 } else {
8583 return discard_event();
8584 }
8585 }
8586
8587 boost::statechart::result PG::RecoveryState::Incomplete::react(
8588 const QueryState& q)
8589 {
8590 q.f->open_object_section("state");
8591 q.f->dump_string("name", state_name);
8592 q.f->dump_stream("enter_time") << enter_time;
8593 q.f->dump_string("comment", "not enough complete instances of this PG");
8594 q.f->close_section();
8595 return forward_event();
8596 }
8597
8598 void PG::RecoveryState::Incomplete::exit()
8599 {
8600 context< RecoveryMachine >().log_exit(state_name, enter_time);
8601 PG *pg = context< RecoveryMachine >().pg;
8602
8603 pg->state_clear(PG_STATE_INCOMPLETE);
8604 utime_t dur = ceph_clock_now() - enter_time;
8605 pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur);
8606
8607 pg->blocked_by.clear();
8608 pg->publish_stats_to_osd();
8609 }
8610
8611 /*------GetMissing--------*/
8612 PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
8613 : my_base(ctx),
8614 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetMissing")
8615 {
8616 context< RecoveryMachine >().log_enter(state_name);
8617
8618 PG *pg = context< RecoveryMachine >().pg;
8619 assert(!pg->actingbackfill.empty());
8620 eversion_t since;
8621 for (set<pg_shard_t>::iterator i = pg->actingbackfill.begin();
8622 i != pg->actingbackfill.end();
8623 ++i) {
8624 if (*i == pg->get_primary()) continue;
8625 const pg_info_t& pi = pg->peer_info[*i];
8626 // reset this so to make sure the pg_missing_t is initialized and
8627 // has the correct semantics even if we don't need to get a
8628 // missing set from a shard. This way later additions due to
8629 // lost+unfound delete work properly.
8630 pg->peer_missing[*i].may_include_deletes = !pg->perform_deletes_during_peering();
8631
8632 if (pi.is_empty())
8633 continue; // no pg data, nothing divergent
8634
8635 if (pi.last_update < pg->pg_log.get_tail()) {
8636 ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
8637 pg->peer_missing[*i].clear();
8638 continue;
8639 }
8640 if (pi.last_backfill == hobject_t()) {
8641 ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
8642 pg->peer_missing[*i].clear();
8643 continue;
8644 }
8645
8646 if (pi.last_update == pi.last_complete && // peer has no missing
8647 pi.last_update == pg->info.last_update) { // peer is up to date
8648 // replica has no missing and identical log as us. no need to
8649 // pull anything.
8650 // FIXME: we can do better here. if last_update==last_complete we
8651 // can infer the rest!
8652 ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl;
8653 pg->peer_missing[*i].clear();
8654 continue;
8655 }
8656
8657 // We pull the log from the peer's last_epoch_started to ensure we
8658 // get enough log to detect divergent updates.
8659 since.epoch = pi.last_epoch_started;
8660 assert(pi.last_update >= pg->info.log_tail); // or else choose_acting() did a bad thing
8661 if (pi.log_tail <= since) {
8662 ldout(pg->cct, 10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
8663 context< RecoveryMachine >().send_query(
8664 *i,
8665 pg_query_t(
8666 pg_query_t::LOG,
8667 i->shard, pg->pg_whoami.shard,
8668 since, pg->info.history,
8669 pg->get_osdmap()->get_epoch()));
8670 } else {
8671 ldout(pg->cct, 10) << " requesting fulllog+missing from osd." << *i
8672 << " (want since " << since << " < log.tail "
8673 << pi.log_tail << ")" << dendl;
8674 context< RecoveryMachine >().send_query(
8675 *i, pg_query_t(
8676 pg_query_t::FULLLOG,
8677 i->shard, pg->pg_whoami.shard,
8678 pg->info.history, pg->get_osdmap()->get_epoch()));
8679 }
8680 peer_missing_requested.insert(*i);
8681 pg->blocked_by.insert(i->osd);
8682 }
8683
8684 if (peer_missing_requested.empty()) {
8685 if (pg->need_up_thru) {
8686 ldout(pg->cct, 10) << " still need up_thru update before going active"
8687 << dendl;
8688 post_event(NeedUpThru());
8689 return;
8690 }
8691
8692 // all good!
8693 post_event(Activate(pg->get_osdmap()->get_epoch()));
8694 } else {
8695 pg->publish_stats_to_osd();
8696 }
8697 }
8698
8699 boost::statechart::result PG::RecoveryState::GetMissing::react(const MLogRec& logevt)
8700 {
8701 PG *pg = context< RecoveryMachine >().pg;
8702
8703 peer_missing_requested.erase(logevt.from);
8704 pg->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
8705
8706 if (peer_missing_requested.empty()) {
8707 if (pg->need_up_thru) {
8708 ldout(pg->cct, 10) << " still need up_thru update before going active"
8709 << dendl;
8710 post_event(NeedUpThru());
8711 } else {
8712 ldout(pg->cct, 10) << "Got last missing, don't need missing "
8713 << "posting Activate" << dendl;
8714 post_event(Activate(pg->get_osdmap()->get_epoch()));
8715 }
8716 }
8717 return discard_event();
8718 }
8719
8720 boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState& q)
8721 {
8722 PG *pg = context< RecoveryMachine >().pg;
8723 q.f->open_object_section("state");
8724 q.f->dump_string("name", state_name);
8725 q.f->dump_stream("enter_time") << enter_time;
8726
8727 q.f->open_array_section("peer_missing_requested");
8728 for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
8729 p != peer_missing_requested.end();
8730 ++p) {
8731 q.f->open_object_section("osd");
8732 q.f->dump_stream("osd") << *p;
8733 if (pg->peer_missing.count(*p)) {
8734 q.f->open_object_section("got_missing");
8735 pg->peer_missing[*p].dump(q.f);
8736 q.f->close_section();
8737 }
8738 q.f->close_section();
8739 }
8740 q.f->close_section();
8741
8742 q.f->close_section();
8743 return forward_event();
8744 }
8745
8746 void PG::RecoveryState::GetMissing::exit()
8747 {
8748 context< RecoveryMachine >().log_exit(state_name, enter_time);
8749 PG *pg = context< RecoveryMachine >().pg;
8750 utime_t dur = ceph_clock_now() - enter_time;
8751 pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur);
8752 pg->blocked_by.clear();
8753 pg->publish_stats_to_osd();
8754 }
8755
8756 /*------WaitUpThru--------*/
8757 PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx)
8758 : my_base(ctx),
8759 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitUpThru")
8760 {
8761 context< RecoveryMachine >().log_enter(state_name);
8762 }
8763
8764 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const ActMap& am)
8765 {
8766 PG *pg = context< RecoveryMachine >().pg;
8767 if (!pg->need_up_thru) {
8768 post_event(Activate(pg->get_osdmap()->get_epoch()));
8769 }
8770 return forward_event();
8771 }
8772
8773 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const MLogRec& logevt)
8774 {
8775 PG *pg = context< RecoveryMachine >().pg;
8776 ldout(pg->cct, 10) << "Noting missing from osd." << logevt.from << dendl;
8777 pg->peer_missing[logevt.from].claim(logevt.msg->missing);
8778 pg->peer_info[logevt.from] = logevt.msg->info;
8779 return discard_event();
8780 }
8781
8782 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const QueryState& q)
8783 {
8784 q.f->open_object_section("state");
8785 q.f->dump_string("name", state_name);
8786 q.f->dump_stream("enter_time") << enter_time;
8787 q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
8788 q.f->close_section();
8789 return forward_event();
8790 }
8791
8792 void PG::RecoveryState::WaitUpThru::exit()
8793 {
8794 context< RecoveryMachine >().log_exit(state_name, enter_time);
8795 PG *pg = context< RecoveryMachine >().pg;
8796 utime_t dur = ceph_clock_now() - enter_time;
8797 pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur);
8798 }
8799
8800 /*----RecoveryState::RecoveryMachine Methods-----*/
8801 #undef dout_prefix
8802 #define dout_prefix *_dout << pg->gen_prefix()
8803
8804 void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name)
8805 {
8806 PG *pg = context< RecoveryMachine >().pg;
8807 ldout(pg->cct, 5) << "enter " << state_name << dendl;
8808 pg->osd->pg_recovery_stats.log_enter(state_name);
8809 }
8810
8811 void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_t enter_time)
8812 {
8813 utime_t dur = ceph_clock_now() - enter_time;
8814 PG *pg = context< RecoveryMachine >().pg;
8815 ldout(pg->cct, 5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
8816 pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now() - enter_time,
8817 event_count, event_time);
8818 event_count = 0;
8819 event_time = utime_t();
8820 }
8821
8822
8823 /*---------------------------------------------------*/
8824 #undef dout_prefix
8825 #define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ")
8826
8827 void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
8828 assert(!rctx);
8829 assert(!orig_ctx);
8830 orig_ctx = new_ctx;
8831 if (new_ctx) {
8832 if (messages_pending_flush) {
8833 rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
8834 } else {
8835 rctx = *new_ctx;
8836 }
8837 rctx->start_time = ceph_clock_now();
8838 }
8839 }
8840
8841 void PG::RecoveryState::begin_block_outgoing() {
8842 assert(!messages_pending_flush);
8843 assert(orig_ctx);
8844 assert(rctx);
8845 messages_pending_flush = BufferedRecoveryMessages();
8846 rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
8847 }
8848
8849 void PG::RecoveryState::clear_blocked_outgoing() {
8850 assert(orig_ctx);
8851 assert(rctx);
8852 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8853 }
8854
8855 void PG::RecoveryState::end_block_outgoing() {
8856 assert(messages_pending_flush);
8857 assert(orig_ctx);
8858 assert(rctx);
8859
8860 rctx = RecoveryCtx(*orig_ctx);
8861 rctx->accept_buffered_messages(*messages_pending_flush);
8862 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8863 }
8864
8865 void PG::RecoveryState::end_handle() {
8866 if (rctx) {
8867 utime_t dur = ceph_clock_now() - rctx->start_time;
8868 machine.event_time += dur;
8869 }
8870
8871 machine.event_count++;
8872 rctx = boost::optional<RecoveryCtx>();
8873 orig_ctx = NULL;
8874 }
8875
8876 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi)
8877 {
8878 out << "BackfillInfo(" << bi.begin << "-" << bi.end
8879 << " " << bi.objects.size() << " objects";
8880 if (!bi.objects.empty())
8881 out << " " << bi.objects;
8882 out << ")";
8883 return out;
8884 }
8885
8886 void intrusive_ptr_add_ref(PG *pg) { pg->get("intptr"); }
8887 void intrusive_ptr_release(PG *pg) { pg->put("intptr"); }
8888
8889 #ifdef PG_DEBUG_REFS
8890 uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
8891 void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
8892 #endif