]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PG.cc
update sources to 12.2.10
[ceph.git] / ceph / src / osd / PG.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "PG.h"
16 // #include "msg/Messenger.h"
17 #include "messages/MOSDRepScrub.h"
18 // #include "common/cmdparse.h"
19 // #include "common/ceph_context.h"
20
21 #include "common/errno.h"
22 #include "common/config.h"
23 #include "OSD.h"
24 #include "OpRequest.h"
25 #include "ScrubStore.h"
26 #include "Session.h"
27
28 #include "common/Timer.h"
29 #include "common/perf_counters.h"
30
31 #include "messages/MOSDOp.h"
32 #include "messages/MOSDPGNotify.h"
33 // #include "messages/MOSDPGLog.h"
34 #include "messages/MOSDPGRemove.h"
35 #include "messages/MOSDPGInfo.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDPGBackfill.h"
39 #include "messages/MOSDPGBackfillRemove.h"
40 #include "messages/MBackfillReserve.h"
41 #include "messages/MRecoveryReserve.h"
42 #include "messages/MOSDPGPush.h"
43 #include "messages/MOSDPGPushReply.h"
44 #include "messages/MOSDPGPull.h"
45 #include "messages/MOSDECSubOpWrite.h"
46 #include "messages/MOSDECSubOpWriteReply.h"
47 #include "messages/MOSDECSubOpRead.h"
48 #include "messages/MOSDECSubOpReadReply.h"
49 #include "messages/MOSDPGUpdateLogMissing.h"
50 #include "messages/MOSDPGUpdateLogMissingReply.h"
51 #include "messages/MOSDBackoff.h"
52 #include "messages/MOSDScrubReserve.h"
53 #include "messages/MOSDSubOp.h"
54 #include "messages/MOSDRepOp.h"
55 #include "messages/MOSDSubOpReply.h"
56 #include "messages/MOSDRepOpReply.h"
57 #include "messages/MOSDRepScrubMap.h"
58 #include "messages/MOSDPGRecoveryDelete.h"
59 #include "messages/MOSDPGRecoveryDeleteReply.h"
60
61 #include "common/BackTrace.h"
62 #include "common/EventTrace.h"
63
64 #ifdef WITH_LTTNG
65 #define TRACEPOINT_DEFINE
66 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
67 #include "tracing/pg.h"
68 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
69 #undef TRACEPOINT_DEFINE
70 #else
71 #define tracepoint(...)
72 #endif
73
74 #include <sstream>
75
76 #define dout_context cct
77 #define dout_subsys ceph_subsys_osd
78 #undef dout_prefix
79 #define dout_prefix _prefix(_dout, this)
80
81 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
82 // easily skip them
83 const string infover_key("_infover");
84 const string info_key("_info");
85 const string biginfo_key("_biginfo");
86 const string epoch_key("_epoch");
87 const string fastinfo_key("_fastinfo");
88
89 template <class T>
90 static ostream& _prefix(std::ostream *_dout, T *t)
91 {
92 return *_dout << t->gen_prefix();
93 }
94
95 MEMPOOL_DEFINE_OBJECT_FACTORY(PG::CephPeeringEvt, pg_peering_evt, osd);
96
97 void PGStateHistory::enter(PG* pg, const utime_t entime, const char* state)
98 {
99 // Ignore trimming state machine for now
100 if (::strstr(state, "Trimming") != NULL) {
101 return;
102 } else if (pi != nullptr) {
103 pi->enter_state(entime, state);
104 } else {
105 // Store current state since we can't reliably take the PG lock here
106 if ( tmppi == nullptr) {
107 tmppi = std::unique_ptr<PGStateInstance>(new PGStateInstance);
108 }
109
110 thispg = pg;
111 tmppi->enter_state(entime, state);
112 }
113 }
114
115 void PGStateHistory::exit(const char* state) {
116 // Ignore trimming state machine for now
117 // Do nothing if PG is being destroyed!
118 if (::strstr(state, "Trimming") != NULL || pg_in_destructor) {
119 return;
120 } else {
121 bool ilocked = false;
122 if(!thispg->is_locked()) {
123 thispg->lock();
124 ilocked = true;
125 }
126 if (pi == nullptr) {
127 buffer.push_back(std::unique_ptr<PGStateInstance>(tmppi.release()));
128 pi = buffer.back().get();
129 pi->setepoch(thispg->get_osdmap()->get_epoch());
130 }
131
132 pi->exit_state(ceph_clock_now());
133 if (::strcmp(state, "Reset") == 0) {
134 this->reset();
135 }
136 if(ilocked) {
137 thispg->unlock();
138 }
139 }
140 }
141
142 void PGStateHistory::dump(Formatter* f) const {
143 f->open_array_section("history");
144 for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) {
145 f->open_object_section("states");
146 f->dump_stream("epoch") << (*pi)->this_epoch;
147 for (auto she : (*pi)->state_history) {
148 f->dump_string("state", std::get<2>(she));
149 f->dump_stream("enter") << std::get<0>(she);
150 f->dump_stream("exit") << std::get<1>(she);
151 }
152 f->close_section();
153 }
154 f->close_section();
155 }
156
157 void PG::get(const char* tag)
158 {
159 ref++;
160 #ifdef PG_DEBUG_REFS
161 Mutex::Locker l(_ref_id_lock);
162 _tag_counts[tag]++;
163 #endif
164 }
165
166 void PG::put(const char* tag)
167 {
168 #ifdef PG_DEBUG_REFS
169 {
170 Mutex::Locker l(_ref_id_lock);
171 auto tag_counts_entry = _tag_counts.find(tag);
172 assert(tag_counts_entry != _tag_counts.end());
173 --tag_counts_entry->second;
174 if (tag_counts_entry->second == 0) {
175 _tag_counts.erase(tag_counts_entry);
176 }
177 }
178 #endif
179 if (--ref== 0)
180 delete this;
181 }
182
183 #ifdef PG_DEBUG_REFS
184 uint64_t PG::get_with_id()
185 {
186 ref++;
187 Mutex::Locker l(_ref_id_lock);
188 uint64_t id = ++_ref_id;
189 BackTrace bt(0);
190 stringstream ss;
191 bt.print(ss);
192 dout(20) << __func__ << ": " << info.pgid << " got id " << id << " (new) ref==" << ref << dendl;
193 assert(!_live_ids.count(id));
194 _live_ids.insert(make_pair(id, ss.str()));
195 return id;
196 }
197
198 void PG::put_with_id(uint64_t id)
199 {
200 dout(20) << __func__ << ": " << info.pgid << " put id " << id << " (current) ref==" << ref << dendl;
201 {
202 Mutex::Locker l(_ref_id_lock);
203 assert(_live_ids.count(id));
204 _live_ids.erase(id);
205 }
206 if (--ref == 0)
207 delete this;
208 }
209
210 void PG::dump_live_ids()
211 {
212 Mutex::Locker l(_ref_id_lock);
213 dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl;
214 for (map<uint64_t, string>::iterator i = _live_ids.begin();
215 i != _live_ids.end();
216 ++i) {
217 dout(0) << "\t\tid: " << *i << dendl;
218 }
219 dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl;
220 for (map<string, uint64_t>::iterator i = _tag_counts.begin();
221 i != _tag_counts.end();
222 ++i) {
223 dout(0) << "\t\tid: " << *i << dendl;
224 }
225 }
226 #endif
227
228
229 void PGPool::update(OSDMapRef map)
230 {
231 const pg_pool_t *pi = map->get_pg_pool(id);
232 assert(pi);
233 info = *pi;
234 auid = pi->auid;
235 name = map->get_pool_name(id);
236 bool updated = false;
237 if ((map->get_epoch() != cached_epoch + 1) ||
238 (pi->get_snap_epoch() == map->get_epoch())) {
239 updated = true;
240 if (pi->maybe_updated_removed_snaps(cached_removed_snaps)) {
241 pi->build_removed_snaps(newly_removed_snaps);
242 if (cached_removed_snaps.subset_of(newly_removed_snaps)) {
243 interval_set<snapid_t> removed_snaps = newly_removed_snaps;
244 newly_removed_snaps.subtract(cached_removed_snaps);
245 cached_removed_snaps.swap(removed_snaps);
246 } else {
247 lgeneric_subdout(cct, osd, 0) << __func__
248 << " cached_removed_snaps shrank from " << cached_removed_snaps
249 << " to " << newly_removed_snaps << dendl;
250 cached_removed_snaps.swap(newly_removed_snaps);
251 newly_removed_snaps.clear();
252 }
253 } else
254 newly_removed_snaps.clear();
255 snapc = pi->get_snap_context();
256 } else {
257 /* 1) map->get_epoch() == cached_epoch + 1 &&
258 * 2) pi->get_snap_epoch() != map->get_epoch()
259 *
260 * From the if branch, 1 && 2 must be true. From 2, we know that
261 * this map didn't change the set of removed snaps. From 1, we
262 * know that our cached_removed_snaps matches the previous map.
263 * Thus, from 1 && 2, cached_removed snaps matches the current
264 * set of removed snaps and all we have to do is clear
265 * newly_removed_snaps.
266 */
267 newly_removed_snaps.clear();
268 }
269 cached_epoch = map->get_epoch();
270 lgeneric_subdout(cct, osd, 20)
271 << "PGPool::update cached_removed_snaps "
272 << cached_removed_snaps
273 << " newly_removed_snaps "
274 << newly_removed_snaps
275 << " snapc " << snapc
276 << (updated ? " (updated)":" (no change)")
277 << dendl;
278 }
279
280 PG::PG(OSDService *o, OSDMapRef curmap,
281 const PGPool &_pool, spg_t p) :
282 osd(o),
283 cct(o->cct),
284 osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
285 snap_mapper(
286 cct,
287 &osdriver,
288 p.ps(),
289 p.get_split_bits(curmap->get_pg_num(_pool.id)),
290 _pool.id,
291 p.shard),
292 osdmap_ref(curmap), last_persisted_osdmap_ref(curmap), pool(_pool),
293 _lock("PG::_lock"),
294 #ifdef PG_DEBUG_REFS
295 _ref_id_lock("PG::_ref_id_lock"), _ref_id(0),
296 #endif
297 deleting(false),
298 trace_endpoint("0.0.0.0", 0, "PG"),
299 dirty_info(false), dirty_big_info(false),
300 info(p),
301 info_struct_v(0),
302 coll(p),
303 pg_log(cct),
304 pgmeta_oid(p.make_pgmeta_oid()),
305 missing_loc(this),
306 past_intervals(
307 curmap->get_pools().at(p.pgid.pool()).ec_pool(),
308 *curmap),
309 stat_queue_item(this),
310 scrub_queued(false),
311 recovery_queued(false),
312 recovery_ops_active(0),
313 role(-1),
314 state(0),
315 send_notify(false),
316 pg_whoami(osd->whoami, p.shard),
317 need_up_thru(false),
318 last_peering_reset(0),
319 heartbeat_peer_lock("PG::heartbeat_peer_lock"),
320 backfill_reserved(false),
321 backfill_reserving(false),
322 flushes_in_progress(0),
323 pg_stats_publish_lock("PG::pg_stats_publish_lock"),
324 pg_stats_publish_valid(false),
325 osr(osd->osr_registry.lookup_or_create(p, (stringify(p)))),
326 finish_sync_event(NULL),
327 backoff_lock("PG::backoff_lock"),
328 scrub_after_recovery(false),
329 active_pushes(0),
330 recovery_state(this),
331 pg_id(p),
332 peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
333 acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
334 upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
335 last_epoch(0)
336 {
337 #ifdef PG_DEBUG_REFS
338 osd->add_pgid(p, this);
339 #endif
340 #ifdef WITH_BLKIN
341 std::stringstream ss;
342 ss << "PG " << info.pgid;
343 trace_endpoint.copy_name(ss.str());
344 #endif
345 osr->shard_hint = p;
346 }
347
348 PG::~PG()
349 {
350 pgstate_history.set_pg_in_destructor();
351 #ifdef PG_DEBUG_REFS
352 osd->remove_pgid(info.pgid, this);
353 #endif
354 }
355
356 void PG::lock_suspend_timeout(ThreadPool::TPHandle &handle)
357 {
358 handle.suspend_tp_timeout();
359 lock();
360 handle.reset_tp_timeout();
361 }
362
363 void PG::lock(bool no_lockdep) const
364 {
365 _lock.Lock(no_lockdep);
366 // if we have unrecorded dirty state with the lock dropped, there is a bug
367 assert(!dirty_info);
368 assert(!dirty_big_info);
369
370 dout(30) << "lock" << dendl;
371 }
372
373 std::string PG::gen_prefix() const
374 {
375 stringstream out;
376 OSDMapRef mapref = osdmap_ref;
377 if (_lock.is_locked_by_me()) {
378 out << "osd." << osd->whoami
379 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
380 << " " << *this << " ";
381 } else {
382 out << "osd." << osd->whoami
383 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
384 << " pg[" << info.pgid << "(unlocked)] ";
385 }
386 return out.str();
387 }
388
389 /********* PG **********/
390
391 void PG::proc_master_log(
392 ObjectStore::Transaction& t, pg_info_t &oinfo,
393 pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
394 {
395 dout(10) << "proc_master_log for osd." << from << ": "
396 << olog << " " << omissing << dendl;
397 assert(!is_peered() && is_primary());
398
399 // merge log into our own log to build master log. no need to
400 // make any adjustments to their missing map; we are taking their
401 // log to be authoritative (i.e., their entries are by definitely
402 // non-divergent).
403 merge_log(t, oinfo, olog, from);
404 peer_info[from] = oinfo;
405 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
406 might_have_unfound.insert(from);
407
408 // See doc/dev/osd_internals/last_epoch_started
409 if (oinfo.last_epoch_started > info.last_epoch_started) {
410 info.last_epoch_started = oinfo.last_epoch_started;
411 dirty_info = true;
412 }
413 if (oinfo.last_interval_started > info.last_interval_started) {
414 info.last_interval_started = oinfo.last_interval_started;
415 dirty_info = true;
416 }
417 update_history(oinfo.history);
418 assert(cct->_conf->osd_find_best_info_ignore_history_les ||
419 info.last_epoch_started >= info.history.last_epoch_started);
420
421 peer_missing[from].claim(omissing);
422 }
423
424 void PG::proc_replica_log(
425 pg_info_t &oinfo,
426 const pg_log_t &olog,
427 pg_missing_t& omissing,
428 pg_shard_t from)
429 {
430 dout(10) << "proc_replica_log for osd." << from << ": "
431 << oinfo << " " << olog << " " << omissing << dendl;
432
433 pg_log.proc_replica_log(oinfo, olog, omissing, from);
434
435 peer_info[from] = oinfo;
436 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
437 might_have_unfound.insert(from);
438
439 for (map<hobject_t, pg_missing_item>::const_iterator i =
440 omissing.get_items().begin();
441 i != omissing.get_items().end();
442 ++i) {
443 dout(20) << " after missing " << i->first << " need " << i->second.need
444 << " have " << i->second.have << dendl;
445 }
446 peer_missing[from].claim(omissing);
447 }
448
449 bool PG::proc_replica_info(
450 pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
451 {
452 map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
453 if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
454 dout(10) << " got dup osd." << from << " info " << oinfo << ", identical to ours" << dendl;
455 return false;
456 }
457
458 if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
459 dout(10) << " got info " << oinfo << " from down osd." << from
460 << " discarding" << dendl;
461 return false;
462 }
463
464 dout(10) << " got osd." << from << " " << oinfo << dendl;
465 assert(is_primary());
466 peer_info[from] = oinfo;
467 might_have_unfound.insert(from);
468
469 update_history(oinfo.history);
470
471 // stray?
472 if (!is_up(from) && !is_acting(from)) {
473 dout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
474 stray_set.insert(from);
475 if (is_clean()) {
476 purge_strays();
477 }
478 }
479
480 // was this a new info? if so, update peers!
481 if (p == peer_info.end())
482 update_heartbeat_peers();
483
484 return true;
485 }
486
487 void PG::remove_snap_mapped_object(
488 ObjectStore::Transaction &t, const hobject_t &soid)
489 {
490 t.remove(
491 coll,
492 ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard));
493 clear_object_snap_mapping(&t, soid);
494 }
495
496 void PG::clear_object_snap_mapping(
497 ObjectStore::Transaction *t, const hobject_t &soid)
498 {
499 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
500 if (soid.snap < CEPH_MAXSNAP) {
501 int r = snap_mapper.remove_oid(
502 soid,
503 &_t);
504 if (!(r == 0 || r == -ENOENT)) {
505 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
506 ceph_abort();
507 }
508 }
509 }
510
511 void PG::update_object_snap_mapping(
512 ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps)
513 {
514 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
515 assert(soid.snap < CEPH_MAXSNAP);
516 int r = snap_mapper.remove_oid(
517 soid,
518 &_t);
519 if (!(r == 0 || r == -ENOENT)) {
520 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
521 ceph_abort();
522 }
523 snap_mapper.add_oid(
524 soid,
525 snaps,
526 &_t);
527 }
528
529 void PG::merge_log(
530 ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_shard_t from)
531 {
532 PGLogEntryHandler rollbacker{this, &t};
533 pg_log.merge_log(
534 oinfo, olog, from, info, &rollbacker, dirty_info, dirty_big_info);
535 }
536
537 void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead)
538 {
539 PGLogEntryHandler rollbacker{this, &t};
540 pg_log.rewind_divergent_log(
541 newhead, info, &rollbacker, dirty_info, dirty_big_info);
542 }
543
544 /*
545 * Process information from a replica to determine if it could have any
546 * objects that i need.
547 *
548 * TODO: if the missing set becomes very large, this could get expensive.
549 * Instead, we probably want to just iterate over our unfound set.
550 */
551 bool PG::search_for_missing(
552 const pg_info_t &oinfo, const pg_missing_t &omissing,
553 pg_shard_t from,
554 RecoveryCtx *ctx)
555 {
556 uint64_t num_unfound_before = missing_loc.num_unfound();
557 bool found_missing = missing_loc.add_source_info(
558 from, oinfo, omissing, ctx->handle);
559 if (found_missing && num_unfound_before != missing_loc.num_unfound())
560 publish_stats_to_osd();
561 // avoid doing this if the peer is empty. This is abit of paranoia
562 // to avoid doing something rash if add_source_info() above
563 // incorrectly decided we found something new. (if the peer has
564 // last_update=0'0 that's impossible.)
565 if (found_missing &&
566 (get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
567 CEPH_FEATURE_OSD_ERASURE_CODES) &&
568 oinfo.last_update != eversion_t()) {
569 pg_info_t tinfo(oinfo);
570 tinfo.pgid.shard = pg_whoami.shard;
571 (*(ctx->info_map))[from.osd].push_back(
572 make_pair(
573 pg_notify_t(
574 from.shard, pg_whoami.shard,
575 get_osdmap()->get_epoch(),
576 get_osdmap()->get_epoch(),
577 tinfo),
578 past_intervals));
579 }
580 return found_missing;
581 }
582
583
584 // MissingLoc
585
586 bool PG::MissingLoc::readable_with_acting(
587 const hobject_t &hoid,
588 const set<pg_shard_t> &acting) const {
589 if (!needs_recovery(hoid))
590 return true;
591 if (is_deleted(hoid))
592 return false;
593 auto missing_loc_entry = missing_loc.find(hoid);
594 if (missing_loc_entry == missing_loc.end())
595 return false;
596 const set<pg_shard_t> &locs = missing_loc_entry->second;
597 ldout(pg->cct, 10) << __func__ << ": locs:" << locs << dendl;
598 set<pg_shard_t> have_acting;
599 for (set<pg_shard_t>::const_iterator i = locs.begin();
600 i != locs.end();
601 ++i) {
602 if (acting.count(*i))
603 have_acting.insert(*i);
604 }
605 return (*is_readable)(have_acting);
606 }
607
608 void PG::MissingLoc::add_batch_sources_info(
609 const set<pg_shard_t> &sources, ThreadPool::TPHandle* handle)
610 {
611 ldout(pg->cct, 10) << __func__ << ": adding sources in batch "
612 << sources.size() << dendl;
613 unsigned loop = 0;
614 for (map<hobject_t, pg_missing_item>::const_iterator i = needs_recovery_map.begin();
615 i != needs_recovery_map.end();
616 ++i) {
617 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
618 handle->reset_tp_timeout();
619 loop = 0;
620 }
621 if (i->second.is_delete())
622 continue;
623
624 auto p = missing_loc.find(i->first);
625 if (p == missing_loc.end()) {
626 p = missing_loc.emplace(i->first, set<pg_shard_t>()).first;
627 } else {
628 _dec_count(p->second);
629 }
630 missing_loc[i->first].insert(sources.begin(), sources.end());
631 missing_loc_sources.insert(sources.begin(), sources.end());
632 _inc_count(p->second);
633
634 }
635 }
636
637 bool PG::MissingLoc::add_source_info(
638 pg_shard_t fromosd,
639 const pg_info_t &oinfo,
640 const pg_missing_t &omissing,
641 ThreadPool::TPHandle* handle)
642 {
643 bool found_missing = false;
644 unsigned loop = 0;
645 // found items?
646 for (map<hobject_t,pg_missing_item>::const_iterator p = needs_recovery_map.begin();
647 p != needs_recovery_map.end();
648 ++p) {
649 const hobject_t &soid(p->first);
650 eversion_t need = p->second.need;
651 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
652 handle->reset_tp_timeout();
653 loop = 0;
654 }
655 if (p->second.is_delete()) {
656 ldout(pg->cct, 10) << __func__ << " " << soid
657 << " delete, ignoring source" << dendl;
658 continue;
659 }
660 if (oinfo.last_update < need) {
661 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
662 << " also missing on osd." << fromosd
663 << " (last_update " << oinfo.last_update
664 << " < needed " << need << ")" << dendl;
665 continue;
666 }
667 if (!oinfo.last_backfill.is_max() &&
668 !oinfo.last_backfill_bitwise) {
669 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
670 << " also missing on osd." << fromosd
671 << " (last_backfill " << oinfo.last_backfill
672 << " but with wrong sort order)"
673 << dendl;
674 continue;
675 }
676 if (p->first >= oinfo.last_backfill) {
677 // FIXME: this is _probably_ true, although it could conceivably
678 // be in the undefined region! Hmm!
679 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
680 << " also missing on osd." << fromosd
681 << " (past last_backfill " << oinfo.last_backfill
682 << ")" << dendl;
683 continue;
684 }
685 if (oinfo.last_complete < need) {
686 if (omissing.is_missing(soid)) {
687 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
688 << " also missing on osd." << fromosd << dendl;
689 continue;
690 }
691 }
692
693 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
694 << " is on osd." << fromosd << dendl;
695
696 missing_loc_sources.insert(fromosd);
697 {
698 auto p = missing_loc.find(soid);
699 if (p == missing_loc.end()) {
700 p = missing_loc.emplace(soid, set<pg_shard_t>()).first;
701 } else {
702 _dec_count(p->second);
703 }
704 p->second.insert(fromosd);
705 _inc_count(p->second);
706 }
707
708 found_missing = true;
709 }
710
711 ldout(pg->cct, 20) << "needs_recovery_map missing " << needs_recovery_map
712 << dendl;
713 return found_missing;
714 }
715
716 void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
717 {
718 set<pg_shard_t> now_down;
719 for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
720 p != missing_loc_sources.end();
721 ) {
722 if (osdmap->is_up(p->osd)) {
723 ++p;
724 continue;
725 }
726 ldout(pg->cct, 10) << __func__ << " source osd." << *p << " now down" << dendl;
727 now_down.insert(*p);
728 missing_loc_sources.erase(p++);
729 }
730
731 if (now_down.empty()) {
732 ldout(pg->cct, 10) << __func__ << " no source osds (" << missing_loc_sources << ") went down" << dendl;
733 } else {
734 ldout(pg->cct, 10) << __func__ << " sources osds " << now_down << " now down, remaining sources are "
735 << missing_loc_sources << dendl;
736
737 // filter missing_loc
738 map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin();
739 while (p != missing_loc.end()) {
740 set<pg_shard_t>::iterator q = p->second.begin();
741 bool changed = false;
742 while (q != p->second.end()) {
743 if (now_down.count(*q)) {
744 if (!changed) {
745 changed = true;
746 _dec_count(p->second);
747 }
748 p->second.erase(q++);
749 } else {
750 ++q;
751 }
752 }
753 if (p->second.empty()) {
754 missing_loc.erase(p++);
755 } else {
756 if (changed) {
757 _inc_count(p->second);
758 }
759 ++p;
760 }
761 }
762 }
763 }
764
765 void PG::discover_all_missing(map<int, map<spg_t,pg_query_t> > &query_map)
766 {
767 auto &missing = pg_log.get_missing();
768 uint64_t unfound = get_num_unfound();
769
770 dout(10) << __func__ << " "
771 << missing.num_missing() << " missing, "
772 << unfound << " unfound"
773 << dendl;
774
775 std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
776 std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
777 for (; m != mend; ++m) {
778 pg_shard_t peer(*m);
779
780 if (!get_osdmap()->is_up(peer.osd)) {
781 dout(20) << __func__ << " skipping down osd." << peer << dendl;
782 continue;
783 }
784
785 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
786 if (iter != peer_info.end() &&
787 (iter->second.is_empty() || iter->second.dne())) {
788 // ignore empty peers
789 continue;
790 }
791
792 // If we've requested any of this stuff, the pg_missing_t information
793 // should be on its way.
794 // TODO: coalsce requested_* into a single data structure
795 if (peer_missing.find(peer) != peer_missing.end()) {
796 dout(20) << __func__ << ": osd." << peer
797 << ": we already have pg_missing_t" << dendl;
798 continue;
799 }
800 if (peer_log_requested.find(peer) != peer_log_requested.end()) {
801 dout(20) << __func__ << ": osd." << peer
802 << ": in peer_log_requested" << dendl;
803 continue;
804 }
805 if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
806 dout(20) << __func__ << ": osd." << peer
807 << ": in peer_missing_requested" << dendl;
808 continue;
809 }
810
811 // Request missing
812 dout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
813 << dendl;
814 peer_missing_requested.insert(peer);
815 query_map[peer.osd][spg_t(info.pgid.pgid, peer.shard)] =
816 pg_query_t(
817 pg_query_t::FULLLOG,
818 peer.shard, pg_whoami.shard,
819 info.history, get_osdmap()->get_epoch());
820 }
821 }
822
823 /******* PG ***********/
824 bool PG::needs_recovery() const
825 {
826 assert(is_primary());
827
828 auto &missing = pg_log.get_missing();
829
830 if (missing.num_missing()) {
831 dout(10) << __func__ << " primary has " << missing.num_missing()
832 << " missing" << dendl;
833 return true;
834 }
835
836 assert(!actingbackfill.empty());
837 set<pg_shard_t>::const_iterator end = actingbackfill.end();
838 set<pg_shard_t>::const_iterator a = actingbackfill.begin();
839 for (; a != end; ++a) {
840 if (*a == get_primary()) continue;
841 pg_shard_t peer = *a;
842 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
843 if (pm == peer_missing.end()) {
844 dout(10) << __func__ << " osd." << peer << " doesn't have missing set"
845 << dendl;
846 continue;
847 }
848 if (pm->second.num_missing()) {
849 dout(10) << __func__ << " osd." << peer << " has "
850 << pm->second.num_missing() << " missing" << dendl;
851 return true;
852 }
853 }
854
855 dout(10) << __func__ << " is recovered" << dendl;
856 return false;
857 }
858
859 bool PG::needs_backfill() const
860 {
861 assert(is_primary());
862
863 // We can assume that only possible osds that need backfill
864 // are on the backfill_targets vector nodes.
865 set<pg_shard_t>::const_iterator end = backfill_targets.end();
866 set<pg_shard_t>::const_iterator a = backfill_targets.begin();
867 for (; a != end; ++a) {
868 pg_shard_t peer = *a;
869 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
870 if (!pi->second.last_backfill.is_max()) {
871 dout(10) << __func__ << " osd." << peer << " has last_backfill " << pi->second.last_backfill << dendl;
872 return true;
873 }
874 }
875
876 dout(10) << __func__ << " does not need backfill" << dendl;
877 return false;
878 }
879
880
881 void PG::check_past_interval_bounds() const
882 {
883 auto rpib = get_required_past_interval_bounds(
884 info,
885 osd->get_superblock().oldest_map);
886 if (rpib.first >= rpib.second) {
887 if (!past_intervals.empty()) {
888 osd->clog->error() << info.pgid << " required past_interval bounds are"
889 << " empty [" << rpib << ") but past_intervals is not: "
890 << past_intervals;
891 derr << info.pgid << " required past_interval bounds are"
892 << " empty [" << rpib << ") but past_intervals is not: "
893 << past_intervals << dendl;
894 }
895 } else {
896 if (past_intervals.empty()) {
897 osd->clog->error() << info.pgid << " required past_interval bounds are"
898 << " not empty [" << rpib << ") but past_intervals "
899 << past_intervals << " is empty";
900 derr << info.pgid << " required past_interval bounds are"
901 << " not empty [" << rpib << ") but past_intervals "
902 << past_intervals << " is empty" << dendl;
903 assert(!past_intervals.empty());
904 }
905
906 auto apib = past_intervals.get_bounds();
907 if (apib.first > rpib.first) {
908 osd->clog->error() << info.pgid << " past_intervals [" << apib
909 << ") start interval does not contain the required"
910 << " bound [" << rpib << ") start";
911 derr << info.pgid << " past_intervals [" << apib
912 << ") start interval does not contain the required"
913 << " bound [" << rpib << ") start" << dendl;
914 assert(0 == "past_interval start interval mismatch");
915 }
916 if (apib.second != rpib.second) {
917 osd->clog->error() << info.pgid << " past_interal bound [" << apib
918 << ") end does not match required [" << rpib
919 << ") end";
920 derr << info.pgid << " past_interal bound [" << apib
921 << ") end does not match required [" << rpib
922 << ") end" << dendl;
923 assert(0 == "past_interval end mismatch");
924 }
925 }
926 }
927
928 bool PG::adjust_need_up_thru(const OSDMapRef osdmap)
929 {
930 epoch_t up_thru = osdmap->get_up_thru(osd->whoami);
931 if (need_up_thru &&
932 up_thru >= info.history.same_interval_since) {
933 dout(10) << "adjust_need_up_thru now " << up_thru << ", need_up_thru now false" << dendl;
934 need_up_thru = false;
935 return true;
936 }
937 return false;
938 }
939
940 void PG::remove_down_peer_info(const OSDMapRef osdmap)
941 {
942 // Remove any downed osds from peer_info
943 bool removed = false;
944 map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
945 while (p != peer_info.end()) {
946 if (!osdmap->is_up(p->first.osd)) {
947 dout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
948 peer_missing.erase(p->first);
949 peer_log_requested.erase(p->first);
950 peer_missing_requested.erase(p->first);
951 peer_info.erase(p++);
952 removed = true;
953 } else
954 ++p;
955 }
956
957 // if we removed anyone, update peers (which include peer_info)
958 if (removed)
959 update_heartbeat_peers();
960 check_recovery_sources(osdmap);
961 }
962
963 /*
964 * Returns true unless there is a non-lost OSD in might_have_unfound.
965 */
966 bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
967 {
968 assert(is_primary());
969
970 set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
971 set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
972 for (; peer != mend; ++peer) {
973 if (peer_missing.count(*peer))
974 continue;
975 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
976 if (iter != peer_info.end() &&
977 (iter->second.is_empty() || iter->second.dne()))
978 continue;
979 if (!osdmap->exists(peer->osd))
980 continue;
981 const osd_info_t &osd_info(osdmap->get_info(peer->osd));
982 if (osd_info.lost_at <= osd_info.up_from) {
983 // If there is even one OSD in might_have_unfound that isn't lost, we
984 // still might retrieve our unfound.
985 return false;
986 }
987 }
988 dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound
989 << " have been queried or are marked lost" << dendl;
990 return true;
991 }
992
993 PastIntervals::PriorSet PG::build_prior()
994 {
995 if (1) {
996 // sanity check
997 for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
998 it != peer_info.end();
999 ++it) {
1000 assert(info.history.last_epoch_started >= it->second.history.last_epoch_started);
1001 }
1002 }
1003
1004 const OSDMap &osdmap = *get_osdmap();
1005 PastIntervals::PriorSet prior = past_intervals.get_prior_set(
1006 pool.info.ec_pool(),
1007 info.history.last_epoch_started,
1008 get_pgbackend()->get_is_recoverable_predicate(),
1009 [&](epoch_t start, int osd, epoch_t *lost_at) {
1010 const osd_info_t *pinfo = 0;
1011 if (osdmap.exists(osd)) {
1012 pinfo = &osdmap.get_info(osd);
1013 if (lost_at)
1014 *lost_at = pinfo->lost_at;
1015 }
1016
1017 if (osdmap.is_up(osd)) {
1018 return PastIntervals::UP;
1019 } else if (!pinfo) {
1020 return PastIntervals::DNE;
1021 } else if (pinfo->lost_at > start) {
1022 return PastIntervals::LOST;
1023 } else {
1024 return PastIntervals::DOWN;
1025 }
1026 },
1027 up,
1028 acting,
1029 this);
1030
1031 if (prior.pg_down) {
1032 state_set(PG_STATE_DOWN);
1033 }
1034
1035 if (get_osdmap()->get_up_thru(osd->whoami) < info.history.same_interval_since) {
1036 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
1037 << " < same_since " << info.history.same_interval_since
1038 << ", must notify monitor" << dendl;
1039 need_up_thru = true;
1040 } else {
1041 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
1042 << " >= same_since " << info.history.same_interval_since
1043 << ", all is well" << dendl;
1044 need_up_thru = false;
1045 }
1046 set_probe_targets(prior.probe);
1047 return prior;
1048 }
1049
1050 void PG::clear_primary_state()
1051 {
1052 dout(10) << "clear_primary_state" << dendl;
1053
1054 // clear peering state
1055 stray_set.clear();
1056 peer_log_requested.clear();
1057 peer_missing_requested.clear();
1058 peer_info.clear();
1059 peer_missing.clear();
1060 need_up_thru = false;
1061 peer_last_complete_ondisk.clear();
1062 peer_activated.clear();
1063 min_last_complete_ondisk = eversion_t();
1064 pg_trim_to = eversion_t();
1065 might_have_unfound.clear();
1066 projected_log = PGLog::IndexedLog();
1067
1068 last_update_ondisk = eversion_t();
1069
1070 snap_trimq.clear();
1071
1072 finish_sync_event = 0; // so that _finish_recovery doesn't go off in another thread
1073
1074 missing_loc.clear();
1075
1076 release_pg_backoffs();
1077
1078 pg_log.reset_recovery_pointers();
1079
1080 scrubber.reserved_peers.clear();
1081 scrub_after_recovery = false;
1082
1083 agent_clear();
1084 }
1085
1086 PG::Scrubber::Scrubber()
1087 : reserved(false), reserve_failed(false),
1088 epoch_start(0),
1089 active(false),
1090 shallow_errors(0), deep_errors(0), fixed(0),
1091 must_scrub(false), must_deep_scrub(false), must_repair(false),
1092 auto_repair(false),
1093 num_digest_updates_pending(0),
1094 state(INACTIVE),
1095 deep(false)
1096 {}
1097
1098 PG::Scrubber::~Scrubber() {}
1099
1100 /**
1101 * find_best_info
1102 *
1103 * Returns an iterator to the best info in infos sorted by:
1104 * 1) Prefer newer last_update
1105 * 2) Prefer longer tail if it brings another info into contiguity
1106 * 3) Prefer current primary
1107 */
1108 map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
1109 const map<pg_shard_t, pg_info_t> &infos,
1110 bool restrict_to_up_acting,
1111 bool *history_les_bound) const
1112 {
1113 assert(history_les_bound);
1114 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1115 * to make changes to this process. Also, make sure to update it
1116 * when you find bugs! */
1117 eversion_t min_last_update_acceptable = eversion_t::max();
1118 epoch_t max_last_epoch_started_found = 0;
1119 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1120 i != infos.end();
1121 ++i) {
1122 if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1123 max_last_epoch_started_found < i->second.history.last_epoch_started) {
1124 *history_les_bound = true;
1125 max_last_epoch_started_found = i->second.history.last_epoch_started;
1126 }
1127 if (!i->second.is_incomplete() &&
1128 max_last_epoch_started_found < i->second.last_epoch_started) {
1129 max_last_epoch_started_found = i->second.last_epoch_started;
1130 }
1131 }
1132 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1133 i != infos.end();
1134 ++i) {
1135 if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1136 if (min_last_update_acceptable > i->second.last_update)
1137 min_last_update_acceptable = i->second.last_update;
1138 }
1139 }
1140 if (min_last_update_acceptable == eversion_t::max())
1141 return infos.end();
1142
1143 map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1144 // find osd with newest last_update (oldest for ec_pool).
1145 // if there are multiples, prefer
1146 // - a longer tail, if it brings another peer into log contiguity
1147 // - the current primary
1148 for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1149 p != infos.end();
1150 ++p) {
1151 if (restrict_to_up_acting && !is_up(p->first) &&
1152 !is_acting(p->first))
1153 continue;
1154 // Only consider peers with last_update >= min_last_update_acceptable
1155 if (p->second.last_update < min_last_update_acceptable)
1156 continue;
1157 // Disqualify anyone with a too old last_epoch_started
1158 if (p->second.last_epoch_started < max_last_epoch_started_found)
1159 continue;
1160 // Disqualify anyone who is incomplete (not fully backfilled)
1161 if (p->second.is_incomplete())
1162 continue;
1163 if (best == infos.end()) {
1164 best = p;
1165 continue;
1166 }
1167 // Prefer newer last_update
1168 if (pool.info.require_rollback()) {
1169 if (p->second.last_update > best->second.last_update)
1170 continue;
1171 if (p->second.last_update < best->second.last_update) {
1172 best = p;
1173 continue;
1174 }
1175 } else {
1176 if (p->second.last_update < best->second.last_update)
1177 continue;
1178 if (p->second.last_update > best->second.last_update) {
1179 best = p;
1180 continue;
1181 }
1182 }
1183
1184 // Prefer longer tail
1185 if (p->second.log_tail > best->second.log_tail) {
1186 continue;
1187 } else if (p->second.log_tail < best->second.log_tail) {
1188 best = p;
1189 continue;
1190 }
1191
1192 // prefer current primary (usually the caller), all things being equal
1193 if (p->first == pg_whoami) {
1194 dout(10) << "calc_acting prefer osd." << p->first
1195 << " because it is current primary" << dendl;
1196 best = p;
1197 continue;
1198 }
1199 }
1200 return best;
1201 }
1202
1203 void PG::calc_ec_acting(
1204 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1205 unsigned size,
1206 const vector<int> &acting,
1207 pg_shard_t acting_primary,
1208 const vector<int> &up,
1209 pg_shard_t up_primary,
1210 const map<pg_shard_t, pg_info_t> &all_info,
1211 bool restrict_to_up_acting,
1212 vector<int> *_want,
1213 set<pg_shard_t> *backfill,
1214 set<pg_shard_t> *acting_backfill,
1215 pg_shard_t *want_primary,
1216 ostream &ss)
1217 {
1218 vector<int> want(size, CRUSH_ITEM_NONE);
1219 map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1220 unsigned usable = 0;
1221 for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1222 i != all_info.end();
1223 ++i) {
1224 all_info_by_shard[i->first.shard].insert(i->first);
1225 }
1226 for (uint8_t i = 0; i < want.size(); ++i) {
1227 ss << "For position " << (unsigned)i << ": ";
1228 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1229 !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1230 all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1231 auth_log_shard->second.log_tail) {
1232 ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1233 want[i] = up[i];
1234 ++usable;
1235 continue;
1236 }
1237 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1238 ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1239 << " and ";
1240 backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1241 }
1242
1243 if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1244 !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1245 all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1246 auth_log_shard->second.log_tail) {
1247 ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1248 want[i] = acting[i];
1249 ++usable;
1250 } else if (!restrict_to_up_acting) {
1251 for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1252 j != all_info_by_shard[shard_id_t(i)].end();
1253 ++j) {
1254 assert(j->shard == i);
1255 if (!all_info.find(*j)->second.is_incomplete() &&
1256 all_info.find(*j)->second.last_update >=
1257 auth_log_shard->second.log_tail) {
1258 ss << " selecting stray: " << *j << std::endl;
1259 want[i] = j->osd;
1260 ++usable;
1261 break;
1262 }
1263 }
1264 if (want[i] == CRUSH_ITEM_NONE)
1265 ss << " failed to fill position " << (int)i << std::endl;
1266 }
1267 }
1268
1269 bool found_primary = false;
1270 for (uint8_t i = 0; i < want.size(); ++i) {
1271 if (want[i] != CRUSH_ITEM_NONE) {
1272 acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1273 if (!found_primary) {
1274 *want_primary = pg_shard_t(want[i], shard_id_t(i));
1275 found_primary = true;
1276 }
1277 }
1278 }
1279 acting_backfill->insert(backfill->begin(), backfill->end());
1280 _want->swap(want);
1281 }
1282
1283 /**
1284 * calculate the desired acting set.
1285 *
1286 * Choose an appropriate acting set. Prefer up[0], unless it is
1287 * incomplete, or another osd has a longer tail that allows us to
1288 * bring other up nodes up to date.
1289 */
1290 void PG::calc_replicated_acting(
1291 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1292 unsigned size,
1293 const vector<int> &acting,
1294 pg_shard_t acting_primary,
1295 const vector<int> &up,
1296 pg_shard_t up_primary,
1297 const map<pg_shard_t, pg_info_t> &all_info,
1298 bool restrict_to_up_acting,
1299 vector<int> *want,
1300 set<pg_shard_t> *backfill,
1301 set<pg_shard_t> *acting_backfill,
1302 pg_shard_t *want_primary,
1303 ostream &ss)
1304 {
1305 ss << "calc_acting newest update on osd." << auth_log_shard->first
1306 << " with " << auth_log_shard->second
1307 << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1308 pg_shard_t auth_log_shard_id = auth_log_shard->first;
1309
1310 // select primary
1311 map<pg_shard_t,pg_info_t>::const_iterator primary;
1312 if (up.size() &&
1313 !all_info.find(up_primary)->second.is_incomplete() &&
1314 all_info.find(up_primary)->second.last_update >=
1315 auth_log_shard->second.log_tail) {
1316 ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1317 primary = all_info.find(up_primary); // prefer up[0], all thing being equal
1318 } else {
1319 assert(!auth_log_shard->second.is_incomplete());
1320 ss << "up[0] needs backfill, osd." << auth_log_shard_id
1321 << " selected as primary instead" << std::endl;
1322 primary = auth_log_shard;
1323 }
1324
1325 ss << "calc_acting primary is osd." << primary->first
1326 << " with " << primary->second << std::endl;
1327 *want_primary = primary->first;
1328 want->push_back(primary->first.osd);
1329 acting_backfill->insert(primary->first);
1330 unsigned usable = 1;
1331
1332 // select replicas that have log contiguity with primary.
1333 // prefer up, then acting, then any peer_info osds
1334 for (vector<int>::const_iterator i = up.begin();
1335 i != up.end();
1336 ++i) {
1337 pg_shard_t up_cand = pg_shard_t(*i, shard_id_t::NO_SHARD);
1338 if (up_cand == primary->first)
1339 continue;
1340 const pg_info_t &cur_info = all_info.find(up_cand)->second;
1341 if (cur_info.is_incomplete() ||
1342 cur_info.last_update < MIN(
1343 primary->second.log_tail,
1344 auth_log_shard->second.log_tail)) {
1345 /* We include auth_log_shard->second.log_tail because in GetLog,
1346 * we will request logs back to the min last_update over our
1347 * acting_backfill set, which will result in our log being extended
1348 * as far backwards as necessary to pick up any peers which can
1349 * be log recovered by auth_log_shard's log */
1350 ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1351 backfill->insert(up_cand);
1352 acting_backfill->insert(up_cand);
1353 } else {
1354 want->push_back(*i);
1355 acting_backfill->insert(up_cand);
1356 usable++;
1357 ss << " osd." << *i << " (up) accepted " << cur_info << std::endl;
1358 }
1359 if (want->size() >= size) {
1360 break;
1361 }
1362 }
1363
1364 // This no longer has backfill OSDs, but they are covered above.
1365 for (vector<int>::const_iterator i = acting.begin();
1366 i != acting.end();
1367 ++i) {
1368 pg_shard_t acting_cand(*i, shard_id_t::NO_SHARD);
1369 if (usable >= size)
1370 break;
1371
1372 // skip up osds we already considered above
1373 if (acting_cand == primary->first)
1374 continue;
1375 vector<int>::const_iterator up_it = find(up.begin(), up.end(), acting_cand.osd);
1376 if (up_it != up.end())
1377 continue;
1378
1379 const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1380 if (cur_info.is_incomplete() ||
1381 cur_info.last_update < primary->second.log_tail) {
1382 ss << " shard " << acting_cand << " (stray) REJECTED "
1383 << cur_info << std::endl;
1384 } else {
1385 want->push_back(*i);
1386 acting_backfill->insert(acting_cand);
1387 ss << " shard " << acting_cand << " (stray) accepted "
1388 << cur_info << std::endl;
1389 usable++;
1390 }
1391 }
1392
1393 if (restrict_to_up_acting) {
1394 return;
1395 }
1396 for (map<pg_shard_t,pg_info_t>::const_iterator i = all_info.begin();
1397 i != all_info.end();
1398 ++i) {
1399 if (usable >= size)
1400 break;
1401
1402 // skip up osds we already considered above
1403 if (i->first == primary->first)
1404 continue;
1405 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i->first.osd);
1406 if (up_it != up.end())
1407 continue;
1408 vector<int>::const_iterator acting_it = find(
1409 acting.begin(), acting.end(), i->first.osd);
1410 if (acting_it != acting.end())
1411 continue;
1412
1413 if (i->second.is_incomplete() ||
1414 i->second.last_update < primary->second.log_tail) {
1415 ss << " shard " << i->first << " (stray) REJECTED "
1416 << i->second << std::endl;
1417 } else {
1418 want->push_back(i->first.osd);
1419 acting_backfill->insert(i->first);
1420 ss << " shard " << i->first << " (stray) accepted "
1421 << i->second << std::endl;
1422 usable++;
1423 }
1424 }
1425 }
1426
1427 /**
1428 * choose acting
1429 *
1430 * calculate the desired acting, and request a change with the monitor
1431 * if it differs from the current acting.
1432 *
1433 * if restrict_to_up_acting=true, we filter out anything that's not in
1434 * up/acting. in order to lift this restriction, we need to
1435 * 1) check whether it's worth switching the acting set any time we get
1436 * a new pg info (not just here, when recovery finishes)
1437 * 2) check whether anything in want_acting went down on each new map
1438 * (and, if so, calculate a new want_acting)
1439 * 3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap)
1440 * TODO!
1441 */
1442 bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
1443 bool restrict_to_up_acting,
1444 bool *history_les_bound)
1445 {
1446 map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
1447 all_info[pg_whoami] = info;
1448
1449 for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
1450 p != all_info.end();
1451 ++p) {
1452 dout(10) << __func__ << " all_info osd." << p->first << " " << p->second << dendl;
1453 }
1454
1455 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
1456 find_best_info(all_info, restrict_to_up_acting, history_les_bound);
1457
1458 if (auth_log_shard == all_info.end()) {
1459 if (up != acting) {
1460 dout(10) << "choose_acting no suitable info found (incomplete backfills?),"
1461 << " reverting to up" << dendl;
1462 want_acting = up;
1463 vector<int> empty;
1464 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1465 } else {
1466 dout(10) << "choose_acting failed" << dendl;
1467 assert(want_acting.empty());
1468 }
1469 return false;
1470 }
1471
1472 assert(!auth_log_shard->second.is_incomplete());
1473 auth_log_shard_id = auth_log_shard->first;
1474
1475 set<pg_shard_t> want_backfill, want_acting_backfill;
1476 vector<int> want;
1477 pg_shard_t want_primary;
1478 stringstream ss;
1479 if (!pool.info.ec_pool())
1480 calc_replicated_acting(
1481 auth_log_shard,
1482 get_osdmap()->get_pg_size(info.pgid.pgid),
1483 acting,
1484 primary,
1485 up,
1486 up_primary,
1487 all_info,
1488 restrict_to_up_acting,
1489 &want,
1490 &want_backfill,
1491 &want_acting_backfill,
1492 &want_primary,
1493 ss);
1494 else
1495 calc_ec_acting(
1496 auth_log_shard,
1497 get_osdmap()->get_pg_size(info.pgid.pgid),
1498 acting,
1499 primary,
1500 up,
1501 up_primary,
1502 all_info,
1503 restrict_to_up_acting,
1504 &want,
1505 &want_backfill,
1506 &want_acting_backfill,
1507 &want_primary,
1508 ss);
1509 dout(10) << ss.str() << dendl;
1510
1511 unsigned num_want_acting = 0;
1512 set<pg_shard_t> have;
1513 for (int i = 0; i < (int)want.size(); ++i) {
1514 if (want[i] != CRUSH_ITEM_NONE) {
1515 ++num_want_acting;
1516 have.insert(
1517 pg_shard_t(
1518 want[i],
1519 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1520 }
1521 }
1522
1523 // We go incomplete if below min_size for ec_pools since backfill
1524 // does not currently maintain rollbackability
1525 // Otherwise, we will go "peered", but not "active"
1526 if (num_want_acting < pool.info.min_size &&
1527 (pool.info.ec_pool() ||
1528 !cct->_conf->osd_allow_recovery_below_min_size)) {
1529 want_acting.clear();
1530 dout(10) << "choose_acting failed, below min size" << dendl;
1531 return false;
1532 }
1533
1534 /* Check whether we have enough acting shards to later perform recovery */
1535 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable_predicate(
1536 get_pgbackend()->get_is_recoverable_predicate());
1537 if (!(*recoverable_predicate)(have)) {
1538 want_acting.clear();
1539 dout(10) << "choose_acting failed, not recoverable" << dendl;
1540 return false;
1541 }
1542
1543 if (want != acting) {
1544 dout(10) << "choose_acting want " << want << " != acting " << acting
1545 << ", requesting pg_temp change" << dendl;
1546 want_acting = want;
1547
1548 if (want_acting == up) {
1549 // There can't be any pending backfill if
1550 // want is the same as crush map up OSDs.
1551 assert(want_backfill.empty());
1552 vector<int> empty;
1553 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1554 } else
1555 osd->queue_want_pg_temp(info.pgid.pgid, want);
1556 return false;
1557 }
1558 want_acting.clear();
1559 actingbackfill = want_acting_backfill;
1560 dout(10) << "actingbackfill is " << actingbackfill << dendl;
1561 assert(backfill_targets.empty() || backfill_targets == want_backfill);
1562 if (backfill_targets.empty()) {
1563 // Caller is GetInfo
1564 backfill_targets = want_backfill;
1565 }
1566 // Will not change if already set because up would have had to change
1567 // Verify that nothing in backfill is in stray_set
1568 for (set<pg_shard_t>::iterator i = want_backfill.begin();
1569 i != want_backfill.end();
1570 ++i) {
1571 assert(stray_set.find(*i) == stray_set.end());
1572 }
1573 dout(10) << "choose_acting want " << want << " (== acting) backfill_targets "
1574 << want_backfill << dendl;
1575 return true;
1576 }
1577
1578 /* Build the might_have_unfound set.
1579 *
1580 * This is used by the primary OSD during recovery.
1581 *
1582 * This set tracks the OSDs which might have unfound objects that the primary
1583 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
1584 * will remove the OSD from the set.
1585 */
1586 void PG::build_might_have_unfound()
1587 {
1588 assert(might_have_unfound.empty());
1589 assert(is_primary());
1590
1591 dout(10) << __func__ << dendl;
1592
1593 check_past_interval_bounds();
1594
1595 might_have_unfound = past_intervals.get_might_have_unfound(
1596 pg_whoami,
1597 pool.info.ec_pool());
1598
1599 // include any (stray) peers
1600 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
1601 p != peer_info.end();
1602 ++p)
1603 might_have_unfound.insert(p->first);
1604
1605 dout(15) << __func__ << ": built " << might_have_unfound << dendl;
1606 }
1607
1608 struct C_PG_ActivateCommitted : public Context {
1609 PGRef pg;
1610 epoch_t epoch;
1611 epoch_t activation_epoch;
1612 C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
1613 : pg(p), epoch(e), activation_epoch(ae) {}
1614 void finish(int r) override {
1615 pg->_activate_committed(epoch, activation_epoch);
1616 }
1617 };
1618
1619 void PG::activate(ObjectStore::Transaction& t,
1620 epoch_t activation_epoch,
1621 list<Context*>& tfin,
1622 map<int, map<spg_t,pg_query_t> >& query_map,
1623 map<int,
1624 vector<
1625 pair<pg_notify_t,
1626 PastIntervals> > > *activator_map,
1627 RecoveryCtx *ctx)
1628 {
1629 assert(!is_peered());
1630 assert(scrubber.callbacks.empty());
1631 assert(callbacks_for_degraded_object.empty());
1632
1633 // twiddle pg state
1634 state_clear(PG_STATE_DOWN);
1635
1636 send_notify = false;
1637
1638 if (is_primary()) {
1639 // only update primary last_epoch_started if we will go active
1640 if (acting.size() >= pool.info.min_size) {
1641 assert(cct->_conf->osd_find_best_info_ignore_history_les ||
1642 info.last_epoch_started <= activation_epoch);
1643 info.last_epoch_started = activation_epoch;
1644 info.last_interval_started = info.history.same_interval_since;
1645 }
1646 } else if (is_acting(pg_whoami)) {
1647 /* update last_epoch_started on acting replica to whatever the primary sent
1648 * unless it's smaller (could happen if we are going peered rather than
1649 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
1650 if (info.last_epoch_started < activation_epoch) {
1651 info.last_epoch_started = activation_epoch;
1652 info.last_interval_started = info.history.same_interval_since;
1653 }
1654 }
1655
1656 auto &missing = pg_log.get_missing();
1657
1658 if (is_primary()) {
1659 last_update_ondisk = info.last_update;
1660 min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
1661 }
1662 last_update_applied = info.last_update;
1663 last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
1664
1665 need_up_thru = false;
1666
1667 // write pg info, log
1668 dirty_info = true;
1669 dirty_big_info = true; // maybe
1670
1671 // find out when we commit
1672 t.register_on_complete(
1673 new C_PG_ActivateCommitted(
1674 this,
1675 get_osdmap()->get_epoch(),
1676 activation_epoch));
1677
1678 // initialize snap_trimq
1679 if (is_primary()) {
1680 dout(20) << "activate - purged_snaps " << info.purged_snaps
1681 << " cached_removed_snaps " << pool.cached_removed_snaps << dendl;
1682 snap_trimq = pool.cached_removed_snaps;
1683 interval_set<snapid_t> intersection;
1684 intersection.intersection_of(snap_trimq, info.purged_snaps);
1685 if (intersection == info.purged_snaps) {
1686 snap_trimq.subtract(info.purged_snaps);
1687 } else {
1688 dout(0) << "warning: info.purged_snaps (" << info.purged_snaps
1689 << ") is not a subset of pool.cached_removed_snaps ("
1690 << pool.cached_removed_snaps << ")" << dendl;
1691 snap_trimq.subtract(intersection);
1692 }
1693 }
1694
1695 // init complete pointer
1696 if (missing.num_missing() == 0) {
1697 dout(10) << "activate - no missing, moving last_complete " << info.last_complete
1698 << " -> " << info.last_update << dendl;
1699 info.last_complete = info.last_update;
1700 pg_log.reset_recovery_pointers();
1701 } else {
1702 dout(10) << "activate - not complete, " << missing << dendl;
1703 pg_log.activate_not_complete(info);
1704 }
1705
1706 log_weirdness();
1707
1708 // if primary..
1709 if (is_primary()) {
1710 assert(ctx);
1711 // start up replicas
1712
1713 assert(!actingbackfill.empty());
1714 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1715 i != actingbackfill.end();
1716 ++i) {
1717 if (*i == pg_whoami) continue;
1718 pg_shard_t peer = *i;
1719 assert(peer_info.count(peer));
1720 pg_info_t& pi = peer_info[peer];
1721
1722 dout(10) << "activate peer osd." << peer << " " << pi << dendl;
1723
1724 MOSDPGLog *m = 0;
1725 assert(peer_missing.count(peer));
1726 pg_missing_t& pm = peer_missing[peer];
1727
1728 bool needs_past_intervals = pi.dne();
1729
1730 /*
1731 * cover case where peer sort order was different and
1732 * last_backfill cannot be interpreted
1733 */
1734 bool force_restart_backfill =
1735 !pi.last_backfill.is_max() &&
1736 !pi.last_backfill_bitwise;
1737
1738 if (pi.last_update == info.last_update && !force_restart_backfill) {
1739 // empty log
1740 if (!pi.last_backfill.is_max())
1741 osd->clog->info() << info.pgid << " continuing backfill to osd."
1742 << peer
1743 << " from (" << pi.log_tail << "," << pi.last_update
1744 << "] " << pi.last_backfill
1745 << " to " << info.last_update;
1746 if (!pi.is_empty() && activator_map) {
1747 dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
1748 (*activator_map)[peer.osd].push_back(
1749 make_pair(
1750 pg_notify_t(
1751 peer.shard, pg_whoami.shard,
1752 get_osdmap()->get_epoch(),
1753 get_osdmap()->get_epoch(),
1754 info),
1755 past_intervals));
1756 } else {
1757 dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
1758 m = new MOSDPGLog(
1759 i->shard, pg_whoami.shard,
1760 get_osdmap()->get_epoch(), info);
1761 }
1762 } else if (
1763 pg_log.get_tail() > pi.last_update ||
1764 pi.last_backfill == hobject_t() ||
1765 force_restart_backfill ||
1766 (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
1767 /* ^ This last case covers a situation where a replica is not contiguous
1768 * with the auth_log, but is contiguous with this replica. Reshuffling
1769 * the active set to handle this would be tricky, so instead we just go
1770 * ahead and backfill it anyway. This is probably preferrable in any
1771 * case since the replica in question would have to be significantly
1772 * behind.
1773 */
1774 // backfill
1775 osd->clog->debug() << info.pgid << " starting backfill to osd." << peer
1776 << " from (" << pi.log_tail << "," << pi.last_update
1777 << "] " << pi.last_backfill
1778 << " to " << info.last_update;
1779
1780 pi.last_update = info.last_update;
1781 pi.last_complete = info.last_update;
1782 pi.set_last_backfill(hobject_t());
1783 pi.last_epoch_started = info.last_epoch_started;
1784 pi.last_interval_started = info.last_interval_started;
1785 pi.history = info.history;
1786 pi.hit_set = info.hit_set;
1787 pi.stats.stats.clear();
1788
1789 // initialize peer with our purged_snaps.
1790 pi.purged_snaps = info.purged_snaps;
1791
1792 m = new MOSDPGLog(
1793 i->shard, pg_whoami.shard,
1794 get_osdmap()->get_epoch(), pi);
1795
1796 // send some recent log, so that op dup detection works well.
1797 m->log.copy_up_to(pg_log.get_log(), cct->_conf->osd_min_pg_log_entries);
1798 m->info.log_tail = m->log.tail;
1799 pi.log_tail = m->log.tail; // sigh...
1800
1801 pm.clear();
1802 } else {
1803 // catch up
1804 assert(pg_log.get_tail() <= pi.last_update);
1805 m = new MOSDPGLog(
1806 i->shard, pg_whoami.shard,
1807 get_osdmap()->get_epoch(), info);
1808 // send new stuff to append to replicas log
1809 m->log.copy_after(pg_log.get_log(), pi.last_update);
1810 }
1811
1812 // share past_intervals if we are creating the pg on the replica
1813 // based on whether our info for that peer was dne() *before*
1814 // updating pi.history in the backfill block above.
1815 if (m && needs_past_intervals)
1816 m->past_intervals = past_intervals;
1817
1818 // update local version of peer's missing list!
1819 if (m && pi.last_backfill != hobject_t()) {
1820 for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
1821 p != m->log.log.end();
1822 ++p) {
1823 if (p->soid <= pi.last_backfill &&
1824 !p->is_error()) {
1825 if (perform_deletes_during_peering() && p->is_delete()) {
1826 pm.rm(p->soid, p->version);
1827 } else {
1828 pm.add_next_event(*p);
1829 }
1830 }
1831 }
1832 }
1833
1834 if (m) {
1835 dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
1836 //m->log.print(cout);
1837 osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
1838 }
1839
1840 // peer now has
1841 pi.last_update = info.last_update;
1842
1843 // update our missing
1844 if (pm.num_missing() == 0) {
1845 pi.last_complete = pi.last_update;
1846 dout(10) << "activate peer osd." << peer << " " << pi << " uptodate" << dendl;
1847 } else {
1848 dout(10) << "activate peer osd." << peer << " " << pi << " missing " << pm << dendl;
1849 }
1850 }
1851
1852 // Set up missing_loc
1853 set<pg_shard_t> complete_shards;
1854 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1855 i != actingbackfill.end();
1856 ++i) {
1857 dout(20) << __func__ << " setting up missing_loc from shard " << *i << " " << dendl;
1858 if (*i == get_primary()) {
1859 missing_loc.add_active_missing(missing);
1860 if (!missing.have_missing())
1861 complete_shards.insert(*i);
1862 } else {
1863 auto peer_missing_entry = peer_missing.find(*i);
1864 assert(peer_missing_entry != peer_missing.end());
1865 missing_loc.add_active_missing(peer_missing_entry->second);
1866 if (!peer_missing_entry->second.have_missing() &&
1867 peer_info[*i].last_backfill.is_max())
1868 complete_shards.insert(*i);
1869 }
1870 }
1871
1872 // If necessary, create might_have_unfound to help us find our unfound objects.
1873 // NOTE: It's important that we build might_have_unfound before trimming the
1874 // past intervals.
1875 might_have_unfound.clear();
1876 if (needs_recovery()) {
1877 // If only one shard has missing, we do a trick to add all others as recovery
1878 // source, this is considered safe since the PGLogs have been merged locally,
1879 // and covers vast majority of the use cases, like one OSD/host is down for
1880 // a while for hardware repairing
1881 if (complete_shards.size() + 1 == actingbackfill.size()) {
1882 missing_loc.add_batch_sources_info(complete_shards, ctx->handle);
1883 } else {
1884 missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
1885 ctx->handle);
1886 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1887 i != actingbackfill.end();
1888 ++i) {
1889 if (*i == pg_whoami) continue;
1890 dout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
1891 assert(peer_missing.count(*i));
1892 assert(peer_info.count(*i));
1893 missing_loc.add_source_info(
1894 *i,
1895 peer_info[*i],
1896 peer_missing[*i],
1897 ctx->handle);
1898 }
1899 }
1900 for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
1901 i != peer_missing.end();
1902 ++i) {
1903 if (is_actingbackfill(i->first))
1904 continue;
1905 assert(peer_info.count(i->first));
1906 search_for_missing(
1907 peer_info[i->first],
1908 i->second,
1909 i->first,
1910 ctx);
1911 }
1912
1913 build_might_have_unfound();
1914
1915 // Always call now so _update_calc_stats() will be accurate
1916 discover_all_missing(query_map);
1917 }
1918
1919 // num_objects_degraded if calculated should reflect this too, unless no
1920 // missing and we are about to go clean.
1921 if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
1922 state_set(PG_STATE_UNDERSIZED);
1923 }
1924
1925 state_set(PG_STATE_ACTIVATING);
1926 release_pg_backoffs();
1927 projected_last_update = info.last_update;
1928 }
1929 if (acting.size() >= pool.info.min_size) {
1930 PGLogEntryHandler handler{this, &t};
1931 pg_log.roll_forward(&handler);
1932 }
1933 }
1934
1935 bool PG::op_has_sufficient_caps(OpRequestRef& op)
1936 {
1937 // only check MOSDOp
1938 if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
1939 return true;
1940
1941 const MOSDOp *req = static_cast<const MOSDOp*>(op->get_req());
1942
1943 Session *session = static_cast<Session*>(req->get_connection()->get_priv());
1944 if (!session) {
1945 dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl;
1946 return false;
1947 }
1948 OSDCap& caps = session->caps;
1949 session->put();
1950
1951 const string &key = req->get_hobj().get_key().empty() ?
1952 req->get_oid().name :
1953 req->get_hobj().get_key();
1954
1955 bool cap = caps.is_capable(pool.name, req->get_hobj().nspace,
1956 pool.auid, key,
1957 op->need_read_cap(),
1958 op->need_write_cap(),
1959 op->classes());
1960
1961 dout(20) << "op_has_sufficient_caps "
1962 << "session=" << session
1963 << " pool=" << pool.id << " (" << pool.name
1964 << " " << req->get_hobj().nspace
1965 << ") owner=" << pool.auid
1966 << " need_read_cap=" << op->need_read_cap()
1967 << " need_write_cap=" << op->need_write_cap()
1968 << " classes=" << op->classes()
1969 << " -> " << (cap ? "yes" : "NO")
1970 << dendl;
1971 return cap;
1972 }
1973
1974 void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
1975 {
1976 lock();
1977 if (pg_has_reset_since(epoch)) {
1978 dout(10) << "_activate_committed " << epoch
1979 << ", that was an old interval" << dendl;
1980 } else if (is_primary()) {
1981 peer_activated.insert(pg_whoami);
1982 dout(10) << "_activate_committed " << epoch
1983 << " peer_activated now " << peer_activated
1984 << " last_interval_started " << info.history.last_interval_started
1985 << " last_epoch_started " << info.history.last_epoch_started
1986 << " same_interval_since " << info.history.same_interval_since << dendl;
1987 assert(!actingbackfill.empty());
1988 if (peer_activated.size() == actingbackfill.size())
1989 all_activated_and_committed();
1990 } else {
1991 dout(10) << "_activate_committed " << epoch << " telling primary" << dendl;
1992 MOSDPGInfo *m = new MOSDPGInfo(epoch);
1993 pg_notify_t i = pg_notify_t(
1994 get_primary().shard, pg_whoami.shard,
1995 get_osdmap()->get_epoch(),
1996 get_osdmap()->get_epoch(),
1997 info);
1998
1999 i.info.history.last_epoch_started = activation_epoch;
2000 i.info.history.last_interval_started = i.info.history.same_interval_since;
2001 if (acting.size() >= pool.info.min_size) {
2002 state_set(PG_STATE_ACTIVE);
2003 } else {
2004 state_set(PG_STATE_PEERED);
2005 }
2006
2007 m->pg_list.push_back(make_pair(i, PastIntervals()));
2008 osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap()->get_epoch());
2009
2010 // waiters
2011 if (flushes_in_progress == 0) {
2012 requeue_ops(waiting_for_peered);
2013 } else if (!waiting_for_peered.empty()) {
2014 dout(10) << __func__ << " flushes in progress, moving "
2015 << waiting_for_peered.size() << " items to waiting_for_flush"
2016 << dendl;
2017 assert(waiting_for_flush.empty());
2018 waiting_for_flush.swap(waiting_for_peered);
2019 }
2020 }
2021
2022 assert(!dirty_info);
2023
2024 unlock();
2025 }
2026
2027 /*
2028 * update info.history.last_epoch_started ONLY after we and all
2029 * replicas have activated AND committed the activate transaction
2030 * (i.e. the peering results are stable on disk).
2031 */
2032 void PG::all_activated_and_committed()
2033 {
2034 dout(10) << "all_activated_and_committed" << dendl;
2035 assert(is_primary());
2036 assert(peer_activated.size() == actingbackfill.size());
2037 assert(!actingbackfill.empty());
2038 assert(blocked_by.empty());
2039
2040 // Degraded?
2041 _update_calc_stats();
2042 if (info.stats.stats.sum.num_objects_degraded) {
2043 state_set(PG_STATE_DEGRADED);
2044 } else {
2045 state_clear(PG_STATE_DEGRADED);
2046 }
2047
2048 queue_peering_event(
2049 CephPeeringEvtRef(
2050 std::make_shared<CephPeeringEvt>(
2051 get_osdmap()->get_epoch(),
2052 get_osdmap()->get_epoch(),
2053 AllReplicasActivated())));
2054 }
2055
2056 bool PG::requeue_scrub(bool high_priority)
2057 {
2058 assert(is_locked());
2059 if (scrub_queued) {
2060 dout(10) << __func__ << ": already queued" << dendl;
2061 return false;
2062 } else {
2063 dout(10) << __func__ << ": queueing" << dendl;
2064 scrub_queued = true;
2065 osd->queue_for_scrub(this, high_priority);
2066 return true;
2067 }
2068 }
2069
2070 void PG::queue_recovery()
2071 {
2072 if (!is_primary() || !is_peered()) {
2073 dout(10) << "queue_recovery -- not primary or not peered " << dendl;
2074 assert(!recovery_queued);
2075 } else if (recovery_queued) {
2076 dout(10) << "queue_recovery -- already queued" << dendl;
2077 } else {
2078 dout(10) << "queue_recovery -- queuing" << dendl;
2079 recovery_queued = true;
2080 osd->queue_for_recovery(this);
2081 }
2082 }
2083
2084 bool PG::queue_scrub()
2085 {
2086 assert(is_locked());
2087 if (is_scrubbing()) {
2088 return false;
2089 }
2090 scrubber.priority = scrubber.must_scrub ?
2091 cct->_conf->osd_requested_scrub_priority : get_scrub_priority();
2092 scrubber.must_scrub = false;
2093 state_set(PG_STATE_SCRUBBING);
2094 if (scrubber.must_deep_scrub) {
2095 state_set(PG_STATE_DEEP_SCRUB);
2096 scrubber.must_deep_scrub = false;
2097 }
2098 if (scrubber.must_repair || scrubber.auto_repair) {
2099 state_set(PG_STATE_REPAIR);
2100 scrubber.must_repair = false;
2101 }
2102 requeue_scrub();
2103 return true;
2104 }
2105
2106 unsigned PG::get_scrub_priority()
2107 {
2108 // a higher value -> a higher priority
2109 int pool_scrub_priority = 0;
2110 pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority);
2111 return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
2112 }
2113
2114 struct C_PG_FinishRecovery : public Context {
2115 PGRef pg;
2116 explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
2117 void finish(int r) override {
2118 pg->_finish_recovery(this);
2119 }
2120 };
2121
2122 void PG::mark_clean()
2123 {
2124 if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2125 state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
2126 state_set(PG_STATE_CLEAN);
2127 info.history.last_epoch_clean = get_osdmap()->get_epoch();
2128 info.history.last_interval_clean = info.history.same_interval_since;
2129 past_intervals.clear();
2130 dirty_big_info = true;
2131 dirty_info = true;
2132 }
2133
2134 kick_snap_trim();
2135 }
2136
2137 void PG::_change_recovery_force_mode(int new_mode, bool clear)
2138 {
2139 if (!deleting) {
2140 // we can't and shouldn't do anything if the PG is being deleted locally
2141 if (clear) {
2142 state_clear(new_mode);
2143 } else {
2144 state_set(new_mode);
2145 }
2146 publish_stats_to_osd();
2147 }
2148 }
2149
2150 inline int PG::clamp_recovery_priority(int priority)
2151 {
2152 static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
2153 static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
2154
2155 // Clamp to valid range
2156 if (priority > OSD_RECOVERY_PRIORITY_MAX) {
2157 return OSD_RECOVERY_PRIORITY_MAX;
2158 } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
2159 return OSD_RECOVERY_PRIORITY_MIN;
2160 } else {
2161 return priority;
2162 }
2163 }
2164
2165 unsigned PG::get_recovery_priority()
2166 {
2167 // a higher value -> a higher priority
2168 int ret = 0;
2169
2170 if (state & PG_STATE_FORCED_RECOVERY) {
2171 ret = OSD_RECOVERY_PRIORITY_FORCED;
2172 } else {
2173 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &ret);
2174 ret = clamp_recovery_priority(OSD_RECOVERY_PRIORITY_BASE + ret);
2175 }
2176 dout(20) << __func__ << " recovery priority for " << *this << " is " << ret << ", state is " << state << dendl;
2177 return static_cast<unsigned>(ret);
2178 }
2179
2180 unsigned PG::get_backfill_priority()
2181 {
2182 // a higher value -> a higher priority
2183 int ret = OSD_BACKFILL_PRIORITY_BASE;
2184 if (state & PG_STATE_FORCED_BACKFILL) {
2185 ret = OSD_RECOVERY_PRIORITY_FORCED;
2186 } else {
2187 if (acting.size() < pool.info.min_size) {
2188 // inactive: no. of replicas < min_size, highest priority since it blocks IO
2189 ret = OSD_BACKFILL_INACTIVE_PRIORITY_BASE + (pool.info.min_size - acting.size());
2190
2191 } else if (is_undersized()) {
2192 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
2193 assert(pool.info.size > actingset.size());
2194 ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE + (pool.info.size - actingset.size());
2195
2196 } else if (is_degraded()) {
2197 // degraded: baseline degraded
2198 ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
2199 }
2200
2201 // Adjust with pool's recovery priority
2202 int pool_recovery_priority = 0;
2203 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2204
2205 ret = clamp_recovery_priority(pool_recovery_priority + ret);
2206 }
2207
2208 return static_cast<unsigned>(ret);
2209 }
2210
2211 void PG::finish_recovery(list<Context*>& tfin)
2212 {
2213 dout(10) << "finish_recovery" << dendl;
2214 assert(info.last_complete == info.last_update);
2215
2216 clear_recovery_state();
2217
2218 /*
2219 * sync all this before purging strays. but don't block!
2220 */
2221 finish_sync_event = new C_PG_FinishRecovery(this);
2222 tfin.push_back(finish_sync_event);
2223 }
2224
2225 void PG::_finish_recovery(Context *c)
2226 {
2227 lock();
2228 if (deleting) {
2229 unlock();
2230 return;
2231 }
2232 if (c == finish_sync_event) {
2233 dout(10) << "_finish_recovery" << dendl;
2234 finish_sync_event = 0;
2235 purge_strays();
2236
2237 publish_stats_to_osd();
2238
2239 if (scrub_after_recovery) {
2240 dout(10) << "_finish_recovery requeueing for scrub" << dendl;
2241 scrub_after_recovery = false;
2242 scrubber.must_deep_scrub = true;
2243 queue_scrub();
2244 }
2245 } else {
2246 dout(10) << "_finish_recovery -- stale" << dendl;
2247 }
2248 unlock();
2249 }
2250
2251 void PG::start_recovery_op(const hobject_t& soid)
2252 {
2253 dout(10) << "start_recovery_op " << soid
2254 #ifdef DEBUG_RECOVERY_OIDS
2255 << " (" << recovering_oids << ")"
2256 #endif
2257 << dendl;
2258 assert(recovery_ops_active >= 0);
2259 recovery_ops_active++;
2260 #ifdef DEBUG_RECOVERY_OIDS
2261 assert(recovering_oids.count(soid) == 0);
2262 recovering_oids.insert(soid);
2263 #endif
2264 osd->start_recovery_op(this, soid);
2265 }
2266
2267 void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
2268 {
2269 dout(10) << "finish_recovery_op " << soid
2270 #ifdef DEBUG_RECOVERY_OIDS
2271 << " (" << recovering_oids << ")"
2272 #endif
2273 << dendl;
2274 assert(recovery_ops_active > 0);
2275 recovery_ops_active--;
2276 #ifdef DEBUG_RECOVERY_OIDS
2277 assert(recovering_oids.count(soid));
2278 recovering_oids.erase(soid);
2279 #endif
2280 osd->finish_recovery_op(this, soid, dequeue);
2281
2282 if (!dequeue) {
2283 queue_recovery();
2284 }
2285 }
2286
2287 void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
2288 {
2289 child->update_snap_mapper_bits(split_bits);
2290 child->update_osdmap_ref(get_osdmap());
2291
2292 child->pool = pool;
2293
2294 // Log
2295 pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2296 child->info.last_complete = info.last_complete;
2297
2298 info.last_update = pg_log.get_head();
2299 child->info.last_update = child->pg_log.get_head();
2300
2301 child->info.last_user_version = info.last_user_version;
2302
2303 info.log_tail = pg_log.get_tail();
2304 child->info.log_tail = child->pg_log.get_tail();
2305
2306 if (info.last_complete < pg_log.get_tail())
2307 info.last_complete = pg_log.get_tail();
2308 if (child->info.last_complete < child->pg_log.get_tail())
2309 child->info.last_complete = child->pg_log.get_tail();
2310
2311 // Info
2312 child->info.history = info.history;
2313 child->info.history.epoch_created = get_osdmap()->get_epoch();
2314 child->info.purged_snaps = info.purged_snaps;
2315
2316 if (info.last_backfill.is_max()) {
2317 child->info.set_last_backfill(hobject_t::get_max());
2318 } else {
2319 // restart backfill on parent and child to be safe. we could
2320 // probably do better in the bitwise sort case, but it's more
2321 // fragile (there may be special work to do on backfill completion
2322 // in the future).
2323 info.set_last_backfill(hobject_t());
2324 child->info.set_last_backfill(hobject_t());
2325 // restarting backfill implies that the missing set is empty,
2326 // since it is only used for objects prior to last_backfill
2327 pg_log.reset_backfill();
2328 child->pg_log.reset_backfill();
2329 }
2330
2331 child->info.stats = info.stats;
2332 child->info.stats.parent_split_bits = split_bits;
2333 info.stats.stats_invalid = true;
2334 child->info.stats.stats_invalid = true;
2335 child->info.last_epoch_started = info.last_epoch_started;
2336 child->info.last_interval_started = info.last_interval_started;
2337
2338 child->snap_trimq = snap_trimq;
2339
2340 // There can't be recovery/backfill going on now
2341 int primary, up_primary;
2342 vector<int> newup, newacting;
2343 get_osdmap()->pg_to_up_acting_osds(
2344 child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2345 child->init_primary_up_acting(
2346 newup,
2347 newacting,
2348 up_primary,
2349 primary);
2350 child->role = OSDMap::calc_pg_role(osd->whoami, child->acting);
2351
2352 // this comparison includes primary rank via pg_shard_t
2353 if (get_primary() != child->get_primary())
2354 child->info.history.same_primary_since = get_osdmap()->get_epoch();
2355
2356 child->info.stats.up = up;
2357 child->info.stats.up_primary = up_primary;
2358 child->info.stats.acting = acting;
2359 child->info.stats.acting_primary = primary;
2360 child->info.stats.mapping_epoch = get_osdmap()->get_epoch();
2361
2362 // History
2363 child->past_intervals = past_intervals;
2364
2365 _split_into(child_pgid, child, split_bits);
2366
2367 // release all backoffs for simplicity
2368 release_backoffs(hobject_t(), hobject_t::get_max());
2369
2370 child->on_new_interval();
2371
2372 child->dirty_info = true;
2373 child->dirty_big_info = true;
2374 dirty_info = true;
2375 dirty_big_info = true;
2376 }
2377
2378 void PG::add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end)
2379 {
2380 ConnectionRef con = s->con;
2381 if (!con) // OSD::ms_handle_reset clears s->con without a lock
2382 return;
2383 BackoffRef b(s->have_backoff(info.pgid, begin));
2384 if (b) {
2385 derr << __func__ << " already have backoff for " << s << " begin " << begin
2386 << " " << *b << dendl;
2387 ceph_abort();
2388 }
2389 Mutex::Locker l(backoff_lock);
2390 {
2391 b = new Backoff(info.pgid, this, s, ++s->backoff_seq, begin, end);
2392 backoffs[begin].insert(b);
2393 s->add_backoff(b);
2394 dout(10) << __func__ << " session " << s << " added " << *b << dendl;
2395 }
2396 con->send_message(
2397 new MOSDBackoff(
2398 info.pgid,
2399 get_osdmap()->get_epoch(),
2400 CEPH_OSD_BACKOFF_OP_BLOCK,
2401 b->id,
2402 begin,
2403 end));
2404 }
2405
2406 void PG::release_backoffs(const hobject_t& begin, const hobject_t& end)
2407 {
2408 dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl;
2409 vector<BackoffRef> bv;
2410 {
2411 Mutex::Locker l(backoff_lock);
2412 auto p = backoffs.lower_bound(begin);
2413 while (p != backoffs.end()) {
2414 int r = cmp(p->first, end);
2415 dout(20) << __func__ << " ? " << r << " " << p->first
2416 << " " << p->second << dendl;
2417 // note: must still examine begin=end=p->first case
2418 if (r > 0 || (r == 0 && begin < end)) {
2419 break;
2420 }
2421 dout(20) << __func__ << " checking " << p->first
2422 << " " << p->second << dendl;
2423 auto q = p->second.begin();
2424 while (q != p->second.end()) {
2425 dout(20) << __func__ << " checking " << *q << dendl;
2426 int r = cmp((*q)->begin, begin);
2427 if (r == 0 || (r > 0 && (*q)->end < end)) {
2428 bv.push_back(*q);
2429 q = p->second.erase(q);
2430 } else {
2431 ++q;
2432 }
2433 }
2434 if (p->second.empty()) {
2435 p = backoffs.erase(p);
2436 } else {
2437 ++p;
2438 }
2439 }
2440 }
2441 for (auto b : bv) {
2442 Mutex::Locker l(b->lock);
2443 dout(10) << __func__ << " " << *b << dendl;
2444 if (b->session) {
2445 assert(b->pg == this);
2446 ConnectionRef con = b->session->con;
2447 if (con) { // OSD::ms_handle_reset clears s->con without a lock
2448 con->send_message(
2449 new MOSDBackoff(
2450 info.pgid,
2451 get_osdmap()->get_epoch(),
2452 CEPH_OSD_BACKOFF_OP_UNBLOCK,
2453 b->id,
2454 b->begin,
2455 b->end));
2456 }
2457 if (b->is_new()) {
2458 b->state = Backoff::STATE_DELETING;
2459 } else {
2460 b->session->rm_backoff(b);
2461 b->session.reset();
2462 }
2463 b->pg.reset();
2464 }
2465 }
2466 }
2467
2468 void PG::clear_backoffs()
2469 {
2470 dout(10) << __func__ << " " << dendl;
2471 map<hobject_t,set<BackoffRef>> ls;
2472 {
2473 Mutex::Locker l(backoff_lock);
2474 ls.swap(backoffs);
2475 }
2476 for (auto& p : ls) {
2477 for (auto& b : p.second) {
2478 Mutex::Locker l(b->lock);
2479 dout(10) << __func__ << " " << *b << dendl;
2480 if (b->session) {
2481 assert(b->pg == this);
2482 if (b->is_new()) {
2483 b->state = Backoff::STATE_DELETING;
2484 } else {
2485 b->session->rm_backoff(b);
2486 b->session.reset();
2487 }
2488 b->pg.reset();
2489 }
2490 }
2491 }
2492 }
2493
2494 // called by Session::clear_backoffs()
2495 void PG::rm_backoff(BackoffRef b)
2496 {
2497 dout(10) << __func__ << " " << *b << dendl;
2498 Mutex::Locker l(backoff_lock);
2499 assert(b->lock.is_locked_by_me());
2500 assert(b->pg == this);
2501 auto p = backoffs.find(b->begin);
2502 // may race with release_backoffs()
2503 if (p != backoffs.end()) {
2504 auto q = p->second.find(b);
2505 if (q != p->second.end()) {
2506 p->second.erase(q);
2507 if (p->second.empty()) {
2508 backoffs.erase(p);
2509 }
2510 }
2511 }
2512 }
2513
2514 void PG::clear_recovery_state()
2515 {
2516 dout(10) << "clear_recovery_state" << dendl;
2517
2518 pg_log.reset_recovery_pointers();
2519 finish_sync_event = 0;
2520
2521 hobject_t soid;
2522 while (recovery_ops_active > 0) {
2523 #ifdef DEBUG_RECOVERY_OIDS
2524 soid = *recovering_oids.begin();
2525 #endif
2526 finish_recovery_op(soid, true);
2527 }
2528
2529 backfill_targets.clear();
2530 backfill_info.clear();
2531 peer_backfill_info.clear();
2532 waiting_on_backfill.clear();
2533 _clear_recovery_state(); // pg impl specific hook
2534 }
2535
2536 void PG::cancel_recovery()
2537 {
2538 dout(10) << "cancel_recovery" << dendl;
2539 clear_recovery_state();
2540 }
2541
2542
2543 void PG::purge_strays()
2544 {
2545 dout(10) << "purge_strays " << stray_set << dendl;
2546
2547 bool removed = false;
2548 for (set<pg_shard_t>::iterator p = stray_set.begin();
2549 p != stray_set.end();
2550 ++p) {
2551 assert(!is_actingbackfill(*p));
2552 if (get_osdmap()->is_up(p->osd)) {
2553 dout(10) << "sending PGRemove to osd." << *p << dendl;
2554 vector<spg_t> to_remove;
2555 to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
2556 MOSDPGRemove *m = new MOSDPGRemove(
2557 get_osdmap()->get_epoch(),
2558 to_remove);
2559 osd->send_message_osd_cluster(p->osd, m, get_osdmap()->get_epoch());
2560 } else {
2561 dout(10) << "not sending PGRemove to down osd." << *p << dendl;
2562 }
2563 peer_missing.erase(*p);
2564 peer_info.erase(*p);
2565 peer_purged.insert(*p);
2566 removed = true;
2567 }
2568
2569 // if we removed anyone, update peers (which include peer_info)
2570 if (removed)
2571 update_heartbeat_peers();
2572
2573 stray_set.clear();
2574
2575 // clear _requested maps; we may have to peer() again if we discover
2576 // (more) stray content
2577 peer_log_requested.clear();
2578 peer_missing_requested.clear();
2579 }
2580
2581 void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
2582 {
2583 Mutex::Locker l(heartbeat_peer_lock);
2584 probe_targets.clear();
2585 for (set<pg_shard_t>::iterator i = probe_set.begin();
2586 i != probe_set.end();
2587 ++i) {
2588 probe_targets.insert(i->osd);
2589 }
2590 }
2591
2592 void PG::clear_probe_targets()
2593 {
2594 Mutex::Locker l(heartbeat_peer_lock);
2595 probe_targets.clear();
2596 }
2597
2598 void PG::update_heartbeat_peers()
2599 {
2600 assert(is_locked());
2601
2602 if (!is_primary())
2603 return;
2604
2605 set<int> new_peers;
2606 for (unsigned i=0; i<acting.size(); i++) {
2607 if (acting[i] != CRUSH_ITEM_NONE)
2608 new_peers.insert(acting[i]);
2609 }
2610 for (unsigned i=0; i<up.size(); i++) {
2611 if (up[i] != CRUSH_ITEM_NONE)
2612 new_peers.insert(up[i]);
2613 }
2614 for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
2615 p != peer_info.end();
2616 ++p)
2617 new_peers.insert(p->first.osd);
2618
2619 bool need_update = false;
2620 heartbeat_peer_lock.Lock();
2621 if (new_peers == heartbeat_peers) {
2622 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl;
2623 } else {
2624 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl;
2625 heartbeat_peers.swap(new_peers);
2626 need_update = true;
2627 }
2628 heartbeat_peer_lock.Unlock();
2629
2630 if (need_update)
2631 osd->need_heartbeat_peer_update();
2632 }
2633
2634
2635 bool PG::check_in_progress_op(
2636 const osd_reqid_t &r,
2637 eversion_t *version,
2638 version_t *user_version,
2639 int *return_code) const
2640 {
2641 return (
2642 projected_log.get_request(r, version, user_version, return_code) ||
2643 pg_log.get_log().get_request(r, version, user_version, return_code));
2644 }
2645
2646 static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard)
2647 {
2648 for (auto&p : pgs)
2649 if (p.shard == shard)
2650 return true;
2651 return false;
2652 }
2653
2654 static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard)
2655 {
2656 for (auto&p : pgs) {
2657 if (p == skip)
2658 continue;
2659 if (p.shard == shard)
2660 return p;
2661 }
2662 return pg_shard_t();
2663 }
2664
2665 void PG::_update_calc_stats()
2666 {
2667 info.stats.version = info.last_update;
2668 info.stats.created = info.history.epoch_created;
2669 info.stats.last_scrub = info.history.last_scrub;
2670 info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
2671 info.stats.last_deep_scrub = info.history.last_deep_scrub;
2672 info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
2673 info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
2674 info.stats.last_epoch_clean = info.history.last_epoch_clean;
2675
2676 info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
2677 info.stats.ondisk_log_size = info.stats.log_size;
2678 info.stats.log_start = pg_log.get_tail();
2679 info.stats.ondisk_log_start = pg_log.get_tail();
2680 info.stats.snaptrimq_len = snap_trimq.size();
2681
2682 unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
2683
2684 // In rare case that upset is too large (usually transient), use as target
2685 // for calculations below.
2686 unsigned target = std::max(num_shards, (unsigned)upset.size());
2687 // For undersized actingset may be larger with OSDs out
2688 unsigned nrep = std::max(actingset.size(), upset.size());
2689 // calc num_object_copies
2690 info.stats.stats.calc_copies(MAX(target, nrep));
2691 info.stats.stats.sum.num_objects_degraded = 0;
2692 info.stats.stats.sum.num_objects_unfound = 0;
2693 info.stats.stats.sum.num_objects_misplaced = 0;
2694
2695 if ((is_remapped() || is_undersized() || !is_clean()) && (is_peered() || is_activating())) {
2696 dout(20) << __func__ << " actingset " << actingset << " upset "
2697 << upset << " actingbackfill " << actingbackfill << dendl;
2698 dout(20) << __func__ << " acting " << acting << " up " << up << dendl;
2699
2700 assert(!actingbackfill.empty());
2701
2702 bool estimate = false;
2703
2704 // NOTE: we only generate degraded, misplaced and unfound
2705 // values for the summation, not individual stat categories.
2706 int64_t num_objects = info.stats.stats.sum.num_objects;
2707
2708 // Objects missing from up nodes, sorted by # objects.
2709 boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
2710 // Objects missing from nodes not in up, sort by # objects
2711 boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
2712
2713 // Fill missing_target_objects/acting_source_objects
2714
2715 {
2716 int64_t missing;
2717
2718 // Primary first
2719 missing = pg_log.get_missing().num_missing();
2720 assert(actingbackfill.count(pg_whoami));
2721 if (upset.count(pg_whoami)) {
2722 missing_target_objects.insert(make_pair(missing, pg_whoami));
2723 } else {
2724 acting_source_objects.insert(make_pair(missing, pg_whoami));
2725 }
2726 info.stats.stats.sum.num_objects_missing_on_primary = missing;
2727 dout(20) << __func__ << " shard " << pg_whoami
2728 << " primary objects " << num_objects
2729 << " missing " << missing
2730 << dendl;
2731
2732 }
2733
2734 // All other peers
2735 for (auto& peer : peer_info) {
2736 // Primary should not be in the peer_info, skip if it is.
2737 if (peer.first == pg_whoami) continue;
2738 int64_t missing = 0;
2739 int64_t peer_num_objects = peer.second.stats.stats.sum.num_objects;
2740 // Backfill targets always track num_objects accurately
2741 // all other peers track missing accurately.
2742 if (is_backfill_targets(peer.first)) {
2743 missing = std::max((int64_t)0, num_objects - peer_num_objects);
2744 } else {
2745 if (peer_missing.count(peer.first)) {
2746 missing = peer_missing[peer.first].num_missing();
2747 } else {
2748 dout(20) << __func__ << " no peer_missing found for " << peer.first << dendl;
2749 if (is_recovering()) {
2750 estimate = true;
2751 }
2752 missing = std::max((int64_t)0, num_objects - peer_num_objects);
2753 }
2754 }
2755 if (upset.count(peer.first)) {
2756 missing_target_objects.insert(make_pair(missing, peer.first));
2757 } else if (actingset.count(peer.first)) {
2758 acting_source_objects.insert(make_pair(missing, peer.first));
2759 }
2760 peer.second.stats.stats.sum.num_objects_missing = missing;
2761 dout(20) << __func__ << " shard " << peer.first
2762 << " objects " << peer_num_objects
2763 << " missing " << missing
2764 << dendl;
2765 }
2766
2767 // A misplaced object is not stored on the correct OSD
2768 int64_t misplaced = 0;
2769 // a degraded objects has fewer replicas or EC shards than the pool specifies.
2770 int64_t degraded = 0;
2771
2772 if (is_recovering()) {
2773 for (auto& sml: missing_loc.get_missing_by_count()) {
2774 for (auto& ml: sml.second) {
2775 int missing_shards;
2776 if (sml.first == shard_id_t::NO_SHARD) {
2777 dout(0) << __func__ << " ml " << ml.second << " upset size " << upset.size() << " up " << ml.first.up << dendl;
2778 missing_shards = (int)upset.size() - ml.first.up;
2779 } else {
2780 // Handle shards not even in upset below
2781 if (!find_shard(upset, sml.first))
2782 continue;
2783 missing_shards = std::max(0, 1 - ml.first.up);
2784 dout(0) << __func__ << " shard " << sml.first << " ml " << ml.second << " missing shards " << missing_shards << dendl;
2785 }
2786 int odegraded = ml.second * missing_shards;
2787 // Copies on other osds but limited to the possible degraded
2788 int more_osds = std::min(missing_shards, ml.first.other);
2789 int omisplaced = ml.second * more_osds;
2790 assert(omisplaced <= odegraded);
2791 odegraded -= omisplaced;
2792
2793 misplaced += omisplaced;
2794 degraded += odegraded;
2795 }
2796 }
2797
2798 dout(20) << __func__ << " missing based degraded " << degraded << dendl;
2799 dout(20) << __func__ << " missing based misplaced " << misplaced << dendl;
2800
2801 // Handle undersized case
2802 if (pool.info.is_replicated()) {
2803 // Add degraded for missing targets (num_objects missing)
2804 assert(target >= upset.size());
2805 unsigned needed = target - upset.size();
2806 degraded += num_objects * needed;
2807 } else {
2808 for (unsigned i = 0 ; i < num_shards; ++i) {
2809 shard_id_t shard(i);
2810
2811 if (!find_shard(upset, shard)) {
2812 pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard);
2813
2814 if (pgs != pg_shard_t()) {
2815 int64_t missing;
2816
2817 if (pgs == pg_whoami)
2818 missing = info.stats.stats.sum.num_objects_missing_on_primary;
2819 else
2820 missing = peer_info[pgs].stats.stats.sum.num_objects_missing;
2821
2822 degraded += missing;
2823 misplaced += std::max((int64_t)0, num_objects - missing);
2824 } else {
2825 // No shard anywhere
2826 degraded += num_objects;
2827 }
2828 }
2829 }
2830 }
2831 goto out;
2832 }
2833
2834 // Handle undersized case
2835 if (pool.info.is_replicated()) {
2836 // Add to missing_target_objects
2837 assert(target >= missing_target_objects.size());
2838 unsigned needed = target - missing_target_objects.size();
2839 if (needed)
2840 missing_target_objects.insert(make_pair(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD)));
2841 } else {
2842 for (unsigned i = 0 ; i < num_shards; ++i) {
2843 shard_id_t shard(i);
2844 bool found = false;
2845 for (const auto& t : missing_target_objects) {
2846 if (std::get<1>(t).shard == shard) {
2847 found = true;
2848 break;
2849 }
2850 }
2851 if (!found)
2852 missing_target_objects.insert(make_pair(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard)));
2853 }
2854 }
2855
2856 for (const auto& item : missing_target_objects)
2857 dout(20) << __func__ << " missing shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
2858 for (const auto& item : acting_source_objects)
2859 dout(20) << __func__ << " acting shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
2860
2861 // Handle all objects not in missing for remapped
2862 // or backfill
2863 for (auto m = missing_target_objects.rbegin();
2864 m != missing_target_objects.rend(); ++m) {
2865
2866 int64_t extra_missing = -1;
2867
2868 if (pool.info.is_replicated()) {
2869 if (!acting_source_objects.empty()) {
2870 auto extra_copy = acting_source_objects.begin();
2871 extra_missing = std::get<0>(*extra_copy);
2872 acting_source_objects.erase(extra_copy);
2873 }
2874 } else { // Erasure coded
2875 // Use corresponding shard
2876 for (const auto& a : acting_source_objects) {
2877 if (std::get<1>(a).shard == std::get<1>(*m).shard) {
2878 extra_missing = std::get<0>(a);
2879 acting_source_objects.erase(a);
2880 break;
2881 }
2882 }
2883 }
2884
2885 if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
2886 // We don't know which of the objects on the target
2887 // are part of extra_missing so assume are all degraded.
2888 misplaced += std::get<0>(*m) - extra_missing;
2889 degraded += extra_missing;
2890 } else {
2891 // 1. extra_missing == -1, more targets than sources so degraded
2892 // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
2893 // previously degraded are now present on the target.
2894 degraded += std::get<0>(*m);
2895 }
2896 }
2897 // If there are still acting that haven't been accounted for
2898 // then they are misplaced
2899 for (const auto& a : acting_source_objects) {
2900 int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
2901 dout(20) << __func__ << " extra acting misplaced " << extra_misplaced << dendl;
2902 misplaced += extra_misplaced;
2903 }
2904 out:
2905 // NOTE: Tests use these messages to verify this code
2906 dout(20) << __func__ << " degraded " << degraded << (estimate ? " (est)": "") << dendl;
2907 dout(20) << __func__ << " misplaced " << misplaced << (estimate ? " (est)": "")<< dendl;
2908
2909 info.stats.stats.sum.num_objects_degraded = degraded;
2910 info.stats.stats.sum.num_objects_unfound = get_num_unfound();
2911 info.stats.stats.sum.num_objects_misplaced = misplaced;
2912 }
2913 }
2914
2915 void PG::_update_blocked_by()
2916 {
2917 // set a max on the number of blocking peers we report. if we go
2918 // over, report a random subset. keep the result sorted.
2919 unsigned keep = MIN(blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
2920 unsigned skip = blocked_by.size() - keep;
2921 info.stats.blocked_by.clear();
2922 info.stats.blocked_by.resize(keep);
2923 unsigned pos = 0;
2924 for (set<int>::iterator p = blocked_by.begin();
2925 p != blocked_by.end() && keep > 0;
2926 ++p) {
2927 if (skip > 0 && (rand() % (skip + keep) < skip)) {
2928 --skip;
2929 } else {
2930 info.stats.blocked_by[pos++] = *p;
2931 --keep;
2932 }
2933 }
2934 }
2935
2936 void PG::publish_stats_to_osd()
2937 {
2938 if (!is_primary())
2939 return;
2940
2941 pg_stats_publish_lock.Lock();
2942
2943 if (info.stats.stats.sum.num_scrub_errors)
2944 state_set(PG_STATE_INCONSISTENT);
2945 else
2946 state_clear(PG_STATE_INCONSISTENT);
2947
2948 utime_t now = ceph_clock_now();
2949 if (info.stats.state != state) {
2950 info.stats.last_change = now;
2951 // Optimistic estimation, if we just find out an inactive PG,
2952 // assumt it is active till now.
2953 if (!(state & PG_STATE_ACTIVE) &&
2954 (info.stats.state & PG_STATE_ACTIVE))
2955 info.stats.last_active = now;
2956
2957 if ((state & PG_STATE_ACTIVE) &&
2958 !(info.stats.state & PG_STATE_ACTIVE))
2959 info.stats.last_became_active = now;
2960 if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
2961 !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
2962 info.stats.last_became_peered = now;
2963 if (!(state & PG_STATE_CREATING) &&
2964 (info.stats.state & PG_STATE_CREATING)) {
2965 osd->send_pg_created(get_pgid().pgid);
2966 }
2967 info.stats.state = state;
2968 }
2969
2970 _update_calc_stats();
2971 if (info.stats.stats.sum.num_objects_degraded) {
2972 state_set(PG_STATE_DEGRADED);
2973 } else {
2974 state_clear(PG_STATE_DEGRADED);
2975 }
2976 _update_blocked_by();
2977
2978 bool publish = false;
2979 pg_stat_t pre_publish = info.stats;
2980 pre_publish.stats.add(unstable_stats);
2981 utime_t cutoff = now;
2982 cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
2983 if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
2984 info.stats.last_fresh > cutoff) {
2985 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2986 << ": no change since " << info.stats.last_fresh << dendl;
2987 } else {
2988 // update our stat summary and timestamps
2989 info.stats.reported_epoch = get_osdmap()->get_epoch();
2990 ++info.stats.reported_seq;
2991
2992 info.stats.last_fresh = now;
2993
2994 if (info.stats.state & PG_STATE_CLEAN)
2995 info.stats.last_clean = now;
2996 if (info.stats.state & PG_STATE_ACTIVE)
2997 info.stats.last_active = now;
2998 if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
2999 info.stats.last_peered = now;
3000 info.stats.last_unstale = now;
3001 if ((info.stats.state & PG_STATE_DEGRADED) == 0)
3002 info.stats.last_undegraded = now;
3003 if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
3004 info.stats.last_fullsized = now;
3005
3006 // do not send pgstat to mon anymore once we are luminous, since mgr takes
3007 // care of this by sending MMonMgrReport to mon.
3008 publish =
3009 osd->osd->get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
3010 pg_stats_publish_valid = true;
3011 pg_stats_publish = pre_publish;
3012
3013 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3014 << ":" << pg_stats_publish.reported_seq << dendl;
3015 }
3016 pg_stats_publish_lock.Unlock();
3017
3018 if (publish)
3019 osd->pg_stat_queue_enqueue(this);
3020 }
3021
3022 void PG::clear_publish_stats()
3023 {
3024 dout(15) << "clear_stats" << dendl;
3025 pg_stats_publish_lock.Lock();
3026 pg_stats_publish_valid = false;
3027 pg_stats_publish_lock.Unlock();
3028
3029 osd->pg_stat_queue_dequeue(this);
3030 }
3031
3032 /**
3033 * initialize a newly instantiated pg
3034 *
3035 * Initialize PG state, as when a PG is initially created, or when it
3036 * is first instantiated on the current node.
3037 *
3038 * @param role our role/rank
3039 * @param newup up set
3040 * @param newacting acting set
3041 * @param history pg history
3042 * @param pi past_intervals
3043 * @param backfill true if info should be marked as backfill
3044 * @param t transaction to write out our new state in
3045 */
3046 void PG::init(
3047 int role,
3048 const vector<int>& newup, int new_up_primary,
3049 const vector<int>& newacting, int new_acting_primary,
3050 const pg_history_t& history,
3051 const PastIntervals& pi,
3052 bool backfill,
3053 ObjectStore::Transaction *t)
3054 {
3055 dout(10) << "init role " << role << " up " << newup << " acting " << newacting
3056 << " history " << history
3057 << " past_intervals " << pi
3058 << dendl;
3059
3060 set_role(role);
3061 acting = newacting;
3062 up = newup;
3063 init_primary_up_acting(
3064 newup,
3065 newacting,
3066 new_up_primary,
3067 new_acting_primary);
3068
3069 info.history = history;
3070 past_intervals = pi;
3071
3072 info.stats.up = up;
3073 info.stats.up_primary = new_up_primary;
3074 info.stats.acting = acting;
3075 info.stats.acting_primary = new_acting_primary;
3076 info.stats.mapping_epoch = info.history.same_interval_since;
3077
3078 if (backfill) {
3079 dout(10) << __func__ << ": Setting backfill" << dendl;
3080 info.set_last_backfill(hobject_t());
3081 info.last_complete = info.last_update;
3082 pg_log.mark_log_for_rewrite();
3083 }
3084
3085 on_new_interval();
3086
3087 dirty_info = true;
3088 dirty_big_info = true;
3089 write_if_dirty(*t);
3090 }
3091
3092 #pragma GCC diagnostic ignored "-Wpragmas"
3093 #pragma GCC diagnostic push
3094 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3095
3096 void PG::upgrade(ObjectStore *store)
3097 {
3098 assert(info_struct_v <= 10);
3099 ObjectStore::Transaction t;
3100
3101 assert(info_struct_v >= 7);
3102
3103 // 7 -> 8
3104 if (info_struct_v <= 7) {
3105 pg_log.mark_log_for_rewrite();
3106 ghobject_t log_oid(OSD::make_pg_log_oid(pg_id));
3107 ghobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id));
3108 t.remove(coll_t::meta(), log_oid);
3109 t.remove(coll_t::meta(), biginfo_oid);
3110 t.touch(coll, pgmeta_oid);
3111 }
3112
3113 // 8 -> 9
3114 if (info_struct_v <= 8) {
3115 // no special action needed.
3116 }
3117
3118 // 9 -> 10
3119 if (info_struct_v <= 9) {
3120 // previous versions weren't (as) aggressively clearing past_intervals
3121 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
3122 dout(20) << __func__ << " clearing past_intervals" << dendl;
3123 past_intervals.clear();
3124 }
3125 }
3126
3127 // update infover_key
3128 if (info_struct_v < cur_struct_v) {
3129 map<string,bufferlist> v;
3130 __u8 ver = cur_struct_v;
3131 ::encode(ver, v[infover_key]);
3132 t.omap_setkeys(coll, pgmeta_oid, v);
3133 }
3134
3135 dirty_info = true;
3136 dirty_big_info = true;
3137 write_if_dirty(t);
3138
3139 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
3140 ObjectStore::Sequencer>("upgrade"));
3141 int r = store->apply_transaction(osr.get(), std::move(t));
3142 if (r != 0) {
3143 derr << __func__ << ": apply_transaction returned "
3144 << cpp_strerror(r) << dendl;
3145 ceph_abort();
3146 }
3147 assert(r == 0);
3148
3149 C_SaferCond waiter;
3150 if (!osr->flush_commit(&waiter)) {
3151 waiter.wait();
3152 }
3153 }
3154
3155 #pragma GCC diagnostic pop
3156 #pragma GCC diagnostic warning "-Wpragmas"
3157
3158 int PG::_prepare_write_info(CephContext* cct,
3159 map<string,bufferlist> *km,
3160 epoch_t epoch,
3161 pg_info_t &info, pg_info_t &last_written_info,
3162 PastIntervals &past_intervals,
3163 bool dirty_big_info,
3164 bool dirty_epoch,
3165 bool try_fast_info,
3166 PerfCounters *logger)
3167 {
3168 if (dirty_epoch) {
3169 ::encode(epoch, (*km)[epoch_key]);
3170 }
3171
3172 if (logger)
3173 logger->inc(l_osd_pg_info);
3174
3175 // try to do info efficiently?
3176 if (!dirty_big_info && try_fast_info &&
3177 info.last_update > last_written_info.last_update) {
3178 pg_fast_info_t fast;
3179 fast.populate_from(info);
3180 bool did = fast.try_apply_to(&last_written_info);
3181 assert(did); // we verified last_update increased above
3182 if (info == last_written_info) {
3183 ::encode(fast, (*km)[fastinfo_key]);
3184 if (logger)
3185 logger->inc(l_osd_pg_fastinfo);
3186 return 0;
3187 }
3188 generic_dout(30) << __func__ << " fastinfo failed, info:\n";
3189 {
3190 JSONFormatter jf(true);
3191 jf.dump_object("info", info);
3192 jf.flush(*_dout);
3193 }
3194 {
3195 *_dout << "\nlast_written_info:\n";
3196 JSONFormatter jf(true);
3197 jf.dump_object("last_written_info", last_written_info);
3198 jf.flush(*_dout);
3199 }
3200 *_dout << dendl;
3201 }
3202 last_written_info = info;
3203
3204 // info. store purged_snaps separately.
3205 interval_set<snapid_t> purged_snaps;
3206 purged_snaps.swap(info.purged_snaps);
3207 ::encode(info, (*km)[info_key]);
3208 purged_snaps.swap(info.purged_snaps);
3209
3210 if (dirty_big_info) {
3211 // potentially big stuff
3212 bufferlist& bigbl = (*km)[biginfo_key];
3213 ::encode(past_intervals, bigbl);
3214 ::encode(info.purged_snaps, bigbl);
3215 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
3216 if (logger)
3217 logger->inc(l_osd_pg_biginfo);
3218 }
3219
3220 return 0;
3221 }
3222
3223 void PG::_create(ObjectStore::Transaction& t, spg_t pgid, int bits)
3224 {
3225 coll_t coll(pgid);
3226 t.create_collection(coll, bits);
3227 }
3228
3229 void PG::_init(ObjectStore::Transaction& t, spg_t pgid, const pg_pool_t *pool)
3230 {
3231 coll_t coll(pgid);
3232
3233 if (pool) {
3234 // Give a hint to the PG collection
3235 bufferlist hint;
3236 uint32_t pg_num = pool->get_pg_num();
3237 uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
3238 ::encode(pg_num, hint);
3239 ::encode(expected_num_objects_pg, hint);
3240 uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
3241 t.collection_hint(coll, hint_type, hint);
3242 }
3243
3244 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3245 t.touch(coll, pgmeta_oid);
3246 map<string,bufferlist> values;
3247 __u8 struct_v = cur_struct_v;
3248 ::encode(struct_v, values[infover_key]);
3249 t.omap_setkeys(coll, pgmeta_oid, values);
3250 }
3251
3252 void PG::prepare_write_info(map<string,bufferlist> *km)
3253 {
3254 info.stats.stats.add(unstable_stats);
3255 unstable_stats.clear();
3256
3257 bool need_update_epoch = last_epoch < get_osdmap()->get_epoch();
3258 int ret = _prepare_write_info(cct, km, get_osdmap()->get_epoch(),
3259 info,
3260 last_written_info,
3261 past_intervals,
3262 dirty_big_info, need_update_epoch,
3263 cct->_conf->osd_fast_info,
3264 osd->logger);
3265 assert(ret == 0);
3266 if (need_update_epoch)
3267 last_epoch = get_osdmap()->get_epoch();
3268 last_persisted_osdmap_ref = osdmap_ref;
3269
3270 dirty_info = false;
3271 dirty_big_info = false;
3272 }
3273
3274 #pragma GCC diagnostic ignored "-Wpragmas"
3275 #pragma GCC diagnostic push
3276 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3277
3278 bool PG::_has_removal_flag(ObjectStore *store,
3279 spg_t pgid)
3280 {
3281 coll_t coll(pgid);
3282 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3283
3284 // first try new way
3285 set<string> keys;
3286 keys.insert("_remove");
3287 map<string,bufferlist> values;
3288 if (store->omap_get_values(coll, pgmeta_oid, keys, &values) == 0 &&
3289 values.size() == 1)
3290 return true;
3291
3292 return false;
3293 }
3294
3295 int PG::peek_map_epoch(ObjectStore *store,
3296 spg_t pgid,
3297 epoch_t *pepoch,
3298 bufferlist *bl)
3299 {
3300 coll_t coll(pgid);
3301 ghobject_t legacy_infos_oid(OSD::make_infos_oid());
3302 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3303 epoch_t cur_epoch = 0;
3304
3305 assert(bl);
3306 {
3307 // validate collection name
3308 assert(coll.is_pg());
3309 }
3310
3311 // try for v8
3312 set<string> keys;
3313 keys.insert(infover_key);
3314 keys.insert(epoch_key);
3315 map<string,bufferlist> values;
3316 int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3317 if (r == 0) {
3318 assert(values.size() == 2);
3319
3320 // sanity check version
3321 bufferlist::iterator bp = values[infover_key].begin();
3322 __u8 struct_v = 0;
3323 ::decode(struct_v, bp);
3324 assert(struct_v >= 8);
3325
3326 // get epoch
3327 bp = values[epoch_key].begin();
3328 ::decode(cur_epoch, bp);
3329 } else {
3330 // probably bug 10617; see OSD::load_pgs()
3331 return -1;
3332 }
3333
3334 *pepoch = cur_epoch;
3335 return 0;
3336 }
3337
3338 #pragma GCC diagnostic pop
3339 #pragma GCC diagnostic warning "-Wpragmas"
3340
3341 void PG::write_if_dirty(ObjectStore::Transaction& t)
3342 {
3343 map<string,bufferlist> km;
3344 if (dirty_big_info || dirty_info)
3345 prepare_write_info(&km);
3346 pg_log.write_log_and_missing(t, &km, coll, pgmeta_oid, pool.info.require_rollback());
3347 if (!km.empty())
3348 t.omap_setkeys(coll, pgmeta_oid, km);
3349 }
3350
3351 void PG::trim_log()
3352 {
3353 assert(is_primary());
3354 calc_trim_to();
3355 dout(10) << __func__ << " to " << pg_trim_to << dendl;
3356 if (pg_trim_to != eversion_t()) {
3357 // inform peers to trim log
3358 assert(!actingbackfill.empty());
3359 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3360 i != actingbackfill.end();
3361 ++i) {
3362 if (*i == pg_whoami) continue;
3363 osd->send_message_osd_cluster(
3364 i->osd,
3365 new MOSDPGTrim(
3366 get_osdmap()->get_epoch(),
3367 spg_t(info.pgid.pgid, i->shard),
3368 pg_trim_to),
3369 get_osdmap()->get_epoch());
3370 }
3371
3372 // trim primary as well
3373 pg_log.trim(pg_trim_to, info);
3374 dirty_info = true;
3375 }
3376 }
3377
3378 void PG::add_log_entry(const pg_log_entry_t& e, bool applied)
3379 {
3380 // raise last_complete only if we were previously up to date
3381 if (info.last_complete == info.last_update)
3382 info.last_complete = e.version;
3383
3384 // raise last_update.
3385 assert(e.version > info.last_update);
3386 info.last_update = e.version;
3387
3388 // raise user_version, if it increased (it may have not get bumped
3389 // by all logged updates)
3390 if (e.user_version > info.last_user_version)
3391 info.last_user_version = e.user_version;
3392
3393 // log mutation
3394 pg_log.add(e, applied);
3395 dout(10) << "add_log_entry " << e << dendl;
3396 }
3397
3398
3399 void PG::append_log(
3400 const vector<pg_log_entry_t>& logv,
3401 eversion_t trim_to,
3402 eversion_t roll_forward_to,
3403 ObjectStore::Transaction &t,
3404 bool transaction_applied)
3405 {
3406 if (transaction_applied)
3407 update_snap_map(logv, t);
3408
3409 /* The primary has sent an info updating the history, but it may not
3410 * have arrived yet. We want to make sure that we cannot remember this
3411 * write without remembering that it happened in an interval which went
3412 * active in epoch history.last_epoch_started.
3413 */
3414 if (info.last_epoch_started != info.history.last_epoch_started) {
3415 info.history.last_epoch_started = info.last_epoch_started;
3416 }
3417 if (info.last_interval_started != info.history.last_interval_started) {
3418 info.history.last_interval_started = info.last_interval_started;
3419 }
3420 dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3421
3422 PGLogEntryHandler handler{this, &t};
3423 if (!transaction_applied) {
3424 /* We must be a backfill peer, so it's ok if we apply
3425 * out-of-turn since we won't be considered when
3426 * determining a min possible last_update.
3427 */
3428 pg_log.roll_forward(&handler);
3429 }
3430
3431 for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3432 p != logv.end();
3433 ++p) {
3434 add_log_entry(*p, transaction_applied);
3435
3436 /* We don't want to leave the rollforward artifacts around
3437 * here past last_backfill. It's ok for the same reason as
3438 * above */
3439 if (transaction_applied &&
3440 p->soid > info.last_backfill) {
3441 pg_log.roll_forward(&handler);
3442 }
3443 }
3444 auto last = logv.rbegin();
3445 if (is_primary() && last != logv.rend()) {
3446 projected_log.skip_can_rollback_to_to_head();
3447 projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
3448 }
3449
3450 if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
3451 pg_log.roll_forward_to(
3452 roll_forward_to,
3453 &handler);
3454 t.register_on_applied(
3455 new C_UpdateLastRollbackInfoTrimmedToApplied(
3456 this,
3457 get_osdmap()->get_epoch(),
3458 roll_forward_to));
3459 }
3460
3461 pg_log.trim(trim_to, info);
3462
3463 // update the local pg, pg log
3464 dirty_info = true;
3465 write_if_dirty(t);
3466 }
3467
3468 bool PG::check_log_for_corruption(ObjectStore *store)
3469 {
3470 /// TODO: this method needs to work with the omap log
3471 return true;
3472 }
3473
3474 //! Get the name we're going to save our corrupt page log as
3475 std::string PG::get_corrupt_pg_log_name() const
3476 {
3477 const int MAX_BUF = 512;
3478 char buf[MAX_BUF];
3479 struct tm tm_buf;
3480 time_t my_time(time(NULL));
3481 const struct tm *t = localtime_r(&my_time, &tm_buf);
3482 int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t);
3483 if (ret == 0) {
3484 dout(0) << "strftime failed" << dendl;
3485 return "corrupt_log_unknown_time";
3486 }
3487 string out(buf);
3488 out += stringify(info.pgid);
3489 return out;
3490 }
3491
3492 int PG::read_info(
3493 ObjectStore *store, spg_t pgid, const coll_t &coll, bufferlist &bl,
3494 pg_info_t &info, PastIntervals &past_intervals,
3495 __u8 &struct_v)
3496 {
3497 // try for v8 or later
3498 set<string> keys;
3499 keys.insert(infover_key);
3500 keys.insert(info_key);
3501 keys.insert(biginfo_key);
3502 keys.insert(fastinfo_key);
3503 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3504 map<string,bufferlist> values;
3505 int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3506 if (r == 0) {
3507 assert(values.size() == 3 ||
3508 values.size() == 4);
3509
3510 bufferlist::iterator p = values[infover_key].begin();
3511 ::decode(struct_v, p);
3512 assert(struct_v >= 8);
3513
3514 p = values[info_key].begin();
3515 ::decode(info, p);
3516
3517 p = values[biginfo_key].begin();
3518 if (struct_v >= 10) {
3519 ::decode(past_intervals, p);
3520 } else {
3521 past_intervals.decode_classic(p);
3522 }
3523 ::decode(info.purged_snaps, p);
3524
3525 p = values[fastinfo_key].begin();
3526 if (!p.end()) {
3527 pg_fast_info_t fast;
3528 ::decode(fast, p);
3529 fast.try_apply_to(&info);
3530 }
3531 return 0;
3532 }
3533
3534 // legacy (ver < 8)
3535 ghobject_t infos_oid(OSD::make_infos_oid());
3536 bufferlist::iterator p = bl.begin();
3537 ::decode(struct_v, p);
3538 assert(struct_v == 7);
3539
3540 // get info out of leveldb
3541 string k = get_info_key(info.pgid);
3542 string bk = get_biginfo_key(info.pgid);
3543 keys.clear();
3544 keys.insert(k);
3545 keys.insert(bk);
3546 values.clear();
3547 store->omap_get_values(coll_t::meta(), ghobject_t(infos_oid), keys, &values);
3548 assert(values.size() == 2);
3549
3550 p = values[k].begin();
3551 ::decode(info, p);
3552
3553 p = values[bk].begin();
3554 ::decode(past_intervals, p);
3555 interval_set<snapid_t> snap_collections; // obsolete
3556 ::decode(snap_collections, p);
3557 ::decode(info.purged_snaps, p);
3558 return 0;
3559 }
3560
3561 void PG::read_state(ObjectStore *store, bufferlist &bl)
3562 {
3563 int r = read_info(store, pg_id, coll, bl, info, past_intervals,
3564 info_struct_v);
3565 assert(r >= 0);
3566
3567 last_written_info = info;
3568
3569 // if we are upgrading from jewel, we need to force rebuild of
3570 // missing set. v9 was fastinfo, added v11.0.2-331-g1d5dc29a13
3571 // (before kraken). persisted missing set was circa
3572 // v11.0.0-866-gb0e239da95 (a bit earlier, also before kraken).
3573 // v8 was pre-jewel (per-pg meta object).
3574 bool force_rebuild_missing = info_struct_v < 9;
3575 if (force_rebuild_missing) {
3576 dout(10) << __func__ << " detected upgrade from jewel, force_rebuild_missing"
3577 << dendl;
3578 }
3579
3580 ostringstream oss;
3581 pg_log.read_log_and_missing(
3582 store,
3583 coll,
3584 info_struct_v < 8 ? coll_t::meta() : coll,
3585 ghobject_t(info_struct_v < 8 ? OSD::make_pg_log_oid(pg_id) : pgmeta_oid),
3586 info,
3587 force_rebuild_missing,
3588 oss,
3589 cct->_conf->osd_ignore_stale_divergent_priors,
3590 cct->_conf->osd_debug_verify_missing_on_start);
3591 if (oss.tellp())
3592 osd->clog->error() << oss.str();
3593
3594 if (force_rebuild_missing) {
3595 dout(10) << __func__ << " forced rebuild of missing got "
3596 << pg_log.get_missing()
3597 << dendl;
3598 }
3599
3600 // log any weirdness
3601 log_weirdness();
3602 }
3603
3604 void PG::log_weirdness()
3605 {
3606 if (pg_log.get_tail() != info.log_tail)
3607 osd->clog->error() << info.pgid
3608 << " info mismatch, log.tail " << pg_log.get_tail()
3609 << " != info.log_tail " << info.log_tail;
3610 if (pg_log.get_head() != info.last_update)
3611 osd->clog->error() << info.pgid
3612 << " info mismatch, log.head " << pg_log.get_head()
3613 << " != info.last_update " << info.last_update;
3614
3615 if (!pg_log.get_log().empty()) {
3616 // sloppy check
3617 if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
3618 osd->clog->error() << info.pgid
3619 << " log bound mismatch, info (tail,head] ("
3620 << pg_log.get_tail() << "," << pg_log.get_head() << "]"
3621 << " actual ["
3622 << pg_log.get_log().log.begin()->version << ","
3623 << pg_log.get_log().log.rbegin()->version << "]";
3624 }
3625
3626 if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
3627 osd->clog->error() << info.pgid
3628 << " caller_ops.size " << pg_log.get_log().caller_ops.size()
3629 << " > log size " << pg_log.get_log().log.size();
3630 }
3631 }
3632
3633 void PG::update_snap_map(
3634 const vector<pg_log_entry_t> &log_entries,
3635 ObjectStore::Transaction &t)
3636 {
3637 for (vector<pg_log_entry_t>::const_iterator i = log_entries.begin();
3638 i != log_entries.end();
3639 ++i) {
3640 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
3641 if (i->soid.snap < CEPH_MAXSNAP) {
3642 if (i->is_delete()) {
3643 int r = snap_mapper.remove_oid(
3644 i->soid,
3645 &_t);
3646 assert(r == 0);
3647 } else if (i->is_update()) {
3648 assert(i->snaps.length() > 0);
3649 vector<snapid_t> snaps;
3650 bufferlist snapbl = i->snaps;
3651 bufferlist::iterator p = snapbl.begin();
3652 try {
3653 ::decode(snaps, p);
3654 } catch (...) {
3655 derr << __func__ << " decode snaps failure on " << *i << dendl;
3656 snaps.clear();
3657 }
3658 set<snapid_t> _snaps(snaps.begin(), snaps.end());
3659
3660 if (i->is_clone() || i->is_promote()) {
3661 snap_mapper.add_oid(
3662 i->soid,
3663 _snaps,
3664 &_t);
3665 } else if (i->is_modify()) {
3666 assert(i->is_modify());
3667 int r = snap_mapper.update_snaps(
3668 i->soid,
3669 _snaps,
3670 0,
3671 &_t);
3672 assert(r == 0);
3673 } else {
3674 assert(i->is_clean());
3675 }
3676 }
3677 }
3678 }
3679 }
3680
3681 /**
3682 * filter trimming|trimmed snaps out of snapcontext
3683 */
3684 void PG::filter_snapc(vector<snapid_t> &snaps)
3685 {
3686 //nothing needs to trim, we can return immediately
3687 if(snap_trimq.empty() && info.purged_snaps.empty())
3688 return;
3689
3690 bool filtering = false;
3691 vector<snapid_t> newsnaps;
3692 for (vector<snapid_t>::iterator p = snaps.begin();
3693 p != snaps.end();
3694 ++p) {
3695 if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
3696 if (!filtering) {
3697 // start building a new vector with what we've seen so far
3698 dout(10) << "filter_snapc filtering " << snaps << dendl;
3699 newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
3700 filtering = true;
3701 }
3702 dout(20) << "filter_snapc removing trimq|purged snap " << *p << dendl;
3703 } else {
3704 if (filtering)
3705 newsnaps.push_back(*p); // continue building new vector
3706 }
3707 }
3708 if (filtering) {
3709 snaps.swap(newsnaps);
3710 dout(10) << "filter_snapc result " << snaps << dendl;
3711 }
3712 }
3713
3714 void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m)
3715 {
3716 for (map<hobject_t, list<OpRequestRef>>::iterator it = m.begin();
3717 it != m.end();
3718 ++it)
3719 requeue_ops(it->second);
3720 m.clear();
3721 }
3722
3723 void PG::requeue_op(OpRequestRef op)
3724 {
3725 auto p = waiting_for_map.find(op->get_source());
3726 if (p != waiting_for_map.end()) {
3727 dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")"
3728 << dendl;
3729 p->second.push_front(op);
3730 } else {
3731 dout(20) << __func__ << " " << op << dendl;
3732 osd->enqueue_front(info.pgid, PGQueueable(op, get_osdmap()->get_epoch()));
3733 }
3734 }
3735
3736 void PG::requeue_ops(list<OpRequestRef> &ls)
3737 {
3738 for (list<OpRequestRef>::reverse_iterator i = ls.rbegin();
3739 i != ls.rend();
3740 ++i) {
3741 auto p = waiting_for_map.find((*i)->get_source());
3742 if (p != waiting_for_map.end()) {
3743 dout(20) << __func__ << " " << *i << " (waiting_for_map " << p->first
3744 << ")" << dendl;
3745 p->second.push_front(*i);
3746 } else {
3747 dout(20) << __func__ << " " << *i << dendl;
3748 osd->enqueue_front(info.pgid, PGQueueable(*i, get_osdmap()->get_epoch()));
3749 }
3750 }
3751 ls.clear();
3752 }
3753
3754 void PG::requeue_map_waiters()
3755 {
3756 epoch_t epoch = get_osdmap()->get_epoch();
3757 auto p = waiting_for_map.begin();
3758 while (p != waiting_for_map.end()) {
3759 if (epoch < p->second.front()->min_epoch) {
3760 dout(20) << __func__ << " " << p->first << " front op "
3761 << p->second.front() << " must still wait, doing nothing"
3762 << dendl;
3763 ++p;
3764 } else {
3765 dout(20) << __func__ << " " << p->first << " " << p->second << dendl;
3766 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
3767 osd->enqueue_front(info.pgid, PGQueueable(*q, epoch));
3768 }
3769 p = waiting_for_map.erase(p);
3770 }
3771 }
3772 }
3773
3774
3775 // ==========================================================================================
3776 // SCRUB
3777
3778 /*
3779 * when holding pg and sched_scrub_lock, then the states are:
3780 * scheduling:
3781 * scrubber.reserved = true
3782 * scrub_rserved_peers includes whoami
3783 * osd->scrub_pending++
3784 * scheduling, replica declined:
3785 * scrubber.reserved = true
3786 * scrubber.reserved_peers includes -1
3787 * osd->scrub_pending++
3788 * pending:
3789 * scrubber.reserved = true
3790 * scrubber.reserved_peers.size() == acting.size();
3791 * pg on scrub_wq
3792 * osd->scrub_pending++
3793 * scrubbing:
3794 * scrubber.reserved = false;
3795 * scrubber.reserved_peers empty
3796 * osd->scrubber.active++
3797 */
3798
3799 // returns true if a scrub has been newly kicked off
3800 bool PG::sched_scrub()
3801 {
3802 bool nodeep_scrub = false;
3803 assert(is_locked());
3804 if (!(is_primary() && is_active() && is_clean() && !is_scrubbing())) {
3805 return false;
3806 }
3807
3808 double deep_scrub_interval = 0;
3809 pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
3810 if (deep_scrub_interval <= 0) {
3811 deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
3812 }
3813 bool time_for_deep = ceph_clock_now() >=
3814 info.history.last_deep_scrub_stamp + deep_scrub_interval;
3815
3816 bool deep_coin_flip = false;
3817 // Only add random deep scrubs when NOT user initiated scrub
3818 if (!scrubber.must_scrub)
3819 deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
3820 dout(20) << __func__ << ": time_for_deep=" << time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
3821
3822 time_for_deep = (time_for_deep || deep_coin_flip);
3823
3824 //NODEEP_SCRUB so ignore time initiated deep-scrub
3825 if (osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
3826 pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
3827 time_for_deep = false;
3828 nodeep_scrub = true;
3829 }
3830
3831 if (!scrubber.must_scrub) {
3832 assert(!scrubber.must_deep_scrub);
3833
3834 //NOSCRUB so skip regular scrubs
3835 if ((osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
3836 pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) && !time_for_deep) {
3837 if (scrubber.reserved) {
3838 // cancel scrub if it is still in scheduling,
3839 // so pgs from other pools where scrub are still legal
3840 // have a chance to go ahead with scrubbing.
3841 clear_scrub_reserved();
3842 scrub_unreserve_replicas();
3843 }
3844 return false;
3845 }
3846 }
3847
3848 if (cct->_conf->osd_scrub_auto_repair
3849 && get_pgbackend()->auto_repair_supported()
3850 && time_for_deep
3851 // respect the command from user, and not do auto-repair
3852 && !scrubber.must_repair
3853 && !scrubber.must_scrub
3854 && !scrubber.must_deep_scrub) {
3855 dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
3856 scrubber.auto_repair = true;
3857 } else {
3858 // this happens when user issue the scrub/repair command during
3859 // the scheduling of the scrub/repair (e.g. request reservation)
3860 scrubber.auto_repair = false;
3861 }
3862
3863 bool ret = true;
3864 if (!scrubber.reserved) {
3865 assert(scrubber.reserved_peers.empty());
3866 if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
3867 osd->inc_scrubs_pending()) {
3868 dout(20) << __func__ << ": reserved locally, reserving replicas" << dendl;
3869 scrubber.reserved = true;
3870 scrubber.reserved_peers.insert(pg_whoami);
3871 scrub_reserve_replicas();
3872 } else {
3873 dout(20) << __func__ << ": failed to reserve locally" << dendl;
3874 ret = false;
3875 }
3876 }
3877 if (scrubber.reserved) {
3878 if (scrubber.reserve_failed) {
3879 dout(20) << "sched_scrub: failed, a peer declined" << dendl;
3880 clear_scrub_reserved();
3881 scrub_unreserve_replicas();
3882 ret = false;
3883 } else if (scrubber.reserved_peers.size() == acting.size()) {
3884 dout(20) << "sched_scrub: success, reserved self and replicas" << dendl;
3885 if (time_for_deep) {
3886 dout(10) << "sched_scrub: scrub will be deep" << dendl;
3887 state_set(PG_STATE_DEEP_SCRUB);
3888 } else if (!scrubber.must_deep_scrub && info.stats.stats.sum.num_deep_scrub_errors) {
3889 if (!nodeep_scrub) {
3890 osd->clog->info() << "osd." << osd->whoami
3891 << " pg " << info.pgid
3892 << " Deep scrub errors, upgrading scrub to deep-scrub";
3893 state_set(PG_STATE_DEEP_SCRUB);
3894 } else if (!scrubber.must_scrub) {
3895 osd->clog->error() << "osd." << osd->whoami
3896 << " pg " << info.pgid
3897 << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
3898 clear_scrub_reserved();
3899 scrub_unreserve_replicas();
3900 return false;
3901 } else {
3902 osd->clog->error() << "osd." << osd->whoami
3903 << " pg " << info.pgid
3904 << " Regular scrub request, deep-scrub details will be lost";
3905 }
3906 }
3907 queue_scrub();
3908 } else {
3909 // none declined, since scrubber.reserved is set
3910 dout(20) << "sched_scrub: reserved " << scrubber.reserved_peers << ", waiting for replicas" << dendl;
3911 }
3912 }
3913
3914 return ret;
3915 }
3916
3917 void PG::reg_next_scrub()
3918 {
3919 if (!is_primary())
3920 return;
3921
3922 utime_t reg_stamp;
3923 if (scrubber.must_scrub ||
3924 (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats)) {
3925 reg_stamp = ceph_clock_now();
3926 } else {
3927 reg_stamp = info.history.last_scrub_stamp;
3928 }
3929 // note down the sched_time, so we can locate this scrub, and remove it
3930 // later on.
3931 double scrub_min_interval = 0, scrub_max_interval = 0;
3932 pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
3933 pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
3934 assert(scrubber.scrub_reg_stamp == utime_t());
3935 scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
3936 reg_stamp,
3937 scrub_min_interval,
3938 scrub_max_interval,
3939 scrubber.must_scrub);
3940 }
3941
3942 void PG::unreg_next_scrub()
3943 {
3944 if (is_primary()) {
3945 osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
3946 scrubber.scrub_reg_stamp = utime_t();
3947 }
3948 }
3949
3950 void PG::do_replica_scrub_map(OpRequestRef op)
3951 {
3952 const MOSDRepScrubMap *m = static_cast<const MOSDRepScrubMap*>(op->get_req());
3953 dout(7) << __func__ << " " << *m << dendl;
3954 if (m->map_epoch < info.history.same_interval_since) {
3955 dout(10) << __func__ << " discarding old from "
3956 << m->map_epoch << " < " << info.history.same_interval_since
3957 << dendl;
3958 return;
3959 }
3960 if (!scrubber.is_chunky_scrub_active()) {
3961 dout(10) << __func__ << " scrub isn't active" << dendl;
3962 return;
3963 }
3964
3965 op->mark_started();
3966
3967 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3968 scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3969 dout(10) << "map version is "
3970 << scrubber.received_maps[m->from].valid_through
3971 << dendl;
3972
3973 dout(10) << __func__ << " waiting_on_whom was " << scrubber.waiting_on_whom
3974 << dendl;
3975 assert(scrubber.waiting_on_whom.count(m->from));
3976 scrubber.waiting_on_whom.erase(m->from);
3977 if (m->preempted) {
3978 dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
3979 scrub_preempted = true;
3980 }
3981 if (scrubber.waiting_on_whom.empty()) {
3982 if (ops_blocked_by_scrub()) {
3983 requeue_scrub(true);
3984 } else {
3985 requeue_scrub(false);
3986 }
3987 }
3988 }
3989
3990 void PG::sub_op_scrub_map(OpRequestRef op)
3991 {
3992 // for legacy jewel compatibility only
3993 const MOSDSubOp *m = static_cast<const MOSDSubOp *>(op->get_req());
3994 assert(m->get_type() == MSG_OSD_SUBOP);
3995 dout(7) << "sub_op_scrub_map" << dendl;
3996
3997 if (m->map_epoch < info.history.same_interval_since) {
3998 dout(10) << "sub_op_scrub discarding old sub_op from "
3999 << m->map_epoch << " < " << info.history.same_interval_since << dendl;
4000 return;
4001 }
4002
4003 if (!scrubber.is_chunky_scrub_active()) {
4004 dout(10) << "sub_op_scrub_map scrub isn't active" << dendl;
4005 return;
4006 }
4007
4008 op->mark_started();
4009
4010 dout(10) << " got " << m->from << " scrub map" << dendl;
4011 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
4012
4013 scrubber.received_maps[m->from].decode(p, info.pgid.pool());
4014 dout(10) << "map version is "
4015 << scrubber.received_maps[m->from].valid_through
4016 << dendl;
4017
4018 scrubber.waiting_on_whom.erase(m->from);
4019
4020 if (scrubber.waiting_on_whom.empty()) {
4021 if (ops_blocked_by_scrub()) {
4022 requeue_scrub(true);
4023 } else {
4024 requeue_scrub(false);
4025 }
4026 }
4027 }
4028
4029 // send scrub v3 messages (chunky scrub)
4030 void PG::_request_scrub_map(
4031 pg_shard_t replica, eversion_t version,
4032 hobject_t start, hobject_t end,
4033 bool deep,
4034 bool allow_preemption)
4035 {
4036 assert(replica != pg_whoami);
4037 dout(10) << "scrub requesting scrubmap from osd." << replica
4038 << " deep " << (int)deep << dendl;
4039 MOSDRepScrub *repscrubop = new MOSDRepScrub(
4040 spg_t(info.pgid.pgid, replica.shard), version,
4041 get_osdmap()->get_epoch(),
4042 get_last_peering_reset(),
4043 start, end, deep,
4044 allow_preemption,
4045 scrubber.priority,
4046 ops_blocked_by_scrub());
4047 // default priority, we want the rep scrub processed prior to any recovery
4048 // or client io messages (we are holding a lock!)
4049 osd->send_message_osd_cluster(
4050 replica.osd, repscrubop, get_osdmap()->get_epoch());
4051 }
4052
4053 void PG::handle_scrub_reserve_request(OpRequestRef op)
4054 {
4055 dout(7) << __func__ << " " << *op->get_req() << dendl;
4056 op->mark_started();
4057 if (scrubber.reserved) {
4058 dout(10) << __func__ << " ignoring reserve request: Already reserved"
4059 << dendl;
4060 return;
4061 }
4062 if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
4063 osd->inc_scrubs_pending()) {
4064 scrubber.reserved = true;
4065 } else {
4066 dout(20) << __func__ << ": failed to reserve remotely" << dendl;
4067 scrubber.reserved = false;
4068 }
4069 if (op->get_req()->get_type() == MSG_OSD_SCRUB_RESERVE) {
4070 const MOSDScrubReserve *m =
4071 static_cast<const MOSDScrubReserve*>(op->get_req());
4072 Message *reply = new MOSDScrubReserve(
4073 spg_t(info.pgid.pgid, primary.shard),
4074 m->map_epoch,
4075 scrubber.reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
4076 pg_whoami);
4077 osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
4078 } else {
4079 // for jewel compat only
4080 const MOSDSubOp *req = static_cast<const MOSDSubOp*>(op->get_req());
4081 assert(req->get_type() == MSG_OSD_SUBOP);
4082 MOSDSubOpReply *reply = new MOSDSubOpReply(
4083 req, pg_whoami, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
4084 ::encode(scrubber.reserved, reply->get_data());
4085 osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
4086 }
4087 }
4088
4089 void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
4090 {
4091 dout(7) << __func__ << " " << *op->get_req() << dendl;
4092 op->mark_started();
4093 if (!scrubber.reserved) {
4094 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
4095 return;
4096 }
4097 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
4098 dout(10) << " already had osd." << from << " reserved" << dendl;
4099 } else {
4100 dout(10) << " osd." << from << " scrub reserve = success" << dendl;
4101 scrubber.reserved_peers.insert(from);
4102 sched_scrub();
4103 }
4104 }
4105
4106 void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
4107 {
4108 dout(7) << __func__ << " " << *op->get_req() << dendl;
4109 op->mark_started();
4110 if (!scrubber.reserved) {
4111 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
4112 return;
4113 }
4114 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
4115 dout(10) << " already had osd." << from << " reserved" << dendl;
4116 } else {
4117 /* One decline stops this pg from being scheduled for scrubbing. */
4118 dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
4119 scrubber.reserve_failed = true;
4120 sched_scrub();
4121 }
4122 }
4123
4124 void PG::handle_scrub_reserve_release(OpRequestRef op)
4125 {
4126 dout(7) << __func__ << " " << *op->get_req() << dendl;
4127 op->mark_started();
4128 clear_scrub_reserved();
4129 }
4130
4131 void PG::reject_reservation()
4132 {
4133 osd->send_message_osd_cluster(
4134 primary.osd,
4135 new MBackfillReserve(
4136 MBackfillReserve::REJECT,
4137 spg_t(info.pgid.pgid, primary.shard),
4138 get_osdmap()->get_epoch()),
4139 get_osdmap()->get_epoch());
4140 }
4141
4142 void PG::schedule_backfill_retry(float delay)
4143 {
4144 Mutex::Locker lock(osd->recovery_request_lock);
4145 osd->recovery_request_timer.add_event_after(
4146 delay,
4147 new QueuePeeringEvt<RequestBackfill>(
4148 this, get_osdmap()->get_epoch(),
4149 RequestBackfill()));
4150 }
4151
4152 void PG::schedule_recovery_retry(float delay)
4153 {
4154 Mutex::Locker lock(osd->recovery_request_lock);
4155 osd->recovery_request_timer.add_event_after(
4156 delay,
4157 new QueuePeeringEvt<DoRecovery>(
4158 this, get_osdmap()->get_epoch(),
4159 DoRecovery()));
4160 }
4161
4162 void PG::clear_scrub_reserved()
4163 {
4164 scrubber.reserved_peers.clear();
4165 scrubber.reserve_failed = false;
4166
4167 if (scrubber.reserved) {
4168 scrubber.reserved = false;
4169 osd->dec_scrubs_pending();
4170 }
4171 }
4172
4173 void PG::scrub_reserve_replicas()
4174 {
4175 assert(backfill_targets.empty());
4176 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4177 i != actingbackfill.end();
4178 ++i) {
4179 if (*i == pg_whoami) continue;
4180 dout(10) << "scrub requesting reserve from osd." << *i << dendl;
4181 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
4182 osd->send_message_osd_cluster(
4183 i->osd,
4184 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
4185 get_osdmap()->get_epoch(),
4186 MOSDScrubReserve::REQUEST, pg_whoami),
4187 get_osdmap()->get_epoch());
4188 } else {
4189 // for jewel compat only
4190 vector<OSDOp> scrub(1);
4191 scrub[0].op.op = CEPH_OSD_OP_SCRUB_RESERVE;
4192 hobject_t poid;
4193 eversion_t v;
4194 osd_reqid_t reqid;
4195 MOSDSubOp *subop = new MOSDSubOp(
4196 reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
4197 get_osdmap()->get_epoch(), osd->get_tid(), v);
4198 subop->ops = scrub;
4199 osd->send_message_osd_cluster(
4200 i->osd, subop, get_osdmap()->get_epoch());
4201 }
4202 }
4203 }
4204
4205 void PG::scrub_unreserve_replicas()
4206 {
4207 assert(backfill_targets.empty());
4208 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4209 i != actingbackfill.end();
4210 ++i) {
4211 if (*i == pg_whoami) continue;
4212 dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
4213 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
4214 osd->send_message_osd_cluster(
4215 i->osd,
4216 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
4217 get_osdmap()->get_epoch(),
4218 MOSDScrubReserve::RELEASE, pg_whoami),
4219 get_osdmap()->get_epoch());
4220 } else {
4221 // for jewel compat only
4222 vector<OSDOp> scrub(1);
4223 scrub[0].op.op = CEPH_OSD_OP_SCRUB_UNRESERVE;
4224 hobject_t poid;
4225 eversion_t v;
4226 osd_reqid_t reqid;
4227 MOSDSubOp *subop = new MOSDSubOp(
4228 reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
4229 get_osdmap()->get_epoch(), osd->get_tid(), v);
4230 subop->ops = scrub;
4231 osd->send_message_osd_cluster(i->osd, subop, get_osdmap()->get_epoch());
4232 }
4233 }
4234 }
4235
4236 void PG::_scan_rollback_obs(
4237 const vector<ghobject_t> &rollback_obs,
4238 ThreadPool::TPHandle &handle)
4239 {
4240 ObjectStore::Transaction t;
4241 eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
4242 for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
4243 i != rollback_obs.end();
4244 ++i) {
4245 if (i->generation < trimmed_to.version) {
4246 osd->clog->error() << "osd." << osd->whoami
4247 << " pg " << info.pgid
4248 << " found obsolete rollback obj "
4249 << *i << " generation < trimmed_to "
4250 << trimmed_to
4251 << "...repaired";
4252 t.remove(coll, *i);
4253 }
4254 }
4255 if (!t.empty()) {
4256 derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
4257 << dendl;
4258 osd->store->queue_transaction(osr.get(), std::move(t), NULL);
4259 }
4260 }
4261
4262 void PG::_scan_snaps(ScrubMap &smap)
4263 {
4264 hobject_t head;
4265 SnapSet snapset;
4266
4267 // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify
4268 // caller using clean_meta_map(), and it works properly.
4269 dout(20) << __func__ << " start" << dendl;
4270
4271 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4272 i != smap.objects.rend();
4273 ++i) {
4274 const hobject_t &hoid = i->first;
4275 ScrubMap::object &o = i->second;
4276
4277 dout(20) << __func__ << " " << hoid << dendl;
4278
4279 if (hoid.is_head() || hoid.is_snapdir()) {
4280 // parse the SnapSet
4281 bufferlist bl;
4282 if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
4283 continue;
4284 }
4285 bl.push_back(o.attrs[SS_ATTR]);
4286 auto p = bl.begin();
4287 try {
4288 ::decode(snapset, p);
4289 } catch(...) {
4290 continue;
4291 }
4292 head = hoid.get_head();
4293 // Make sure head_exists is correct for is_legacy() check
4294 if (hoid.is_head())
4295 snapset.head_exists = true;
4296 continue;
4297 }
4298 if (hoid.snap < CEPH_MAXSNAP) {
4299 // check and if necessary fix snap_mapper
4300 if (hoid.get_head() != head) {
4301 derr << __func__ << " no head for " << hoid << " (have " << head << ")"
4302 << dendl;
4303 continue;
4304 }
4305 set<snapid_t> obj_snaps;
4306 if (!snapset.is_legacy()) {
4307 auto p = snapset.clone_snaps.find(hoid.snap);
4308 if (p == snapset.clone_snaps.end()) {
4309 derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset
4310 << dendl;
4311 continue;
4312 }
4313 obj_snaps.insert(p->second.begin(), p->second.end());
4314 } else {
4315 bufferlist bl;
4316 if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4317 continue;
4318 }
4319 bl.push_back(o.attrs[OI_ATTR]);
4320 object_info_t oi;
4321 try {
4322 oi.decode(bl);
4323 } catch(...) {
4324 continue;
4325 }
4326 obj_snaps.insert(oi.legacy_snaps.begin(), oi.legacy_snaps.end());
4327 }
4328 set<snapid_t> cur_snaps;
4329 int r = snap_mapper.get_snaps(hoid, &cur_snaps);
4330 if (r != 0 && r != -ENOENT) {
4331 derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
4332 ceph_abort();
4333 }
4334 if (r == -ENOENT || cur_snaps != obj_snaps) {
4335 ObjectStore::Transaction t;
4336 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4337 if (r == 0) {
4338 r = snap_mapper.remove_oid(hoid, &_t);
4339 if (r != 0) {
4340 derr << __func__ << ": remove_oid returned " << cpp_strerror(r)
4341 << dendl;
4342 ceph_abort();
4343 }
4344 osd->clog->error() << "osd." << osd->whoami
4345 << " found snap mapper error on pg "
4346 << info.pgid
4347 << " oid " << hoid << " snaps in mapper: "
4348 << cur_snaps << ", oi: "
4349 << obj_snaps
4350 << "...repaired";
4351 } else {
4352 osd->clog->error() << "osd." << osd->whoami
4353 << " found snap mapper error on pg "
4354 << info.pgid
4355 << " oid " << hoid << " snaps missing in mapper"
4356 << ", should be: "
4357 << obj_snaps
4358 << "...repaired";
4359 }
4360 snap_mapper.add_oid(hoid, obj_snaps, &_t);
4361
4362 // wait for repair to apply to avoid confusing other bits of the system.
4363 {
4364 Cond my_cond;
4365 Mutex my_lock("PG::_scan_snaps my_lock");
4366 int r = 0;
4367 bool done;
4368 t.register_on_applied_sync(
4369 new C_SafeCond(&my_lock, &my_cond, &done, &r));
4370 r = osd->store->apply_transaction(osr.get(), std::move(t));
4371 if (r != 0) {
4372 derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4373 << dendl;
4374 } else {
4375 my_lock.Lock();
4376 while (!done)
4377 my_cond.Wait(my_lock);
4378 my_lock.Unlock();
4379 }
4380 }
4381 }
4382 }
4383 }
4384 }
4385
4386 void PG::_repair_oinfo_oid(ScrubMap &smap)
4387 {
4388 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4389 i != smap.objects.rend();
4390 ++i) {
4391 const hobject_t &hoid = i->first;
4392 ScrubMap::object &o = i->second;
4393
4394 bufferlist bl;
4395 if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4396 continue;
4397 }
4398 bl.push_back(o.attrs[OI_ATTR]);
4399 object_info_t oi;
4400 try {
4401 oi.decode(bl);
4402 } catch(...) {
4403 continue;
4404 }
4405 if (oi.soid != hoid) {
4406 ObjectStore::Transaction t;
4407 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4408 osd->clog->error() << "osd." << osd->whoami
4409 << " found object info error on pg "
4410 << info.pgid
4411 << " oid " << hoid << " oid in object info: "
4412 << oi.soid
4413 << "...repaired";
4414 // Fix object info
4415 oi.soid = hoid;
4416 bl.clear();
4417 ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4418
4419 bufferptr bp(bl.c_str(), bl.length());
4420 o.attrs[OI_ATTR] = bp;
4421
4422 t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
4423 int r = osd->store->apply_transaction(osr.get(), std::move(t));
4424 if (r != 0) {
4425 derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4426 << dendl;
4427 }
4428 }
4429 }
4430 }
4431 int PG::build_scrub_map_chunk(
4432 ScrubMap &map,
4433 ScrubMapBuilder &pos,
4434 hobject_t start,
4435 hobject_t end,
4436 bool deep,
4437 ThreadPool::TPHandle &handle)
4438 {
4439 dout(10) << __func__ << " [" << start << "," << end << ") "
4440 << " pos " << pos
4441 << dendl;
4442
4443 // start
4444 while (pos.empty()) {
4445 pos.deep = deep;
4446 map.valid_through = info.last_update;
4447 osr->flush();
4448
4449 // objects
4450 vector<ghobject_t> rollback_obs;
4451 pos.ret = get_pgbackend()->objects_list_range(
4452 start,
4453 end,
4454 0,
4455 &pos.ls,
4456 &rollback_obs);
4457 if (pos.ret < 0) {
4458 dout(5) << "objects_list_range error: " << pos.ret << dendl;
4459 return pos.ret;
4460 }
4461 if (pos.ls.empty()) {
4462 break;
4463 }
4464 _scan_rollback_obs(rollback_obs, handle);
4465 pos.pos = 0;
4466 return -EINPROGRESS;
4467 }
4468
4469 // scan objects
4470 while (!pos.done()) {
4471 int r = get_pgbackend()->be_scan_list(map, pos);
4472 if (r == -EINPROGRESS) {
4473 return r;
4474 }
4475 }
4476
4477 // finish
4478 dout(20) << __func__ << " finishing" << dendl;
4479 assert(pos.done());
4480 _repair_oinfo_oid(map);
4481 if (!is_primary()) {
4482 ScrubMap for_meta_scrub;
4483 // In case we restarted smaller chunk, clear old data
4484 scrubber.cleaned_meta_map.clear_from(scrubber.start);
4485 scrubber.cleaned_meta_map.insert(map);
4486 scrubber.clean_meta_map(for_meta_scrub);
4487 _scan_snaps(for_meta_scrub);
4488 }
4489
4490 dout(20) << __func__ << " done, got " << map.objects.size() << " items"
4491 << dendl;
4492 return 0;
4493 }
4494
4495 void PG::Scrubber::cleanup_store(ObjectStore::Transaction *t) {
4496 if (!store)
4497 return;
4498 struct OnComplete : Context {
4499 std::unique_ptr<Scrub::Store> store;
4500 OnComplete(
4501 std::unique_ptr<Scrub::Store> &&store)
4502 : store(std::move(store)) {}
4503 void finish(int) override {}
4504 };
4505 store->cleanup(t);
4506 t->register_on_complete(new OnComplete(std::move(store)));
4507 assert(!store);
4508 }
4509
4510 void PG::repair_object(
4511 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
4512 pg_shard_t bad_peer)
4513 {
4514 list<pg_shard_t> op_shards;
4515 for (auto i : *ok_peers) {
4516 op_shards.push_back(i.second);
4517 }
4518 dout(10) << "repair_object " << soid << " bad_peer osd."
4519 << bad_peer << " ok_peers osd.{" << op_shards << "}" << dendl;
4520 ScrubMap::object &po = ok_peers->back().first;
4521 eversion_t v;
4522 bufferlist bv;
4523 bv.push_back(po.attrs[OI_ATTR]);
4524 object_info_t oi;
4525 try {
4526 bufferlist::iterator bliter = bv.begin();
4527 ::decode(oi, bliter);
4528 } catch (...) {
4529 dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
4530 assert(0);
4531 }
4532 if (bad_peer != primary) {
4533 peer_missing[bad_peer].add(soid, oi.version, eversion_t(), false);
4534 } else {
4535 // We should only be scrubbing if the PG is clean.
4536 assert(waiting_for_unreadable_object.empty());
4537
4538 pg_log.missing_add(soid, oi.version, eversion_t());
4539
4540 pg_log.set_last_requested(0);
4541 dout(10) << __func__ << ": primary = " << primary << dendl;
4542 }
4543
4544 if (is_ec_pg() || bad_peer == primary) {
4545 // we'd better collect all shard for EC pg, and prepare good peers as the
4546 // source of pull in the case of replicated pg.
4547 missing_loc.add_missing(soid, oi.version, eversion_t());
4548 list<pair<ScrubMap::object, pg_shard_t> >::iterator i;
4549 for (i = ok_peers->begin();
4550 i != ok_peers->end();
4551 ++i)
4552 missing_loc.add_location(soid, i->second);
4553 }
4554 }
4555
4556 /* replica_scrub
4557 *
4558 * Wait for last_update_applied to match msg->scrub_to as above. Wait
4559 * for pushes to complete in case of recent recovery. Build a single
4560 * scrubmap of objects that are in the range [msg->start, msg->end).
4561 */
4562 void PG::replica_scrub(
4563 OpRequestRef op,
4564 ThreadPool::TPHandle &handle)
4565 {
4566 const MOSDRepScrub *msg = static_cast<const MOSDRepScrub *>(op->get_req());
4567 assert(!scrubber.active_rep_scrub);
4568 dout(7) << "replica_scrub" << dendl;
4569
4570 if (msg->map_epoch < info.history.same_interval_since) {
4571 dout(10) << "replica_scrub discarding old replica_scrub from "
4572 << msg->map_epoch << " < " << info.history.same_interval_since
4573 << dendl;
4574 return;
4575 }
4576
4577 assert(msg->chunky);
4578 if (last_update_applied < msg->scrub_to) {
4579 dout(10) << "waiting for last_update_applied to catch up" << dendl;
4580 scrubber.active_rep_scrub = op;
4581 return;
4582 }
4583
4584 if (active_pushes > 0) {
4585 dout(10) << "waiting for active pushes to finish" << dendl;
4586 scrubber.active_rep_scrub = op;
4587 return;
4588 }
4589
4590 scrubber.state = Scrubber::BUILD_MAP_REPLICA;
4591 scrubber.replica_scrub_start = msg->min_epoch;
4592 scrubber.start = msg->start;
4593 scrubber.end = msg->end;
4594 scrubber.max_end = msg->end;
4595 scrubber.deep = msg->deep;
4596 scrubber.epoch_start = info.history.same_interval_since;
4597 if (msg->priority) {
4598 scrubber.priority = msg->priority;
4599 } else {
4600 scrubber.priority = get_scrub_priority();
4601 }
4602
4603 scrub_can_preempt = msg->allow_preemption;
4604 scrub_preempted = false;
4605 scrubber.replica_scrubmap_pos.reset();
4606
4607 requeue_scrub(msg->high_priority);
4608 }
4609
4610 /* Scrub:
4611 * PG_STATE_SCRUBBING is set when the scrub is queued
4612 *
4613 * scrub will be chunky if all OSDs in PG support chunky scrub
4614 * scrub will fail if OSDs are too old.
4615 */
4616 void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
4617 {
4618 if (cct->_conf->osd_scrub_sleep > 0 &&
4619 (scrubber.state == PG::Scrubber::NEW_CHUNK ||
4620 scrubber.state == PG::Scrubber::INACTIVE) &&
4621 scrubber.needs_sleep) {
4622 ceph_assert(!scrubber.sleeping);
4623 dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
4624
4625 // Do an async sleep so we don't block the op queue
4626 OSDService *osds = osd;
4627 spg_t pgid = get_pgid();
4628 int state = scrubber.state;
4629 auto scrub_requeue_callback =
4630 new FunctionContext([osds, pgid, state](int r) {
4631 PG *pg = osds->osd->lookup_lock_pg(pgid);
4632 if (pg == nullptr) {
4633 lgeneric_dout(osds->osd->cct, 20)
4634 << "scrub_requeue_callback: Could not find "
4635 << "PG " << pgid << " can't complete scrub requeue after sleep"
4636 << dendl;
4637 return;
4638 }
4639 pg->scrubber.sleeping = false;
4640 pg->scrubber.needs_sleep = false;
4641 lgeneric_dout(pg->cct, 20)
4642 << "scrub_requeue_callback: slept for "
4643 << ceph_clock_now() - pg->scrubber.sleep_start
4644 << ", re-queuing scrub with state " << state << dendl;
4645 pg->scrub_queued = false;
4646 pg->requeue_scrub();
4647 pg->scrubber.sleep_start = utime_t();
4648 pg->unlock();
4649 });
4650 Mutex::Locker l(osd->scrub_sleep_lock);
4651 osd->scrub_sleep_timer.add_event_after(cct->_conf->osd_scrub_sleep,
4652 scrub_requeue_callback);
4653 scrubber.sleeping = true;
4654 scrubber.sleep_start = ceph_clock_now();
4655 return;
4656 }
4657 if (pg_has_reset_since(queued)) {
4658 return;
4659 }
4660 assert(scrub_queued);
4661 scrub_queued = false;
4662 scrubber.needs_sleep = true;
4663
4664 // for the replica
4665 if (!is_primary() &&
4666 scrubber.state == PG::Scrubber::BUILD_MAP_REPLICA) {
4667 chunky_scrub(handle);
4668 return;
4669 }
4670
4671 if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
4672 dout(10) << "scrub -- not primary or active or not clean" << dendl;
4673 state_clear(PG_STATE_SCRUBBING);
4674 state_clear(PG_STATE_REPAIR);
4675 state_clear(PG_STATE_DEEP_SCRUB);
4676 publish_stats_to_osd();
4677 return;
4678 }
4679
4680 if (!scrubber.active) {
4681 assert(backfill_targets.empty());
4682
4683 scrubber.deep = state_test(PG_STATE_DEEP_SCRUB);
4684
4685 dout(10) << "starting a new chunky scrub" << dendl;
4686 }
4687
4688 chunky_scrub(handle);
4689 }
4690
4691 /*
4692 * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
4693 * chunk.
4694 *
4695 * The object store is partitioned into chunks which end on hash boundaries. For
4696 * each chunk, the following logic is performed:
4697 *
4698 * (1) Block writes on the chunk
4699 * (2) Request maps from replicas
4700 * (3) Wait for pushes to be applied (after recovery)
4701 * (4) Wait for writes to flush on the chunk
4702 * (5) Wait for maps from replicas
4703 * (6) Compare / repair all scrub maps
4704 * (7) Wait for digest updates to apply
4705 *
4706 * This logic is encoded in the mostly linear state machine:
4707 *
4708 * +------------------+
4709 * _________v__________ |
4710 * | | |
4711 * | INACTIVE | |
4712 * |____________________| |
4713 * | |
4714 * | +----------+ |
4715 * _________v___v______ | |
4716 * | | | |
4717 * | NEW_CHUNK | | |
4718 * |____________________| | |
4719 * | | |
4720 * _________v__________ | |
4721 * | | | |
4722 * | WAIT_PUSHES | | |
4723 * |____________________| | |
4724 * | | |
4725 * _________v__________ | |
4726 * | | | |
4727 * | WAIT_LAST_UPDATE | | |
4728 * |____________________| | |
4729 * | | |
4730 * _________v__________ | |
4731 * | | | |
4732 * | BUILD_MAP | | |
4733 * |____________________| | |
4734 * | | |
4735 * _________v__________ | |
4736 * | | | |
4737 * | WAIT_REPLICAS | | |
4738 * |____________________| | |
4739 * | | |
4740 * _________v__________ | |
4741 * | | | |
4742 * | COMPARE_MAPS | | |
4743 * |____________________| | |
4744 * | | |
4745 * | | |
4746 * _________v__________ | |
4747 * | | | |
4748 * |WAIT_DIGEST_UPDATES | | |
4749 * |____________________| | |
4750 * | | | |
4751 * | +----------+ |
4752 * _________v__________ |
4753 * | | |
4754 * | FINISH | |
4755 * |____________________| |
4756 * | |
4757 * +------------------+
4758 *
4759 * The primary determines the last update from the subset by walking the log. If
4760 * it sees a log entry pertaining to a file in the chunk, it tells the replicas
4761 * to wait until that update is applied before building a scrub map. Both the
4762 * primary and replicas will wait for any active pushes to be applied.
4763 *
4764 * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
4765 *
4766 * scrubber.state encodes the current state of the scrub (refer to state diagram
4767 * for details).
4768 */
4769 void PG::chunky_scrub(ThreadPool::TPHandle &handle)
4770 {
4771 // check for map changes
4772 if (scrubber.is_chunky_scrub_active()) {
4773 if (scrubber.epoch_start != info.history.same_interval_since) {
4774 dout(10) << "scrub pg changed, aborting" << dendl;
4775 scrub_clear_state();
4776 scrub_unreserve_replicas();
4777 return;
4778 }
4779 }
4780
4781 bool done = false;
4782 int ret;
4783
4784 while (!done) {
4785 dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
4786 << " [" << scrubber.start << "," << scrubber.end << ")"
4787 << " max_end " << scrubber.max_end << dendl;
4788
4789 switch (scrubber.state) {
4790 case PG::Scrubber::INACTIVE:
4791 dout(10) << "scrub start" << dendl;
4792 assert(is_primary());
4793
4794 publish_stats_to_osd();
4795 scrubber.epoch_start = info.history.same_interval_since;
4796 scrubber.active = true;
4797
4798 osd->inc_scrubs_active(scrubber.reserved);
4799 if (scrubber.reserved) {
4800 scrubber.reserved = false;
4801 scrubber.reserved_peers.clear();
4802 }
4803
4804 {
4805 ObjectStore::Transaction t;
4806 scrubber.cleanup_store(&t);
4807 scrubber.store.reset(Scrub::Store::create(osd->store, &t,
4808 info.pgid, coll));
4809 osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4810 }
4811
4812 // Don't include temporary objects when scrubbing
4813 scrubber.start = info.pgid.pgid.get_hobj_start();
4814 scrubber.state = PG::Scrubber::NEW_CHUNK;
4815
4816 {
4817 bool repair = state_test(PG_STATE_REPAIR);
4818 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4819 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4820 stringstream oss;
4821 oss << info.pgid.pgid << " " << mode << " starts" << std::endl;
4822 osd->clog->debug(oss);
4823 }
4824
4825 scrubber.preempt_left = cct->_conf->get_val<uint64_t>(
4826 "osd_scrub_max_preemptions");
4827 scrubber.preempt_divisor = 1;
4828 break;
4829
4830 case PG::Scrubber::NEW_CHUNK:
4831 scrubber.primary_scrubmap = ScrubMap();
4832 scrubber.received_maps.clear();
4833
4834 // begin (possible) preemption window
4835 if (scrub_preempted) {
4836 scrubber.preempt_left--;
4837 scrubber.preempt_divisor *= 2;
4838 dout(10) << __func__ << " preempted, " << scrubber.preempt_left
4839 << " left" << dendl;
4840 scrub_preempted = false;
4841 }
4842 scrub_can_preempt = scrubber.preempt_left > 0;
4843
4844 {
4845 /* get the start and end of our scrub chunk
4846 *
4847 * Our scrub chunk has an important restriction we're going to need to
4848 * respect. We can't let head or snapdir be start or end.
4849 * Using a half-open interval means that if end == head|snapdir,
4850 * we'd scrub/lock head and the clone right next to head in different
4851 * chunks which would allow us to miss clones created between
4852 * scrubbing that chunk and scrubbing the chunk including head.
4853 * This isn't true for any of the other clones since clones can
4854 * only be created "just to the left of" head. There is one exception
4855 * to this: promotion of clones which always happens to the left of the
4856 * left-most clone, but promote_object checks the scrubber in that
4857 * case, so it should be ok. Also, it's ok to "miss" clones at the
4858 * left end of the range if we are a tier because they may legitimately
4859 * not exist (see _scrub).
4860 */
4861 int min = std::max<int64_t>(3, cct->_conf->osd_scrub_chunk_min /
4862 scrubber.preempt_divisor);
4863 int max = std::max<int64_t>(min, cct->_conf->osd_scrub_chunk_max /
4864 scrubber.preempt_divisor);
4865 hobject_t start = scrubber.start;
4866 hobject_t candidate_end;
4867 vector<hobject_t> objects;
4868 osr->flush();
4869 ret = get_pgbackend()->objects_list_partial(
4870 start,
4871 min,
4872 max,
4873 &objects,
4874 &candidate_end);
4875 assert(ret >= 0);
4876
4877 if (!objects.empty()) {
4878 hobject_t back = objects.back();
4879 while (candidate_end.has_snapset() &&
4880 candidate_end.get_head() == back.get_head()) {
4881 candidate_end = back;
4882 objects.pop_back();
4883 if (objects.empty()) {
4884 assert(0 ==
4885 "Somehow we got more than 2 objects which"
4886 "have the same head but are not clones");
4887 }
4888 back = objects.back();
4889 }
4890 if (candidate_end.has_snapset()) {
4891 assert(candidate_end.get_head() != back.get_head());
4892 candidate_end = candidate_end.get_object_boundary();
4893 }
4894 } else {
4895 assert(candidate_end.is_max());
4896 }
4897
4898 if (!_range_available_for_scrub(scrubber.start, candidate_end)) {
4899 // we'll be requeued by whatever made us unavailable for scrub
4900 dout(10) << __func__ << ": scrub blocked somewhere in range "
4901 << "[" << scrubber.start << ", " << candidate_end << ")"
4902 << dendl;
4903 done = true;
4904 break;
4905 }
4906 scrubber.end = candidate_end;
4907 if (scrubber.end > scrubber.max_end)
4908 scrubber.max_end = scrubber.end;
4909 }
4910
4911 // walk the log to find the latest update that affects our chunk
4912 scrubber.subset_last_update = eversion_t();
4913 for (auto p = projected_log.log.rbegin();
4914 p != projected_log.log.rend();
4915 ++p) {
4916 if (p->soid >= scrubber.start &&
4917 p->soid < scrubber.end) {
4918 scrubber.subset_last_update = p->version;
4919 break;
4920 }
4921 }
4922 if (scrubber.subset_last_update == eversion_t()) {
4923 for (list<pg_log_entry_t>::const_reverse_iterator p =
4924 pg_log.get_log().log.rbegin();
4925 p != pg_log.get_log().log.rend();
4926 ++p) {
4927 if (p->soid >= scrubber.start &&
4928 p->soid < scrubber.end) {
4929 scrubber.subset_last_update = p->version;
4930 break;
4931 }
4932 }
4933 }
4934
4935 // ask replicas to wait until
4936 // last_update_applied >= scrubber.subset_last_update and then scan
4937 scrubber.waiting_on_whom.insert(pg_whoami);
4938
4939 // request maps from replicas
4940 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4941 i != actingbackfill.end();
4942 ++i) {
4943 if (*i == pg_whoami) continue;
4944 _request_scrub_map(*i, scrubber.subset_last_update,
4945 scrubber.start, scrubber.end, scrubber.deep,
4946 scrubber.preempt_left > 0);
4947 scrubber.waiting_on_whom.insert(*i);
4948 }
4949 dout(10) << __func__ << " waiting_on_whom " << scrubber.waiting_on_whom
4950 << dendl;
4951
4952 scrubber.state = PG::Scrubber::WAIT_PUSHES;
4953 break;
4954
4955 case PG::Scrubber::WAIT_PUSHES:
4956 if (active_pushes == 0) {
4957 scrubber.state = PG::Scrubber::WAIT_LAST_UPDATE;
4958 } else {
4959 dout(15) << "wait for pushes to apply" << dendl;
4960 done = true;
4961 }
4962 break;
4963
4964 case PG::Scrubber::WAIT_LAST_UPDATE:
4965 if (last_update_applied < scrubber.subset_last_update) {
4966 // will be requeued by op_applied
4967 dout(15) << "wait for writes to flush" << dendl;
4968 done = true;
4969 break;
4970 }
4971
4972 scrubber.state = PG::Scrubber::BUILD_MAP;
4973 scrubber.primary_scrubmap_pos.reset();
4974 break;
4975
4976 case PG::Scrubber::BUILD_MAP:
4977 assert(last_update_applied >= scrubber.subset_last_update);
4978
4979 // build my own scrub map
4980 if (scrub_preempted) {
4981 dout(10) << __func__ << " preempted" << dendl;
4982 scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
4983 break;
4984 }
4985 ret = build_scrub_map_chunk(
4986 scrubber.primary_scrubmap,
4987 scrubber.primary_scrubmap_pos,
4988 scrubber.start, scrubber.end,
4989 scrubber.deep,
4990 handle);
4991 if (ret == -EINPROGRESS) {
4992 requeue_scrub();
4993 done = true;
4994 break;
4995 }
4996 scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
4997 break;
4998
4999 case PG::Scrubber::BUILD_MAP_DONE:
5000 if (scrubber.primary_scrubmap_pos.ret < 0) {
5001 dout(5) << "error: " << scrubber.primary_scrubmap_pos.ret
5002 << ", aborting" << dendl;
5003 scrub_clear_state();
5004 scrub_unreserve_replicas();
5005 return;
5006 }
5007 dout(10) << __func__ << " waiting_on_whom was "
5008 << scrubber.waiting_on_whom << dendl;
5009 assert(scrubber.waiting_on_whom.count(pg_whoami));
5010 scrubber.waiting_on_whom.erase(pg_whoami);
5011
5012 scrubber.state = PG::Scrubber::WAIT_REPLICAS;
5013 break;
5014
5015 case PG::Scrubber::WAIT_REPLICAS:
5016 if (!scrubber.waiting_on_whom.empty()) {
5017 // will be requeued by sub_op_scrub_map
5018 dout(10) << "wait for replicas to build scrub map" << dendl;
5019 done = true;
5020 break;
5021 }
5022 // end (possible) preemption window
5023 scrub_can_preempt = false;
5024 if (scrub_preempted) {
5025 dout(10) << __func__ << " preempted, restarting chunk" << dendl;
5026 scrubber.state = PG::Scrubber::NEW_CHUNK;
5027 } else {
5028 scrubber.state = PG::Scrubber::COMPARE_MAPS;
5029 }
5030 break;
5031
5032 case PG::Scrubber::COMPARE_MAPS:
5033 assert(last_update_applied >= scrubber.subset_last_update);
5034 assert(scrubber.waiting_on_whom.empty());
5035
5036 scrub_compare_maps();
5037 scrubber.start = scrubber.end;
5038 scrubber.run_callbacks();
5039
5040 // requeue the writes from the chunk that just finished
5041 requeue_ops(waiting_for_scrub);
5042
5043 scrubber.state = PG::Scrubber::WAIT_DIGEST_UPDATES;
5044
5045 // fall-thru
5046
5047 case PG::Scrubber::WAIT_DIGEST_UPDATES:
5048 if (scrubber.num_digest_updates_pending) {
5049 dout(10) << __func__ << " waiting on "
5050 << scrubber.num_digest_updates_pending
5051 << " digest updates" << dendl;
5052 done = true;
5053 break;
5054 }
5055
5056 scrubber.preempt_left = cct->_conf->get_val<uint64_t>(
5057 "osd_scrub_max_preemptions");
5058 scrubber.preempt_divisor = 1;
5059
5060 if (!(scrubber.end.is_max())) {
5061 scrubber.state = PG::Scrubber::NEW_CHUNK;
5062 requeue_scrub();
5063 done = true;
5064 } else {
5065 scrubber.state = PG::Scrubber::FINISH;
5066 }
5067
5068 break;
5069
5070 case PG::Scrubber::FINISH:
5071 scrub_finish();
5072 scrubber.state = PG::Scrubber::INACTIVE;
5073 done = true;
5074
5075 if (!snap_trimq.empty()) {
5076 dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
5077 snap_trimmer_scrub_complete();
5078 }
5079
5080 break;
5081
5082 case PG::Scrubber::BUILD_MAP_REPLICA:
5083 // build my own scrub map
5084 if (scrub_preempted) {
5085 dout(10) << __func__ << " preempted" << dendl;
5086 ret = 0;
5087 } else {
5088 ret = build_scrub_map_chunk(
5089 scrubber.replica_scrubmap,
5090 scrubber.replica_scrubmap_pos,
5091 scrubber.start, scrubber.end,
5092 scrubber.deep,
5093 handle);
5094 }
5095 if (ret == -EINPROGRESS) {
5096 requeue_scrub();
5097 done = true;
5098 break;
5099 }
5100 // reply
5101 if (HAVE_FEATURE(acting_features, SERVER_LUMINOUS)) {
5102 MOSDRepScrubMap *reply = new MOSDRepScrubMap(
5103 spg_t(info.pgid.pgid, get_primary().shard),
5104 scrubber.replica_scrub_start,
5105 pg_whoami);
5106 reply->preempted = scrub_preempted;
5107 ::encode(scrubber.replica_scrubmap, reply->get_data());
5108 osd->send_message_osd_cluster(
5109 get_primary().osd, reply,
5110 scrubber.replica_scrub_start);
5111 } else {
5112 // for jewel compatibility
5113 vector<OSDOp> scrub(1);
5114 scrub[0].op.op = CEPH_OSD_OP_SCRUB_MAP;
5115 hobject_t poid;
5116 eversion_t v;
5117 osd_reqid_t reqid;
5118 MOSDSubOp *subop = new MOSDSubOp(
5119 reqid,
5120 pg_whoami,
5121 spg_t(info.pgid.pgid, get_primary().shard),
5122 poid,
5123 0,
5124 scrubber.replica_scrub_start,
5125 osd->get_tid(),
5126 v);
5127 ::encode(scrubber.replica_scrubmap, subop->get_data());
5128 subop->ops = scrub;
5129 osd->send_message_osd_cluster(
5130 get_primary().osd, subop,
5131 scrubber.replica_scrub_start);
5132 }
5133 scrub_preempted = false;
5134 scrub_can_preempt = false;
5135 scrubber.state = PG::Scrubber::INACTIVE;
5136 scrubber.replica_scrubmap = ScrubMap();
5137 scrubber.replica_scrubmap_pos = ScrubMapBuilder();
5138 scrubber.start = hobject_t();
5139 scrubber.end = hobject_t();
5140 scrubber.max_end = hobject_t();
5141 done = true;
5142 break;
5143
5144 default:
5145 ceph_abort();
5146 }
5147 }
5148 dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
5149 << " [" << scrubber.start << "," << scrubber.end << ")"
5150 << " max_end " << scrubber.max_end << dendl;
5151 }
5152
5153 bool PG::write_blocked_by_scrub(const hobject_t& soid)
5154 {
5155 if (soid < scrubber.start || soid >= scrubber.end) {
5156 return false;
5157 }
5158 if (scrub_can_preempt) {
5159 if (!scrub_preempted) {
5160 dout(10) << __func__ << " " << soid << " preempted" << dendl;
5161 scrub_preempted = true;
5162 } else {
5163 dout(10) << __func__ << " " << soid << " already preempted" << dendl;
5164 }
5165 return false;
5166 }
5167 return true;
5168 }
5169
5170 bool PG::range_intersects_scrub(const hobject_t &start, const hobject_t& end)
5171 {
5172 // does [start, end] intersect [scrubber.start, scrubber.max_end)
5173 return (start < scrubber.max_end &&
5174 end >= scrubber.start);
5175 }
5176
5177 void PG::scrub_clear_state()
5178 {
5179 assert(is_locked());
5180 state_clear(PG_STATE_SCRUBBING);
5181 state_clear(PG_STATE_REPAIR);
5182 state_clear(PG_STATE_DEEP_SCRUB);
5183 publish_stats_to_osd();
5184
5185 // active -> nothing.
5186 if (scrubber.active)
5187 osd->dec_scrubs_active();
5188
5189 requeue_ops(waiting_for_scrub);
5190
5191 scrubber.reset();
5192
5193 // type-specific state clear
5194 _scrub_clear_state();
5195 }
5196
5197 void PG::scrub_compare_maps()
5198 {
5199 dout(10) << __func__ << " has maps, analyzing" << dendl;
5200
5201 // construct authoritative scrub map for type specific scrubbing
5202 scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
5203 map<hobject_t,
5204 pair<boost::optional<uint32_t>,
5205 boost::optional<uint32_t>>> missing_digest;
5206
5207 map<pg_shard_t, ScrubMap *> maps;
5208 maps[pg_whoami] = &scrubber.primary_scrubmap;
5209
5210 for (const auto& i : actingbackfill) {
5211 if (i == pg_whoami) continue;
5212 dout(2) << __func__ << " replica " << i << " has "
5213 << scrubber.received_maps[i].objects.size()
5214 << " items" << dendl;
5215 maps[i] = &scrubber.received_maps[i];
5216 }
5217
5218 set<hobject_t> master_set;
5219
5220 // Construct master set
5221 for (const auto map : maps) {
5222 for (const auto i : map.second->objects) {
5223 master_set.insert(i.first);
5224 }
5225 }
5226
5227 stringstream ss;
5228 get_pgbackend()->be_large_omap_check(maps, master_set,
5229 scrubber.large_omap_objects, ss);
5230 if (!ss.str().empty()) {
5231 osd->clog->warn(ss);
5232 }
5233
5234 if (acting.size() > 1) {
5235 dout(10) << __func__ << " comparing replica scrub maps" << dendl;
5236
5237 // Map from object with errors to good peer
5238 map<hobject_t, list<pg_shard_t>> authoritative;
5239
5240 dout(2) << __func__ << " osd." << acting[0] << " has "
5241 << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
5242
5243 ss.str("");
5244 ss.clear();
5245
5246 get_pgbackend()->be_compare_scrubmaps(
5247 maps,
5248 master_set,
5249 state_test(PG_STATE_REPAIR),
5250 scrubber.missing,
5251 scrubber.inconsistent,
5252 authoritative,
5253 missing_digest,
5254 scrubber.shallow_errors,
5255 scrubber.deep_errors,
5256 scrubber.store.get(),
5257 info.pgid, acting,
5258 ss);
5259 dout(2) << ss.str() << dendl;
5260
5261 if (!ss.str().empty()) {
5262 osd->clog->error(ss);
5263 }
5264
5265 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
5266 i != authoritative.end();
5267 ++i) {
5268 list<pair<ScrubMap::object, pg_shard_t> > good_peers;
5269 for (list<pg_shard_t>::const_iterator j = i->second.begin();
5270 j != i->second.end();
5271 ++j) {
5272 good_peers.push_back(make_pair(maps[*j]->objects[i->first], *j));
5273 }
5274 scrubber.authoritative.insert(
5275 make_pair(
5276 i->first,
5277 good_peers));
5278 }
5279
5280 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
5281 i != authoritative.end();
5282 ++i) {
5283 scrubber.cleaned_meta_map.objects.erase(i->first);
5284 scrubber.cleaned_meta_map.objects.insert(
5285 *(maps[i->second.back()]->objects.find(i->first))
5286 );
5287 }
5288 }
5289
5290 ScrubMap for_meta_scrub;
5291 scrubber.clean_meta_map(for_meta_scrub);
5292
5293 // ok, do the pg-type specific scrubbing
5294 scrub_snapshot_metadata(for_meta_scrub, missing_digest);
5295 // Called here on the primary can use an authoritative map if it isn't the primary
5296 _scan_snaps(for_meta_scrub);
5297 if (!scrubber.store->empty()) {
5298 if (state_test(PG_STATE_REPAIR)) {
5299 dout(10) << __func__ << ": discarding scrub results" << dendl;
5300 scrubber.store->flush(nullptr);
5301 } else {
5302 dout(10) << __func__ << ": updating scrub object" << dendl;
5303 ObjectStore::Transaction t;
5304 scrubber.store->flush(&t);
5305 osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
5306 }
5307 }
5308 }
5309
5310 bool PG::scrub_process_inconsistent()
5311 {
5312 dout(10) << __func__ << ": checking authoritative" << dendl;
5313 bool repair = state_test(PG_STATE_REPAIR);
5314 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5315 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5316
5317 // authoriative only store objects which missing or inconsistent.
5318 if (!scrubber.authoritative.empty()) {
5319 stringstream ss;
5320 ss << info.pgid << " " << mode << " "
5321 << scrubber.missing.size() << " missing, "
5322 << scrubber.inconsistent.size() << " inconsistent objects";
5323 dout(2) << ss.str() << dendl;
5324 osd->clog->error(ss);
5325 if (repair) {
5326 state_clear(PG_STATE_CLEAN);
5327 for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >>::iterator i =
5328 scrubber.authoritative.begin();
5329 i != scrubber.authoritative.end();
5330 ++i) {
5331 set<pg_shard_t>::iterator j;
5332
5333 auto missing_entry = scrubber.missing.find(i->first);
5334 if (missing_entry != scrubber.missing.end()) {
5335 for (j = missing_entry->second.begin();
5336 j != missing_entry->second.end();
5337 ++j) {
5338 repair_object(
5339 i->first,
5340 &(i->second),
5341 *j);
5342 ++scrubber.fixed;
5343 }
5344 }
5345 if (scrubber.inconsistent.count(i->first)) {
5346 for (j = scrubber.inconsistent[i->first].begin();
5347 j != scrubber.inconsistent[i->first].end();
5348 ++j) {
5349 repair_object(i->first,
5350 &(i->second),
5351 *j);
5352 ++scrubber.fixed;
5353 }
5354 }
5355 }
5356 }
5357 }
5358 return (!scrubber.authoritative.empty() && repair);
5359 }
5360
5361 bool PG::ops_blocked_by_scrub() const {
5362 return (waiting_for_scrub.size() != 0);
5363 }
5364
5365 // the part that actually finalizes a scrub
5366 void PG::scrub_finish()
5367 {
5368 bool repair = state_test(PG_STATE_REPAIR);
5369 // if the repair request comes from auto-repair and large number of errors,
5370 // we would like to cancel auto-repair
5371 if (repair && scrubber.auto_repair
5372 && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) {
5373 state_clear(PG_STATE_REPAIR);
5374 repair = false;
5375 }
5376 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5377 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5378
5379 // type-specific finish (can tally more errors)
5380 _scrub_finish();
5381
5382 bool has_error = scrub_process_inconsistent();
5383
5384 {
5385 stringstream oss;
5386 oss << info.pgid.pgid << " " << mode << " ";
5387 int total_errors = scrubber.shallow_errors + scrubber.deep_errors;
5388 if (total_errors)
5389 oss << total_errors << " errors";
5390 else
5391 oss << "ok";
5392 if (!deep_scrub && info.stats.stats.sum.num_deep_scrub_errors)
5393 oss << " ( " << info.stats.stats.sum.num_deep_scrub_errors
5394 << " remaining deep scrub error details lost)";
5395 if (repair)
5396 oss << ", " << scrubber.fixed << " fixed";
5397 if (total_errors)
5398 osd->clog->error(oss);
5399 else
5400 osd->clog->debug(oss);
5401 }
5402
5403 // finish up
5404 unreg_next_scrub();
5405 utime_t now = ceph_clock_now();
5406 info.history.last_scrub = info.last_update;
5407 info.history.last_scrub_stamp = now;
5408 if (scrubber.deep) {
5409 info.history.last_deep_scrub = info.last_update;
5410 info.history.last_deep_scrub_stamp = now;
5411 }
5412 // Since we don't know which errors were fixed, we can only clear them
5413 // when every one has been fixed.
5414 if (repair) {
5415 if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) {
5416 assert(deep_scrub);
5417 scrubber.shallow_errors = scrubber.deep_errors = 0;
5418 } else {
5419 // Deep scrub in order to get corrected error counts
5420 scrub_after_recovery = true;
5421 }
5422 }
5423 if (deep_scrub) {
5424 if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0))
5425 info.history.last_clean_scrub_stamp = now;
5426 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
5427 info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
5428 info.stats.stats.sum.num_large_omap_objects = scrubber.large_omap_objects;
5429 } else {
5430 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
5431 // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
5432 // because of deep-scrub errors
5433 if (scrubber.shallow_errors == 0)
5434 info.history.last_clean_scrub_stamp = now;
5435 }
5436 info.stats.stats.sum.num_scrub_errors =
5437 info.stats.stats.sum.num_shallow_scrub_errors +
5438 info.stats.stats.sum.num_deep_scrub_errors;
5439 reg_next_scrub();
5440
5441 {
5442 ObjectStore::Transaction t;
5443 dirty_info = true;
5444 write_if_dirty(t);
5445 int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
5446 assert(tr == 0);
5447 }
5448
5449
5450 if (has_error) {
5451 queue_peering_event(
5452 CephPeeringEvtRef(
5453 std::make_shared<CephPeeringEvt>(
5454 get_osdmap()->get_epoch(),
5455 get_osdmap()->get_epoch(),
5456 DoRecovery())));
5457 }
5458
5459 scrub_clear_state();
5460 scrub_unreserve_replicas();
5461
5462 if (is_active() && is_primary()) {
5463 share_pg_info();
5464 }
5465 }
5466
5467 void PG::share_pg_info()
5468 {
5469 dout(10) << "share_pg_info" << dendl;
5470
5471 // share new pg_info_t with replicas
5472 assert(!actingbackfill.empty());
5473 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
5474 i != actingbackfill.end();
5475 ++i) {
5476 if (*i == pg_whoami) continue;
5477 pg_shard_t peer = *i;
5478 if (peer_info.count(peer)) {
5479 peer_info[peer].last_epoch_started = info.last_epoch_started;
5480 peer_info[peer].last_interval_started = info.last_interval_started;
5481 peer_info[peer].history.merge(info.history);
5482 }
5483 MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
5484 m->pg_list.push_back(
5485 make_pair(
5486 pg_notify_t(
5487 peer.shard, pg_whoami.shard,
5488 get_osdmap()->get_epoch(),
5489 get_osdmap()->get_epoch(),
5490 info),
5491 PastIntervals()));
5492 osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
5493 }
5494 }
5495
5496 bool PG::append_log_entries_update_missing(
5497 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
5498 ObjectStore::Transaction &t, boost::optional<eversion_t> trim_to,
5499 boost::optional<eversion_t> roll_forward_to)
5500 {
5501 assert(!entries.empty());
5502 assert(entries.begin()->version > info.last_update);
5503
5504 PGLogEntryHandler rollbacker{this, &t};
5505 bool invalidate_stats =
5506 pg_log.append_new_log_entries(info.last_backfill,
5507 info.last_backfill_bitwise,
5508 entries,
5509 &rollbacker);
5510
5511 if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
5512 pg_log.roll_forward(&rollbacker);
5513 }
5514 if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
5515 pg_log.roll_forward_to(*roll_forward_to, &rollbacker);
5516 last_rollback_info_trimmed_to_applied = *roll_forward_to;
5517 }
5518
5519 info.last_update = pg_log.get_head();
5520
5521 if (pg_log.get_missing().num_missing() == 0) {
5522 // advance last_complete since nothing else is missing!
5523 info.last_complete = info.last_update;
5524 }
5525 info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
5526
5527 dout(20) << __func__ << "trim_to bool = " << bool(trim_to) << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
5528 if (trim_to)
5529 pg_log.trim(*trim_to, info);
5530 dirty_info = true;
5531 write_if_dirty(t);
5532 return invalidate_stats;
5533 }
5534
5535
5536 void PG::merge_new_log_entries(
5537 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
5538 ObjectStore::Transaction &t,
5539 boost::optional<eversion_t> trim_to,
5540 boost::optional<eversion_t> roll_forward_to)
5541 {
5542 dout(10) << __func__ << " " << entries << dendl;
5543 assert(is_primary());
5544
5545 bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
5546 for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
5547 i != actingbackfill.end();
5548 ++i) {
5549 pg_shard_t peer(*i);
5550 if (peer == pg_whoami) continue;
5551 assert(peer_missing.count(peer));
5552 assert(peer_info.count(peer));
5553 pg_missing_t& pmissing(peer_missing[peer]);
5554 dout(20) << __func__ << " peer_missing for " << peer << " = " << pmissing << dendl;
5555 pg_info_t& pinfo(peer_info[peer]);
5556 bool invalidate_stats = PGLog::append_log_entries_update_missing(
5557 pinfo.last_backfill,
5558 info.last_backfill_bitwise,
5559 entries,
5560 true,
5561 NULL,
5562 pmissing,
5563 NULL,
5564 this);
5565 pinfo.last_update = info.last_update;
5566 pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
5567 rebuild_missing = rebuild_missing || invalidate_stats;
5568 }
5569
5570 if (!rebuild_missing) {
5571 return;
5572 }
5573
5574 for (auto &&i: entries) {
5575 missing_loc.rebuild(
5576 i.soid,
5577 pg_whoami,
5578 actingbackfill,
5579 info,
5580 pg_log.get_missing(),
5581 peer_missing,
5582 peer_info);
5583 }
5584 }
5585
5586 void PG::update_history(const pg_history_t& new_history)
5587 {
5588 unreg_next_scrub();
5589 if (info.history.merge(new_history)) {
5590 dout(20) << __func__ << " advanced history from " << new_history << dendl;
5591 dirty_info = true;
5592 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
5593 dout(20) << __func__ << " clearing past_intervals" << dendl;
5594 past_intervals.clear();
5595 dirty_big_info = true;
5596 }
5597 }
5598 reg_next_scrub();
5599 }
5600
5601 void PG::fulfill_info(
5602 pg_shard_t from, const pg_query_t &query,
5603 pair<pg_shard_t, pg_info_t> &notify_info)
5604 {
5605 assert(from == primary);
5606 assert(query.type == pg_query_t::INFO);
5607
5608 // info
5609 dout(10) << "sending info" << dendl;
5610 notify_info = make_pair(from, info);
5611 }
5612
5613 void PG::fulfill_log(
5614 pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
5615 {
5616 dout(10) << "log request from " << from << dendl;
5617 assert(from == primary);
5618 assert(query.type != pg_query_t::INFO);
5619 ConnectionRef con = osd->get_con_osd_cluster(
5620 from.osd, get_osdmap()->get_epoch());
5621 if (!con) return;
5622
5623 MOSDPGLog *mlog = new MOSDPGLog(
5624 from.shard, pg_whoami.shard,
5625 get_osdmap()->get_epoch(),
5626 info, query_epoch);
5627 mlog->missing = pg_log.get_missing();
5628
5629 // primary -> other, when building master log
5630 if (query.type == pg_query_t::LOG) {
5631 dout(10) << " sending info+missing+log since " << query.since
5632 << dendl;
5633 if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
5634 osd->clog->error() << info.pgid << " got broken pg_query_t::LOG since " << query.since
5635 << " when my log.tail is " << pg_log.get_tail()
5636 << ", sending full log instead";
5637 mlog->log = pg_log.get_log(); // primary should not have requested this!!
5638 } else
5639 mlog->log.copy_after(pg_log.get_log(), query.since);
5640 }
5641 else if (query.type == pg_query_t::FULLLOG) {
5642 dout(10) << " sending info+missing+full log" << dendl;
5643 mlog->log = pg_log.get_log();
5644 }
5645
5646 dout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
5647
5648 osd->share_map_peer(from.osd, con.get(), get_osdmap());
5649 osd->send_message_osd_cluster(mlog, con.get());
5650 }
5651
5652 void PG::fulfill_query(const MQuery& query, RecoveryCtx *rctx)
5653 {
5654 if (query.query.type == pg_query_t::INFO) {
5655 pair<pg_shard_t, pg_info_t> notify_info;
5656 update_history(query.query.history);
5657 fulfill_info(query.from, query.query, notify_info);
5658 rctx->send_notify(
5659 notify_info.first,
5660 pg_notify_t(
5661 notify_info.first.shard, pg_whoami.shard,
5662 query.query_epoch,
5663 get_osdmap()->get_epoch(),
5664 notify_info.second),
5665 past_intervals);
5666 } else {
5667 update_history(query.query.history);
5668 fulfill_log(query.from, query.query, query.query_epoch);
5669 }
5670 }
5671
5672 void PG::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
5673 {
5674 bool changed = false;
5675 if (osdmap->test_flag(CEPH_OSDMAP_FULL) &&
5676 !lastmap->test_flag(CEPH_OSDMAP_FULL)) {
5677 dout(10) << " cluster was marked full in " << osdmap->get_epoch() << dendl;
5678 changed = true;
5679 }
5680 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
5681 assert(pi);
5682 if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
5683 const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
5684 if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
5685 dout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
5686 changed = true;
5687 }
5688 }
5689 if (changed) {
5690 info.history.last_epoch_marked_full = osdmap->get_epoch();
5691 dirty_info = true;
5692 }
5693 }
5694
5695 bool PG::should_restart_peering(
5696 int newupprimary,
5697 int newactingprimary,
5698 const vector<int>& newup,
5699 const vector<int>& newacting,
5700 OSDMapRef lastmap,
5701 OSDMapRef osdmap)
5702 {
5703 if (PastIntervals::is_new_interval(
5704 primary.osd,
5705 newactingprimary,
5706 acting,
5707 newacting,
5708 up_primary.osd,
5709 newupprimary,
5710 up,
5711 newup,
5712 osdmap,
5713 lastmap,
5714 info.pgid.pgid)) {
5715 dout(20) << "new interval newup " << newup
5716 << " newacting " << newacting << dendl;
5717 return true;
5718 }
5719 if (!lastmap->is_up(osd->whoami) && osdmap->is_up(osd->whoami)) {
5720 dout(10) << __func__ << " osd transitioned from down -> up" << dendl;
5721 return true;
5722 }
5723 return false;
5724 }
5725
5726 bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
5727 {
5728 if (last_peering_reset > reply_epoch ||
5729 last_peering_reset > query_epoch) {
5730 dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " << query_epoch
5731 << " last_peering_reset " << last_peering_reset
5732 << dendl;
5733 return true;
5734 }
5735 return false;
5736 }
5737
5738 void PG::set_last_peering_reset()
5739 {
5740 dout(20) << "set_last_peering_reset " << get_osdmap()->get_epoch() << dendl;
5741 if (last_peering_reset != get_osdmap()->get_epoch()) {
5742 last_peering_reset = get_osdmap()->get_epoch();
5743 reset_interval_flush();
5744 }
5745 }
5746
5747 struct FlushState {
5748 PGRef pg;
5749 epoch_t epoch;
5750 FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
5751 ~FlushState() {
5752 pg->lock();
5753 if (!pg->pg_has_reset_since(epoch))
5754 pg->queue_flushed(epoch);
5755 pg->unlock();
5756 }
5757 };
5758 typedef ceph::shared_ptr<FlushState> FlushStateRef;
5759
5760 void PG::start_flush(ObjectStore::Transaction *t,
5761 list<Context *> *on_applied,
5762 list<Context *> *on_safe)
5763 {
5764 // flush in progress ops
5765 FlushStateRef flush_trigger (std::make_shared<FlushState>(
5766 this, get_osdmap()->get_epoch()));
5767 t->nop();
5768 flushes_in_progress++;
5769 on_applied->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5770 on_safe->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5771 }
5772
5773 void PG::reset_interval_flush()
5774 {
5775 dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
5776 recovery_state.clear_blocked_outgoing();
5777
5778 Context *c = new QueuePeeringEvt<IntervalFlush>(
5779 this, get_osdmap()->get_epoch(), IntervalFlush());
5780 if (!osr->flush_commit(c)) {
5781 dout(10) << "Beginning to block outgoing recovery messages" << dendl;
5782 recovery_state.begin_block_outgoing();
5783 } else {
5784 dout(10) << "Not blocking outgoing recovery messages" << dendl;
5785 delete c;
5786 }
5787 }
5788
5789 /* Called before initializing peering during advance_map */
5790 void PG::start_peering_interval(
5791 const OSDMapRef lastmap,
5792 const vector<int>& newup, int new_up_primary,
5793 const vector<int>& newacting, int new_acting_primary,
5794 ObjectStore::Transaction *t)
5795 {
5796 const OSDMapRef osdmap = get_osdmap();
5797
5798 set_last_peering_reset();
5799
5800 vector<int> oldacting, oldup;
5801 int oldrole = get_role();
5802
5803 unreg_next_scrub();
5804
5805 pg_shard_t old_acting_primary = get_primary();
5806 pg_shard_t old_up_primary = up_primary;
5807 bool was_old_primary = is_primary();
5808 bool was_old_replica = is_replica();
5809
5810 acting.swap(oldacting);
5811 up.swap(oldup);
5812 init_primary_up_acting(
5813 newup,
5814 newacting,
5815 new_up_primary,
5816 new_acting_primary);
5817
5818 if (info.stats.up != up ||
5819 info.stats.acting != acting ||
5820 info.stats.up_primary != new_up_primary ||
5821 info.stats.acting_primary != new_acting_primary) {
5822 info.stats.up = up;
5823 info.stats.up_primary = new_up_primary;
5824 info.stats.acting = acting;
5825 info.stats.acting_primary = new_acting_primary;
5826 info.stats.mapping_epoch = osdmap->get_epoch();
5827 }
5828
5829 pg_stats_publish_lock.Lock();
5830 pg_stats_publish_valid = false;
5831 pg_stats_publish_lock.Unlock();
5832
5833 // This will now be remapped during a backfill in cases
5834 // that it would not have been before.
5835 if (up != acting)
5836 state_set(PG_STATE_REMAPPED);
5837 else
5838 state_clear(PG_STATE_REMAPPED);
5839
5840 int role = osdmap->calc_pg_role(osd->whoami, acting, acting.size());
5841 if (pool.info.is_replicated() || role == pg_whoami.shard)
5842 set_role(role);
5843 else
5844 set_role(-1);
5845
5846 // did acting, up, primary|acker change?
5847 if (!lastmap) {
5848 dout(10) << " no lastmap" << dendl;
5849 dirty_info = true;
5850 dirty_big_info = true;
5851 info.history.same_interval_since = osdmap->get_epoch();
5852 } else {
5853 std::stringstream debug;
5854 assert(info.history.same_interval_since != 0);
5855 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
5856 get_is_recoverable_predicate());
5857 bool new_interval = PastIntervals::check_new_interval(
5858 old_acting_primary.osd,
5859 new_acting_primary,
5860 oldacting, newacting,
5861 old_up_primary.osd,
5862 new_up_primary,
5863 oldup, newup,
5864 info.history.same_interval_since,
5865 info.history.last_epoch_clean,
5866 osdmap,
5867 lastmap,
5868 info.pgid.pgid,
5869 recoverable.get(),
5870 &past_intervals,
5871 &debug);
5872 dout(10) << __func__ << ": check_new_interval output: "
5873 << debug.str() << dendl;
5874 if (new_interval) {
5875 if (osdmap->get_epoch() == osd->get_superblock().oldest_map &&
5876 info.history.last_epoch_clean < osdmap->get_epoch()) {
5877 dout(10) << " map gap, clearing past_intervals and faking" << dendl;
5878 // our information is incomplete and useless; someone else was clean
5879 // after everything we know if osdmaps were trimmed.
5880 past_intervals.clear();
5881 } else {
5882 dout(10) << " noting past " << past_intervals << dendl;
5883 }
5884 dirty_info = true;
5885 dirty_big_info = true;
5886 info.history.same_interval_since = osdmap->get_epoch();
5887 if (info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
5888 osdmap->get_pg_num(info.pgid.pgid.pool()),
5889 nullptr)) {
5890 info.history.last_epoch_split = osdmap->get_epoch();
5891 }
5892 }
5893 }
5894
5895 if (old_up_primary != up_primary ||
5896 oldup != up) {
5897 info.history.same_up_since = osdmap->get_epoch();
5898 }
5899 // this comparison includes primary rank via pg_shard_t
5900 if (old_acting_primary != get_primary()) {
5901 info.history.same_primary_since = osdmap->get_epoch();
5902 }
5903
5904 on_new_interval();
5905
5906 dout(1) << __func__ << " up " << oldup << " -> " << up
5907 << ", acting " << oldacting << " -> " << acting
5908 << ", acting_primary " << old_acting_primary << " -> " << new_acting_primary
5909 << ", up_primary " << old_up_primary << " -> " << new_up_primary
5910 << ", role " << oldrole << " -> " << role
5911 << ", features acting " << acting_features
5912 << " upacting " << upacting_features
5913 << dendl;
5914
5915 // deactivate.
5916 state_clear(PG_STATE_ACTIVE);
5917 state_clear(PG_STATE_PEERED);
5918 state_clear(PG_STATE_DOWN);
5919 state_clear(PG_STATE_RECOVERY_WAIT);
5920 state_clear(PG_STATE_RECOVERY_TOOFULL);
5921 state_clear(PG_STATE_RECOVERING);
5922
5923 peer_purged.clear();
5924 actingbackfill.clear();
5925 scrub_queued = false;
5926
5927 // reset primary/replica state?
5928 if (was_old_primary || is_primary()) {
5929 osd->remove_want_pg_temp(info.pgid.pgid);
5930 } else if (was_old_replica || is_replica()) {
5931 osd->remove_want_pg_temp(info.pgid.pgid);
5932 }
5933 clear_primary_state();
5934
5935
5936 // pg->on_*
5937 on_change(t);
5938
5939 projected_last_update = eversion_t();
5940
5941 assert(!deleting);
5942
5943 // should we tell the primary we are here?
5944 send_notify = !is_primary();
5945
5946 if (role != oldrole ||
5947 was_old_primary != is_primary()) {
5948 // did primary change?
5949 if (was_old_primary != is_primary()) {
5950 state_clear(PG_STATE_CLEAN);
5951 clear_publish_stats();
5952 }
5953
5954 on_role_change();
5955
5956 // take active waiters
5957 requeue_ops(waiting_for_peered);
5958
5959 } else {
5960 // no role change.
5961 // did primary change?
5962 if (get_primary() != old_acting_primary) {
5963 dout(10) << *this << " " << oldacting << " -> " << acting
5964 << ", acting primary "
5965 << old_acting_primary << " -> " << get_primary()
5966 << dendl;
5967 } else {
5968 // primary is the same.
5969 if (is_primary()) {
5970 // i am (still) primary. but my replica set changed.
5971 state_clear(PG_STATE_CLEAN);
5972
5973 dout(10) << oldacting << " -> " << acting
5974 << ", replicas changed" << dendl;
5975 }
5976 }
5977 }
5978 cancel_recovery();
5979
5980 if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
5981 dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
5982 osd->queue_want_pg_temp(info.pgid.pgid, acting);
5983 }
5984 }
5985
5986 void PG::on_new_interval()
5987 {
5988 const OSDMapRef osdmap = get_osdmap();
5989
5990 reg_next_scrub();
5991
5992 // initialize features
5993 acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5994 upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5995 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
5996 if (*p == CRUSH_ITEM_NONE)
5997 continue;
5998 uint64_t f = osdmap->get_xinfo(*p).features;
5999 acting_features &= f;
6000 upacting_features &= f;
6001 }
6002 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
6003 if (*p == CRUSH_ITEM_NONE)
6004 continue;
6005 upacting_features &= osdmap->get_xinfo(*p).features;
6006 }
6007
6008 _on_new_interval();
6009 }
6010
6011 void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
6012 {
6013 assert(!is_primary());
6014
6015 update_history(oinfo.history);
6016 if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
6017 info.stats.stats.sum.num_scrub_errors = 0;
6018 info.stats.stats.sum.num_shallow_scrub_errors = 0;
6019 info.stats.stats.sum.num_deep_scrub_errors = 0;
6020 dirty_info = true;
6021 }
6022
6023 if (!(info.purged_snaps == oinfo.purged_snaps)) {
6024 dout(10) << __func__ << " updating purged_snaps to " << oinfo.purged_snaps
6025 << dendl;
6026 info.purged_snaps = oinfo.purged_snaps;
6027 dirty_info = true;
6028 dirty_big_info = true;
6029 }
6030 }
6031
6032 ostream& operator<<(ostream& out, const PG& pg)
6033 {
6034 out << "pg[" << pg.info
6035 << " " << pg.up;
6036 if (pg.acting != pg.up)
6037 out << "/" << pg.acting;
6038 if (pg.is_ec_pg())
6039 out << "p" << pg.get_primary();
6040 out << " r=" << pg.get_role();
6041 out << " lpr=" << pg.get_last_peering_reset();
6042
6043 if (!pg.past_intervals.empty()) {
6044 out << " pi=[" << pg.past_intervals.get_bounds()
6045 << ")/" << pg.past_intervals.size();
6046 }
6047
6048 if (pg.is_peered()) {
6049 if (pg.last_update_ondisk != pg.info.last_update)
6050 out << " luod=" << pg.last_update_ondisk;
6051 if (pg.last_update_applied != pg.info.last_update)
6052 out << " lua=" << pg.last_update_applied;
6053 }
6054
6055 if (pg.recovery_ops_active)
6056 out << " rops=" << pg.recovery_ops_active;
6057
6058 if (pg.pg_log.get_tail() != pg.info.log_tail ||
6059 pg.pg_log.get_head() != pg.info.last_update)
6060 out << " (info mismatch, " << pg.pg_log.get_log() << ")";
6061
6062 if (!pg.pg_log.get_log().empty()) {
6063 if ((pg.pg_log.get_log().log.begin()->version <= pg.pg_log.get_tail())) {
6064 out << " (log bound mismatch, actual=["
6065 << pg.pg_log.get_log().log.begin()->version << ","
6066 << pg.pg_log.get_log().log.rbegin()->version << "]";
6067 out << ")";
6068 }
6069 }
6070
6071 if (!pg.backfill_targets.empty())
6072 out << " bft=" << pg.backfill_targets;
6073 out << " crt=" << pg.pg_log.get_can_rollback_to();
6074
6075 if (pg.last_complete_ondisk != pg.info.last_complete)
6076 out << " lcod " << pg.last_complete_ondisk;
6077
6078 if (pg.is_primary()) {
6079 out << " mlcod " << pg.min_last_complete_ondisk;
6080 }
6081
6082 out << " " << pg_state_string(pg.get_state());
6083 if (pg.should_send_notify())
6084 out << " NOTIFY";
6085
6086 if (pg.scrubber.must_repair)
6087 out << " MUST_REPAIR";
6088 if (pg.scrubber.auto_repair)
6089 out << " AUTO_REPAIR";
6090 if (pg.scrubber.must_deep_scrub)
6091 out << " MUST_DEEP_SCRUB";
6092 if (pg.scrubber.must_scrub)
6093 out << " MUST_SCRUB";
6094
6095 //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
6096 if (pg.pg_log.get_missing().num_missing()) {
6097 out << " m=" << pg.pg_log.get_missing().num_missing();
6098 if (pg.is_primary()) {
6099 uint64_t unfound = pg.get_num_unfound();
6100 if (unfound)
6101 out << " u=" << unfound;
6102 }
6103 }
6104 if (pg.snap_trimq.size())
6105 out << " snaptrimq=" << pg.snap_trimq;
6106 if (!pg.is_clean()) {
6107 out << " mbc=" << pg.missing_loc.get_missing_by_count();
6108 }
6109
6110 out << "]";
6111
6112
6113 return out;
6114 }
6115
6116 bool PG::can_discard_op(OpRequestRef& op)
6117 {
6118 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
6119 if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) {
6120 dout(20) << " discard " << *m << dendl;
6121 return true;
6122 }
6123
6124 if (m->get_map_epoch() < info.history.same_primary_since) {
6125 dout(7) << " changed after " << m->get_map_epoch()
6126 << ", dropping " << *m << dendl;
6127 return true;
6128 }
6129
6130 if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) {
6131 if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) {
6132 dout(7) << __func__ << " sent before last_force_op_resend "
6133 << pool.info.last_force_op_resend << ", dropping" << *m << dendl;
6134 return true;
6135 }
6136 if (m->get_map_epoch() < info.history.last_epoch_split) {
6137 dout(7) << __func__ << " pg split in "
6138 << info.history.last_epoch_split << ", dropping" << dendl;
6139 return true;
6140 }
6141 } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) {
6142 if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) {
6143 dout(7) << __func__ << " sent before last_force_op_resend_preluminous "
6144 << pool.info.last_force_op_resend_preluminous
6145 << ", dropping" << *m << dendl;
6146 return true;
6147 }
6148 }
6149
6150 return false;
6151 }
6152
6153 template<typename T, int MSGTYPE>
6154 bool PG::can_discard_replica_op(OpRequestRef& op)
6155 {
6156 const T *m = static_cast<const T *>(op->get_req());
6157 assert(m->get_type() == MSGTYPE);
6158
6159 int from = m->get_source().num();
6160
6161 // if a repop is replied after a replica goes down in a new osdmap, and
6162 // before the pg advances to this new osdmap, the repop replies before this
6163 // repop can be discarded by that replica OSD, because the primary resets the
6164 // connection to it when handling the new osdmap marking it down, and also
6165 // resets the messenger sesssion when the replica reconnects. to avoid the
6166 // out-of-order replies, the messages from that replica should be discarded.
6167 if (osd->get_osdmap()->is_down(from))
6168 return true;
6169 /* Mostly, this overlaps with the old_peering_msg
6170 * condition. An important exception is pushes
6171 * sent by replicas not in the acting set, since
6172 * if such a replica goes down it does not cause
6173 * a new interval. */
6174 if (get_osdmap()->get_down_at(from) >= m->map_epoch)
6175 return true;
6176
6177 // same pg?
6178 // if pg changes _at all_, we reset and repeer!
6179 if (old_peering_msg(m->map_epoch, m->map_epoch)) {
6180 dout(10) << "can_discard_replica_op pg changed " << info.history
6181 << " after " << m->map_epoch
6182 << ", dropping" << dendl;
6183 return true;
6184 }
6185 return false;
6186 }
6187
6188 bool PG::can_discard_scan(OpRequestRef op)
6189 {
6190 const MOSDPGScan *m = static_cast<const MOSDPGScan *>(op->get_req());
6191 assert(m->get_type() == MSG_OSD_PG_SCAN);
6192
6193 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
6194 dout(10) << " got old scan, ignoring" << dendl;
6195 return true;
6196 }
6197 return false;
6198 }
6199
6200 bool PG::can_discard_backfill(OpRequestRef op)
6201 {
6202 const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill *>(op->get_req());
6203 assert(m->get_type() == MSG_OSD_PG_BACKFILL);
6204
6205 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
6206 dout(10) << " got old backfill, ignoring" << dendl;
6207 return true;
6208 }
6209
6210 return false;
6211
6212 }
6213
6214 bool PG::can_discard_request(OpRequestRef& op)
6215 {
6216 switch (op->get_req()->get_type()) {
6217 case CEPH_MSG_OSD_OP:
6218 return can_discard_op(op);
6219 case CEPH_MSG_OSD_BACKOFF:
6220 return false; // never discard
6221 case MSG_OSD_SUBOP:
6222 return can_discard_replica_op<MOSDSubOp, MSG_OSD_SUBOP>(op);
6223 case MSG_OSD_REPOP:
6224 return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op);
6225 case MSG_OSD_PG_PUSH:
6226 return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op);
6227 case MSG_OSD_PG_PULL:
6228 return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
6229 case MSG_OSD_PG_PUSH_REPLY:
6230 return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
6231 case MSG_OSD_SUBOPREPLY:
6232 return can_discard_replica_op<MOSDSubOpReply, MSG_OSD_SUBOPREPLY>(op);
6233 case MSG_OSD_REPOPREPLY:
6234 return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
6235 case MSG_OSD_PG_RECOVERY_DELETE:
6236 return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op);
6237
6238 case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
6239 return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op);
6240
6241 case MSG_OSD_EC_WRITE:
6242 return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
6243 case MSG_OSD_EC_WRITE_REPLY:
6244 return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op);
6245 case MSG_OSD_EC_READ:
6246 return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
6247 case MSG_OSD_EC_READ_REPLY:
6248 return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
6249 case MSG_OSD_REP_SCRUB:
6250 return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
6251 case MSG_OSD_SCRUB_RESERVE:
6252 return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op);
6253 case MSG_OSD_REP_SCRUBMAP:
6254 return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op);
6255 case MSG_OSD_PG_UPDATE_LOG_MISSING:
6256 return can_discard_replica_op<
6257 MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op);
6258 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
6259 return can_discard_replica_op<
6260 MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
6261
6262 case MSG_OSD_PG_SCAN:
6263 return can_discard_scan(op);
6264 case MSG_OSD_PG_BACKFILL:
6265 return can_discard_backfill(op);
6266 case MSG_OSD_PG_BACKFILL_REMOVE:
6267 return can_discard_replica_op<MOSDPGBackfillRemove,
6268 MSG_OSD_PG_BACKFILL_REMOVE>(op);
6269 }
6270 return true;
6271 }
6272
6273 void PG::take_waiters()
6274 {
6275 dout(10) << "take_waiters" << dendl;
6276 requeue_map_waiters();
6277 for (list<CephPeeringEvtRef>::iterator i = peering_waiters.begin();
6278 i != peering_waiters.end();
6279 ++i) osd->queue_for_peering(this);
6280 peering_queue.splice(peering_queue.begin(), peering_waiters,
6281 peering_waiters.begin(), peering_waiters.end());
6282 }
6283
6284 void PG::handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx)
6285 {
6286 dout(10) << "handle_peering_event: " << evt->get_desc() << dendl;
6287 if (!have_same_or_newer_map(evt->get_epoch_sent())) {
6288 dout(10) << "deferring event " << evt->get_desc() << dendl;
6289 peering_waiters.push_back(evt);
6290 return;
6291 }
6292 if (old_peering_evt(evt))
6293 return;
6294 recovery_state.handle_event(evt, rctx);
6295 }
6296
6297 void PG::queue_peering_event(CephPeeringEvtRef evt)
6298 {
6299 if (old_peering_evt(evt))
6300 return;
6301 peering_queue.push_back(evt);
6302 osd->queue_for_peering(this);
6303 }
6304
6305 void PG::queue_null(epoch_t msg_epoch,
6306 epoch_t query_epoch)
6307 {
6308 dout(10) << "null" << dendl;
6309 queue_peering_event(
6310 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
6311 NullEvt())));
6312 }
6313
6314 void PG::queue_flushed(epoch_t e)
6315 {
6316 dout(10) << "flushed" << dendl;
6317 queue_peering_event(
6318 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(e, e,
6319 FlushedEvt())));
6320 }
6321
6322 void PG::queue_query(epoch_t msg_epoch,
6323 epoch_t query_epoch,
6324 pg_shard_t from, const pg_query_t& q)
6325 {
6326 dout(10) << "handle_query " << q << " from replica " << from << dendl;
6327 queue_peering_event(
6328 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
6329 MQuery(from, q, query_epoch))));
6330 }
6331
6332 void PG::handle_advance_map(
6333 OSDMapRef osdmap, OSDMapRef lastmap,
6334 vector<int>& newup, int up_primary,
6335 vector<int>& newacting, int acting_primary,
6336 RecoveryCtx *rctx)
6337 {
6338 assert(lastmap->get_epoch() == osdmap_ref->get_epoch());
6339 assert(lastmap == osdmap_ref);
6340 dout(10) << "handle_advance_map "
6341 << newup << "/" << newacting
6342 << " -- " << up_primary << "/" << acting_primary
6343 << dendl;
6344 update_osdmap_ref(osdmap);
6345 pool.update(osdmap);
6346 past_intervals.update_type_from_map(pool.info.ec_pool(), *osdmap);
6347 if (cct->_conf->osd_debug_verify_cached_snaps) {
6348 interval_set<snapid_t> actual_removed_snaps;
6349 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
6350 assert(pi);
6351 pi->build_removed_snaps(actual_removed_snaps);
6352 if (!(actual_removed_snaps == pool.cached_removed_snaps)) {
6353 derr << __func__ << ": mismatch between the actual removed snaps "
6354 << actual_removed_snaps << " and pool.cached_removed_snaps "
6355 << " pool.cached_removed_snaps " << pool.cached_removed_snaps
6356 << dendl;
6357 }
6358 assert(actual_removed_snaps == pool.cached_removed_snaps);
6359 }
6360 AdvMap evt(
6361 osdmap, lastmap, newup, up_primary,
6362 newacting, acting_primary);
6363 recovery_state.handle_event(evt, rctx);
6364 if (pool.info.last_change == osdmap_ref->get_epoch()) {
6365 on_pool_change();
6366 update_store_with_options();
6367 }
6368 }
6369
6370 void PG::handle_activate_map(RecoveryCtx *rctx)
6371 {
6372 dout(10) << "handle_activate_map " << dendl;
6373 ActMap evt;
6374 recovery_state.handle_event(evt, rctx);
6375 if (osdmap_ref->get_epoch() - last_persisted_osdmap_ref->get_epoch() >
6376 cct->_conf->osd_pg_epoch_persisted_max_stale) {
6377 dout(20) << __func__ << ": Dirtying info: last_persisted is "
6378 << last_persisted_osdmap_ref->get_epoch()
6379 << " while current is " << osdmap_ref->get_epoch() << dendl;
6380 dirty_info = true;
6381 } else {
6382 dout(20) << __func__ << ": Not dirtying info: last_persisted is "
6383 << last_persisted_osdmap_ref->get_epoch()
6384 << " while current is " << osdmap_ref->get_epoch() << dendl;
6385 }
6386 if (osdmap_ref->check_new_blacklist_entries()) check_blacklisted_watchers();
6387 }
6388
6389 void PG::handle_loaded(RecoveryCtx *rctx)
6390 {
6391 dout(10) << "handle_loaded" << dendl;
6392 Load evt;
6393 recovery_state.handle_event(evt, rctx);
6394 }
6395
6396 void PG::handle_create(RecoveryCtx *rctx)
6397 {
6398 dout(10) << "handle_create" << dendl;
6399 rctx->created_pgs.insert(this);
6400 Initialize evt;
6401 recovery_state.handle_event(evt, rctx);
6402 ActMap evt2;
6403 recovery_state.handle_event(evt2, rctx);
6404
6405 rctx->on_applied->add(make_lambda_context([this]() {
6406 update_store_with_options();
6407 }));
6408 }
6409
6410 void PG::handle_query_state(Formatter *f)
6411 {
6412 dout(10) << "handle_query_state" << dendl;
6413 QueryState q(f);
6414 recovery_state.handle_event(q, 0);
6415 }
6416
6417 void PG::update_store_with_options()
6418 {
6419 auto r = osd->store->set_collection_opts(coll, pool.info.opts);
6420 if(r < 0 && r != -EOPNOTSUPP) {
6421 derr << __func__ << " set_collection_opts returns error:" << r << dendl;
6422 }
6423 }
6424
6425 void PG::update_store_on_load()
6426 {
6427 if (osd->store->get_type() == "filestore") {
6428 // legacy filestore didn't store collection bit width; fix.
6429 int bits = osd->store->collection_bits(coll);
6430 if (bits < 0) {
6431 assert(!coll.is_meta()); // otherwise OSD::load_pgs() did a bad thing
6432 bits = info.pgid.get_split_bits(pool.info.get_pg_num());
6433 lderr(cct) << __func__ << " setting bit width to " << bits << dendl;
6434 ObjectStore::Transaction t;
6435 t.collection_set_bits(coll, bits);
6436 osd->store->apply_transaction(osr.get(), std::move(t));
6437 }
6438 }
6439 }
6440
6441 /*------------ Recovery State Machine----------------*/
6442 #undef dout_prefix
6443 #define dout_prefix (*_dout << context< RecoveryMachine >().pg->gen_prefix() \
6444 << "state<" << get_state_name() << ">: ")
6445
6446 /*------Crashed-------*/
6447 PG::RecoveryState::Crashed::Crashed(my_context ctx)
6448 : my_base(ctx),
6449 NamedState(context< RecoveryMachine >().pg, "Crashed")
6450 {
6451 context< RecoveryMachine >().log_enter(state_name);
6452 assert(0 == "we got a bad state machine event");
6453 }
6454
6455
6456 /*------Initial-------*/
6457 PG::RecoveryState::Initial::Initial(my_context ctx)
6458 : my_base(ctx),
6459 NamedState(context< RecoveryMachine >().pg, "Initial")
6460 {
6461 context< RecoveryMachine >().log_enter(state_name);
6462 }
6463
6464 boost::statechart::result PG::RecoveryState::Initial::react(const Load& l)
6465 {
6466 PG *pg = context< RecoveryMachine >().pg;
6467
6468 // do we tell someone we're here?
6469 pg->send_notify = (!pg->is_primary());
6470 pg->update_store_with_options();
6471
6472 pg->update_store_on_load();
6473
6474 return transit< Reset >();
6475 }
6476
6477 boost::statechart::result PG::RecoveryState::Initial::react(const MNotifyRec& notify)
6478 {
6479 PG *pg = context< RecoveryMachine >().pg;
6480 pg->proc_replica_info(
6481 notify.from, notify.notify.info, notify.notify.epoch_sent);
6482 pg->set_last_peering_reset();
6483 return transit< Primary >();
6484 }
6485
6486 boost::statechart::result PG::RecoveryState::Initial::react(const MInfoRec& i)
6487 {
6488 PG *pg = context< RecoveryMachine >().pg;
6489 assert(!pg->is_primary());
6490 post_event(i);
6491 return transit< Stray >();
6492 }
6493
6494 boost::statechart::result PG::RecoveryState::Initial::react(const MLogRec& i)
6495 {
6496 PG *pg = context< RecoveryMachine >().pg;
6497 assert(!pg->is_primary());
6498 post_event(i);
6499 return transit< Stray >();
6500 }
6501
6502 void PG::RecoveryState::Initial::exit()
6503 {
6504 context< RecoveryMachine >().log_exit(state_name, enter_time);
6505 PG *pg = context< RecoveryMachine >().pg;
6506 utime_t dur = ceph_clock_now() - enter_time;
6507 pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur);
6508 }
6509
6510 /*------Started-------*/
6511 PG::RecoveryState::Started::Started(my_context ctx)
6512 : my_base(ctx),
6513 NamedState(context< RecoveryMachine >().pg, "Started")
6514 {
6515 context< RecoveryMachine >().log_enter(state_name);
6516 }
6517
6518 boost::statechart::result
6519 PG::RecoveryState::Started::react(const IntervalFlush&)
6520 {
6521 PG *pg = context< RecoveryMachine >().pg;
6522 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6523 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6524 return discard_event();
6525 }
6526
6527
6528 boost::statechart::result
6529 PG::RecoveryState::Started::react(const FlushedEvt&)
6530 {
6531 PG *pg = context< RecoveryMachine >().pg;
6532 pg->on_flushed();
6533 return discard_event();
6534 }
6535
6536
6537 boost::statechart::result PG::RecoveryState::Started::react(const AdvMap& advmap)
6538 {
6539 PG *pg = context< RecoveryMachine >().pg;
6540 ldout(pg->cct, 10) << "Started advmap" << dendl;
6541 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6542 if (pg->should_restart_peering(
6543 advmap.up_primary,
6544 advmap.acting_primary,
6545 advmap.newup,
6546 advmap.newacting,
6547 advmap.lastmap,
6548 advmap.osdmap)) {
6549 ldout(pg->cct, 10) << "should_restart_peering, transitioning to Reset"
6550 << dendl;
6551 post_event(advmap);
6552 return transit< Reset >();
6553 }
6554 pg->remove_down_peer_info(advmap.osdmap);
6555 return discard_event();
6556 }
6557
6558 boost::statechart::result PG::RecoveryState::Started::react(const QueryState& q)
6559 {
6560 q.f->open_object_section("state");
6561 q.f->dump_string("name", state_name);
6562 q.f->dump_stream("enter_time") << enter_time;
6563 q.f->close_section();
6564 return discard_event();
6565 }
6566
6567 void PG::RecoveryState::Started::exit()
6568 {
6569 context< RecoveryMachine >().log_exit(state_name, enter_time);
6570 PG *pg = context< RecoveryMachine >().pg;
6571 utime_t dur = ceph_clock_now() - enter_time;
6572 pg->osd->recoverystate_perf->tinc(rs_started_latency, dur);
6573 }
6574
6575 /*--------Reset---------*/
6576 PG::RecoveryState::Reset::Reset(my_context ctx)
6577 : my_base(ctx),
6578 NamedState(context< RecoveryMachine >().pg, "Reset")
6579 {
6580 context< RecoveryMachine >().log_enter(state_name);
6581 PG *pg = context< RecoveryMachine >().pg;
6582
6583 pg->flushes_in_progress = 0;
6584 pg->set_last_peering_reset();
6585 }
6586
6587 boost::statechart::result
6588 PG::RecoveryState::Reset::react(const FlushedEvt&)
6589 {
6590 PG *pg = context< RecoveryMachine >().pg;
6591 pg->on_flushed();
6592 return discard_event();
6593 }
6594
6595 boost::statechart::result
6596 PG::RecoveryState::Reset::react(const IntervalFlush&)
6597 {
6598 PG *pg = context< RecoveryMachine >().pg;
6599 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6600 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6601 return discard_event();
6602 }
6603
6604 boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
6605 {
6606 PG *pg = context< RecoveryMachine >().pg;
6607 ldout(pg->cct, 10) << "Reset advmap" << dendl;
6608
6609 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6610
6611 if (pg->should_restart_peering(
6612 advmap.up_primary,
6613 advmap.acting_primary,
6614 advmap.newup,
6615 advmap.newacting,
6616 advmap.lastmap,
6617 advmap.osdmap)) {
6618 ldout(pg->cct, 10) << "should restart peering, calling start_peering_interval again"
6619 << dendl;
6620 pg->start_peering_interval(
6621 advmap.lastmap,
6622 advmap.newup, advmap.up_primary,
6623 advmap.newacting, advmap.acting_primary,
6624 context< RecoveryMachine >().get_cur_transaction());
6625 }
6626 pg->remove_down_peer_info(advmap.osdmap);
6627 pg->check_past_interval_bounds();
6628 return discard_event();
6629 }
6630
6631 boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&)
6632 {
6633 PG *pg = context< RecoveryMachine >().pg;
6634 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
6635 context< RecoveryMachine >().send_notify(
6636 pg->get_primary(),
6637 pg_notify_t(
6638 pg->get_primary().shard, pg->pg_whoami.shard,
6639 pg->get_osdmap()->get_epoch(),
6640 pg->get_osdmap()->get_epoch(),
6641 pg->info),
6642 pg->past_intervals);
6643 }
6644
6645 pg->update_heartbeat_peers();
6646 pg->take_waiters();
6647
6648 return transit< Started >();
6649 }
6650
6651 boost::statechart::result PG::RecoveryState::Reset::react(const QueryState& q)
6652 {
6653 q.f->open_object_section("state");
6654 q.f->dump_string("name", state_name);
6655 q.f->dump_stream("enter_time") << enter_time;
6656 q.f->close_section();
6657 return discard_event();
6658 }
6659
6660 void PG::RecoveryState::Reset::exit()
6661 {
6662 context< RecoveryMachine >().log_exit(state_name, enter_time);
6663 PG *pg = context< RecoveryMachine >().pg;
6664 utime_t dur = ceph_clock_now() - enter_time;
6665 pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur);
6666 }
6667
6668 /*-------Start---------*/
6669 PG::RecoveryState::Start::Start(my_context ctx)
6670 : my_base(ctx),
6671 NamedState(context< RecoveryMachine >().pg, "Start")
6672 {
6673 context< RecoveryMachine >().log_enter(state_name);
6674
6675 PG *pg = context< RecoveryMachine >().pg;
6676 if (pg->is_primary()) {
6677 ldout(pg->cct, 1) << "transitioning to Primary" << dendl;
6678 post_event(MakePrimary());
6679 } else { //is_stray
6680 ldout(pg->cct, 1) << "transitioning to Stray" << dendl;
6681 post_event(MakeStray());
6682 }
6683 }
6684
6685 void PG::RecoveryState::Start::exit()
6686 {
6687 context< RecoveryMachine >().log_exit(state_name, enter_time);
6688 PG *pg = context< RecoveryMachine >().pg;
6689 utime_t dur = ceph_clock_now() - enter_time;
6690 pg->osd->recoverystate_perf->tinc(rs_start_latency, dur);
6691 }
6692
6693 /*---------Primary--------*/
6694 PG::RecoveryState::Primary::Primary(my_context ctx)
6695 : my_base(ctx),
6696 NamedState(context< RecoveryMachine >().pg, "Started/Primary")
6697 {
6698 context< RecoveryMachine >().log_enter(state_name);
6699 PG *pg = context< RecoveryMachine >().pg;
6700 assert(pg->want_acting.empty());
6701
6702 // set CREATING bit until we have peered for the first time.
6703 if (pg->info.history.last_epoch_started == 0) {
6704 pg->state_set(PG_STATE_CREATING);
6705 // use the history timestamp, which ultimately comes from the
6706 // monitor in the create case.
6707 utime_t t = pg->info.history.last_scrub_stamp;
6708 pg->info.stats.last_fresh = t;
6709 pg->info.stats.last_active = t;
6710 pg->info.stats.last_change = t;
6711 pg->info.stats.last_peered = t;
6712 pg->info.stats.last_clean = t;
6713 pg->info.stats.last_unstale = t;
6714 pg->info.stats.last_undegraded = t;
6715 pg->info.stats.last_fullsized = t;
6716 pg->info.stats.last_scrub_stamp = t;
6717 pg->info.stats.last_deep_scrub_stamp = t;
6718 pg->info.stats.last_clean_scrub_stamp = t;
6719 }
6720 }
6721
6722 boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& notevt)
6723 {
6724 PG *pg = context< RecoveryMachine >().pg;
6725 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
6726 pg->proc_replica_info(
6727 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
6728 return discard_event();
6729 }
6730
6731 boost::statechart::result PG::RecoveryState::Primary::react(const ActMap&)
6732 {
6733 PG *pg = context< RecoveryMachine >().pg;
6734 ldout(pg->cct, 7) << "handle ActMap primary" << dendl;
6735 pg->publish_stats_to_osd();
6736 pg->take_waiters();
6737 return discard_event();
6738 }
6739
6740 void PG::RecoveryState::Primary::exit()
6741 {
6742 context< RecoveryMachine >().log_exit(state_name, enter_time);
6743 PG *pg = context< RecoveryMachine >().pg;
6744 pg->want_acting.clear();
6745 utime_t dur = ceph_clock_now() - enter_time;
6746 pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur);
6747 pg->clear_primary_state();
6748 pg->state_clear(PG_STATE_CREATING);
6749 }
6750
6751 /*---------Peering--------*/
6752 PG::RecoveryState::Peering::Peering(my_context ctx)
6753 : my_base(ctx),
6754 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering"),
6755 history_les_bound(false)
6756 {
6757 context< RecoveryMachine >().log_enter(state_name);
6758
6759 PG *pg = context< RecoveryMachine >().pg;
6760 assert(!pg->is_peered());
6761 assert(!pg->is_peering());
6762 assert(pg->is_primary());
6763 pg->state_set(PG_STATE_PEERING);
6764 }
6765
6766 boost::statechart::result PG::RecoveryState::Peering::react(const AdvMap& advmap)
6767 {
6768 PG *pg = context< RecoveryMachine >().pg;
6769 ldout(pg->cct, 10) << "Peering advmap" << dendl;
6770 if (prior_set.affected_by_map(*(advmap.osdmap), pg)) {
6771 ldout(pg->cct, 1) << "Peering, affected_by_map, going to Reset" << dendl;
6772 post_event(advmap);
6773 return transit< Reset >();
6774 }
6775
6776 pg->adjust_need_up_thru(advmap.osdmap);
6777
6778 return forward_event();
6779 }
6780
6781 boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q)
6782 {
6783 PG *pg = context< RecoveryMachine >().pg;
6784
6785 q.f->open_object_section("state");
6786 q.f->dump_string("name", state_name);
6787 q.f->dump_stream("enter_time") << enter_time;
6788
6789 q.f->open_array_section("past_intervals");
6790 pg->past_intervals.dump(q.f);
6791 q.f->close_section();
6792
6793 q.f->open_array_section("probing_osds");
6794 for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
6795 p != prior_set.probe.end();
6796 ++p)
6797 q.f->dump_stream("osd") << *p;
6798 q.f->close_section();
6799
6800 if (prior_set.pg_down)
6801 q.f->dump_string("blocked", "peering is blocked due to down osds");
6802
6803 q.f->open_array_section("down_osds_we_would_probe");
6804 for (set<int>::iterator p = prior_set.down.begin();
6805 p != prior_set.down.end();
6806 ++p)
6807 q.f->dump_int("osd", *p);
6808 q.f->close_section();
6809
6810 q.f->open_array_section("peering_blocked_by");
6811 for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
6812 p != prior_set.blocked_by.end();
6813 ++p) {
6814 q.f->open_object_section("osd");
6815 q.f->dump_int("osd", p->first);
6816 q.f->dump_int("current_lost_at", p->second);
6817 q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
6818 q.f->close_section();
6819 }
6820 q.f->close_section();
6821
6822 if (history_les_bound) {
6823 q.f->open_array_section("peering_blocked_by_detail");
6824 q.f->open_object_section("item");
6825 q.f->dump_string("detail","peering_blocked_by_history_les_bound");
6826 q.f->close_section();
6827 q.f->close_section();
6828 }
6829
6830 q.f->close_section();
6831 return forward_event();
6832 }
6833
6834 void PG::RecoveryState::Peering::exit()
6835 {
6836 PG *pg = context< RecoveryMachine >().pg;
6837 ldout(pg->cct, 10) << "Leaving Peering" << dendl;
6838 context< RecoveryMachine >().log_exit(state_name, enter_time);
6839 pg->state_clear(PG_STATE_PEERING);
6840 pg->clear_probe_targets();
6841
6842 utime_t dur = ceph_clock_now() - enter_time;
6843 pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur);
6844 }
6845
6846
6847 /*------Backfilling-------*/
6848 PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
6849 : my_base(ctx),
6850 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Backfilling")
6851 {
6852 context< RecoveryMachine >().log_enter(state_name);
6853 PG *pg = context< RecoveryMachine >().pg;
6854 pg->backfill_reserved = true;
6855 pg->queue_recovery();
6856 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
6857 pg->state_clear(PG_STATE_BACKFILL_WAIT);
6858 pg->state_set(PG_STATE_BACKFILLING);
6859 pg->publish_stats_to_osd();
6860 }
6861
6862 boost::statechart::result
6863 PG::RecoveryState::Backfilling::react(const DeferBackfill &c)
6864 {
6865 PG *pg = context< RecoveryMachine >().pg;
6866 ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl;
6867 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6868
6869 pg->state_set(PG_STATE_BACKFILL_WAIT);
6870 pg->state_clear(PG_STATE_BACKFILLING);
6871
6872 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6873 it != pg->backfill_targets.end();
6874 ++it) {
6875 assert(*it != pg->pg_whoami);
6876 ConnectionRef con = pg->osd->get_con_osd_cluster(
6877 it->osd, pg->get_osdmap()->get_epoch());
6878 if (con) {
6879 pg->osd->send_message_osd_cluster(
6880 new MBackfillReserve(
6881 MBackfillReserve::REJECT,
6882 spg_t(pg->info.pgid.pgid, it->shard),
6883 pg->get_osdmap()->get_epoch()),
6884 con.get());
6885 }
6886 }
6887
6888
6889 if (!pg->waiting_on_backfill.empty()) {
6890 pg->waiting_on_backfill.clear();
6891 pg->finish_recovery_op(hobject_t::get_max());
6892 }
6893
6894 pg->schedule_backfill_retry(c.delay);
6895 return transit<NotBackfilling>();
6896 }
6897
6898 boost::statechart::result
6899 PG::RecoveryState::Backfilling::react(const UnfoundBackfill &c)
6900 {
6901 PG *pg = context< RecoveryMachine >().pg;
6902 ldout(pg->cct, 10) << "backfill has unfound, can't continue" << dendl;
6903 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6904
6905 pg->state_set(PG_STATE_BACKFILL_UNFOUND);
6906 pg->state_clear(PG_STATE_BACKFILLING);
6907
6908 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6909 it != pg->backfill_targets.end();
6910 ++it) {
6911 assert(*it != pg->pg_whoami);
6912 ConnectionRef con = pg->osd->get_con_osd_cluster(
6913 it->osd, pg->get_osdmap()->get_epoch());
6914 if (con) {
6915 pg->osd->send_message_osd_cluster(
6916 new MBackfillReserve(
6917 MBackfillReserve::REJECT,
6918 spg_t(pg->info.pgid.pgid, it->shard),
6919 pg->get_osdmap()->get_epoch()),
6920 con.get());
6921 }
6922 }
6923
6924 pg->waiting_on_backfill.clear();
6925
6926 return transit<NotBackfilling>();
6927 }
6928
6929 boost::statechart::result
6930 PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &)
6931 {
6932 PG *pg = context< RecoveryMachine >().pg;
6933 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6934 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6935
6936 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6937 it != pg->backfill_targets.end();
6938 ++it) {
6939 assert(*it != pg->pg_whoami);
6940 ConnectionRef con = pg->osd->get_con_osd_cluster(
6941 it->osd, pg->get_osdmap()->get_epoch());
6942 if (con) {
6943 pg->osd->send_message_osd_cluster(
6944 new MBackfillReserve(
6945 MBackfillReserve::REJECT,
6946 spg_t(pg->info.pgid.pgid, it->shard),
6947 pg->get_osdmap()->get_epoch()),
6948 con.get());
6949 }
6950 }
6951
6952 if (!pg->waiting_on_backfill.empty()) {
6953 pg->waiting_on_backfill.clear();
6954 pg->finish_recovery_op(hobject_t::get_max());
6955 }
6956
6957 pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
6958 return transit<NotBackfilling>();
6959 }
6960
6961 void PG::RecoveryState::Backfilling::exit()
6962 {
6963 context< RecoveryMachine >().log_exit(state_name, enter_time);
6964 PG *pg = context< RecoveryMachine >().pg;
6965 pg->backfill_reserved = false;
6966 pg->backfill_reserving = false;
6967 pg->state_clear(PG_STATE_BACKFILLING);
6968 pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
6969 utime_t dur = ceph_clock_now() - enter_time;
6970 pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
6971 }
6972
6973 /*--WaitRemoteBackfillReserved--*/
6974
6975 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
6976 : my_base(ctx),
6977 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteBackfillReserved"),
6978 backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
6979 {
6980 context< RecoveryMachine >().log_enter(state_name);
6981 PG *pg = context< RecoveryMachine >().pg;
6982 pg->state_set(PG_STATE_BACKFILL_WAIT);
6983 pg->publish_stats_to_osd();
6984 post_event(RemoteBackfillReserved());
6985 }
6986
6987 boost::statechart::result
6988 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
6989 {
6990 PG *pg = context< RecoveryMachine >().pg;
6991
6992 if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) {
6993 //The primary never backfills itself
6994 assert(*backfill_osd_it != pg->pg_whoami);
6995 ConnectionRef con = pg->osd->get_con_osd_cluster(
6996 backfill_osd_it->osd, pg->get_osdmap()->get_epoch());
6997 if (con) {
6998 pg->osd->send_message_osd_cluster(
6999 new MBackfillReserve(
7000 MBackfillReserve::REQUEST,
7001 spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
7002 pg->get_osdmap()->get_epoch(),
7003 pg->get_backfill_priority()),
7004 con.get());
7005 }
7006 ++backfill_osd_it;
7007 } else {
7008 post_event(AllBackfillsReserved());
7009 }
7010 return discard_event();
7011 }
7012
7013 void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
7014 {
7015 context< RecoveryMachine >().log_exit(state_name, enter_time);
7016 PG *pg = context< RecoveryMachine >().pg;
7017 utime_t dur = ceph_clock_now() - enter_time;
7018 pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur);
7019 }
7020
7021 boost::statechart::result
7022 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt)
7023 {
7024 PG *pg = context< RecoveryMachine >().pg;
7025 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7026
7027 // Send REJECT to all previously acquired reservations
7028 set<pg_shard_t>::const_iterator it, begin, end, next;
7029 begin = context< Active >().remote_shards_to_reserve_backfill.begin();
7030 end = context< Active >().remote_shards_to_reserve_backfill.end();
7031 assert(begin != end);
7032 for (next = it = begin, ++next ; next != backfill_osd_it; ++it, ++next) {
7033 //The primary never backfills itself
7034 assert(*it != pg->pg_whoami);
7035 ConnectionRef con = pg->osd->get_con_osd_cluster(
7036 it->osd, pg->get_osdmap()->get_epoch());
7037 if (con) {
7038 pg->osd->send_message_osd_cluster(
7039 new MBackfillReserve(
7040 MBackfillReserve::REJECT,
7041 spg_t(pg->info.pgid.pgid, it->shard),
7042 pg->get_osdmap()->get_epoch()),
7043 con.get());
7044 }
7045 }
7046
7047 pg->state_clear(PG_STATE_BACKFILL_WAIT);
7048 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
7049 pg->publish_stats_to_osd();
7050
7051 pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
7052
7053 return transit<NotBackfilling>();
7054 }
7055
7056 /*--WaitLocalBackfillReserved--*/
7057 PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
7058 : my_base(ctx),
7059 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalBackfillReserved")
7060 {
7061 context< RecoveryMachine >().log_enter(state_name);
7062 PG *pg = context< RecoveryMachine >().pg;
7063 pg->state_set(PG_STATE_BACKFILL_WAIT);
7064 pg->osd->local_reserver.request_reservation(
7065 pg->info.pgid,
7066 new QueuePeeringEvt<LocalBackfillReserved>(
7067 pg, pg->get_osdmap()->get_epoch(),
7068 LocalBackfillReserved()),
7069 pg->get_backfill_priority(),
7070 new QueuePeeringEvt<DeferBackfill>(
7071 pg, pg->get_osdmap()->get_epoch(),
7072 DeferBackfill(0.0)));
7073 pg->publish_stats_to_osd();
7074 }
7075
7076 void PG::RecoveryState::WaitLocalBackfillReserved::exit()
7077 {
7078 context< RecoveryMachine >().log_exit(state_name, enter_time);
7079 PG *pg = context< RecoveryMachine >().pg;
7080 utime_t dur = ceph_clock_now() - enter_time;
7081 pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur);
7082 }
7083
7084 /*----NotBackfilling------*/
7085 PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
7086 : my_base(ctx),
7087 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotBackfilling")
7088 {
7089 context< RecoveryMachine >().log_enter(state_name);
7090 PG *pg = context< RecoveryMachine >().pg;
7091 pg->publish_stats_to_osd();
7092 }
7093
7094 boost::statechart::result
7095 PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
7096 {
7097 return discard_event();
7098 }
7099
7100 boost::statechart::result
7101 PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
7102 {
7103 return discard_event();
7104 }
7105
7106 void PG::RecoveryState::NotBackfilling::exit()
7107 {
7108 context< RecoveryMachine >().log_exit(state_name, enter_time);
7109 PG *pg = context< RecoveryMachine >().pg;
7110 pg->state_clear(PG_STATE_BACKFILL_UNFOUND);
7111 utime_t dur = ceph_clock_now() - enter_time;
7112 pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
7113 }
7114
7115 /*----NotRecovering------*/
7116 PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx)
7117 : my_base(ctx),
7118 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotRecovering")
7119 {
7120 context< RecoveryMachine >().log_enter(state_name);
7121 PG *pg = context< RecoveryMachine >().pg;
7122 pg->publish_stats_to_osd();
7123 }
7124
7125 void PG::RecoveryState::NotRecovering::exit()
7126 {
7127 context< RecoveryMachine >().log_exit(state_name, enter_time);
7128 PG *pg = context< RecoveryMachine >().pg;
7129 pg->state_clear(PG_STATE_RECOVERY_UNFOUND);
7130 utime_t dur = ceph_clock_now() - enter_time;
7131 pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur);
7132 }
7133
7134 /*---RepNotRecovering----*/
7135 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
7136 : my_base(ctx),
7137 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepNotRecovering")
7138 {
7139 context< RecoveryMachine >().log_enter(state_name);
7140 }
7141
7142 boost::statechart::result
7143 PG::RecoveryState::RepNotRecovering::react(const RejectRemoteReservation &evt)
7144 {
7145 PG *pg = context< RecoveryMachine >().pg;
7146 pg->reject_reservation();
7147 post_event(RemoteReservationRejected());
7148 return discard_event();
7149 }
7150
7151 void PG::RecoveryState::RepNotRecovering::exit()
7152 {
7153 context< RecoveryMachine >().log_exit(state_name, enter_time);
7154 PG *pg = context< RecoveryMachine >().pg;
7155 utime_t dur = ceph_clock_now() - enter_time;
7156 pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur);
7157 }
7158
7159 /*---RepWaitRecoveryReserved--*/
7160 PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
7161 : my_base(ctx),
7162 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitRecoveryReserved")
7163 {
7164 context< RecoveryMachine >().log_enter(state_name);
7165 PG *pg = context< RecoveryMachine >().pg;
7166
7167 pg->osd->remote_reserver.request_reservation(
7168 pg->info.pgid,
7169 new QueuePeeringEvt<RemoteRecoveryReserved>(
7170 pg, pg->get_osdmap()->get_epoch(),
7171 RemoteRecoveryReserved()),
7172 pg->get_recovery_priority());
7173 }
7174
7175 boost::statechart::result
7176 PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
7177 {
7178 PG *pg = context< RecoveryMachine >().pg;
7179 pg->osd->send_message_osd_cluster(
7180 pg->primary.osd,
7181 new MRecoveryReserve(
7182 MRecoveryReserve::GRANT,
7183 spg_t(pg->info.pgid.pgid, pg->primary.shard),
7184 pg->get_osdmap()->get_epoch()),
7185 pg->get_osdmap()->get_epoch());
7186 return transit<RepRecovering>();
7187 }
7188
7189 boost::statechart::result
7190 PG::RecoveryState::RepWaitRecoveryReserved::react(
7191 const RemoteReservationCanceled &evt)
7192 {
7193 PG *pg = context< RecoveryMachine >().pg;
7194 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7195 return transit<RepNotRecovering>();
7196 }
7197
7198 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
7199 {
7200 context< RecoveryMachine >().log_exit(state_name, enter_time);
7201 PG *pg = context< RecoveryMachine >().pg;
7202 utime_t dur = ceph_clock_now() - enter_time;
7203 pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur);
7204 }
7205
7206 /*-RepWaitBackfillReserved*/
7207 PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
7208 : my_base(ctx),
7209 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitBackfillReserved")
7210 {
7211 context< RecoveryMachine >().log_enter(state_name);
7212 }
7213
7214 boost::statechart::result
7215 PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
7216 {
7217 PG *pg = context< RecoveryMachine >().pg;
7218 ostringstream ss;
7219
7220 if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
7221 (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
7222 ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
7223 << dendl;
7224 post_event(RejectRemoteReservation());
7225 } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
7226 pg->osd->check_backfill_full(ss)) {
7227 ldout(pg->cct, 10) << "backfill reservation rejected: "
7228 << ss.str() << dendl;
7229 post_event(RejectRemoteReservation());
7230 } else {
7231 pg->osd->remote_reserver.request_reservation(
7232 pg->info.pgid,
7233 new QueuePeeringEvt<RemoteBackfillReserved>(
7234 pg, pg->get_osdmap()->get_epoch(),
7235 RemoteBackfillReserved()), evt.priority);
7236 }
7237 return transit<RepWaitBackfillReserved>();
7238 }
7239
7240 void PG::RecoveryState::RepWaitBackfillReserved::exit()
7241 {
7242 context< RecoveryMachine >().log_exit(state_name, enter_time);
7243 PG *pg = context< RecoveryMachine >().pg;
7244 utime_t dur = ceph_clock_now() - enter_time;
7245 pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur);
7246 }
7247
7248 boost::statechart::result
7249 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
7250 {
7251 PG *pg = context< RecoveryMachine >().pg;
7252
7253 ostringstream ss;
7254 if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
7255 (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
7256 ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
7257 << "failure injection" << dendl;
7258 post_event(RejectRemoteReservation());
7259 return discard_event();
7260 } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
7261 pg->osd->check_backfill_full(ss)) {
7262 ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
7263 << ss.str() << dendl;
7264 post_event(RejectRemoteReservation());
7265 return discard_event();
7266 } else {
7267 pg->osd->send_message_osd_cluster(
7268 pg->primary.osd,
7269 new MBackfillReserve(
7270 MBackfillReserve::GRANT,
7271 spg_t(pg->info.pgid.pgid, pg->primary.shard),
7272 pg->get_osdmap()->get_epoch()),
7273 pg->get_osdmap()->get_epoch());
7274 return transit<RepRecovering>();
7275 }
7276 }
7277
7278 boost::statechart::result
7279 PG::RecoveryState::RepWaitBackfillReserved::react(
7280 const RejectRemoteReservation &evt)
7281 {
7282 PG *pg = context< RecoveryMachine >().pg;
7283 pg->reject_reservation();
7284 post_event(RemoteReservationRejected());
7285 return discard_event();
7286 }
7287
7288 boost::statechart::result
7289 PG::RecoveryState::RepWaitBackfillReserved::react(
7290 const RemoteReservationRejected &evt)
7291 {
7292 PG *pg = context< RecoveryMachine >().pg;
7293 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7294 return transit<RepNotRecovering>();
7295 }
7296
7297 boost::statechart::result
7298 PG::RecoveryState::RepWaitBackfillReserved::react(
7299 const RemoteReservationCanceled &evt)
7300 {
7301 PG *pg = context< RecoveryMachine >().pg;
7302 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7303 return transit<RepNotRecovering>();
7304 }
7305
7306 /*---RepRecovering-------*/
7307 PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx)
7308 : my_base(ctx),
7309 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepRecovering")
7310 {
7311 context< RecoveryMachine >().log_enter(state_name);
7312 }
7313
7314 boost::statechart::result
7315 PG::RecoveryState::RepRecovering::react(const BackfillTooFull &)
7316 {
7317 PG *pg = context< RecoveryMachine >().pg;
7318 pg->reject_reservation();
7319 return discard_event();
7320 }
7321
7322 void PG::RecoveryState::RepRecovering::exit()
7323 {
7324 context< RecoveryMachine >().log_exit(state_name, enter_time);
7325 PG *pg = context< RecoveryMachine >().pg;
7326 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7327 utime_t dur = ceph_clock_now() - enter_time;
7328 pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur);
7329 }
7330
7331 /*------Activating--------*/
7332 PG::RecoveryState::Activating::Activating(my_context ctx)
7333 : my_base(ctx),
7334 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Activating")
7335 {
7336 context< RecoveryMachine >().log_enter(state_name);
7337 }
7338
7339 void PG::RecoveryState::Activating::exit()
7340 {
7341 context< RecoveryMachine >().log_exit(state_name, enter_time);
7342 PG *pg = context< RecoveryMachine >().pg;
7343 utime_t dur = ceph_clock_now() - enter_time;
7344 pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur);
7345 }
7346
7347 PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
7348 : my_base(ctx),
7349 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalRecoveryReserved")
7350 {
7351 context< RecoveryMachine >().log_enter(state_name);
7352 PG *pg = context< RecoveryMachine >().pg;
7353
7354 // Make sure all nodes that part of the recovery aren't full
7355 if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery &&
7356 pg->osd->check_osdmap_full(pg->actingbackfill)) {
7357 post_event(RecoveryTooFull());
7358 return;
7359 }
7360
7361 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7362 pg->state_set(PG_STATE_RECOVERY_WAIT);
7363 pg->osd->local_reserver.request_reservation(
7364 pg->info.pgid,
7365 new QueuePeeringEvt<LocalRecoveryReserved>(
7366 pg, pg->get_osdmap()->get_epoch(),
7367 LocalRecoveryReserved()),
7368 pg->get_recovery_priority(),
7369 new QueuePeeringEvt<DeferRecovery>(
7370 pg, pg->get_osdmap()->get_epoch(),
7371 DeferRecovery(0.0)));
7372 pg->publish_stats_to_osd();
7373 }
7374
7375 boost::statechart::result
7376 PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
7377 {
7378 PG *pg = context< RecoveryMachine >().pg;
7379 pg->state_set(PG_STATE_RECOVERY_TOOFULL);
7380 pg->schedule_recovery_retry(pg->cct->_conf->osd_recovery_retry_interval);
7381 return transit<NotRecovering>();
7382 }
7383
7384 void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
7385 {
7386 context< RecoveryMachine >().log_exit(state_name, enter_time);
7387 PG *pg = context< RecoveryMachine >().pg;
7388 utime_t dur = ceph_clock_now() - enter_time;
7389 pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur);
7390 }
7391
7392 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
7393 : my_base(ctx),
7394 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
7395 remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
7396 {
7397 context< RecoveryMachine >().log_enter(state_name);
7398 post_event(RemoteRecoveryReserved());
7399 }
7400
7401 boost::statechart::result
7402 PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
7403 PG *pg = context< RecoveryMachine >().pg;
7404
7405 if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
7406 assert(*remote_recovery_reservation_it != pg->pg_whoami);
7407 ConnectionRef con = pg->osd->get_con_osd_cluster(
7408 remote_recovery_reservation_it->osd, pg->get_osdmap()->get_epoch());
7409 if (con) {
7410 pg->osd->send_message_osd_cluster(
7411 new MRecoveryReserve(
7412 MRecoveryReserve::REQUEST,
7413 spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard),
7414 pg->get_osdmap()->get_epoch()),
7415 con.get());
7416 }
7417 ++remote_recovery_reservation_it;
7418 } else {
7419 post_event(AllRemotesReserved());
7420 }
7421 return discard_event();
7422 }
7423
7424 void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
7425 {
7426 context< RecoveryMachine >().log_exit(state_name, enter_time);
7427 PG *pg = context< RecoveryMachine >().pg;
7428 utime_t dur = ceph_clock_now() - enter_time;
7429 pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur);
7430 }
7431
7432 PG::RecoveryState::Recovering::Recovering(my_context ctx)
7433 : my_base(ctx),
7434 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovering")
7435 {
7436 context< RecoveryMachine >().log_enter(state_name);
7437
7438 PG *pg = context< RecoveryMachine >().pg;
7439 pg->state_clear(PG_STATE_RECOVERY_WAIT);
7440 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7441 pg->state_set(PG_STATE_RECOVERING);
7442 assert(!pg->state_test(PG_STATE_ACTIVATING));
7443 pg->publish_stats_to_osd();
7444 pg->queue_recovery();
7445 }
7446
7447 void PG::RecoveryState::Recovering::release_reservations(bool cancel)
7448 {
7449 PG *pg = context< RecoveryMachine >().pg;
7450 assert(cancel || !pg->pg_log.get_missing().have_missing());
7451
7452 // release remote reservations
7453 for (set<pg_shard_t>::const_iterator i =
7454 context< Active >().remote_shards_to_reserve_recovery.begin();
7455 i != context< Active >().remote_shards_to_reserve_recovery.end();
7456 ++i) {
7457 if (*i == pg->pg_whoami) // skip myself
7458 continue;
7459 ConnectionRef con = pg->osd->get_con_osd_cluster(
7460 i->osd, pg->get_osdmap()->get_epoch());
7461 if (con) {
7462 pg->osd->send_message_osd_cluster(
7463 new MRecoveryReserve(
7464 MRecoveryReserve::RELEASE,
7465 spg_t(pg->info.pgid.pgid, i->shard),
7466 pg->get_osdmap()->get_epoch()),
7467 con.get());
7468 }
7469 }
7470 }
7471
7472 boost::statechart::result
7473 PG::RecoveryState::Recovering::react(const AllReplicasRecovered &evt)
7474 {
7475 PG *pg = context< RecoveryMachine >().pg;
7476 pg->state_clear(PG_STATE_RECOVERING);
7477 pg->state_clear(PG_STATE_FORCED_RECOVERY);
7478 release_reservations();
7479 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7480 return transit<Recovered>();
7481 }
7482
7483 boost::statechart::result
7484 PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
7485 {
7486 PG *pg = context< RecoveryMachine >().pg;
7487 pg->state_clear(PG_STATE_RECOVERING);
7488 pg->state_clear(PG_STATE_FORCED_RECOVERY);
7489 release_reservations();
7490 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7491 // XXX: Is this needed?
7492 pg->publish_stats_to_osd();
7493 return transit<WaitLocalBackfillReserved>();
7494 }
7495
7496 boost::statechart::result
7497 PG::RecoveryState::Recovering::react(const DeferRecovery &evt)
7498 {
7499 PG *pg = context< RecoveryMachine >().pg;
7500 if (!pg->state_test(PG_STATE_RECOVERING)) {
7501 // we may have finished recovery and have an AllReplicasRecovered
7502 // event queued to move us to the next state.
7503 ldout(pg->cct, 10) << "got defer recovery but not recovering" << dendl;
7504 return discard_event();
7505 }
7506 ldout(pg->cct, 10) << "defer recovery, retry delay " << evt.delay << dendl;
7507 pg->state_clear(PG_STATE_RECOVERING);
7508 pg->state_set(PG_STATE_RECOVERY_WAIT);
7509 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7510 release_reservations(true);
7511 pg->schedule_recovery_retry(evt.delay);
7512 return transit<NotRecovering>();
7513 }
7514
7515 boost::statechart::result
7516 PG::RecoveryState::Recovering::react(const UnfoundRecovery &evt)
7517 {
7518 PG *pg = context< RecoveryMachine >().pg;
7519 ldout(pg->cct, 10) << "recovery has unfound, can't continue" << dendl;
7520 pg->state_set(PG_STATE_RECOVERY_UNFOUND);
7521 pg->state_clear(PG_STATE_RECOVERING);
7522 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7523 release_reservations(true);
7524 return transit<NotRecovering>();
7525 }
7526
7527 void PG::RecoveryState::Recovering::exit()
7528 {
7529 context< RecoveryMachine >().log_exit(state_name, enter_time);
7530 PG *pg = context< RecoveryMachine >().pg;
7531 utime_t dur = ceph_clock_now() - enter_time;
7532 pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur);
7533 }
7534
7535 PG::RecoveryState::Recovered::Recovered(my_context ctx)
7536 : my_base(ctx),
7537 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovered")
7538 {
7539 pg_shard_t auth_log_shard;
7540
7541 context< RecoveryMachine >().log_enter(state_name);
7542
7543 PG *pg = context< RecoveryMachine >().pg;
7544
7545 assert(!pg->needs_recovery());
7546
7547 // if we finished backfill, all acting are active; recheck if
7548 // DEGRADED | UNDERSIZED is appropriate.
7549 assert(!pg->actingbackfill.empty());
7550 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
7551 pg->actingbackfill.size()) {
7552 pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
7553 pg->publish_stats_to_osd();
7554 }
7555
7556 // trim pglog on recovered
7557 pg->trim_log();
7558
7559 // adjust acting set? (e.g. because backfill completed...)
7560 bool history_les_bound = false;
7561 if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
7562 true, &history_les_bound))
7563 assert(pg->want_acting.size());
7564
7565 if (context< Active >().all_replicas_activated)
7566 post_event(GoClean());
7567 }
7568
7569 void PG::RecoveryState::Recovered::exit()
7570 {
7571 context< RecoveryMachine >().log_exit(state_name, enter_time);
7572 PG *pg = context< RecoveryMachine >().pg;
7573 utime_t dur = ceph_clock_now() - enter_time;
7574 pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur);
7575 }
7576
7577 PG::RecoveryState::Clean::Clean(my_context ctx)
7578 : my_base(ctx),
7579 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Clean")
7580 {
7581 context< RecoveryMachine >().log_enter(state_name);
7582
7583 PG *pg = context< RecoveryMachine >().pg;
7584
7585 if (pg->info.last_complete != pg->info.last_update) {
7586 ceph_abort();
7587 }
7588 pg->finish_recovery(*context< RecoveryMachine >().get_on_safe_context_list());
7589
7590 if (pg->is_active()) {
7591 pg->mark_clean();
7592 }
7593
7594 pg->share_pg_info();
7595 pg->publish_stats_to_osd();
7596 pg->requeue_ops(pg->waiting_for_clean_to_primary_repair);
7597 }
7598
7599 void PG::RecoveryState::Clean::exit()
7600 {
7601 context< RecoveryMachine >().log_exit(state_name, enter_time);
7602 PG *pg = context< RecoveryMachine >().pg;
7603 pg->state_clear(PG_STATE_CLEAN);
7604 utime_t dur = ceph_clock_now() - enter_time;
7605 pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur);
7606 }
7607
7608 template <typename T>
7609 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
7610 {
7611 set<int> osds_found;
7612 set<pg_shard_t> out;
7613 for (typename T::const_iterator i = in.begin();
7614 i != in.end();
7615 ++i) {
7616 if (*i != skip && !osds_found.count(i->osd)) {
7617 osds_found.insert(i->osd);
7618 out.insert(*i);
7619 }
7620 }
7621 return out;
7622 }
7623
7624 /*---------Active---------*/
7625 PG::RecoveryState::Active::Active(my_context ctx)
7626 : my_base(ctx),
7627 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active"),
7628 remote_shards_to_reserve_recovery(
7629 unique_osd_shard_set(
7630 context< RecoveryMachine >().pg->pg_whoami,
7631 context< RecoveryMachine >().pg->actingbackfill)),
7632 remote_shards_to_reserve_backfill(
7633 unique_osd_shard_set(
7634 context< RecoveryMachine >().pg->pg_whoami,
7635 context< RecoveryMachine >().pg->backfill_targets)),
7636 all_replicas_activated(false)
7637 {
7638 context< RecoveryMachine >().log_enter(state_name);
7639
7640 PG *pg = context< RecoveryMachine >().pg;
7641
7642 assert(!pg->backfill_reserving);
7643 assert(!pg->backfill_reserved);
7644 assert(pg->is_primary());
7645 ldout(pg->cct, 10) << "In Active, about to call activate" << dendl;
7646 pg->start_flush(
7647 context< RecoveryMachine >().get_cur_transaction(),
7648 context< RecoveryMachine >().get_on_applied_context_list(),
7649 context< RecoveryMachine >().get_on_safe_context_list());
7650 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7651 pg->get_osdmap()->get_epoch(),
7652 *context< RecoveryMachine >().get_on_safe_context_list(),
7653 *context< RecoveryMachine >().get_query_map(),
7654 context< RecoveryMachine >().get_info_map(),
7655 context< RecoveryMachine >().get_recovery_ctx());
7656
7657 // everyone has to commit/ack before we are truly active
7658 pg->blocked_by.clear();
7659 for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
7660 p != pg->actingbackfill.end();
7661 ++p) {
7662 if (p->shard != pg->pg_whoami.shard) {
7663 pg->blocked_by.insert(p->shard);
7664 }
7665 }
7666 pg->publish_stats_to_osd();
7667 ldout(pg->cct, 10) << "Activate Finished" << dendl;
7668 }
7669
7670 boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
7671 {
7672 PG *pg = context< RecoveryMachine >().pg;
7673 ldout(pg->cct, 10) << "Active advmap" << dendl;
7674 if (!pg->pool.newly_removed_snaps.empty()) {
7675 pg->snap_trimq.union_of(pg->pool.newly_removed_snaps);
7676 ldout(pg->cct, 10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl;
7677 pg->dirty_info = true;
7678 pg->dirty_big_info = true;
7679 }
7680
7681 for (size_t i = 0; i < pg->want_acting.size(); i++) {
7682 int osd = pg->want_acting[i];
7683 if (!advmap.osdmap->is_up(osd)) {
7684 pg_shard_t osd_with_shard(osd, shard_id_t(i));
7685 assert(pg->is_acting(osd_with_shard) || pg->is_up(osd_with_shard));
7686 }
7687 }
7688
7689 bool need_publish = false;
7690 /* Check for changes in pool size (if the acting set changed as a result,
7691 * this does not matter) */
7692 if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) !=
7693 pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) {
7694 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) {
7695 pg->state_clear(PG_STATE_UNDERSIZED);
7696 } else {
7697 pg->state_set(PG_STATE_UNDERSIZED);
7698 }
7699 // degraded changes will be detected by call from publish_stats_to_osd()
7700 need_publish = true;
7701 }
7702
7703 // if we haven't reported our PG stats in a long time, do so now.
7704 if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
7705 ldout(pg->cct, 20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch)
7706 << " epochs" << dendl;
7707 need_publish = true;
7708 }
7709
7710 if (need_publish)
7711 pg->publish_stats_to_osd();
7712
7713 return forward_event();
7714 }
7715
7716 boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
7717 {
7718 PG *pg = context< RecoveryMachine >().pg;
7719 ldout(pg->cct, 10) << "Active: handling ActMap" << dendl;
7720 assert(pg->is_primary());
7721
7722 if (pg->have_unfound()) {
7723 // object may have become unfound
7724 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7725 }
7726
7727 if (pg->cct->_conf->osd_check_for_log_corruption)
7728 pg->check_log_for_corruption(pg->osd->store);
7729
7730 uint64_t unfound = pg->missing_loc.num_unfound();
7731 if (unfound > 0 &&
7732 pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
7733 if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
7734 pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound
7735 << " objects unfound and apparently lost, would automatically "
7736 << "mark these objects lost but this feature is not yet implemented "
7737 << "(osd_auto_mark_unfound_lost)";
7738 } else
7739 pg->osd->clog->error() << pg->info.pgid.pgid << " has "
7740 << unfound << " objects unfound and apparently lost";
7741 }
7742
7743 if (pg->is_active()) {
7744 ldout(pg->cct, 10) << "Active: kicking snap trim" << dendl;
7745 pg->kick_snap_trim();
7746 }
7747
7748 if (pg->is_peered() &&
7749 !pg->is_clean() &&
7750 !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) &&
7751 (!pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || pg->is_degraded())) {
7752 pg->queue_recovery();
7753 }
7754 return forward_event();
7755 }
7756
7757 boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& notevt)
7758 {
7759 PG *pg = context< RecoveryMachine >().pg;
7760 assert(pg->is_primary());
7761 if (pg->peer_info.count(notevt.from)) {
7762 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7763 << ", already have info from that osd, ignoring"
7764 << dendl;
7765 } else if (pg->peer_purged.count(notevt.from)) {
7766 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7767 << ", already purged that peer, ignoring"
7768 << dendl;
7769 } else {
7770 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7771 << ", calling proc_replica_info and discover_all_missing"
7772 << dendl;
7773 pg->proc_replica_info(
7774 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
7775 if (pg->have_unfound()) {
7776 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7777 }
7778 }
7779 return discard_event();
7780 }
7781
7782 boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoevt)
7783 {
7784 PG *pg = context< RecoveryMachine >().pg;
7785 assert(pg->is_primary());
7786
7787 assert(!pg->actingbackfill.empty());
7788 // don't update history (yet) if we are active and primary; the replica
7789 // may be telling us they have activated (and committed) but we can't
7790 // share that until _everyone_ does the same.
7791 if (pg->is_actingbackfill(infoevt.from)) {
7792 ldout(pg->cct, 10) << " peer osd." << infoevt.from
7793 << " activated and committed" << dendl;
7794 pg->peer_activated.insert(infoevt.from);
7795 pg->blocked_by.erase(infoevt.from.shard);
7796 pg->publish_stats_to_osd();
7797 if (pg->peer_activated.size() == pg->actingbackfill.size()) {
7798 pg->all_activated_and_committed();
7799 }
7800 }
7801 return discard_event();
7802 }
7803
7804 boost::statechart::result PG::RecoveryState::Active::react(const MLogRec& logevt)
7805 {
7806 PG *pg = context< RecoveryMachine >().pg;
7807 ldout(pg->cct, 10) << "searching osd." << logevt.from
7808 << " log for unfound items" << dendl;
7809 pg->proc_replica_log(
7810 logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
7811 bool got_missing = pg->search_for_missing(
7812 pg->peer_info[logevt.from],
7813 pg->peer_missing[logevt.from],
7814 logevt.from,
7815 context< RecoveryMachine >().get_recovery_ctx());
7816 // If there are missing AND we are "fully" active then start recovery now
7817 if (got_missing && pg->state_test(PG_STATE_ACTIVE)) {
7818 post_event(DoRecovery());
7819 }
7820 return discard_event();
7821 }
7822
7823 boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
7824 {
7825 PG *pg = context< RecoveryMachine >().pg;
7826
7827 q.f->open_object_section("state");
7828 q.f->dump_string("name", state_name);
7829 q.f->dump_stream("enter_time") << enter_time;
7830
7831 {
7832 q.f->open_array_section("might_have_unfound");
7833 for (set<pg_shard_t>::iterator p = pg->might_have_unfound.begin();
7834 p != pg->might_have_unfound.end();
7835 ++p) {
7836 q.f->open_object_section("osd");
7837 q.f->dump_stream("osd") << *p;
7838 if (pg->peer_missing.count(*p)) {
7839 q.f->dump_string("status", "already probed");
7840 } else if (pg->peer_missing_requested.count(*p)) {
7841 q.f->dump_string("status", "querying");
7842 } else if (!pg->get_osdmap()->is_up(p->osd)) {
7843 q.f->dump_string("status", "osd is down");
7844 } else {
7845 q.f->dump_string("status", "not queried");
7846 }
7847 q.f->close_section();
7848 }
7849 q.f->close_section();
7850 }
7851 {
7852 q.f->open_object_section("recovery_progress");
7853 pg->dump_recovery_info(q.f);
7854 q.f->close_section();
7855 }
7856
7857 {
7858 q.f->open_object_section("scrub");
7859 q.f->dump_stream("scrubber.epoch_start") << pg->scrubber.epoch_start;
7860 q.f->dump_bool("scrubber.active", pg->scrubber.active);
7861 q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state));
7862 q.f->dump_stream("scrubber.start") << pg->scrubber.start;
7863 q.f->dump_stream("scrubber.end") << pg->scrubber.end;
7864 q.f->dump_stream("scrubber.max_end") << pg->scrubber.max_end;
7865 q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update;
7866 q.f->dump_bool("scrubber.deep", pg->scrubber.deep);
7867 {
7868 q.f->open_array_section("scrubber.waiting_on_whom");
7869 for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin();
7870 p != pg->scrubber.waiting_on_whom.end();
7871 ++p) {
7872 q.f->dump_stream("shard") << *p;
7873 }
7874 q.f->close_section();
7875 }
7876 q.f->close_section();
7877 }
7878
7879 q.f->close_section();
7880 return forward_event();
7881 }
7882
7883 boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActivated &evt)
7884 {
7885 PG *pg = context< RecoveryMachine >().pg;
7886 all_replicas_activated = true;
7887
7888 pg->state_clear(PG_STATE_ACTIVATING);
7889 pg->state_clear(PG_STATE_CREATING);
7890 if (pg->acting.size() >= pg->pool.info.min_size) {
7891 pg->state_set(PG_STATE_ACTIVE);
7892 } else {
7893 pg->state_set(PG_STATE_PEERED);
7894 }
7895
7896 // info.last_epoch_started is set during activate()
7897 pg->info.history.last_epoch_started = pg->info.last_epoch_started;
7898 pg->info.history.last_interval_started = pg->info.last_interval_started;
7899 pg->dirty_info = true;
7900
7901 pg->share_pg_info();
7902 pg->publish_stats_to_osd();
7903
7904 pg->check_local();
7905
7906 // waiters
7907 if (pg->flushes_in_progress == 0) {
7908 pg->requeue_ops(pg->waiting_for_peered);
7909 } else if (!pg->waiting_for_peered.empty()) {
7910 ldout(pg->cct, 10) << __func__ << " flushes in progress, moving "
7911 << pg->waiting_for_peered.size()
7912 << " items to waiting_for_flush"
7913 << dendl;
7914 assert(pg->waiting_for_flush.empty());
7915 pg->waiting_for_flush.swap(pg->waiting_for_peered);
7916 }
7917
7918 pg->on_activate();
7919
7920 return discard_event();
7921 }
7922
7923 void PG::RecoveryState::Active::exit()
7924 {
7925 context< RecoveryMachine >().log_exit(state_name, enter_time);
7926 PG *pg = context< RecoveryMachine >().pg;
7927 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7928
7929 pg->blocked_by.clear();
7930 pg->backfill_reserved = false;
7931 pg->backfill_reserving = false;
7932 pg->state_clear(PG_STATE_ACTIVATING);
7933 pg->state_clear(PG_STATE_DEGRADED);
7934 pg->state_clear(PG_STATE_UNDERSIZED);
7935 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
7936 pg->state_clear(PG_STATE_BACKFILL_WAIT);
7937 pg->state_clear(PG_STATE_RECOVERY_WAIT);
7938 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7939 utime_t dur = ceph_clock_now() - enter_time;
7940 pg->osd->recoverystate_perf->tinc(rs_active_latency, dur);
7941 pg->agent_stop();
7942 }
7943
7944 /*------ReplicaActive-----*/
7945 PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx)
7946 : my_base(ctx),
7947 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive")
7948 {
7949 context< RecoveryMachine >().log_enter(state_name);
7950
7951 PG *pg = context< RecoveryMachine >().pg;
7952 pg->start_flush(
7953 context< RecoveryMachine >().get_cur_transaction(),
7954 context< RecoveryMachine >().get_on_applied_context_list(),
7955 context< RecoveryMachine >().get_on_safe_context_list());
7956 }
7957
7958
7959 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
7960 const Activate& actevt) {
7961 PG *pg = context< RecoveryMachine >().pg;
7962 ldout(pg->cct, 10) << "In ReplicaActive, about to call activate" << dendl;
7963 map<int, map<spg_t, pg_query_t> > query_map;
7964 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7965 actevt.activation_epoch,
7966 *context< RecoveryMachine >().get_on_safe_context_list(),
7967 query_map, NULL, NULL);
7968 ldout(pg->cct, 10) << "Activate Finished" << dendl;
7969 return discard_event();
7970 }
7971
7972 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MInfoRec& infoevt)
7973 {
7974 PG *pg = context< RecoveryMachine >().pg;
7975 pg->proc_primary_info(*context<RecoveryMachine>().get_cur_transaction(),
7976 infoevt.info);
7977 return discard_event();
7978 }
7979
7980 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MLogRec& logevt)
7981 {
7982 PG *pg = context< RecoveryMachine >().pg;
7983 ldout(pg->cct, 10) << "received log from " << logevt.from << dendl;
7984 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7985 pg->merge_log(*t, logevt.msg->info, logevt.msg->log, logevt.from);
7986 assert(pg->pg_log.get_head() == pg->info.last_update);
7987
7988 return discard_event();
7989 }
7990
7991 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&)
7992 {
7993 PG *pg = context< RecoveryMachine >().pg;
7994 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7995 context< RecoveryMachine >().send_notify(
7996 pg->get_primary(),
7997 pg_notify_t(
7998 pg->get_primary().shard, pg->pg_whoami.shard,
7999 pg->get_osdmap()->get_epoch(),
8000 pg->get_osdmap()->get_epoch(),
8001 pg->info),
8002 pg->past_intervals);
8003 }
8004 pg->take_waiters();
8005 return discard_event();
8006 }
8007
8008 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
8009 const MQuery& query)
8010 {
8011 PG *pg = context< RecoveryMachine >().pg;
8012 pg->fulfill_query(query, context<RecoveryMachine>().get_recovery_ctx());
8013 return discard_event();
8014 }
8015
8016 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const QueryState& q)
8017 {
8018 q.f->open_object_section("state");
8019 q.f->dump_string("name", state_name);
8020 q.f->dump_stream("enter_time") << enter_time;
8021 q.f->close_section();
8022 return forward_event();
8023 }
8024
8025 void PG::RecoveryState::ReplicaActive::exit()
8026 {
8027 context< RecoveryMachine >().log_exit(state_name, enter_time);
8028 PG *pg = context< RecoveryMachine >().pg;
8029 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
8030 utime_t dur = ceph_clock_now() - enter_time;
8031 pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur);
8032 }
8033
8034 /*-------Stray---*/
8035 PG::RecoveryState::Stray::Stray(my_context ctx)
8036 : my_base(ctx),
8037 NamedState(context< RecoveryMachine >().pg, "Started/Stray")
8038 {
8039 context< RecoveryMachine >().log_enter(state_name);
8040
8041 PG *pg = context< RecoveryMachine >().pg;
8042 assert(!pg->is_peered());
8043 assert(!pg->is_peering());
8044 assert(!pg->is_primary());
8045 pg->start_flush(
8046 context< RecoveryMachine >().get_cur_transaction(),
8047 context< RecoveryMachine >().get_on_applied_context_list(),
8048 context< RecoveryMachine >().get_on_safe_context_list());
8049 }
8050
8051 boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
8052 {
8053 PG *pg = context< RecoveryMachine >().pg;
8054 MOSDPGLog *msg = logevt.msg.get();
8055 ldout(pg->cct, 10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
8056
8057 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
8058 if (msg->info.last_backfill == hobject_t()) {
8059 // restart backfill
8060 pg->unreg_next_scrub();
8061 pg->info = msg->info;
8062 pg->reg_next_scrub();
8063 pg->dirty_info = true;
8064 pg->dirty_big_info = true; // maybe.
8065
8066 PGLogEntryHandler rollbacker{pg, t};
8067 pg->pg_log.reset_backfill_claim_log(msg->log, &rollbacker);
8068
8069 pg->pg_log.reset_backfill();
8070 } else {
8071 pg->merge_log(*t, msg->info, msg->log, logevt.from);
8072 }
8073
8074 assert(pg->pg_log.get_head() == pg->info.last_update);
8075
8076 post_event(Activate(logevt.msg->info.last_epoch_started));
8077 return transit<ReplicaActive>();
8078 }
8079
8080 boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoevt)
8081 {
8082 PG *pg = context< RecoveryMachine >().pg;
8083 ldout(pg->cct, 10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
8084
8085 if (pg->info.last_update > infoevt.info.last_update) {
8086 // rewind divergent log entries
8087 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
8088 pg->rewind_divergent_log(*t, infoevt.info.last_update);
8089 pg->info.stats = infoevt.info.stats;
8090 pg->info.hit_set = infoevt.info.hit_set;
8091 }
8092
8093 assert(infoevt.info.last_update == pg->info.last_update);
8094 assert(pg->pg_log.get_head() == pg->info.last_update);
8095
8096 post_event(Activate(infoevt.info.last_epoch_started));
8097 return transit<ReplicaActive>();
8098 }
8099
8100 boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query)
8101 {
8102 PG *pg = context< RecoveryMachine >().pg;
8103 pg->fulfill_query(query, context<RecoveryMachine>().get_recovery_ctx());
8104 return discard_event();
8105 }
8106
8107 boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&)
8108 {
8109 PG *pg = context< RecoveryMachine >().pg;
8110 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
8111 context< RecoveryMachine >().send_notify(
8112 pg->get_primary(),
8113 pg_notify_t(
8114 pg->get_primary().shard, pg->pg_whoami.shard,
8115 pg->get_osdmap()->get_epoch(),
8116 pg->get_osdmap()->get_epoch(),
8117 pg->info),
8118 pg->past_intervals);
8119 }
8120 pg->take_waiters();
8121 return discard_event();
8122 }
8123
8124 void PG::RecoveryState::Stray::exit()
8125 {
8126 context< RecoveryMachine >().log_exit(state_name, enter_time);
8127 PG *pg = context< RecoveryMachine >().pg;
8128 utime_t dur = ceph_clock_now() - enter_time;
8129 pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur);
8130 }
8131
8132 /*--------GetInfo---------*/
8133 PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
8134 : my_base(ctx),
8135 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetInfo")
8136 {
8137 context< RecoveryMachine >().log_enter(state_name);
8138
8139 PG *pg = context< RecoveryMachine >().pg;
8140 pg->check_past_interval_bounds();
8141 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
8142
8143 assert(pg->blocked_by.empty());
8144
8145 prior_set = pg->build_prior();
8146
8147 pg->reset_min_peer_features();
8148 get_infos();
8149 if (prior_set.pg_down) {
8150 post_event(IsDown());
8151 } else if (peer_info_requested.empty()) {
8152 post_event(GotInfo());
8153 }
8154 }
8155
8156 void PG::RecoveryState::GetInfo::get_infos()
8157 {
8158 PG *pg = context< RecoveryMachine >().pg;
8159 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
8160
8161 pg->blocked_by.clear();
8162 for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
8163 it != prior_set.probe.end();
8164 ++it) {
8165 pg_shard_t peer = *it;
8166 if (peer == pg->pg_whoami) {
8167 continue;
8168 }
8169 if (pg->peer_info.count(peer)) {
8170 ldout(pg->cct, 10) << " have osd." << peer << " info " << pg->peer_info[peer] << dendl;
8171 continue;
8172 }
8173 if (peer_info_requested.count(peer)) {
8174 ldout(pg->cct, 10) << " already requested info from osd." << peer << dendl;
8175 pg->blocked_by.insert(peer.osd);
8176 } else if (!pg->get_osdmap()->is_up(peer.osd)) {
8177 ldout(pg->cct, 10) << " not querying info from down osd." << peer << dendl;
8178 } else {
8179 ldout(pg->cct, 10) << " querying info from osd." << peer << dendl;
8180 context< RecoveryMachine >().send_query(
8181 peer, pg_query_t(pg_query_t::INFO,
8182 it->shard, pg->pg_whoami.shard,
8183 pg->info.history,
8184 pg->get_osdmap()->get_epoch()));
8185 peer_info_requested.insert(peer);
8186 pg->blocked_by.insert(peer.osd);
8187 }
8188 }
8189
8190 pg->publish_stats_to_osd();
8191 }
8192
8193 boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt)
8194 {
8195 PG *pg = context< RecoveryMachine >().pg;
8196
8197 set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
8198 if (p != peer_info_requested.end()) {
8199 peer_info_requested.erase(p);
8200 pg->blocked_by.erase(infoevt.from.osd);
8201 }
8202
8203 epoch_t old_start = pg->info.history.last_epoch_started;
8204 if (pg->proc_replica_info(
8205 infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
8206 // we got something new ...
8207 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
8208 if (old_start < pg->info.history.last_epoch_started) {
8209 ldout(pg->cct, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
8210 prior_set = pg->build_prior();
8211
8212 // filter out any osds that got dropped from the probe set from
8213 // peer_info_requested. this is less expensive than restarting
8214 // peering (which would re-probe everyone).
8215 set<pg_shard_t>::iterator p = peer_info_requested.begin();
8216 while (p != peer_info_requested.end()) {
8217 if (prior_set.probe.count(*p) == 0) {
8218 ldout(pg->cct, 20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
8219 peer_info_requested.erase(p++);
8220 } else {
8221 ++p;
8222 }
8223 }
8224 get_infos();
8225 }
8226 ldout(pg->cct, 20) << "Adding osd: " << infoevt.from.osd << " peer features: "
8227 << hex << infoevt.features << dec << dendl;
8228 pg->apply_peer_features(infoevt.features);
8229
8230 // are we done getting everything?
8231 if (peer_info_requested.empty() && !prior_set.pg_down) {
8232 ldout(pg->cct, 20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl;
8233 ldout(pg->cct, 20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl;
8234 ldout(pg->cct, 20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl;
8235 post_event(GotInfo());
8236 }
8237 }
8238 return discard_event();
8239 }
8240
8241 boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q)
8242 {
8243 PG *pg = context< RecoveryMachine >().pg;
8244 q.f->open_object_section("state");
8245 q.f->dump_string("name", state_name);
8246 q.f->dump_stream("enter_time") << enter_time;
8247
8248 q.f->open_array_section("requested_info_from");
8249 for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
8250 p != peer_info_requested.end();
8251 ++p) {
8252 q.f->open_object_section("osd");
8253 q.f->dump_stream("osd") << *p;
8254 if (pg->peer_info.count(*p)) {
8255 q.f->open_object_section("got_info");
8256 pg->peer_info[*p].dump(q.f);
8257 q.f->close_section();
8258 }
8259 q.f->close_section();
8260 }
8261 q.f->close_section();
8262
8263 q.f->close_section();
8264 return forward_event();
8265 }
8266
8267 void PG::RecoveryState::GetInfo::exit()
8268 {
8269 context< RecoveryMachine >().log_exit(state_name, enter_time);
8270 PG *pg = context< RecoveryMachine >().pg;
8271 utime_t dur = ceph_clock_now() - enter_time;
8272 pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur);
8273 pg->blocked_by.clear();
8274 pg->publish_stats_to_osd();
8275 }
8276
8277 /*------GetLog------------*/
8278 PG::RecoveryState::GetLog::GetLog(my_context ctx)
8279 : my_base(ctx),
8280 NamedState(
8281 context< RecoveryMachine >().pg, "Started/Primary/Peering/GetLog"),
8282 msg(0)
8283 {
8284 context< RecoveryMachine >().log_enter(state_name);
8285
8286 PG *pg = context< RecoveryMachine >().pg;
8287
8288 // adjust acting?
8289 if (!pg->choose_acting(auth_log_shard, false,
8290 &context< Peering >().history_les_bound)) {
8291 if (!pg->want_acting.empty()) {
8292 post_event(NeedActingChange());
8293 } else {
8294 post_event(IsIncomplete());
8295 }
8296 return;
8297 }
8298
8299 // am i the best?
8300 if (auth_log_shard == pg->pg_whoami) {
8301 post_event(GotLog());
8302 return;
8303 }
8304
8305 const pg_info_t& best = pg->peer_info[auth_log_shard];
8306
8307 // am i broken?
8308 if (pg->info.last_update < best.log_tail) {
8309 ldout(pg->cct, 10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
8310 post_event(IsIncomplete());
8311 return;
8312 }
8313
8314 // how much log to request?
8315 eversion_t request_log_from = pg->info.last_update;
8316 assert(!pg->actingbackfill.empty());
8317 for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
8318 p != pg->actingbackfill.end();
8319 ++p) {
8320 if (*p == pg->pg_whoami) continue;
8321 pg_info_t& ri = pg->peer_info[*p];
8322 if (ri.last_update < pg->info.log_tail && ri.last_update >= best.log_tail &&
8323 ri.last_update < request_log_from)
8324 request_log_from = ri.last_update;
8325 }
8326
8327 // how much?
8328 ldout(pg->cct, 10) << " requesting log from osd." << auth_log_shard << dendl;
8329 context<RecoveryMachine>().send_query(
8330 auth_log_shard,
8331 pg_query_t(
8332 pg_query_t::LOG,
8333 auth_log_shard.shard, pg->pg_whoami.shard,
8334 request_log_from, pg->info.history,
8335 pg->get_osdmap()->get_epoch()));
8336
8337 assert(pg->blocked_by.empty());
8338 pg->blocked_by.insert(auth_log_shard.osd);
8339 pg->publish_stats_to_osd();
8340 }
8341
8342 boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap)
8343 {
8344 PG *pg = context< RecoveryMachine >().pg;
8345 // make sure our log source didn't go down. we need to check
8346 // explicitly because it may not be part of the prior set, which
8347 // means the Peering state check won't catch it going down.
8348 if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
8349 ldout(pg->cct, 10) << "GetLog: auth_log_shard osd."
8350 << auth_log_shard.osd << " went down" << dendl;
8351 post_event(advmap);
8352 return transit< Reset >();
8353 }
8354
8355 // let the Peering state do its checks.
8356 return forward_event();
8357 }
8358
8359 boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt)
8360 {
8361 PG *pg = context< RecoveryMachine >().pg;
8362 assert(!msg);
8363 if (logevt.from != auth_log_shard) {
8364 ldout(pg->cct, 10) << "GetLog: discarding log from "
8365 << "non-auth_log_shard osd." << logevt.from << dendl;
8366 return discard_event();
8367 }
8368 ldout(pg->cct, 10) << "GetLog: received master log from osd"
8369 << logevt.from << dendl;
8370 msg = logevt.msg;
8371 post_event(GotLog());
8372 return discard_event();
8373 }
8374
8375 boost::statechart::result PG::RecoveryState::GetLog::react(const GotLog&)
8376 {
8377 PG *pg = context< RecoveryMachine >().pg;
8378 ldout(pg->cct, 10) << "leaving GetLog" << dendl;
8379 if (msg) {
8380 ldout(pg->cct, 10) << "processing master log" << dendl;
8381 pg->proc_master_log(*context<RecoveryMachine>().get_cur_transaction(),
8382 msg->info, msg->log, msg->missing,
8383 auth_log_shard);
8384 }
8385 pg->start_flush(
8386 context< RecoveryMachine >().get_cur_transaction(),
8387 context< RecoveryMachine >().get_on_applied_context_list(),
8388 context< RecoveryMachine >().get_on_safe_context_list());
8389 return transit< GetMissing >();
8390 }
8391
8392 boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q)
8393 {
8394 q.f->open_object_section("state");
8395 q.f->dump_string("name", state_name);
8396 q.f->dump_stream("enter_time") << enter_time;
8397 q.f->dump_stream("auth_log_shard") << auth_log_shard;
8398 q.f->close_section();
8399 return forward_event();
8400 }
8401
8402 void PG::RecoveryState::GetLog::exit()
8403 {
8404 context< RecoveryMachine >().log_exit(state_name, enter_time);
8405 PG *pg = context< RecoveryMachine >().pg;
8406 utime_t dur = ceph_clock_now() - enter_time;
8407 pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur);
8408 pg->blocked_by.clear();
8409 pg->publish_stats_to_osd();
8410 }
8411
8412 /*------WaitActingChange--------*/
8413 PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx)
8414 : my_base(ctx),
8415 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitActingChange")
8416 {
8417 context< RecoveryMachine >().log_enter(state_name);
8418 }
8419
8420 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const AdvMap& advmap)
8421 {
8422 PG *pg = context< RecoveryMachine >().pg;
8423 OSDMapRef osdmap = advmap.osdmap;
8424
8425 ldout(pg->cct, 10) << "verifying no want_acting " << pg->want_acting << " targets didn't go down" << dendl;
8426 for (vector<int>::iterator p = pg->want_acting.begin(); p != pg->want_acting.end(); ++p) {
8427 if (!osdmap->is_up(*p)) {
8428 ldout(pg->cct, 10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
8429 post_event(advmap);
8430 return transit< Reset >();
8431 }
8432 }
8433 return forward_event();
8434 }
8435
8436 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MLogRec& logevt)
8437 {
8438 PG *pg = context< RecoveryMachine >().pg;
8439 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MLocRec" << dendl;
8440 return discard_event();
8441 }
8442
8443 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MInfoRec& evt)
8444 {
8445 PG *pg = context< RecoveryMachine >().pg;
8446 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
8447 return discard_event();
8448 }
8449
8450 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MNotifyRec& evt)
8451 {
8452 PG *pg = context< RecoveryMachine >().pg;
8453 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
8454 return discard_event();
8455 }
8456
8457 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const QueryState& q)
8458 {
8459 q.f->open_object_section("state");
8460 q.f->dump_string("name", state_name);
8461 q.f->dump_stream("enter_time") << enter_time;
8462 q.f->dump_string("comment", "waiting for pg acting set to change");
8463 q.f->close_section();
8464 return forward_event();
8465 }
8466
8467 void PG::RecoveryState::WaitActingChange::exit()
8468 {
8469 context< RecoveryMachine >().log_exit(state_name, enter_time);
8470 PG *pg = context< RecoveryMachine >().pg;
8471 utime_t dur = ceph_clock_now() - enter_time;
8472 pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur);
8473 }
8474
8475 /*------Down--------*/
8476 PG::RecoveryState::Down::Down(my_context ctx)
8477 : my_base(ctx),
8478 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Down")
8479 {
8480 context< RecoveryMachine >().log_enter(state_name);
8481 PG *pg = context< RecoveryMachine >().pg;
8482
8483 pg->state_clear(PG_STATE_PEERING);
8484 pg->state_set(PG_STATE_DOWN);
8485
8486 auto &prior_set = context< Peering >().prior_set;
8487 assert(pg->blocked_by.empty());
8488 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
8489 pg->publish_stats_to_osd();
8490 }
8491
8492 void PG::RecoveryState::Down::exit()
8493 {
8494 context< RecoveryMachine >().log_exit(state_name, enter_time);
8495 PG *pg = context< RecoveryMachine >().pg;
8496
8497 pg->state_clear(PG_STATE_DOWN);
8498 utime_t dur = ceph_clock_now() - enter_time;
8499 pg->osd->recoverystate_perf->tinc(rs_down_latency, dur);
8500
8501 pg->blocked_by.clear();
8502 pg->publish_stats_to_osd();
8503 }
8504
8505 boost::statechart::result PG::RecoveryState::Down::react(const QueryState& q)
8506 {
8507 q.f->open_object_section("state");
8508 q.f->dump_string("name", state_name);
8509 q.f->dump_stream("enter_time") << enter_time;
8510 q.f->dump_string("comment",
8511 "not enough up instances of this PG to go active");
8512 q.f->close_section();
8513 return forward_event();
8514 }
8515
8516 /*------Incomplete--------*/
8517 PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
8518 : my_base(ctx),
8519 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Incomplete")
8520 {
8521 context< RecoveryMachine >().log_enter(state_name);
8522 PG *pg = context< RecoveryMachine >().pg;
8523
8524 pg->state_clear(PG_STATE_PEERING);
8525 pg->state_set(PG_STATE_INCOMPLETE);
8526
8527 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
8528 assert(pg->blocked_by.empty());
8529 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
8530 pg->publish_stats_to_osd();
8531 }
8532
8533 boost::statechart::result PG::RecoveryState::Incomplete::react(const AdvMap &advmap) {
8534 PG *pg = context< RecoveryMachine >().pg;
8535 int64_t poolnum = pg->info.pgid.pool();
8536
8537 // Reset if min_size turn smaller than previous value, pg might now be able to go active
8538 if (!advmap.osdmap->have_pg_pool(poolnum) ||
8539 advmap.lastmap->get_pools().find(poolnum)->second.min_size >
8540 advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
8541 post_event(advmap);
8542 return transit< Reset >();
8543 }
8544
8545 return forward_event();
8546 }
8547
8548 boost::statechart::result PG::RecoveryState::Incomplete::react(const MNotifyRec& notevt) {
8549 PG *pg = context< RecoveryMachine >().pg;
8550 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
8551 if (pg->proc_replica_info(
8552 notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
8553 // We got something new, try again!
8554 return transit< GetLog >();
8555 } else {
8556 return discard_event();
8557 }
8558 }
8559
8560 boost::statechart::result PG::RecoveryState::Incomplete::react(
8561 const QueryState& q)
8562 {
8563 q.f->open_object_section("state");
8564 q.f->dump_string("name", state_name);
8565 q.f->dump_stream("enter_time") << enter_time;
8566 q.f->dump_string("comment", "not enough complete instances of this PG");
8567 q.f->close_section();
8568 return forward_event();
8569 }
8570
8571 void PG::RecoveryState::Incomplete::exit()
8572 {
8573 context< RecoveryMachine >().log_exit(state_name, enter_time);
8574 PG *pg = context< RecoveryMachine >().pg;
8575
8576 pg->state_clear(PG_STATE_INCOMPLETE);
8577 utime_t dur = ceph_clock_now() - enter_time;
8578 pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur);
8579
8580 pg->blocked_by.clear();
8581 pg->publish_stats_to_osd();
8582 }
8583
8584 /*------GetMissing--------*/
8585 PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
8586 : my_base(ctx),
8587 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetMissing")
8588 {
8589 context< RecoveryMachine >().log_enter(state_name);
8590
8591 PG *pg = context< RecoveryMachine >().pg;
8592 assert(!pg->actingbackfill.empty());
8593 eversion_t since;
8594 for (set<pg_shard_t>::iterator i = pg->actingbackfill.begin();
8595 i != pg->actingbackfill.end();
8596 ++i) {
8597 if (*i == pg->get_primary()) continue;
8598 const pg_info_t& pi = pg->peer_info[*i];
8599 // reset this so to make sure the pg_missing_t is initialized and
8600 // has the correct semantics even if we don't need to get a
8601 // missing set from a shard. This way later additions due to
8602 // lost+unfound delete work properly.
8603 pg->peer_missing[*i].may_include_deletes = !pg->perform_deletes_during_peering();
8604
8605 if (pi.is_empty())
8606 continue; // no pg data, nothing divergent
8607
8608 if (pi.last_update < pg->pg_log.get_tail()) {
8609 ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
8610 pg->peer_missing[*i].clear();
8611 continue;
8612 }
8613 if (pi.last_backfill == hobject_t()) {
8614 ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
8615 pg->peer_missing[*i].clear();
8616 continue;
8617 }
8618
8619 if (pi.last_update == pi.last_complete && // peer has no missing
8620 pi.last_update == pg->info.last_update) { // peer is up to date
8621 // replica has no missing and identical log as us. no need to
8622 // pull anything.
8623 // FIXME: we can do better here. if last_update==last_complete we
8624 // can infer the rest!
8625 ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl;
8626 pg->peer_missing[*i].clear();
8627 continue;
8628 }
8629
8630 // We pull the log from the peer's last_epoch_started to ensure we
8631 // get enough log to detect divergent updates.
8632 since.epoch = pi.last_epoch_started;
8633 assert(pi.last_update >= pg->info.log_tail); // or else choose_acting() did a bad thing
8634 if (pi.log_tail <= since) {
8635 ldout(pg->cct, 10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
8636 context< RecoveryMachine >().send_query(
8637 *i,
8638 pg_query_t(
8639 pg_query_t::LOG,
8640 i->shard, pg->pg_whoami.shard,
8641 since, pg->info.history,
8642 pg->get_osdmap()->get_epoch()));
8643 } else {
8644 ldout(pg->cct, 10) << " requesting fulllog+missing from osd." << *i
8645 << " (want since " << since << " < log.tail "
8646 << pi.log_tail << ")" << dendl;
8647 context< RecoveryMachine >().send_query(
8648 *i, pg_query_t(
8649 pg_query_t::FULLLOG,
8650 i->shard, pg->pg_whoami.shard,
8651 pg->info.history, pg->get_osdmap()->get_epoch()));
8652 }
8653 peer_missing_requested.insert(*i);
8654 pg->blocked_by.insert(i->osd);
8655 }
8656
8657 if (peer_missing_requested.empty()) {
8658 if (pg->need_up_thru) {
8659 ldout(pg->cct, 10) << " still need up_thru update before going active"
8660 << dendl;
8661 post_event(NeedUpThru());
8662 return;
8663 }
8664
8665 // all good!
8666 post_event(Activate(pg->get_osdmap()->get_epoch()));
8667 } else {
8668 pg->publish_stats_to_osd();
8669 }
8670 }
8671
8672 boost::statechart::result PG::RecoveryState::GetMissing::react(const MLogRec& logevt)
8673 {
8674 PG *pg = context< RecoveryMachine >().pg;
8675
8676 peer_missing_requested.erase(logevt.from);
8677 pg->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
8678
8679 if (peer_missing_requested.empty()) {
8680 if (pg->need_up_thru) {
8681 ldout(pg->cct, 10) << " still need up_thru update before going active"
8682 << dendl;
8683 post_event(NeedUpThru());
8684 } else {
8685 ldout(pg->cct, 10) << "Got last missing, don't need missing "
8686 << "posting Activate" << dendl;
8687 post_event(Activate(pg->get_osdmap()->get_epoch()));
8688 }
8689 }
8690 return discard_event();
8691 }
8692
8693 boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState& q)
8694 {
8695 PG *pg = context< RecoveryMachine >().pg;
8696 q.f->open_object_section("state");
8697 q.f->dump_string("name", state_name);
8698 q.f->dump_stream("enter_time") << enter_time;
8699
8700 q.f->open_array_section("peer_missing_requested");
8701 for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
8702 p != peer_missing_requested.end();
8703 ++p) {
8704 q.f->open_object_section("osd");
8705 q.f->dump_stream("osd") << *p;
8706 if (pg->peer_missing.count(*p)) {
8707 q.f->open_object_section("got_missing");
8708 pg->peer_missing[*p].dump(q.f);
8709 q.f->close_section();
8710 }
8711 q.f->close_section();
8712 }
8713 q.f->close_section();
8714
8715 q.f->close_section();
8716 return forward_event();
8717 }
8718
8719 void PG::RecoveryState::GetMissing::exit()
8720 {
8721 context< RecoveryMachine >().log_exit(state_name, enter_time);
8722 PG *pg = context< RecoveryMachine >().pg;
8723 utime_t dur = ceph_clock_now() - enter_time;
8724 pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur);
8725 pg->blocked_by.clear();
8726 pg->publish_stats_to_osd();
8727 }
8728
8729 /*------WaitUpThru--------*/
8730 PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx)
8731 : my_base(ctx),
8732 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitUpThru")
8733 {
8734 context< RecoveryMachine >().log_enter(state_name);
8735 }
8736
8737 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const ActMap& am)
8738 {
8739 PG *pg = context< RecoveryMachine >().pg;
8740 if (!pg->need_up_thru) {
8741 post_event(Activate(pg->get_osdmap()->get_epoch()));
8742 }
8743 return forward_event();
8744 }
8745
8746 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const MLogRec& logevt)
8747 {
8748 PG *pg = context< RecoveryMachine >().pg;
8749 ldout(pg->cct, 10) << "Noting missing from osd." << logevt.from << dendl;
8750 pg->peer_missing[logevt.from].claim(logevt.msg->missing);
8751 pg->peer_info[logevt.from] = logevt.msg->info;
8752 return discard_event();
8753 }
8754
8755 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const QueryState& q)
8756 {
8757 q.f->open_object_section("state");
8758 q.f->dump_string("name", state_name);
8759 q.f->dump_stream("enter_time") << enter_time;
8760 q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
8761 q.f->close_section();
8762 return forward_event();
8763 }
8764
8765 void PG::RecoveryState::WaitUpThru::exit()
8766 {
8767 context< RecoveryMachine >().log_exit(state_name, enter_time);
8768 PG *pg = context< RecoveryMachine >().pg;
8769 utime_t dur = ceph_clock_now() - enter_time;
8770 pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur);
8771 }
8772
8773 /*----RecoveryState::RecoveryMachine Methods-----*/
8774 #undef dout_prefix
8775 #define dout_prefix *_dout << pg->gen_prefix()
8776
8777 void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name)
8778 {
8779 PG *pg = context< RecoveryMachine >().pg;
8780 ldout(pg->cct, 5) << "enter " << state_name << dendl;
8781 pg->osd->pg_recovery_stats.log_enter(state_name);
8782 }
8783
8784 void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_t enter_time)
8785 {
8786 utime_t dur = ceph_clock_now() - enter_time;
8787 PG *pg = context< RecoveryMachine >().pg;
8788 ldout(pg->cct, 5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
8789 pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now() - enter_time,
8790 event_count, event_time);
8791 event_count = 0;
8792 event_time = utime_t();
8793 }
8794
8795
8796 /*---------------------------------------------------*/
8797 #undef dout_prefix
8798 #define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ")
8799
8800 void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
8801 assert(!rctx);
8802 assert(!orig_ctx);
8803 orig_ctx = new_ctx;
8804 if (new_ctx) {
8805 if (messages_pending_flush) {
8806 rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
8807 } else {
8808 rctx = *new_ctx;
8809 }
8810 rctx->start_time = ceph_clock_now();
8811 }
8812 }
8813
8814 void PG::RecoveryState::begin_block_outgoing() {
8815 assert(!messages_pending_flush);
8816 assert(orig_ctx);
8817 assert(rctx);
8818 messages_pending_flush = BufferedRecoveryMessages();
8819 rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
8820 }
8821
8822 void PG::RecoveryState::clear_blocked_outgoing() {
8823 assert(orig_ctx);
8824 assert(rctx);
8825 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8826 }
8827
8828 void PG::RecoveryState::end_block_outgoing() {
8829 assert(messages_pending_flush);
8830 assert(orig_ctx);
8831 assert(rctx);
8832
8833 rctx = RecoveryCtx(*orig_ctx);
8834 rctx->accept_buffered_messages(*messages_pending_flush);
8835 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8836 }
8837
8838 void PG::RecoveryState::end_handle() {
8839 if (rctx) {
8840 utime_t dur = ceph_clock_now() - rctx->start_time;
8841 machine.event_time += dur;
8842 }
8843
8844 machine.event_count++;
8845 rctx = boost::optional<RecoveryCtx>();
8846 orig_ctx = NULL;
8847 }
8848
8849 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi)
8850 {
8851 out << "BackfillInfo(" << bi.begin << "-" << bi.end
8852 << " " << bi.objects.size() << " objects";
8853 if (!bi.objects.empty())
8854 out << " " << bi.objects;
8855 out << ")";
8856 return out;
8857 }
8858
8859 void intrusive_ptr_add_ref(PG *pg) { pg->get("intptr"); }
8860 void intrusive_ptr_release(PG *pg) { pg->put("intptr"); }
8861
8862 #ifdef PG_DEBUG_REFS
8863 uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
8864 void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
8865 #endif