]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PG.cc
37ae7d797831fc015084cd0fd9d7747f0bdcb8db
[ceph.git] / ceph / src / osd / PG.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "PG.h"
16 // #include "msg/Messenger.h"
17 #include "messages/MOSDRepScrub.h"
18 // #include "common/cmdparse.h"
19 // #include "common/ceph_context.h"
20
21 #include "common/errno.h"
22 #include "common/config.h"
23 #include "OSD.h"
24 #include "OpRequest.h"
25 #include "ScrubStore.h"
26 #include "Session.h"
27
28 #include "common/Timer.h"
29 #include "common/perf_counters.h"
30
31 #include "messages/MOSDOp.h"
32 #include "messages/MOSDPGNotify.h"
33 // #include "messages/MOSDPGLog.h"
34 #include "messages/MOSDPGRemove.h"
35 #include "messages/MOSDPGInfo.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDPGBackfill.h"
39 #include "messages/MOSDPGBackfillRemove.h"
40 #include "messages/MBackfillReserve.h"
41 #include "messages/MRecoveryReserve.h"
42 #include "messages/MOSDPGPush.h"
43 #include "messages/MOSDPGPushReply.h"
44 #include "messages/MOSDPGPull.h"
45 #include "messages/MOSDECSubOpWrite.h"
46 #include "messages/MOSDECSubOpWriteReply.h"
47 #include "messages/MOSDECSubOpRead.h"
48 #include "messages/MOSDECSubOpReadReply.h"
49 #include "messages/MOSDPGUpdateLogMissing.h"
50 #include "messages/MOSDPGUpdateLogMissingReply.h"
51 #include "messages/MOSDBackoff.h"
52 #include "messages/MOSDScrubReserve.h"
53 #include "messages/MOSDSubOp.h"
54 #include "messages/MOSDRepOp.h"
55 #include "messages/MOSDSubOpReply.h"
56 #include "messages/MOSDRepOpReply.h"
57 #include "messages/MOSDRepScrubMap.h"
58 #include "messages/MOSDPGRecoveryDelete.h"
59 #include "messages/MOSDPGRecoveryDeleteReply.h"
60
61 #include "common/BackTrace.h"
62 #include "common/EventTrace.h"
63
64 #ifdef WITH_LTTNG
65 #define TRACEPOINT_DEFINE
66 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
67 #include "tracing/pg.h"
68 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
69 #undef TRACEPOINT_DEFINE
70 #else
71 #define tracepoint(...)
72 #endif
73
74 #include <sstream>
75
76 #define dout_context cct
77 #define dout_subsys ceph_subsys_osd
78 #undef dout_prefix
79 #define dout_prefix _prefix(_dout, this)
80
81 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
82 // easily skip them
83 const string infover_key("_infover");
84 const string info_key("_info");
85 const string biginfo_key("_biginfo");
86 const string epoch_key("_epoch");
87 const string fastinfo_key("_fastinfo");
88
89 template <class T>
90 static ostream& _prefix(std::ostream *_dout, T *t)
91 {
92 return *_dout << t->gen_prefix();
93 }
94
95 MEMPOOL_DEFINE_OBJECT_FACTORY(PG::CephPeeringEvt, pg_peering_evt, osd);
96
97 void PGStateHistory::enter(PG* pg, const utime_t entime, const char* state)
98 {
99 // Ignore trimming state machine for now
100 if (::strstr(state, "Trimming") != NULL) {
101 return;
102 } else if (pi != nullptr) {
103 pi->enter_state(entime, state);
104 } else {
105 // Store current state since we can't reliably take the PG lock here
106 if ( tmppi == nullptr) {
107 tmppi = std::unique_ptr<PGStateInstance>(new PGStateInstance);
108 }
109
110 thispg = pg;
111 tmppi->enter_state(entime, state);
112 }
113 }
114
115 void PGStateHistory::exit(const char* state) {
116 // Ignore trimming state machine for now
117 // Do nothing if PG is being destroyed!
118 if (::strstr(state, "Trimming") != NULL || pg_in_destructor) {
119 return;
120 } else {
121 bool ilocked = false;
122 if(!thispg->is_locked()) {
123 thispg->lock();
124 ilocked = true;
125 }
126 if (pi == nullptr) {
127 buffer.push_back(std::unique_ptr<PGStateInstance>(tmppi.release()));
128 pi = buffer.back().get();
129 pi->setepoch(thispg->get_osdmap()->get_epoch());
130 }
131
132 pi->exit_state(ceph_clock_now());
133 if (::strcmp(state, "Reset") == 0) {
134 this->reset();
135 }
136 if(ilocked) {
137 thispg->unlock();
138 }
139 }
140 }
141
142 void PGStateHistory::dump(Formatter* f) const {
143 f->open_array_section("history");
144 for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) {
145 f->open_object_section("states");
146 f->dump_stream("epoch") << (*pi)->this_epoch;
147 for (auto she : (*pi)->state_history) {
148 f->dump_string("state", std::get<2>(she));
149 f->dump_stream("enter") << std::get<0>(she);
150 f->dump_stream("exit") << std::get<1>(she);
151 }
152 f->close_section();
153 }
154 f->close_section();
155 }
156
157 void PG::get(const char* tag)
158 {
159 ref++;
160 #ifdef PG_DEBUG_REFS
161 Mutex::Locker l(_ref_id_lock);
162 _tag_counts[tag]++;
163 #endif
164 }
165
166 void PG::put(const char* tag)
167 {
168 #ifdef PG_DEBUG_REFS
169 {
170 Mutex::Locker l(_ref_id_lock);
171 auto tag_counts_entry = _tag_counts.find(tag);
172 assert(tag_counts_entry != _tag_counts.end());
173 --tag_counts_entry->second;
174 if (tag_counts_entry->second == 0) {
175 _tag_counts.erase(tag_counts_entry);
176 }
177 }
178 #endif
179 if (--ref== 0)
180 delete this;
181 }
182
183 #ifdef PG_DEBUG_REFS
184 uint64_t PG::get_with_id()
185 {
186 ref++;
187 Mutex::Locker l(_ref_id_lock);
188 uint64_t id = ++_ref_id;
189 BackTrace bt(0);
190 stringstream ss;
191 bt.print(ss);
192 dout(20) << __func__ << ": " << info.pgid << " got id " << id << " (new) ref==" << ref << dendl;
193 assert(!_live_ids.count(id));
194 _live_ids.insert(make_pair(id, ss.str()));
195 return id;
196 }
197
198 void PG::put_with_id(uint64_t id)
199 {
200 dout(20) << __func__ << ": " << info.pgid << " put id " << id << " (current) ref==" << ref << dendl;
201 {
202 Mutex::Locker l(_ref_id_lock);
203 assert(_live_ids.count(id));
204 _live_ids.erase(id);
205 }
206 if (--ref == 0)
207 delete this;
208 }
209
210 void PG::dump_live_ids()
211 {
212 Mutex::Locker l(_ref_id_lock);
213 dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl;
214 for (map<uint64_t, string>::iterator i = _live_ids.begin();
215 i != _live_ids.end();
216 ++i) {
217 dout(0) << "\t\tid: " << *i << dendl;
218 }
219 dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl;
220 for (map<string, uint64_t>::iterator i = _tag_counts.begin();
221 i != _tag_counts.end();
222 ++i) {
223 dout(0) << "\t\tid: " << *i << dendl;
224 }
225 }
226 #endif
227
228
229 void PGPool::update(OSDMapRef map)
230 {
231 const pg_pool_t *pi = map->get_pg_pool(id);
232 assert(pi);
233 info = *pi;
234 auid = pi->auid;
235 name = map->get_pool_name(id);
236 bool updated = false;
237 if ((map->get_epoch() != cached_epoch + 1) ||
238 (pi->get_snap_epoch() == map->get_epoch())) {
239 updated = true;
240 pi->build_removed_snaps(newly_removed_snaps);
241 interval_set<snapid_t> intersection;
242 intersection.intersection_of(newly_removed_snaps, cached_removed_snaps);
243 if (intersection == cached_removed_snaps) {
244 newly_removed_snaps.subtract(cached_removed_snaps);
245 cached_removed_snaps.union_of(newly_removed_snaps);
246 } else {
247 lgeneric_subdout(cct, osd, 0) << __func__
248 << " cached_removed_snaps shrank from " << cached_removed_snaps
249 << " to " << newly_removed_snaps << dendl;
250 cached_removed_snaps = newly_removed_snaps;
251 newly_removed_snaps.clear();
252 }
253 snapc = pi->get_snap_context();
254 } else {
255 /* 1) map->get_epoch() == cached_epoch + 1 &&
256 * 2) pi->get_snap_epoch() != map->get_epoch()
257 *
258 * From the if branch, 1 && 2 must be true. From 2, we know that
259 * this map didn't change the set of removed snaps. From 1, we
260 * know that our cached_removed_snaps matches the previous map.
261 * Thus, from 1 && 2, cached_removed snaps matches the current
262 * set of removed snaps and all we have to do is clear
263 * newly_removed_snaps.
264 */
265 newly_removed_snaps.clear();
266 }
267 cached_epoch = map->get_epoch();
268 lgeneric_subdout(cct, osd, 20)
269 << "PGPool::update cached_removed_snaps "
270 << cached_removed_snaps
271 << " newly_removed_snaps "
272 << newly_removed_snaps
273 << " snapc " << snapc
274 << (updated ? " (updated)":" (no change)")
275 << dendl;
276 }
277
278 PG::PG(OSDService *o, OSDMapRef curmap,
279 const PGPool &_pool, spg_t p) :
280 osd(o),
281 cct(o->cct),
282 osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
283 snap_mapper(
284 cct,
285 &osdriver,
286 p.ps(),
287 p.get_split_bits(curmap->get_pg_num(_pool.id)),
288 _pool.id,
289 p.shard),
290 osdmap_ref(curmap), last_persisted_osdmap_ref(curmap), pool(_pool),
291 _lock("PG::_lock"),
292 #ifdef PG_DEBUG_REFS
293 _ref_id_lock("PG::_ref_id_lock"), _ref_id(0),
294 #endif
295 deleting(false),
296 trace_endpoint("0.0.0.0", 0, "PG"),
297 dirty_info(false), dirty_big_info(false),
298 info(p),
299 info_struct_v(0),
300 coll(p),
301 pg_log(cct),
302 pgmeta_oid(p.make_pgmeta_oid()),
303 missing_loc(this),
304 past_intervals(
305 curmap->get_pools().at(p.pgid.pool()).ec_pool(),
306 *curmap),
307 stat_queue_item(this),
308 scrub_queued(false),
309 recovery_queued(false),
310 recovery_ops_active(0),
311 role(-1),
312 state(0),
313 send_notify(false),
314 pg_whoami(osd->whoami, p.shard),
315 need_up_thru(false),
316 last_peering_reset(0),
317 heartbeat_peer_lock("PG::heartbeat_peer_lock"),
318 backfill_reserved(false),
319 backfill_reserving(false),
320 flushes_in_progress(0),
321 pg_stats_publish_lock("PG::pg_stats_publish_lock"),
322 pg_stats_publish_valid(false),
323 osr(osd->osr_registry.lookup_or_create(p, (stringify(p)))),
324 finish_sync_event(NULL),
325 backoff_lock("PG::backoff_lock"),
326 scrub_after_recovery(false),
327 active_pushes(0),
328 recovery_state(this),
329 pg_id(p),
330 peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
331 acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
332 upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
333 last_epoch(0)
334 {
335 #ifdef PG_DEBUG_REFS
336 osd->add_pgid(p, this);
337 #endif
338 #ifdef WITH_BLKIN
339 std::stringstream ss;
340 ss << "PG " << info.pgid;
341 trace_endpoint.copy_name(ss.str());
342 #endif
343 osr->shard_hint = p;
344 }
345
346 PG::~PG()
347 {
348 pgstate_history.set_pg_in_destructor();
349 #ifdef PG_DEBUG_REFS
350 osd->remove_pgid(info.pgid, this);
351 #endif
352 }
353
354 void PG::lock_suspend_timeout(ThreadPool::TPHandle &handle)
355 {
356 handle.suspend_tp_timeout();
357 lock();
358 handle.reset_tp_timeout();
359 }
360
361 void PG::lock(bool no_lockdep) const
362 {
363 _lock.Lock(no_lockdep);
364 // if we have unrecorded dirty state with the lock dropped, there is a bug
365 assert(!dirty_info);
366 assert(!dirty_big_info);
367
368 dout(30) << "lock" << dendl;
369 }
370
371 std::string PG::gen_prefix() const
372 {
373 stringstream out;
374 OSDMapRef mapref = osdmap_ref;
375 if (_lock.is_locked_by_me()) {
376 out << "osd." << osd->whoami
377 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
378 << " " << *this << " ";
379 } else {
380 out << "osd." << osd->whoami
381 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
382 << " pg[" << info.pgid << "(unlocked)] ";
383 }
384 return out.str();
385 }
386
387 /********* PG **********/
388
389 void PG::proc_master_log(
390 ObjectStore::Transaction& t, pg_info_t &oinfo,
391 pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
392 {
393 dout(10) << "proc_master_log for osd." << from << ": "
394 << olog << " " << omissing << dendl;
395 assert(!is_peered() && is_primary());
396
397 // merge log into our own log to build master log. no need to
398 // make any adjustments to their missing map; we are taking their
399 // log to be authoritative (i.e., their entries are by definitely
400 // non-divergent).
401 merge_log(t, oinfo, olog, from);
402 peer_info[from] = oinfo;
403 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
404 might_have_unfound.insert(from);
405
406 // See doc/dev/osd_internals/last_epoch_started
407 if (oinfo.last_epoch_started > info.last_epoch_started) {
408 info.last_epoch_started = oinfo.last_epoch_started;
409 dirty_info = true;
410 }
411 if (oinfo.last_interval_started > info.last_interval_started) {
412 info.last_interval_started = oinfo.last_interval_started;
413 dirty_info = true;
414 }
415 update_history(oinfo.history);
416 assert(cct->_conf->osd_find_best_info_ignore_history_les ||
417 info.last_epoch_started >= info.history.last_epoch_started);
418
419 peer_missing[from].claim(omissing);
420 }
421
422 void PG::proc_replica_log(
423 pg_info_t &oinfo,
424 const pg_log_t &olog,
425 pg_missing_t& omissing,
426 pg_shard_t from)
427 {
428 dout(10) << "proc_replica_log for osd." << from << ": "
429 << oinfo << " " << olog << " " << omissing << dendl;
430
431 pg_log.proc_replica_log(oinfo, olog, omissing, from);
432
433 peer_info[from] = oinfo;
434 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
435 might_have_unfound.insert(from);
436
437 for (map<hobject_t, pg_missing_item>::const_iterator i =
438 omissing.get_items().begin();
439 i != omissing.get_items().end();
440 ++i) {
441 dout(20) << " after missing " << i->first << " need " << i->second.need
442 << " have " << i->second.have << dendl;
443 }
444 peer_missing[from].claim(omissing);
445 }
446
447 bool PG::proc_replica_info(
448 pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
449 {
450 map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
451 if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
452 dout(10) << " got dup osd." << from << " info " << oinfo << ", identical to ours" << dendl;
453 return false;
454 }
455
456 if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
457 dout(10) << " got info " << oinfo << " from down osd." << from
458 << " discarding" << dendl;
459 return false;
460 }
461
462 dout(10) << " got osd." << from << " " << oinfo << dendl;
463 assert(is_primary());
464 peer_info[from] = oinfo;
465 might_have_unfound.insert(from);
466
467 update_history(oinfo.history);
468
469 // stray?
470 if (!is_up(from) && !is_acting(from)) {
471 dout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
472 stray_set.insert(from);
473 if (is_clean()) {
474 purge_strays();
475 }
476 }
477
478 // was this a new info? if so, update peers!
479 if (p == peer_info.end())
480 update_heartbeat_peers();
481
482 return true;
483 }
484
485 void PG::remove_snap_mapped_object(
486 ObjectStore::Transaction &t, const hobject_t &soid)
487 {
488 t.remove(
489 coll,
490 ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard));
491 clear_object_snap_mapping(&t, soid);
492 }
493
494 void PG::clear_object_snap_mapping(
495 ObjectStore::Transaction *t, const hobject_t &soid)
496 {
497 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
498 if (soid.snap < CEPH_MAXSNAP) {
499 int r = snap_mapper.remove_oid(
500 soid,
501 &_t);
502 if (!(r == 0 || r == -ENOENT)) {
503 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
504 ceph_abort();
505 }
506 }
507 }
508
509 void PG::update_object_snap_mapping(
510 ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps)
511 {
512 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
513 assert(soid.snap < CEPH_MAXSNAP);
514 int r = snap_mapper.remove_oid(
515 soid,
516 &_t);
517 if (!(r == 0 || r == -ENOENT)) {
518 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
519 ceph_abort();
520 }
521 snap_mapper.add_oid(
522 soid,
523 snaps,
524 &_t);
525 }
526
527 void PG::merge_log(
528 ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_shard_t from)
529 {
530 PGLogEntryHandler rollbacker{this, &t};
531 pg_log.merge_log(
532 oinfo, olog, from, info, &rollbacker, dirty_info, dirty_big_info);
533 }
534
535 void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead)
536 {
537 PGLogEntryHandler rollbacker{this, &t};
538 pg_log.rewind_divergent_log(
539 newhead, info, &rollbacker, dirty_info, dirty_big_info);
540 }
541
542 /*
543 * Process information from a replica to determine if it could have any
544 * objects that i need.
545 *
546 * TODO: if the missing set becomes very large, this could get expensive.
547 * Instead, we probably want to just iterate over our unfound set.
548 */
549 bool PG::search_for_missing(
550 const pg_info_t &oinfo, const pg_missing_t &omissing,
551 pg_shard_t from,
552 RecoveryCtx *ctx)
553 {
554 uint64_t num_unfound_before = missing_loc.num_unfound();
555 bool found_missing = missing_loc.add_source_info(
556 from, oinfo, omissing, ctx->handle);
557 if (found_missing && num_unfound_before != missing_loc.num_unfound())
558 publish_stats_to_osd();
559 if (found_missing &&
560 (get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
561 CEPH_FEATURE_OSD_ERASURE_CODES)) {
562 pg_info_t tinfo(oinfo);
563 tinfo.pgid.shard = pg_whoami.shard;
564 (*(ctx->info_map))[from.osd].push_back(
565 make_pair(
566 pg_notify_t(
567 from.shard, pg_whoami.shard,
568 get_osdmap()->get_epoch(),
569 get_osdmap()->get_epoch(),
570 tinfo),
571 past_intervals));
572 }
573 return found_missing;
574 }
575
576 bool PG::MissingLoc::readable_with_acting(
577 const hobject_t &hoid,
578 const set<pg_shard_t> &acting) const {
579 if (!needs_recovery(hoid))
580 return true;
581 if (is_deleted(hoid))
582 return false;
583 auto missing_loc_entry = missing_loc.find(hoid);
584 if (missing_loc_entry == missing_loc.end())
585 return false;
586 const set<pg_shard_t> &locs = missing_loc_entry->second;
587 ldout(pg->cct, 10) << __func__ << ": locs:" << locs << dendl;
588 set<pg_shard_t> have_acting;
589 for (set<pg_shard_t>::const_iterator i = locs.begin();
590 i != locs.end();
591 ++i) {
592 if (acting.count(*i))
593 have_acting.insert(*i);
594 }
595 return (*is_readable)(have_acting);
596 }
597
598 void PG::MissingLoc::add_batch_sources_info(
599 const set<pg_shard_t> &sources, ThreadPool::TPHandle* handle)
600 {
601 ldout(pg->cct, 10) << __func__ << ": adding sources in batch "
602 << sources.size() << dendl;
603 unsigned loop = 0;
604 for (map<hobject_t, pg_missing_item>::const_iterator i = needs_recovery_map.begin();
605 i != needs_recovery_map.end();
606 ++i) {
607 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
608 handle->reset_tp_timeout();
609 loop = 0;
610 }
611 if (i->second.is_delete())
612 continue;
613 missing_loc[i->first].insert(sources.begin(), sources.end());
614 missing_loc_sources.insert(sources.begin(), sources.end());
615 }
616 }
617
618 bool PG::MissingLoc::add_source_info(
619 pg_shard_t fromosd,
620 const pg_info_t &oinfo,
621 const pg_missing_t &omissing,
622 ThreadPool::TPHandle* handle)
623 {
624 bool found_missing = false;
625 unsigned loop = 0;
626 // found items?
627 for (map<hobject_t,pg_missing_item>::const_iterator p = needs_recovery_map.begin();
628 p != needs_recovery_map.end();
629 ++p) {
630 const hobject_t &soid(p->first);
631 eversion_t need = p->second.need;
632 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
633 handle->reset_tp_timeout();
634 loop = 0;
635 }
636 if (p->second.is_delete()) {
637 ldout(pg->cct, 10) << __func__ << " " << soid
638 << " delete, ignoring source" << dendl;
639 found_missing = true;
640 continue;
641 }
642 if (oinfo.last_update < need) {
643 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
644 << " also missing on osd." << fromosd
645 << " (last_update " << oinfo.last_update
646 << " < needed " << need << ")" << dendl;
647 continue;
648 }
649 if (!oinfo.last_backfill.is_max() &&
650 !oinfo.last_backfill_bitwise) {
651 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
652 << " also missing on osd." << fromosd
653 << " (last_backfill " << oinfo.last_backfill
654 << " but with wrong sort order)"
655 << dendl;
656 continue;
657 }
658 if (p->first >= oinfo.last_backfill) {
659 // FIXME: this is _probably_ true, although it could conceivably
660 // be in the undefined region! Hmm!
661 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
662 << " also missing on osd." << fromosd
663 << " (past last_backfill " << oinfo.last_backfill
664 << ")" << dendl;
665 continue;
666 }
667 if (oinfo.last_complete < need) {
668 if (omissing.is_missing(soid)) {
669 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
670 << " also missing on osd." << fromosd << dendl;
671 continue;
672 }
673 }
674
675 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
676 << " is on osd." << fromosd << dendl;
677
678 missing_loc[soid].insert(fromosd);
679 missing_loc_sources.insert(fromosd);
680 found_missing = true;
681 }
682
683 ldout(pg->cct, 20) << "needs_recovery_map missing " << needs_recovery_map
684 << dendl;
685 return found_missing;
686 }
687
688 void PG::discover_all_missing(map<int, map<spg_t,pg_query_t> > &query_map)
689 {
690 auto &missing = pg_log.get_missing();
691 uint64_t unfound = get_num_unfound();
692 assert(unfound > 0);
693
694 dout(10) << __func__ << " "
695 << missing.num_missing() << " missing, "
696 << unfound << " unfound"
697 << dendl;
698
699 std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
700 std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
701 for (; m != mend; ++m) {
702 pg_shard_t peer(*m);
703
704 if (!get_osdmap()->is_up(peer.osd)) {
705 dout(20) << __func__ << " skipping down osd." << peer << dendl;
706 continue;
707 }
708
709 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
710 if (iter != peer_info.end() &&
711 (iter->second.is_empty() || iter->second.dne())) {
712 // ignore empty peers
713 continue;
714 }
715
716 // If we've requested any of this stuff, the pg_missing_t information
717 // should be on its way.
718 // TODO: coalsce requested_* into a single data structure
719 if (peer_missing.find(peer) != peer_missing.end()) {
720 dout(20) << __func__ << ": osd." << peer
721 << ": we already have pg_missing_t" << dendl;
722 continue;
723 }
724 if (peer_log_requested.find(peer) != peer_log_requested.end()) {
725 dout(20) << __func__ << ": osd." << peer
726 << ": in peer_log_requested" << dendl;
727 continue;
728 }
729 if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
730 dout(20) << __func__ << ": osd." << peer
731 << ": in peer_missing_requested" << dendl;
732 continue;
733 }
734
735 // Request missing
736 dout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
737 << dendl;
738 peer_missing_requested.insert(peer);
739 query_map[peer.osd][spg_t(info.pgid.pgid, peer.shard)] =
740 pg_query_t(
741 pg_query_t::FULLLOG,
742 peer.shard, pg_whoami.shard,
743 info.history, get_osdmap()->get_epoch());
744 }
745 }
746
747 /******* PG ***********/
748 bool PG::needs_recovery() const
749 {
750 assert(is_primary());
751
752 auto &missing = pg_log.get_missing();
753
754 if (missing.num_missing()) {
755 dout(10) << __func__ << " primary has " << missing.num_missing()
756 << " missing" << dendl;
757 return true;
758 }
759
760 assert(!actingbackfill.empty());
761 set<pg_shard_t>::const_iterator end = actingbackfill.end();
762 set<pg_shard_t>::const_iterator a = actingbackfill.begin();
763 for (; a != end; ++a) {
764 if (*a == get_primary()) continue;
765 pg_shard_t peer = *a;
766 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
767 if (pm == peer_missing.end()) {
768 dout(10) << __func__ << " osd." << peer << " doesn't have missing set"
769 << dendl;
770 continue;
771 }
772 if (pm->second.num_missing()) {
773 dout(10) << __func__ << " osd." << peer << " has "
774 << pm->second.num_missing() << " missing" << dendl;
775 return true;
776 }
777 }
778
779 dout(10) << __func__ << " is recovered" << dendl;
780 return false;
781 }
782
783 bool PG::needs_backfill() const
784 {
785 assert(is_primary());
786
787 // We can assume that only possible osds that need backfill
788 // are on the backfill_targets vector nodes.
789 set<pg_shard_t>::const_iterator end = backfill_targets.end();
790 set<pg_shard_t>::const_iterator a = backfill_targets.begin();
791 for (; a != end; ++a) {
792 pg_shard_t peer = *a;
793 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
794 if (!pi->second.last_backfill.is_max()) {
795 dout(10) << __func__ << " osd." << peer << " has last_backfill " << pi->second.last_backfill << dendl;
796 return true;
797 }
798 }
799
800 dout(10) << __func__ << " does not need backfill" << dendl;
801 return false;
802 }
803
804
805 void PG::check_past_interval_bounds() const
806 {
807 auto rpib = get_required_past_interval_bounds(
808 info,
809 osd->get_superblock().oldest_map);
810 if (rpib.first >= rpib.second) {
811 if (!past_intervals.empty()) {
812 osd->clog->error() << info.pgid << " required past_interval bounds are"
813 << " empty [" << rpib << ") but past_intervals is not: "
814 << past_intervals;
815 derr << info.pgid << " required past_interval bounds are"
816 << " empty [" << rpib << ") but past_intervals is not: "
817 << past_intervals << dendl;
818 }
819 } else {
820 if (past_intervals.empty()) {
821 osd->clog->error() << info.pgid << " required past_interval bounds are"
822 << " not empty [" << rpib << ") but past_intervals "
823 << past_intervals << " is empty";
824 derr << info.pgid << " required past_interval bounds are"
825 << " not empty [" << rpib << ") but past_intervals "
826 << past_intervals << " is empty" << dendl;
827 assert(!past_intervals.empty());
828 }
829
830 auto apib = past_intervals.get_bounds();
831 if (apib.first > rpib.first) {
832 osd->clog->error() << info.pgid << " past_intervals [" << apib
833 << ") start interval does not contain the required"
834 << " bound [" << rpib << ") start";
835 derr << info.pgid << " past_intervals [" << apib
836 << ") start interval does not contain the required"
837 << " bound [" << rpib << ") start" << dendl;
838 assert(0 == "past_interval start interval mismatch");
839 }
840 if (apib.second != rpib.second) {
841 osd->clog->error() << info.pgid << " past_interal bound [" << apib
842 << ") end does not match required [" << rpib
843 << ") end";
844 derr << info.pgid << " past_interal bound [" << apib
845 << ") end does not match required [" << rpib
846 << ") end" << dendl;
847 assert(0 == "past_interval end mismatch");
848 }
849 }
850 }
851
852 bool PG::adjust_need_up_thru(const OSDMapRef osdmap)
853 {
854 epoch_t up_thru = osdmap->get_up_thru(osd->whoami);
855 if (need_up_thru &&
856 up_thru >= info.history.same_interval_since) {
857 dout(10) << "adjust_need_up_thru now " << up_thru << ", need_up_thru now false" << dendl;
858 need_up_thru = false;
859 return true;
860 }
861 return false;
862 }
863
864 void PG::remove_down_peer_info(const OSDMapRef osdmap)
865 {
866 // Remove any downed osds from peer_info
867 bool removed = false;
868 map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
869 while (p != peer_info.end()) {
870 if (!osdmap->is_up(p->first.osd)) {
871 dout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
872 peer_missing.erase(p->first);
873 peer_log_requested.erase(p->first);
874 peer_missing_requested.erase(p->first);
875 peer_info.erase(p++);
876 removed = true;
877 } else
878 ++p;
879 }
880
881 // if we removed anyone, update peers (which include peer_info)
882 if (removed)
883 update_heartbeat_peers();
884 check_recovery_sources(osdmap);
885 }
886
887 /*
888 * Returns true unless there is a non-lost OSD in might_have_unfound.
889 */
890 bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
891 {
892 assert(is_primary());
893
894 set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
895 set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
896 for (; peer != mend; ++peer) {
897 if (peer_missing.count(*peer))
898 continue;
899 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
900 if (iter != peer_info.end() &&
901 (iter->second.is_empty() || iter->second.dne()))
902 continue;
903 if (!osdmap->exists(peer->osd))
904 continue;
905 const osd_info_t &osd_info(osdmap->get_info(peer->osd));
906 if (osd_info.lost_at <= osd_info.up_from) {
907 // If there is even one OSD in might_have_unfound that isn't lost, we
908 // still might retrieve our unfound.
909 return false;
910 }
911 }
912 dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound
913 << " have been queried or are marked lost" << dendl;
914 return true;
915 }
916
917 PastIntervals::PriorSet PG::build_prior()
918 {
919 if (1) {
920 // sanity check
921 for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
922 it != peer_info.end();
923 ++it) {
924 assert(info.history.last_epoch_started >= it->second.history.last_epoch_started);
925 }
926 }
927
928 const OSDMap &osdmap = *get_osdmap();
929 PastIntervals::PriorSet prior = past_intervals.get_prior_set(
930 pool.info.ec_pool(),
931 info.history.last_epoch_started,
932 get_pgbackend()->get_is_recoverable_predicate(),
933 [&](epoch_t start, int osd, epoch_t *lost_at) {
934 const osd_info_t *pinfo = 0;
935 if (osdmap.exists(osd)) {
936 pinfo = &osdmap.get_info(osd);
937 if (lost_at)
938 *lost_at = pinfo->lost_at;
939 }
940
941 if (osdmap.is_up(osd)) {
942 return PastIntervals::UP;
943 } else if (!pinfo) {
944 return PastIntervals::DNE;
945 } else if (pinfo->lost_at > start) {
946 return PastIntervals::LOST;
947 } else {
948 return PastIntervals::DOWN;
949 }
950 },
951 up,
952 acting,
953 this);
954
955 if (prior.pg_down) {
956 state_set(PG_STATE_DOWN);
957 }
958
959 if (get_osdmap()->get_up_thru(osd->whoami) < info.history.same_interval_since) {
960 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
961 << " < same_since " << info.history.same_interval_since
962 << ", must notify monitor" << dendl;
963 need_up_thru = true;
964 } else {
965 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
966 << " >= same_since " << info.history.same_interval_since
967 << ", all is well" << dendl;
968 need_up_thru = false;
969 }
970 set_probe_targets(prior.probe);
971 return prior;
972 }
973
974 void PG::clear_primary_state()
975 {
976 dout(10) << "clear_primary_state" << dendl;
977
978 // clear peering state
979 stray_set.clear();
980 peer_log_requested.clear();
981 peer_missing_requested.clear();
982 peer_info.clear();
983 peer_missing.clear();
984 need_up_thru = false;
985 peer_last_complete_ondisk.clear();
986 peer_activated.clear();
987 min_last_complete_ondisk = eversion_t();
988 pg_trim_to = eversion_t();
989 might_have_unfound.clear();
990 projected_log = PGLog::IndexedLog();
991
992 last_update_ondisk = eversion_t();
993
994 snap_trimq.clear();
995
996 finish_sync_event = 0; // so that _finish_recovery doesn't go off in another thread
997
998 missing_loc.clear();
999
1000 release_pg_backoffs();
1001
1002 pg_log.reset_recovery_pointers();
1003
1004 scrubber.reserved_peers.clear();
1005 scrub_after_recovery = false;
1006
1007 agent_clear();
1008 }
1009
1010 PG::Scrubber::Scrubber()
1011 : reserved(false), reserve_failed(false),
1012 epoch_start(0),
1013 active(false),
1014 waiting_on(0), shallow_errors(0), deep_errors(0), fixed(0),
1015 must_scrub(false), must_deep_scrub(false), must_repair(false),
1016 auto_repair(false),
1017 num_digest_updates_pending(0),
1018 state(INACTIVE),
1019 deep(false),
1020 seed(0)
1021 {}
1022
1023 PG::Scrubber::~Scrubber() {}
1024
1025 /**
1026 * find_best_info
1027 *
1028 * Returns an iterator to the best info in infos sorted by:
1029 * 1) Prefer newer last_update
1030 * 2) Prefer longer tail if it brings another info into contiguity
1031 * 3) Prefer current primary
1032 */
1033 map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
1034 const map<pg_shard_t, pg_info_t> &infos,
1035 bool restrict_to_up_acting,
1036 bool *history_les_bound) const
1037 {
1038 assert(history_les_bound);
1039 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1040 * to make changes to this process. Also, make sure to update it
1041 * when you find bugs! */
1042 eversion_t min_last_update_acceptable = eversion_t::max();
1043 epoch_t max_last_epoch_started_found = 0;
1044 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1045 i != infos.end();
1046 ++i) {
1047 if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1048 max_last_epoch_started_found < i->second.history.last_epoch_started) {
1049 *history_les_bound = true;
1050 max_last_epoch_started_found = i->second.history.last_epoch_started;
1051 }
1052 if (!i->second.is_incomplete() &&
1053 max_last_epoch_started_found < i->second.last_epoch_started) {
1054 max_last_epoch_started_found = i->second.last_epoch_started;
1055 }
1056 }
1057 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1058 i != infos.end();
1059 ++i) {
1060 if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1061 if (min_last_update_acceptable > i->second.last_update)
1062 min_last_update_acceptable = i->second.last_update;
1063 }
1064 }
1065 if (min_last_update_acceptable == eversion_t::max())
1066 return infos.end();
1067
1068 map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1069 // find osd with newest last_update (oldest for ec_pool).
1070 // if there are multiples, prefer
1071 // - a longer tail, if it brings another peer into log contiguity
1072 // - the current primary
1073 for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1074 p != infos.end();
1075 ++p) {
1076 if (restrict_to_up_acting && !is_up(p->first) &&
1077 !is_acting(p->first))
1078 continue;
1079 // Only consider peers with last_update >= min_last_update_acceptable
1080 if (p->second.last_update < min_last_update_acceptable)
1081 continue;
1082 // Disqualify anyone with a too old last_epoch_started
1083 if (p->second.last_epoch_started < max_last_epoch_started_found)
1084 continue;
1085 // Disqualify anyone who is incomplete (not fully backfilled)
1086 if (p->second.is_incomplete())
1087 continue;
1088 if (best == infos.end()) {
1089 best = p;
1090 continue;
1091 }
1092 // Prefer newer last_update
1093 if (pool.info.require_rollback()) {
1094 if (p->second.last_update > best->second.last_update)
1095 continue;
1096 if (p->second.last_update < best->second.last_update) {
1097 best = p;
1098 continue;
1099 }
1100 } else {
1101 if (p->second.last_update < best->second.last_update)
1102 continue;
1103 if (p->second.last_update > best->second.last_update) {
1104 best = p;
1105 continue;
1106 }
1107 }
1108
1109 // Prefer longer tail
1110 if (p->second.log_tail > best->second.log_tail) {
1111 continue;
1112 } else if (p->second.log_tail < best->second.log_tail) {
1113 best = p;
1114 continue;
1115 }
1116
1117 // prefer current primary (usually the caller), all things being equal
1118 if (p->first == pg_whoami) {
1119 dout(10) << "calc_acting prefer osd." << p->first
1120 << " because it is current primary" << dendl;
1121 best = p;
1122 continue;
1123 }
1124 }
1125 return best;
1126 }
1127
1128 void PG::calc_ec_acting(
1129 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1130 unsigned size,
1131 const vector<int> &acting,
1132 pg_shard_t acting_primary,
1133 const vector<int> &up,
1134 pg_shard_t up_primary,
1135 const map<pg_shard_t, pg_info_t> &all_info,
1136 bool restrict_to_up_acting,
1137 vector<int> *_want,
1138 set<pg_shard_t> *backfill,
1139 set<pg_shard_t> *acting_backfill,
1140 pg_shard_t *want_primary,
1141 ostream &ss)
1142 {
1143 vector<int> want(size, CRUSH_ITEM_NONE);
1144 map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1145 unsigned usable = 0;
1146 for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1147 i != all_info.end();
1148 ++i) {
1149 all_info_by_shard[i->first.shard].insert(i->first);
1150 }
1151 for (uint8_t i = 0; i < want.size(); ++i) {
1152 ss << "For position " << (unsigned)i << ": ";
1153 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1154 !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1155 all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1156 auth_log_shard->second.log_tail) {
1157 ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1158 want[i] = up[i];
1159 ++usable;
1160 continue;
1161 }
1162 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1163 ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1164 << " and ";
1165 backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1166 }
1167
1168 if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1169 !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1170 all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1171 auth_log_shard->second.log_tail) {
1172 ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1173 want[i] = acting[i];
1174 ++usable;
1175 } else if (!restrict_to_up_acting) {
1176 for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1177 j != all_info_by_shard[shard_id_t(i)].end();
1178 ++j) {
1179 assert(j->shard == i);
1180 if (!all_info.find(*j)->second.is_incomplete() &&
1181 all_info.find(*j)->second.last_update >=
1182 auth_log_shard->second.log_tail) {
1183 ss << " selecting stray: " << *j << std::endl;
1184 want[i] = j->osd;
1185 ++usable;
1186 break;
1187 }
1188 }
1189 if (want[i] == CRUSH_ITEM_NONE)
1190 ss << " failed to fill position " << (int)i << std::endl;
1191 }
1192 }
1193
1194 bool found_primary = false;
1195 for (uint8_t i = 0; i < want.size(); ++i) {
1196 if (want[i] != CRUSH_ITEM_NONE) {
1197 acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1198 if (!found_primary) {
1199 *want_primary = pg_shard_t(want[i], shard_id_t(i));
1200 found_primary = true;
1201 }
1202 }
1203 }
1204 acting_backfill->insert(backfill->begin(), backfill->end());
1205 _want->swap(want);
1206 }
1207
1208 /**
1209 * calculate the desired acting set.
1210 *
1211 * Choose an appropriate acting set. Prefer up[0], unless it is
1212 * incomplete, or another osd has a longer tail that allows us to
1213 * bring other up nodes up to date.
1214 */
1215 void PG::calc_replicated_acting(
1216 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1217 unsigned size,
1218 const vector<int> &acting,
1219 pg_shard_t acting_primary,
1220 const vector<int> &up,
1221 pg_shard_t up_primary,
1222 const map<pg_shard_t, pg_info_t> &all_info,
1223 bool restrict_to_up_acting,
1224 vector<int> *want,
1225 set<pg_shard_t> *backfill,
1226 set<pg_shard_t> *acting_backfill,
1227 pg_shard_t *want_primary,
1228 ostream &ss)
1229 {
1230 ss << "calc_acting newest update on osd." << auth_log_shard->first
1231 << " with " << auth_log_shard->second
1232 << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1233 pg_shard_t auth_log_shard_id = auth_log_shard->first;
1234
1235 // select primary
1236 map<pg_shard_t,pg_info_t>::const_iterator primary;
1237 if (up.size() &&
1238 !all_info.find(up_primary)->second.is_incomplete() &&
1239 all_info.find(up_primary)->second.last_update >=
1240 auth_log_shard->second.log_tail) {
1241 ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1242 primary = all_info.find(up_primary); // prefer up[0], all thing being equal
1243 } else {
1244 assert(!auth_log_shard->second.is_incomplete());
1245 ss << "up[0] needs backfill, osd." << auth_log_shard_id
1246 << " selected as primary instead" << std::endl;
1247 primary = auth_log_shard;
1248 }
1249
1250 ss << "calc_acting primary is osd." << primary->first
1251 << " with " << primary->second << std::endl;
1252 *want_primary = primary->first;
1253 want->push_back(primary->first.osd);
1254 acting_backfill->insert(primary->first);
1255 unsigned usable = 1;
1256
1257 // select replicas that have log contiguity with primary.
1258 // prefer up, then acting, then any peer_info osds
1259 for (vector<int>::const_iterator i = up.begin();
1260 i != up.end();
1261 ++i) {
1262 pg_shard_t up_cand = pg_shard_t(*i, shard_id_t::NO_SHARD);
1263 if (up_cand == primary->first)
1264 continue;
1265 const pg_info_t &cur_info = all_info.find(up_cand)->second;
1266 if (cur_info.is_incomplete() ||
1267 cur_info.last_update < MIN(
1268 primary->second.log_tail,
1269 auth_log_shard->second.log_tail)) {
1270 /* We include auth_log_shard->second.log_tail because in GetLog,
1271 * we will request logs back to the min last_update over our
1272 * acting_backfill set, which will result in our log being extended
1273 * as far backwards as necessary to pick up any peers which can
1274 * be log recovered by auth_log_shard's log */
1275 ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1276 backfill->insert(up_cand);
1277 acting_backfill->insert(up_cand);
1278 } else {
1279 want->push_back(*i);
1280 acting_backfill->insert(up_cand);
1281 usable++;
1282 ss << " osd." << *i << " (up) accepted " << cur_info << std::endl;
1283 }
1284 }
1285
1286 // This no longer has backfill OSDs, but they are covered above.
1287 for (vector<int>::const_iterator i = acting.begin();
1288 i != acting.end();
1289 ++i) {
1290 pg_shard_t acting_cand(*i, shard_id_t::NO_SHARD);
1291 if (usable >= size)
1292 break;
1293
1294 // skip up osds we already considered above
1295 if (acting_cand == primary->first)
1296 continue;
1297 vector<int>::const_iterator up_it = find(up.begin(), up.end(), acting_cand.osd);
1298 if (up_it != up.end())
1299 continue;
1300
1301 const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1302 if (cur_info.is_incomplete() ||
1303 cur_info.last_update < primary->second.log_tail) {
1304 ss << " shard " << acting_cand << " (stray) REJECTED "
1305 << cur_info << std::endl;
1306 } else {
1307 want->push_back(*i);
1308 acting_backfill->insert(acting_cand);
1309 ss << " shard " << acting_cand << " (stray) accepted "
1310 << cur_info << std::endl;
1311 usable++;
1312 }
1313 }
1314
1315 if (restrict_to_up_acting) {
1316 return;
1317 }
1318 for (map<pg_shard_t,pg_info_t>::const_iterator i = all_info.begin();
1319 i != all_info.end();
1320 ++i) {
1321 if (usable >= size)
1322 break;
1323
1324 // skip up osds we already considered above
1325 if (i->first == primary->first)
1326 continue;
1327 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i->first.osd);
1328 if (up_it != up.end())
1329 continue;
1330 vector<int>::const_iterator acting_it = find(
1331 acting.begin(), acting.end(), i->first.osd);
1332 if (acting_it != acting.end())
1333 continue;
1334
1335 if (i->second.is_incomplete() ||
1336 i->second.last_update < primary->second.log_tail) {
1337 ss << " shard " << i->first << " (stray) REJECTED "
1338 << i->second << std::endl;
1339 } else {
1340 want->push_back(i->first.osd);
1341 acting_backfill->insert(i->first);
1342 ss << " shard " << i->first << " (stray) accepted "
1343 << i->second << std::endl;
1344 usable++;
1345 }
1346 }
1347 }
1348
1349 /**
1350 * choose acting
1351 *
1352 * calculate the desired acting, and request a change with the monitor
1353 * if it differs from the current acting.
1354 *
1355 * if restrict_to_up_acting=true, we filter out anything that's not in
1356 * up/acting. in order to lift this restriction, we need to
1357 * 1) check whether it's worth switching the acting set any time we get
1358 * a new pg info (not just here, when recovery finishes)
1359 * 2) check whether anything in want_acting went down on each new map
1360 * (and, if so, calculate a new want_acting)
1361 * 3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap)
1362 * TODO!
1363 */
1364 bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
1365 bool restrict_to_up_acting,
1366 bool *history_les_bound)
1367 {
1368 map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
1369 all_info[pg_whoami] = info;
1370
1371 for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
1372 p != all_info.end();
1373 ++p) {
1374 dout(10) << "calc_acting osd." << p->first << " " << p->second << dendl;
1375 }
1376
1377 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
1378 find_best_info(all_info, restrict_to_up_acting, history_les_bound);
1379
1380 if (auth_log_shard == all_info.end()) {
1381 if (up != acting) {
1382 dout(10) << "choose_acting no suitable info found (incomplete backfills?),"
1383 << " reverting to up" << dendl;
1384 want_acting = up;
1385 vector<int> empty;
1386 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1387 } else {
1388 dout(10) << "choose_acting failed" << dendl;
1389 assert(want_acting.empty());
1390 }
1391 return false;
1392 }
1393
1394 assert(!auth_log_shard->second.is_incomplete());
1395 auth_log_shard_id = auth_log_shard->first;
1396
1397 set<pg_shard_t> want_backfill, want_acting_backfill;
1398 vector<int> want;
1399 pg_shard_t want_primary;
1400 stringstream ss;
1401 if (!pool.info.ec_pool())
1402 calc_replicated_acting(
1403 auth_log_shard,
1404 get_osdmap()->get_pg_size(info.pgid.pgid),
1405 acting,
1406 primary,
1407 up,
1408 up_primary,
1409 all_info,
1410 restrict_to_up_acting,
1411 &want,
1412 &want_backfill,
1413 &want_acting_backfill,
1414 &want_primary,
1415 ss);
1416 else
1417 calc_ec_acting(
1418 auth_log_shard,
1419 get_osdmap()->get_pg_size(info.pgid.pgid),
1420 acting,
1421 primary,
1422 up,
1423 up_primary,
1424 all_info,
1425 restrict_to_up_acting,
1426 &want,
1427 &want_backfill,
1428 &want_acting_backfill,
1429 &want_primary,
1430 ss);
1431 dout(10) << ss.str() << dendl;
1432
1433 unsigned num_want_acting = 0;
1434 set<pg_shard_t> have;
1435 for (int i = 0; i < (int)want.size(); ++i) {
1436 if (want[i] != CRUSH_ITEM_NONE) {
1437 ++num_want_acting;
1438 have.insert(
1439 pg_shard_t(
1440 want[i],
1441 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1442 }
1443 }
1444
1445 // We go incomplete if below min_size for ec_pools since backfill
1446 // does not currently maintain rollbackability
1447 // Otherwise, we will go "peered", but not "active"
1448 if (num_want_acting < pool.info.min_size &&
1449 (pool.info.ec_pool() ||
1450 !cct->_conf->osd_allow_recovery_below_min_size)) {
1451 want_acting.clear();
1452 dout(10) << "choose_acting failed, below min size" << dendl;
1453 return false;
1454 }
1455
1456 /* Check whether we have enough acting shards to later perform recovery */
1457 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable_predicate(
1458 get_pgbackend()->get_is_recoverable_predicate());
1459 if (!(*recoverable_predicate)(have)) {
1460 want_acting.clear();
1461 dout(10) << "choose_acting failed, not recoverable" << dendl;
1462 return false;
1463 }
1464
1465 if (want != acting) {
1466 dout(10) << "choose_acting want " << want << " != acting " << acting
1467 << ", requesting pg_temp change" << dendl;
1468 want_acting = want;
1469
1470 if (want_acting == up) {
1471 // There can't be any pending backfill if
1472 // want is the same as crush map up OSDs.
1473 assert(want_backfill.empty());
1474 vector<int> empty;
1475 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1476 } else
1477 osd->queue_want_pg_temp(info.pgid.pgid, want);
1478 return false;
1479 }
1480 want_acting.clear();
1481 actingbackfill = want_acting_backfill;
1482 dout(10) << "actingbackfill is " << actingbackfill << dendl;
1483 assert(backfill_targets.empty() || backfill_targets == want_backfill);
1484 if (backfill_targets.empty()) {
1485 // Caller is GetInfo
1486 backfill_targets = want_backfill;
1487 }
1488 // Will not change if already set because up would have had to change
1489 // Verify that nothing in backfill is in stray_set
1490 for (set<pg_shard_t>::iterator i = want_backfill.begin();
1491 i != want_backfill.end();
1492 ++i) {
1493 assert(stray_set.find(*i) == stray_set.end());
1494 }
1495 dout(10) << "choose_acting want " << want << " (== acting) backfill_targets "
1496 << want_backfill << dendl;
1497 return true;
1498 }
1499
1500 /* Build the might_have_unfound set.
1501 *
1502 * This is used by the primary OSD during recovery.
1503 *
1504 * This set tracks the OSDs which might have unfound objects that the primary
1505 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
1506 * will remove the OSD from the set.
1507 */
1508 void PG::build_might_have_unfound()
1509 {
1510 assert(might_have_unfound.empty());
1511 assert(is_primary());
1512
1513 dout(10) << __func__ << dendl;
1514
1515 check_past_interval_bounds();
1516
1517 might_have_unfound = past_intervals.get_might_have_unfound(
1518 pg_whoami,
1519 pool.info.ec_pool());
1520
1521 // include any (stray) peers
1522 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
1523 p != peer_info.end();
1524 ++p)
1525 might_have_unfound.insert(p->first);
1526
1527 dout(15) << __func__ << ": built " << might_have_unfound << dendl;
1528 }
1529
1530 struct C_PG_ActivateCommitted : public Context {
1531 PGRef pg;
1532 epoch_t epoch;
1533 epoch_t activation_epoch;
1534 C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
1535 : pg(p), epoch(e), activation_epoch(ae) {}
1536 void finish(int r) override {
1537 pg->_activate_committed(epoch, activation_epoch);
1538 }
1539 };
1540
1541 void PG::activate(ObjectStore::Transaction& t,
1542 epoch_t activation_epoch,
1543 list<Context*>& tfin,
1544 map<int, map<spg_t,pg_query_t> >& query_map,
1545 map<int,
1546 vector<
1547 pair<pg_notify_t,
1548 PastIntervals> > > *activator_map,
1549 RecoveryCtx *ctx)
1550 {
1551 assert(!is_peered());
1552 assert(scrubber.callbacks.empty());
1553 assert(callbacks_for_degraded_object.empty());
1554
1555 // twiddle pg state
1556 state_clear(PG_STATE_DOWN);
1557
1558 send_notify = false;
1559
1560 if (is_primary()) {
1561 // only update primary last_epoch_started if we will go active
1562 if (acting.size() >= pool.info.min_size) {
1563 assert(cct->_conf->osd_find_best_info_ignore_history_les ||
1564 info.last_epoch_started <= activation_epoch);
1565 info.last_epoch_started = activation_epoch;
1566 info.last_interval_started = info.history.same_interval_since;
1567 }
1568 } else if (is_acting(pg_whoami)) {
1569 /* update last_epoch_started on acting replica to whatever the primary sent
1570 * unless it's smaller (could happen if we are going peered rather than
1571 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
1572 if (info.last_epoch_started < activation_epoch) {
1573 info.last_epoch_started = activation_epoch;
1574 info.last_interval_started = info.history.same_interval_since;
1575 }
1576 }
1577
1578 auto &missing = pg_log.get_missing();
1579
1580 if (is_primary()) {
1581 last_update_ondisk = info.last_update;
1582 min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
1583 }
1584 last_update_applied = info.last_update;
1585 last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
1586
1587 need_up_thru = false;
1588
1589 // write pg info, log
1590 dirty_info = true;
1591 dirty_big_info = true; // maybe
1592
1593 // find out when we commit
1594 t.register_on_complete(
1595 new C_PG_ActivateCommitted(
1596 this,
1597 get_osdmap()->get_epoch(),
1598 activation_epoch));
1599
1600 // initialize snap_trimq
1601 if (is_primary()) {
1602 dout(20) << "activate - purged_snaps " << info.purged_snaps
1603 << " cached_removed_snaps " << pool.cached_removed_snaps << dendl;
1604 snap_trimq = pool.cached_removed_snaps;
1605 interval_set<snapid_t> intersection;
1606 intersection.intersection_of(snap_trimq, info.purged_snaps);
1607 if (intersection == info.purged_snaps) {
1608 snap_trimq.subtract(info.purged_snaps);
1609 } else {
1610 dout(0) << "warning: info.purged_snaps (" << info.purged_snaps
1611 << ") is not a subset of pool.cached_removed_snaps ("
1612 << pool.cached_removed_snaps << ")" << dendl;
1613 snap_trimq.subtract(intersection);
1614 }
1615 }
1616
1617 // init complete pointer
1618 if (missing.num_missing() == 0) {
1619 dout(10) << "activate - no missing, moving last_complete " << info.last_complete
1620 << " -> " << info.last_update << dendl;
1621 info.last_complete = info.last_update;
1622 pg_log.reset_recovery_pointers();
1623 } else {
1624 dout(10) << "activate - not complete, " << missing << dendl;
1625 pg_log.activate_not_complete(info);
1626 }
1627
1628 log_weirdness();
1629
1630 // if primary..
1631 if (is_primary()) {
1632 assert(ctx);
1633 // start up replicas
1634
1635 assert(!actingbackfill.empty());
1636 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1637 i != actingbackfill.end();
1638 ++i) {
1639 if (*i == pg_whoami) continue;
1640 pg_shard_t peer = *i;
1641 assert(peer_info.count(peer));
1642 pg_info_t& pi = peer_info[peer];
1643
1644 dout(10) << "activate peer osd." << peer << " " << pi << dendl;
1645
1646 MOSDPGLog *m = 0;
1647 pg_missing_t& pm = peer_missing[peer];
1648
1649 bool needs_past_intervals = pi.dne();
1650
1651 /*
1652 * cover case where peer sort order was different and
1653 * last_backfill cannot be interpreted
1654 */
1655 bool force_restart_backfill =
1656 !pi.last_backfill.is_max() &&
1657 !pi.last_backfill_bitwise;
1658
1659 if (pi.last_update == info.last_update && !force_restart_backfill) {
1660 // empty log
1661 if (!pi.last_backfill.is_max())
1662 osd->clog->info() << info.pgid << " continuing backfill to osd."
1663 << peer
1664 << " from (" << pi.log_tail << "," << pi.last_update
1665 << "] " << pi.last_backfill
1666 << " to " << info.last_update;
1667 if (!pi.is_empty() && activator_map) {
1668 dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
1669 (*activator_map)[peer.osd].push_back(
1670 make_pair(
1671 pg_notify_t(
1672 peer.shard, pg_whoami.shard,
1673 get_osdmap()->get_epoch(),
1674 get_osdmap()->get_epoch(),
1675 info),
1676 past_intervals));
1677 } else {
1678 dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
1679 m = new MOSDPGLog(
1680 i->shard, pg_whoami.shard,
1681 get_osdmap()->get_epoch(), info);
1682 }
1683 } else if (
1684 pg_log.get_tail() > pi.last_update ||
1685 pi.last_backfill == hobject_t() ||
1686 force_restart_backfill ||
1687 (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
1688 /* ^ This last case covers a situation where a replica is not contiguous
1689 * with the auth_log, but is contiguous with this replica. Reshuffling
1690 * the active set to handle this would be tricky, so instead we just go
1691 * ahead and backfill it anyway. This is probably preferrable in any
1692 * case since the replica in question would have to be significantly
1693 * behind.
1694 */
1695 // backfill
1696 osd->clog->debug() << info.pgid << " starting backfill to osd." << peer
1697 << " from (" << pi.log_tail << "," << pi.last_update
1698 << "] " << pi.last_backfill
1699 << " to " << info.last_update;
1700
1701 pi.last_update = info.last_update;
1702 pi.last_complete = info.last_update;
1703 pi.set_last_backfill(hobject_t());
1704 pi.last_epoch_started = info.last_epoch_started;
1705 pi.last_interval_started = info.last_interval_started;
1706 pi.history = info.history;
1707 pi.hit_set = info.hit_set;
1708 pi.stats.stats.clear();
1709
1710 // initialize peer with our purged_snaps.
1711 pi.purged_snaps = info.purged_snaps;
1712
1713 m = new MOSDPGLog(
1714 i->shard, pg_whoami.shard,
1715 get_osdmap()->get_epoch(), pi);
1716
1717 // send some recent log, so that op dup detection works well.
1718 m->log.copy_up_to(pg_log.get_log(), cct->_conf->osd_min_pg_log_entries);
1719 m->info.log_tail = m->log.tail;
1720 pi.log_tail = m->log.tail; // sigh...
1721
1722 pm.clear();
1723 } else {
1724 // catch up
1725 assert(pg_log.get_tail() <= pi.last_update);
1726 m = new MOSDPGLog(
1727 i->shard, pg_whoami.shard,
1728 get_osdmap()->get_epoch(), info);
1729 // send new stuff to append to replicas log
1730 m->log.copy_after(pg_log.get_log(), pi.last_update);
1731 }
1732
1733 // share past_intervals if we are creating the pg on the replica
1734 // based on whether our info for that peer was dne() *before*
1735 // updating pi.history in the backfill block above.
1736 if (m && needs_past_intervals)
1737 m->past_intervals = past_intervals;
1738
1739 // update local version of peer's missing list!
1740 if (m && pi.last_backfill != hobject_t()) {
1741 for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
1742 p != m->log.log.end();
1743 ++p) {
1744 if (p->soid <= pi.last_backfill &&
1745 !p->is_error()) {
1746 if (perform_deletes_during_peering() && p->is_delete()) {
1747 pm.rm(p->soid, p->version);
1748 } else {
1749 pm.add_next_event(*p);
1750 }
1751 }
1752 }
1753 }
1754
1755 if (m) {
1756 dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
1757 //m->log.print(cout);
1758 osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
1759 }
1760
1761 // peer now has
1762 pi.last_update = info.last_update;
1763
1764 // update our missing
1765 if (pm.num_missing() == 0) {
1766 pi.last_complete = pi.last_update;
1767 dout(10) << "activate peer osd." << peer << " " << pi << " uptodate" << dendl;
1768 } else {
1769 dout(10) << "activate peer osd." << peer << " " << pi << " missing " << pm << dendl;
1770 }
1771 }
1772
1773 // Set up missing_loc
1774 set<pg_shard_t> complete_shards;
1775 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1776 i != actingbackfill.end();
1777 ++i) {
1778 dout(20) << __func__ << " setting up missing_loc from shard " << *i << " " << dendl;
1779 if (*i == get_primary()) {
1780 missing_loc.add_active_missing(missing);
1781 if (!missing.have_missing())
1782 complete_shards.insert(*i);
1783 } else {
1784 auto peer_missing_entry = peer_missing.find(*i);
1785 assert(peer_missing_entry != peer_missing.end());
1786 missing_loc.add_active_missing(peer_missing_entry->second);
1787 if (!peer_missing_entry->second.have_missing() &&
1788 peer_info[*i].last_backfill.is_max())
1789 complete_shards.insert(*i);
1790 }
1791 }
1792 // If necessary, create might_have_unfound to help us find our unfound objects.
1793 // NOTE: It's important that we build might_have_unfound before trimming the
1794 // past intervals.
1795 might_have_unfound.clear();
1796 if (needs_recovery()) {
1797 // If only one shard has missing, we do a trick to add all others as recovery
1798 // source, this is considered safe since the PGLogs have been merged locally,
1799 // and covers vast majority of the use cases, like one OSD/host is down for
1800 // a while for hardware repairing
1801 if (complete_shards.size() + 1 == actingbackfill.size()) {
1802 missing_loc.add_batch_sources_info(complete_shards, ctx->handle);
1803 } else {
1804 missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
1805 ctx->handle);
1806 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1807 i != actingbackfill.end();
1808 ++i) {
1809 if (*i == pg_whoami) continue;
1810 dout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
1811 assert(peer_missing.count(*i));
1812 assert(peer_info.count(*i));
1813 missing_loc.add_source_info(
1814 *i,
1815 peer_info[*i],
1816 peer_missing[*i],
1817 ctx->handle);
1818 }
1819 }
1820 for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
1821 i != peer_missing.end();
1822 ++i) {
1823 if (is_actingbackfill(i->first))
1824 continue;
1825 assert(peer_info.count(i->first));
1826 search_for_missing(
1827 peer_info[i->first],
1828 i->second,
1829 i->first,
1830 ctx);
1831 }
1832
1833 build_might_have_unfound();
1834
1835 state_set(PG_STATE_DEGRADED);
1836 if (have_unfound())
1837 discover_all_missing(query_map);
1838 }
1839
1840 // degraded?
1841 if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
1842 state_set(PG_STATE_DEGRADED);
1843 state_set(PG_STATE_UNDERSIZED);
1844 }
1845
1846 state_set(PG_STATE_ACTIVATING);
1847 release_pg_backoffs();
1848 projected_last_update = info.last_update;
1849 }
1850 if (acting.size() >= pool.info.min_size) {
1851 PGLogEntryHandler handler{this, &t};
1852 pg_log.roll_forward(&handler);
1853 }
1854 }
1855
1856 bool PG::op_has_sufficient_caps(OpRequestRef& op)
1857 {
1858 // only check MOSDOp
1859 if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
1860 return true;
1861
1862 const MOSDOp *req = static_cast<const MOSDOp*>(op->get_req());
1863
1864 Session *session = static_cast<Session*>(req->get_connection()->get_priv());
1865 if (!session) {
1866 dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl;
1867 return false;
1868 }
1869 OSDCap& caps = session->caps;
1870 session->put();
1871
1872 const string &key = req->get_hobj().get_key().empty() ?
1873 req->get_oid().name :
1874 req->get_hobj().get_key();
1875
1876 bool cap = caps.is_capable(pool.name, req->get_hobj().nspace,
1877 pool.auid, key,
1878 op->need_read_cap(),
1879 op->need_write_cap(),
1880 op->classes());
1881
1882 dout(20) << "op_has_sufficient_caps "
1883 << "session=" << session
1884 << " pool=" << pool.id << " (" << pool.name
1885 << " " << req->get_hobj().nspace
1886 << ") owner=" << pool.auid
1887 << " need_read_cap=" << op->need_read_cap()
1888 << " need_write_cap=" << op->need_write_cap()
1889 << " classes=" << op->classes()
1890 << " -> " << (cap ? "yes" : "NO")
1891 << dendl;
1892 return cap;
1893 }
1894
1895 void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
1896 {
1897 lock();
1898 if (pg_has_reset_since(epoch)) {
1899 dout(10) << "_activate_committed " << epoch
1900 << ", that was an old interval" << dendl;
1901 } else if (is_primary()) {
1902 peer_activated.insert(pg_whoami);
1903 dout(10) << "_activate_committed " << epoch
1904 << " peer_activated now " << peer_activated
1905 << " last_interval_started " << info.history.last_interval_started
1906 << " last_epoch_started " << info.history.last_epoch_started
1907 << " same_interval_since " << info.history.same_interval_since << dendl;
1908 assert(!actingbackfill.empty());
1909 if (peer_activated.size() == actingbackfill.size())
1910 all_activated_and_committed();
1911 } else {
1912 dout(10) << "_activate_committed " << epoch << " telling primary" << dendl;
1913 MOSDPGInfo *m = new MOSDPGInfo(epoch);
1914 pg_notify_t i = pg_notify_t(
1915 get_primary().shard, pg_whoami.shard,
1916 get_osdmap()->get_epoch(),
1917 get_osdmap()->get_epoch(),
1918 info);
1919
1920 i.info.history.last_epoch_started = activation_epoch;
1921 i.info.history.last_interval_started = i.info.history.same_interval_since;
1922 if (acting.size() >= pool.info.min_size) {
1923 state_set(PG_STATE_ACTIVE);
1924 } else {
1925 state_set(PG_STATE_PEERED);
1926 }
1927
1928 m->pg_list.push_back(make_pair(i, PastIntervals()));
1929 osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap()->get_epoch());
1930
1931 // waiters
1932 if (flushes_in_progress == 0) {
1933 requeue_ops(waiting_for_peered);
1934 }
1935 }
1936
1937 assert(!dirty_info);
1938
1939 unlock();
1940 }
1941
1942 /*
1943 * update info.history.last_epoch_started ONLY after we and all
1944 * replicas have activated AND committed the activate transaction
1945 * (i.e. the peering results are stable on disk).
1946 */
1947 void PG::all_activated_and_committed()
1948 {
1949 dout(10) << "all_activated_and_committed" << dendl;
1950 assert(is_primary());
1951 assert(peer_activated.size() == actingbackfill.size());
1952 assert(!actingbackfill.empty());
1953 assert(blocked_by.empty());
1954
1955 queue_peering_event(
1956 CephPeeringEvtRef(
1957 std::make_shared<CephPeeringEvt>(
1958 get_osdmap()->get_epoch(),
1959 get_osdmap()->get_epoch(),
1960 AllReplicasActivated())));
1961 }
1962
1963 bool PG::requeue_scrub(bool high_priority)
1964 {
1965 assert(is_locked());
1966 if (scrub_queued) {
1967 dout(10) << __func__ << ": already queued" << dendl;
1968 return false;
1969 } else {
1970 dout(10) << __func__ << ": queueing" << dendl;
1971 scrub_queued = true;
1972 osd->queue_for_scrub(this, high_priority);
1973 return true;
1974 }
1975 }
1976
1977 void PG::queue_recovery()
1978 {
1979 if (!is_primary() || !is_peered()) {
1980 dout(10) << "queue_recovery -- not primary or not peered " << dendl;
1981 assert(!recovery_queued);
1982 } else if (recovery_queued) {
1983 dout(10) << "queue_recovery -- already queued" << dendl;
1984 } else {
1985 dout(10) << "queue_recovery -- queuing" << dendl;
1986 recovery_queued = true;
1987 osd->queue_for_recovery(this);
1988 }
1989 }
1990
1991 bool PG::queue_scrub()
1992 {
1993 assert(is_locked());
1994 if (is_scrubbing()) {
1995 return false;
1996 }
1997 scrubber.priority = scrubber.must_scrub ?
1998 cct->_conf->osd_requested_scrub_priority : get_scrub_priority();
1999 scrubber.must_scrub = false;
2000 state_set(PG_STATE_SCRUBBING);
2001 if (scrubber.must_deep_scrub) {
2002 state_set(PG_STATE_DEEP_SCRUB);
2003 scrubber.must_deep_scrub = false;
2004 }
2005 if (scrubber.must_repair || scrubber.auto_repair) {
2006 state_set(PG_STATE_REPAIR);
2007 scrubber.must_repair = false;
2008 }
2009 requeue_scrub();
2010 return true;
2011 }
2012
2013 unsigned PG::get_scrub_priority()
2014 {
2015 // a higher value -> a higher priority
2016 int pool_scrub_priority = 0;
2017 pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority);
2018 return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
2019 }
2020
2021 struct C_PG_FinishRecovery : public Context {
2022 PGRef pg;
2023 explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
2024 void finish(int r) override {
2025 pg->_finish_recovery(this);
2026 }
2027 };
2028
2029 void PG::mark_clean()
2030 {
2031 if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2032 state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
2033 state_set(PG_STATE_CLEAN);
2034 info.history.last_epoch_clean = get_osdmap()->get_epoch();
2035 info.history.last_interval_clean = info.history.same_interval_since;
2036 past_intervals.clear();
2037 dirty_big_info = true;
2038 dirty_info = true;
2039 }
2040
2041 kick_snap_trim();
2042 }
2043
2044 void PG::change_recovery_force_mode(int new_mode, bool clear)
2045 {
2046 lock(true);
2047 if (clear) {
2048 state_clear(new_mode);
2049 } else {
2050 state_set(new_mode);
2051 }
2052 publish_stats_to_osd();
2053
2054 unlock();
2055 }
2056
2057 inline int PG::clamp_recovery_priority(int priority)
2058 {
2059 static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
2060 static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
2061
2062 // Clamp to valid range
2063 if (priority > OSD_RECOVERY_PRIORITY_MAX) {
2064 return OSD_RECOVERY_PRIORITY_MAX;
2065 } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
2066 return OSD_RECOVERY_PRIORITY_MIN;
2067 } else {
2068 return priority;
2069 }
2070 }
2071
2072 unsigned PG::get_recovery_priority()
2073 {
2074 // a higher value -> a higher priority
2075 int ret = 0;
2076
2077 if (state & PG_STATE_FORCED_RECOVERY) {
2078 ret = OSD_RECOVERY_PRIORITY_FORCED;
2079 } else {
2080 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &ret);
2081 ret = clamp_recovery_priority(OSD_RECOVERY_PRIORITY_BASE + ret);
2082 }
2083 dout(20) << __func__ << " recovery priority for " << *this << " is " << ret << ", state is " << state << dendl;
2084 return static_cast<unsigned>(ret);
2085 }
2086
2087 unsigned PG::get_backfill_priority()
2088 {
2089 // a higher value -> a higher priority
2090 int ret = OSD_BACKFILL_PRIORITY_BASE;
2091 if (state & PG_STATE_FORCED_BACKFILL) {
2092 ret = OSD_RECOVERY_PRIORITY_FORCED;
2093 } else {
2094 if (acting.size() < pool.info.min_size) {
2095 // inactive: no. of replicas < min_size, highest priority since it blocks IO
2096 ret = OSD_BACKFILL_INACTIVE_PRIORITY_BASE + (pool.info.min_size - acting.size());
2097
2098 } else if (is_undersized()) {
2099 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
2100 assert(pool.info.size > actingset.size());
2101 ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE + (pool.info.size - actingset.size());
2102
2103 } else if (is_degraded()) {
2104 // degraded: baseline degraded
2105 ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
2106 }
2107
2108 // Adjust with pool's recovery priority
2109 int pool_recovery_priority = 0;
2110 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2111
2112 ret = clamp_recovery_priority(pool_recovery_priority + ret);
2113 }
2114
2115 return static_cast<unsigned>(ret);
2116 }
2117
2118 void PG::finish_recovery(list<Context*>& tfin)
2119 {
2120 dout(10) << "finish_recovery" << dendl;
2121 assert(info.last_complete == info.last_update);
2122
2123 clear_recovery_state();
2124
2125 /*
2126 * sync all this before purging strays. but don't block!
2127 */
2128 finish_sync_event = new C_PG_FinishRecovery(this);
2129 tfin.push_back(finish_sync_event);
2130 }
2131
2132 void PG::_finish_recovery(Context *c)
2133 {
2134 lock();
2135 if (deleting) {
2136 unlock();
2137 return;
2138 }
2139 if (c == finish_sync_event) {
2140 dout(10) << "_finish_recovery" << dendl;
2141 finish_sync_event = 0;
2142 purge_strays();
2143
2144 publish_stats_to_osd();
2145
2146 if (scrub_after_recovery) {
2147 dout(10) << "_finish_recovery requeueing for scrub" << dendl;
2148 scrub_after_recovery = false;
2149 scrubber.must_deep_scrub = true;
2150 queue_scrub();
2151 }
2152 } else {
2153 dout(10) << "_finish_recovery -- stale" << dendl;
2154 }
2155 unlock();
2156 }
2157
2158 void PG::start_recovery_op(const hobject_t& soid)
2159 {
2160 dout(10) << "start_recovery_op " << soid
2161 #ifdef DEBUG_RECOVERY_OIDS
2162 << " (" << recovering_oids << ")"
2163 #endif
2164 << dendl;
2165 assert(recovery_ops_active >= 0);
2166 recovery_ops_active++;
2167 #ifdef DEBUG_RECOVERY_OIDS
2168 assert(recovering_oids.count(soid) == 0);
2169 recovering_oids.insert(soid);
2170 #endif
2171 osd->start_recovery_op(this, soid);
2172 }
2173
2174 void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
2175 {
2176 dout(10) << "finish_recovery_op " << soid
2177 #ifdef DEBUG_RECOVERY_OIDS
2178 << " (" << recovering_oids << ")"
2179 #endif
2180 << dendl;
2181 assert(recovery_ops_active > 0);
2182 recovery_ops_active--;
2183 #ifdef DEBUG_RECOVERY_OIDS
2184 assert(recovering_oids.count(soid));
2185 recovering_oids.erase(soid);
2186 #endif
2187 osd->finish_recovery_op(this, soid, dequeue);
2188
2189 if (!dequeue) {
2190 queue_recovery();
2191 }
2192 }
2193
2194 void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
2195 {
2196 child->update_snap_mapper_bits(split_bits);
2197 child->update_osdmap_ref(get_osdmap());
2198
2199 child->pool = pool;
2200
2201 // Log
2202 pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2203 child->info.last_complete = info.last_complete;
2204
2205 info.last_update = pg_log.get_head();
2206 child->info.last_update = child->pg_log.get_head();
2207
2208 child->info.last_user_version = info.last_user_version;
2209
2210 info.log_tail = pg_log.get_tail();
2211 child->info.log_tail = child->pg_log.get_tail();
2212
2213 if (info.last_complete < pg_log.get_tail())
2214 info.last_complete = pg_log.get_tail();
2215 if (child->info.last_complete < child->pg_log.get_tail())
2216 child->info.last_complete = child->pg_log.get_tail();
2217
2218 // Info
2219 child->info.history = info.history;
2220 child->info.history.epoch_created = get_osdmap()->get_epoch();
2221 child->info.purged_snaps = info.purged_snaps;
2222
2223 if (info.last_backfill.is_max()) {
2224 child->info.set_last_backfill(hobject_t::get_max());
2225 } else {
2226 // restart backfill on parent and child to be safe. we could
2227 // probably do better in the bitwise sort case, but it's more
2228 // fragile (there may be special work to do on backfill completion
2229 // in the future).
2230 info.set_last_backfill(hobject_t());
2231 child->info.set_last_backfill(hobject_t());
2232 }
2233
2234 child->info.stats = info.stats;
2235 child->info.stats.parent_split_bits = split_bits;
2236 info.stats.stats_invalid = true;
2237 child->info.stats.stats_invalid = true;
2238 child->info.last_epoch_started = info.last_epoch_started;
2239 child->info.last_interval_started = info.last_interval_started;
2240
2241 child->snap_trimq = snap_trimq;
2242
2243 // There can't be recovery/backfill going on now
2244 int primary, up_primary;
2245 vector<int> newup, newacting;
2246 get_osdmap()->pg_to_up_acting_osds(
2247 child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2248 child->init_primary_up_acting(
2249 newup,
2250 newacting,
2251 up_primary,
2252 primary);
2253 child->role = OSDMap::calc_pg_role(osd->whoami, child->acting);
2254
2255 // this comparison includes primary rank via pg_shard_t
2256 if (get_primary() != child->get_primary())
2257 child->info.history.same_primary_since = get_osdmap()->get_epoch();
2258
2259 child->info.stats.up = up;
2260 child->info.stats.up_primary = up_primary;
2261 child->info.stats.acting = acting;
2262 child->info.stats.acting_primary = primary;
2263 child->info.stats.mapping_epoch = get_osdmap()->get_epoch();
2264
2265 // History
2266 child->past_intervals = past_intervals;
2267
2268 _split_into(child_pgid, child, split_bits);
2269
2270 // release all backoffs for simplicity
2271 release_backoffs(hobject_t(), hobject_t::get_max());
2272
2273 child->on_new_interval();
2274
2275 child->dirty_info = true;
2276 child->dirty_big_info = true;
2277 dirty_info = true;
2278 dirty_big_info = true;
2279 }
2280
2281 void PG::add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end)
2282 {
2283 ConnectionRef con = s->con;
2284 if (!con) // OSD::ms_handle_reset clears s->con without a lock
2285 return;
2286 BackoffRef b(s->have_backoff(info.pgid, begin));
2287 if (b) {
2288 derr << __func__ << " already have backoff for " << s << " begin " << begin
2289 << " " << *b << dendl;
2290 ceph_abort();
2291 }
2292 Mutex::Locker l(backoff_lock);
2293 {
2294 b = new Backoff(info.pgid, this, s, ++s->backoff_seq, begin, end);
2295 backoffs[begin].insert(b);
2296 s->add_backoff(b);
2297 dout(10) << __func__ << " session " << s << " added " << *b << dendl;
2298 }
2299 con->send_message(
2300 new MOSDBackoff(
2301 info.pgid,
2302 get_osdmap()->get_epoch(),
2303 CEPH_OSD_BACKOFF_OP_BLOCK,
2304 b->id,
2305 begin,
2306 end));
2307 }
2308
2309 void PG::release_backoffs(const hobject_t& begin, const hobject_t& end)
2310 {
2311 dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl;
2312 vector<BackoffRef> bv;
2313 {
2314 Mutex::Locker l(backoff_lock);
2315 auto p = backoffs.lower_bound(begin);
2316 while (p != backoffs.end()) {
2317 int r = cmp(p->first, end);
2318 dout(20) << __func__ << " ? " << r << " " << p->first
2319 << " " << p->second << dendl;
2320 // note: must still examine begin=end=p->first case
2321 if (r > 0 || (r == 0 && begin < end)) {
2322 break;
2323 }
2324 dout(20) << __func__ << " checking " << p->first
2325 << " " << p->second << dendl;
2326 auto q = p->second.begin();
2327 while (q != p->second.end()) {
2328 dout(20) << __func__ << " checking " << *q << dendl;
2329 int r = cmp((*q)->begin, begin);
2330 if (r == 0 || (r > 0 && (*q)->end < end)) {
2331 bv.push_back(*q);
2332 q = p->second.erase(q);
2333 } else {
2334 ++q;
2335 }
2336 }
2337 if (p->second.empty()) {
2338 p = backoffs.erase(p);
2339 } else {
2340 ++p;
2341 }
2342 }
2343 }
2344 for (auto b : bv) {
2345 Mutex::Locker l(b->lock);
2346 dout(10) << __func__ << " " << *b << dendl;
2347 if (b->session) {
2348 assert(b->pg == this);
2349 ConnectionRef con = b->session->con;
2350 if (con) { // OSD::ms_handle_reset clears s->con without a lock
2351 con->send_message(
2352 new MOSDBackoff(
2353 info.pgid,
2354 get_osdmap()->get_epoch(),
2355 CEPH_OSD_BACKOFF_OP_UNBLOCK,
2356 b->id,
2357 b->begin,
2358 b->end));
2359 }
2360 if (b->is_new()) {
2361 b->state = Backoff::STATE_DELETING;
2362 } else {
2363 b->session->rm_backoff(b);
2364 b->session.reset();
2365 }
2366 b->pg.reset();
2367 }
2368 }
2369 }
2370
2371 void PG::clear_backoffs()
2372 {
2373 dout(10) << __func__ << " " << dendl;
2374 map<hobject_t,set<BackoffRef>> ls;
2375 {
2376 Mutex::Locker l(backoff_lock);
2377 ls.swap(backoffs);
2378 }
2379 for (auto& p : ls) {
2380 for (auto& b : p.second) {
2381 Mutex::Locker l(b->lock);
2382 dout(10) << __func__ << " " << *b << dendl;
2383 if (b->session) {
2384 assert(b->pg == this);
2385 if (b->is_new()) {
2386 b->state = Backoff::STATE_DELETING;
2387 } else {
2388 b->session->rm_backoff(b);
2389 b->session.reset();
2390 }
2391 b->pg.reset();
2392 }
2393 }
2394 }
2395 }
2396
2397 // called by Session::clear_backoffs()
2398 void PG::rm_backoff(BackoffRef b)
2399 {
2400 dout(10) << __func__ << " " << *b << dendl;
2401 Mutex::Locker l(backoff_lock);
2402 assert(b->lock.is_locked_by_me());
2403 assert(b->pg == this);
2404 auto p = backoffs.find(b->begin);
2405 // may race with release_backoffs()
2406 if (p != backoffs.end()) {
2407 auto q = p->second.find(b);
2408 if (q != p->second.end()) {
2409 p->second.erase(q);
2410 if (p->second.empty()) {
2411 backoffs.erase(p);
2412 }
2413 }
2414 }
2415 }
2416
2417 void PG::clear_recovery_state()
2418 {
2419 dout(10) << "clear_recovery_state" << dendl;
2420
2421 pg_log.reset_recovery_pointers();
2422 finish_sync_event = 0;
2423
2424 hobject_t soid;
2425 while (recovery_ops_active > 0) {
2426 #ifdef DEBUG_RECOVERY_OIDS
2427 soid = *recovering_oids.begin();
2428 #endif
2429 finish_recovery_op(soid, true);
2430 }
2431
2432 backfill_targets.clear();
2433 backfill_info.clear();
2434 peer_backfill_info.clear();
2435 waiting_on_backfill.clear();
2436 _clear_recovery_state(); // pg impl specific hook
2437 }
2438
2439 void PG::cancel_recovery()
2440 {
2441 dout(10) << "cancel_recovery" << dendl;
2442 clear_recovery_state();
2443 }
2444
2445
2446 void PG::purge_strays()
2447 {
2448 dout(10) << "purge_strays " << stray_set << dendl;
2449
2450 bool removed = false;
2451 for (set<pg_shard_t>::iterator p = stray_set.begin();
2452 p != stray_set.end();
2453 ++p) {
2454 assert(!is_actingbackfill(*p));
2455 if (get_osdmap()->is_up(p->osd)) {
2456 dout(10) << "sending PGRemove to osd." << *p << dendl;
2457 vector<spg_t> to_remove;
2458 to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
2459 MOSDPGRemove *m = new MOSDPGRemove(
2460 get_osdmap()->get_epoch(),
2461 to_remove);
2462 osd->send_message_osd_cluster(p->osd, m, get_osdmap()->get_epoch());
2463 } else {
2464 dout(10) << "not sending PGRemove to down osd." << *p << dendl;
2465 }
2466 peer_missing.erase(*p);
2467 peer_info.erase(*p);
2468 peer_purged.insert(*p);
2469 removed = true;
2470 }
2471
2472 // if we removed anyone, update peers (which include peer_info)
2473 if (removed)
2474 update_heartbeat_peers();
2475
2476 stray_set.clear();
2477
2478 // clear _requested maps; we may have to peer() again if we discover
2479 // (more) stray content
2480 peer_log_requested.clear();
2481 peer_missing_requested.clear();
2482 }
2483
2484 void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
2485 {
2486 Mutex::Locker l(heartbeat_peer_lock);
2487 probe_targets.clear();
2488 for (set<pg_shard_t>::iterator i = probe_set.begin();
2489 i != probe_set.end();
2490 ++i) {
2491 probe_targets.insert(i->osd);
2492 }
2493 }
2494
2495 void PG::clear_probe_targets()
2496 {
2497 Mutex::Locker l(heartbeat_peer_lock);
2498 probe_targets.clear();
2499 }
2500
2501 void PG::update_heartbeat_peers()
2502 {
2503 assert(is_locked());
2504
2505 if (!is_primary())
2506 return;
2507
2508 set<int> new_peers;
2509 for (unsigned i=0; i<acting.size(); i++) {
2510 if (acting[i] != CRUSH_ITEM_NONE)
2511 new_peers.insert(acting[i]);
2512 }
2513 for (unsigned i=0; i<up.size(); i++) {
2514 if (up[i] != CRUSH_ITEM_NONE)
2515 new_peers.insert(up[i]);
2516 }
2517 for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
2518 p != peer_info.end();
2519 ++p)
2520 new_peers.insert(p->first.osd);
2521
2522 bool need_update = false;
2523 heartbeat_peer_lock.Lock();
2524 if (new_peers == heartbeat_peers) {
2525 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl;
2526 } else {
2527 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl;
2528 heartbeat_peers.swap(new_peers);
2529 need_update = true;
2530 }
2531 heartbeat_peer_lock.Unlock();
2532
2533 if (need_update)
2534 osd->need_heartbeat_peer_update();
2535 }
2536
2537
2538 bool PG::check_in_progress_op(
2539 const osd_reqid_t &r,
2540 eversion_t *version,
2541 version_t *user_version,
2542 int *return_code) const
2543 {
2544 return (
2545 projected_log.get_request(r, version, user_version, return_code) ||
2546 pg_log.get_log().get_request(r, version, user_version, return_code));
2547 }
2548
2549 void PG::_update_calc_stats()
2550 {
2551 info.stats.version = info.last_update;
2552 info.stats.created = info.history.epoch_created;
2553 info.stats.last_scrub = info.history.last_scrub;
2554 info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
2555 info.stats.last_deep_scrub = info.history.last_deep_scrub;
2556 info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
2557 info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
2558 info.stats.last_epoch_clean = info.history.last_epoch_clean;
2559
2560 info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
2561 info.stats.ondisk_log_size = info.stats.log_size;
2562 info.stats.log_start = pg_log.get_tail();
2563 info.stats.ondisk_log_start = pg_log.get_tail();
2564
2565 // If actingset is larger then upset we will have misplaced,
2566 // so we will report based on actingset size.
2567
2568 // If upset is larger then we will have degraded,
2569 // so we will report based on upset size.
2570
2571 // If target is the largest of them all, it will contribute to
2572 // the degraded count because num_object_copies is
2573 // computed using target and eventual used to get degraded total.
2574
2575 unsigned target = get_osdmap()->get_pg_size(info.pgid.pgid);
2576 unsigned nrep = MAX(actingset.size(), upset.size());
2577 // calc num_object_copies
2578 info.stats.stats.calc_copies(MAX(target, nrep));
2579 info.stats.stats.sum.num_objects_degraded = 0;
2580 info.stats.stats.sum.num_objects_unfound = 0;
2581 info.stats.stats.sum.num_objects_misplaced = 0;
2582 if ((is_degraded() || is_undersized() || !is_clean()) && is_peered()) {
2583 // NOTE: we only generate copies, degraded, misplaced and unfound
2584 // values for the summation, not individual stat categories.
2585 int64_t num_objects = info.stats.stats.sum.num_objects;
2586
2587 // Total sum of all missing
2588 int64_t missing = 0;
2589 // Objects that have arrived backfilled to up OSDs (not in acting)
2590 int64_t backfilled = 0;
2591 // A misplaced object is not stored on the correct OSD
2592 int64_t misplaced = 0;
2593 // Total of object copies/shards found
2594 int64_t object_copies = 0;
2595
2596 // num_objects_missing on each peer
2597 for (map<pg_shard_t, pg_info_t>::iterator pi =
2598 peer_info.begin();
2599 pi != peer_info.end();
2600 ++pi) {
2601 map<pg_shard_t, pg_missing_t>::const_iterator pm =
2602 peer_missing.find(pi->first);
2603 if (pm != peer_missing.end()) {
2604 pi->second.stats.stats.sum.num_objects_missing =
2605 pm->second.num_missing();
2606 }
2607 }
2608
2609 assert(!actingbackfill.empty());
2610 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
2611 i != actingbackfill.end();
2612 ++i) {
2613 const pg_shard_t &p = *i;
2614
2615 bool in_up = (upset.find(p) != upset.end());
2616 bool in_acting = (actingset.find(p) != actingset.end());
2617 assert(in_up || in_acting);
2618
2619 // in acting Compute total objects excluding num_missing
2620 // in acting and not in up Compute misplaced objects excluding num_missing
2621 // in up and not in acting Compute total objects already backfilled
2622 if (in_acting) {
2623 unsigned osd_missing;
2624 // primary handling
2625 if (p == pg_whoami) {
2626 osd_missing = pg_log.get_missing().num_missing();
2627 info.stats.stats.sum.num_objects_missing_on_primary =
2628 osd_missing;
2629 object_copies += num_objects; // My local (primary) count
2630 } else {
2631 assert(peer_missing.count(p));
2632 osd_missing = peer_missing[p].num_missing();
2633 object_copies += peer_info[p].stats.stats.sum.num_objects;
2634 }
2635 missing += osd_missing;
2636 // Count non-missing objects not in up as misplaced
2637 if (!in_up && num_objects > osd_missing)
2638 misplaced += num_objects - osd_missing;
2639 } else {
2640 assert(in_up && !in_acting);
2641
2642 // If this peer has more objects then it should, ignore them
2643 backfilled += MIN(num_objects, peer_info[p].stats.stats.sum.num_objects);
2644 }
2645 }
2646
2647 // Any objects that have been backfilled to up OSDs can deducted from misplaced
2648 misplaced = MAX(0, misplaced - backfilled);
2649
2650 // Deduct computed total missing on acting nodes
2651 object_copies -= missing;
2652 // Include computed backfilled objects on up nodes
2653 object_copies += backfilled;
2654 // a degraded objects has fewer replicas or EC shards than the
2655 // pool specifies. num_object_copies will never be smaller than target * num_copies.
2656 int64_t degraded = MAX(0, info.stats.stats.sum.num_object_copies - object_copies);
2657
2658 info.stats.stats.sum.num_objects_degraded = degraded;
2659 info.stats.stats.sum.num_objects_unfound = get_num_unfound();
2660 info.stats.stats.sum.num_objects_misplaced = misplaced;
2661 }
2662 }
2663
2664 void PG::_update_blocked_by()
2665 {
2666 // set a max on the number of blocking peers we report. if we go
2667 // over, report a random subset. keep the result sorted.
2668 unsigned keep = MIN(blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
2669 unsigned skip = blocked_by.size() - keep;
2670 info.stats.blocked_by.clear();
2671 info.stats.blocked_by.resize(keep);
2672 unsigned pos = 0;
2673 for (set<int>::iterator p = blocked_by.begin();
2674 p != blocked_by.end() && keep > 0;
2675 ++p) {
2676 if (skip > 0 && (rand() % (skip + keep) < skip)) {
2677 --skip;
2678 } else {
2679 info.stats.blocked_by[pos++] = *p;
2680 --keep;
2681 }
2682 }
2683 }
2684
2685 void PG::publish_stats_to_osd()
2686 {
2687 if (!is_primary())
2688 return;
2689
2690 pg_stats_publish_lock.Lock();
2691
2692 if (info.stats.stats.sum.num_scrub_errors)
2693 state_set(PG_STATE_INCONSISTENT);
2694 else
2695 state_clear(PG_STATE_INCONSISTENT);
2696
2697 utime_t now = ceph_clock_now();
2698 if (info.stats.state != state) {
2699 info.stats.last_change = now;
2700 // Optimistic estimation, if we just find out an inactive PG,
2701 // assumt it is active till now.
2702 if (!(state & PG_STATE_ACTIVE) &&
2703 (info.stats.state & PG_STATE_ACTIVE))
2704 info.stats.last_active = now;
2705
2706 if ((state & PG_STATE_ACTIVE) &&
2707 !(info.stats.state & PG_STATE_ACTIVE))
2708 info.stats.last_became_active = now;
2709 if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
2710 !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
2711 info.stats.last_became_peered = now;
2712 if (!(state & PG_STATE_CREATING) &&
2713 (info.stats.state & PG_STATE_CREATING)) {
2714 osd->send_pg_created(get_pgid().pgid);
2715 }
2716 info.stats.state = state;
2717 }
2718
2719 _update_calc_stats();
2720 _update_blocked_by();
2721
2722 bool publish = false;
2723 pg_stat_t pre_publish = info.stats;
2724 pre_publish.stats.add(unstable_stats);
2725 utime_t cutoff = now;
2726 cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
2727 if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
2728 info.stats.last_fresh > cutoff) {
2729 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2730 << ": no change since " << info.stats.last_fresh << dendl;
2731 } else {
2732 // update our stat summary and timestamps
2733 info.stats.reported_epoch = get_osdmap()->get_epoch();
2734 ++info.stats.reported_seq;
2735
2736 info.stats.last_fresh = now;
2737
2738 if (info.stats.state & PG_STATE_CLEAN)
2739 info.stats.last_clean = now;
2740 if (info.stats.state & PG_STATE_ACTIVE)
2741 info.stats.last_active = now;
2742 if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
2743 info.stats.last_peered = now;
2744 info.stats.last_unstale = now;
2745 if ((info.stats.state & PG_STATE_DEGRADED) == 0)
2746 info.stats.last_undegraded = now;
2747 if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
2748 info.stats.last_fullsized = now;
2749
2750 // do not send pgstat to mon anymore once we are luminous, since mgr takes
2751 // care of this by sending MMonMgrReport to mon.
2752 publish =
2753 osd->osd->get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
2754 pg_stats_publish_valid = true;
2755 pg_stats_publish = pre_publish;
2756
2757 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2758 << ":" << pg_stats_publish.reported_seq << dendl;
2759 }
2760 pg_stats_publish_lock.Unlock();
2761
2762 if (publish)
2763 osd->pg_stat_queue_enqueue(this);
2764 }
2765
2766 void PG::clear_publish_stats()
2767 {
2768 dout(15) << "clear_stats" << dendl;
2769 pg_stats_publish_lock.Lock();
2770 pg_stats_publish_valid = false;
2771 pg_stats_publish_lock.Unlock();
2772
2773 osd->pg_stat_queue_dequeue(this);
2774 }
2775
2776 /**
2777 * initialize a newly instantiated pg
2778 *
2779 * Initialize PG state, as when a PG is initially created, or when it
2780 * is first instantiated on the current node.
2781 *
2782 * @param role our role/rank
2783 * @param newup up set
2784 * @param newacting acting set
2785 * @param history pg history
2786 * @param pi past_intervals
2787 * @param backfill true if info should be marked as backfill
2788 * @param t transaction to write out our new state in
2789 */
2790 void PG::init(
2791 int role,
2792 const vector<int>& newup, int new_up_primary,
2793 const vector<int>& newacting, int new_acting_primary,
2794 const pg_history_t& history,
2795 const PastIntervals& pi,
2796 bool backfill,
2797 ObjectStore::Transaction *t)
2798 {
2799 dout(10) << "init role " << role << " up " << newup << " acting " << newacting
2800 << " history " << history
2801 << " past_intervals " << pi
2802 << dendl;
2803
2804 set_role(role);
2805 acting = newacting;
2806 up = newup;
2807 init_primary_up_acting(
2808 newup,
2809 newacting,
2810 new_up_primary,
2811 new_acting_primary);
2812
2813 info.history = history;
2814 past_intervals = pi;
2815
2816 info.stats.up = up;
2817 info.stats.up_primary = new_up_primary;
2818 info.stats.acting = acting;
2819 info.stats.acting_primary = new_acting_primary;
2820 info.stats.mapping_epoch = info.history.same_interval_since;
2821
2822 if (backfill) {
2823 dout(10) << __func__ << ": Setting backfill" << dendl;
2824 info.set_last_backfill(hobject_t());
2825 info.last_complete = info.last_update;
2826 pg_log.mark_log_for_rewrite();
2827 }
2828
2829 on_new_interval();
2830
2831 dirty_info = true;
2832 dirty_big_info = true;
2833 write_if_dirty(*t);
2834 }
2835
2836 #pragma GCC diagnostic ignored "-Wpragmas"
2837 #pragma GCC diagnostic push
2838 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2839
2840 void PG::upgrade(ObjectStore *store)
2841 {
2842 assert(info_struct_v <= 10);
2843 ObjectStore::Transaction t;
2844
2845 assert(info_struct_v >= 7);
2846
2847 // 7 -> 8
2848 if (info_struct_v <= 7) {
2849 pg_log.mark_log_for_rewrite();
2850 ghobject_t log_oid(OSD::make_pg_log_oid(pg_id));
2851 ghobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id));
2852 t.remove(coll_t::meta(), log_oid);
2853 t.remove(coll_t::meta(), biginfo_oid);
2854 t.touch(coll, pgmeta_oid);
2855 }
2856
2857 // 8 -> 9
2858 if (info_struct_v <= 8) {
2859 // no special action needed.
2860 }
2861
2862 // 9 -> 10
2863 if (info_struct_v <= 9) {
2864 // previous versions weren't (as) aggressively clearing past_intervals
2865 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
2866 dout(20) << __func__ << " clearing past_intervals" << dendl;
2867 past_intervals.clear();
2868 }
2869 }
2870
2871 // update infover_key
2872 if (info_struct_v < cur_struct_v) {
2873 map<string,bufferlist> v;
2874 __u8 ver = cur_struct_v;
2875 ::encode(ver, v[infover_key]);
2876 t.omap_setkeys(coll, pgmeta_oid, v);
2877 }
2878
2879 dirty_info = true;
2880 dirty_big_info = true;
2881 write_if_dirty(t);
2882
2883 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
2884 ObjectStore::Sequencer>("upgrade"));
2885 int r = store->apply_transaction(osr.get(), std::move(t));
2886 if (r != 0) {
2887 derr << __func__ << ": apply_transaction returned "
2888 << cpp_strerror(r) << dendl;
2889 ceph_abort();
2890 }
2891 assert(r == 0);
2892
2893 C_SaferCond waiter;
2894 if (!osr->flush_commit(&waiter)) {
2895 waiter.wait();
2896 }
2897 }
2898
2899 #pragma GCC diagnostic pop
2900 #pragma GCC diagnostic warning "-Wpragmas"
2901
2902 int PG::_prepare_write_info(CephContext* cct,
2903 map<string,bufferlist> *km,
2904 epoch_t epoch,
2905 pg_info_t &info, pg_info_t &last_written_info,
2906 PastIntervals &past_intervals,
2907 bool dirty_big_info,
2908 bool dirty_epoch,
2909 bool try_fast_info,
2910 PerfCounters *logger)
2911 {
2912 if (dirty_epoch) {
2913 ::encode(epoch, (*km)[epoch_key]);
2914 }
2915
2916 if (logger)
2917 logger->inc(l_osd_pg_info);
2918
2919 // try to do info efficiently?
2920 if (!dirty_big_info && try_fast_info &&
2921 info.last_update > last_written_info.last_update) {
2922 pg_fast_info_t fast;
2923 fast.populate_from(info);
2924 bool did = fast.try_apply_to(&last_written_info);
2925 assert(did); // we verified last_update increased above
2926 if (info == last_written_info) {
2927 ::encode(fast, (*km)[fastinfo_key]);
2928 if (logger)
2929 logger->inc(l_osd_pg_fastinfo);
2930 return 0;
2931 }
2932 generic_dout(30) << __func__ << " fastinfo failed, info:\n";
2933 {
2934 JSONFormatter jf(true);
2935 jf.dump_object("info", info);
2936 jf.flush(*_dout);
2937 }
2938 {
2939 *_dout << "\nlast_written_info:\n";
2940 JSONFormatter jf(true);
2941 jf.dump_object("last_written_info", last_written_info);
2942 jf.flush(*_dout);
2943 }
2944 *_dout << dendl;
2945 }
2946 last_written_info = info;
2947
2948 // info. store purged_snaps separately.
2949 interval_set<snapid_t> purged_snaps;
2950 purged_snaps.swap(info.purged_snaps);
2951 ::encode(info, (*km)[info_key]);
2952 purged_snaps.swap(info.purged_snaps);
2953
2954 if (dirty_big_info) {
2955 // potentially big stuff
2956 bufferlist& bigbl = (*km)[biginfo_key];
2957 ::encode(past_intervals, bigbl);
2958 ::encode(info.purged_snaps, bigbl);
2959 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
2960 if (logger)
2961 logger->inc(l_osd_pg_biginfo);
2962 }
2963
2964 return 0;
2965 }
2966
2967 void PG::_create(ObjectStore::Transaction& t, spg_t pgid, int bits)
2968 {
2969 coll_t coll(pgid);
2970 t.create_collection(coll, bits);
2971 }
2972
2973 void PG::_init(ObjectStore::Transaction& t, spg_t pgid, const pg_pool_t *pool)
2974 {
2975 coll_t coll(pgid);
2976
2977 if (pool) {
2978 // Give a hint to the PG collection
2979 bufferlist hint;
2980 uint32_t pg_num = pool->get_pg_num();
2981 uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
2982 ::encode(pg_num, hint);
2983 ::encode(expected_num_objects_pg, hint);
2984 uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
2985 t.collection_hint(coll, hint_type, hint);
2986 }
2987
2988 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
2989 t.touch(coll, pgmeta_oid);
2990 map<string,bufferlist> values;
2991 __u8 struct_v = cur_struct_v;
2992 ::encode(struct_v, values[infover_key]);
2993 t.omap_setkeys(coll, pgmeta_oid, values);
2994 }
2995
2996 void PG::prepare_write_info(map<string,bufferlist> *km)
2997 {
2998 info.stats.stats.add(unstable_stats);
2999 unstable_stats.clear();
3000
3001 bool need_update_epoch = last_epoch < get_osdmap()->get_epoch();
3002 int ret = _prepare_write_info(cct, km, get_osdmap()->get_epoch(),
3003 info,
3004 last_written_info,
3005 past_intervals,
3006 dirty_big_info, need_update_epoch,
3007 cct->_conf->osd_fast_info,
3008 osd->logger);
3009 assert(ret == 0);
3010 if (need_update_epoch)
3011 last_epoch = get_osdmap()->get_epoch();
3012 last_persisted_osdmap_ref = osdmap_ref;
3013
3014 dirty_info = false;
3015 dirty_big_info = false;
3016 }
3017
3018 #pragma GCC diagnostic ignored "-Wpragmas"
3019 #pragma GCC diagnostic push
3020 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3021
3022 bool PG::_has_removal_flag(ObjectStore *store,
3023 spg_t pgid)
3024 {
3025 coll_t coll(pgid);
3026 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3027
3028 // first try new way
3029 set<string> keys;
3030 keys.insert("_remove");
3031 map<string,bufferlist> values;
3032 if (store->omap_get_values(coll, pgmeta_oid, keys, &values) == 0 &&
3033 values.size() == 1)
3034 return true;
3035
3036 return false;
3037 }
3038
3039 int PG::peek_map_epoch(ObjectStore *store,
3040 spg_t pgid,
3041 epoch_t *pepoch,
3042 bufferlist *bl)
3043 {
3044 coll_t coll(pgid);
3045 ghobject_t legacy_infos_oid(OSD::make_infos_oid());
3046 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3047 epoch_t cur_epoch = 0;
3048
3049 assert(bl);
3050 {
3051 // validate collection name
3052 assert(coll.is_pg());
3053 }
3054
3055 // try for v8
3056 set<string> keys;
3057 keys.insert(infover_key);
3058 keys.insert(epoch_key);
3059 map<string,bufferlist> values;
3060 int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3061 if (r == 0) {
3062 assert(values.size() == 2);
3063
3064 // sanity check version
3065 bufferlist::iterator bp = values[infover_key].begin();
3066 __u8 struct_v = 0;
3067 ::decode(struct_v, bp);
3068 assert(struct_v >= 8);
3069
3070 // get epoch
3071 bp = values[epoch_key].begin();
3072 ::decode(cur_epoch, bp);
3073 } else {
3074 // probably bug 10617; see OSD::load_pgs()
3075 return -1;
3076 }
3077
3078 *pepoch = cur_epoch;
3079 return 0;
3080 }
3081
3082 #pragma GCC diagnostic pop
3083 #pragma GCC diagnostic warning "-Wpragmas"
3084
3085 void PG::write_if_dirty(ObjectStore::Transaction& t)
3086 {
3087 map<string,bufferlist> km;
3088 if (dirty_big_info || dirty_info)
3089 prepare_write_info(&km);
3090 pg_log.write_log_and_missing(t, &km, coll, pgmeta_oid, pool.info.require_rollback());
3091 if (!km.empty())
3092 t.omap_setkeys(coll, pgmeta_oid, km);
3093 }
3094
3095 void PG::trim_log()
3096 {
3097 assert(is_primary());
3098 calc_trim_to();
3099 dout(10) << __func__ << " to " << pg_trim_to << dendl;
3100 if (pg_trim_to != eversion_t()) {
3101 // inform peers to trim log
3102 assert(!actingbackfill.empty());
3103 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3104 i != actingbackfill.end();
3105 ++i) {
3106 if (*i == pg_whoami) continue;
3107 osd->send_message_osd_cluster(
3108 i->osd,
3109 new MOSDPGTrim(
3110 get_osdmap()->get_epoch(),
3111 spg_t(info.pgid.pgid, i->shard),
3112 pg_trim_to),
3113 get_osdmap()->get_epoch());
3114 }
3115
3116 // trim primary as well
3117 pg_log.trim(pg_trim_to, info);
3118 dirty_info = true;
3119 }
3120 }
3121
3122 void PG::add_log_entry(const pg_log_entry_t& e, bool applied)
3123 {
3124 // raise last_complete only if we were previously up to date
3125 if (info.last_complete == info.last_update)
3126 info.last_complete = e.version;
3127
3128 // raise last_update.
3129 assert(e.version > info.last_update);
3130 info.last_update = e.version;
3131
3132 // raise user_version, if it increased (it may have not get bumped
3133 // by all logged updates)
3134 if (e.user_version > info.last_user_version)
3135 info.last_user_version = e.user_version;
3136
3137 // log mutation
3138 pg_log.add(e, applied);
3139 dout(10) << "add_log_entry " << e << dendl;
3140 }
3141
3142
3143 void PG::append_log(
3144 const vector<pg_log_entry_t>& logv,
3145 eversion_t trim_to,
3146 eversion_t roll_forward_to,
3147 ObjectStore::Transaction &t,
3148 bool transaction_applied)
3149 {
3150 if (transaction_applied)
3151 update_snap_map(logv, t);
3152
3153 /* The primary has sent an info updating the history, but it may not
3154 * have arrived yet. We want to make sure that we cannot remember this
3155 * write without remembering that it happened in an interval which went
3156 * active in epoch history.last_epoch_started.
3157 */
3158 if (info.last_epoch_started != info.history.last_epoch_started) {
3159 info.history.last_epoch_started = info.last_epoch_started;
3160 }
3161 if (info.last_interval_started != info.history.last_interval_started) {
3162 info.history.last_interval_started = info.last_interval_started;
3163 }
3164 dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3165
3166 PGLogEntryHandler handler{this, &t};
3167 if (!transaction_applied) {
3168 /* We must be a backfill peer, so it's ok if we apply
3169 * out-of-turn since we won't be considered when
3170 * determining a min possible last_update.
3171 */
3172 pg_log.roll_forward(&handler);
3173 }
3174
3175 for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3176 p != logv.end();
3177 ++p) {
3178 add_log_entry(*p, transaction_applied);
3179
3180 /* We don't want to leave the rollforward artifacts around
3181 * here past last_backfill. It's ok for the same reason as
3182 * above */
3183 if (transaction_applied &&
3184 p->soid > info.last_backfill) {
3185 pg_log.roll_forward(&handler);
3186 }
3187 }
3188 auto last = logv.rbegin();
3189 if (is_primary() && last != logv.rend()) {
3190 projected_log.skip_can_rollback_to_to_head();
3191 projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
3192 }
3193
3194 if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
3195 pg_log.roll_forward_to(
3196 roll_forward_to,
3197 &handler);
3198 t.register_on_applied(
3199 new C_UpdateLastRollbackInfoTrimmedToApplied(
3200 this,
3201 get_osdmap()->get_epoch(),
3202 roll_forward_to));
3203 }
3204
3205 pg_log.trim(trim_to, info);
3206
3207 // update the local pg, pg log
3208 dirty_info = true;
3209 write_if_dirty(t);
3210 }
3211
3212 bool PG::check_log_for_corruption(ObjectStore *store)
3213 {
3214 /// TODO: this method needs to work with the omap log
3215 return true;
3216 }
3217
3218 //! Get the name we're going to save our corrupt page log as
3219 std::string PG::get_corrupt_pg_log_name() const
3220 {
3221 const int MAX_BUF = 512;
3222 char buf[MAX_BUF];
3223 struct tm tm_buf;
3224 time_t my_time(time(NULL));
3225 const struct tm *t = localtime_r(&my_time, &tm_buf);
3226 int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t);
3227 if (ret == 0) {
3228 dout(0) << "strftime failed" << dendl;
3229 return "corrupt_log_unknown_time";
3230 }
3231 string out(buf);
3232 out += stringify(info.pgid);
3233 return out;
3234 }
3235
3236 int PG::read_info(
3237 ObjectStore *store, spg_t pgid, const coll_t &coll, bufferlist &bl,
3238 pg_info_t &info, PastIntervals &past_intervals,
3239 __u8 &struct_v)
3240 {
3241 // try for v8 or later
3242 set<string> keys;
3243 keys.insert(infover_key);
3244 keys.insert(info_key);
3245 keys.insert(biginfo_key);
3246 keys.insert(fastinfo_key);
3247 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3248 map<string,bufferlist> values;
3249 int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3250 if (r == 0) {
3251 assert(values.size() == 3 ||
3252 values.size() == 4);
3253
3254 bufferlist::iterator p = values[infover_key].begin();
3255 ::decode(struct_v, p);
3256 assert(struct_v >= 8);
3257
3258 p = values[info_key].begin();
3259 ::decode(info, p);
3260
3261 p = values[biginfo_key].begin();
3262 if (struct_v >= 10) {
3263 ::decode(past_intervals, p);
3264 } else {
3265 past_intervals.decode_classic(p);
3266 }
3267 ::decode(info.purged_snaps, p);
3268
3269 p = values[fastinfo_key].begin();
3270 if (!p.end()) {
3271 pg_fast_info_t fast;
3272 ::decode(fast, p);
3273 fast.try_apply_to(&info);
3274 }
3275 return 0;
3276 }
3277
3278 // legacy (ver < 8)
3279 ghobject_t infos_oid(OSD::make_infos_oid());
3280 bufferlist::iterator p = bl.begin();
3281 ::decode(struct_v, p);
3282 assert(struct_v == 7);
3283
3284 // get info out of leveldb
3285 string k = get_info_key(info.pgid);
3286 string bk = get_biginfo_key(info.pgid);
3287 keys.clear();
3288 keys.insert(k);
3289 keys.insert(bk);
3290 values.clear();
3291 store->omap_get_values(coll_t::meta(), ghobject_t(infos_oid), keys, &values);
3292 assert(values.size() == 2);
3293
3294 p = values[k].begin();
3295 ::decode(info, p);
3296
3297 p = values[bk].begin();
3298 ::decode(past_intervals, p);
3299 interval_set<snapid_t> snap_collections; // obsolete
3300 ::decode(snap_collections, p);
3301 ::decode(info.purged_snaps, p);
3302 return 0;
3303 }
3304
3305 void PG::read_state(ObjectStore *store, bufferlist &bl)
3306 {
3307 int r = read_info(store, pg_id, coll, bl, info, past_intervals,
3308 info_struct_v);
3309 assert(r >= 0);
3310
3311 last_written_info = info;
3312
3313 ostringstream oss;
3314 pg_log.read_log_and_missing(
3315 store,
3316 coll,
3317 info_struct_v < 8 ? coll_t::meta() : coll,
3318 ghobject_t(info_struct_v < 8 ? OSD::make_pg_log_oid(pg_id) : pgmeta_oid),
3319 info,
3320 oss,
3321 cct->_conf->osd_ignore_stale_divergent_priors,
3322 cct->_conf->osd_debug_verify_missing_on_start);
3323 if (oss.tellp())
3324 osd->clog->error() << oss.rdbuf();
3325
3326 // log any weirdness
3327 log_weirdness();
3328 }
3329
3330 void PG::log_weirdness()
3331 {
3332 if (pg_log.get_tail() != info.log_tail)
3333 osd->clog->error() << info.pgid
3334 << " info mismatch, log.tail " << pg_log.get_tail()
3335 << " != info.log_tail " << info.log_tail;
3336 if (pg_log.get_head() != info.last_update)
3337 osd->clog->error() << info.pgid
3338 << " info mismatch, log.head " << pg_log.get_head()
3339 << " != info.last_update " << info.last_update;
3340
3341 if (!pg_log.get_log().empty()) {
3342 // sloppy check
3343 if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
3344 osd->clog->error() << info.pgid
3345 << " log bound mismatch, info (tail,head] ("
3346 << pg_log.get_tail() << "," << pg_log.get_head() << "]"
3347 << " actual ["
3348 << pg_log.get_log().log.begin()->version << ","
3349 << pg_log.get_log().log.rbegin()->version << "]";
3350 }
3351
3352 if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
3353 osd->clog->error() << info.pgid
3354 << " caller_ops.size " << pg_log.get_log().caller_ops.size()
3355 << " > log size " << pg_log.get_log().log.size();
3356 }
3357 }
3358
3359 void PG::update_snap_map(
3360 const vector<pg_log_entry_t> &log_entries,
3361 ObjectStore::Transaction &t)
3362 {
3363 for (vector<pg_log_entry_t>::const_iterator i = log_entries.begin();
3364 i != log_entries.end();
3365 ++i) {
3366 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
3367 if (i->soid.snap < CEPH_MAXSNAP) {
3368 if (i->is_delete()) {
3369 int r = snap_mapper.remove_oid(
3370 i->soid,
3371 &_t);
3372 assert(r == 0);
3373 } else if (i->is_update()) {
3374 assert(i->snaps.length() > 0);
3375 vector<snapid_t> snaps;
3376 bufferlist snapbl = i->snaps;
3377 bufferlist::iterator p = snapbl.begin();
3378 try {
3379 ::decode(snaps, p);
3380 } catch (...) {
3381 snaps.clear();
3382 }
3383 set<snapid_t> _snaps(snaps.begin(), snaps.end());
3384
3385 if (i->is_clone() || i->is_promote()) {
3386 snap_mapper.add_oid(
3387 i->soid,
3388 _snaps,
3389 &_t);
3390 } else if (i->is_modify()) {
3391 assert(i->is_modify());
3392 int r = snap_mapper.update_snaps(
3393 i->soid,
3394 _snaps,
3395 0,
3396 &_t);
3397 assert(r == 0);
3398 } else {
3399 assert(i->is_clean());
3400 }
3401 }
3402 }
3403 }
3404 }
3405
3406 /**
3407 * filter trimming|trimmed snaps out of snapcontext
3408 */
3409 void PG::filter_snapc(vector<snapid_t> &snaps)
3410 {
3411 //nothing needs to trim, we can return immediately
3412 if(snap_trimq.empty() && info.purged_snaps.empty())
3413 return;
3414
3415 bool filtering = false;
3416 vector<snapid_t> newsnaps;
3417 for (vector<snapid_t>::iterator p = snaps.begin();
3418 p != snaps.end();
3419 ++p) {
3420 if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
3421 if (!filtering) {
3422 // start building a new vector with what we've seen so far
3423 dout(10) << "filter_snapc filtering " << snaps << dendl;
3424 newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
3425 filtering = true;
3426 }
3427 dout(20) << "filter_snapc removing trimq|purged snap " << *p << dendl;
3428 } else {
3429 if (filtering)
3430 newsnaps.push_back(*p); // continue building new vector
3431 }
3432 }
3433 if (filtering) {
3434 snaps.swap(newsnaps);
3435 dout(10) << "filter_snapc result " << snaps << dendl;
3436 }
3437 }
3438
3439 void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m)
3440 {
3441 for (map<hobject_t, list<OpRequestRef>>::iterator it = m.begin();
3442 it != m.end();
3443 ++it)
3444 requeue_ops(it->second);
3445 m.clear();
3446 }
3447
3448 void PG::requeue_op(OpRequestRef op)
3449 {
3450 auto p = waiting_for_map.find(op->get_source());
3451 if (p != waiting_for_map.end()) {
3452 dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")"
3453 << dendl;
3454 p->second.push_front(op);
3455 } else {
3456 dout(20) << __func__ << " " << op << dendl;
3457 osd->enqueue_front(info.pgid, PGQueueable(op, get_osdmap()->get_epoch()));
3458 }
3459 }
3460
3461 void PG::requeue_ops(list<OpRequestRef> &ls)
3462 {
3463 for (list<OpRequestRef>::reverse_iterator i = ls.rbegin();
3464 i != ls.rend();
3465 ++i) {
3466 auto p = waiting_for_map.find((*i)->get_source());
3467 if (p != waiting_for_map.end()) {
3468 dout(20) << __func__ << " " << *i << " (waiting_for_map " << p->first
3469 << ")" << dendl;
3470 p->second.push_front(*i);
3471 } else {
3472 dout(20) << __func__ << " " << *i << dendl;
3473 osd->enqueue_front(info.pgid, PGQueueable(*i, get_osdmap()->get_epoch()));
3474 }
3475 }
3476 ls.clear();
3477 }
3478
3479 void PG::requeue_map_waiters()
3480 {
3481 epoch_t epoch = get_osdmap()->get_epoch();
3482 auto p = waiting_for_map.begin();
3483 while (p != waiting_for_map.end()) {
3484 if (epoch < p->second.front()->min_epoch) {
3485 dout(20) << __func__ << " " << p->first << " front op "
3486 << p->second.front() << " must still wait, doing nothing"
3487 << dendl;
3488 ++p;
3489 } else {
3490 dout(20) << __func__ << " " << p->first << " " << p->second << dendl;
3491 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
3492 osd->enqueue_front(info.pgid, PGQueueable(*q, epoch));
3493 }
3494 p = waiting_for_map.erase(p);
3495 }
3496 }
3497 }
3498
3499
3500 // ==========================================================================================
3501 // SCRUB
3502
3503 /*
3504 * when holding pg and sched_scrub_lock, then the states are:
3505 * scheduling:
3506 * scrubber.reserved = true
3507 * scrub_rserved_peers includes whoami
3508 * osd->scrub_pending++
3509 * scheduling, replica declined:
3510 * scrubber.reserved = true
3511 * scrubber.reserved_peers includes -1
3512 * osd->scrub_pending++
3513 * pending:
3514 * scrubber.reserved = true
3515 * scrubber.reserved_peers.size() == acting.size();
3516 * pg on scrub_wq
3517 * osd->scrub_pending++
3518 * scrubbing:
3519 * scrubber.reserved = false;
3520 * scrubber.reserved_peers empty
3521 * osd->scrubber.active++
3522 */
3523
3524 // returns true if a scrub has been newly kicked off
3525 bool PG::sched_scrub()
3526 {
3527 bool nodeep_scrub = false;
3528 assert(is_locked());
3529 if (!(is_primary() && is_active() && is_clean() && !is_scrubbing())) {
3530 return false;
3531 }
3532
3533 double deep_scrub_interval = 0;
3534 pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
3535 if (deep_scrub_interval <= 0) {
3536 deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
3537 }
3538 bool time_for_deep = ceph_clock_now() >=
3539 info.history.last_deep_scrub_stamp + deep_scrub_interval;
3540
3541 bool deep_coin_flip = false;
3542 // Only add random deep scrubs when NOT user initiated scrub
3543 if (!scrubber.must_scrub)
3544 deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
3545 dout(20) << __func__ << ": time_for_deep=" << time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
3546
3547 time_for_deep = (time_for_deep || deep_coin_flip);
3548
3549 //NODEEP_SCRUB so ignore time initiated deep-scrub
3550 if (osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
3551 pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
3552 time_for_deep = false;
3553 nodeep_scrub = true;
3554 }
3555
3556 if (!scrubber.must_scrub) {
3557 assert(!scrubber.must_deep_scrub);
3558
3559 //NOSCRUB so skip regular scrubs
3560 if ((osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
3561 pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) && !time_for_deep) {
3562 if (scrubber.reserved) {
3563 // cancel scrub if it is still in scheduling,
3564 // so pgs from other pools where scrub are still legal
3565 // have a chance to go ahead with scrubbing.
3566 clear_scrub_reserved();
3567 scrub_unreserve_replicas();
3568 }
3569 return false;
3570 }
3571 }
3572
3573 if (cct->_conf->osd_scrub_auto_repair
3574 && get_pgbackend()->auto_repair_supported()
3575 && time_for_deep
3576 // respect the command from user, and not do auto-repair
3577 && !scrubber.must_repair
3578 && !scrubber.must_scrub
3579 && !scrubber.must_deep_scrub) {
3580 dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
3581 scrubber.auto_repair = true;
3582 } else {
3583 // this happens when user issue the scrub/repair command during
3584 // the scheduling of the scrub/repair (e.g. request reservation)
3585 scrubber.auto_repair = false;
3586 }
3587
3588 bool ret = true;
3589 if (!scrubber.reserved) {
3590 assert(scrubber.reserved_peers.empty());
3591 if (osd->inc_scrubs_pending()) {
3592 dout(20) << "sched_scrub: reserved locally, reserving replicas" << dendl;
3593 scrubber.reserved = true;
3594 scrubber.reserved_peers.insert(pg_whoami);
3595 scrub_reserve_replicas();
3596 } else {
3597 dout(20) << "sched_scrub: failed to reserve locally" << dendl;
3598 ret = false;
3599 }
3600 }
3601 if (scrubber.reserved) {
3602 if (scrubber.reserve_failed) {
3603 dout(20) << "sched_scrub: failed, a peer declined" << dendl;
3604 clear_scrub_reserved();
3605 scrub_unreserve_replicas();
3606 ret = false;
3607 } else if (scrubber.reserved_peers.size() == acting.size()) {
3608 dout(20) << "sched_scrub: success, reserved self and replicas" << dendl;
3609 if (time_for_deep) {
3610 dout(10) << "sched_scrub: scrub will be deep" << dendl;
3611 state_set(PG_STATE_DEEP_SCRUB);
3612 } else if (!scrubber.must_deep_scrub && info.stats.stats.sum.num_deep_scrub_errors) {
3613 if (!nodeep_scrub) {
3614 osd->clog->info() << "osd." << osd->whoami
3615 << " pg " << info.pgid
3616 << " Deep scrub errors, upgrading scrub to deep-scrub";
3617 state_set(PG_STATE_DEEP_SCRUB);
3618 } else if (!scrubber.must_scrub) {
3619 osd->clog->error() << "osd." << osd->whoami
3620 << " pg " << info.pgid
3621 << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
3622 clear_scrub_reserved();
3623 scrub_unreserve_replicas();
3624 return false;
3625 } else {
3626 osd->clog->error() << "osd." << osd->whoami
3627 << " pg " << info.pgid
3628 << " Regular scrub request, deep-scrub details will be lost";
3629 }
3630 }
3631 queue_scrub();
3632 } else {
3633 // none declined, since scrubber.reserved is set
3634 dout(20) << "sched_scrub: reserved " << scrubber.reserved_peers << ", waiting for replicas" << dendl;
3635 }
3636 }
3637
3638 return ret;
3639 }
3640
3641 void PG::reg_next_scrub()
3642 {
3643 if (!is_primary())
3644 return;
3645
3646 utime_t reg_stamp;
3647 if (scrubber.must_scrub ||
3648 (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats)) {
3649 reg_stamp = ceph_clock_now();
3650 } else {
3651 reg_stamp = info.history.last_scrub_stamp;
3652 }
3653 // note down the sched_time, so we can locate this scrub, and remove it
3654 // later on.
3655 double scrub_min_interval = 0, scrub_max_interval = 0;
3656 pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
3657 pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
3658 assert(scrubber.scrub_reg_stamp == utime_t());
3659 scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
3660 reg_stamp,
3661 scrub_min_interval,
3662 scrub_max_interval,
3663 scrubber.must_scrub);
3664 }
3665
3666 void PG::unreg_next_scrub()
3667 {
3668 if (is_primary()) {
3669 osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
3670 scrubber.scrub_reg_stamp = utime_t();
3671 }
3672 }
3673
3674 void PG::do_replica_scrub_map(OpRequestRef op)
3675 {
3676 const MOSDRepScrubMap *m = static_cast<const MOSDRepScrubMap*>(op->get_req());
3677 dout(7) << __func__ << " " << *m << dendl;
3678 if (m->map_epoch < info.history.same_interval_since) {
3679 dout(10) << __func__ << " discarding old from "
3680 << m->map_epoch << " < " << info.history.same_interval_since
3681 << dendl;
3682 return;
3683 }
3684 if (!scrubber.is_chunky_scrub_active()) {
3685 dout(10) << __func__ << " scrub isn't active" << dendl;
3686 return;
3687 }
3688
3689 op->mark_started();
3690
3691 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3692 scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3693 dout(10) << "map version is "
3694 << scrubber.received_maps[m->from].valid_through
3695 << dendl;
3696
3697 --scrubber.waiting_on;
3698 scrubber.waiting_on_whom.erase(m->from);
3699 if (scrubber.waiting_on == 0) {
3700 if (ops_blocked_by_scrub()) {
3701 requeue_scrub(true);
3702 } else {
3703 requeue_scrub(false);
3704 }
3705 }
3706 }
3707
3708 void PG::sub_op_scrub_map(OpRequestRef op)
3709 {
3710 // for legacy jewel compatibility only
3711 const MOSDSubOp *m = static_cast<const MOSDSubOp *>(op->get_req());
3712 assert(m->get_type() == MSG_OSD_SUBOP);
3713 dout(7) << "sub_op_scrub_map" << dendl;
3714
3715 if (m->map_epoch < info.history.same_interval_since) {
3716 dout(10) << "sub_op_scrub discarding old sub_op from "
3717 << m->map_epoch << " < " << info.history.same_interval_since << dendl;
3718 return;
3719 }
3720
3721 if (!scrubber.is_chunky_scrub_active()) {
3722 dout(10) << "sub_op_scrub_map scrub isn't active" << dendl;
3723 return;
3724 }
3725
3726 op->mark_started();
3727
3728 dout(10) << " got " << m->from << " scrub map" << dendl;
3729 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3730
3731 scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3732 dout(10) << "map version is "
3733 << scrubber.received_maps[m->from].valid_through
3734 << dendl;
3735
3736 --scrubber.waiting_on;
3737 scrubber.waiting_on_whom.erase(m->from);
3738
3739 if (scrubber.waiting_on == 0) {
3740 if (ops_blocked_by_scrub()) {
3741 requeue_scrub(true);
3742 } else {
3743 requeue_scrub(false);
3744 }
3745 }
3746 }
3747
3748 // send scrub v3 messages (chunky scrub)
3749 void PG::_request_scrub_map(
3750 pg_shard_t replica, eversion_t version,
3751 hobject_t start, hobject_t end,
3752 bool deep, uint32_t seed)
3753 {
3754 assert(replica != pg_whoami);
3755 dout(10) << "scrub requesting scrubmap from osd." << replica
3756 << " deep " << (int)deep << " seed " << seed << dendl;
3757 MOSDRepScrub *repscrubop = new MOSDRepScrub(
3758 spg_t(info.pgid.pgid, replica.shard), version,
3759 get_osdmap()->get_epoch(),
3760 get_last_peering_reset(),
3761 start, end, deep, seed);
3762 // default priority, we want the rep scrub processed prior to any recovery
3763 // or client io messages (we are holding a lock!)
3764 osd->send_message_osd_cluster(
3765 replica.osd, repscrubop, get_osdmap()->get_epoch());
3766 }
3767
3768 void PG::handle_scrub_reserve_request(OpRequestRef op)
3769 {
3770 dout(7) << __func__ << " " << *op->get_req() << dendl;
3771 op->mark_started();
3772 if (scrubber.reserved) {
3773 dout(10) << __func__ << " ignoring reserve request: Already reserved"
3774 << dendl;
3775 return;
3776 }
3777 scrubber.reserved = osd->inc_scrubs_pending();
3778 if (op->get_req()->get_type() == MSG_OSD_SCRUB_RESERVE) {
3779 const MOSDScrubReserve *m =
3780 static_cast<const MOSDScrubReserve*>(op->get_req());
3781 Message *reply = new MOSDScrubReserve(
3782 spg_t(info.pgid.pgid, primary.shard),
3783 m->map_epoch,
3784 scrubber.reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
3785 pg_whoami);
3786 osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
3787 } else {
3788 // for jewel compat only
3789 const MOSDSubOp *req = static_cast<const MOSDSubOp*>(op->get_req());
3790 assert(req->get_type() == MSG_OSD_SUBOP);
3791 MOSDSubOpReply *reply = new MOSDSubOpReply(
3792 req, pg_whoami, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
3793 ::encode(scrubber.reserved, reply->get_data());
3794 osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
3795 }
3796 }
3797
3798 void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
3799 {
3800 dout(7) << __func__ << " " << *op->get_req() << dendl;
3801 op->mark_started();
3802 if (!scrubber.reserved) {
3803 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
3804 return;
3805 }
3806 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
3807 dout(10) << " already had osd." << from << " reserved" << dendl;
3808 } else {
3809 dout(10) << " osd." << from << " scrub reserve = success" << dendl;
3810 scrubber.reserved_peers.insert(from);
3811 sched_scrub();
3812 }
3813 }
3814
3815 void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
3816 {
3817 dout(7) << __func__ << " " << *op->get_req() << dendl;
3818 op->mark_started();
3819 if (!scrubber.reserved) {
3820 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
3821 return;
3822 }
3823 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
3824 dout(10) << " already had osd." << from << " reserved" << dendl;
3825 } else {
3826 /* One decline stops this pg from being scheduled for scrubbing. */
3827 dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
3828 scrubber.reserve_failed = true;
3829 sched_scrub();
3830 }
3831 }
3832
3833 void PG::handle_scrub_reserve_release(OpRequestRef op)
3834 {
3835 dout(7) << __func__ << " " << *op->get_req() << dendl;
3836 op->mark_started();
3837 clear_scrub_reserved();
3838 }
3839
3840 void PG::reject_reservation()
3841 {
3842 osd->send_message_osd_cluster(
3843 primary.osd,
3844 new MBackfillReserve(
3845 MBackfillReserve::REJECT,
3846 spg_t(info.pgid.pgid, primary.shard),
3847 get_osdmap()->get_epoch()),
3848 get_osdmap()->get_epoch());
3849 }
3850
3851 void PG::schedule_backfill_full_retry()
3852 {
3853 Mutex::Locker lock(osd->recovery_request_lock);
3854 osd->recovery_request_timer.add_event_after(
3855 cct->_conf->osd_backfill_retry_interval,
3856 new QueuePeeringEvt<RequestBackfill>(
3857 this, get_osdmap()->get_epoch(),
3858 RequestBackfill()));
3859 }
3860
3861 void PG::schedule_recovery_full_retry()
3862 {
3863 Mutex::Locker lock(osd->recovery_request_lock);
3864 osd->recovery_request_timer.add_event_after(
3865 cct->_conf->osd_recovery_retry_interval,
3866 new QueuePeeringEvt<DoRecovery>(
3867 this, get_osdmap()->get_epoch(),
3868 DoRecovery()));
3869 }
3870
3871 void PG::clear_scrub_reserved()
3872 {
3873 scrubber.reserved_peers.clear();
3874 scrubber.reserve_failed = false;
3875
3876 if (scrubber.reserved) {
3877 scrubber.reserved = false;
3878 osd->dec_scrubs_pending();
3879 }
3880 }
3881
3882 void PG::scrub_reserve_replicas()
3883 {
3884 assert(backfill_targets.empty());
3885 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3886 i != actingbackfill.end();
3887 ++i) {
3888 if (*i == pg_whoami) continue;
3889 dout(10) << "scrub requesting reserve from osd." << *i << dendl;
3890 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
3891 osd->send_message_osd_cluster(
3892 i->osd,
3893 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
3894 get_osdmap()->get_epoch(),
3895 MOSDScrubReserve::REQUEST, pg_whoami),
3896 get_osdmap()->get_epoch());
3897 } else {
3898 // for jewel compat only
3899 vector<OSDOp> scrub(1);
3900 scrub[0].op.op = CEPH_OSD_OP_SCRUB_RESERVE;
3901 hobject_t poid;
3902 eversion_t v;
3903 osd_reqid_t reqid;
3904 MOSDSubOp *subop = new MOSDSubOp(
3905 reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
3906 get_osdmap()->get_epoch(), osd->get_tid(), v);
3907 subop->ops = scrub;
3908 osd->send_message_osd_cluster(
3909 i->osd, subop, get_osdmap()->get_epoch());
3910 }
3911 }
3912 }
3913
3914 void PG::scrub_unreserve_replicas()
3915 {
3916 assert(backfill_targets.empty());
3917 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3918 i != actingbackfill.end();
3919 ++i) {
3920 if (*i == pg_whoami) continue;
3921 dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
3922 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
3923 osd->send_message_osd_cluster(
3924 i->osd,
3925 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
3926 get_osdmap()->get_epoch(),
3927 MOSDScrubReserve::RELEASE, pg_whoami),
3928 get_osdmap()->get_epoch());
3929 } else {
3930 // for jewel compat only
3931 vector<OSDOp> scrub(1);
3932 scrub[0].op.op = CEPH_OSD_OP_SCRUB_UNRESERVE;
3933 hobject_t poid;
3934 eversion_t v;
3935 osd_reqid_t reqid;
3936 MOSDSubOp *subop = new MOSDSubOp(
3937 reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
3938 get_osdmap()->get_epoch(), osd->get_tid(), v);
3939 subop->ops = scrub;
3940 osd->send_message_osd_cluster(i->osd, subop, get_osdmap()->get_epoch());
3941 }
3942 }
3943 }
3944
3945 void PG::_scan_rollback_obs(
3946 const vector<ghobject_t> &rollback_obs,
3947 ThreadPool::TPHandle &handle)
3948 {
3949 ObjectStore::Transaction t;
3950 eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
3951 for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
3952 i != rollback_obs.end();
3953 ++i) {
3954 if (i->generation < trimmed_to.version) {
3955 osd->clog->error() << "osd." << osd->whoami
3956 << " pg " << info.pgid
3957 << " found obsolete rollback obj "
3958 << *i << " generation < trimmed_to "
3959 << trimmed_to
3960 << "...repaired";
3961 t.remove(coll, *i);
3962 }
3963 }
3964 if (!t.empty()) {
3965 derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
3966 << dendl;
3967 osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3968 }
3969 }
3970
3971 void PG::_scan_snaps(ScrubMap &smap)
3972 {
3973 hobject_t head;
3974 SnapSet snapset;
3975 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
3976 i != smap.objects.rend();
3977 ++i) {
3978 const hobject_t &hoid = i->first;
3979 ScrubMap::object &o = i->second;
3980
3981 if (hoid.is_head() || hoid.is_snapdir()) {
3982 // parse the SnapSet
3983 bufferlist bl;
3984 if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
3985 continue;
3986 }
3987 bl.push_back(o.attrs[SS_ATTR]);
3988 auto p = bl.begin();
3989 try {
3990 ::decode(snapset, p);
3991 } catch(...) {
3992 continue;
3993 }
3994 head = hoid.get_head();
3995 continue;
3996 }
3997 if (hoid.snap < CEPH_MAXSNAP) {
3998 // check and if necessary fix snap_mapper
3999 if (hoid.get_head() != head) {
4000 derr << __func__ << " no head for " << hoid << " (have " << head << ")"
4001 << dendl;
4002 continue;
4003 }
4004 set<snapid_t> obj_snaps;
4005 if (!snapset.is_legacy()) {
4006 auto p = snapset.clone_snaps.find(hoid.snap);
4007 if (p == snapset.clone_snaps.end()) {
4008 derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset
4009 << dendl;
4010 continue;
4011 }
4012 obj_snaps.insert(p->second.begin(), p->second.end());
4013 } else {
4014 bufferlist bl;
4015 if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4016 continue;
4017 }
4018 bl.push_back(o.attrs[OI_ATTR]);
4019 object_info_t oi;
4020 try {
4021 oi.decode(bl);
4022 } catch(...) {
4023 continue;
4024 }
4025 obj_snaps.insert(oi.legacy_snaps.begin(), oi.legacy_snaps.end());
4026 }
4027 set<snapid_t> cur_snaps;
4028 int r = snap_mapper.get_snaps(hoid, &cur_snaps);
4029 if (r != 0 && r != -ENOENT) {
4030 derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
4031 ceph_abort();
4032 }
4033 if (r == -ENOENT || cur_snaps != obj_snaps) {
4034 ObjectStore::Transaction t;
4035 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4036 if (r == 0) {
4037 r = snap_mapper.remove_oid(hoid, &_t);
4038 if (r != 0) {
4039 derr << __func__ << ": remove_oid returned " << cpp_strerror(r)
4040 << dendl;
4041 ceph_abort();
4042 }
4043 osd->clog->error() << "osd." << osd->whoami
4044 << " found snap mapper error on pg "
4045 << info.pgid
4046 << " oid " << hoid << " snaps in mapper: "
4047 << cur_snaps << ", oi: "
4048 << obj_snaps
4049 << "...repaired";
4050 } else {
4051 osd->clog->error() << "osd." << osd->whoami
4052 << " found snap mapper error on pg "
4053 << info.pgid
4054 << " oid " << hoid << " snaps missing in mapper"
4055 << ", should be: "
4056 << obj_snaps
4057 << "...repaired";
4058 }
4059 snap_mapper.add_oid(hoid, obj_snaps, &_t);
4060 r = osd->store->apply_transaction(osr.get(), std::move(t));
4061 if (r != 0) {
4062 derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4063 << dendl;
4064 }
4065 }
4066 }
4067 }
4068 }
4069
4070 void PG::_repair_oinfo_oid(ScrubMap &smap)
4071 {
4072 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4073 i != smap.objects.rend();
4074 ++i) {
4075 const hobject_t &hoid = i->first;
4076 ScrubMap::object &o = i->second;
4077
4078 bufferlist bl;
4079 if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4080 continue;
4081 }
4082 bl.push_back(o.attrs[OI_ATTR]);
4083 object_info_t oi;
4084 try {
4085 oi.decode(bl);
4086 } catch(...) {
4087 continue;
4088 }
4089 if (oi.soid != hoid) {
4090 ObjectStore::Transaction t;
4091 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4092 osd->clog->error() << "osd." << osd->whoami
4093 << " found object info error on pg "
4094 << info.pgid
4095 << " oid " << hoid << " oid in object info: "
4096 << oi.soid
4097 << "...repaired";
4098 // Fix object info
4099 oi.soid = hoid;
4100 bl.clear();
4101 ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4102
4103 bufferptr bp(bl.c_str(), bl.length());
4104 o.attrs[OI_ATTR] = bp;
4105
4106 t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
4107 int r = osd->store->apply_transaction(osr.get(), std::move(t));
4108 if (r != 0) {
4109 derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4110 << dendl;
4111 }
4112 }
4113 }
4114 }
4115
4116 /*
4117 * build a scrub map over a chunk without releasing the lock
4118 * only used by chunky scrub
4119 */
4120 int PG::build_scrub_map_chunk(
4121 ScrubMap &map,
4122 hobject_t start, hobject_t end, bool deep, uint32_t seed,
4123 ThreadPool::TPHandle &handle)
4124 {
4125 dout(10) << __func__ << " [" << start << "," << end << ") "
4126 << " seed " << seed << dendl;
4127
4128 map.valid_through = info.last_update;
4129
4130 // objects
4131 vector<hobject_t> ls;
4132 vector<ghobject_t> rollback_obs;
4133 int ret = get_pgbackend()->objects_list_range(
4134 start,
4135 end,
4136 0,
4137 &ls,
4138 &rollback_obs);
4139 if (ret < 0) {
4140 dout(5) << "objects_list_range error: " << ret << dendl;
4141 return ret;
4142 }
4143
4144
4145 get_pgbackend()->be_scan_list(map, ls, deep, seed, handle);
4146 _scan_rollback_obs(rollback_obs, handle);
4147 _scan_snaps(map);
4148 _repair_oinfo_oid(map);
4149
4150 dout(20) << __func__ << " done" << dendl;
4151 return 0;
4152 }
4153
4154 void PG::Scrubber::cleanup_store(ObjectStore::Transaction *t) {
4155 if (!store)
4156 return;
4157 struct OnComplete : Context {
4158 std::unique_ptr<Scrub::Store> store;
4159 OnComplete(
4160 std::unique_ptr<Scrub::Store> &&store)
4161 : store(std::move(store)) {}
4162 void finish(int) override {}
4163 };
4164 store->cleanup(t);
4165 t->register_on_complete(new OnComplete(std::move(store)));
4166 assert(!store);
4167 }
4168
4169 void PG::repair_object(
4170 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
4171 pg_shard_t bad_peer)
4172 {
4173 list<pg_shard_t> op_shards;
4174 for (auto i : *ok_peers) {
4175 op_shards.push_back(i.second);
4176 }
4177 dout(10) << "repair_object " << soid << " bad_peer osd."
4178 << bad_peer << " ok_peers osd.{" << op_shards << "}" << dendl;
4179 ScrubMap::object &po = ok_peers->back().first;
4180 eversion_t v;
4181 bufferlist bv;
4182 bv.push_back(po.attrs[OI_ATTR]);
4183 object_info_t oi;
4184 try {
4185 bufferlist::iterator bliter = bv.begin();
4186 ::decode(oi, bliter);
4187 } catch (...) {
4188 dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
4189 assert(0);
4190 }
4191 if (bad_peer != primary) {
4192 peer_missing[bad_peer].add(soid, oi.version, eversion_t(), false);
4193 } else {
4194 // We should only be scrubbing if the PG is clean.
4195 assert(waiting_for_unreadable_object.empty());
4196
4197 pg_log.missing_add(soid, oi.version, eversion_t());
4198
4199 pg_log.set_last_requested(0);
4200 dout(10) << __func__ << ": primary = " << primary << dendl;
4201 }
4202
4203 if (is_ec_pg() || bad_peer == primary) {
4204 // we'd better collect all shard for EC pg, and prepare good peers as the
4205 // source of pull in the case of replicated pg.
4206 missing_loc.add_missing(soid, oi.version, eversion_t());
4207 list<pair<ScrubMap::object, pg_shard_t> >::iterator i;
4208 for (i = ok_peers->begin();
4209 i != ok_peers->end();
4210 ++i)
4211 missing_loc.add_location(soid, i->second);
4212 }
4213 }
4214
4215 /* replica_scrub
4216 *
4217 * Wait for last_update_applied to match msg->scrub_to as above. Wait
4218 * for pushes to complete in case of recent recovery. Build a single
4219 * scrubmap of objects that are in the range [msg->start, msg->end).
4220 */
4221 void PG::replica_scrub(
4222 OpRequestRef op,
4223 ThreadPool::TPHandle &handle)
4224 {
4225 const MOSDRepScrub *msg = static_cast<const MOSDRepScrub *>(op->get_req());
4226 assert(!scrubber.active_rep_scrub);
4227 dout(7) << "replica_scrub" << dendl;
4228
4229 if (msg->map_epoch < info.history.same_interval_since) {
4230 dout(10) << "replica_scrub discarding old replica_scrub from "
4231 << msg->map_epoch << " < " << info.history.same_interval_since
4232 << dendl;
4233 return;
4234 }
4235
4236 ScrubMap map;
4237
4238 assert(msg->chunky);
4239 if (last_update_applied < msg->scrub_to) {
4240 dout(10) << "waiting for last_update_applied to catch up" << dendl;
4241 scrubber.active_rep_scrub = op;
4242 return;
4243 }
4244
4245 if (active_pushes > 0) {
4246 dout(10) << "waiting for active pushes to finish" << dendl;
4247 scrubber.active_rep_scrub = op;
4248 return;
4249 }
4250
4251 // compensate for hobject_t's with wrong pool from sloppy hammer OSDs
4252 hobject_t start = msg->start;
4253 hobject_t end = msg->end;
4254 if (!start.is_max())
4255 start.pool = info.pgid.pool();
4256 if (!end.is_max())
4257 end.pool = info.pgid.pool();
4258
4259 build_scrub_map_chunk(
4260 map, start, end, msg->deep, msg->seed,
4261 handle);
4262
4263 if (HAVE_FEATURE(acting_features, SERVER_LUMINOUS)) {
4264 MOSDRepScrubMap *reply = new MOSDRepScrubMap(
4265 spg_t(info.pgid.pgid, get_primary().shard),
4266 msg->map_epoch,
4267 pg_whoami);
4268 ::encode(map, reply->get_data());
4269 osd->send_message_osd_cluster(reply, msg->get_connection());
4270 } else {
4271 // for jewel compatibility
4272 vector<OSDOp> scrub(1);
4273 scrub[0].op.op = CEPH_OSD_OP_SCRUB_MAP;
4274 hobject_t poid;
4275 eversion_t v;
4276 osd_reqid_t reqid;
4277 MOSDSubOp *subop = new MOSDSubOp(
4278 reqid,
4279 pg_whoami,
4280 spg_t(info.pgid.pgid, get_primary().shard),
4281 poid,
4282 0,
4283 msg->map_epoch,
4284 osd->get_tid(),
4285 v);
4286 ::encode(map, subop->get_data());
4287 subop->ops = scrub;
4288 osd->send_message_osd_cluster(subop, msg->get_connection());
4289 }
4290 }
4291
4292 /* Scrub:
4293 * PG_STATE_SCRUBBING is set when the scrub is queued
4294 *
4295 * scrub will be chunky if all OSDs in PG support chunky scrub
4296 * scrub will fail if OSDs are too old.
4297 */
4298 void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
4299 {
4300 if (cct->_conf->osd_scrub_sleep > 0 &&
4301 (scrubber.state == PG::Scrubber::NEW_CHUNK ||
4302 scrubber.state == PG::Scrubber::INACTIVE) &&
4303 scrubber.needs_sleep) {
4304 ceph_assert(!scrubber.sleeping);
4305 dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
4306
4307 // Do an async sleep so we don't block the op queue
4308 OSDService *osds = osd;
4309 spg_t pgid = get_pgid();
4310 int state = scrubber.state;
4311 auto scrub_requeue_callback =
4312 new FunctionContext([osds, pgid, state](int r) {
4313 PG *pg = osds->osd->lookup_lock_pg(pgid);
4314 if (pg == nullptr) {
4315 lgeneric_dout(osds->osd->cct, 20)
4316 << "scrub_requeue_callback: Could not find "
4317 << "PG " << pgid << " can't complete scrub requeue after sleep"
4318 << dendl;
4319 return;
4320 }
4321 pg->scrubber.sleeping = false;
4322 pg->scrubber.needs_sleep = false;
4323 lgeneric_dout(pg->cct, 20)
4324 << "scrub_requeue_callback: slept for "
4325 << ceph_clock_now() - pg->scrubber.sleep_start
4326 << ", re-queuing scrub with state " << state << dendl;
4327 pg->scrub_queued = false;
4328 pg->requeue_scrub();
4329 pg->scrubber.sleep_start = utime_t();
4330 pg->unlock();
4331 });
4332 Mutex::Locker l(osd->scrub_sleep_lock);
4333 osd->scrub_sleep_timer.add_event_after(cct->_conf->osd_scrub_sleep,
4334 scrub_requeue_callback);
4335 scrubber.sleeping = true;
4336 scrubber.sleep_start = ceph_clock_now();
4337 return;
4338 }
4339 if (pg_has_reset_since(queued)) {
4340 return;
4341 }
4342 assert(scrub_queued);
4343 scrub_queued = false;
4344 scrubber.needs_sleep = true;
4345
4346 if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
4347 dout(10) << "scrub -- not primary or active or not clean" << dendl;
4348 state_clear(PG_STATE_SCRUBBING);
4349 state_clear(PG_STATE_REPAIR);
4350 state_clear(PG_STATE_DEEP_SCRUB);
4351 publish_stats_to_osd();
4352 return;
4353 }
4354
4355 if (!scrubber.active) {
4356 assert(backfill_targets.empty());
4357
4358 scrubber.deep = state_test(PG_STATE_DEEP_SCRUB);
4359
4360 dout(10) << "starting a new chunky scrub" << dendl;
4361 }
4362
4363 chunky_scrub(handle);
4364 }
4365
4366 /*
4367 * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
4368 * chunk.
4369 *
4370 * The object store is partitioned into chunks which end on hash boundaries. For
4371 * each chunk, the following logic is performed:
4372 *
4373 * (1) Block writes on the chunk
4374 * (2) Request maps from replicas
4375 * (3) Wait for pushes to be applied (after recovery)
4376 * (4) Wait for writes to flush on the chunk
4377 * (5) Wait for maps from replicas
4378 * (6) Compare / repair all scrub maps
4379 * (7) Wait for digest updates to apply
4380 *
4381 * This logic is encoded in the mostly linear state machine:
4382 *
4383 * +------------------+
4384 * _________v__________ |
4385 * | | |
4386 * | INACTIVE | |
4387 * |____________________| |
4388 * | |
4389 * | +----------+ |
4390 * _________v___v______ | |
4391 * | | | |
4392 * | NEW_CHUNK | | |
4393 * |____________________| | |
4394 * | | |
4395 * _________v__________ | |
4396 * | | | |
4397 * | WAIT_PUSHES | | |
4398 * |____________________| | |
4399 * | | |
4400 * _________v__________ | |
4401 * | | | |
4402 * | WAIT_LAST_UPDATE | | |
4403 * |____________________| | |
4404 * | | |
4405 * _________v__________ | |
4406 * | | | |
4407 * | BUILD_MAP | | |
4408 * |____________________| | |
4409 * | | |
4410 * _________v__________ | |
4411 * | | | |
4412 * | WAIT_REPLICAS | | |
4413 * |____________________| | |
4414 * | | |
4415 * _________v__________ | |
4416 * | | | |
4417 * | COMPARE_MAPS | | |
4418 * |____________________| | |
4419 * | | |
4420 * | | |
4421 * _________v__________ | |
4422 * | | | |
4423 * |WAIT_DIGEST_UPDATES | | |
4424 * |____________________| | |
4425 * | | | |
4426 * | +----------+ |
4427 * _________v__________ |
4428 * | | |
4429 * | FINISH | |
4430 * |____________________| |
4431 * | |
4432 * +------------------+
4433 *
4434 * The primary determines the last update from the subset by walking the log. If
4435 * it sees a log entry pertaining to a file in the chunk, it tells the replicas
4436 * to wait until that update is applied before building a scrub map. Both the
4437 * primary and replicas will wait for any active pushes to be applied.
4438 *
4439 * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
4440 *
4441 * scrubber.state encodes the current state of the scrub (refer to state diagram
4442 * for details).
4443 */
4444 void PG::chunky_scrub(ThreadPool::TPHandle &handle)
4445 {
4446 // check for map changes
4447 if (scrubber.is_chunky_scrub_active()) {
4448 if (scrubber.epoch_start != info.history.same_interval_since) {
4449 dout(10) << "scrub pg changed, aborting" << dendl;
4450 scrub_clear_state();
4451 scrub_unreserve_replicas();
4452 return;
4453 }
4454 }
4455
4456 bool done = false;
4457 int ret;
4458
4459 while (!done) {
4460 dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
4461 << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
4462
4463 switch (scrubber.state) {
4464 case PG::Scrubber::INACTIVE:
4465 dout(10) << "scrub start" << dendl;
4466
4467 publish_stats_to_osd();
4468 scrubber.epoch_start = info.history.same_interval_since;
4469 scrubber.active = true;
4470
4471 osd->inc_scrubs_active(scrubber.reserved);
4472 if (scrubber.reserved) {
4473 scrubber.reserved = false;
4474 scrubber.reserved_peers.clear();
4475 }
4476
4477 {
4478 ObjectStore::Transaction t;
4479 scrubber.cleanup_store(&t);
4480 scrubber.store.reset(Scrub::Store::create(osd->store, &t,
4481 info.pgid, coll));
4482 osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4483 }
4484
4485 // Don't include temporary objects when scrubbing
4486 scrubber.start = info.pgid.pgid.get_hobj_start();
4487 scrubber.state = PG::Scrubber::NEW_CHUNK;
4488
4489 {
4490 bool repair = state_test(PG_STATE_REPAIR);
4491 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4492 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4493 stringstream oss;
4494 oss << info.pgid.pgid << " " << mode << " starts" << std::endl;
4495 osd->clog->info(oss);
4496 }
4497
4498 scrubber.seed = -1;
4499
4500 break;
4501
4502 case PG::Scrubber::NEW_CHUNK:
4503 scrubber.primary_scrubmap = ScrubMap();
4504 scrubber.received_maps.clear();
4505
4506 {
4507 /* get the start and end of our scrub chunk
4508 *
4509 * Our scrub chunk has an important restriction we're going to need to
4510 * respect. We can't let head or snapdir be start or end.
4511 * Using a half-open interval means that if end == head|snapdir,
4512 * we'd scrub/lock head and the clone right next to head in different
4513 * chunks which would allow us to miss clones created between
4514 * scrubbing that chunk and scrubbing the chunk including head.
4515 * This isn't true for any of the other clones since clones can
4516 * only be created "just to the left of" head. There is one exception
4517 * to this: promotion of clones which always happens to the left of the
4518 * left-most clone, but promote_object checks the scrubber in that
4519 * case, so it should be ok. Also, it's ok to "miss" clones at the
4520 * left end of the range if we are a tier because they may legitimately
4521 * not exist (see _scrub).
4522 */
4523 int min = MAX(3, cct->_conf->osd_scrub_chunk_min);
4524 hobject_t start = scrubber.start;
4525 hobject_t candidate_end;
4526 vector<hobject_t> objects;
4527 ret = get_pgbackend()->objects_list_partial(
4528 start,
4529 min,
4530 MAX(min, cct->_conf->osd_scrub_chunk_max),
4531 &objects,
4532 &candidate_end);
4533 assert(ret >= 0);
4534
4535 if (!objects.empty()) {
4536 hobject_t back = objects.back();
4537 while (candidate_end.has_snapset() &&
4538 candidate_end.get_head() == back.get_head()) {
4539 candidate_end = back;
4540 objects.pop_back();
4541 if (objects.empty()) {
4542 assert(0 ==
4543 "Somehow we got more than 2 objects which"
4544 "have the same head but are not clones");
4545 }
4546 back = objects.back();
4547 }
4548 if (candidate_end.has_snapset()) {
4549 assert(candidate_end.get_head() != back.get_head());
4550 candidate_end = candidate_end.get_object_boundary();
4551 }
4552 } else {
4553 assert(candidate_end.is_max());
4554 }
4555
4556 if (!_range_available_for_scrub(scrubber.start, candidate_end)) {
4557 // we'll be requeued by whatever made us unavailable for scrub
4558 dout(10) << __func__ << ": scrub blocked somewhere in range "
4559 << "[" << scrubber.start << ", " << candidate_end << ")"
4560 << dendl;
4561 done = true;
4562 break;
4563 }
4564 scrubber.end = candidate_end;
4565 }
4566
4567 // walk the log to find the latest update that affects our chunk
4568 scrubber.subset_last_update = eversion_t();
4569 for (auto p = projected_log.log.rbegin();
4570 p != projected_log.log.rend();
4571 ++p) {
4572 if (p->soid >= scrubber.start &&
4573 p->soid < scrubber.end) {
4574 scrubber.subset_last_update = p->version;
4575 break;
4576 }
4577 }
4578 if (scrubber.subset_last_update == eversion_t()) {
4579 for (list<pg_log_entry_t>::const_reverse_iterator p =
4580 pg_log.get_log().log.rbegin();
4581 p != pg_log.get_log().log.rend();
4582 ++p) {
4583 if (p->soid >= scrubber.start &&
4584 p->soid < scrubber.end) {
4585 scrubber.subset_last_update = p->version;
4586 break;
4587 }
4588 }
4589 }
4590
4591 // ask replicas to wait until
4592 // last_update_applied >= scrubber.subset_last_update and then scan
4593 scrubber.waiting_on_whom.insert(pg_whoami);
4594 ++scrubber.waiting_on;
4595
4596 // request maps from replicas
4597 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4598 i != actingbackfill.end();
4599 ++i) {
4600 if (*i == pg_whoami) continue;
4601 _request_scrub_map(*i, scrubber.subset_last_update,
4602 scrubber.start, scrubber.end, scrubber.deep,
4603 scrubber.seed);
4604 scrubber.waiting_on_whom.insert(*i);
4605 ++scrubber.waiting_on;
4606 }
4607
4608 scrubber.state = PG::Scrubber::WAIT_PUSHES;
4609
4610 break;
4611
4612 case PG::Scrubber::WAIT_PUSHES:
4613 if (active_pushes == 0) {
4614 scrubber.state = PG::Scrubber::WAIT_LAST_UPDATE;
4615 } else {
4616 dout(15) << "wait for pushes to apply" << dendl;
4617 done = true;
4618 }
4619 break;
4620
4621 case PG::Scrubber::WAIT_LAST_UPDATE:
4622 if (last_update_applied >= scrubber.subset_last_update) {
4623 scrubber.state = PG::Scrubber::BUILD_MAP;
4624 } else {
4625 // will be requeued by op_applied
4626 dout(15) << "wait for writes to flush" << dendl;
4627 done = true;
4628 }
4629 break;
4630
4631 case PG::Scrubber::BUILD_MAP:
4632 assert(last_update_applied >= scrubber.subset_last_update);
4633
4634 // build my own scrub map
4635 ret = build_scrub_map_chunk(scrubber.primary_scrubmap,
4636 scrubber.start, scrubber.end,
4637 scrubber.deep, scrubber.seed,
4638 handle);
4639 if (ret < 0) {
4640 dout(5) << "error building scrub map: " << ret << ", aborting" << dendl;
4641 scrub_clear_state();
4642 scrub_unreserve_replicas();
4643 return;
4644 }
4645
4646 --scrubber.waiting_on;
4647 scrubber.waiting_on_whom.erase(pg_whoami);
4648
4649 scrubber.state = PG::Scrubber::WAIT_REPLICAS;
4650 break;
4651
4652 case PG::Scrubber::WAIT_REPLICAS:
4653 if (scrubber.waiting_on > 0) {
4654 // will be requeued by sub_op_scrub_map
4655 dout(10) << "wait for replicas to build scrub map" << dendl;
4656 done = true;
4657 } else {
4658 scrubber.state = PG::Scrubber::COMPARE_MAPS;
4659 }
4660 break;
4661
4662 case PG::Scrubber::COMPARE_MAPS:
4663 assert(last_update_applied >= scrubber.subset_last_update);
4664 assert(scrubber.waiting_on == 0);
4665
4666 scrub_compare_maps();
4667 scrubber.start = scrubber.end;
4668 scrubber.run_callbacks();
4669
4670 // requeue the writes from the chunk that just finished
4671 requeue_ops(waiting_for_scrub);
4672
4673 scrubber.state = PG::Scrubber::WAIT_DIGEST_UPDATES;
4674
4675 // fall-thru
4676
4677 case PG::Scrubber::WAIT_DIGEST_UPDATES:
4678 if (scrubber.num_digest_updates_pending) {
4679 dout(10) << __func__ << " waiting on "
4680 << scrubber.num_digest_updates_pending
4681 << " digest updates" << dendl;
4682 done = true;
4683 break;
4684 }
4685
4686 if (!(scrubber.end.is_max())) {
4687 scrubber.state = PG::Scrubber::NEW_CHUNK;
4688 requeue_scrub();
4689 done = true;
4690 } else {
4691 scrubber.state = PG::Scrubber::FINISH;
4692 }
4693
4694 break;
4695
4696 case PG::Scrubber::FINISH:
4697 scrub_finish();
4698 scrubber.state = PG::Scrubber::INACTIVE;
4699 done = true;
4700
4701 if (!snap_trimq.empty()) {
4702 dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
4703 snap_trimmer_scrub_complete();
4704 }
4705
4706 break;
4707
4708 default:
4709 ceph_abort();
4710 }
4711 }
4712 dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
4713 << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
4714 }
4715
4716 void PG::scrub_clear_state()
4717 {
4718 assert(is_locked());
4719 state_clear(PG_STATE_SCRUBBING);
4720 state_clear(PG_STATE_REPAIR);
4721 state_clear(PG_STATE_DEEP_SCRUB);
4722 publish_stats_to_osd();
4723
4724 // active -> nothing.
4725 if (scrubber.active)
4726 osd->dec_scrubs_active();
4727
4728 requeue_ops(waiting_for_scrub);
4729
4730 scrubber.reset();
4731
4732 // type-specific state clear
4733 _scrub_clear_state();
4734 }
4735
4736 void PG::scrub_compare_maps()
4737 {
4738 dout(10) << __func__ << " has maps, analyzing" << dendl;
4739
4740 // construct authoritative scrub map for type specific scrubbing
4741 scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
4742 map<hobject_t, pair<uint32_t, uint32_t>> missing_digest;
4743
4744 if (acting.size() > 1) {
4745 dout(10) << __func__ << " comparing replica scrub maps" << dendl;
4746
4747 stringstream ss;
4748
4749 // Map from object with errors to good peer
4750 map<hobject_t, list<pg_shard_t>> authoritative;
4751 map<pg_shard_t, ScrubMap *> maps;
4752
4753 dout(2) << __func__ << " osd." << acting[0] << " has "
4754 << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
4755 maps[pg_whoami] = &scrubber.primary_scrubmap;
4756
4757 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4758 i != actingbackfill.end();
4759 ++i) {
4760 if (*i == pg_whoami) continue;
4761 dout(2) << __func__ << " replica " << *i << " has "
4762 << scrubber.received_maps[*i].objects.size()
4763 << " items" << dendl;
4764 maps[*i] = &scrubber.received_maps[*i];
4765 }
4766
4767 get_pgbackend()->be_compare_scrubmaps(
4768 maps,
4769 state_test(PG_STATE_REPAIR),
4770 scrubber.missing,
4771 scrubber.inconsistent,
4772 authoritative,
4773 missing_digest,
4774 scrubber.shallow_errors,
4775 scrubber.deep_errors,
4776 scrubber.store.get(),
4777 info.pgid, acting,
4778 ss);
4779 dout(2) << ss.str() << dendl;
4780
4781 if (!ss.str().empty()) {
4782 osd->clog->error(ss);
4783 }
4784
4785 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
4786 i != authoritative.end();
4787 ++i) {
4788 list<pair<ScrubMap::object, pg_shard_t> > good_peers;
4789 for (list<pg_shard_t>::const_iterator j = i->second.begin();
4790 j != i->second.end();
4791 ++j) {
4792 good_peers.push_back(make_pair(maps[*j]->objects[i->first], *j));
4793 }
4794 scrubber.authoritative.insert(
4795 make_pair(
4796 i->first,
4797 good_peers));
4798 }
4799
4800 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
4801 i != authoritative.end();
4802 ++i) {
4803 scrubber.cleaned_meta_map.objects.erase(i->first);
4804 scrubber.cleaned_meta_map.objects.insert(
4805 *(maps[i->second.back()]->objects.find(i->first))
4806 );
4807 }
4808 }
4809
4810 ScrubMap for_meta_scrub;
4811 if (scrubber.end.is_max() ||
4812 scrubber.cleaned_meta_map.objects.empty()) {
4813 scrubber.cleaned_meta_map.swap(for_meta_scrub);
4814 } else {
4815 auto iter = scrubber.cleaned_meta_map.objects.end();
4816 --iter; // not empty, see if clause
4817 auto begin = scrubber.cleaned_meta_map.objects.begin();
4818 while (iter != begin) {
4819 auto next = iter--;
4820 if (next->first.get_head() != iter->first.get_head()) {
4821 ++iter;
4822 break;
4823 }
4824 }
4825 for_meta_scrub.objects.insert(begin, iter);
4826 scrubber.cleaned_meta_map.objects.erase(begin, iter);
4827 }
4828
4829 // ok, do the pg-type specific scrubbing
4830 scrub_snapshot_metadata(for_meta_scrub, missing_digest);
4831 if (!scrubber.store->empty()) {
4832 if (state_test(PG_STATE_REPAIR)) {
4833 dout(10) << __func__ << ": discarding scrub results" << dendl;
4834 scrubber.store->flush(nullptr);
4835 } else {
4836 dout(10) << __func__ << ": updating scrub object" << dendl;
4837 ObjectStore::Transaction t;
4838 scrubber.store->flush(&t);
4839 osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4840 }
4841 }
4842 }
4843
4844 bool PG::scrub_process_inconsistent()
4845 {
4846 dout(10) << __func__ << ": checking authoritative" << dendl;
4847 bool repair = state_test(PG_STATE_REPAIR);
4848 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4849 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4850
4851 // authoriative only store objects which missing or inconsistent.
4852 if (!scrubber.authoritative.empty()) {
4853 stringstream ss;
4854 ss << info.pgid << " " << mode << " "
4855 << scrubber.missing.size() << " missing, "
4856 << scrubber.inconsistent.size() << " inconsistent objects";
4857 dout(2) << ss.str() << dendl;
4858 osd->clog->error(ss);
4859 if (repair) {
4860 state_clear(PG_STATE_CLEAN);
4861 for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >>::iterator i =
4862 scrubber.authoritative.begin();
4863 i != scrubber.authoritative.end();
4864 ++i) {
4865 set<pg_shard_t>::iterator j;
4866
4867 auto missing_entry = scrubber.missing.find(i->first);
4868 if (missing_entry != scrubber.missing.end()) {
4869 for (j = missing_entry->second.begin();
4870 j != missing_entry->second.end();
4871 ++j) {
4872 repair_object(
4873 i->first,
4874 &(i->second),
4875 *j);
4876 ++scrubber.fixed;
4877 }
4878 }
4879 if (scrubber.inconsistent.count(i->first)) {
4880 for (j = scrubber.inconsistent[i->first].begin();
4881 j != scrubber.inconsistent[i->first].end();
4882 ++j) {
4883 repair_object(i->first,
4884 &(i->second),
4885 *j);
4886 ++scrubber.fixed;
4887 }
4888 }
4889 }
4890 }
4891 }
4892 return (!scrubber.authoritative.empty() && repair);
4893 }
4894
4895 bool PG::ops_blocked_by_scrub() const {
4896 return (waiting_for_scrub.size() != 0);
4897 }
4898
4899 // the part that actually finalizes a scrub
4900 void PG::scrub_finish()
4901 {
4902 bool repair = state_test(PG_STATE_REPAIR);
4903 // if the repair request comes from auto-repair and large number of errors,
4904 // we would like to cancel auto-repair
4905 if (repair && scrubber.auto_repair
4906 && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) {
4907 state_clear(PG_STATE_REPAIR);
4908 repair = false;
4909 }
4910 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4911 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4912
4913 // type-specific finish (can tally more errors)
4914 _scrub_finish();
4915
4916 bool has_error = scrub_process_inconsistent();
4917
4918 {
4919 stringstream oss;
4920 oss << info.pgid.pgid << " " << mode << " ";
4921 int total_errors = scrubber.shallow_errors + scrubber.deep_errors;
4922 if (total_errors)
4923 oss << total_errors << " errors";
4924 else
4925 oss << "ok";
4926 if (!deep_scrub && info.stats.stats.sum.num_deep_scrub_errors)
4927 oss << " ( " << info.stats.stats.sum.num_deep_scrub_errors
4928 << " remaining deep scrub error details lost)";
4929 if (repair)
4930 oss << ", " << scrubber.fixed << " fixed";
4931 if (total_errors)
4932 osd->clog->error(oss);
4933 else
4934 osd->clog->info(oss);
4935 }
4936
4937 // finish up
4938 unreg_next_scrub();
4939 utime_t now = ceph_clock_now();
4940 info.history.last_scrub = info.last_update;
4941 info.history.last_scrub_stamp = now;
4942 if (scrubber.deep) {
4943 info.history.last_deep_scrub = info.last_update;
4944 info.history.last_deep_scrub_stamp = now;
4945 }
4946 // Since we don't know which errors were fixed, we can only clear them
4947 // when every one has been fixed.
4948 if (repair) {
4949 if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) {
4950 assert(deep_scrub);
4951 scrubber.shallow_errors = scrubber.deep_errors = 0;
4952 } else {
4953 // Deep scrub in order to get corrected error counts
4954 scrub_after_recovery = true;
4955 }
4956 }
4957 if (deep_scrub) {
4958 if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0))
4959 info.history.last_clean_scrub_stamp = now;
4960 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
4961 info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
4962 } else {
4963 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
4964 // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
4965 // because of deep-scrub errors
4966 if (scrubber.shallow_errors == 0)
4967 info.history.last_clean_scrub_stamp = now;
4968 }
4969 info.stats.stats.sum.num_scrub_errors =
4970 info.stats.stats.sum.num_shallow_scrub_errors +
4971 info.stats.stats.sum.num_deep_scrub_errors;
4972 reg_next_scrub();
4973
4974 {
4975 ObjectStore::Transaction t;
4976 dirty_info = true;
4977 write_if_dirty(t);
4978 int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
4979 assert(tr == 0);
4980 }
4981
4982
4983 if (has_error) {
4984 queue_peering_event(
4985 CephPeeringEvtRef(
4986 std::make_shared<CephPeeringEvt>(
4987 get_osdmap()->get_epoch(),
4988 get_osdmap()->get_epoch(),
4989 DoRecovery())));
4990 }
4991
4992 scrub_clear_state();
4993 scrub_unreserve_replicas();
4994
4995 if (is_active() && is_primary()) {
4996 share_pg_info();
4997 }
4998 }
4999
5000 void PG::share_pg_info()
5001 {
5002 dout(10) << "share_pg_info" << dendl;
5003
5004 // share new pg_info_t with replicas
5005 assert(!actingbackfill.empty());
5006 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
5007 i != actingbackfill.end();
5008 ++i) {
5009 if (*i == pg_whoami) continue;
5010 pg_shard_t peer = *i;
5011 if (peer_info.count(peer)) {
5012 peer_info[peer].last_epoch_started = info.last_epoch_started;
5013 peer_info[peer].last_interval_started = info.last_interval_started;
5014 peer_info[peer].history.merge(info.history);
5015 }
5016 MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
5017 m->pg_list.push_back(
5018 make_pair(
5019 pg_notify_t(
5020 peer.shard, pg_whoami.shard,
5021 get_osdmap()->get_epoch(),
5022 get_osdmap()->get_epoch(),
5023 info),
5024 PastIntervals()));
5025 osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
5026 }
5027 }
5028
5029 bool PG::append_log_entries_update_missing(
5030 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
5031 ObjectStore::Transaction &t)
5032 {
5033 assert(!entries.empty());
5034 assert(entries.begin()->version > info.last_update);
5035
5036 PGLogEntryHandler rollbacker{this, &t};
5037 bool invalidate_stats =
5038 pg_log.append_new_log_entries(info.last_backfill,
5039 info.last_backfill_bitwise,
5040 entries,
5041 &rollbacker);
5042 info.last_update = pg_log.get_head();
5043
5044 if (pg_log.get_missing().num_missing() == 0) {
5045 // advance last_complete since nothing else is missing!
5046 info.last_complete = info.last_update;
5047 }
5048
5049 info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
5050 dirty_info = true;
5051 write_if_dirty(t);
5052 return invalidate_stats;
5053 }
5054
5055
5056 void PG::merge_new_log_entries(
5057 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
5058 ObjectStore::Transaction &t)
5059 {
5060 dout(10) << __func__ << " " << entries << dendl;
5061 assert(is_primary());
5062
5063 bool rebuild_missing = append_log_entries_update_missing(entries, t);
5064 for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
5065 i != actingbackfill.end();
5066 ++i) {
5067 pg_shard_t peer(*i);
5068 if (peer == pg_whoami) continue;
5069 assert(peer_missing.count(peer));
5070 assert(peer_info.count(peer));
5071 pg_missing_t& pmissing(peer_missing[peer]);
5072 pg_info_t& pinfo(peer_info[peer]);
5073 bool invalidate_stats = PGLog::append_log_entries_update_missing(
5074 pinfo.last_backfill,
5075 info.last_backfill_bitwise,
5076 entries,
5077 true,
5078 NULL,
5079 pmissing,
5080 NULL,
5081 this);
5082 pinfo.last_update = info.last_update;
5083 pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
5084 rebuild_missing = rebuild_missing || invalidate_stats;
5085 }
5086
5087 if (!rebuild_missing) {
5088 return;
5089 }
5090
5091 for (auto &&i: entries) {
5092 missing_loc.rebuild(
5093 i.soid,
5094 pg_whoami,
5095 actingbackfill,
5096 info,
5097 pg_log.get_missing(),
5098 peer_missing,
5099 peer_info);
5100 }
5101 }
5102
5103 void PG::update_history(const pg_history_t& new_history)
5104 {
5105 unreg_next_scrub();
5106 if (info.history.merge(new_history)) {
5107 dout(20) << __func__ << " advanced history from " << new_history << dendl;
5108 dirty_info = true;
5109 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
5110 dout(20) << __func__ << " clearing past_intervals" << dendl;
5111 past_intervals.clear();
5112 dirty_big_info = true;
5113 }
5114 }
5115 reg_next_scrub();
5116 }
5117
5118 void PG::fulfill_info(
5119 pg_shard_t from, const pg_query_t &query,
5120 pair<pg_shard_t, pg_info_t> &notify_info)
5121 {
5122 assert(from == primary);
5123 assert(query.type == pg_query_t::INFO);
5124
5125 // info
5126 dout(10) << "sending info" << dendl;
5127 notify_info = make_pair(from, info);
5128 }
5129
5130 void PG::fulfill_log(
5131 pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
5132 {
5133 dout(10) << "log request from " << from << dendl;
5134 assert(from == primary);
5135 assert(query.type != pg_query_t::INFO);
5136 ConnectionRef con = osd->get_con_osd_cluster(
5137 from.osd, get_osdmap()->get_epoch());
5138 if (!con) return;
5139
5140 MOSDPGLog *mlog = new MOSDPGLog(
5141 from.shard, pg_whoami.shard,
5142 get_osdmap()->get_epoch(),
5143 info, query_epoch);
5144 mlog->missing = pg_log.get_missing();
5145
5146 // primary -> other, when building master log
5147 if (query.type == pg_query_t::LOG) {
5148 dout(10) << " sending info+missing+log since " << query.since
5149 << dendl;
5150 if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
5151 osd->clog->error() << info.pgid << " got broken pg_query_t::LOG since " << query.since
5152 << " when my log.tail is " << pg_log.get_tail()
5153 << ", sending full log instead";
5154 mlog->log = pg_log.get_log(); // primary should not have requested this!!
5155 } else
5156 mlog->log.copy_after(pg_log.get_log(), query.since);
5157 }
5158 else if (query.type == pg_query_t::FULLLOG) {
5159 dout(10) << " sending info+missing+full log" << dendl;
5160 mlog->log = pg_log.get_log();
5161 }
5162
5163 dout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
5164
5165 osd->share_map_peer(from.osd, con.get(), get_osdmap());
5166 osd->send_message_osd_cluster(mlog, con.get());
5167 }
5168
5169 void PG::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
5170 {
5171 bool changed = false;
5172 if (osdmap->test_flag(CEPH_OSDMAP_FULL) &&
5173 !lastmap->test_flag(CEPH_OSDMAP_FULL)) {
5174 dout(10) << " cluster was marked full in " << osdmap->get_epoch() << dendl;
5175 changed = true;
5176 }
5177 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
5178 assert(pi);
5179 if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
5180 const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
5181 if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
5182 dout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
5183 changed = true;
5184 }
5185 }
5186 if (changed) {
5187 info.history.last_epoch_marked_full = osdmap->get_epoch();
5188 dirty_info = true;
5189 }
5190 }
5191
5192 bool PG::should_restart_peering(
5193 int newupprimary,
5194 int newactingprimary,
5195 const vector<int>& newup,
5196 const vector<int>& newacting,
5197 OSDMapRef lastmap,
5198 OSDMapRef osdmap)
5199 {
5200 if (PastIntervals::is_new_interval(
5201 primary.osd,
5202 newactingprimary,
5203 acting,
5204 newacting,
5205 up_primary.osd,
5206 newupprimary,
5207 up,
5208 newup,
5209 osdmap,
5210 lastmap,
5211 info.pgid.pgid)) {
5212 dout(20) << "new interval newup " << newup
5213 << " newacting " << newacting << dendl;
5214 return true;
5215 } else {
5216 return false;
5217 }
5218 }
5219
5220 bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
5221 {
5222 if (last_peering_reset > reply_epoch ||
5223 last_peering_reset > query_epoch) {
5224 dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " << query_epoch
5225 << " last_peering_reset " << last_peering_reset
5226 << dendl;
5227 return true;
5228 }
5229 return false;
5230 }
5231
5232 void PG::set_last_peering_reset()
5233 {
5234 dout(20) << "set_last_peering_reset " << get_osdmap()->get_epoch() << dendl;
5235 if (last_peering_reset != get_osdmap()->get_epoch()) {
5236 last_peering_reset = get_osdmap()->get_epoch();
5237 reset_interval_flush();
5238 }
5239 }
5240
5241 struct FlushState {
5242 PGRef pg;
5243 epoch_t epoch;
5244 FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
5245 ~FlushState() {
5246 pg->lock();
5247 if (!pg->pg_has_reset_since(epoch))
5248 pg->queue_flushed(epoch);
5249 pg->unlock();
5250 }
5251 };
5252 typedef ceph::shared_ptr<FlushState> FlushStateRef;
5253
5254 void PG::start_flush(ObjectStore::Transaction *t,
5255 list<Context *> *on_applied,
5256 list<Context *> *on_safe)
5257 {
5258 // flush in progress ops
5259 FlushStateRef flush_trigger (std::make_shared<FlushState>(
5260 this, get_osdmap()->get_epoch()));
5261 t->nop();
5262 flushes_in_progress++;
5263 on_applied->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5264 on_safe->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5265 }
5266
5267 void PG::reset_interval_flush()
5268 {
5269 dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
5270 recovery_state.clear_blocked_outgoing();
5271
5272 Context *c = new QueuePeeringEvt<IntervalFlush>(
5273 this, get_osdmap()->get_epoch(), IntervalFlush());
5274 if (!osr->flush_commit(c)) {
5275 dout(10) << "Beginning to block outgoing recovery messages" << dendl;
5276 recovery_state.begin_block_outgoing();
5277 } else {
5278 dout(10) << "Not blocking outgoing recovery messages" << dendl;
5279 delete c;
5280 }
5281 }
5282
5283 /* Called before initializing peering during advance_map */
5284 void PG::start_peering_interval(
5285 const OSDMapRef lastmap,
5286 const vector<int>& newup, int new_up_primary,
5287 const vector<int>& newacting, int new_acting_primary,
5288 ObjectStore::Transaction *t)
5289 {
5290 const OSDMapRef osdmap = get_osdmap();
5291
5292 set_last_peering_reset();
5293
5294 vector<int> oldacting, oldup;
5295 int oldrole = get_role();
5296
5297 unreg_next_scrub();
5298
5299 pg_shard_t old_acting_primary = get_primary();
5300 pg_shard_t old_up_primary = up_primary;
5301 bool was_old_primary = is_primary();
5302
5303 acting.swap(oldacting);
5304 up.swap(oldup);
5305 init_primary_up_acting(
5306 newup,
5307 newacting,
5308 new_up_primary,
5309 new_acting_primary);
5310
5311 if (info.stats.up != up ||
5312 info.stats.acting != acting ||
5313 info.stats.up_primary != new_up_primary ||
5314 info.stats.acting_primary != new_acting_primary) {
5315 info.stats.up = up;
5316 info.stats.up_primary = new_up_primary;
5317 info.stats.acting = acting;
5318 info.stats.acting_primary = new_acting_primary;
5319 info.stats.mapping_epoch = osdmap->get_epoch();
5320 }
5321
5322 pg_stats_publish_lock.Lock();
5323 pg_stats_publish_valid = false;
5324 pg_stats_publish_lock.Unlock();
5325
5326 // This will now be remapped during a backfill in cases
5327 // that it would not have been before.
5328 if (up != acting)
5329 state_set(PG_STATE_REMAPPED);
5330 else
5331 state_clear(PG_STATE_REMAPPED);
5332
5333 int role = osdmap->calc_pg_role(osd->whoami, acting, acting.size());
5334 if (pool.info.is_replicated() || role == pg_whoami.shard)
5335 set_role(role);
5336 else
5337 set_role(-1);
5338
5339 // did acting, up, primary|acker change?
5340 if (!lastmap) {
5341 dout(10) << " no lastmap" << dendl;
5342 dirty_info = true;
5343 dirty_big_info = true;
5344 info.history.same_interval_since = osdmap->get_epoch();
5345 } else {
5346 std::stringstream debug;
5347 assert(info.history.same_interval_since != 0);
5348 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
5349 get_is_recoverable_predicate());
5350 bool new_interval = PastIntervals::check_new_interval(
5351 old_acting_primary.osd,
5352 new_acting_primary,
5353 oldacting, newacting,
5354 old_up_primary.osd,
5355 new_up_primary,
5356 oldup, newup,
5357 info.history.same_interval_since,
5358 info.history.last_epoch_clean,
5359 osdmap,
5360 lastmap,
5361 info.pgid.pgid,
5362 recoverable.get(),
5363 &past_intervals,
5364 &debug);
5365 dout(10) << __func__ << ": check_new_interval output: "
5366 << debug.str() << dendl;
5367 if (new_interval) {
5368 if (osdmap->get_epoch() == osd->get_superblock().oldest_map &&
5369 info.history.last_epoch_clean < osdmap->get_epoch()) {
5370 dout(10) << " map gap, clearing past_intervals and faking" << dendl;
5371 // our information is incomplete and useless; someone else was clean
5372 // after everything we know if osdmaps were trimmed.
5373 past_intervals.clear();
5374 } else {
5375 dout(10) << " noting past " << past_intervals << dendl;
5376 }
5377 dirty_info = true;
5378 dirty_big_info = true;
5379 info.history.same_interval_since = osdmap->get_epoch();
5380 if (info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
5381 osdmap->get_pg_num(info.pgid.pgid.pool()),
5382 nullptr)) {
5383 info.history.last_epoch_split = osdmap->get_epoch();
5384 }
5385 }
5386 }
5387
5388 if (old_up_primary != up_primary ||
5389 oldup != up) {
5390 info.history.same_up_since = osdmap->get_epoch();
5391 }
5392 // this comparison includes primary rank via pg_shard_t
5393 if (old_acting_primary != get_primary()) {
5394 info.history.same_primary_since = osdmap->get_epoch();
5395 }
5396
5397 on_new_interval();
5398
5399 dout(1) << __func__ << " up " << oldup << " -> " << up
5400 << ", acting " << oldacting << " -> " << acting
5401 << ", acting_primary " << old_acting_primary << " -> " << new_acting_primary
5402 << ", up_primary " << old_up_primary << " -> " << new_up_primary
5403 << ", role " << oldrole << " -> " << role
5404 << ", features acting " << acting_features
5405 << " upacting " << upacting_features
5406 << dendl;
5407
5408 // deactivate.
5409 state_clear(PG_STATE_ACTIVE);
5410 state_clear(PG_STATE_PEERED);
5411 state_clear(PG_STATE_DOWN);
5412 state_clear(PG_STATE_RECOVERY_WAIT);
5413 state_clear(PG_STATE_RECOVERY_TOOFULL);
5414 state_clear(PG_STATE_RECOVERING);
5415
5416 peer_purged.clear();
5417 actingbackfill.clear();
5418 scrub_queued = false;
5419
5420 // reset primary state?
5421 if (was_old_primary || is_primary()) {
5422 osd->remove_want_pg_temp(info.pgid.pgid);
5423 }
5424 clear_primary_state();
5425
5426
5427 // pg->on_*
5428 on_change(t);
5429
5430 projected_last_update = eversion_t();
5431
5432 assert(!deleting);
5433
5434 // should we tell the primary we are here?
5435 send_notify = !is_primary();
5436
5437 if (role != oldrole ||
5438 was_old_primary != is_primary()) {
5439 // did primary change?
5440 if (was_old_primary != is_primary()) {
5441 state_clear(PG_STATE_CLEAN);
5442 clear_publish_stats();
5443 }
5444
5445 on_role_change();
5446
5447 // take active waiters
5448 requeue_ops(waiting_for_peered);
5449
5450 } else {
5451 // no role change.
5452 // did primary change?
5453 if (get_primary() != old_acting_primary) {
5454 dout(10) << *this << " " << oldacting << " -> " << acting
5455 << ", acting primary "
5456 << old_acting_primary << " -> " << get_primary()
5457 << dendl;
5458 } else {
5459 // primary is the same.
5460 if (is_primary()) {
5461 // i am (still) primary. but my replica set changed.
5462 state_clear(PG_STATE_CLEAN);
5463
5464 dout(10) << oldacting << " -> " << acting
5465 << ", replicas changed" << dendl;
5466 }
5467 }
5468 }
5469 cancel_recovery();
5470
5471 if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
5472 dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
5473 osd->queue_want_pg_temp(info.pgid.pgid, acting);
5474 }
5475 }
5476
5477 void PG::on_new_interval()
5478 {
5479 const OSDMapRef osdmap = get_osdmap();
5480
5481 reg_next_scrub();
5482
5483 // initialize features
5484 acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5485 upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5486 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
5487 if (*p == CRUSH_ITEM_NONE)
5488 continue;
5489 uint64_t f = osdmap->get_xinfo(*p).features;
5490 acting_features &= f;
5491 upacting_features &= f;
5492 }
5493 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
5494 if (*p == CRUSH_ITEM_NONE)
5495 continue;
5496 upacting_features &= osdmap->get_xinfo(*p).features;
5497 }
5498
5499 assert(osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE));
5500
5501 _on_new_interval();
5502 }
5503
5504 void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
5505 {
5506 assert(!is_primary());
5507
5508 update_history(oinfo.history);
5509
5510 if (last_complete_ondisk.epoch >= info.history.last_epoch_started) {
5511 // DEBUG: verify that the snaps are empty in snap_mapper
5512 if (cct->_conf->osd_debug_verify_snaps_on_info) {
5513 interval_set<snapid_t> p;
5514 p.union_of(oinfo.purged_snaps, info.purged_snaps);
5515 p.subtract(info.purged_snaps);
5516 if (!p.empty()) {
5517 for (interval_set<snapid_t>::iterator i = p.begin();
5518 i != p.end();
5519 ++i) {
5520 for (snapid_t snap = i.get_start();
5521 snap != i.get_len() + i.get_start();
5522 ++snap) {
5523 vector<hobject_t> hoids;
5524 int r = snap_mapper.get_next_objects_to_trim(snap, 1, &hoids);
5525 if (r != 0 && r != -ENOENT) {
5526 derr << __func__ << ": snap_mapper get_next_object_to_trim returned "
5527 << cpp_strerror(r) << dendl;
5528 ceph_abort();
5529 } else if (r != -ENOENT) {
5530 assert(!hoids.empty());
5531 derr << __func__ << ": snap_mapper get_next_object_to_trim returned "
5532 << cpp_strerror(r) << " for object "
5533 << hoids[0] << " on snap " << snap
5534 << " which should have been fully trimmed " << dendl;
5535 ceph_abort();
5536 }
5537 }
5538 }
5539 }
5540 }
5541 info.purged_snaps = oinfo.purged_snaps;
5542 dirty_info = true;
5543 dirty_big_info = true;
5544 }
5545 }
5546
5547 ostream& operator<<(ostream& out, const PG& pg)
5548 {
5549 out << "pg[" << pg.info
5550 << " " << pg.up;
5551 if (pg.acting != pg.up)
5552 out << "/" << pg.acting;
5553 out << " r=" << pg.get_role();
5554 out << " lpr=" << pg.get_last_peering_reset();
5555
5556 if (!pg.past_intervals.empty()) {
5557 out << " pi=[" << pg.past_intervals.get_bounds()
5558 << ")/" << pg.past_intervals.size();
5559 }
5560
5561 if (pg.is_peered()) {
5562 if (pg.last_update_ondisk != pg.info.last_update)
5563 out << " luod=" << pg.last_update_ondisk;
5564 if (pg.last_update_applied != pg.info.last_update)
5565 out << " lua=" << pg.last_update_applied;
5566 }
5567
5568 if (pg.recovery_ops_active)
5569 out << " rops=" << pg.recovery_ops_active;
5570
5571 if (pg.pg_log.get_tail() != pg.info.log_tail ||
5572 pg.pg_log.get_head() != pg.info.last_update)
5573 out << " (info mismatch, " << pg.pg_log.get_log() << ")";
5574
5575 if (!pg.pg_log.get_log().empty()) {
5576 if ((pg.pg_log.get_log().log.begin()->version <= pg.pg_log.get_tail())) {
5577 out << " (log bound mismatch, actual=["
5578 << pg.pg_log.get_log().log.begin()->version << ","
5579 << pg.pg_log.get_log().log.rbegin()->version << "]";
5580 out << ")";
5581 }
5582 }
5583
5584 if (!pg.backfill_targets.empty())
5585 out << " bft=" << pg.backfill_targets;
5586 out << " crt=" << pg.pg_log.get_can_rollback_to();
5587
5588 if (pg.last_complete_ondisk != pg.info.last_complete)
5589 out << " lcod " << pg.last_complete_ondisk;
5590
5591 if (pg.is_primary()) {
5592 out << " mlcod " << pg.min_last_complete_ondisk;
5593 }
5594
5595 out << " " << pg_state_string(pg.get_state());
5596 if (pg.should_send_notify())
5597 out << " NOTIFY";
5598
5599 if (pg.scrubber.must_repair)
5600 out << " MUST_REPAIR";
5601 if (pg.scrubber.auto_repair)
5602 out << " AUTO_REPAIR";
5603 if (pg.scrubber.must_deep_scrub)
5604 out << " MUST_DEEP_SCRUB";
5605 if (pg.scrubber.must_scrub)
5606 out << " MUST_SCRUB";
5607
5608 //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
5609 if (pg.pg_log.get_missing().num_missing()) {
5610 out << " m=" << pg.pg_log.get_missing().num_missing();
5611 if (pg.is_primary()) {
5612 uint64_t unfound = pg.get_num_unfound();
5613 if (unfound)
5614 out << " u=" << unfound;
5615 }
5616 }
5617 if (pg.snap_trimq.size())
5618 out << " snaptrimq=" << pg.snap_trimq;
5619
5620 out << "]";
5621
5622
5623 return out;
5624 }
5625
5626 bool PG::can_discard_op(OpRequestRef& op)
5627 {
5628 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
5629 if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) {
5630 dout(20) << " discard " << *m << dendl;
5631 return true;
5632 }
5633
5634 if (m->get_map_epoch() < info.history.same_primary_since) {
5635 dout(7) << " changed after " << m->get_map_epoch()
5636 << ", dropping " << *m << dendl;
5637 return true;
5638 }
5639
5640 if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) {
5641 if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) {
5642 dout(7) << __func__ << " sent before last_force_op_resend "
5643 << pool.info.last_force_op_resend << ", dropping" << *m << dendl;
5644 return true;
5645 }
5646 if (m->get_map_epoch() < info.history.last_epoch_split) {
5647 dout(7) << __func__ << " pg split in "
5648 << info.history.last_epoch_split << ", dropping" << dendl;
5649 return true;
5650 }
5651 } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) {
5652 if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) {
5653 dout(7) << __func__ << " sent before last_force_op_resend_preluminous "
5654 << pool.info.last_force_op_resend_preluminous
5655 << ", dropping" << *m << dendl;
5656 return true;
5657 }
5658 }
5659
5660 return false;
5661 }
5662
5663 template<typename T, int MSGTYPE>
5664 bool PG::can_discard_replica_op(OpRequestRef& op)
5665 {
5666 const T *m = static_cast<const T *>(op->get_req());
5667 assert(m->get_type() == MSGTYPE);
5668
5669 /* Mostly, this overlaps with the old_peering_msg
5670 * condition. An important exception is pushes
5671 * sent by replicas not in the acting set, since
5672 * if such a replica goes down it does not cause
5673 * a new interval. */
5674 int from = m->get_source().num();
5675 if (get_osdmap()->get_down_at(from) >= m->map_epoch)
5676 return true;
5677
5678 // same pg?
5679 // if pg changes _at all_, we reset and repeer!
5680 if (old_peering_msg(m->map_epoch, m->map_epoch)) {
5681 dout(10) << "can_discard_replica_op pg changed " << info.history
5682 << " after " << m->map_epoch
5683 << ", dropping" << dendl;
5684 return true;
5685 }
5686 return false;
5687 }
5688
5689 bool PG::can_discard_scan(OpRequestRef op)
5690 {
5691 const MOSDPGScan *m = static_cast<const MOSDPGScan *>(op->get_req());
5692 assert(m->get_type() == MSG_OSD_PG_SCAN);
5693
5694 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
5695 dout(10) << " got old scan, ignoring" << dendl;
5696 return true;
5697 }
5698 return false;
5699 }
5700
5701 bool PG::can_discard_backfill(OpRequestRef op)
5702 {
5703 const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill *>(op->get_req());
5704 assert(m->get_type() == MSG_OSD_PG_BACKFILL);
5705
5706 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
5707 dout(10) << " got old backfill, ignoring" << dendl;
5708 return true;
5709 }
5710
5711 return false;
5712
5713 }
5714
5715 bool PG::can_discard_request(OpRequestRef& op)
5716 {
5717 switch (op->get_req()->get_type()) {
5718 case CEPH_MSG_OSD_OP:
5719 return can_discard_op(op);
5720 case CEPH_MSG_OSD_BACKOFF:
5721 return false; // never discard
5722 case MSG_OSD_SUBOP:
5723 return can_discard_replica_op<MOSDSubOp, MSG_OSD_SUBOP>(op);
5724 case MSG_OSD_REPOP:
5725 return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op);
5726 case MSG_OSD_PG_PUSH:
5727 return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op);
5728 case MSG_OSD_PG_PULL:
5729 return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
5730 case MSG_OSD_PG_PUSH_REPLY:
5731 return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
5732 case MSG_OSD_SUBOPREPLY:
5733 return can_discard_replica_op<MOSDSubOpReply, MSG_OSD_SUBOPREPLY>(op);
5734 case MSG_OSD_REPOPREPLY:
5735 return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
5736 case MSG_OSD_PG_RECOVERY_DELETE:
5737 return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op);
5738
5739 case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
5740 return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op);
5741
5742 case MSG_OSD_EC_WRITE:
5743 return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
5744 case MSG_OSD_EC_WRITE_REPLY:
5745 return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op);
5746 case MSG_OSD_EC_READ:
5747 return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
5748 case MSG_OSD_EC_READ_REPLY:
5749 return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
5750 case MSG_OSD_REP_SCRUB:
5751 return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
5752 case MSG_OSD_SCRUB_RESERVE:
5753 return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op);
5754 case MSG_OSD_REP_SCRUBMAP:
5755 return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op);
5756 case MSG_OSD_PG_UPDATE_LOG_MISSING:
5757 return can_discard_replica_op<
5758 MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op);
5759 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
5760 return can_discard_replica_op<
5761 MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
5762
5763 case MSG_OSD_PG_SCAN:
5764 return can_discard_scan(op);
5765 case MSG_OSD_PG_BACKFILL:
5766 return can_discard_backfill(op);
5767 case MSG_OSD_PG_BACKFILL_REMOVE:
5768 return can_discard_replica_op<MOSDPGBackfillRemove,
5769 MSG_OSD_PG_BACKFILL_REMOVE>(op);
5770 }
5771 return true;
5772 }
5773
5774 void PG::take_waiters()
5775 {
5776 dout(10) << "take_waiters" << dendl;
5777 requeue_map_waiters();
5778 for (list<CephPeeringEvtRef>::iterator i = peering_waiters.begin();
5779 i != peering_waiters.end();
5780 ++i) osd->queue_for_peering(this);
5781 peering_queue.splice(peering_queue.begin(), peering_waiters,
5782 peering_waiters.begin(), peering_waiters.end());
5783 }
5784
5785 void PG::handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx)
5786 {
5787 dout(10) << "handle_peering_event: " << evt->get_desc() << dendl;
5788 if (!have_same_or_newer_map(evt->get_epoch_sent())) {
5789 dout(10) << "deferring event " << evt->get_desc() << dendl;
5790 peering_waiters.push_back(evt);
5791 return;
5792 }
5793 if (old_peering_evt(evt))
5794 return;
5795 recovery_state.handle_event(evt, rctx);
5796 }
5797
5798 void PG::queue_peering_event(CephPeeringEvtRef evt)
5799 {
5800 if (old_peering_evt(evt))
5801 return;
5802 peering_queue.push_back(evt);
5803 osd->queue_for_peering(this);
5804 }
5805
5806 void PG::queue_null(epoch_t msg_epoch,
5807 epoch_t query_epoch)
5808 {
5809 dout(10) << "null" << dendl;
5810 queue_peering_event(
5811 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
5812 NullEvt())));
5813 }
5814
5815 void PG::queue_flushed(epoch_t e)
5816 {
5817 dout(10) << "flushed" << dendl;
5818 queue_peering_event(
5819 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(e, e,
5820 FlushedEvt())));
5821 }
5822
5823 void PG::queue_query(epoch_t msg_epoch,
5824 epoch_t query_epoch,
5825 pg_shard_t from, const pg_query_t& q)
5826 {
5827 dout(10) << "handle_query " << q << " from replica " << from << dendl;
5828 queue_peering_event(
5829 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
5830 MQuery(from, q, query_epoch))));
5831 }
5832
5833 void PG::handle_advance_map(
5834 OSDMapRef osdmap, OSDMapRef lastmap,
5835 vector<int>& newup, int up_primary,
5836 vector<int>& newacting, int acting_primary,
5837 RecoveryCtx *rctx)
5838 {
5839 assert(lastmap->get_epoch() == osdmap_ref->get_epoch());
5840 assert(lastmap == osdmap_ref);
5841 dout(10) << "handle_advance_map "
5842 << newup << "/" << newacting
5843 << " -- " << up_primary << "/" << acting_primary
5844 << dendl;
5845 update_osdmap_ref(osdmap);
5846 pool.update(osdmap);
5847 past_intervals.update_type_from_map(pool.info.ec_pool(), *osdmap);
5848 if (cct->_conf->osd_debug_verify_cached_snaps) {
5849 interval_set<snapid_t> actual_removed_snaps;
5850 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
5851 assert(pi);
5852 pi->build_removed_snaps(actual_removed_snaps);
5853 if (!(actual_removed_snaps == pool.cached_removed_snaps)) {
5854 derr << __func__ << ": mismatch between the actual removed snaps "
5855 << actual_removed_snaps << " and pool.cached_removed_snaps "
5856 << " pool.cached_removed_snaps " << pool.cached_removed_snaps
5857 << dendl;
5858 }
5859 assert(actual_removed_snaps == pool.cached_removed_snaps);
5860 }
5861 AdvMap evt(
5862 osdmap, lastmap, newup, up_primary,
5863 newacting, acting_primary);
5864 recovery_state.handle_event(evt, rctx);
5865 if (pool.info.last_change == osdmap_ref->get_epoch()) {
5866 on_pool_change();
5867 update_store_with_options();
5868 }
5869 }
5870
5871 void PG::handle_activate_map(RecoveryCtx *rctx)
5872 {
5873 dout(10) << "handle_activate_map " << dendl;
5874 ActMap evt;
5875 recovery_state.handle_event(evt, rctx);
5876 if (osdmap_ref->get_epoch() - last_persisted_osdmap_ref->get_epoch() >
5877 cct->_conf->osd_pg_epoch_persisted_max_stale) {
5878 dout(20) << __func__ << ": Dirtying info: last_persisted is "
5879 << last_persisted_osdmap_ref->get_epoch()
5880 << " while current is " << osdmap_ref->get_epoch() << dendl;
5881 dirty_info = true;
5882 } else {
5883 dout(20) << __func__ << ": Not dirtying info: last_persisted is "
5884 << last_persisted_osdmap_ref->get_epoch()
5885 << " while current is " << osdmap_ref->get_epoch() << dendl;
5886 }
5887 if (osdmap_ref->check_new_blacklist_entries()) check_blacklisted_watchers();
5888 }
5889
5890 void PG::handle_loaded(RecoveryCtx *rctx)
5891 {
5892 dout(10) << "handle_loaded" << dendl;
5893 Load evt;
5894 recovery_state.handle_event(evt, rctx);
5895 }
5896
5897 void PG::handle_create(RecoveryCtx *rctx)
5898 {
5899 dout(10) << "handle_create" << dendl;
5900 rctx->created_pgs.insert(this);
5901 Initialize evt;
5902 recovery_state.handle_event(evt, rctx);
5903 ActMap evt2;
5904 recovery_state.handle_event(evt2, rctx);
5905 }
5906
5907 void PG::handle_query_state(Formatter *f)
5908 {
5909 dout(10) << "handle_query_state" << dendl;
5910 QueryState q(f);
5911 recovery_state.handle_event(q, 0);
5912 }
5913
5914 void PG::update_store_with_options()
5915 {
5916 auto r = osd->store->set_collection_opts(coll, pool.info.opts);
5917 if(r < 0 && r != -EOPNOTSUPP) {
5918 derr << __func__ << "set_collection_opts returns error:" << r << dendl;
5919 }
5920 }
5921
5922 void PG::update_store_on_load()
5923 {
5924 if (osd->store->get_type() == "filestore") {
5925 // legacy filestore didn't store collection bit width; fix.
5926 int bits = osd->store->collection_bits(coll);
5927 if (bits < 0) {
5928 if (coll.is_meta())
5929 bits = 0;
5930 else
5931 bits = info.pgid.get_split_bits(pool.info.get_pg_num());
5932 lderr(cct) << __func__ << " setting bit width to " << bits << dendl;
5933 ObjectStore::Transaction t;
5934 t.collection_set_bits(coll, bits);
5935 osd->store->apply_transaction(osr.get(), std::move(t));
5936 }
5937 }
5938 }
5939
5940 /*------------ Recovery State Machine----------------*/
5941 #undef dout_prefix
5942 #define dout_prefix (*_dout << context< RecoveryMachine >().pg->gen_prefix() \
5943 << "state<" << get_state_name() << ">: ")
5944
5945 /*------Crashed-------*/
5946 PG::RecoveryState::Crashed::Crashed(my_context ctx)
5947 : my_base(ctx),
5948 NamedState(context< RecoveryMachine >().pg, "Crashed")
5949 {
5950 context< RecoveryMachine >().log_enter(state_name);
5951 assert(0 == "we got a bad state machine event");
5952 }
5953
5954
5955 /*------Initial-------*/
5956 PG::RecoveryState::Initial::Initial(my_context ctx)
5957 : my_base(ctx),
5958 NamedState(context< RecoveryMachine >().pg, "Initial")
5959 {
5960 context< RecoveryMachine >().log_enter(state_name);
5961 }
5962
5963 boost::statechart::result PG::RecoveryState::Initial::react(const Load& l)
5964 {
5965 PG *pg = context< RecoveryMachine >().pg;
5966
5967 // do we tell someone we're here?
5968 pg->send_notify = (!pg->is_primary());
5969 pg->update_store_with_options();
5970
5971 pg->update_store_on_load();
5972
5973 return transit< Reset >();
5974 }
5975
5976 boost::statechart::result PG::RecoveryState::Initial::react(const MNotifyRec& notify)
5977 {
5978 PG *pg = context< RecoveryMachine >().pg;
5979 pg->proc_replica_info(
5980 notify.from, notify.notify.info, notify.notify.epoch_sent);
5981 pg->set_last_peering_reset();
5982 return transit< Primary >();
5983 }
5984
5985 boost::statechart::result PG::RecoveryState::Initial::react(const MInfoRec& i)
5986 {
5987 PG *pg = context< RecoveryMachine >().pg;
5988 assert(!pg->is_primary());
5989 post_event(i);
5990 return transit< Stray >();
5991 }
5992
5993 boost::statechart::result PG::RecoveryState::Initial::react(const MLogRec& i)
5994 {
5995 PG *pg = context< RecoveryMachine >().pg;
5996 assert(!pg->is_primary());
5997 post_event(i);
5998 return transit< Stray >();
5999 }
6000
6001 void PG::RecoveryState::Initial::exit()
6002 {
6003 context< RecoveryMachine >().log_exit(state_name, enter_time);
6004 PG *pg = context< RecoveryMachine >().pg;
6005 utime_t dur = ceph_clock_now() - enter_time;
6006 pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur);
6007 }
6008
6009 /*------Started-------*/
6010 PG::RecoveryState::Started::Started(my_context ctx)
6011 : my_base(ctx),
6012 NamedState(context< RecoveryMachine >().pg, "Started")
6013 {
6014 context< RecoveryMachine >().log_enter(state_name);
6015 }
6016
6017 boost::statechart::result
6018 PG::RecoveryState::Started::react(const IntervalFlush&)
6019 {
6020 PG *pg = context< RecoveryMachine >().pg;
6021 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6022 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6023 return discard_event();
6024 }
6025
6026
6027 boost::statechart::result
6028 PG::RecoveryState::Started::react(const FlushedEvt&)
6029 {
6030 PG *pg = context< RecoveryMachine >().pg;
6031 pg->on_flushed();
6032 return discard_event();
6033 }
6034
6035
6036 boost::statechart::result PG::RecoveryState::Started::react(const AdvMap& advmap)
6037 {
6038 PG *pg = context< RecoveryMachine >().pg;
6039 ldout(pg->cct, 10) << "Started advmap" << dendl;
6040 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6041 if (pg->should_restart_peering(
6042 advmap.up_primary,
6043 advmap.acting_primary,
6044 advmap.newup,
6045 advmap.newacting,
6046 advmap.lastmap,
6047 advmap.osdmap)) {
6048 ldout(pg->cct, 10) << "should_restart_peering, transitioning to Reset"
6049 << dendl;
6050 post_event(advmap);
6051 return transit< Reset >();
6052 }
6053 pg->remove_down_peer_info(advmap.osdmap);
6054 return discard_event();
6055 }
6056
6057 boost::statechart::result PG::RecoveryState::Started::react(const QueryState& q)
6058 {
6059 q.f->open_object_section("state");
6060 q.f->dump_string("name", state_name);
6061 q.f->dump_stream("enter_time") << enter_time;
6062 q.f->close_section();
6063 return discard_event();
6064 }
6065
6066 void PG::RecoveryState::Started::exit()
6067 {
6068 context< RecoveryMachine >().log_exit(state_name, enter_time);
6069 PG *pg = context< RecoveryMachine >().pg;
6070 utime_t dur = ceph_clock_now() - enter_time;
6071 pg->osd->recoverystate_perf->tinc(rs_started_latency, dur);
6072 }
6073
6074 /*--------Reset---------*/
6075 PG::RecoveryState::Reset::Reset(my_context ctx)
6076 : my_base(ctx),
6077 NamedState(context< RecoveryMachine >().pg, "Reset")
6078 {
6079 context< RecoveryMachine >().log_enter(state_name);
6080 PG *pg = context< RecoveryMachine >().pg;
6081
6082 pg->flushes_in_progress = 0;
6083 pg->set_last_peering_reset();
6084 }
6085
6086 boost::statechart::result
6087 PG::RecoveryState::Reset::react(const FlushedEvt&)
6088 {
6089 PG *pg = context< RecoveryMachine >().pg;
6090 pg->on_flushed();
6091 return discard_event();
6092 }
6093
6094 boost::statechart::result
6095 PG::RecoveryState::Reset::react(const IntervalFlush&)
6096 {
6097 PG *pg = context< RecoveryMachine >().pg;
6098 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6099 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6100 return discard_event();
6101 }
6102
6103 boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
6104 {
6105 PG *pg = context< RecoveryMachine >().pg;
6106 ldout(pg->cct, 10) << "Reset advmap" << dendl;
6107
6108 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6109
6110 if (pg->should_restart_peering(
6111 advmap.up_primary,
6112 advmap.acting_primary,
6113 advmap.newup,
6114 advmap.newacting,
6115 advmap.lastmap,
6116 advmap.osdmap)) {
6117 ldout(pg->cct, 10) << "should restart peering, calling start_peering_interval again"
6118 << dendl;
6119 pg->start_peering_interval(
6120 advmap.lastmap,
6121 advmap.newup, advmap.up_primary,
6122 advmap.newacting, advmap.acting_primary,
6123 context< RecoveryMachine >().get_cur_transaction());
6124 }
6125 pg->remove_down_peer_info(advmap.osdmap);
6126 pg->check_past_interval_bounds();
6127 return discard_event();
6128 }
6129
6130 boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&)
6131 {
6132 PG *pg = context< RecoveryMachine >().pg;
6133 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
6134 context< RecoveryMachine >().send_notify(
6135 pg->get_primary(),
6136 pg_notify_t(
6137 pg->get_primary().shard, pg->pg_whoami.shard,
6138 pg->get_osdmap()->get_epoch(),
6139 pg->get_osdmap()->get_epoch(),
6140 pg->info),
6141 pg->past_intervals);
6142 }
6143
6144 pg->update_heartbeat_peers();
6145 pg->take_waiters();
6146
6147 return transit< Started >();
6148 }
6149
6150 boost::statechart::result PG::RecoveryState::Reset::react(const QueryState& q)
6151 {
6152 q.f->open_object_section("state");
6153 q.f->dump_string("name", state_name);
6154 q.f->dump_stream("enter_time") << enter_time;
6155 q.f->close_section();
6156 return discard_event();
6157 }
6158
6159 void PG::RecoveryState::Reset::exit()
6160 {
6161 context< RecoveryMachine >().log_exit(state_name, enter_time);
6162 PG *pg = context< RecoveryMachine >().pg;
6163 utime_t dur = ceph_clock_now() - enter_time;
6164 pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur);
6165 }
6166
6167 /*-------Start---------*/
6168 PG::RecoveryState::Start::Start(my_context ctx)
6169 : my_base(ctx),
6170 NamedState(context< RecoveryMachine >().pg, "Start")
6171 {
6172 context< RecoveryMachine >().log_enter(state_name);
6173
6174 PG *pg = context< RecoveryMachine >().pg;
6175 if (pg->is_primary()) {
6176 ldout(pg->cct, 1) << "transitioning to Primary" << dendl;
6177 post_event(MakePrimary());
6178 } else { //is_stray
6179 ldout(pg->cct, 1) << "transitioning to Stray" << dendl;
6180 post_event(MakeStray());
6181 }
6182 }
6183
6184 void PG::RecoveryState::Start::exit()
6185 {
6186 context< RecoveryMachine >().log_exit(state_name, enter_time);
6187 PG *pg = context< RecoveryMachine >().pg;
6188 utime_t dur = ceph_clock_now() - enter_time;
6189 pg->osd->recoverystate_perf->tinc(rs_start_latency, dur);
6190 }
6191
6192 /*---------Primary--------*/
6193 PG::RecoveryState::Primary::Primary(my_context ctx)
6194 : my_base(ctx),
6195 NamedState(context< RecoveryMachine >().pg, "Started/Primary")
6196 {
6197 context< RecoveryMachine >().log_enter(state_name);
6198 PG *pg = context< RecoveryMachine >().pg;
6199 assert(pg->want_acting.empty());
6200
6201 // set CREATING bit until we have peered for the first time.
6202 if (pg->info.history.last_epoch_started == 0) {
6203 pg->state_set(PG_STATE_CREATING);
6204 // use the history timestamp, which ultimately comes from the
6205 // monitor in the create case.
6206 utime_t t = pg->info.history.last_scrub_stamp;
6207 pg->info.stats.last_fresh = t;
6208 pg->info.stats.last_active = t;
6209 pg->info.stats.last_change = t;
6210 pg->info.stats.last_peered = t;
6211 pg->info.stats.last_clean = t;
6212 pg->info.stats.last_unstale = t;
6213 pg->info.stats.last_undegraded = t;
6214 pg->info.stats.last_fullsized = t;
6215 pg->info.stats.last_scrub_stamp = t;
6216 pg->info.stats.last_deep_scrub_stamp = t;
6217 pg->info.stats.last_clean_scrub_stamp = t;
6218 }
6219 }
6220
6221 boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& notevt)
6222 {
6223 PG *pg = context< RecoveryMachine >().pg;
6224 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
6225 pg->proc_replica_info(
6226 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
6227 return discard_event();
6228 }
6229
6230 boost::statechart::result PG::RecoveryState::Primary::react(const ActMap&)
6231 {
6232 PG *pg = context< RecoveryMachine >().pg;
6233 ldout(pg->cct, 7) << "handle ActMap primary" << dendl;
6234 pg->publish_stats_to_osd();
6235 pg->take_waiters();
6236 return discard_event();
6237 }
6238
6239 void PG::RecoveryState::Primary::exit()
6240 {
6241 context< RecoveryMachine >().log_exit(state_name, enter_time);
6242 PG *pg = context< RecoveryMachine >().pg;
6243 pg->want_acting.clear();
6244 utime_t dur = ceph_clock_now() - enter_time;
6245 pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur);
6246 pg->clear_primary_state();
6247 pg->state_clear(PG_STATE_CREATING);
6248 }
6249
6250 /*---------Peering--------*/
6251 PG::RecoveryState::Peering::Peering(my_context ctx)
6252 : my_base(ctx),
6253 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering"),
6254 history_les_bound(false)
6255 {
6256 context< RecoveryMachine >().log_enter(state_name);
6257
6258 PG *pg = context< RecoveryMachine >().pg;
6259 assert(!pg->is_peered());
6260 assert(!pg->is_peering());
6261 assert(pg->is_primary());
6262 pg->state_set(PG_STATE_PEERING);
6263 }
6264
6265 boost::statechart::result PG::RecoveryState::Peering::react(const AdvMap& advmap)
6266 {
6267 PG *pg = context< RecoveryMachine >().pg;
6268 ldout(pg->cct, 10) << "Peering advmap" << dendl;
6269 if (prior_set.affected_by_map(*(advmap.osdmap), pg)) {
6270 ldout(pg->cct, 1) << "Peering, affected_by_map, going to Reset" << dendl;
6271 post_event(advmap);
6272 return transit< Reset >();
6273 }
6274
6275 pg->adjust_need_up_thru(advmap.osdmap);
6276
6277 return forward_event();
6278 }
6279
6280 boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q)
6281 {
6282 PG *pg = context< RecoveryMachine >().pg;
6283
6284 q.f->open_object_section("state");
6285 q.f->dump_string("name", state_name);
6286 q.f->dump_stream("enter_time") << enter_time;
6287
6288 q.f->open_array_section("past_intervals");
6289 pg->past_intervals.dump(q.f);
6290 q.f->close_section();
6291
6292 q.f->open_array_section("probing_osds");
6293 for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
6294 p != prior_set.probe.end();
6295 ++p)
6296 q.f->dump_stream("osd") << *p;
6297 q.f->close_section();
6298
6299 if (prior_set.pg_down)
6300 q.f->dump_string("blocked", "peering is blocked due to down osds");
6301
6302 q.f->open_array_section("down_osds_we_would_probe");
6303 for (set<int>::iterator p = prior_set.down.begin();
6304 p != prior_set.down.end();
6305 ++p)
6306 q.f->dump_int("osd", *p);
6307 q.f->close_section();
6308
6309 q.f->open_array_section("peering_blocked_by");
6310 for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
6311 p != prior_set.blocked_by.end();
6312 ++p) {
6313 q.f->open_object_section("osd");
6314 q.f->dump_int("osd", p->first);
6315 q.f->dump_int("current_lost_at", p->second);
6316 q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
6317 q.f->close_section();
6318 }
6319 q.f->close_section();
6320
6321 if (history_les_bound) {
6322 q.f->open_array_section("peering_blocked_by_detail");
6323 q.f->open_object_section("item");
6324 q.f->dump_string("detail","peering_blocked_by_history_les_bound");
6325 q.f->close_section();
6326 q.f->close_section();
6327 }
6328
6329 q.f->close_section();
6330 return forward_event();
6331 }
6332
6333 void PG::RecoveryState::Peering::exit()
6334 {
6335 PG *pg = context< RecoveryMachine >().pg;
6336 ldout(pg->cct, 10) << "Leaving Peering" << dendl;
6337 context< RecoveryMachine >().log_exit(state_name, enter_time);
6338 pg->state_clear(PG_STATE_PEERING);
6339 pg->clear_probe_targets();
6340
6341 utime_t dur = ceph_clock_now() - enter_time;
6342 pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur);
6343 }
6344
6345
6346 /*------Backfilling-------*/
6347 PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
6348 : my_base(ctx),
6349 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Backfilling")
6350 {
6351 context< RecoveryMachine >().log_enter(state_name);
6352 PG *pg = context< RecoveryMachine >().pg;
6353 pg->backfill_reserved = true;
6354 pg->queue_recovery();
6355 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
6356 pg->state_clear(PG_STATE_BACKFILL_WAIT);
6357 pg->state_set(PG_STATE_BACKFILL);
6358 pg->publish_stats_to_osd();
6359 }
6360
6361 boost::statechart::result
6362 PG::RecoveryState::Backfilling::react(const CancelBackfill &)
6363 {
6364 PG *pg = context< RecoveryMachine >().pg;
6365 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6366 // XXX: Add a new pg state so user can see why backfill isn't proceeding
6367 // Can't use PG_STATE_BACKFILL_WAIT since it means waiting for reservations
6368 //pg->state_set(PG_STATE_BACKFILL_STALLED????);
6369
6370 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6371 it != pg->backfill_targets.end();
6372 ++it) {
6373 assert(*it != pg->pg_whoami);
6374 ConnectionRef con = pg->osd->get_con_osd_cluster(
6375 it->osd, pg->get_osdmap()->get_epoch());
6376 if (con) {
6377 pg->osd->send_message_osd_cluster(
6378 new MBackfillReserve(
6379 MBackfillReserve::REJECT,
6380 spg_t(pg->info.pgid.pgid, it->shard),
6381 pg->get_osdmap()->get_epoch()),
6382 con.get());
6383 }
6384 }
6385
6386 pg->waiting_on_backfill.clear();
6387
6388 pg->schedule_backfill_full_retry();
6389 return transit<NotBackfilling>();
6390 }
6391
6392 boost::statechart::result
6393 PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &)
6394 {
6395 PG *pg = context< RecoveryMachine >().pg;
6396 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6397 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6398
6399 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6400 it != pg->backfill_targets.end();
6401 ++it) {
6402 assert(*it != pg->pg_whoami);
6403 ConnectionRef con = pg->osd->get_con_osd_cluster(
6404 it->osd, pg->get_osdmap()->get_epoch());
6405 if (con) {
6406 pg->osd->send_message_osd_cluster(
6407 new MBackfillReserve(
6408 MBackfillReserve::REJECT,
6409 spg_t(pg->info.pgid.pgid, it->shard),
6410 pg->get_osdmap()->get_epoch()),
6411 con.get());
6412 }
6413 }
6414
6415 pg->waiting_on_backfill.clear();
6416 pg->finish_recovery_op(hobject_t::get_max());
6417
6418 pg->schedule_backfill_full_retry();
6419 return transit<NotBackfilling>();
6420 }
6421
6422 void PG::RecoveryState::Backfilling::exit()
6423 {
6424 context< RecoveryMachine >().log_exit(state_name, enter_time);
6425 PG *pg = context< RecoveryMachine >().pg;
6426 pg->backfill_reserved = false;
6427 pg->backfill_reserving = false;
6428 pg->state_clear(PG_STATE_BACKFILL);
6429 pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
6430 utime_t dur = ceph_clock_now() - enter_time;
6431 pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
6432 }
6433
6434 /*--WaitRemoteBackfillReserved--*/
6435
6436 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
6437 : my_base(ctx),
6438 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteBackfillReserved"),
6439 backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
6440 {
6441 context< RecoveryMachine >().log_enter(state_name);
6442 PG *pg = context< RecoveryMachine >().pg;
6443 pg->state_set(PG_STATE_BACKFILL_WAIT);
6444 pg->publish_stats_to_osd();
6445 post_event(RemoteBackfillReserved());
6446 }
6447
6448 boost::statechart::result
6449 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
6450 {
6451 PG *pg = context< RecoveryMachine >().pg;
6452
6453 if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) {
6454 //The primary never backfills itself
6455 assert(*backfill_osd_it != pg->pg_whoami);
6456 ConnectionRef con = pg->osd->get_con_osd_cluster(
6457 backfill_osd_it->osd, pg->get_osdmap()->get_epoch());
6458 if (con) {
6459 pg->osd->send_message_osd_cluster(
6460 new MBackfillReserve(
6461 MBackfillReserve::REQUEST,
6462 spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
6463 pg->get_osdmap()->get_epoch(),
6464 pg->get_backfill_priority()),
6465 con.get());
6466 }
6467 ++backfill_osd_it;
6468 } else {
6469 post_event(AllBackfillsReserved());
6470 }
6471 return discard_event();
6472 }
6473
6474 void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
6475 {
6476 context< RecoveryMachine >().log_exit(state_name, enter_time);
6477 PG *pg = context< RecoveryMachine >().pg;
6478 utime_t dur = ceph_clock_now() - enter_time;
6479 pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur);
6480 }
6481
6482 boost::statechart::result
6483 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt)
6484 {
6485 PG *pg = context< RecoveryMachine >().pg;
6486 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6487
6488 // Send REJECT to all previously acquired reservations
6489 set<pg_shard_t>::const_iterator it, begin, end, next;
6490 begin = context< Active >().remote_shards_to_reserve_backfill.begin();
6491 end = context< Active >().remote_shards_to_reserve_backfill.end();
6492 assert(begin != end);
6493 for (next = it = begin, ++next ; next != backfill_osd_it; ++it, ++next) {
6494 //The primary never backfills itself
6495 assert(*it != pg->pg_whoami);
6496 ConnectionRef con = pg->osd->get_con_osd_cluster(
6497 it->osd, pg->get_osdmap()->get_epoch());
6498 if (con) {
6499 pg->osd->send_message_osd_cluster(
6500 new MBackfillReserve(
6501 MBackfillReserve::REJECT,
6502 spg_t(pg->info.pgid.pgid, it->shard),
6503 pg->get_osdmap()->get_epoch()),
6504 con.get());
6505 }
6506 }
6507
6508 pg->state_clear(PG_STATE_BACKFILL_WAIT);
6509 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6510 pg->publish_stats_to_osd();
6511
6512 pg->schedule_backfill_full_retry();
6513
6514 return transit<NotBackfilling>();
6515 }
6516
6517 /*--WaitLocalBackfillReserved--*/
6518 PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
6519 : my_base(ctx),
6520 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalBackfillReserved")
6521 {
6522 context< RecoveryMachine >().log_enter(state_name);
6523 PG *pg = context< RecoveryMachine >().pg;
6524 pg->state_set(PG_STATE_BACKFILL_WAIT);
6525 pg->osd->local_reserver.request_reservation(
6526 pg->info.pgid,
6527 new QueuePeeringEvt<LocalBackfillReserved>(
6528 pg, pg->get_osdmap()->get_epoch(),
6529 LocalBackfillReserved()),
6530 pg->get_backfill_priority());
6531 pg->publish_stats_to_osd();
6532 }
6533
6534 void PG::RecoveryState::WaitLocalBackfillReserved::exit()
6535 {
6536 context< RecoveryMachine >().log_exit(state_name, enter_time);
6537 PG *pg = context< RecoveryMachine >().pg;
6538 utime_t dur = ceph_clock_now() - enter_time;
6539 pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur);
6540 }
6541
6542 /*----NotBackfilling------*/
6543 PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
6544 : my_base(ctx),
6545 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotBackfilling")
6546 {
6547 context< RecoveryMachine >().log_enter(state_name);
6548 PG *pg = context< RecoveryMachine >().pg;
6549 pg->publish_stats_to_osd();
6550 }
6551
6552 boost::statechart::result
6553 PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
6554 {
6555 return discard_event();
6556 }
6557
6558 boost::statechart::result
6559 PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
6560 {
6561 return discard_event();
6562 }
6563
6564 void PG::RecoveryState::NotBackfilling::exit()
6565 {
6566 context< RecoveryMachine >().log_exit(state_name, enter_time);
6567 PG *pg = context< RecoveryMachine >().pg;
6568 utime_t dur = ceph_clock_now() - enter_time;
6569 pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
6570 }
6571
6572 /*----NotRecovering------*/
6573 PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx)
6574 : my_base(ctx),
6575 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotRecovering")
6576 {
6577 context< RecoveryMachine >().log_enter(state_name);
6578 PG *pg = context< RecoveryMachine >().pg;
6579 pg->publish_stats_to_osd();
6580 }
6581
6582 void PG::RecoveryState::NotRecovering::exit()
6583 {
6584 context< RecoveryMachine >().log_exit(state_name, enter_time);
6585 PG *pg = context< RecoveryMachine >().pg;
6586 utime_t dur = ceph_clock_now() - enter_time;
6587 pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur);
6588 }
6589
6590 /*---RepNotRecovering----*/
6591 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
6592 : my_base(ctx),
6593 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepNotRecovering")
6594 {
6595 context< RecoveryMachine >().log_enter(state_name);
6596 }
6597
6598 void PG::RecoveryState::RepNotRecovering::exit()
6599 {
6600 context< RecoveryMachine >().log_exit(state_name, enter_time);
6601 PG *pg = context< RecoveryMachine >().pg;
6602 utime_t dur = ceph_clock_now() - enter_time;
6603 pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur);
6604 }
6605
6606 /*---RepWaitRecoveryReserved--*/
6607 PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
6608 : my_base(ctx),
6609 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitRecoveryReserved")
6610 {
6611 context< RecoveryMachine >().log_enter(state_name);
6612 PG *pg = context< RecoveryMachine >().pg;
6613
6614 pg->osd->remote_reserver.request_reservation(
6615 pg->info.pgid,
6616 new QueuePeeringEvt<RemoteRecoveryReserved>(
6617 pg, pg->get_osdmap()->get_epoch(),
6618 RemoteRecoveryReserved()),
6619 pg->get_recovery_priority());
6620 }
6621
6622 boost::statechart::result
6623 PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
6624 {
6625 PG *pg = context< RecoveryMachine >().pg;
6626 pg->osd->send_message_osd_cluster(
6627 pg->primary.osd,
6628 new MRecoveryReserve(
6629 MRecoveryReserve::GRANT,
6630 spg_t(pg->info.pgid.pgid, pg->primary.shard),
6631 pg->get_osdmap()->get_epoch()),
6632 pg->get_osdmap()->get_epoch());
6633 return transit<RepRecovering>();
6634 }
6635
6636 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
6637 {
6638 context< RecoveryMachine >().log_exit(state_name, enter_time);
6639 PG *pg = context< RecoveryMachine >().pg;
6640 utime_t dur = ceph_clock_now() - enter_time;
6641 pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur);
6642 }
6643
6644 /*-RepWaitBackfillReserved*/
6645 PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
6646 : my_base(ctx),
6647 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitBackfillReserved")
6648 {
6649 context< RecoveryMachine >().log_enter(state_name);
6650 }
6651
6652 boost::statechart::result
6653 PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
6654 {
6655 PG *pg = context< RecoveryMachine >().pg;
6656 ostringstream ss;
6657
6658 if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
6659 (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
6660 ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
6661 << dendl;
6662 post_event(RemoteReservationRejected());
6663 } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
6664 pg->osd->check_backfill_full(ss)) {
6665 ldout(pg->cct, 10) << "backfill reservation rejected: "
6666 << ss.str() << dendl;
6667 post_event(RemoteReservationRejected());
6668 } else {
6669 pg->osd->remote_reserver.request_reservation(
6670 pg->info.pgid,
6671 new QueuePeeringEvt<RemoteBackfillReserved>(
6672 pg, pg->get_osdmap()->get_epoch(),
6673 RemoteBackfillReserved()), evt.priority);
6674 }
6675 return transit<RepWaitBackfillReserved>();
6676 }
6677
6678 void PG::RecoveryState::RepWaitBackfillReserved::exit()
6679 {
6680 context< RecoveryMachine >().log_exit(state_name, enter_time);
6681 PG *pg = context< RecoveryMachine >().pg;
6682 utime_t dur = ceph_clock_now() - enter_time;
6683 pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur);
6684 }
6685
6686 boost::statechart::result
6687 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
6688 {
6689 PG *pg = context< RecoveryMachine >().pg;
6690
6691 ostringstream ss;
6692 if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
6693 (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
6694 ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
6695 << "failure injection" << dendl;
6696 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6697 post_event(RemoteReservationRejected());
6698 return discard_event();
6699 } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
6700 pg->osd->check_backfill_full(ss)) {
6701 ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
6702 << ss.str() << dendl;
6703 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6704 post_event(RemoteReservationRejected());
6705 return discard_event();
6706 } else {
6707 pg->osd->send_message_osd_cluster(
6708 pg->primary.osd,
6709 new MBackfillReserve(
6710 MBackfillReserve::GRANT,
6711 spg_t(pg->info.pgid.pgid, pg->primary.shard),
6712 pg->get_osdmap()->get_epoch()),
6713 pg->get_osdmap()->get_epoch());
6714 return transit<RepRecovering>();
6715 }
6716 }
6717
6718 boost::statechart::result
6719 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteReservationRejected &evt)
6720 {
6721 PG *pg = context< RecoveryMachine >().pg;
6722 pg->reject_reservation();
6723 return transit<RepNotRecovering>();
6724 }
6725
6726 /*---RepRecovering-------*/
6727 PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx)
6728 : my_base(ctx),
6729 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepRecovering")
6730 {
6731 context< RecoveryMachine >().log_enter(state_name);
6732 }
6733
6734 boost::statechart::result
6735 PG::RecoveryState::RepRecovering::react(const BackfillTooFull &)
6736 {
6737 PG *pg = context< RecoveryMachine >().pg;
6738 pg->reject_reservation();
6739 return discard_event();
6740 }
6741
6742 void PG::RecoveryState::RepRecovering::exit()
6743 {
6744 context< RecoveryMachine >().log_exit(state_name, enter_time);
6745 PG *pg = context< RecoveryMachine >().pg;
6746 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6747 utime_t dur = ceph_clock_now() - enter_time;
6748 pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur);
6749 }
6750
6751 /*------Activating--------*/
6752 PG::RecoveryState::Activating::Activating(my_context ctx)
6753 : my_base(ctx),
6754 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Activating")
6755 {
6756 context< RecoveryMachine >().log_enter(state_name);
6757 }
6758
6759 void PG::RecoveryState::Activating::exit()
6760 {
6761 context< RecoveryMachine >().log_exit(state_name, enter_time);
6762 PG *pg = context< RecoveryMachine >().pg;
6763 utime_t dur = ceph_clock_now() - enter_time;
6764 pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur);
6765 }
6766
6767 PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
6768 : my_base(ctx),
6769 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalRecoveryReserved")
6770 {
6771 context< RecoveryMachine >().log_enter(state_name);
6772 PG *pg = context< RecoveryMachine >().pg;
6773
6774 // Make sure all nodes that part of the recovery aren't full
6775 if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery &&
6776 pg->osd->check_osdmap_full(pg->actingbackfill)) {
6777 post_event(RecoveryTooFull());
6778 return;
6779 }
6780
6781 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
6782 pg->state_set(PG_STATE_RECOVERY_WAIT);
6783 pg->osd->local_reserver.request_reservation(
6784 pg->info.pgid,
6785 new QueuePeeringEvt<LocalRecoveryReserved>(
6786 pg, pg->get_osdmap()->get_epoch(),
6787 LocalRecoveryReserved()),
6788 pg->get_recovery_priority());
6789 pg->publish_stats_to_osd();
6790 }
6791
6792 boost::statechart::result
6793 PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
6794 {
6795 PG *pg = context< RecoveryMachine >().pg;
6796 pg->state_set(PG_STATE_RECOVERY_TOOFULL);
6797 pg->schedule_recovery_full_retry();
6798 return transit<NotRecovering>();
6799 }
6800
6801 void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
6802 {
6803 context< RecoveryMachine >().log_exit(state_name, enter_time);
6804 PG *pg = context< RecoveryMachine >().pg;
6805 utime_t dur = ceph_clock_now() - enter_time;
6806 pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur);
6807 }
6808
6809 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
6810 : my_base(ctx),
6811 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
6812 remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
6813 {
6814 context< RecoveryMachine >().log_enter(state_name);
6815 post_event(RemoteRecoveryReserved());
6816 }
6817
6818 boost::statechart::result
6819 PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
6820 PG *pg = context< RecoveryMachine >().pg;
6821
6822 if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
6823 assert(*remote_recovery_reservation_it != pg->pg_whoami);
6824 ConnectionRef con = pg->osd->get_con_osd_cluster(
6825 remote_recovery_reservation_it->osd, pg->get_osdmap()->get_epoch());
6826 if (con) {
6827 pg->osd->send_message_osd_cluster(
6828 new MRecoveryReserve(
6829 MRecoveryReserve::REQUEST,
6830 spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard),
6831 pg->get_osdmap()->get_epoch()),
6832 con.get());
6833 }
6834 ++remote_recovery_reservation_it;
6835 } else {
6836 post_event(AllRemotesReserved());
6837 }
6838 return discard_event();
6839 }
6840
6841 void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
6842 {
6843 context< RecoveryMachine >().log_exit(state_name, enter_time);
6844 PG *pg = context< RecoveryMachine >().pg;
6845 utime_t dur = ceph_clock_now() - enter_time;
6846 pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur);
6847 }
6848
6849 PG::RecoveryState::Recovering::Recovering(my_context ctx)
6850 : my_base(ctx),
6851 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovering")
6852 {
6853 context< RecoveryMachine >().log_enter(state_name);
6854
6855 PG *pg = context< RecoveryMachine >().pg;
6856 pg->state_clear(PG_STATE_RECOVERY_WAIT);
6857 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
6858 pg->state_set(PG_STATE_RECOVERING);
6859 pg->publish_stats_to_osd();
6860 pg->queue_recovery();
6861 }
6862
6863 void PG::RecoveryState::Recovering::release_reservations(bool cancel)
6864 {
6865 PG *pg = context< RecoveryMachine >().pg;
6866 assert(cancel || !pg->pg_log.get_missing().have_missing());
6867
6868 // release remote reservations
6869 for (set<pg_shard_t>::const_iterator i =
6870 context< Active >().remote_shards_to_reserve_recovery.begin();
6871 i != context< Active >().remote_shards_to_reserve_recovery.end();
6872 ++i) {
6873 if (*i == pg->pg_whoami) // skip myself
6874 continue;
6875 ConnectionRef con = pg->osd->get_con_osd_cluster(
6876 i->osd, pg->get_osdmap()->get_epoch());
6877 if (con) {
6878 pg->osd->send_message_osd_cluster(
6879 new MRecoveryReserve(
6880 MRecoveryReserve::RELEASE,
6881 spg_t(pg->info.pgid.pgid, i->shard),
6882 pg->get_osdmap()->get_epoch()),
6883 con.get());
6884 }
6885 }
6886 }
6887
6888 boost::statechart::result
6889 PG::RecoveryState::Recovering::react(const AllReplicasRecovered &evt)
6890 {
6891 PG *pg = context< RecoveryMachine >().pg;
6892 pg->state_clear(PG_STATE_RECOVERING);
6893 pg->state_clear(PG_STATE_FORCED_RECOVERY);
6894 release_reservations();
6895 return transit<Recovered>();
6896 }
6897
6898 boost::statechart::result
6899 PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
6900 {
6901 PG *pg = context< RecoveryMachine >().pg;
6902 pg->state_clear(PG_STATE_RECOVERING);
6903 pg->state_clear(PG_STATE_FORCED_RECOVERY);
6904 release_reservations();
6905 return transit<WaitRemoteBackfillReserved>();
6906 }
6907
6908 boost::statechart::result
6909 PG::RecoveryState::Recovering::react(const CancelRecovery &evt)
6910 {
6911 PG *pg = context< RecoveryMachine >().pg;
6912 pg->state_clear(PG_STATE_RECOVERING);
6913 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6914 release_reservations(true);
6915 pg->schedule_recovery_full_retry();
6916 return transit<NotRecovering>();
6917 }
6918
6919 void PG::RecoveryState::Recovering::exit()
6920 {
6921 context< RecoveryMachine >().log_exit(state_name, enter_time);
6922 PG *pg = context< RecoveryMachine >().pg;
6923 utime_t dur = ceph_clock_now() - enter_time;
6924 pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur);
6925 }
6926
6927 PG::RecoveryState::Recovered::Recovered(my_context ctx)
6928 : my_base(ctx),
6929 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovered")
6930 {
6931 pg_shard_t auth_log_shard;
6932
6933 context< RecoveryMachine >().log_enter(state_name);
6934
6935 PG *pg = context< RecoveryMachine >().pg;
6936 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6937
6938 assert(!pg->needs_recovery());
6939
6940 // if we finished backfill, all acting are active; recheck if
6941 // DEGRADED | UNDERSIZED is appropriate.
6942 assert(!pg->actingbackfill.empty());
6943 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
6944 pg->actingbackfill.size()) {
6945 pg->state_clear(PG_STATE_DEGRADED);
6946 pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
6947 pg->publish_stats_to_osd();
6948 }
6949
6950 // trim pglog on recovered
6951 pg->trim_log();
6952
6953 // adjust acting set? (e.g. because backfill completed...)
6954 bool history_les_bound = false;
6955 if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
6956 true, &history_les_bound))
6957 assert(pg->want_acting.size());
6958
6959 if (context< Active >().all_replicas_activated)
6960 post_event(GoClean());
6961 }
6962
6963 void PG::RecoveryState::Recovered::exit()
6964 {
6965 context< RecoveryMachine >().log_exit(state_name, enter_time);
6966 PG *pg = context< RecoveryMachine >().pg;
6967 utime_t dur = ceph_clock_now() - enter_time;
6968 pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur);
6969 }
6970
6971 PG::RecoveryState::Clean::Clean(my_context ctx)
6972 : my_base(ctx),
6973 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Clean")
6974 {
6975 context< RecoveryMachine >().log_enter(state_name);
6976
6977 PG *pg = context< RecoveryMachine >().pg;
6978
6979 if (pg->info.last_complete != pg->info.last_update) {
6980 ceph_abort();
6981 }
6982 pg->finish_recovery(*context< RecoveryMachine >().get_on_safe_context_list());
6983
6984 if (pg->is_active()) {
6985 pg->mark_clean();
6986 }
6987
6988 pg->share_pg_info();
6989 pg->publish_stats_to_osd();
6990 pg->requeue_ops(pg->waiting_for_clean_to_primary_repair);
6991 }
6992
6993 void PG::RecoveryState::Clean::exit()
6994 {
6995 context< RecoveryMachine >().log_exit(state_name, enter_time);
6996 PG *pg = context< RecoveryMachine >().pg;
6997 pg->state_clear(PG_STATE_CLEAN);
6998 utime_t dur = ceph_clock_now() - enter_time;
6999 pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur);
7000 }
7001
7002 template <typename T>
7003 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
7004 {
7005 set<int> osds_found;
7006 set<pg_shard_t> out;
7007 for (typename T::const_iterator i = in.begin();
7008 i != in.end();
7009 ++i) {
7010 if (*i != skip && !osds_found.count(i->osd)) {
7011 osds_found.insert(i->osd);
7012 out.insert(*i);
7013 }
7014 }
7015 return out;
7016 }
7017
7018 /*---------Active---------*/
7019 PG::RecoveryState::Active::Active(my_context ctx)
7020 : my_base(ctx),
7021 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active"),
7022 remote_shards_to_reserve_recovery(
7023 unique_osd_shard_set(
7024 context< RecoveryMachine >().pg->pg_whoami,
7025 context< RecoveryMachine >().pg->actingbackfill)),
7026 remote_shards_to_reserve_backfill(
7027 unique_osd_shard_set(
7028 context< RecoveryMachine >().pg->pg_whoami,
7029 context< RecoveryMachine >().pg->backfill_targets)),
7030 all_replicas_activated(false)
7031 {
7032 context< RecoveryMachine >().log_enter(state_name);
7033
7034 PG *pg = context< RecoveryMachine >().pg;
7035
7036 assert(!pg->backfill_reserving);
7037 assert(!pg->backfill_reserved);
7038 assert(pg->is_primary());
7039 ldout(pg->cct, 10) << "In Active, about to call activate" << dendl;
7040 pg->start_flush(
7041 context< RecoveryMachine >().get_cur_transaction(),
7042 context< RecoveryMachine >().get_on_applied_context_list(),
7043 context< RecoveryMachine >().get_on_safe_context_list());
7044 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7045 pg->get_osdmap()->get_epoch(),
7046 *context< RecoveryMachine >().get_on_safe_context_list(),
7047 *context< RecoveryMachine >().get_query_map(),
7048 context< RecoveryMachine >().get_info_map(),
7049 context< RecoveryMachine >().get_recovery_ctx());
7050
7051 // everyone has to commit/ack before we are truly active
7052 pg->blocked_by.clear();
7053 for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
7054 p != pg->actingbackfill.end();
7055 ++p) {
7056 if (p->shard != pg->pg_whoami.shard) {
7057 pg->blocked_by.insert(p->shard);
7058 }
7059 }
7060 pg->publish_stats_to_osd();
7061 ldout(pg->cct, 10) << "Activate Finished" << dendl;
7062 }
7063
7064 boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
7065 {
7066 PG *pg = context< RecoveryMachine >().pg;
7067 ldout(pg->cct, 10) << "Active advmap" << dendl;
7068 if (!pg->pool.newly_removed_snaps.empty()) {
7069 pg->snap_trimq.union_of(pg->pool.newly_removed_snaps);
7070 ldout(pg->cct, 10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl;
7071 pg->dirty_info = true;
7072 pg->dirty_big_info = true;
7073 }
7074
7075 for (size_t i = 0; i < pg->want_acting.size(); i++) {
7076 int osd = pg->want_acting[i];
7077 if (!advmap.osdmap->is_up(osd)) {
7078 pg_shard_t osd_with_shard(osd, shard_id_t(i));
7079 assert(pg->is_acting(osd_with_shard) || pg->is_up(osd_with_shard));
7080 }
7081 }
7082
7083 bool need_publish = false;
7084 /* Check for changes in pool size (if the acting set changed as a result,
7085 * this does not matter) */
7086 if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) !=
7087 pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) {
7088 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) {
7089 pg->state_clear(PG_STATE_UNDERSIZED);
7090 if (pg->needs_recovery()) {
7091 pg->state_set(PG_STATE_DEGRADED);
7092 } else {
7093 pg->state_clear(PG_STATE_DEGRADED);
7094 }
7095 } else {
7096 pg->state_set(PG_STATE_UNDERSIZED);
7097 pg->state_set(PG_STATE_DEGRADED);
7098 }
7099 need_publish = true; // degraded may have changed
7100 }
7101
7102 // if we haven't reported our PG stats in a long time, do so now.
7103 if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
7104 ldout(pg->cct, 20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch)
7105 << " epochs" << dendl;
7106 need_publish = true;
7107 }
7108
7109 if (need_publish)
7110 pg->publish_stats_to_osd();
7111
7112 return forward_event();
7113 }
7114
7115 boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
7116 {
7117 PG *pg = context< RecoveryMachine >().pg;
7118 ldout(pg->cct, 10) << "Active: handling ActMap" << dendl;
7119 assert(pg->is_primary());
7120
7121 if (pg->have_unfound()) {
7122 // object may have become unfound
7123 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7124 }
7125
7126 if (pg->cct->_conf->osd_check_for_log_corruption)
7127 pg->check_log_for_corruption(pg->osd->store);
7128
7129 uint64_t unfound = pg->missing_loc.num_unfound();
7130 if (unfound > 0 &&
7131 pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
7132 if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
7133 pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound
7134 << " objects unfound and apparently lost, would automatically "
7135 << "mark these objects lost but this feature is not yet implemented "
7136 << "(osd_auto_mark_unfound_lost)";
7137 } else
7138 pg->osd->clog->error() << pg->info.pgid.pgid << " has "
7139 << unfound << " objects unfound and apparently lost";
7140 }
7141
7142 if (pg->is_active()) {
7143 ldout(pg->cct, 10) << "Active: kicking snap trim" << dendl;
7144 pg->kick_snap_trim();
7145 }
7146
7147 if (pg->is_peered() &&
7148 !pg->is_clean() &&
7149 !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) &&
7150 (!pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || pg->is_degraded())) {
7151 pg->queue_recovery();
7152 }
7153 return forward_event();
7154 }
7155
7156 boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& notevt)
7157 {
7158 PG *pg = context< RecoveryMachine >().pg;
7159 assert(pg->is_primary());
7160 if (pg->peer_info.count(notevt.from)) {
7161 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7162 << ", already have info from that osd, ignoring"
7163 << dendl;
7164 } else if (pg->peer_purged.count(notevt.from)) {
7165 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7166 << ", already purged that peer, ignoring"
7167 << dendl;
7168 } else {
7169 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7170 << ", calling proc_replica_info and discover_all_missing"
7171 << dendl;
7172 pg->proc_replica_info(
7173 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
7174 if (pg->have_unfound()) {
7175 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7176 }
7177 }
7178 return discard_event();
7179 }
7180
7181 boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoevt)
7182 {
7183 PG *pg = context< RecoveryMachine >().pg;
7184 assert(pg->is_primary());
7185
7186 assert(!pg->actingbackfill.empty());
7187 // don't update history (yet) if we are active and primary; the replica
7188 // may be telling us they have activated (and committed) but we can't
7189 // share that until _everyone_ does the same.
7190 if (pg->is_actingbackfill(infoevt.from)) {
7191 ldout(pg->cct, 10) << " peer osd." << infoevt.from
7192 << " activated and committed" << dendl;
7193 pg->peer_activated.insert(infoevt.from);
7194 pg->blocked_by.erase(infoevt.from.shard);
7195 pg->publish_stats_to_osd();
7196 if (pg->peer_activated.size() == pg->actingbackfill.size()) {
7197 pg->all_activated_and_committed();
7198 }
7199 }
7200 return discard_event();
7201 }
7202
7203 boost::statechart::result PG::RecoveryState::Active::react(const MLogRec& logevt)
7204 {
7205 PG *pg = context< RecoveryMachine >().pg;
7206 ldout(pg->cct, 10) << "searching osd." << logevt.from
7207 << " log for unfound items" << dendl;
7208 pg->proc_replica_log(
7209 logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
7210 bool got_missing = pg->search_for_missing(
7211 pg->peer_info[logevt.from],
7212 pg->peer_missing[logevt.from],
7213 logevt.from,
7214 context< RecoveryMachine >().get_recovery_ctx());
7215 if (pg->is_peered() &&
7216 got_missing)
7217 pg->queue_recovery();
7218 return discard_event();
7219 }
7220
7221 boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
7222 {
7223 PG *pg = context< RecoveryMachine >().pg;
7224
7225 q.f->open_object_section("state");
7226 q.f->dump_string("name", state_name);
7227 q.f->dump_stream("enter_time") << enter_time;
7228
7229 {
7230 q.f->open_array_section("might_have_unfound");
7231 for (set<pg_shard_t>::iterator p = pg->might_have_unfound.begin();
7232 p != pg->might_have_unfound.end();
7233 ++p) {
7234 q.f->open_object_section("osd");
7235 q.f->dump_stream("osd") << *p;
7236 if (pg->peer_missing.count(*p)) {
7237 q.f->dump_string("status", "already probed");
7238 } else if (pg->peer_missing_requested.count(*p)) {
7239 q.f->dump_string("status", "querying");
7240 } else if (!pg->get_osdmap()->is_up(p->osd)) {
7241 q.f->dump_string("status", "osd is down");
7242 } else {
7243 q.f->dump_string("status", "not queried");
7244 }
7245 q.f->close_section();
7246 }
7247 q.f->close_section();
7248 }
7249 {
7250 q.f->open_object_section("recovery_progress");
7251 pg->dump_recovery_info(q.f);
7252 q.f->close_section();
7253 }
7254
7255 {
7256 q.f->open_object_section("scrub");
7257 q.f->dump_stream("scrubber.epoch_start") << pg->scrubber.epoch_start;
7258 q.f->dump_bool("scrubber.active", pg->scrubber.active);
7259 q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state));
7260 q.f->dump_stream("scrubber.start") << pg->scrubber.start;
7261 q.f->dump_stream("scrubber.end") << pg->scrubber.end;
7262 q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update;
7263 q.f->dump_bool("scrubber.deep", pg->scrubber.deep);
7264 q.f->dump_unsigned("scrubber.seed", pg->scrubber.seed);
7265 q.f->dump_int("scrubber.waiting_on", pg->scrubber.waiting_on);
7266 {
7267 q.f->open_array_section("scrubber.waiting_on_whom");
7268 for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin();
7269 p != pg->scrubber.waiting_on_whom.end();
7270 ++p) {
7271 q.f->dump_stream("shard") << *p;
7272 }
7273 q.f->close_section();
7274 }
7275 q.f->close_section();
7276 }
7277
7278 q.f->close_section();
7279 return forward_event();
7280 }
7281
7282 boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActivated &evt)
7283 {
7284 PG *pg = context< RecoveryMachine >().pg;
7285 all_replicas_activated = true;
7286
7287 pg->state_clear(PG_STATE_ACTIVATING);
7288 pg->state_clear(PG_STATE_CREATING);
7289 if (pg->acting.size() >= pg->pool.info.min_size) {
7290 pg->state_set(PG_STATE_ACTIVE);
7291 } else {
7292 pg->state_set(PG_STATE_PEERED);
7293 }
7294
7295 // info.last_epoch_started is set during activate()
7296 pg->info.history.last_epoch_started = pg->info.last_epoch_started;
7297 pg->info.history.last_interval_started = pg->info.last_interval_started;
7298 pg->dirty_info = true;
7299
7300 pg->share_pg_info();
7301 pg->publish_stats_to_osd();
7302
7303 pg->check_local();
7304
7305 // waiters
7306 if (pg->flushes_in_progress == 0) {
7307 pg->requeue_ops(pg->waiting_for_peered);
7308 }
7309
7310 pg->on_activate();
7311
7312 return discard_event();
7313 }
7314
7315 void PG::RecoveryState::Active::exit()
7316 {
7317 context< RecoveryMachine >().log_exit(state_name, enter_time);
7318 PG *pg = context< RecoveryMachine >().pg;
7319 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7320
7321 pg->blocked_by.clear();
7322 pg->backfill_reserved = false;
7323 pg->backfill_reserving = false;
7324 pg->state_clear(PG_STATE_ACTIVATING);
7325 pg->state_clear(PG_STATE_DEGRADED);
7326 pg->state_clear(PG_STATE_UNDERSIZED);
7327 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
7328 pg->state_clear(PG_STATE_BACKFILL_WAIT);
7329 pg->state_clear(PG_STATE_RECOVERY_WAIT);
7330 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7331 utime_t dur = ceph_clock_now() - enter_time;
7332 pg->osd->recoverystate_perf->tinc(rs_active_latency, dur);
7333 pg->agent_stop();
7334 }
7335
7336 /*------ReplicaActive-----*/
7337 PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx)
7338 : my_base(ctx),
7339 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive")
7340 {
7341 context< RecoveryMachine >().log_enter(state_name);
7342
7343 PG *pg = context< RecoveryMachine >().pg;
7344 pg->start_flush(
7345 context< RecoveryMachine >().get_cur_transaction(),
7346 context< RecoveryMachine >().get_on_applied_context_list(),
7347 context< RecoveryMachine >().get_on_safe_context_list());
7348 }
7349
7350
7351 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
7352 const Activate& actevt) {
7353 PG *pg = context< RecoveryMachine >().pg;
7354 ldout(pg->cct, 10) << "In ReplicaActive, about to call activate" << dendl;
7355 map<int, map<spg_t, pg_query_t> > query_map;
7356 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7357 actevt.activation_epoch,
7358 *context< RecoveryMachine >().get_on_safe_context_list(),
7359 query_map, NULL, NULL);
7360 ldout(pg->cct, 10) << "Activate Finished" << dendl;
7361 return discard_event();
7362 }
7363
7364 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MInfoRec& infoevt)
7365 {
7366 PG *pg = context< RecoveryMachine >().pg;
7367 pg->proc_primary_info(*context<RecoveryMachine>().get_cur_transaction(),
7368 infoevt.info);
7369 return discard_event();
7370 }
7371
7372 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MLogRec& logevt)
7373 {
7374 PG *pg = context< RecoveryMachine >().pg;
7375 ldout(pg->cct, 10) << "received log from " << logevt.from << dendl;
7376 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7377 pg->merge_log(*t, logevt.msg->info, logevt.msg->log, logevt.from);
7378 assert(pg->pg_log.get_head() == pg->info.last_update);
7379
7380 return discard_event();
7381 }
7382
7383 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&)
7384 {
7385 PG *pg = context< RecoveryMachine >().pg;
7386 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7387 context< RecoveryMachine >().send_notify(
7388 pg->get_primary(),
7389 pg_notify_t(
7390 pg->get_primary().shard, pg->pg_whoami.shard,
7391 pg->get_osdmap()->get_epoch(),
7392 pg->get_osdmap()->get_epoch(),
7393 pg->info),
7394 pg->past_intervals);
7395 }
7396 pg->take_waiters();
7397 return discard_event();
7398 }
7399
7400 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MQuery& query)
7401 {
7402 PG *pg = context< RecoveryMachine >().pg;
7403 if (query.query.type == pg_query_t::MISSING) {
7404 pg->update_history(query.query.history);
7405 pg->fulfill_log(query.from, query.query, query.query_epoch);
7406 } // else: from prior to activation, safe to ignore
7407 return discard_event();
7408 }
7409
7410 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const QueryState& q)
7411 {
7412 q.f->open_object_section("state");
7413 q.f->dump_string("name", state_name);
7414 q.f->dump_stream("enter_time") << enter_time;
7415 q.f->close_section();
7416 return forward_event();
7417 }
7418
7419 void PG::RecoveryState::ReplicaActive::exit()
7420 {
7421 context< RecoveryMachine >().log_exit(state_name, enter_time);
7422 PG *pg = context< RecoveryMachine >().pg;
7423 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7424 utime_t dur = ceph_clock_now() - enter_time;
7425 pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur);
7426 }
7427
7428 /*-------Stray---*/
7429 PG::RecoveryState::Stray::Stray(my_context ctx)
7430 : my_base(ctx),
7431 NamedState(context< RecoveryMachine >().pg, "Started/Stray")
7432 {
7433 context< RecoveryMachine >().log_enter(state_name);
7434
7435 PG *pg = context< RecoveryMachine >().pg;
7436 assert(!pg->is_peered());
7437 assert(!pg->is_peering());
7438 assert(!pg->is_primary());
7439 pg->start_flush(
7440 context< RecoveryMachine >().get_cur_transaction(),
7441 context< RecoveryMachine >().get_on_applied_context_list(),
7442 context< RecoveryMachine >().get_on_safe_context_list());
7443 }
7444
7445 boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
7446 {
7447 PG *pg = context< RecoveryMachine >().pg;
7448 MOSDPGLog *msg = logevt.msg.get();
7449 ldout(pg->cct, 10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
7450
7451 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7452 if (msg->info.last_backfill == hobject_t()) {
7453 // restart backfill
7454 pg->unreg_next_scrub();
7455 pg->info = msg->info;
7456 pg->reg_next_scrub();
7457 pg->dirty_info = true;
7458 pg->dirty_big_info = true; // maybe.
7459
7460 PGLogEntryHandler rollbacker{pg, t};
7461 pg->pg_log.reset_backfill_claim_log(msg->log, &rollbacker);
7462
7463 pg->pg_log.reset_backfill();
7464 } else {
7465 pg->merge_log(*t, msg->info, msg->log, logevt.from);
7466 }
7467
7468 assert(pg->pg_log.get_head() == pg->info.last_update);
7469
7470 post_event(Activate(logevt.msg->info.last_epoch_started));
7471 return transit<ReplicaActive>();
7472 }
7473
7474 boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoevt)
7475 {
7476 PG *pg = context< RecoveryMachine >().pg;
7477 ldout(pg->cct, 10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
7478
7479 if (pg->info.last_update > infoevt.info.last_update) {
7480 // rewind divergent log entries
7481 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7482 pg->rewind_divergent_log(*t, infoevt.info.last_update);
7483 pg->info.stats = infoevt.info.stats;
7484 pg->info.hit_set = infoevt.info.hit_set;
7485 }
7486
7487 assert(infoevt.info.last_update == pg->info.last_update);
7488 assert(pg->pg_log.get_head() == pg->info.last_update);
7489
7490 post_event(Activate(infoevt.info.last_epoch_started));
7491 return transit<ReplicaActive>();
7492 }
7493
7494 boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query)
7495 {
7496 PG *pg = context< RecoveryMachine >().pg;
7497 if (query.query.type == pg_query_t::INFO) {
7498 pair<pg_shard_t, pg_info_t> notify_info;
7499 pg->update_history(query.query.history);
7500 pg->fulfill_info(query.from, query.query, notify_info);
7501 context< RecoveryMachine >().send_notify(
7502 notify_info.first,
7503 pg_notify_t(
7504 notify_info.first.shard, pg->pg_whoami.shard,
7505 query.query_epoch,
7506 pg->get_osdmap()->get_epoch(),
7507 notify_info.second),
7508 pg->past_intervals);
7509 } else {
7510 pg->fulfill_log(query.from, query.query, query.query_epoch);
7511 }
7512 return discard_event();
7513 }
7514
7515 boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&)
7516 {
7517 PG *pg = context< RecoveryMachine >().pg;
7518 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7519 context< RecoveryMachine >().send_notify(
7520 pg->get_primary(),
7521 pg_notify_t(
7522 pg->get_primary().shard, pg->pg_whoami.shard,
7523 pg->get_osdmap()->get_epoch(),
7524 pg->get_osdmap()->get_epoch(),
7525 pg->info),
7526 pg->past_intervals);
7527 }
7528 pg->take_waiters();
7529 return discard_event();
7530 }
7531
7532 void PG::RecoveryState::Stray::exit()
7533 {
7534 context< RecoveryMachine >().log_exit(state_name, enter_time);
7535 PG *pg = context< RecoveryMachine >().pg;
7536 utime_t dur = ceph_clock_now() - enter_time;
7537 pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur);
7538 }
7539
7540 /*--------GetInfo---------*/
7541 PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
7542 : my_base(ctx),
7543 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetInfo")
7544 {
7545 context< RecoveryMachine >().log_enter(state_name);
7546
7547 PG *pg = context< RecoveryMachine >().pg;
7548 pg->check_past_interval_bounds();
7549 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7550
7551 assert(pg->blocked_by.empty());
7552
7553 prior_set = pg->build_prior();
7554
7555 pg->reset_min_peer_features();
7556 get_infos();
7557 if (prior_set.pg_down) {
7558 post_event(IsDown());
7559 } else if (peer_info_requested.empty()) {
7560 post_event(GotInfo());
7561 }
7562 }
7563
7564 void PG::RecoveryState::GetInfo::get_infos()
7565 {
7566 PG *pg = context< RecoveryMachine >().pg;
7567 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7568
7569 pg->blocked_by.clear();
7570 for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
7571 it != prior_set.probe.end();
7572 ++it) {
7573 pg_shard_t peer = *it;
7574 if (peer == pg->pg_whoami) {
7575 continue;
7576 }
7577 if (pg->peer_info.count(peer)) {
7578 ldout(pg->cct, 10) << " have osd." << peer << " info " << pg->peer_info[peer] << dendl;
7579 continue;
7580 }
7581 if (peer_info_requested.count(peer)) {
7582 ldout(pg->cct, 10) << " already requested info from osd." << peer << dendl;
7583 pg->blocked_by.insert(peer.osd);
7584 } else if (!pg->get_osdmap()->is_up(peer.osd)) {
7585 ldout(pg->cct, 10) << " not querying info from down osd." << peer << dendl;
7586 } else {
7587 ldout(pg->cct, 10) << " querying info from osd." << peer << dendl;
7588 context< RecoveryMachine >().send_query(
7589 peer, pg_query_t(pg_query_t::INFO,
7590 it->shard, pg->pg_whoami.shard,
7591 pg->info.history,
7592 pg->get_osdmap()->get_epoch()));
7593 peer_info_requested.insert(peer);
7594 pg->blocked_by.insert(peer.osd);
7595 }
7596 }
7597
7598 pg->publish_stats_to_osd();
7599 }
7600
7601 boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt)
7602 {
7603 PG *pg = context< RecoveryMachine >().pg;
7604
7605 set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
7606 if (p != peer_info_requested.end()) {
7607 peer_info_requested.erase(p);
7608 pg->blocked_by.erase(infoevt.from.osd);
7609 }
7610
7611 epoch_t old_start = pg->info.history.last_epoch_started;
7612 if (pg->proc_replica_info(
7613 infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
7614 // we got something new ...
7615 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7616 if (old_start < pg->info.history.last_epoch_started) {
7617 ldout(pg->cct, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
7618 prior_set = pg->build_prior();
7619
7620 // filter out any osds that got dropped from the probe set from
7621 // peer_info_requested. this is less expensive than restarting
7622 // peering (which would re-probe everyone).
7623 set<pg_shard_t>::iterator p = peer_info_requested.begin();
7624 while (p != peer_info_requested.end()) {
7625 if (prior_set.probe.count(*p) == 0) {
7626 ldout(pg->cct, 20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
7627 peer_info_requested.erase(p++);
7628 } else {
7629 ++p;
7630 }
7631 }
7632 get_infos();
7633 }
7634 ldout(pg->cct, 20) << "Adding osd: " << infoevt.from.osd << " peer features: "
7635 << hex << infoevt.features << dec << dendl;
7636 pg->apply_peer_features(infoevt.features);
7637
7638 // are we done getting everything?
7639 if (peer_info_requested.empty() && !prior_set.pg_down) {
7640 ldout(pg->cct, 20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl;
7641 ldout(pg->cct, 20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl;
7642 ldout(pg->cct, 20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl;
7643 post_event(GotInfo());
7644 }
7645 }
7646 return discard_event();
7647 }
7648
7649 boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q)
7650 {
7651 PG *pg = context< RecoveryMachine >().pg;
7652 q.f->open_object_section("state");
7653 q.f->dump_string("name", state_name);
7654 q.f->dump_stream("enter_time") << enter_time;
7655
7656 q.f->open_array_section("requested_info_from");
7657 for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
7658 p != peer_info_requested.end();
7659 ++p) {
7660 q.f->open_object_section("osd");
7661 q.f->dump_stream("osd") << *p;
7662 if (pg->peer_info.count(*p)) {
7663 q.f->open_object_section("got_info");
7664 pg->peer_info[*p].dump(q.f);
7665 q.f->close_section();
7666 }
7667 q.f->close_section();
7668 }
7669 q.f->close_section();
7670
7671 q.f->close_section();
7672 return forward_event();
7673 }
7674
7675 void PG::RecoveryState::GetInfo::exit()
7676 {
7677 context< RecoveryMachine >().log_exit(state_name, enter_time);
7678 PG *pg = context< RecoveryMachine >().pg;
7679 utime_t dur = ceph_clock_now() - enter_time;
7680 pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur);
7681 pg->blocked_by.clear();
7682 pg->publish_stats_to_osd();
7683 }
7684
7685 /*------GetLog------------*/
7686 PG::RecoveryState::GetLog::GetLog(my_context ctx)
7687 : my_base(ctx),
7688 NamedState(
7689 context< RecoveryMachine >().pg, "Started/Primary/Peering/GetLog"),
7690 msg(0)
7691 {
7692 context< RecoveryMachine >().log_enter(state_name);
7693
7694 PG *pg = context< RecoveryMachine >().pg;
7695
7696 // adjust acting?
7697 if (!pg->choose_acting(auth_log_shard, false,
7698 &context< Peering >().history_les_bound)) {
7699 if (!pg->want_acting.empty()) {
7700 post_event(NeedActingChange());
7701 } else {
7702 post_event(IsIncomplete());
7703 }
7704 return;
7705 }
7706
7707 // am i the best?
7708 if (auth_log_shard == pg->pg_whoami) {
7709 post_event(GotLog());
7710 return;
7711 }
7712
7713 const pg_info_t& best = pg->peer_info[auth_log_shard];
7714
7715 // am i broken?
7716 if (pg->info.last_update < best.log_tail) {
7717 ldout(pg->cct, 10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
7718 post_event(IsIncomplete());
7719 return;
7720 }
7721
7722 // how much log to request?
7723 eversion_t request_log_from = pg->info.last_update;
7724 assert(!pg->actingbackfill.empty());
7725 for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
7726 p != pg->actingbackfill.end();
7727 ++p) {
7728 if (*p == pg->pg_whoami) continue;
7729 pg_info_t& ri = pg->peer_info[*p];
7730 if (ri.last_update < pg->info.log_tail && ri.last_update >= best.log_tail &&
7731 ri.last_update < request_log_from)
7732 request_log_from = ri.last_update;
7733 }
7734
7735 // how much?
7736 ldout(pg->cct, 10) << " requesting log from osd." << auth_log_shard << dendl;
7737 context<RecoveryMachine>().send_query(
7738 auth_log_shard,
7739 pg_query_t(
7740 pg_query_t::LOG,
7741 auth_log_shard.shard, pg->pg_whoami.shard,
7742 request_log_from, pg->info.history,
7743 pg->get_osdmap()->get_epoch()));
7744
7745 assert(pg->blocked_by.empty());
7746 pg->blocked_by.insert(auth_log_shard.osd);
7747 pg->publish_stats_to_osd();
7748 }
7749
7750 boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap)
7751 {
7752 PG *pg = context< RecoveryMachine >().pg;
7753 // make sure our log source didn't go down. we need to check
7754 // explicitly because it may not be part of the prior set, which
7755 // means the Peering state check won't catch it going down.
7756 if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
7757 ldout(pg->cct, 10) << "GetLog: auth_log_shard osd."
7758 << auth_log_shard.osd << " went down" << dendl;
7759 post_event(advmap);
7760 return transit< Reset >();
7761 }
7762
7763 // let the Peering state do its checks.
7764 return forward_event();
7765 }
7766
7767 boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt)
7768 {
7769 PG *pg = context< RecoveryMachine >().pg;
7770 assert(!msg);
7771 if (logevt.from != auth_log_shard) {
7772 ldout(pg->cct, 10) << "GetLog: discarding log from "
7773 << "non-auth_log_shard osd." << logevt.from << dendl;
7774 return discard_event();
7775 }
7776 ldout(pg->cct, 10) << "GetLog: received master log from osd"
7777 << logevt.from << dendl;
7778 msg = logevt.msg;
7779 post_event(GotLog());
7780 return discard_event();
7781 }
7782
7783 boost::statechart::result PG::RecoveryState::GetLog::react(const GotLog&)
7784 {
7785 PG *pg = context< RecoveryMachine >().pg;
7786 ldout(pg->cct, 10) << "leaving GetLog" << dendl;
7787 if (msg) {
7788 ldout(pg->cct, 10) << "processing master log" << dendl;
7789 pg->proc_master_log(*context<RecoveryMachine>().get_cur_transaction(),
7790 msg->info, msg->log, msg->missing,
7791 auth_log_shard);
7792 }
7793 pg->start_flush(
7794 context< RecoveryMachine >().get_cur_transaction(),
7795 context< RecoveryMachine >().get_on_applied_context_list(),
7796 context< RecoveryMachine >().get_on_safe_context_list());
7797 return transit< GetMissing >();
7798 }
7799
7800 boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q)
7801 {
7802 q.f->open_object_section("state");
7803 q.f->dump_string("name", state_name);
7804 q.f->dump_stream("enter_time") << enter_time;
7805 q.f->dump_stream("auth_log_shard") << auth_log_shard;
7806 q.f->close_section();
7807 return forward_event();
7808 }
7809
7810 void PG::RecoveryState::GetLog::exit()
7811 {
7812 context< RecoveryMachine >().log_exit(state_name, enter_time);
7813 PG *pg = context< RecoveryMachine >().pg;
7814 utime_t dur = ceph_clock_now() - enter_time;
7815 pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur);
7816 pg->blocked_by.clear();
7817 pg->publish_stats_to_osd();
7818 }
7819
7820 /*------WaitActingChange--------*/
7821 PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx)
7822 : my_base(ctx),
7823 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitActingChange")
7824 {
7825 context< RecoveryMachine >().log_enter(state_name);
7826 }
7827
7828 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const AdvMap& advmap)
7829 {
7830 PG *pg = context< RecoveryMachine >().pg;
7831 OSDMapRef osdmap = advmap.osdmap;
7832
7833 ldout(pg->cct, 10) << "verifying no want_acting " << pg->want_acting << " targets didn't go down" << dendl;
7834 for (vector<int>::iterator p = pg->want_acting.begin(); p != pg->want_acting.end(); ++p) {
7835 if (!osdmap->is_up(*p)) {
7836 ldout(pg->cct, 10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
7837 post_event(advmap);
7838 return transit< Reset >();
7839 }
7840 }
7841 return forward_event();
7842 }
7843
7844 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MLogRec& logevt)
7845 {
7846 PG *pg = context< RecoveryMachine >().pg;
7847 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MLocRec" << dendl;
7848 return discard_event();
7849 }
7850
7851 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MInfoRec& evt)
7852 {
7853 PG *pg = context< RecoveryMachine >().pg;
7854 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
7855 return discard_event();
7856 }
7857
7858 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MNotifyRec& evt)
7859 {
7860 PG *pg = context< RecoveryMachine >().pg;
7861 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
7862 return discard_event();
7863 }
7864
7865 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const QueryState& q)
7866 {
7867 q.f->open_object_section("state");
7868 q.f->dump_string("name", state_name);
7869 q.f->dump_stream("enter_time") << enter_time;
7870 q.f->dump_string("comment", "waiting for pg acting set to change");
7871 q.f->close_section();
7872 return forward_event();
7873 }
7874
7875 void PG::RecoveryState::WaitActingChange::exit()
7876 {
7877 context< RecoveryMachine >().log_exit(state_name, enter_time);
7878 PG *pg = context< RecoveryMachine >().pg;
7879 utime_t dur = ceph_clock_now() - enter_time;
7880 pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur);
7881 }
7882
7883 /*------Down--------*/
7884 PG::RecoveryState::Down::Down(my_context ctx)
7885 : my_base(ctx),
7886 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Down")
7887 {
7888 context< RecoveryMachine >().log_enter(state_name);
7889 PG *pg = context< RecoveryMachine >().pg;
7890
7891 pg->state_clear(PG_STATE_PEERING);
7892 pg->state_set(PG_STATE_DOWN);
7893
7894 auto &prior_set = context< Peering >().prior_set;
7895 assert(pg->blocked_by.empty());
7896 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
7897 pg->publish_stats_to_osd();
7898 }
7899
7900 void PG::RecoveryState::Down::exit()
7901 {
7902 context< RecoveryMachine >().log_exit(state_name, enter_time);
7903 PG *pg = context< RecoveryMachine >().pg;
7904
7905 pg->state_clear(PG_STATE_DOWN);
7906 utime_t dur = ceph_clock_now() - enter_time;
7907 pg->osd->recoverystate_perf->tinc(rs_down_latency, dur);
7908
7909 pg->blocked_by.clear();
7910 pg->publish_stats_to_osd();
7911 }
7912
7913 boost::statechart::result PG::RecoveryState::Down::react(const QueryState& q)
7914 {
7915 q.f->open_object_section("state");
7916 q.f->dump_string("name", state_name);
7917 q.f->dump_stream("enter_time") << enter_time;
7918 q.f->dump_string("comment",
7919 "not enough up instances of this PG to go active");
7920 q.f->close_section();
7921 return forward_event();
7922 }
7923
7924 /*------Incomplete--------*/
7925 PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
7926 : my_base(ctx),
7927 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Incomplete")
7928 {
7929 context< RecoveryMachine >().log_enter(state_name);
7930 PG *pg = context< RecoveryMachine >().pg;
7931
7932 pg->state_clear(PG_STATE_PEERING);
7933 pg->state_set(PG_STATE_INCOMPLETE);
7934
7935 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7936 assert(pg->blocked_by.empty());
7937 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
7938 pg->publish_stats_to_osd();
7939 }
7940
7941 boost::statechart::result PG::RecoveryState::Incomplete::react(const AdvMap &advmap) {
7942 PG *pg = context< RecoveryMachine >().pg;
7943 int64_t poolnum = pg->info.pgid.pool();
7944
7945 // Reset if min_size turn smaller than previous value, pg might now be able to go active
7946 if (advmap.lastmap->get_pools().find(poolnum)->second.min_size >
7947 advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
7948 post_event(advmap);
7949 return transit< Reset >();
7950 }
7951
7952 return forward_event();
7953 }
7954
7955 boost::statechart::result PG::RecoveryState::Incomplete::react(const MNotifyRec& notevt) {
7956 PG *pg = context< RecoveryMachine >().pg;
7957 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
7958 if (pg->proc_replica_info(
7959 notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
7960 // We got something new, try again!
7961 return transit< GetLog >();
7962 } else {
7963 return discard_event();
7964 }
7965 }
7966
7967 boost::statechart::result PG::RecoveryState::Incomplete::react(
7968 const QueryState& q)
7969 {
7970 q.f->open_object_section("state");
7971 q.f->dump_string("name", state_name);
7972 q.f->dump_stream("enter_time") << enter_time;
7973 q.f->dump_string("comment", "not enough complete instances of this PG");
7974 q.f->close_section();
7975 return forward_event();
7976 }
7977
7978 void PG::RecoveryState::Incomplete::exit()
7979 {
7980 context< RecoveryMachine >().log_exit(state_name, enter_time);
7981 PG *pg = context< RecoveryMachine >().pg;
7982
7983 pg->state_clear(PG_STATE_INCOMPLETE);
7984 utime_t dur = ceph_clock_now() - enter_time;
7985 pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur);
7986
7987 pg->blocked_by.clear();
7988 pg->publish_stats_to_osd();
7989 }
7990
7991 /*------GetMissing--------*/
7992 PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
7993 : my_base(ctx),
7994 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetMissing")
7995 {
7996 context< RecoveryMachine >().log_enter(state_name);
7997
7998 PG *pg = context< RecoveryMachine >().pg;
7999 assert(!pg->actingbackfill.empty());
8000 eversion_t since;
8001 for (set<pg_shard_t>::iterator i = pg->actingbackfill.begin();
8002 i != pg->actingbackfill.end();
8003 ++i) {
8004 if (*i == pg->get_primary()) continue;
8005 const pg_info_t& pi = pg->peer_info[*i];
8006
8007 if (pi.is_empty())
8008 continue; // no pg data, nothing divergent
8009
8010 if (pi.last_update < pg->pg_log.get_tail()) {
8011 ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
8012 pg->peer_missing[*i];
8013 continue;
8014 }
8015 if (pi.last_backfill == hobject_t()) {
8016 ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
8017 pg->peer_missing[*i];
8018 continue;
8019 }
8020
8021 if (pi.last_update == pi.last_complete && // peer has no missing
8022 pi.last_update == pg->info.last_update) { // peer is up to date
8023 // replica has no missing and identical log as us. no need to
8024 // pull anything.
8025 // FIXME: we can do better here. if last_update==last_complete we
8026 // can infer the rest!
8027 ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl;
8028 pg->peer_missing[*i];
8029 continue;
8030 }
8031
8032 // We pull the log from the peer's last_epoch_started to ensure we
8033 // get enough log to detect divergent updates.
8034 since.epoch = pi.last_epoch_started;
8035 assert(pi.last_update >= pg->info.log_tail); // or else choose_acting() did a bad thing
8036 if (pi.log_tail <= since) {
8037 ldout(pg->cct, 10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
8038 context< RecoveryMachine >().send_query(
8039 *i,
8040 pg_query_t(
8041 pg_query_t::LOG,
8042 i->shard, pg->pg_whoami.shard,
8043 since, pg->info.history,
8044 pg->get_osdmap()->get_epoch()));
8045 } else {
8046 ldout(pg->cct, 10) << " requesting fulllog+missing from osd." << *i
8047 << " (want since " << since << " < log.tail "
8048 << pi.log_tail << ")" << dendl;
8049 context< RecoveryMachine >().send_query(
8050 *i, pg_query_t(
8051 pg_query_t::FULLLOG,
8052 i->shard, pg->pg_whoami.shard,
8053 pg->info.history, pg->get_osdmap()->get_epoch()));
8054 }
8055 peer_missing_requested.insert(*i);
8056 pg->blocked_by.insert(i->osd);
8057 }
8058
8059 if (peer_missing_requested.empty()) {
8060 if (pg->need_up_thru) {
8061 ldout(pg->cct, 10) << " still need up_thru update before going active"
8062 << dendl;
8063 post_event(NeedUpThru());
8064 return;
8065 }
8066
8067 // all good!
8068 post_event(Activate(pg->get_osdmap()->get_epoch()));
8069 } else {
8070 pg->publish_stats_to_osd();
8071 }
8072 }
8073
8074 boost::statechart::result PG::RecoveryState::GetMissing::react(const MLogRec& logevt)
8075 {
8076 PG *pg = context< RecoveryMachine >().pg;
8077
8078 peer_missing_requested.erase(logevt.from);
8079 pg->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
8080
8081 if (peer_missing_requested.empty()) {
8082 if (pg->need_up_thru) {
8083 ldout(pg->cct, 10) << " still need up_thru update before going active"
8084 << dendl;
8085 post_event(NeedUpThru());
8086 } else {
8087 ldout(pg->cct, 10) << "Got last missing, don't need missing "
8088 << "posting Activate" << dendl;
8089 post_event(Activate(pg->get_osdmap()->get_epoch()));
8090 }
8091 }
8092 return discard_event();
8093 }
8094
8095 boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState& q)
8096 {
8097 PG *pg = context< RecoveryMachine >().pg;
8098 q.f->open_object_section("state");
8099 q.f->dump_string("name", state_name);
8100 q.f->dump_stream("enter_time") << enter_time;
8101
8102 q.f->open_array_section("peer_missing_requested");
8103 for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
8104 p != peer_missing_requested.end();
8105 ++p) {
8106 q.f->open_object_section("osd");
8107 q.f->dump_stream("osd") << *p;
8108 if (pg->peer_missing.count(*p)) {
8109 q.f->open_object_section("got_missing");
8110 pg->peer_missing[*p].dump(q.f);
8111 q.f->close_section();
8112 }
8113 q.f->close_section();
8114 }
8115 q.f->close_section();
8116
8117 q.f->close_section();
8118 return forward_event();
8119 }
8120
8121 void PG::RecoveryState::GetMissing::exit()
8122 {
8123 context< RecoveryMachine >().log_exit(state_name, enter_time);
8124 PG *pg = context< RecoveryMachine >().pg;
8125 utime_t dur = ceph_clock_now() - enter_time;
8126 pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur);
8127 pg->blocked_by.clear();
8128 pg->publish_stats_to_osd();
8129 }
8130
8131 /*------WaitUpThru--------*/
8132 PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx)
8133 : my_base(ctx),
8134 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitUpThru")
8135 {
8136 context< RecoveryMachine >().log_enter(state_name);
8137 }
8138
8139 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const ActMap& am)
8140 {
8141 PG *pg = context< RecoveryMachine >().pg;
8142 if (!pg->need_up_thru) {
8143 post_event(Activate(pg->get_osdmap()->get_epoch()));
8144 }
8145 return forward_event();
8146 }
8147
8148 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const MLogRec& logevt)
8149 {
8150 PG *pg = context< RecoveryMachine >().pg;
8151 ldout(pg->cct, 10) << "Noting missing from osd." << logevt.from << dendl;
8152 pg->peer_missing[logevt.from].claim(logevt.msg->missing);
8153 pg->peer_info[logevt.from] = logevt.msg->info;
8154 return discard_event();
8155 }
8156
8157 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const QueryState& q)
8158 {
8159 q.f->open_object_section("state");
8160 q.f->dump_string("name", state_name);
8161 q.f->dump_stream("enter_time") << enter_time;
8162 q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
8163 q.f->close_section();
8164 return forward_event();
8165 }
8166
8167 void PG::RecoveryState::WaitUpThru::exit()
8168 {
8169 context< RecoveryMachine >().log_exit(state_name, enter_time);
8170 PG *pg = context< RecoveryMachine >().pg;
8171 utime_t dur = ceph_clock_now() - enter_time;
8172 pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur);
8173 }
8174
8175 /*----RecoveryState::RecoveryMachine Methods-----*/
8176 #undef dout_prefix
8177 #define dout_prefix *_dout << pg->gen_prefix()
8178
8179 void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name)
8180 {
8181 PG *pg = context< RecoveryMachine >().pg;
8182 ldout(pg->cct, 5) << "enter " << state_name << dendl;
8183 pg->osd->pg_recovery_stats.log_enter(state_name);
8184 }
8185
8186 void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_t enter_time)
8187 {
8188 utime_t dur = ceph_clock_now() - enter_time;
8189 PG *pg = context< RecoveryMachine >().pg;
8190 ldout(pg->cct, 5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
8191 pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now() - enter_time,
8192 event_count, event_time);
8193 event_count = 0;
8194 event_time = utime_t();
8195 }
8196
8197
8198 /*---------------------------------------------------*/
8199 #undef dout_prefix
8200 #define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ")
8201
8202 void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
8203 assert(!rctx);
8204 assert(!orig_ctx);
8205 orig_ctx = new_ctx;
8206 if (new_ctx) {
8207 if (messages_pending_flush) {
8208 rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
8209 } else {
8210 rctx = *new_ctx;
8211 }
8212 rctx->start_time = ceph_clock_now();
8213 }
8214 }
8215
8216 void PG::RecoveryState::begin_block_outgoing() {
8217 assert(!messages_pending_flush);
8218 assert(orig_ctx);
8219 assert(rctx);
8220 messages_pending_flush = BufferedRecoveryMessages();
8221 rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
8222 }
8223
8224 void PG::RecoveryState::clear_blocked_outgoing() {
8225 assert(orig_ctx);
8226 assert(rctx);
8227 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8228 }
8229
8230 void PG::RecoveryState::end_block_outgoing() {
8231 assert(messages_pending_flush);
8232 assert(orig_ctx);
8233 assert(rctx);
8234
8235 rctx = RecoveryCtx(*orig_ctx);
8236 rctx->accept_buffered_messages(*messages_pending_flush);
8237 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8238 }
8239
8240 void PG::RecoveryState::end_handle() {
8241 if (rctx) {
8242 utime_t dur = ceph_clock_now() - rctx->start_time;
8243 machine.event_time += dur;
8244 }
8245
8246 machine.event_count++;
8247 rctx = boost::optional<RecoveryCtx>();
8248 orig_ctx = NULL;
8249 }
8250
8251 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi)
8252 {
8253 out << "BackfillInfo(" << bi.begin << "-" << bi.end
8254 << " " << bi.objects.size() << " objects";
8255 if (!bi.objects.empty())
8256 out << " " << bi.objects;
8257 out << ")";
8258 return out;
8259 }
8260
8261 void intrusive_ptr_add_ref(PG *pg) { pg->get("intptr"); }
8262 void intrusive_ptr_release(PG *pg) { pg->put("intptr"); }
8263
8264 #ifdef PG_DEBUG_REFS
8265 uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
8266 void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
8267 #endif