]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PG.cc
update sources to v12.2.5
[ceph.git] / ceph / src / osd / PG.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "PG.h"
16 // #include "msg/Messenger.h"
17 #include "messages/MOSDRepScrub.h"
18 // #include "common/cmdparse.h"
19 // #include "common/ceph_context.h"
20
21 #include "common/errno.h"
22 #include "common/config.h"
23 #include "OSD.h"
24 #include "OpRequest.h"
25 #include "ScrubStore.h"
26 #include "Session.h"
27
28 #include "common/Timer.h"
29 #include "common/perf_counters.h"
30
31 #include "messages/MOSDOp.h"
32 #include "messages/MOSDPGNotify.h"
33 // #include "messages/MOSDPGLog.h"
34 #include "messages/MOSDPGRemove.h"
35 #include "messages/MOSDPGInfo.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDPGBackfill.h"
39 #include "messages/MOSDPGBackfillRemove.h"
40 #include "messages/MBackfillReserve.h"
41 #include "messages/MRecoveryReserve.h"
42 #include "messages/MOSDPGPush.h"
43 #include "messages/MOSDPGPushReply.h"
44 #include "messages/MOSDPGPull.h"
45 #include "messages/MOSDECSubOpWrite.h"
46 #include "messages/MOSDECSubOpWriteReply.h"
47 #include "messages/MOSDECSubOpRead.h"
48 #include "messages/MOSDECSubOpReadReply.h"
49 #include "messages/MOSDPGUpdateLogMissing.h"
50 #include "messages/MOSDPGUpdateLogMissingReply.h"
51 #include "messages/MOSDBackoff.h"
52 #include "messages/MOSDScrubReserve.h"
53 #include "messages/MOSDSubOp.h"
54 #include "messages/MOSDRepOp.h"
55 #include "messages/MOSDSubOpReply.h"
56 #include "messages/MOSDRepOpReply.h"
57 #include "messages/MOSDRepScrubMap.h"
58 #include "messages/MOSDPGRecoveryDelete.h"
59 #include "messages/MOSDPGRecoveryDeleteReply.h"
60
61 #include "common/BackTrace.h"
62 #include "common/EventTrace.h"
63
64 #ifdef WITH_LTTNG
65 #define TRACEPOINT_DEFINE
66 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
67 #include "tracing/pg.h"
68 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
69 #undef TRACEPOINT_DEFINE
70 #else
71 #define tracepoint(...)
72 #endif
73
74 #include <sstream>
75
76 #define dout_context cct
77 #define dout_subsys ceph_subsys_osd
78 #undef dout_prefix
79 #define dout_prefix _prefix(_dout, this)
80
81 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
82 // easily skip them
83 const string infover_key("_infover");
84 const string info_key("_info");
85 const string biginfo_key("_biginfo");
86 const string epoch_key("_epoch");
87 const string fastinfo_key("_fastinfo");
88
89 template <class T>
90 static ostream& _prefix(std::ostream *_dout, T *t)
91 {
92 return *_dout << t->gen_prefix();
93 }
94
95 MEMPOOL_DEFINE_OBJECT_FACTORY(PG::CephPeeringEvt, pg_peering_evt, osd);
96
97 void PGStateHistory::enter(PG* pg, const utime_t entime, const char* state)
98 {
99 // Ignore trimming state machine for now
100 if (::strstr(state, "Trimming") != NULL) {
101 return;
102 } else if (pi != nullptr) {
103 pi->enter_state(entime, state);
104 } else {
105 // Store current state since we can't reliably take the PG lock here
106 if ( tmppi == nullptr) {
107 tmppi = std::unique_ptr<PGStateInstance>(new PGStateInstance);
108 }
109
110 thispg = pg;
111 tmppi->enter_state(entime, state);
112 }
113 }
114
115 void PGStateHistory::exit(const char* state) {
116 // Ignore trimming state machine for now
117 // Do nothing if PG is being destroyed!
118 if (::strstr(state, "Trimming") != NULL || pg_in_destructor) {
119 return;
120 } else {
121 bool ilocked = false;
122 if(!thispg->is_locked()) {
123 thispg->lock();
124 ilocked = true;
125 }
126 if (pi == nullptr) {
127 buffer.push_back(std::unique_ptr<PGStateInstance>(tmppi.release()));
128 pi = buffer.back().get();
129 pi->setepoch(thispg->get_osdmap()->get_epoch());
130 }
131
132 pi->exit_state(ceph_clock_now());
133 if (::strcmp(state, "Reset") == 0) {
134 this->reset();
135 }
136 if(ilocked) {
137 thispg->unlock();
138 }
139 }
140 }
141
142 void PGStateHistory::dump(Formatter* f) const {
143 f->open_array_section("history");
144 for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) {
145 f->open_object_section("states");
146 f->dump_stream("epoch") << (*pi)->this_epoch;
147 for (auto she : (*pi)->state_history) {
148 f->dump_string("state", std::get<2>(she));
149 f->dump_stream("enter") << std::get<0>(she);
150 f->dump_stream("exit") << std::get<1>(she);
151 }
152 f->close_section();
153 }
154 f->close_section();
155 }
156
157 void PG::get(const char* tag)
158 {
159 ref++;
160 #ifdef PG_DEBUG_REFS
161 Mutex::Locker l(_ref_id_lock);
162 _tag_counts[tag]++;
163 #endif
164 }
165
166 void PG::put(const char* tag)
167 {
168 #ifdef PG_DEBUG_REFS
169 {
170 Mutex::Locker l(_ref_id_lock);
171 auto tag_counts_entry = _tag_counts.find(tag);
172 assert(tag_counts_entry != _tag_counts.end());
173 --tag_counts_entry->second;
174 if (tag_counts_entry->second == 0) {
175 _tag_counts.erase(tag_counts_entry);
176 }
177 }
178 #endif
179 if (--ref== 0)
180 delete this;
181 }
182
183 #ifdef PG_DEBUG_REFS
184 uint64_t PG::get_with_id()
185 {
186 ref++;
187 Mutex::Locker l(_ref_id_lock);
188 uint64_t id = ++_ref_id;
189 BackTrace bt(0);
190 stringstream ss;
191 bt.print(ss);
192 dout(20) << __func__ << ": " << info.pgid << " got id " << id << " (new) ref==" << ref << dendl;
193 assert(!_live_ids.count(id));
194 _live_ids.insert(make_pair(id, ss.str()));
195 return id;
196 }
197
198 void PG::put_with_id(uint64_t id)
199 {
200 dout(20) << __func__ << ": " << info.pgid << " put id " << id << " (current) ref==" << ref << dendl;
201 {
202 Mutex::Locker l(_ref_id_lock);
203 assert(_live_ids.count(id));
204 _live_ids.erase(id);
205 }
206 if (--ref == 0)
207 delete this;
208 }
209
210 void PG::dump_live_ids()
211 {
212 Mutex::Locker l(_ref_id_lock);
213 dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl;
214 for (map<uint64_t, string>::iterator i = _live_ids.begin();
215 i != _live_ids.end();
216 ++i) {
217 dout(0) << "\t\tid: " << *i << dendl;
218 }
219 dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl;
220 for (map<string, uint64_t>::iterator i = _tag_counts.begin();
221 i != _tag_counts.end();
222 ++i) {
223 dout(0) << "\t\tid: " << *i << dendl;
224 }
225 }
226 #endif
227
228
229 void PGPool::update(OSDMapRef map)
230 {
231 const pg_pool_t *pi = map->get_pg_pool(id);
232 assert(pi);
233 info = *pi;
234 auid = pi->auid;
235 name = map->get_pool_name(id);
236 bool updated = false;
237 if ((map->get_epoch() != cached_epoch + 1) ||
238 (pi->get_snap_epoch() == map->get_epoch())) {
239 updated = true;
240 pi->build_removed_snaps(newly_removed_snaps);
241 interval_set<snapid_t> intersection;
242 intersection.intersection_of(newly_removed_snaps, cached_removed_snaps);
243 if (intersection == cached_removed_snaps) {
244 newly_removed_snaps.subtract(cached_removed_snaps);
245 cached_removed_snaps.union_of(newly_removed_snaps);
246 } else {
247 lgeneric_subdout(cct, osd, 0) << __func__
248 << " cached_removed_snaps shrank from " << cached_removed_snaps
249 << " to " << newly_removed_snaps << dendl;
250 cached_removed_snaps = newly_removed_snaps;
251 newly_removed_snaps.clear();
252 }
253 snapc = pi->get_snap_context();
254 } else {
255 /* 1) map->get_epoch() == cached_epoch + 1 &&
256 * 2) pi->get_snap_epoch() != map->get_epoch()
257 *
258 * From the if branch, 1 && 2 must be true. From 2, we know that
259 * this map didn't change the set of removed snaps. From 1, we
260 * know that our cached_removed_snaps matches the previous map.
261 * Thus, from 1 && 2, cached_removed snaps matches the current
262 * set of removed snaps and all we have to do is clear
263 * newly_removed_snaps.
264 */
265 newly_removed_snaps.clear();
266 }
267 cached_epoch = map->get_epoch();
268 lgeneric_subdout(cct, osd, 20)
269 << "PGPool::update cached_removed_snaps "
270 << cached_removed_snaps
271 << " newly_removed_snaps "
272 << newly_removed_snaps
273 << " snapc " << snapc
274 << (updated ? " (updated)":" (no change)")
275 << dendl;
276 }
277
278 PG::PG(OSDService *o, OSDMapRef curmap,
279 const PGPool &_pool, spg_t p) :
280 osd(o),
281 cct(o->cct),
282 osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
283 snap_mapper(
284 cct,
285 &osdriver,
286 p.ps(),
287 p.get_split_bits(curmap->get_pg_num(_pool.id)),
288 _pool.id,
289 p.shard),
290 osdmap_ref(curmap), last_persisted_osdmap_ref(curmap), pool(_pool),
291 _lock("PG::_lock"),
292 #ifdef PG_DEBUG_REFS
293 _ref_id_lock("PG::_ref_id_lock"), _ref_id(0),
294 #endif
295 deleting(false),
296 trace_endpoint("0.0.0.0", 0, "PG"),
297 dirty_info(false), dirty_big_info(false),
298 info(p),
299 info_struct_v(0),
300 coll(p),
301 pg_log(cct),
302 pgmeta_oid(p.make_pgmeta_oid()),
303 missing_loc(this),
304 past_intervals(
305 curmap->get_pools().at(p.pgid.pool()).ec_pool(),
306 *curmap),
307 stat_queue_item(this),
308 scrub_queued(false),
309 recovery_queued(false),
310 recovery_ops_active(0),
311 role(-1),
312 state(0),
313 send_notify(false),
314 pg_whoami(osd->whoami, p.shard),
315 need_up_thru(false),
316 last_peering_reset(0),
317 heartbeat_peer_lock("PG::heartbeat_peer_lock"),
318 backfill_reserved(false),
319 backfill_reserving(false),
320 flushes_in_progress(0),
321 pg_stats_publish_lock("PG::pg_stats_publish_lock"),
322 pg_stats_publish_valid(false),
323 osr(osd->osr_registry.lookup_or_create(p, (stringify(p)))),
324 finish_sync_event(NULL),
325 backoff_lock("PG::backoff_lock"),
326 scrub_after_recovery(false),
327 active_pushes(0),
328 recovery_state(this),
329 pg_id(p),
330 peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
331 acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
332 upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
333 last_epoch(0)
334 {
335 #ifdef PG_DEBUG_REFS
336 osd->add_pgid(p, this);
337 #endif
338 #ifdef WITH_BLKIN
339 std::stringstream ss;
340 ss << "PG " << info.pgid;
341 trace_endpoint.copy_name(ss.str());
342 #endif
343 osr->shard_hint = p;
344 }
345
346 PG::~PG()
347 {
348 pgstate_history.set_pg_in_destructor();
349 #ifdef PG_DEBUG_REFS
350 osd->remove_pgid(info.pgid, this);
351 #endif
352 }
353
354 void PG::lock_suspend_timeout(ThreadPool::TPHandle &handle)
355 {
356 handle.suspend_tp_timeout();
357 lock();
358 handle.reset_tp_timeout();
359 }
360
361 void PG::lock(bool no_lockdep) const
362 {
363 _lock.Lock(no_lockdep);
364 // if we have unrecorded dirty state with the lock dropped, there is a bug
365 assert(!dirty_info);
366 assert(!dirty_big_info);
367
368 dout(30) << "lock" << dendl;
369 }
370
371 std::string PG::gen_prefix() const
372 {
373 stringstream out;
374 OSDMapRef mapref = osdmap_ref;
375 if (_lock.is_locked_by_me()) {
376 out << "osd." << osd->whoami
377 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
378 << " " << *this << " ";
379 } else {
380 out << "osd." << osd->whoami
381 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
382 << " pg[" << info.pgid << "(unlocked)] ";
383 }
384 return out.str();
385 }
386
387 /********* PG **********/
388
389 void PG::proc_master_log(
390 ObjectStore::Transaction& t, pg_info_t &oinfo,
391 pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
392 {
393 dout(10) << "proc_master_log for osd." << from << ": "
394 << olog << " " << omissing << dendl;
395 assert(!is_peered() && is_primary());
396
397 // merge log into our own log to build master log. no need to
398 // make any adjustments to their missing map; we are taking their
399 // log to be authoritative (i.e., their entries are by definitely
400 // non-divergent).
401 merge_log(t, oinfo, olog, from);
402 peer_info[from] = oinfo;
403 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
404 might_have_unfound.insert(from);
405
406 // See doc/dev/osd_internals/last_epoch_started
407 if (oinfo.last_epoch_started > info.last_epoch_started) {
408 info.last_epoch_started = oinfo.last_epoch_started;
409 dirty_info = true;
410 }
411 if (oinfo.last_interval_started > info.last_interval_started) {
412 info.last_interval_started = oinfo.last_interval_started;
413 dirty_info = true;
414 }
415 update_history(oinfo.history);
416 assert(cct->_conf->osd_find_best_info_ignore_history_les ||
417 info.last_epoch_started >= info.history.last_epoch_started);
418
419 peer_missing[from].claim(omissing);
420 }
421
422 void PG::proc_replica_log(
423 pg_info_t &oinfo,
424 const pg_log_t &olog,
425 pg_missing_t& omissing,
426 pg_shard_t from)
427 {
428 dout(10) << "proc_replica_log for osd." << from << ": "
429 << oinfo << " " << olog << " " << omissing << dendl;
430
431 pg_log.proc_replica_log(oinfo, olog, omissing, from);
432
433 peer_info[from] = oinfo;
434 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
435 might_have_unfound.insert(from);
436
437 for (map<hobject_t, pg_missing_item>::const_iterator i =
438 omissing.get_items().begin();
439 i != omissing.get_items().end();
440 ++i) {
441 dout(20) << " after missing " << i->first << " need " << i->second.need
442 << " have " << i->second.have << dendl;
443 }
444 peer_missing[from].claim(omissing);
445 }
446
447 bool PG::proc_replica_info(
448 pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
449 {
450 map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
451 if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
452 dout(10) << " got dup osd." << from << " info " << oinfo << ", identical to ours" << dendl;
453 return false;
454 }
455
456 if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
457 dout(10) << " got info " << oinfo << " from down osd." << from
458 << " discarding" << dendl;
459 return false;
460 }
461
462 dout(10) << " got osd." << from << " " << oinfo << dendl;
463 assert(is_primary());
464 peer_info[from] = oinfo;
465 might_have_unfound.insert(from);
466
467 update_history(oinfo.history);
468
469 // stray?
470 if (!is_up(from) && !is_acting(from)) {
471 dout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
472 stray_set.insert(from);
473 if (is_clean()) {
474 purge_strays();
475 }
476 }
477
478 // was this a new info? if so, update peers!
479 if (p == peer_info.end())
480 update_heartbeat_peers();
481
482 return true;
483 }
484
485 void PG::remove_snap_mapped_object(
486 ObjectStore::Transaction &t, const hobject_t &soid)
487 {
488 t.remove(
489 coll,
490 ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard));
491 clear_object_snap_mapping(&t, soid);
492 }
493
494 void PG::clear_object_snap_mapping(
495 ObjectStore::Transaction *t, const hobject_t &soid)
496 {
497 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
498 if (soid.snap < CEPH_MAXSNAP) {
499 int r = snap_mapper.remove_oid(
500 soid,
501 &_t);
502 if (!(r == 0 || r == -ENOENT)) {
503 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
504 ceph_abort();
505 }
506 }
507 }
508
509 void PG::update_object_snap_mapping(
510 ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps)
511 {
512 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
513 assert(soid.snap < CEPH_MAXSNAP);
514 int r = snap_mapper.remove_oid(
515 soid,
516 &_t);
517 if (!(r == 0 || r == -ENOENT)) {
518 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
519 ceph_abort();
520 }
521 snap_mapper.add_oid(
522 soid,
523 snaps,
524 &_t);
525 }
526
527 void PG::merge_log(
528 ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_shard_t from)
529 {
530 PGLogEntryHandler rollbacker{this, &t};
531 pg_log.merge_log(
532 oinfo, olog, from, info, &rollbacker, dirty_info, dirty_big_info);
533 }
534
535 void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead)
536 {
537 PGLogEntryHandler rollbacker{this, &t};
538 pg_log.rewind_divergent_log(
539 newhead, info, &rollbacker, dirty_info, dirty_big_info);
540 }
541
542 /*
543 * Process information from a replica to determine if it could have any
544 * objects that i need.
545 *
546 * TODO: if the missing set becomes very large, this could get expensive.
547 * Instead, we probably want to just iterate over our unfound set.
548 */
549 bool PG::search_for_missing(
550 const pg_info_t &oinfo, const pg_missing_t &omissing,
551 pg_shard_t from,
552 RecoveryCtx *ctx)
553 {
554 uint64_t num_unfound_before = missing_loc.num_unfound();
555 bool found_missing = missing_loc.add_source_info(
556 from, oinfo, omissing, ctx->handle);
557 if (found_missing && num_unfound_before != missing_loc.num_unfound())
558 publish_stats_to_osd();
559 if (found_missing &&
560 (get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
561 CEPH_FEATURE_OSD_ERASURE_CODES)) {
562 pg_info_t tinfo(oinfo);
563 tinfo.pgid.shard = pg_whoami.shard;
564 (*(ctx->info_map))[from.osd].push_back(
565 make_pair(
566 pg_notify_t(
567 from.shard, pg_whoami.shard,
568 get_osdmap()->get_epoch(),
569 get_osdmap()->get_epoch(),
570 tinfo),
571 past_intervals));
572 }
573 return found_missing;
574 }
575
576 bool PG::MissingLoc::readable_with_acting(
577 const hobject_t &hoid,
578 const set<pg_shard_t> &acting) const {
579 if (!needs_recovery(hoid))
580 return true;
581 if (is_deleted(hoid))
582 return false;
583 auto missing_loc_entry = missing_loc.find(hoid);
584 if (missing_loc_entry == missing_loc.end())
585 return false;
586 const set<pg_shard_t> &locs = missing_loc_entry->second;
587 ldout(pg->cct, 10) << __func__ << ": locs:" << locs << dendl;
588 set<pg_shard_t> have_acting;
589 for (set<pg_shard_t>::const_iterator i = locs.begin();
590 i != locs.end();
591 ++i) {
592 if (acting.count(*i))
593 have_acting.insert(*i);
594 }
595 return (*is_readable)(have_acting);
596 }
597
598 void PG::MissingLoc::add_batch_sources_info(
599 const set<pg_shard_t> &sources, ThreadPool::TPHandle* handle)
600 {
601 ldout(pg->cct, 10) << __func__ << ": adding sources in batch "
602 << sources.size() << dendl;
603 unsigned loop = 0;
604 for (map<hobject_t, pg_missing_item>::const_iterator i = needs_recovery_map.begin();
605 i != needs_recovery_map.end();
606 ++i) {
607 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
608 handle->reset_tp_timeout();
609 loop = 0;
610 }
611 if (i->second.is_delete())
612 continue;
613 missing_loc[i->first].insert(sources.begin(), sources.end());
614 missing_loc_sources.insert(sources.begin(), sources.end());
615 }
616 }
617
618 bool PG::MissingLoc::add_source_info(
619 pg_shard_t fromosd,
620 const pg_info_t &oinfo,
621 const pg_missing_t &omissing,
622 ThreadPool::TPHandle* handle)
623 {
624 bool found_missing = false;
625 unsigned loop = 0;
626 // found items?
627 for (map<hobject_t,pg_missing_item>::const_iterator p = needs_recovery_map.begin();
628 p != needs_recovery_map.end();
629 ++p) {
630 const hobject_t &soid(p->first);
631 eversion_t need = p->second.need;
632 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
633 handle->reset_tp_timeout();
634 loop = 0;
635 }
636 if (p->second.is_delete()) {
637 ldout(pg->cct, 10) << __func__ << " " << soid
638 << " delete, ignoring source" << dendl;
639 found_missing = true;
640 continue;
641 }
642 if (oinfo.last_update < need) {
643 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
644 << " also missing on osd." << fromosd
645 << " (last_update " << oinfo.last_update
646 << " < needed " << need << ")" << dendl;
647 continue;
648 }
649 if (!oinfo.last_backfill.is_max() &&
650 !oinfo.last_backfill_bitwise) {
651 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
652 << " also missing on osd." << fromosd
653 << " (last_backfill " << oinfo.last_backfill
654 << " but with wrong sort order)"
655 << dendl;
656 continue;
657 }
658 if (p->first >= oinfo.last_backfill) {
659 // FIXME: this is _probably_ true, although it could conceivably
660 // be in the undefined region! Hmm!
661 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
662 << " also missing on osd." << fromosd
663 << " (past last_backfill " << oinfo.last_backfill
664 << ")" << dendl;
665 continue;
666 }
667 if (oinfo.last_complete < need) {
668 if (omissing.is_missing(soid)) {
669 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
670 << " also missing on osd." << fromosd << dendl;
671 continue;
672 }
673 }
674
675 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
676 << " is on osd." << fromosd << dendl;
677
678 missing_loc[soid].insert(fromosd);
679 missing_loc_sources.insert(fromosd);
680 found_missing = true;
681 }
682
683 ldout(pg->cct, 20) << "needs_recovery_map missing " << needs_recovery_map
684 << dendl;
685 return found_missing;
686 }
687
688 void PG::discover_all_missing(map<int, map<spg_t,pg_query_t> > &query_map)
689 {
690 auto &missing = pg_log.get_missing();
691 uint64_t unfound = get_num_unfound();
692 assert(unfound > 0);
693
694 dout(10) << __func__ << " "
695 << missing.num_missing() << " missing, "
696 << unfound << " unfound"
697 << dendl;
698
699 std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
700 std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
701 for (; m != mend; ++m) {
702 pg_shard_t peer(*m);
703
704 if (!get_osdmap()->is_up(peer.osd)) {
705 dout(20) << __func__ << " skipping down osd." << peer << dendl;
706 continue;
707 }
708
709 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
710 if (iter != peer_info.end() &&
711 (iter->second.is_empty() || iter->second.dne())) {
712 // ignore empty peers
713 continue;
714 }
715
716 // If we've requested any of this stuff, the pg_missing_t information
717 // should be on its way.
718 // TODO: coalsce requested_* into a single data structure
719 if (peer_missing.find(peer) != peer_missing.end()) {
720 dout(20) << __func__ << ": osd." << peer
721 << ": we already have pg_missing_t" << dendl;
722 continue;
723 }
724 if (peer_log_requested.find(peer) != peer_log_requested.end()) {
725 dout(20) << __func__ << ": osd." << peer
726 << ": in peer_log_requested" << dendl;
727 continue;
728 }
729 if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
730 dout(20) << __func__ << ": osd." << peer
731 << ": in peer_missing_requested" << dendl;
732 continue;
733 }
734
735 // Request missing
736 dout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
737 << dendl;
738 peer_missing_requested.insert(peer);
739 query_map[peer.osd][spg_t(info.pgid.pgid, peer.shard)] =
740 pg_query_t(
741 pg_query_t::FULLLOG,
742 peer.shard, pg_whoami.shard,
743 info.history, get_osdmap()->get_epoch());
744 }
745 }
746
747 /******* PG ***********/
748 bool PG::needs_recovery() const
749 {
750 assert(is_primary());
751
752 auto &missing = pg_log.get_missing();
753
754 if (missing.num_missing()) {
755 dout(10) << __func__ << " primary has " << missing.num_missing()
756 << " missing" << dendl;
757 return true;
758 }
759
760 assert(!actingbackfill.empty());
761 set<pg_shard_t>::const_iterator end = actingbackfill.end();
762 set<pg_shard_t>::const_iterator a = actingbackfill.begin();
763 for (; a != end; ++a) {
764 if (*a == get_primary()) continue;
765 pg_shard_t peer = *a;
766 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
767 if (pm == peer_missing.end()) {
768 dout(10) << __func__ << " osd." << peer << " doesn't have missing set"
769 << dendl;
770 continue;
771 }
772 if (pm->second.num_missing()) {
773 dout(10) << __func__ << " osd." << peer << " has "
774 << pm->second.num_missing() << " missing" << dendl;
775 return true;
776 }
777 }
778
779 dout(10) << __func__ << " is recovered" << dendl;
780 return false;
781 }
782
783 bool PG::needs_backfill() const
784 {
785 assert(is_primary());
786
787 // We can assume that only possible osds that need backfill
788 // are on the backfill_targets vector nodes.
789 set<pg_shard_t>::const_iterator end = backfill_targets.end();
790 set<pg_shard_t>::const_iterator a = backfill_targets.begin();
791 for (; a != end; ++a) {
792 pg_shard_t peer = *a;
793 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
794 if (!pi->second.last_backfill.is_max()) {
795 dout(10) << __func__ << " osd." << peer << " has last_backfill " << pi->second.last_backfill << dendl;
796 return true;
797 }
798 }
799
800 dout(10) << __func__ << " does not need backfill" << dendl;
801 return false;
802 }
803
804
805 void PG::check_past_interval_bounds() const
806 {
807 auto rpib = get_required_past_interval_bounds(
808 info,
809 osd->get_superblock().oldest_map);
810 if (rpib.first >= rpib.second) {
811 if (!past_intervals.empty()) {
812 osd->clog->error() << info.pgid << " required past_interval bounds are"
813 << " empty [" << rpib << ") but past_intervals is not: "
814 << past_intervals;
815 derr << info.pgid << " required past_interval bounds are"
816 << " empty [" << rpib << ") but past_intervals is not: "
817 << past_intervals << dendl;
818 }
819 } else {
820 if (past_intervals.empty()) {
821 osd->clog->error() << info.pgid << " required past_interval bounds are"
822 << " not empty [" << rpib << ") but past_intervals "
823 << past_intervals << " is empty";
824 derr << info.pgid << " required past_interval bounds are"
825 << " not empty [" << rpib << ") but past_intervals "
826 << past_intervals << " is empty" << dendl;
827 assert(!past_intervals.empty());
828 }
829
830 auto apib = past_intervals.get_bounds();
831 if (apib.first > rpib.first) {
832 osd->clog->error() << info.pgid << " past_intervals [" << apib
833 << ") start interval does not contain the required"
834 << " bound [" << rpib << ") start";
835 derr << info.pgid << " past_intervals [" << apib
836 << ") start interval does not contain the required"
837 << " bound [" << rpib << ") start" << dendl;
838 assert(0 == "past_interval start interval mismatch");
839 }
840 if (apib.second != rpib.second) {
841 osd->clog->error() << info.pgid << " past_interal bound [" << apib
842 << ") end does not match required [" << rpib
843 << ") end";
844 derr << info.pgid << " past_interal bound [" << apib
845 << ") end does not match required [" << rpib
846 << ") end" << dendl;
847 assert(0 == "past_interval end mismatch");
848 }
849 }
850 }
851
852 bool PG::adjust_need_up_thru(const OSDMapRef osdmap)
853 {
854 epoch_t up_thru = osdmap->get_up_thru(osd->whoami);
855 if (need_up_thru &&
856 up_thru >= info.history.same_interval_since) {
857 dout(10) << "adjust_need_up_thru now " << up_thru << ", need_up_thru now false" << dendl;
858 need_up_thru = false;
859 return true;
860 }
861 return false;
862 }
863
864 void PG::remove_down_peer_info(const OSDMapRef osdmap)
865 {
866 // Remove any downed osds from peer_info
867 bool removed = false;
868 map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
869 while (p != peer_info.end()) {
870 if (!osdmap->is_up(p->first.osd)) {
871 dout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
872 peer_missing.erase(p->first);
873 peer_log_requested.erase(p->first);
874 peer_missing_requested.erase(p->first);
875 peer_info.erase(p++);
876 removed = true;
877 } else
878 ++p;
879 }
880
881 // if we removed anyone, update peers (which include peer_info)
882 if (removed)
883 update_heartbeat_peers();
884 check_recovery_sources(osdmap);
885 }
886
887 /*
888 * Returns true unless there is a non-lost OSD in might_have_unfound.
889 */
890 bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
891 {
892 assert(is_primary());
893
894 set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
895 set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
896 for (; peer != mend; ++peer) {
897 if (peer_missing.count(*peer))
898 continue;
899 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
900 if (iter != peer_info.end() &&
901 (iter->second.is_empty() || iter->second.dne()))
902 continue;
903 if (!osdmap->exists(peer->osd))
904 continue;
905 const osd_info_t &osd_info(osdmap->get_info(peer->osd));
906 if (osd_info.lost_at <= osd_info.up_from) {
907 // If there is even one OSD in might_have_unfound that isn't lost, we
908 // still might retrieve our unfound.
909 return false;
910 }
911 }
912 dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound
913 << " have been queried or are marked lost" << dendl;
914 return true;
915 }
916
917 PastIntervals::PriorSet PG::build_prior()
918 {
919 if (1) {
920 // sanity check
921 for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
922 it != peer_info.end();
923 ++it) {
924 assert(info.history.last_epoch_started >= it->second.history.last_epoch_started);
925 }
926 }
927
928 const OSDMap &osdmap = *get_osdmap();
929 PastIntervals::PriorSet prior = past_intervals.get_prior_set(
930 pool.info.ec_pool(),
931 info.history.last_epoch_started,
932 get_pgbackend()->get_is_recoverable_predicate(),
933 [&](epoch_t start, int osd, epoch_t *lost_at) {
934 const osd_info_t *pinfo = 0;
935 if (osdmap.exists(osd)) {
936 pinfo = &osdmap.get_info(osd);
937 if (lost_at)
938 *lost_at = pinfo->lost_at;
939 }
940
941 if (osdmap.is_up(osd)) {
942 return PastIntervals::UP;
943 } else if (!pinfo) {
944 return PastIntervals::DNE;
945 } else if (pinfo->lost_at > start) {
946 return PastIntervals::LOST;
947 } else {
948 return PastIntervals::DOWN;
949 }
950 },
951 up,
952 acting,
953 this);
954
955 if (prior.pg_down) {
956 state_set(PG_STATE_DOWN);
957 }
958
959 if (get_osdmap()->get_up_thru(osd->whoami) < info.history.same_interval_since) {
960 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
961 << " < same_since " << info.history.same_interval_since
962 << ", must notify monitor" << dendl;
963 need_up_thru = true;
964 } else {
965 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
966 << " >= same_since " << info.history.same_interval_since
967 << ", all is well" << dendl;
968 need_up_thru = false;
969 }
970 set_probe_targets(prior.probe);
971 return prior;
972 }
973
974 void PG::clear_primary_state()
975 {
976 dout(10) << "clear_primary_state" << dendl;
977
978 // clear peering state
979 stray_set.clear();
980 peer_log_requested.clear();
981 peer_missing_requested.clear();
982 peer_info.clear();
983 peer_missing.clear();
984 need_up_thru = false;
985 peer_last_complete_ondisk.clear();
986 peer_activated.clear();
987 min_last_complete_ondisk = eversion_t();
988 pg_trim_to = eversion_t();
989 might_have_unfound.clear();
990 projected_log = PGLog::IndexedLog();
991
992 last_update_ondisk = eversion_t();
993
994 snap_trimq.clear();
995
996 finish_sync_event = 0; // so that _finish_recovery doesn't go off in another thread
997
998 missing_loc.clear();
999
1000 release_pg_backoffs();
1001
1002 pg_log.reset_recovery_pointers();
1003
1004 scrubber.reserved_peers.clear();
1005 scrub_after_recovery = false;
1006
1007 agent_clear();
1008 }
1009
1010 PG::Scrubber::Scrubber()
1011 : reserved(false), reserve_failed(false),
1012 epoch_start(0),
1013 active(false),
1014 waiting_on(0), shallow_errors(0), deep_errors(0), fixed(0),
1015 must_scrub(false), must_deep_scrub(false), must_repair(false),
1016 auto_repair(false),
1017 num_digest_updates_pending(0),
1018 state(INACTIVE),
1019 deep(false),
1020 seed(0)
1021 {}
1022
1023 PG::Scrubber::~Scrubber() {}
1024
1025 /**
1026 * find_best_info
1027 *
1028 * Returns an iterator to the best info in infos sorted by:
1029 * 1) Prefer newer last_update
1030 * 2) Prefer longer tail if it brings another info into contiguity
1031 * 3) Prefer current primary
1032 */
1033 map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
1034 const map<pg_shard_t, pg_info_t> &infos,
1035 bool restrict_to_up_acting,
1036 bool *history_les_bound) const
1037 {
1038 assert(history_les_bound);
1039 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1040 * to make changes to this process. Also, make sure to update it
1041 * when you find bugs! */
1042 eversion_t min_last_update_acceptable = eversion_t::max();
1043 epoch_t max_last_epoch_started_found = 0;
1044 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1045 i != infos.end();
1046 ++i) {
1047 if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1048 max_last_epoch_started_found < i->second.history.last_epoch_started) {
1049 *history_les_bound = true;
1050 max_last_epoch_started_found = i->second.history.last_epoch_started;
1051 }
1052 if (!i->second.is_incomplete() &&
1053 max_last_epoch_started_found < i->second.last_epoch_started) {
1054 max_last_epoch_started_found = i->second.last_epoch_started;
1055 }
1056 }
1057 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1058 i != infos.end();
1059 ++i) {
1060 if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1061 if (min_last_update_acceptable > i->second.last_update)
1062 min_last_update_acceptable = i->second.last_update;
1063 }
1064 }
1065 if (min_last_update_acceptable == eversion_t::max())
1066 return infos.end();
1067
1068 map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1069 // find osd with newest last_update (oldest for ec_pool).
1070 // if there are multiples, prefer
1071 // - a longer tail, if it brings another peer into log contiguity
1072 // - the current primary
1073 for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1074 p != infos.end();
1075 ++p) {
1076 if (restrict_to_up_acting && !is_up(p->first) &&
1077 !is_acting(p->first))
1078 continue;
1079 // Only consider peers with last_update >= min_last_update_acceptable
1080 if (p->second.last_update < min_last_update_acceptable)
1081 continue;
1082 // Disqualify anyone with a too old last_epoch_started
1083 if (p->second.last_epoch_started < max_last_epoch_started_found)
1084 continue;
1085 // Disqualify anyone who is incomplete (not fully backfilled)
1086 if (p->second.is_incomplete())
1087 continue;
1088 if (best == infos.end()) {
1089 best = p;
1090 continue;
1091 }
1092 // Prefer newer last_update
1093 if (pool.info.require_rollback()) {
1094 if (p->second.last_update > best->second.last_update)
1095 continue;
1096 if (p->second.last_update < best->second.last_update) {
1097 best = p;
1098 continue;
1099 }
1100 } else {
1101 if (p->second.last_update < best->second.last_update)
1102 continue;
1103 if (p->second.last_update > best->second.last_update) {
1104 best = p;
1105 continue;
1106 }
1107 }
1108
1109 // Prefer longer tail
1110 if (p->second.log_tail > best->second.log_tail) {
1111 continue;
1112 } else if (p->second.log_tail < best->second.log_tail) {
1113 best = p;
1114 continue;
1115 }
1116
1117 // prefer current primary (usually the caller), all things being equal
1118 if (p->first == pg_whoami) {
1119 dout(10) << "calc_acting prefer osd." << p->first
1120 << " because it is current primary" << dendl;
1121 best = p;
1122 continue;
1123 }
1124 }
1125 return best;
1126 }
1127
1128 void PG::calc_ec_acting(
1129 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1130 unsigned size,
1131 const vector<int> &acting,
1132 pg_shard_t acting_primary,
1133 const vector<int> &up,
1134 pg_shard_t up_primary,
1135 const map<pg_shard_t, pg_info_t> &all_info,
1136 bool restrict_to_up_acting,
1137 vector<int> *_want,
1138 set<pg_shard_t> *backfill,
1139 set<pg_shard_t> *acting_backfill,
1140 pg_shard_t *want_primary,
1141 ostream &ss)
1142 {
1143 vector<int> want(size, CRUSH_ITEM_NONE);
1144 map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1145 unsigned usable = 0;
1146 for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1147 i != all_info.end();
1148 ++i) {
1149 all_info_by_shard[i->first.shard].insert(i->first);
1150 }
1151 for (uint8_t i = 0; i < want.size(); ++i) {
1152 ss << "For position " << (unsigned)i << ": ";
1153 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1154 !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1155 all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1156 auth_log_shard->second.log_tail) {
1157 ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1158 want[i] = up[i];
1159 ++usable;
1160 continue;
1161 }
1162 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1163 ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1164 << " and ";
1165 backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1166 }
1167
1168 if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1169 !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1170 all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1171 auth_log_shard->second.log_tail) {
1172 ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1173 want[i] = acting[i];
1174 ++usable;
1175 } else if (!restrict_to_up_acting) {
1176 for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1177 j != all_info_by_shard[shard_id_t(i)].end();
1178 ++j) {
1179 assert(j->shard == i);
1180 if (!all_info.find(*j)->second.is_incomplete() &&
1181 all_info.find(*j)->second.last_update >=
1182 auth_log_shard->second.log_tail) {
1183 ss << " selecting stray: " << *j << std::endl;
1184 want[i] = j->osd;
1185 ++usable;
1186 break;
1187 }
1188 }
1189 if (want[i] == CRUSH_ITEM_NONE)
1190 ss << " failed to fill position " << (int)i << std::endl;
1191 }
1192 }
1193
1194 bool found_primary = false;
1195 for (uint8_t i = 0; i < want.size(); ++i) {
1196 if (want[i] != CRUSH_ITEM_NONE) {
1197 acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1198 if (!found_primary) {
1199 *want_primary = pg_shard_t(want[i], shard_id_t(i));
1200 found_primary = true;
1201 }
1202 }
1203 }
1204 acting_backfill->insert(backfill->begin(), backfill->end());
1205 _want->swap(want);
1206 }
1207
1208 /**
1209 * calculate the desired acting set.
1210 *
1211 * Choose an appropriate acting set. Prefer up[0], unless it is
1212 * incomplete, or another osd has a longer tail that allows us to
1213 * bring other up nodes up to date.
1214 */
1215 void PG::calc_replicated_acting(
1216 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1217 unsigned size,
1218 const vector<int> &acting,
1219 pg_shard_t acting_primary,
1220 const vector<int> &up,
1221 pg_shard_t up_primary,
1222 const map<pg_shard_t, pg_info_t> &all_info,
1223 bool restrict_to_up_acting,
1224 vector<int> *want,
1225 set<pg_shard_t> *backfill,
1226 set<pg_shard_t> *acting_backfill,
1227 pg_shard_t *want_primary,
1228 ostream &ss)
1229 {
1230 ss << "calc_acting newest update on osd." << auth_log_shard->first
1231 << " with " << auth_log_shard->second
1232 << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1233 pg_shard_t auth_log_shard_id = auth_log_shard->first;
1234
1235 // select primary
1236 map<pg_shard_t,pg_info_t>::const_iterator primary;
1237 if (up.size() &&
1238 !all_info.find(up_primary)->second.is_incomplete() &&
1239 all_info.find(up_primary)->second.last_update >=
1240 auth_log_shard->second.log_tail) {
1241 ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1242 primary = all_info.find(up_primary); // prefer up[0], all thing being equal
1243 } else {
1244 assert(!auth_log_shard->second.is_incomplete());
1245 ss << "up[0] needs backfill, osd." << auth_log_shard_id
1246 << " selected as primary instead" << std::endl;
1247 primary = auth_log_shard;
1248 }
1249
1250 ss << "calc_acting primary is osd." << primary->first
1251 << " with " << primary->second << std::endl;
1252 *want_primary = primary->first;
1253 want->push_back(primary->first.osd);
1254 acting_backfill->insert(primary->first);
1255 unsigned usable = 1;
1256
1257 // select replicas that have log contiguity with primary.
1258 // prefer up, then acting, then any peer_info osds
1259 for (vector<int>::const_iterator i = up.begin();
1260 i != up.end();
1261 ++i) {
1262 pg_shard_t up_cand = pg_shard_t(*i, shard_id_t::NO_SHARD);
1263 if (up_cand == primary->first)
1264 continue;
1265 const pg_info_t &cur_info = all_info.find(up_cand)->second;
1266 if (cur_info.is_incomplete() ||
1267 cur_info.last_update < MIN(
1268 primary->second.log_tail,
1269 auth_log_shard->second.log_tail)) {
1270 /* We include auth_log_shard->second.log_tail because in GetLog,
1271 * we will request logs back to the min last_update over our
1272 * acting_backfill set, which will result in our log being extended
1273 * as far backwards as necessary to pick up any peers which can
1274 * be log recovered by auth_log_shard's log */
1275 ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1276 backfill->insert(up_cand);
1277 acting_backfill->insert(up_cand);
1278 } else {
1279 want->push_back(*i);
1280 acting_backfill->insert(up_cand);
1281 usable++;
1282 ss << " osd." << *i << " (up) accepted " << cur_info << std::endl;
1283 }
1284 }
1285
1286 // This no longer has backfill OSDs, but they are covered above.
1287 for (vector<int>::const_iterator i = acting.begin();
1288 i != acting.end();
1289 ++i) {
1290 pg_shard_t acting_cand(*i, shard_id_t::NO_SHARD);
1291 if (usable >= size)
1292 break;
1293
1294 // skip up osds we already considered above
1295 if (acting_cand == primary->first)
1296 continue;
1297 vector<int>::const_iterator up_it = find(up.begin(), up.end(), acting_cand.osd);
1298 if (up_it != up.end())
1299 continue;
1300
1301 const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1302 if (cur_info.is_incomplete() ||
1303 cur_info.last_update < primary->second.log_tail) {
1304 ss << " shard " << acting_cand << " (stray) REJECTED "
1305 << cur_info << std::endl;
1306 } else {
1307 want->push_back(*i);
1308 acting_backfill->insert(acting_cand);
1309 ss << " shard " << acting_cand << " (stray) accepted "
1310 << cur_info << std::endl;
1311 usable++;
1312 }
1313 }
1314
1315 if (restrict_to_up_acting) {
1316 return;
1317 }
1318 for (map<pg_shard_t,pg_info_t>::const_iterator i = all_info.begin();
1319 i != all_info.end();
1320 ++i) {
1321 if (usable >= size)
1322 break;
1323
1324 // skip up osds we already considered above
1325 if (i->first == primary->first)
1326 continue;
1327 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i->first.osd);
1328 if (up_it != up.end())
1329 continue;
1330 vector<int>::const_iterator acting_it = find(
1331 acting.begin(), acting.end(), i->first.osd);
1332 if (acting_it != acting.end())
1333 continue;
1334
1335 if (i->second.is_incomplete() ||
1336 i->second.last_update < primary->second.log_tail) {
1337 ss << " shard " << i->first << " (stray) REJECTED "
1338 << i->second << std::endl;
1339 } else {
1340 want->push_back(i->first.osd);
1341 acting_backfill->insert(i->first);
1342 ss << " shard " << i->first << " (stray) accepted "
1343 << i->second << std::endl;
1344 usable++;
1345 }
1346 }
1347 }
1348
1349 /**
1350 * choose acting
1351 *
1352 * calculate the desired acting, and request a change with the monitor
1353 * if it differs from the current acting.
1354 *
1355 * if restrict_to_up_acting=true, we filter out anything that's not in
1356 * up/acting. in order to lift this restriction, we need to
1357 * 1) check whether it's worth switching the acting set any time we get
1358 * a new pg info (not just here, when recovery finishes)
1359 * 2) check whether anything in want_acting went down on each new map
1360 * (and, if so, calculate a new want_acting)
1361 * 3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap)
1362 * TODO!
1363 */
1364 bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
1365 bool restrict_to_up_acting,
1366 bool *history_les_bound)
1367 {
1368 map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
1369 all_info[pg_whoami] = info;
1370
1371 for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
1372 p != all_info.end();
1373 ++p) {
1374 dout(10) << "calc_acting osd." << p->first << " " << p->second << dendl;
1375 }
1376
1377 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
1378 find_best_info(all_info, restrict_to_up_acting, history_les_bound);
1379
1380 if (auth_log_shard == all_info.end()) {
1381 if (up != acting) {
1382 dout(10) << "choose_acting no suitable info found (incomplete backfills?),"
1383 << " reverting to up" << dendl;
1384 want_acting = up;
1385 vector<int> empty;
1386 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1387 } else {
1388 dout(10) << "choose_acting failed" << dendl;
1389 assert(want_acting.empty());
1390 }
1391 return false;
1392 }
1393
1394 assert(!auth_log_shard->second.is_incomplete());
1395 auth_log_shard_id = auth_log_shard->first;
1396
1397 set<pg_shard_t> want_backfill, want_acting_backfill;
1398 vector<int> want;
1399 pg_shard_t want_primary;
1400 stringstream ss;
1401 if (!pool.info.ec_pool())
1402 calc_replicated_acting(
1403 auth_log_shard,
1404 get_osdmap()->get_pg_size(info.pgid.pgid),
1405 acting,
1406 primary,
1407 up,
1408 up_primary,
1409 all_info,
1410 restrict_to_up_acting,
1411 &want,
1412 &want_backfill,
1413 &want_acting_backfill,
1414 &want_primary,
1415 ss);
1416 else
1417 calc_ec_acting(
1418 auth_log_shard,
1419 get_osdmap()->get_pg_size(info.pgid.pgid),
1420 acting,
1421 primary,
1422 up,
1423 up_primary,
1424 all_info,
1425 restrict_to_up_acting,
1426 &want,
1427 &want_backfill,
1428 &want_acting_backfill,
1429 &want_primary,
1430 ss);
1431 dout(10) << ss.str() << dendl;
1432
1433 unsigned num_want_acting = 0;
1434 set<pg_shard_t> have;
1435 for (int i = 0; i < (int)want.size(); ++i) {
1436 if (want[i] != CRUSH_ITEM_NONE) {
1437 ++num_want_acting;
1438 have.insert(
1439 pg_shard_t(
1440 want[i],
1441 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1442 }
1443 }
1444
1445 // We go incomplete if below min_size for ec_pools since backfill
1446 // does not currently maintain rollbackability
1447 // Otherwise, we will go "peered", but not "active"
1448 if (num_want_acting < pool.info.min_size &&
1449 (pool.info.ec_pool() ||
1450 !cct->_conf->osd_allow_recovery_below_min_size)) {
1451 want_acting.clear();
1452 dout(10) << "choose_acting failed, below min size" << dendl;
1453 return false;
1454 }
1455
1456 /* Check whether we have enough acting shards to later perform recovery */
1457 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable_predicate(
1458 get_pgbackend()->get_is_recoverable_predicate());
1459 if (!(*recoverable_predicate)(have)) {
1460 want_acting.clear();
1461 dout(10) << "choose_acting failed, not recoverable" << dendl;
1462 return false;
1463 }
1464
1465 if (want != acting) {
1466 dout(10) << "choose_acting want " << want << " != acting " << acting
1467 << ", requesting pg_temp change" << dendl;
1468 want_acting = want;
1469
1470 if (want_acting == up) {
1471 // There can't be any pending backfill if
1472 // want is the same as crush map up OSDs.
1473 assert(want_backfill.empty());
1474 vector<int> empty;
1475 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1476 } else
1477 osd->queue_want_pg_temp(info.pgid.pgid, want);
1478 return false;
1479 }
1480 want_acting.clear();
1481 actingbackfill = want_acting_backfill;
1482 dout(10) << "actingbackfill is " << actingbackfill << dendl;
1483 assert(backfill_targets.empty() || backfill_targets == want_backfill);
1484 if (backfill_targets.empty()) {
1485 // Caller is GetInfo
1486 backfill_targets = want_backfill;
1487 }
1488 // Will not change if already set because up would have had to change
1489 // Verify that nothing in backfill is in stray_set
1490 for (set<pg_shard_t>::iterator i = want_backfill.begin();
1491 i != want_backfill.end();
1492 ++i) {
1493 assert(stray_set.find(*i) == stray_set.end());
1494 }
1495 dout(10) << "choose_acting want " << want << " (== acting) backfill_targets "
1496 << want_backfill << dendl;
1497 return true;
1498 }
1499
1500 /* Build the might_have_unfound set.
1501 *
1502 * This is used by the primary OSD during recovery.
1503 *
1504 * This set tracks the OSDs which might have unfound objects that the primary
1505 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
1506 * will remove the OSD from the set.
1507 */
1508 void PG::build_might_have_unfound()
1509 {
1510 assert(might_have_unfound.empty());
1511 assert(is_primary());
1512
1513 dout(10) << __func__ << dendl;
1514
1515 check_past_interval_bounds();
1516
1517 might_have_unfound = past_intervals.get_might_have_unfound(
1518 pg_whoami,
1519 pool.info.ec_pool());
1520
1521 // include any (stray) peers
1522 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
1523 p != peer_info.end();
1524 ++p)
1525 might_have_unfound.insert(p->first);
1526
1527 dout(15) << __func__ << ": built " << might_have_unfound << dendl;
1528 }
1529
1530 struct C_PG_ActivateCommitted : public Context {
1531 PGRef pg;
1532 epoch_t epoch;
1533 epoch_t activation_epoch;
1534 C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
1535 : pg(p), epoch(e), activation_epoch(ae) {}
1536 void finish(int r) override {
1537 pg->_activate_committed(epoch, activation_epoch);
1538 }
1539 };
1540
1541 void PG::activate(ObjectStore::Transaction& t,
1542 epoch_t activation_epoch,
1543 list<Context*>& tfin,
1544 map<int, map<spg_t,pg_query_t> >& query_map,
1545 map<int,
1546 vector<
1547 pair<pg_notify_t,
1548 PastIntervals> > > *activator_map,
1549 RecoveryCtx *ctx)
1550 {
1551 assert(!is_peered());
1552 assert(scrubber.callbacks.empty());
1553 assert(callbacks_for_degraded_object.empty());
1554
1555 // twiddle pg state
1556 state_clear(PG_STATE_DOWN);
1557
1558 send_notify = false;
1559
1560 if (is_primary()) {
1561 // only update primary last_epoch_started if we will go active
1562 if (acting.size() >= pool.info.min_size) {
1563 assert(cct->_conf->osd_find_best_info_ignore_history_les ||
1564 info.last_epoch_started <= activation_epoch);
1565 info.last_epoch_started = activation_epoch;
1566 info.last_interval_started = info.history.same_interval_since;
1567 }
1568 } else if (is_acting(pg_whoami)) {
1569 /* update last_epoch_started on acting replica to whatever the primary sent
1570 * unless it's smaller (could happen if we are going peered rather than
1571 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
1572 if (info.last_epoch_started < activation_epoch) {
1573 info.last_epoch_started = activation_epoch;
1574 info.last_interval_started = info.history.same_interval_since;
1575 }
1576 }
1577
1578 auto &missing = pg_log.get_missing();
1579
1580 if (is_primary()) {
1581 last_update_ondisk = info.last_update;
1582 min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
1583 }
1584 last_update_applied = info.last_update;
1585 last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
1586
1587 need_up_thru = false;
1588
1589 // write pg info, log
1590 dirty_info = true;
1591 dirty_big_info = true; // maybe
1592
1593 // find out when we commit
1594 t.register_on_complete(
1595 new C_PG_ActivateCommitted(
1596 this,
1597 get_osdmap()->get_epoch(),
1598 activation_epoch));
1599
1600 // initialize snap_trimq
1601 if (is_primary()) {
1602 dout(20) << "activate - purged_snaps " << info.purged_snaps
1603 << " cached_removed_snaps " << pool.cached_removed_snaps << dendl;
1604 snap_trimq = pool.cached_removed_snaps;
1605 interval_set<snapid_t> intersection;
1606 intersection.intersection_of(snap_trimq, info.purged_snaps);
1607 if (intersection == info.purged_snaps) {
1608 snap_trimq.subtract(info.purged_snaps);
1609 } else {
1610 dout(0) << "warning: info.purged_snaps (" << info.purged_snaps
1611 << ") is not a subset of pool.cached_removed_snaps ("
1612 << pool.cached_removed_snaps << ")" << dendl;
1613 snap_trimq.subtract(intersection);
1614 }
1615 }
1616
1617 // init complete pointer
1618 if (missing.num_missing() == 0) {
1619 dout(10) << "activate - no missing, moving last_complete " << info.last_complete
1620 << " -> " << info.last_update << dendl;
1621 info.last_complete = info.last_update;
1622 pg_log.reset_recovery_pointers();
1623 } else {
1624 dout(10) << "activate - not complete, " << missing << dendl;
1625 pg_log.activate_not_complete(info);
1626 }
1627
1628 log_weirdness();
1629
1630 // if primary..
1631 if (is_primary()) {
1632 assert(ctx);
1633 // start up replicas
1634
1635 assert(!actingbackfill.empty());
1636 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1637 i != actingbackfill.end();
1638 ++i) {
1639 if (*i == pg_whoami) continue;
1640 pg_shard_t peer = *i;
1641 assert(peer_info.count(peer));
1642 pg_info_t& pi = peer_info[peer];
1643
1644 dout(10) << "activate peer osd." << peer << " " << pi << dendl;
1645
1646 MOSDPGLog *m = 0;
1647 assert(peer_missing.count(peer));
1648 pg_missing_t& pm = peer_missing[peer];
1649
1650 bool needs_past_intervals = pi.dne();
1651
1652 /*
1653 * cover case where peer sort order was different and
1654 * last_backfill cannot be interpreted
1655 */
1656 bool force_restart_backfill =
1657 !pi.last_backfill.is_max() &&
1658 !pi.last_backfill_bitwise;
1659
1660 if (pi.last_update == info.last_update && !force_restart_backfill) {
1661 // empty log
1662 if (!pi.last_backfill.is_max())
1663 osd->clog->info() << info.pgid << " continuing backfill to osd."
1664 << peer
1665 << " from (" << pi.log_tail << "," << pi.last_update
1666 << "] " << pi.last_backfill
1667 << " to " << info.last_update;
1668 if (!pi.is_empty() && activator_map) {
1669 dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
1670 (*activator_map)[peer.osd].push_back(
1671 make_pair(
1672 pg_notify_t(
1673 peer.shard, pg_whoami.shard,
1674 get_osdmap()->get_epoch(),
1675 get_osdmap()->get_epoch(),
1676 info),
1677 past_intervals));
1678 } else {
1679 dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
1680 m = new MOSDPGLog(
1681 i->shard, pg_whoami.shard,
1682 get_osdmap()->get_epoch(), info);
1683 }
1684 } else if (
1685 pg_log.get_tail() > pi.last_update ||
1686 pi.last_backfill == hobject_t() ||
1687 force_restart_backfill ||
1688 (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
1689 /* ^ This last case covers a situation where a replica is not contiguous
1690 * with the auth_log, but is contiguous with this replica. Reshuffling
1691 * the active set to handle this would be tricky, so instead we just go
1692 * ahead and backfill it anyway. This is probably preferrable in any
1693 * case since the replica in question would have to be significantly
1694 * behind.
1695 */
1696 // backfill
1697 osd->clog->debug() << info.pgid << " starting backfill to osd." << peer
1698 << " from (" << pi.log_tail << "," << pi.last_update
1699 << "] " << pi.last_backfill
1700 << " to " << info.last_update;
1701
1702 pi.last_update = info.last_update;
1703 pi.last_complete = info.last_update;
1704 pi.set_last_backfill(hobject_t());
1705 pi.last_epoch_started = info.last_epoch_started;
1706 pi.last_interval_started = info.last_interval_started;
1707 pi.history = info.history;
1708 pi.hit_set = info.hit_set;
1709 pi.stats.stats.clear();
1710
1711 // initialize peer with our purged_snaps.
1712 pi.purged_snaps = info.purged_snaps;
1713
1714 m = new MOSDPGLog(
1715 i->shard, pg_whoami.shard,
1716 get_osdmap()->get_epoch(), pi);
1717
1718 // send some recent log, so that op dup detection works well.
1719 m->log.copy_up_to(pg_log.get_log(), cct->_conf->osd_min_pg_log_entries);
1720 m->info.log_tail = m->log.tail;
1721 pi.log_tail = m->log.tail; // sigh...
1722
1723 pm.clear();
1724 } else {
1725 // catch up
1726 assert(pg_log.get_tail() <= pi.last_update);
1727 m = new MOSDPGLog(
1728 i->shard, pg_whoami.shard,
1729 get_osdmap()->get_epoch(), info);
1730 // send new stuff to append to replicas log
1731 m->log.copy_after(pg_log.get_log(), pi.last_update);
1732 }
1733
1734 // share past_intervals if we are creating the pg on the replica
1735 // based on whether our info for that peer was dne() *before*
1736 // updating pi.history in the backfill block above.
1737 if (m && needs_past_intervals)
1738 m->past_intervals = past_intervals;
1739
1740 // update local version of peer's missing list!
1741 if (m && pi.last_backfill != hobject_t()) {
1742 for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
1743 p != m->log.log.end();
1744 ++p) {
1745 if (p->soid <= pi.last_backfill &&
1746 !p->is_error()) {
1747 if (perform_deletes_during_peering() && p->is_delete()) {
1748 pm.rm(p->soid, p->version);
1749 } else {
1750 pm.add_next_event(*p);
1751 }
1752 }
1753 }
1754 }
1755
1756 if (m) {
1757 dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
1758 //m->log.print(cout);
1759 osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
1760 }
1761
1762 // peer now has
1763 pi.last_update = info.last_update;
1764
1765 // update our missing
1766 if (pm.num_missing() == 0) {
1767 pi.last_complete = pi.last_update;
1768 dout(10) << "activate peer osd." << peer << " " << pi << " uptodate" << dendl;
1769 } else {
1770 dout(10) << "activate peer osd." << peer << " " << pi << " missing " << pm << dendl;
1771 }
1772 }
1773
1774 // Set up missing_loc
1775 set<pg_shard_t> complete_shards;
1776 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1777 i != actingbackfill.end();
1778 ++i) {
1779 dout(20) << __func__ << " setting up missing_loc from shard " << *i << " " << dendl;
1780 if (*i == get_primary()) {
1781 missing_loc.add_active_missing(missing);
1782 if (!missing.have_missing())
1783 complete_shards.insert(*i);
1784 } else {
1785 auto peer_missing_entry = peer_missing.find(*i);
1786 assert(peer_missing_entry != peer_missing.end());
1787 missing_loc.add_active_missing(peer_missing_entry->second);
1788 if (!peer_missing_entry->second.have_missing() &&
1789 peer_info[*i].last_backfill.is_max())
1790 complete_shards.insert(*i);
1791 }
1792 }
1793 // If necessary, create might_have_unfound to help us find our unfound objects.
1794 // NOTE: It's important that we build might_have_unfound before trimming the
1795 // past intervals.
1796 might_have_unfound.clear();
1797 if (needs_recovery()) {
1798 // If only one shard has missing, we do a trick to add all others as recovery
1799 // source, this is considered safe since the PGLogs have been merged locally,
1800 // and covers vast majority of the use cases, like one OSD/host is down for
1801 // a while for hardware repairing
1802 if (complete_shards.size() + 1 == actingbackfill.size()) {
1803 missing_loc.add_batch_sources_info(complete_shards, ctx->handle);
1804 } else {
1805 missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
1806 ctx->handle);
1807 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1808 i != actingbackfill.end();
1809 ++i) {
1810 if (*i == pg_whoami) continue;
1811 dout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
1812 assert(peer_missing.count(*i));
1813 assert(peer_info.count(*i));
1814 missing_loc.add_source_info(
1815 *i,
1816 peer_info[*i],
1817 peer_missing[*i],
1818 ctx->handle);
1819 }
1820 }
1821 for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
1822 i != peer_missing.end();
1823 ++i) {
1824 if (is_actingbackfill(i->first))
1825 continue;
1826 assert(peer_info.count(i->first));
1827 search_for_missing(
1828 peer_info[i->first],
1829 i->second,
1830 i->first,
1831 ctx);
1832 }
1833
1834 build_might_have_unfound();
1835
1836 if (have_unfound())
1837 discover_all_missing(query_map);
1838 }
1839
1840 // num_objects_degraded if calculated should reflect this too, unless no
1841 // missing and we are about to go clean.
1842 if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
1843 state_set(PG_STATE_UNDERSIZED);
1844 }
1845
1846 state_set(PG_STATE_ACTIVATING);
1847 release_pg_backoffs();
1848 projected_last_update = info.last_update;
1849 }
1850 if (acting.size() >= pool.info.min_size) {
1851 PGLogEntryHandler handler{this, &t};
1852 pg_log.roll_forward(&handler);
1853 }
1854 }
1855
1856 bool PG::op_has_sufficient_caps(OpRequestRef& op)
1857 {
1858 // only check MOSDOp
1859 if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
1860 return true;
1861
1862 const MOSDOp *req = static_cast<const MOSDOp*>(op->get_req());
1863
1864 Session *session = static_cast<Session*>(req->get_connection()->get_priv());
1865 if (!session) {
1866 dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl;
1867 return false;
1868 }
1869 OSDCap& caps = session->caps;
1870 session->put();
1871
1872 const string &key = req->get_hobj().get_key().empty() ?
1873 req->get_oid().name :
1874 req->get_hobj().get_key();
1875
1876 bool cap = caps.is_capable(pool.name, req->get_hobj().nspace,
1877 pool.auid, key,
1878 op->need_read_cap(),
1879 op->need_write_cap(),
1880 op->classes());
1881
1882 dout(20) << "op_has_sufficient_caps "
1883 << "session=" << session
1884 << " pool=" << pool.id << " (" << pool.name
1885 << " " << req->get_hobj().nspace
1886 << ") owner=" << pool.auid
1887 << " need_read_cap=" << op->need_read_cap()
1888 << " need_write_cap=" << op->need_write_cap()
1889 << " classes=" << op->classes()
1890 << " -> " << (cap ? "yes" : "NO")
1891 << dendl;
1892 return cap;
1893 }
1894
1895 void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
1896 {
1897 lock();
1898 if (pg_has_reset_since(epoch)) {
1899 dout(10) << "_activate_committed " << epoch
1900 << ", that was an old interval" << dendl;
1901 } else if (is_primary()) {
1902 peer_activated.insert(pg_whoami);
1903 dout(10) << "_activate_committed " << epoch
1904 << " peer_activated now " << peer_activated
1905 << " last_interval_started " << info.history.last_interval_started
1906 << " last_epoch_started " << info.history.last_epoch_started
1907 << " same_interval_since " << info.history.same_interval_since << dendl;
1908 assert(!actingbackfill.empty());
1909 if (peer_activated.size() == actingbackfill.size())
1910 all_activated_and_committed();
1911 } else {
1912 dout(10) << "_activate_committed " << epoch << " telling primary" << dendl;
1913 MOSDPGInfo *m = new MOSDPGInfo(epoch);
1914 pg_notify_t i = pg_notify_t(
1915 get_primary().shard, pg_whoami.shard,
1916 get_osdmap()->get_epoch(),
1917 get_osdmap()->get_epoch(),
1918 info);
1919
1920 i.info.history.last_epoch_started = activation_epoch;
1921 i.info.history.last_interval_started = i.info.history.same_interval_since;
1922 if (acting.size() >= pool.info.min_size) {
1923 state_set(PG_STATE_ACTIVE);
1924 } else {
1925 state_set(PG_STATE_PEERED);
1926 }
1927
1928 m->pg_list.push_back(make_pair(i, PastIntervals()));
1929 osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap()->get_epoch());
1930
1931 // waiters
1932 if (flushes_in_progress == 0) {
1933 requeue_ops(waiting_for_peered);
1934 } else if (!waiting_for_peered.empty()) {
1935 dout(10) << __func__ << " flushes in progress, moving "
1936 << waiting_for_peered.size() << " items to waiting_for_flush"
1937 << dendl;
1938 assert(waiting_for_flush.empty());
1939 waiting_for_flush.swap(waiting_for_peered);
1940 }
1941 }
1942
1943 assert(!dirty_info);
1944
1945 unlock();
1946 }
1947
1948 /*
1949 * update info.history.last_epoch_started ONLY after we and all
1950 * replicas have activated AND committed the activate transaction
1951 * (i.e. the peering results are stable on disk).
1952 */
1953 void PG::all_activated_and_committed()
1954 {
1955 dout(10) << "all_activated_and_committed" << dendl;
1956 assert(is_primary());
1957 assert(peer_activated.size() == actingbackfill.size());
1958 assert(!actingbackfill.empty());
1959 assert(blocked_by.empty());
1960
1961 // Degraded?
1962 _update_calc_stats();
1963 if (info.stats.stats.sum.num_objects_degraded) {
1964 state_set(PG_STATE_DEGRADED);
1965 } else {
1966 state_clear(PG_STATE_DEGRADED);
1967 }
1968
1969 queue_peering_event(
1970 CephPeeringEvtRef(
1971 std::make_shared<CephPeeringEvt>(
1972 get_osdmap()->get_epoch(),
1973 get_osdmap()->get_epoch(),
1974 AllReplicasActivated())));
1975 }
1976
1977 bool PG::requeue_scrub(bool high_priority)
1978 {
1979 assert(is_locked());
1980 if (scrub_queued) {
1981 dout(10) << __func__ << ": already queued" << dendl;
1982 return false;
1983 } else {
1984 dout(10) << __func__ << ": queueing" << dendl;
1985 scrub_queued = true;
1986 osd->queue_for_scrub(this, high_priority);
1987 return true;
1988 }
1989 }
1990
1991 void PG::queue_recovery()
1992 {
1993 if (!is_primary() || !is_peered()) {
1994 dout(10) << "queue_recovery -- not primary or not peered " << dendl;
1995 assert(!recovery_queued);
1996 } else if (recovery_queued) {
1997 dout(10) << "queue_recovery -- already queued" << dendl;
1998 } else {
1999 dout(10) << "queue_recovery -- queuing" << dendl;
2000 recovery_queued = true;
2001 osd->queue_for_recovery(this);
2002 }
2003 }
2004
2005 bool PG::queue_scrub()
2006 {
2007 assert(is_locked());
2008 if (is_scrubbing()) {
2009 return false;
2010 }
2011 scrubber.priority = scrubber.must_scrub ?
2012 cct->_conf->osd_requested_scrub_priority : get_scrub_priority();
2013 scrubber.must_scrub = false;
2014 state_set(PG_STATE_SCRUBBING);
2015 if (scrubber.must_deep_scrub) {
2016 state_set(PG_STATE_DEEP_SCRUB);
2017 scrubber.must_deep_scrub = false;
2018 }
2019 if (scrubber.must_repair || scrubber.auto_repair) {
2020 state_set(PG_STATE_REPAIR);
2021 scrubber.must_repair = false;
2022 }
2023 requeue_scrub();
2024 return true;
2025 }
2026
2027 unsigned PG::get_scrub_priority()
2028 {
2029 // a higher value -> a higher priority
2030 int pool_scrub_priority = 0;
2031 pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority);
2032 return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
2033 }
2034
2035 struct C_PG_FinishRecovery : public Context {
2036 PGRef pg;
2037 explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
2038 void finish(int r) override {
2039 pg->_finish_recovery(this);
2040 }
2041 };
2042
2043 void PG::mark_clean()
2044 {
2045 if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2046 state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
2047 state_set(PG_STATE_CLEAN);
2048 info.history.last_epoch_clean = get_osdmap()->get_epoch();
2049 info.history.last_interval_clean = info.history.same_interval_since;
2050 past_intervals.clear();
2051 dirty_big_info = true;
2052 dirty_info = true;
2053 }
2054
2055 kick_snap_trim();
2056 }
2057
2058 void PG::_change_recovery_force_mode(int new_mode, bool clear)
2059 {
2060 if (!deleting) {
2061 // we can't and shouldn't do anything if the PG is being deleted locally
2062 if (clear) {
2063 state_clear(new_mode);
2064 } else {
2065 state_set(new_mode);
2066 }
2067 publish_stats_to_osd();
2068 }
2069 }
2070
2071 inline int PG::clamp_recovery_priority(int priority)
2072 {
2073 static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
2074 static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
2075
2076 // Clamp to valid range
2077 if (priority > OSD_RECOVERY_PRIORITY_MAX) {
2078 return OSD_RECOVERY_PRIORITY_MAX;
2079 } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
2080 return OSD_RECOVERY_PRIORITY_MIN;
2081 } else {
2082 return priority;
2083 }
2084 }
2085
2086 unsigned PG::get_recovery_priority()
2087 {
2088 // a higher value -> a higher priority
2089 int ret = 0;
2090
2091 if (state & PG_STATE_FORCED_RECOVERY) {
2092 ret = OSD_RECOVERY_PRIORITY_FORCED;
2093 } else {
2094 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &ret);
2095 ret = clamp_recovery_priority(OSD_RECOVERY_PRIORITY_BASE + ret);
2096 }
2097 dout(20) << __func__ << " recovery priority for " << *this << " is " << ret << ", state is " << state << dendl;
2098 return static_cast<unsigned>(ret);
2099 }
2100
2101 unsigned PG::get_backfill_priority()
2102 {
2103 // a higher value -> a higher priority
2104 int ret = OSD_BACKFILL_PRIORITY_BASE;
2105 if (state & PG_STATE_FORCED_BACKFILL) {
2106 ret = OSD_RECOVERY_PRIORITY_FORCED;
2107 } else {
2108 if (acting.size() < pool.info.min_size) {
2109 // inactive: no. of replicas < min_size, highest priority since it blocks IO
2110 ret = OSD_BACKFILL_INACTIVE_PRIORITY_BASE + (pool.info.min_size - acting.size());
2111
2112 } else if (is_undersized()) {
2113 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
2114 assert(pool.info.size > actingset.size());
2115 ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE + (pool.info.size - actingset.size());
2116
2117 } else if (is_degraded()) {
2118 // degraded: baseline degraded
2119 ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
2120 }
2121
2122 // Adjust with pool's recovery priority
2123 int pool_recovery_priority = 0;
2124 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2125
2126 ret = clamp_recovery_priority(pool_recovery_priority + ret);
2127 }
2128
2129 return static_cast<unsigned>(ret);
2130 }
2131
2132 void PG::finish_recovery(list<Context*>& tfin)
2133 {
2134 dout(10) << "finish_recovery" << dendl;
2135 assert(info.last_complete == info.last_update);
2136
2137 clear_recovery_state();
2138
2139 /*
2140 * sync all this before purging strays. but don't block!
2141 */
2142 finish_sync_event = new C_PG_FinishRecovery(this);
2143 tfin.push_back(finish_sync_event);
2144 }
2145
2146 void PG::_finish_recovery(Context *c)
2147 {
2148 lock();
2149 if (deleting) {
2150 unlock();
2151 return;
2152 }
2153 if (c == finish_sync_event) {
2154 dout(10) << "_finish_recovery" << dendl;
2155 finish_sync_event = 0;
2156 purge_strays();
2157
2158 publish_stats_to_osd();
2159
2160 if (scrub_after_recovery) {
2161 dout(10) << "_finish_recovery requeueing for scrub" << dendl;
2162 scrub_after_recovery = false;
2163 scrubber.must_deep_scrub = true;
2164 queue_scrub();
2165 }
2166 } else {
2167 dout(10) << "_finish_recovery -- stale" << dendl;
2168 }
2169 unlock();
2170 }
2171
2172 void PG::start_recovery_op(const hobject_t& soid)
2173 {
2174 dout(10) << "start_recovery_op " << soid
2175 #ifdef DEBUG_RECOVERY_OIDS
2176 << " (" << recovering_oids << ")"
2177 #endif
2178 << dendl;
2179 assert(recovery_ops_active >= 0);
2180 recovery_ops_active++;
2181 #ifdef DEBUG_RECOVERY_OIDS
2182 assert(recovering_oids.count(soid) == 0);
2183 recovering_oids.insert(soid);
2184 #endif
2185 osd->start_recovery_op(this, soid);
2186 }
2187
2188 void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
2189 {
2190 dout(10) << "finish_recovery_op " << soid
2191 #ifdef DEBUG_RECOVERY_OIDS
2192 << " (" << recovering_oids << ")"
2193 #endif
2194 << dendl;
2195 assert(recovery_ops_active > 0);
2196 recovery_ops_active--;
2197 #ifdef DEBUG_RECOVERY_OIDS
2198 assert(recovering_oids.count(soid));
2199 recovering_oids.erase(soid);
2200 #endif
2201 osd->finish_recovery_op(this, soid, dequeue);
2202
2203 if (!dequeue) {
2204 queue_recovery();
2205 }
2206 }
2207
2208 void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
2209 {
2210 child->update_snap_mapper_bits(split_bits);
2211 child->update_osdmap_ref(get_osdmap());
2212
2213 child->pool = pool;
2214
2215 // Log
2216 pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2217 child->info.last_complete = info.last_complete;
2218
2219 info.last_update = pg_log.get_head();
2220 child->info.last_update = child->pg_log.get_head();
2221
2222 child->info.last_user_version = info.last_user_version;
2223
2224 info.log_tail = pg_log.get_tail();
2225 child->info.log_tail = child->pg_log.get_tail();
2226
2227 if (info.last_complete < pg_log.get_tail())
2228 info.last_complete = pg_log.get_tail();
2229 if (child->info.last_complete < child->pg_log.get_tail())
2230 child->info.last_complete = child->pg_log.get_tail();
2231
2232 // Info
2233 child->info.history = info.history;
2234 child->info.history.epoch_created = get_osdmap()->get_epoch();
2235 child->info.purged_snaps = info.purged_snaps;
2236
2237 if (info.last_backfill.is_max()) {
2238 child->info.set_last_backfill(hobject_t::get_max());
2239 } else {
2240 // restart backfill on parent and child to be safe. we could
2241 // probably do better in the bitwise sort case, but it's more
2242 // fragile (there may be special work to do on backfill completion
2243 // in the future).
2244 info.set_last_backfill(hobject_t());
2245 child->info.set_last_backfill(hobject_t());
2246 // restarting backfill implies that the missing set is empty,
2247 // since it is only used for objects prior to last_backfill
2248 pg_log.reset_backfill();
2249 child->pg_log.reset_backfill();
2250 }
2251
2252 child->info.stats = info.stats;
2253 child->info.stats.parent_split_bits = split_bits;
2254 info.stats.stats_invalid = true;
2255 child->info.stats.stats_invalid = true;
2256 child->info.last_epoch_started = info.last_epoch_started;
2257 child->info.last_interval_started = info.last_interval_started;
2258
2259 child->snap_trimq = snap_trimq;
2260
2261 // There can't be recovery/backfill going on now
2262 int primary, up_primary;
2263 vector<int> newup, newacting;
2264 get_osdmap()->pg_to_up_acting_osds(
2265 child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2266 child->init_primary_up_acting(
2267 newup,
2268 newacting,
2269 up_primary,
2270 primary);
2271 child->role = OSDMap::calc_pg_role(osd->whoami, child->acting);
2272
2273 // this comparison includes primary rank via pg_shard_t
2274 if (get_primary() != child->get_primary())
2275 child->info.history.same_primary_since = get_osdmap()->get_epoch();
2276
2277 child->info.stats.up = up;
2278 child->info.stats.up_primary = up_primary;
2279 child->info.stats.acting = acting;
2280 child->info.stats.acting_primary = primary;
2281 child->info.stats.mapping_epoch = get_osdmap()->get_epoch();
2282
2283 // History
2284 child->past_intervals = past_intervals;
2285
2286 _split_into(child_pgid, child, split_bits);
2287
2288 // release all backoffs for simplicity
2289 release_backoffs(hobject_t(), hobject_t::get_max());
2290
2291 child->on_new_interval();
2292
2293 child->dirty_info = true;
2294 child->dirty_big_info = true;
2295 dirty_info = true;
2296 dirty_big_info = true;
2297 }
2298
2299 void PG::add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end)
2300 {
2301 ConnectionRef con = s->con;
2302 if (!con) // OSD::ms_handle_reset clears s->con without a lock
2303 return;
2304 BackoffRef b(s->have_backoff(info.pgid, begin));
2305 if (b) {
2306 derr << __func__ << " already have backoff for " << s << " begin " << begin
2307 << " " << *b << dendl;
2308 ceph_abort();
2309 }
2310 Mutex::Locker l(backoff_lock);
2311 {
2312 b = new Backoff(info.pgid, this, s, ++s->backoff_seq, begin, end);
2313 backoffs[begin].insert(b);
2314 s->add_backoff(b);
2315 dout(10) << __func__ << " session " << s << " added " << *b << dendl;
2316 }
2317 con->send_message(
2318 new MOSDBackoff(
2319 info.pgid,
2320 get_osdmap()->get_epoch(),
2321 CEPH_OSD_BACKOFF_OP_BLOCK,
2322 b->id,
2323 begin,
2324 end));
2325 }
2326
2327 void PG::release_backoffs(const hobject_t& begin, const hobject_t& end)
2328 {
2329 dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl;
2330 vector<BackoffRef> bv;
2331 {
2332 Mutex::Locker l(backoff_lock);
2333 auto p = backoffs.lower_bound(begin);
2334 while (p != backoffs.end()) {
2335 int r = cmp(p->first, end);
2336 dout(20) << __func__ << " ? " << r << " " << p->first
2337 << " " << p->second << dendl;
2338 // note: must still examine begin=end=p->first case
2339 if (r > 0 || (r == 0 && begin < end)) {
2340 break;
2341 }
2342 dout(20) << __func__ << " checking " << p->first
2343 << " " << p->second << dendl;
2344 auto q = p->second.begin();
2345 while (q != p->second.end()) {
2346 dout(20) << __func__ << " checking " << *q << dendl;
2347 int r = cmp((*q)->begin, begin);
2348 if (r == 0 || (r > 0 && (*q)->end < end)) {
2349 bv.push_back(*q);
2350 q = p->second.erase(q);
2351 } else {
2352 ++q;
2353 }
2354 }
2355 if (p->second.empty()) {
2356 p = backoffs.erase(p);
2357 } else {
2358 ++p;
2359 }
2360 }
2361 }
2362 for (auto b : bv) {
2363 Mutex::Locker l(b->lock);
2364 dout(10) << __func__ << " " << *b << dendl;
2365 if (b->session) {
2366 assert(b->pg == this);
2367 ConnectionRef con = b->session->con;
2368 if (con) { // OSD::ms_handle_reset clears s->con without a lock
2369 con->send_message(
2370 new MOSDBackoff(
2371 info.pgid,
2372 get_osdmap()->get_epoch(),
2373 CEPH_OSD_BACKOFF_OP_UNBLOCK,
2374 b->id,
2375 b->begin,
2376 b->end));
2377 }
2378 if (b->is_new()) {
2379 b->state = Backoff::STATE_DELETING;
2380 } else {
2381 b->session->rm_backoff(b);
2382 b->session.reset();
2383 }
2384 b->pg.reset();
2385 }
2386 }
2387 }
2388
2389 void PG::clear_backoffs()
2390 {
2391 dout(10) << __func__ << " " << dendl;
2392 map<hobject_t,set<BackoffRef>> ls;
2393 {
2394 Mutex::Locker l(backoff_lock);
2395 ls.swap(backoffs);
2396 }
2397 for (auto& p : ls) {
2398 for (auto& b : p.second) {
2399 Mutex::Locker l(b->lock);
2400 dout(10) << __func__ << " " << *b << dendl;
2401 if (b->session) {
2402 assert(b->pg == this);
2403 if (b->is_new()) {
2404 b->state = Backoff::STATE_DELETING;
2405 } else {
2406 b->session->rm_backoff(b);
2407 b->session.reset();
2408 }
2409 b->pg.reset();
2410 }
2411 }
2412 }
2413 }
2414
2415 // called by Session::clear_backoffs()
2416 void PG::rm_backoff(BackoffRef b)
2417 {
2418 dout(10) << __func__ << " " << *b << dendl;
2419 Mutex::Locker l(backoff_lock);
2420 assert(b->lock.is_locked_by_me());
2421 assert(b->pg == this);
2422 auto p = backoffs.find(b->begin);
2423 // may race with release_backoffs()
2424 if (p != backoffs.end()) {
2425 auto q = p->second.find(b);
2426 if (q != p->second.end()) {
2427 p->second.erase(q);
2428 if (p->second.empty()) {
2429 backoffs.erase(p);
2430 }
2431 }
2432 }
2433 }
2434
2435 void PG::clear_recovery_state()
2436 {
2437 dout(10) << "clear_recovery_state" << dendl;
2438
2439 pg_log.reset_recovery_pointers();
2440 finish_sync_event = 0;
2441
2442 hobject_t soid;
2443 while (recovery_ops_active > 0) {
2444 #ifdef DEBUG_RECOVERY_OIDS
2445 soid = *recovering_oids.begin();
2446 #endif
2447 finish_recovery_op(soid, true);
2448 }
2449
2450 backfill_targets.clear();
2451 backfill_info.clear();
2452 peer_backfill_info.clear();
2453 waiting_on_backfill.clear();
2454 _clear_recovery_state(); // pg impl specific hook
2455 }
2456
2457 void PG::cancel_recovery()
2458 {
2459 dout(10) << "cancel_recovery" << dendl;
2460 clear_recovery_state();
2461 }
2462
2463
2464 void PG::purge_strays()
2465 {
2466 dout(10) << "purge_strays " << stray_set << dendl;
2467
2468 bool removed = false;
2469 for (set<pg_shard_t>::iterator p = stray_set.begin();
2470 p != stray_set.end();
2471 ++p) {
2472 assert(!is_actingbackfill(*p));
2473 if (get_osdmap()->is_up(p->osd)) {
2474 dout(10) << "sending PGRemove to osd." << *p << dendl;
2475 vector<spg_t> to_remove;
2476 to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
2477 MOSDPGRemove *m = new MOSDPGRemove(
2478 get_osdmap()->get_epoch(),
2479 to_remove);
2480 osd->send_message_osd_cluster(p->osd, m, get_osdmap()->get_epoch());
2481 } else {
2482 dout(10) << "not sending PGRemove to down osd." << *p << dendl;
2483 }
2484 peer_missing.erase(*p);
2485 peer_info.erase(*p);
2486 peer_purged.insert(*p);
2487 removed = true;
2488 }
2489
2490 // if we removed anyone, update peers (which include peer_info)
2491 if (removed)
2492 update_heartbeat_peers();
2493
2494 stray_set.clear();
2495
2496 // clear _requested maps; we may have to peer() again if we discover
2497 // (more) stray content
2498 peer_log_requested.clear();
2499 peer_missing_requested.clear();
2500 }
2501
2502 void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
2503 {
2504 Mutex::Locker l(heartbeat_peer_lock);
2505 probe_targets.clear();
2506 for (set<pg_shard_t>::iterator i = probe_set.begin();
2507 i != probe_set.end();
2508 ++i) {
2509 probe_targets.insert(i->osd);
2510 }
2511 }
2512
2513 void PG::clear_probe_targets()
2514 {
2515 Mutex::Locker l(heartbeat_peer_lock);
2516 probe_targets.clear();
2517 }
2518
2519 void PG::update_heartbeat_peers()
2520 {
2521 assert(is_locked());
2522
2523 if (!is_primary())
2524 return;
2525
2526 set<int> new_peers;
2527 for (unsigned i=0; i<acting.size(); i++) {
2528 if (acting[i] != CRUSH_ITEM_NONE)
2529 new_peers.insert(acting[i]);
2530 }
2531 for (unsigned i=0; i<up.size(); i++) {
2532 if (up[i] != CRUSH_ITEM_NONE)
2533 new_peers.insert(up[i]);
2534 }
2535 for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
2536 p != peer_info.end();
2537 ++p)
2538 new_peers.insert(p->first.osd);
2539
2540 bool need_update = false;
2541 heartbeat_peer_lock.Lock();
2542 if (new_peers == heartbeat_peers) {
2543 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl;
2544 } else {
2545 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl;
2546 heartbeat_peers.swap(new_peers);
2547 need_update = true;
2548 }
2549 heartbeat_peer_lock.Unlock();
2550
2551 if (need_update)
2552 osd->need_heartbeat_peer_update();
2553 }
2554
2555
2556 bool PG::check_in_progress_op(
2557 const osd_reqid_t &r,
2558 eversion_t *version,
2559 version_t *user_version,
2560 int *return_code) const
2561 {
2562 return (
2563 projected_log.get_request(r, version, user_version, return_code) ||
2564 pg_log.get_log().get_request(r, version, user_version, return_code));
2565 }
2566
2567 void PG::_update_calc_stats()
2568 {
2569 info.stats.version = info.last_update;
2570 info.stats.created = info.history.epoch_created;
2571 info.stats.last_scrub = info.history.last_scrub;
2572 info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
2573 info.stats.last_deep_scrub = info.history.last_deep_scrub;
2574 info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
2575 info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
2576 info.stats.last_epoch_clean = info.history.last_epoch_clean;
2577
2578 info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
2579 info.stats.ondisk_log_size = info.stats.log_size;
2580 info.stats.log_start = pg_log.get_tail();
2581 info.stats.ondisk_log_start = pg_log.get_tail();
2582 info.stats.snaptrimq_len = snap_trimq.size();
2583
2584 unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
2585
2586 // In rare case that upset is too large (usually transient), use as target
2587 // for calculations below.
2588 unsigned target = std::max(num_shards, (unsigned)upset.size());
2589 // Not sure this could ever happen, that actingset > upset
2590 // which only matters if actingset > num_shards.
2591 unsigned nrep = std::max(actingset.size(), upset.size());
2592 // calc num_object_copies
2593 info.stats.stats.calc_copies(MAX(target, nrep));
2594 info.stats.stats.sum.num_objects_degraded = 0;
2595 info.stats.stats.sum.num_objects_unfound = 0;
2596 info.stats.stats.sum.num_objects_misplaced = 0;
2597 if ((is_remapped() || is_undersized() || !is_clean()) && (is_peered() || is_activating())) {
2598 dout(20) << __func__ << " actingset " << actingset << " upset "
2599 << upset << " actingbackfill " << actingbackfill << dendl;
2600 dout(20) << __func__ << " acting " << acting << " up " << up << dendl;
2601
2602 assert(!actingbackfill.empty());
2603
2604 // NOTE: we only generate degraded, misplaced and unfound
2605 // values for the summation, not individual stat categories.
2606 int64_t num_objects = info.stats.stats.sum.num_objects;
2607
2608 // Objects missing from up nodes, sorted by # objects.
2609 boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
2610 // Objects missing from nodes not in up, sort by # objects
2611 boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
2612
2613 int64_t missing;
2614
2615 // Primary first
2616 missing = pg_log.get_missing().num_missing();
2617 assert(actingbackfill.count(pg_whoami));
2618 if (upset.count(pg_whoami)) {
2619 missing_target_objects.insert(make_pair(missing, pg_whoami));
2620 } else {
2621 acting_source_objects.insert(make_pair(missing, pg_whoami));
2622 }
2623 info.stats.stats.sum.num_objects_missing_on_primary = missing;
2624
2625 // All other peers
2626 for (auto& peer : peer_info) {
2627 // Ignore other peers until we add code to look at detailed missing
2628 // information. (recovery)
2629 if (!actingbackfill.count(peer.first)) {
2630 continue;
2631 }
2632 missing = 0;
2633 // Backfill targets always track num_objects accurately
2634 // all other peers track missing accurately.
2635 if (is_backfill_targets(peer.first)) {
2636 missing = std::max((int64_t)0, num_objects - peer.second.stats.stats.sum.num_objects);
2637 } else {
2638 if (peer_missing.count(peer.first)) {
2639 missing = peer_missing[peer.first].num_missing();
2640 } else {
2641 dout(20) << __func__ << " no peer_missing found for " << peer.first << dendl;
2642 }
2643 }
2644 if (upset.count(peer.first)) {
2645 missing_target_objects.insert(make_pair(missing, peer.first));
2646 } else {
2647 acting_source_objects.insert(make_pair(missing, peer.first));
2648 }
2649 peer.second.stats.stats.sum.num_objects_missing = missing;
2650 }
2651
2652 if (pool.info.is_replicated()) {
2653 // Add to missing_target_objects up to target elements (num_objects missing)
2654 assert(target >= missing_target_objects.size());
2655 unsigned needed = target - missing_target_objects.size();
2656 for (; needed; --needed)
2657 missing_target_objects.insert(make_pair(num_objects, pg_shard_t(pg_shard_t::NO_OSD)));
2658 } else {
2659 for (unsigned i = 0 ; i < num_shards; ++i) {
2660 shard_id_t shard(i);
2661 bool found = false;
2662 for (const auto& t : missing_target_objects) {
2663 if (std::get<1>(t).shard == shard) {
2664 found = true;
2665 break;
2666 }
2667 }
2668 if (!found)
2669 missing_target_objects.insert(make_pair(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard)));
2670 }
2671 }
2672
2673 for (const auto& item : missing_target_objects)
2674 dout(20) << __func__ << " missing shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
2675 for (const auto& item : acting_source_objects)
2676 dout(20) << __func__ << " acting shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
2677
2678 // A misplaced object is not stored on the correct OSD
2679 int64_t misplaced = 0;
2680 // a degraded objects has fewer replicas or EC shards than the pool specifies.
2681 int64_t degraded = 0;
2682
2683 for (auto m = missing_target_objects.rbegin();
2684 m != missing_target_objects.rend(); ++m) {
2685
2686 int64_t extra_missing = -1;
2687
2688 if (pool.info.is_replicated()) {
2689 if (!acting_source_objects.empty()) {
2690 auto extra_copy = acting_source_objects.begin();
2691 extra_missing = std::get<0>(*extra_copy);
2692 acting_source_objects.erase(extra_copy);
2693 }
2694 } else { // Erasure coded
2695 // Use corresponding shard
2696 for (const auto& a : acting_source_objects) {
2697 if (std::get<1>(a).shard == std::get<1>(*m).shard) {
2698 extra_missing = std::get<0>(a);
2699 acting_source_objects.erase(a);
2700 break;
2701 }
2702 }
2703 }
2704
2705 if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
2706 // We don't know which of the objects on the target
2707 // are part of extra_missing so assume are all degraded.
2708 misplaced += std::get<0>(*m) - extra_missing;
2709 degraded += extra_missing;
2710 } else {
2711 // 1. extra_missing == -1, more targets than sources so degraded
2712 // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
2713 // previously degraded are now present on the target.
2714 degraded += std::get<0>(*m);
2715 }
2716 }
2717 // If there are still acting that haven't been accounted for
2718 // then they are misplaced
2719 for (const auto& a : acting_source_objects) {
2720 int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
2721 dout(20) << __func__ << " extra acting misplaced " << extra_misplaced << dendl;
2722 misplaced += extra_misplaced;
2723 }
2724 dout(20) << __func__ << " degraded " << degraded << dendl;
2725 dout(20) << __func__ << " misplaced " << misplaced << dendl;
2726
2727 info.stats.stats.sum.num_objects_degraded = degraded;
2728 info.stats.stats.sum.num_objects_unfound = get_num_unfound();
2729 info.stats.stats.sum.num_objects_misplaced = misplaced;
2730 }
2731 }
2732
2733 void PG::_update_blocked_by()
2734 {
2735 // set a max on the number of blocking peers we report. if we go
2736 // over, report a random subset. keep the result sorted.
2737 unsigned keep = MIN(blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
2738 unsigned skip = blocked_by.size() - keep;
2739 info.stats.blocked_by.clear();
2740 info.stats.blocked_by.resize(keep);
2741 unsigned pos = 0;
2742 for (set<int>::iterator p = blocked_by.begin();
2743 p != blocked_by.end() && keep > 0;
2744 ++p) {
2745 if (skip > 0 && (rand() % (skip + keep) < skip)) {
2746 --skip;
2747 } else {
2748 info.stats.blocked_by[pos++] = *p;
2749 --keep;
2750 }
2751 }
2752 }
2753
2754 void PG::publish_stats_to_osd()
2755 {
2756 if (!is_primary())
2757 return;
2758
2759 pg_stats_publish_lock.Lock();
2760
2761 if (info.stats.stats.sum.num_scrub_errors)
2762 state_set(PG_STATE_INCONSISTENT);
2763 else
2764 state_clear(PG_STATE_INCONSISTENT);
2765
2766 utime_t now = ceph_clock_now();
2767 if (info.stats.state != state) {
2768 info.stats.last_change = now;
2769 // Optimistic estimation, if we just find out an inactive PG,
2770 // assumt it is active till now.
2771 if (!(state & PG_STATE_ACTIVE) &&
2772 (info.stats.state & PG_STATE_ACTIVE))
2773 info.stats.last_active = now;
2774
2775 if ((state & PG_STATE_ACTIVE) &&
2776 !(info.stats.state & PG_STATE_ACTIVE))
2777 info.stats.last_became_active = now;
2778 if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
2779 !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
2780 info.stats.last_became_peered = now;
2781 if (!(state & PG_STATE_CREATING) &&
2782 (info.stats.state & PG_STATE_CREATING)) {
2783 osd->send_pg_created(get_pgid().pgid);
2784 }
2785 info.stats.state = state;
2786 }
2787
2788 _update_calc_stats();
2789 if (info.stats.stats.sum.num_objects_degraded) {
2790 state_set(PG_STATE_DEGRADED);
2791 } else {
2792 state_clear(PG_STATE_DEGRADED);
2793 }
2794 _update_blocked_by();
2795
2796 bool publish = false;
2797 pg_stat_t pre_publish = info.stats;
2798 pre_publish.stats.add(unstable_stats);
2799 utime_t cutoff = now;
2800 cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
2801 if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
2802 info.stats.last_fresh > cutoff) {
2803 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2804 << ": no change since " << info.stats.last_fresh << dendl;
2805 } else {
2806 // update our stat summary and timestamps
2807 info.stats.reported_epoch = get_osdmap()->get_epoch();
2808 ++info.stats.reported_seq;
2809
2810 info.stats.last_fresh = now;
2811
2812 if (info.stats.state & PG_STATE_CLEAN)
2813 info.stats.last_clean = now;
2814 if (info.stats.state & PG_STATE_ACTIVE)
2815 info.stats.last_active = now;
2816 if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
2817 info.stats.last_peered = now;
2818 info.stats.last_unstale = now;
2819 if ((info.stats.state & PG_STATE_DEGRADED) == 0)
2820 info.stats.last_undegraded = now;
2821 if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
2822 info.stats.last_fullsized = now;
2823
2824 // do not send pgstat to mon anymore once we are luminous, since mgr takes
2825 // care of this by sending MMonMgrReport to mon.
2826 publish =
2827 osd->osd->get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
2828 pg_stats_publish_valid = true;
2829 pg_stats_publish = pre_publish;
2830
2831 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2832 << ":" << pg_stats_publish.reported_seq << dendl;
2833 }
2834 pg_stats_publish_lock.Unlock();
2835
2836 if (publish)
2837 osd->pg_stat_queue_enqueue(this);
2838 }
2839
2840 void PG::clear_publish_stats()
2841 {
2842 dout(15) << "clear_stats" << dendl;
2843 pg_stats_publish_lock.Lock();
2844 pg_stats_publish_valid = false;
2845 pg_stats_publish_lock.Unlock();
2846
2847 osd->pg_stat_queue_dequeue(this);
2848 }
2849
2850 /**
2851 * initialize a newly instantiated pg
2852 *
2853 * Initialize PG state, as when a PG is initially created, or when it
2854 * is first instantiated on the current node.
2855 *
2856 * @param role our role/rank
2857 * @param newup up set
2858 * @param newacting acting set
2859 * @param history pg history
2860 * @param pi past_intervals
2861 * @param backfill true if info should be marked as backfill
2862 * @param t transaction to write out our new state in
2863 */
2864 void PG::init(
2865 int role,
2866 const vector<int>& newup, int new_up_primary,
2867 const vector<int>& newacting, int new_acting_primary,
2868 const pg_history_t& history,
2869 const PastIntervals& pi,
2870 bool backfill,
2871 ObjectStore::Transaction *t)
2872 {
2873 dout(10) << "init role " << role << " up " << newup << " acting " << newacting
2874 << " history " << history
2875 << " past_intervals " << pi
2876 << dendl;
2877
2878 set_role(role);
2879 acting = newacting;
2880 up = newup;
2881 init_primary_up_acting(
2882 newup,
2883 newacting,
2884 new_up_primary,
2885 new_acting_primary);
2886
2887 info.history = history;
2888 past_intervals = pi;
2889
2890 info.stats.up = up;
2891 info.stats.up_primary = new_up_primary;
2892 info.stats.acting = acting;
2893 info.stats.acting_primary = new_acting_primary;
2894 info.stats.mapping_epoch = info.history.same_interval_since;
2895
2896 if (backfill) {
2897 dout(10) << __func__ << ": Setting backfill" << dendl;
2898 info.set_last_backfill(hobject_t());
2899 info.last_complete = info.last_update;
2900 pg_log.mark_log_for_rewrite();
2901 }
2902
2903 on_new_interval();
2904
2905 dirty_info = true;
2906 dirty_big_info = true;
2907 write_if_dirty(*t);
2908 }
2909
2910 #pragma GCC diagnostic ignored "-Wpragmas"
2911 #pragma GCC diagnostic push
2912 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2913
2914 void PG::upgrade(ObjectStore *store)
2915 {
2916 assert(info_struct_v <= 10);
2917 ObjectStore::Transaction t;
2918
2919 assert(info_struct_v >= 7);
2920
2921 // 7 -> 8
2922 if (info_struct_v <= 7) {
2923 pg_log.mark_log_for_rewrite();
2924 ghobject_t log_oid(OSD::make_pg_log_oid(pg_id));
2925 ghobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id));
2926 t.remove(coll_t::meta(), log_oid);
2927 t.remove(coll_t::meta(), biginfo_oid);
2928 t.touch(coll, pgmeta_oid);
2929 }
2930
2931 // 8 -> 9
2932 if (info_struct_v <= 8) {
2933 // no special action needed.
2934 }
2935
2936 // 9 -> 10
2937 if (info_struct_v <= 9) {
2938 // previous versions weren't (as) aggressively clearing past_intervals
2939 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
2940 dout(20) << __func__ << " clearing past_intervals" << dendl;
2941 past_intervals.clear();
2942 }
2943 }
2944
2945 // update infover_key
2946 if (info_struct_v < cur_struct_v) {
2947 map<string,bufferlist> v;
2948 __u8 ver = cur_struct_v;
2949 ::encode(ver, v[infover_key]);
2950 t.omap_setkeys(coll, pgmeta_oid, v);
2951 }
2952
2953 dirty_info = true;
2954 dirty_big_info = true;
2955 write_if_dirty(t);
2956
2957 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
2958 ObjectStore::Sequencer>("upgrade"));
2959 int r = store->apply_transaction(osr.get(), std::move(t));
2960 if (r != 0) {
2961 derr << __func__ << ": apply_transaction returned "
2962 << cpp_strerror(r) << dendl;
2963 ceph_abort();
2964 }
2965 assert(r == 0);
2966
2967 C_SaferCond waiter;
2968 if (!osr->flush_commit(&waiter)) {
2969 waiter.wait();
2970 }
2971 }
2972
2973 #pragma GCC diagnostic pop
2974 #pragma GCC diagnostic warning "-Wpragmas"
2975
2976 int PG::_prepare_write_info(CephContext* cct,
2977 map<string,bufferlist> *km,
2978 epoch_t epoch,
2979 pg_info_t &info, pg_info_t &last_written_info,
2980 PastIntervals &past_intervals,
2981 bool dirty_big_info,
2982 bool dirty_epoch,
2983 bool try_fast_info,
2984 PerfCounters *logger)
2985 {
2986 if (dirty_epoch) {
2987 ::encode(epoch, (*km)[epoch_key]);
2988 }
2989
2990 if (logger)
2991 logger->inc(l_osd_pg_info);
2992
2993 // try to do info efficiently?
2994 if (!dirty_big_info && try_fast_info &&
2995 info.last_update > last_written_info.last_update) {
2996 pg_fast_info_t fast;
2997 fast.populate_from(info);
2998 bool did = fast.try_apply_to(&last_written_info);
2999 assert(did); // we verified last_update increased above
3000 if (info == last_written_info) {
3001 ::encode(fast, (*km)[fastinfo_key]);
3002 if (logger)
3003 logger->inc(l_osd_pg_fastinfo);
3004 return 0;
3005 }
3006 generic_dout(30) << __func__ << " fastinfo failed, info:\n";
3007 {
3008 JSONFormatter jf(true);
3009 jf.dump_object("info", info);
3010 jf.flush(*_dout);
3011 }
3012 {
3013 *_dout << "\nlast_written_info:\n";
3014 JSONFormatter jf(true);
3015 jf.dump_object("last_written_info", last_written_info);
3016 jf.flush(*_dout);
3017 }
3018 *_dout << dendl;
3019 }
3020 last_written_info = info;
3021
3022 // info. store purged_snaps separately.
3023 interval_set<snapid_t> purged_snaps;
3024 purged_snaps.swap(info.purged_snaps);
3025 ::encode(info, (*km)[info_key]);
3026 purged_snaps.swap(info.purged_snaps);
3027
3028 if (dirty_big_info) {
3029 // potentially big stuff
3030 bufferlist& bigbl = (*km)[biginfo_key];
3031 ::encode(past_intervals, bigbl);
3032 ::encode(info.purged_snaps, bigbl);
3033 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
3034 if (logger)
3035 logger->inc(l_osd_pg_biginfo);
3036 }
3037
3038 return 0;
3039 }
3040
3041 void PG::_create(ObjectStore::Transaction& t, spg_t pgid, int bits)
3042 {
3043 coll_t coll(pgid);
3044 t.create_collection(coll, bits);
3045 }
3046
3047 void PG::_init(ObjectStore::Transaction& t, spg_t pgid, const pg_pool_t *pool)
3048 {
3049 coll_t coll(pgid);
3050
3051 if (pool) {
3052 // Give a hint to the PG collection
3053 bufferlist hint;
3054 uint32_t pg_num = pool->get_pg_num();
3055 uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
3056 ::encode(pg_num, hint);
3057 ::encode(expected_num_objects_pg, hint);
3058 uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
3059 t.collection_hint(coll, hint_type, hint);
3060 }
3061
3062 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3063 t.touch(coll, pgmeta_oid);
3064 map<string,bufferlist> values;
3065 __u8 struct_v = cur_struct_v;
3066 ::encode(struct_v, values[infover_key]);
3067 t.omap_setkeys(coll, pgmeta_oid, values);
3068 }
3069
3070 void PG::prepare_write_info(map<string,bufferlist> *km)
3071 {
3072 info.stats.stats.add(unstable_stats);
3073 unstable_stats.clear();
3074
3075 bool need_update_epoch = last_epoch < get_osdmap()->get_epoch();
3076 int ret = _prepare_write_info(cct, km, get_osdmap()->get_epoch(),
3077 info,
3078 last_written_info,
3079 past_intervals,
3080 dirty_big_info, need_update_epoch,
3081 cct->_conf->osd_fast_info,
3082 osd->logger);
3083 assert(ret == 0);
3084 if (need_update_epoch)
3085 last_epoch = get_osdmap()->get_epoch();
3086 last_persisted_osdmap_ref = osdmap_ref;
3087
3088 dirty_info = false;
3089 dirty_big_info = false;
3090 }
3091
3092 #pragma GCC diagnostic ignored "-Wpragmas"
3093 #pragma GCC diagnostic push
3094 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3095
3096 bool PG::_has_removal_flag(ObjectStore *store,
3097 spg_t pgid)
3098 {
3099 coll_t coll(pgid);
3100 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3101
3102 // first try new way
3103 set<string> keys;
3104 keys.insert("_remove");
3105 map<string,bufferlist> values;
3106 if (store->omap_get_values(coll, pgmeta_oid, keys, &values) == 0 &&
3107 values.size() == 1)
3108 return true;
3109
3110 return false;
3111 }
3112
3113 int PG::peek_map_epoch(ObjectStore *store,
3114 spg_t pgid,
3115 epoch_t *pepoch,
3116 bufferlist *bl)
3117 {
3118 coll_t coll(pgid);
3119 ghobject_t legacy_infos_oid(OSD::make_infos_oid());
3120 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3121 epoch_t cur_epoch = 0;
3122
3123 assert(bl);
3124 {
3125 // validate collection name
3126 assert(coll.is_pg());
3127 }
3128
3129 // try for v8
3130 set<string> keys;
3131 keys.insert(infover_key);
3132 keys.insert(epoch_key);
3133 map<string,bufferlist> values;
3134 int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3135 if (r == 0) {
3136 assert(values.size() == 2);
3137
3138 // sanity check version
3139 bufferlist::iterator bp = values[infover_key].begin();
3140 __u8 struct_v = 0;
3141 ::decode(struct_v, bp);
3142 assert(struct_v >= 8);
3143
3144 // get epoch
3145 bp = values[epoch_key].begin();
3146 ::decode(cur_epoch, bp);
3147 } else {
3148 // probably bug 10617; see OSD::load_pgs()
3149 return -1;
3150 }
3151
3152 *pepoch = cur_epoch;
3153 return 0;
3154 }
3155
3156 #pragma GCC diagnostic pop
3157 #pragma GCC diagnostic warning "-Wpragmas"
3158
3159 void PG::write_if_dirty(ObjectStore::Transaction& t)
3160 {
3161 map<string,bufferlist> km;
3162 if (dirty_big_info || dirty_info)
3163 prepare_write_info(&km);
3164 pg_log.write_log_and_missing(t, &km, coll, pgmeta_oid, pool.info.require_rollback());
3165 if (!km.empty())
3166 t.omap_setkeys(coll, pgmeta_oid, km);
3167 }
3168
3169 void PG::trim_log()
3170 {
3171 assert(is_primary());
3172 calc_trim_to();
3173 dout(10) << __func__ << " to " << pg_trim_to << dendl;
3174 if (pg_trim_to != eversion_t()) {
3175 // inform peers to trim log
3176 assert(!actingbackfill.empty());
3177 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3178 i != actingbackfill.end();
3179 ++i) {
3180 if (*i == pg_whoami) continue;
3181 osd->send_message_osd_cluster(
3182 i->osd,
3183 new MOSDPGTrim(
3184 get_osdmap()->get_epoch(),
3185 spg_t(info.pgid.pgid, i->shard),
3186 pg_trim_to),
3187 get_osdmap()->get_epoch());
3188 }
3189
3190 // trim primary as well
3191 pg_log.trim(pg_trim_to, info);
3192 dirty_info = true;
3193 }
3194 }
3195
3196 void PG::add_log_entry(const pg_log_entry_t& e, bool applied)
3197 {
3198 // raise last_complete only if we were previously up to date
3199 if (info.last_complete == info.last_update)
3200 info.last_complete = e.version;
3201
3202 // raise last_update.
3203 assert(e.version > info.last_update);
3204 info.last_update = e.version;
3205
3206 // raise user_version, if it increased (it may have not get bumped
3207 // by all logged updates)
3208 if (e.user_version > info.last_user_version)
3209 info.last_user_version = e.user_version;
3210
3211 // log mutation
3212 pg_log.add(e, applied);
3213 dout(10) << "add_log_entry " << e << dendl;
3214 }
3215
3216
3217 void PG::append_log(
3218 const vector<pg_log_entry_t>& logv,
3219 eversion_t trim_to,
3220 eversion_t roll_forward_to,
3221 ObjectStore::Transaction &t,
3222 bool transaction_applied)
3223 {
3224 if (transaction_applied)
3225 update_snap_map(logv, t);
3226
3227 /* The primary has sent an info updating the history, but it may not
3228 * have arrived yet. We want to make sure that we cannot remember this
3229 * write without remembering that it happened in an interval which went
3230 * active in epoch history.last_epoch_started.
3231 */
3232 if (info.last_epoch_started != info.history.last_epoch_started) {
3233 info.history.last_epoch_started = info.last_epoch_started;
3234 }
3235 if (info.last_interval_started != info.history.last_interval_started) {
3236 info.history.last_interval_started = info.last_interval_started;
3237 }
3238 dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3239
3240 PGLogEntryHandler handler{this, &t};
3241 if (!transaction_applied) {
3242 /* We must be a backfill peer, so it's ok if we apply
3243 * out-of-turn since we won't be considered when
3244 * determining a min possible last_update.
3245 */
3246 pg_log.roll_forward(&handler);
3247 }
3248
3249 for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3250 p != logv.end();
3251 ++p) {
3252 add_log_entry(*p, transaction_applied);
3253
3254 /* We don't want to leave the rollforward artifacts around
3255 * here past last_backfill. It's ok for the same reason as
3256 * above */
3257 if (transaction_applied &&
3258 p->soid > info.last_backfill) {
3259 pg_log.roll_forward(&handler);
3260 }
3261 }
3262 auto last = logv.rbegin();
3263 if (is_primary() && last != logv.rend()) {
3264 projected_log.skip_can_rollback_to_to_head();
3265 projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
3266 }
3267
3268 if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
3269 pg_log.roll_forward_to(
3270 roll_forward_to,
3271 &handler);
3272 t.register_on_applied(
3273 new C_UpdateLastRollbackInfoTrimmedToApplied(
3274 this,
3275 get_osdmap()->get_epoch(),
3276 roll_forward_to));
3277 }
3278
3279 pg_log.trim(trim_to, info);
3280
3281 // update the local pg, pg log
3282 dirty_info = true;
3283 write_if_dirty(t);
3284 }
3285
3286 bool PG::check_log_for_corruption(ObjectStore *store)
3287 {
3288 /// TODO: this method needs to work with the omap log
3289 return true;
3290 }
3291
3292 //! Get the name we're going to save our corrupt page log as
3293 std::string PG::get_corrupt_pg_log_name() const
3294 {
3295 const int MAX_BUF = 512;
3296 char buf[MAX_BUF];
3297 struct tm tm_buf;
3298 time_t my_time(time(NULL));
3299 const struct tm *t = localtime_r(&my_time, &tm_buf);
3300 int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t);
3301 if (ret == 0) {
3302 dout(0) << "strftime failed" << dendl;
3303 return "corrupt_log_unknown_time";
3304 }
3305 string out(buf);
3306 out += stringify(info.pgid);
3307 return out;
3308 }
3309
3310 int PG::read_info(
3311 ObjectStore *store, spg_t pgid, const coll_t &coll, bufferlist &bl,
3312 pg_info_t &info, PastIntervals &past_intervals,
3313 __u8 &struct_v)
3314 {
3315 // try for v8 or later
3316 set<string> keys;
3317 keys.insert(infover_key);
3318 keys.insert(info_key);
3319 keys.insert(biginfo_key);
3320 keys.insert(fastinfo_key);
3321 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3322 map<string,bufferlist> values;
3323 int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3324 if (r == 0) {
3325 assert(values.size() == 3 ||
3326 values.size() == 4);
3327
3328 bufferlist::iterator p = values[infover_key].begin();
3329 ::decode(struct_v, p);
3330 assert(struct_v >= 8);
3331
3332 p = values[info_key].begin();
3333 ::decode(info, p);
3334
3335 p = values[biginfo_key].begin();
3336 if (struct_v >= 10) {
3337 ::decode(past_intervals, p);
3338 } else {
3339 past_intervals.decode_classic(p);
3340 }
3341 ::decode(info.purged_snaps, p);
3342
3343 p = values[fastinfo_key].begin();
3344 if (!p.end()) {
3345 pg_fast_info_t fast;
3346 ::decode(fast, p);
3347 fast.try_apply_to(&info);
3348 }
3349 return 0;
3350 }
3351
3352 // legacy (ver < 8)
3353 ghobject_t infos_oid(OSD::make_infos_oid());
3354 bufferlist::iterator p = bl.begin();
3355 ::decode(struct_v, p);
3356 assert(struct_v == 7);
3357
3358 // get info out of leveldb
3359 string k = get_info_key(info.pgid);
3360 string bk = get_biginfo_key(info.pgid);
3361 keys.clear();
3362 keys.insert(k);
3363 keys.insert(bk);
3364 values.clear();
3365 store->omap_get_values(coll_t::meta(), ghobject_t(infos_oid), keys, &values);
3366 assert(values.size() == 2);
3367
3368 p = values[k].begin();
3369 ::decode(info, p);
3370
3371 p = values[bk].begin();
3372 ::decode(past_intervals, p);
3373 interval_set<snapid_t> snap_collections; // obsolete
3374 ::decode(snap_collections, p);
3375 ::decode(info.purged_snaps, p);
3376 return 0;
3377 }
3378
3379 void PG::read_state(ObjectStore *store, bufferlist &bl)
3380 {
3381 int r = read_info(store, pg_id, coll, bl, info, past_intervals,
3382 info_struct_v);
3383 assert(r >= 0);
3384
3385 last_written_info = info;
3386
3387 // if we are upgrading from jewel, we need to force rebuild of
3388 // missing set. v9 was fastinfo, added v11.0.2-331-g1d5dc29a13
3389 // (before kraken). persisted missing set was circa
3390 // v11.0.0-866-gb0e239da95 (a bit earlier, also before kraken).
3391 // v8 was pre-jewel (per-pg meta object).
3392 bool force_rebuild_missing = info_struct_v < 9;
3393 if (force_rebuild_missing) {
3394 dout(10) << __func__ << " detected upgrade from jewel, force_rebuild_missing"
3395 << dendl;
3396 }
3397
3398 ostringstream oss;
3399 pg_log.read_log_and_missing(
3400 store,
3401 coll,
3402 info_struct_v < 8 ? coll_t::meta() : coll,
3403 ghobject_t(info_struct_v < 8 ? OSD::make_pg_log_oid(pg_id) : pgmeta_oid),
3404 info,
3405 force_rebuild_missing,
3406 oss,
3407 cct->_conf->osd_ignore_stale_divergent_priors,
3408 cct->_conf->osd_debug_verify_missing_on_start);
3409 if (oss.tellp())
3410 osd->clog->error() << oss.str();
3411
3412 if (force_rebuild_missing) {
3413 dout(10) << __func__ << " forced rebuild of missing got "
3414 << pg_log.get_missing()
3415 << dendl;
3416 }
3417
3418 // log any weirdness
3419 log_weirdness();
3420 }
3421
3422 void PG::log_weirdness()
3423 {
3424 if (pg_log.get_tail() != info.log_tail)
3425 osd->clog->error() << info.pgid
3426 << " info mismatch, log.tail " << pg_log.get_tail()
3427 << " != info.log_tail " << info.log_tail;
3428 if (pg_log.get_head() != info.last_update)
3429 osd->clog->error() << info.pgid
3430 << " info mismatch, log.head " << pg_log.get_head()
3431 << " != info.last_update " << info.last_update;
3432
3433 if (!pg_log.get_log().empty()) {
3434 // sloppy check
3435 if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
3436 osd->clog->error() << info.pgid
3437 << " log bound mismatch, info (tail,head] ("
3438 << pg_log.get_tail() << "," << pg_log.get_head() << "]"
3439 << " actual ["
3440 << pg_log.get_log().log.begin()->version << ","
3441 << pg_log.get_log().log.rbegin()->version << "]";
3442 }
3443
3444 if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
3445 osd->clog->error() << info.pgid
3446 << " caller_ops.size " << pg_log.get_log().caller_ops.size()
3447 << " > log size " << pg_log.get_log().log.size();
3448 }
3449 }
3450
3451 void PG::update_snap_map(
3452 const vector<pg_log_entry_t> &log_entries,
3453 ObjectStore::Transaction &t)
3454 {
3455 for (vector<pg_log_entry_t>::const_iterator i = log_entries.begin();
3456 i != log_entries.end();
3457 ++i) {
3458 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
3459 if (i->soid.snap < CEPH_MAXSNAP) {
3460 if (i->is_delete()) {
3461 int r = snap_mapper.remove_oid(
3462 i->soid,
3463 &_t);
3464 assert(r == 0);
3465 } else if (i->is_update()) {
3466 assert(i->snaps.length() > 0);
3467 vector<snapid_t> snaps;
3468 bufferlist snapbl = i->snaps;
3469 bufferlist::iterator p = snapbl.begin();
3470 try {
3471 ::decode(snaps, p);
3472 } catch (...) {
3473 derr << __func__ << " decode snaps failure on " << *i << dendl;
3474 snaps.clear();
3475 }
3476 set<snapid_t> _snaps(snaps.begin(), snaps.end());
3477
3478 if (i->is_clone() || i->is_promote()) {
3479 snap_mapper.add_oid(
3480 i->soid,
3481 _snaps,
3482 &_t);
3483 } else if (i->is_modify()) {
3484 assert(i->is_modify());
3485 int r = snap_mapper.update_snaps(
3486 i->soid,
3487 _snaps,
3488 0,
3489 &_t);
3490 assert(r == 0);
3491 } else {
3492 assert(i->is_clean());
3493 }
3494 }
3495 }
3496 }
3497 }
3498
3499 /**
3500 * filter trimming|trimmed snaps out of snapcontext
3501 */
3502 void PG::filter_snapc(vector<snapid_t> &snaps)
3503 {
3504 //nothing needs to trim, we can return immediately
3505 if(snap_trimq.empty() && info.purged_snaps.empty())
3506 return;
3507
3508 bool filtering = false;
3509 vector<snapid_t> newsnaps;
3510 for (vector<snapid_t>::iterator p = snaps.begin();
3511 p != snaps.end();
3512 ++p) {
3513 if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
3514 if (!filtering) {
3515 // start building a new vector with what we've seen so far
3516 dout(10) << "filter_snapc filtering " << snaps << dendl;
3517 newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
3518 filtering = true;
3519 }
3520 dout(20) << "filter_snapc removing trimq|purged snap " << *p << dendl;
3521 } else {
3522 if (filtering)
3523 newsnaps.push_back(*p); // continue building new vector
3524 }
3525 }
3526 if (filtering) {
3527 snaps.swap(newsnaps);
3528 dout(10) << "filter_snapc result " << snaps << dendl;
3529 }
3530 }
3531
3532 void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m)
3533 {
3534 for (map<hobject_t, list<OpRequestRef>>::iterator it = m.begin();
3535 it != m.end();
3536 ++it)
3537 requeue_ops(it->second);
3538 m.clear();
3539 }
3540
3541 void PG::requeue_op(OpRequestRef op)
3542 {
3543 auto p = waiting_for_map.find(op->get_source());
3544 if (p != waiting_for_map.end()) {
3545 dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")"
3546 << dendl;
3547 p->second.push_front(op);
3548 } else {
3549 dout(20) << __func__ << " " << op << dendl;
3550 osd->enqueue_front(info.pgid, PGQueueable(op, get_osdmap()->get_epoch()));
3551 }
3552 }
3553
3554 void PG::requeue_ops(list<OpRequestRef> &ls)
3555 {
3556 for (list<OpRequestRef>::reverse_iterator i = ls.rbegin();
3557 i != ls.rend();
3558 ++i) {
3559 auto p = waiting_for_map.find((*i)->get_source());
3560 if (p != waiting_for_map.end()) {
3561 dout(20) << __func__ << " " << *i << " (waiting_for_map " << p->first
3562 << ")" << dendl;
3563 p->second.push_front(*i);
3564 } else {
3565 dout(20) << __func__ << " " << *i << dendl;
3566 osd->enqueue_front(info.pgid, PGQueueable(*i, get_osdmap()->get_epoch()));
3567 }
3568 }
3569 ls.clear();
3570 }
3571
3572 void PG::requeue_map_waiters()
3573 {
3574 epoch_t epoch = get_osdmap()->get_epoch();
3575 auto p = waiting_for_map.begin();
3576 while (p != waiting_for_map.end()) {
3577 if (epoch < p->second.front()->min_epoch) {
3578 dout(20) << __func__ << " " << p->first << " front op "
3579 << p->second.front() << " must still wait, doing nothing"
3580 << dendl;
3581 ++p;
3582 } else {
3583 dout(20) << __func__ << " " << p->first << " " << p->second << dendl;
3584 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
3585 osd->enqueue_front(info.pgid, PGQueueable(*q, epoch));
3586 }
3587 p = waiting_for_map.erase(p);
3588 }
3589 }
3590 }
3591
3592
3593 // ==========================================================================================
3594 // SCRUB
3595
3596 /*
3597 * when holding pg and sched_scrub_lock, then the states are:
3598 * scheduling:
3599 * scrubber.reserved = true
3600 * scrub_rserved_peers includes whoami
3601 * osd->scrub_pending++
3602 * scheduling, replica declined:
3603 * scrubber.reserved = true
3604 * scrubber.reserved_peers includes -1
3605 * osd->scrub_pending++
3606 * pending:
3607 * scrubber.reserved = true
3608 * scrubber.reserved_peers.size() == acting.size();
3609 * pg on scrub_wq
3610 * osd->scrub_pending++
3611 * scrubbing:
3612 * scrubber.reserved = false;
3613 * scrubber.reserved_peers empty
3614 * osd->scrubber.active++
3615 */
3616
3617 // returns true if a scrub has been newly kicked off
3618 bool PG::sched_scrub()
3619 {
3620 bool nodeep_scrub = false;
3621 assert(is_locked());
3622 if (!(is_primary() && is_active() && is_clean() && !is_scrubbing())) {
3623 return false;
3624 }
3625
3626 double deep_scrub_interval = 0;
3627 pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
3628 if (deep_scrub_interval <= 0) {
3629 deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
3630 }
3631 bool time_for_deep = ceph_clock_now() >=
3632 info.history.last_deep_scrub_stamp + deep_scrub_interval;
3633
3634 bool deep_coin_flip = false;
3635 // Only add random deep scrubs when NOT user initiated scrub
3636 if (!scrubber.must_scrub)
3637 deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
3638 dout(20) << __func__ << ": time_for_deep=" << time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
3639
3640 time_for_deep = (time_for_deep || deep_coin_flip);
3641
3642 //NODEEP_SCRUB so ignore time initiated deep-scrub
3643 if (osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
3644 pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
3645 time_for_deep = false;
3646 nodeep_scrub = true;
3647 }
3648
3649 if (!scrubber.must_scrub) {
3650 assert(!scrubber.must_deep_scrub);
3651
3652 //NOSCRUB so skip regular scrubs
3653 if ((osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
3654 pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) && !time_for_deep) {
3655 if (scrubber.reserved) {
3656 // cancel scrub if it is still in scheduling,
3657 // so pgs from other pools where scrub are still legal
3658 // have a chance to go ahead with scrubbing.
3659 clear_scrub_reserved();
3660 scrub_unreserve_replicas();
3661 }
3662 return false;
3663 }
3664 }
3665
3666 if (cct->_conf->osd_scrub_auto_repair
3667 && get_pgbackend()->auto_repair_supported()
3668 && time_for_deep
3669 // respect the command from user, and not do auto-repair
3670 && !scrubber.must_repair
3671 && !scrubber.must_scrub
3672 && !scrubber.must_deep_scrub) {
3673 dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
3674 scrubber.auto_repair = true;
3675 } else {
3676 // this happens when user issue the scrub/repair command during
3677 // the scheduling of the scrub/repair (e.g. request reservation)
3678 scrubber.auto_repair = false;
3679 }
3680
3681 bool ret = true;
3682 if (!scrubber.reserved) {
3683 assert(scrubber.reserved_peers.empty());
3684 if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
3685 osd->inc_scrubs_pending()) {
3686 dout(20) << __func__ << ": reserved locally, reserving replicas" << dendl;
3687 scrubber.reserved = true;
3688 scrubber.reserved_peers.insert(pg_whoami);
3689 scrub_reserve_replicas();
3690 } else {
3691 dout(20) << __func__ << ": failed to reserve locally" << dendl;
3692 ret = false;
3693 }
3694 }
3695 if (scrubber.reserved) {
3696 if (scrubber.reserve_failed) {
3697 dout(20) << "sched_scrub: failed, a peer declined" << dendl;
3698 clear_scrub_reserved();
3699 scrub_unreserve_replicas();
3700 ret = false;
3701 } else if (scrubber.reserved_peers.size() == acting.size()) {
3702 dout(20) << "sched_scrub: success, reserved self and replicas" << dendl;
3703 if (time_for_deep) {
3704 dout(10) << "sched_scrub: scrub will be deep" << dendl;
3705 state_set(PG_STATE_DEEP_SCRUB);
3706 } else if (!scrubber.must_deep_scrub && info.stats.stats.sum.num_deep_scrub_errors) {
3707 if (!nodeep_scrub) {
3708 osd->clog->info() << "osd." << osd->whoami
3709 << " pg " << info.pgid
3710 << " Deep scrub errors, upgrading scrub to deep-scrub";
3711 state_set(PG_STATE_DEEP_SCRUB);
3712 } else if (!scrubber.must_scrub) {
3713 osd->clog->error() << "osd." << osd->whoami
3714 << " pg " << info.pgid
3715 << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
3716 clear_scrub_reserved();
3717 scrub_unreserve_replicas();
3718 return false;
3719 } else {
3720 osd->clog->error() << "osd." << osd->whoami
3721 << " pg " << info.pgid
3722 << " Regular scrub request, deep-scrub details will be lost";
3723 }
3724 }
3725 queue_scrub();
3726 } else {
3727 // none declined, since scrubber.reserved is set
3728 dout(20) << "sched_scrub: reserved " << scrubber.reserved_peers << ", waiting for replicas" << dendl;
3729 }
3730 }
3731
3732 return ret;
3733 }
3734
3735 void PG::reg_next_scrub()
3736 {
3737 if (!is_primary())
3738 return;
3739
3740 utime_t reg_stamp;
3741 if (scrubber.must_scrub ||
3742 (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats)) {
3743 reg_stamp = ceph_clock_now();
3744 } else {
3745 reg_stamp = info.history.last_scrub_stamp;
3746 }
3747 // note down the sched_time, so we can locate this scrub, and remove it
3748 // later on.
3749 double scrub_min_interval = 0, scrub_max_interval = 0;
3750 pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
3751 pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
3752 assert(scrubber.scrub_reg_stamp == utime_t());
3753 scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
3754 reg_stamp,
3755 scrub_min_interval,
3756 scrub_max_interval,
3757 scrubber.must_scrub);
3758 }
3759
3760 void PG::unreg_next_scrub()
3761 {
3762 if (is_primary()) {
3763 osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
3764 scrubber.scrub_reg_stamp = utime_t();
3765 }
3766 }
3767
3768 void PG::do_replica_scrub_map(OpRequestRef op)
3769 {
3770 const MOSDRepScrubMap *m = static_cast<const MOSDRepScrubMap*>(op->get_req());
3771 dout(7) << __func__ << " " << *m << dendl;
3772 if (m->map_epoch < info.history.same_interval_since) {
3773 dout(10) << __func__ << " discarding old from "
3774 << m->map_epoch << " < " << info.history.same_interval_since
3775 << dendl;
3776 return;
3777 }
3778 if (!scrubber.is_chunky_scrub_active()) {
3779 dout(10) << __func__ << " scrub isn't active" << dendl;
3780 return;
3781 }
3782
3783 op->mark_started();
3784
3785 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3786 scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3787 dout(10) << "map version is "
3788 << scrubber.received_maps[m->from].valid_through
3789 << dendl;
3790
3791 --scrubber.waiting_on;
3792 scrubber.waiting_on_whom.erase(m->from);
3793 if (scrubber.waiting_on == 0) {
3794 if (ops_blocked_by_scrub()) {
3795 requeue_scrub(true);
3796 } else {
3797 requeue_scrub(false);
3798 }
3799 }
3800 }
3801
3802 void PG::sub_op_scrub_map(OpRequestRef op)
3803 {
3804 // for legacy jewel compatibility only
3805 const MOSDSubOp *m = static_cast<const MOSDSubOp *>(op->get_req());
3806 assert(m->get_type() == MSG_OSD_SUBOP);
3807 dout(7) << "sub_op_scrub_map" << dendl;
3808
3809 if (m->map_epoch < info.history.same_interval_since) {
3810 dout(10) << "sub_op_scrub discarding old sub_op from "
3811 << m->map_epoch << " < " << info.history.same_interval_since << dendl;
3812 return;
3813 }
3814
3815 if (!scrubber.is_chunky_scrub_active()) {
3816 dout(10) << "sub_op_scrub_map scrub isn't active" << dendl;
3817 return;
3818 }
3819
3820 op->mark_started();
3821
3822 dout(10) << " got " << m->from << " scrub map" << dendl;
3823 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3824
3825 scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3826 dout(10) << "map version is "
3827 << scrubber.received_maps[m->from].valid_through
3828 << dendl;
3829
3830 --scrubber.waiting_on;
3831 scrubber.waiting_on_whom.erase(m->from);
3832
3833 if (scrubber.waiting_on == 0) {
3834 if (ops_blocked_by_scrub()) {
3835 requeue_scrub(true);
3836 } else {
3837 requeue_scrub(false);
3838 }
3839 }
3840 }
3841
3842 // send scrub v3 messages (chunky scrub)
3843 void PG::_request_scrub_map(
3844 pg_shard_t replica, eversion_t version,
3845 hobject_t start, hobject_t end,
3846 bool deep, uint32_t seed)
3847 {
3848 assert(replica != pg_whoami);
3849 dout(10) << "scrub requesting scrubmap from osd." << replica
3850 << " deep " << (int)deep << " seed " << seed << dendl;
3851 MOSDRepScrub *repscrubop = new MOSDRepScrub(
3852 spg_t(info.pgid.pgid, replica.shard), version,
3853 get_osdmap()->get_epoch(),
3854 get_last_peering_reset(),
3855 start, end, deep, seed);
3856 // default priority, we want the rep scrub processed prior to any recovery
3857 // or client io messages (we are holding a lock!)
3858 osd->send_message_osd_cluster(
3859 replica.osd, repscrubop, get_osdmap()->get_epoch());
3860 }
3861
3862 void PG::handle_scrub_reserve_request(OpRequestRef op)
3863 {
3864 dout(7) << __func__ << " " << *op->get_req() << dendl;
3865 op->mark_started();
3866 if (scrubber.reserved) {
3867 dout(10) << __func__ << " ignoring reserve request: Already reserved"
3868 << dendl;
3869 return;
3870 }
3871 if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
3872 osd->inc_scrubs_pending()) {
3873 scrubber.reserved = true;
3874 } else {
3875 dout(20) << __func__ << ": failed to reserve remotely" << dendl;
3876 scrubber.reserved = false;
3877 }
3878 if (op->get_req()->get_type() == MSG_OSD_SCRUB_RESERVE) {
3879 const MOSDScrubReserve *m =
3880 static_cast<const MOSDScrubReserve*>(op->get_req());
3881 Message *reply = new MOSDScrubReserve(
3882 spg_t(info.pgid.pgid, primary.shard),
3883 m->map_epoch,
3884 scrubber.reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
3885 pg_whoami);
3886 osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
3887 } else {
3888 // for jewel compat only
3889 const MOSDSubOp *req = static_cast<const MOSDSubOp*>(op->get_req());
3890 assert(req->get_type() == MSG_OSD_SUBOP);
3891 MOSDSubOpReply *reply = new MOSDSubOpReply(
3892 req, pg_whoami, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
3893 ::encode(scrubber.reserved, reply->get_data());
3894 osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
3895 }
3896 }
3897
3898 void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
3899 {
3900 dout(7) << __func__ << " " << *op->get_req() << dendl;
3901 op->mark_started();
3902 if (!scrubber.reserved) {
3903 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
3904 return;
3905 }
3906 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
3907 dout(10) << " already had osd." << from << " reserved" << dendl;
3908 } else {
3909 dout(10) << " osd." << from << " scrub reserve = success" << dendl;
3910 scrubber.reserved_peers.insert(from);
3911 sched_scrub();
3912 }
3913 }
3914
3915 void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
3916 {
3917 dout(7) << __func__ << " " << *op->get_req() << dendl;
3918 op->mark_started();
3919 if (!scrubber.reserved) {
3920 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
3921 return;
3922 }
3923 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
3924 dout(10) << " already had osd." << from << " reserved" << dendl;
3925 } else {
3926 /* One decline stops this pg from being scheduled for scrubbing. */
3927 dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
3928 scrubber.reserve_failed = true;
3929 sched_scrub();
3930 }
3931 }
3932
3933 void PG::handle_scrub_reserve_release(OpRequestRef op)
3934 {
3935 dout(7) << __func__ << " " << *op->get_req() << dendl;
3936 op->mark_started();
3937 clear_scrub_reserved();
3938 }
3939
3940 void PG::reject_reservation()
3941 {
3942 osd->send_message_osd_cluster(
3943 primary.osd,
3944 new MBackfillReserve(
3945 MBackfillReserve::REJECT,
3946 spg_t(info.pgid.pgid, primary.shard),
3947 get_osdmap()->get_epoch()),
3948 get_osdmap()->get_epoch());
3949 }
3950
3951 void PG::schedule_backfill_retry(float delay)
3952 {
3953 Mutex::Locker lock(osd->recovery_request_lock);
3954 osd->recovery_request_timer.add_event_after(
3955 delay,
3956 new QueuePeeringEvt<RequestBackfill>(
3957 this, get_osdmap()->get_epoch(),
3958 RequestBackfill()));
3959 }
3960
3961 void PG::schedule_recovery_retry(float delay)
3962 {
3963 Mutex::Locker lock(osd->recovery_request_lock);
3964 osd->recovery_request_timer.add_event_after(
3965 delay,
3966 new QueuePeeringEvt<DoRecovery>(
3967 this, get_osdmap()->get_epoch(),
3968 DoRecovery()));
3969 }
3970
3971 void PG::clear_scrub_reserved()
3972 {
3973 scrubber.reserved_peers.clear();
3974 scrubber.reserve_failed = false;
3975
3976 if (scrubber.reserved) {
3977 scrubber.reserved = false;
3978 osd->dec_scrubs_pending();
3979 }
3980 }
3981
3982 void PG::scrub_reserve_replicas()
3983 {
3984 assert(backfill_targets.empty());
3985 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3986 i != actingbackfill.end();
3987 ++i) {
3988 if (*i == pg_whoami) continue;
3989 dout(10) << "scrub requesting reserve from osd." << *i << dendl;
3990 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
3991 osd->send_message_osd_cluster(
3992 i->osd,
3993 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
3994 get_osdmap()->get_epoch(),
3995 MOSDScrubReserve::REQUEST, pg_whoami),
3996 get_osdmap()->get_epoch());
3997 } else {
3998 // for jewel compat only
3999 vector<OSDOp> scrub(1);
4000 scrub[0].op.op = CEPH_OSD_OP_SCRUB_RESERVE;
4001 hobject_t poid;
4002 eversion_t v;
4003 osd_reqid_t reqid;
4004 MOSDSubOp *subop = new MOSDSubOp(
4005 reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
4006 get_osdmap()->get_epoch(), osd->get_tid(), v);
4007 subop->ops = scrub;
4008 osd->send_message_osd_cluster(
4009 i->osd, subop, get_osdmap()->get_epoch());
4010 }
4011 }
4012 }
4013
4014 void PG::scrub_unreserve_replicas()
4015 {
4016 assert(backfill_targets.empty());
4017 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4018 i != actingbackfill.end();
4019 ++i) {
4020 if (*i == pg_whoami) continue;
4021 dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
4022 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
4023 osd->send_message_osd_cluster(
4024 i->osd,
4025 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
4026 get_osdmap()->get_epoch(),
4027 MOSDScrubReserve::RELEASE, pg_whoami),
4028 get_osdmap()->get_epoch());
4029 } else {
4030 // for jewel compat only
4031 vector<OSDOp> scrub(1);
4032 scrub[0].op.op = CEPH_OSD_OP_SCRUB_UNRESERVE;
4033 hobject_t poid;
4034 eversion_t v;
4035 osd_reqid_t reqid;
4036 MOSDSubOp *subop = new MOSDSubOp(
4037 reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
4038 get_osdmap()->get_epoch(), osd->get_tid(), v);
4039 subop->ops = scrub;
4040 osd->send_message_osd_cluster(i->osd, subop, get_osdmap()->get_epoch());
4041 }
4042 }
4043 }
4044
4045 void PG::_scan_rollback_obs(
4046 const vector<ghobject_t> &rollback_obs,
4047 ThreadPool::TPHandle &handle)
4048 {
4049 ObjectStore::Transaction t;
4050 eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
4051 for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
4052 i != rollback_obs.end();
4053 ++i) {
4054 if (i->generation < trimmed_to.version) {
4055 osd->clog->error() << "osd." << osd->whoami
4056 << " pg " << info.pgid
4057 << " found obsolete rollback obj "
4058 << *i << " generation < trimmed_to "
4059 << trimmed_to
4060 << "...repaired";
4061 t.remove(coll, *i);
4062 }
4063 }
4064 if (!t.empty()) {
4065 derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
4066 << dendl;
4067 osd->store->queue_transaction(osr.get(), std::move(t), NULL);
4068 }
4069 }
4070
4071 void PG::_scan_snaps(ScrubMap &smap)
4072 {
4073 hobject_t head;
4074 SnapSet snapset;
4075 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4076 i != smap.objects.rend();
4077 ++i) {
4078 const hobject_t &hoid = i->first;
4079 ScrubMap::object &o = i->second;
4080
4081 if (hoid.is_head() || hoid.is_snapdir()) {
4082 // parse the SnapSet
4083 bufferlist bl;
4084 if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
4085 continue;
4086 }
4087 bl.push_back(o.attrs[SS_ATTR]);
4088 auto p = bl.begin();
4089 try {
4090 ::decode(snapset, p);
4091 } catch(...) {
4092 continue;
4093 }
4094 head = hoid.get_head();
4095 // Make sure head_exists is correct for is_legacy() check
4096 if (hoid.is_head())
4097 snapset.head_exists = true;
4098 continue;
4099 }
4100 if (hoid.snap < CEPH_MAXSNAP) {
4101 // check and if necessary fix snap_mapper
4102 if (hoid.get_head() != head) {
4103 derr << __func__ << " no head for " << hoid << " (have " << head << ")"
4104 << dendl;
4105 continue;
4106 }
4107 set<snapid_t> obj_snaps;
4108 if (!snapset.is_legacy()) {
4109 auto p = snapset.clone_snaps.find(hoid.snap);
4110 if (p == snapset.clone_snaps.end()) {
4111 derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset
4112 << dendl;
4113 continue;
4114 }
4115 obj_snaps.insert(p->second.begin(), p->second.end());
4116 } else {
4117 bufferlist bl;
4118 if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4119 continue;
4120 }
4121 bl.push_back(o.attrs[OI_ATTR]);
4122 object_info_t oi;
4123 try {
4124 oi.decode(bl);
4125 } catch(...) {
4126 continue;
4127 }
4128 obj_snaps.insert(oi.legacy_snaps.begin(), oi.legacy_snaps.end());
4129 }
4130 set<snapid_t> cur_snaps;
4131 int r = snap_mapper.get_snaps(hoid, &cur_snaps);
4132 if (r != 0 && r != -ENOENT) {
4133 derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
4134 ceph_abort();
4135 }
4136 if (r == -ENOENT || cur_snaps != obj_snaps) {
4137 ObjectStore::Transaction t;
4138 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4139 if (r == 0) {
4140 r = snap_mapper.remove_oid(hoid, &_t);
4141 if (r != 0) {
4142 derr << __func__ << ": remove_oid returned " << cpp_strerror(r)
4143 << dendl;
4144 ceph_abort();
4145 }
4146 osd->clog->error() << "osd." << osd->whoami
4147 << " found snap mapper error on pg "
4148 << info.pgid
4149 << " oid " << hoid << " snaps in mapper: "
4150 << cur_snaps << ", oi: "
4151 << obj_snaps
4152 << "...repaired";
4153 } else {
4154 osd->clog->error() << "osd." << osd->whoami
4155 << " found snap mapper error on pg "
4156 << info.pgid
4157 << " oid " << hoid << " snaps missing in mapper"
4158 << ", should be: "
4159 << obj_snaps
4160 << "...repaired";
4161 }
4162 snap_mapper.add_oid(hoid, obj_snaps, &_t);
4163
4164 // wait for repair to apply to avoid confusing other bits of the system.
4165 {
4166 Cond my_cond;
4167 Mutex my_lock("PG::_scan_snaps my_lock");
4168 int r = 0;
4169 bool done;
4170 t.register_on_applied_sync(
4171 new C_SafeCond(&my_lock, &my_cond, &done, &r));
4172 r = osd->store->apply_transaction(osr.get(), std::move(t));
4173 if (r != 0) {
4174 derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4175 << dendl;
4176 } else {
4177 my_lock.Lock();
4178 while (!done)
4179 my_cond.Wait(my_lock);
4180 my_lock.Unlock();
4181 }
4182 }
4183 }
4184 }
4185 }
4186 }
4187
4188 void PG::_repair_oinfo_oid(ScrubMap &smap)
4189 {
4190 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4191 i != smap.objects.rend();
4192 ++i) {
4193 const hobject_t &hoid = i->first;
4194 ScrubMap::object &o = i->second;
4195
4196 bufferlist bl;
4197 if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4198 continue;
4199 }
4200 bl.push_back(o.attrs[OI_ATTR]);
4201 object_info_t oi;
4202 try {
4203 oi.decode(bl);
4204 } catch(...) {
4205 continue;
4206 }
4207 if (oi.soid != hoid) {
4208 ObjectStore::Transaction t;
4209 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4210 osd->clog->error() << "osd." << osd->whoami
4211 << " found object info error on pg "
4212 << info.pgid
4213 << " oid " << hoid << " oid in object info: "
4214 << oi.soid
4215 << "...repaired";
4216 // Fix object info
4217 oi.soid = hoid;
4218 bl.clear();
4219 ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4220
4221 bufferptr bp(bl.c_str(), bl.length());
4222 o.attrs[OI_ATTR] = bp;
4223
4224 t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
4225 int r = osd->store->apply_transaction(osr.get(), std::move(t));
4226 if (r != 0) {
4227 derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4228 << dendl;
4229 }
4230 }
4231 }
4232 }
4233
4234 /*
4235 * build a scrub map over a chunk without releasing the lock
4236 * only used by chunky scrub
4237 */
4238 int PG::build_scrub_map_chunk(
4239 ScrubMap &map,
4240 hobject_t start, hobject_t end, bool deep, uint32_t seed,
4241 ThreadPool::TPHandle &handle)
4242 {
4243 dout(10) << __func__ << " [" << start << "," << end << ") "
4244 << " seed " << seed << dendl;
4245
4246 map.valid_through = info.last_update;
4247
4248 // objects
4249 vector<hobject_t> ls;
4250 vector<ghobject_t> rollback_obs;
4251 int ret = get_pgbackend()->objects_list_range(
4252 start,
4253 end,
4254 0,
4255 &ls,
4256 &rollback_obs);
4257 if (ret < 0) {
4258 dout(5) << "objects_list_range error: " << ret << dendl;
4259 return ret;
4260 }
4261
4262
4263 get_pgbackend()->be_scan_list(map, ls, deep, seed, handle);
4264 _scan_rollback_obs(rollback_obs, handle);
4265 _scan_snaps(map);
4266 _repair_oinfo_oid(map);
4267
4268 dout(20) << __func__ << " done" << dendl;
4269 return 0;
4270 }
4271
4272 void PG::Scrubber::cleanup_store(ObjectStore::Transaction *t) {
4273 if (!store)
4274 return;
4275 struct OnComplete : Context {
4276 std::unique_ptr<Scrub::Store> store;
4277 OnComplete(
4278 std::unique_ptr<Scrub::Store> &&store)
4279 : store(std::move(store)) {}
4280 void finish(int) override {}
4281 };
4282 store->cleanup(t);
4283 t->register_on_complete(new OnComplete(std::move(store)));
4284 assert(!store);
4285 }
4286
4287 void PG::repair_object(
4288 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
4289 pg_shard_t bad_peer)
4290 {
4291 list<pg_shard_t> op_shards;
4292 for (auto i : *ok_peers) {
4293 op_shards.push_back(i.second);
4294 }
4295 dout(10) << "repair_object " << soid << " bad_peer osd."
4296 << bad_peer << " ok_peers osd.{" << op_shards << "}" << dendl;
4297 ScrubMap::object &po = ok_peers->back().first;
4298 eversion_t v;
4299 bufferlist bv;
4300 bv.push_back(po.attrs[OI_ATTR]);
4301 object_info_t oi;
4302 try {
4303 bufferlist::iterator bliter = bv.begin();
4304 ::decode(oi, bliter);
4305 } catch (...) {
4306 dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
4307 assert(0);
4308 }
4309 if (bad_peer != primary) {
4310 peer_missing[bad_peer].add(soid, oi.version, eversion_t(), false);
4311 } else {
4312 // We should only be scrubbing if the PG is clean.
4313 assert(waiting_for_unreadable_object.empty());
4314
4315 pg_log.missing_add(soid, oi.version, eversion_t());
4316
4317 pg_log.set_last_requested(0);
4318 dout(10) << __func__ << ": primary = " << primary << dendl;
4319 }
4320
4321 if (is_ec_pg() || bad_peer == primary) {
4322 // we'd better collect all shard for EC pg, and prepare good peers as the
4323 // source of pull in the case of replicated pg.
4324 missing_loc.add_missing(soid, oi.version, eversion_t());
4325 list<pair<ScrubMap::object, pg_shard_t> >::iterator i;
4326 for (i = ok_peers->begin();
4327 i != ok_peers->end();
4328 ++i)
4329 missing_loc.add_location(soid, i->second);
4330 }
4331 }
4332
4333 /* replica_scrub
4334 *
4335 * Wait for last_update_applied to match msg->scrub_to as above. Wait
4336 * for pushes to complete in case of recent recovery. Build a single
4337 * scrubmap of objects that are in the range [msg->start, msg->end).
4338 */
4339 void PG::replica_scrub(
4340 OpRequestRef op,
4341 ThreadPool::TPHandle &handle)
4342 {
4343 const MOSDRepScrub *msg = static_cast<const MOSDRepScrub *>(op->get_req());
4344 assert(!scrubber.active_rep_scrub);
4345 dout(7) << "replica_scrub" << dendl;
4346
4347 if (msg->map_epoch < info.history.same_interval_since) {
4348 dout(10) << "replica_scrub discarding old replica_scrub from "
4349 << msg->map_epoch << " < " << info.history.same_interval_since
4350 << dendl;
4351 return;
4352 }
4353
4354 ScrubMap map;
4355
4356 assert(msg->chunky);
4357 if (last_update_applied < msg->scrub_to) {
4358 dout(10) << "waiting for last_update_applied to catch up" << dendl;
4359 scrubber.active_rep_scrub = op;
4360 return;
4361 }
4362
4363 if (active_pushes > 0) {
4364 dout(10) << "waiting for active pushes to finish" << dendl;
4365 scrubber.active_rep_scrub = op;
4366 return;
4367 }
4368
4369 // compensate for hobject_t's with wrong pool from sloppy hammer OSDs
4370 hobject_t start = msg->start;
4371 hobject_t end = msg->end;
4372 if (!start.is_max())
4373 start.pool = info.pgid.pool();
4374 if (!end.is_max())
4375 end.pool = info.pgid.pool();
4376
4377 build_scrub_map_chunk(
4378 map, start, end, msg->deep, msg->seed,
4379 handle);
4380
4381 if (HAVE_FEATURE(acting_features, SERVER_LUMINOUS)) {
4382 MOSDRepScrubMap *reply = new MOSDRepScrubMap(
4383 spg_t(info.pgid.pgid, get_primary().shard),
4384 msg->map_epoch,
4385 pg_whoami);
4386 ::encode(map, reply->get_data());
4387 osd->send_message_osd_cluster(reply, msg->get_connection());
4388 } else {
4389 // for jewel compatibility
4390 vector<OSDOp> scrub(1);
4391 scrub[0].op.op = CEPH_OSD_OP_SCRUB_MAP;
4392 hobject_t poid;
4393 eversion_t v;
4394 osd_reqid_t reqid;
4395 MOSDSubOp *subop = new MOSDSubOp(
4396 reqid,
4397 pg_whoami,
4398 spg_t(info.pgid.pgid, get_primary().shard),
4399 poid,
4400 0,
4401 msg->map_epoch,
4402 osd->get_tid(),
4403 v);
4404 ::encode(map, subop->get_data());
4405 subop->ops = scrub;
4406 osd->send_message_osd_cluster(subop, msg->get_connection());
4407 }
4408 }
4409
4410 /* Scrub:
4411 * PG_STATE_SCRUBBING is set when the scrub is queued
4412 *
4413 * scrub will be chunky if all OSDs in PG support chunky scrub
4414 * scrub will fail if OSDs are too old.
4415 */
4416 void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
4417 {
4418 if (cct->_conf->osd_scrub_sleep > 0 &&
4419 (scrubber.state == PG::Scrubber::NEW_CHUNK ||
4420 scrubber.state == PG::Scrubber::INACTIVE) &&
4421 scrubber.needs_sleep) {
4422 ceph_assert(!scrubber.sleeping);
4423 dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
4424
4425 // Do an async sleep so we don't block the op queue
4426 OSDService *osds = osd;
4427 spg_t pgid = get_pgid();
4428 int state = scrubber.state;
4429 auto scrub_requeue_callback =
4430 new FunctionContext([osds, pgid, state](int r) {
4431 PG *pg = osds->osd->lookup_lock_pg(pgid);
4432 if (pg == nullptr) {
4433 lgeneric_dout(osds->osd->cct, 20)
4434 << "scrub_requeue_callback: Could not find "
4435 << "PG " << pgid << " can't complete scrub requeue after sleep"
4436 << dendl;
4437 return;
4438 }
4439 pg->scrubber.sleeping = false;
4440 pg->scrubber.needs_sleep = false;
4441 lgeneric_dout(pg->cct, 20)
4442 << "scrub_requeue_callback: slept for "
4443 << ceph_clock_now() - pg->scrubber.sleep_start
4444 << ", re-queuing scrub with state " << state << dendl;
4445 pg->scrub_queued = false;
4446 pg->requeue_scrub();
4447 pg->scrubber.sleep_start = utime_t();
4448 pg->unlock();
4449 });
4450 Mutex::Locker l(osd->scrub_sleep_lock);
4451 osd->scrub_sleep_timer.add_event_after(cct->_conf->osd_scrub_sleep,
4452 scrub_requeue_callback);
4453 scrubber.sleeping = true;
4454 scrubber.sleep_start = ceph_clock_now();
4455 return;
4456 }
4457 if (pg_has_reset_since(queued)) {
4458 return;
4459 }
4460 assert(scrub_queued);
4461 scrub_queued = false;
4462 scrubber.needs_sleep = true;
4463
4464 if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
4465 dout(10) << "scrub -- not primary or active or not clean" << dendl;
4466 state_clear(PG_STATE_SCRUBBING);
4467 state_clear(PG_STATE_REPAIR);
4468 state_clear(PG_STATE_DEEP_SCRUB);
4469 publish_stats_to_osd();
4470 return;
4471 }
4472
4473 if (!scrubber.active) {
4474 assert(backfill_targets.empty());
4475
4476 scrubber.deep = state_test(PG_STATE_DEEP_SCRUB);
4477
4478 dout(10) << "starting a new chunky scrub" << dendl;
4479 }
4480
4481 chunky_scrub(handle);
4482 }
4483
4484 /*
4485 * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
4486 * chunk.
4487 *
4488 * The object store is partitioned into chunks which end on hash boundaries. For
4489 * each chunk, the following logic is performed:
4490 *
4491 * (1) Block writes on the chunk
4492 * (2) Request maps from replicas
4493 * (3) Wait for pushes to be applied (after recovery)
4494 * (4) Wait for writes to flush on the chunk
4495 * (5) Wait for maps from replicas
4496 * (6) Compare / repair all scrub maps
4497 * (7) Wait for digest updates to apply
4498 *
4499 * This logic is encoded in the mostly linear state machine:
4500 *
4501 * +------------------+
4502 * _________v__________ |
4503 * | | |
4504 * | INACTIVE | |
4505 * |____________________| |
4506 * | |
4507 * | +----------+ |
4508 * _________v___v______ | |
4509 * | | | |
4510 * | NEW_CHUNK | | |
4511 * |____________________| | |
4512 * | | |
4513 * _________v__________ | |
4514 * | | | |
4515 * | WAIT_PUSHES | | |
4516 * |____________________| | |
4517 * | | |
4518 * _________v__________ | |
4519 * | | | |
4520 * | WAIT_LAST_UPDATE | | |
4521 * |____________________| | |
4522 * | | |
4523 * _________v__________ | |
4524 * | | | |
4525 * | BUILD_MAP | | |
4526 * |____________________| | |
4527 * | | |
4528 * _________v__________ | |
4529 * | | | |
4530 * | WAIT_REPLICAS | | |
4531 * |____________________| | |
4532 * | | |
4533 * _________v__________ | |
4534 * | | | |
4535 * | COMPARE_MAPS | | |
4536 * |____________________| | |
4537 * | | |
4538 * | | |
4539 * _________v__________ | |
4540 * | | | |
4541 * |WAIT_DIGEST_UPDATES | | |
4542 * |____________________| | |
4543 * | | | |
4544 * | +----------+ |
4545 * _________v__________ |
4546 * | | |
4547 * | FINISH | |
4548 * |____________________| |
4549 * | |
4550 * +------------------+
4551 *
4552 * The primary determines the last update from the subset by walking the log. If
4553 * it sees a log entry pertaining to a file in the chunk, it tells the replicas
4554 * to wait until that update is applied before building a scrub map. Both the
4555 * primary and replicas will wait for any active pushes to be applied.
4556 *
4557 * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
4558 *
4559 * scrubber.state encodes the current state of the scrub (refer to state diagram
4560 * for details).
4561 */
4562 void PG::chunky_scrub(ThreadPool::TPHandle &handle)
4563 {
4564 // check for map changes
4565 if (scrubber.is_chunky_scrub_active()) {
4566 if (scrubber.epoch_start != info.history.same_interval_since) {
4567 dout(10) << "scrub pg changed, aborting" << dendl;
4568 scrub_clear_state();
4569 scrub_unreserve_replicas();
4570 return;
4571 }
4572 }
4573
4574 bool done = false;
4575 int ret;
4576
4577 while (!done) {
4578 dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
4579 << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
4580
4581 switch (scrubber.state) {
4582 case PG::Scrubber::INACTIVE:
4583 dout(10) << "scrub start" << dendl;
4584
4585 publish_stats_to_osd();
4586 scrubber.epoch_start = info.history.same_interval_since;
4587 scrubber.active = true;
4588
4589 osd->inc_scrubs_active(scrubber.reserved);
4590 if (scrubber.reserved) {
4591 scrubber.reserved = false;
4592 scrubber.reserved_peers.clear();
4593 }
4594
4595 {
4596 ObjectStore::Transaction t;
4597 scrubber.cleanup_store(&t);
4598 scrubber.store.reset(Scrub::Store::create(osd->store, &t,
4599 info.pgid, coll));
4600 osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4601 }
4602
4603 // Don't include temporary objects when scrubbing
4604 scrubber.start = info.pgid.pgid.get_hobj_start();
4605 scrubber.state = PG::Scrubber::NEW_CHUNK;
4606
4607 {
4608 bool repair = state_test(PG_STATE_REPAIR);
4609 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4610 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4611 stringstream oss;
4612 oss << info.pgid.pgid << " " << mode << " starts" << std::endl;
4613 osd->clog->debug(oss);
4614 }
4615
4616 scrubber.seed = -1;
4617
4618 break;
4619
4620 case PG::Scrubber::NEW_CHUNK:
4621 scrubber.primary_scrubmap = ScrubMap();
4622 scrubber.received_maps.clear();
4623
4624 {
4625 /* get the start and end of our scrub chunk
4626 *
4627 * Our scrub chunk has an important restriction we're going to need to
4628 * respect. We can't let head or snapdir be start or end.
4629 * Using a half-open interval means that if end == head|snapdir,
4630 * we'd scrub/lock head and the clone right next to head in different
4631 * chunks which would allow us to miss clones created between
4632 * scrubbing that chunk and scrubbing the chunk including head.
4633 * This isn't true for any of the other clones since clones can
4634 * only be created "just to the left of" head. There is one exception
4635 * to this: promotion of clones which always happens to the left of the
4636 * left-most clone, but promote_object checks the scrubber in that
4637 * case, so it should be ok. Also, it's ok to "miss" clones at the
4638 * left end of the range if we are a tier because they may legitimately
4639 * not exist (see _scrub).
4640 */
4641 int min = MAX(3, cct->_conf->osd_scrub_chunk_min);
4642 hobject_t start = scrubber.start;
4643 hobject_t candidate_end;
4644 vector<hobject_t> objects;
4645 ret = get_pgbackend()->objects_list_partial(
4646 start,
4647 min,
4648 MAX(min, cct->_conf->osd_scrub_chunk_max),
4649 &objects,
4650 &candidate_end);
4651 assert(ret >= 0);
4652
4653 if (!objects.empty()) {
4654 hobject_t back = objects.back();
4655 while (candidate_end.has_snapset() &&
4656 candidate_end.get_head() == back.get_head()) {
4657 candidate_end = back;
4658 objects.pop_back();
4659 if (objects.empty()) {
4660 assert(0 ==
4661 "Somehow we got more than 2 objects which"
4662 "have the same head but are not clones");
4663 }
4664 back = objects.back();
4665 }
4666 if (candidate_end.has_snapset()) {
4667 assert(candidate_end.get_head() != back.get_head());
4668 candidate_end = candidate_end.get_object_boundary();
4669 }
4670 } else {
4671 assert(candidate_end.is_max());
4672 }
4673
4674 if (!_range_available_for_scrub(scrubber.start, candidate_end)) {
4675 // we'll be requeued by whatever made us unavailable for scrub
4676 dout(10) << __func__ << ": scrub blocked somewhere in range "
4677 << "[" << scrubber.start << ", " << candidate_end << ")"
4678 << dendl;
4679 done = true;
4680 break;
4681 }
4682 scrubber.end = candidate_end;
4683 }
4684
4685 // walk the log to find the latest update that affects our chunk
4686 scrubber.subset_last_update = eversion_t();
4687 for (auto p = projected_log.log.rbegin();
4688 p != projected_log.log.rend();
4689 ++p) {
4690 if (p->soid >= scrubber.start &&
4691 p->soid < scrubber.end) {
4692 scrubber.subset_last_update = p->version;
4693 break;
4694 }
4695 }
4696 if (scrubber.subset_last_update == eversion_t()) {
4697 for (list<pg_log_entry_t>::const_reverse_iterator p =
4698 pg_log.get_log().log.rbegin();
4699 p != pg_log.get_log().log.rend();
4700 ++p) {
4701 if (p->soid >= scrubber.start &&
4702 p->soid < scrubber.end) {
4703 scrubber.subset_last_update = p->version;
4704 break;
4705 }
4706 }
4707 }
4708
4709 // ask replicas to wait until
4710 // last_update_applied >= scrubber.subset_last_update and then scan
4711 scrubber.waiting_on_whom.insert(pg_whoami);
4712 ++scrubber.waiting_on;
4713
4714 // request maps from replicas
4715 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4716 i != actingbackfill.end();
4717 ++i) {
4718 if (*i == pg_whoami) continue;
4719 _request_scrub_map(*i, scrubber.subset_last_update,
4720 scrubber.start, scrubber.end, scrubber.deep,
4721 scrubber.seed);
4722 scrubber.waiting_on_whom.insert(*i);
4723 ++scrubber.waiting_on;
4724 }
4725
4726 scrubber.state = PG::Scrubber::WAIT_PUSHES;
4727
4728 break;
4729
4730 case PG::Scrubber::WAIT_PUSHES:
4731 if (active_pushes == 0) {
4732 scrubber.state = PG::Scrubber::WAIT_LAST_UPDATE;
4733 } else {
4734 dout(15) << "wait for pushes to apply" << dendl;
4735 done = true;
4736 }
4737 break;
4738
4739 case PG::Scrubber::WAIT_LAST_UPDATE:
4740 if (last_update_applied >= scrubber.subset_last_update) {
4741 scrubber.state = PG::Scrubber::BUILD_MAP;
4742 } else {
4743 // will be requeued by op_applied
4744 dout(15) << "wait for writes to flush" << dendl;
4745 done = true;
4746 }
4747 break;
4748
4749 case PG::Scrubber::BUILD_MAP:
4750 assert(last_update_applied >= scrubber.subset_last_update);
4751
4752 // build my own scrub map
4753 ret = build_scrub_map_chunk(scrubber.primary_scrubmap,
4754 scrubber.start, scrubber.end,
4755 scrubber.deep, scrubber.seed,
4756 handle);
4757 if (ret < 0) {
4758 dout(5) << "error building scrub map: " << ret << ", aborting" << dendl;
4759 scrub_clear_state();
4760 scrub_unreserve_replicas();
4761 return;
4762 }
4763
4764 --scrubber.waiting_on;
4765 scrubber.waiting_on_whom.erase(pg_whoami);
4766
4767 scrubber.state = PG::Scrubber::WAIT_REPLICAS;
4768 break;
4769
4770 case PG::Scrubber::WAIT_REPLICAS:
4771 if (scrubber.waiting_on > 0) {
4772 // will be requeued by sub_op_scrub_map
4773 dout(10) << "wait for replicas to build scrub map" << dendl;
4774 done = true;
4775 } else {
4776 scrubber.state = PG::Scrubber::COMPARE_MAPS;
4777 }
4778 break;
4779
4780 case PG::Scrubber::COMPARE_MAPS:
4781 assert(last_update_applied >= scrubber.subset_last_update);
4782 assert(scrubber.waiting_on == 0);
4783
4784 scrub_compare_maps();
4785 scrubber.start = scrubber.end;
4786 scrubber.run_callbacks();
4787
4788 // requeue the writes from the chunk that just finished
4789 requeue_ops(waiting_for_scrub);
4790
4791 scrubber.state = PG::Scrubber::WAIT_DIGEST_UPDATES;
4792
4793 // fall-thru
4794
4795 case PG::Scrubber::WAIT_DIGEST_UPDATES:
4796 if (scrubber.num_digest_updates_pending) {
4797 dout(10) << __func__ << " waiting on "
4798 << scrubber.num_digest_updates_pending
4799 << " digest updates" << dendl;
4800 done = true;
4801 break;
4802 }
4803
4804 if (!(scrubber.end.is_max())) {
4805 scrubber.state = PG::Scrubber::NEW_CHUNK;
4806 requeue_scrub();
4807 done = true;
4808 } else {
4809 scrubber.state = PG::Scrubber::FINISH;
4810 }
4811
4812 break;
4813
4814 case PG::Scrubber::FINISH:
4815 scrub_finish();
4816 scrubber.state = PG::Scrubber::INACTIVE;
4817 done = true;
4818
4819 if (!snap_trimq.empty()) {
4820 dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
4821 snap_trimmer_scrub_complete();
4822 }
4823
4824 break;
4825
4826 default:
4827 ceph_abort();
4828 }
4829 }
4830 dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
4831 << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
4832 }
4833
4834 void PG::scrub_clear_state()
4835 {
4836 assert(is_locked());
4837 state_clear(PG_STATE_SCRUBBING);
4838 state_clear(PG_STATE_REPAIR);
4839 state_clear(PG_STATE_DEEP_SCRUB);
4840 publish_stats_to_osd();
4841
4842 // active -> nothing.
4843 if (scrubber.active)
4844 osd->dec_scrubs_active();
4845
4846 requeue_ops(waiting_for_scrub);
4847
4848 scrubber.reset();
4849
4850 // type-specific state clear
4851 _scrub_clear_state();
4852 }
4853
4854 void PG::scrub_compare_maps()
4855 {
4856 dout(10) << __func__ << " has maps, analyzing" << dendl;
4857
4858 // construct authoritative scrub map for type specific scrubbing
4859 scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
4860 map<hobject_t, pair<uint32_t, uint32_t>> missing_digest;
4861
4862 if (acting.size() > 1) {
4863 dout(10) << __func__ << " comparing replica scrub maps" << dendl;
4864
4865 stringstream ss;
4866
4867 // Map from object with errors to good peer
4868 map<hobject_t, list<pg_shard_t>> authoritative;
4869 map<pg_shard_t, ScrubMap *> maps;
4870
4871 dout(2) << __func__ << " osd." << acting[0] << " has "
4872 << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
4873 maps[pg_whoami] = &scrubber.primary_scrubmap;
4874
4875 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4876 i != actingbackfill.end();
4877 ++i) {
4878 if (*i == pg_whoami) continue;
4879 dout(2) << __func__ << " replica " << *i << " has "
4880 << scrubber.received_maps[*i].objects.size()
4881 << " items" << dendl;
4882 maps[*i] = &scrubber.received_maps[*i];
4883 }
4884
4885 get_pgbackend()->be_compare_scrubmaps(
4886 maps,
4887 state_test(PG_STATE_REPAIR),
4888 scrubber.missing,
4889 scrubber.inconsistent,
4890 authoritative,
4891 missing_digest,
4892 scrubber.shallow_errors,
4893 scrubber.deep_errors,
4894 scrubber.store.get(),
4895 info.pgid, acting,
4896 ss);
4897 dout(2) << ss.str() << dendl;
4898
4899 if (!ss.str().empty()) {
4900 osd->clog->error(ss);
4901 }
4902
4903 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
4904 i != authoritative.end();
4905 ++i) {
4906 list<pair<ScrubMap::object, pg_shard_t> > good_peers;
4907 for (list<pg_shard_t>::const_iterator j = i->second.begin();
4908 j != i->second.end();
4909 ++j) {
4910 good_peers.push_back(make_pair(maps[*j]->objects[i->first], *j));
4911 }
4912 scrubber.authoritative.insert(
4913 make_pair(
4914 i->first,
4915 good_peers));
4916 }
4917
4918 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
4919 i != authoritative.end();
4920 ++i) {
4921 scrubber.cleaned_meta_map.objects.erase(i->first);
4922 scrubber.cleaned_meta_map.objects.insert(
4923 *(maps[i->second.back()]->objects.find(i->first))
4924 );
4925 }
4926 }
4927
4928 ScrubMap for_meta_scrub;
4929 if (scrubber.end.is_max() ||
4930 scrubber.cleaned_meta_map.objects.empty()) {
4931 scrubber.cleaned_meta_map.swap(for_meta_scrub);
4932 } else {
4933 auto iter = scrubber.cleaned_meta_map.objects.end();
4934 --iter; // not empty, see if clause
4935 auto begin = scrubber.cleaned_meta_map.objects.begin();
4936 while (iter != begin) {
4937 auto next = iter--;
4938 if (next->first.get_head() != iter->first.get_head()) {
4939 ++iter;
4940 break;
4941 }
4942 }
4943 for_meta_scrub.objects.insert(begin, iter);
4944 scrubber.cleaned_meta_map.objects.erase(begin, iter);
4945 }
4946
4947 // ok, do the pg-type specific scrubbing
4948 scrub_snapshot_metadata(for_meta_scrub, missing_digest);
4949 if (!scrubber.store->empty()) {
4950 if (state_test(PG_STATE_REPAIR)) {
4951 dout(10) << __func__ << ": discarding scrub results" << dendl;
4952 scrubber.store->flush(nullptr);
4953 } else {
4954 dout(10) << __func__ << ": updating scrub object" << dendl;
4955 ObjectStore::Transaction t;
4956 scrubber.store->flush(&t);
4957 osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4958 }
4959 }
4960 }
4961
4962 bool PG::scrub_process_inconsistent()
4963 {
4964 dout(10) << __func__ << ": checking authoritative" << dendl;
4965 bool repair = state_test(PG_STATE_REPAIR);
4966 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4967 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4968
4969 // authoriative only store objects which missing or inconsistent.
4970 if (!scrubber.authoritative.empty()) {
4971 stringstream ss;
4972 ss << info.pgid << " " << mode << " "
4973 << scrubber.missing.size() << " missing, "
4974 << scrubber.inconsistent.size() << " inconsistent objects";
4975 dout(2) << ss.str() << dendl;
4976 osd->clog->error(ss);
4977 if (repair) {
4978 state_clear(PG_STATE_CLEAN);
4979 for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >>::iterator i =
4980 scrubber.authoritative.begin();
4981 i != scrubber.authoritative.end();
4982 ++i) {
4983 set<pg_shard_t>::iterator j;
4984
4985 auto missing_entry = scrubber.missing.find(i->first);
4986 if (missing_entry != scrubber.missing.end()) {
4987 for (j = missing_entry->second.begin();
4988 j != missing_entry->second.end();
4989 ++j) {
4990 repair_object(
4991 i->first,
4992 &(i->second),
4993 *j);
4994 ++scrubber.fixed;
4995 }
4996 }
4997 if (scrubber.inconsistent.count(i->first)) {
4998 for (j = scrubber.inconsistent[i->first].begin();
4999 j != scrubber.inconsistent[i->first].end();
5000 ++j) {
5001 repair_object(i->first,
5002 &(i->second),
5003 *j);
5004 ++scrubber.fixed;
5005 }
5006 }
5007 }
5008 }
5009 }
5010 return (!scrubber.authoritative.empty() && repair);
5011 }
5012
5013 bool PG::ops_blocked_by_scrub() const {
5014 return (waiting_for_scrub.size() != 0);
5015 }
5016
5017 // the part that actually finalizes a scrub
5018 void PG::scrub_finish()
5019 {
5020 bool repair = state_test(PG_STATE_REPAIR);
5021 // if the repair request comes from auto-repair and large number of errors,
5022 // we would like to cancel auto-repair
5023 if (repair && scrubber.auto_repair
5024 && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) {
5025 state_clear(PG_STATE_REPAIR);
5026 repair = false;
5027 }
5028 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5029 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5030
5031 // type-specific finish (can tally more errors)
5032 _scrub_finish();
5033
5034 bool has_error = scrub_process_inconsistent();
5035
5036 {
5037 stringstream oss;
5038 oss << info.pgid.pgid << " " << mode << " ";
5039 int total_errors = scrubber.shallow_errors + scrubber.deep_errors;
5040 if (total_errors)
5041 oss << total_errors << " errors";
5042 else
5043 oss << "ok";
5044 if (!deep_scrub && info.stats.stats.sum.num_deep_scrub_errors)
5045 oss << " ( " << info.stats.stats.sum.num_deep_scrub_errors
5046 << " remaining deep scrub error details lost)";
5047 if (repair)
5048 oss << ", " << scrubber.fixed << " fixed";
5049 if (total_errors)
5050 osd->clog->error(oss);
5051 else
5052 osd->clog->debug(oss);
5053 }
5054
5055 // finish up
5056 unreg_next_scrub();
5057 utime_t now = ceph_clock_now();
5058 info.history.last_scrub = info.last_update;
5059 info.history.last_scrub_stamp = now;
5060 if (scrubber.deep) {
5061 info.history.last_deep_scrub = info.last_update;
5062 info.history.last_deep_scrub_stamp = now;
5063 }
5064 // Since we don't know which errors were fixed, we can only clear them
5065 // when every one has been fixed.
5066 if (repair) {
5067 if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) {
5068 assert(deep_scrub);
5069 scrubber.shallow_errors = scrubber.deep_errors = 0;
5070 } else {
5071 // Deep scrub in order to get corrected error counts
5072 scrub_after_recovery = true;
5073 }
5074 }
5075 if (deep_scrub) {
5076 if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0))
5077 info.history.last_clean_scrub_stamp = now;
5078 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
5079 info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
5080 } else {
5081 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
5082 // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
5083 // because of deep-scrub errors
5084 if (scrubber.shallow_errors == 0)
5085 info.history.last_clean_scrub_stamp = now;
5086 }
5087 info.stats.stats.sum.num_scrub_errors =
5088 info.stats.stats.sum.num_shallow_scrub_errors +
5089 info.stats.stats.sum.num_deep_scrub_errors;
5090 reg_next_scrub();
5091
5092 {
5093 ObjectStore::Transaction t;
5094 dirty_info = true;
5095 write_if_dirty(t);
5096 int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
5097 assert(tr == 0);
5098 }
5099
5100
5101 if (has_error) {
5102 queue_peering_event(
5103 CephPeeringEvtRef(
5104 std::make_shared<CephPeeringEvt>(
5105 get_osdmap()->get_epoch(),
5106 get_osdmap()->get_epoch(),
5107 DoRecovery())));
5108 }
5109
5110 scrub_clear_state();
5111 scrub_unreserve_replicas();
5112
5113 if (is_active() && is_primary()) {
5114 share_pg_info();
5115 }
5116 }
5117
5118 void PG::share_pg_info()
5119 {
5120 dout(10) << "share_pg_info" << dendl;
5121
5122 // share new pg_info_t with replicas
5123 assert(!actingbackfill.empty());
5124 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
5125 i != actingbackfill.end();
5126 ++i) {
5127 if (*i == pg_whoami) continue;
5128 pg_shard_t peer = *i;
5129 if (peer_info.count(peer)) {
5130 peer_info[peer].last_epoch_started = info.last_epoch_started;
5131 peer_info[peer].last_interval_started = info.last_interval_started;
5132 peer_info[peer].history.merge(info.history);
5133 }
5134 MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
5135 m->pg_list.push_back(
5136 make_pair(
5137 pg_notify_t(
5138 peer.shard, pg_whoami.shard,
5139 get_osdmap()->get_epoch(),
5140 get_osdmap()->get_epoch(),
5141 info),
5142 PastIntervals()));
5143 osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
5144 }
5145 }
5146
5147 bool PG::append_log_entries_update_missing(
5148 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
5149 ObjectStore::Transaction &t, boost::optional<eversion_t> trim_to,
5150 boost::optional<eversion_t> roll_forward_to)
5151 {
5152 assert(!entries.empty());
5153 assert(entries.begin()->version > info.last_update);
5154
5155 PGLogEntryHandler rollbacker{this, &t};
5156 if (roll_forward_to) {
5157 pg_log.roll_forward(&rollbacker);
5158 }
5159 bool invalidate_stats =
5160 pg_log.append_new_log_entries(info.last_backfill,
5161 info.last_backfill_bitwise,
5162 entries,
5163 &rollbacker);
5164
5165 if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
5166 pg_log.roll_forward(&rollbacker);
5167 }
5168 if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
5169 pg_log.roll_forward_to(*roll_forward_to, &rollbacker);
5170 last_rollback_info_trimmed_to_applied = *roll_forward_to;
5171 }
5172
5173 info.last_update = pg_log.get_head();
5174
5175 if (pg_log.get_missing().num_missing() == 0) {
5176 // advance last_complete since nothing else is missing!
5177 info.last_complete = info.last_update;
5178 }
5179 info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
5180
5181 dout(20) << __func__ << "trim_to bool = " << bool(trim_to) << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
5182 if (trim_to)
5183 pg_log.trim(*trim_to, info);
5184 dirty_info = true;
5185 write_if_dirty(t);
5186 return invalidate_stats;
5187 }
5188
5189
5190 void PG::merge_new_log_entries(
5191 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
5192 ObjectStore::Transaction &t,
5193 boost::optional<eversion_t> trim_to,
5194 boost::optional<eversion_t> roll_forward_to)
5195 {
5196 dout(10) << __func__ << " " << entries << dendl;
5197 assert(is_primary());
5198
5199 bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
5200 for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
5201 i != actingbackfill.end();
5202 ++i) {
5203 pg_shard_t peer(*i);
5204 if (peer == pg_whoami) continue;
5205 assert(peer_missing.count(peer));
5206 assert(peer_info.count(peer));
5207 pg_missing_t& pmissing(peer_missing[peer]);
5208 dout(20) << __func__ << " peer_missing for " << peer << " = " << pmissing << dendl;
5209 pg_info_t& pinfo(peer_info[peer]);
5210 bool invalidate_stats = PGLog::append_log_entries_update_missing(
5211 pinfo.last_backfill,
5212 info.last_backfill_bitwise,
5213 entries,
5214 true,
5215 NULL,
5216 pmissing,
5217 NULL,
5218 this);
5219 pinfo.last_update = info.last_update;
5220 pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
5221 rebuild_missing = rebuild_missing || invalidate_stats;
5222 }
5223
5224 if (!rebuild_missing) {
5225 return;
5226 }
5227
5228 for (auto &&i: entries) {
5229 missing_loc.rebuild(
5230 i.soid,
5231 pg_whoami,
5232 actingbackfill,
5233 info,
5234 pg_log.get_missing(),
5235 peer_missing,
5236 peer_info);
5237 }
5238 }
5239
5240 void PG::update_history(const pg_history_t& new_history)
5241 {
5242 unreg_next_scrub();
5243 if (info.history.merge(new_history)) {
5244 dout(20) << __func__ << " advanced history from " << new_history << dendl;
5245 dirty_info = true;
5246 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
5247 dout(20) << __func__ << " clearing past_intervals" << dendl;
5248 past_intervals.clear();
5249 dirty_big_info = true;
5250 }
5251 }
5252 reg_next_scrub();
5253 }
5254
5255 void PG::fulfill_info(
5256 pg_shard_t from, const pg_query_t &query,
5257 pair<pg_shard_t, pg_info_t> &notify_info)
5258 {
5259 assert(from == primary);
5260 assert(query.type == pg_query_t::INFO);
5261
5262 // info
5263 dout(10) << "sending info" << dendl;
5264 notify_info = make_pair(from, info);
5265 }
5266
5267 void PG::fulfill_log(
5268 pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
5269 {
5270 dout(10) << "log request from " << from << dendl;
5271 assert(from == primary);
5272 assert(query.type != pg_query_t::INFO);
5273 ConnectionRef con = osd->get_con_osd_cluster(
5274 from.osd, get_osdmap()->get_epoch());
5275 if (!con) return;
5276
5277 MOSDPGLog *mlog = new MOSDPGLog(
5278 from.shard, pg_whoami.shard,
5279 get_osdmap()->get_epoch(),
5280 info, query_epoch);
5281 mlog->missing = pg_log.get_missing();
5282
5283 // primary -> other, when building master log
5284 if (query.type == pg_query_t::LOG) {
5285 dout(10) << " sending info+missing+log since " << query.since
5286 << dendl;
5287 if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
5288 osd->clog->error() << info.pgid << " got broken pg_query_t::LOG since " << query.since
5289 << " when my log.tail is " << pg_log.get_tail()
5290 << ", sending full log instead";
5291 mlog->log = pg_log.get_log(); // primary should not have requested this!!
5292 } else
5293 mlog->log.copy_after(pg_log.get_log(), query.since);
5294 }
5295 else if (query.type == pg_query_t::FULLLOG) {
5296 dout(10) << " sending info+missing+full log" << dendl;
5297 mlog->log = pg_log.get_log();
5298 }
5299
5300 dout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
5301
5302 osd->share_map_peer(from.osd, con.get(), get_osdmap());
5303 osd->send_message_osd_cluster(mlog, con.get());
5304 }
5305
5306 void PG::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
5307 {
5308 bool changed = false;
5309 if (osdmap->test_flag(CEPH_OSDMAP_FULL) &&
5310 !lastmap->test_flag(CEPH_OSDMAP_FULL)) {
5311 dout(10) << " cluster was marked full in " << osdmap->get_epoch() << dendl;
5312 changed = true;
5313 }
5314 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
5315 assert(pi);
5316 if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
5317 const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
5318 if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
5319 dout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
5320 changed = true;
5321 }
5322 }
5323 if (changed) {
5324 info.history.last_epoch_marked_full = osdmap->get_epoch();
5325 dirty_info = true;
5326 }
5327 }
5328
5329 bool PG::should_restart_peering(
5330 int newupprimary,
5331 int newactingprimary,
5332 const vector<int>& newup,
5333 const vector<int>& newacting,
5334 OSDMapRef lastmap,
5335 OSDMapRef osdmap)
5336 {
5337 if (PastIntervals::is_new_interval(
5338 primary.osd,
5339 newactingprimary,
5340 acting,
5341 newacting,
5342 up_primary.osd,
5343 newupprimary,
5344 up,
5345 newup,
5346 osdmap,
5347 lastmap,
5348 info.pgid.pgid)) {
5349 dout(20) << "new interval newup " << newup
5350 << " newacting " << newacting << dendl;
5351 return true;
5352 } else {
5353 return false;
5354 }
5355 }
5356
5357 bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
5358 {
5359 if (last_peering_reset > reply_epoch ||
5360 last_peering_reset > query_epoch) {
5361 dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " << query_epoch
5362 << " last_peering_reset " << last_peering_reset
5363 << dendl;
5364 return true;
5365 }
5366 return false;
5367 }
5368
5369 void PG::set_last_peering_reset()
5370 {
5371 dout(20) << "set_last_peering_reset " << get_osdmap()->get_epoch() << dendl;
5372 if (last_peering_reset != get_osdmap()->get_epoch()) {
5373 last_peering_reset = get_osdmap()->get_epoch();
5374 reset_interval_flush();
5375 }
5376 }
5377
5378 struct FlushState {
5379 PGRef pg;
5380 epoch_t epoch;
5381 FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
5382 ~FlushState() {
5383 pg->lock();
5384 if (!pg->pg_has_reset_since(epoch))
5385 pg->queue_flushed(epoch);
5386 pg->unlock();
5387 }
5388 };
5389 typedef ceph::shared_ptr<FlushState> FlushStateRef;
5390
5391 void PG::start_flush(ObjectStore::Transaction *t,
5392 list<Context *> *on_applied,
5393 list<Context *> *on_safe)
5394 {
5395 // flush in progress ops
5396 FlushStateRef flush_trigger (std::make_shared<FlushState>(
5397 this, get_osdmap()->get_epoch()));
5398 t->nop();
5399 flushes_in_progress++;
5400 on_applied->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5401 on_safe->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5402 }
5403
5404 void PG::reset_interval_flush()
5405 {
5406 dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
5407 recovery_state.clear_blocked_outgoing();
5408
5409 Context *c = new QueuePeeringEvt<IntervalFlush>(
5410 this, get_osdmap()->get_epoch(), IntervalFlush());
5411 if (!osr->flush_commit(c)) {
5412 dout(10) << "Beginning to block outgoing recovery messages" << dendl;
5413 recovery_state.begin_block_outgoing();
5414 } else {
5415 dout(10) << "Not blocking outgoing recovery messages" << dendl;
5416 delete c;
5417 }
5418 }
5419
5420 /* Called before initializing peering during advance_map */
5421 void PG::start_peering_interval(
5422 const OSDMapRef lastmap,
5423 const vector<int>& newup, int new_up_primary,
5424 const vector<int>& newacting, int new_acting_primary,
5425 ObjectStore::Transaction *t)
5426 {
5427 const OSDMapRef osdmap = get_osdmap();
5428
5429 set_last_peering_reset();
5430
5431 vector<int> oldacting, oldup;
5432 int oldrole = get_role();
5433
5434 unreg_next_scrub();
5435
5436 pg_shard_t old_acting_primary = get_primary();
5437 pg_shard_t old_up_primary = up_primary;
5438 bool was_old_primary = is_primary();
5439 bool was_old_replica = is_replica();
5440
5441 acting.swap(oldacting);
5442 up.swap(oldup);
5443 init_primary_up_acting(
5444 newup,
5445 newacting,
5446 new_up_primary,
5447 new_acting_primary);
5448
5449 if (info.stats.up != up ||
5450 info.stats.acting != acting ||
5451 info.stats.up_primary != new_up_primary ||
5452 info.stats.acting_primary != new_acting_primary) {
5453 info.stats.up = up;
5454 info.stats.up_primary = new_up_primary;
5455 info.stats.acting = acting;
5456 info.stats.acting_primary = new_acting_primary;
5457 info.stats.mapping_epoch = osdmap->get_epoch();
5458 }
5459
5460 pg_stats_publish_lock.Lock();
5461 pg_stats_publish_valid = false;
5462 pg_stats_publish_lock.Unlock();
5463
5464 // This will now be remapped during a backfill in cases
5465 // that it would not have been before.
5466 if (up != acting)
5467 state_set(PG_STATE_REMAPPED);
5468 else
5469 state_clear(PG_STATE_REMAPPED);
5470
5471 int role = osdmap->calc_pg_role(osd->whoami, acting, acting.size());
5472 if (pool.info.is_replicated() || role == pg_whoami.shard)
5473 set_role(role);
5474 else
5475 set_role(-1);
5476
5477 // did acting, up, primary|acker change?
5478 if (!lastmap) {
5479 dout(10) << " no lastmap" << dendl;
5480 dirty_info = true;
5481 dirty_big_info = true;
5482 info.history.same_interval_since = osdmap->get_epoch();
5483 } else {
5484 std::stringstream debug;
5485 assert(info.history.same_interval_since != 0);
5486 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
5487 get_is_recoverable_predicate());
5488 bool new_interval = PastIntervals::check_new_interval(
5489 old_acting_primary.osd,
5490 new_acting_primary,
5491 oldacting, newacting,
5492 old_up_primary.osd,
5493 new_up_primary,
5494 oldup, newup,
5495 info.history.same_interval_since,
5496 info.history.last_epoch_clean,
5497 osdmap,
5498 lastmap,
5499 info.pgid.pgid,
5500 recoverable.get(),
5501 &past_intervals,
5502 &debug);
5503 dout(10) << __func__ << ": check_new_interval output: "
5504 << debug.str() << dendl;
5505 if (new_interval) {
5506 if (osdmap->get_epoch() == osd->get_superblock().oldest_map &&
5507 info.history.last_epoch_clean < osdmap->get_epoch()) {
5508 dout(10) << " map gap, clearing past_intervals and faking" << dendl;
5509 // our information is incomplete and useless; someone else was clean
5510 // after everything we know if osdmaps were trimmed.
5511 past_intervals.clear();
5512 } else {
5513 dout(10) << " noting past " << past_intervals << dendl;
5514 }
5515 dirty_info = true;
5516 dirty_big_info = true;
5517 info.history.same_interval_since = osdmap->get_epoch();
5518 if (info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
5519 osdmap->get_pg_num(info.pgid.pgid.pool()),
5520 nullptr)) {
5521 info.history.last_epoch_split = osdmap->get_epoch();
5522 }
5523 }
5524 }
5525
5526 if (old_up_primary != up_primary ||
5527 oldup != up) {
5528 info.history.same_up_since = osdmap->get_epoch();
5529 }
5530 // this comparison includes primary rank via pg_shard_t
5531 if (old_acting_primary != get_primary()) {
5532 info.history.same_primary_since = osdmap->get_epoch();
5533 }
5534
5535 on_new_interval();
5536
5537 dout(1) << __func__ << " up " << oldup << " -> " << up
5538 << ", acting " << oldacting << " -> " << acting
5539 << ", acting_primary " << old_acting_primary << " -> " << new_acting_primary
5540 << ", up_primary " << old_up_primary << " -> " << new_up_primary
5541 << ", role " << oldrole << " -> " << role
5542 << ", features acting " << acting_features
5543 << " upacting " << upacting_features
5544 << dendl;
5545
5546 // deactivate.
5547 state_clear(PG_STATE_ACTIVE);
5548 state_clear(PG_STATE_PEERED);
5549 state_clear(PG_STATE_DOWN);
5550 state_clear(PG_STATE_RECOVERY_WAIT);
5551 state_clear(PG_STATE_RECOVERY_TOOFULL);
5552 state_clear(PG_STATE_RECOVERING);
5553
5554 peer_purged.clear();
5555 actingbackfill.clear();
5556 scrub_queued = false;
5557
5558 // reset primary/replica state?
5559 if (was_old_primary || is_primary()) {
5560 osd->remove_want_pg_temp(info.pgid.pgid);
5561 } else if (was_old_replica || is_replica()) {
5562 osd->remove_want_pg_temp(info.pgid.pgid);
5563 }
5564 clear_primary_state();
5565
5566
5567 // pg->on_*
5568 on_change(t);
5569
5570 projected_last_update = eversion_t();
5571
5572 assert(!deleting);
5573
5574 // should we tell the primary we are here?
5575 send_notify = !is_primary();
5576
5577 if (role != oldrole ||
5578 was_old_primary != is_primary()) {
5579 // did primary change?
5580 if (was_old_primary != is_primary()) {
5581 state_clear(PG_STATE_CLEAN);
5582 clear_publish_stats();
5583 }
5584
5585 on_role_change();
5586
5587 // take active waiters
5588 requeue_ops(waiting_for_peered);
5589
5590 } else {
5591 // no role change.
5592 // did primary change?
5593 if (get_primary() != old_acting_primary) {
5594 dout(10) << *this << " " << oldacting << " -> " << acting
5595 << ", acting primary "
5596 << old_acting_primary << " -> " << get_primary()
5597 << dendl;
5598 } else {
5599 // primary is the same.
5600 if (is_primary()) {
5601 // i am (still) primary. but my replica set changed.
5602 state_clear(PG_STATE_CLEAN);
5603
5604 dout(10) << oldacting << " -> " << acting
5605 << ", replicas changed" << dendl;
5606 }
5607 }
5608 }
5609 cancel_recovery();
5610
5611 if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
5612 dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
5613 osd->queue_want_pg_temp(info.pgid.pgid, acting);
5614 }
5615 }
5616
5617 void PG::on_new_interval()
5618 {
5619 const OSDMapRef osdmap = get_osdmap();
5620
5621 reg_next_scrub();
5622
5623 // initialize features
5624 acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5625 upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5626 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
5627 if (*p == CRUSH_ITEM_NONE)
5628 continue;
5629 uint64_t f = osdmap->get_xinfo(*p).features;
5630 acting_features &= f;
5631 upacting_features &= f;
5632 }
5633 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
5634 if (*p == CRUSH_ITEM_NONE)
5635 continue;
5636 upacting_features &= osdmap->get_xinfo(*p).features;
5637 }
5638
5639 _on_new_interval();
5640 }
5641
5642 void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
5643 {
5644 assert(!is_primary());
5645
5646 update_history(oinfo.history);
5647 if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
5648 info.stats.stats.sum.num_scrub_errors = 0;
5649 info.stats.stats.sum.num_shallow_scrub_errors = 0;
5650 info.stats.stats.sum.num_deep_scrub_errors = 0;
5651 dirty_info = true;
5652 }
5653
5654 if (!(info.purged_snaps == oinfo.purged_snaps)) {
5655 dout(10) << __func__ << " updating purged_snaps to " << oinfo.purged_snaps
5656 << dendl;
5657 info.purged_snaps = oinfo.purged_snaps;
5658 dirty_info = true;
5659 dirty_big_info = true;
5660 }
5661 }
5662
5663 ostream& operator<<(ostream& out, const PG& pg)
5664 {
5665 out << "pg[" << pg.info
5666 << " " << pg.up;
5667 if (pg.acting != pg.up)
5668 out << "/" << pg.acting;
5669 if (pg.is_ec_pg())
5670 out << "p" << pg.get_primary();
5671 out << " r=" << pg.get_role();
5672 out << " lpr=" << pg.get_last_peering_reset();
5673
5674 if (!pg.past_intervals.empty()) {
5675 out << " pi=[" << pg.past_intervals.get_bounds()
5676 << ")/" << pg.past_intervals.size();
5677 }
5678
5679 if (pg.is_peered()) {
5680 if (pg.last_update_ondisk != pg.info.last_update)
5681 out << " luod=" << pg.last_update_ondisk;
5682 if (pg.last_update_applied != pg.info.last_update)
5683 out << " lua=" << pg.last_update_applied;
5684 }
5685
5686 if (pg.recovery_ops_active)
5687 out << " rops=" << pg.recovery_ops_active;
5688
5689 if (pg.pg_log.get_tail() != pg.info.log_tail ||
5690 pg.pg_log.get_head() != pg.info.last_update)
5691 out << " (info mismatch, " << pg.pg_log.get_log() << ")";
5692
5693 if (!pg.pg_log.get_log().empty()) {
5694 if ((pg.pg_log.get_log().log.begin()->version <= pg.pg_log.get_tail())) {
5695 out << " (log bound mismatch, actual=["
5696 << pg.pg_log.get_log().log.begin()->version << ","
5697 << pg.pg_log.get_log().log.rbegin()->version << "]";
5698 out << ")";
5699 }
5700 }
5701
5702 if (!pg.backfill_targets.empty())
5703 out << " bft=" << pg.backfill_targets;
5704 out << " crt=" << pg.pg_log.get_can_rollback_to();
5705
5706 if (pg.last_complete_ondisk != pg.info.last_complete)
5707 out << " lcod " << pg.last_complete_ondisk;
5708
5709 if (pg.is_primary()) {
5710 out << " mlcod " << pg.min_last_complete_ondisk;
5711 }
5712
5713 out << " " << pg_state_string(pg.get_state());
5714 if (pg.should_send_notify())
5715 out << " NOTIFY";
5716
5717 if (pg.scrubber.must_repair)
5718 out << " MUST_REPAIR";
5719 if (pg.scrubber.auto_repair)
5720 out << " AUTO_REPAIR";
5721 if (pg.scrubber.must_deep_scrub)
5722 out << " MUST_DEEP_SCRUB";
5723 if (pg.scrubber.must_scrub)
5724 out << " MUST_SCRUB";
5725
5726 //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
5727 if (pg.pg_log.get_missing().num_missing()) {
5728 out << " m=" << pg.pg_log.get_missing().num_missing();
5729 if (pg.is_primary()) {
5730 uint64_t unfound = pg.get_num_unfound();
5731 if (unfound)
5732 out << " u=" << unfound;
5733 }
5734 }
5735 if (pg.snap_trimq.size())
5736 out << " snaptrimq=" << pg.snap_trimq;
5737
5738 out << "]";
5739
5740
5741 return out;
5742 }
5743
5744 bool PG::can_discard_op(OpRequestRef& op)
5745 {
5746 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
5747 if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) {
5748 dout(20) << " discard " << *m << dendl;
5749 return true;
5750 }
5751
5752 if (m->get_map_epoch() < info.history.same_primary_since) {
5753 dout(7) << " changed after " << m->get_map_epoch()
5754 << ", dropping " << *m << dendl;
5755 return true;
5756 }
5757
5758 if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) {
5759 if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) {
5760 dout(7) << __func__ << " sent before last_force_op_resend "
5761 << pool.info.last_force_op_resend << ", dropping" << *m << dendl;
5762 return true;
5763 }
5764 if (m->get_map_epoch() < info.history.last_epoch_split) {
5765 dout(7) << __func__ << " pg split in "
5766 << info.history.last_epoch_split << ", dropping" << dendl;
5767 return true;
5768 }
5769 } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) {
5770 if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) {
5771 dout(7) << __func__ << " sent before last_force_op_resend_preluminous "
5772 << pool.info.last_force_op_resend_preluminous
5773 << ", dropping" << *m << dendl;
5774 return true;
5775 }
5776 }
5777
5778 return false;
5779 }
5780
5781 template<typename T, int MSGTYPE>
5782 bool PG::can_discard_replica_op(OpRequestRef& op)
5783 {
5784 const T *m = static_cast<const T *>(op->get_req());
5785 assert(m->get_type() == MSGTYPE);
5786
5787 int from = m->get_source().num();
5788
5789 // if a repop is replied after a replica goes down in a new osdmap, and
5790 // before the pg advances to this new osdmap, the repop replies before this
5791 // repop can be discarded by that replica OSD, because the primary resets the
5792 // connection to it when handling the new osdmap marking it down, and also
5793 // resets the messenger sesssion when the replica reconnects. to avoid the
5794 // out-of-order replies, the messages from that replica should be discarded.
5795 if (osd->get_osdmap()->is_down(from))
5796 return true;
5797 /* Mostly, this overlaps with the old_peering_msg
5798 * condition. An important exception is pushes
5799 * sent by replicas not in the acting set, since
5800 * if such a replica goes down it does not cause
5801 * a new interval. */
5802 if (get_osdmap()->get_down_at(from) >= m->map_epoch)
5803 return true;
5804
5805 // same pg?
5806 // if pg changes _at all_, we reset and repeer!
5807 if (old_peering_msg(m->map_epoch, m->map_epoch)) {
5808 dout(10) << "can_discard_replica_op pg changed " << info.history
5809 << " after " << m->map_epoch
5810 << ", dropping" << dendl;
5811 return true;
5812 }
5813 return false;
5814 }
5815
5816 bool PG::can_discard_scan(OpRequestRef op)
5817 {
5818 const MOSDPGScan *m = static_cast<const MOSDPGScan *>(op->get_req());
5819 assert(m->get_type() == MSG_OSD_PG_SCAN);
5820
5821 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
5822 dout(10) << " got old scan, ignoring" << dendl;
5823 return true;
5824 }
5825 return false;
5826 }
5827
5828 bool PG::can_discard_backfill(OpRequestRef op)
5829 {
5830 const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill *>(op->get_req());
5831 assert(m->get_type() == MSG_OSD_PG_BACKFILL);
5832
5833 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
5834 dout(10) << " got old backfill, ignoring" << dendl;
5835 return true;
5836 }
5837
5838 return false;
5839
5840 }
5841
5842 bool PG::can_discard_request(OpRequestRef& op)
5843 {
5844 switch (op->get_req()->get_type()) {
5845 case CEPH_MSG_OSD_OP:
5846 return can_discard_op(op);
5847 case CEPH_MSG_OSD_BACKOFF:
5848 return false; // never discard
5849 case MSG_OSD_SUBOP:
5850 return can_discard_replica_op<MOSDSubOp, MSG_OSD_SUBOP>(op);
5851 case MSG_OSD_REPOP:
5852 return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op);
5853 case MSG_OSD_PG_PUSH:
5854 return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op);
5855 case MSG_OSD_PG_PULL:
5856 return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
5857 case MSG_OSD_PG_PUSH_REPLY:
5858 return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
5859 case MSG_OSD_SUBOPREPLY:
5860 return can_discard_replica_op<MOSDSubOpReply, MSG_OSD_SUBOPREPLY>(op);
5861 case MSG_OSD_REPOPREPLY:
5862 return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
5863 case MSG_OSD_PG_RECOVERY_DELETE:
5864 return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op);
5865
5866 case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
5867 return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op);
5868
5869 case MSG_OSD_EC_WRITE:
5870 return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
5871 case MSG_OSD_EC_WRITE_REPLY:
5872 return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op);
5873 case MSG_OSD_EC_READ:
5874 return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
5875 case MSG_OSD_EC_READ_REPLY:
5876 return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
5877 case MSG_OSD_REP_SCRUB:
5878 return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
5879 case MSG_OSD_SCRUB_RESERVE:
5880 return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op);
5881 case MSG_OSD_REP_SCRUBMAP:
5882 return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op);
5883 case MSG_OSD_PG_UPDATE_LOG_MISSING:
5884 return can_discard_replica_op<
5885 MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op);
5886 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
5887 return can_discard_replica_op<
5888 MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
5889
5890 case MSG_OSD_PG_SCAN:
5891 return can_discard_scan(op);
5892 case MSG_OSD_PG_BACKFILL:
5893 return can_discard_backfill(op);
5894 case MSG_OSD_PG_BACKFILL_REMOVE:
5895 return can_discard_replica_op<MOSDPGBackfillRemove,
5896 MSG_OSD_PG_BACKFILL_REMOVE>(op);
5897 }
5898 return true;
5899 }
5900
5901 void PG::take_waiters()
5902 {
5903 dout(10) << "take_waiters" << dendl;
5904 requeue_map_waiters();
5905 for (list<CephPeeringEvtRef>::iterator i = peering_waiters.begin();
5906 i != peering_waiters.end();
5907 ++i) osd->queue_for_peering(this);
5908 peering_queue.splice(peering_queue.begin(), peering_waiters,
5909 peering_waiters.begin(), peering_waiters.end());
5910 }
5911
5912 void PG::handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx)
5913 {
5914 dout(10) << "handle_peering_event: " << evt->get_desc() << dendl;
5915 if (!have_same_or_newer_map(evt->get_epoch_sent())) {
5916 dout(10) << "deferring event " << evt->get_desc() << dendl;
5917 peering_waiters.push_back(evt);
5918 return;
5919 }
5920 if (old_peering_evt(evt))
5921 return;
5922 recovery_state.handle_event(evt, rctx);
5923 }
5924
5925 void PG::queue_peering_event(CephPeeringEvtRef evt)
5926 {
5927 if (old_peering_evt(evt))
5928 return;
5929 peering_queue.push_back(evt);
5930 osd->queue_for_peering(this);
5931 }
5932
5933 void PG::queue_null(epoch_t msg_epoch,
5934 epoch_t query_epoch)
5935 {
5936 dout(10) << "null" << dendl;
5937 queue_peering_event(
5938 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
5939 NullEvt())));
5940 }
5941
5942 void PG::queue_flushed(epoch_t e)
5943 {
5944 dout(10) << "flushed" << dendl;
5945 queue_peering_event(
5946 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(e, e,
5947 FlushedEvt())));
5948 }
5949
5950 void PG::queue_query(epoch_t msg_epoch,
5951 epoch_t query_epoch,
5952 pg_shard_t from, const pg_query_t& q)
5953 {
5954 dout(10) << "handle_query " << q << " from replica " << from << dendl;
5955 queue_peering_event(
5956 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
5957 MQuery(from, q, query_epoch))));
5958 }
5959
5960 void PG::handle_advance_map(
5961 OSDMapRef osdmap, OSDMapRef lastmap,
5962 vector<int>& newup, int up_primary,
5963 vector<int>& newacting, int acting_primary,
5964 RecoveryCtx *rctx)
5965 {
5966 assert(lastmap->get_epoch() == osdmap_ref->get_epoch());
5967 assert(lastmap == osdmap_ref);
5968 dout(10) << "handle_advance_map "
5969 << newup << "/" << newacting
5970 << " -- " << up_primary << "/" << acting_primary
5971 << dendl;
5972 update_osdmap_ref(osdmap);
5973 pool.update(osdmap);
5974 past_intervals.update_type_from_map(pool.info.ec_pool(), *osdmap);
5975 if (cct->_conf->osd_debug_verify_cached_snaps) {
5976 interval_set<snapid_t> actual_removed_snaps;
5977 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
5978 assert(pi);
5979 pi->build_removed_snaps(actual_removed_snaps);
5980 if (!(actual_removed_snaps == pool.cached_removed_snaps)) {
5981 derr << __func__ << ": mismatch between the actual removed snaps "
5982 << actual_removed_snaps << " and pool.cached_removed_snaps "
5983 << " pool.cached_removed_snaps " << pool.cached_removed_snaps
5984 << dendl;
5985 }
5986 assert(actual_removed_snaps == pool.cached_removed_snaps);
5987 }
5988 AdvMap evt(
5989 osdmap, lastmap, newup, up_primary,
5990 newacting, acting_primary);
5991 recovery_state.handle_event(evt, rctx);
5992 if (pool.info.last_change == osdmap_ref->get_epoch()) {
5993 on_pool_change();
5994 update_store_with_options();
5995 }
5996 }
5997
5998 void PG::handle_activate_map(RecoveryCtx *rctx)
5999 {
6000 dout(10) << "handle_activate_map " << dendl;
6001 ActMap evt;
6002 recovery_state.handle_event(evt, rctx);
6003 if (osdmap_ref->get_epoch() - last_persisted_osdmap_ref->get_epoch() >
6004 cct->_conf->osd_pg_epoch_persisted_max_stale) {
6005 dout(20) << __func__ << ": Dirtying info: last_persisted is "
6006 << last_persisted_osdmap_ref->get_epoch()
6007 << " while current is " << osdmap_ref->get_epoch() << dendl;
6008 dirty_info = true;
6009 } else {
6010 dout(20) << __func__ << ": Not dirtying info: last_persisted is "
6011 << last_persisted_osdmap_ref->get_epoch()
6012 << " while current is " << osdmap_ref->get_epoch() << dendl;
6013 }
6014 if (osdmap_ref->check_new_blacklist_entries()) check_blacklisted_watchers();
6015 }
6016
6017 void PG::handle_loaded(RecoveryCtx *rctx)
6018 {
6019 dout(10) << "handle_loaded" << dendl;
6020 Load evt;
6021 recovery_state.handle_event(evt, rctx);
6022 }
6023
6024 void PG::handle_create(RecoveryCtx *rctx)
6025 {
6026 dout(10) << "handle_create" << dendl;
6027 rctx->created_pgs.insert(this);
6028 Initialize evt;
6029 recovery_state.handle_event(evt, rctx);
6030 ActMap evt2;
6031 recovery_state.handle_event(evt2, rctx);
6032
6033 rctx->on_applied->add(make_lambda_context([this]() {
6034 update_store_with_options();
6035 }));
6036 }
6037
6038 void PG::handle_query_state(Formatter *f)
6039 {
6040 dout(10) << "handle_query_state" << dendl;
6041 QueryState q(f);
6042 recovery_state.handle_event(q, 0);
6043 }
6044
6045 void PG::update_store_with_options()
6046 {
6047 auto r = osd->store->set_collection_opts(coll, pool.info.opts);
6048 if(r < 0 && r != -EOPNOTSUPP) {
6049 derr << __func__ << " set_collection_opts returns error:" << r << dendl;
6050 }
6051 }
6052
6053 void PG::update_store_on_load()
6054 {
6055 if (osd->store->get_type() == "filestore") {
6056 // legacy filestore didn't store collection bit width; fix.
6057 int bits = osd->store->collection_bits(coll);
6058 if (bits < 0) {
6059 assert(!coll.is_meta()); // otherwise OSD::load_pgs() did a bad thing
6060 bits = info.pgid.get_split_bits(pool.info.get_pg_num());
6061 lderr(cct) << __func__ << " setting bit width to " << bits << dendl;
6062 ObjectStore::Transaction t;
6063 t.collection_set_bits(coll, bits);
6064 osd->store->apply_transaction(osr.get(), std::move(t));
6065 }
6066 }
6067 }
6068
6069 /*------------ Recovery State Machine----------------*/
6070 #undef dout_prefix
6071 #define dout_prefix (*_dout << context< RecoveryMachine >().pg->gen_prefix() \
6072 << "state<" << get_state_name() << ">: ")
6073
6074 /*------Crashed-------*/
6075 PG::RecoveryState::Crashed::Crashed(my_context ctx)
6076 : my_base(ctx),
6077 NamedState(context< RecoveryMachine >().pg, "Crashed")
6078 {
6079 context< RecoveryMachine >().log_enter(state_name);
6080 assert(0 == "we got a bad state machine event");
6081 }
6082
6083
6084 /*------Initial-------*/
6085 PG::RecoveryState::Initial::Initial(my_context ctx)
6086 : my_base(ctx),
6087 NamedState(context< RecoveryMachine >().pg, "Initial")
6088 {
6089 context< RecoveryMachine >().log_enter(state_name);
6090 }
6091
6092 boost::statechart::result PG::RecoveryState::Initial::react(const Load& l)
6093 {
6094 PG *pg = context< RecoveryMachine >().pg;
6095
6096 // do we tell someone we're here?
6097 pg->send_notify = (!pg->is_primary());
6098 pg->update_store_with_options();
6099
6100 pg->update_store_on_load();
6101
6102 return transit< Reset >();
6103 }
6104
6105 boost::statechart::result PG::RecoveryState::Initial::react(const MNotifyRec& notify)
6106 {
6107 PG *pg = context< RecoveryMachine >().pg;
6108 pg->proc_replica_info(
6109 notify.from, notify.notify.info, notify.notify.epoch_sent);
6110 pg->set_last_peering_reset();
6111 return transit< Primary >();
6112 }
6113
6114 boost::statechart::result PG::RecoveryState::Initial::react(const MInfoRec& i)
6115 {
6116 PG *pg = context< RecoveryMachine >().pg;
6117 assert(!pg->is_primary());
6118 post_event(i);
6119 return transit< Stray >();
6120 }
6121
6122 boost::statechart::result PG::RecoveryState::Initial::react(const MLogRec& i)
6123 {
6124 PG *pg = context< RecoveryMachine >().pg;
6125 assert(!pg->is_primary());
6126 post_event(i);
6127 return transit< Stray >();
6128 }
6129
6130 void PG::RecoveryState::Initial::exit()
6131 {
6132 context< RecoveryMachine >().log_exit(state_name, enter_time);
6133 PG *pg = context< RecoveryMachine >().pg;
6134 utime_t dur = ceph_clock_now() - enter_time;
6135 pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur);
6136 }
6137
6138 /*------Started-------*/
6139 PG::RecoveryState::Started::Started(my_context ctx)
6140 : my_base(ctx),
6141 NamedState(context< RecoveryMachine >().pg, "Started")
6142 {
6143 context< RecoveryMachine >().log_enter(state_name);
6144 }
6145
6146 boost::statechart::result
6147 PG::RecoveryState::Started::react(const IntervalFlush&)
6148 {
6149 PG *pg = context< RecoveryMachine >().pg;
6150 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6151 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6152 return discard_event();
6153 }
6154
6155
6156 boost::statechart::result
6157 PG::RecoveryState::Started::react(const FlushedEvt&)
6158 {
6159 PG *pg = context< RecoveryMachine >().pg;
6160 pg->on_flushed();
6161 return discard_event();
6162 }
6163
6164
6165 boost::statechart::result PG::RecoveryState::Started::react(const AdvMap& advmap)
6166 {
6167 PG *pg = context< RecoveryMachine >().pg;
6168 ldout(pg->cct, 10) << "Started advmap" << dendl;
6169 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6170 if (pg->should_restart_peering(
6171 advmap.up_primary,
6172 advmap.acting_primary,
6173 advmap.newup,
6174 advmap.newacting,
6175 advmap.lastmap,
6176 advmap.osdmap)) {
6177 ldout(pg->cct, 10) << "should_restart_peering, transitioning to Reset"
6178 << dendl;
6179 post_event(advmap);
6180 return transit< Reset >();
6181 }
6182 pg->remove_down_peer_info(advmap.osdmap);
6183 return discard_event();
6184 }
6185
6186 boost::statechart::result PG::RecoveryState::Started::react(const QueryState& q)
6187 {
6188 q.f->open_object_section("state");
6189 q.f->dump_string("name", state_name);
6190 q.f->dump_stream("enter_time") << enter_time;
6191 q.f->close_section();
6192 return discard_event();
6193 }
6194
6195 void PG::RecoveryState::Started::exit()
6196 {
6197 context< RecoveryMachine >().log_exit(state_name, enter_time);
6198 PG *pg = context< RecoveryMachine >().pg;
6199 utime_t dur = ceph_clock_now() - enter_time;
6200 pg->osd->recoverystate_perf->tinc(rs_started_latency, dur);
6201 }
6202
6203 /*--------Reset---------*/
6204 PG::RecoveryState::Reset::Reset(my_context ctx)
6205 : my_base(ctx),
6206 NamedState(context< RecoveryMachine >().pg, "Reset")
6207 {
6208 context< RecoveryMachine >().log_enter(state_name);
6209 PG *pg = context< RecoveryMachine >().pg;
6210
6211 pg->flushes_in_progress = 0;
6212 pg->set_last_peering_reset();
6213 }
6214
6215 boost::statechart::result
6216 PG::RecoveryState::Reset::react(const FlushedEvt&)
6217 {
6218 PG *pg = context< RecoveryMachine >().pg;
6219 pg->on_flushed();
6220 return discard_event();
6221 }
6222
6223 boost::statechart::result
6224 PG::RecoveryState::Reset::react(const IntervalFlush&)
6225 {
6226 PG *pg = context< RecoveryMachine >().pg;
6227 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6228 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6229 return discard_event();
6230 }
6231
6232 boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
6233 {
6234 PG *pg = context< RecoveryMachine >().pg;
6235 ldout(pg->cct, 10) << "Reset advmap" << dendl;
6236
6237 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6238
6239 if (pg->should_restart_peering(
6240 advmap.up_primary,
6241 advmap.acting_primary,
6242 advmap.newup,
6243 advmap.newacting,
6244 advmap.lastmap,
6245 advmap.osdmap)) {
6246 ldout(pg->cct, 10) << "should restart peering, calling start_peering_interval again"
6247 << dendl;
6248 pg->start_peering_interval(
6249 advmap.lastmap,
6250 advmap.newup, advmap.up_primary,
6251 advmap.newacting, advmap.acting_primary,
6252 context< RecoveryMachine >().get_cur_transaction());
6253 }
6254 pg->remove_down_peer_info(advmap.osdmap);
6255 pg->check_past_interval_bounds();
6256 return discard_event();
6257 }
6258
6259 boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&)
6260 {
6261 PG *pg = context< RecoveryMachine >().pg;
6262 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
6263 context< RecoveryMachine >().send_notify(
6264 pg->get_primary(),
6265 pg_notify_t(
6266 pg->get_primary().shard, pg->pg_whoami.shard,
6267 pg->get_osdmap()->get_epoch(),
6268 pg->get_osdmap()->get_epoch(),
6269 pg->info),
6270 pg->past_intervals);
6271 }
6272
6273 pg->update_heartbeat_peers();
6274 pg->take_waiters();
6275
6276 return transit< Started >();
6277 }
6278
6279 boost::statechart::result PG::RecoveryState::Reset::react(const QueryState& q)
6280 {
6281 q.f->open_object_section("state");
6282 q.f->dump_string("name", state_name);
6283 q.f->dump_stream("enter_time") << enter_time;
6284 q.f->close_section();
6285 return discard_event();
6286 }
6287
6288 void PG::RecoveryState::Reset::exit()
6289 {
6290 context< RecoveryMachine >().log_exit(state_name, enter_time);
6291 PG *pg = context< RecoveryMachine >().pg;
6292 utime_t dur = ceph_clock_now() - enter_time;
6293 pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur);
6294 }
6295
6296 /*-------Start---------*/
6297 PG::RecoveryState::Start::Start(my_context ctx)
6298 : my_base(ctx),
6299 NamedState(context< RecoveryMachine >().pg, "Start")
6300 {
6301 context< RecoveryMachine >().log_enter(state_name);
6302
6303 PG *pg = context< RecoveryMachine >().pg;
6304 if (pg->is_primary()) {
6305 ldout(pg->cct, 1) << "transitioning to Primary" << dendl;
6306 post_event(MakePrimary());
6307 } else { //is_stray
6308 ldout(pg->cct, 1) << "transitioning to Stray" << dendl;
6309 post_event(MakeStray());
6310 }
6311 }
6312
6313 void PG::RecoveryState::Start::exit()
6314 {
6315 context< RecoveryMachine >().log_exit(state_name, enter_time);
6316 PG *pg = context< RecoveryMachine >().pg;
6317 utime_t dur = ceph_clock_now() - enter_time;
6318 pg->osd->recoverystate_perf->tinc(rs_start_latency, dur);
6319 }
6320
6321 /*---------Primary--------*/
6322 PG::RecoveryState::Primary::Primary(my_context ctx)
6323 : my_base(ctx),
6324 NamedState(context< RecoveryMachine >().pg, "Started/Primary")
6325 {
6326 context< RecoveryMachine >().log_enter(state_name);
6327 PG *pg = context< RecoveryMachine >().pg;
6328 assert(pg->want_acting.empty());
6329
6330 // set CREATING bit until we have peered for the first time.
6331 if (pg->info.history.last_epoch_started == 0) {
6332 pg->state_set(PG_STATE_CREATING);
6333 // use the history timestamp, which ultimately comes from the
6334 // monitor in the create case.
6335 utime_t t = pg->info.history.last_scrub_stamp;
6336 pg->info.stats.last_fresh = t;
6337 pg->info.stats.last_active = t;
6338 pg->info.stats.last_change = t;
6339 pg->info.stats.last_peered = t;
6340 pg->info.stats.last_clean = t;
6341 pg->info.stats.last_unstale = t;
6342 pg->info.stats.last_undegraded = t;
6343 pg->info.stats.last_fullsized = t;
6344 pg->info.stats.last_scrub_stamp = t;
6345 pg->info.stats.last_deep_scrub_stamp = t;
6346 pg->info.stats.last_clean_scrub_stamp = t;
6347 }
6348 }
6349
6350 boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& notevt)
6351 {
6352 PG *pg = context< RecoveryMachine >().pg;
6353 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
6354 pg->proc_replica_info(
6355 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
6356 return discard_event();
6357 }
6358
6359 boost::statechart::result PG::RecoveryState::Primary::react(const ActMap&)
6360 {
6361 PG *pg = context< RecoveryMachine >().pg;
6362 ldout(pg->cct, 7) << "handle ActMap primary" << dendl;
6363 pg->publish_stats_to_osd();
6364 pg->take_waiters();
6365 return discard_event();
6366 }
6367
6368 void PG::RecoveryState::Primary::exit()
6369 {
6370 context< RecoveryMachine >().log_exit(state_name, enter_time);
6371 PG *pg = context< RecoveryMachine >().pg;
6372 pg->want_acting.clear();
6373 utime_t dur = ceph_clock_now() - enter_time;
6374 pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur);
6375 pg->clear_primary_state();
6376 pg->state_clear(PG_STATE_CREATING);
6377 }
6378
6379 /*---------Peering--------*/
6380 PG::RecoveryState::Peering::Peering(my_context ctx)
6381 : my_base(ctx),
6382 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering"),
6383 history_les_bound(false)
6384 {
6385 context< RecoveryMachine >().log_enter(state_name);
6386
6387 PG *pg = context< RecoveryMachine >().pg;
6388 assert(!pg->is_peered());
6389 assert(!pg->is_peering());
6390 assert(pg->is_primary());
6391 pg->state_set(PG_STATE_PEERING);
6392 }
6393
6394 boost::statechart::result PG::RecoveryState::Peering::react(const AdvMap& advmap)
6395 {
6396 PG *pg = context< RecoveryMachine >().pg;
6397 ldout(pg->cct, 10) << "Peering advmap" << dendl;
6398 if (prior_set.affected_by_map(*(advmap.osdmap), pg)) {
6399 ldout(pg->cct, 1) << "Peering, affected_by_map, going to Reset" << dendl;
6400 post_event(advmap);
6401 return transit< Reset >();
6402 }
6403
6404 pg->adjust_need_up_thru(advmap.osdmap);
6405
6406 return forward_event();
6407 }
6408
6409 boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q)
6410 {
6411 PG *pg = context< RecoveryMachine >().pg;
6412
6413 q.f->open_object_section("state");
6414 q.f->dump_string("name", state_name);
6415 q.f->dump_stream("enter_time") << enter_time;
6416
6417 q.f->open_array_section("past_intervals");
6418 pg->past_intervals.dump(q.f);
6419 q.f->close_section();
6420
6421 q.f->open_array_section("probing_osds");
6422 for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
6423 p != prior_set.probe.end();
6424 ++p)
6425 q.f->dump_stream("osd") << *p;
6426 q.f->close_section();
6427
6428 if (prior_set.pg_down)
6429 q.f->dump_string("blocked", "peering is blocked due to down osds");
6430
6431 q.f->open_array_section("down_osds_we_would_probe");
6432 for (set<int>::iterator p = prior_set.down.begin();
6433 p != prior_set.down.end();
6434 ++p)
6435 q.f->dump_int("osd", *p);
6436 q.f->close_section();
6437
6438 q.f->open_array_section("peering_blocked_by");
6439 for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
6440 p != prior_set.blocked_by.end();
6441 ++p) {
6442 q.f->open_object_section("osd");
6443 q.f->dump_int("osd", p->first);
6444 q.f->dump_int("current_lost_at", p->second);
6445 q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
6446 q.f->close_section();
6447 }
6448 q.f->close_section();
6449
6450 if (history_les_bound) {
6451 q.f->open_array_section("peering_blocked_by_detail");
6452 q.f->open_object_section("item");
6453 q.f->dump_string("detail","peering_blocked_by_history_les_bound");
6454 q.f->close_section();
6455 q.f->close_section();
6456 }
6457
6458 q.f->close_section();
6459 return forward_event();
6460 }
6461
6462 void PG::RecoveryState::Peering::exit()
6463 {
6464 PG *pg = context< RecoveryMachine >().pg;
6465 ldout(pg->cct, 10) << "Leaving Peering" << dendl;
6466 context< RecoveryMachine >().log_exit(state_name, enter_time);
6467 pg->state_clear(PG_STATE_PEERING);
6468 pg->clear_probe_targets();
6469
6470 utime_t dur = ceph_clock_now() - enter_time;
6471 pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur);
6472 }
6473
6474
6475 /*------Backfilling-------*/
6476 PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
6477 : my_base(ctx),
6478 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Backfilling")
6479 {
6480 context< RecoveryMachine >().log_enter(state_name);
6481 PG *pg = context< RecoveryMachine >().pg;
6482 pg->backfill_reserved = true;
6483 pg->queue_recovery();
6484 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
6485 pg->state_clear(PG_STATE_BACKFILL_WAIT);
6486 pg->state_set(PG_STATE_BACKFILLING);
6487 pg->publish_stats_to_osd();
6488 }
6489
6490 boost::statechart::result
6491 PG::RecoveryState::Backfilling::react(const DeferBackfill &c)
6492 {
6493 PG *pg = context< RecoveryMachine >().pg;
6494 ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl;
6495 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6496
6497 pg->state_set(PG_STATE_BACKFILL_WAIT);
6498 pg->state_clear(PG_STATE_BACKFILLING);
6499
6500 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6501 it != pg->backfill_targets.end();
6502 ++it) {
6503 assert(*it != pg->pg_whoami);
6504 ConnectionRef con = pg->osd->get_con_osd_cluster(
6505 it->osd, pg->get_osdmap()->get_epoch());
6506 if (con) {
6507 pg->osd->send_message_osd_cluster(
6508 new MBackfillReserve(
6509 MBackfillReserve::REJECT,
6510 spg_t(pg->info.pgid.pgid, it->shard),
6511 pg->get_osdmap()->get_epoch()),
6512 con.get());
6513 }
6514 }
6515
6516
6517 if (!pg->waiting_on_backfill.empty()) {
6518 pg->waiting_on_backfill.clear();
6519 pg->finish_recovery_op(hobject_t::get_max());
6520 }
6521
6522 pg->schedule_backfill_retry(c.delay);
6523 return transit<NotBackfilling>();
6524 }
6525
6526 boost::statechart::result
6527 PG::RecoveryState::Backfilling::react(const UnfoundBackfill &c)
6528 {
6529 PG *pg = context< RecoveryMachine >().pg;
6530 ldout(pg->cct, 10) << "backfill has unfound, can't continue" << dendl;
6531 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6532
6533 pg->state_set(PG_STATE_BACKFILL_UNFOUND);
6534 pg->state_clear(PG_STATE_BACKFILLING);
6535
6536 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6537 it != pg->backfill_targets.end();
6538 ++it) {
6539 assert(*it != pg->pg_whoami);
6540 ConnectionRef con = pg->osd->get_con_osd_cluster(
6541 it->osd, pg->get_osdmap()->get_epoch());
6542 if (con) {
6543 pg->osd->send_message_osd_cluster(
6544 new MBackfillReserve(
6545 MBackfillReserve::REJECT,
6546 spg_t(pg->info.pgid.pgid, it->shard),
6547 pg->get_osdmap()->get_epoch()),
6548 con.get());
6549 }
6550 }
6551
6552 pg->waiting_on_backfill.clear();
6553
6554 return transit<NotBackfilling>();
6555 }
6556
6557 boost::statechart::result
6558 PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &)
6559 {
6560 PG *pg = context< RecoveryMachine >().pg;
6561 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6562 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6563
6564 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6565 it != pg->backfill_targets.end();
6566 ++it) {
6567 assert(*it != pg->pg_whoami);
6568 ConnectionRef con = pg->osd->get_con_osd_cluster(
6569 it->osd, pg->get_osdmap()->get_epoch());
6570 if (con) {
6571 pg->osd->send_message_osd_cluster(
6572 new MBackfillReserve(
6573 MBackfillReserve::REJECT,
6574 spg_t(pg->info.pgid.pgid, it->shard),
6575 pg->get_osdmap()->get_epoch()),
6576 con.get());
6577 }
6578 }
6579
6580 if (!pg->waiting_on_backfill.empty()) {
6581 pg->waiting_on_backfill.clear();
6582 pg->finish_recovery_op(hobject_t::get_max());
6583 }
6584
6585 pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
6586 return transit<NotBackfilling>();
6587 }
6588
6589 void PG::RecoveryState::Backfilling::exit()
6590 {
6591 context< RecoveryMachine >().log_exit(state_name, enter_time);
6592 PG *pg = context< RecoveryMachine >().pg;
6593 pg->backfill_reserved = false;
6594 pg->backfill_reserving = false;
6595 pg->state_clear(PG_STATE_BACKFILLING);
6596 pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
6597 utime_t dur = ceph_clock_now() - enter_time;
6598 pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
6599 }
6600
6601 /*--WaitRemoteBackfillReserved--*/
6602
6603 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
6604 : my_base(ctx),
6605 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteBackfillReserved"),
6606 backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
6607 {
6608 context< RecoveryMachine >().log_enter(state_name);
6609 PG *pg = context< RecoveryMachine >().pg;
6610 pg->state_set(PG_STATE_BACKFILL_WAIT);
6611 pg->publish_stats_to_osd();
6612 post_event(RemoteBackfillReserved());
6613 }
6614
6615 boost::statechart::result
6616 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
6617 {
6618 PG *pg = context< RecoveryMachine >().pg;
6619
6620 if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) {
6621 //The primary never backfills itself
6622 assert(*backfill_osd_it != pg->pg_whoami);
6623 ConnectionRef con = pg->osd->get_con_osd_cluster(
6624 backfill_osd_it->osd, pg->get_osdmap()->get_epoch());
6625 if (con) {
6626 pg->osd->send_message_osd_cluster(
6627 new MBackfillReserve(
6628 MBackfillReserve::REQUEST,
6629 spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
6630 pg->get_osdmap()->get_epoch(),
6631 pg->get_backfill_priority()),
6632 con.get());
6633 }
6634 ++backfill_osd_it;
6635 } else {
6636 post_event(AllBackfillsReserved());
6637 }
6638 return discard_event();
6639 }
6640
6641 void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
6642 {
6643 context< RecoveryMachine >().log_exit(state_name, enter_time);
6644 PG *pg = context< RecoveryMachine >().pg;
6645 utime_t dur = ceph_clock_now() - enter_time;
6646 pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur);
6647 }
6648
6649 boost::statechart::result
6650 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt)
6651 {
6652 PG *pg = context< RecoveryMachine >().pg;
6653 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6654
6655 // Send REJECT to all previously acquired reservations
6656 set<pg_shard_t>::const_iterator it, begin, end, next;
6657 begin = context< Active >().remote_shards_to_reserve_backfill.begin();
6658 end = context< Active >().remote_shards_to_reserve_backfill.end();
6659 assert(begin != end);
6660 for (next = it = begin, ++next ; next != backfill_osd_it; ++it, ++next) {
6661 //The primary never backfills itself
6662 assert(*it != pg->pg_whoami);
6663 ConnectionRef con = pg->osd->get_con_osd_cluster(
6664 it->osd, pg->get_osdmap()->get_epoch());
6665 if (con) {
6666 pg->osd->send_message_osd_cluster(
6667 new MBackfillReserve(
6668 MBackfillReserve::REJECT,
6669 spg_t(pg->info.pgid.pgid, it->shard),
6670 pg->get_osdmap()->get_epoch()),
6671 con.get());
6672 }
6673 }
6674
6675 pg->state_clear(PG_STATE_BACKFILL_WAIT);
6676 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6677 pg->publish_stats_to_osd();
6678
6679 pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
6680
6681 return transit<NotBackfilling>();
6682 }
6683
6684 /*--WaitLocalBackfillReserved--*/
6685 PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
6686 : my_base(ctx),
6687 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalBackfillReserved")
6688 {
6689 context< RecoveryMachine >().log_enter(state_name);
6690 PG *pg = context< RecoveryMachine >().pg;
6691 pg->state_set(PG_STATE_BACKFILL_WAIT);
6692 pg->osd->local_reserver.request_reservation(
6693 pg->info.pgid,
6694 new QueuePeeringEvt<LocalBackfillReserved>(
6695 pg, pg->get_osdmap()->get_epoch(),
6696 LocalBackfillReserved()),
6697 pg->get_backfill_priority(),
6698 new QueuePeeringEvt<DeferBackfill>(
6699 pg, pg->get_osdmap()->get_epoch(),
6700 DeferBackfill(0.0)));
6701 pg->publish_stats_to_osd();
6702 }
6703
6704 void PG::RecoveryState::WaitLocalBackfillReserved::exit()
6705 {
6706 context< RecoveryMachine >().log_exit(state_name, enter_time);
6707 PG *pg = context< RecoveryMachine >().pg;
6708 utime_t dur = ceph_clock_now() - enter_time;
6709 pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur);
6710 }
6711
6712 /*----NotBackfilling------*/
6713 PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
6714 : my_base(ctx),
6715 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotBackfilling")
6716 {
6717 context< RecoveryMachine >().log_enter(state_name);
6718 PG *pg = context< RecoveryMachine >().pg;
6719 pg->publish_stats_to_osd();
6720 }
6721
6722 boost::statechart::result
6723 PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
6724 {
6725 return discard_event();
6726 }
6727
6728 boost::statechart::result
6729 PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
6730 {
6731 return discard_event();
6732 }
6733
6734 void PG::RecoveryState::NotBackfilling::exit()
6735 {
6736 context< RecoveryMachine >().log_exit(state_name, enter_time);
6737 PG *pg = context< RecoveryMachine >().pg;
6738 pg->state_clear(PG_STATE_BACKFILL_UNFOUND);
6739 utime_t dur = ceph_clock_now() - enter_time;
6740 pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
6741 }
6742
6743 /*----NotRecovering------*/
6744 PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx)
6745 : my_base(ctx),
6746 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotRecovering")
6747 {
6748 context< RecoveryMachine >().log_enter(state_name);
6749 PG *pg = context< RecoveryMachine >().pg;
6750 pg->publish_stats_to_osd();
6751 }
6752
6753 void PG::RecoveryState::NotRecovering::exit()
6754 {
6755 context< RecoveryMachine >().log_exit(state_name, enter_time);
6756 PG *pg = context< RecoveryMachine >().pg;
6757 pg->state_clear(PG_STATE_RECOVERY_UNFOUND);
6758 utime_t dur = ceph_clock_now() - enter_time;
6759 pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur);
6760 }
6761
6762 /*---RepNotRecovering----*/
6763 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
6764 : my_base(ctx),
6765 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepNotRecovering")
6766 {
6767 context< RecoveryMachine >().log_enter(state_name);
6768 }
6769
6770 boost::statechart::result
6771 PG::RecoveryState::RepNotRecovering::react(const RejectRemoteReservation &evt)
6772 {
6773 PG *pg = context< RecoveryMachine >().pg;
6774 pg->reject_reservation();
6775 post_event(RemoteReservationRejected());
6776 return discard_event();
6777 }
6778
6779 void PG::RecoveryState::RepNotRecovering::exit()
6780 {
6781 context< RecoveryMachine >().log_exit(state_name, enter_time);
6782 PG *pg = context< RecoveryMachine >().pg;
6783 utime_t dur = ceph_clock_now() - enter_time;
6784 pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur);
6785 }
6786
6787 /*---RepWaitRecoveryReserved--*/
6788 PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
6789 : my_base(ctx),
6790 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitRecoveryReserved")
6791 {
6792 context< RecoveryMachine >().log_enter(state_name);
6793 PG *pg = context< RecoveryMachine >().pg;
6794
6795 pg->osd->remote_reserver.request_reservation(
6796 pg->info.pgid,
6797 new QueuePeeringEvt<RemoteRecoveryReserved>(
6798 pg, pg->get_osdmap()->get_epoch(),
6799 RemoteRecoveryReserved()),
6800 pg->get_recovery_priority());
6801 }
6802
6803 boost::statechart::result
6804 PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
6805 {
6806 PG *pg = context< RecoveryMachine >().pg;
6807 pg->osd->send_message_osd_cluster(
6808 pg->primary.osd,
6809 new MRecoveryReserve(
6810 MRecoveryReserve::GRANT,
6811 spg_t(pg->info.pgid.pgid, pg->primary.shard),
6812 pg->get_osdmap()->get_epoch()),
6813 pg->get_osdmap()->get_epoch());
6814 return transit<RepRecovering>();
6815 }
6816
6817 boost::statechart::result
6818 PG::RecoveryState::RepWaitRecoveryReserved::react(
6819 const RemoteReservationCanceled &evt)
6820 {
6821 PG *pg = context< RecoveryMachine >().pg;
6822 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6823 return transit<RepNotRecovering>();
6824 }
6825
6826 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
6827 {
6828 context< RecoveryMachine >().log_exit(state_name, enter_time);
6829 PG *pg = context< RecoveryMachine >().pg;
6830 utime_t dur = ceph_clock_now() - enter_time;
6831 pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur);
6832 }
6833
6834 /*-RepWaitBackfillReserved*/
6835 PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
6836 : my_base(ctx),
6837 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitBackfillReserved")
6838 {
6839 context< RecoveryMachine >().log_enter(state_name);
6840 }
6841
6842 boost::statechart::result
6843 PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
6844 {
6845 PG *pg = context< RecoveryMachine >().pg;
6846 ostringstream ss;
6847
6848 if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
6849 (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
6850 ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
6851 << dendl;
6852 post_event(RejectRemoteReservation());
6853 } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
6854 pg->osd->check_backfill_full(ss)) {
6855 ldout(pg->cct, 10) << "backfill reservation rejected: "
6856 << ss.str() << dendl;
6857 post_event(RejectRemoteReservation());
6858 } else {
6859 pg->osd->remote_reserver.request_reservation(
6860 pg->info.pgid,
6861 new QueuePeeringEvt<RemoteBackfillReserved>(
6862 pg, pg->get_osdmap()->get_epoch(),
6863 RemoteBackfillReserved()), evt.priority);
6864 }
6865 return transit<RepWaitBackfillReserved>();
6866 }
6867
6868 void PG::RecoveryState::RepWaitBackfillReserved::exit()
6869 {
6870 context< RecoveryMachine >().log_exit(state_name, enter_time);
6871 PG *pg = context< RecoveryMachine >().pg;
6872 utime_t dur = ceph_clock_now() - enter_time;
6873 pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur);
6874 }
6875
6876 boost::statechart::result
6877 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
6878 {
6879 PG *pg = context< RecoveryMachine >().pg;
6880
6881 ostringstream ss;
6882 if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
6883 (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
6884 ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
6885 << "failure injection" << dendl;
6886 post_event(RejectRemoteReservation());
6887 return discard_event();
6888 } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
6889 pg->osd->check_backfill_full(ss)) {
6890 ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
6891 << ss.str() << dendl;
6892 post_event(RejectRemoteReservation());
6893 return discard_event();
6894 } else {
6895 pg->osd->send_message_osd_cluster(
6896 pg->primary.osd,
6897 new MBackfillReserve(
6898 MBackfillReserve::GRANT,
6899 spg_t(pg->info.pgid.pgid, pg->primary.shard),
6900 pg->get_osdmap()->get_epoch()),
6901 pg->get_osdmap()->get_epoch());
6902 return transit<RepRecovering>();
6903 }
6904 }
6905
6906 boost::statechart::result
6907 PG::RecoveryState::RepWaitBackfillReserved::react(
6908 const RejectRemoteReservation &evt)
6909 {
6910 PG *pg = context< RecoveryMachine >().pg;
6911 pg->reject_reservation();
6912 post_event(RemoteReservationRejected());
6913 return discard_event();
6914 }
6915
6916 boost::statechart::result
6917 PG::RecoveryState::RepWaitBackfillReserved::react(
6918 const RemoteReservationRejected &evt)
6919 {
6920 PG *pg = context< RecoveryMachine >().pg;
6921 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6922 return transit<RepNotRecovering>();
6923 }
6924
6925 boost::statechart::result
6926 PG::RecoveryState::RepWaitBackfillReserved::react(
6927 const RemoteReservationCanceled &evt)
6928 {
6929 PG *pg = context< RecoveryMachine >().pg;
6930 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6931 return transit<RepNotRecovering>();
6932 }
6933
6934 /*---RepRecovering-------*/
6935 PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx)
6936 : my_base(ctx),
6937 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepRecovering")
6938 {
6939 context< RecoveryMachine >().log_enter(state_name);
6940 }
6941
6942 boost::statechart::result
6943 PG::RecoveryState::RepRecovering::react(const BackfillTooFull &)
6944 {
6945 PG *pg = context< RecoveryMachine >().pg;
6946 pg->reject_reservation();
6947 return discard_event();
6948 }
6949
6950 void PG::RecoveryState::RepRecovering::exit()
6951 {
6952 context< RecoveryMachine >().log_exit(state_name, enter_time);
6953 PG *pg = context< RecoveryMachine >().pg;
6954 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6955 utime_t dur = ceph_clock_now() - enter_time;
6956 pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur);
6957 }
6958
6959 /*------Activating--------*/
6960 PG::RecoveryState::Activating::Activating(my_context ctx)
6961 : my_base(ctx),
6962 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Activating")
6963 {
6964 context< RecoveryMachine >().log_enter(state_name);
6965 }
6966
6967 void PG::RecoveryState::Activating::exit()
6968 {
6969 context< RecoveryMachine >().log_exit(state_name, enter_time);
6970 PG *pg = context< RecoveryMachine >().pg;
6971 utime_t dur = ceph_clock_now() - enter_time;
6972 pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur);
6973 }
6974
6975 PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
6976 : my_base(ctx),
6977 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalRecoveryReserved")
6978 {
6979 context< RecoveryMachine >().log_enter(state_name);
6980 PG *pg = context< RecoveryMachine >().pg;
6981
6982 // Make sure all nodes that part of the recovery aren't full
6983 if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery &&
6984 pg->osd->check_osdmap_full(pg->actingbackfill)) {
6985 post_event(RecoveryTooFull());
6986 return;
6987 }
6988
6989 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
6990 pg->state_set(PG_STATE_RECOVERY_WAIT);
6991 pg->osd->local_reserver.request_reservation(
6992 pg->info.pgid,
6993 new QueuePeeringEvt<LocalRecoveryReserved>(
6994 pg, pg->get_osdmap()->get_epoch(),
6995 LocalRecoveryReserved()),
6996 pg->get_recovery_priority(),
6997 new QueuePeeringEvt<DeferRecovery>(
6998 pg, pg->get_osdmap()->get_epoch(),
6999 DeferRecovery(0.0)));
7000 pg->publish_stats_to_osd();
7001 }
7002
7003 boost::statechart::result
7004 PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
7005 {
7006 PG *pg = context< RecoveryMachine >().pg;
7007 pg->state_set(PG_STATE_RECOVERY_TOOFULL);
7008 pg->schedule_recovery_retry(pg->cct->_conf->osd_recovery_retry_interval);
7009 return transit<NotRecovering>();
7010 }
7011
7012 void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
7013 {
7014 context< RecoveryMachine >().log_exit(state_name, enter_time);
7015 PG *pg = context< RecoveryMachine >().pg;
7016 utime_t dur = ceph_clock_now() - enter_time;
7017 pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur);
7018 }
7019
7020 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
7021 : my_base(ctx),
7022 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
7023 remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
7024 {
7025 context< RecoveryMachine >().log_enter(state_name);
7026 post_event(RemoteRecoveryReserved());
7027 }
7028
7029 boost::statechart::result
7030 PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
7031 PG *pg = context< RecoveryMachine >().pg;
7032
7033 if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
7034 assert(*remote_recovery_reservation_it != pg->pg_whoami);
7035 ConnectionRef con = pg->osd->get_con_osd_cluster(
7036 remote_recovery_reservation_it->osd, pg->get_osdmap()->get_epoch());
7037 if (con) {
7038 pg->osd->send_message_osd_cluster(
7039 new MRecoveryReserve(
7040 MRecoveryReserve::REQUEST,
7041 spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard),
7042 pg->get_osdmap()->get_epoch()),
7043 con.get());
7044 }
7045 ++remote_recovery_reservation_it;
7046 } else {
7047 post_event(AllRemotesReserved());
7048 }
7049 return discard_event();
7050 }
7051
7052 void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
7053 {
7054 context< RecoveryMachine >().log_exit(state_name, enter_time);
7055 PG *pg = context< RecoveryMachine >().pg;
7056 utime_t dur = ceph_clock_now() - enter_time;
7057 pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur);
7058 }
7059
7060 PG::RecoveryState::Recovering::Recovering(my_context ctx)
7061 : my_base(ctx),
7062 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovering")
7063 {
7064 context< RecoveryMachine >().log_enter(state_name);
7065
7066 PG *pg = context< RecoveryMachine >().pg;
7067 pg->state_clear(PG_STATE_RECOVERY_WAIT);
7068 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7069 pg->state_set(PG_STATE_RECOVERING);
7070 assert(!pg->state_test(PG_STATE_ACTIVATING));
7071 pg->publish_stats_to_osd();
7072 pg->queue_recovery();
7073 }
7074
7075 void PG::RecoveryState::Recovering::release_reservations(bool cancel)
7076 {
7077 PG *pg = context< RecoveryMachine >().pg;
7078 assert(cancel || !pg->pg_log.get_missing().have_missing());
7079
7080 // release remote reservations
7081 for (set<pg_shard_t>::const_iterator i =
7082 context< Active >().remote_shards_to_reserve_recovery.begin();
7083 i != context< Active >().remote_shards_to_reserve_recovery.end();
7084 ++i) {
7085 if (*i == pg->pg_whoami) // skip myself
7086 continue;
7087 ConnectionRef con = pg->osd->get_con_osd_cluster(
7088 i->osd, pg->get_osdmap()->get_epoch());
7089 if (con) {
7090 pg->osd->send_message_osd_cluster(
7091 new MRecoveryReserve(
7092 MRecoveryReserve::RELEASE,
7093 spg_t(pg->info.pgid.pgid, i->shard),
7094 pg->get_osdmap()->get_epoch()),
7095 con.get());
7096 }
7097 }
7098 }
7099
7100 boost::statechart::result
7101 PG::RecoveryState::Recovering::react(const AllReplicasRecovered &evt)
7102 {
7103 PG *pg = context< RecoveryMachine >().pg;
7104 pg->state_clear(PG_STATE_RECOVERING);
7105 pg->state_clear(PG_STATE_FORCED_RECOVERY);
7106 release_reservations();
7107 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7108 return transit<Recovered>();
7109 }
7110
7111 boost::statechart::result
7112 PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
7113 {
7114 PG *pg = context< RecoveryMachine >().pg;
7115 pg->state_clear(PG_STATE_RECOVERING);
7116 pg->state_clear(PG_STATE_FORCED_RECOVERY);
7117 release_reservations();
7118 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7119 // XXX: Is this needed?
7120 pg->publish_stats_to_osd();
7121 return transit<WaitLocalBackfillReserved>();
7122 }
7123
7124 boost::statechart::result
7125 PG::RecoveryState::Recovering::react(const DeferRecovery &evt)
7126 {
7127 PG *pg = context< RecoveryMachine >().pg;
7128 ldout(pg->cct, 10) << "defer recovery, retry delay " << evt.delay << dendl;
7129 pg->state_clear(PG_STATE_RECOVERING);
7130 pg->state_set(PG_STATE_RECOVERY_WAIT);
7131 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7132 release_reservations(true);
7133 pg->schedule_recovery_retry(evt.delay);
7134 return transit<NotRecovering>();
7135 }
7136
7137 boost::statechart::result
7138 PG::RecoveryState::Recovering::react(const UnfoundRecovery &evt)
7139 {
7140 PG *pg = context< RecoveryMachine >().pg;
7141 ldout(pg->cct, 10) << "recovery has unfound, can't continue" << dendl;
7142 pg->state_set(PG_STATE_RECOVERY_UNFOUND);
7143 pg->state_clear(PG_STATE_RECOVERING);
7144 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7145 release_reservations(true);
7146 return transit<NotRecovering>();
7147 }
7148
7149 void PG::RecoveryState::Recovering::exit()
7150 {
7151 context< RecoveryMachine >().log_exit(state_name, enter_time);
7152 PG *pg = context< RecoveryMachine >().pg;
7153 utime_t dur = ceph_clock_now() - enter_time;
7154 pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur);
7155 }
7156
7157 PG::RecoveryState::Recovered::Recovered(my_context ctx)
7158 : my_base(ctx),
7159 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovered")
7160 {
7161 pg_shard_t auth_log_shard;
7162
7163 context< RecoveryMachine >().log_enter(state_name);
7164
7165 PG *pg = context< RecoveryMachine >().pg;
7166
7167 assert(!pg->needs_recovery());
7168
7169 // if we finished backfill, all acting are active; recheck if
7170 // DEGRADED | UNDERSIZED is appropriate.
7171 assert(!pg->actingbackfill.empty());
7172 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
7173 pg->actingbackfill.size()) {
7174 pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
7175 pg->publish_stats_to_osd();
7176 }
7177
7178 // trim pglog on recovered
7179 pg->trim_log();
7180
7181 // adjust acting set? (e.g. because backfill completed...)
7182 bool history_les_bound = false;
7183 if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
7184 true, &history_les_bound))
7185 assert(pg->want_acting.size());
7186
7187 if (context< Active >().all_replicas_activated)
7188 post_event(GoClean());
7189 }
7190
7191 void PG::RecoveryState::Recovered::exit()
7192 {
7193 context< RecoveryMachine >().log_exit(state_name, enter_time);
7194 PG *pg = context< RecoveryMachine >().pg;
7195 utime_t dur = ceph_clock_now() - enter_time;
7196 pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur);
7197 }
7198
7199 PG::RecoveryState::Clean::Clean(my_context ctx)
7200 : my_base(ctx),
7201 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Clean")
7202 {
7203 context< RecoveryMachine >().log_enter(state_name);
7204
7205 PG *pg = context< RecoveryMachine >().pg;
7206
7207 if (pg->info.last_complete != pg->info.last_update) {
7208 ceph_abort();
7209 }
7210 pg->finish_recovery(*context< RecoveryMachine >().get_on_safe_context_list());
7211
7212 if (pg->is_active()) {
7213 pg->mark_clean();
7214 }
7215
7216 pg->share_pg_info();
7217 pg->publish_stats_to_osd();
7218 pg->requeue_ops(pg->waiting_for_clean_to_primary_repair);
7219 }
7220
7221 void PG::RecoveryState::Clean::exit()
7222 {
7223 context< RecoveryMachine >().log_exit(state_name, enter_time);
7224 PG *pg = context< RecoveryMachine >().pg;
7225 pg->state_clear(PG_STATE_CLEAN);
7226 utime_t dur = ceph_clock_now() - enter_time;
7227 pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur);
7228 }
7229
7230 template <typename T>
7231 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
7232 {
7233 set<int> osds_found;
7234 set<pg_shard_t> out;
7235 for (typename T::const_iterator i = in.begin();
7236 i != in.end();
7237 ++i) {
7238 if (*i != skip && !osds_found.count(i->osd)) {
7239 osds_found.insert(i->osd);
7240 out.insert(*i);
7241 }
7242 }
7243 return out;
7244 }
7245
7246 /*---------Active---------*/
7247 PG::RecoveryState::Active::Active(my_context ctx)
7248 : my_base(ctx),
7249 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active"),
7250 remote_shards_to_reserve_recovery(
7251 unique_osd_shard_set(
7252 context< RecoveryMachine >().pg->pg_whoami,
7253 context< RecoveryMachine >().pg->actingbackfill)),
7254 remote_shards_to_reserve_backfill(
7255 unique_osd_shard_set(
7256 context< RecoveryMachine >().pg->pg_whoami,
7257 context< RecoveryMachine >().pg->backfill_targets)),
7258 all_replicas_activated(false)
7259 {
7260 context< RecoveryMachine >().log_enter(state_name);
7261
7262 PG *pg = context< RecoveryMachine >().pg;
7263
7264 assert(!pg->backfill_reserving);
7265 assert(!pg->backfill_reserved);
7266 assert(pg->is_primary());
7267 ldout(pg->cct, 10) << "In Active, about to call activate" << dendl;
7268 pg->start_flush(
7269 context< RecoveryMachine >().get_cur_transaction(),
7270 context< RecoveryMachine >().get_on_applied_context_list(),
7271 context< RecoveryMachine >().get_on_safe_context_list());
7272 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7273 pg->get_osdmap()->get_epoch(),
7274 *context< RecoveryMachine >().get_on_safe_context_list(),
7275 *context< RecoveryMachine >().get_query_map(),
7276 context< RecoveryMachine >().get_info_map(),
7277 context< RecoveryMachine >().get_recovery_ctx());
7278
7279 // everyone has to commit/ack before we are truly active
7280 pg->blocked_by.clear();
7281 for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
7282 p != pg->actingbackfill.end();
7283 ++p) {
7284 if (p->shard != pg->pg_whoami.shard) {
7285 pg->blocked_by.insert(p->shard);
7286 }
7287 }
7288 pg->publish_stats_to_osd();
7289 ldout(pg->cct, 10) << "Activate Finished" << dendl;
7290 }
7291
7292 boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
7293 {
7294 PG *pg = context< RecoveryMachine >().pg;
7295 ldout(pg->cct, 10) << "Active advmap" << dendl;
7296 if (!pg->pool.newly_removed_snaps.empty()) {
7297 pg->snap_trimq.union_of(pg->pool.newly_removed_snaps);
7298 ldout(pg->cct, 10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl;
7299 pg->dirty_info = true;
7300 pg->dirty_big_info = true;
7301 }
7302
7303 for (size_t i = 0; i < pg->want_acting.size(); i++) {
7304 int osd = pg->want_acting[i];
7305 if (!advmap.osdmap->is_up(osd)) {
7306 pg_shard_t osd_with_shard(osd, shard_id_t(i));
7307 assert(pg->is_acting(osd_with_shard) || pg->is_up(osd_with_shard));
7308 }
7309 }
7310
7311 bool need_publish = false;
7312 /* Check for changes in pool size (if the acting set changed as a result,
7313 * this does not matter) */
7314 if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) !=
7315 pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) {
7316 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) {
7317 pg->state_clear(PG_STATE_UNDERSIZED);
7318 } else {
7319 pg->state_set(PG_STATE_UNDERSIZED);
7320 }
7321 // degraded changes will be detected by call from publish_stats_to_osd()
7322 need_publish = true;
7323 }
7324
7325 // if we haven't reported our PG stats in a long time, do so now.
7326 if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
7327 ldout(pg->cct, 20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch)
7328 << " epochs" << dendl;
7329 need_publish = true;
7330 }
7331
7332 if (need_publish)
7333 pg->publish_stats_to_osd();
7334
7335 return forward_event();
7336 }
7337
7338 boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
7339 {
7340 PG *pg = context< RecoveryMachine >().pg;
7341 ldout(pg->cct, 10) << "Active: handling ActMap" << dendl;
7342 assert(pg->is_primary());
7343
7344 if (pg->have_unfound()) {
7345 // object may have become unfound
7346 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7347 }
7348
7349 if (pg->cct->_conf->osd_check_for_log_corruption)
7350 pg->check_log_for_corruption(pg->osd->store);
7351
7352 uint64_t unfound = pg->missing_loc.num_unfound();
7353 if (unfound > 0 &&
7354 pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
7355 if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
7356 pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound
7357 << " objects unfound and apparently lost, would automatically "
7358 << "mark these objects lost but this feature is not yet implemented "
7359 << "(osd_auto_mark_unfound_lost)";
7360 } else
7361 pg->osd->clog->error() << pg->info.pgid.pgid << " has "
7362 << unfound << " objects unfound and apparently lost";
7363 }
7364
7365 if (pg->is_active()) {
7366 ldout(pg->cct, 10) << "Active: kicking snap trim" << dendl;
7367 pg->kick_snap_trim();
7368 }
7369
7370 if (pg->is_peered() &&
7371 !pg->is_clean() &&
7372 !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) &&
7373 (!pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || pg->is_degraded())) {
7374 pg->queue_recovery();
7375 }
7376 return forward_event();
7377 }
7378
7379 boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& notevt)
7380 {
7381 PG *pg = context< RecoveryMachine >().pg;
7382 assert(pg->is_primary());
7383 if (pg->peer_info.count(notevt.from)) {
7384 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7385 << ", already have info from that osd, ignoring"
7386 << dendl;
7387 } else if (pg->peer_purged.count(notevt.from)) {
7388 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7389 << ", already purged that peer, ignoring"
7390 << dendl;
7391 } else {
7392 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7393 << ", calling proc_replica_info and discover_all_missing"
7394 << dendl;
7395 pg->proc_replica_info(
7396 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
7397 if (pg->have_unfound()) {
7398 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7399 }
7400 }
7401 return discard_event();
7402 }
7403
7404 boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoevt)
7405 {
7406 PG *pg = context< RecoveryMachine >().pg;
7407 assert(pg->is_primary());
7408
7409 assert(!pg->actingbackfill.empty());
7410 // don't update history (yet) if we are active and primary; the replica
7411 // may be telling us they have activated (and committed) but we can't
7412 // share that until _everyone_ does the same.
7413 if (pg->is_actingbackfill(infoevt.from)) {
7414 ldout(pg->cct, 10) << " peer osd." << infoevt.from
7415 << " activated and committed" << dendl;
7416 pg->peer_activated.insert(infoevt.from);
7417 pg->blocked_by.erase(infoevt.from.shard);
7418 pg->publish_stats_to_osd();
7419 if (pg->peer_activated.size() == pg->actingbackfill.size()) {
7420 pg->all_activated_and_committed();
7421 }
7422 }
7423 return discard_event();
7424 }
7425
7426 boost::statechart::result PG::RecoveryState::Active::react(const MLogRec& logevt)
7427 {
7428 PG *pg = context< RecoveryMachine >().pg;
7429 ldout(pg->cct, 10) << "searching osd." << logevt.from
7430 << " log for unfound items" << dendl;
7431 pg->proc_replica_log(
7432 logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
7433 bool got_missing = pg->search_for_missing(
7434 pg->peer_info[logevt.from],
7435 pg->peer_missing[logevt.from],
7436 logevt.from,
7437 context< RecoveryMachine >().get_recovery_ctx());
7438 // If there are missing AND we are "fully" active then start recovery now
7439 if (got_missing && pg->state_test(PG_STATE_ACTIVE)) {
7440 post_event(DoRecovery());
7441 }
7442 return discard_event();
7443 }
7444
7445 boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
7446 {
7447 PG *pg = context< RecoveryMachine >().pg;
7448
7449 q.f->open_object_section("state");
7450 q.f->dump_string("name", state_name);
7451 q.f->dump_stream("enter_time") << enter_time;
7452
7453 {
7454 q.f->open_array_section("might_have_unfound");
7455 for (set<pg_shard_t>::iterator p = pg->might_have_unfound.begin();
7456 p != pg->might_have_unfound.end();
7457 ++p) {
7458 q.f->open_object_section("osd");
7459 q.f->dump_stream("osd") << *p;
7460 if (pg->peer_missing.count(*p)) {
7461 q.f->dump_string("status", "already probed");
7462 } else if (pg->peer_missing_requested.count(*p)) {
7463 q.f->dump_string("status", "querying");
7464 } else if (!pg->get_osdmap()->is_up(p->osd)) {
7465 q.f->dump_string("status", "osd is down");
7466 } else {
7467 q.f->dump_string("status", "not queried");
7468 }
7469 q.f->close_section();
7470 }
7471 q.f->close_section();
7472 }
7473 {
7474 q.f->open_object_section("recovery_progress");
7475 pg->dump_recovery_info(q.f);
7476 q.f->close_section();
7477 }
7478
7479 {
7480 q.f->open_object_section("scrub");
7481 q.f->dump_stream("scrubber.epoch_start") << pg->scrubber.epoch_start;
7482 q.f->dump_bool("scrubber.active", pg->scrubber.active);
7483 q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state));
7484 q.f->dump_stream("scrubber.start") << pg->scrubber.start;
7485 q.f->dump_stream("scrubber.end") << pg->scrubber.end;
7486 q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update;
7487 q.f->dump_bool("scrubber.deep", pg->scrubber.deep);
7488 q.f->dump_unsigned("scrubber.seed", pg->scrubber.seed);
7489 q.f->dump_int("scrubber.waiting_on", pg->scrubber.waiting_on);
7490 {
7491 q.f->open_array_section("scrubber.waiting_on_whom");
7492 for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin();
7493 p != pg->scrubber.waiting_on_whom.end();
7494 ++p) {
7495 q.f->dump_stream("shard") << *p;
7496 }
7497 q.f->close_section();
7498 }
7499 q.f->close_section();
7500 }
7501
7502 q.f->close_section();
7503 return forward_event();
7504 }
7505
7506 boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActivated &evt)
7507 {
7508 PG *pg = context< RecoveryMachine >().pg;
7509 all_replicas_activated = true;
7510
7511 pg->state_clear(PG_STATE_ACTIVATING);
7512 pg->state_clear(PG_STATE_CREATING);
7513 if (pg->acting.size() >= pg->pool.info.min_size) {
7514 pg->state_set(PG_STATE_ACTIVE);
7515 } else {
7516 pg->state_set(PG_STATE_PEERED);
7517 }
7518
7519 // info.last_epoch_started is set during activate()
7520 pg->info.history.last_epoch_started = pg->info.last_epoch_started;
7521 pg->info.history.last_interval_started = pg->info.last_interval_started;
7522 pg->dirty_info = true;
7523
7524 pg->share_pg_info();
7525 pg->publish_stats_to_osd();
7526
7527 pg->check_local();
7528
7529 // waiters
7530 if (pg->flushes_in_progress == 0) {
7531 pg->requeue_ops(pg->waiting_for_peered);
7532 } else if (!pg->waiting_for_peered.empty()) {
7533 ldout(pg->cct, 10) << __func__ << " flushes in progress, moving "
7534 << pg->waiting_for_peered.size()
7535 << " items to waiting_for_flush"
7536 << dendl;
7537 assert(pg->waiting_for_flush.empty());
7538 pg->waiting_for_flush.swap(pg->waiting_for_peered);
7539 }
7540
7541 pg->on_activate();
7542
7543 return discard_event();
7544 }
7545
7546 void PG::RecoveryState::Active::exit()
7547 {
7548 context< RecoveryMachine >().log_exit(state_name, enter_time);
7549 PG *pg = context< RecoveryMachine >().pg;
7550 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7551
7552 pg->blocked_by.clear();
7553 pg->backfill_reserved = false;
7554 pg->backfill_reserving = false;
7555 pg->state_clear(PG_STATE_ACTIVATING);
7556 pg->state_clear(PG_STATE_DEGRADED);
7557 pg->state_clear(PG_STATE_UNDERSIZED);
7558 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
7559 pg->state_clear(PG_STATE_BACKFILL_WAIT);
7560 pg->state_clear(PG_STATE_RECOVERY_WAIT);
7561 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7562 utime_t dur = ceph_clock_now() - enter_time;
7563 pg->osd->recoverystate_perf->tinc(rs_active_latency, dur);
7564 pg->agent_stop();
7565 }
7566
7567 /*------ReplicaActive-----*/
7568 PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx)
7569 : my_base(ctx),
7570 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive")
7571 {
7572 context< RecoveryMachine >().log_enter(state_name);
7573
7574 PG *pg = context< RecoveryMachine >().pg;
7575 pg->start_flush(
7576 context< RecoveryMachine >().get_cur_transaction(),
7577 context< RecoveryMachine >().get_on_applied_context_list(),
7578 context< RecoveryMachine >().get_on_safe_context_list());
7579 }
7580
7581
7582 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
7583 const Activate& actevt) {
7584 PG *pg = context< RecoveryMachine >().pg;
7585 ldout(pg->cct, 10) << "In ReplicaActive, about to call activate" << dendl;
7586 map<int, map<spg_t, pg_query_t> > query_map;
7587 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7588 actevt.activation_epoch,
7589 *context< RecoveryMachine >().get_on_safe_context_list(),
7590 query_map, NULL, NULL);
7591 ldout(pg->cct, 10) << "Activate Finished" << dendl;
7592 return discard_event();
7593 }
7594
7595 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MInfoRec& infoevt)
7596 {
7597 PG *pg = context< RecoveryMachine >().pg;
7598 pg->proc_primary_info(*context<RecoveryMachine>().get_cur_transaction(),
7599 infoevt.info);
7600 return discard_event();
7601 }
7602
7603 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MLogRec& logevt)
7604 {
7605 PG *pg = context< RecoveryMachine >().pg;
7606 ldout(pg->cct, 10) << "received log from " << logevt.from << dendl;
7607 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7608 pg->merge_log(*t, logevt.msg->info, logevt.msg->log, logevt.from);
7609 assert(pg->pg_log.get_head() == pg->info.last_update);
7610
7611 return discard_event();
7612 }
7613
7614 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&)
7615 {
7616 PG *pg = context< RecoveryMachine >().pg;
7617 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7618 context< RecoveryMachine >().send_notify(
7619 pg->get_primary(),
7620 pg_notify_t(
7621 pg->get_primary().shard, pg->pg_whoami.shard,
7622 pg->get_osdmap()->get_epoch(),
7623 pg->get_osdmap()->get_epoch(),
7624 pg->info),
7625 pg->past_intervals);
7626 }
7627 pg->take_waiters();
7628 return discard_event();
7629 }
7630
7631 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MQuery& query)
7632 {
7633 PG *pg = context< RecoveryMachine >().pg;
7634 if (query.query.type == pg_query_t::MISSING) {
7635 pg->update_history(query.query.history);
7636 pg->fulfill_log(query.from, query.query, query.query_epoch);
7637 } // else: from prior to activation, safe to ignore
7638 return discard_event();
7639 }
7640
7641 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const QueryState& q)
7642 {
7643 q.f->open_object_section("state");
7644 q.f->dump_string("name", state_name);
7645 q.f->dump_stream("enter_time") << enter_time;
7646 q.f->close_section();
7647 return forward_event();
7648 }
7649
7650 void PG::RecoveryState::ReplicaActive::exit()
7651 {
7652 context< RecoveryMachine >().log_exit(state_name, enter_time);
7653 PG *pg = context< RecoveryMachine >().pg;
7654 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7655 utime_t dur = ceph_clock_now() - enter_time;
7656 pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur);
7657 }
7658
7659 /*-------Stray---*/
7660 PG::RecoveryState::Stray::Stray(my_context ctx)
7661 : my_base(ctx),
7662 NamedState(context< RecoveryMachine >().pg, "Started/Stray")
7663 {
7664 context< RecoveryMachine >().log_enter(state_name);
7665
7666 PG *pg = context< RecoveryMachine >().pg;
7667 assert(!pg->is_peered());
7668 assert(!pg->is_peering());
7669 assert(!pg->is_primary());
7670 pg->start_flush(
7671 context< RecoveryMachine >().get_cur_transaction(),
7672 context< RecoveryMachine >().get_on_applied_context_list(),
7673 context< RecoveryMachine >().get_on_safe_context_list());
7674 }
7675
7676 boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
7677 {
7678 PG *pg = context< RecoveryMachine >().pg;
7679 MOSDPGLog *msg = logevt.msg.get();
7680 ldout(pg->cct, 10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
7681
7682 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7683 if (msg->info.last_backfill == hobject_t()) {
7684 // restart backfill
7685 pg->unreg_next_scrub();
7686 pg->info = msg->info;
7687 pg->reg_next_scrub();
7688 pg->dirty_info = true;
7689 pg->dirty_big_info = true; // maybe.
7690
7691 PGLogEntryHandler rollbacker{pg, t};
7692 pg->pg_log.reset_backfill_claim_log(msg->log, &rollbacker);
7693
7694 pg->pg_log.reset_backfill();
7695 } else {
7696 pg->merge_log(*t, msg->info, msg->log, logevt.from);
7697 }
7698
7699 assert(pg->pg_log.get_head() == pg->info.last_update);
7700
7701 post_event(Activate(logevt.msg->info.last_epoch_started));
7702 return transit<ReplicaActive>();
7703 }
7704
7705 boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoevt)
7706 {
7707 PG *pg = context< RecoveryMachine >().pg;
7708 ldout(pg->cct, 10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
7709
7710 if (pg->info.last_update > infoevt.info.last_update) {
7711 // rewind divergent log entries
7712 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7713 pg->rewind_divergent_log(*t, infoevt.info.last_update);
7714 pg->info.stats = infoevt.info.stats;
7715 pg->info.hit_set = infoevt.info.hit_set;
7716 }
7717
7718 assert(infoevt.info.last_update == pg->info.last_update);
7719 assert(pg->pg_log.get_head() == pg->info.last_update);
7720
7721 post_event(Activate(infoevt.info.last_epoch_started));
7722 return transit<ReplicaActive>();
7723 }
7724
7725 boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query)
7726 {
7727 PG *pg = context< RecoveryMachine >().pg;
7728 if (query.query.type == pg_query_t::INFO) {
7729 pair<pg_shard_t, pg_info_t> notify_info;
7730 pg->update_history(query.query.history);
7731 pg->fulfill_info(query.from, query.query, notify_info);
7732 context< RecoveryMachine >().send_notify(
7733 notify_info.first,
7734 pg_notify_t(
7735 notify_info.first.shard, pg->pg_whoami.shard,
7736 query.query_epoch,
7737 pg->get_osdmap()->get_epoch(),
7738 notify_info.second),
7739 pg->past_intervals);
7740 } else {
7741 pg->fulfill_log(query.from, query.query, query.query_epoch);
7742 }
7743 return discard_event();
7744 }
7745
7746 boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&)
7747 {
7748 PG *pg = context< RecoveryMachine >().pg;
7749 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7750 context< RecoveryMachine >().send_notify(
7751 pg->get_primary(),
7752 pg_notify_t(
7753 pg->get_primary().shard, pg->pg_whoami.shard,
7754 pg->get_osdmap()->get_epoch(),
7755 pg->get_osdmap()->get_epoch(),
7756 pg->info),
7757 pg->past_intervals);
7758 }
7759 pg->take_waiters();
7760 return discard_event();
7761 }
7762
7763 void PG::RecoveryState::Stray::exit()
7764 {
7765 context< RecoveryMachine >().log_exit(state_name, enter_time);
7766 PG *pg = context< RecoveryMachine >().pg;
7767 utime_t dur = ceph_clock_now() - enter_time;
7768 pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur);
7769 }
7770
7771 /*--------GetInfo---------*/
7772 PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
7773 : my_base(ctx),
7774 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetInfo")
7775 {
7776 context< RecoveryMachine >().log_enter(state_name);
7777
7778 PG *pg = context< RecoveryMachine >().pg;
7779 pg->check_past_interval_bounds();
7780 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7781
7782 assert(pg->blocked_by.empty());
7783
7784 prior_set = pg->build_prior();
7785
7786 pg->reset_min_peer_features();
7787 get_infos();
7788 if (prior_set.pg_down) {
7789 post_event(IsDown());
7790 } else if (peer_info_requested.empty()) {
7791 post_event(GotInfo());
7792 }
7793 }
7794
7795 void PG::RecoveryState::GetInfo::get_infos()
7796 {
7797 PG *pg = context< RecoveryMachine >().pg;
7798 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7799
7800 pg->blocked_by.clear();
7801 for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
7802 it != prior_set.probe.end();
7803 ++it) {
7804 pg_shard_t peer = *it;
7805 if (peer == pg->pg_whoami) {
7806 continue;
7807 }
7808 if (pg->peer_info.count(peer)) {
7809 ldout(pg->cct, 10) << " have osd." << peer << " info " << pg->peer_info[peer] << dendl;
7810 continue;
7811 }
7812 if (peer_info_requested.count(peer)) {
7813 ldout(pg->cct, 10) << " already requested info from osd." << peer << dendl;
7814 pg->blocked_by.insert(peer.osd);
7815 } else if (!pg->get_osdmap()->is_up(peer.osd)) {
7816 ldout(pg->cct, 10) << " not querying info from down osd." << peer << dendl;
7817 } else {
7818 ldout(pg->cct, 10) << " querying info from osd." << peer << dendl;
7819 context< RecoveryMachine >().send_query(
7820 peer, pg_query_t(pg_query_t::INFO,
7821 it->shard, pg->pg_whoami.shard,
7822 pg->info.history,
7823 pg->get_osdmap()->get_epoch()));
7824 peer_info_requested.insert(peer);
7825 pg->blocked_by.insert(peer.osd);
7826 }
7827 }
7828
7829 pg->publish_stats_to_osd();
7830 }
7831
7832 boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt)
7833 {
7834 PG *pg = context< RecoveryMachine >().pg;
7835
7836 set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
7837 if (p != peer_info_requested.end()) {
7838 peer_info_requested.erase(p);
7839 pg->blocked_by.erase(infoevt.from.osd);
7840 }
7841
7842 epoch_t old_start = pg->info.history.last_epoch_started;
7843 if (pg->proc_replica_info(
7844 infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
7845 // we got something new ...
7846 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7847 if (old_start < pg->info.history.last_epoch_started) {
7848 ldout(pg->cct, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
7849 prior_set = pg->build_prior();
7850
7851 // filter out any osds that got dropped from the probe set from
7852 // peer_info_requested. this is less expensive than restarting
7853 // peering (which would re-probe everyone).
7854 set<pg_shard_t>::iterator p = peer_info_requested.begin();
7855 while (p != peer_info_requested.end()) {
7856 if (prior_set.probe.count(*p) == 0) {
7857 ldout(pg->cct, 20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
7858 peer_info_requested.erase(p++);
7859 } else {
7860 ++p;
7861 }
7862 }
7863 get_infos();
7864 }
7865 ldout(pg->cct, 20) << "Adding osd: " << infoevt.from.osd << " peer features: "
7866 << hex << infoevt.features << dec << dendl;
7867 pg->apply_peer_features(infoevt.features);
7868
7869 // are we done getting everything?
7870 if (peer_info_requested.empty() && !prior_set.pg_down) {
7871 ldout(pg->cct, 20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl;
7872 ldout(pg->cct, 20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl;
7873 ldout(pg->cct, 20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl;
7874 post_event(GotInfo());
7875 }
7876 }
7877 return discard_event();
7878 }
7879
7880 boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q)
7881 {
7882 PG *pg = context< RecoveryMachine >().pg;
7883 q.f->open_object_section("state");
7884 q.f->dump_string("name", state_name);
7885 q.f->dump_stream("enter_time") << enter_time;
7886
7887 q.f->open_array_section("requested_info_from");
7888 for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
7889 p != peer_info_requested.end();
7890 ++p) {
7891 q.f->open_object_section("osd");
7892 q.f->dump_stream("osd") << *p;
7893 if (pg->peer_info.count(*p)) {
7894 q.f->open_object_section("got_info");
7895 pg->peer_info[*p].dump(q.f);
7896 q.f->close_section();
7897 }
7898 q.f->close_section();
7899 }
7900 q.f->close_section();
7901
7902 q.f->close_section();
7903 return forward_event();
7904 }
7905
7906 void PG::RecoveryState::GetInfo::exit()
7907 {
7908 context< RecoveryMachine >().log_exit(state_name, enter_time);
7909 PG *pg = context< RecoveryMachine >().pg;
7910 utime_t dur = ceph_clock_now() - enter_time;
7911 pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur);
7912 pg->blocked_by.clear();
7913 pg->publish_stats_to_osd();
7914 }
7915
7916 /*------GetLog------------*/
7917 PG::RecoveryState::GetLog::GetLog(my_context ctx)
7918 : my_base(ctx),
7919 NamedState(
7920 context< RecoveryMachine >().pg, "Started/Primary/Peering/GetLog"),
7921 msg(0)
7922 {
7923 context< RecoveryMachine >().log_enter(state_name);
7924
7925 PG *pg = context< RecoveryMachine >().pg;
7926
7927 // adjust acting?
7928 if (!pg->choose_acting(auth_log_shard, false,
7929 &context< Peering >().history_les_bound)) {
7930 if (!pg->want_acting.empty()) {
7931 post_event(NeedActingChange());
7932 } else {
7933 post_event(IsIncomplete());
7934 }
7935 return;
7936 }
7937
7938 // am i the best?
7939 if (auth_log_shard == pg->pg_whoami) {
7940 post_event(GotLog());
7941 return;
7942 }
7943
7944 const pg_info_t& best = pg->peer_info[auth_log_shard];
7945
7946 // am i broken?
7947 if (pg->info.last_update < best.log_tail) {
7948 ldout(pg->cct, 10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
7949 post_event(IsIncomplete());
7950 return;
7951 }
7952
7953 // how much log to request?
7954 eversion_t request_log_from = pg->info.last_update;
7955 assert(!pg->actingbackfill.empty());
7956 for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
7957 p != pg->actingbackfill.end();
7958 ++p) {
7959 if (*p == pg->pg_whoami) continue;
7960 pg_info_t& ri = pg->peer_info[*p];
7961 if (ri.last_update < pg->info.log_tail && ri.last_update >= best.log_tail &&
7962 ri.last_update < request_log_from)
7963 request_log_from = ri.last_update;
7964 }
7965
7966 // how much?
7967 ldout(pg->cct, 10) << " requesting log from osd." << auth_log_shard << dendl;
7968 context<RecoveryMachine>().send_query(
7969 auth_log_shard,
7970 pg_query_t(
7971 pg_query_t::LOG,
7972 auth_log_shard.shard, pg->pg_whoami.shard,
7973 request_log_from, pg->info.history,
7974 pg->get_osdmap()->get_epoch()));
7975
7976 assert(pg->blocked_by.empty());
7977 pg->blocked_by.insert(auth_log_shard.osd);
7978 pg->publish_stats_to_osd();
7979 }
7980
7981 boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap)
7982 {
7983 PG *pg = context< RecoveryMachine >().pg;
7984 // make sure our log source didn't go down. we need to check
7985 // explicitly because it may not be part of the prior set, which
7986 // means the Peering state check won't catch it going down.
7987 if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
7988 ldout(pg->cct, 10) << "GetLog: auth_log_shard osd."
7989 << auth_log_shard.osd << " went down" << dendl;
7990 post_event(advmap);
7991 return transit< Reset >();
7992 }
7993
7994 // let the Peering state do its checks.
7995 return forward_event();
7996 }
7997
7998 boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt)
7999 {
8000 PG *pg = context< RecoveryMachine >().pg;
8001 assert(!msg);
8002 if (logevt.from != auth_log_shard) {
8003 ldout(pg->cct, 10) << "GetLog: discarding log from "
8004 << "non-auth_log_shard osd." << logevt.from << dendl;
8005 return discard_event();
8006 }
8007 ldout(pg->cct, 10) << "GetLog: received master log from osd"
8008 << logevt.from << dendl;
8009 msg = logevt.msg;
8010 post_event(GotLog());
8011 return discard_event();
8012 }
8013
8014 boost::statechart::result PG::RecoveryState::GetLog::react(const GotLog&)
8015 {
8016 PG *pg = context< RecoveryMachine >().pg;
8017 ldout(pg->cct, 10) << "leaving GetLog" << dendl;
8018 if (msg) {
8019 ldout(pg->cct, 10) << "processing master log" << dendl;
8020 pg->proc_master_log(*context<RecoveryMachine>().get_cur_transaction(),
8021 msg->info, msg->log, msg->missing,
8022 auth_log_shard);
8023 }
8024 pg->start_flush(
8025 context< RecoveryMachine >().get_cur_transaction(),
8026 context< RecoveryMachine >().get_on_applied_context_list(),
8027 context< RecoveryMachine >().get_on_safe_context_list());
8028 return transit< GetMissing >();
8029 }
8030
8031 boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q)
8032 {
8033 q.f->open_object_section("state");
8034 q.f->dump_string("name", state_name);
8035 q.f->dump_stream("enter_time") << enter_time;
8036 q.f->dump_stream("auth_log_shard") << auth_log_shard;
8037 q.f->close_section();
8038 return forward_event();
8039 }
8040
8041 void PG::RecoveryState::GetLog::exit()
8042 {
8043 context< RecoveryMachine >().log_exit(state_name, enter_time);
8044 PG *pg = context< RecoveryMachine >().pg;
8045 utime_t dur = ceph_clock_now() - enter_time;
8046 pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur);
8047 pg->blocked_by.clear();
8048 pg->publish_stats_to_osd();
8049 }
8050
8051 /*------WaitActingChange--------*/
8052 PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx)
8053 : my_base(ctx),
8054 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitActingChange")
8055 {
8056 context< RecoveryMachine >().log_enter(state_name);
8057 }
8058
8059 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const AdvMap& advmap)
8060 {
8061 PG *pg = context< RecoveryMachine >().pg;
8062 OSDMapRef osdmap = advmap.osdmap;
8063
8064 ldout(pg->cct, 10) << "verifying no want_acting " << pg->want_acting << " targets didn't go down" << dendl;
8065 for (vector<int>::iterator p = pg->want_acting.begin(); p != pg->want_acting.end(); ++p) {
8066 if (!osdmap->is_up(*p)) {
8067 ldout(pg->cct, 10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
8068 post_event(advmap);
8069 return transit< Reset >();
8070 }
8071 }
8072 return forward_event();
8073 }
8074
8075 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MLogRec& logevt)
8076 {
8077 PG *pg = context< RecoveryMachine >().pg;
8078 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MLocRec" << dendl;
8079 return discard_event();
8080 }
8081
8082 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MInfoRec& evt)
8083 {
8084 PG *pg = context< RecoveryMachine >().pg;
8085 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
8086 return discard_event();
8087 }
8088
8089 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MNotifyRec& evt)
8090 {
8091 PG *pg = context< RecoveryMachine >().pg;
8092 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
8093 return discard_event();
8094 }
8095
8096 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const QueryState& q)
8097 {
8098 q.f->open_object_section("state");
8099 q.f->dump_string("name", state_name);
8100 q.f->dump_stream("enter_time") << enter_time;
8101 q.f->dump_string("comment", "waiting for pg acting set to change");
8102 q.f->close_section();
8103 return forward_event();
8104 }
8105
8106 void PG::RecoveryState::WaitActingChange::exit()
8107 {
8108 context< RecoveryMachine >().log_exit(state_name, enter_time);
8109 PG *pg = context< RecoveryMachine >().pg;
8110 utime_t dur = ceph_clock_now() - enter_time;
8111 pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur);
8112 }
8113
8114 /*------Down--------*/
8115 PG::RecoveryState::Down::Down(my_context ctx)
8116 : my_base(ctx),
8117 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Down")
8118 {
8119 context< RecoveryMachine >().log_enter(state_name);
8120 PG *pg = context< RecoveryMachine >().pg;
8121
8122 pg->state_clear(PG_STATE_PEERING);
8123 pg->state_set(PG_STATE_DOWN);
8124
8125 auto &prior_set = context< Peering >().prior_set;
8126 assert(pg->blocked_by.empty());
8127 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
8128 pg->publish_stats_to_osd();
8129 }
8130
8131 void PG::RecoveryState::Down::exit()
8132 {
8133 context< RecoveryMachine >().log_exit(state_name, enter_time);
8134 PG *pg = context< RecoveryMachine >().pg;
8135
8136 pg->state_clear(PG_STATE_DOWN);
8137 utime_t dur = ceph_clock_now() - enter_time;
8138 pg->osd->recoverystate_perf->tinc(rs_down_latency, dur);
8139
8140 pg->blocked_by.clear();
8141 pg->publish_stats_to_osd();
8142 }
8143
8144 boost::statechart::result PG::RecoveryState::Down::react(const QueryState& q)
8145 {
8146 q.f->open_object_section("state");
8147 q.f->dump_string("name", state_name);
8148 q.f->dump_stream("enter_time") << enter_time;
8149 q.f->dump_string("comment",
8150 "not enough up instances of this PG to go active");
8151 q.f->close_section();
8152 return forward_event();
8153 }
8154
8155 /*------Incomplete--------*/
8156 PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
8157 : my_base(ctx),
8158 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Incomplete")
8159 {
8160 context< RecoveryMachine >().log_enter(state_name);
8161 PG *pg = context< RecoveryMachine >().pg;
8162
8163 pg->state_clear(PG_STATE_PEERING);
8164 pg->state_set(PG_STATE_INCOMPLETE);
8165
8166 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
8167 assert(pg->blocked_by.empty());
8168 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
8169 pg->publish_stats_to_osd();
8170 }
8171
8172 boost::statechart::result PG::RecoveryState::Incomplete::react(const AdvMap &advmap) {
8173 PG *pg = context< RecoveryMachine >().pg;
8174 int64_t poolnum = pg->info.pgid.pool();
8175
8176 // Reset if min_size turn smaller than previous value, pg might now be able to go active
8177 if (advmap.lastmap->get_pools().find(poolnum)->second.min_size >
8178 advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
8179 post_event(advmap);
8180 return transit< Reset >();
8181 }
8182
8183 return forward_event();
8184 }
8185
8186 boost::statechart::result PG::RecoveryState::Incomplete::react(const MNotifyRec& notevt) {
8187 PG *pg = context< RecoveryMachine >().pg;
8188 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
8189 if (pg->proc_replica_info(
8190 notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
8191 // We got something new, try again!
8192 return transit< GetLog >();
8193 } else {
8194 return discard_event();
8195 }
8196 }
8197
8198 boost::statechart::result PG::RecoveryState::Incomplete::react(
8199 const QueryState& q)
8200 {
8201 q.f->open_object_section("state");
8202 q.f->dump_string("name", state_name);
8203 q.f->dump_stream("enter_time") << enter_time;
8204 q.f->dump_string("comment", "not enough complete instances of this PG");
8205 q.f->close_section();
8206 return forward_event();
8207 }
8208
8209 void PG::RecoveryState::Incomplete::exit()
8210 {
8211 context< RecoveryMachine >().log_exit(state_name, enter_time);
8212 PG *pg = context< RecoveryMachine >().pg;
8213
8214 pg->state_clear(PG_STATE_INCOMPLETE);
8215 utime_t dur = ceph_clock_now() - enter_time;
8216 pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur);
8217
8218 pg->blocked_by.clear();
8219 pg->publish_stats_to_osd();
8220 }
8221
8222 /*------GetMissing--------*/
8223 PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
8224 : my_base(ctx),
8225 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetMissing")
8226 {
8227 context< RecoveryMachine >().log_enter(state_name);
8228
8229 PG *pg = context< RecoveryMachine >().pg;
8230 assert(!pg->actingbackfill.empty());
8231 eversion_t since;
8232 for (set<pg_shard_t>::iterator i = pg->actingbackfill.begin();
8233 i != pg->actingbackfill.end();
8234 ++i) {
8235 if (*i == pg->get_primary()) continue;
8236 const pg_info_t& pi = pg->peer_info[*i];
8237 // reset this so to make sure the pg_missing_t is initialized and
8238 // has the correct semantics even if we don't need to get a
8239 // missing set from a shard. This way later additions due to
8240 // lost+unfound delete work properly.
8241 pg->peer_missing[*i].may_include_deletes = !pg->perform_deletes_during_peering();
8242
8243 if (pi.is_empty())
8244 continue; // no pg data, nothing divergent
8245
8246 if (pi.last_update < pg->pg_log.get_tail()) {
8247 ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
8248 pg->peer_missing[*i].clear();
8249 continue;
8250 }
8251 if (pi.last_backfill == hobject_t()) {
8252 ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
8253 pg->peer_missing[*i].clear();
8254 continue;
8255 }
8256
8257 if (pi.last_update == pi.last_complete && // peer has no missing
8258 pi.last_update == pg->info.last_update) { // peer is up to date
8259 // replica has no missing and identical log as us. no need to
8260 // pull anything.
8261 // FIXME: we can do better here. if last_update==last_complete we
8262 // can infer the rest!
8263 ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl;
8264 pg->peer_missing[*i].clear();
8265 continue;
8266 }
8267
8268 // We pull the log from the peer's last_epoch_started to ensure we
8269 // get enough log to detect divergent updates.
8270 since.epoch = pi.last_epoch_started;
8271 assert(pi.last_update >= pg->info.log_tail); // or else choose_acting() did a bad thing
8272 if (pi.log_tail <= since) {
8273 ldout(pg->cct, 10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
8274 context< RecoveryMachine >().send_query(
8275 *i,
8276 pg_query_t(
8277 pg_query_t::LOG,
8278 i->shard, pg->pg_whoami.shard,
8279 since, pg->info.history,
8280 pg->get_osdmap()->get_epoch()));
8281 } else {
8282 ldout(pg->cct, 10) << " requesting fulllog+missing from osd." << *i
8283 << " (want since " << since << " < log.tail "
8284 << pi.log_tail << ")" << dendl;
8285 context< RecoveryMachine >().send_query(
8286 *i, pg_query_t(
8287 pg_query_t::FULLLOG,
8288 i->shard, pg->pg_whoami.shard,
8289 pg->info.history, pg->get_osdmap()->get_epoch()));
8290 }
8291 peer_missing_requested.insert(*i);
8292 pg->blocked_by.insert(i->osd);
8293 }
8294
8295 if (peer_missing_requested.empty()) {
8296 if (pg->need_up_thru) {
8297 ldout(pg->cct, 10) << " still need up_thru update before going active"
8298 << dendl;
8299 post_event(NeedUpThru());
8300 return;
8301 }
8302
8303 // all good!
8304 post_event(Activate(pg->get_osdmap()->get_epoch()));
8305 } else {
8306 pg->publish_stats_to_osd();
8307 }
8308 }
8309
8310 boost::statechart::result PG::RecoveryState::GetMissing::react(const MLogRec& logevt)
8311 {
8312 PG *pg = context< RecoveryMachine >().pg;
8313
8314 peer_missing_requested.erase(logevt.from);
8315 pg->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
8316
8317 if (peer_missing_requested.empty()) {
8318 if (pg->need_up_thru) {
8319 ldout(pg->cct, 10) << " still need up_thru update before going active"
8320 << dendl;
8321 post_event(NeedUpThru());
8322 } else {
8323 ldout(pg->cct, 10) << "Got last missing, don't need missing "
8324 << "posting Activate" << dendl;
8325 post_event(Activate(pg->get_osdmap()->get_epoch()));
8326 }
8327 }
8328 return discard_event();
8329 }
8330
8331 boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState& q)
8332 {
8333 PG *pg = context< RecoveryMachine >().pg;
8334 q.f->open_object_section("state");
8335 q.f->dump_string("name", state_name);
8336 q.f->dump_stream("enter_time") << enter_time;
8337
8338 q.f->open_array_section("peer_missing_requested");
8339 for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
8340 p != peer_missing_requested.end();
8341 ++p) {
8342 q.f->open_object_section("osd");
8343 q.f->dump_stream("osd") << *p;
8344 if (pg->peer_missing.count(*p)) {
8345 q.f->open_object_section("got_missing");
8346 pg->peer_missing[*p].dump(q.f);
8347 q.f->close_section();
8348 }
8349 q.f->close_section();
8350 }
8351 q.f->close_section();
8352
8353 q.f->close_section();
8354 return forward_event();
8355 }
8356
8357 void PG::RecoveryState::GetMissing::exit()
8358 {
8359 context< RecoveryMachine >().log_exit(state_name, enter_time);
8360 PG *pg = context< RecoveryMachine >().pg;
8361 utime_t dur = ceph_clock_now() - enter_time;
8362 pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur);
8363 pg->blocked_by.clear();
8364 pg->publish_stats_to_osd();
8365 }
8366
8367 /*------WaitUpThru--------*/
8368 PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx)
8369 : my_base(ctx),
8370 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitUpThru")
8371 {
8372 context< RecoveryMachine >().log_enter(state_name);
8373 }
8374
8375 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const ActMap& am)
8376 {
8377 PG *pg = context< RecoveryMachine >().pg;
8378 if (!pg->need_up_thru) {
8379 post_event(Activate(pg->get_osdmap()->get_epoch()));
8380 }
8381 return forward_event();
8382 }
8383
8384 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const MLogRec& logevt)
8385 {
8386 PG *pg = context< RecoveryMachine >().pg;
8387 ldout(pg->cct, 10) << "Noting missing from osd." << logevt.from << dendl;
8388 pg->peer_missing[logevt.from].claim(logevt.msg->missing);
8389 pg->peer_info[logevt.from] = logevt.msg->info;
8390 return discard_event();
8391 }
8392
8393 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const QueryState& q)
8394 {
8395 q.f->open_object_section("state");
8396 q.f->dump_string("name", state_name);
8397 q.f->dump_stream("enter_time") << enter_time;
8398 q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
8399 q.f->close_section();
8400 return forward_event();
8401 }
8402
8403 void PG::RecoveryState::WaitUpThru::exit()
8404 {
8405 context< RecoveryMachine >().log_exit(state_name, enter_time);
8406 PG *pg = context< RecoveryMachine >().pg;
8407 utime_t dur = ceph_clock_now() - enter_time;
8408 pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur);
8409 }
8410
8411 /*----RecoveryState::RecoveryMachine Methods-----*/
8412 #undef dout_prefix
8413 #define dout_prefix *_dout << pg->gen_prefix()
8414
8415 void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name)
8416 {
8417 PG *pg = context< RecoveryMachine >().pg;
8418 ldout(pg->cct, 5) << "enter " << state_name << dendl;
8419 pg->osd->pg_recovery_stats.log_enter(state_name);
8420 }
8421
8422 void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_t enter_time)
8423 {
8424 utime_t dur = ceph_clock_now() - enter_time;
8425 PG *pg = context< RecoveryMachine >().pg;
8426 ldout(pg->cct, 5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
8427 pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now() - enter_time,
8428 event_count, event_time);
8429 event_count = 0;
8430 event_time = utime_t();
8431 }
8432
8433
8434 /*---------------------------------------------------*/
8435 #undef dout_prefix
8436 #define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ")
8437
8438 void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
8439 assert(!rctx);
8440 assert(!orig_ctx);
8441 orig_ctx = new_ctx;
8442 if (new_ctx) {
8443 if (messages_pending_flush) {
8444 rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
8445 } else {
8446 rctx = *new_ctx;
8447 }
8448 rctx->start_time = ceph_clock_now();
8449 }
8450 }
8451
8452 void PG::RecoveryState::begin_block_outgoing() {
8453 assert(!messages_pending_flush);
8454 assert(orig_ctx);
8455 assert(rctx);
8456 messages_pending_flush = BufferedRecoveryMessages();
8457 rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
8458 }
8459
8460 void PG::RecoveryState::clear_blocked_outgoing() {
8461 assert(orig_ctx);
8462 assert(rctx);
8463 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8464 }
8465
8466 void PG::RecoveryState::end_block_outgoing() {
8467 assert(messages_pending_flush);
8468 assert(orig_ctx);
8469 assert(rctx);
8470
8471 rctx = RecoveryCtx(*orig_ctx);
8472 rctx->accept_buffered_messages(*messages_pending_flush);
8473 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8474 }
8475
8476 void PG::RecoveryState::end_handle() {
8477 if (rctx) {
8478 utime_t dur = ceph_clock_now() - rctx->start_time;
8479 machine.event_time += dur;
8480 }
8481
8482 machine.event_count++;
8483 rctx = boost::optional<RecoveryCtx>();
8484 orig_ctx = NULL;
8485 }
8486
8487 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi)
8488 {
8489 out << "BackfillInfo(" << bi.begin << "-" << bi.end
8490 << " " << bi.objects.size() << " objects";
8491 if (!bi.objects.empty())
8492 out << " " << bi.objects;
8493 out << ")";
8494 return out;
8495 }
8496
8497 void intrusive_ptr_add_ref(PG *pg) { pg->get("intptr"); }
8498 void intrusive_ptr_release(PG *pg) { pg->put("intptr"); }
8499
8500 #ifdef PG_DEBUG_REFS
8501 uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
8502 void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
8503 #endif