]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PG.cc
update sources to 12.2.7
[ceph.git] / ceph / src / osd / PG.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "PG.h"
16 // #include "msg/Messenger.h"
17 #include "messages/MOSDRepScrub.h"
18 // #include "common/cmdparse.h"
19 // #include "common/ceph_context.h"
20
21 #include "common/errno.h"
22 #include "common/config.h"
23 #include "OSD.h"
24 #include "OpRequest.h"
25 #include "ScrubStore.h"
26 #include "Session.h"
27
28 #include "common/Timer.h"
29 #include "common/perf_counters.h"
30
31 #include "messages/MOSDOp.h"
32 #include "messages/MOSDPGNotify.h"
33 // #include "messages/MOSDPGLog.h"
34 #include "messages/MOSDPGRemove.h"
35 #include "messages/MOSDPGInfo.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDPGBackfill.h"
39 #include "messages/MOSDPGBackfillRemove.h"
40 #include "messages/MBackfillReserve.h"
41 #include "messages/MRecoveryReserve.h"
42 #include "messages/MOSDPGPush.h"
43 #include "messages/MOSDPGPushReply.h"
44 #include "messages/MOSDPGPull.h"
45 #include "messages/MOSDECSubOpWrite.h"
46 #include "messages/MOSDECSubOpWriteReply.h"
47 #include "messages/MOSDECSubOpRead.h"
48 #include "messages/MOSDECSubOpReadReply.h"
49 #include "messages/MOSDPGUpdateLogMissing.h"
50 #include "messages/MOSDPGUpdateLogMissingReply.h"
51 #include "messages/MOSDBackoff.h"
52 #include "messages/MOSDScrubReserve.h"
53 #include "messages/MOSDSubOp.h"
54 #include "messages/MOSDRepOp.h"
55 #include "messages/MOSDSubOpReply.h"
56 #include "messages/MOSDRepOpReply.h"
57 #include "messages/MOSDRepScrubMap.h"
58 #include "messages/MOSDPGRecoveryDelete.h"
59 #include "messages/MOSDPGRecoveryDeleteReply.h"
60
61 #include "common/BackTrace.h"
62 #include "common/EventTrace.h"
63
64 #ifdef WITH_LTTNG
65 #define TRACEPOINT_DEFINE
66 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
67 #include "tracing/pg.h"
68 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
69 #undef TRACEPOINT_DEFINE
70 #else
71 #define tracepoint(...)
72 #endif
73
74 #include <sstream>
75
76 #define dout_context cct
77 #define dout_subsys ceph_subsys_osd
78 #undef dout_prefix
79 #define dout_prefix _prefix(_dout, this)
80
81 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
82 // easily skip them
83 const string infover_key("_infover");
84 const string info_key("_info");
85 const string biginfo_key("_biginfo");
86 const string epoch_key("_epoch");
87 const string fastinfo_key("_fastinfo");
88
89 template <class T>
90 static ostream& _prefix(std::ostream *_dout, T *t)
91 {
92 return *_dout << t->gen_prefix();
93 }
94
95 MEMPOOL_DEFINE_OBJECT_FACTORY(PG::CephPeeringEvt, pg_peering_evt, osd);
96
97 void PGStateHistory::enter(PG* pg, const utime_t entime, const char* state)
98 {
99 // Ignore trimming state machine for now
100 if (::strstr(state, "Trimming") != NULL) {
101 return;
102 } else if (pi != nullptr) {
103 pi->enter_state(entime, state);
104 } else {
105 // Store current state since we can't reliably take the PG lock here
106 if ( tmppi == nullptr) {
107 tmppi = std::unique_ptr<PGStateInstance>(new PGStateInstance);
108 }
109
110 thispg = pg;
111 tmppi->enter_state(entime, state);
112 }
113 }
114
115 void PGStateHistory::exit(const char* state) {
116 // Ignore trimming state machine for now
117 // Do nothing if PG is being destroyed!
118 if (::strstr(state, "Trimming") != NULL || pg_in_destructor) {
119 return;
120 } else {
121 bool ilocked = false;
122 if(!thispg->is_locked()) {
123 thispg->lock();
124 ilocked = true;
125 }
126 if (pi == nullptr) {
127 buffer.push_back(std::unique_ptr<PGStateInstance>(tmppi.release()));
128 pi = buffer.back().get();
129 pi->setepoch(thispg->get_osdmap()->get_epoch());
130 }
131
132 pi->exit_state(ceph_clock_now());
133 if (::strcmp(state, "Reset") == 0) {
134 this->reset();
135 }
136 if(ilocked) {
137 thispg->unlock();
138 }
139 }
140 }
141
142 void PGStateHistory::dump(Formatter* f) const {
143 f->open_array_section("history");
144 for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) {
145 f->open_object_section("states");
146 f->dump_stream("epoch") << (*pi)->this_epoch;
147 for (auto she : (*pi)->state_history) {
148 f->dump_string("state", std::get<2>(she));
149 f->dump_stream("enter") << std::get<0>(she);
150 f->dump_stream("exit") << std::get<1>(she);
151 }
152 f->close_section();
153 }
154 f->close_section();
155 }
156
157 void PG::get(const char* tag)
158 {
159 ref++;
160 #ifdef PG_DEBUG_REFS
161 Mutex::Locker l(_ref_id_lock);
162 _tag_counts[tag]++;
163 #endif
164 }
165
166 void PG::put(const char* tag)
167 {
168 #ifdef PG_DEBUG_REFS
169 {
170 Mutex::Locker l(_ref_id_lock);
171 auto tag_counts_entry = _tag_counts.find(tag);
172 assert(tag_counts_entry != _tag_counts.end());
173 --tag_counts_entry->second;
174 if (tag_counts_entry->second == 0) {
175 _tag_counts.erase(tag_counts_entry);
176 }
177 }
178 #endif
179 if (--ref== 0)
180 delete this;
181 }
182
183 #ifdef PG_DEBUG_REFS
184 uint64_t PG::get_with_id()
185 {
186 ref++;
187 Mutex::Locker l(_ref_id_lock);
188 uint64_t id = ++_ref_id;
189 BackTrace bt(0);
190 stringstream ss;
191 bt.print(ss);
192 dout(20) << __func__ << ": " << info.pgid << " got id " << id << " (new) ref==" << ref << dendl;
193 assert(!_live_ids.count(id));
194 _live_ids.insert(make_pair(id, ss.str()));
195 return id;
196 }
197
198 void PG::put_with_id(uint64_t id)
199 {
200 dout(20) << __func__ << ": " << info.pgid << " put id " << id << " (current) ref==" << ref << dendl;
201 {
202 Mutex::Locker l(_ref_id_lock);
203 assert(_live_ids.count(id));
204 _live_ids.erase(id);
205 }
206 if (--ref == 0)
207 delete this;
208 }
209
210 void PG::dump_live_ids()
211 {
212 Mutex::Locker l(_ref_id_lock);
213 dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl;
214 for (map<uint64_t, string>::iterator i = _live_ids.begin();
215 i != _live_ids.end();
216 ++i) {
217 dout(0) << "\t\tid: " << *i << dendl;
218 }
219 dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl;
220 for (map<string, uint64_t>::iterator i = _tag_counts.begin();
221 i != _tag_counts.end();
222 ++i) {
223 dout(0) << "\t\tid: " << *i << dendl;
224 }
225 }
226 #endif
227
228
229 void PGPool::update(OSDMapRef map)
230 {
231 const pg_pool_t *pi = map->get_pg_pool(id);
232 assert(pi);
233 info = *pi;
234 auid = pi->auid;
235 name = map->get_pool_name(id);
236 bool updated = false;
237 if ((map->get_epoch() != cached_epoch + 1) ||
238 (pi->get_snap_epoch() == map->get_epoch())) {
239 updated = true;
240 pi->build_removed_snaps(newly_removed_snaps);
241 interval_set<snapid_t> intersection;
242 intersection.intersection_of(newly_removed_snaps, cached_removed_snaps);
243 if (intersection == cached_removed_snaps) {
244 newly_removed_snaps.subtract(cached_removed_snaps);
245 cached_removed_snaps.union_of(newly_removed_snaps);
246 } else {
247 lgeneric_subdout(cct, osd, 0) << __func__
248 << " cached_removed_snaps shrank from " << cached_removed_snaps
249 << " to " << newly_removed_snaps << dendl;
250 cached_removed_snaps = newly_removed_snaps;
251 newly_removed_snaps.clear();
252 }
253 snapc = pi->get_snap_context();
254 } else {
255 /* 1) map->get_epoch() == cached_epoch + 1 &&
256 * 2) pi->get_snap_epoch() != map->get_epoch()
257 *
258 * From the if branch, 1 && 2 must be true. From 2, we know that
259 * this map didn't change the set of removed snaps. From 1, we
260 * know that our cached_removed_snaps matches the previous map.
261 * Thus, from 1 && 2, cached_removed snaps matches the current
262 * set of removed snaps and all we have to do is clear
263 * newly_removed_snaps.
264 */
265 newly_removed_snaps.clear();
266 }
267 cached_epoch = map->get_epoch();
268 lgeneric_subdout(cct, osd, 20)
269 << "PGPool::update cached_removed_snaps "
270 << cached_removed_snaps
271 << " newly_removed_snaps "
272 << newly_removed_snaps
273 << " snapc " << snapc
274 << (updated ? " (updated)":" (no change)")
275 << dendl;
276 }
277
278 PG::PG(OSDService *o, OSDMapRef curmap,
279 const PGPool &_pool, spg_t p) :
280 osd(o),
281 cct(o->cct),
282 osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
283 snap_mapper(
284 cct,
285 &osdriver,
286 p.ps(),
287 p.get_split_bits(curmap->get_pg_num(_pool.id)),
288 _pool.id,
289 p.shard),
290 osdmap_ref(curmap), last_persisted_osdmap_ref(curmap), pool(_pool),
291 _lock("PG::_lock"),
292 #ifdef PG_DEBUG_REFS
293 _ref_id_lock("PG::_ref_id_lock"), _ref_id(0),
294 #endif
295 deleting(false),
296 trace_endpoint("0.0.0.0", 0, "PG"),
297 dirty_info(false), dirty_big_info(false),
298 info(p),
299 info_struct_v(0),
300 coll(p),
301 pg_log(cct),
302 pgmeta_oid(p.make_pgmeta_oid()),
303 missing_loc(this),
304 past_intervals(
305 curmap->get_pools().at(p.pgid.pool()).ec_pool(),
306 *curmap),
307 stat_queue_item(this),
308 scrub_queued(false),
309 recovery_queued(false),
310 recovery_ops_active(0),
311 role(-1),
312 state(0),
313 send_notify(false),
314 pg_whoami(osd->whoami, p.shard),
315 need_up_thru(false),
316 last_peering_reset(0),
317 heartbeat_peer_lock("PG::heartbeat_peer_lock"),
318 backfill_reserved(false),
319 backfill_reserving(false),
320 flushes_in_progress(0),
321 pg_stats_publish_lock("PG::pg_stats_publish_lock"),
322 pg_stats_publish_valid(false),
323 osr(osd->osr_registry.lookup_or_create(p, (stringify(p)))),
324 finish_sync_event(NULL),
325 backoff_lock("PG::backoff_lock"),
326 scrub_after_recovery(false),
327 active_pushes(0),
328 recovery_state(this),
329 pg_id(p),
330 peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
331 acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
332 upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
333 last_epoch(0)
334 {
335 #ifdef PG_DEBUG_REFS
336 osd->add_pgid(p, this);
337 #endif
338 #ifdef WITH_BLKIN
339 std::stringstream ss;
340 ss << "PG " << info.pgid;
341 trace_endpoint.copy_name(ss.str());
342 #endif
343 osr->shard_hint = p;
344 }
345
346 PG::~PG()
347 {
348 pgstate_history.set_pg_in_destructor();
349 #ifdef PG_DEBUG_REFS
350 osd->remove_pgid(info.pgid, this);
351 #endif
352 }
353
354 void PG::lock_suspend_timeout(ThreadPool::TPHandle &handle)
355 {
356 handle.suspend_tp_timeout();
357 lock();
358 handle.reset_tp_timeout();
359 }
360
361 void PG::lock(bool no_lockdep) const
362 {
363 _lock.Lock(no_lockdep);
364 // if we have unrecorded dirty state with the lock dropped, there is a bug
365 assert(!dirty_info);
366 assert(!dirty_big_info);
367
368 dout(30) << "lock" << dendl;
369 }
370
371 std::string PG::gen_prefix() const
372 {
373 stringstream out;
374 OSDMapRef mapref = osdmap_ref;
375 if (_lock.is_locked_by_me()) {
376 out << "osd." << osd->whoami
377 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
378 << " " << *this << " ";
379 } else {
380 out << "osd." << osd->whoami
381 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
382 << " pg[" << info.pgid << "(unlocked)] ";
383 }
384 return out.str();
385 }
386
387 /********* PG **********/
388
389 void PG::proc_master_log(
390 ObjectStore::Transaction& t, pg_info_t &oinfo,
391 pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
392 {
393 dout(10) << "proc_master_log for osd." << from << ": "
394 << olog << " " << omissing << dendl;
395 assert(!is_peered() && is_primary());
396
397 // merge log into our own log to build master log. no need to
398 // make any adjustments to their missing map; we are taking their
399 // log to be authoritative (i.e., their entries are by definitely
400 // non-divergent).
401 merge_log(t, oinfo, olog, from);
402 peer_info[from] = oinfo;
403 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
404 might_have_unfound.insert(from);
405
406 // See doc/dev/osd_internals/last_epoch_started
407 if (oinfo.last_epoch_started > info.last_epoch_started) {
408 info.last_epoch_started = oinfo.last_epoch_started;
409 dirty_info = true;
410 }
411 if (oinfo.last_interval_started > info.last_interval_started) {
412 info.last_interval_started = oinfo.last_interval_started;
413 dirty_info = true;
414 }
415 update_history(oinfo.history);
416 assert(cct->_conf->osd_find_best_info_ignore_history_les ||
417 info.last_epoch_started >= info.history.last_epoch_started);
418
419 peer_missing[from].claim(omissing);
420 }
421
422 void PG::proc_replica_log(
423 pg_info_t &oinfo,
424 const pg_log_t &olog,
425 pg_missing_t& omissing,
426 pg_shard_t from)
427 {
428 dout(10) << "proc_replica_log for osd." << from << ": "
429 << oinfo << " " << olog << " " << omissing << dendl;
430
431 pg_log.proc_replica_log(oinfo, olog, omissing, from);
432
433 peer_info[from] = oinfo;
434 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
435 might_have_unfound.insert(from);
436
437 for (map<hobject_t, pg_missing_item>::const_iterator i =
438 omissing.get_items().begin();
439 i != omissing.get_items().end();
440 ++i) {
441 dout(20) << " after missing " << i->first << " need " << i->second.need
442 << " have " << i->second.have << dendl;
443 }
444 peer_missing[from].claim(omissing);
445 }
446
447 bool PG::proc_replica_info(
448 pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
449 {
450 map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
451 if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
452 dout(10) << " got dup osd." << from << " info " << oinfo << ", identical to ours" << dendl;
453 return false;
454 }
455
456 if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
457 dout(10) << " got info " << oinfo << " from down osd." << from
458 << " discarding" << dendl;
459 return false;
460 }
461
462 dout(10) << " got osd." << from << " " << oinfo << dendl;
463 assert(is_primary());
464 peer_info[from] = oinfo;
465 might_have_unfound.insert(from);
466
467 update_history(oinfo.history);
468
469 // stray?
470 if (!is_up(from) && !is_acting(from)) {
471 dout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
472 stray_set.insert(from);
473 if (is_clean()) {
474 purge_strays();
475 }
476 }
477
478 // was this a new info? if so, update peers!
479 if (p == peer_info.end())
480 update_heartbeat_peers();
481
482 return true;
483 }
484
485 void PG::remove_snap_mapped_object(
486 ObjectStore::Transaction &t, const hobject_t &soid)
487 {
488 t.remove(
489 coll,
490 ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard));
491 clear_object_snap_mapping(&t, soid);
492 }
493
494 void PG::clear_object_snap_mapping(
495 ObjectStore::Transaction *t, const hobject_t &soid)
496 {
497 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
498 if (soid.snap < CEPH_MAXSNAP) {
499 int r = snap_mapper.remove_oid(
500 soid,
501 &_t);
502 if (!(r == 0 || r == -ENOENT)) {
503 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
504 ceph_abort();
505 }
506 }
507 }
508
509 void PG::update_object_snap_mapping(
510 ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps)
511 {
512 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
513 assert(soid.snap < CEPH_MAXSNAP);
514 int r = snap_mapper.remove_oid(
515 soid,
516 &_t);
517 if (!(r == 0 || r == -ENOENT)) {
518 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
519 ceph_abort();
520 }
521 snap_mapper.add_oid(
522 soid,
523 snaps,
524 &_t);
525 }
526
527 void PG::merge_log(
528 ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_shard_t from)
529 {
530 PGLogEntryHandler rollbacker{this, &t};
531 pg_log.merge_log(
532 oinfo, olog, from, info, &rollbacker, dirty_info, dirty_big_info);
533 }
534
535 void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead)
536 {
537 PGLogEntryHandler rollbacker{this, &t};
538 pg_log.rewind_divergent_log(
539 newhead, info, &rollbacker, dirty_info, dirty_big_info);
540 }
541
542 /*
543 * Process information from a replica to determine if it could have any
544 * objects that i need.
545 *
546 * TODO: if the missing set becomes very large, this could get expensive.
547 * Instead, we probably want to just iterate over our unfound set.
548 */
549 bool PG::search_for_missing(
550 const pg_info_t &oinfo, const pg_missing_t &omissing,
551 pg_shard_t from,
552 RecoveryCtx *ctx)
553 {
554 uint64_t num_unfound_before = missing_loc.num_unfound();
555 bool found_missing = missing_loc.add_source_info(
556 from, oinfo, omissing, ctx->handle);
557 if (found_missing && num_unfound_before != missing_loc.num_unfound())
558 publish_stats_to_osd();
559 if (found_missing &&
560 (get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
561 CEPH_FEATURE_OSD_ERASURE_CODES)) {
562 pg_info_t tinfo(oinfo);
563 tinfo.pgid.shard = pg_whoami.shard;
564 (*(ctx->info_map))[from.osd].push_back(
565 make_pair(
566 pg_notify_t(
567 from.shard, pg_whoami.shard,
568 get_osdmap()->get_epoch(),
569 get_osdmap()->get_epoch(),
570 tinfo),
571 past_intervals));
572 }
573 return found_missing;
574 }
575
576 bool PG::MissingLoc::readable_with_acting(
577 const hobject_t &hoid,
578 const set<pg_shard_t> &acting) const {
579 if (!needs_recovery(hoid))
580 return true;
581 if (is_deleted(hoid))
582 return false;
583 auto missing_loc_entry = missing_loc.find(hoid);
584 if (missing_loc_entry == missing_loc.end())
585 return false;
586 const set<pg_shard_t> &locs = missing_loc_entry->second;
587 ldout(pg->cct, 10) << __func__ << ": locs:" << locs << dendl;
588 set<pg_shard_t> have_acting;
589 for (set<pg_shard_t>::const_iterator i = locs.begin();
590 i != locs.end();
591 ++i) {
592 if (acting.count(*i))
593 have_acting.insert(*i);
594 }
595 return (*is_readable)(have_acting);
596 }
597
598 void PG::MissingLoc::add_batch_sources_info(
599 const set<pg_shard_t> &sources, ThreadPool::TPHandle* handle)
600 {
601 ldout(pg->cct, 10) << __func__ << ": adding sources in batch "
602 << sources.size() << dendl;
603 unsigned loop = 0;
604 for (map<hobject_t, pg_missing_item>::const_iterator i = needs_recovery_map.begin();
605 i != needs_recovery_map.end();
606 ++i) {
607 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
608 handle->reset_tp_timeout();
609 loop = 0;
610 }
611 if (i->second.is_delete())
612 continue;
613 missing_loc[i->first].insert(sources.begin(), sources.end());
614 missing_loc_sources.insert(sources.begin(), sources.end());
615 }
616 }
617
618 bool PG::MissingLoc::add_source_info(
619 pg_shard_t fromosd,
620 const pg_info_t &oinfo,
621 const pg_missing_t &omissing,
622 ThreadPool::TPHandle* handle)
623 {
624 bool found_missing = false;
625 unsigned loop = 0;
626 // found items?
627 for (map<hobject_t,pg_missing_item>::const_iterator p = needs_recovery_map.begin();
628 p != needs_recovery_map.end();
629 ++p) {
630 const hobject_t &soid(p->first);
631 eversion_t need = p->second.need;
632 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
633 handle->reset_tp_timeout();
634 loop = 0;
635 }
636 if (p->second.is_delete()) {
637 ldout(pg->cct, 10) << __func__ << " " << soid
638 << " delete, ignoring source" << dendl;
639 found_missing = true;
640 continue;
641 }
642 if (oinfo.last_update < need) {
643 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
644 << " also missing on osd." << fromosd
645 << " (last_update " << oinfo.last_update
646 << " < needed " << need << ")" << dendl;
647 continue;
648 }
649 if (!oinfo.last_backfill.is_max() &&
650 !oinfo.last_backfill_bitwise) {
651 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
652 << " also missing on osd." << fromosd
653 << " (last_backfill " << oinfo.last_backfill
654 << " but with wrong sort order)"
655 << dendl;
656 continue;
657 }
658 if (p->first >= oinfo.last_backfill) {
659 // FIXME: this is _probably_ true, although it could conceivably
660 // be in the undefined region! Hmm!
661 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
662 << " also missing on osd." << fromosd
663 << " (past last_backfill " << oinfo.last_backfill
664 << ")" << dendl;
665 continue;
666 }
667 if (oinfo.last_complete < need) {
668 if (omissing.is_missing(soid)) {
669 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
670 << " also missing on osd." << fromosd << dendl;
671 continue;
672 }
673 }
674
675 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
676 << " is on osd." << fromosd << dendl;
677
678 missing_loc[soid].insert(fromosd);
679 missing_loc_sources.insert(fromosd);
680 found_missing = true;
681 }
682
683 ldout(pg->cct, 20) << "needs_recovery_map missing " << needs_recovery_map
684 << dendl;
685 return found_missing;
686 }
687
688 void PG::discover_all_missing(map<int, map<spg_t,pg_query_t> > &query_map)
689 {
690 auto &missing = pg_log.get_missing();
691 uint64_t unfound = get_num_unfound();
692 assert(unfound > 0);
693
694 dout(10) << __func__ << " "
695 << missing.num_missing() << " missing, "
696 << unfound << " unfound"
697 << dendl;
698
699 std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
700 std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
701 for (; m != mend; ++m) {
702 pg_shard_t peer(*m);
703
704 if (!get_osdmap()->is_up(peer.osd)) {
705 dout(20) << __func__ << " skipping down osd." << peer << dendl;
706 continue;
707 }
708
709 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
710 if (iter != peer_info.end() &&
711 (iter->second.is_empty() || iter->second.dne())) {
712 // ignore empty peers
713 continue;
714 }
715
716 // If we've requested any of this stuff, the pg_missing_t information
717 // should be on its way.
718 // TODO: coalsce requested_* into a single data structure
719 if (peer_missing.find(peer) != peer_missing.end()) {
720 dout(20) << __func__ << ": osd." << peer
721 << ": we already have pg_missing_t" << dendl;
722 continue;
723 }
724 if (peer_log_requested.find(peer) != peer_log_requested.end()) {
725 dout(20) << __func__ << ": osd." << peer
726 << ": in peer_log_requested" << dendl;
727 continue;
728 }
729 if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
730 dout(20) << __func__ << ": osd." << peer
731 << ": in peer_missing_requested" << dendl;
732 continue;
733 }
734
735 // Request missing
736 dout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
737 << dendl;
738 peer_missing_requested.insert(peer);
739 query_map[peer.osd][spg_t(info.pgid.pgid, peer.shard)] =
740 pg_query_t(
741 pg_query_t::FULLLOG,
742 peer.shard, pg_whoami.shard,
743 info.history, get_osdmap()->get_epoch());
744 }
745 }
746
747 /******* PG ***********/
748 bool PG::needs_recovery() const
749 {
750 assert(is_primary());
751
752 auto &missing = pg_log.get_missing();
753
754 if (missing.num_missing()) {
755 dout(10) << __func__ << " primary has " << missing.num_missing()
756 << " missing" << dendl;
757 return true;
758 }
759
760 assert(!actingbackfill.empty());
761 set<pg_shard_t>::const_iterator end = actingbackfill.end();
762 set<pg_shard_t>::const_iterator a = actingbackfill.begin();
763 for (; a != end; ++a) {
764 if (*a == get_primary()) continue;
765 pg_shard_t peer = *a;
766 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
767 if (pm == peer_missing.end()) {
768 dout(10) << __func__ << " osd." << peer << " doesn't have missing set"
769 << dendl;
770 continue;
771 }
772 if (pm->second.num_missing()) {
773 dout(10) << __func__ << " osd." << peer << " has "
774 << pm->second.num_missing() << " missing" << dendl;
775 return true;
776 }
777 }
778
779 dout(10) << __func__ << " is recovered" << dendl;
780 return false;
781 }
782
783 bool PG::needs_backfill() const
784 {
785 assert(is_primary());
786
787 // We can assume that only possible osds that need backfill
788 // are on the backfill_targets vector nodes.
789 set<pg_shard_t>::const_iterator end = backfill_targets.end();
790 set<pg_shard_t>::const_iterator a = backfill_targets.begin();
791 for (; a != end; ++a) {
792 pg_shard_t peer = *a;
793 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
794 if (!pi->second.last_backfill.is_max()) {
795 dout(10) << __func__ << " osd." << peer << " has last_backfill " << pi->second.last_backfill << dendl;
796 return true;
797 }
798 }
799
800 dout(10) << __func__ << " does not need backfill" << dendl;
801 return false;
802 }
803
804
805 void PG::check_past_interval_bounds() const
806 {
807 auto rpib = get_required_past_interval_bounds(
808 info,
809 osd->get_superblock().oldest_map);
810 if (rpib.first >= rpib.second) {
811 if (!past_intervals.empty()) {
812 osd->clog->error() << info.pgid << " required past_interval bounds are"
813 << " empty [" << rpib << ") but past_intervals is not: "
814 << past_intervals;
815 derr << info.pgid << " required past_interval bounds are"
816 << " empty [" << rpib << ") but past_intervals is not: "
817 << past_intervals << dendl;
818 }
819 } else {
820 if (past_intervals.empty()) {
821 osd->clog->error() << info.pgid << " required past_interval bounds are"
822 << " not empty [" << rpib << ") but past_intervals "
823 << past_intervals << " is empty";
824 derr << info.pgid << " required past_interval bounds are"
825 << " not empty [" << rpib << ") but past_intervals "
826 << past_intervals << " is empty" << dendl;
827 assert(!past_intervals.empty());
828 }
829
830 auto apib = past_intervals.get_bounds();
831 if (apib.first > rpib.first) {
832 osd->clog->error() << info.pgid << " past_intervals [" << apib
833 << ") start interval does not contain the required"
834 << " bound [" << rpib << ") start";
835 derr << info.pgid << " past_intervals [" << apib
836 << ") start interval does not contain the required"
837 << " bound [" << rpib << ") start" << dendl;
838 assert(0 == "past_interval start interval mismatch");
839 }
840 if (apib.second != rpib.second) {
841 osd->clog->error() << info.pgid << " past_interal bound [" << apib
842 << ") end does not match required [" << rpib
843 << ") end";
844 derr << info.pgid << " past_interal bound [" << apib
845 << ") end does not match required [" << rpib
846 << ") end" << dendl;
847 assert(0 == "past_interval end mismatch");
848 }
849 }
850 }
851
852 bool PG::adjust_need_up_thru(const OSDMapRef osdmap)
853 {
854 epoch_t up_thru = osdmap->get_up_thru(osd->whoami);
855 if (need_up_thru &&
856 up_thru >= info.history.same_interval_since) {
857 dout(10) << "adjust_need_up_thru now " << up_thru << ", need_up_thru now false" << dendl;
858 need_up_thru = false;
859 return true;
860 }
861 return false;
862 }
863
864 void PG::remove_down_peer_info(const OSDMapRef osdmap)
865 {
866 // Remove any downed osds from peer_info
867 bool removed = false;
868 map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
869 while (p != peer_info.end()) {
870 if (!osdmap->is_up(p->first.osd)) {
871 dout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
872 peer_missing.erase(p->first);
873 peer_log_requested.erase(p->first);
874 peer_missing_requested.erase(p->first);
875 peer_info.erase(p++);
876 removed = true;
877 } else
878 ++p;
879 }
880
881 // if we removed anyone, update peers (which include peer_info)
882 if (removed)
883 update_heartbeat_peers();
884 check_recovery_sources(osdmap);
885 }
886
887 /*
888 * Returns true unless there is a non-lost OSD in might_have_unfound.
889 */
890 bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
891 {
892 assert(is_primary());
893
894 set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
895 set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
896 for (; peer != mend; ++peer) {
897 if (peer_missing.count(*peer))
898 continue;
899 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
900 if (iter != peer_info.end() &&
901 (iter->second.is_empty() || iter->second.dne()))
902 continue;
903 if (!osdmap->exists(peer->osd))
904 continue;
905 const osd_info_t &osd_info(osdmap->get_info(peer->osd));
906 if (osd_info.lost_at <= osd_info.up_from) {
907 // If there is even one OSD in might_have_unfound that isn't lost, we
908 // still might retrieve our unfound.
909 return false;
910 }
911 }
912 dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound
913 << " have been queried or are marked lost" << dendl;
914 return true;
915 }
916
917 PastIntervals::PriorSet PG::build_prior()
918 {
919 if (1) {
920 // sanity check
921 for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
922 it != peer_info.end();
923 ++it) {
924 assert(info.history.last_epoch_started >= it->second.history.last_epoch_started);
925 }
926 }
927
928 const OSDMap &osdmap = *get_osdmap();
929 PastIntervals::PriorSet prior = past_intervals.get_prior_set(
930 pool.info.ec_pool(),
931 info.history.last_epoch_started,
932 get_pgbackend()->get_is_recoverable_predicate(),
933 [&](epoch_t start, int osd, epoch_t *lost_at) {
934 const osd_info_t *pinfo = 0;
935 if (osdmap.exists(osd)) {
936 pinfo = &osdmap.get_info(osd);
937 if (lost_at)
938 *lost_at = pinfo->lost_at;
939 }
940
941 if (osdmap.is_up(osd)) {
942 return PastIntervals::UP;
943 } else if (!pinfo) {
944 return PastIntervals::DNE;
945 } else if (pinfo->lost_at > start) {
946 return PastIntervals::LOST;
947 } else {
948 return PastIntervals::DOWN;
949 }
950 },
951 up,
952 acting,
953 this);
954
955 if (prior.pg_down) {
956 state_set(PG_STATE_DOWN);
957 }
958
959 if (get_osdmap()->get_up_thru(osd->whoami) < info.history.same_interval_since) {
960 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
961 << " < same_since " << info.history.same_interval_since
962 << ", must notify monitor" << dendl;
963 need_up_thru = true;
964 } else {
965 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
966 << " >= same_since " << info.history.same_interval_since
967 << ", all is well" << dendl;
968 need_up_thru = false;
969 }
970 set_probe_targets(prior.probe);
971 return prior;
972 }
973
974 void PG::clear_primary_state()
975 {
976 dout(10) << "clear_primary_state" << dendl;
977
978 // clear peering state
979 stray_set.clear();
980 peer_log_requested.clear();
981 peer_missing_requested.clear();
982 peer_info.clear();
983 peer_missing.clear();
984 need_up_thru = false;
985 peer_last_complete_ondisk.clear();
986 peer_activated.clear();
987 min_last_complete_ondisk = eversion_t();
988 pg_trim_to = eversion_t();
989 might_have_unfound.clear();
990 projected_log = PGLog::IndexedLog();
991
992 last_update_ondisk = eversion_t();
993
994 snap_trimq.clear();
995
996 finish_sync_event = 0; // so that _finish_recovery doesn't go off in another thread
997
998 missing_loc.clear();
999
1000 release_pg_backoffs();
1001
1002 pg_log.reset_recovery_pointers();
1003
1004 scrubber.reserved_peers.clear();
1005 scrub_after_recovery = false;
1006
1007 agent_clear();
1008 }
1009
1010 PG::Scrubber::Scrubber()
1011 : reserved(false), reserve_failed(false),
1012 epoch_start(0),
1013 active(false),
1014 shallow_errors(0), deep_errors(0), fixed(0),
1015 must_scrub(false), must_deep_scrub(false), must_repair(false),
1016 auto_repair(false),
1017 num_digest_updates_pending(0),
1018 state(INACTIVE),
1019 deep(false)
1020 {}
1021
1022 PG::Scrubber::~Scrubber() {}
1023
1024 /**
1025 * find_best_info
1026 *
1027 * Returns an iterator to the best info in infos sorted by:
1028 * 1) Prefer newer last_update
1029 * 2) Prefer longer tail if it brings another info into contiguity
1030 * 3) Prefer current primary
1031 */
1032 map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
1033 const map<pg_shard_t, pg_info_t> &infos,
1034 bool restrict_to_up_acting,
1035 bool *history_les_bound) const
1036 {
1037 assert(history_les_bound);
1038 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1039 * to make changes to this process. Also, make sure to update it
1040 * when you find bugs! */
1041 eversion_t min_last_update_acceptable = eversion_t::max();
1042 epoch_t max_last_epoch_started_found = 0;
1043 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1044 i != infos.end();
1045 ++i) {
1046 if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1047 max_last_epoch_started_found < i->second.history.last_epoch_started) {
1048 *history_les_bound = true;
1049 max_last_epoch_started_found = i->second.history.last_epoch_started;
1050 }
1051 if (!i->second.is_incomplete() &&
1052 max_last_epoch_started_found < i->second.last_epoch_started) {
1053 max_last_epoch_started_found = i->second.last_epoch_started;
1054 }
1055 }
1056 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1057 i != infos.end();
1058 ++i) {
1059 if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1060 if (min_last_update_acceptable > i->second.last_update)
1061 min_last_update_acceptable = i->second.last_update;
1062 }
1063 }
1064 if (min_last_update_acceptable == eversion_t::max())
1065 return infos.end();
1066
1067 map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1068 // find osd with newest last_update (oldest for ec_pool).
1069 // if there are multiples, prefer
1070 // - a longer tail, if it brings another peer into log contiguity
1071 // - the current primary
1072 for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1073 p != infos.end();
1074 ++p) {
1075 if (restrict_to_up_acting && !is_up(p->first) &&
1076 !is_acting(p->first))
1077 continue;
1078 // Only consider peers with last_update >= min_last_update_acceptable
1079 if (p->second.last_update < min_last_update_acceptable)
1080 continue;
1081 // Disqualify anyone with a too old last_epoch_started
1082 if (p->second.last_epoch_started < max_last_epoch_started_found)
1083 continue;
1084 // Disqualify anyone who is incomplete (not fully backfilled)
1085 if (p->second.is_incomplete())
1086 continue;
1087 if (best == infos.end()) {
1088 best = p;
1089 continue;
1090 }
1091 // Prefer newer last_update
1092 if (pool.info.require_rollback()) {
1093 if (p->second.last_update > best->second.last_update)
1094 continue;
1095 if (p->second.last_update < best->second.last_update) {
1096 best = p;
1097 continue;
1098 }
1099 } else {
1100 if (p->second.last_update < best->second.last_update)
1101 continue;
1102 if (p->second.last_update > best->second.last_update) {
1103 best = p;
1104 continue;
1105 }
1106 }
1107
1108 // Prefer longer tail
1109 if (p->second.log_tail > best->second.log_tail) {
1110 continue;
1111 } else if (p->second.log_tail < best->second.log_tail) {
1112 best = p;
1113 continue;
1114 }
1115
1116 // prefer current primary (usually the caller), all things being equal
1117 if (p->first == pg_whoami) {
1118 dout(10) << "calc_acting prefer osd." << p->first
1119 << " because it is current primary" << dendl;
1120 best = p;
1121 continue;
1122 }
1123 }
1124 return best;
1125 }
1126
1127 void PG::calc_ec_acting(
1128 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1129 unsigned size,
1130 const vector<int> &acting,
1131 pg_shard_t acting_primary,
1132 const vector<int> &up,
1133 pg_shard_t up_primary,
1134 const map<pg_shard_t, pg_info_t> &all_info,
1135 bool restrict_to_up_acting,
1136 vector<int> *_want,
1137 set<pg_shard_t> *backfill,
1138 set<pg_shard_t> *acting_backfill,
1139 pg_shard_t *want_primary,
1140 ostream &ss)
1141 {
1142 vector<int> want(size, CRUSH_ITEM_NONE);
1143 map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1144 unsigned usable = 0;
1145 for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1146 i != all_info.end();
1147 ++i) {
1148 all_info_by_shard[i->first.shard].insert(i->first);
1149 }
1150 for (uint8_t i = 0; i < want.size(); ++i) {
1151 ss << "For position " << (unsigned)i << ": ";
1152 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1153 !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1154 all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1155 auth_log_shard->second.log_tail) {
1156 ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1157 want[i] = up[i];
1158 ++usable;
1159 continue;
1160 }
1161 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1162 ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1163 << " and ";
1164 backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1165 }
1166
1167 if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1168 !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1169 all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1170 auth_log_shard->second.log_tail) {
1171 ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1172 want[i] = acting[i];
1173 ++usable;
1174 } else if (!restrict_to_up_acting) {
1175 for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1176 j != all_info_by_shard[shard_id_t(i)].end();
1177 ++j) {
1178 assert(j->shard == i);
1179 if (!all_info.find(*j)->second.is_incomplete() &&
1180 all_info.find(*j)->second.last_update >=
1181 auth_log_shard->second.log_tail) {
1182 ss << " selecting stray: " << *j << std::endl;
1183 want[i] = j->osd;
1184 ++usable;
1185 break;
1186 }
1187 }
1188 if (want[i] == CRUSH_ITEM_NONE)
1189 ss << " failed to fill position " << (int)i << std::endl;
1190 }
1191 }
1192
1193 bool found_primary = false;
1194 for (uint8_t i = 0; i < want.size(); ++i) {
1195 if (want[i] != CRUSH_ITEM_NONE) {
1196 acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1197 if (!found_primary) {
1198 *want_primary = pg_shard_t(want[i], shard_id_t(i));
1199 found_primary = true;
1200 }
1201 }
1202 }
1203 acting_backfill->insert(backfill->begin(), backfill->end());
1204 _want->swap(want);
1205 }
1206
1207 /**
1208 * calculate the desired acting set.
1209 *
1210 * Choose an appropriate acting set. Prefer up[0], unless it is
1211 * incomplete, or another osd has a longer tail that allows us to
1212 * bring other up nodes up to date.
1213 */
1214 void PG::calc_replicated_acting(
1215 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1216 unsigned size,
1217 const vector<int> &acting,
1218 pg_shard_t acting_primary,
1219 const vector<int> &up,
1220 pg_shard_t up_primary,
1221 const map<pg_shard_t, pg_info_t> &all_info,
1222 bool restrict_to_up_acting,
1223 vector<int> *want,
1224 set<pg_shard_t> *backfill,
1225 set<pg_shard_t> *acting_backfill,
1226 pg_shard_t *want_primary,
1227 ostream &ss)
1228 {
1229 ss << "calc_acting newest update on osd." << auth_log_shard->first
1230 << " with " << auth_log_shard->second
1231 << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1232 pg_shard_t auth_log_shard_id = auth_log_shard->first;
1233
1234 // select primary
1235 map<pg_shard_t,pg_info_t>::const_iterator primary;
1236 if (up.size() &&
1237 !all_info.find(up_primary)->second.is_incomplete() &&
1238 all_info.find(up_primary)->second.last_update >=
1239 auth_log_shard->second.log_tail) {
1240 ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1241 primary = all_info.find(up_primary); // prefer up[0], all thing being equal
1242 } else {
1243 assert(!auth_log_shard->second.is_incomplete());
1244 ss << "up[0] needs backfill, osd." << auth_log_shard_id
1245 << " selected as primary instead" << std::endl;
1246 primary = auth_log_shard;
1247 }
1248
1249 ss << "calc_acting primary is osd." << primary->first
1250 << " with " << primary->second << std::endl;
1251 *want_primary = primary->first;
1252 want->push_back(primary->first.osd);
1253 acting_backfill->insert(primary->first);
1254 unsigned usable = 1;
1255
1256 // select replicas that have log contiguity with primary.
1257 // prefer up, then acting, then any peer_info osds
1258 for (vector<int>::const_iterator i = up.begin();
1259 i != up.end();
1260 ++i) {
1261 pg_shard_t up_cand = pg_shard_t(*i, shard_id_t::NO_SHARD);
1262 if (up_cand == primary->first)
1263 continue;
1264 const pg_info_t &cur_info = all_info.find(up_cand)->second;
1265 if (cur_info.is_incomplete() ||
1266 cur_info.last_update < MIN(
1267 primary->second.log_tail,
1268 auth_log_shard->second.log_tail)) {
1269 /* We include auth_log_shard->second.log_tail because in GetLog,
1270 * we will request logs back to the min last_update over our
1271 * acting_backfill set, which will result in our log being extended
1272 * as far backwards as necessary to pick up any peers which can
1273 * be log recovered by auth_log_shard's log */
1274 ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1275 backfill->insert(up_cand);
1276 acting_backfill->insert(up_cand);
1277 } else {
1278 want->push_back(*i);
1279 acting_backfill->insert(up_cand);
1280 usable++;
1281 ss << " osd." << *i << " (up) accepted " << cur_info << std::endl;
1282 }
1283 }
1284
1285 // This no longer has backfill OSDs, but they are covered above.
1286 for (vector<int>::const_iterator i = acting.begin();
1287 i != acting.end();
1288 ++i) {
1289 pg_shard_t acting_cand(*i, shard_id_t::NO_SHARD);
1290 if (usable >= size)
1291 break;
1292
1293 // skip up osds we already considered above
1294 if (acting_cand == primary->first)
1295 continue;
1296 vector<int>::const_iterator up_it = find(up.begin(), up.end(), acting_cand.osd);
1297 if (up_it != up.end())
1298 continue;
1299
1300 const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1301 if (cur_info.is_incomplete() ||
1302 cur_info.last_update < primary->second.log_tail) {
1303 ss << " shard " << acting_cand << " (stray) REJECTED "
1304 << cur_info << std::endl;
1305 } else {
1306 want->push_back(*i);
1307 acting_backfill->insert(acting_cand);
1308 ss << " shard " << acting_cand << " (stray) accepted "
1309 << cur_info << std::endl;
1310 usable++;
1311 }
1312 }
1313
1314 if (restrict_to_up_acting) {
1315 return;
1316 }
1317 for (map<pg_shard_t,pg_info_t>::const_iterator i = all_info.begin();
1318 i != all_info.end();
1319 ++i) {
1320 if (usable >= size)
1321 break;
1322
1323 // skip up osds we already considered above
1324 if (i->first == primary->first)
1325 continue;
1326 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i->first.osd);
1327 if (up_it != up.end())
1328 continue;
1329 vector<int>::const_iterator acting_it = find(
1330 acting.begin(), acting.end(), i->first.osd);
1331 if (acting_it != acting.end())
1332 continue;
1333
1334 if (i->second.is_incomplete() ||
1335 i->second.last_update < primary->second.log_tail) {
1336 ss << " shard " << i->first << " (stray) REJECTED "
1337 << i->second << std::endl;
1338 } else {
1339 want->push_back(i->first.osd);
1340 acting_backfill->insert(i->first);
1341 ss << " shard " << i->first << " (stray) accepted "
1342 << i->second << std::endl;
1343 usable++;
1344 }
1345 }
1346 }
1347
1348 /**
1349 * choose acting
1350 *
1351 * calculate the desired acting, and request a change with the monitor
1352 * if it differs from the current acting.
1353 *
1354 * if restrict_to_up_acting=true, we filter out anything that's not in
1355 * up/acting. in order to lift this restriction, we need to
1356 * 1) check whether it's worth switching the acting set any time we get
1357 * a new pg info (not just here, when recovery finishes)
1358 * 2) check whether anything in want_acting went down on each new map
1359 * (and, if so, calculate a new want_acting)
1360 * 3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap)
1361 * TODO!
1362 */
1363 bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
1364 bool restrict_to_up_acting,
1365 bool *history_les_bound)
1366 {
1367 map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
1368 all_info[pg_whoami] = info;
1369
1370 for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
1371 p != all_info.end();
1372 ++p) {
1373 dout(10) << "calc_acting osd." << p->first << " " << p->second << dendl;
1374 }
1375
1376 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
1377 find_best_info(all_info, restrict_to_up_acting, history_les_bound);
1378
1379 if (auth_log_shard == all_info.end()) {
1380 if (up != acting) {
1381 dout(10) << "choose_acting no suitable info found (incomplete backfills?),"
1382 << " reverting to up" << dendl;
1383 want_acting = up;
1384 vector<int> empty;
1385 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1386 } else {
1387 dout(10) << "choose_acting failed" << dendl;
1388 assert(want_acting.empty());
1389 }
1390 return false;
1391 }
1392
1393 assert(!auth_log_shard->second.is_incomplete());
1394 auth_log_shard_id = auth_log_shard->first;
1395
1396 set<pg_shard_t> want_backfill, want_acting_backfill;
1397 vector<int> want;
1398 pg_shard_t want_primary;
1399 stringstream ss;
1400 if (!pool.info.ec_pool())
1401 calc_replicated_acting(
1402 auth_log_shard,
1403 get_osdmap()->get_pg_size(info.pgid.pgid),
1404 acting,
1405 primary,
1406 up,
1407 up_primary,
1408 all_info,
1409 restrict_to_up_acting,
1410 &want,
1411 &want_backfill,
1412 &want_acting_backfill,
1413 &want_primary,
1414 ss);
1415 else
1416 calc_ec_acting(
1417 auth_log_shard,
1418 get_osdmap()->get_pg_size(info.pgid.pgid),
1419 acting,
1420 primary,
1421 up,
1422 up_primary,
1423 all_info,
1424 restrict_to_up_acting,
1425 &want,
1426 &want_backfill,
1427 &want_acting_backfill,
1428 &want_primary,
1429 ss);
1430 dout(10) << ss.str() << dendl;
1431
1432 unsigned num_want_acting = 0;
1433 set<pg_shard_t> have;
1434 for (int i = 0; i < (int)want.size(); ++i) {
1435 if (want[i] != CRUSH_ITEM_NONE) {
1436 ++num_want_acting;
1437 have.insert(
1438 pg_shard_t(
1439 want[i],
1440 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1441 }
1442 }
1443
1444 // We go incomplete if below min_size for ec_pools since backfill
1445 // does not currently maintain rollbackability
1446 // Otherwise, we will go "peered", but not "active"
1447 if (num_want_acting < pool.info.min_size &&
1448 (pool.info.ec_pool() ||
1449 !cct->_conf->osd_allow_recovery_below_min_size)) {
1450 want_acting.clear();
1451 dout(10) << "choose_acting failed, below min size" << dendl;
1452 return false;
1453 }
1454
1455 /* Check whether we have enough acting shards to later perform recovery */
1456 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable_predicate(
1457 get_pgbackend()->get_is_recoverable_predicate());
1458 if (!(*recoverable_predicate)(have)) {
1459 want_acting.clear();
1460 dout(10) << "choose_acting failed, not recoverable" << dendl;
1461 return false;
1462 }
1463
1464 if (want != acting) {
1465 dout(10) << "choose_acting want " << want << " != acting " << acting
1466 << ", requesting pg_temp change" << dendl;
1467 want_acting = want;
1468
1469 if (want_acting == up) {
1470 // There can't be any pending backfill if
1471 // want is the same as crush map up OSDs.
1472 assert(want_backfill.empty());
1473 vector<int> empty;
1474 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1475 } else
1476 osd->queue_want_pg_temp(info.pgid.pgid, want);
1477 return false;
1478 }
1479 want_acting.clear();
1480 actingbackfill = want_acting_backfill;
1481 dout(10) << "actingbackfill is " << actingbackfill << dendl;
1482 assert(backfill_targets.empty() || backfill_targets == want_backfill);
1483 if (backfill_targets.empty()) {
1484 // Caller is GetInfo
1485 backfill_targets = want_backfill;
1486 }
1487 // Will not change if already set because up would have had to change
1488 // Verify that nothing in backfill is in stray_set
1489 for (set<pg_shard_t>::iterator i = want_backfill.begin();
1490 i != want_backfill.end();
1491 ++i) {
1492 assert(stray_set.find(*i) == stray_set.end());
1493 }
1494 dout(10) << "choose_acting want " << want << " (== acting) backfill_targets "
1495 << want_backfill << dendl;
1496 return true;
1497 }
1498
1499 /* Build the might_have_unfound set.
1500 *
1501 * This is used by the primary OSD during recovery.
1502 *
1503 * This set tracks the OSDs which might have unfound objects that the primary
1504 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
1505 * will remove the OSD from the set.
1506 */
1507 void PG::build_might_have_unfound()
1508 {
1509 assert(might_have_unfound.empty());
1510 assert(is_primary());
1511
1512 dout(10) << __func__ << dendl;
1513
1514 check_past_interval_bounds();
1515
1516 might_have_unfound = past_intervals.get_might_have_unfound(
1517 pg_whoami,
1518 pool.info.ec_pool());
1519
1520 // include any (stray) peers
1521 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
1522 p != peer_info.end();
1523 ++p)
1524 might_have_unfound.insert(p->first);
1525
1526 dout(15) << __func__ << ": built " << might_have_unfound << dendl;
1527 }
1528
1529 struct C_PG_ActivateCommitted : public Context {
1530 PGRef pg;
1531 epoch_t epoch;
1532 epoch_t activation_epoch;
1533 C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
1534 : pg(p), epoch(e), activation_epoch(ae) {}
1535 void finish(int r) override {
1536 pg->_activate_committed(epoch, activation_epoch);
1537 }
1538 };
1539
1540 void PG::activate(ObjectStore::Transaction& t,
1541 epoch_t activation_epoch,
1542 list<Context*>& tfin,
1543 map<int, map<spg_t,pg_query_t> >& query_map,
1544 map<int,
1545 vector<
1546 pair<pg_notify_t,
1547 PastIntervals> > > *activator_map,
1548 RecoveryCtx *ctx)
1549 {
1550 assert(!is_peered());
1551 assert(scrubber.callbacks.empty());
1552 assert(callbacks_for_degraded_object.empty());
1553
1554 // twiddle pg state
1555 state_clear(PG_STATE_DOWN);
1556
1557 send_notify = false;
1558
1559 if (is_primary()) {
1560 // only update primary last_epoch_started if we will go active
1561 if (acting.size() >= pool.info.min_size) {
1562 assert(cct->_conf->osd_find_best_info_ignore_history_les ||
1563 info.last_epoch_started <= activation_epoch);
1564 info.last_epoch_started = activation_epoch;
1565 info.last_interval_started = info.history.same_interval_since;
1566 }
1567 } else if (is_acting(pg_whoami)) {
1568 /* update last_epoch_started on acting replica to whatever the primary sent
1569 * unless it's smaller (could happen if we are going peered rather than
1570 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
1571 if (info.last_epoch_started < activation_epoch) {
1572 info.last_epoch_started = activation_epoch;
1573 info.last_interval_started = info.history.same_interval_since;
1574 }
1575 }
1576
1577 auto &missing = pg_log.get_missing();
1578
1579 if (is_primary()) {
1580 last_update_ondisk = info.last_update;
1581 min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
1582 }
1583 last_update_applied = info.last_update;
1584 last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
1585
1586 need_up_thru = false;
1587
1588 // write pg info, log
1589 dirty_info = true;
1590 dirty_big_info = true; // maybe
1591
1592 // find out when we commit
1593 t.register_on_complete(
1594 new C_PG_ActivateCommitted(
1595 this,
1596 get_osdmap()->get_epoch(),
1597 activation_epoch));
1598
1599 // initialize snap_trimq
1600 if (is_primary()) {
1601 dout(20) << "activate - purged_snaps " << info.purged_snaps
1602 << " cached_removed_snaps " << pool.cached_removed_snaps << dendl;
1603 snap_trimq = pool.cached_removed_snaps;
1604 interval_set<snapid_t> intersection;
1605 intersection.intersection_of(snap_trimq, info.purged_snaps);
1606 if (intersection == info.purged_snaps) {
1607 snap_trimq.subtract(info.purged_snaps);
1608 } else {
1609 dout(0) << "warning: info.purged_snaps (" << info.purged_snaps
1610 << ") is not a subset of pool.cached_removed_snaps ("
1611 << pool.cached_removed_snaps << ")" << dendl;
1612 snap_trimq.subtract(intersection);
1613 }
1614 }
1615
1616 // init complete pointer
1617 if (missing.num_missing() == 0) {
1618 dout(10) << "activate - no missing, moving last_complete " << info.last_complete
1619 << " -> " << info.last_update << dendl;
1620 info.last_complete = info.last_update;
1621 pg_log.reset_recovery_pointers();
1622 } else {
1623 dout(10) << "activate - not complete, " << missing << dendl;
1624 pg_log.activate_not_complete(info);
1625 }
1626
1627 log_weirdness();
1628
1629 // if primary..
1630 if (is_primary()) {
1631 assert(ctx);
1632 // start up replicas
1633
1634 assert(!actingbackfill.empty());
1635 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1636 i != actingbackfill.end();
1637 ++i) {
1638 if (*i == pg_whoami) continue;
1639 pg_shard_t peer = *i;
1640 assert(peer_info.count(peer));
1641 pg_info_t& pi = peer_info[peer];
1642
1643 dout(10) << "activate peer osd." << peer << " " << pi << dendl;
1644
1645 MOSDPGLog *m = 0;
1646 assert(peer_missing.count(peer));
1647 pg_missing_t& pm = peer_missing[peer];
1648
1649 bool needs_past_intervals = pi.dne();
1650
1651 /*
1652 * cover case where peer sort order was different and
1653 * last_backfill cannot be interpreted
1654 */
1655 bool force_restart_backfill =
1656 !pi.last_backfill.is_max() &&
1657 !pi.last_backfill_bitwise;
1658
1659 if (pi.last_update == info.last_update && !force_restart_backfill) {
1660 // empty log
1661 if (!pi.last_backfill.is_max())
1662 osd->clog->info() << info.pgid << " continuing backfill to osd."
1663 << peer
1664 << " from (" << pi.log_tail << "," << pi.last_update
1665 << "] " << pi.last_backfill
1666 << " to " << info.last_update;
1667 if (!pi.is_empty() && activator_map) {
1668 dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
1669 (*activator_map)[peer.osd].push_back(
1670 make_pair(
1671 pg_notify_t(
1672 peer.shard, pg_whoami.shard,
1673 get_osdmap()->get_epoch(),
1674 get_osdmap()->get_epoch(),
1675 info),
1676 past_intervals));
1677 } else {
1678 dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
1679 m = new MOSDPGLog(
1680 i->shard, pg_whoami.shard,
1681 get_osdmap()->get_epoch(), info);
1682 }
1683 } else if (
1684 pg_log.get_tail() > pi.last_update ||
1685 pi.last_backfill == hobject_t() ||
1686 force_restart_backfill ||
1687 (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
1688 /* ^ This last case covers a situation where a replica is not contiguous
1689 * with the auth_log, but is contiguous with this replica. Reshuffling
1690 * the active set to handle this would be tricky, so instead we just go
1691 * ahead and backfill it anyway. This is probably preferrable in any
1692 * case since the replica in question would have to be significantly
1693 * behind.
1694 */
1695 // backfill
1696 osd->clog->debug() << info.pgid << " starting backfill to osd." << peer
1697 << " from (" << pi.log_tail << "," << pi.last_update
1698 << "] " << pi.last_backfill
1699 << " to " << info.last_update;
1700
1701 pi.last_update = info.last_update;
1702 pi.last_complete = info.last_update;
1703 pi.set_last_backfill(hobject_t());
1704 pi.last_epoch_started = info.last_epoch_started;
1705 pi.last_interval_started = info.last_interval_started;
1706 pi.history = info.history;
1707 pi.hit_set = info.hit_set;
1708 pi.stats.stats.clear();
1709
1710 // initialize peer with our purged_snaps.
1711 pi.purged_snaps = info.purged_snaps;
1712
1713 m = new MOSDPGLog(
1714 i->shard, pg_whoami.shard,
1715 get_osdmap()->get_epoch(), pi);
1716
1717 // send some recent log, so that op dup detection works well.
1718 m->log.copy_up_to(pg_log.get_log(), cct->_conf->osd_min_pg_log_entries);
1719 m->info.log_tail = m->log.tail;
1720 pi.log_tail = m->log.tail; // sigh...
1721
1722 pm.clear();
1723 } else {
1724 // catch up
1725 assert(pg_log.get_tail() <= pi.last_update);
1726 m = new MOSDPGLog(
1727 i->shard, pg_whoami.shard,
1728 get_osdmap()->get_epoch(), info);
1729 // send new stuff to append to replicas log
1730 m->log.copy_after(pg_log.get_log(), pi.last_update);
1731 }
1732
1733 // share past_intervals if we are creating the pg on the replica
1734 // based on whether our info for that peer was dne() *before*
1735 // updating pi.history in the backfill block above.
1736 if (m && needs_past_intervals)
1737 m->past_intervals = past_intervals;
1738
1739 // update local version of peer's missing list!
1740 if (m && pi.last_backfill != hobject_t()) {
1741 for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
1742 p != m->log.log.end();
1743 ++p) {
1744 if (p->soid <= pi.last_backfill &&
1745 !p->is_error()) {
1746 if (perform_deletes_during_peering() && p->is_delete()) {
1747 pm.rm(p->soid, p->version);
1748 } else {
1749 pm.add_next_event(*p);
1750 }
1751 }
1752 }
1753 }
1754
1755 if (m) {
1756 dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
1757 //m->log.print(cout);
1758 osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
1759 }
1760
1761 // peer now has
1762 pi.last_update = info.last_update;
1763
1764 // update our missing
1765 if (pm.num_missing() == 0) {
1766 pi.last_complete = pi.last_update;
1767 dout(10) << "activate peer osd." << peer << " " << pi << " uptodate" << dendl;
1768 } else {
1769 dout(10) << "activate peer osd." << peer << " " << pi << " missing " << pm << dendl;
1770 }
1771 }
1772
1773 // Set up missing_loc
1774 set<pg_shard_t> complete_shards;
1775 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1776 i != actingbackfill.end();
1777 ++i) {
1778 dout(20) << __func__ << " setting up missing_loc from shard " << *i << " " << dendl;
1779 if (*i == get_primary()) {
1780 missing_loc.add_active_missing(missing);
1781 if (!missing.have_missing())
1782 complete_shards.insert(*i);
1783 } else {
1784 auto peer_missing_entry = peer_missing.find(*i);
1785 assert(peer_missing_entry != peer_missing.end());
1786 missing_loc.add_active_missing(peer_missing_entry->second);
1787 if (!peer_missing_entry->second.have_missing() &&
1788 peer_info[*i].last_backfill.is_max())
1789 complete_shards.insert(*i);
1790 }
1791 }
1792 // If necessary, create might_have_unfound to help us find our unfound objects.
1793 // NOTE: It's important that we build might_have_unfound before trimming the
1794 // past intervals.
1795 might_have_unfound.clear();
1796 if (needs_recovery()) {
1797 // If only one shard has missing, we do a trick to add all others as recovery
1798 // source, this is considered safe since the PGLogs have been merged locally,
1799 // and covers vast majority of the use cases, like one OSD/host is down for
1800 // a while for hardware repairing
1801 if (complete_shards.size() + 1 == actingbackfill.size()) {
1802 missing_loc.add_batch_sources_info(complete_shards, ctx->handle);
1803 } else {
1804 missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
1805 ctx->handle);
1806 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1807 i != actingbackfill.end();
1808 ++i) {
1809 if (*i == pg_whoami) continue;
1810 dout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
1811 assert(peer_missing.count(*i));
1812 assert(peer_info.count(*i));
1813 missing_loc.add_source_info(
1814 *i,
1815 peer_info[*i],
1816 peer_missing[*i],
1817 ctx->handle);
1818 }
1819 }
1820 for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
1821 i != peer_missing.end();
1822 ++i) {
1823 if (is_actingbackfill(i->first))
1824 continue;
1825 assert(peer_info.count(i->first));
1826 search_for_missing(
1827 peer_info[i->first],
1828 i->second,
1829 i->first,
1830 ctx);
1831 }
1832
1833 build_might_have_unfound();
1834
1835 if (have_unfound())
1836 discover_all_missing(query_map);
1837 }
1838
1839 // num_objects_degraded if calculated should reflect this too, unless no
1840 // missing and we are about to go clean.
1841 if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
1842 state_set(PG_STATE_UNDERSIZED);
1843 }
1844
1845 state_set(PG_STATE_ACTIVATING);
1846 release_pg_backoffs();
1847 projected_last_update = info.last_update;
1848 }
1849 if (acting.size() >= pool.info.min_size) {
1850 PGLogEntryHandler handler{this, &t};
1851 pg_log.roll_forward(&handler);
1852 }
1853 }
1854
1855 bool PG::op_has_sufficient_caps(OpRequestRef& op)
1856 {
1857 // only check MOSDOp
1858 if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
1859 return true;
1860
1861 const MOSDOp *req = static_cast<const MOSDOp*>(op->get_req());
1862
1863 Session *session = static_cast<Session*>(req->get_connection()->get_priv());
1864 if (!session) {
1865 dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl;
1866 return false;
1867 }
1868 OSDCap& caps = session->caps;
1869 session->put();
1870
1871 const string &key = req->get_hobj().get_key().empty() ?
1872 req->get_oid().name :
1873 req->get_hobj().get_key();
1874
1875 bool cap = caps.is_capable(pool.name, req->get_hobj().nspace,
1876 pool.auid, key,
1877 op->need_read_cap(),
1878 op->need_write_cap(),
1879 op->classes());
1880
1881 dout(20) << "op_has_sufficient_caps "
1882 << "session=" << session
1883 << " pool=" << pool.id << " (" << pool.name
1884 << " " << req->get_hobj().nspace
1885 << ") owner=" << pool.auid
1886 << " need_read_cap=" << op->need_read_cap()
1887 << " need_write_cap=" << op->need_write_cap()
1888 << " classes=" << op->classes()
1889 << " -> " << (cap ? "yes" : "NO")
1890 << dendl;
1891 return cap;
1892 }
1893
1894 void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
1895 {
1896 lock();
1897 if (pg_has_reset_since(epoch)) {
1898 dout(10) << "_activate_committed " << epoch
1899 << ", that was an old interval" << dendl;
1900 } else if (is_primary()) {
1901 peer_activated.insert(pg_whoami);
1902 dout(10) << "_activate_committed " << epoch
1903 << " peer_activated now " << peer_activated
1904 << " last_interval_started " << info.history.last_interval_started
1905 << " last_epoch_started " << info.history.last_epoch_started
1906 << " same_interval_since " << info.history.same_interval_since << dendl;
1907 assert(!actingbackfill.empty());
1908 if (peer_activated.size() == actingbackfill.size())
1909 all_activated_and_committed();
1910 } else {
1911 dout(10) << "_activate_committed " << epoch << " telling primary" << dendl;
1912 MOSDPGInfo *m = new MOSDPGInfo(epoch);
1913 pg_notify_t i = pg_notify_t(
1914 get_primary().shard, pg_whoami.shard,
1915 get_osdmap()->get_epoch(),
1916 get_osdmap()->get_epoch(),
1917 info);
1918
1919 i.info.history.last_epoch_started = activation_epoch;
1920 i.info.history.last_interval_started = i.info.history.same_interval_since;
1921 if (acting.size() >= pool.info.min_size) {
1922 state_set(PG_STATE_ACTIVE);
1923 } else {
1924 state_set(PG_STATE_PEERED);
1925 }
1926
1927 m->pg_list.push_back(make_pair(i, PastIntervals()));
1928 osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap()->get_epoch());
1929
1930 // waiters
1931 if (flushes_in_progress == 0) {
1932 requeue_ops(waiting_for_peered);
1933 } else if (!waiting_for_peered.empty()) {
1934 dout(10) << __func__ << " flushes in progress, moving "
1935 << waiting_for_peered.size() << " items to waiting_for_flush"
1936 << dendl;
1937 assert(waiting_for_flush.empty());
1938 waiting_for_flush.swap(waiting_for_peered);
1939 }
1940 }
1941
1942 assert(!dirty_info);
1943
1944 unlock();
1945 }
1946
1947 /*
1948 * update info.history.last_epoch_started ONLY after we and all
1949 * replicas have activated AND committed the activate transaction
1950 * (i.e. the peering results are stable on disk).
1951 */
1952 void PG::all_activated_and_committed()
1953 {
1954 dout(10) << "all_activated_and_committed" << dendl;
1955 assert(is_primary());
1956 assert(peer_activated.size() == actingbackfill.size());
1957 assert(!actingbackfill.empty());
1958 assert(blocked_by.empty());
1959
1960 // Degraded?
1961 _update_calc_stats();
1962 if (info.stats.stats.sum.num_objects_degraded) {
1963 state_set(PG_STATE_DEGRADED);
1964 } else {
1965 state_clear(PG_STATE_DEGRADED);
1966 }
1967
1968 queue_peering_event(
1969 CephPeeringEvtRef(
1970 std::make_shared<CephPeeringEvt>(
1971 get_osdmap()->get_epoch(),
1972 get_osdmap()->get_epoch(),
1973 AllReplicasActivated())));
1974 }
1975
1976 bool PG::requeue_scrub(bool high_priority)
1977 {
1978 assert(is_locked());
1979 if (scrub_queued) {
1980 dout(10) << __func__ << ": already queued" << dendl;
1981 return false;
1982 } else {
1983 dout(10) << __func__ << ": queueing" << dendl;
1984 scrub_queued = true;
1985 osd->queue_for_scrub(this, high_priority);
1986 return true;
1987 }
1988 }
1989
1990 void PG::queue_recovery()
1991 {
1992 if (!is_primary() || !is_peered()) {
1993 dout(10) << "queue_recovery -- not primary or not peered " << dendl;
1994 assert(!recovery_queued);
1995 } else if (recovery_queued) {
1996 dout(10) << "queue_recovery -- already queued" << dendl;
1997 } else {
1998 dout(10) << "queue_recovery -- queuing" << dendl;
1999 recovery_queued = true;
2000 osd->queue_for_recovery(this);
2001 }
2002 }
2003
2004 bool PG::queue_scrub()
2005 {
2006 assert(is_locked());
2007 if (is_scrubbing()) {
2008 return false;
2009 }
2010 scrubber.priority = scrubber.must_scrub ?
2011 cct->_conf->osd_requested_scrub_priority : get_scrub_priority();
2012 scrubber.must_scrub = false;
2013 state_set(PG_STATE_SCRUBBING);
2014 if (scrubber.must_deep_scrub) {
2015 state_set(PG_STATE_DEEP_SCRUB);
2016 scrubber.must_deep_scrub = false;
2017 }
2018 if (scrubber.must_repair || scrubber.auto_repair) {
2019 state_set(PG_STATE_REPAIR);
2020 scrubber.must_repair = false;
2021 }
2022 requeue_scrub();
2023 return true;
2024 }
2025
2026 unsigned PG::get_scrub_priority()
2027 {
2028 // a higher value -> a higher priority
2029 int pool_scrub_priority = 0;
2030 pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority);
2031 return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
2032 }
2033
2034 struct C_PG_FinishRecovery : public Context {
2035 PGRef pg;
2036 explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
2037 void finish(int r) override {
2038 pg->_finish_recovery(this);
2039 }
2040 };
2041
2042 void PG::mark_clean()
2043 {
2044 if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2045 state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
2046 state_set(PG_STATE_CLEAN);
2047 info.history.last_epoch_clean = get_osdmap()->get_epoch();
2048 info.history.last_interval_clean = info.history.same_interval_since;
2049 past_intervals.clear();
2050 dirty_big_info = true;
2051 dirty_info = true;
2052 }
2053
2054 kick_snap_trim();
2055 }
2056
2057 void PG::_change_recovery_force_mode(int new_mode, bool clear)
2058 {
2059 if (!deleting) {
2060 // we can't and shouldn't do anything if the PG is being deleted locally
2061 if (clear) {
2062 state_clear(new_mode);
2063 } else {
2064 state_set(new_mode);
2065 }
2066 publish_stats_to_osd();
2067 }
2068 }
2069
2070 inline int PG::clamp_recovery_priority(int priority)
2071 {
2072 static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
2073 static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
2074
2075 // Clamp to valid range
2076 if (priority > OSD_RECOVERY_PRIORITY_MAX) {
2077 return OSD_RECOVERY_PRIORITY_MAX;
2078 } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
2079 return OSD_RECOVERY_PRIORITY_MIN;
2080 } else {
2081 return priority;
2082 }
2083 }
2084
2085 unsigned PG::get_recovery_priority()
2086 {
2087 // a higher value -> a higher priority
2088 int ret = 0;
2089
2090 if (state & PG_STATE_FORCED_RECOVERY) {
2091 ret = OSD_RECOVERY_PRIORITY_FORCED;
2092 } else {
2093 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &ret);
2094 ret = clamp_recovery_priority(OSD_RECOVERY_PRIORITY_BASE + ret);
2095 }
2096 dout(20) << __func__ << " recovery priority for " << *this << " is " << ret << ", state is " << state << dendl;
2097 return static_cast<unsigned>(ret);
2098 }
2099
2100 unsigned PG::get_backfill_priority()
2101 {
2102 // a higher value -> a higher priority
2103 int ret = OSD_BACKFILL_PRIORITY_BASE;
2104 if (state & PG_STATE_FORCED_BACKFILL) {
2105 ret = OSD_RECOVERY_PRIORITY_FORCED;
2106 } else {
2107 if (acting.size() < pool.info.min_size) {
2108 // inactive: no. of replicas < min_size, highest priority since it blocks IO
2109 ret = OSD_BACKFILL_INACTIVE_PRIORITY_BASE + (pool.info.min_size - acting.size());
2110
2111 } else if (is_undersized()) {
2112 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
2113 assert(pool.info.size > actingset.size());
2114 ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE + (pool.info.size - actingset.size());
2115
2116 } else if (is_degraded()) {
2117 // degraded: baseline degraded
2118 ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
2119 }
2120
2121 // Adjust with pool's recovery priority
2122 int pool_recovery_priority = 0;
2123 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2124
2125 ret = clamp_recovery_priority(pool_recovery_priority + ret);
2126 }
2127
2128 return static_cast<unsigned>(ret);
2129 }
2130
2131 void PG::finish_recovery(list<Context*>& tfin)
2132 {
2133 dout(10) << "finish_recovery" << dendl;
2134 assert(info.last_complete == info.last_update);
2135
2136 clear_recovery_state();
2137
2138 /*
2139 * sync all this before purging strays. but don't block!
2140 */
2141 finish_sync_event = new C_PG_FinishRecovery(this);
2142 tfin.push_back(finish_sync_event);
2143 }
2144
2145 void PG::_finish_recovery(Context *c)
2146 {
2147 lock();
2148 if (deleting) {
2149 unlock();
2150 return;
2151 }
2152 if (c == finish_sync_event) {
2153 dout(10) << "_finish_recovery" << dendl;
2154 finish_sync_event = 0;
2155 purge_strays();
2156
2157 publish_stats_to_osd();
2158
2159 if (scrub_after_recovery) {
2160 dout(10) << "_finish_recovery requeueing for scrub" << dendl;
2161 scrub_after_recovery = false;
2162 scrubber.must_deep_scrub = true;
2163 queue_scrub();
2164 }
2165 } else {
2166 dout(10) << "_finish_recovery -- stale" << dendl;
2167 }
2168 unlock();
2169 }
2170
2171 void PG::start_recovery_op(const hobject_t& soid)
2172 {
2173 dout(10) << "start_recovery_op " << soid
2174 #ifdef DEBUG_RECOVERY_OIDS
2175 << " (" << recovering_oids << ")"
2176 #endif
2177 << dendl;
2178 assert(recovery_ops_active >= 0);
2179 recovery_ops_active++;
2180 #ifdef DEBUG_RECOVERY_OIDS
2181 assert(recovering_oids.count(soid) == 0);
2182 recovering_oids.insert(soid);
2183 #endif
2184 osd->start_recovery_op(this, soid);
2185 }
2186
2187 void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
2188 {
2189 dout(10) << "finish_recovery_op " << soid
2190 #ifdef DEBUG_RECOVERY_OIDS
2191 << " (" << recovering_oids << ")"
2192 #endif
2193 << dendl;
2194 assert(recovery_ops_active > 0);
2195 recovery_ops_active--;
2196 #ifdef DEBUG_RECOVERY_OIDS
2197 assert(recovering_oids.count(soid));
2198 recovering_oids.erase(soid);
2199 #endif
2200 osd->finish_recovery_op(this, soid, dequeue);
2201
2202 if (!dequeue) {
2203 queue_recovery();
2204 }
2205 }
2206
2207 void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
2208 {
2209 child->update_snap_mapper_bits(split_bits);
2210 child->update_osdmap_ref(get_osdmap());
2211
2212 child->pool = pool;
2213
2214 // Log
2215 pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2216 child->info.last_complete = info.last_complete;
2217
2218 info.last_update = pg_log.get_head();
2219 child->info.last_update = child->pg_log.get_head();
2220
2221 child->info.last_user_version = info.last_user_version;
2222
2223 info.log_tail = pg_log.get_tail();
2224 child->info.log_tail = child->pg_log.get_tail();
2225
2226 if (info.last_complete < pg_log.get_tail())
2227 info.last_complete = pg_log.get_tail();
2228 if (child->info.last_complete < child->pg_log.get_tail())
2229 child->info.last_complete = child->pg_log.get_tail();
2230
2231 // Info
2232 child->info.history = info.history;
2233 child->info.history.epoch_created = get_osdmap()->get_epoch();
2234 child->info.purged_snaps = info.purged_snaps;
2235
2236 if (info.last_backfill.is_max()) {
2237 child->info.set_last_backfill(hobject_t::get_max());
2238 } else {
2239 // restart backfill on parent and child to be safe. we could
2240 // probably do better in the bitwise sort case, but it's more
2241 // fragile (there may be special work to do on backfill completion
2242 // in the future).
2243 info.set_last_backfill(hobject_t());
2244 child->info.set_last_backfill(hobject_t());
2245 // restarting backfill implies that the missing set is empty,
2246 // since it is only used for objects prior to last_backfill
2247 pg_log.reset_backfill();
2248 child->pg_log.reset_backfill();
2249 }
2250
2251 child->info.stats = info.stats;
2252 child->info.stats.parent_split_bits = split_bits;
2253 info.stats.stats_invalid = true;
2254 child->info.stats.stats_invalid = true;
2255 child->info.last_epoch_started = info.last_epoch_started;
2256 child->info.last_interval_started = info.last_interval_started;
2257
2258 child->snap_trimq = snap_trimq;
2259
2260 // There can't be recovery/backfill going on now
2261 int primary, up_primary;
2262 vector<int> newup, newacting;
2263 get_osdmap()->pg_to_up_acting_osds(
2264 child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2265 child->init_primary_up_acting(
2266 newup,
2267 newacting,
2268 up_primary,
2269 primary);
2270 child->role = OSDMap::calc_pg_role(osd->whoami, child->acting);
2271
2272 // this comparison includes primary rank via pg_shard_t
2273 if (get_primary() != child->get_primary())
2274 child->info.history.same_primary_since = get_osdmap()->get_epoch();
2275
2276 child->info.stats.up = up;
2277 child->info.stats.up_primary = up_primary;
2278 child->info.stats.acting = acting;
2279 child->info.stats.acting_primary = primary;
2280 child->info.stats.mapping_epoch = get_osdmap()->get_epoch();
2281
2282 // History
2283 child->past_intervals = past_intervals;
2284
2285 _split_into(child_pgid, child, split_bits);
2286
2287 // release all backoffs for simplicity
2288 release_backoffs(hobject_t(), hobject_t::get_max());
2289
2290 child->on_new_interval();
2291
2292 child->dirty_info = true;
2293 child->dirty_big_info = true;
2294 dirty_info = true;
2295 dirty_big_info = true;
2296 }
2297
2298 void PG::add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end)
2299 {
2300 ConnectionRef con = s->con;
2301 if (!con) // OSD::ms_handle_reset clears s->con without a lock
2302 return;
2303 BackoffRef b(s->have_backoff(info.pgid, begin));
2304 if (b) {
2305 derr << __func__ << " already have backoff for " << s << " begin " << begin
2306 << " " << *b << dendl;
2307 ceph_abort();
2308 }
2309 Mutex::Locker l(backoff_lock);
2310 {
2311 b = new Backoff(info.pgid, this, s, ++s->backoff_seq, begin, end);
2312 backoffs[begin].insert(b);
2313 s->add_backoff(b);
2314 dout(10) << __func__ << " session " << s << " added " << *b << dendl;
2315 }
2316 con->send_message(
2317 new MOSDBackoff(
2318 info.pgid,
2319 get_osdmap()->get_epoch(),
2320 CEPH_OSD_BACKOFF_OP_BLOCK,
2321 b->id,
2322 begin,
2323 end));
2324 }
2325
2326 void PG::release_backoffs(const hobject_t& begin, const hobject_t& end)
2327 {
2328 dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl;
2329 vector<BackoffRef> bv;
2330 {
2331 Mutex::Locker l(backoff_lock);
2332 auto p = backoffs.lower_bound(begin);
2333 while (p != backoffs.end()) {
2334 int r = cmp(p->first, end);
2335 dout(20) << __func__ << " ? " << r << " " << p->first
2336 << " " << p->second << dendl;
2337 // note: must still examine begin=end=p->first case
2338 if (r > 0 || (r == 0 && begin < end)) {
2339 break;
2340 }
2341 dout(20) << __func__ << " checking " << p->first
2342 << " " << p->second << dendl;
2343 auto q = p->second.begin();
2344 while (q != p->second.end()) {
2345 dout(20) << __func__ << " checking " << *q << dendl;
2346 int r = cmp((*q)->begin, begin);
2347 if (r == 0 || (r > 0 && (*q)->end < end)) {
2348 bv.push_back(*q);
2349 q = p->second.erase(q);
2350 } else {
2351 ++q;
2352 }
2353 }
2354 if (p->second.empty()) {
2355 p = backoffs.erase(p);
2356 } else {
2357 ++p;
2358 }
2359 }
2360 }
2361 for (auto b : bv) {
2362 Mutex::Locker l(b->lock);
2363 dout(10) << __func__ << " " << *b << dendl;
2364 if (b->session) {
2365 assert(b->pg == this);
2366 ConnectionRef con = b->session->con;
2367 if (con) { // OSD::ms_handle_reset clears s->con without a lock
2368 con->send_message(
2369 new MOSDBackoff(
2370 info.pgid,
2371 get_osdmap()->get_epoch(),
2372 CEPH_OSD_BACKOFF_OP_UNBLOCK,
2373 b->id,
2374 b->begin,
2375 b->end));
2376 }
2377 if (b->is_new()) {
2378 b->state = Backoff::STATE_DELETING;
2379 } else {
2380 b->session->rm_backoff(b);
2381 b->session.reset();
2382 }
2383 b->pg.reset();
2384 }
2385 }
2386 }
2387
2388 void PG::clear_backoffs()
2389 {
2390 dout(10) << __func__ << " " << dendl;
2391 map<hobject_t,set<BackoffRef>> ls;
2392 {
2393 Mutex::Locker l(backoff_lock);
2394 ls.swap(backoffs);
2395 }
2396 for (auto& p : ls) {
2397 for (auto& b : p.second) {
2398 Mutex::Locker l(b->lock);
2399 dout(10) << __func__ << " " << *b << dendl;
2400 if (b->session) {
2401 assert(b->pg == this);
2402 if (b->is_new()) {
2403 b->state = Backoff::STATE_DELETING;
2404 } else {
2405 b->session->rm_backoff(b);
2406 b->session.reset();
2407 }
2408 b->pg.reset();
2409 }
2410 }
2411 }
2412 }
2413
2414 // called by Session::clear_backoffs()
2415 void PG::rm_backoff(BackoffRef b)
2416 {
2417 dout(10) << __func__ << " " << *b << dendl;
2418 Mutex::Locker l(backoff_lock);
2419 assert(b->lock.is_locked_by_me());
2420 assert(b->pg == this);
2421 auto p = backoffs.find(b->begin);
2422 // may race with release_backoffs()
2423 if (p != backoffs.end()) {
2424 auto q = p->second.find(b);
2425 if (q != p->second.end()) {
2426 p->second.erase(q);
2427 if (p->second.empty()) {
2428 backoffs.erase(p);
2429 }
2430 }
2431 }
2432 }
2433
2434 void PG::clear_recovery_state()
2435 {
2436 dout(10) << "clear_recovery_state" << dendl;
2437
2438 pg_log.reset_recovery_pointers();
2439 finish_sync_event = 0;
2440
2441 hobject_t soid;
2442 while (recovery_ops_active > 0) {
2443 #ifdef DEBUG_RECOVERY_OIDS
2444 soid = *recovering_oids.begin();
2445 #endif
2446 finish_recovery_op(soid, true);
2447 }
2448
2449 backfill_targets.clear();
2450 backfill_info.clear();
2451 peer_backfill_info.clear();
2452 waiting_on_backfill.clear();
2453 _clear_recovery_state(); // pg impl specific hook
2454 }
2455
2456 void PG::cancel_recovery()
2457 {
2458 dout(10) << "cancel_recovery" << dendl;
2459 clear_recovery_state();
2460 }
2461
2462
2463 void PG::purge_strays()
2464 {
2465 dout(10) << "purge_strays " << stray_set << dendl;
2466
2467 bool removed = false;
2468 for (set<pg_shard_t>::iterator p = stray_set.begin();
2469 p != stray_set.end();
2470 ++p) {
2471 assert(!is_actingbackfill(*p));
2472 if (get_osdmap()->is_up(p->osd)) {
2473 dout(10) << "sending PGRemove to osd." << *p << dendl;
2474 vector<spg_t> to_remove;
2475 to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
2476 MOSDPGRemove *m = new MOSDPGRemove(
2477 get_osdmap()->get_epoch(),
2478 to_remove);
2479 osd->send_message_osd_cluster(p->osd, m, get_osdmap()->get_epoch());
2480 } else {
2481 dout(10) << "not sending PGRemove to down osd." << *p << dendl;
2482 }
2483 peer_missing.erase(*p);
2484 peer_info.erase(*p);
2485 peer_purged.insert(*p);
2486 removed = true;
2487 }
2488
2489 // if we removed anyone, update peers (which include peer_info)
2490 if (removed)
2491 update_heartbeat_peers();
2492
2493 stray_set.clear();
2494
2495 // clear _requested maps; we may have to peer() again if we discover
2496 // (more) stray content
2497 peer_log_requested.clear();
2498 peer_missing_requested.clear();
2499 }
2500
2501 void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
2502 {
2503 Mutex::Locker l(heartbeat_peer_lock);
2504 probe_targets.clear();
2505 for (set<pg_shard_t>::iterator i = probe_set.begin();
2506 i != probe_set.end();
2507 ++i) {
2508 probe_targets.insert(i->osd);
2509 }
2510 }
2511
2512 void PG::clear_probe_targets()
2513 {
2514 Mutex::Locker l(heartbeat_peer_lock);
2515 probe_targets.clear();
2516 }
2517
2518 void PG::update_heartbeat_peers()
2519 {
2520 assert(is_locked());
2521
2522 if (!is_primary())
2523 return;
2524
2525 set<int> new_peers;
2526 for (unsigned i=0; i<acting.size(); i++) {
2527 if (acting[i] != CRUSH_ITEM_NONE)
2528 new_peers.insert(acting[i]);
2529 }
2530 for (unsigned i=0; i<up.size(); i++) {
2531 if (up[i] != CRUSH_ITEM_NONE)
2532 new_peers.insert(up[i]);
2533 }
2534 for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
2535 p != peer_info.end();
2536 ++p)
2537 new_peers.insert(p->first.osd);
2538
2539 bool need_update = false;
2540 heartbeat_peer_lock.Lock();
2541 if (new_peers == heartbeat_peers) {
2542 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl;
2543 } else {
2544 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl;
2545 heartbeat_peers.swap(new_peers);
2546 need_update = true;
2547 }
2548 heartbeat_peer_lock.Unlock();
2549
2550 if (need_update)
2551 osd->need_heartbeat_peer_update();
2552 }
2553
2554
2555 bool PG::check_in_progress_op(
2556 const osd_reqid_t &r,
2557 eversion_t *version,
2558 version_t *user_version,
2559 int *return_code) const
2560 {
2561 return (
2562 projected_log.get_request(r, version, user_version, return_code) ||
2563 pg_log.get_log().get_request(r, version, user_version, return_code));
2564 }
2565
2566 void PG::_update_calc_stats()
2567 {
2568 info.stats.version = info.last_update;
2569 info.stats.created = info.history.epoch_created;
2570 info.stats.last_scrub = info.history.last_scrub;
2571 info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
2572 info.stats.last_deep_scrub = info.history.last_deep_scrub;
2573 info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
2574 info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
2575 info.stats.last_epoch_clean = info.history.last_epoch_clean;
2576
2577 info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
2578 info.stats.ondisk_log_size = info.stats.log_size;
2579 info.stats.log_start = pg_log.get_tail();
2580 info.stats.ondisk_log_start = pg_log.get_tail();
2581 info.stats.snaptrimq_len = snap_trimq.size();
2582
2583 unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
2584
2585 // In rare case that upset is too large (usually transient), use as target
2586 // for calculations below.
2587 unsigned target = std::max(num_shards, (unsigned)upset.size());
2588 // Not sure this could ever happen, that actingset > upset
2589 // which only matters if actingset > num_shards.
2590 unsigned nrep = std::max(actingset.size(), upset.size());
2591 // calc num_object_copies
2592 info.stats.stats.calc_copies(MAX(target, nrep));
2593 info.stats.stats.sum.num_objects_degraded = 0;
2594 info.stats.stats.sum.num_objects_unfound = 0;
2595 info.stats.stats.sum.num_objects_misplaced = 0;
2596 if ((is_remapped() || is_undersized() || !is_clean()) && (is_peered() || is_activating())) {
2597 dout(20) << __func__ << " actingset " << actingset << " upset "
2598 << upset << " actingbackfill " << actingbackfill << dendl;
2599 dout(20) << __func__ << " acting " << acting << " up " << up << dendl;
2600
2601 assert(!actingbackfill.empty());
2602
2603 // NOTE: we only generate degraded, misplaced and unfound
2604 // values for the summation, not individual stat categories.
2605 int64_t num_objects = info.stats.stats.sum.num_objects;
2606
2607 // Objects missing from up nodes, sorted by # objects.
2608 boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
2609 // Objects missing from nodes not in up, sort by # objects
2610 boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
2611
2612 int64_t missing;
2613
2614 // Primary first
2615 missing = pg_log.get_missing().num_missing();
2616 assert(actingbackfill.count(pg_whoami));
2617 if (upset.count(pg_whoami)) {
2618 missing_target_objects.insert(make_pair(missing, pg_whoami));
2619 } else {
2620 acting_source_objects.insert(make_pair(missing, pg_whoami));
2621 }
2622 info.stats.stats.sum.num_objects_missing_on_primary = missing;
2623
2624 // All other peers
2625 for (auto& peer : peer_info) {
2626 // Ignore other peers until we add code to look at detailed missing
2627 // information. (recovery)
2628 if (!actingbackfill.count(peer.first)) {
2629 continue;
2630 }
2631 missing = 0;
2632 // Backfill targets always track num_objects accurately
2633 // all other peers track missing accurately.
2634 if (is_backfill_targets(peer.first)) {
2635 missing = std::max((int64_t)0, num_objects - peer.second.stats.stats.sum.num_objects);
2636 } else {
2637 if (peer_missing.count(peer.first)) {
2638 missing = peer_missing[peer.first].num_missing();
2639 } else {
2640 dout(20) << __func__ << " no peer_missing found for " << peer.first << dendl;
2641 }
2642 }
2643 if (upset.count(peer.first)) {
2644 missing_target_objects.insert(make_pair(missing, peer.first));
2645 } else {
2646 acting_source_objects.insert(make_pair(missing, peer.first));
2647 }
2648 peer.second.stats.stats.sum.num_objects_missing = missing;
2649 }
2650
2651 if (pool.info.is_replicated()) {
2652 // Add to missing_target_objects up to target elements (num_objects missing)
2653 assert(target >= missing_target_objects.size());
2654 unsigned needed = target - missing_target_objects.size();
2655 for (; needed; --needed)
2656 missing_target_objects.insert(make_pair(num_objects, pg_shard_t(pg_shard_t::NO_OSD)));
2657 } else {
2658 for (unsigned i = 0 ; i < num_shards; ++i) {
2659 shard_id_t shard(i);
2660 bool found = false;
2661 for (const auto& t : missing_target_objects) {
2662 if (std::get<1>(t).shard == shard) {
2663 found = true;
2664 break;
2665 }
2666 }
2667 if (!found)
2668 missing_target_objects.insert(make_pair(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard)));
2669 }
2670 }
2671
2672 for (const auto& item : missing_target_objects)
2673 dout(20) << __func__ << " missing shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
2674 for (const auto& item : acting_source_objects)
2675 dout(20) << __func__ << " acting shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
2676
2677 // A misplaced object is not stored on the correct OSD
2678 int64_t misplaced = 0;
2679 // a degraded objects has fewer replicas or EC shards than the pool specifies.
2680 int64_t degraded = 0;
2681
2682 for (auto m = missing_target_objects.rbegin();
2683 m != missing_target_objects.rend(); ++m) {
2684
2685 int64_t extra_missing = -1;
2686
2687 if (pool.info.is_replicated()) {
2688 if (!acting_source_objects.empty()) {
2689 auto extra_copy = acting_source_objects.begin();
2690 extra_missing = std::get<0>(*extra_copy);
2691 acting_source_objects.erase(extra_copy);
2692 }
2693 } else { // Erasure coded
2694 // Use corresponding shard
2695 for (const auto& a : acting_source_objects) {
2696 if (std::get<1>(a).shard == std::get<1>(*m).shard) {
2697 extra_missing = std::get<0>(a);
2698 acting_source_objects.erase(a);
2699 break;
2700 }
2701 }
2702 }
2703
2704 if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
2705 // We don't know which of the objects on the target
2706 // are part of extra_missing so assume are all degraded.
2707 misplaced += std::get<0>(*m) - extra_missing;
2708 degraded += extra_missing;
2709 } else {
2710 // 1. extra_missing == -1, more targets than sources so degraded
2711 // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
2712 // previously degraded are now present on the target.
2713 degraded += std::get<0>(*m);
2714 }
2715 }
2716 // If there are still acting that haven't been accounted for
2717 // then they are misplaced
2718 for (const auto& a : acting_source_objects) {
2719 int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
2720 dout(20) << __func__ << " extra acting misplaced " << extra_misplaced << dendl;
2721 misplaced += extra_misplaced;
2722 }
2723 dout(20) << __func__ << " degraded " << degraded << dendl;
2724 dout(20) << __func__ << " misplaced " << misplaced << dendl;
2725
2726 info.stats.stats.sum.num_objects_degraded = degraded;
2727 info.stats.stats.sum.num_objects_unfound = get_num_unfound();
2728 info.stats.stats.sum.num_objects_misplaced = misplaced;
2729 }
2730 }
2731
2732 void PG::_update_blocked_by()
2733 {
2734 // set a max on the number of blocking peers we report. if we go
2735 // over, report a random subset. keep the result sorted.
2736 unsigned keep = MIN(blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
2737 unsigned skip = blocked_by.size() - keep;
2738 info.stats.blocked_by.clear();
2739 info.stats.blocked_by.resize(keep);
2740 unsigned pos = 0;
2741 for (set<int>::iterator p = blocked_by.begin();
2742 p != blocked_by.end() && keep > 0;
2743 ++p) {
2744 if (skip > 0 && (rand() % (skip + keep) < skip)) {
2745 --skip;
2746 } else {
2747 info.stats.blocked_by[pos++] = *p;
2748 --keep;
2749 }
2750 }
2751 }
2752
2753 void PG::publish_stats_to_osd()
2754 {
2755 if (!is_primary())
2756 return;
2757
2758 pg_stats_publish_lock.Lock();
2759
2760 if (info.stats.stats.sum.num_scrub_errors)
2761 state_set(PG_STATE_INCONSISTENT);
2762 else
2763 state_clear(PG_STATE_INCONSISTENT);
2764
2765 utime_t now = ceph_clock_now();
2766 if (info.stats.state != state) {
2767 info.stats.last_change = now;
2768 // Optimistic estimation, if we just find out an inactive PG,
2769 // assumt it is active till now.
2770 if (!(state & PG_STATE_ACTIVE) &&
2771 (info.stats.state & PG_STATE_ACTIVE))
2772 info.stats.last_active = now;
2773
2774 if ((state & PG_STATE_ACTIVE) &&
2775 !(info.stats.state & PG_STATE_ACTIVE))
2776 info.stats.last_became_active = now;
2777 if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
2778 !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
2779 info.stats.last_became_peered = now;
2780 if (!(state & PG_STATE_CREATING) &&
2781 (info.stats.state & PG_STATE_CREATING)) {
2782 osd->send_pg_created(get_pgid().pgid);
2783 }
2784 info.stats.state = state;
2785 }
2786
2787 _update_calc_stats();
2788 if (info.stats.stats.sum.num_objects_degraded) {
2789 state_set(PG_STATE_DEGRADED);
2790 } else {
2791 state_clear(PG_STATE_DEGRADED);
2792 }
2793 _update_blocked_by();
2794
2795 bool publish = false;
2796 pg_stat_t pre_publish = info.stats;
2797 pre_publish.stats.add(unstable_stats);
2798 utime_t cutoff = now;
2799 cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
2800 if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
2801 info.stats.last_fresh > cutoff) {
2802 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2803 << ": no change since " << info.stats.last_fresh << dendl;
2804 } else {
2805 // update our stat summary and timestamps
2806 info.stats.reported_epoch = get_osdmap()->get_epoch();
2807 ++info.stats.reported_seq;
2808
2809 info.stats.last_fresh = now;
2810
2811 if (info.stats.state & PG_STATE_CLEAN)
2812 info.stats.last_clean = now;
2813 if (info.stats.state & PG_STATE_ACTIVE)
2814 info.stats.last_active = now;
2815 if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
2816 info.stats.last_peered = now;
2817 info.stats.last_unstale = now;
2818 if ((info.stats.state & PG_STATE_DEGRADED) == 0)
2819 info.stats.last_undegraded = now;
2820 if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
2821 info.stats.last_fullsized = now;
2822
2823 // do not send pgstat to mon anymore once we are luminous, since mgr takes
2824 // care of this by sending MMonMgrReport to mon.
2825 publish =
2826 osd->osd->get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
2827 pg_stats_publish_valid = true;
2828 pg_stats_publish = pre_publish;
2829
2830 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2831 << ":" << pg_stats_publish.reported_seq << dendl;
2832 }
2833 pg_stats_publish_lock.Unlock();
2834
2835 if (publish)
2836 osd->pg_stat_queue_enqueue(this);
2837 }
2838
2839 void PG::clear_publish_stats()
2840 {
2841 dout(15) << "clear_stats" << dendl;
2842 pg_stats_publish_lock.Lock();
2843 pg_stats_publish_valid = false;
2844 pg_stats_publish_lock.Unlock();
2845
2846 osd->pg_stat_queue_dequeue(this);
2847 }
2848
2849 /**
2850 * initialize a newly instantiated pg
2851 *
2852 * Initialize PG state, as when a PG is initially created, or when it
2853 * is first instantiated on the current node.
2854 *
2855 * @param role our role/rank
2856 * @param newup up set
2857 * @param newacting acting set
2858 * @param history pg history
2859 * @param pi past_intervals
2860 * @param backfill true if info should be marked as backfill
2861 * @param t transaction to write out our new state in
2862 */
2863 void PG::init(
2864 int role,
2865 const vector<int>& newup, int new_up_primary,
2866 const vector<int>& newacting, int new_acting_primary,
2867 const pg_history_t& history,
2868 const PastIntervals& pi,
2869 bool backfill,
2870 ObjectStore::Transaction *t)
2871 {
2872 dout(10) << "init role " << role << " up " << newup << " acting " << newacting
2873 << " history " << history
2874 << " past_intervals " << pi
2875 << dendl;
2876
2877 set_role(role);
2878 acting = newacting;
2879 up = newup;
2880 init_primary_up_acting(
2881 newup,
2882 newacting,
2883 new_up_primary,
2884 new_acting_primary);
2885
2886 info.history = history;
2887 past_intervals = pi;
2888
2889 info.stats.up = up;
2890 info.stats.up_primary = new_up_primary;
2891 info.stats.acting = acting;
2892 info.stats.acting_primary = new_acting_primary;
2893 info.stats.mapping_epoch = info.history.same_interval_since;
2894
2895 if (backfill) {
2896 dout(10) << __func__ << ": Setting backfill" << dendl;
2897 info.set_last_backfill(hobject_t());
2898 info.last_complete = info.last_update;
2899 pg_log.mark_log_for_rewrite();
2900 }
2901
2902 on_new_interval();
2903
2904 dirty_info = true;
2905 dirty_big_info = true;
2906 write_if_dirty(*t);
2907 }
2908
2909 #pragma GCC diagnostic ignored "-Wpragmas"
2910 #pragma GCC diagnostic push
2911 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2912
2913 void PG::upgrade(ObjectStore *store)
2914 {
2915 assert(info_struct_v <= 10);
2916 ObjectStore::Transaction t;
2917
2918 assert(info_struct_v >= 7);
2919
2920 // 7 -> 8
2921 if (info_struct_v <= 7) {
2922 pg_log.mark_log_for_rewrite();
2923 ghobject_t log_oid(OSD::make_pg_log_oid(pg_id));
2924 ghobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id));
2925 t.remove(coll_t::meta(), log_oid);
2926 t.remove(coll_t::meta(), biginfo_oid);
2927 t.touch(coll, pgmeta_oid);
2928 }
2929
2930 // 8 -> 9
2931 if (info_struct_v <= 8) {
2932 // no special action needed.
2933 }
2934
2935 // 9 -> 10
2936 if (info_struct_v <= 9) {
2937 // previous versions weren't (as) aggressively clearing past_intervals
2938 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
2939 dout(20) << __func__ << " clearing past_intervals" << dendl;
2940 past_intervals.clear();
2941 }
2942 }
2943
2944 // update infover_key
2945 if (info_struct_v < cur_struct_v) {
2946 map<string,bufferlist> v;
2947 __u8 ver = cur_struct_v;
2948 ::encode(ver, v[infover_key]);
2949 t.omap_setkeys(coll, pgmeta_oid, v);
2950 }
2951
2952 dirty_info = true;
2953 dirty_big_info = true;
2954 write_if_dirty(t);
2955
2956 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
2957 ObjectStore::Sequencer>("upgrade"));
2958 int r = store->apply_transaction(osr.get(), std::move(t));
2959 if (r != 0) {
2960 derr << __func__ << ": apply_transaction returned "
2961 << cpp_strerror(r) << dendl;
2962 ceph_abort();
2963 }
2964 assert(r == 0);
2965
2966 C_SaferCond waiter;
2967 if (!osr->flush_commit(&waiter)) {
2968 waiter.wait();
2969 }
2970 }
2971
2972 #pragma GCC diagnostic pop
2973 #pragma GCC diagnostic warning "-Wpragmas"
2974
2975 int PG::_prepare_write_info(CephContext* cct,
2976 map<string,bufferlist> *km,
2977 epoch_t epoch,
2978 pg_info_t &info, pg_info_t &last_written_info,
2979 PastIntervals &past_intervals,
2980 bool dirty_big_info,
2981 bool dirty_epoch,
2982 bool try_fast_info,
2983 PerfCounters *logger)
2984 {
2985 if (dirty_epoch) {
2986 ::encode(epoch, (*km)[epoch_key]);
2987 }
2988
2989 if (logger)
2990 logger->inc(l_osd_pg_info);
2991
2992 // try to do info efficiently?
2993 if (!dirty_big_info && try_fast_info &&
2994 info.last_update > last_written_info.last_update) {
2995 pg_fast_info_t fast;
2996 fast.populate_from(info);
2997 bool did = fast.try_apply_to(&last_written_info);
2998 assert(did); // we verified last_update increased above
2999 if (info == last_written_info) {
3000 ::encode(fast, (*km)[fastinfo_key]);
3001 if (logger)
3002 logger->inc(l_osd_pg_fastinfo);
3003 return 0;
3004 }
3005 generic_dout(30) << __func__ << " fastinfo failed, info:\n";
3006 {
3007 JSONFormatter jf(true);
3008 jf.dump_object("info", info);
3009 jf.flush(*_dout);
3010 }
3011 {
3012 *_dout << "\nlast_written_info:\n";
3013 JSONFormatter jf(true);
3014 jf.dump_object("last_written_info", last_written_info);
3015 jf.flush(*_dout);
3016 }
3017 *_dout << dendl;
3018 }
3019 last_written_info = info;
3020
3021 // info. store purged_snaps separately.
3022 interval_set<snapid_t> purged_snaps;
3023 purged_snaps.swap(info.purged_snaps);
3024 ::encode(info, (*km)[info_key]);
3025 purged_snaps.swap(info.purged_snaps);
3026
3027 if (dirty_big_info) {
3028 // potentially big stuff
3029 bufferlist& bigbl = (*km)[biginfo_key];
3030 ::encode(past_intervals, bigbl);
3031 ::encode(info.purged_snaps, bigbl);
3032 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
3033 if (logger)
3034 logger->inc(l_osd_pg_biginfo);
3035 }
3036
3037 return 0;
3038 }
3039
3040 void PG::_create(ObjectStore::Transaction& t, spg_t pgid, int bits)
3041 {
3042 coll_t coll(pgid);
3043 t.create_collection(coll, bits);
3044 }
3045
3046 void PG::_init(ObjectStore::Transaction& t, spg_t pgid, const pg_pool_t *pool)
3047 {
3048 coll_t coll(pgid);
3049
3050 if (pool) {
3051 // Give a hint to the PG collection
3052 bufferlist hint;
3053 uint32_t pg_num = pool->get_pg_num();
3054 uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
3055 ::encode(pg_num, hint);
3056 ::encode(expected_num_objects_pg, hint);
3057 uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
3058 t.collection_hint(coll, hint_type, hint);
3059 }
3060
3061 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3062 t.touch(coll, pgmeta_oid);
3063 map<string,bufferlist> values;
3064 __u8 struct_v = cur_struct_v;
3065 ::encode(struct_v, values[infover_key]);
3066 t.omap_setkeys(coll, pgmeta_oid, values);
3067 }
3068
3069 void PG::prepare_write_info(map<string,bufferlist> *km)
3070 {
3071 info.stats.stats.add(unstable_stats);
3072 unstable_stats.clear();
3073
3074 bool need_update_epoch = last_epoch < get_osdmap()->get_epoch();
3075 int ret = _prepare_write_info(cct, km, get_osdmap()->get_epoch(),
3076 info,
3077 last_written_info,
3078 past_intervals,
3079 dirty_big_info, need_update_epoch,
3080 cct->_conf->osd_fast_info,
3081 osd->logger);
3082 assert(ret == 0);
3083 if (need_update_epoch)
3084 last_epoch = get_osdmap()->get_epoch();
3085 last_persisted_osdmap_ref = osdmap_ref;
3086
3087 dirty_info = false;
3088 dirty_big_info = false;
3089 }
3090
3091 #pragma GCC diagnostic ignored "-Wpragmas"
3092 #pragma GCC diagnostic push
3093 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3094
3095 bool PG::_has_removal_flag(ObjectStore *store,
3096 spg_t pgid)
3097 {
3098 coll_t coll(pgid);
3099 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3100
3101 // first try new way
3102 set<string> keys;
3103 keys.insert("_remove");
3104 map<string,bufferlist> values;
3105 if (store->omap_get_values(coll, pgmeta_oid, keys, &values) == 0 &&
3106 values.size() == 1)
3107 return true;
3108
3109 return false;
3110 }
3111
3112 int PG::peek_map_epoch(ObjectStore *store,
3113 spg_t pgid,
3114 epoch_t *pepoch,
3115 bufferlist *bl)
3116 {
3117 coll_t coll(pgid);
3118 ghobject_t legacy_infos_oid(OSD::make_infos_oid());
3119 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3120 epoch_t cur_epoch = 0;
3121
3122 assert(bl);
3123 {
3124 // validate collection name
3125 assert(coll.is_pg());
3126 }
3127
3128 // try for v8
3129 set<string> keys;
3130 keys.insert(infover_key);
3131 keys.insert(epoch_key);
3132 map<string,bufferlist> values;
3133 int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3134 if (r == 0) {
3135 assert(values.size() == 2);
3136
3137 // sanity check version
3138 bufferlist::iterator bp = values[infover_key].begin();
3139 __u8 struct_v = 0;
3140 ::decode(struct_v, bp);
3141 assert(struct_v >= 8);
3142
3143 // get epoch
3144 bp = values[epoch_key].begin();
3145 ::decode(cur_epoch, bp);
3146 } else {
3147 // probably bug 10617; see OSD::load_pgs()
3148 return -1;
3149 }
3150
3151 *pepoch = cur_epoch;
3152 return 0;
3153 }
3154
3155 #pragma GCC diagnostic pop
3156 #pragma GCC diagnostic warning "-Wpragmas"
3157
3158 void PG::write_if_dirty(ObjectStore::Transaction& t)
3159 {
3160 map<string,bufferlist> km;
3161 if (dirty_big_info || dirty_info)
3162 prepare_write_info(&km);
3163 pg_log.write_log_and_missing(t, &km, coll, pgmeta_oid, pool.info.require_rollback());
3164 if (!km.empty())
3165 t.omap_setkeys(coll, pgmeta_oid, km);
3166 }
3167
3168 void PG::trim_log()
3169 {
3170 assert(is_primary());
3171 calc_trim_to();
3172 dout(10) << __func__ << " to " << pg_trim_to << dendl;
3173 if (pg_trim_to != eversion_t()) {
3174 // inform peers to trim log
3175 assert(!actingbackfill.empty());
3176 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3177 i != actingbackfill.end();
3178 ++i) {
3179 if (*i == pg_whoami) continue;
3180 osd->send_message_osd_cluster(
3181 i->osd,
3182 new MOSDPGTrim(
3183 get_osdmap()->get_epoch(),
3184 spg_t(info.pgid.pgid, i->shard),
3185 pg_trim_to),
3186 get_osdmap()->get_epoch());
3187 }
3188
3189 // trim primary as well
3190 pg_log.trim(pg_trim_to, info);
3191 dirty_info = true;
3192 }
3193 }
3194
3195 void PG::add_log_entry(const pg_log_entry_t& e, bool applied)
3196 {
3197 // raise last_complete only if we were previously up to date
3198 if (info.last_complete == info.last_update)
3199 info.last_complete = e.version;
3200
3201 // raise last_update.
3202 assert(e.version > info.last_update);
3203 info.last_update = e.version;
3204
3205 // raise user_version, if it increased (it may have not get bumped
3206 // by all logged updates)
3207 if (e.user_version > info.last_user_version)
3208 info.last_user_version = e.user_version;
3209
3210 // log mutation
3211 pg_log.add(e, applied);
3212 dout(10) << "add_log_entry " << e << dendl;
3213 }
3214
3215
3216 void PG::append_log(
3217 const vector<pg_log_entry_t>& logv,
3218 eversion_t trim_to,
3219 eversion_t roll_forward_to,
3220 ObjectStore::Transaction &t,
3221 bool transaction_applied)
3222 {
3223 if (transaction_applied)
3224 update_snap_map(logv, t);
3225
3226 /* The primary has sent an info updating the history, but it may not
3227 * have arrived yet. We want to make sure that we cannot remember this
3228 * write without remembering that it happened in an interval which went
3229 * active in epoch history.last_epoch_started.
3230 */
3231 if (info.last_epoch_started != info.history.last_epoch_started) {
3232 info.history.last_epoch_started = info.last_epoch_started;
3233 }
3234 if (info.last_interval_started != info.history.last_interval_started) {
3235 info.history.last_interval_started = info.last_interval_started;
3236 }
3237 dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3238
3239 PGLogEntryHandler handler{this, &t};
3240 if (!transaction_applied) {
3241 /* We must be a backfill peer, so it's ok if we apply
3242 * out-of-turn since we won't be considered when
3243 * determining a min possible last_update.
3244 */
3245 pg_log.roll_forward(&handler);
3246 }
3247
3248 for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3249 p != logv.end();
3250 ++p) {
3251 add_log_entry(*p, transaction_applied);
3252
3253 /* We don't want to leave the rollforward artifacts around
3254 * here past last_backfill. It's ok for the same reason as
3255 * above */
3256 if (transaction_applied &&
3257 p->soid > info.last_backfill) {
3258 pg_log.roll_forward(&handler);
3259 }
3260 }
3261 auto last = logv.rbegin();
3262 if (is_primary() && last != logv.rend()) {
3263 projected_log.skip_can_rollback_to_to_head();
3264 projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
3265 }
3266
3267 if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
3268 pg_log.roll_forward_to(
3269 roll_forward_to,
3270 &handler);
3271 t.register_on_applied(
3272 new C_UpdateLastRollbackInfoTrimmedToApplied(
3273 this,
3274 get_osdmap()->get_epoch(),
3275 roll_forward_to));
3276 }
3277
3278 pg_log.trim(trim_to, info);
3279
3280 // update the local pg, pg log
3281 dirty_info = true;
3282 write_if_dirty(t);
3283 }
3284
3285 bool PG::check_log_for_corruption(ObjectStore *store)
3286 {
3287 /// TODO: this method needs to work with the omap log
3288 return true;
3289 }
3290
3291 //! Get the name we're going to save our corrupt page log as
3292 std::string PG::get_corrupt_pg_log_name() const
3293 {
3294 const int MAX_BUF = 512;
3295 char buf[MAX_BUF];
3296 struct tm tm_buf;
3297 time_t my_time(time(NULL));
3298 const struct tm *t = localtime_r(&my_time, &tm_buf);
3299 int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t);
3300 if (ret == 0) {
3301 dout(0) << "strftime failed" << dendl;
3302 return "corrupt_log_unknown_time";
3303 }
3304 string out(buf);
3305 out += stringify(info.pgid);
3306 return out;
3307 }
3308
3309 int PG::read_info(
3310 ObjectStore *store, spg_t pgid, const coll_t &coll, bufferlist &bl,
3311 pg_info_t &info, PastIntervals &past_intervals,
3312 __u8 &struct_v)
3313 {
3314 // try for v8 or later
3315 set<string> keys;
3316 keys.insert(infover_key);
3317 keys.insert(info_key);
3318 keys.insert(biginfo_key);
3319 keys.insert(fastinfo_key);
3320 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3321 map<string,bufferlist> values;
3322 int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3323 if (r == 0) {
3324 assert(values.size() == 3 ||
3325 values.size() == 4);
3326
3327 bufferlist::iterator p = values[infover_key].begin();
3328 ::decode(struct_v, p);
3329 assert(struct_v >= 8);
3330
3331 p = values[info_key].begin();
3332 ::decode(info, p);
3333
3334 p = values[biginfo_key].begin();
3335 if (struct_v >= 10) {
3336 ::decode(past_intervals, p);
3337 } else {
3338 past_intervals.decode_classic(p);
3339 }
3340 ::decode(info.purged_snaps, p);
3341
3342 p = values[fastinfo_key].begin();
3343 if (!p.end()) {
3344 pg_fast_info_t fast;
3345 ::decode(fast, p);
3346 fast.try_apply_to(&info);
3347 }
3348 return 0;
3349 }
3350
3351 // legacy (ver < 8)
3352 ghobject_t infos_oid(OSD::make_infos_oid());
3353 bufferlist::iterator p = bl.begin();
3354 ::decode(struct_v, p);
3355 assert(struct_v == 7);
3356
3357 // get info out of leveldb
3358 string k = get_info_key(info.pgid);
3359 string bk = get_biginfo_key(info.pgid);
3360 keys.clear();
3361 keys.insert(k);
3362 keys.insert(bk);
3363 values.clear();
3364 store->omap_get_values(coll_t::meta(), ghobject_t(infos_oid), keys, &values);
3365 assert(values.size() == 2);
3366
3367 p = values[k].begin();
3368 ::decode(info, p);
3369
3370 p = values[bk].begin();
3371 ::decode(past_intervals, p);
3372 interval_set<snapid_t> snap_collections; // obsolete
3373 ::decode(snap_collections, p);
3374 ::decode(info.purged_snaps, p);
3375 return 0;
3376 }
3377
3378 void PG::read_state(ObjectStore *store, bufferlist &bl)
3379 {
3380 int r = read_info(store, pg_id, coll, bl, info, past_intervals,
3381 info_struct_v);
3382 assert(r >= 0);
3383
3384 last_written_info = info;
3385
3386 // if we are upgrading from jewel, we need to force rebuild of
3387 // missing set. v9 was fastinfo, added v11.0.2-331-g1d5dc29a13
3388 // (before kraken). persisted missing set was circa
3389 // v11.0.0-866-gb0e239da95 (a bit earlier, also before kraken).
3390 // v8 was pre-jewel (per-pg meta object).
3391 bool force_rebuild_missing = info_struct_v < 9;
3392 if (force_rebuild_missing) {
3393 dout(10) << __func__ << " detected upgrade from jewel, force_rebuild_missing"
3394 << dendl;
3395 }
3396
3397 ostringstream oss;
3398 pg_log.read_log_and_missing(
3399 store,
3400 coll,
3401 info_struct_v < 8 ? coll_t::meta() : coll,
3402 ghobject_t(info_struct_v < 8 ? OSD::make_pg_log_oid(pg_id) : pgmeta_oid),
3403 info,
3404 force_rebuild_missing,
3405 oss,
3406 cct->_conf->osd_ignore_stale_divergent_priors,
3407 cct->_conf->osd_debug_verify_missing_on_start);
3408 if (oss.tellp())
3409 osd->clog->error() << oss.str();
3410
3411 if (force_rebuild_missing) {
3412 dout(10) << __func__ << " forced rebuild of missing got "
3413 << pg_log.get_missing()
3414 << dendl;
3415 }
3416
3417 // log any weirdness
3418 log_weirdness();
3419 }
3420
3421 void PG::log_weirdness()
3422 {
3423 if (pg_log.get_tail() != info.log_tail)
3424 osd->clog->error() << info.pgid
3425 << " info mismatch, log.tail " << pg_log.get_tail()
3426 << " != info.log_tail " << info.log_tail;
3427 if (pg_log.get_head() != info.last_update)
3428 osd->clog->error() << info.pgid
3429 << " info mismatch, log.head " << pg_log.get_head()
3430 << " != info.last_update " << info.last_update;
3431
3432 if (!pg_log.get_log().empty()) {
3433 // sloppy check
3434 if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
3435 osd->clog->error() << info.pgid
3436 << " log bound mismatch, info (tail,head] ("
3437 << pg_log.get_tail() << "," << pg_log.get_head() << "]"
3438 << " actual ["
3439 << pg_log.get_log().log.begin()->version << ","
3440 << pg_log.get_log().log.rbegin()->version << "]";
3441 }
3442
3443 if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
3444 osd->clog->error() << info.pgid
3445 << " caller_ops.size " << pg_log.get_log().caller_ops.size()
3446 << " > log size " << pg_log.get_log().log.size();
3447 }
3448 }
3449
3450 void PG::update_snap_map(
3451 const vector<pg_log_entry_t> &log_entries,
3452 ObjectStore::Transaction &t)
3453 {
3454 for (vector<pg_log_entry_t>::const_iterator i = log_entries.begin();
3455 i != log_entries.end();
3456 ++i) {
3457 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
3458 if (i->soid.snap < CEPH_MAXSNAP) {
3459 if (i->is_delete()) {
3460 int r = snap_mapper.remove_oid(
3461 i->soid,
3462 &_t);
3463 assert(r == 0);
3464 } else if (i->is_update()) {
3465 assert(i->snaps.length() > 0);
3466 vector<snapid_t> snaps;
3467 bufferlist snapbl = i->snaps;
3468 bufferlist::iterator p = snapbl.begin();
3469 try {
3470 ::decode(snaps, p);
3471 } catch (...) {
3472 derr << __func__ << " decode snaps failure on " << *i << dendl;
3473 snaps.clear();
3474 }
3475 set<snapid_t> _snaps(snaps.begin(), snaps.end());
3476
3477 if (i->is_clone() || i->is_promote()) {
3478 snap_mapper.add_oid(
3479 i->soid,
3480 _snaps,
3481 &_t);
3482 } else if (i->is_modify()) {
3483 assert(i->is_modify());
3484 int r = snap_mapper.update_snaps(
3485 i->soid,
3486 _snaps,
3487 0,
3488 &_t);
3489 assert(r == 0);
3490 } else {
3491 assert(i->is_clean());
3492 }
3493 }
3494 }
3495 }
3496 }
3497
3498 /**
3499 * filter trimming|trimmed snaps out of snapcontext
3500 */
3501 void PG::filter_snapc(vector<snapid_t> &snaps)
3502 {
3503 //nothing needs to trim, we can return immediately
3504 if(snap_trimq.empty() && info.purged_snaps.empty())
3505 return;
3506
3507 bool filtering = false;
3508 vector<snapid_t> newsnaps;
3509 for (vector<snapid_t>::iterator p = snaps.begin();
3510 p != snaps.end();
3511 ++p) {
3512 if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
3513 if (!filtering) {
3514 // start building a new vector with what we've seen so far
3515 dout(10) << "filter_snapc filtering " << snaps << dendl;
3516 newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
3517 filtering = true;
3518 }
3519 dout(20) << "filter_snapc removing trimq|purged snap " << *p << dendl;
3520 } else {
3521 if (filtering)
3522 newsnaps.push_back(*p); // continue building new vector
3523 }
3524 }
3525 if (filtering) {
3526 snaps.swap(newsnaps);
3527 dout(10) << "filter_snapc result " << snaps << dendl;
3528 }
3529 }
3530
3531 void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m)
3532 {
3533 for (map<hobject_t, list<OpRequestRef>>::iterator it = m.begin();
3534 it != m.end();
3535 ++it)
3536 requeue_ops(it->second);
3537 m.clear();
3538 }
3539
3540 void PG::requeue_op(OpRequestRef op)
3541 {
3542 auto p = waiting_for_map.find(op->get_source());
3543 if (p != waiting_for_map.end()) {
3544 dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")"
3545 << dendl;
3546 p->second.push_front(op);
3547 } else {
3548 dout(20) << __func__ << " " << op << dendl;
3549 osd->enqueue_front(info.pgid, PGQueueable(op, get_osdmap()->get_epoch()));
3550 }
3551 }
3552
3553 void PG::requeue_ops(list<OpRequestRef> &ls)
3554 {
3555 for (list<OpRequestRef>::reverse_iterator i = ls.rbegin();
3556 i != ls.rend();
3557 ++i) {
3558 auto p = waiting_for_map.find((*i)->get_source());
3559 if (p != waiting_for_map.end()) {
3560 dout(20) << __func__ << " " << *i << " (waiting_for_map " << p->first
3561 << ")" << dendl;
3562 p->second.push_front(*i);
3563 } else {
3564 dout(20) << __func__ << " " << *i << dendl;
3565 osd->enqueue_front(info.pgid, PGQueueable(*i, get_osdmap()->get_epoch()));
3566 }
3567 }
3568 ls.clear();
3569 }
3570
3571 void PG::requeue_map_waiters()
3572 {
3573 epoch_t epoch = get_osdmap()->get_epoch();
3574 auto p = waiting_for_map.begin();
3575 while (p != waiting_for_map.end()) {
3576 if (epoch < p->second.front()->min_epoch) {
3577 dout(20) << __func__ << " " << p->first << " front op "
3578 << p->second.front() << " must still wait, doing nothing"
3579 << dendl;
3580 ++p;
3581 } else {
3582 dout(20) << __func__ << " " << p->first << " " << p->second << dendl;
3583 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
3584 osd->enqueue_front(info.pgid, PGQueueable(*q, epoch));
3585 }
3586 p = waiting_for_map.erase(p);
3587 }
3588 }
3589 }
3590
3591
3592 // ==========================================================================================
3593 // SCRUB
3594
3595 /*
3596 * when holding pg and sched_scrub_lock, then the states are:
3597 * scheduling:
3598 * scrubber.reserved = true
3599 * scrub_rserved_peers includes whoami
3600 * osd->scrub_pending++
3601 * scheduling, replica declined:
3602 * scrubber.reserved = true
3603 * scrubber.reserved_peers includes -1
3604 * osd->scrub_pending++
3605 * pending:
3606 * scrubber.reserved = true
3607 * scrubber.reserved_peers.size() == acting.size();
3608 * pg on scrub_wq
3609 * osd->scrub_pending++
3610 * scrubbing:
3611 * scrubber.reserved = false;
3612 * scrubber.reserved_peers empty
3613 * osd->scrubber.active++
3614 */
3615
3616 // returns true if a scrub has been newly kicked off
3617 bool PG::sched_scrub()
3618 {
3619 bool nodeep_scrub = false;
3620 assert(is_locked());
3621 if (!(is_primary() && is_active() && is_clean() && !is_scrubbing())) {
3622 return false;
3623 }
3624
3625 double deep_scrub_interval = 0;
3626 pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
3627 if (deep_scrub_interval <= 0) {
3628 deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
3629 }
3630 bool time_for_deep = ceph_clock_now() >=
3631 info.history.last_deep_scrub_stamp + deep_scrub_interval;
3632
3633 bool deep_coin_flip = false;
3634 // Only add random deep scrubs when NOT user initiated scrub
3635 if (!scrubber.must_scrub)
3636 deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
3637 dout(20) << __func__ << ": time_for_deep=" << time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
3638
3639 time_for_deep = (time_for_deep || deep_coin_flip);
3640
3641 //NODEEP_SCRUB so ignore time initiated deep-scrub
3642 if (osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
3643 pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
3644 time_for_deep = false;
3645 nodeep_scrub = true;
3646 }
3647
3648 if (!scrubber.must_scrub) {
3649 assert(!scrubber.must_deep_scrub);
3650
3651 //NOSCRUB so skip regular scrubs
3652 if ((osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
3653 pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) && !time_for_deep) {
3654 if (scrubber.reserved) {
3655 // cancel scrub if it is still in scheduling,
3656 // so pgs from other pools where scrub are still legal
3657 // have a chance to go ahead with scrubbing.
3658 clear_scrub_reserved();
3659 scrub_unreserve_replicas();
3660 }
3661 return false;
3662 }
3663 }
3664
3665 if (cct->_conf->osd_scrub_auto_repair
3666 && get_pgbackend()->auto_repair_supported()
3667 && time_for_deep
3668 // respect the command from user, and not do auto-repair
3669 && !scrubber.must_repair
3670 && !scrubber.must_scrub
3671 && !scrubber.must_deep_scrub) {
3672 dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
3673 scrubber.auto_repair = true;
3674 } else {
3675 // this happens when user issue the scrub/repair command during
3676 // the scheduling of the scrub/repair (e.g. request reservation)
3677 scrubber.auto_repair = false;
3678 }
3679
3680 bool ret = true;
3681 if (!scrubber.reserved) {
3682 assert(scrubber.reserved_peers.empty());
3683 if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
3684 osd->inc_scrubs_pending()) {
3685 dout(20) << __func__ << ": reserved locally, reserving replicas" << dendl;
3686 scrubber.reserved = true;
3687 scrubber.reserved_peers.insert(pg_whoami);
3688 scrub_reserve_replicas();
3689 } else {
3690 dout(20) << __func__ << ": failed to reserve locally" << dendl;
3691 ret = false;
3692 }
3693 }
3694 if (scrubber.reserved) {
3695 if (scrubber.reserve_failed) {
3696 dout(20) << "sched_scrub: failed, a peer declined" << dendl;
3697 clear_scrub_reserved();
3698 scrub_unreserve_replicas();
3699 ret = false;
3700 } else if (scrubber.reserved_peers.size() == acting.size()) {
3701 dout(20) << "sched_scrub: success, reserved self and replicas" << dendl;
3702 if (time_for_deep) {
3703 dout(10) << "sched_scrub: scrub will be deep" << dendl;
3704 state_set(PG_STATE_DEEP_SCRUB);
3705 } else if (!scrubber.must_deep_scrub && info.stats.stats.sum.num_deep_scrub_errors) {
3706 if (!nodeep_scrub) {
3707 osd->clog->info() << "osd." << osd->whoami
3708 << " pg " << info.pgid
3709 << " Deep scrub errors, upgrading scrub to deep-scrub";
3710 state_set(PG_STATE_DEEP_SCRUB);
3711 } else if (!scrubber.must_scrub) {
3712 osd->clog->error() << "osd." << osd->whoami
3713 << " pg " << info.pgid
3714 << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
3715 clear_scrub_reserved();
3716 scrub_unreserve_replicas();
3717 return false;
3718 } else {
3719 osd->clog->error() << "osd." << osd->whoami
3720 << " pg " << info.pgid
3721 << " Regular scrub request, deep-scrub details will be lost";
3722 }
3723 }
3724 queue_scrub();
3725 } else {
3726 // none declined, since scrubber.reserved is set
3727 dout(20) << "sched_scrub: reserved " << scrubber.reserved_peers << ", waiting for replicas" << dendl;
3728 }
3729 }
3730
3731 return ret;
3732 }
3733
3734 void PG::reg_next_scrub()
3735 {
3736 if (!is_primary())
3737 return;
3738
3739 utime_t reg_stamp;
3740 if (scrubber.must_scrub ||
3741 (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats)) {
3742 reg_stamp = ceph_clock_now();
3743 } else {
3744 reg_stamp = info.history.last_scrub_stamp;
3745 }
3746 // note down the sched_time, so we can locate this scrub, and remove it
3747 // later on.
3748 double scrub_min_interval = 0, scrub_max_interval = 0;
3749 pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
3750 pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
3751 assert(scrubber.scrub_reg_stamp == utime_t());
3752 scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
3753 reg_stamp,
3754 scrub_min_interval,
3755 scrub_max_interval,
3756 scrubber.must_scrub);
3757 }
3758
3759 void PG::unreg_next_scrub()
3760 {
3761 if (is_primary()) {
3762 osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
3763 scrubber.scrub_reg_stamp = utime_t();
3764 }
3765 }
3766
3767 void PG::do_replica_scrub_map(OpRequestRef op)
3768 {
3769 const MOSDRepScrubMap *m = static_cast<const MOSDRepScrubMap*>(op->get_req());
3770 dout(7) << __func__ << " " << *m << dendl;
3771 if (m->map_epoch < info.history.same_interval_since) {
3772 dout(10) << __func__ << " discarding old from "
3773 << m->map_epoch << " < " << info.history.same_interval_since
3774 << dendl;
3775 return;
3776 }
3777 if (!scrubber.is_chunky_scrub_active()) {
3778 dout(10) << __func__ << " scrub isn't active" << dendl;
3779 return;
3780 }
3781
3782 op->mark_started();
3783
3784 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3785 scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3786 dout(10) << "map version is "
3787 << scrubber.received_maps[m->from].valid_through
3788 << dendl;
3789
3790 dout(10) << __func__ << " waiting_on_whom was " << scrubber.waiting_on_whom
3791 << dendl;
3792 assert(scrubber.waiting_on_whom.count(m->from));
3793 scrubber.waiting_on_whom.erase(m->from);
3794 if (m->preempted) {
3795 dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
3796 scrub_preempted = true;
3797 }
3798 if (scrubber.waiting_on_whom.empty()) {
3799 if (ops_blocked_by_scrub()) {
3800 requeue_scrub(true);
3801 } else {
3802 requeue_scrub(false);
3803 }
3804 }
3805 }
3806
3807 void PG::sub_op_scrub_map(OpRequestRef op)
3808 {
3809 // for legacy jewel compatibility only
3810 const MOSDSubOp *m = static_cast<const MOSDSubOp *>(op->get_req());
3811 assert(m->get_type() == MSG_OSD_SUBOP);
3812 dout(7) << "sub_op_scrub_map" << dendl;
3813
3814 if (m->map_epoch < info.history.same_interval_since) {
3815 dout(10) << "sub_op_scrub discarding old sub_op from "
3816 << m->map_epoch << " < " << info.history.same_interval_since << dendl;
3817 return;
3818 }
3819
3820 if (!scrubber.is_chunky_scrub_active()) {
3821 dout(10) << "sub_op_scrub_map scrub isn't active" << dendl;
3822 return;
3823 }
3824
3825 op->mark_started();
3826
3827 dout(10) << " got " << m->from << " scrub map" << dendl;
3828 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3829
3830 scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3831 dout(10) << "map version is "
3832 << scrubber.received_maps[m->from].valid_through
3833 << dendl;
3834
3835 scrubber.waiting_on_whom.erase(m->from);
3836
3837 if (scrubber.waiting_on_whom.empty()) {
3838 if (ops_blocked_by_scrub()) {
3839 requeue_scrub(true);
3840 } else {
3841 requeue_scrub(false);
3842 }
3843 }
3844 }
3845
3846 // send scrub v3 messages (chunky scrub)
3847 void PG::_request_scrub_map(
3848 pg_shard_t replica, eversion_t version,
3849 hobject_t start, hobject_t end,
3850 bool deep,
3851 bool allow_preemption)
3852 {
3853 assert(replica != pg_whoami);
3854 dout(10) << "scrub requesting scrubmap from osd." << replica
3855 << " deep " << (int)deep << dendl;
3856 MOSDRepScrub *repscrubop = new MOSDRepScrub(
3857 spg_t(info.pgid.pgid, replica.shard), version,
3858 get_osdmap()->get_epoch(),
3859 get_last_peering_reset(),
3860 start, end, deep,
3861 allow_preemption,
3862 scrubber.priority,
3863 ops_blocked_by_scrub());
3864 // default priority, we want the rep scrub processed prior to any recovery
3865 // or client io messages (we are holding a lock!)
3866 osd->send_message_osd_cluster(
3867 replica.osd, repscrubop, get_osdmap()->get_epoch());
3868 }
3869
3870 void PG::handle_scrub_reserve_request(OpRequestRef op)
3871 {
3872 dout(7) << __func__ << " " << *op->get_req() << dendl;
3873 op->mark_started();
3874 if (scrubber.reserved) {
3875 dout(10) << __func__ << " ignoring reserve request: Already reserved"
3876 << dendl;
3877 return;
3878 }
3879 if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
3880 osd->inc_scrubs_pending()) {
3881 scrubber.reserved = true;
3882 } else {
3883 dout(20) << __func__ << ": failed to reserve remotely" << dendl;
3884 scrubber.reserved = false;
3885 }
3886 if (op->get_req()->get_type() == MSG_OSD_SCRUB_RESERVE) {
3887 const MOSDScrubReserve *m =
3888 static_cast<const MOSDScrubReserve*>(op->get_req());
3889 Message *reply = new MOSDScrubReserve(
3890 spg_t(info.pgid.pgid, primary.shard),
3891 m->map_epoch,
3892 scrubber.reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
3893 pg_whoami);
3894 osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
3895 } else {
3896 // for jewel compat only
3897 const MOSDSubOp *req = static_cast<const MOSDSubOp*>(op->get_req());
3898 assert(req->get_type() == MSG_OSD_SUBOP);
3899 MOSDSubOpReply *reply = new MOSDSubOpReply(
3900 req, pg_whoami, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
3901 ::encode(scrubber.reserved, reply->get_data());
3902 osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
3903 }
3904 }
3905
3906 void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
3907 {
3908 dout(7) << __func__ << " " << *op->get_req() << dendl;
3909 op->mark_started();
3910 if (!scrubber.reserved) {
3911 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
3912 return;
3913 }
3914 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
3915 dout(10) << " already had osd." << from << " reserved" << dendl;
3916 } else {
3917 dout(10) << " osd." << from << " scrub reserve = success" << dendl;
3918 scrubber.reserved_peers.insert(from);
3919 sched_scrub();
3920 }
3921 }
3922
3923 void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
3924 {
3925 dout(7) << __func__ << " " << *op->get_req() << dendl;
3926 op->mark_started();
3927 if (!scrubber.reserved) {
3928 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
3929 return;
3930 }
3931 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
3932 dout(10) << " already had osd." << from << " reserved" << dendl;
3933 } else {
3934 /* One decline stops this pg from being scheduled for scrubbing. */
3935 dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
3936 scrubber.reserve_failed = true;
3937 sched_scrub();
3938 }
3939 }
3940
3941 void PG::handle_scrub_reserve_release(OpRequestRef op)
3942 {
3943 dout(7) << __func__ << " " << *op->get_req() << dendl;
3944 op->mark_started();
3945 clear_scrub_reserved();
3946 }
3947
3948 void PG::reject_reservation()
3949 {
3950 osd->send_message_osd_cluster(
3951 primary.osd,
3952 new MBackfillReserve(
3953 MBackfillReserve::REJECT,
3954 spg_t(info.pgid.pgid, primary.shard),
3955 get_osdmap()->get_epoch()),
3956 get_osdmap()->get_epoch());
3957 }
3958
3959 void PG::schedule_backfill_retry(float delay)
3960 {
3961 Mutex::Locker lock(osd->recovery_request_lock);
3962 osd->recovery_request_timer.add_event_after(
3963 delay,
3964 new QueuePeeringEvt<RequestBackfill>(
3965 this, get_osdmap()->get_epoch(),
3966 RequestBackfill()));
3967 }
3968
3969 void PG::schedule_recovery_retry(float delay)
3970 {
3971 Mutex::Locker lock(osd->recovery_request_lock);
3972 osd->recovery_request_timer.add_event_after(
3973 delay,
3974 new QueuePeeringEvt<DoRecovery>(
3975 this, get_osdmap()->get_epoch(),
3976 DoRecovery()));
3977 }
3978
3979 void PG::clear_scrub_reserved()
3980 {
3981 scrubber.reserved_peers.clear();
3982 scrubber.reserve_failed = false;
3983
3984 if (scrubber.reserved) {
3985 scrubber.reserved = false;
3986 osd->dec_scrubs_pending();
3987 }
3988 }
3989
3990 void PG::scrub_reserve_replicas()
3991 {
3992 assert(backfill_targets.empty());
3993 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3994 i != actingbackfill.end();
3995 ++i) {
3996 if (*i == pg_whoami) continue;
3997 dout(10) << "scrub requesting reserve from osd." << *i << dendl;
3998 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
3999 osd->send_message_osd_cluster(
4000 i->osd,
4001 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
4002 get_osdmap()->get_epoch(),
4003 MOSDScrubReserve::REQUEST, pg_whoami),
4004 get_osdmap()->get_epoch());
4005 } else {
4006 // for jewel compat only
4007 vector<OSDOp> scrub(1);
4008 scrub[0].op.op = CEPH_OSD_OP_SCRUB_RESERVE;
4009 hobject_t poid;
4010 eversion_t v;
4011 osd_reqid_t reqid;
4012 MOSDSubOp *subop = new MOSDSubOp(
4013 reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
4014 get_osdmap()->get_epoch(), osd->get_tid(), v);
4015 subop->ops = scrub;
4016 osd->send_message_osd_cluster(
4017 i->osd, subop, get_osdmap()->get_epoch());
4018 }
4019 }
4020 }
4021
4022 void PG::scrub_unreserve_replicas()
4023 {
4024 assert(backfill_targets.empty());
4025 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4026 i != actingbackfill.end();
4027 ++i) {
4028 if (*i == pg_whoami) continue;
4029 dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
4030 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
4031 osd->send_message_osd_cluster(
4032 i->osd,
4033 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
4034 get_osdmap()->get_epoch(),
4035 MOSDScrubReserve::RELEASE, pg_whoami),
4036 get_osdmap()->get_epoch());
4037 } else {
4038 // for jewel compat only
4039 vector<OSDOp> scrub(1);
4040 scrub[0].op.op = CEPH_OSD_OP_SCRUB_UNRESERVE;
4041 hobject_t poid;
4042 eversion_t v;
4043 osd_reqid_t reqid;
4044 MOSDSubOp *subop = new MOSDSubOp(
4045 reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
4046 get_osdmap()->get_epoch(), osd->get_tid(), v);
4047 subop->ops = scrub;
4048 osd->send_message_osd_cluster(i->osd, subop, get_osdmap()->get_epoch());
4049 }
4050 }
4051 }
4052
4053 void PG::_scan_rollback_obs(
4054 const vector<ghobject_t> &rollback_obs,
4055 ThreadPool::TPHandle &handle)
4056 {
4057 ObjectStore::Transaction t;
4058 eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
4059 for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
4060 i != rollback_obs.end();
4061 ++i) {
4062 if (i->generation < trimmed_to.version) {
4063 osd->clog->error() << "osd." << osd->whoami
4064 << " pg " << info.pgid
4065 << " found obsolete rollback obj "
4066 << *i << " generation < trimmed_to "
4067 << trimmed_to
4068 << "...repaired";
4069 t.remove(coll, *i);
4070 }
4071 }
4072 if (!t.empty()) {
4073 derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
4074 << dendl;
4075 osd->store->queue_transaction(osr.get(), std::move(t), NULL);
4076 }
4077 }
4078
4079 void PG::_scan_snaps(ScrubMap &smap)
4080 {
4081 hobject_t head;
4082 SnapSet snapset;
4083
4084 // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify
4085 // caller using clean_meta_map(), and it works properly.
4086 dout(20) << __func__ << " start" << dendl;
4087
4088 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4089 i != smap.objects.rend();
4090 ++i) {
4091 const hobject_t &hoid = i->first;
4092 ScrubMap::object &o = i->second;
4093
4094 dout(20) << __func__ << " " << hoid << dendl;
4095
4096 if (hoid.is_head() || hoid.is_snapdir()) {
4097 // parse the SnapSet
4098 bufferlist bl;
4099 if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
4100 continue;
4101 }
4102 bl.push_back(o.attrs[SS_ATTR]);
4103 auto p = bl.begin();
4104 try {
4105 ::decode(snapset, p);
4106 } catch(...) {
4107 continue;
4108 }
4109 head = hoid.get_head();
4110 // Make sure head_exists is correct for is_legacy() check
4111 if (hoid.is_head())
4112 snapset.head_exists = true;
4113 continue;
4114 }
4115 if (hoid.snap < CEPH_MAXSNAP) {
4116 // check and if necessary fix snap_mapper
4117 if (hoid.get_head() != head) {
4118 derr << __func__ << " no head for " << hoid << " (have " << head << ")"
4119 << dendl;
4120 continue;
4121 }
4122 set<snapid_t> obj_snaps;
4123 if (!snapset.is_legacy()) {
4124 auto p = snapset.clone_snaps.find(hoid.snap);
4125 if (p == snapset.clone_snaps.end()) {
4126 derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset
4127 << dendl;
4128 continue;
4129 }
4130 obj_snaps.insert(p->second.begin(), p->second.end());
4131 } else {
4132 bufferlist bl;
4133 if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4134 continue;
4135 }
4136 bl.push_back(o.attrs[OI_ATTR]);
4137 object_info_t oi;
4138 try {
4139 oi.decode(bl);
4140 } catch(...) {
4141 continue;
4142 }
4143 obj_snaps.insert(oi.legacy_snaps.begin(), oi.legacy_snaps.end());
4144 }
4145 set<snapid_t> cur_snaps;
4146 int r = snap_mapper.get_snaps(hoid, &cur_snaps);
4147 if (r != 0 && r != -ENOENT) {
4148 derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
4149 ceph_abort();
4150 }
4151 if (r == -ENOENT || cur_snaps != obj_snaps) {
4152 ObjectStore::Transaction t;
4153 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4154 if (r == 0) {
4155 r = snap_mapper.remove_oid(hoid, &_t);
4156 if (r != 0) {
4157 derr << __func__ << ": remove_oid returned " << cpp_strerror(r)
4158 << dendl;
4159 ceph_abort();
4160 }
4161 osd->clog->error() << "osd." << osd->whoami
4162 << " found snap mapper error on pg "
4163 << info.pgid
4164 << " oid " << hoid << " snaps in mapper: "
4165 << cur_snaps << ", oi: "
4166 << obj_snaps
4167 << "...repaired";
4168 } else {
4169 osd->clog->error() << "osd." << osd->whoami
4170 << " found snap mapper error on pg "
4171 << info.pgid
4172 << " oid " << hoid << " snaps missing in mapper"
4173 << ", should be: "
4174 << obj_snaps
4175 << "...repaired";
4176 }
4177 snap_mapper.add_oid(hoid, obj_snaps, &_t);
4178
4179 // wait for repair to apply to avoid confusing other bits of the system.
4180 {
4181 Cond my_cond;
4182 Mutex my_lock("PG::_scan_snaps my_lock");
4183 int r = 0;
4184 bool done;
4185 t.register_on_applied_sync(
4186 new C_SafeCond(&my_lock, &my_cond, &done, &r));
4187 r = osd->store->apply_transaction(osr.get(), std::move(t));
4188 if (r != 0) {
4189 derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4190 << dendl;
4191 } else {
4192 my_lock.Lock();
4193 while (!done)
4194 my_cond.Wait(my_lock);
4195 my_lock.Unlock();
4196 }
4197 }
4198 }
4199 }
4200 }
4201 }
4202
4203 void PG::_repair_oinfo_oid(ScrubMap &smap)
4204 {
4205 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4206 i != smap.objects.rend();
4207 ++i) {
4208 const hobject_t &hoid = i->first;
4209 ScrubMap::object &o = i->second;
4210
4211 bufferlist bl;
4212 if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4213 continue;
4214 }
4215 bl.push_back(o.attrs[OI_ATTR]);
4216 object_info_t oi;
4217 try {
4218 oi.decode(bl);
4219 } catch(...) {
4220 continue;
4221 }
4222 if (oi.soid != hoid) {
4223 ObjectStore::Transaction t;
4224 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4225 osd->clog->error() << "osd." << osd->whoami
4226 << " found object info error on pg "
4227 << info.pgid
4228 << " oid " << hoid << " oid in object info: "
4229 << oi.soid
4230 << "...repaired";
4231 // Fix object info
4232 oi.soid = hoid;
4233 bl.clear();
4234 ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4235
4236 bufferptr bp(bl.c_str(), bl.length());
4237 o.attrs[OI_ATTR] = bp;
4238
4239 t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
4240 int r = osd->store->apply_transaction(osr.get(), std::move(t));
4241 if (r != 0) {
4242 derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4243 << dendl;
4244 }
4245 }
4246 }
4247 }
4248 int PG::build_scrub_map_chunk(
4249 ScrubMap &map,
4250 ScrubMapBuilder &pos,
4251 hobject_t start,
4252 hobject_t end,
4253 bool deep,
4254 ThreadPool::TPHandle &handle)
4255 {
4256 dout(10) << __func__ << " [" << start << "," << end << ") "
4257 << " pos " << pos
4258 << dendl;
4259
4260 // start
4261 while (pos.empty()) {
4262 pos.deep = deep;
4263 map.valid_through = info.last_update;
4264 osr->flush();
4265
4266 // objects
4267 vector<ghobject_t> rollback_obs;
4268 pos.ret = get_pgbackend()->objects_list_range(
4269 start,
4270 end,
4271 0,
4272 &pos.ls,
4273 &rollback_obs);
4274 if (pos.ret < 0) {
4275 dout(5) << "objects_list_range error: " << pos.ret << dendl;
4276 return pos.ret;
4277 }
4278 if (pos.ls.empty()) {
4279 break;
4280 }
4281 _scan_rollback_obs(rollback_obs, handle);
4282 pos.pos = 0;
4283 return -EINPROGRESS;
4284 }
4285
4286 // scan objects
4287 while (!pos.done()) {
4288 int r = get_pgbackend()->be_scan_list(map, pos);
4289 if (r == -EINPROGRESS) {
4290 return r;
4291 }
4292 }
4293
4294 // finish
4295 dout(20) << __func__ << " finishing" << dendl;
4296 assert(pos.done());
4297 _repair_oinfo_oid(map);
4298 if (!is_primary()) {
4299 ScrubMap for_meta_scrub;
4300 // In case we restarted smaller chunk, clear old data
4301 scrubber.cleaned_meta_map.clear_from(scrubber.start);
4302 scrubber.cleaned_meta_map.insert(map);
4303 scrubber.clean_meta_map(for_meta_scrub);
4304 _scan_snaps(for_meta_scrub);
4305 }
4306
4307 dout(20) << __func__ << " done, got " << map.objects.size() << " items"
4308 << dendl;
4309 return 0;
4310 }
4311
4312 void PG::Scrubber::cleanup_store(ObjectStore::Transaction *t) {
4313 if (!store)
4314 return;
4315 struct OnComplete : Context {
4316 std::unique_ptr<Scrub::Store> store;
4317 OnComplete(
4318 std::unique_ptr<Scrub::Store> &&store)
4319 : store(std::move(store)) {}
4320 void finish(int) override {}
4321 };
4322 store->cleanup(t);
4323 t->register_on_complete(new OnComplete(std::move(store)));
4324 assert(!store);
4325 }
4326
4327 void PG::repair_object(
4328 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
4329 pg_shard_t bad_peer)
4330 {
4331 list<pg_shard_t> op_shards;
4332 for (auto i : *ok_peers) {
4333 op_shards.push_back(i.second);
4334 }
4335 dout(10) << "repair_object " << soid << " bad_peer osd."
4336 << bad_peer << " ok_peers osd.{" << op_shards << "}" << dendl;
4337 ScrubMap::object &po = ok_peers->back().first;
4338 eversion_t v;
4339 bufferlist bv;
4340 bv.push_back(po.attrs[OI_ATTR]);
4341 object_info_t oi;
4342 try {
4343 bufferlist::iterator bliter = bv.begin();
4344 ::decode(oi, bliter);
4345 } catch (...) {
4346 dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
4347 assert(0);
4348 }
4349 if (bad_peer != primary) {
4350 peer_missing[bad_peer].add(soid, oi.version, eversion_t(), false);
4351 } else {
4352 // We should only be scrubbing if the PG is clean.
4353 assert(waiting_for_unreadable_object.empty());
4354
4355 pg_log.missing_add(soid, oi.version, eversion_t());
4356
4357 pg_log.set_last_requested(0);
4358 dout(10) << __func__ << ": primary = " << primary << dendl;
4359 }
4360
4361 if (is_ec_pg() || bad_peer == primary) {
4362 // we'd better collect all shard for EC pg, and prepare good peers as the
4363 // source of pull in the case of replicated pg.
4364 missing_loc.add_missing(soid, oi.version, eversion_t());
4365 list<pair<ScrubMap::object, pg_shard_t> >::iterator i;
4366 for (i = ok_peers->begin();
4367 i != ok_peers->end();
4368 ++i)
4369 missing_loc.add_location(soid, i->second);
4370 }
4371 }
4372
4373 /* replica_scrub
4374 *
4375 * Wait for last_update_applied to match msg->scrub_to as above. Wait
4376 * for pushes to complete in case of recent recovery. Build a single
4377 * scrubmap of objects that are in the range [msg->start, msg->end).
4378 */
4379 void PG::replica_scrub(
4380 OpRequestRef op,
4381 ThreadPool::TPHandle &handle)
4382 {
4383 const MOSDRepScrub *msg = static_cast<const MOSDRepScrub *>(op->get_req());
4384 assert(!scrubber.active_rep_scrub);
4385 dout(7) << "replica_scrub" << dendl;
4386
4387 if (msg->map_epoch < info.history.same_interval_since) {
4388 dout(10) << "replica_scrub discarding old replica_scrub from "
4389 << msg->map_epoch << " < " << info.history.same_interval_since
4390 << dendl;
4391 return;
4392 }
4393
4394 assert(msg->chunky);
4395 if (last_update_applied < msg->scrub_to) {
4396 dout(10) << "waiting for last_update_applied to catch up" << dendl;
4397 scrubber.active_rep_scrub = op;
4398 return;
4399 }
4400
4401 if (active_pushes > 0) {
4402 dout(10) << "waiting for active pushes to finish" << dendl;
4403 scrubber.active_rep_scrub = op;
4404 return;
4405 }
4406
4407 scrubber.state = Scrubber::BUILD_MAP_REPLICA;
4408 scrubber.replica_scrub_start = msg->min_epoch;
4409 scrubber.start = msg->start;
4410 scrubber.end = msg->end;
4411 scrubber.max_end = msg->end;
4412 scrubber.deep = msg->deep;
4413 scrubber.epoch_start = info.history.same_interval_since;
4414 if (msg->priority) {
4415 scrubber.priority = msg->priority;
4416 } else {
4417 scrubber.priority = get_scrub_priority();
4418 }
4419
4420 scrub_can_preempt = msg->allow_preemption;
4421 scrub_preempted = false;
4422 scrubber.replica_scrubmap_pos.reset();
4423
4424 requeue_scrub(msg->high_priority);
4425 }
4426
4427 /* Scrub:
4428 * PG_STATE_SCRUBBING is set when the scrub is queued
4429 *
4430 * scrub will be chunky if all OSDs in PG support chunky scrub
4431 * scrub will fail if OSDs are too old.
4432 */
4433 void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
4434 {
4435 if (cct->_conf->osd_scrub_sleep > 0 &&
4436 (scrubber.state == PG::Scrubber::NEW_CHUNK ||
4437 scrubber.state == PG::Scrubber::INACTIVE) &&
4438 scrubber.needs_sleep) {
4439 ceph_assert(!scrubber.sleeping);
4440 dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
4441
4442 // Do an async sleep so we don't block the op queue
4443 OSDService *osds = osd;
4444 spg_t pgid = get_pgid();
4445 int state = scrubber.state;
4446 auto scrub_requeue_callback =
4447 new FunctionContext([osds, pgid, state](int r) {
4448 PG *pg = osds->osd->lookup_lock_pg(pgid);
4449 if (pg == nullptr) {
4450 lgeneric_dout(osds->osd->cct, 20)
4451 << "scrub_requeue_callback: Could not find "
4452 << "PG " << pgid << " can't complete scrub requeue after sleep"
4453 << dendl;
4454 return;
4455 }
4456 pg->scrubber.sleeping = false;
4457 pg->scrubber.needs_sleep = false;
4458 lgeneric_dout(pg->cct, 20)
4459 << "scrub_requeue_callback: slept for "
4460 << ceph_clock_now() - pg->scrubber.sleep_start
4461 << ", re-queuing scrub with state " << state << dendl;
4462 pg->scrub_queued = false;
4463 pg->requeue_scrub();
4464 pg->scrubber.sleep_start = utime_t();
4465 pg->unlock();
4466 });
4467 Mutex::Locker l(osd->scrub_sleep_lock);
4468 osd->scrub_sleep_timer.add_event_after(cct->_conf->osd_scrub_sleep,
4469 scrub_requeue_callback);
4470 scrubber.sleeping = true;
4471 scrubber.sleep_start = ceph_clock_now();
4472 return;
4473 }
4474 if (pg_has_reset_since(queued)) {
4475 return;
4476 }
4477 assert(scrub_queued);
4478 scrub_queued = false;
4479 scrubber.needs_sleep = true;
4480
4481 // for the replica
4482 if (!is_primary() &&
4483 scrubber.state == PG::Scrubber::BUILD_MAP_REPLICA) {
4484 chunky_scrub(handle);
4485 return;
4486 }
4487
4488 if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
4489 dout(10) << "scrub -- not primary or active or not clean" << dendl;
4490 state_clear(PG_STATE_SCRUBBING);
4491 state_clear(PG_STATE_REPAIR);
4492 state_clear(PG_STATE_DEEP_SCRUB);
4493 publish_stats_to_osd();
4494 return;
4495 }
4496
4497 if (!scrubber.active) {
4498 assert(backfill_targets.empty());
4499
4500 scrubber.deep = state_test(PG_STATE_DEEP_SCRUB);
4501
4502 dout(10) << "starting a new chunky scrub" << dendl;
4503 }
4504
4505 chunky_scrub(handle);
4506 }
4507
4508 /*
4509 * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
4510 * chunk.
4511 *
4512 * The object store is partitioned into chunks which end on hash boundaries. For
4513 * each chunk, the following logic is performed:
4514 *
4515 * (1) Block writes on the chunk
4516 * (2) Request maps from replicas
4517 * (3) Wait for pushes to be applied (after recovery)
4518 * (4) Wait for writes to flush on the chunk
4519 * (5) Wait for maps from replicas
4520 * (6) Compare / repair all scrub maps
4521 * (7) Wait for digest updates to apply
4522 *
4523 * This logic is encoded in the mostly linear state machine:
4524 *
4525 * +------------------+
4526 * _________v__________ |
4527 * | | |
4528 * | INACTIVE | |
4529 * |____________________| |
4530 * | |
4531 * | +----------+ |
4532 * _________v___v______ | |
4533 * | | | |
4534 * | NEW_CHUNK | | |
4535 * |____________________| | |
4536 * | | |
4537 * _________v__________ | |
4538 * | | | |
4539 * | WAIT_PUSHES | | |
4540 * |____________________| | |
4541 * | | |
4542 * _________v__________ | |
4543 * | | | |
4544 * | WAIT_LAST_UPDATE | | |
4545 * |____________________| | |
4546 * | | |
4547 * _________v__________ | |
4548 * | | | |
4549 * | BUILD_MAP | | |
4550 * |____________________| | |
4551 * | | |
4552 * _________v__________ | |
4553 * | | | |
4554 * | WAIT_REPLICAS | | |
4555 * |____________________| | |
4556 * | | |
4557 * _________v__________ | |
4558 * | | | |
4559 * | COMPARE_MAPS | | |
4560 * |____________________| | |
4561 * | | |
4562 * | | |
4563 * _________v__________ | |
4564 * | | | |
4565 * |WAIT_DIGEST_UPDATES | | |
4566 * |____________________| | |
4567 * | | | |
4568 * | +----------+ |
4569 * _________v__________ |
4570 * | | |
4571 * | FINISH | |
4572 * |____________________| |
4573 * | |
4574 * +------------------+
4575 *
4576 * The primary determines the last update from the subset by walking the log. If
4577 * it sees a log entry pertaining to a file in the chunk, it tells the replicas
4578 * to wait until that update is applied before building a scrub map. Both the
4579 * primary and replicas will wait for any active pushes to be applied.
4580 *
4581 * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
4582 *
4583 * scrubber.state encodes the current state of the scrub (refer to state diagram
4584 * for details).
4585 */
4586 void PG::chunky_scrub(ThreadPool::TPHandle &handle)
4587 {
4588 // check for map changes
4589 if (scrubber.is_chunky_scrub_active()) {
4590 if (scrubber.epoch_start != info.history.same_interval_since) {
4591 dout(10) << "scrub pg changed, aborting" << dendl;
4592 scrub_clear_state();
4593 scrub_unreserve_replicas();
4594 return;
4595 }
4596 }
4597
4598 bool done = false;
4599 int ret;
4600
4601 while (!done) {
4602 dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
4603 << " [" << scrubber.start << "," << scrubber.end << ")"
4604 << " max_end " << scrubber.max_end << dendl;
4605
4606 switch (scrubber.state) {
4607 case PG::Scrubber::INACTIVE:
4608 dout(10) << "scrub start" << dendl;
4609 assert(is_primary());
4610
4611 publish_stats_to_osd();
4612 scrubber.epoch_start = info.history.same_interval_since;
4613 scrubber.active = true;
4614
4615 osd->inc_scrubs_active(scrubber.reserved);
4616 if (scrubber.reserved) {
4617 scrubber.reserved = false;
4618 scrubber.reserved_peers.clear();
4619 }
4620
4621 {
4622 ObjectStore::Transaction t;
4623 scrubber.cleanup_store(&t);
4624 scrubber.store.reset(Scrub::Store::create(osd->store, &t,
4625 info.pgid, coll));
4626 osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4627 }
4628
4629 // Don't include temporary objects when scrubbing
4630 scrubber.start = info.pgid.pgid.get_hobj_start();
4631 scrubber.state = PG::Scrubber::NEW_CHUNK;
4632
4633 {
4634 bool repair = state_test(PG_STATE_REPAIR);
4635 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4636 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4637 stringstream oss;
4638 oss << info.pgid.pgid << " " << mode << " starts" << std::endl;
4639 osd->clog->debug(oss);
4640 }
4641
4642 scrubber.preempt_left = cct->_conf->get_val<uint64_t>(
4643 "osd_scrub_max_preemptions");
4644 scrubber.preempt_divisor = 1;
4645 break;
4646
4647 case PG::Scrubber::NEW_CHUNK:
4648 scrubber.primary_scrubmap = ScrubMap();
4649 scrubber.received_maps.clear();
4650
4651 // begin (possible) preemption window
4652 if (scrub_preempted) {
4653 scrubber.preempt_left--;
4654 scrubber.preempt_divisor *= 2;
4655 dout(10) << __func__ << " preempted, " << scrubber.preempt_left
4656 << " left" << dendl;
4657 scrub_preempted = false;
4658 }
4659 scrub_can_preempt = scrubber.preempt_left > 0;
4660
4661 {
4662 /* get the start and end of our scrub chunk
4663 *
4664 * Our scrub chunk has an important restriction we're going to need to
4665 * respect. We can't let head or snapdir be start or end.
4666 * Using a half-open interval means that if end == head|snapdir,
4667 * we'd scrub/lock head and the clone right next to head in different
4668 * chunks which would allow us to miss clones created between
4669 * scrubbing that chunk and scrubbing the chunk including head.
4670 * This isn't true for any of the other clones since clones can
4671 * only be created "just to the left of" head. There is one exception
4672 * to this: promotion of clones which always happens to the left of the
4673 * left-most clone, but promote_object checks the scrubber in that
4674 * case, so it should be ok. Also, it's ok to "miss" clones at the
4675 * left end of the range if we are a tier because they may legitimately
4676 * not exist (see _scrub).
4677 */
4678 int min = std::max<int64_t>(3, cct->_conf->osd_scrub_chunk_min /
4679 scrubber.preempt_divisor);
4680 int max = std::max<int64_t>(min, cct->_conf->osd_scrub_chunk_max /
4681 scrubber.preempt_divisor);
4682 hobject_t start = scrubber.start;
4683 hobject_t candidate_end;
4684 vector<hobject_t> objects;
4685 osr->flush();
4686 ret = get_pgbackend()->objects_list_partial(
4687 start,
4688 min,
4689 max,
4690 &objects,
4691 &candidate_end);
4692 assert(ret >= 0);
4693
4694 if (!objects.empty()) {
4695 hobject_t back = objects.back();
4696 while (candidate_end.has_snapset() &&
4697 candidate_end.get_head() == back.get_head()) {
4698 candidate_end = back;
4699 objects.pop_back();
4700 if (objects.empty()) {
4701 assert(0 ==
4702 "Somehow we got more than 2 objects which"
4703 "have the same head but are not clones");
4704 }
4705 back = objects.back();
4706 }
4707 if (candidate_end.has_snapset()) {
4708 assert(candidate_end.get_head() != back.get_head());
4709 candidate_end = candidate_end.get_object_boundary();
4710 }
4711 } else {
4712 assert(candidate_end.is_max());
4713 }
4714
4715 if (!_range_available_for_scrub(scrubber.start, candidate_end)) {
4716 // we'll be requeued by whatever made us unavailable for scrub
4717 dout(10) << __func__ << ": scrub blocked somewhere in range "
4718 << "[" << scrubber.start << ", " << candidate_end << ")"
4719 << dendl;
4720 done = true;
4721 break;
4722 }
4723 scrubber.end = candidate_end;
4724 if (scrubber.end > scrubber.max_end)
4725 scrubber.max_end = scrubber.end;
4726 }
4727
4728 // walk the log to find the latest update that affects our chunk
4729 scrubber.subset_last_update = eversion_t();
4730 for (auto p = projected_log.log.rbegin();
4731 p != projected_log.log.rend();
4732 ++p) {
4733 if (p->soid >= scrubber.start &&
4734 p->soid < scrubber.end) {
4735 scrubber.subset_last_update = p->version;
4736 break;
4737 }
4738 }
4739 if (scrubber.subset_last_update == eversion_t()) {
4740 for (list<pg_log_entry_t>::const_reverse_iterator p =
4741 pg_log.get_log().log.rbegin();
4742 p != pg_log.get_log().log.rend();
4743 ++p) {
4744 if (p->soid >= scrubber.start &&
4745 p->soid < scrubber.end) {
4746 scrubber.subset_last_update = p->version;
4747 break;
4748 }
4749 }
4750 }
4751
4752 // ask replicas to wait until
4753 // last_update_applied >= scrubber.subset_last_update and then scan
4754 scrubber.waiting_on_whom.insert(pg_whoami);
4755
4756 // request maps from replicas
4757 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4758 i != actingbackfill.end();
4759 ++i) {
4760 if (*i == pg_whoami) continue;
4761 _request_scrub_map(*i, scrubber.subset_last_update,
4762 scrubber.start, scrubber.end, scrubber.deep,
4763 scrubber.preempt_left > 0);
4764 scrubber.waiting_on_whom.insert(*i);
4765 }
4766 dout(10) << __func__ << " waiting_on_whom " << scrubber.waiting_on_whom
4767 << dendl;
4768
4769 scrubber.state = PG::Scrubber::WAIT_PUSHES;
4770 break;
4771
4772 case PG::Scrubber::WAIT_PUSHES:
4773 if (active_pushes == 0) {
4774 scrubber.state = PG::Scrubber::WAIT_LAST_UPDATE;
4775 } else {
4776 dout(15) << "wait for pushes to apply" << dendl;
4777 done = true;
4778 }
4779 break;
4780
4781 case PG::Scrubber::WAIT_LAST_UPDATE:
4782 if (last_update_applied < scrubber.subset_last_update) {
4783 // will be requeued by op_applied
4784 dout(15) << "wait for writes to flush" << dendl;
4785 done = true;
4786 break;
4787 }
4788
4789 scrubber.state = PG::Scrubber::BUILD_MAP;
4790 scrubber.primary_scrubmap_pos.reset();
4791 break;
4792
4793 case PG::Scrubber::BUILD_MAP:
4794 assert(last_update_applied >= scrubber.subset_last_update);
4795
4796 // build my own scrub map
4797 if (scrub_preempted) {
4798 dout(10) << __func__ << " preempted" << dendl;
4799 scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
4800 break;
4801 }
4802 ret = build_scrub_map_chunk(
4803 scrubber.primary_scrubmap,
4804 scrubber.primary_scrubmap_pos,
4805 scrubber.start, scrubber.end,
4806 scrubber.deep,
4807 handle);
4808 if (ret == -EINPROGRESS) {
4809 requeue_scrub();
4810 done = true;
4811 break;
4812 }
4813 scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
4814 break;
4815
4816 case PG::Scrubber::BUILD_MAP_DONE:
4817 if (scrubber.primary_scrubmap_pos.ret < 0) {
4818 dout(5) << "error: " << scrubber.primary_scrubmap_pos.ret
4819 << ", aborting" << dendl;
4820 scrub_clear_state();
4821 scrub_unreserve_replicas();
4822 return;
4823 }
4824 dout(10) << __func__ << " waiting_on_whom was "
4825 << scrubber.waiting_on_whom << dendl;
4826 assert(scrubber.waiting_on_whom.count(pg_whoami));
4827 scrubber.waiting_on_whom.erase(pg_whoami);
4828
4829 scrubber.state = PG::Scrubber::WAIT_REPLICAS;
4830 break;
4831
4832 case PG::Scrubber::WAIT_REPLICAS:
4833 if (!scrubber.waiting_on_whom.empty()) {
4834 // will be requeued by sub_op_scrub_map
4835 dout(10) << "wait for replicas to build scrub map" << dendl;
4836 done = true;
4837 break;
4838 }
4839 // end (possible) preemption window
4840 scrub_can_preempt = false;
4841 if (scrub_preempted) {
4842 dout(10) << __func__ << " preempted, restarting chunk" << dendl;
4843 scrubber.state = PG::Scrubber::NEW_CHUNK;
4844 } else {
4845 scrubber.state = PG::Scrubber::COMPARE_MAPS;
4846 }
4847 break;
4848
4849 case PG::Scrubber::COMPARE_MAPS:
4850 assert(last_update_applied >= scrubber.subset_last_update);
4851 assert(scrubber.waiting_on_whom.empty());
4852
4853 scrub_compare_maps();
4854 scrubber.start = scrubber.end;
4855 scrubber.run_callbacks();
4856
4857 // requeue the writes from the chunk that just finished
4858 requeue_ops(waiting_for_scrub);
4859
4860 scrubber.state = PG::Scrubber::WAIT_DIGEST_UPDATES;
4861
4862 // fall-thru
4863
4864 case PG::Scrubber::WAIT_DIGEST_UPDATES:
4865 if (scrubber.num_digest_updates_pending) {
4866 dout(10) << __func__ << " waiting on "
4867 << scrubber.num_digest_updates_pending
4868 << " digest updates" << dendl;
4869 done = true;
4870 break;
4871 }
4872
4873 scrubber.preempt_left = cct->_conf->get_val<uint64_t>(
4874 "osd_scrub_max_preemptions");
4875 scrubber.preempt_divisor = 1;
4876
4877 if (!(scrubber.end.is_max())) {
4878 scrubber.state = PG::Scrubber::NEW_CHUNK;
4879 requeue_scrub();
4880 done = true;
4881 } else {
4882 scrubber.state = PG::Scrubber::FINISH;
4883 }
4884
4885 break;
4886
4887 case PG::Scrubber::FINISH:
4888 scrub_finish();
4889 scrubber.state = PG::Scrubber::INACTIVE;
4890 done = true;
4891
4892 if (!snap_trimq.empty()) {
4893 dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
4894 snap_trimmer_scrub_complete();
4895 }
4896
4897 break;
4898
4899 case PG::Scrubber::BUILD_MAP_REPLICA:
4900 // build my own scrub map
4901 if (scrub_preempted) {
4902 dout(10) << __func__ << " preempted" << dendl;
4903 ret = 0;
4904 } else {
4905 ret = build_scrub_map_chunk(
4906 scrubber.replica_scrubmap,
4907 scrubber.replica_scrubmap_pos,
4908 scrubber.start, scrubber.end,
4909 scrubber.deep,
4910 handle);
4911 }
4912 if (ret == -EINPROGRESS) {
4913 requeue_scrub();
4914 done = true;
4915 break;
4916 }
4917 // reply
4918 if (HAVE_FEATURE(acting_features, SERVER_LUMINOUS)) {
4919 MOSDRepScrubMap *reply = new MOSDRepScrubMap(
4920 spg_t(info.pgid.pgid, get_primary().shard),
4921 scrubber.replica_scrub_start,
4922 pg_whoami);
4923 reply->preempted = scrub_preempted;
4924 ::encode(scrubber.replica_scrubmap, reply->get_data());
4925 osd->send_message_osd_cluster(
4926 get_primary().osd, reply,
4927 scrubber.replica_scrub_start);
4928 } else {
4929 // for jewel compatibility
4930 vector<OSDOp> scrub(1);
4931 scrub[0].op.op = CEPH_OSD_OP_SCRUB_MAP;
4932 hobject_t poid;
4933 eversion_t v;
4934 osd_reqid_t reqid;
4935 MOSDSubOp *subop = new MOSDSubOp(
4936 reqid,
4937 pg_whoami,
4938 spg_t(info.pgid.pgid, get_primary().shard),
4939 poid,
4940 0,
4941 scrubber.replica_scrub_start,
4942 osd->get_tid(),
4943 v);
4944 ::encode(scrubber.replica_scrubmap, subop->get_data());
4945 subop->ops = scrub;
4946 osd->send_message_osd_cluster(
4947 get_primary().osd, subop,
4948 scrubber.replica_scrub_start);
4949 }
4950 scrub_preempted = false;
4951 scrub_can_preempt = false;
4952 scrubber.state = PG::Scrubber::INACTIVE;
4953 scrubber.replica_scrubmap = ScrubMap();
4954 scrubber.replica_scrubmap_pos = ScrubMapBuilder();
4955 scrubber.start = hobject_t();
4956 scrubber.end = hobject_t();
4957 scrubber.max_end = hobject_t();
4958 done = true;
4959 break;
4960
4961 default:
4962 ceph_abort();
4963 }
4964 }
4965 dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
4966 << " [" << scrubber.start << "," << scrubber.end << ")"
4967 << " max_end " << scrubber.max_end << dendl;
4968 }
4969
4970 bool PG::write_blocked_by_scrub(const hobject_t& soid)
4971 {
4972 if (soid < scrubber.start || soid >= scrubber.end) {
4973 return false;
4974 }
4975 if (scrub_can_preempt) {
4976 if (!scrub_preempted) {
4977 dout(10) << __func__ << " " << soid << " preempted" << dendl;
4978 scrub_preempted = true;
4979 } else {
4980 dout(10) << __func__ << " " << soid << " already preempted" << dendl;
4981 }
4982 return false;
4983 }
4984 return true;
4985 }
4986
4987 bool PG::range_intersects_scrub(const hobject_t &start, const hobject_t& end)
4988 {
4989 // does [start, end] intersect [scrubber.start, scrubber.max_end)
4990 return (start < scrubber.max_end &&
4991 end >= scrubber.start);
4992 }
4993
4994 void PG::scrub_clear_state()
4995 {
4996 assert(is_locked());
4997 state_clear(PG_STATE_SCRUBBING);
4998 state_clear(PG_STATE_REPAIR);
4999 state_clear(PG_STATE_DEEP_SCRUB);
5000 publish_stats_to_osd();
5001
5002 // active -> nothing.
5003 if (scrubber.active)
5004 osd->dec_scrubs_active();
5005
5006 requeue_ops(waiting_for_scrub);
5007
5008 scrubber.reset();
5009
5010 // type-specific state clear
5011 _scrub_clear_state();
5012 }
5013
5014 void PG::scrub_compare_maps()
5015 {
5016 dout(10) << __func__ << " has maps, analyzing" << dendl;
5017
5018 // construct authoritative scrub map for type specific scrubbing
5019 scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
5020 map<hobject_t,
5021 pair<boost::optional<uint32_t>,
5022 boost::optional<uint32_t>>> missing_digest;
5023
5024 map<pg_shard_t, ScrubMap *> maps;
5025 maps[pg_whoami] = &scrubber.primary_scrubmap;
5026
5027 for (const auto& i : actingbackfill) {
5028 if (i == pg_whoami) continue;
5029 dout(2) << __func__ << " replica " << i << " has "
5030 << scrubber.received_maps[i].objects.size()
5031 << " items" << dendl;
5032 maps[i] = &scrubber.received_maps[i];
5033 }
5034
5035 set<hobject_t> master_set;
5036
5037 // Construct master set
5038 for (const auto map : maps) {
5039 for (const auto i : map.second->objects) {
5040 master_set.insert(i.first);
5041 }
5042 }
5043
5044 stringstream ss;
5045 get_pgbackend()->be_large_omap_check(maps, master_set,
5046 scrubber.large_omap_objects, ss);
5047 if (!ss.str().empty()) {
5048 osd->clog->warn(ss);
5049 }
5050
5051 if (acting.size() > 1) {
5052 dout(10) << __func__ << " comparing replica scrub maps" << dendl;
5053
5054 // Map from object with errors to good peer
5055 map<hobject_t, list<pg_shard_t>> authoritative;
5056
5057 dout(2) << __func__ << " osd." << acting[0] << " has "
5058 << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
5059
5060 ss.str("");
5061 ss.clear();
5062
5063 get_pgbackend()->be_compare_scrubmaps(
5064 maps,
5065 master_set,
5066 state_test(PG_STATE_REPAIR),
5067 scrubber.missing,
5068 scrubber.inconsistent,
5069 authoritative,
5070 missing_digest,
5071 scrubber.shallow_errors,
5072 scrubber.deep_errors,
5073 scrubber.store.get(),
5074 info.pgid, acting,
5075 ss);
5076 dout(2) << ss.str() << dendl;
5077
5078 if (!ss.str().empty()) {
5079 osd->clog->error(ss);
5080 }
5081
5082 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
5083 i != authoritative.end();
5084 ++i) {
5085 list<pair<ScrubMap::object, pg_shard_t> > good_peers;
5086 for (list<pg_shard_t>::const_iterator j = i->second.begin();
5087 j != i->second.end();
5088 ++j) {
5089 good_peers.push_back(make_pair(maps[*j]->objects[i->first], *j));
5090 }
5091 scrubber.authoritative.insert(
5092 make_pair(
5093 i->first,
5094 good_peers));
5095 }
5096
5097 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
5098 i != authoritative.end();
5099 ++i) {
5100 scrubber.cleaned_meta_map.objects.erase(i->first);
5101 scrubber.cleaned_meta_map.objects.insert(
5102 *(maps[i->second.back()]->objects.find(i->first))
5103 );
5104 }
5105 }
5106
5107 ScrubMap for_meta_scrub;
5108 scrubber.clean_meta_map(for_meta_scrub);
5109
5110 // ok, do the pg-type specific scrubbing
5111 scrub_snapshot_metadata(for_meta_scrub, missing_digest);
5112 // Called here on the primary can use an authoritative map if it isn't the primary
5113 _scan_snaps(for_meta_scrub);
5114 if (!scrubber.store->empty()) {
5115 if (state_test(PG_STATE_REPAIR)) {
5116 dout(10) << __func__ << ": discarding scrub results" << dendl;
5117 scrubber.store->flush(nullptr);
5118 } else {
5119 dout(10) << __func__ << ": updating scrub object" << dendl;
5120 ObjectStore::Transaction t;
5121 scrubber.store->flush(&t);
5122 osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
5123 }
5124 }
5125 }
5126
5127 bool PG::scrub_process_inconsistent()
5128 {
5129 dout(10) << __func__ << ": checking authoritative" << dendl;
5130 bool repair = state_test(PG_STATE_REPAIR);
5131 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5132 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5133
5134 // authoriative only store objects which missing or inconsistent.
5135 if (!scrubber.authoritative.empty()) {
5136 stringstream ss;
5137 ss << info.pgid << " " << mode << " "
5138 << scrubber.missing.size() << " missing, "
5139 << scrubber.inconsistent.size() << " inconsistent objects";
5140 dout(2) << ss.str() << dendl;
5141 osd->clog->error(ss);
5142 if (repair) {
5143 state_clear(PG_STATE_CLEAN);
5144 for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >>::iterator i =
5145 scrubber.authoritative.begin();
5146 i != scrubber.authoritative.end();
5147 ++i) {
5148 set<pg_shard_t>::iterator j;
5149
5150 auto missing_entry = scrubber.missing.find(i->first);
5151 if (missing_entry != scrubber.missing.end()) {
5152 for (j = missing_entry->second.begin();
5153 j != missing_entry->second.end();
5154 ++j) {
5155 repair_object(
5156 i->first,
5157 &(i->second),
5158 *j);
5159 ++scrubber.fixed;
5160 }
5161 }
5162 if (scrubber.inconsistent.count(i->first)) {
5163 for (j = scrubber.inconsistent[i->first].begin();
5164 j != scrubber.inconsistent[i->first].end();
5165 ++j) {
5166 repair_object(i->first,
5167 &(i->second),
5168 *j);
5169 ++scrubber.fixed;
5170 }
5171 }
5172 }
5173 }
5174 }
5175 return (!scrubber.authoritative.empty() && repair);
5176 }
5177
5178 bool PG::ops_blocked_by_scrub() const {
5179 return (waiting_for_scrub.size() != 0);
5180 }
5181
5182 // the part that actually finalizes a scrub
5183 void PG::scrub_finish()
5184 {
5185 bool repair = state_test(PG_STATE_REPAIR);
5186 // if the repair request comes from auto-repair and large number of errors,
5187 // we would like to cancel auto-repair
5188 if (repair && scrubber.auto_repair
5189 && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) {
5190 state_clear(PG_STATE_REPAIR);
5191 repair = false;
5192 }
5193 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5194 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5195
5196 // type-specific finish (can tally more errors)
5197 _scrub_finish();
5198
5199 bool has_error = scrub_process_inconsistent();
5200
5201 {
5202 stringstream oss;
5203 oss << info.pgid.pgid << " " << mode << " ";
5204 int total_errors = scrubber.shallow_errors + scrubber.deep_errors;
5205 if (total_errors)
5206 oss << total_errors << " errors";
5207 else
5208 oss << "ok";
5209 if (!deep_scrub && info.stats.stats.sum.num_deep_scrub_errors)
5210 oss << " ( " << info.stats.stats.sum.num_deep_scrub_errors
5211 << " remaining deep scrub error details lost)";
5212 if (repair)
5213 oss << ", " << scrubber.fixed << " fixed";
5214 if (total_errors)
5215 osd->clog->error(oss);
5216 else
5217 osd->clog->debug(oss);
5218 }
5219
5220 // finish up
5221 unreg_next_scrub();
5222 utime_t now = ceph_clock_now();
5223 info.history.last_scrub = info.last_update;
5224 info.history.last_scrub_stamp = now;
5225 if (scrubber.deep) {
5226 info.history.last_deep_scrub = info.last_update;
5227 info.history.last_deep_scrub_stamp = now;
5228 }
5229 // Since we don't know which errors were fixed, we can only clear them
5230 // when every one has been fixed.
5231 if (repair) {
5232 if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) {
5233 assert(deep_scrub);
5234 scrubber.shallow_errors = scrubber.deep_errors = 0;
5235 } else {
5236 // Deep scrub in order to get corrected error counts
5237 scrub_after_recovery = true;
5238 }
5239 }
5240 if (deep_scrub) {
5241 if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0))
5242 info.history.last_clean_scrub_stamp = now;
5243 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
5244 info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
5245 info.stats.stats.sum.num_large_omap_objects = scrubber.large_omap_objects;
5246 } else {
5247 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
5248 // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
5249 // because of deep-scrub errors
5250 if (scrubber.shallow_errors == 0)
5251 info.history.last_clean_scrub_stamp = now;
5252 }
5253 info.stats.stats.sum.num_scrub_errors =
5254 info.stats.stats.sum.num_shallow_scrub_errors +
5255 info.stats.stats.sum.num_deep_scrub_errors;
5256 reg_next_scrub();
5257
5258 {
5259 ObjectStore::Transaction t;
5260 dirty_info = true;
5261 write_if_dirty(t);
5262 int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
5263 assert(tr == 0);
5264 }
5265
5266
5267 if (has_error) {
5268 queue_peering_event(
5269 CephPeeringEvtRef(
5270 std::make_shared<CephPeeringEvt>(
5271 get_osdmap()->get_epoch(),
5272 get_osdmap()->get_epoch(),
5273 DoRecovery())));
5274 }
5275
5276 scrub_clear_state();
5277 scrub_unreserve_replicas();
5278
5279 if (is_active() && is_primary()) {
5280 share_pg_info();
5281 }
5282 }
5283
5284 void PG::share_pg_info()
5285 {
5286 dout(10) << "share_pg_info" << dendl;
5287
5288 // share new pg_info_t with replicas
5289 assert(!actingbackfill.empty());
5290 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
5291 i != actingbackfill.end();
5292 ++i) {
5293 if (*i == pg_whoami) continue;
5294 pg_shard_t peer = *i;
5295 if (peer_info.count(peer)) {
5296 peer_info[peer].last_epoch_started = info.last_epoch_started;
5297 peer_info[peer].last_interval_started = info.last_interval_started;
5298 peer_info[peer].history.merge(info.history);
5299 }
5300 MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
5301 m->pg_list.push_back(
5302 make_pair(
5303 pg_notify_t(
5304 peer.shard, pg_whoami.shard,
5305 get_osdmap()->get_epoch(),
5306 get_osdmap()->get_epoch(),
5307 info),
5308 PastIntervals()));
5309 osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
5310 }
5311 }
5312
5313 bool PG::append_log_entries_update_missing(
5314 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
5315 ObjectStore::Transaction &t, boost::optional<eversion_t> trim_to,
5316 boost::optional<eversion_t> roll_forward_to)
5317 {
5318 assert(!entries.empty());
5319 assert(entries.begin()->version > info.last_update);
5320
5321 PGLogEntryHandler rollbacker{this, &t};
5322 bool invalidate_stats =
5323 pg_log.append_new_log_entries(info.last_backfill,
5324 info.last_backfill_bitwise,
5325 entries,
5326 &rollbacker);
5327
5328 if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
5329 pg_log.roll_forward(&rollbacker);
5330 }
5331 if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
5332 pg_log.roll_forward_to(*roll_forward_to, &rollbacker);
5333 last_rollback_info_trimmed_to_applied = *roll_forward_to;
5334 }
5335
5336 info.last_update = pg_log.get_head();
5337
5338 if (pg_log.get_missing().num_missing() == 0) {
5339 // advance last_complete since nothing else is missing!
5340 info.last_complete = info.last_update;
5341 }
5342 info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
5343
5344 dout(20) << __func__ << "trim_to bool = " << bool(trim_to) << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
5345 if (trim_to)
5346 pg_log.trim(*trim_to, info);
5347 dirty_info = true;
5348 write_if_dirty(t);
5349 return invalidate_stats;
5350 }
5351
5352
5353 void PG::merge_new_log_entries(
5354 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
5355 ObjectStore::Transaction &t,
5356 boost::optional<eversion_t> trim_to,
5357 boost::optional<eversion_t> roll_forward_to)
5358 {
5359 dout(10) << __func__ << " " << entries << dendl;
5360 assert(is_primary());
5361
5362 bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
5363 for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
5364 i != actingbackfill.end();
5365 ++i) {
5366 pg_shard_t peer(*i);
5367 if (peer == pg_whoami) continue;
5368 assert(peer_missing.count(peer));
5369 assert(peer_info.count(peer));
5370 pg_missing_t& pmissing(peer_missing[peer]);
5371 dout(20) << __func__ << " peer_missing for " << peer << " = " << pmissing << dendl;
5372 pg_info_t& pinfo(peer_info[peer]);
5373 bool invalidate_stats = PGLog::append_log_entries_update_missing(
5374 pinfo.last_backfill,
5375 info.last_backfill_bitwise,
5376 entries,
5377 true,
5378 NULL,
5379 pmissing,
5380 NULL,
5381 this);
5382 pinfo.last_update = info.last_update;
5383 pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
5384 rebuild_missing = rebuild_missing || invalidate_stats;
5385 }
5386
5387 if (!rebuild_missing) {
5388 return;
5389 }
5390
5391 for (auto &&i: entries) {
5392 missing_loc.rebuild(
5393 i.soid,
5394 pg_whoami,
5395 actingbackfill,
5396 info,
5397 pg_log.get_missing(),
5398 peer_missing,
5399 peer_info);
5400 }
5401 }
5402
5403 void PG::update_history(const pg_history_t& new_history)
5404 {
5405 unreg_next_scrub();
5406 if (info.history.merge(new_history)) {
5407 dout(20) << __func__ << " advanced history from " << new_history << dendl;
5408 dirty_info = true;
5409 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
5410 dout(20) << __func__ << " clearing past_intervals" << dendl;
5411 past_intervals.clear();
5412 dirty_big_info = true;
5413 }
5414 }
5415 reg_next_scrub();
5416 }
5417
5418 void PG::fulfill_info(
5419 pg_shard_t from, const pg_query_t &query,
5420 pair<pg_shard_t, pg_info_t> &notify_info)
5421 {
5422 assert(from == primary);
5423 assert(query.type == pg_query_t::INFO);
5424
5425 // info
5426 dout(10) << "sending info" << dendl;
5427 notify_info = make_pair(from, info);
5428 }
5429
5430 void PG::fulfill_log(
5431 pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
5432 {
5433 dout(10) << "log request from " << from << dendl;
5434 assert(from == primary);
5435 assert(query.type != pg_query_t::INFO);
5436 ConnectionRef con = osd->get_con_osd_cluster(
5437 from.osd, get_osdmap()->get_epoch());
5438 if (!con) return;
5439
5440 MOSDPGLog *mlog = new MOSDPGLog(
5441 from.shard, pg_whoami.shard,
5442 get_osdmap()->get_epoch(),
5443 info, query_epoch);
5444 mlog->missing = pg_log.get_missing();
5445
5446 // primary -> other, when building master log
5447 if (query.type == pg_query_t::LOG) {
5448 dout(10) << " sending info+missing+log since " << query.since
5449 << dendl;
5450 if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
5451 osd->clog->error() << info.pgid << " got broken pg_query_t::LOG since " << query.since
5452 << " when my log.tail is " << pg_log.get_tail()
5453 << ", sending full log instead";
5454 mlog->log = pg_log.get_log(); // primary should not have requested this!!
5455 } else
5456 mlog->log.copy_after(pg_log.get_log(), query.since);
5457 }
5458 else if (query.type == pg_query_t::FULLLOG) {
5459 dout(10) << " sending info+missing+full log" << dendl;
5460 mlog->log = pg_log.get_log();
5461 }
5462
5463 dout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
5464
5465 osd->share_map_peer(from.osd, con.get(), get_osdmap());
5466 osd->send_message_osd_cluster(mlog, con.get());
5467 }
5468
5469 void PG::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
5470 {
5471 bool changed = false;
5472 if (osdmap->test_flag(CEPH_OSDMAP_FULL) &&
5473 !lastmap->test_flag(CEPH_OSDMAP_FULL)) {
5474 dout(10) << " cluster was marked full in " << osdmap->get_epoch() << dendl;
5475 changed = true;
5476 }
5477 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
5478 assert(pi);
5479 if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
5480 const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
5481 if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
5482 dout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
5483 changed = true;
5484 }
5485 }
5486 if (changed) {
5487 info.history.last_epoch_marked_full = osdmap->get_epoch();
5488 dirty_info = true;
5489 }
5490 }
5491
5492 bool PG::should_restart_peering(
5493 int newupprimary,
5494 int newactingprimary,
5495 const vector<int>& newup,
5496 const vector<int>& newacting,
5497 OSDMapRef lastmap,
5498 OSDMapRef osdmap)
5499 {
5500 if (PastIntervals::is_new_interval(
5501 primary.osd,
5502 newactingprimary,
5503 acting,
5504 newacting,
5505 up_primary.osd,
5506 newupprimary,
5507 up,
5508 newup,
5509 osdmap,
5510 lastmap,
5511 info.pgid.pgid)) {
5512 dout(20) << "new interval newup " << newup
5513 << " newacting " << newacting << dendl;
5514 return true;
5515 } else {
5516 return false;
5517 }
5518 }
5519
5520 bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
5521 {
5522 if (last_peering_reset > reply_epoch ||
5523 last_peering_reset > query_epoch) {
5524 dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " << query_epoch
5525 << " last_peering_reset " << last_peering_reset
5526 << dendl;
5527 return true;
5528 }
5529 return false;
5530 }
5531
5532 void PG::set_last_peering_reset()
5533 {
5534 dout(20) << "set_last_peering_reset " << get_osdmap()->get_epoch() << dendl;
5535 if (last_peering_reset != get_osdmap()->get_epoch()) {
5536 last_peering_reset = get_osdmap()->get_epoch();
5537 reset_interval_flush();
5538 }
5539 }
5540
5541 struct FlushState {
5542 PGRef pg;
5543 epoch_t epoch;
5544 FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
5545 ~FlushState() {
5546 pg->lock();
5547 if (!pg->pg_has_reset_since(epoch))
5548 pg->queue_flushed(epoch);
5549 pg->unlock();
5550 }
5551 };
5552 typedef ceph::shared_ptr<FlushState> FlushStateRef;
5553
5554 void PG::start_flush(ObjectStore::Transaction *t,
5555 list<Context *> *on_applied,
5556 list<Context *> *on_safe)
5557 {
5558 // flush in progress ops
5559 FlushStateRef flush_trigger (std::make_shared<FlushState>(
5560 this, get_osdmap()->get_epoch()));
5561 t->nop();
5562 flushes_in_progress++;
5563 on_applied->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5564 on_safe->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5565 }
5566
5567 void PG::reset_interval_flush()
5568 {
5569 dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
5570 recovery_state.clear_blocked_outgoing();
5571
5572 Context *c = new QueuePeeringEvt<IntervalFlush>(
5573 this, get_osdmap()->get_epoch(), IntervalFlush());
5574 if (!osr->flush_commit(c)) {
5575 dout(10) << "Beginning to block outgoing recovery messages" << dendl;
5576 recovery_state.begin_block_outgoing();
5577 } else {
5578 dout(10) << "Not blocking outgoing recovery messages" << dendl;
5579 delete c;
5580 }
5581 }
5582
5583 /* Called before initializing peering during advance_map */
5584 void PG::start_peering_interval(
5585 const OSDMapRef lastmap,
5586 const vector<int>& newup, int new_up_primary,
5587 const vector<int>& newacting, int new_acting_primary,
5588 ObjectStore::Transaction *t)
5589 {
5590 const OSDMapRef osdmap = get_osdmap();
5591
5592 set_last_peering_reset();
5593
5594 vector<int> oldacting, oldup;
5595 int oldrole = get_role();
5596
5597 unreg_next_scrub();
5598
5599 pg_shard_t old_acting_primary = get_primary();
5600 pg_shard_t old_up_primary = up_primary;
5601 bool was_old_primary = is_primary();
5602 bool was_old_replica = is_replica();
5603
5604 acting.swap(oldacting);
5605 up.swap(oldup);
5606 init_primary_up_acting(
5607 newup,
5608 newacting,
5609 new_up_primary,
5610 new_acting_primary);
5611
5612 if (info.stats.up != up ||
5613 info.stats.acting != acting ||
5614 info.stats.up_primary != new_up_primary ||
5615 info.stats.acting_primary != new_acting_primary) {
5616 info.stats.up = up;
5617 info.stats.up_primary = new_up_primary;
5618 info.stats.acting = acting;
5619 info.stats.acting_primary = new_acting_primary;
5620 info.stats.mapping_epoch = osdmap->get_epoch();
5621 }
5622
5623 pg_stats_publish_lock.Lock();
5624 pg_stats_publish_valid = false;
5625 pg_stats_publish_lock.Unlock();
5626
5627 // This will now be remapped during a backfill in cases
5628 // that it would not have been before.
5629 if (up != acting)
5630 state_set(PG_STATE_REMAPPED);
5631 else
5632 state_clear(PG_STATE_REMAPPED);
5633
5634 int role = osdmap->calc_pg_role(osd->whoami, acting, acting.size());
5635 if (pool.info.is_replicated() || role == pg_whoami.shard)
5636 set_role(role);
5637 else
5638 set_role(-1);
5639
5640 // did acting, up, primary|acker change?
5641 if (!lastmap) {
5642 dout(10) << " no lastmap" << dendl;
5643 dirty_info = true;
5644 dirty_big_info = true;
5645 info.history.same_interval_since = osdmap->get_epoch();
5646 } else {
5647 std::stringstream debug;
5648 assert(info.history.same_interval_since != 0);
5649 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
5650 get_is_recoverable_predicate());
5651 bool new_interval = PastIntervals::check_new_interval(
5652 old_acting_primary.osd,
5653 new_acting_primary,
5654 oldacting, newacting,
5655 old_up_primary.osd,
5656 new_up_primary,
5657 oldup, newup,
5658 info.history.same_interval_since,
5659 info.history.last_epoch_clean,
5660 osdmap,
5661 lastmap,
5662 info.pgid.pgid,
5663 recoverable.get(),
5664 &past_intervals,
5665 &debug);
5666 dout(10) << __func__ << ": check_new_interval output: "
5667 << debug.str() << dendl;
5668 if (new_interval) {
5669 if (osdmap->get_epoch() == osd->get_superblock().oldest_map &&
5670 info.history.last_epoch_clean < osdmap->get_epoch()) {
5671 dout(10) << " map gap, clearing past_intervals and faking" << dendl;
5672 // our information is incomplete and useless; someone else was clean
5673 // after everything we know if osdmaps were trimmed.
5674 past_intervals.clear();
5675 } else {
5676 dout(10) << " noting past " << past_intervals << dendl;
5677 }
5678 dirty_info = true;
5679 dirty_big_info = true;
5680 info.history.same_interval_since = osdmap->get_epoch();
5681 if (info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
5682 osdmap->get_pg_num(info.pgid.pgid.pool()),
5683 nullptr)) {
5684 info.history.last_epoch_split = osdmap->get_epoch();
5685 }
5686 }
5687 }
5688
5689 if (old_up_primary != up_primary ||
5690 oldup != up) {
5691 info.history.same_up_since = osdmap->get_epoch();
5692 }
5693 // this comparison includes primary rank via pg_shard_t
5694 if (old_acting_primary != get_primary()) {
5695 info.history.same_primary_since = osdmap->get_epoch();
5696 }
5697
5698 on_new_interval();
5699
5700 dout(1) << __func__ << " up " << oldup << " -> " << up
5701 << ", acting " << oldacting << " -> " << acting
5702 << ", acting_primary " << old_acting_primary << " -> " << new_acting_primary
5703 << ", up_primary " << old_up_primary << " -> " << new_up_primary
5704 << ", role " << oldrole << " -> " << role
5705 << ", features acting " << acting_features
5706 << " upacting " << upacting_features
5707 << dendl;
5708
5709 // deactivate.
5710 state_clear(PG_STATE_ACTIVE);
5711 state_clear(PG_STATE_PEERED);
5712 state_clear(PG_STATE_DOWN);
5713 state_clear(PG_STATE_RECOVERY_WAIT);
5714 state_clear(PG_STATE_RECOVERY_TOOFULL);
5715 state_clear(PG_STATE_RECOVERING);
5716
5717 peer_purged.clear();
5718 actingbackfill.clear();
5719 scrub_queued = false;
5720
5721 // reset primary/replica state?
5722 if (was_old_primary || is_primary()) {
5723 osd->remove_want_pg_temp(info.pgid.pgid);
5724 } else if (was_old_replica || is_replica()) {
5725 osd->remove_want_pg_temp(info.pgid.pgid);
5726 }
5727 clear_primary_state();
5728
5729
5730 // pg->on_*
5731 on_change(t);
5732
5733 projected_last_update = eversion_t();
5734
5735 assert(!deleting);
5736
5737 // should we tell the primary we are here?
5738 send_notify = !is_primary();
5739
5740 if (role != oldrole ||
5741 was_old_primary != is_primary()) {
5742 // did primary change?
5743 if (was_old_primary != is_primary()) {
5744 state_clear(PG_STATE_CLEAN);
5745 clear_publish_stats();
5746 }
5747
5748 on_role_change();
5749
5750 // take active waiters
5751 requeue_ops(waiting_for_peered);
5752
5753 } else {
5754 // no role change.
5755 // did primary change?
5756 if (get_primary() != old_acting_primary) {
5757 dout(10) << *this << " " << oldacting << " -> " << acting
5758 << ", acting primary "
5759 << old_acting_primary << " -> " << get_primary()
5760 << dendl;
5761 } else {
5762 // primary is the same.
5763 if (is_primary()) {
5764 // i am (still) primary. but my replica set changed.
5765 state_clear(PG_STATE_CLEAN);
5766
5767 dout(10) << oldacting << " -> " << acting
5768 << ", replicas changed" << dendl;
5769 }
5770 }
5771 }
5772 cancel_recovery();
5773
5774 if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
5775 dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
5776 osd->queue_want_pg_temp(info.pgid.pgid, acting);
5777 }
5778 }
5779
5780 void PG::on_new_interval()
5781 {
5782 const OSDMapRef osdmap = get_osdmap();
5783
5784 reg_next_scrub();
5785
5786 // initialize features
5787 acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5788 upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5789 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
5790 if (*p == CRUSH_ITEM_NONE)
5791 continue;
5792 uint64_t f = osdmap->get_xinfo(*p).features;
5793 acting_features &= f;
5794 upacting_features &= f;
5795 }
5796 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
5797 if (*p == CRUSH_ITEM_NONE)
5798 continue;
5799 upacting_features &= osdmap->get_xinfo(*p).features;
5800 }
5801
5802 _on_new_interval();
5803 }
5804
5805 void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
5806 {
5807 assert(!is_primary());
5808
5809 update_history(oinfo.history);
5810 if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
5811 info.stats.stats.sum.num_scrub_errors = 0;
5812 info.stats.stats.sum.num_shallow_scrub_errors = 0;
5813 info.stats.stats.sum.num_deep_scrub_errors = 0;
5814 dirty_info = true;
5815 }
5816
5817 if (!(info.purged_snaps == oinfo.purged_snaps)) {
5818 dout(10) << __func__ << " updating purged_snaps to " << oinfo.purged_snaps
5819 << dendl;
5820 info.purged_snaps = oinfo.purged_snaps;
5821 dirty_info = true;
5822 dirty_big_info = true;
5823 }
5824 }
5825
5826 ostream& operator<<(ostream& out, const PG& pg)
5827 {
5828 out << "pg[" << pg.info
5829 << " " << pg.up;
5830 if (pg.acting != pg.up)
5831 out << "/" << pg.acting;
5832 if (pg.is_ec_pg())
5833 out << "p" << pg.get_primary();
5834 out << " r=" << pg.get_role();
5835 out << " lpr=" << pg.get_last_peering_reset();
5836
5837 if (!pg.past_intervals.empty()) {
5838 out << " pi=[" << pg.past_intervals.get_bounds()
5839 << ")/" << pg.past_intervals.size();
5840 }
5841
5842 if (pg.is_peered()) {
5843 if (pg.last_update_ondisk != pg.info.last_update)
5844 out << " luod=" << pg.last_update_ondisk;
5845 if (pg.last_update_applied != pg.info.last_update)
5846 out << " lua=" << pg.last_update_applied;
5847 }
5848
5849 if (pg.recovery_ops_active)
5850 out << " rops=" << pg.recovery_ops_active;
5851
5852 if (pg.pg_log.get_tail() != pg.info.log_tail ||
5853 pg.pg_log.get_head() != pg.info.last_update)
5854 out << " (info mismatch, " << pg.pg_log.get_log() << ")";
5855
5856 if (!pg.pg_log.get_log().empty()) {
5857 if ((pg.pg_log.get_log().log.begin()->version <= pg.pg_log.get_tail())) {
5858 out << " (log bound mismatch, actual=["
5859 << pg.pg_log.get_log().log.begin()->version << ","
5860 << pg.pg_log.get_log().log.rbegin()->version << "]";
5861 out << ")";
5862 }
5863 }
5864
5865 if (!pg.backfill_targets.empty())
5866 out << " bft=" << pg.backfill_targets;
5867 out << " crt=" << pg.pg_log.get_can_rollback_to();
5868
5869 if (pg.last_complete_ondisk != pg.info.last_complete)
5870 out << " lcod " << pg.last_complete_ondisk;
5871
5872 if (pg.is_primary()) {
5873 out << " mlcod " << pg.min_last_complete_ondisk;
5874 }
5875
5876 out << " " << pg_state_string(pg.get_state());
5877 if (pg.should_send_notify())
5878 out << " NOTIFY";
5879
5880 if (pg.scrubber.must_repair)
5881 out << " MUST_REPAIR";
5882 if (pg.scrubber.auto_repair)
5883 out << " AUTO_REPAIR";
5884 if (pg.scrubber.must_deep_scrub)
5885 out << " MUST_DEEP_SCRUB";
5886 if (pg.scrubber.must_scrub)
5887 out << " MUST_SCRUB";
5888
5889 //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
5890 if (pg.pg_log.get_missing().num_missing()) {
5891 out << " m=" << pg.pg_log.get_missing().num_missing();
5892 if (pg.is_primary()) {
5893 uint64_t unfound = pg.get_num_unfound();
5894 if (unfound)
5895 out << " u=" << unfound;
5896 }
5897 }
5898 if (pg.snap_trimq.size())
5899 out << " snaptrimq=" << pg.snap_trimq;
5900
5901 out << "]";
5902
5903
5904 return out;
5905 }
5906
5907 bool PG::can_discard_op(OpRequestRef& op)
5908 {
5909 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
5910 if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) {
5911 dout(20) << " discard " << *m << dendl;
5912 return true;
5913 }
5914
5915 if (m->get_map_epoch() < info.history.same_primary_since) {
5916 dout(7) << " changed after " << m->get_map_epoch()
5917 << ", dropping " << *m << dendl;
5918 return true;
5919 }
5920
5921 if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) {
5922 if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) {
5923 dout(7) << __func__ << " sent before last_force_op_resend "
5924 << pool.info.last_force_op_resend << ", dropping" << *m << dendl;
5925 return true;
5926 }
5927 if (m->get_map_epoch() < info.history.last_epoch_split) {
5928 dout(7) << __func__ << " pg split in "
5929 << info.history.last_epoch_split << ", dropping" << dendl;
5930 return true;
5931 }
5932 } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) {
5933 if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) {
5934 dout(7) << __func__ << " sent before last_force_op_resend_preluminous "
5935 << pool.info.last_force_op_resend_preluminous
5936 << ", dropping" << *m << dendl;
5937 return true;
5938 }
5939 }
5940
5941 return false;
5942 }
5943
5944 template<typename T, int MSGTYPE>
5945 bool PG::can_discard_replica_op(OpRequestRef& op)
5946 {
5947 const T *m = static_cast<const T *>(op->get_req());
5948 assert(m->get_type() == MSGTYPE);
5949
5950 int from = m->get_source().num();
5951
5952 // if a repop is replied after a replica goes down in a new osdmap, and
5953 // before the pg advances to this new osdmap, the repop replies before this
5954 // repop can be discarded by that replica OSD, because the primary resets the
5955 // connection to it when handling the new osdmap marking it down, and also
5956 // resets the messenger sesssion when the replica reconnects. to avoid the
5957 // out-of-order replies, the messages from that replica should be discarded.
5958 if (osd->get_osdmap()->is_down(from))
5959 return true;
5960 /* Mostly, this overlaps with the old_peering_msg
5961 * condition. An important exception is pushes
5962 * sent by replicas not in the acting set, since
5963 * if such a replica goes down it does not cause
5964 * a new interval. */
5965 if (get_osdmap()->get_down_at(from) >= m->map_epoch)
5966 return true;
5967
5968 // same pg?
5969 // if pg changes _at all_, we reset and repeer!
5970 if (old_peering_msg(m->map_epoch, m->map_epoch)) {
5971 dout(10) << "can_discard_replica_op pg changed " << info.history
5972 << " after " << m->map_epoch
5973 << ", dropping" << dendl;
5974 return true;
5975 }
5976 return false;
5977 }
5978
5979 bool PG::can_discard_scan(OpRequestRef op)
5980 {
5981 const MOSDPGScan *m = static_cast<const MOSDPGScan *>(op->get_req());
5982 assert(m->get_type() == MSG_OSD_PG_SCAN);
5983
5984 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
5985 dout(10) << " got old scan, ignoring" << dendl;
5986 return true;
5987 }
5988 return false;
5989 }
5990
5991 bool PG::can_discard_backfill(OpRequestRef op)
5992 {
5993 const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill *>(op->get_req());
5994 assert(m->get_type() == MSG_OSD_PG_BACKFILL);
5995
5996 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
5997 dout(10) << " got old backfill, ignoring" << dendl;
5998 return true;
5999 }
6000
6001 return false;
6002
6003 }
6004
6005 bool PG::can_discard_request(OpRequestRef& op)
6006 {
6007 switch (op->get_req()->get_type()) {
6008 case CEPH_MSG_OSD_OP:
6009 return can_discard_op(op);
6010 case CEPH_MSG_OSD_BACKOFF:
6011 return false; // never discard
6012 case MSG_OSD_SUBOP:
6013 return can_discard_replica_op<MOSDSubOp, MSG_OSD_SUBOP>(op);
6014 case MSG_OSD_REPOP:
6015 return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op);
6016 case MSG_OSD_PG_PUSH:
6017 return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op);
6018 case MSG_OSD_PG_PULL:
6019 return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
6020 case MSG_OSD_PG_PUSH_REPLY:
6021 return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
6022 case MSG_OSD_SUBOPREPLY:
6023 return can_discard_replica_op<MOSDSubOpReply, MSG_OSD_SUBOPREPLY>(op);
6024 case MSG_OSD_REPOPREPLY:
6025 return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
6026 case MSG_OSD_PG_RECOVERY_DELETE:
6027 return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op);
6028
6029 case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
6030 return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op);
6031
6032 case MSG_OSD_EC_WRITE:
6033 return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
6034 case MSG_OSD_EC_WRITE_REPLY:
6035 return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op);
6036 case MSG_OSD_EC_READ:
6037 return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
6038 case MSG_OSD_EC_READ_REPLY:
6039 return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
6040 case MSG_OSD_REP_SCRUB:
6041 return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
6042 case MSG_OSD_SCRUB_RESERVE:
6043 return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op);
6044 case MSG_OSD_REP_SCRUBMAP:
6045 return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op);
6046 case MSG_OSD_PG_UPDATE_LOG_MISSING:
6047 return can_discard_replica_op<
6048 MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op);
6049 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
6050 return can_discard_replica_op<
6051 MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
6052
6053 case MSG_OSD_PG_SCAN:
6054 return can_discard_scan(op);
6055 case MSG_OSD_PG_BACKFILL:
6056 return can_discard_backfill(op);
6057 case MSG_OSD_PG_BACKFILL_REMOVE:
6058 return can_discard_replica_op<MOSDPGBackfillRemove,
6059 MSG_OSD_PG_BACKFILL_REMOVE>(op);
6060 }
6061 return true;
6062 }
6063
6064 void PG::take_waiters()
6065 {
6066 dout(10) << "take_waiters" << dendl;
6067 requeue_map_waiters();
6068 for (list<CephPeeringEvtRef>::iterator i = peering_waiters.begin();
6069 i != peering_waiters.end();
6070 ++i) osd->queue_for_peering(this);
6071 peering_queue.splice(peering_queue.begin(), peering_waiters,
6072 peering_waiters.begin(), peering_waiters.end());
6073 }
6074
6075 void PG::handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx)
6076 {
6077 dout(10) << "handle_peering_event: " << evt->get_desc() << dendl;
6078 if (!have_same_or_newer_map(evt->get_epoch_sent())) {
6079 dout(10) << "deferring event " << evt->get_desc() << dendl;
6080 peering_waiters.push_back(evt);
6081 return;
6082 }
6083 if (old_peering_evt(evt))
6084 return;
6085 recovery_state.handle_event(evt, rctx);
6086 }
6087
6088 void PG::queue_peering_event(CephPeeringEvtRef evt)
6089 {
6090 if (old_peering_evt(evt))
6091 return;
6092 peering_queue.push_back(evt);
6093 osd->queue_for_peering(this);
6094 }
6095
6096 void PG::queue_null(epoch_t msg_epoch,
6097 epoch_t query_epoch)
6098 {
6099 dout(10) << "null" << dendl;
6100 queue_peering_event(
6101 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
6102 NullEvt())));
6103 }
6104
6105 void PG::queue_flushed(epoch_t e)
6106 {
6107 dout(10) << "flushed" << dendl;
6108 queue_peering_event(
6109 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(e, e,
6110 FlushedEvt())));
6111 }
6112
6113 void PG::queue_query(epoch_t msg_epoch,
6114 epoch_t query_epoch,
6115 pg_shard_t from, const pg_query_t& q)
6116 {
6117 dout(10) << "handle_query " << q << " from replica " << from << dendl;
6118 queue_peering_event(
6119 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
6120 MQuery(from, q, query_epoch))));
6121 }
6122
6123 void PG::handle_advance_map(
6124 OSDMapRef osdmap, OSDMapRef lastmap,
6125 vector<int>& newup, int up_primary,
6126 vector<int>& newacting, int acting_primary,
6127 RecoveryCtx *rctx)
6128 {
6129 assert(lastmap->get_epoch() == osdmap_ref->get_epoch());
6130 assert(lastmap == osdmap_ref);
6131 dout(10) << "handle_advance_map "
6132 << newup << "/" << newacting
6133 << " -- " << up_primary << "/" << acting_primary
6134 << dendl;
6135 update_osdmap_ref(osdmap);
6136 pool.update(osdmap);
6137 past_intervals.update_type_from_map(pool.info.ec_pool(), *osdmap);
6138 if (cct->_conf->osd_debug_verify_cached_snaps) {
6139 interval_set<snapid_t> actual_removed_snaps;
6140 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
6141 assert(pi);
6142 pi->build_removed_snaps(actual_removed_snaps);
6143 if (!(actual_removed_snaps == pool.cached_removed_snaps)) {
6144 derr << __func__ << ": mismatch between the actual removed snaps "
6145 << actual_removed_snaps << " and pool.cached_removed_snaps "
6146 << " pool.cached_removed_snaps " << pool.cached_removed_snaps
6147 << dendl;
6148 }
6149 assert(actual_removed_snaps == pool.cached_removed_snaps);
6150 }
6151 AdvMap evt(
6152 osdmap, lastmap, newup, up_primary,
6153 newacting, acting_primary);
6154 recovery_state.handle_event(evt, rctx);
6155 if (pool.info.last_change == osdmap_ref->get_epoch()) {
6156 on_pool_change();
6157 update_store_with_options();
6158 }
6159 }
6160
6161 void PG::handle_activate_map(RecoveryCtx *rctx)
6162 {
6163 dout(10) << "handle_activate_map " << dendl;
6164 ActMap evt;
6165 recovery_state.handle_event(evt, rctx);
6166 if (osdmap_ref->get_epoch() - last_persisted_osdmap_ref->get_epoch() >
6167 cct->_conf->osd_pg_epoch_persisted_max_stale) {
6168 dout(20) << __func__ << ": Dirtying info: last_persisted is "
6169 << last_persisted_osdmap_ref->get_epoch()
6170 << " while current is " << osdmap_ref->get_epoch() << dendl;
6171 dirty_info = true;
6172 } else {
6173 dout(20) << __func__ << ": Not dirtying info: last_persisted is "
6174 << last_persisted_osdmap_ref->get_epoch()
6175 << " while current is " << osdmap_ref->get_epoch() << dendl;
6176 }
6177 if (osdmap_ref->check_new_blacklist_entries()) check_blacklisted_watchers();
6178 }
6179
6180 void PG::handle_loaded(RecoveryCtx *rctx)
6181 {
6182 dout(10) << "handle_loaded" << dendl;
6183 Load evt;
6184 recovery_state.handle_event(evt, rctx);
6185 }
6186
6187 void PG::handle_create(RecoveryCtx *rctx)
6188 {
6189 dout(10) << "handle_create" << dendl;
6190 rctx->created_pgs.insert(this);
6191 Initialize evt;
6192 recovery_state.handle_event(evt, rctx);
6193 ActMap evt2;
6194 recovery_state.handle_event(evt2, rctx);
6195
6196 rctx->on_applied->add(make_lambda_context([this]() {
6197 update_store_with_options();
6198 }));
6199 }
6200
6201 void PG::handle_query_state(Formatter *f)
6202 {
6203 dout(10) << "handle_query_state" << dendl;
6204 QueryState q(f);
6205 recovery_state.handle_event(q, 0);
6206 }
6207
6208 void PG::update_store_with_options()
6209 {
6210 auto r = osd->store->set_collection_opts(coll, pool.info.opts);
6211 if(r < 0 && r != -EOPNOTSUPP) {
6212 derr << __func__ << " set_collection_opts returns error:" << r << dendl;
6213 }
6214 }
6215
6216 void PG::update_store_on_load()
6217 {
6218 if (osd->store->get_type() == "filestore") {
6219 // legacy filestore didn't store collection bit width; fix.
6220 int bits = osd->store->collection_bits(coll);
6221 if (bits < 0) {
6222 assert(!coll.is_meta()); // otherwise OSD::load_pgs() did a bad thing
6223 bits = info.pgid.get_split_bits(pool.info.get_pg_num());
6224 lderr(cct) << __func__ << " setting bit width to " << bits << dendl;
6225 ObjectStore::Transaction t;
6226 t.collection_set_bits(coll, bits);
6227 osd->store->apply_transaction(osr.get(), std::move(t));
6228 }
6229 }
6230 }
6231
6232 /*------------ Recovery State Machine----------------*/
6233 #undef dout_prefix
6234 #define dout_prefix (*_dout << context< RecoveryMachine >().pg->gen_prefix() \
6235 << "state<" << get_state_name() << ">: ")
6236
6237 /*------Crashed-------*/
6238 PG::RecoveryState::Crashed::Crashed(my_context ctx)
6239 : my_base(ctx),
6240 NamedState(context< RecoveryMachine >().pg, "Crashed")
6241 {
6242 context< RecoveryMachine >().log_enter(state_name);
6243 assert(0 == "we got a bad state machine event");
6244 }
6245
6246
6247 /*------Initial-------*/
6248 PG::RecoveryState::Initial::Initial(my_context ctx)
6249 : my_base(ctx),
6250 NamedState(context< RecoveryMachine >().pg, "Initial")
6251 {
6252 context< RecoveryMachine >().log_enter(state_name);
6253 }
6254
6255 boost::statechart::result PG::RecoveryState::Initial::react(const Load& l)
6256 {
6257 PG *pg = context< RecoveryMachine >().pg;
6258
6259 // do we tell someone we're here?
6260 pg->send_notify = (!pg->is_primary());
6261 pg->update_store_with_options();
6262
6263 pg->update_store_on_load();
6264
6265 return transit< Reset >();
6266 }
6267
6268 boost::statechart::result PG::RecoveryState::Initial::react(const MNotifyRec& notify)
6269 {
6270 PG *pg = context< RecoveryMachine >().pg;
6271 pg->proc_replica_info(
6272 notify.from, notify.notify.info, notify.notify.epoch_sent);
6273 pg->set_last_peering_reset();
6274 return transit< Primary >();
6275 }
6276
6277 boost::statechart::result PG::RecoveryState::Initial::react(const MInfoRec& i)
6278 {
6279 PG *pg = context< RecoveryMachine >().pg;
6280 assert(!pg->is_primary());
6281 post_event(i);
6282 return transit< Stray >();
6283 }
6284
6285 boost::statechart::result PG::RecoveryState::Initial::react(const MLogRec& i)
6286 {
6287 PG *pg = context< RecoveryMachine >().pg;
6288 assert(!pg->is_primary());
6289 post_event(i);
6290 return transit< Stray >();
6291 }
6292
6293 void PG::RecoveryState::Initial::exit()
6294 {
6295 context< RecoveryMachine >().log_exit(state_name, enter_time);
6296 PG *pg = context< RecoveryMachine >().pg;
6297 utime_t dur = ceph_clock_now() - enter_time;
6298 pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur);
6299 }
6300
6301 /*------Started-------*/
6302 PG::RecoveryState::Started::Started(my_context ctx)
6303 : my_base(ctx),
6304 NamedState(context< RecoveryMachine >().pg, "Started")
6305 {
6306 context< RecoveryMachine >().log_enter(state_name);
6307 }
6308
6309 boost::statechart::result
6310 PG::RecoveryState::Started::react(const IntervalFlush&)
6311 {
6312 PG *pg = context< RecoveryMachine >().pg;
6313 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6314 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6315 return discard_event();
6316 }
6317
6318
6319 boost::statechart::result
6320 PG::RecoveryState::Started::react(const FlushedEvt&)
6321 {
6322 PG *pg = context< RecoveryMachine >().pg;
6323 pg->on_flushed();
6324 return discard_event();
6325 }
6326
6327
6328 boost::statechart::result PG::RecoveryState::Started::react(const AdvMap& advmap)
6329 {
6330 PG *pg = context< RecoveryMachine >().pg;
6331 ldout(pg->cct, 10) << "Started advmap" << dendl;
6332 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6333 if (pg->should_restart_peering(
6334 advmap.up_primary,
6335 advmap.acting_primary,
6336 advmap.newup,
6337 advmap.newacting,
6338 advmap.lastmap,
6339 advmap.osdmap)) {
6340 ldout(pg->cct, 10) << "should_restart_peering, transitioning to Reset"
6341 << dendl;
6342 post_event(advmap);
6343 return transit< Reset >();
6344 }
6345 pg->remove_down_peer_info(advmap.osdmap);
6346 return discard_event();
6347 }
6348
6349 boost::statechart::result PG::RecoveryState::Started::react(const QueryState& q)
6350 {
6351 q.f->open_object_section("state");
6352 q.f->dump_string("name", state_name);
6353 q.f->dump_stream("enter_time") << enter_time;
6354 q.f->close_section();
6355 return discard_event();
6356 }
6357
6358 void PG::RecoveryState::Started::exit()
6359 {
6360 context< RecoveryMachine >().log_exit(state_name, enter_time);
6361 PG *pg = context< RecoveryMachine >().pg;
6362 utime_t dur = ceph_clock_now() - enter_time;
6363 pg->osd->recoverystate_perf->tinc(rs_started_latency, dur);
6364 }
6365
6366 /*--------Reset---------*/
6367 PG::RecoveryState::Reset::Reset(my_context ctx)
6368 : my_base(ctx),
6369 NamedState(context< RecoveryMachine >().pg, "Reset")
6370 {
6371 context< RecoveryMachine >().log_enter(state_name);
6372 PG *pg = context< RecoveryMachine >().pg;
6373
6374 pg->flushes_in_progress = 0;
6375 pg->set_last_peering_reset();
6376 }
6377
6378 boost::statechart::result
6379 PG::RecoveryState::Reset::react(const FlushedEvt&)
6380 {
6381 PG *pg = context< RecoveryMachine >().pg;
6382 pg->on_flushed();
6383 return discard_event();
6384 }
6385
6386 boost::statechart::result
6387 PG::RecoveryState::Reset::react(const IntervalFlush&)
6388 {
6389 PG *pg = context< RecoveryMachine >().pg;
6390 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6391 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6392 return discard_event();
6393 }
6394
6395 boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
6396 {
6397 PG *pg = context< RecoveryMachine >().pg;
6398 ldout(pg->cct, 10) << "Reset advmap" << dendl;
6399
6400 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6401
6402 if (pg->should_restart_peering(
6403 advmap.up_primary,
6404 advmap.acting_primary,
6405 advmap.newup,
6406 advmap.newacting,
6407 advmap.lastmap,
6408 advmap.osdmap)) {
6409 ldout(pg->cct, 10) << "should restart peering, calling start_peering_interval again"
6410 << dendl;
6411 pg->start_peering_interval(
6412 advmap.lastmap,
6413 advmap.newup, advmap.up_primary,
6414 advmap.newacting, advmap.acting_primary,
6415 context< RecoveryMachine >().get_cur_transaction());
6416 }
6417 pg->remove_down_peer_info(advmap.osdmap);
6418 pg->check_past_interval_bounds();
6419 return discard_event();
6420 }
6421
6422 boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&)
6423 {
6424 PG *pg = context< RecoveryMachine >().pg;
6425 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
6426 context< RecoveryMachine >().send_notify(
6427 pg->get_primary(),
6428 pg_notify_t(
6429 pg->get_primary().shard, pg->pg_whoami.shard,
6430 pg->get_osdmap()->get_epoch(),
6431 pg->get_osdmap()->get_epoch(),
6432 pg->info),
6433 pg->past_intervals);
6434 }
6435
6436 pg->update_heartbeat_peers();
6437 pg->take_waiters();
6438
6439 return transit< Started >();
6440 }
6441
6442 boost::statechart::result PG::RecoveryState::Reset::react(const QueryState& q)
6443 {
6444 q.f->open_object_section("state");
6445 q.f->dump_string("name", state_name);
6446 q.f->dump_stream("enter_time") << enter_time;
6447 q.f->close_section();
6448 return discard_event();
6449 }
6450
6451 void PG::RecoveryState::Reset::exit()
6452 {
6453 context< RecoveryMachine >().log_exit(state_name, enter_time);
6454 PG *pg = context< RecoveryMachine >().pg;
6455 utime_t dur = ceph_clock_now() - enter_time;
6456 pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur);
6457 }
6458
6459 /*-------Start---------*/
6460 PG::RecoveryState::Start::Start(my_context ctx)
6461 : my_base(ctx),
6462 NamedState(context< RecoveryMachine >().pg, "Start")
6463 {
6464 context< RecoveryMachine >().log_enter(state_name);
6465
6466 PG *pg = context< RecoveryMachine >().pg;
6467 if (pg->is_primary()) {
6468 ldout(pg->cct, 1) << "transitioning to Primary" << dendl;
6469 post_event(MakePrimary());
6470 } else { //is_stray
6471 ldout(pg->cct, 1) << "transitioning to Stray" << dendl;
6472 post_event(MakeStray());
6473 }
6474 }
6475
6476 void PG::RecoveryState::Start::exit()
6477 {
6478 context< RecoveryMachine >().log_exit(state_name, enter_time);
6479 PG *pg = context< RecoveryMachine >().pg;
6480 utime_t dur = ceph_clock_now() - enter_time;
6481 pg->osd->recoverystate_perf->tinc(rs_start_latency, dur);
6482 }
6483
6484 /*---------Primary--------*/
6485 PG::RecoveryState::Primary::Primary(my_context ctx)
6486 : my_base(ctx),
6487 NamedState(context< RecoveryMachine >().pg, "Started/Primary")
6488 {
6489 context< RecoveryMachine >().log_enter(state_name);
6490 PG *pg = context< RecoveryMachine >().pg;
6491 assert(pg->want_acting.empty());
6492
6493 // set CREATING bit until we have peered for the first time.
6494 if (pg->info.history.last_epoch_started == 0) {
6495 pg->state_set(PG_STATE_CREATING);
6496 // use the history timestamp, which ultimately comes from the
6497 // monitor in the create case.
6498 utime_t t = pg->info.history.last_scrub_stamp;
6499 pg->info.stats.last_fresh = t;
6500 pg->info.stats.last_active = t;
6501 pg->info.stats.last_change = t;
6502 pg->info.stats.last_peered = t;
6503 pg->info.stats.last_clean = t;
6504 pg->info.stats.last_unstale = t;
6505 pg->info.stats.last_undegraded = t;
6506 pg->info.stats.last_fullsized = t;
6507 pg->info.stats.last_scrub_stamp = t;
6508 pg->info.stats.last_deep_scrub_stamp = t;
6509 pg->info.stats.last_clean_scrub_stamp = t;
6510 }
6511 }
6512
6513 boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& notevt)
6514 {
6515 PG *pg = context< RecoveryMachine >().pg;
6516 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
6517 pg->proc_replica_info(
6518 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
6519 return discard_event();
6520 }
6521
6522 boost::statechart::result PG::RecoveryState::Primary::react(const ActMap&)
6523 {
6524 PG *pg = context< RecoveryMachine >().pg;
6525 ldout(pg->cct, 7) << "handle ActMap primary" << dendl;
6526 pg->publish_stats_to_osd();
6527 pg->take_waiters();
6528 return discard_event();
6529 }
6530
6531 void PG::RecoveryState::Primary::exit()
6532 {
6533 context< RecoveryMachine >().log_exit(state_name, enter_time);
6534 PG *pg = context< RecoveryMachine >().pg;
6535 pg->want_acting.clear();
6536 utime_t dur = ceph_clock_now() - enter_time;
6537 pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur);
6538 pg->clear_primary_state();
6539 pg->state_clear(PG_STATE_CREATING);
6540 }
6541
6542 /*---------Peering--------*/
6543 PG::RecoveryState::Peering::Peering(my_context ctx)
6544 : my_base(ctx),
6545 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering"),
6546 history_les_bound(false)
6547 {
6548 context< RecoveryMachine >().log_enter(state_name);
6549
6550 PG *pg = context< RecoveryMachine >().pg;
6551 assert(!pg->is_peered());
6552 assert(!pg->is_peering());
6553 assert(pg->is_primary());
6554 pg->state_set(PG_STATE_PEERING);
6555 }
6556
6557 boost::statechart::result PG::RecoveryState::Peering::react(const AdvMap& advmap)
6558 {
6559 PG *pg = context< RecoveryMachine >().pg;
6560 ldout(pg->cct, 10) << "Peering advmap" << dendl;
6561 if (prior_set.affected_by_map(*(advmap.osdmap), pg)) {
6562 ldout(pg->cct, 1) << "Peering, affected_by_map, going to Reset" << dendl;
6563 post_event(advmap);
6564 return transit< Reset >();
6565 }
6566
6567 pg->adjust_need_up_thru(advmap.osdmap);
6568
6569 return forward_event();
6570 }
6571
6572 boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q)
6573 {
6574 PG *pg = context< RecoveryMachine >().pg;
6575
6576 q.f->open_object_section("state");
6577 q.f->dump_string("name", state_name);
6578 q.f->dump_stream("enter_time") << enter_time;
6579
6580 q.f->open_array_section("past_intervals");
6581 pg->past_intervals.dump(q.f);
6582 q.f->close_section();
6583
6584 q.f->open_array_section("probing_osds");
6585 for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
6586 p != prior_set.probe.end();
6587 ++p)
6588 q.f->dump_stream("osd") << *p;
6589 q.f->close_section();
6590
6591 if (prior_set.pg_down)
6592 q.f->dump_string("blocked", "peering is blocked due to down osds");
6593
6594 q.f->open_array_section("down_osds_we_would_probe");
6595 for (set<int>::iterator p = prior_set.down.begin();
6596 p != prior_set.down.end();
6597 ++p)
6598 q.f->dump_int("osd", *p);
6599 q.f->close_section();
6600
6601 q.f->open_array_section("peering_blocked_by");
6602 for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
6603 p != prior_set.blocked_by.end();
6604 ++p) {
6605 q.f->open_object_section("osd");
6606 q.f->dump_int("osd", p->first);
6607 q.f->dump_int("current_lost_at", p->second);
6608 q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
6609 q.f->close_section();
6610 }
6611 q.f->close_section();
6612
6613 if (history_les_bound) {
6614 q.f->open_array_section("peering_blocked_by_detail");
6615 q.f->open_object_section("item");
6616 q.f->dump_string("detail","peering_blocked_by_history_les_bound");
6617 q.f->close_section();
6618 q.f->close_section();
6619 }
6620
6621 q.f->close_section();
6622 return forward_event();
6623 }
6624
6625 void PG::RecoveryState::Peering::exit()
6626 {
6627 PG *pg = context< RecoveryMachine >().pg;
6628 ldout(pg->cct, 10) << "Leaving Peering" << dendl;
6629 context< RecoveryMachine >().log_exit(state_name, enter_time);
6630 pg->state_clear(PG_STATE_PEERING);
6631 pg->clear_probe_targets();
6632
6633 utime_t dur = ceph_clock_now() - enter_time;
6634 pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur);
6635 }
6636
6637
6638 /*------Backfilling-------*/
6639 PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
6640 : my_base(ctx),
6641 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Backfilling")
6642 {
6643 context< RecoveryMachine >().log_enter(state_name);
6644 PG *pg = context< RecoveryMachine >().pg;
6645 pg->backfill_reserved = true;
6646 pg->queue_recovery();
6647 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
6648 pg->state_clear(PG_STATE_BACKFILL_WAIT);
6649 pg->state_set(PG_STATE_BACKFILLING);
6650 pg->publish_stats_to_osd();
6651 }
6652
6653 boost::statechart::result
6654 PG::RecoveryState::Backfilling::react(const DeferBackfill &c)
6655 {
6656 PG *pg = context< RecoveryMachine >().pg;
6657 ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl;
6658 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6659
6660 pg->state_set(PG_STATE_BACKFILL_WAIT);
6661 pg->state_clear(PG_STATE_BACKFILLING);
6662
6663 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6664 it != pg->backfill_targets.end();
6665 ++it) {
6666 assert(*it != pg->pg_whoami);
6667 ConnectionRef con = pg->osd->get_con_osd_cluster(
6668 it->osd, pg->get_osdmap()->get_epoch());
6669 if (con) {
6670 pg->osd->send_message_osd_cluster(
6671 new MBackfillReserve(
6672 MBackfillReserve::REJECT,
6673 spg_t(pg->info.pgid.pgid, it->shard),
6674 pg->get_osdmap()->get_epoch()),
6675 con.get());
6676 }
6677 }
6678
6679
6680 if (!pg->waiting_on_backfill.empty()) {
6681 pg->waiting_on_backfill.clear();
6682 pg->finish_recovery_op(hobject_t::get_max());
6683 }
6684
6685 pg->schedule_backfill_retry(c.delay);
6686 return transit<NotBackfilling>();
6687 }
6688
6689 boost::statechart::result
6690 PG::RecoveryState::Backfilling::react(const UnfoundBackfill &c)
6691 {
6692 PG *pg = context< RecoveryMachine >().pg;
6693 ldout(pg->cct, 10) << "backfill has unfound, can't continue" << dendl;
6694 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6695
6696 pg->state_set(PG_STATE_BACKFILL_UNFOUND);
6697 pg->state_clear(PG_STATE_BACKFILLING);
6698
6699 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6700 it != pg->backfill_targets.end();
6701 ++it) {
6702 assert(*it != pg->pg_whoami);
6703 ConnectionRef con = pg->osd->get_con_osd_cluster(
6704 it->osd, pg->get_osdmap()->get_epoch());
6705 if (con) {
6706 pg->osd->send_message_osd_cluster(
6707 new MBackfillReserve(
6708 MBackfillReserve::REJECT,
6709 spg_t(pg->info.pgid.pgid, it->shard),
6710 pg->get_osdmap()->get_epoch()),
6711 con.get());
6712 }
6713 }
6714
6715 pg->waiting_on_backfill.clear();
6716
6717 return transit<NotBackfilling>();
6718 }
6719
6720 boost::statechart::result
6721 PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &)
6722 {
6723 PG *pg = context< RecoveryMachine >().pg;
6724 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6725 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6726
6727 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6728 it != pg->backfill_targets.end();
6729 ++it) {
6730 assert(*it != pg->pg_whoami);
6731 ConnectionRef con = pg->osd->get_con_osd_cluster(
6732 it->osd, pg->get_osdmap()->get_epoch());
6733 if (con) {
6734 pg->osd->send_message_osd_cluster(
6735 new MBackfillReserve(
6736 MBackfillReserve::REJECT,
6737 spg_t(pg->info.pgid.pgid, it->shard),
6738 pg->get_osdmap()->get_epoch()),
6739 con.get());
6740 }
6741 }
6742
6743 if (!pg->waiting_on_backfill.empty()) {
6744 pg->waiting_on_backfill.clear();
6745 pg->finish_recovery_op(hobject_t::get_max());
6746 }
6747
6748 pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
6749 return transit<NotBackfilling>();
6750 }
6751
6752 void PG::RecoveryState::Backfilling::exit()
6753 {
6754 context< RecoveryMachine >().log_exit(state_name, enter_time);
6755 PG *pg = context< RecoveryMachine >().pg;
6756 pg->backfill_reserved = false;
6757 pg->backfill_reserving = false;
6758 pg->state_clear(PG_STATE_BACKFILLING);
6759 pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
6760 utime_t dur = ceph_clock_now() - enter_time;
6761 pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
6762 }
6763
6764 /*--WaitRemoteBackfillReserved--*/
6765
6766 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
6767 : my_base(ctx),
6768 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteBackfillReserved"),
6769 backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
6770 {
6771 context< RecoveryMachine >().log_enter(state_name);
6772 PG *pg = context< RecoveryMachine >().pg;
6773 pg->state_set(PG_STATE_BACKFILL_WAIT);
6774 pg->publish_stats_to_osd();
6775 post_event(RemoteBackfillReserved());
6776 }
6777
6778 boost::statechart::result
6779 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
6780 {
6781 PG *pg = context< RecoveryMachine >().pg;
6782
6783 if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) {
6784 //The primary never backfills itself
6785 assert(*backfill_osd_it != pg->pg_whoami);
6786 ConnectionRef con = pg->osd->get_con_osd_cluster(
6787 backfill_osd_it->osd, pg->get_osdmap()->get_epoch());
6788 if (con) {
6789 pg->osd->send_message_osd_cluster(
6790 new MBackfillReserve(
6791 MBackfillReserve::REQUEST,
6792 spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
6793 pg->get_osdmap()->get_epoch(),
6794 pg->get_backfill_priority()),
6795 con.get());
6796 }
6797 ++backfill_osd_it;
6798 } else {
6799 post_event(AllBackfillsReserved());
6800 }
6801 return discard_event();
6802 }
6803
6804 void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
6805 {
6806 context< RecoveryMachine >().log_exit(state_name, enter_time);
6807 PG *pg = context< RecoveryMachine >().pg;
6808 utime_t dur = ceph_clock_now() - enter_time;
6809 pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur);
6810 }
6811
6812 boost::statechart::result
6813 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt)
6814 {
6815 PG *pg = context< RecoveryMachine >().pg;
6816 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6817
6818 // Send REJECT to all previously acquired reservations
6819 set<pg_shard_t>::const_iterator it, begin, end, next;
6820 begin = context< Active >().remote_shards_to_reserve_backfill.begin();
6821 end = context< Active >().remote_shards_to_reserve_backfill.end();
6822 assert(begin != end);
6823 for (next = it = begin, ++next ; next != backfill_osd_it; ++it, ++next) {
6824 //The primary never backfills itself
6825 assert(*it != pg->pg_whoami);
6826 ConnectionRef con = pg->osd->get_con_osd_cluster(
6827 it->osd, pg->get_osdmap()->get_epoch());
6828 if (con) {
6829 pg->osd->send_message_osd_cluster(
6830 new MBackfillReserve(
6831 MBackfillReserve::REJECT,
6832 spg_t(pg->info.pgid.pgid, it->shard),
6833 pg->get_osdmap()->get_epoch()),
6834 con.get());
6835 }
6836 }
6837
6838 pg->state_clear(PG_STATE_BACKFILL_WAIT);
6839 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6840 pg->publish_stats_to_osd();
6841
6842 pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
6843
6844 return transit<NotBackfilling>();
6845 }
6846
6847 /*--WaitLocalBackfillReserved--*/
6848 PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
6849 : my_base(ctx),
6850 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalBackfillReserved")
6851 {
6852 context< RecoveryMachine >().log_enter(state_name);
6853 PG *pg = context< RecoveryMachine >().pg;
6854 pg->state_set(PG_STATE_BACKFILL_WAIT);
6855 pg->osd->local_reserver.request_reservation(
6856 pg->info.pgid,
6857 new QueuePeeringEvt<LocalBackfillReserved>(
6858 pg, pg->get_osdmap()->get_epoch(),
6859 LocalBackfillReserved()),
6860 pg->get_backfill_priority(),
6861 new QueuePeeringEvt<DeferBackfill>(
6862 pg, pg->get_osdmap()->get_epoch(),
6863 DeferBackfill(0.0)));
6864 pg->publish_stats_to_osd();
6865 }
6866
6867 void PG::RecoveryState::WaitLocalBackfillReserved::exit()
6868 {
6869 context< RecoveryMachine >().log_exit(state_name, enter_time);
6870 PG *pg = context< RecoveryMachine >().pg;
6871 utime_t dur = ceph_clock_now() - enter_time;
6872 pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur);
6873 }
6874
6875 /*----NotBackfilling------*/
6876 PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
6877 : my_base(ctx),
6878 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotBackfilling")
6879 {
6880 context< RecoveryMachine >().log_enter(state_name);
6881 PG *pg = context< RecoveryMachine >().pg;
6882 pg->publish_stats_to_osd();
6883 }
6884
6885 boost::statechart::result
6886 PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
6887 {
6888 return discard_event();
6889 }
6890
6891 boost::statechart::result
6892 PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
6893 {
6894 return discard_event();
6895 }
6896
6897 void PG::RecoveryState::NotBackfilling::exit()
6898 {
6899 context< RecoveryMachine >().log_exit(state_name, enter_time);
6900 PG *pg = context< RecoveryMachine >().pg;
6901 pg->state_clear(PG_STATE_BACKFILL_UNFOUND);
6902 utime_t dur = ceph_clock_now() - enter_time;
6903 pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
6904 }
6905
6906 /*----NotRecovering------*/
6907 PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx)
6908 : my_base(ctx),
6909 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotRecovering")
6910 {
6911 context< RecoveryMachine >().log_enter(state_name);
6912 PG *pg = context< RecoveryMachine >().pg;
6913 pg->publish_stats_to_osd();
6914 }
6915
6916 void PG::RecoveryState::NotRecovering::exit()
6917 {
6918 context< RecoveryMachine >().log_exit(state_name, enter_time);
6919 PG *pg = context< RecoveryMachine >().pg;
6920 pg->state_clear(PG_STATE_RECOVERY_UNFOUND);
6921 utime_t dur = ceph_clock_now() - enter_time;
6922 pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur);
6923 }
6924
6925 /*---RepNotRecovering----*/
6926 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
6927 : my_base(ctx),
6928 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepNotRecovering")
6929 {
6930 context< RecoveryMachine >().log_enter(state_name);
6931 }
6932
6933 boost::statechart::result
6934 PG::RecoveryState::RepNotRecovering::react(const RejectRemoteReservation &evt)
6935 {
6936 PG *pg = context< RecoveryMachine >().pg;
6937 pg->reject_reservation();
6938 post_event(RemoteReservationRejected());
6939 return discard_event();
6940 }
6941
6942 void PG::RecoveryState::RepNotRecovering::exit()
6943 {
6944 context< RecoveryMachine >().log_exit(state_name, enter_time);
6945 PG *pg = context< RecoveryMachine >().pg;
6946 utime_t dur = ceph_clock_now() - enter_time;
6947 pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur);
6948 }
6949
6950 /*---RepWaitRecoveryReserved--*/
6951 PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
6952 : my_base(ctx),
6953 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitRecoveryReserved")
6954 {
6955 context< RecoveryMachine >().log_enter(state_name);
6956 PG *pg = context< RecoveryMachine >().pg;
6957
6958 pg->osd->remote_reserver.request_reservation(
6959 pg->info.pgid,
6960 new QueuePeeringEvt<RemoteRecoveryReserved>(
6961 pg, pg->get_osdmap()->get_epoch(),
6962 RemoteRecoveryReserved()),
6963 pg->get_recovery_priority());
6964 }
6965
6966 boost::statechart::result
6967 PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
6968 {
6969 PG *pg = context< RecoveryMachine >().pg;
6970 pg->osd->send_message_osd_cluster(
6971 pg->primary.osd,
6972 new MRecoveryReserve(
6973 MRecoveryReserve::GRANT,
6974 spg_t(pg->info.pgid.pgid, pg->primary.shard),
6975 pg->get_osdmap()->get_epoch()),
6976 pg->get_osdmap()->get_epoch());
6977 return transit<RepRecovering>();
6978 }
6979
6980 boost::statechart::result
6981 PG::RecoveryState::RepWaitRecoveryReserved::react(
6982 const RemoteReservationCanceled &evt)
6983 {
6984 PG *pg = context< RecoveryMachine >().pg;
6985 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6986 return transit<RepNotRecovering>();
6987 }
6988
6989 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
6990 {
6991 context< RecoveryMachine >().log_exit(state_name, enter_time);
6992 PG *pg = context< RecoveryMachine >().pg;
6993 utime_t dur = ceph_clock_now() - enter_time;
6994 pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur);
6995 }
6996
6997 /*-RepWaitBackfillReserved*/
6998 PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
6999 : my_base(ctx),
7000 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitBackfillReserved")
7001 {
7002 context< RecoveryMachine >().log_enter(state_name);
7003 }
7004
7005 boost::statechart::result
7006 PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
7007 {
7008 PG *pg = context< RecoveryMachine >().pg;
7009 ostringstream ss;
7010
7011 if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
7012 (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
7013 ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
7014 << dendl;
7015 post_event(RejectRemoteReservation());
7016 } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
7017 pg->osd->check_backfill_full(ss)) {
7018 ldout(pg->cct, 10) << "backfill reservation rejected: "
7019 << ss.str() << dendl;
7020 post_event(RejectRemoteReservation());
7021 } else {
7022 pg->osd->remote_reserver.request_reservation(
7023 pg->info.pgid,
7024 new QueuePeeringEvt<RemoteBackfillReserved>(
7025 pg, pg->get_osdmap()->get_epoch(),
7026 RemoteBackfillReserved()), evt.priority);
7027 }
7028 return transit<RepWaitBackfillReserved>();
7029 }
7030
7031 void PG::RecoveryState::RepWaitBackfillReserved::exit()
7032 {
7033 context< RecoveryMachine >().log_exit(state_name, enter_time);
7034 PG *pg = context< RecoveryMachine >().pg;
7035 utime_t dur = ceph_clock_now() - enter_time;
7036 pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur);
7037 }
7038
7039 boost::statechart::result
7040 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
7041 {
7042 PG *pg = context< RecoveryMachine >().pg;
7043
7044 ostringstream ss;
7045 if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
7046 (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
7047 ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
7048 << "failure injection" << dendl;
7049 post_event(RejectRemoteReservation());
7050 return discard_event();
7051 } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
7052 pg->osd->check_backfill_full(ss)) {
7053 ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
7054 << ss.str() << dendl;
7055 post_event(RejectRemoteReservation());
7056 return discard_event();
7057 } else {
7058 pg->osd->send_message_osd_cluster(
7059 pg->primary.osd,
7060 new MBackfillReserve(
7061 MBackfillReserve::GRANT,
7062 spg_t(pg->info.pgid.pgid, pg->primary.shard),
7063 pg->get_osdmap()->get_epoch()),
7064 pg->get_osdmap()->get_epoch());
7065 return transit<RepRecovering>();
7066 }
7067 }
7068
7069 boost::statechart::result
7070 PG::RecoveryState::RepWaitBackfillReserved::react(
7071 const RejectRemoteReservation &evt)
7072 {
7073 PG *pg = context< RecoveryMachine >().pg;
7074 pg->reject_reservation();
7075 post_event(RemoteReservationRejected());
7076 return discard_event();
7077 }
7078
7079 boost::statechart::result
7080 PG::RecoveryState::RepWaitBackfillReserved::react(
7081 const RemoteReservationRejected &evt)
7082 {
7083 PG *pg = context< RecoveryMachine >().pg;
7084 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7085 return transit<RepNotRecovering>();
7086 }
7087
7088 boost::statechart::result
7089 PG::RecoveryState::RepWaitBackfillReserved::react(
7090 const RemoteReservationCanceled &evt)
7091 {
7092 PG *pg = context< RecoveryMachine >().pg;
7093 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7094 return transit<RepNotRecovering>();
7095 }
7096
7097 /*---RepRecovering-------*/
7098 PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx)
7099 : my_base(ctx),
7100 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepRecovering")
7101 {
7102 context< RecoveryMachine >().log_enter(state_name);
7103 }
7104
7105 boost::statechart::result
7106 PG::RecoveryState::RepRecovering::react(const BackfillTooFull &)
7107 {
7108 PG *pg = context< RecoveryMachine >().pg;
7109 pg->reject_reservation();
7110 return discard_event();
7111 }
7112
7113 void PG::RecoveryState::RepRecovering::exit()
7114 {
7115 context< RecoveryMachine >().log_exit(state_name, enter_time);
7116 PG *pg = context< RecoveryMachine >().pg;
7117 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7118 utime_t dur = ceph_clock_now() - enter_time;
7119 pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur);
7120 }
7121
7122 /*------Activating--------*/
7123 PG::RecoveryState::Activating::Activating(my_context ctx)
7124 : my_base(ctx),
7125 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Activating")
7126 {
7127 context< RecoveryMachine >().log_enter(state_name);
7128 }
7129
7130 void PG::RecoveryState::Activating::exit()
7131 {
7132 context< RecoveryMachine >().log_exit(state_name, enter_time);
7133 PG *pg = context< RecoveryMachine >().pg;
7134 utime_t dur = ceph_clock_now() - enter_time;
7135 pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur);
7136 }
7137
7138 PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
7139 : my_base(ctx),
7140 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalRecoveryReserved")
7141 {
7142 context< RecoveryMachine >().log_enter(state_name);
7143 PG *pg = context< RecoveryMachine >().pg;
7144
7145 // Make sure all nodes that part of the recovery aren't full
7146 if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery &&
7147 pg->osd->check_osdmap_full(pg->actingbackfill)) {
7148 post_event(RecoveryTooFull());
7149 return;
7150 }
7151
7152 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7153 pg->state_set(PG_STATE_RECOVERY_WAIT);
7154 pg->osd->local_reserver.request_reservation(
7155 pg->info.pgid,
7156 new QueuePeeringEvt<LocalRecoveryReserved>(
7157 pg, pg->get_osdmap()->get_epoch(),
7158 LocalRecoveryReserved()),
7159 pg->get_recovery_priority(),
7160 new QueuePeeringEvt<DeferRecovery>(
7161 pg, pg->get_osdmap()->get_epoch(),
7162 DeferRecovery(0.0)));
7163 pg->publish_stats_to_osd();
7164 }
7165
7166 boost::statechart::result
7167 PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
7168 {
7169 PG *pg = context< RecoveryMachine >().pg;
7170 pg->state_set(PG_STATE_RECOVERY_TOOFULL);
7171 pg->schedule_recovery_retry(pg->cct->_conf->osd_recovery_retry_interval);
7172 return transit<NotRecovering>();
7173 }
7174
7175 void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
7176 {
7177 context< RecoveryMachine >().log_exit(state_name, enter_time);
7178 PG *pg = context< RecoveryMachine >().pg;
7179 utime_t dur = ceph_clock_now() - enter_time;
7180 pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur);
7181 }
7182
7183 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
7184 : my_base(ctx),
7185 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
7186 remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
7187 {
7188 context< RecoveryMachine >().log_enter(state_name);
7189 post_event(RemoteRecoveryReserved());
7190 }
7191
7192 boost::statechart::result
7193 PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
7194 PG *pg = context< RecoveryMachine >().pg;
7195
7196 if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
7197 assert(*remote_recovery_reservation_it != pg->pg_whoami);
7198 ConnectionRef con = pg->osd->get_con_osd_cluster(
7199 remote_recovery_reservation_it->osd, pg->get_osdmap()->get_epoch());
7200 if (con) {
7201 pg->osd->send_message_osd_cluster(
7202 new MRecoveryReserve(
7203 MRecoveryReserve::REQUEST,
7204 spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard),
7205 pg->get_osdmap()->get_epoch()),
7206 con.get());
7207 }
7208 ++remote_recovery_reservation_it;
7209 } else {
7210 post_event(AllRemotesReserved());
7211 }
7212 return discard_event();
7213 }
7214
7215 void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
7216 {
7217 context< RecoveryMachine >().log_exit(state_name, enter_time);
7218 PG *pg = context< RecoveryMachine >().pg;
7219 utime_t dur = ceph_clock_now() - enter_time;
7220 pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur);
7221 }
7222
7223 PG::RecoveryState::Recovering::Recovering(my_context ctx)
7224 : my_base(ctx),
7225 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovering")
7226 {
7227 context< RecoveryMachine >().log_enter(state_name);
7228
7229 PG *pg = context< RecoveryMachine >().pg;
7230 pg->state_clear(PG_STATE_RECOVERY_WAIT);
7231 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7232 pg->state_set(PG_STATE_RECOVERING);
7233 assert(!pg->state_test(PG_STATE_ACTIVATING));
7234 pg->publish_stats_to_osd();
7235 pg->queue_recovery();
7236 }
7237
7238 void PG::RecoveryState::Recovering::release_reservations(bool cancel)
7239 {
7240 PG *pg = context< RecoveryMachine >().pg;
7241 assert(cancel || !pg->pg_log.get_missing().have_missing());
7242
7243 // release remote reservations
7244 for (set<pg_shard_t>::const_iterator i =
7245 context< Active >().remote_shards_to_reserve_recovery.begin();
7246 i != context< Active >().remote_shards_to_reserve_recovery.end();
7247 ++i) {
7248 if (*i == pg->pg_whoami) // skip myself
7249 continue;
7250 ConnectionRef con = pg->osd->get_con_osd_cluster(
7251 i->osd, pg->get_osdmap()->get_epoch());
7252 if (con) {
7253 pg->osd->send_message_osd_cluster(
7254 new MRecoveryReserve(
7255 MRecoveryReserve::RELEASE,
7256 spg_t(pg->info.pgid.pgid, i->shard),
7257 pg->get_osdmap()->get_epoch()),
7258 con.get());
7259 }
7260 }
7261 }
7262
7263 boost::statechart::result
7264 PG::RecoveryState::Recovering::react(const AllReplicasRecovered &evt)
7265 {
7266 PG *pg = context< RecoveryMachine >().pg;
7267 pg->state_clear(PG_STATE_RECOVERING);
7268 pg->state_clear(PG_STATE_FORCED_RECOVERY);
7269 release_reservations();
7270 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7271 return transit<Recovered>();
7272 }
7273
7274 boost::statechart::result
7275 PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
7276 {
7277 PG *pg = context< RecoveryMachine >().pg;
7278 pg->state_clear(PG_STATE_RECOVERING);
7279 pg->state_clear(PG_STATE_FORCED_RECOVERY);
7280 release_reservations();
7281 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7282 // XXX: Is this needed?
7283 pg->publish_stats_to_osd();
7284 return transit<WaitLocalBackfillReserved>();
7285 }
7286
7287 boost::statechart::result
7288 PG::RecoveryState::Recovering::react(const DeferRecovery &evt)
7289 {
7290 PG *pg = context< RecoveryMachine >().pg;
7291 if (!pg->state_test(PG_STATE_RECOVERING)) {
7292 // we may have finished recovery and have an AllReplicasRecovered
7293 // event queued to move us to the next state.
7294 ldout(pg->cct, 10) << "got defer recovery but not recovering" << dendl;
7295 return discard_event();
7296 }
7297 ldout(pg->cct, 10) << "defer recovery, retry delay " << evt.delay << dendl;
7298 pg->state_clear(PG_STATE_RECOVERING);
7299 pg->state_set(PG_STATE_RECOVERY_WAIT);
7300 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7301 release_reservations(true);
7302 pg->schedule_recovery_retry(evt.delay);
7303 return transit<NotRecovering>();
7304 }
7305
7306 boost::statechart::result
7307 PG::RecoveryState::Recovering::react(const UnfoundRecovery &evt)
7308 {
7309 PG *pg = context< RecoveryMachine >().pg;
7310 ldout(pg->cct, 10) << "recovery has unfound, can't continue" << dendl;
7311 pg->state_set(PG_STATE_RECOVERY_UNFOUND);
7312 pg->state_clear(PG_STATE_RECOVERING);
7313 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7314 release_reservations(true);
7315 return transit<NotRecovering>();
7316 }
7317
7318 void PG::RecoveryState::Recovering::exit()
7319 {
7320 context< RecoveryMachine >().log_exit(state_name, enter_time);
7321 PG *pg = context< RecoveryMachine >().pg;
7322 utime_t dur = ceph_clock_now() - enter_time;
7323 pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur);
7324 }
7325
7326 PG::RecoveryState::Recovered::Recovered(my_context ctx)
7327 : my_base(ctx),
7328 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovered")
7329 {
7330 pg_shard_t auth_log_shard;
7331
7332 context< RecoveryMachine >().log_enter(state_name);
7333
7334 PG *pg = context< RecoveryMachine >().pg;
7335
7336 assert(!pg->needs_recovery());
7337
7338 // if we finished backfill, all acting are active; recheck if
7339 // DEGRADED | UNDERSIZED is appropriate.
7340 assert(!pg->actingbackfill.empty());
7341 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
7342 pg->actingbackfill.size()) {
7343 pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
7344 pg->publish_stats_to_osd();
7345 }
7346
7347 // trim pglog on recovered
7348 pg->trim_log();
7349
7350 // adjust acting set? (e.g. because backfill completed...)
7351 bool history_les_bound = false;
7352 if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
7353 true, &history_les_bound))
7354 assert(pg->want_acting.size());
7355
7356 if (context< Active >().all_replicas_activated)
7357 post_event(GoClean());
7358 }
7359
7360 void PG::RecoveryState::Recovered::exit()
7361 {
7362 context< RecoveryMachine >().log_exit(state_name, enter_time);
7363 PG *pg = context< RecoveryMachine >().pg;
7364 utime_t dur = ceph_clock_now() - enter_time;
7365 pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur);
7366 }
7367
7368 PG::RecoveryState::Clean::Clean(my_context ctx)
7369 : my_base(ctx),
7370 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Clean")
7371 {
7372 context< RecoveryMachine >().log_enter(state_name);
7373
7374 PG *pg = context< RecoveryMachine >().pg;
7375
7376 if (pg->info.last_complete != pg->info.last_update) {
7377 ceph_abort();
7378 }
7379 pg->finish_recovery(*context< RecoveryMachine >().get_on_safe_context_list());
7380
7381 if (pg->is_active()) {
7382 pg->mark_clean();
7383 }
7384
7385 pg->share_pg_info();
7386 pg->publish_stats_to_osd();
7387 pg->requeue_ops(pg->waiting_for_clean_to_primary_repair);
7388 }
7389
7390 void PG::RecoveryState::Clean::exit()
7391 {
7392 context< RecoveryMachine >().log_exit(state_name, enter_time);
7393 PG *pg = context< RecoveryMachine >().pg;
7394 pg->state_clear(PG_STATE_CLEAN);
7395 utime_t dur = ceph_clock_now() - enter_time;
7396 pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur);
7397 }
7398
7399 template <typename T>
7400 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
7401 {
7402 set<int> osds_found;
7403 set<pg_shard_t> out;
7404 for (typename T::const_iterator i = in.begin();
7405 i != in.end();
7406 ++i) {
7407 if (*i != skip && !osds_found.count(i->osd)) {
7408 osds_found.insert(i->osd);
7409 out.insert(*i);
7410 }
7411 }
7412 return out;
7413 }
7414
7415 /*---------Active---------*/
7416 PG::RecoveryState::Active::Active(my_context ctx)
7417 : my_base(ctx),
7418 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active"),
7419 remote_shards_to_reserve_recovery(
7420 unique_osd_shard_set(
7421 context< RecoveryMachine >().pg->pg_whoami,
7422 context< RecoveryMachine >().pg->actingbackfill)),
7423 remote_shards_to_reserve_backfill(
7424 unique_osd_shard_set(
7425 context< RecoveryMachine >().pg->pg_whoami,
7426 context< RecoveryMachine >().pg->backfill_targets)),
7427 all_replicas_activated(false)
7428 {
7429 context< RecoveryMachine >().log_enter(state_name);
7430
7431 PG *pg = context< RecoveryMachine >().pg;
7432
7433 assert(!pg->backfill_reserving);
7434 assert(!pg->backfill_reserved);
7435 assert(pg->is_primary());
7436 ldout(pg->cct, 10) << "In Active, about to call activate" << dendl;
7437 pg->start_flush(
7438 context< RecoveryMachine >().get_cur_transaction(),
7439 context< RecoveryMachine >().get_on_applied_context_list(),
7440 context< RecoveryMachine >().get_on_safe_context_list());
7441 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7442 pg->get_osdmap()->get_epoch(),
7443 *context< RecoveryMachine >().get_on_safe_context_list(),
7444 *context< RecoveryMachine >().get_query_map(),
7445 context< RecoveryMachine >().get_info_map(),
7446 context< RecoveryMachine >().get_recovery_ctx());
7447
7448 // everyone has to commit/ack before we are truly active
7449 pg->blocked_by.clear();
7450 for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
7451 p != pg->actingbackfill.end();
7452 ++p) {
7453 if (p->shard != pg->pg_whoami.shard) {
7454 pg->blocked_by.insert(p->shard);
7455 }
7456 }
7457 pg->publish_stats_to_osd();
7458 ldout(pg->cct, 10) << "Activate Finished" << dendl;
7459 }
7460
7461 boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
7462 {
7463 PG *pg = context< RecoveryMachine >().pg;
7464 ldout(pg->cct, 10) << "Active advmap" << dendl;
7465 if (!pg->pool.newly_removed_snaps.empty()) {
7466 pg->snap_trimq.union_of(pg->pool.newly_removed_snaps);
7467 ldout(pg->cct, 10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl;
7468 pg->dirty_info = true;
7469 pg->dirty_big_info = true;
7470 }
7471
7472 for (size_t i = 0; i < pg->want_acting.size(); i++) {
7473 int osd = pg->want_acting[i];
7474 if (!advmap.osdmap->is_up(osd)) {
7475 pg_shard_t osd_with_shard(osd, shard_id_t(i));
7476 assert(pg->is_acting(osd_with_shard) || pg->is_up(osd_with_shard));
7477 }
7478 }
7479
7480 bool need_publish = false;
7481 /* Check for changes in pool size (if the acting set changed as a result,
7482 * this does not matter) */
7483 if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) !=
7484 pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) {
7485 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) {
7486 pg->state_clear(PG_STATE_UNDERSIZED);
7487 } else {
7488 pg->state_set(PG_STATE_UNDERSIZED);
7489 }
7490 // degraded changes will be detected by call from publish_stats_to_osd()
7491 need_publish = true;
7492 }
7493
7494 // if we haven't reported our PG stats in a long time, do so now.
7495 if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
7496 ldout(pg->cct, 20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch)
7497 << " epochs" << dendl;
7498 need_publish = true;
7499 }
7500
7501 if (need_publish)
7502 pg->publish_stats_to_osd();
7503
7504 return forward_event();
7505 }
7506
7507 boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
7508 {
7509 PG *pg = context< RecoveryMachine >().pg;
7510 ldout(pg->cct, 10) << "Active: handling ActMap" << dendl;
7511 assert(pg->is_primary());
7512
7513 if (pg->have_unfound()) {
7514 // object may have become unfound
7515 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7516 }
7517
7518 if (pg->cct->_conf->osd_check_for_log_corruption)
7519 pg->check_log_for_corruption(pg->osd->store);
7520
7521 uint64_t unfound = pg->missing_loc.num_unfound();
7522 if (unfound > 0 &&
7523 pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
7524 if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
7525 pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound
7526 << " objects unfound and apparently lost, would automatically "
7527 << "mark these objects lost but this feature is not yet implemented "
7528 << "(osd_auto_mark_unfound_lost)";
7529 } else
7530 pg->osd->clog->error() << pg->info.pgid.pgid << " has "
7531 << unfound << " objects unfound and apparently lost";
7532 }
7533
7534 if (pg->is_active()) {
7535 ldout(pg->cct, 10) << "Active: kicking snap trim" << dendl;
7536 pg->kick_snap_trim();
7537 }
7538
7539 if (pg->is_peered() &&
7540 !pg->is_clean() &&
7541 !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) &&
7542 (!pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || pg->is_degraded())) {
7543 pg->queue_recovery();
7544 }
7545 return forward_event();
7546 }
7547
7548 boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& notevt)
7549 {
7550 PG *pg = context< RecoveryMachine >().pg;
7551 assert(pg->is_primary());
7552 if (pg->peer_info.count(notevt.from)) {
7553 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7554 << ", already have info from that osd, ignoring"
7555 << dendl;
7556 } else if (pg->peer_purged.count(notevt.from)) {
7557 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7558 << ", already purged that peer, ignoring"
7559 << dendl;
7560 } else {
7561 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7562 << ", calling proc_replica_info and discover_all_missing"
7563 << dendl;
7564 pg->proc_replica_info(
7565 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
7566 if (pg->have_unfound()) {
7567 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7568 }
7569 }
7570 return discard_event();
7571 }
7572
7573 boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoevt)
7574 {
7575 PG *pg = context< RecoveryMachine >().pg;
7576 assert(pg->is_primary());
7577
7578 assert(!pg->actingbackfill.empty());
7579 // don't update history (yet) if we are active and primary; the replica
7580 // may be telling us they have activated (and committed) but we can't
7581 // share that until _everyone_ does the same.
7582 if (pg->is_actingbackfill(infoevt.from)) {
7583 ldout(pg->cct, 10) << " peer osd." << infoevt.from
7584 << " activated and committed" << dendl;
7585 pg->peer_activated.insert(infoevt.from);
7586 pg->blocked_by.erase(infoevt.from.shard);
7587 pg->publish_stats_to_osd();
7588 if (pg->peer_activated.size() == pg->actingbackfill.size()) {
7589 pg->all_activated_and_committed();
7590 }
7591 }
7592 return discard_event();
7593 }
7594
7595 boost::statechart::result PG::RecoveryState::Active::react(const MLogRec& logevt)
7596 {
7597 PG *pg = context< RecoveryMachine >().pg;
7598 ldout(pg->cct, 10) << "searching osd." << logevt.from
7599 << " log for unfound items" << dendl;
7600 pg->proc_replica_log(
7601 logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
7602 bool got_missing = pg->search_for_missing(
7603 pg->peer_info[logevt.from],
7604 pg->peer_missing[logevt.from],
7605 logevt.from,
7606 context< RecoveryMachine >().get_recovery_ctx());
7607 // If there are missing AND we are "fully" active then start recovery now
7608 if (got_missing && pg->state_test(PG_STATE_ACTIVE)) {
7609 post_event(DoRecovery());
7610 }
7611 return discard_event();
7612 }
7613
7614 boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
7615 {
7616 PG *pg = context< RecoveryMachine >().pg;
7617
7618 q.f->open_object_section("state");
7619 q.f->dump_string("name", state_name);
7620 q.f->dump_stream("enter_time") << enter_time;
7621
7622 {
7623 q.f->open_array_section("might_have_unfound");
7624 for (set<pg_shard_t>::iterator p = pg->might_have_unfound.begin();
7625 p != pg->might_have_unfound.end();
7626 ++p) {
7627 q.f->open_object_section("osd");
7628 q.f->dump_stream("osd") << *p;
7629 if (pg->peer_missing.count(*p)) {
7630 q.f->dump_string("status", "already probed");
7631 } else if (pg->peer_missing_requested.count(*p)) {
7632 q.f->dump_string("status", "querying");
7633 } else if (!pg->get_osdmap()->is_up(p->osd)) {
7634 q.f->dump_string("status", "osd is down");
7635 } else {
7636 q.f->dump_string("status", "not queried");
7637 }
7638 q.f->close_section();
7639 }
7640 q.f->close_section();
7641 }
7642 {
7643 q.f->open_object_section("recovery_progress");
7644 pg->dump_recovery_info(q.f);
7645 q.f->close_section();
7646 }
7647
7648 {
7649 q.f->open_object_section("scrub");
7650 q.f->dump_stream("scrubber.epoch_start") << pg->scrubber.epoch_start;
7651 q.f->dump_bool("scrubber.active", pg->scrubber.active);
7652 q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state));
7653 q.f->dump_stream("scrubber.start") << pg->scrubber.start;
7654 q.f->dump_stream("scrubber.end") << pg->scrubber.end;
7655 q.f->dump_stream("scrubber.max_end") << pg->scrubber.max_end;
7656 q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update;
7657 q.f->dump_bool("scrubber.deep", pg->scrubber.deep);
7658 {
7659 q.f->open_array_section("scrubber.waiting_on_whom");
7660 for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin();
7661 p != pg->scrubber.waiting_on_whom.end();
7662 ++p) {
7663 q.f->dump_stream("shard") << *p;
7664 }
7665 q.f->close_section();
7666 }
7667 q.f->close_section();
7668 }
7669
7670 q.f->close_section();
7671 return forward_event();
7672 }
7673
7674 boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActivated &evt)
7675 {
7676 PG *pg = context< RecoveryMachine >().pg;
7677 all_replicas_activated = true;
7678
7679 pg->state_clear(PG_STATE_ACTIVATING);
7680 pg->state_clear(PG_STATE_CREATING);
7681 if (pg->acting.size() >= pg->pool.info.min_size) {
7682 pg->state_set(PG_STATE_ACTIVE);
7683 } else {
7684 pg->state_set(PG_STATE_PEERED);
7685 }
7686
7687 // info.last_epoch_started is set during activate()
7688 pg->info.history.last_epoch_started = pg->info.last_epoch_started;
7689 pg->info.history.last_interval_started = pg->info.last_interval_started;
7690 pg->dirty_info = true;
7691
7692 pg->share_pg_info();
7693 pg->publish_stats_to_osd();
7694
7695 pg->check_local();
7696
7697 // waiters
7698 if (pg->flushes_in_progress == 0) {
7699 pg->requeue_ops(pg->waiting_for_peered);
7700 } else if (!pg->waiting_for_peered.empty()) {
7701 ldout(pg->cct, 10) << __func__ << " flushes in progress, moving "
7702 << pg->waiting_for_peered.size()
7703 << " items to waiting_for_flush"
7704 << dendl;
7705 assert(pg->waiting_for_flush.empty());
7706 pg->waiting_for_flush.swap(pg->waiting_for_peered);
7707 }
7708
7709 pg->on_activate();
7710
7711 return discard_event();
7712 }
7713
7714 void PG::RecoveryState::Active::exit()
7715 {
7716 context< RecoveryMachine >().log_exit(state_name, enter_time);
7717 PG *pg = context< RecoveryMachine >().pg;
7718 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7719
7720 pg->blocked_by.clear();
7721 pg->backfill_reserved = false;
7722 pg->backfill_reserving = false;
7723 pg->state_clear(PG_STATE_ACTIVATING);
7724 pg->state_clear(PG_STATE_DEGRADED);
7725 pg->state_clear(PG_STATE_UNDERSIZED);
7726 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
7727 pg->state_clear(PG_STATE_BACKFILL_WAIT);
7728 pg->state_clear(PG_STATE_RECOVERY_WAIT);
7729 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7730 utime_t dur = ceph_clock_now() - enter_time;
7731 pg->osd->recoverystate_perf->tinc(rs_active_latency, dur);
7732 pg->agent_stop();
7733 }
7734
7735 /*------ReplicaActive-----*/
7736 PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx)
7737 : my_base(ctx),
7738 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive")
7739 {
7740 context< RecoveryMachine >().log_enter(state_name);
7741
7742 PG *pg = context< RecoveryMachine >().pg;
7743 pg->start_flush(
7744 context< RecoveryMachine >().get_cur_transaction(),
7745 context< RecoveryMachine >().get_on_applied_context_list(),
7746 context< RecoveryMachine >().get_on_safe_context_list());
7747 }
7748
7749
7750 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
7751 const Activate& actevt) {
7752 PG *pg = context< RecoveryMachine >().pg;
7753 ldout(pg->cct, 10) << "In ReplicaActive, about to call activate" << dendl;
7754 map<int, map<spg_t, pg_query_t> > query_map;
7755 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7756 actevt.activation_epoch,
7757 *context< RecoveryMachine >().get_on_safe_context_list(),
7758 query_map, NULL, NULL);
7759 ldout(pg->cct, 10) << "Activate Finished" << dendl;
7760 return discard_event();
7761 }
7762
7763 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MInfoRec& infoevt)
7764 {
7765 PG *pg = context< RecoveryMachine >().pg;
7766 pg->proc_primary_info(*context<RecoveryMachine>().get_cur_transaction(),
7767 infoevt.info);
7768 return discard_event();
7769 }
7770
7771 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MLogRec& logevt)
7772 {
7773 PG *pg = context< RecoveryMachine >().pg;
7774 ldout(pg->cct, 10) << "received log from " << logevt.from << dendl;
7775 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7776 pg->merge_log(*t, logevt.msg->info, logevt.msg->log, logevt.from);
7777 assert(pg->pg_log.get_head() == pg->info.last_update);
7778
7779 return discard_event();
7780 }
7781
7782 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&)
7783 {
7784 PG *pg = context< RecoveryMachine >().pg;
7785 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7786 context< RecoveryMachine >().send_notify(
7787 pg->get_primary(),
7788 pg_notify_t(
7789 pg->get_primary().shard, pg->pg_whoami.shard,
7790 pg->get_osdmap()->get_epoch(),
7791 pg->get_osdmap()->get_epoch(),
7792 pg->info),
7793 pg->past_intervals);
7794 }
7795 pg->take_waiters();
7796 return discard_event();
7797 }
7798
7799 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MQuery& query)
7800 {
7801 PG *pg = context< RecoveryMachine >().pg;
7802 if (query.query.type == pg_query_t::MISSING) {
7803 pg->update_history(query.query.history);
7804 pg->fulfill_log(query.from, query.query, query.query_epoch);
7805 } // else: from prior to activation, safe to ignore
7806 return discard_event();
7807 }
7808
7809 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const QueryState& q)
7810 {
7811 q.f->open_object_section("state");
7812 q.f->dump_string("name", state_name);
7813 q.f->dump_stream("enter_time") << enter_time;
7814 q.f->close_section();
7815 return forward_event();
7816 }
7817
7818 void PG::RecoveryState::ReplicaActive::exit()
7819 {
7820 context< RecoveryMachine >().log_exit(state_name, enter_time);
7821 PG *pg = context< RecoveryMachine >().pg;
7822 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7823 utime_t dur = ceph_clock_now() - enter_time;
7824 pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur);
7825 }
7826
7827 /*-------Stray---*/
7828 PG::RecoveryState::Stray::Stray(my_context ctx)
7829 : my_base(ctx),
7830 NamedState(context< RecoveryMachine >().pg, "Started/Stray")
7831 {
7832 context< RecoveryMachine >().log_enter(state_name);
7833
7834 PG *pg = context< RecoveryMachine >().pg;
7835 assert(!pg->is_peered());
7836 assert(!pg->is_peering());
7837 assert(!pg->is_primary());
7838 pg->start_flush(
7839 context< RecoveryMachine >().get_cur_transaction(),
7840 context< RecoveryMachine >().get_on_applied_context_list(),
7841 context< RecoveryMachine >().get_on_safe_context_list());
7842 }
7843
7844 boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
7845 {
7846 PG *pg = context< RecoveryMachine >().pg;
7847 MOSDPGLog *msg = logevt.msg.get();
7848 ldout(pg->cct, 10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
7849
7850 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7851 if (msg->info.last_backfill == hobject_t()) {
7852 // restart backfill
7853 pg->unreg_next_scrub();
7854 pg->info = msg->info;
7855 pg->reg_next_scrub();
7856 pg->dirty_info = true;
7857 pg->dirty_big_info = true; // maybe.
7858
7859 PGLogEntryHandler rollbacker{pg, t};
7860 pg->pg_log.reset_backfill_claim_log(msg->log, &rollbacker);
7861
7862 pg->pg_log.reset_backfill();
7863 } else {
7864 pg->merge_log(*t, msg->info, msg->log, logevt.from);
7865 }
7866
7867 assert(pg->pg_log.get_head() == pg->info.last_update);
7868
7869 post_event(Activate(logevt.msg->info.last_epoch_started));
7870 return transit<ReplicaActive>();
7871 }
7872
7873 boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoevt)
7874 {
7875 PG *pg = context< RecoveryMachine >().pg;
7876 ldout(pg->cct, 10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
7877
7878 if (pg->info.last_update > infoevt.info.last_update) {
7879 // rewind divergent log entries
7880 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7881 pg->rewind_divergent_log(*t, infoevt.info.last_update);
7882 pg->info.stats = infoevt.info.stats;
7883 pg->info.hit_set = infoevt.info.hit_set;
7884 }
7885
7886 assert(infoevt.info.last_update == pg->info.last_update);
7887 assert(pg->pg_log.get_head() == pg->info.last_update);
7888
7889 post_event(Activate(infoevt.info.last_epoch_started));
7890 return transit<ReplicaActive>();
7891 }
7892
7893 boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query)
7894 {
7895 PG *pg = context< RecoveryMachine >().pg;
7896 if (query.query.type == pg_query_t::INFO) {
7897 pair<pg_shard_t, pg_info_t> notify_info;
7898 pg->update_history(query.query.history);
7899 pg->fulfill_info(query.from, query.query, notify_info);
7900 context< RecoveryMachine >().send_notify(
7901 notify_info.first,
7902 pg_notify_t(
7903 notify_info.first.shard, pg->pg_whoami.shard,
7904 query.query_epoch,
7905 pg->get_osdmap()->get_epoch(),
7906 notify_info.second),
7907 pg->past_intervals);
7908 } else {
7909 pg->fulfill_log(query.from, query.query, query.query_epoch);
7910 }
7911 return discard_event();
7912 }
7913
7914 boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&)
7915 {
7916 PG *pg = context< RecoveryMachine >().pg;
7917 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7918 context< RecoveryMachine >().send_notify(
7919 pg->get_primary(),
7920 pg_notify_t(
7921 pg->get_primary().shard, pg->pg_whoami.shard,
7922 pg->get_osdmap()->get_epoch(),
7923 pg->get_osdmap()->get_epoch(),
7924 pg->info),
7925 pg->past_intervals);
7926 }
7927 pg->take_waiters();
7928 return discard_event();
7929 }
7930
7931 void PG::RecoveryState::Stray::exit()
7932 {
7933 context< RecoveryMachine >().log_exit(state_name, enter_time);
7934 PG *pg = context< RecoveryMachine >().pg;
7935 utime_t dur = ceph_clock_now() - enter_time;
7936 pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur);
7937 }
7938
7939 /*--------GetInfo---------*/
7940 PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
7941 : my_base(ctx),
7942 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetInfo")
7943 {
7944 context< RecoveryMachine >().log_enter(state_name);
7945
7946 PG *pg = context< RecoveryMachine >().pg;
7947 pg->check_past_interval_bounds();
7948 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7949
7950 assert(pg->blocked_by.empty());
7951
7952 prior_set = pg->build_prior();
7953
7954 pg->reset_min_peer_features();
7955 get_infos();
7956 if (prior_set.pg_down) {
7957 post_event(IsDown());
7958 } else if (peer_info_requested.empty()) {
7959 post_event(GotInfo());
7960 }
7961 }
7962
7963 void PG::RecoveryState::GetInfo::get_infos()
7964 {
7965 PG *pg = context< RecoveryMachine >().pg;
7966 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7967
7968 pg->blocked_by.clear();
7969 for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
7970 it != prior_set.probe.end();
7971 ++it) {
7972 pg_shard_t peer = *it;
7973 if (peer == pg->pg_whoami) {
7974 continue;
7975 }
7976 if (pg->peer_info.count(peer)) {
7977 ldout(pg->cct, 10) << " have osd." << peer << " info " << pg->peer_info[peer] << dendl;
7978 continue;
7979 }
7980 if (peer_info_requested.count(peer)) {
7981 ldout(pg->cct, 10) << " already requested info from osd." << peer << dendl;
7982 pg->blocked_by.insert(peer.osd);
7983 } else if (!pg->get_osdmap()->is_up(peer.osd)) {
7984 ldout(pg->cct, 10) << " not querying info from down osd." << peer << dendl;
7985 } else {
7986 ldout(pg->cct, 10) << " querying info from osd." << peer << dendl;
7987 context< RecoveryMachine >().send_query(
7988 peer, pg_query_t(pg_query_t::INFO,
7989 it->shard, pg->pg_whoami.shard,
7990 pg->info.history,
7991 pg->get_osdmap()->get_epoch()));
7992 peer_info_requested.insert(peer);
7993 pg->blocked_by.insert(peer.osd);
7994 }
7995 }
7996
7997 pg->publish_stats_to_osd();
7998 }
7999
8000 boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt)
8001 {
8002 PG *pg = context< RecoveryMachine >().pg;
8003
8004 set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
8005 if (p != peer_info_requested.end()) {
8006 peer_info_requested.erase(p);
8007 pg->blocked_by.erase(infoevt.from.osd);
8008 }
8009
8010 epoch_t old_start = pg->info.history.last_epoch_started;
8011 if (pg->proc_replica_info(
8012 infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
8013 // we got something new ...
8014 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
8015 if (old_start < pg->info.history.last_epoch_started) {
8016 ldout(pg->cct, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
8017 prior_set = pg->build_prior();
8018
8019 // filter out any osds that got dropped from the probe set from
8020 // peer_info_requested. this is less expensive than restarting
8021 // peering (which would re-probe everyone).
8022 set<pg_shard_t>::iterator p = peer_info_requested.begin();
8023 while (p != peer_info_requested.end()) {
8024 if (prior_set.probe.count(*p) == 0) {
8025 ldout(pg->cct, 20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
8026 peer_info_requested.erase(p++);
8027 } else {
8028 ++p;
8029 }
8030 }
8031 get_infos();
8032 }
8033 ldout(pg->cct, 20) << "Adding osd: " << infoevt.from.osd << " peer features: "
8034 << hex << infoevt.features << dec << dendl;
8035 pg->apply_peer_features(infoevt.features);
8036
8037 // are we done getting everything?
8038 if (peer_info_requested.empty() && !prior_set.pg_down) {
8039 ldout(pg->cct, 20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl;
8040 ldout(pg->cct, 20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl;
8041 ldout(pg->cct, 20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl;
8042 post_event(GotInfo());
8043 }
8044 }
8045 return discard_event();
8046 }
8047
8048 boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q)
8049 {
8050 PG *pg = context< RecoveryMachine >().pg;
8051 q.f->open_object_section("state");
8052 q.f->dump_string("name", state_name);
8053 q.f->dump_stream("enter_time") << enter_time;
8054
8055 q.f->open_array_section("requested_info_from");
8056 for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
8057 p != peer_info_requested.end();
8058 ++p) {
8059 q.f->open_object_section("osd");
8060 q.f->dump_stream("osd") << *p;
8061 if (pg->peer_info.count(*p)) {
8062 q.f->open_object_section("got_info");
8063 pg->peer_info[*p].dump(q.f);
8064 q.f->close_section();
8065 }
8066 q.f->close_section();
8067 }
8068 q.f->close_section();
8069
8070 q.f->close_section();
8071 return forward_event();
8072 }
8073
8074 void PG::RecoveryState::GetInfo::exit()
8075 {
8076 context< RecoveryMachine >().log_exit(state_name, enter_time);
8077 PG *pg = context< RecoveryMachine >().pg;
8078 utime_t dur = ceph_clock_now() - enter_time;
8079 pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur);
8080 pg->blocked_by.clear();
8081 pg->publish_stats_to_osd();
8082 }
8083
8084 /*------GetLog------------*/
8085 PG::RecoveryState::GetLog::GetLog(my_context ctx)
8086 : my_base(ctx),
8087 NamedState(
8088 context< RecoveryMachine >().pg, "Started/Primary/Peering/GetLog"),
8089 msg(0)
8090 {
8091 context< RecoveryMachine >().log_enter(state_name);
8092
8093 PG *pg = context< RecoveryMachine >().pg;
8094
8095 // adjust acting?
8096 if (!pg->choose_acting(auth_log_shard, false,
8097 &context< Peering >().history_les_bound)) {
8098 if (!pg->want_acting.empty()) {
8099 post_event(NeedActingChange());
8100 } else {
8101 post_event(IsIncomplete());
8102 }
8103 return;
8104 }
8105
8106 // am i the best?
8107 if (auth_log_shard == pg->pg_whoami) {
8108 post_event(GotLog());
8109 return;
8110 }
8111
8112 const pg_info_t& best = pg->peer_info[auth_log_shard];
8113
8114 // am i broken?
8115 if (pg->info.last_update < best.log_tail) {
8116 ldout(pg->cct, 10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
8117 post_event(IsIncomplete());
8118 return;
8119 }
8120
8121 // how much log to request?
8122 eversion_t request_log_from = pg->info.last_update;
8123 assert(!pg->actingbackfill.empty());
8124 for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
8125 p != pg->actingbackfill.end();
8126 ++p) {
8127 if (*p == pg->pg_whoami) continue;
8128 pg_info_t& ri = pg->peer_info[*p];
8129 if (ri.last_update < pg->info.log_tail && ri.last_update >= best.log_tail &&
8130 ri.last_update < request_log_from)
8131 request_log_from = ri.last_update;
8132 }
8133
8134 // how much?
8135 ldout(pg->cct, 10) << " requesting log from osd." << auth_log_shard << dendl;
8136 context<RecoveryMachine>().send_query(
8137 auth_log_shard,
8138 pg_query_t(
8139 pg_query_t::LOG,
8140 auth_log_shard.shard, pg->pg_whoami.shard,
8141 request_log_from, pg->info.history,
8142 pg->get_osdmap()->get_epoch()));
8143
8144 assert(pg->blocked_by.empty());
8145 pg->blocked_by.insert(auth_log_shard.osd);
8146 pg->publish_stats_to_osd();
8147 }
8148
8149 boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap)
8150 {
8151 PG *pg = context< RecoveryMachine >().pg;
8152 // make sure our log source didn't go down. we need to check
8153 // explicitly because it may not be part of the prior set, which
8154 // means the Peering state check won't catch it going down.
8155 if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
8156 ldout(pg->cct, 10) << "GetLog: auth_log_shard osd."
8157 << auth_log_shard.osd << " went down" << dendl;
8158 post_event(advmap);
8159 return transit< Reset >();
8160 }
8161
8162 // let the Peering state do its checks.
8163 return forward_event();
8164 }
8165
8166 boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt)
8167 {
8168 PG *pg = context< RecoveryMachine >().pg;
8169 assert(!msg);
8170 if (logevt.from != auth_log_shard) {
8171 ldout(pg->cct, 10) << "GetLog: discarding log from "
8172 << "non-auth_log_shard osd." << logevt.from << dendl;
8173 return discard_event();
8174 }
8175 ldout(pg->cct, 10) << "GetLog: received master log from osd"
8176 << logevt.from << dendl;
8177 msg = logevt.msg;
8178 post_event(GotLog());
8179 return discard_event();
8180 }
8181
8182 boost::statechart::result PG::RecoveryState::GetLog::react(const GotLog&)
8183 {
8184 PG *pg = context< RecoveryMachine >().pg;
8185 ldout(pg->cct, 10) << "leaving GetLog" << dendl;
8186 if (msg) {
8187 ldout(pg->cct, 10) << "processing master log" << dendl;
8188 pg->proc_master_log(*context<RecoveryMachine>().get_cur_transaction(),
8189 msg->info, msg->log, msg->missing,
8190 auth_log_shard);
8191 }
8192 pg->start_flush(
8193 context< RecoveryMachine >().get_cur_transaction(),
8194 context< RecoveryMachine >().get_on_applied_context_list(),
8195 context< RecoveryMachine >().get_on_safe_context_list());
8196 return transit< GetMissing >();
8197 }
8198
8199 boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q)
8200 {
8201 q.f->open_object_section("state");
8202 q.f->dump_string("name", state_name);
8203 q.f->dump_stream("enter_time") << enter_time;
8204 q.f->dump_stream("auth_log_shard") << auth_log_shard;
8205 q.f->close_section();
8206 return forward_event();
8207 }
8208
8209 void PG::RecoveryState::GetLog::exit()
8210 {
8211 context< RecoveryMachine >().log_exit(state_name, enter_time);
8212 PG *pg = context< RecoveryMachine >().pg;
8213 utime_t dur = ceph_clock_now() - enter_time;
8214 pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur);
8215 pg->blocked_by.clear();
8216 pg->publish_stats_to_osd();
8217 }
8218
8219 /*------WaitActingChange--------*/
8220 PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx)
8221 : my_base(ctx),
8222 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitActingChange")
8223 {
8224 context< RecoveryMachine >().log_enter(state_name);
8225 }
8226
8227 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const AdvMap& advmap)
8228 {
8229 PG *pg = context< RecoveryMachine >().pg;
8230 OSDMapRef osdmap = advmap.osdmap;
8231
8232 ldout(pg->cct, 10) << "verifying no want_acting " << pg->want_acting << " targets didn't go down" << dendl;
8233 for (vector<int>::iterator p = pg->want_acting.begin(); p != pg->want_acting.end(); ++p) {
8234 if (!osdmap->is_up(*p)) {
8235 ldout(pg->cct, 10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
8236 post_event(advmap);
8237 return transit< Reset >();
8238 }
8239 }
8240 return forward_event();
8241 }
8242
8243 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MLogRec& logevt)
8244 {
8245 PG *pg = context< RecoveryMachine >().pg;
8246 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MLocRec" << dendl;
8247 return discard_event();
8248 }
8249
8250 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MInfoRec& evt)
8251 {
8252 PG *pg = context< RecoveryMachine >().pg;
8253 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
8254 return discard_event();
8255 }
8256
8257 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MNotifyRec& evt)
8258 {
8259 PG *pg = context< RecoveryMachine >().pg;
8260 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
8261 return discard_event();
8262 }
8263
8264 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const QueryState& q)
8265 {
8266 q.f->open_object_section("state");
8267 q.f->dump_string("name", state_name);
8268 q.f->dump_stream("enter_time") << enter_time;
8269 q.f->dump_string("comment", "waiting for pg acting set to change");
8270 q.f->close_section();
8271 return forward_event();
8272 }
8273
8274 void PG::RecoveryState::WaitActingChange::exit()
8275 {
8276 context< RecoveryMachine >().log_exit(state_name, enter_time);
8277 PG *pg = context< RecoveryMachine >().pg;
8278 utime_t dur = ceph_clock_now() - enter_time;
8279 pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur);
8280 }
8281
8282 /*------Down--------*/
8283 PG::RecoveryState::Down::Down(my_context ctx)
8284 : my_base(ctx),
8285 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Down")
8286 {
8287 context< RecoveryMachine >().log_enter(state_name);
8288 PG *pg = context< RecoveryMachine >().pg;
8289
8290 pg->state_clear(PG_STATE_PEERING);
8291 pg->state_set(PG_STATE_DOWN);
8292
8293 auto &prior_set = context< Peering >().prior_set;
8294 assert(pg->blocked_by.empty());
8295 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
8296 pg->publish_stats_to_osd();
8297 }
8298
8299 void PG::RecoveryState::Down::exit()
8300 {
8301 context< RecoveryMachine >().log_exit(state_name, enter_time);
8302 PG *pg = context< RecoveryMachine >().pg;
8303
8304 pg->state_clear(PG_STATE_DOWN);
8305 utime_t dur = ceph_clock_now() - enter_time;
8306 pg->osd->recoverystate_perf->tinc(rs_down_latency, dur);
8307
8308 pg->blocked_by.clear();
8309 pg->publish_stats_to_osd();
8310 }
8311
8312 boost::statechart::result PG::RecoveryState::Down::react(const QueryState& q)
8313 {
8314 q.f->open_object_section("state");
8315 q.f->dump_string("name", state_name);
8316 q.f->dump_stream("enter_time") << enter_time;
8317 q.f->dump_string("comment",
8318 "not enough up instances of this PG to go active");
8319 q.f->close_section();
8320 return forward_event();
8321 }
8322
8323 /*------Incomplete--------*/
8324 PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
8325 : my_base(ctx),
8326 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Incomplete")
8327 {
8328 context< RecoveryMachine >().log_enter(state_name);
8329 PG *pg = context< RecoveryMachine >().pg;
8330
8331 pg->state_clear(PG_STATE_PEERING);
8332 pg->state_set(PG_STATE_INCOMPLETE);
8333
8334 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
8335 assert(pg->blocked_by.empty());
8336 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
8337 pg->publish_stats_to_osd();
8338 }
8339
8340 boost::statechart::result PG::RecoveryState::Incomplete::react(const AdvMap &advmap) {
8341 PG *pg = context< RecoveryMachine >().pg;
8342 int64_t poolnum = pg->info.pgid.pool();
8343
8344 // Reset if min_size turn smaller than previous value, pg might now be able to go active
8345 if (!advmap.osdmap->have_pg_pool(poolnum) ||
8346 advmap.lastmap->get_pools().find(poolnum)->second.min_size >
8347 advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
8348 post_event(advmap);
8349 return transit< Reset >();
8350 }
8351
8352 return forward_event();
8353 }
8354
8355 boost::statechart::result PG::RecoveryState::Incomplete::react(const MNotifyRec& notevt) {
8356 PG *pg = context< RecoveryMachine >().pg;
8357 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
8358 if (pg->proc_replica_info(
8359 notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
8360 // We got something new, try again!
8361 return transit< GetLog >();
8362 } else {
8363 return discard_event();
8364 }
8365 }
8366
8367 boost::statechart::result PG::RecoveryState::Incomplete::react(
8368 const QueryState& q)
8369 {
8370 q.f->open_object_section("state");
8371 q.f->dump_string("name", state_name);
8372 q.f->dump_stream("enter_time") << enter_time;
8373 q.f->dump_string("comment", "not enough complete instances of this PG");
8374 q.f->close_section();
8375 return forward_event();
8376 }
8377
8378 void PG::RecoveryState::Incomplete::exit()
8379 {
8380 context< RecoveryMachine >().log_exit(state_name, enter_time);
8381 PG *pg = context< RecoveryMachine >().pg;
8382
8383 pg->state_clear(PG_STATE_INCOMPLETE);
8384 utime_t dur = ceph_clock_now() - enter_time;
8385 pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur);
8386
8387 pg->blocked_by.clear();
8388 pg->publish_stats_to_osd();
8389 }
8390
8391 /*------GetMissing--------*/
8392 PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
8393 : my_base(ctx),
8394 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetMissing")
8395 {
8396 context< RecoveryMachine >().log_enter(state_name);
8397
8398 PG *pg = context< RecoveryMachine >().pg;
8399 assert(!pg->actingbackfill.empty());
8400 eversion_t since;
8401 for (set<pg_shard_t>::iterator i = pg->actingbackfill.begin();
8402 i != pg->actingbackfill.end();
8403 ++i) {
8404 if (*i == pg->get_primary()) continue;
8405 const pg_info_t& pi = pg->peer_info[*i];
8406 // reset this so to make sure the pg_missing_t is initialized and
8407 // has the correct semantics even if we don't need to get a
8408 // missing set from a shard. This way later additions due to
8409 // lost+unfound delete work properly.
8410 pg->peer_missing[*i].may_include_deletes = !pg->perform_deletes_during_peering();
8411
8412 if (pi.is_empty())
8413 continue; // no pg data, nothing divergent
8414
8415 if (pi.last_update < pg->pg_log.get_tail()) {
8416 ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
8417 pg->peer_missing[*i].clear();
8418 continue;
8419 }
8420 if (pi.last_backfill == hobject_t()) {
8421 ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
8422 pg->peer_missing[*i].clear();
8423 continue;
8424 }
8425
8426 if (pi.last_update == pi.last_complete && // peer has no missing
8427 pi.last_update == pg->info.last_update) { // peer is up to date
8428 // replica has no missing and identical log as us. no need to
8429 // pull anything.
8430 // FIXME: we can do better here. if last_update==last_complete we
8431 // can infer the rest!
8432 ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl;
8433 pg->peer_missing[*i].clear();
8434 continue;
8435 }
8436
8437 // We pull the log from the peer's last_epoch_started to ensure we
8438 // get enough log to detect divergent updates.
8439 since.epoch = pi.last_epoch_started;
8440 assert(pi.last_update >= pg->info.log_tail); // or else choose_acting() did a bad thing
8441 if (pi.log_tail <= since) {
8442 ldout(pg->cct, 10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
8443 context< RecoveryMachine >().send_query(
8444 *i,
8445 pg_query_t(
8446 pg_query_t::LOG,
8447 i->shard, pg->pg_whoami.shard,
8448 since, pg->info.history,
8449 pg->get_osdmap()->get_epoch()));
8450 } else {
8451 ldout(pg->cct, 10) << " requesting fulllog+missing from osd." << *i
8452 << " (want since " << since << " < log.tail "
8453 << pi.log_tail << ")" << dendl;
8454 context< RecoveryMachine >().send_query(
8455 *i, pg_query_t(
8456 pg_query_t::FULLLOG,
8457 i->shard, pg->pg_whoami.shard,
8458 pg->info.history, pg->get_osdmap()->get_epoch()));
8459 }
8460 peer_missing_requested.insert(*i);
8461 pg->blocked_by.insert(i->osd);
8462 }
8463
8464 if (peer_missing_requested.empty()) {
8465 if (pg->need_up_thru) {
8466 ldout(pg->cct, 10) << " still need up_thru update before going active"
8467 << dendl;
8468 post_event(NeedUpThru());
8469 return;
8470 }
8471
8472 // all good!
8473 post_event(Activate(pg->get_osdmap()->get_epoch()));
8474 } else {
8475 pg->publish_stats_to_osd();
8476 }
8477 }
8478
8479 boost::statechart::result PG::RecoveryState::GetMissing::react(const MLogRec& logevt)
8480 {
8481 PG *pg = context< RecoveryMachine >().pg;
8482
8483 peer_missing_requested.erase(logevt.from);
8484 pg->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
8485
8486 if (peer_missing_requested.empty()) {
8487 if (pg->need_up_thru) {
8488 ldout(pg->cct, 10) << " still need up_thru update before going active"
8489 << dendl;
8490 post_event(NeedUpThru());
8491 } else {
8492 ldout(pg->cct, 10) << "Got last missing, don't need missing "
8493 << "posting Activate" << dendl;
8494 post_event(Activate(pg->get_osdmap()->get_epoch()));
8495 }
8496 }
8497 return discard_event();
8498 }
8499
8500 boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState& q)
8501 {
8502 PG *pg = context< RecoveryMachine >().pg;
8503 q.f->open_object_section("state");
8504 q.f->dump_string("name", state_name);
8505 q.f->dump_stream("enter_time") << enter_time;
8506
8507 q.f->open_array_section("peer_missing_requested");
8508 for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
8509 p != peer_missing_requested.end();
8510 ++p) {
8511 q.f->open_object_section("osd");
8512 q.f->dump_stream("osd") << *p;
8513 if (pg->peer_missing.count(*p)) {
8514 q.f->open_object_section("got_missing");
8515 pg->peer_missing[*p].dump(q.f);
8516 q.f->close_section();
8517 }
8518 q.f->close_section();
8519 }
8520 q.f->close_section();
8521
8522 q.f->close_section();
8523 return forward_event();
8524 }
8525
8526 void PG::RecoveryState::GetMissing::exit()
8527 {
8528 context< RecoveryMachine >().log_exit(state_name, enter_time);
8529 PG *pg = context< RecoveryMachine >().pg;
8530 utime_t dur = ceph_clock_now() - enter_time;
8531 pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur);
8532 pg->blocked_by.clear();
8533 pg->publish_stats_to_osd();
8534 }
8535
8536 /*------WaitUpThru--------*/
8537 PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx)
8538 : my_base(ctx),
8539 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitUpThru")
8540 {
8541 context< RecoveryMachine >().log_enter(state_name);
8542 }
8543
8544 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const ActMap& am)
8545 {
8546 PG *pg = context< RecoveryMachine >().pg;
8547 if (!pg->need_up_thru) {
8548 post_event(Activate(pg->get_osdmap()->get_epoch()));
8549 }
8550 return forward_event();
8551 }
8552
8553 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const MLogRec& logevt)
8554 {
8555 PG *pg = context< RecoveryMachine >().pg;
8556 ldout(pg->cct, 10) << "Noting missing from osd." << logevt.from << dendl;
8557 pg->peer_missing[logevt.from].claim(logevt.msg->missing);
8558 pg->peer_info[logevt.from] = logevt.msg->info;
8559 return discard_event();
8560 }
8561
8562 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const QueryState& q)
8563 {
8564 q.f->open_object_section("state");
8565 q.f->dump_string("name", state_name);
8566 q.f->dump_stream("enter_time") << enter_time;
8567 q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
8568 q.f->close_section();
8569 return forward_event();
8570 }
8571
8572 void PG::RecoveryState::WaitUpThru::exit()
8573 {
8574 context< RecoveryMachine >().log_exit(state_name, enter_time);
8575 PG *pg = context< RecoveryMachine >().pg;
8576 utime_t dur = ceph_clock_now() - enter_time;
8577 pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur);
8578 }
8579
8580 /*----RecoveryState::RecoveryMachine Methods-----*/
8581 #undef dout_prefix
8582 #define dout_prefix *_dout << pg->gen_prefix()
8583
8584 void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name)
8585 {
8586 PG *pg = context< RecoveryMachine >().pg;
8587 ldout(pg->cct, 5) << "enter " << state_name << dendl;
8588 pg->osd->pg_recovery_stats.log_enter(state_name);
8589 }
8590
8591 void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_t enter_time)
8592 {
8593 utime_t dur = ceph_clock_now() - enter_time;
8594 PG *pg = context< RecoveryMachine >().pg;
8595 ldout(pg->cct, 5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
8596 pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now() - enter_time,
8597 event_count, event_time);
8598 event_count = 0;
8599 event_time = utime_t();
8600 }
8601
8602
8603 /*---------------------------------------------------*/
8604 #undef dout_prefix
8605 #define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ")
8606
8607 void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
8608 assert(!rctx);
8609 assert(!orig_ctx);
8610 orig_ctx = new_ctx;
8611 if (new_ctx) {
8612 if (messages_pending_flush) {
8613 rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
8614 } else {
8615 rctx = *new_ctx;
8616 }
8617 rctx->start_time = ceph_clock_now();
8618 }
8619 }
8620
8621 void PG::RecoveryState::begin_block_outgoing() {
8622 assert(!messages_pending_flush);
8623 assert(orig_ctx);
8624 assert(rctx);
8625 messages_pending_flush = BufferedRecoveryMessages();
8626 rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
8627 }
8628
8629 void PG::RecoveryState::clear_blocked_outgoing() {
8630 assert(orig_ctx);
8631 assert(rctx);
8632 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8633 }
8634
8635 void PG::RecoveryState::end_block_outgoing() {
8636 assert(messages_pending_flush);
8637 assert(orig_ctx);
8638 assert(rctx);
8639
8640 rctx = RecoveryCtx(*orig_ctx);
8641 rctx->accept_buffered_messages(*messages_pending_flush);
8642 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8643 }
8644
8645 void PG::RecoveryState::end_handle() {
8646 if (rctx) {
8647 utime_t dur = ceph_clock_now() - rctx->start_time;
8648 machine.event_time += dur;
8649 }
8650
8651 machine.event_count++;
8652 rctx = boost::optional<RecoveryCtx>();
8653 orig_ctx = NULL;
8654 }
8655
8656 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi)
8657 {
8658 out << "BackfillInfo(" << bi.begin << "-" << bi.end
8659 << " " << bi.objects.size() << " objects";
8660 if (!bi.objects.empty())
8661 out << " " << bi.objects;
8662 out << ")";
8663 return out;
8664 }
8665
8666 void intrusive_ptr_add_ref(PG *pg) { pg->get("intptr"); }
8667 void intrusive_ptr_release(PG *pg) { pg->put("intptr"); }
8668
8669 #ifdef PG_DEBUG_REFS
8670 uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
8671 void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
8672 #endif