]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PG.cc
import new upstream nautilus stable release 14.2.8
[ceph.git] / ceph / src / osd / PG.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "PG.h"
16 // #include "msg/Messenger.h"
17 #include "messages/MOSDRepScrub.h"
18 // #include "common/cmdparse.h"
19 // #include "common/ceph_context.h"
20
21 #include "common/errno.h"
22 #include "common/config.h"
23 #include "OSD.h"
24 #include "OpRequest.h"
25 #include "ScrubStore.h"
26 #include "Session.h"
27
28 #include "common/Timer.h"
29 #include "common/perf_counters.h"
30
31 #include "messages/MOSDOp.h"
32 #include "messages/MOSDPGNotify.h"
33 // #include "messages/MOSDPGLog.h"
34 #include "messages/MOSDPGRemove.h"
35 #include "messages/MOSDPGInfo.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDPGBackfill.h"
39 #include "messages/MOSDPGBackfillRemove.h"
40 #include "messages/MBackfillReserve.h"
41 #include "messages/MRecoveryReserve.h"
42 #include "messages/MOSDPGPush.h"
43 #include "messages/MOSDPGPushReply.h"
44 #include "messages/MOSDPGPull.h"
45 #include "messages/MOSDECSubOpWrite.h"
46 #include "messages/MOSDECSubOpWriteReply.h"
47 #include "messages/MOSDECSubOpRead.h"
48 #include "messages/MOSDECSubOpReadReply.h"
49 #include "messages/MOSDPGUpdateLogMissing.h"
50 #include "messages/MOSDPGUpdateLogMissingReply.h"
51 #include "messages/MOSDBackoff.h"
52 #include "messages/MOSDScrubReserve.h"
53 #include "messages/MOSDRepOp.h"
54 #include "messages/MOSDRepOpReply.h"
55 #include "messages/MOSDRepScrubMap.h"
56 #include "messages/MOSDPGRecoveryDelete.h"
57 #include "messages/MOSDPGRecoveryDeleteReply.h"
58
59 #include "common/BackTrace.h"
60 #include "common/EventTrace.h"
61
62 #ifdef WITH_LTTNG
63 #define TRACEPOINT_DEFINE
64 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
65 #include "tracing/pg.h"
66 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
67 #undef TRACEPOINT_DEFINE
68 #else
69 #define tracepoint(...)
70 #endif
71
72 #include <sstream>
73
74 #define dout_context cct
75 #define dout_subsys ceph_subsys_osd
76 #undef dout_prefix
77 #define dout_prefix _prefix(_dout, this)
78
79 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
80 // easily skip them
81 const string infover_key("_infover");
82 const string info_key("_info");
83 const string biginfo_key("_biginfo");
84 const string epoch_key("_epoch");
85 const string fastinfo_key("_fastinfo");
86
87 template <class T>
88 static ostream& _prefix(std::ostream *_dout, T *t)
89 {
90 return t->gen_prefix(*_dout);
91 }
92
93 void PGStateHistory::enter(PG* pg, const utime_t entime, const char* state)
94 {
95 // Ignore trimming state machine for now
96 if (::strstr(state, "Trimming") != NULL) {
97 return;
98 } else if (pi != nullptr) {
99 pi->enter_state(entime, state);
100 } else {
101 // Store current state since we can't reliably take the PG lock here
102 if ( tmppi == nullptr) {
103 tmppi = std::unique_ptr<PGStateInstance>(new PGStateInstance);
104 }
105
106 thispg = pg;
107 tmppi->enter_state(entime, state);
108 }
109 }
110
111 void PGStateHistory::exit(const char* state) {
112 // Ignore trimming state machine for now
113 // Do nothing if PG is being destroyed!
114 if (::strstr(state, "Trimming") != NULL || pg_in_destructor) {
115 return;
116 } else {
117 bool ilocked = false;
118 if(!thispg->is_locked()) {
119 thispg->lock();
120 ilocked = true;
121 }
122 if (pi == nullptr) {
123 buffer.push_back(std::unique_ptr<PGStateInstance>(tmppi.release()));
124 pi = buffer.back().get();
125 pi->setepoch(thispg->get_osdmap_epoch());
126 }
127
128 pi->exit_state(ceph_clock_now());
129 if (::strcmp(state, "Reset") == 0) {
130 this->reset();
131 }
132 if(ilocked) {
133 thispg->unlock();
134 }
135 }
136 }
137
138 void PGStateHistory::dump(Formatter* f) const {
139 f->open_array_section("history");
140 for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) {
141 f->open_object_section("states");
142 f->dump_stream("epoch") << (*pi)->this_epoch;
143 for (auto she : (*pi)->state_history) {
144 f->dump_string("state", std::get<2>(she));
145 f->dump_stream("enter") << std::get<0>(she);
146 f->dump_stream("exit") << std::get<1>(she);
147 }
148 f->close_section();
149 }
150 f->close_section();
151 }
152
153 void PG::get(const char* tag)
154 {
155 int after = ++ref;
156 lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " "
157 << "tag " << (tag ? tag : "(none") << " "
158 << (after - 1) << " -> " << after << dendl;
159 #ifdef PG_DEBUG_REFS
160 std::lock_guard l(_ref_id_lock);
161 _tag_counts[tag]++;
162 #endif
163 }
164
165 void PG::put(const char* tag)
166 {
167 #ifdef PG_DEBUG_REFS
168 {
169 std::lock_guard l(_ref_id_lock);
170 auto tag_counts_entry = _tag_counts.find(tag);
171 ceph_assert(tag_counts_entry != _tag_counts.end());
172 --tag_counts_entry->second;
173 if (tag_counts_entry->second == 0) {
174 _tag_counts.erase(tag_counts_entry);
175 }
176 }
177 #endif
178 auto local_cct = cct;
179 int after = --ref;
180 lgeneric_subdout(local_cct, refs, 5) << "PG::put " << this << " "
181 << "tag " << (tag ? tag : "(none") << " "
182 << (after + 1) << " -> " << after
183 << dendl;
184 if (after == 0)
185 delete this;
186 }
187
188 #ifdef PG_DEBUG_REFS
189 uint64_t PG::get_with_id()
190 {
191 ref++;
192 std::lock_guard l(_ref_id_lock);
193 uint64_t id = ++_ref_id;
194 BackTrace bt(0);
195 stringstream ss;
196 bt.print(ss);
197 lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " " << info.pgid
198 << " got id " << id << " "
199 << (ref - 1) << " -> " << ref
200 << dendl;
201 ceph_assert(!_live_ids.count(id));
202 _live_ids.insert(make_pair(id, ss.str()));
203 return id;
204 }
205
206 void PG::put_with_id(uint64_t id)
207 {
208 int newref = --ref;
209 lgeneric_subdout(cct, refs, 5) << "PG::put " << this << " " << info.pgid
210 << " put id " << id << " "
211 << (newref + 1) << " -> " << newref
212 << dendl;
213 {
214 std::lock_guard l(_ref_id_lock);
215 ceph_assert(_live_ids.count(id));
216 _live_ids.erase(id);
217 }
218 if (newref)
219 delete this;
220 }
221
222 void PG::dump_live_ids()
223 {
224 std::lock_guard l(_ref_id_lock);
225 dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl;
226 for (map<uint64_t, string>::iterator i = _live_ids.begin();
227 i != _live_ids.end();
228 ++i) {
229 dout(0) << "\t\tid: " << *i << dendl;
230 }
231 dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl;
232 for (map<string, uint64_t>::iterator i = _tag_counts.begin();
233 i != _tag_counts.end();
234 ++i) {
235 dout(0) << "\t\tid: " << *i << dendl;
236 }
237 }
238 #endif
239
240
241 void PGPool::update(CephContext *cct, OSDMapRef map)
242 {
243 const pg_pool_t *pi = map->get_pg_pool(id);
244 if (!pi) {
245 return; // pool has been deleted
246 }
247 info = *pi;
248 name = map->get_pool_name(id);
249
250 bool updated = false;
251 if ((map->get_epoch() != cached_epoch + 1) ||
252 (pi->get_snap_epoch() == map->get_epoch())) {
253 updated = true;
254 }
255
256 if (map->require_osd_release >= CEPH_RELEASE_MIMIC) {
257 // mimic tracks removed_snaps_queue in the OSDmap and purged_snaps
258 // in the pg_info_t, with deltas for both in each OSDMap. we don't
259 // need to (and can't) track it here.
260 cached_removed_snaps.clear();
261 newly_removed_snaps.clear();
262 } else {
263 // legacy (<= luminous) removed_snaps tracking
264 if (updated) {
265 if (pi->maybe_updated_removed_snaps(cached_removed_snaps)) {
266 pi->build_removed_snaps(newly_removed_snaps);
267 if (cached_removed_snaps.subset_of(newly_removed_snaps)) {
268 interval_set<snapid_t> removed_snaps = newly_removed_snaps;
269 newly_removed_snaps.subtract(cached_removed_snaps);
270 cached_removed_snaps.swap(removed_snaps);
271 } else {
272 lgeneric_subdout(cct, osd, 0) << __func__
273 << " cached_removed_snaps shrank from " << cached_removed_snaps
274 << " to " << newly_removed_snaps << dendl;
275 cached_removed_snaps.swap(newly_removed_snaps);
276 newly_removed_snaps.clear();
277 }
278 } else {
279 newly_removed_snaps.clear();
280 }
281 } else {
282 /* 1) map->get_epoch() == cached_epoch + 1 &&
283 * 2) pi->get_snap_epoch() != map->get_epoch()
284 *
285 * From the if branch, 1 && 2 must be true. From 2, we know that
286 * this map didn't change the set of removed snaps. From 1, we
287 * know that our cached_removed_snaps matches the previous map.
288 * Thus, from 1 && 2, cached_removed snaps matches the current
289 * set of removed snaps and all we have to do is clear
290 * newly_removed_snaps.
291 */
292 newly_removed_snaps.clear();
293 }
294 lgeneric_subdout(cct, osd, 20)
295 << "PGPool::update cached_removed_snaps "
296 << cached_removed_snaps
297 << " newly_removed_snaps "
298 << newly_removed_snaps
299 << " snapc " << snapc
300 << (updated ? " (updated)":" (no change)")
301 << dendl;
302 if (cct->_conf->osd_debug_verify_cached_snaps) {
303 interval_set<snapid_t> actual_removed_snaps;
304 pi->build_removed_snaps(actual_removed_snaps);
305 if (!(actual_removed_snaps == cached_removed_snaps)) {
306 lgeneric_derr(cct) << __func__
307 << ": mismatch between the actual removed snaps "
308 << actual_removed_snaps
309 << " and pool.cached_removed_snaps "
310 << " pool.cached_removed_snaps " << cached_removed_snaps
311 << dendl;
312 }
313 ceph_assert(actual_removed_snaps == cached_removed_snaps);
314 }
315 }
316 if (info.is_pool_snaps_mode() && updated) {
317 snapc = pi->get_snap_context();
318 }
319 cached_epoch = map->get_epoch();
320 }
321
322 PG::PG(OSDService *o, OSDMapRef curmap,
323 const PGPool &_pool, spg_t p) :
324 pg_id(p),
325 coll(p),
326 osd(o),
327 cct(o->cct),
328 osdmap_ref(curmap),
329 pool(_pool),
330 osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
331 snap_mapper(
332 cct,
333 &osdriver,
334 p.ps(),
335 p.get_split_bits(_pool.info.get_pg_num()),
336 _pool.id,
337 p.shard),
338 last_persisted_osdmap(curmap->get_epoch()),
339 deleting(false),
340 trace_endpoint("0.0.0.0", 0, "PG"),
341 dirty_info(false), dirty_big_info(false),
342 info(p),
343 info_struct_v(0),
344 pg_log(cct),
345 pgmeta_oid(p.make_pgmeta_oid()),
346 missing_loc(this),
347 stat_queue_item(this),
348 scrub_queued(false),
349 recovery_queued(false),
350 recovery_ops_active(0),
351 role(-1),
352 state(0),
353 send_notify(false),
354 pg_whoami(osd->whoami, p.shard),
355 need_up_thru(false),
356 last_peering_reset(0),
357 heartbeat_peer_lock("PG::heartbeat_peer_lock"),
358 backfill_reserved(false),
359 backfill_reserving(false),
360 flushes_in_progress(0),
361 pg_stats_publish_lock("PG::pg_stats_publish_lock"),
362 pg_stats_publish_valid(false),
363 finish_sync_event(NULL),
364 backoff_lock("PG::backoff_lock"),
365 scrub_after_recovery(false),
366 active_pushes(0),
367 recovery_state(this),
368 peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
369 acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
370 upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
371 last_epoch(0),
372 last_require_osd_release(curmap->require_osd_release)
373 {
374 #ifdef PG_DEBUG_REFS
375 osd->add_pgid(p, this);
376 #endif
377 #ifdef WITH_BLKIN
378 std::stringstream ss;
379 ss << "PG " << info.pgid;
380 trace_endpoint.copy_name(ss.str());
381 #endif
382 }
383
384 PG::~PG()
385 {
386 pgstate_history.set_pg_in_destructor();
387 #ifdef PG_DEBUG_REFS
388 osd->remove_pgid(info.pgid, this);
389 #endif
390 }
391
392 void PG::lock(bool no_lockdep) const
393 {
394 _lock.Lock(no_lockdep);
395 // if we have unrecorded dirty state with the lock dropped, there is a bug
396 ceph_assert(!dirty_info);
397 ceph_assert(!dirty_big_info);
398
399 dout(30) << "lock" << dendl;
400 }
401
402 std::ostream& PG::gen_prefix(std::ostream& out) const
403 {
404 OSDMapRef mapref = osdmap_ref;
405 if (_lock.is_locked_by_me()) {
406 out << "osd." << osd->whoami
407 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
408 << " " << *this << " ";
409 } else {
410 out << "osd." << osd->whoami
411 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
412 << " pg[" << info.pgid << "(unlocked)] ";
413 }
414 return out;
415 }
416
417 /********* PG **********/
418
419 void PG::proc_master_log(
420 ObjectStore::Transaction& t, pg_info_t &oinfo,
421 pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
422 {
423 dout(10) << "proc_master_log for osd." << from << ": "
424 << olog << " " << omissing << dendl;
425 ceph_assert(!is_peered() && is_primary());
426
427 // merge log into our own log to build master log. no need to
428 // make any adjustments to their missing map; we are taking their
429 // log to be authoritative (i.e., their entries are by definitely
430 // non-divergent).
431 merge_log(t, oinfo, olog, from);
432 peer_info[from] = oinfo;
433 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
434 might_have_unfound.insert(from);
435
436 // See doc/dev/osd_internals/last_epoch_started
437 if (oinfo.last_epoch_started > info.last_epoch_started) {
438 info.last_epoch_started = oinfo.last_epoch_started;
439 dirty_info = true;
440 }
441 if (oinfo.last_interval_started > info.last_interval_started) {
442 info.last_interval_started = oinfo.last_interval_started;
443 dirty_info = true;
444 }
445 update_history(oinfo.history);
446 ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
447 info.last_epoch_started >= info.history.last_epoch_started);
448
449 peer_missing[from].claim(omissing);
450 }
451
452 void PG::proc_replica_log(
453 pg_info_t &oinfo,
454 const pg_log_t &olog,
455 pg_missing_t& omissing,
456 pg_shard_t from)
457 {
458 dout(10) << "proc_replica_log for osd." << from << ": "
459 << oinfo << " " << olog << " " << omissing << dendl;
460
461 pg_log.proc_replica_log(oinfo, olog, omissing, from);
462
463 peer_info[from] = oinfo;
464 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
465 might_have_unfound.insert(from);
466
467 for (map<hobject_t, pg_missing_item>::const_iterator i =
468 omissing.get_items().begin();
469 i != omissing.get_items().end();
470 ++i) {
471 dout(20) << " after missing " << i->first << " need " << i->second.need
472 << " have " << i->second.have << dendl;
473 }
474 peer_missing[from].claim(omissing);
475 }
476
477 bool PG::proc_replica_info(
478 pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
479 {
480 map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
481 if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
482 dout(10) << " got dup osd." << from << " info " << oinfo << ", identical to ours" << dendl;
483 return false;
484 }
485
486 if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
487 dout(10) << " got info " << oinfo << " from down osd." << from
488 << " discarding" << dendl;
489 return false;
490 }
491
492 dout(10) << " got osd." << from << " " << oinfo << dendl;
493 ceph_assert(is_primary());
494 peer_info[from] = oinfo;
495 might_have_unfound.insert(from);
496
497 update_history(oinfo.history);
498
499 // stray?
500 if (!is_up(from) && !is_acting(from)) {
501 dout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
502 stray_set.insert(from);
503 if (is_clean()) {
504 purge_strays();
505 }
506 }
507
508 // was this a new info? if so, update peers!
509 if (p == peer_info.end())
510 update_heartbeat_peers();
511
512 return true;
513 }
514
515 void PG::remove_snap_mapped_object(
516 ObjectStore::Transaction &t, const hobject_t &soid)
517 {
518 t.remove(
519 coll,
520 ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard));
521 clear_object_snap_mapping(&t, soid);
522 }
523
524 void PG::clear_object_snap_mapping(
525 ObjectStore::Transaction *t, const hobject_t &soid)
526 {
527 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
528 if (soid.snap < CEPH_MAXSNAP) {
529 int r = snap_mapper.remove_oid(
530 soid,
531 &_t);
532 if (!(r == 0 || r == -ENOENT)) {
533 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
534 ceph_abort();
535 }
536 }
537 }
538
539 void PG::update_object_snap_mapping(
540 ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps)
541 {
542 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
543 ceph_assert(soid.snap < CEPH_MAXSNAP);
544 int r = snap_mapper.remove_oid(
545 soid,
546 &_t);
547 if (!(r == 0 || r == -ENOENT)) {
548 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
549 ceph_abort();
550 }
551 snap_mapper.add_oid(
552 soid,
553 snaps,
554 &_t);
555 }
556
557 void PG::merge_log(
558 ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_shard_t from)
559 {
560 PGLogEntryHandler rollbacker{this, &t};
561 pg_log.merge_log(
562 oinfo, olog, from, info, &rollbacker, dirty_info, dirty_big_info);
563 }
564
565 void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead)
566 {
567 PGLogEntryHandler rollbacker{this, &t};
568 pg_log.rewind_divergent_log(
569 newhead, info, &rollbacker, dirty_info, dirty_big_info);
570 }
571
572 /*
573 * Process information from a replica to determine if it could have any
574 * objects that i need.
575 *
576 * TODO: if the missing set becomes very large, this could get expensive.
577 * Instead, we probably want to just iterate over our unfound set.
578 */
579 bool PG::search_for_missing(
580 const pg_info_t &oinfo, const pg_missing_t &omissing,
581 pg_shard_t from,
582 RecoveryCtx *ctx)
583 {
584 uint64_t num_unfound_before = missing_loc.num_unfound();
585 bool found_missing = missing_loc.add_source_info(
586 from, oinfo, omissing, ctx->handle);
587 if (found_missing && num_unfound_before != missing_loc.num_unfound())
588 publish_stats_to_osd();
589 // avoid doing this if the peer is empty. This is abit of paranoia
590 // to avoid doing something rash if add_source_info() above
591 // incorrectly decided we found something new. (if the peer has
592 // last_update=0'0 that's impossible.)
593 if (found_missing &&
594 oinfo.last_update != eversion_t()) {
595 pg_info_t tinfo(oinfo);
596 tinfo.pgid.shard = pg_whoami.shard;
597 (*(ctx->info_map))[from.osd].push_back(
598 make_pair(
599 pg_notify_t(
600 from.shard, pg_whoami.shard,
601 get_osdmap_epoch(),
602 get_osdmap_epoch(),
603 tinfo),
604 past_intervals));
605 }
606 return found_missing;
607 }
608
609
610 // MissingLoc
611
612 bool PG::MissingLoc::readable_with_acting(
613 const hobject_t &hoid,
614 const set<pg_shard_t> &acting) const {
615 if (!needs_recovery(hoid))
616 return true;
617 if (is_deleted(hoid))
618 return false;
619 auto missing_loc_entry = missing_loc.find(hoid);
620 if (missing_loc_entry == missing_loc.end())
621 return false;
622 const set<pg_shard_t> &locs = missing_loc_entry->second;
623 ldout(pg->cct, 10) << __func__ << ": locs:" << locs << dendl;
624 set<pg_shard_t> have_acting;
625 for (set<pg_shard_t>::const_iterator i = locs.begin();
626 i != locs.end();
627 ++i) {
628 if (acting.count(*i))
629 have_acting.insert(*i);
630 }
631 return (*is_readable)(have_acting);
632 }
633
634 void PG::MissingLoc::add_batch_sources_info(
635 const set<pg_shard_t> &sources, ThreadPool::TPHandle* handle)
636 {
637 ldout(pg->cct, 10) << __func__ << ": adding sources in batch "
638 << sources.size() << dendl;
639 unsigned loop = 0;
640 bool sources_updated = false;
641 for (map<hobject_t, pg_missing_item>::const_iterator i = needs_recovery_map.begin();
642 i != needs_recovery_map.end();
643 ++i) {
644 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
645 handle->reset_tp_timeout();
646 loop = 0;
647 }
648 if (i->second.is_delete())
649 continue;
650
651 auto p = missing_loc.find(i->first);
652 if (p == missing_loc.end()) {
653 p = missing_loc.emplace(i->first, set<pg_shard_t>()).first;
654 } else {
655 _dec_count(p->second);
656 }
657 missing_loc[i->first].insert(sources.begin(), sources.end());
658 _inc_count(p->second);
659
660 if (!sources_updated) {
661 missing_loc_sources.insert(sources.begin(), sources.end());
662 sources_updated = true;
663 }
664 }
665 }
666
667 bool PG::MissingLoc::add_source_info(
668 pg_shard_t fromosd,
669 const pg_info_t &oinfo,
670 const pg_missing_t &omissing,
671 ThreadPool::TPHandle* handle)
672 {
673 bool found_missing = false;
674 unsigned loop = 0;
675 bool sources_updated = false;
676 // found items?
677 for (map<hobject_t,pg_missing_item>::const_iterator p = needs_recovery_map.begin();
678 p != needs_recovery_map.end();
679 ++p) {
680 const hobject_t &soid(p->first);
681 eversion_t need = p->second.need;
682 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
683 handle->reset_tp_timeout();
684 loop = 0;
685 }
686 if (p->second.is_delete()) {
687 ldout(pg->cct, 10) << __func__ << " " << soid
688 << " delete, ignoring source" << dendl;
689 continue;
690 }
691 if (oinfo.last_update < need) {
692 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
693 << " also missing on osd." << fromosd
694 << " (last_update " << oinfo.last_update
695 << " < needed " << need << ")" << dendl;
696 continue;
697 }
698 if (!oinfo.last_backfill.is_max() &&
699 !oinfo.last_backfill_bitwise) {
700 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
701 << " also missing on osd." << fromosd
702 << " (last_backfill " << oinfo.last_backfill
703 << " but with wrong sort order)"
704 << dendl;
705 continue;
706 }
707 if (p->first >= oinfo.last_backfill) {
708 // FIXME: this is _probably_ true, although it could conceivably
709 // be in the undefined region! Hmm!
710 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
711 << " also missing on osd." << fromosd
712 << " (past last_backfill " << oinfo.last_backfill
713 << ")" << dendl;
714 continue;
715 }
716 if (omissing.is_missing(soid)) {
717 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
718 << " also missing on osd." << fromosd << dendl;
719 continue;
720 }
721
722 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
723 << " is on osd." << fromosd << dendl;
724
725 {
726 auto p = missing_loc.find(soid);
727 if (p == missing_loc.end()) {
728 p = missing_loc.emplace(soid, set<pg_shard_t>()).first;
729 } else {
730 _dec_count(p->second);
731 }
732 p->second.insert(fromosd);
733 _inc_count(p->second);
734 }
735
736 if (!sources_updated) {
737 missing_loc_sources.insert(fromosd);
738 sources_updated = true;
739 }
740 found_missing = true;
741 }
742
743 ldout(pg->cct, 20) << "needs_recovery_map missing " << needs_recovery_map
744 << dendl;
745 return found_missing;
746 }
747
748 void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
749 {
750 set<pg_shard_t> now_down;
751 for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
752 p != missing_loc_sources.end();
753 ) {
754 if (osdmap->is_up(p->osd)) {
755 ++p;
756 continue;
757 }
758 ldout(pg->cct, 10) << __func__ << " source osd." << *p << " now down" << dendl;
759 now_down.insert(*p);
760 missing_loc_sources.erase(p++);
761 }
762
763 if (now_down.empty()) {
764 ldout(pg->cct, 10) << __func__ << " no source osds (" << missing_loc_sources << ") went down" << dendl;
765 } else {
766 ldout(pg->cct, 10) << __func__ << " sources osds " << now_down << " now down, remaining sources are "
767 << missing_loc_sources << dendl;
768
769 // filter missing_loc
770 map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin();
771 while (p != missing_loc.end()) {
772 set<pg_shard_t>::iterator q = p->second.begin();
773 bool changed = false;
774 while (q != p->second.end()) {
775 if (now_down.count(*q)) {
776 if (!changed) {
777 changed = true;
778 _dec_count(p->second);
779 }
780 p->second.erase(q++);
781 } else {
782 ++q;
783 }
784 }
785 if (p->second.empty()) {
786 missing_loc.erase(p++);
787 } else {
788 if (changed) {
789 _inc_count(p->second);
790 }
791 ++p;
792 }
793 }
794 }
795 }
796
797 void PG::discover_all_missing(map<int, map<spg_t,pg_query_t> > &query_map)
798 {
799 auto &missing = pg_log.get_missing();
800 uint64_t unfound = get_num_unfound();
801
802 dout(10) << __func__ << " "
803 << missing.num_missing() << " missing, "
804 << unfound << " unfound"
805 << dendl;
806
807 std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
808 std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
809 for (; m != mend; ++m) {
810 pg_shard_t peer(*m);
811
812 if (!get_osdmap()->is_up(peer.osd)) {
813 dout(20) << __func__ << " skipping down osd." << peer << dendl;
814 continue;
815 }
816
817 if (peer_purged.count(peer)) {
818 dout(20) << __func__ << " skipping purged osd." << peer << dendl;
819 continue;
820 }
821
822 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
823 if (iter != peer_info.end() &&
824 (iter->second.is_empty() || iter->second.dne())) {
825 // ignore empty peers
826 continue;
827 }
828
829 // If we've requested any of this stuff, the pg_missing_t information
830 // should be on its way.
831 // TODO: coalsce requested_* into a single data structure
832 if (peer_missing.find(peer) != peer_missing.end()) {
833 dout(20) << __func__ << ": osd." << peer
834 << ": we already have pg_missing_t" << dendl;
835 continue;
836 }
837 if (peer_log_requested.find(peer) != peer_log_requested.end()) {
838 dout(20) << __func__ << ": osd." << peer
839 << ": in peer_log_requested" << dendl;
840 continue;
841 }
842 if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
843 dout(20) << __func__ << ": osd." << peer
844 << ": in peer_missing_requested" << dendl;
845 continue;
846 }
847
848 // Request missing
849 dout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
850 << dendl;
851 peer_missing_requested.insert(peer);
852 query_map[peer.osd][spg_t(info.pgid.pgid, peer.shard)] =
853 pg_query_t(
854 pg_query_t::FULLLOG,
855 peer.shard, pg_whoami.shard,
856 info.history, get_osdmap_epoch());
857 }
858 }
859
860 /******* PG ***********/
861 bool PG::needs_recovery() const
862 {
863 ceph_assert(is_primary());
864
865 auto &missing = pg_log.get_missing();
866
867 if (missing.num_missing()) {
868 dout(10) << __func__ << " primary has " << missing.num_missing()
869 << " missing" << dendl;
870 return true;
871 }
872
873 ceph_assert(!acting_recovery_backfill.empty());
874 set<pg_shard_t>::const_iterator end = acting_recovery_backfill.end();
875 set<pg_shard_t>::const_iterator a = acting_recovery_backfill.begin();
876 for (; a != end; ++a) {
877 if (*a == get_primary()) continue;
878 pg_shard_t peer = *a;
879 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
880 if (pm == peer_missing.end()) {
881 dout(10) << __func__ << " osd." << peer << " doesn't have missing set"
882 << dendl;
883 continue;
884 }
885 if (pm->second.num_missing()) {
886 dout(10) << __func__ << " osd." << peer << " has "
887 << pm->second.num_missing() << " missing" << dendl;
888 return true;
889 }
890 }
891
892 dout(10) << __func__ << " is recovered" << dendl;
893 return false;
894 }
895
896 bool PG::needs_backfill() const
897 {
898 ceph_assert(is_primary());
899
900 // We can assume that only possible osds that need backfill
901 // are on the backfill_targets vector nodes.
902 set<pg_shard_t>::const_iterator end = backfill_targets.end();
903 set<pg_shard_t>::const_iterator a = backfill_targets.begin();
904 for (; a != end; ++a) {
905 pg_shard_t peer = *a;
906 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
907 if (!pi->second.last_backfill.is_max()) {
908 dout(10) << __func__ << " osd." << peer << " has last_backfill " << pi->second.last_backfill << dendl;
909 return true;
910 }
911 }
912
913 dout(10) << __func__ << " does not need backfill" << dendl;
914 return false;
915 }
916
917
918 void PG::check_past_interval_bounds() const
919 {
920 auto oldest_epoch = osd->get_superblock().oldest_map;
921 auto rpib = get_required_past_interval_bounds(
922 info,
923 oldest_epoch);
924 if (rpib.first >= rpib.second) {
925 // do not warn if the start bound is dictated by oldest_map; the
926 // past intervals are presumably appropriate given the pg info.
927 if (!past_intervals.empty() &&
928 rpib.first > oldest_epoch) {
929 osd->clog->error() << info.pgid << " required past_interval bounds are"
930 << " empty [" << rpib << ") but past_intervals is not: "
931 << past_intervals;
932 derr << info.pgid << " required past_interval bounds are"
933 << " empty [" << rpib << ") but past_intervals is not: "
934 << past_intervals << dendl;
935 }
936 } else {
937 if (past_intervals.empty()) {
938 osd->clog->error() << info.pgid << " required past_interval bounds are"
939 << " not empty [" << rpib << ") but past_intervals "
940 << past_intervals << " is empty";
941 derr << info.pgid << " required past_interval bounds are"
942 << " not empty [" << rpib << ") but past_intervals "
943 << past_intervals << " is empty" << dendl;
944 ceph_assert(!past_intervals.empty());
945 }
946
947 auto apib = past_intervals.get_bounds();
948 if (apib.first > rpib.first) {
949 osd->clog->error() << info.pgid << " past_intervals [" << apib
950 << ") start interval does not contain the required"
951 << " bound [" << rpib << ") start";
952 derr << info.pgid << " past_intervals [" << apib
953 << ") start interval does not contain the required"
954 << " bound [" << rpib << ") start" << dendl;
955 ceph_abort_msg("past_interval start interval mismatch");
956 }
957 if (apib.second != rpib.second) {
958 osd->clog->error() << info.pgid << " past_interal bound [" << apib
959 << ") end does not match required [" << rpib
960 << ") end";
961 derr << info.pgid << " past_interal bound [" << apib
962 << ") end does not match required [" << rpib
963 << ") end" << dendl;
964 ceph_abort_msg("past_interval end mismatch");
965 }
966 }
967 }
968
969 bool PG::adjust_need_up_thru(const OSDMapRef osdmap)
970 {
971 epoch_t up_thru = osdmap->get_up_thru(osd->whoami);
972 if (need_up_thru &&
973 up_thru >= info.history.same_interval_since) {
974 dout(10) << "adjust_need_up_thru now " << up_thru << ", need_up_thru now false" << dendl;
975 need_up_thru = false;
976 return true;
977 }
978 return false;
979 }
980
981 void PG::remove_down_peer_info(const OSDMapRef osdmap)
982 {
983 // Remove any downed osds from peer_info
984 bool removed = false;
985 map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
986 while (p != peer_info.end()) {
987 if (!osdmap->is_up(p->first.osd)) {
988 dout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
989 peer_missing.erase(p->first);
990 peer_log_requested.erase(p->first);
991 peer_missing_requested.erase(p->first);
992 peer_purged.erase(p->first); // so we can re-purge if necessary
993 peer_info.erase(p++);
994 removed = true;
995 } else
996 ++p;
997 }
998
999 // if we removed anyone, update peers (which include peer_info)
1000 if (removed)
1001 update_heartbeat_peers();
1002 check_recovery_sources(osdmap);
1003 }
1004
1005 /*
1006 * Returns true unless there is a non-lost OSD in might_have_unfound.
1007 */
1008 bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
1009 {
1010 ceph_assert(is_primary());
1011
1012 set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
1013 set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
1014 for (; peer != mend; ++peer) {
1015 if (peer_missing.count(*peer))
1016 continue;
1017 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
1018 if (iter != peer_info.end() &&
1019 (iter->second.is_empty() || iter->second.dne()))
1020 continue;
1021 if (!osdmap->exists(peer->osd))
1022 continue;
1023 const osd_info_t &osd_info(osdmap->get_info(peer->osd));
1024 if (osd_info.lost_at <= osd_info.up_from) {
1025 // If there is even one OSD in might_have_unfound that isn't lost, we
1026 // still might retrieve our unfound.
1027 return false;
1028 }
1029 }
1030 dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound
1031 << " have been queried or are marked lost" << dendl;
1032 return true;
1033 }
1034
1035 PastIntervals::PriorSet PG::build_prior()
1036 {
1037 if (1) {
1038 // sanity check
1039 for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
1040 it != peer_info.end();
1041 ++it) {
1042 ceph_assert(info.history.last_epoch_started >= it->second.history.last_epoch_started);
1043 }
1044 }
1045
1046 const OSDMap &osdmap = *get_osdmap();
1047 PastIntervals::PriorSet prior = past_intervals.get_prior_set(
1048 pool.info.is_erasure(),
1049 info.history.last_epoch_started,
1050 get_pgbackend()->get_is_recoverable_predicate(),
1051 [&](epoch_t start, int osd, epoch_t *lost_at) {
1052 const osd_info_t *pinfo = 0;
1053 if (osdmap.exists(osd)) {
1054 pinfo = &osdmap.get_info(osd);
1055 if (lost_at)
1056 *lost_at = pinfo->lost_at;
1057 }
1058
1059 if (osdmap.is_up(osd)) {
1060 return PastIntervals::UP;
1061 } else if (!pinfo) {
1062 return PastIntervals::DNE;
1063 } else if (pinfo->lost_at > start) {
1064 return PastIntervals::LOST;
1065 } else {
1066 return PastIntervals::DOWN;
1067 }
1068 },
1069 up,
1070 acting,
1071 this);
1072
1073 if (prior.pg_down) {
1074 state_set(PG_STATE_DOWN);
1075 }
1076
1077 if (get_osdmap()->get_up_thru(osd->whoami) < info.history.same_interval_since) {
1078 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
1079 << " < same_since " << info.history.same_interval_since
1080 << ", must notify monitor" << dendl;
1081 need_up_thru = true;
1082 } else {
1083 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
1084 << " >= same_since " << info.history.same_interval_since
1085 << ", all is well" << dendl;
1086 need_up_thru = false;
1087 }
1088 set_probe_targets(prior.probe);
1089 return prior;
1090 }
1091
1092 void PG::clear_primary_state()
1093 {
1094 dout(10) << "clear_primary_state" << dendl;
1095
1096 // clear peering state
1097 stray_set.clear();
1098 peer_log_requested.clear();
1099 peer_missing_requested.clear();
1100 peer_info.clear();
1101 peer_bytes.clear();
1102 peer_missing.clear();
1103 need_up_thru = false;
1104 peer_last_complete_ondisk.clear();
1105 peer_activated.clear();
1106 min_last_complete_ondisk = eversion_t();
1107 pg_trim_to = eversion_t();
1108 might_have_unfound.clear();
1109 projected_log = PGLog::IndexedLog();
1110
1111 last_update_ondisk = eversion_t();
1112
1113 snap_trimq.clear();
1114
1115 finish_sync_event = 0; // so that _finish_recovery doesn't go off in another thread
1116
1117 missing_loc.clear();
1118
1119 release_pg_backoffs();
1120
1121 pg_log.reset_recovery_pointers();
1122
1123 scrubber.reserved_peers.clear();
1124 scrub_after_recovery = false;
1125
1126 agent_clear();
1127 }
1128
1129 PG::Scrubber::Scrubber()
1130 : local_reserved(false), remote_reserved(false), reserve_failed(false),
1131 epoch_start(0),
1132 active(false),
1133 shallow_errors(0), deep_errors(0), fixed(0),
1134 must_scrub(false), must_deep_scrub(false), must_repair(false),
1135 need_auto(false), time_for_deep(false),
1136 auto_repair(false),
1137 check_repair(false),
1138 deep_scrub_on_error(false),
1139 num_digest_updates_pending(0),
1140 state(INACTIVE),
1141 deep(false)
1142 {}
1143
1144 PG::Scrubber::~Scrubber() {}
1145
1146 /**
1147 * find_best_info
1148 *
1149 * Returns an iterator to the best info in infos sorted by:
1150 * 1) Prefer newer last_update
1151 * 2) Prefer longer tail if it brings another info into contiguity
1152 * 3) Prefer current primary
1153 */
1154 map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
1155 const map<pg_shard_t, pg_info_t> &infos,
1156 bool restrict_to_up_acting,
1157 bool *history_les_bound) const
1158 {
1159 ceph_assert(history_les_bound);
1160 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1161 * to make changes to this process. Also, make sure to update it
1162 * when you find bugs! */
1163 eversion_t min_last_update_acceptable = eversion_t::max();
1164 epoch_t max_last_epoch_started_found = 0;
1165 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1166 i != infos.end();
1167 ++i) {
1168 if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1169 max_last_epoch_started_found < i->second.history.last_epoch_started) {
1170 *history_les_bound = true;
1171 max_last_epoch_started_found = i->second.history.last_epoch_started;
1172 }
1173 if (!i->second.is_incomplete() &&
1174 max_last_epoch_started_found < i->second.last_epoch_started) {
1175 *history_les_bound = false;
1176 max_last_epoch_started_found = i->second.last_epoch_started;
1177 }
1178 }
1179 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1180 i != infos.end();
1181 ++i) {
1182 if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1183 if (min_last_update_acceptable > i->second.last_update)
1184 min_last_update_acceptable = i->second.last_update;
1185 }
1186 }
1187 if (min_last_update_acceptable == eversion_t::max())
1188 return infos.end();
1189
1190 map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1191 // find osd with newest last_update (oldest for ec_pool).
1192 // if there are multiples, prefer
1193 // - a longer tail, if it brings another peer into log contiguity
1194 // - the current primary
1195 for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1196 p != infos.end();
1197 ++p) {
1198 if (restrict_to_up_acting && !is_up(p->first) &&
1199 !is_acting(p->first))
1200 continue;
1201 // Only consider peers with last_update >= min_last_update_acceptable
1202 if (p->second.last_update < min_last_update_acceptable)
1203 continue;
1204 // Disqualify anyone with a too old last_epoch_started
1205 if (p->second.last_epoch_started < max_last_epoch_started_found)
1206 continue;
1207 // Disqualify anyone who is incomplete (not fully backfilled)
1208 if (p->second.is_incomplete())
1209 continue;
1210 if (best == infos.end()) {
1211 best = p;
1212 continue;
1213 }
1214 // Prefer newer last_update
1215 if (pool.info.require_rollback()) {
1216 if (p->second.last_update > best->second.last_update)
1217 continue;
1218 if (p->second.last_update < best->second.last_update) {
1219 best = p;
1220 continue;
1221 }
1222 } else {
1223 if (p->second.last_update < best->second.last_update)
1224 continue;
1225 if (p->second.last_update > best->second.last_update) {
1226 best = p;
1227 continue;
1228 }
1229 }
1230
1231 // Prefer longer tail
1232 if (p->second.log_tail > best->second.log_tail) {
1233 continue;
1234 } else if (p->second.log_tail < best->second.log_tail) {
1235 best = p;
1236 continue;
1237 }
1238
1239 if (!p->second.has_missing() && best->second.has_missing()) {
1240 dout(10) << __func__ << " prefer osd." << p->first
1241 << " because it is complete while best has missing"
1242 << dendl;
1243 best = p;
1244 continue;
1245 } else if (p->second.has_missing() && !best->second.has_missing()) {
1246 dout(10) << __func__ << " skipping osd." << p->first
1247 << " because it has missing while best is complete"
1248 << dendl;
1249 continue;
1250 } else {
1251 // both are complete or have missing
1252 // fall through
1253 }
1254
1255 // prefer current primary (usually the caller), all things being equal
1256 if (p->first == pg_whoami) {
1257 dout(10) << "calc_acting prefer osd." << p->first
1258 << " because it is current primary" << dendl;
1259 best = p;
1260 continue;
1261 }
1262 }
1263 return best;
1264 }
1265
1266 void PG::calc_ec_acting(
1267 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1268 unsigned size,
1269 const vector<int> &acting,
1270 const vector<int> &up,
1271 const map<pg_shard_t, pg_info_t> &all_info,
1272 bool restrict_to_up_acting,
1273 vector<int> *_want,
1274 set<pg_shard_t> *backfill,
1275 set<pg_shard_t> *acting_backfill,
1276 ostream &ss)
1277 {
1278 vector<int> want(size, CRUSH_ITEM_NONE);
1279 map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1280 for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1281 i != all_info.end();
1282 ++i) {
1283 all_info_by_shard[i->first.shard].insert(i->first);
1284 }
1285 for (uint8_t i = 0; i < want.size(); ++i) {
1286 ss << "For position " << (unsigned)i << ": ";
1287 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1288 !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1289 all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1290 auth_log_shard->second.log_tail) {
1291 ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1292 want[i] = up[i];
1293 continue;
1294 }
1295 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1296 ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1297 << " and ";
1298 backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1299 }
1300
1301 if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1302 !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1303 all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1304 auth_log_shard->second.log_tail) {
1305 ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1306 want[i] = acting[i];
1307 } else if (!restrict_to_up_acting) {
1308 for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1309 j != all_info_by_shard[shard_id_t(i)].end();
1310 ++j) {
1311 ceph_assert(j->shard == i);
1312 if (!all_info.find(*j)->second.is_incomplete() &&
1313 all_info.find(*j)->second.last_update >=
1314 auth_log_shard->second.log_tail) {
1315 ss << " selecting stray: " << *j << std::endl;
1316 want[i] = j->osd;
1317 break;
1318 }
1319 }
1320 if (want[i] == CRUSH_ITEM_NONE)
1321 ss << " failed to fill position " << (int)i << std::endl;
1322 }
1323 }
1324
1325 for (uint8_t i = 0; i < want.size(); ++i) {
1326 if (want[i] != CRUSH_ITEM_NONE) {
1327 acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1328 }
1329 }
1330 acting_backfill->insert(backfill->begin(), backfill->end());
1331 _want->swap(want);
1332 }
1333
1334 /**
1335 * calculate the desired acting set.
1336 *
1337 * Choose an appropriate acting set. Prefer up[0], unless it is
1338 * incomplete, or another osd has a longer tail that allows us to
1339 * bring other up nodes up to date.
1340 */
1341 void PG::calc_replicated_acting(
1342 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1343 uint64_t force_auth_primary_missing_objects,
1344 unsigned size,
1345 const vector<int> &acting,
1346 const vector<int> &up,
1347 pg_shard_t up_primary,
1348 const map<pg_shard_t, pg_info_t> &all_info,
1349 bool restrict_to_up_acting,
1350 vector<int> *want,
1351 set<pg_shard_t> *backfill,
1352 set<pg_shard_t> *acting_backfill,
1353 const OSDMapRef osdmap,
1354 ostream &ss)
1355 {
1356 pg_shard_t auth_log_shard_id = auth_log_shard->first;
1357
1358 ss << __func__ << " newest update on osd." << auth_log_shard_id
1359 << " with " << auth_log_shard->second
1360 << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1361
1362 // select primary
1363 auto primary = all_info.find(up_primary);
1364 if (up.size() &&
1365 !primary->second.is_incomplete() &&
1366 primary->second.last_update >=
1367 auth_log_shard->second.log_tail) {
1368 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1369 auto approx_missing_objects =
1370 primary->second.stats.stats.sum.num_objects_missing;
1371 auto auth_version = auth_log_shard->second.last_update.version;
1372 auto primary_version = primary->second.last_update.version;
1373 if (auth_version > primary_version) {
1374 approx_missing_objects += auth_version - primary_version;
1375 } else {
1376 approx_missing_objects += primary_version - auth_version;
1377 }
1378 if ((uint64_t)approx_missing_objects >
1379 force_auth_primary_missing_objects) {
1380 primary = auth_log_shard;
1381 ss << "up_primary: " << up_primary << ") has approximate "
1382 << approx_missing_objects
1383 << "(>" << force_auth_primary_missing_objects <<") "
1384 << "missing objects, osd." << auth_log_shard_id
1385 << " selected as primary instead"
1386 << std::endl;
1387 } else {
1388 ss << "up_primary: " << up_primary << ") selected as primary"
1389 << std::endl;
1390 }
1391 } else {
1392 ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1393 }
1394 } else {
1395 ceph_assert(!auth_log_shard->second.is_incomplete());
1396 ss << "up[0] needs backfill, osd." << auth_log_shard_id
1397 << " selected as primary instead" << std::endl;
1398 primary = auth_log_shard;
1399 }
1400
1401 ss << __func__ << " primary is osd." << primary->first
1402 << " with " << primary->second << std::endl;
1403 want->push_back(primary->first.osd);
1404 acting_backfill->insert(primary->first);
1405
1406 /* We include auth_log_shard->second.log_tail because in GetLog,
1407 * we will request logs back to the min last_update over our
1408 * acting_backfill set, which will result in our log being extended
1409 * as far backwards as necessary to pick up any peers which can
1410 * be log recovered by auth_log_shard's log */
1411 eversion_t oldest_auth_log_entry =
1412 std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
1413
1414 // select replicas that have log contiguity with primary.
1415 // prefer up, then acting, then any peer_info osds
1416 for (auto i : up) {
1417 pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
1418 if (up_cand == primary->first)
1419 continue;
1420 const pg_info_t &cur_info = all_info.find(up_cand)->second;
1421 if (cur_info.is_incomplete() ||
1422 cur_info.last_update < oldest_auth_log_entry) {
1423 ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1424 backfill->insert(up_cand);
1425 acting_backfill->insert(up_cand);
1426 } else {
1427 want->push_back(i);
1428 acting_backfill->insert(up_cand);
1429 ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
1430 }
1431 }
1432
1433 if (want->size() >= size) {
1434 return;
1435 }
1436
1437 std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
1438 candidate_by_last_update.reserve(acting.size());
1439 // This no longer has backfill OSDs, but they are covered above.
1440 for (auto i : acting) {
1441 pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
1442 // skip up osds we already considered above
1443 if (acting_cand == primary->first)
1444 continue;
1445 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i);
1446 if (up_it != up.end())
1447 continue;
1448
1449 const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1450 if (cur_info.is_incomplete() ||
1451 cur_info.last_update < oldest_auth_log_entry) {
1452 ss << " shard " << acting_cand << " (acting) REJECTED "
1453 << cur_info << std::endl;
1454 } else {
1455 candidate_by_last_update.push_back(make_pair(cur_info.last_update, i));
1456 }
1457 }
1458
1459 auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs,
1460 const std::pair<eversion_t, int> &rhs) {
1461 return lhs.first > rhs.first;
1462 };
1463 // sort by last_update, in descending order.
1464 std::sort(candidate_by_last_update.begin(),
1465 candidate_by_last_update.end(), sort_by_eversion);
1466 for (auto &p: candidate_by_last_update) {
1467 ceph_assert(want->size() < size);
1468 want->push_back(p.second);
1469 pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1470 acting_backfill->insert(s);
1471 ss << " shard " << s << " (acting) accepted "
1472 << all_info.find(s)->second << std::endl;
1473 if (want->size() >= size) {
1474 return;
1475 }
1476 }
1477
1478 if (restrict_to_up_acting) {
1479 return;
1480 }
1481 candidate_by_last_update.clear();
1482 candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
1483 // continue to search stray to find more suitable peers
1484 for (auto &i : all_info) {
1485 // skip up osds we already considered above
1486 if (i.first == primary->first)
1487 continue;
1488 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i.first.osd);
1489 if (up_it != up.end())
1490 continue;
1491 vector<int>::const_iterator acting_it = find(
1492 acting.begin(), acting.end(), i.first.osd);
1493 if (acting_it != acting.end())
1494 continue;
1495
1496 if (i.second.is_incomplete() ||
1497 i.second.last_update < oldest_auth_log_entry) {
1498 ss << " shard " << i.first << " (stray) REJECTED " << i.second
1499 << std::endl;
1500 } else {
1501 candidate_by_last_update.push_back(
1502 make_pair(i.second.last_update, i.first.osd));
1503 }
1504 }
1505
1506 if (candidate_by_last_update.empty()) {
1507 // save us some effort
1508 return;
1509 }
1510
1511 // sort by last_update, in descending order.
1512 std::sort(candidate_by_last_update.begin(),
1513 candidate_by_last_update.end(), sort_by_eversion);
1514
1515 for (auto &p: candidate_by_last_update) {
1516 ceph_assert(want->size() < size);
1517 want->push_back(p.second);
1518 pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
1519 acting_backfill->insert(s);
1520 ss << " shard " << s << " (stray) accepted "
1521 << all_info.find(s)->second << std::endl;
1522 if (want->size() >= size) {
1523 return;
1524 }
1525 }
1526 }
1527
1528 bool PG::recoverable_and_ge_min_size(const vector<int> &want) const
1529 {
1530 unsigned num_want_acting = 0;
1531 set<pg_shard_t> have;
1532 for (int i = 0; i < (int)want.size(); ++i) {
1533 if (want[i] != CRUSH_ITEM_NONE) {
1534 ++num_want_acting;
1535 have.insert(
1536 pg_shard_t(
1537 want[i],
1538 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1539 }
1540 }
1541 // We go incomplete if below min_size for ec_pools since backfill
1542 // does not currently maintain rollbackability
1543 // Otherwise, we will go "peered", but not "active"
1544 if (num_want_acting < pool.info.min_size &&
1545 (pool.info.is_erasure() ||
1546 !cct->_conf->osd_allow_recovery_below_min_size)) {
1547 dout(10) << __func__ << " failed, below min size" << dendl;
1548 return false;
1549 }
1550
1551 /* Check whether we have enough acting shards to later perform recovery */
1552 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable_predicate(
1553 get_pgbackend()->get_is_recoverable_predicate());
1554 if (!(*recoverable_predicate)(have)) {
1555 dout(10) << __func__ << " failed, not recoverable" << dendl;
1556 return false;
1557 }
1558
1559 return true;
1560 }
1561
1562 void PG::choose_async_recovery_ec(const map<pg_shard_t, pg_info_t> &all_info,
1563 const pg_info_t &auth_info,
1564 vector<int> *want,
1565 set<pg_shard_t> *async_recovery,
1566 const OSDMapRef osdmap) const
1567 {
1568 set<pair<int, pg_shard_t> > candidates_by_cost;
1569 for (uint8_t i = 0; i < want->size(); ++i) {
1570 if ((*want)[i] == CRUSH_ITEM_NONE)
1571 continue;
1572
1573 // Considering log entries to recover is accurate enough for
1574 // now. We could use minimum_to_decode_with_cost() later if
1575 // necessary.
1576 pg_shard_t shard_i((*want)[i], shard_id_t(i));
1577 // do not include strays
1578 if (stray_set.find(shard_i) != stray_set.end())
1579 continue;
1580 // Do not include an osd that is not up, since choosing it as
1581 // an async_recovery_target will move it out of the acting set.
1582 // This results in it being identified as a stray during peering,
1583 // because it is no longer in the up or acting set.
1584 if (!is_up(shard_i))
1585 continue;
1586 auto shard_info = all_info.find(shard_i)->second;
1587 // for ec pools we rollback all entries past the authoritative
1588 // last_update *before* activation. This is relatively inexpensive
1589 // compared to recovery, since it is purely local, so treat shards
1590 // past the authoritative last_update the same as those equal to it.
1591 version_t auth_version = auth_info.last_update.version;
1592 version_t candidate_version = shard_info.last_update.version;
1593 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1594 auto approx_missing_objects =
1595 shard_info.stats.stats.sum.num_objects_missing;
1596 if (auth_version > candidate_version) {
1597 approx_missing_objects += auth_version - candidate_version;
1598 }
1599 if (static_cast<uint64_t>(approx_missing_objects) >
1600 cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1601 candidates_by_cost.emplace(approx_missing_objects, shard_i);
1602 }
1603 } else {
1604 if (auth_version > candidate_version &&
1605 (auth_version - candidate_version) > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1606 candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i));
1607 }
1608 }
1609 }
1610
1611 dout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1612 << dendl;
1613
1614 // take out as many osds as we can for async recovery, in order of cost
1615 for (auto rit = candidates_by_cost.rbegin();
1616 rit != candidates_by_cost.rend(); ++rit) {
1617 pg_shard_t cur_shard = rit->second;
1618 vector<int> candidate_want(*want);
1619 candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE;
1620 if (recoverable_and_ge_min_size(candidate_want)) {
1621 want->swap(candidate_want);
1622 async_recovery->insert(cur_shard);
1623 }
1624 }
1625 dout(20) << __func__ << " result want=" << *want
1626 << " async_recovery=" << *async_recovery << dendl;
1627 }
1628
1629 void PG::choose_async_recovery_replicated(const map<pg_shard_t, pg_info_t> &all_info,
1630 const pg_info_t &auth_info,
1631 vector<int> *want,
1632 set<pg_shard_t> *async_recovery,
1633 const OSDMapRef osdmap) const
1634 {
1635 set<pair<int, pg_shard_t> > candidates_by_cost;
1636 for (auto osd_num : *want) {
1637 pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD);
1638 // do not include strays
1639 if (stray_set.find(shard_i) != stray_set.end())
1640 continue;
1641 // Do not include an osd that is not up, since choosing it as
1642 // an async_recovery_target will move it out of the acting set.
1643 // This results in it being identified as a stray during peering,
1644 // because it is no longer in the up or acting set.
1645 if (!is_up(shard_i))
1646 continue;
1647 auto shard_info = all_info.find(shard_i)->second;
1648 // use the approximate magnitude of the difference in length of
1649 // logs plus historical missing objects as the cost of recovery
1650 version_t auth_version = auth_info.last_update.version;
1651 version_t candidate_version = shard_info.last_update.version;
1652 if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
1653 auto approx_missing_objects =
1654 shard_info.stats.stats.sum.num_objects_missing;
1655 if (auth_version > candidate_version) {
1656 approx_missing_objects += auth_version - candidate_version;
1657 } else {
1658 approx_missing_objects += candidate_version - auth_version;
1659 }
1660 if (static_cast<uint64_t>(approx_missing_objects) >
1661 cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1662 candidates_by_cost.emplace(approx_missing_objects, shard_i);
1663 }
1664 } else {
1665 size_t approx_entries;
1666 if (auth_version > candidate_version) {
1667 approx_entries = auth_version - candidate_version;
1668 } else {
1669 approx_entries = candidate_version - auth_version;
1670 }
1671 if (approx_entries > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
1672 candidates_by_cost.insert(make_pair(approx_entries, shard_i));
1673 }
1674 }
1675 }
1676
1677 dout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
1678 << dendl;
1679 // take out as many osds as we can for async recovery, in order of cost
1680 for (auto rit = candidates_by_cost.rbegin();
1681 rit != candidates_by_cost.rend(); ++rit) {
1682 if (want->size() <= pool.info.min_size) {
1683 break;
1684 }
1685 pg_shard_t cur_shard = rit->second;
1686 vector<int> candidate_want(*want);
1687 for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) {
1688 if (*it == cur_shard.osd) {
1689 candidate_want.erase(it);
1690 want->swap(candidate_want);
1691 async_recovery->insert(cur_shard);
1692 break;
1693 }
1694 }
1695 }
1696 dout(20) << __func__ << " result want=" << *want
1697 << " async_recovery=" << *async_recovery << dendl;
1698 }
1699
1700 /**
1701 * choose acting
1702 *
1703 * calculate the desired acting, and request a change with the monitor
1704 * if it differs from the current acting.
1705 *
1706 * if restrict_to_up_acting=true, we filter out anything that's not in
1707 * up/acting. in order to lift this restriction, we need to
1708 * 1) check whether it's worth switching the acting set any time we get
1709 * a new pg info (not just here, when recovery finishes)
1710 * 2) check whether anything in want_acting went down on each new map
1711 * (and, if so, calculate a new want_acting)
1712 * 3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap)
1713 * TODO!
1714 */
1715 bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
1716 bool restrict_to_up_acting,
1717 bool *history_les_bound)
1718 {
1719 map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
1720 all_info[pg_whoami] = info;
1721
1722 if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
1723 for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
1724 p != all_info.end();
1725 ++p) {
1726 dout(10) << __func__ << " all_info osd." << p->first << " " << p->second << dendl;
1727 }
1728 }
1729
1730 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
1731 find_best_info(all_info, restrict_to_up_acting, history_les_bound);
1732
1733 if (auth_log_shard == all_info.end()) {
1734 if (up != acting) {
1735 dout(10) << __func__ << " no suitable info found (incomplete backfills?),"
1736 << " reverting to up" << dendl;
1737 want_acting = up;
1738 vector<int> empty;
1739 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1740 } else {
1741 dout(10) << __func__ << " failed" << dendl;
1742 ceph_assert(want_acting.empty());
1743 }
1744 return false;
1745 }
1746
1747 ceph_assert(!auth_log_shard->second.is_incomplete());
1748 auth_log_shard_id = auth_log_shard->first;
1749
1750 set<pg_shard_t> want_backfill, want_acting_backfill;
1751 vector<int> want;
1752 stringstream ss;
1753 if (!pool.info.is_erasure())
1754 calc_replicated_acting(
1755 auth_log_shard,
1756 cct->_conf.get_val<uint64_t>(
1757 "osd_force_auth_primary_missing_objects"),
1758 get_osdmap()->get_pg_size(info.pgid.pgid),
1759 acting,
1760 up,
1761 up_primary,
1762 all_info,
1763 restrict_to_up_acting,
1764 &want,
1765 &want_backfill,
1766 &want_acting_backfill,
1767 get_osdmap(),
1768 ss);
1769 else
1770 calc_ec_acting(
1771 auth_log_shard,
1772 get_osdmap()->get_pg_size(info.pgid.pgid),
1773 acting,
1774 up,
1775 all_info,
1776 restrict_to_up_acting,
1777 &want,
1778 &want_backfill,
1779 &want_acting_backfill,
1780 ss);
1781 dout(10) << ss.str() << dendl;
1782
1783 if (!recoverable_and_ge_min_size(want)) {
1784 want_acting.clear();
1785 return false;
1786 }
1787
1788 set<pg_shard_t> want_async_recovery;
1789 if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) {
1790 if (pool.info.is_erasure()) {
1791 choose_async_recovery_ec(all_info, auth_log_shard->second, &want, &want_async_recovery, get_osdmap());
1792 } else {
1793 choose_async_recovery_replicated(all_info, auth_log_shard->second, &want, &want_async_recovery, get_osdmap());
1794 }
1795 }
1796 while (want.size() > pool.info.size) {
1797 // async recovery should have taken out as many osds as it can.
1798 // if not, then always evict the last peer
1799 // (will get synchronously recovered later)
1800 dout(10) << __func__ << " evicting osd." << want.back()
1801 << " from oversized want " << want << dendl;
1802 want.pop_back();
1803 }
1804 if (want != acting) {
1805 dout(10) << __func__ << " want " << want << " != acting " << acting
1806 << ", requesting pg_temp change" << dendl;
1807 want_acting = want;
1808
1809 if (!cct->_conf->osd_debug_no_acting_change) {
1810 if (want_acting == up) {
1811 // There can't be any pending backfill if
1812 // want is the same as crush map up OSDs.
1813 ceph_assert(want_backfill.empty());
1814 vector<int> empty;
1815 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1816 } else
1817 osd->queue_want_pg_temp(info.pgid.pgid, want);
1818 }
1819 return false;
1820 }
1821 want_acting.clear();
1822 acting_recovery_backfill = want_acting_backfill;
1823 dout(10) << "acting_recovery_backfill is " << acting_recovery_backfill << dendl;
1824 ceph_assert(backfill_targets.empty() || backfill_targets == want_backfill);
1825 if (backfill_targets.empty()) {
1826 // Caller is GetInfo
1827 backfill_targets = want_backfill;
1828 }
1829 // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
1830 ceph_assert(async_recovery_targets.empty() || async_recovery_targets == want_async_recovery || !needs_recovery());
1831 if (async_recovery_targets.empty() || !needs_recovery()) {
1832 async_recovery_targets = want_async_recovery;
1833 }
1834 // Will not change if already set because up would have had to change
1835 // Verify that nothing in backfill is in stray_set
1836 for (set<pg_shard_t>::iterator i = want_backfill.begin();
1837 i != want_backfill.end();
1838 ++i) {
1839 ceph_assert(stray_set.find(*i) == stray_set.end());
1840 }
1841 dout(10) << "choose_acting want=" << want << " backfill_targets="
1842 << want_backfill << " async_recovery_targets="
1843 << async_recovery_targets << dendl;
1844 return true;
1845 }
1846
1847 /* Build the might_have_unfound set.
1848 *
1849 * This is used by the primary OSD during recovery.
1850 *
1851 * This set tracks the OSDs which might have unfound objects that the primary
1852 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
1853 * will remove the OSD from the set.
1854 */
1855 void PG::build_might_have_unfound()
1856 {
1857 ceph_assert(might_have_unfound.empty());
1858 ceph_assert(is_primary());
1859
1860 dout(10) << __func__ << dendl;
1861
1862 check_past_interval_bounds();
1863
1864 might_have_unfound = past_intervals.get_might_have_unfound(
1865 pg_whoami,
1866 pool.info.is_erasure());
1867
1868 // include any (stray) peers
1869 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
1870 p != peer_info.end();
1871 ++p)
1872 might_have_unfound.insert(p->first);
1873
1874 dout(15) << __func__ << ": built " << might_have_unfound << dendl;
1875 }
1876
1877 void PG::activate(ObjectStore::Transaction& t,
1878 epoch_t activation_epoch,
1879 map<int, map<spg_t,pg_query_t> >& query_map,
1880 map<int,
1881 vector<
1882 pair<pg_notify_t,
1883 PastIntervals> > > *activator_map,
1884 RecoveryCtx *ctx)
1885 {
1886 ceph_assert(!is_peered());
1887 ceph_assert(scrubber.callbacks.empty());
1888 ceph_assert(callbacks_for_degraded_object.empty());
1889
1890 // twiddle pg state
1891 state_clear(PG_STATE_DOWN);
1892
1893 send_notify = false;
1894
1895 if (is_primary()) {
1896 // only update primary last_epoch_started if we will go active
1897 if (acting.size() >= pool.info.min_size) {
1898 ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
1899 info.last_epoch_started <= activation_epoch);
1900 info.last_epoch_started = activation_epoch;
1901 info.last_interval_started = info.history.same_interval_since;
1902 }
1903 } else if (is_acting(pg_whoami)) {
1904 /* update last_epoch_started on acting replica to whatever the primary sent
1905 * unless it's smaller (could happen if we are going peered rather than
1906 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
1907 if (info.last_epoch_started < activation_epoch) {
1908 info.last_epoch_started = activation_epoch;
1909 info.last_interval_started = info.history.same_interval_since;
1910 }
1911 }
1912
1913 auto &missing = pg_log.get_missing();
1914
1915 if (is_primary()) {
1916 last_update_ondisk = info.last_update;
1917 min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
1918 }
1919 last_update_applied = info.last_update;
1920 last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
1921
1922 need_up_thru = false;
1923
1924 // write pg info, log
1925 dirty_info = true;
1926 dirty_big_info = true; // maybe
1927
1928 // find out when we commit
1929 t.register_on_complete(
1930 new C_PG_ActivateCommitted(
1931 this,
1932 get_osdmap_epoch(),
1933 activation_epoch));
1934
1935 if (is_primary()) {
1936 // initialize snap_trimq
1937 if (get_osdmap()->require_osd_release < CEPH_RELEASE_MIMIC) {
1938 dout(20) << "activate - purged_snaps " << info.purged_snaps
1939 << " cached_removed_snaps " << pool.cached_removed_snaps
1940 << dendl;
1941 snap_trimq = pool.cached_removed_snaps;
1942 } else {
1943 auto& removed_snaps_queue = get_osdmap()->get_removed_snaps_queue();
1944 auto p = removed_snaps_queue.find(info.pgid.pgid.pool());
1945 snap_trimq.clear();
1946 if (p != removed_snaps_queue.end()) {
1947 dout(20) << "activate - purged_snaps " << info.purged_snaps
1948 << " removed_snaps " << p->second
1949 << dendl;
1950 for (auto q : p->second) {
1951 snap_trimq.insert(q.first, q.second);
1952 }
1953 }
1954 }
1955 interval_set<snapid_t> purged;
1956 purged.intersection_of(snap_trimq, info.purged_snaps);
1957 snap_trimq.subtract(purged);
1958
1959 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_MIMIC) {
1960 // adjust purged_snaps: PG may have been inactive while snaps were pruned
1961 // from the removed_snaps_queue in the osdmap. update local purged_snaps
1962 // reflect only those snaps that we thought were pruned and were still in
1963 // the queue.
1964 info.purged_snaps.swap(purged);
1965 }
1966 }
1967
1968 // init complete pointer
1969 if (missing.num_missing() == 0) {
1970 dout(10) << "activate - no missing, moving last_complete " << info.last_complete
1971 << " -> " << info.last_update << dendl;
1972 info.last_complete = info.last_update;
1973 info.stats.stats.sum.num_objects_missing = 0;
1974 pg_log.reset_recovery_pointers();
1975 } else {
1976 dout(10) << "activate - not complete, " << missing << dendl;
1977 info.stats.stats.sum.num_objects_missing = missing.num_missing();
1978 pg_log.activate_not_complete(info);
1979 }
1980
1981 log_weirdness();
1982
1983 // if primary..
1984 if (is_primary()) {
1985 ceph_assert(ctx);
1986 // start up replicas
1987
1988 ceph_assert(!acting_recovery_backfill.empty());
1989 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
1990 i != acting_recovery_backfill.end();
1991 ++i) {
1992 if (*i == pg_whoami) continue;
1993 pg_shard_t peer = *i;
1994 ceph_assert(peer_info.count(peer));
1995 pg_info_t& pi = peer_info[peer];
1996
1997 dout(10) << "activate peer osd." << peer << " " << pi << dendl;
1998
1999 MOSDPGLog *m = 0;
2000 ceph_assert(peer_missing.count(peer));
2001 pg_missing_t& pm = peer_missing[peer];
2002
2003 bool needs_past_intervals = pi.dne();
2004
2005 /*
2006 * cover case where peer sort order was different and
2007 * last_backfill cannot be interpreted
2008 */
2009 bool force_restart_backfill =
2010 !pi.last_backfill.is_max() &&
2011 !pi.last_backfill_bitwise;
2012
2013 if (pi.last_update == info.last_update && !force_restart_backfill) {
2014 // empty log
2015 if (!pi.last_backfill.is_max())
2016 osd->clog->info() << info.pgid << " continuing backfill to osd."
2017 << peer
2018 << " from (" << pi.log_tail << "," << pi.last_update
2019 << "] " << pi.last_backfill
2020 << " to " << info.last_update;
2021 if (!pi.is_empty() && activator_map) {
2022 dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
2023 (*activator_map)[peer.osd].push_back(
2024 make_pair(
2025 pg_notify_t(
2026 peer.shard, pg_whoami.shard,
2027 get_osdmap_epoch(),
2028 get_osdmap_epoch(),
2029 info),
2030 past_intervals));
2031 } else {
2032 dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
2033 m = new MOSDPGLog(
2034 i->shard, pg_whoami.shard,
2035 get_osdmap_epoch(), info,
2036 last_peering_reset);
2037 }
2038 } else if (
2039 pg_log.get_tail() > pi.last_update ||
2040 pi.last_backfill == hobject_t() ||
2041 force_restart_backfill ||
2042 (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
2043 /* ^ This last case covers a situation where a replica is not contiguous
2044 * with the auth_log, but is contiguous with this replica. Reshuffling
2045 * the active set to handle this would be tricky, so instead we just go
2046 * ahead and backfill it anyway. This is probably preferrable in any
2047 * case since the replica in question would have to be significantly
2048 * behind.
2049 */
2050 // backfill
2051 osd->clog->debug() << info.pgid << " starting backfill to osd." << peer
2052 << " from (" << pi.log_tail << "," << pi.last_update
2053 << "] " << pi.last_backfill
2054 << " to " << info.last_update;
2055
2056 pi.last_update = info.last_update;
2057 pi.last_complete = info.last_update;
2058 pi.set_last_backfill(hobject_t());
2059 pi.last_epoch_started = info.last_epoch_started;
2060 pi.last_interval_started = info.last_interval_started;
2061 pi.history = info.history;
2062 pi.hit_set = info.hit_set;
2063 // Save num_bytes for reservation request, can't be negative
2064 peer_bytes[peer] = std::max<int64_t>(0, pi.stats.stats.sum.num_bytes);
2065 pi.stats.stats.clear();
2066
2067 // initialize peer with our purged_snaps.
2068 pi.purged_snaps = info.purged_snaps;
2069
2070 m = new MOSDPGLog(
2071 i->shard, pg_whoami.shard,
2072 get_osdmap_epoch(), pi,
2073 last_peering_reset /* epoch to create pg at */);
2074
2075 // send some recent log, so that op dup detection works well.
2076 m->log.copy_up_to(cct, pg_log.get_log(), cct->_conf->osd_min_pg_log_entries);
2077 m->info.log_tail = m->log.tail;
2078 pi.log_tail = m->log.tail; // sigh...
2079
2080 pm.clear();
2081 } else {
2082 // catch up
2083 ceph_assert(pg_log.get_tail() <= pi.last_update);
2084 m = new MOSDPGLog(
2085 i->shard, pg_whoami.shard,
2086 get_osdmap_epoch(), info,
2087 last_peering_reset /* epoch to create pg at */);
2088 // send new stuff to append to replicas log
2089 m->log.copy_after(cct, pg_log.get_log(), pi.last_update);
2090 }
2091
2092 // share past_intervals if we are creating the pg on the replica
2093 // based on whether our info for that peer was dne() *before*
2094 // updating pi.history in the backfill block above.
2095 if (m && needs_past_intervals)
2096 m->past_intervals = past_intervals;
2097
2098 // update local version of peer's missing list!
2099 if (m && pi.last_backfill != hobject_t()) {
2100 for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
2101 p != m->log.log.end();
2102 ++p) {
2103 if (p->soid <= pi.last_backfill &&
2104 !p->is_error()) {
2105 if (perform_deletes_during_peering() && p->is_delete()) {
2106 pm.rm(p->soid, p->version);
2107 } else {
2108 pm.add_next_event(*p);
2109 }
2110 }
2111 }
2112 }
2113
2114 if (m) {
2115 dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
2116 //m->log.print(cout);
2117 osd->send_message_osd_cluster(peer.osd, m, get_osdmap_epoch());
2118 }
2119
2120 // peer now has
2121 pi.last_update = info.last_update;
2122
2123 // update our missing
2124 if (pm.num_missing() == 0) {
2125 pi.last_complete = pi.last_update;
2126 dout(10) << "activate peer osd." << peer << " " << pi << " uptodate" << dendl;
2127 } else {
2128 dout(10) << "activate peer osd." << peer << " " << pi << " missing " << pm << dendl;
2129 }
2130 }
2131
2132 // Set up missing_loc
2133 set<pg_shard_t> complete_shards;
2134 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2135 i != acting_recovery_backfill.end();
2136 ++i) {
2137 dout(20) << __func__ << " setting up missing_loc from shard " << *i << " " << dendl;
2138 if (*i == get_primary()) {
2139 missing_loc.add_active_missing(missing);
2140 if (!missing.have_missing())
2141 complete_shards.insert(*i);
2142 } else {
2143 auto peer_missing_entry = peer_missing.find(*i);
2144 ceph_assert(peer_missing_entry != peer_missing.end());
2145 missing_loc.add_active_missing(peer_missing_entry->second);
2146 if (!peer_missing_entry->second.have_missing() &&
2147 peer_info[*i].last_backfill.is_max())
2148 complete_shards.insert(*i);
2149 }
2150 }
2151
2152 // If necessary, create might_have_unfound to help us find our unfound objects.
2153 // NOTE: It's important that we build might_have_unfound before trimming the
2154 // past intervals.
2155 might_have_unfound.clear();
2156 if (needs_recovery()) {
2157 // If only one shard has missing, we do a trick to add all others as recovery
2158 // source, this is considered safe since the PGLogs have been merged locally,
2159 // and covers vast majority of the use cases, like one OSD/host is down for
2160 // a while for hardware repairing
2161 if (complete_shards.size() + 1 == acting_recovery_backfill.size()) {
2162 missing_loc.add_batch_sources_info(complete_shards, ctx->handle);
2163 } else {
2164 missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
2165 ctx->handle);
2166 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
2167 i != acting_recovery_backfill.end();
2168 ++i) {
2169 if (*i == pg_whoami) continue;
2170 dout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
2171 ceph_assert(peer_missing.count(*i));
2172 ceph_assert(peer_info.count(*i));
2173 missing_loc.add_source_info(
2174 *i,
2175 peer_info[*i],
2176 peer_missing[*i],
2177 ctx->handle);
2178 }
2179 }
2180 for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
2181 i != peer_missing.end();
2182 ++i) {
2183 if (is_acting_recovery_backfill(i->first))
2184 continue;
2185 ceph_assert(peer_info.count(i->first));
2186 search_for_missing(
2187 peer_info[i->first],
2188 i->second,
2189 i->first,
2190 ctx);
2191 }
2192
2193 build_might_have_unfound();
2194
2195 // Always call now so _update_calc_stats() will be accurate
2196 discover_all_missing(query_map);
2197 }
2198
2199 // num_objects_degraded if calculated should reflect this too, unless no
2200 // missing and we are about to go clean.
2201 if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
2202 state_set(PG_STATE_UNDERSIZED);
2203 }
2204
2205 state_set(PG_STATE_ACTIVATING);
2206 release_pg_backoffs();
2207 projected_last_update = info.last_update;
2208 }
2209 if (acting.size() >= pool.info.min_size) {
2210 PGLogEntryHandler handler{this, &t};
2211 pg_log.roll_forward(&handler);
2212 }
2213 }
2214
2215 bool PG::op_has_sufficient_caps(OpRequestRef& op)
2216 {
2217 // only check MOSDOp
2218 if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
2219 return true;
2220
2221 const MOSDOp *req = static_cast<const MOSDOp*>(op->get_req());
2222
2223 auto priv = req->get_connection()->get_priv();
2224 auto session = static_cast<Session*>(priv.get());
2225 if (!session) {
2226 dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl;
2227 return false;
2228 }
2229 OSDCap& caps = session->caps;
2230 priv.reset();
2231
2232 const string &key = req->get_hobj().get_key().empty() ?
2233 req->get_oid().name :
2234 req->get_hobj().get_key();
2235
2236 bool cap = caps.is_capable(pool.name, req->get_hobj().nspace,
2237 pool.info.application_metadata,
2238 key,
2239 op->need_read_cap(),
2240 op->need_write_cap(),
2241 op->classes(),
2242 session->get_peer_socket_addr());
2243
2244 dout(20) << "op_has_sufficient_caps "
2245 << "session=" << session
2246 << " pool=" << pool.id << " (" << pool.name
2247 << " " << req->get_hobj().nspace
2248 << ")"
2249 << " pool_app_metadata=" << pool.info.application_metadata
2250 << " need_read_cap=" << op->need_read_cap()
2251 << " need_write_cap=" << op->need_write_cap()
2252 << " classes=" << op->classes()
2253 << " -> " << (cap ? "yes" : "NO")
2254 << dendl;
2255 return cap;
2256 }
2257
2258 void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
2259 {
2260 lock();
2261 if (pg_has_reset_since(epoch)) {
2262 dout(10) << "_activate_committed " << epoch
2263 << ", that was an old interval" << dendl;
2264 } else if (is_primary()) {
2265 ceph_assert(!peer_activated.count(pg_whoami));
2266 peer_activated.insert(pg_whoami);
2267 dout(10) << "_activate_committed " << epoch
2268 << " peer_activated now " << peer_activated
2269 << " last_interval_started " << info.history.last_interval_started
2270 << " last_epoch_started " << info.history.last_epoch_started
2271 << " same_interval_since " << info.history.same_interval_since << dendl;
2272 ceph_assert(!acting_recovery_backfill.empty());
2273 if (peer_activated.size() == acting_recovery_backfill.size())
2274 all_activated_and_committed();
2275 } else {
2276 dout(10) << "_activate_committed " << epoch << " telling primary" << dendl;
2277 MOSDPGInfo *m = new MOSDPGInfo(epoch);
2278 pg_notify_t i = pg_notify_t(
2279 get_primary().shard, pg_whoami.shard,
2280 get_osdmap_epoch(),
2281 get_osdmap_epoch(),
2282 info);
2283
2284 i.info.history.last_epoch_started = activation_epoch;
2285 i.info.history.last_interval_started = i.info.history.same_interval_since;
2286 if (acting.size() >= pool.info.min_size) {
2287 state_set(PG_STATE_ACTIVE);
2288 } else {
2289 state_set(PG_STATE_PEERED);
2290 }
2291
2292 m->pg_list.push_back(make_pair(i, PastIntervals()));
2293 osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap_epoch());
2294
2295 // waiters
2296 if (flushes_in_progress == 0) {
2297 requeue_ops(waiting_for_peered);
2298 } else if (!waiting_for_peered.empty()) {
2299 dout(10) << __func__ << " flushes in progress, moving "
2300 << waiting_for_peered.size() << " items to waiting_for_flush"
2301 << dendl;
2302 ceph_assert(waiting_for_flush.empty());
2303 waiting_for_flush.swap(waiting_for_peered);
2304 }
2305 }
2306
2307 ceph_assert(!dirty_info);
2308
2309 unlock();
2310 }
2311
2312 /*
2313 * update info.history.last_epoch_started ONLY after we and all
2314 * replicas have activated AND committed the activate transaction
2315 * (i.e. the peering results are stable on disk).
2316 */
2317 void PG::all_activated_and_committed()
2318 {
2319 dout(10) << "all_activated_and_committed" << dendl;
2320 ceph_assert(is_primary());
2321 ceph_assert(peer_activated.size() == acting_recovery_backfill.size());
2322 ceph_assert(!acting_recovery_backfill.empty());
2323 ceph_assert(blocked_by.empty());
2324
2325 // Degraded?
2326 _update_calc_stats();
2327 if (info.stats.stats.sum.num_objects_degraded) {
2328 state_set(PG_STATE_DEGRADED);
2329 } else {
2330 state_clear(PG_STATE_DEGRADED);
2331 }
2332
2333 queue_peering_event(
2334 PGPeeringEventRef(
2335 std::make_shared<PGPeeringEvent>(
2336 get_osdmap_epoch(),
2337 get_osdmap_epoch(),
2338 AllReplicasActivated())));
2339 }
2340
2341 bool PG::requeue_scrub(bool high_priority)
2342 {
2343 ceph_assert(is_locked());
2344 if (scrub_queued) {
2345 dout(10) << __func__ << ": already queued" << dendl;
2346 return false;
2347 } else {
2348 dout(10) << __func__ << ": queueing" << dendl;
2349 scrub_queued = true;
2350 osd->queue_for_scrub(this, high_priority);
2351 return true;
2352 }
2353 }
2354
2355 void PG::queue_recovery()
2356 {
2357 if (!is_primary() || !is_peered()) {
2358 dout(10) << "queue_recovery -- not primary or not peered " << dendl;
2359 ceph_assert(!recovery_queued);
2360 } else if (recovery_queued) {
2361 dout(10) << "queue_recovery -- already queued" << dendl;
2362 } else {
2363 dout(10) << "queue_recovery -- queuing" << dendl;
2364 recovery_queued = true;
2365 osd->queue_for_recovery(this);
2366 }
2367 }
2368
2369 bool PG::queue_scrub()
2370 {
2371 ceph_assert(is_locked());
2372 if (is_scrubbing()) {
2373 return false;
2374 }
2375 // An interrupted recovery repair could leave this set.
2376 state_clear(PG_STATE_REPAIR);
2377 if (scrubber.need_auto) {
2378 scrubber.must_scrub = true;
2379 scrubber.must_deep_scrub = true;
2380 scrubber.auto_repair = true;
2381 scrubber.need_auto = false;
2382 }
2383 scrubber.priority = scrubber.must_scrub ?
2384 cct->_conf->osd_requested_scrub_priority : get_scrub_priority();
2385 scrubber.must_scrub = false;
2386 state_set(PG_STATE_SCRUBBING);
2387 if (scrubber.must_deep_scrub) {
2388 state_set(PG_STATE_DEEP_SCRUB);
2389 scrubber.must_deep_scrub = false;
2390 }
2391 if (scrubber.must_repair || scrubber.auto_repair) {
2392 state_set(PG_STATE_REPAIR);
2393 scrubber.must_repair = false;
2394 }
2395 requeue_scrub();
2396 return true;
2397 }
2398
2399 unsigned PG::get_scrub_priority()
2400 {
2401 // a higher value -> a higher priority
2402 int64_t pool_scrub_priority = 0;
2403 pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority);
2404 return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
2405 }
2406
2407 void PG::try_mark_clean()
2408 {
2409 if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2410 state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
2411 state_set(PG_STATE_CLEAN);
2412 info.history.last_epoch_clean = get_osdmap_epoch();
2413 info.history.last_interval_clean = info.history.same_interval_since;
2414 past_intervals.clear();
2415 dirty_big_info = true;
2416 dirty_info = true;
2417 }
2418
2419 if (is_active()) {
2420 kick_snap_trim();
2421 } else if (is_peered()) {
2422 if (is_clean()) {
2423 bool target;
2424 if (pool.info.is_pending_merge(info.pgid.pgid, &target)) {
2425 if (target) {
2426 ldout(cct, 10) << "ready to merge (target)" << dendl;
2427 osd->set_ready_to_merge_target(this,
2428 info.last_update,
2429 info.history.last_epoch_started,
2430 info.history.last_epoch_clean);
2431 } else {
2432 ldout(cct, 10) << "ready to merge (source)" << dendl;
2433 osd->set_ready_to_merge_source(this, info.last_update);
2434 }
2435 }
2436 } else {
2437 ldout(cct, 10) << "not clean, not ready to merge" << dendl;
2438 // we should have notified OSD in Active state entry point
2439 }
2440 }
2441
2442 state_clear(PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
2443
2444 share_pg_info();
2445 publish_stats_to_osd();
2446 requeue_ops(waiting_for_clean_to_primary_repair);
2447 }
2448
2449 bool PG::set_force_recovery(bool b)
2450 {
2451 bool did = false;
2452 if (b) {
2453 if (!(state & PG_STATE_FORCED_RECOVERY) &&
2454 (state & (PG_STATE_DEGRADED |
2455 PG_STATE_RECOVERY_WAIT |
2456 PG_STATE_RECOVERING))) {
2457 dout(20) << __func__ << " set" << dendl;
2458 state_set(PG_STATE_FORCED_RECOVERY);
2459 publish_stats_to_osd();
2460 did = true;
2461 }
2462 } else if (state & PG_STATE_FORCED_RECOVERY) {
2463 dout(20) << __func__ << " clear" << dendl;
2464 state_clear(PG_STATE_FORCED_RECOVERY);
2465 publish_stats_to_osd();
2466 did = true;
2467 }
2468 if (did) {
2469 dout(20) << __func__ << " state " << pgstate_history.get_current_state() << dendl;
2470 osd->local_reserver.update_priority(info.pgid, get_recovery_priority());
2471 }
2472 return did;
2473 }
2474
2475 bool PG::set_force_backfill(bool b)
2476 {
2477 bool did = false;
2478 if (b) {
2479 if (!(state & PG_STATE_FORCED_BACKFILL) &&
2480 (state & (PG_STATE_DEGRADED |
2481 PG_STATE_BACKFILL_WAIT |
2482 PG_STATE_BACKFILLING))) {
2483 dout(10) << __func__ << " set" << dendl;
2484 state_set(PG_STATE_FORCED_BACKFILL);
2485 publish_stats_to_osd();
2486 did = true;
2487 }
2488 } else if (state & PG_STATE_FORCED_BACKFILL) {
2489 dout(10) << __func__ << " clear" << dendl;
2490 state_clear(PG_STATE_FORCED_BACKFILL);
2491 publish_stats_to_osd();
2492 did = true;
2493 }
2494 if (did) {
2495 dout(20) << __func__ << " state " << pgstate_history.get_current_state() << dendl;
2496 osd->local_reserver.update_priority(info.pgid, get_backfill_priority());
2497 }
2498 return did;
2499 }
2500
2501 int PG::clamp_recovery_priority(int priority, int pool_recovery_priority, int max)
2502 {
2503 static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
2504 static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
2505
2506 ceph_assert(max <= OSD_RECOVERY_PRIORITY_MAX);
2507
2508 // User can't set this too high anymore, but might be a legacy value
2509 if (pool_recovery_priority > OSD_POOL_PRIORITY_MAX)
2510 pool_recovery_priority = OSD_POOL_PRIORITY_MAX;
2511 if (pool_recovery_priority < OSD_POOL_PRIORITY_MIN)
2512 pool_recovery_priority = OSD_POOL_PRIORITY_MIN;
2513 // Shift range from min to max to 0 to max - min
2514 pool_recovery_priority += (0 - OSD_POOL_PRIORITY_MIN);
2515 ceph_assert(pool_recovery_priority >= 0 && pool_recovery_priority <= (OSD_POOL_PRIORITY_MAX - OSD_POOL_PRIORITY_MIN));
2516
2517 priority += pool_recovery_priority;
2518
2519 // Clamp to valid range
2520 if (priority > max) {
2521 return max;
2522 } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
2523 return OSD_RECOVERY_PRIORITY_MIN;
2524 } else {
2525 return priority;
2526 }
2527 }
2528
2529 unsigned PG::get_recovery_priority()
2530 {
2531 // a higher value -> a higher priority
2532 int ret = OSD_RECOVERY_PRIORITY_BASE;
2533 int base = ret;
2534
2535 if (state & PG_STATE_FORCED_RECOVERY) {
2536 ret = OSD_RECOVERY_PRIORITY_FORCED;
2537 } else {
2538 // XXX: This priority boost isn't so much about inactive, but about data-at-risk
2539 if (is_degraded() && info.stats.avail_no_missing.size() < pool.info.min_size) {
2540 base = OSD_RECOVERY_INACTIVE_PRIORITY_BASE;
2541 // inactive: no. of replicas < min_size, highest priority since it blocks IO
2542 ret = base + (pool.info.min_size - info.stats.avail_no_missing.size());
2543 }
2544
2545 int64_t pool_recovery_priority = 0;
2546 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2547
2548 ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
2549 }
2550 dout(20) << __func__ << " recovery priority is " << ret << dendl;
2551 return static_cast<unsigned>(ret);
2552 }
2553
2554 unsigned PG::get_backfill_priority()
2555 {
2556 // a higher value -> a higher priority
2557 int ret = OSD_BACKFILL_PRIORITY_BASE;
2558 int base = ret;
2559
2560 if (state & PG_STATE_FORCED_BACKFILL) {
2561 ret = OSD_BACKFILL_PRIORITY_FORCED;
2562 } else {
2563 if (acting.size() < pool.info.min_size) {
2564 base = OSD_BACKFILL_INACTIVE_PRIORITY_BASE;
2565 // inactive: no. of replicas < min_size, highest priority since it blocks IO
2566 ret = base + (pool.info.min_size - acting.size());
2567
2568 } else if (is_undersized()) {
2569 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
2570 ceph_assert(pool.info.size > actingset.size());
2571 base = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
2572 ret = base + (pool.info.size - actingset.size());
2573
2574 } else if (is_degraded()) {
2575 // degraded: baseline degraded
2576 base = ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
2577 }
2578
2579 // Adjust with pool's recovery priority
2580 int64_t pool_recovery_priority = 0;
2581 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2582
2583 ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
2584 }
2585
2586 dout(20) << __func__ << " backfill priority is " << ret << dendl;
2587 return static_cast<unsigned>(ret);
2588 }
2589
2590 unsigned PG::get_delete_priority()
2591 {
2592 auto state = get_osdmap()->get_state(osd->whoami);
2593 if (state & (CEPH_OSD_BACKFILLFULL |
2594 CEPH_OSD_FULL)) {
2595 return OSD_DELETE_PRIORITY_FULL;
2596 } else if (state & CEPH_OSD_NEARFULL) {
2597 return OSD_DELETE_PRIORITY_FULLISH;
2598 } else {
2599 return OSD_DELETE_PRIORITY_NORMAL;
2600 }
2601 }
2602
2603 Context *PG::finish_recovery()
2604 {
2605 dout(10) << "finish_recovery" << dendl;
2606 ceph_assert(info.last_complete == info.last_update);
2607
2608 clear_recovery_state();
2609
2610 /*
2611 * sync all this before purging strays. but don't block!
2612 */
2613 finish_sync_event = new C_PG_FinishRecovery(this);
2614 return finish_sync_event;
2615 }
2616
2617 void PG::_finish_recovery(Context *c)
2618 {
2619 lock();
2620 // When recovery is initiated by a repair, that flag is left on
2621 state_clear(PG_STATE_REPAIR);
2622 if (deleting) {
2623 unlock();
2624 return;
2625 }
2626 if (c == finish_sync_event) {
2627 dout(10) << "_finish_recovery" << dendl;
2628 finish_sync_event = 0;
2629 purge_strays();
2630
2631 publish_stats_to_osd();
2632
2633 if (scrub_after_recovery) {
2634 dout(10) << "_finish_recovery requeueing for scrub" << dendl;
2635 scrub_after_recovery = false;
2636 scrubber.must_deep_scrub = true;
2637 scrubber.check_repair = true;
2638 queue_scrub();
2639 }
2640 } else {
2641 dout(10) << "_finish_recovery -- stale" << dendl;
2642 }
2643 unlock();
2644 }
2645
2646 void PG::start_recovery_op(const hobject_t& soid)
2647 {
2648 dout(10) << "start_recovery_op " << soid
2649 #ifdef DEBUG_RECOVERY_OIDS
2650 << " (" << recovering_oids << ")"
2651 #endif
2652 << dendl;
2653 ceph_assert(recovery_ops_active >= 0);
2654 recovery_ops_active++;
2655 #ifdef DEBUG_RECOVERY_OIDS
2656 recovering_oids.insert(soid);
2657 #endif
2658 osd->start_recovery_op(this, soid);
2659 }
2660
2661 void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
2662 {
2663 dout(10) << "finish_recovery_op " << soid
2664 #ifdef DEBUG_RECOVERY_OIDS
2665 << " (" << recovering_oids << ")"
2666 #endif
2667 << dendl;
2668 ceph_assert(recovery_ops_active > 0);
2669 recovery_ops_active--;
2670 #ifdef DEBUG_RECOVERY_OIDS
2671 ceph_assert(recovering_oids.count(soid));
2672 recovering_oids.erase(recovering_oids.find(soid));
2673 #endif
2674 osd->finish_recovery_op(this, soid, dequeue);
2675
2676 if (!dequeue) {
2677 queue_recovery();
2678 }
2679 }
2680
2681 void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
2682 {
2683 child->update_snap_mapper_bits(split_bits);
2684 child->update_osdmap_ref(get_osdmap());
2685
2686 child->pool = pool;
2687
2688 // Log
2689 pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2690 child->info.last_complete = info.last_complete;
2691
2692 info.last_update = pg_log.get_head();
2693 child->info.last_update = child->pg_log.get_head();
2694
2695 child->info.last_user_version = info.last_user_version;
2696
2697 info.log_tail = pg_log.get_tail();
2698 child->info.log_tail = child->pg_log.get_tail();
2699
2700 // reset last_complete, we might have modified pg_log & missing above
2701 pg_log.reset_complete_to(&info);
2702 child->pg_log.reset_complete_to(&child->info);
2703
2704 // Info
2705 child->info.history = info.history;
2706 child->info.history.epoch_created = get_osdmap_epoch();
2707 child->info.purged_snaps = info.purged_snaps;
2708
2709 if (info.last_backfill.is_max()) {
2710 child->info.set_last_backfill(hobject_t::get_max());
2711 } else {
2712 // restart backfill on parent and child to be safe. we could
2713 // probably do better in the bitwise sort case, but it's more
2714 // fragile (there may be special work to do on backfill completion
2715 // in the future).
2716 info.set_last_backfill(hobject_t());
2717 child->info.set_last_backfill(hobject_t());
2718 // restarting backfill implies that the missing set is empty,
2719 // since it is only used for objects prior to last_backfill
2720 pg_log.reset_backfill();
2721 child->pg_log.reset_backfill();
2722 }
2723
2724 child->info.stats = info.stats;
2725 child->info.stats.parent_split_bits = split_bits;
2726 info.stats.stats_invalid = true;
2727 child->info.stats.stats_invalid = true;
2728 child->info.last_epoch_started = info.last_epoch_started;
2729 child->info.last_interval_started = info.last_interval_started;
2730
2731 child->snap_trimq = snap_trimq;
2732
2733 // There can't be recovery/backfill going on now
2734 int primary, up_primary;
2735 vector<int> newup, newacting;
2736 get_osdmap()->pg_to_up_acting_osds(
2737 child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2738 child->init_primary_up_acting(
2739 newup,
2740 newacting,
2741 up_primary,
2742 primary);
2743 child->role = OSDMap::calc_pg_role(osd->whoami, child->acting);
2744
2745 // this comparison includes primary rank via pg_shard_t
2746 if (get_primary() != child->get_primary())
2747 child->info.history.same_primary_since = get_osdmap_epoch();
2748
2749 child->info.stats.up = up;
2750 child->info.stats.up_primary = up_primary;
2751 child->info.stats.acting = acting;
2752 child->info.stats.acting_primary = primary;
2753 child->info.stats.mapping_epoch = get_osdmap_epoch();
2754
2755 // History
2756 child->past_intervals = past_intervals;
2757
2758 _split_into(child_pgid, child, split_bits);
2759
2760 // release all backoffs for simplicity
2761 release_backoffs(hobject_t(), hobject_t::get_max());
2762
2763 child->on_new_interval();
2764
2765 child->send_notify = !child->is_primary();
2766
2767 child->dirty_info = true;
2768 child->dirty_big_info = true;
2769 dirty_info = true;
2770 dirty_big_info = true;
2771 }
2772
2773 void PG::start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
2774 {
2775 out->resize(childpgs.size() + 1);
2776 info.stats.stats.sum.split(*out);
2777 }
2778
2779 void PG::finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction *t)
2780 {
2781 info.stats.stats.sum = stats;
2782 write_if_dirty(*t);
2783 }
2784
2785 void PG::merge_from(map<spg_t,PGRef>& sources, RecoveryCtx *rctx,
2786 unsigned split_bits,
2787 const pg_merge_meta_t& last_pg_merge_meta)
2788 {
2789 dout(10) << __func__ << " from " << sources << " split_bits " << split_bits
2790 << dendl;
2791 bool incomplete = false;
2792 if (info.last_complete != info.last_update ||
2793 info.is_incomplete() ||
2794 info.dne()) {
2795 dout(10) << __func__ << " target incomplete" << dendl;
2796 incomplete = true;
2797 }
2798 if (last_pg_merge_meta.source_pgid != pg_t()) {
2799 if (info.pgid.pgid != last_pg_merge_meta.source_pgid.get_parent()) {
2800 dout(10) << __func__ << " target doesn't match expected parent "
2801 << last_pg_merge_meta.source_pgid.get_parent()
2802 << " of source_pgid " << last_pg_merge_meta.source_pgid
2803 << dendl;
2804 incomplete = true;
2805 }
2806 if (info.last_update != last_pg_merge_meta.target_version) {
2807 dout(10) << __func__ << " target version doesn't match expected "
2808 << last_pg_merge_meta.target_version << dendl;
2809 incomplete = true;
2810 }
2811 }
2812
2813 PGLogEntryHandler handler{this, rctx->transaction};
2814 pg_log.roll_forward(&handler);
2815
2816 info.last_complete = info.last_update; // to fake out trim()
2817 pg_log.reset_recovery_pointers();
2818 pg_log.trim(info.last_update, info);
2819
2820 vector<PGLog*> log_from;
2821 for (auto& i : sources) {
2822 auto& source = i.second;
2823 if (!source) {
2824 dout(10) << __func__ << " source " << i.first << " missing" << dendl;
2825 incomplete = true;
2826 continue;
2827 }
2828 if (source->info.last_complete != source->info.last_update ||
2829 source->info.is_incomplete() ||
2830 source->info.dne()) {
2831 dout(10) << __func__ << " source " << source->pg_id << " incomplete"
2832 << dendl;
2833 incomplete = true;
2834 }
2835 if (last_pg_merge_meta.source_pgid != pg_t()) {
2836 if (source->info.pgid.pgid != last_pg_merge_meta.source_pgid) {
2837 dout(10) << __func__ << " source " << source->info.pgid.pgid
2838 << " doesn't match expected source pgid "
2839 << last_pg_merge_meta.source_pgid << dendl;
2840 incomplete = true;
2841 }
2842 if (source->info.last_update != last_pg_merge_meta.source_version) {
2843 dout(10) << __func__ << " source version doesn't match expected "
2844 << last_pg_merge_meta.target_version << dendl;
2845 incomplete = true;
2846 }
2847 }
2848
2849 // prepare log
2850 PGLogEntryHandler handler{source.get(), rctx->transaction};
2851 source->pg_log.roll_forward(&handler);
2852 source->info.last_complete = source->info.last_update; // to fake out trim()
2853 source->pg_log.reset_recovery_pointers();
2854 source->pg_log.trim(source->info.last_update, source->info);
2855 log_from.push_back(&source->pg_log);
2856
2857 // wipe out source's pgmeta
2858 rctx->transaction->remove(source->coll, source->pgmeta_oid);
2859
2860 // merge (and destroy source collection)
2861 rctx->transaction->merge_collection(source->coll, coll, split_bits);
2862
2863 // combine stats
2864 info.stats.add(source->info.stats);
2865
2866 // pull up last_update
2867 info.last_update = std::max(info.last_update, source->info.last_update);
2868
2869 // adopt source's PastIntervals if target has none. we can do this since
2870 // pgp_num has been reduced prior to the merge, so the OSD mappings for
2871 // the PGs are identical.
2872 if (past_intervals.empty() && !source->past_intervals.empty()) {
2873 dout(10) << __func__ << " taking source's past_intervals" << dendl;
2874 past_intervals = source->past_intervals;
2875 }
2876 }
2877
2878 // merge_collection does this, but maybe all of our sources were missing.
2879 rctx->transaction->collection_set_bits(coll, split_bits);
2880
2881 info.last_complete = info.last_update;
2882 info.log_tail = info.last_update;
2883 if (incomplete) {
2884 info.last_backfill = hobject_t();
2885 }
2886
2887 snap_mapper.update_bits(split_bits);
2888
2889 // merge logs
2890 pg_log.merge_from(log_from, info.last_update);
2891
2892 // make sure we have a meaningful last_epoch_started/clean (if we were a
2893 // placeholder)
2894 if (info.history.epoch_created == 0) {
2895 // start with (a) source's history, since these PGs *should* have been
2896 // remapped in concert with each other...
2897 info.history = sources.begin()->second->info.history;
2898
2899 // we use the last_epoch_{started,clean} we got from
2900 // the caller, which are the epochs that were reported by the PGs were
2901 // found to be ready for merge.
2902 info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean;
2903 info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started;
2904 info.last_epoch_started = last_pg_merge_meta.last_epoch_started;
2905 dout(10) << __func__
2906 << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/"
2907 << last_pg_merge_meta.last_epoch_clean
2908 << " from pool last_dec_*, source pg history was "
2909 << sources.begin()->second->info.history
2910 << dendl;
2911
2912 // if the past_intervals start is later than last_epoch_clean, it
2913 // implies the source repeered again but the target didn't, or
2914 // that the source became clean in a later epoch than the target.
2915 // avoid the discrepancy but adjusting the interval start
2916 // backwards to match so that check_past_interval_bounds() will
2917 // not complain.
2918 auto pib = past_intervals.get_bounds();
2919 if (info.history.last_epoch_clean < pib.first) {
2920 dout(10) << __func__ << " last_epoch_clean "
2921 << info.history.last_epoch_clean << " < past_interval start "
2922 << pib.first << ", adjusting start backwards" << dendl;
2923 past_intervals.adjust_start_backwards(info.history.last_epoch_clean);
2924 }
2925
2926 // Similarly, if the same_interval_since value is later than
2927 // last_epoch_clean, the next interval change will result in a
2928 // past_interval start that is later than last_epoch_clean. This
2929 // can happen if we use the pg_history values from the merge
2930 // source. Adjust the same_interval_since value backwards if that
2931 // happens. (We trust the les and lec values more because they came from
2932 // the real target, whereas the history value we stole from the source.)
2933 if (info.history.last_epoch_started < info.history.same_interval_since) {
2934 dout(10) << __func__ << " last_epoch_started "
2935 << info.history.last_epoch_started << " < same_interval_since "
2936 << info.history.same_interval_since
2937 << ", adjusting pg_history backwards" << dendl;
2938 info.history.same_interval_since = info.history.last_epoch_clean;
2939 // make sure same_{up,primary}_since are <= same_interval_since
2940 info.history.same_up_since = std::min(
2941 info.history.same_up_since, info.history.same_interval_since);
2942 info.history.same_primary_since = std::min(
2943 info.history.same_primary_since, info.history.same_interval_since);
2944 }
2945 }
2946
2947 dirty_info = true;
2948 dirty_big_info = true;
2949 }
2950
2951 void PG::add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end)
2952 {
2953 ConnectionRef con = s->con;
2954 if (!con) // OSD::ms_handle_reset clears s->con without a lock
2955 return;
2956 BackoffRef b(s->have_backoff(info.pgid, begin));
2957 if (b) {
2958 derr << __func__ << " already have backoff for " << s << " begin " << begin
2959 << " " << *b << dendl;
2960 ceph_abort();
2961 }
2962 std::lock_guard l(backoff_lock);
2963 {
2964 b = new Backoff(info.pgid, this, s, ++s->backoff_seq, begin, end);
2965 backoffs[begin].insert(b);
2966 s->add_backoff(b);
2967 dout(10) << __func__ << " session " << s << " added " << *b << dendl;
2968 }
2969 con->send_message(
2970 new MOSDBackoff(
2971 info.pgid,
2972 get_osdmap_epoch(),
2973 CEPH_OSD_BACKOFF_OP_BLOCK,
2974 b->id,
2975 begin,
2976 end));
2977 }
2978
2979 void PG::release_backoffs(const hobject_t& begin, const hobject_t& end)
2980 {
2981 dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl;
2982 vector<BackoffRef> bv;
2983 {
2984 std::lock_guard l(backoff_lock);
2985 auto p = backoffs.lower_bound(begin);
2986 while (p != backoffs.end()) {
2987 int r = cmp(p->first, end);
2988 dout(20) << __func__ << " ? " << r << " " << p->first
2989 << " " << p->second << dendl;
2990 // note: must still examine begin=end=p->first case
2991 if (r > 0 || (r == 0 && begin < end)) {
2992 break;
2993 }
2994 dout(20) << __func__ << " checking " << p->first
2995 << " " << p->second << dendl;
2996 auto q = p->second.begin();
2997 while (q != p->second.end()) {
2998 dout(20) << __func__ << " checking " << *q << dendl;
2999 int r = cmp((*q)->begin, begin);
3000 if (r == 0 || (r > 0 && (*q)->end < end)) {
3001 bv.push_back(*q);
3002 q = p->second.erase(q);
3003 } else {
3004 ++q;
3005 }
3006 }
3007 if (p->second.empty()) {
3008 p = backoffs.erase(p);
3009 } else {
3010 ++p;
3011 }
3012 }
3013 }
3014 for (auto b : bv) {
3015 std::lock_guard l(b->lock);
3016 dout(10) << __func__ << " " << *b << dendl;
3017 if (b->session) {
3018 ceph_assert(b->pg == this);
3019 ConnectionRef con = b->session->con;
3020 if (con) { // OSD::ms_handle_reset clears s->con without a lock
3021 con->send_message(
3022 new MOSDBackoff(
3023 info.pgid,
3024 get_osdmap_epoch(),
3025 CEPH_OSD_BACKOFF_OP_UNBLOCK,
3026 b->id,
3027 b->begin,
3028 b->end));
3029 }
3030 if (b->is_new()) {
3031 b->state = Backoff::STATE_DELETING;
3032 } else {
3033 b->session->rm_backoff(b);
3034 b->session.reset();
3035 }
3036 b->pg.reset();
3037 }
3038 }
3039 }
3040
3041 void PG::clear_backoffs()
3042 {
3043 dout(10) << __func__ << " " << dendl;
3044 map<hobject_t,set<BackoffRef>> ls;
3045 {
3046 std::lock_guard l(backoff_lock);
3047 ls.swap(backoffs);
3048 }
3049 for (auto& p : ls) {
3050 for (auto& b : p.second) {
3051 std::lock_guard l(b->lock);
3052 dout(10) << __func__ << " " << *b << dendl;
3053 if (b->session) {
3054 ceph_assert(b->pg == this);
3055 if (b->is_new()) {
3056 b->state = Backoff::STATE_DELETING;
3057 } else {
3058 b->session->rm_backoff(b);
3059 b->session.reset();
3060 }
3061 b->pg.reset();
3062 }
3063 }
3064 }
3065 }
3066
3067 // called by Session::clear_backoffs()
3068 void PG::rm_backoff(BackoffRef b)
3069 {
3070 dout(10) << __func__ << " " << *b << dendl;
3071 std::lock_guard l(backoff_lock);
3072 ceph_assert(b->lock.is_locked_by_me());
3073 ceph_assert(b->pg == this);
3074 auto p = backoffs.find(b->begin);
3075 // may race with release_backoffs()
3076 if (p != backoffs.end()) {
3077 auto q = p->second.find(b);
3078 if (q != p->second.end()) {
3079 p->second.erase(q);
3080 if (p->second.empty()) {
3081 backoffs.erase(p);
3082 }
3083 }
3084 }
3085 }
3086
3087 void PG::clear_recovery_state()
3088 {
3089 dout(10) << "clear_recovery_state" << dendl;
3090
3091 pg_log.reset_recovery_pointers();
3092 finish_sync_event = 0;
3093
3094 hobject_t soid;
3095 while (recovery_ops_active > 0) {
3096 #ifdef DEBUG_RECOVERY_OIDS
3097 soid = *recovering_oids.begin();
3098 #endif
3099 finish_recovery_op(soid, true);
3100 }
3101
3102 async_recovery_targets.clear();
3103 backfill_targets.clear();
3104 backfill_info.clear();
3105 peer_backfill_info.clear();
3106 waiting_on_backfill.clear();
3107 _clear_recovery_state(); // pg impl specific hook
3108 }
3109
3110 void PG::cancel_recovery()
3111 {
3112 dout(10) << "cancel_recovery" << dendl;
3113 clear_recovery_state();
3114 }
3115
3116
3117 void PG::purge_strays()
3118 {
3119 if (is_premerge()) {
3120 dout(10) << "purge_strays " << stray_set << " but premerge, doing nothing"
3121 << dendl;
3122 return;
3123 }
3124 if (cct->_conf.get_val<bool>("osd_debug_no_purge_strays")) {
3125 return;
3126 }
3127 dout(10) << "purge_strays " << stray_set << dendl;
3128
3129 bool removed = false;
3130 for (set<pg_shard_t>::iterator p = stray_set.begin();
3131 p != stray_set.end();
3132 ++p) {
3133 ceph_assert(!is_acting_recovery_backfill(*p));
3134 if (get_osdmap()->is_up(p->osd)) {
3135 dout(10) << "sending PGRemove to osd." << *p << dendl;
3136 vector<spg_t> to_remove;
3137 to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
3138 MOSDPGRemove *m = new MOSDPGRemove(
3139 get_osdmap_epoch(),
3140 to_remove);
3141 osd->send_message_osd_cluster(p->osd, m, get_osdmap_epoch());
3142 } else {
3143 dout(10) << "not sending PGRemove to down osd." << *p << dendl;
3144 }
3145 peer_missing.erase(*p);
3146 peer_info.erase(*p);
3147 peer_purged.insert(*p);
3148 removed = true;
3149 }
3150
3151 // if we removed anyone, update peers (which include peer_info)
3152 if (removed)
3153 update_heartbeat_peers();
3154
3155 stray_set.clear();
3156
3157 // clear _requested maps; we may have to peer() again if we discover
3158 // (more) stray content
3159 peer_log_requested.clear();
3160 peer_missing_requested.clear();
3161 }
3162
3163 void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
3164 {
3165 std::lock_guard l(heartbeat_peer_lock);
3166 probe_targets.clear();
3167 for (set<pg_shard_t>::iterator i = probe_set.begin();
3168 i != probe_set.end();
3169 ++i) {
3170 probe_targets.insert(i->osd);
3171 }
3172 }
3173
3174 void PG::clear_probe_targets()
3175 {
3176 std::lock_guard l(heartbeat_peer_lock);
3177 probe_targets.clear();
3178 }
3179
3180 void PG::update_heartbeat_peers()
3181 {
3182 ceph_assert(is_locked());
3183
3184 if (!is_primary())
3185 return;
3186
3187 set<int> new_peers;
3188 for (unsigned i=0; i<acting.size(); i++) {
3189 if (acting[i] != CRUSH_ITEM_NONE)
3190 new_peers.insert(acting[i]);
3191 }
3192 for (unsigned i=0; i<up.size(); i++) {
3193 if (up[i] != CRUSH_ITEM_NONE)
3194 new_peers.insert(up[i]);
3195 }
3196 for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
3197 p != peer_info.end();
3198 ++p)
3199 new_peers.insert(p->first.osd);
3200
3201 bool need_update = false;
3202 heartbeat_peer_lock.Lock();
3203 if (new_peers == heartbeat_peers) {
3204 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl;
3205 } else {
3206 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl;
3207 heartbeat_peers.swap(new_peers);
3208 need_update = true;
3209 }
3210 heartbeat_peer_lock.Unlock();
3211
3212 if (need_update)
3213 osd->need_heartbeat_peer_update();
3214 }
3215
3216
3217 bool PG::check_in_progress_op(
3218 const osd_reqid_t &r,
3219 eversion_t *version,
3220 version_t *user_version,
3221 int *return_code) const
3222 {
3223 return (
3224 projected_log.get_request(r, version, user_version, return_code) ||
3225 pg_log.get_log().get_request(r, version, user_version, return_code));
3226 }
3227
3228 static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard)
3229 {
3230 for (auto&p : pgs)
3231 if (p.shard == shard)
3232 return true;
3233 return false;
3234 }
3235
3236 static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard)
3237 {
3238 for (auto&p : pgs) {
3239 if (p == skip)
3240 continue;
3241 if (p.shard == shard)
3242 return p;
3243 }
3244 return pg_shard_t();
3245 }
3246
3247 void PG::_update_calc_stats()
3248 {
3249 info.stats.version = info.last_update;
3250 info.stats.created = info.history.epoch_created;
3251 info.stats.last_scrub = info.history.last_scrub;
3252 info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
3253 info.stats.last_deep_scrub = info.history.last_deep_scrub;
3254 info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
3255 info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
3256 info.stats.last_epoch_clean = info.history.last_epoch_clean;
3257
3258 info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
3259 info.stats.ondisk_log_size = info.stats.log_size;
3260 info.stats.log_start = pg_log.get_tail();
3261 info.stats.ondisk_log_start = pg_log.get_tail();
3262 info.stats.snaptrimq_len = snap_trimq.size();
3263
3264 unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
3265
3266 // In rare case that upset is too large (usually transient), use as target
3267 // for calculations below.
3268 unsigned target = std::max(num_shards, (unsigned)upset.size());
3269 // For undersized actingset may be larger with OSDs out
3270 unsigned nrep = std::max(actingset.size(), upset.size());
3271 // calc num_object_copies
3272 info.stats.stats.calc_copies(std::max(target, nrep));
3273 info.stats.stats.sum.num_objects_degraded = 0;
3274 info.stats.stats.sum.num_objects_unfound = 0;
3275 info.stats.stats.sum.num_objects_misplaced = 0;
3276 info.stats.avail_no_missing.clear();
3277 info.stats.object_location_counts.clear();
3278
3279 // We should never hit this condition, but if end up hitting it,
3280 // make sure to update num_objects and set PG_STATE_INCONSISTENT.
3281 if (info.stats.stats.sum.num_objects < 0) {
3282 dout(0) << __func__ << " negative num_objects = "
3283 << info.stats.stats.sum.num_objects << " setting it to 0 "
3284 << dendl;
3285 info.stats.stats.sum.num_objects = 0;
3286 state_set(PG_STATE_INCONSISTENT);
3287 }
3288
3289 if ((is_remapped() || is_undersized() || !is_clean()) && (is_peered() || is_activating())) {
3290 dout(20) << __func__ << " actingset " << actingset << " upset "
3291 << upset << " acting_recovery_backfill " << acting_recovery_backfill << dendl;
3292 dout(20) << __func__ << " acting " << acting << " up " << up << dendl;
3293
3294 ceph_assert(!acting_recovery_backfill.empty());
3295
3296 bool estimate = false;
3297
3298 // NOTE: we only generate degraded, misplaced and unfound
3299 // values for the summation, not individual stat categories.
3300 int64_t num_objects = info.stats.stats.sum.num_objects;
3301
3302 // Objects missing from up nodes, sorted by # objects.
3303 boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
3304 // Objects missing from nodes not in up, sort by # objects
3305 boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
3306
3307 // Fill missing_target_objects/acting_source_objects
3308
3309 {
3310 int64_t missing;
3311
3312 // Primary first
3313 missing = pg_log.get_missing().num_missing();
3314 ceph_assert(acting_recovery_backfill.count(pg_whoami));
3315 if (upset.count(pg_whoami)) {
3316 missing_target_objects.insert(make_pair(missing, pg_whoami));
3317 } else {
3318 acting_source_objects.insert(make_pair(missing, pg_whoami));
3319 }
3320 info.stats.stats.sum.num_objects_missing_on_primary = missing;
3321 if (missing == 0)
3322 info.stats.avail_no_missing.push_back(pg_whoami);
3323 dout(20) << __func__ << " shard " << pg_whoami
3324 << " primary objects " << num_objects
3325 << " missing " << missing
3326 << dendl;
3327 }
3328
3329 // All other peers
3330 for (auto& peer : peer_info) {
3331 // Primary should not be in the peer_info, skip if it is.
3332 if (peer.first == pg_whoami) continue;
3333 int64_t missing = 0;
3334 int64_t peer_num_objects = peer.second.stats.stats.sum.num_objects;
3335 // Backfill targets always track num_objects accurately
3336 // all other peers track missing accurately.
3337 if (is_backfill_targets(peer.first)) {
3338 missing = std::max((int64_t)0, num_objects - peer_num_objects);
3339 } else {
3340 if (peer_missing.count(peer.first)) {
3341 missing = peer_missing[peer.first].num_missing();
3342 } else {
3343 dout(20) << __func__ << " no peer_missing found for " << peer.first << dendl;
3344 if (is_recovering()) {
3345 estimate = true;
3346 }
3347 missing = std::max((int64_t)0, num_objects - peer_num_objects);
3348 }
3349 }
3350 if (upset.count(peer.first)) {
3351 missing_target_objects.insert(make_pair(missing, peer.first));
3352 } else if (actingset.count(peer.first)) {
3353 acting_source_objects.insert(make_pair(missing, peer.first));
3354 }
3355 peer.second.stats.stats.sum.num_objects_missing = missing;
3356 if (missing == 0)
3357 info.stats.avail_no_missing.push_back(peer.first);
3358 dout(20) << __func__ << " shard " << peer.first
3359 << " objects " << peer_num_objects
3360 << " missing " << missing
3361 << dendl;
3362 }
3363
3364 // Compute object_location_counts
3365 for (auto& ml: missing_loc.get_missing_locs()) {
3366 info.stats.object_location_counts[ml.second]++;
3367 dout(30) << __func__ << " " << ml.first << " object_location_counts["
3368 << ml.second << "]=" << info.stats.object_location_counts[ml.second]
3369 << dendl;
3370 }
3371 int64_t not_missing = num_objects - missing_loc.get_missing_locs().size();
3372 if (not_missing) {
3373 // During recovery we know upset == actingset and is being populated
3374 // During backfill we know that all non-missing objects are in the actingset
3375 info.stats.object_location_counts[actingset] = not_missing;
3376 }
3377 dout(30) << __func__ << " object_location_counts["
3378 << upset << "]=" << info.stats.object_location_counts[upset]
3379 << dendl;
3380 dout(20) << __func__ << " object_location_counts "
3381 << info.stats.object_location_counts << dendl;
3382
3383 // A misplaced object is not stored on the correct OSD
3384 int64_t misplaced = 0;
3385 // a degraded objects has fewer replicas or EC shards than the pool specifies.
3386 int64_t degraded = 0;
3387
3388 if (is_recovering()) {
3389 for (auto& sml: missing_loc.get_missing_by_count()) {
3390 for (auto& ml: sml.second) {
3391 int missing_shards;
3392 if (sml.first == shard_id_t::NO_SHARD) {
3393 dout(20) << __func__ << " ml " << ml.second << " upset size " << upset.size() << " up " << ml.first.up << dendl;
3394 missing_shards = (int)upset.size() - ml.first.up;
3395 } else {
3396 // Handle shards not even in upset below
3397 if (!find_shard(upset, sml.first))
3398 continue;
3399 missing_shards = std::max(0, 1 - ml.first.up);
3400 dout(20) << __func__ << " shard " << sml.first << " ml " << ml.second << " missing shards " << missing_shards << dendl;
3401 }
3402 int odegraded = ml.second * missing_shards;
3403 // Copies on other osds but limited to the possible degraded
3404 int more_osds = std::min(missing_shards, ml.first.other);
3405 int omisplaced = ml.second * more_osds;
3406 ceph_assert(omisplaced <= odegraded);
3407 odegraded -= omisplaced;
3408
3409 misplaced += omisplaced;
3410 degraded += odegraded;
3411 }
3412 }
3413
3414 dout(20) << __func__ << " missing based degraded " << degraded << dendl;
3415 dout(20) << __func__ << " missing based misplaced " << misplaced << dendl;
3416
3417 // Handle undersized case
3418 if (pool.info.is_replicated()) {
3419 // Add degraded for missing targets (num_objects missing)
3420 ceph_assert(target >= upset.size());
3421 unsigned needed = target - upset.size();
3422 degraded += num_objects * needed;
3423 } else {
3424 for (unsigned i = 0 ; i < num_shards; ++i) {
3425 shard_id_t shard(i);
3426
3427 if (!find_shard(upset, shard)) {
3428 pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard);
3429
3430 if (pgs != pg_shard_t()) {
3431 int64_t missing;
3432
3433 if (pgs == pg_whoami)
3434 missing = info.stats.stats.sum.num_objects_missing_on_primary;
3435 else
3436 missing = peer_info[pgs].stats.stats.sum.num_objects_missing;
3437
3438 degraded += missing;
3439 misplaced += std::max((int64_t)0, num_objects - missing);
3440 } else {
3441 // No shard anywhere
3442 degraded += num_objects;
3443 }
3444 }
3445 }
3446 }
3447 goto out;
3448 }
3449
3450 // Handle undersized case
3451 if (pool.info.is_replicated()) {
3452 // Add to missing_target_objects
3453 ceph_assert(target >= missing_target_objects.size());
3454 unsigned needed = target - missing_target_objects.size();
3455 if (needed)
3456 missing_target_objects.insert(make_pair(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD)));
3457 } else {
3458 for (unsigned i = 0 ; i < num_shards; ++i) {
3459 shard_id_t shard(i);
3460 bool found = false;
3461 for (const auto& t : missing_target_objects) {
3462 if (std::get<1>(t).shard == shard) {
3463 found = true;
3464 break;
3465 }
3466 }
3467 if (!found)
3468 missing_target_objects.insert(make_pair(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard)));
3469 }
3470 }
3471
3472 for (const auto& item : missing_target_objects)
3473 dout(20) << __func__ << " missing shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
3474 for (const auto& item : acting_source_objects)
3475 dout(20) << __func__ << " acting shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
3476
3477 // Handle all objects not in missing for remapped
3478 // or backfill
3479 for (auto m = missing_target_objects.rbegin();
3480 m != missing_target_objects.rend(); ++m) {
3481
3482 int64_t extra_missing = -1;
3483
3484 if (pool.info.is_replicated()) {
3485 if (!acting_source_objects.empty()) {
3486 auto extra_copy = acting_source_objects.begin();
3487 extra_missing = std::get<0>(*extra_copy);
3488 acting_source_objects.erase(extra_copy);
3489 }
3490 } else { // Erasure coded
3491 // Use corresponding shard
3492 for (const auto& a : acting_source_objects) {
3493 if (std::get<1>(a).shard == std::get<1>(*m).shard) {
3494 extra_missing = std::get<0>(a);
3495 acting_source_objects.erase(a);
3496 break;
3497 }
3498 }
3499 }
3500
3501 if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
3502 // We don't know which of the objects on the target
3503 // are part of extra_missing so assume are all degraded.
3504 misplaced += std::get<0>(*m) - extra_missing;
3505 degraded += extra_missing;
3506 } else {
3507 // 1. extra_missing == -1, more targets than sources so degraded
3508 // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
3509 // previously degraded are now present on the target.
3510 degraded += std::get<0>(*m);
3511 }
3512 }
3513 // If there are still acting that haven't been accounted for
3514 // then they are misplaced
3515 for (const auto& a : acting_source_objects) {
3516 int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
3517 dout(20) << __func__ << " extra acting misplaced " << extra_misplaced << dendl;
3518 misplaced += extra_misplaced;
3519 }
3520 out:
3521 // NOTE: Tests use these messages to verify this code
3522 dout(20) << __func__ << " degraded " << degraded << (estimate ? " (est)": "") << dendl;
3523 dout(20) << __func__ << " misplaced " << misplaced << (estimate ? " (est)": "")<< dendl;
3524
3525 info.stats.stats.sum.num_objects_degraded = degraded;
3526 info.stats.stats.sum.num_objects_unfound = get_num_unfound();
3527 info.stats.stats.sum.num_objects_misplaced = misplaced;
3528 }
3529 }
3530
3531 void PG::_update_blocked_by()
3532 {
3533 // set a max on the number of blocking peers we report. if we go
3534 // over, report a random subset. keep the result sorted.
3535 unsigned keep = std::min<unsigned>(blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
3536 unsigned skip = blocked_by.size() - keep;
3537 info.stats.blocked_by.clear();
3538 info.stats.blocked_by.resize(keep);
3539 unsigned pos = 0;
3540 for (set<int>::iterator p = blocked_by.begin();
3541 p != blocked_by.end() && keep > 0;
3542 ++p) {
3543 if (skip > 0 && (rand() % (skip + keep) < skip)) {
3544 --skip;
3545 } else {
3546 info.stats.blocked_by[pos++] = *p;
3547 --keep;
3548 }
3549 }
3550 }
3551
3552 void PG::publish_stats_to_osd()
3553 {
3554 if (!is_primary())
3555 return;
3556
3557 pg_stats_publish_lock.Lock();
3558
3559 if (info.stats.stats.sum.num_scrub_errors)
3560 state_set(PG_STATE_INCONSISTENT);
3561 else {
3562 state_clear(PG_STATE_INCONSISTENT);
3563 state_clear(PG_STATE_FAILED_REPAIR);
3564 }
3565
3566 utime_t now = ceph_clock_now();
3567 if (info.stats.state != state) {
3568 info.stats.last_change = now;
3569 // Optimistic estimation, if we just find out an inactive PG,
3570 // assumt it is active till now.
3571 if (!(state & PG_STATE_ACTIVE) &&
3572 (info.stats.state & PG_STATE_ACTIVE))
3573 info.stats.last_active = now;
3574
3575 if ((state & PG_STATE_ACTIVE) &&
3576 !(info.stats.state & PG_STATE_ACTIVE))
3577 info.stats.last_became_active = now;
3578 if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
3579 !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
3580 info.stats.last_became_peered = now;
3581 info.stats.state = state;
3582 }
3583
3584 _update_calc_stats();
3585 if (info.stats.stats.sum.num_objects_degraded) {
3586 state_set(PG_STATE_DEGRADED);
3587 } else {
3588 state_clear(PG_STATE_DEGRADED);
3589 }
3590 _update_blocked_by();
3591
3592 pg_stat_t pre_publish = info.stats;
3593 pre_publish.stats.add(unstable_stats);
3594 utime_t cutoff = now;
3595 cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
3596
3597 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_MIMIC) {
3598 // share (some of) our purged_snaps via the pg_stats. limit # of intervals
3599 // because we don't want to make the pg_stat_t structures too expensive.
3600 unsigned max = cct->_conf->osd_max_snap_prune_intervals_per_epoch;
3601 unsigned num = 0;
3602 auto i = info.purged_snaps.begin();
3603 while (num < max && i != info.purged_snaps.end()) {
3604 pre_publish.purged_snaps.insert(i.get_start(), i.get_len());
3605 ++num;
3606 ++i;
3607 }
3608 dout(20) << __func__ << " reporting purged_snaps "
3609 << pre_publish.purged_snaps << dendl;
3610 }
3611
3612 if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
3613 info.stats.last_fresh > cutoff) {
3614 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3615 << ": no change since " << info.stats.last_fresh << dendl;
3616 } else {
3617 // update our stat summary and timestamps
3618 info.stats.reported_epoch = get_osdmap_epoch();
3619 ++info.stats.reported_seq;
3620
3621 info.stats.last_fresh = now;
3622
3623 if (info.stats.state & PG_STATE_CLEAN)
3624 info.stats.last_clean = now;
3625 if (info.stats.state & PG_STATE_ACTIVE)
3626 info.stats.last_active = now;
3627 if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
3628 info.stats.last_peered = now;
3629 info.stats.last_unstale = now;
3630 if ((info.stats.state & PG_STATE_DEGRADED) == 0)
3631 info.stats.last_undegraded = now;
3632 if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
3633 info.stats.last_fullsized = now;
3634
3635 pg_stats_publish_valid = true;
3636 pg_stats_publish = pre_publish;
3637
3638 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
3639 << ":" << pg_stats_publish.reported_seq << dendl;
3640 }
3641 pg_stats_publish_lock.Unlock();
3642 }
3643
3644 void PG::clear_publish_stats()
3645 {
3646 dout(15) << "clear_stats" << dendl;
3647 pg_stats_publish_lock.Lock();
3648 pg_stats_publish_valid = false;
3649 pg_stats_publish_lock.Unlock();
3650 }
3651
3652 /**
3653 * initialize a newly instantiated pg
3654 *
3655 * Initialize PG state, as when a PG is initially created, or when it
3656 * is first instantiated on the current node.
3657 *
3658 * @param role our role/rank
3659 * @param newup up set
3660 * @param newacting acting set
3661 * @param history pg history
3662 * @param pi past_intervals
3663 * @param backfill true if info should be marked as backfill
3664 * @param t transaction to write out our new state in
3665 */
3666 void PG::init(
3667 int role,
3668 const vector<int>& newup, int new_up_primary,
3669 const vector<int>& newacting, int new_acting_primary,
3670 const pg_history_t& history,
3671 const PastIntervals& pi,
3672 bool backfill,
3673 ObjectStore::Transaction *t)
3674 {
3675 dout(10) << "init role " << role << " up " << newup << " acting " << newacting
3676 << " history " << history
3677 << " past_intervals " << pi
3678 << dendl;
3679
3680 set_role(role);
3681 init_primary_up_acting(
3682 newup,
3683 newacting,
3684 new_up_primary,
3685 new_acting_primary);
3686
3687 info.history = history;
3688 past_intervals = pi;
3689
3690 info.stats.up = up;
3691 info.stats.up_primary = new_up_primary;
3692 info.stats.acting = acting;
3693 info.stats.acting_primary = new_acting_primary;
3694 info.stats.mapping_epoch = info.history.same_interval_since;
3695
3696 if (backfill) {
3697 dout(10) << __func__ << ": Setting backfill" << dendl;
3698 info.set_last_backfill(hobject_t());
3699 info.last_complete = info.last_update;
3700 pg_log.mark_log_for_rewrite();
3701 }
3702
3703 on_new_interval();
3704
3705 dirty_info = true;
3706 dirty_big_info = true;
3707 write_if_dirty(*t);
3708 }
3709
3710 void PG::shutdown()
3711 {
3712 ch->flush();
3713 lock();
3714 on_shutdown();
3715 unlock();
3716 }
3717
3718 #pragma GCC diagnostic ignored "-Wpragmas"
3719 #pragma GCC diagnostic push
3720 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3721
3722 void PG::upgrade(ObjectStore *store)
3723 {
3724 dout(0) << __func__ << " " << info_struct_v << " -> " << latest_struct_v
3725 << dendl;
3726 ceph_assert(info_struct_v <= 10);
3727 ObjectStore::Transaction t;
3728
3729 // <do upgrade steps here>
3730
3731 // finished upgrade!
3732 ceph_assert(info_struct_v == 10);
3733
3734 // update infover_key
3735 if (info_struct_v < latest_struct_v) {
3736 map<string,bufferlist> v;
3737 __u8 ver = latest_struct_v;
3738 encode(ver, v[infover_key]);
3739 t.omap_setkeys(coll, pgmeta_oid, v);
3740 }
3741
3742 dirty_info = true;
3743 dirty_big_info = true;
3744 write_if_dirty(t);
3745
3746 ObjectStore::CollectionHandle ch = store->open_collection(coll);
3747 int r = store->queue_transaction(ch, std::move(t));
3748 if (r != 0) {
3749 derr << __func__ << ": queue_transaction returned "
3750 << cpp_strerror(r) << dendl;
3751 ceph_abort();
3752 }
3753 ceph_assert(r == 0);
3754
3755 C_SaferCond waiter;
3756 if (!ch->flush_commit(&waiter)) {
3757 waiter.wait();
3758 }
3759 }
3760
3761 #pragma GCC diagnostic pop
3762 #pragma GCC diagnostic warning "-Wpragmas"
3763
3764 int PG::_prepare_write_info(CephContext* cct,
3765 map<string,bufferlist> *km,
3766 epoch_t epoch,
3767 pg_info_t &info, pg_info_t &last_written_info,
3768 PastIntervals &past_intervals,
3769 bool dirty_big_info,
3770 bool dirty_epoch,
3771 bool try_fast_info,
3772 PerfCounters *logger)
3773 {
3774 if (dirty_epoch) {
3775 encode(epoch, (*km)[epoch_key]);
3776 }
3777
3778 if (logger)
3779 logger->inc(l_osd_pg_info);
3780
3781 // try to do info efficiently?
3782 if (!dirty_big_info && try_fast_info &&
3783 info.last_update > last_written_info.last_update) {
3784 pg_fast_info_t fast;
3785 fast.populate_from(info);
3786 bool did = fast.try_apply_to(&last_written_info);
3787 ceph_assert(did); // we verified last_update increased above
3788 if (info == last_written_info) {
3789 encode(fast, (*km)[fastinfo_key]);
3790 if (logger)
3791 logger->inc(l_osd_pg_fastinfo);
3792 return 0;
3793 }
3794 generic_dout(30) << __func__ << " fastinfo failed, info:\n";
3795 {
3796 JSONFormatter jf(true);
3797 jf.dump_object("info", info);
3798 jf.flush(*_dout);
3799 }
3800 {
3801 *_dout << "\nlast_written_info:\n";
3802 JSONFormatter jf(true);
3803 jf.dump_object("last_written_info", last_written_info);
3804 jf.flush(*_dout);
3805 }
3806 *_dout << dendl;
3807 }
3808 last_written_info = info;
3809
3810 // info. store purged_snaps separately.
3811 interval_set<snapid_t> purged_snaps;
3812 purged_snaps.swap(info.purged_snaps);
3813 encode(info, (*km)[info_key]);
3814 purged_snaps.swap(info.purged_snaps);
3815
3816 if (dirty_big_info) {
3817 // potentially big stuff
3818 bufferlist& bigbl = (*km)[biginfo_key];
3819 encode(past_intervals, bigbl);
3820 encode(info.purged_snaps, bigbl);
3821 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
3822 if (logger)
3823 logger->inc(l_osd_pg_biginfo);
3824 }
3825
3826 return 0;
3827 }
3828
3829 void PG::_create(ObjectStore::Transaction& t, spg_t pgid, int bits)
3830 {
3831 coll_t coll(pgid);
3832 t.create_collection(coll, bits);
3833 }
3834
3835 void PG::_init(ObjectStore::Transaction& t, spg_t pgid, const pg_pool_t *pool)
3836 {
3837 coll_t coll(pgid);
3838
3839 if (pool) {
3840 // Give a hint to the PG collection
3841 bufferlist hint;
3842 uint32_t pg_num = pool->get_pg_num();
3843 uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
3844 encode(pg_num, hint);
3845 encode(expected_num_objects_pg, hint);
3846 uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
3847 t.collection_hint(coll, hint_type, hint);
3848 }
3849
3850 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3851 t.touch(coll, pgmeta_oid);
3852 map<string,bufferlist> values;
3853 __u8 struct_v = latest_struct_v;
3854 encode(struct_v, values[infover_key]);
3855 t.omap_setkeys(coll, pgmeta_oid, values);
3856 }
3857
3858 void PG::prepare_write_info(map<string,bufferlist> *km)
3859 {
3860 info.stats.stats.add(unstable_stats);
3861 unstable_stats.clear();
3862
3863 bool need_update_epoch = last_epoch < get_osdmap_epoch();
3864 int ret = _prepare_write_info(cct, km, get_osdmap_epoch(),
3865 info,
3866 last_written_info,
3867 past_intervals,
3868 dirty_big_info, need_update_epoch,
3869 cct->_conf->osd_fast_info,
3870 osd->logger);
3871 ceph_assert(ret == 0);
3872 if (need_update_epoch)
3873 last_epoch = get_osdmap_epoch();
3874 last_persisted_osdmap = last_epoch;
3875
3876 dirty_info = false;
3877 dirty_big_info = false;
3878 }
3879
3880 #pragma GCC diagnostic ignored "-Wpragmas"
3881 #pragma GCC diagnostic push
3882 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3883
3884 bool PG::_has_removal_flag(ObjectStore *store,
3885 spg_t pgid)
3886 {
3887 coll_t coll(pgid);
3888 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3889
3890 // first try new way
3891 set<string> keys;
3892 keys.insert("_remove");
3893 map<string,bufferlist> values;
3894 auto ch = store->open_collection(coll);
3895 ceph_assert(ch);
3896 if (store->omap_get_values(ch, pgmeta_oid, keys, &values) == 0 &&
3897 values.size() == 1)
3898 return true;
3899
3900 return false;
3901 }
3902
3903 int PG::peek_map_epoch(ObjectStore *store,
3904 spg_t pgid,
3905 epoch_t *pepoch)
3906 {
3907 coll_t coll(pgid);
3908 ghobject_t legacy_infos_oid(OSD::make_infos_oid());
3909 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3910 epoch_t cur_epoch = 0;
3911
3912 // validate collection name
3913 ceph_assert(coll.is_pg());
3914
3915 // try for v8
3916 set<string> keys;
3917 keys.insert(infover_key);
3918 keys.insert(epoch_key);
3919 map<string,bufferlist> values;
3920 auto ch = store->open_collection(coll);
3921 ceph_assert(ch);
3922 int r = store->omap_get_values(ch, pgmeta_oid, keys, &values);
3923 if (r == 0) {
3924 ceph_assert(values.size() == 2);
3925
3926 // sanity check version
3927 auto bp = values[infover_key].cbegin();
3928 __u8 struct_v = 0;
3929 decode(struct_v, bp);
3930 ceph_assert(struct_v >= 8);
3931
3932 // get epoch
3933 bp = values[epoch_key].begin();
3934 decode(cur_epoch, bp);
3935 } else {
3936 // probably bug 10617; see OSD::load_pgs()
3937 return -1;
3938 }
3939
3940 *pepoch = cur_epoch;
3941 return 0;
3942 }
3943
3944 #pragma GCC diagnostic pop
3945 #pragma GCC diagnostic warning "-Wpragmas"
3946
3947 void PG::write_if_dirty(ObjectStore::Transaction& t)
3948 {
3949 map<string,bufferlist> km;
3950 if (dirty_big_info || dirty_info)
3951 prepare_write_info(&km);
3952 pg_log.write_log_and_missing(t, &km, coll, pgmeta_oid, pool.info.require_rollback());
3953 if (!km.empty())
3954 t.omap_setkeys(coll, pgmeta_oid, km);
3955 }
3956
3957 void PG::add_log_entry(const pg_log_entry_t& e, bool applied)
3958 {
3959 // raise last_complete only if we were previously up to date
3960 if (info.last_complete == info.last_update)
3961 info.last_complete = e.version;
3962
3963 // raise last_update.
3964 ceph_assert(e.version > info.last_update);
3965 info.last_update = e.version;
3966
3967 // raise user_version, if it increased (it may have not get bumped
3968 // by all logged updates)
3969 if (e.user_version > info.last_user_version)
3970 info.last_user_version = e.user_version;
3971
3972 // log mutation
3973 pg_log.add(e, applied);
3974 dout(10) << "add_log_entry " << e << dendl;
3975 }
3976
3977
3978 void PG::append_log(
3979 const vector<pg_log_entry_t>& logv,
3980 eversion_t trim_to,
3981 eversion_t roll_forward_to,
3982 ObjectStore::Transaction &t,
3983 bool transaction_applied,
3984 bool async)
3985 {
3986 if (transaction_applied)
3987 update_snap_map(logv, t);
3988
3989 /* The primary has sent an info updating the history, but it may not
3990 * have arrived yet. We want to make sure that we cannot remember this
3991 * write without remembering that it happened in an interval which went
3992 * active in epoch history.last_epoch_started.
3993 */
3994 if (info.last_epoch_started != info.history.last_epoch_started) {
3995 info.history.last_epoch_started = info.last_epoch_started;
3996 }
3997 if (info.last_interval_started != info.history.last_interval_started) {
3998 info.history.last_interval_started = info.last_interval_started;
3999 }
4000 dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
4001
4002 PGLogEntryHandler handler{this, &t};
4003 if (!transaction_applied) {
4004 /* We must be a backfill or async recovery peer, so it's ok if we apply
4005 * out-of-turn since we won't be considered when
4006 * determining a min possible last_update.
4007 *
4008 * We skip_rollforward() here, which advances the crt, without
4009 * doing an actual rollforward. This avoids cleaning up entries
4010 * from the backend and we do not end up in a situation, where the
4011 * object is deleted before we can _merge_object_divergent_entries().
4012 */
4013 pg_log.skip_rollforward();
4014 }
4015
4016 for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
4017 p != logv.end();
4018 ++p) {
4019 add_log_entry(*p, transaction_applied);
4020
4021 /* We don't want to leave the rollforward artifacts around
4022 * here past last_backfill. It's ok for the same reason as
4023 * above */
4024 if (transaction_applied &&
4025 p->soid > info.last_backfill) {
4026 pg_log.roll_forward(&handler);
4027 }
4028 }
4029 auto last = logv.rbegin();
4030 if (is_primary() && last != logv.rend()) {
4031 projected_log.skip_can_rollback_to_to_head();
4032 projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
4033 }
4034
4035 if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
4036 pg_log.roll_forward_to(
4037 roll_forward_to,
4038 &handler);
4039 last_rollback_info_trimmed_to_applied = roll_forward_to;
4040 }
4041
4042 dout(10) << __func__ << " approx pg log length = "
4043 << pg_log.get_log().approx_size() << dendl;
4044 dout(10) << __func__ << " transaction_applied = "
4045 << transaction_applied << dendl;
4046 if (!transaction_applied || async)
4047 dout(10) << __func__ << " " << pg_whoami
4048 << " is async_recovery or backfill target" << dendl;
4049 pg_log.trim(trim_to, info, transaction_applied, async);
4050
4051 // update the local pg, pg log
4052 dirty_info = true;
4053 write_if_dirty(t);
4054 }
4055
4056 bool PG::check_log_for_corruption(ObjectStore *store)
4057 {
4058 /// TODO: this method needs to work with the omap log
4059 return true;
4060 }
4061
4062 //! Get the name we're going to save our corrupt page log as
4063 std::string PG::get_corrupt_pg_log_name() const
4064 {
4065 const int MAX_BUF = 512;
4066 char buf[MAX_BUF];
4067 struct tm tm_buf;
4068 time_t my_time(time(NULL));
4069 const struct tm *t = localtime_r(&my_time, &tm_buf);
4070 int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t);
4071 if (ret == 0) {
4072 dout(0) << "strftime failed" << dendl;
4073 return "corrupt_log_unknown_time";
4074 }
4075 string out(buf);
4076 out += stringify(info.pgid);
4077 return out;
4078 }
4079
4080 int PG::read_info(
4081 ObjectStore *store, spg_t pgid, const coll_t &coll,
4082 pg_info_t &info, PastIntervals &past_intervals,
4083 __u8 &struct_v)
4084 {
4085 set<string> keys;
4086 keys.insert(infover_key);
4087 keys.insert(info_key);
4088 keys.insert(biginfo_key);
4089 keys.insert(fastinfo_key);
4090 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
4091 map<string,bufferlist> values;
4092 auto ch = store->open_collection(coll);
4093 ceph_assert(ch);
4094 int r = store->omap_get_values(ch, pgmeta_oid, keys, &values);
4095 ceph_assert(r == 0);
4096 ceph_assert(values.size() == 3 ||
4097 values.size() == 4);
4098
4099 auto p = values[infover_key].cbegin();
4100 decode(struct_v, p);
4101 ceph_assert(struct_v >= 10);
4102
4103 p = values[info_key].begin();
4104 decode(info, p);
4105
4106 p = values[biginfo_key].begin();
4107 decode(past_intervals, p);
4108 decode(info.purged_snaps, p);
4109
4110 p = values[fastinfo_key].begin();
4111 if (!p.end()) {
4112 pg_fast_info_t fast;
4113 decode(fast, p);
4114 fast.try_apply_to(&info);
4115 }
4116 return 0;
4117 }
4118
4119 void PG::read_state(ObjectStore *store)
4120 {
4121 int r = read_info(store, pg_id, coll, info, past_intervals,
4122 info_struct_v);
4123 ceph_assert(r >= 0);
4124
4125 if (info_struct_v < compat_struct_v) {
4126 derr << "PG needs upgrade, but on-disk data is too old; upgrade to"
4127 << " an older version first." << dendl;
4128 ceph_abort_msg("PG too old to upgrade");
4129 }
4130
4131 last_written_info = info;
4132
4133 ostringstream oss;
4134 pg_log.read_log_and_missing(
4135 store,
4136 ch,
4137 pgmeta_oid,
4138 info,
4139 oss,
4140 cct->_conf->osd_ignore_stale_divergent_priors,
4141 cct->_conf->osd_debug_verify_missing_on_start);
4142 if (oss.tellp())
4143 osd->clog->error() << oss.str();
4144
4145 // log any weirdness
4146 log_weirdness();
4147
4148 if (info_struct_v < latest_struct_v) {
4149 upgrade(store);
4150 }
4151
4152 // initialize current mapping
4153 {
4154 int primary, up_primary;
4155 vector<int> acting, up;
4156 get_osdmap()->pg_to_up_acting_osds(
4157 pg_id.pgid, &up, &up_primary, &acting, &primary);
4158 init_primary_up_acting(
4159 up,
4160 acting,
4161 up_primary,
4162 primary);
4163 int rr = OSDMap::calc_pg_role(osd->whoami, acting);
4164 if (pool.info.is_replicated() || rr == pg_whoami.shard)
4165 set_role(rr);
4166 else
4167 set_role(-1);
4168 }
4169
4170 // init pool options
4171 store->set_collection_opts(ch, pool.info.opts);
4172
4173 PG::RecoveryCtx rctx(0, 0, 0, new ObjectStore::Transaction);
4174 handle_initialize(&rctx);
4175 // note: we don't activate here because we know the OSD will advance maps
4176 // during boot.
4177 write_if_dirty(*rctx.transaction);
4178 store->queue_transaction(ch, std::move(*rctx.transaction));
4179 delete rctx.transaction;
4180 }
4181
4182 void PG::log_weirdness()
4183 {
4184 if (pg_log.get_tail() != info.log_tail)
4185 osd->clog->error() << info.pgid
4186 << " info mismatch, log.tail " << pg_log.get_tail()
4187 << " != info.log_tail " << info.log_tail;
4188 if (pg_log.get_head() != info.last_update)
4189 osd->clog->error() << info.pgid
4190 << " info mismatch, log.head " << pg_log.get_head()
4191 << " != info.last_update " << info.last_update;
4192
4193 if (!pg_log.get_log().empty()) {
4194 // sloppy check
4195 if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
4196 osd->clog->error() << info.pgid
4197 << " log bound mismatch, info (tail,head] ("
4198 << pg_log.get_tail() << "," << pg_log.get_head() << "]"
4199 << " actual ["
4200 << pg_log.get_log().log.begin()->version << ","
4201 << pg_log.get_log().log.rbegin()->version << "]";
4202 }
4203
4204 if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
4205 osd->clog->error() << info.pgid
4206 << " caller_ops.size " << pg_log.get_log().caller_ops.size()
4207 << " > log size " << pg_log.get_log().log.size();
4208 }
4209 }
4210
4211 void PG::update_snap_map(
4212 const vector<pg_log_entry_t> &log_entries,
4213 ObjectStore::Transaction &t)
4214 {
4215 for (vector<pg_log_entry_t>::const_iterator i = log_entries.begin();
4216 i != log_entries.end();
4217 ++i) {
4218 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4219 if (i->soid.snap < CEPH_MAXSNAP) {
4220 if (i->is_delete()) {
4221 int r = snap_mapper.remove_oid(
4222 i->soid,
4223 &_t);
4224 if (r != 0)
4225 derr << __func__ << " remove_oid " << i->soid << " failed with " << r << dendl;
4226 // On removal tolerate missing key corruption
4227 ceph_assert(r == 0 || r == -ENOENT);
4228 } else if (i->is_update()) {
4229 ceph_assert(i->snaps.length() > 0);
4230 vector<snapid_t> snaps;
4231 bufferlist snapbl = i->snaps;
4232 auto p = snapbl.cbegin();
4233 try {
4234 decode(snaps, p);
4235 } catch (...) {
4236 derr << __func__ << " decode snaps failure on " << *i << dendl;
4237 snaps.clear();
4238 }
4239 set<snapid_t> _snaps(snaps.begin(), snaps.end());
4240
4241 if (i->is_clone() || i->is_promote()) {
4242 snap_mapper.add_oid(
4243 i->soid,
4244 _snaps,
4245 &_t);
4246 } else if (i->is_modify()) {
4247 int r = snap_mapper.update_snaps(
4248 i->soid,
4249 _snaps,
4250 0,
4251 &_t);
4252 ceph_assert(r == 0);
4253 } else {
4254 ceph_assert(i->is_clean());
4255 }
4256 }
4257 }
4258 }
4259 }
4260
4261 /**
4262 * filter trimming|trimmed snaps out of snapcontext
4263 */
4264 void PG::filter_snapc(vector<snapid_t> &snaps)
4265 {
4266 // nothing needs to trim, we can return immediately
4267 if (snap_trimq.empty() && info.purged_snaps.empty())
4268 return;
4269
4270 bool filtering = false;
4271 vector<snapid_t> newsnaps;
4272 for (vector<snapid_t>::iterator p = snaps.begin();
4273 p != snaps.end();
4274 ++p) {
4275 if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
4276 if (!filtering) {
4277 // start building a new vector with what we've seen so far
4278 dout(10) << "filter_snapc filtering " << snaps << dendl;
4279 newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
4280 filtering = true;
4281 }
4282 dout(20) << "filter_snapc removing trimq|purged snap " << *p << dendl;
4283 } else {
4284 if (filtering)
4285 newsnaps.push_back(*p); // continue building new vector
4286 }
4287 }
4288 if (filtering) {
4289 snaps.swap(newsnaps);
4290 dout(10) << "filter_snapc result " << snaps << dendl;
4291 }
4292 }
4293
4294 void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m)
4295 {
4296 for (map<hobject_t, list<OpRequestRef>>::iterator it = m.begin();
4297 it != m.end();
4298 ++it)
4299 requeue_ops(it->second);
4300 m.clear();
4301 }
4302
4303 void PG::requeue_op(OpRequestRef op)
4304 {
4305 auto p = waiting_for_map.find(op->get_source());
4306 if (p != waiting_for_map.end()) {
4307 dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")"
4308 << dendl;
4309 p->second.push_front(op);
4310 } else {
4311 dout(20) << __func__ << " " << op << dendl;
4312 osd->enqueue_front(
4313 OpQueueItem(
4314 unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(info.pgid, op)),
4315 op->get_req()->get_cost(),
4316 op->get_req()->get_priority(),
4317 op->get_req()->get_recv_stamp(),
4318 op->get_req()->get_source().num(),
4319 get_osdmap_epoch()));
4320 }
4321 }
4322
4323 void PG::requeue_ops(list<OpRequestRef> &ls)
4324 {
4325 for (list<OpRequestRef>::reverse_iterator i = ls.rbegin();
4326 i != ls.rend();
4327 ++i) {
4328 requeue_op(*i);
4329 }
4330 ls.clear();
4331 }
4332
4333 void PG::requeue_map_waiters()
4334 {
4335 epoch_t epoch = get_osdmap_epoch();
4336 auto p = waiting_for_map.begin();
4337 while (p != waiting_for_map.end()) {
4338 if (epoch < p->second.front()->min_epoch) {
4339 dout(20) << __func__ << " " << p->first << " front op "
4340 << p->second.front() << " must still wait, doing nothing"
4341 << dendl;
4342 ++p;
4343 } else {
4344 dout(20) << __func__ << " " << p->first << " " << p->second << dendl;
4345 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
4346 auto req = *q;
4347 osd->enqueue_front(OpQueueItem(
4348 unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(info.pgid, req)),
4349 req->get_req()->get_cost(),
4350 req->get_req()->get_priority(),
4351 req->get_req()->get_recv_stamp(),
4352 req->get_req()->get_source().num(),
4353 epoch));
4354 }
4355 p = waiting_for_map.erase(p);
4356 }
4357 }
4358 }
4359
4360
4361 // ==========================================================================================
4362 // SCRUB
4363
4364 /*
4365 * when holding pg and sched_scrub_lock, then the states are:
4366 * scheduling:
4367 * scrubber.local_reserved = true
4368 * scrubber.active = false
4369 * scrubber.reserved_peers includes whoami
4370 * osd->scrubs_local++
4371 * scheduling, replica declined:
4372 * scrubber.local_reserved = true
4373 * scrubber.reserved_peers includes -1
4374 * osd->scrub_local++
4375 * pending:
4376 * scrubber.local_reserved = true
4377 * scrubber.active = false
4378 * scrubber.reserved_peers.size() == acting.size();
4379 * pg on scrub_wq
4380 * osd->scrub_local++
4381 * scrubbing:
4382 * scrubber.local_reserved = true;
4383 * scrubber.active = true
4384 * scrubber.reserved_peers empty
4385 */
4386
4387 // returns true if a scrub has been newly kicked off
4388 bool PG::sched_scrub()
4389 {
4390 ceph_assert(is_locked());
4391 ceph_assert(!is_scrubbing());
4392 if (!(is_primary() && is_active() && is_clean())) {
4393 return false;
4394 }
4395
4396 // All processing the first time through commits us to whatever
4397 // choices are made.
4398 if (!scrubber.local_reserved) {
4399 dout(20) << __func__ << ": Start processing pg " << info.pgid << dendl;
4400
4401 bool allow_deep_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
4402 pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB));
4403 bool allow_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
4404 pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB));
4405 bool has_deep_errors = (info.stats.stats.sum.num_deep_scrub_errors > 0);
4406 bool try_to_auto_repair = (cct->_conf->osd_scrub_auto_repair
4407 && get_pgbackend()->auto_repair_supported());
4408
4409 scrubber.time_for_deep = false;
4410 // Clear these in case user issues the scrub/repair command during
4411 // the scheduling of the scrub/repair (e.g. request reservation)
4412 scrubber.deep_scrub_on_error = false;
4413 scrubber.auto_repair = false;
4414
4415 // All periodic scrub handling goes here because must_scrub is
4416 // always set for must_deep_scrub and must_repair.
4417 if (!scrubber.must_scrub) {
4418 ceph_assert(!scrubber.must_deep_scrub && !scrubber.must_repair);
4419 // Handle deep scrub determination only if allowed
4420 if (allow_deep_scrub) {
4421 // Initial entry and scheduled scrubs without nodeep_scrub set get here
4422 if (scrubber.need_auto) {
4423 dout(20) << __func__ << ": need repair after scrub errors" << dendl;
4424 scrubber.time_for_deep = true;
4425 } else {
4426 double deep_scrub_interval = 0;
4427 pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
4428 if (deep_scrub_interval <= 0) {
4429 deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
4430 }
4431 scrubber.time_for_deep = ceph_clock_now() >=
4432 info.history.last_deep_scrub_stamp + deep_scrub_interval;
4433
4434 bool deep_coin_flip = false;
4435 // If we randomize when !allow_scrub && allow_deep_scrub, then it guarantees
4436 // we will deep scrub because this function is called often.
4437 if (!scrubber.time_for_deep && allow_scrub)
4438 deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
4439 dout(20) << __func__ << ": time_for_deep=" << scrubber.time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
4440
4441 scrubber.time_for_deep = (scrubber.time_for_deep || deep_coin_flip);
4442 }
4443
4444 if (!scrubber.time_for_deep && has_deep_errors) {
4445 osd->clog->info() << "osd." << osd->whoami
4446 << " pg " << info.pgid
4447 << " Deep scrub errors, upgrading scrub to deep-scrub";
4448 scrubber.time_for_deep = true;
4449 }
4450
4451 if (try_to_auto_repair) {
4452 if (scrubber.time_for_deep) {
4453 dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
4454 scrubber.auto_repair = true;
4455 } else if (allow_scrub) {
4456 dout(20) << __func__ << ": auto repair with scrubbing, rescrub if errors found" << dendl;
4457 scrubber.deep_scrub_on_error = true;
4458 }
4459 }
4460 } else { // !allow_deep_scrub
4461 dout(20) << __func__ << ": nodeep_scrub set" << dendl;
4462 if (has_deep_errors) {
4463 osd->clog->error() << "osd." << osd->whoami
4464 << " pg " << info.pgid
4465 << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
4466 return false;
4467 }
4468 }
4469
4470 //NOSCRUB so skip regular scrubs
4471 if (!allow_scrub && !scrubber.time_for_deep) {
4472 return false;
4473 }
4474 // scrubber.must_scrub
4475 } else if (!scrubber.must_deep_scrub && has_deep_errors) {
4476 osd->clog->error() << "osd." << osd->whoami
4477 << " pg " << info.pgid
4478 << " Regular scrub request, deep-scrub details will be lost";
4479 }
4480 // Unless precluded this was handle above
4481 scrubber.need_auto = false;
4482
4483 ceph_assert(scrubber.reserved_peers.empty());
4484 bool allow_scrubing = cct->_conf->osd_scrub_during_recovery ||
4485 (cct->_conf->osd_repair_during_recovery && scrubber.must_repair) ||
4486 !osd->is_recovery_active();
4487 if (allow_scrubing &&
4488 osd->inc_scrubs_local()) {
4489 dout(20) << __func__ << ": reserved locally, reserving replicas" << dendl;
4490 scrubber.local_reserved = true;
4491 scrubber.reserved_peers.insert(pg_whoami);
4492 scrub_reserve_replicas();
4493 } else {
4494 dout(20) << __func__ << ": failed to reserve locally" << dendl;
4495 return false;
4496 }
4497 }
4498
4499 if (scrubber.local_reserved) {
4500 if (scrubber.reserve_failed) {
4501 dout(20) << __func__ << ": failed, a peer declined" << dendl;
4502 clear_scrub_reserved();
4503 scrub_unreserve_replicas();
4504 return false;
4505 } else if (scrubber.reserved_peers.size() == actingset.size()) {
4506 dout(20) << __func__ << ": success, reserved self and replicas" << dendl;
4507 if (scrubber.time_for_deep) {
4508 dout(10) << __func__ << ": scrub will be deep" << dendl;
4509 state_set(PG_STATE_DEEP_SCRUB);
4510 scrubber.time_for_deep = false;
4511 }
4512 queue_scrub();
4513 } else {
4514 // none declined, since scrubber.reserved is set
4515 dout(20) << __func__ << ": reserved " << scrubber.reserved_peers
4516 << ", waiting for replicas" << dendl;
4517 }
4518 }
4519 return true;
4520 }
4521
4522 bool PG::is_scrub_registered()
4523 {
4524 return !scrubber.scrub_reg_stamp.is_zero();
4525 }
4526
4527 void PG::reg_next_scrub()
4528 {
4529 if (!is_primary())
4530 return;
4531
4532 utime_t reg_stamp;
4533 bool must = false;
4534 if (scrubber.must_scrub || scrubber.need_auto) {
4535 // Set the smallest time that isn't utime_t()
4536 reg_stamp = Scrubber::scrub_must_stamp();
4537 must = true;
4538 } else if (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats) {
4539 reg_stamp = ceph_clock_now();
4540 must = true;
4541 } else {
4542 reg_stamp = info.history.last_scrub_stamp;
4543 }
4544 // note down the sched_time, so we can locate this scrub, and remove it
4545 // later on.
4546 double scrub_min_interval = 0, scrub_max_interval = 0;
4547 pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
4548 pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
4549 ceph_assert(!is_scrub_registered());
4550 scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
4551 reg_stamp,
4552 scrub_min_interval,
4553 scrub_max_interval,
4554 must);
4555 dout(10) << __func__ << " pg " << pg_id << " register next scrub, scrub time "
4556 << scrubber.scrub_reg_stamp << ", must = " << (int)must << dendl;
4557 }
4558
4559 void PG::unreg_next_scrub()
4560 {
4561 if (is_scrub_registered()) {
4562 osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
4563 scrubber.scrub_reg_stamp = utime_t();
4564 }
4565 }
4566
4567 void PG::on_info_history_change()
4568 {
4569 unreg_next_scrub();
4570 reg_next_scrub();
4571 }
4572
4573 void PG::scrub_requested(bool deep, bool repair, bool need_auto)
4574 {
4575 unreg_next_scrub();
4576 if (need_auto) {
4577 scrubber.need_auto = true;
4578 } else {
4579 scrubber.must_scrub = true;
4580 scrubber.must_deep_scrub = deep || repair;
4581 scrubber.must_repair = repair;
4582 // User might intervene, so clear this
4583 scrubber.need_auto = false;
4584 }
4585 reg_next_scrub();
4586 }
4587
4588 void PG::do_replica_scrub_map(OpRequestRef op)
4589 {
4590 const MOSDRepScrubMap *m = static_cast<const MOSDRepScrubMap*>(op->get_req());
4591 dout(7) << __func__ << " " << *m << dendl;
4592 if (m->map_epoch < info.history.same_interval_since) {
4593 dout(10) << __func__ << " discarding old from "
4594 << m->map_epoch << " < " << info.history.same_interval_since
4595 << dendl;
4596 return;
4597 }
4598 if (!scrubber.is_chunky_scrub_active()) {
4599 dout(10) << __func__ << " scrub isn't active" << dendl;
4600 return;
4601 }
4602
4603 op->mark_started();
4604
4605 auto p = const_cast<bufferlist&>(m->get_data()).cbegin();
4606 scrubber.received_maps[m->from].decode(p, info.pgid.pool());
4607 dout(10) << "map version is "
4608 << scrubber.received_maps[m->from].valid_through
4609 << dendl;
4610
4611 dout(10) << __func__ << " waiting_on_whom was " << scrubber.waiting_on_whom
4612 << dendl;
4613 ceph_assert(scrubber.waiting_on_whom.count(m->from));
4614 scrubber.waiting_on_whom.erase(m->from);
4615 if (m->preempted) {
4616 dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
4617 scrub_preempted = true;
4618 }
4619 if (scrubber.waiting_on_whom.empty()) {
4620 requeue_scrub(ops_blocked_by_scrub());
4621 }
4622 }
4623
4624 // send scrub v3 messages (chunky scrub)
4625 void PG::_request_scrub_map(
4626 pg_shard_t replica, eversion_t version,
4627 hobject_t start, hobject_t end,
4628 bool deep,
4629 bool allow_preemption)
4630 {
4631 ceph_assert(replica != pg_whoami);
4632 dout(10) << "scrub requesting scrubmap from osd." << replica
4633 << " deep " << (int)deep << dendl;
4634 MOSDRepScrub *repscrubop = new MOSDRepScrub(
4635 spg_t(info.pgid.pgid, replica.shard), version,
4636 get_osdmap_epoch(),
4637 get_last_peering_reset(),
4638 start, end, deep,
4639 allow_preemption,
4640 scrubber.priority,
4641 ops_blocked_by_scrub());
4642 // default priority, we want the rep scrub processed prior to any recovery
4643 // or client io messages (we are holding a lock!)
4644 osd->send_message_osd_cluster(
4645 replica.osd, repscrubop, get_osdmap_epoch());
4646 }
4647
4648 void PG::handle_scrub_reserve_request(OpRequestRef op)
4649 {
4650 dout(7) << __func__ << " " << *op->get_req() << dendl;
4651 op->mark_started();
4652 if (scrubber.local_reserved) {
4653 dout(10) << __func__ << " ignoring reserve request: Already reserved"
4654 << dendl;
4655 return;
4656 }
4657 if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
4658 osd->inc_scrubs_remote()) {
4659 scrubber.remote_reserved = true;
4660 } else {
4661 dout(20) << __func__ << ": failed to reserve remotely" << dendl;
4662 scrubber.remote_reserved = false;
4663 }
4664 const MOSDScrubReserve *m =
4665 static_cast<const MOSDScrubReserve*>(op->get_req());
4666 Message *reply = new MOSDScrubReserve(
4667 spg_t(info.pgid.pgid, primary.shard),
4668 m->map_epoch,
4669 scrubber.remote_reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
4670 pg_whoami);
4671 osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
4672 }
4673
4674 void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
4675 {
4676 dout(7) << __func__ << " " << *op->get_req() << dendl;
4677 op->mark_started();
4678 if (!scrubber.local_reserved) {
4679 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
4680 return;
4681 }
4682 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
4683 dout(10) << " already had osd." << from << " reserved" << dendl;
4684 } else {
4685 dout(10) << " osd." << from << " scrub reserve = success" << dendl;
4686 scrubber.reserved_peers.insert(from);
4687 sched_scrub();
4688 }
4689 }
4690
4691 void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
4692 {
4693 dout(7) << __func__ << " " << *op->get_req() << dendl;
4694 op->mark_started();
4695 if (!scrubber.local_reserved) {
4696 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
4697 return;
4698 }
4699 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
4700 dout(10) << " already had osd." << from << " reserved" << dendl;
4701 } else {
4702 /* One decline stops this pg from being scheduled for scrubbing. */
4703 dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
4704 scrubber.reserve_failed = true;
4705 sched_scrub();
4706 }
4707 }
4708
4709 void PG::handle_scrub_reserve_release(OpRequestRef op)
4710 {
4711 dout(7) << __func__ << " " << *op->get_req() << dendl;
4712 op->mark_started();
4713 clear_scrub_reserved();
4714 }
4715
4716 // We can zero the value of primary num_bytes as just an atomic.
4717 // However, setting above zero reserves space for backfill and requires
4718 // the OSDService::stat_lock which protects all OSD usage
4719 void PG::set_reserved_num_bytes(int64_t primary, int64_t local) {
4720 ceph_assert(osd->stat_lock.is_locked_by_me());
4721 primary_num_bytes.store(primary);
4722 local_num_bytes.store(local);
4723 return;
4724 }
4725
4726 void PG::clear_reserved_num_bytes() {
4727 primary_num_bytes.store(0);
4728 local_num_bytes.store(0);
4729 return;
4730 }
4731
4732 void PG::reject_reservation()
4733 {
4734 clear_reserved_num_bytes();
4735 osd->send_message_osd_cluster(
4736 primary.osd,
4737 new MBackfillReserve(
4738 MBackfillReserve::REJECT_TOOFULL,
4739 spg_t(info.pgid.pgid, primary.shard),
4740 get_osdmap_epoch()),
4741 get_osdmap_epoch());
4742 }
4743
4744 void PG::schedule_backfill_retry(float delay)
4745 {
4746 std::lock_guard lock(osd->recovery_request_lock);
4747 osd->recovery_request_timer.add_event_after(
4748 delay,
4749 new QueuePeeringEvt<RequestBackfill>(
4750 this, get_osdmap_epoch(),
4751 RequestBackfill()));
4752 }
4753
4754 void PG::schedule_recovery_retry(float delay)
4755 {
4756 std::lock_guard lock(osd->recovery_request_lock);
4757 osd->recovery_request_timer.add_event_after(
4758 delay,
4759 new QueuePeeringEvt<DoRecovery>(
4760 this, get_osdmap_epoch(),
4761 DoRecovery()));
4762 }
4763
4764 void PG::clear_scrub_reserved()
4765 {
4766 scrubber.reserved_peers.clear();
4767 scrubber.reserve_failed = false;
4768
4769 if (scrubber.local_reserved) {
4770 scrubber.local_reserved = false;
4771 osd->dec_scrubs_local();
4772 }
4773 if (scrubber.remote_reserved) {
4774 scrubber.remote_reserved = false;
4775 osd->dec_scrubs_remote();
4776 }
4777 }
4778
4779 void PG::scrub_reserve_replicas()
4780 {
4781 ceph_assert(backfill_targets.empty());
4782 for (set<pg_shard_t>::iterator i = actingset.begin();
4783 i != actingset.end();
4784 ++i) {
4785 if (*i == pg_whoami) continue;
4786 dout(10) << "scrub requesting reserve from osd." << *i << dendl;
4787 osd->send_message_osd_cluster(
4788 i->osd,
4789 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
4790 get_osdmap_epoch(),
4791 MOSDScrubReserve::REQUEST, pg_whoami),
4792 get_osdmap_epoch());
4793 }
4794 }
4795
4796 void PG::scrub_unreserve_replicas()
4797 {
4798 ceph_assert(backfill_targets.empty());
4799 for (set<pg_shard_t>::iterator i = actingset.begin();
4800 i != actingset.end();
4801 ++i) {
4802 if (*i == pg_whoami) continue;
4803 dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
4804 osd->send_message_osd_cluster(
4805 i->osd,
4806 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
4807 get_osdmap_epoch(),
4808 MOSDScrubReserve::RELEASE, pg_whoami),
4809 get_osdmap_epoch());
4810 }
4811 }
4812
4813 void PG::_scan_rollback_obs(const vector<ghobject_t> &rollback_obs)
4814 {
4815 ObjectStore::Transaction t;
4816 eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
4817 for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
4818 i != rollback_obs.end();
4819 ++i) {
4820 if (i->generation < trimmed_to.version) {
4821 dout(10) << __func__ << "osd." << osd->whoami
4822 << " pg " << info.pgid
4823 << " found obsolete rollback obj "
4824 << *i << " generation < trimmed_to "
4825 << trimmed_to
4826 << "...repaired" << dendl;
4827 t.remove(coll, *i);
4828 }
4829 }
4830 if (!t.empty()) {
4831 derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
4832 << dendl;
4833 osd->store->queue_transaction(ch, std::move(t), NULL);
4834 }
4835 }
4836
4837 void PG::_scan_snaps(ScrubMap &smap)
4838 {
4839 hobject_t head;
4840 SnapSet snapset;
4841
4842 // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify
4843 // caller using clean_meta_map(), and it works properly.
4844 dout(20) << __func__ << " start" << dendl;
4845
4846 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4847 i != smap.objects.rend();
4848 ++i) {
4849 const hobject_t &hoid = i->first;
4850 ScrubMap::object &o = i->second;
4851
4852 dout(20) << __func__ << " " << hoid << dendl;
4853
4854 ceph_assert(!hoid.is_snapdir());
4855 if (hoid.is_head()) {
4856 // parse the SnapSet
4857 bufferlist bl;
4858 if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
4859 continue;
4860 }
4861 bl.push_back(o.attrs[SS_ATTR]);
4862 auto p = bl.cbegin();
4863 try {
4864 decode(snapset, p);
4865 } catch(...) {
4866 continue;
4867 }
4868 head = hoid.get_head();
4869 continue;
4870 }
4871 if (hoid.snap < CEPH_MAXSNAP) {
4872 // check and if necessary fix snap_mapper
4873 if (hoid.get_head() != head) {
4874 derr << __func__ << " no head for " << hoid << " (have " << head << ")"
4875 << dendl;
4876 continue;
4877 }
4878 set<snapid_t> obj_snaps;
4879 auto p = snapset.clone_snaps.find(hoid.snap);
4880 if (p == snapset.clone_snaps.end()) {
4881 derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset
4882 << dendl;
4883 continue;
4884 }
4885 obj_snaps.insert(p->second.begin(), p->second.end());
4886 set<snapid_t> cur_snaps;
4887 int r = snap_mapper.get_snaps(hoid, &cur_snaps);
4888 if (r != 0 && r != -ENOENT) {
4889 derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
4890 ceph_abort();
4891 }
4892 if (r == -ENOENT || cur_snaps != obj_snaps) {
4893 ObjectStore::Transaction t;
4894 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4895 if (r == 0) {
4896 r = snap_mapper.remove_oid(hoid, &_t);
4897 if (r != 0) {
4898 derr << __func__ << ": remove_oid returned " << cpp_strerror(r)
4899 << dendl;
4900 ceph_abort();
4901 }
4902 osd->clog->error() << "osd." << osd->whoami
4903 << " found snap mapper error on pg "
4904 << info.pgid
4905 << " oid " << hoid << " snaps in mapper: "
4906 << cur_snaps << ", oi: "
4907 << obj_snaps
4908 << "...repaired";
4909 } else {
4910 osd->clog->error() << "osd." << osd->whoami
4911 << " found snap mapper error on pg "
4912 << info.pgid
4913 << " oid " << hoid << " snaps missing in mapper"
4914 << ", should be: "
4915 << obj_snaps
4916 << " was " << cur_snaps << " r " << r
4917 << "...repaired";
4918 }
4919 snap_mapper.add_oid(hoid, obj_snaps, &_t);
4920
4921 // wait for repair to apply to avoid confusing other bits of the system.
4922 {
4923 Cond my_cond;
4924 Mutex my_lock("PG::_scan_snaps my_lock");
4925 int r = 0;
4926 bool done;
4927 t.register_on_applied_sync(
4928 new C_SafeCond(&my_lock, &my_cond, &done, &r));
4929 r = osd->store->queue_transaction(ch, std::move(t));
4930 if (r != 0) {
4931 derr << __func__ << ": queue_transaction got " << cpp_strerror(r)
4932 << dendl;
4933 } else {
4934 my_lock.Lock();
4935 while (!done)
4936 my_cond.Wait(my_lock);
4937 my_lock.Unlock();
4938 }
4939 }
4940 }
4941 }
4942 }
4943 }
4944
4945 void PG::_repair_oinfo_oid(ScrubMap &smap)
4946 {
4947 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4948 i != smap.objects.rend();
4949 ++i) {
4950 const hobject_t &hoid = i->first;
4951 ScrubMap::object &o = i->second;
4952
4953 bufferlist bl;
4954 if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4955 continue;
4956 }
4957 bl.push_back(o.attrs[OI_ATTR]);
4958 object_info_t oi;
4959 try {
4960 oi.decode(bl);
4961 } catch(...) {
4962 continue;
4963 }
4964 if (oi.soid != hoid) {
4965 ObjectStore::Transaction t;
4966 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4967 osd->clog->error() << "osd." << osd->whoami
4968 << " found object info error on pg "
4969 << info.pgid
4970 << " oid " << hoid << " oid in object info: "
4971 << oi.soid
4972 << "...repaired";
4973 // Fix object info
4974 oi.soid = hoid;
4975 bl.clear();
4976 encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4977
4978 bufferptr bp(bl.c_str(), bl.length());
4979 o.attrs[OI_ATTR] = bp;
4980
4981 t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
4982 int r = osd->store->queue_transaction(ch, std::move(t));
4983 if (r != 0) {
4984 derr << __func__ << ": queue_transaction got " << cpp_strerror(r)
4985 << dendl;
4986 }
4987 }
4988 }
4989 }
4990 int PG::build_scrub_map_chunk(
4991 ScrubMap &map,
4992 ScrubMapBuilder &pos,
4993 hobject_t start,
4994 hobject_t end,
4995 bool deep,
4996 ThreadPool::TPHandle &handle)
4997 {
4998 dout(10) << __func__ << " [" << start << "," << end << ") "
4999 << " pos " << pos
5000 << dendl;
5001
5002 // start
5003 while (pos.empty()) {
5004 pos.deep = deep;
5005 map.valid_through = info.last_update;
5006
5007 // objects
5008 vector<ghobject_t> rollback_obs;
5009 pos.ret = get_pgbackend()->objects_list_range(
5010 start,
5011 end,
5012 &pos.ls,
5013 &rollback_obs);
5014 if (pos.ret < 0) {
5015 dout(5) << "objects_list_range error: " << pos.ret << dendl;
5016 return pos.ret;
5017 }
5018 if (pos.ls.empty()) {
5019 break;
5020 }
5021 _scan_rollback_obs(rollback_obs);
5022 pos.pos = 0;
5023 return -EINPROGRESS;
5024 }
5025
5026 // scan objects
5027 while (!pos.done()) {
5028 int r = get_pgbackend()->be_scan_list(map, pos);
5029 if (r == -EINPROGRESS) {
5030 return r;
5031 }
5032 }
5033
5034 // finish
5035 dout(20) << __func__ << " finishing" << dendl;
5036 ceph_assert(pos.done());
5037 _repair_oinfo_oid(map);
5038 if (!is_primary()) {
5039 ScrubMap for_meta_scrub;
5040 // In case we restarted smaller chunk, clear old data
5041 scrubber.cleaned_meta_map.clear_from(scrubber.start);
5042 scrubber.cleaned_meta_map.insert(map);
5043 scrubber.clean_meta_map(for_meta_scrub);
5044 _scan_snaps(for_meta_scrub);
5045 }
5046
5047 dout(20) << __func__ << " done, got " << map.objects.size() << " items"
5048 << dendl;
5049 return 0;
5050 }
5051
5052 void PG::Scrubber::cleanup_store(ObjectStore::Transaction *t) {
5053 if (!store)
5054 return;
5055 struct OnComplete : Context {
5056 std::unique_ptr<Scrub::Store> store;
5057 explicit OnComplete(
5058 std::unique_ptr<Scrub::Store> &&store)
5059 : store(std::move(store)) {}
5060 void finish(int) override {}
5061 };
5062 store->cleanup(t);
5063 t->register_on_complete(new OnComplete(std::move(store)));
5064 ceph_assert(!store);
5065 }
5066
5067 void PG::repair_object(
5068 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
5069 pg_shard_t bad_peer)
5070 {
5071 list<pg_shard_t> op_shards;
5072 for (auto i : *ok_peers) {
5073 op_shards.push_back(i.second);
5074 }
5075 dout(10) << "repair_object " << soid << " bad_peer osd."
5076 << bad_peer << " ok_peers osd.{" << op_shards << "}" << dendl;
5077 ScrubMap::object &po = ok_peers->back().first;
5078 eversion_t v;
5079 bufferlist bv;
5080 bv.push_back(po.attrs[OI_ATTR]);
5081 object_info_t oi;
5082 try {
5083 auto bliter = bv.cbegin();
5084 decode(oi, bliter);
5085 } catch (...) {
5086 dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
5087 ceph_abort();
5088 }
5089 if (bad_peer != primary) {
5090 peer_missing[bad_peer].add(soid, oi.version, eversion_t(), false);
5091 } else {
5092 // We should only be scrubbing if the PG is clean.
5093 ceph_assert(waiting_for_unreadable_object.empty());
5094
5095 pg_log.missing_add(soid, oi.version, eversion_t());
5096
5097 pg_log.set_last_requested(0);
5098 dout(10) << __func__ << ": primary = " << primary << dendl;
5099 }
5100
5101 if (is_ec_pg() || bad_peer == primary) {
5102 // we'd better collect all shard for EC pg, and prepare good peers as the
5103 // source of pull in the case of replicated pg.
5104 missing_loc.add_missing(soid, oi.version, eversion_t());
5105 list<pair<ScrubMap::object, pg_shard_t> >::iterator i;
5106 for (i = ok_peers->begin();
5107 i != ok_peers->end();
5108 ++i)
5109 missing_loc.add_location(soid, i->second);
5110 }
5111 }
5112
5113 /* replica_scrub
5114 *
5115 * Wait for last_update_applied to match msg->scrub_to as above. Wait
5116 * for pushes to complete in case of recent recovery. Build a single
5117 * scrubmap of objects that are in the range [msg->start, msg->end).
5118 */
5119 void PG::replica_scrub(
5120 OpRequestRef op,
5121 ThreadPool::TPHandle &handle)
5122 {
5123 const MOSDRepScrub *msg = static_cast<const MOSDRepScrub *>(op->get_req());
5124 ceph_assert(!scrubber.active_rep_scrub);
5125 dout(7) << "replica_scrub" << dendl;
5126
5127 if (msg->map_epoch < info.history.same_interval_since) {
5128 dout(10) << "replica_scrub discarding old replica_scrub from "
5129 << msg->map_epoch << " < " << info.history.same_interval_since
5130 << dendl;
5131 return;
5132 }
5133
5134 ceph_assert(msg->chunky);
5135 if (active_pushes > 0) {
5136 dout(10) << "waiting for active pushes to finish" << dendl;
5137 scrubber.active_rep_scrub = op;
5138 return;
5139 }
5140
5141 scrubber.state = Scrubber::BUILD_MAP_REPLICA;
5142 scrubber.replica_scrub_start = msg->min_epoch;
5143 scrubber.start = msg->start;
5144 scrubber.end = msg->end;
5145 scrubber.max_end = msg->end;
5146 scrubber.deep = msg->deep;
5147 scrubber.epoch_start = info.history.same_interval_since;
5148 if (msg->priority) {
5149 scrubber.priority = msg->priority;
5150 } else {
5151 scrubber.priority = get_scrub_priority();
5152 }
5153
5154 scrub_can_preempt = msg->allow_preemption;
5155 scrub_preempted = false;
5156 scrubber.replica_scrubmap_pos.reset();
5157
5158 requeue_scrub(msg->high_priority);
5159 }
5160
5161 /* Scrub:
5162 * PG_STATE_SCRUBBING is set when the scrub is queued
5163 *
5164 * scrub will be chunky if all OSDs in PG support chunky scrub
5165 * scrub will fail if OSDs are too old.
5166 */
5167 void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
5168 {
5169 if (cct->_conf->osd_scrub_sleep > 0 &&
5170 (scrubber.state == PG::Scrubber::NEW_CHUNK ||
5171 scrubber.state == PG::Scrubber::INACTIVE) &&
5172 scrubber.needs_sleep) {
5173 ceph_assert(!scrubber.sleeping);
5174 dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
5175
5176 // Do an async sleep so we don't block the op queue
5177 OSDService *osds = osd;
5178 spg_t pgid = get_pgid();
5179 int state = scrubber.state;
5180 auto scrub_requeue_callback =
5181 new FunctionContext([osds, pgid, state](int r) {
5182 PGRef pg = osds->osd->lookup_lock_pg(pgid);
5183 if (pg == nullptr) {
5184 lgeneric_dout(osds->osd->cct, 20)
5185 << "scrub_requeue_callback: Could not find "
5186 << "PG " << pgid << " can't complete scrub requeue after sleep"
5187 << dendl;
5188 return;
5189 }
5190 pg->scrubber.sleeping = false;
5191 pg->scrubber.needs_sleep = false;
5192 lgeneric_dout(pg->cct, 20)
5193 << "scrub_requeue_callback: slept for "
5194 << ceph_clock_now() - pg->scrubber.sleep_start
5195 << ", re-queuing scrub with state " << state << dendl;
5196 pg->scrub_queued = false;
5197 pg->requeue_scrub();
5198 pg->scrubber.sleep_start = utime_t();
5199 pg->unlock();
5200 });
5201 std::lock_guard l(osd->sleep_lock);
5202 osd->sleep_timer.add_event_after(cct->_conf->osd_scrub_sleep,
5203 scrub_requeue_callback);
5204 scrubber.sleeping = true;
5205 scrubber.sleep_start = ceph_clock_now();
5206 return;
5207 }
5208 if (pg_has_reset_since(queued)) {
5209 return;
5210 }
5211 ceph_assert(scrub_queued);
5212 scrub_queued = false;
5213 scrubber.needs_sleep = true;
5214
5215 // for the replica
5216 if (!is_primary() &&
5217 scrubber.state == PG::Scrubber::BUILD_MAP_REPLICA) {
5218 chunky_scrub(handle);
5219 return;
5220 }
5221
5222 if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
5223 dout(10) << "scrub -- not primary or active or not clean" << dendl;
5224 state_clear(PG_STATE_SCRUBBING);
5225 state_clear(PG_STATE_REPAIR);
5226 state_clear(PG_STATE_DEEP_SCRUB);
5227 publish_stats_to_osd();
5228 return;
5229 }
5230
5231 if (!scrubber.active) {
5232 ceph_assert(backfill_targets.empty());
5233
5234 scrubber.deep = state_test(PG_STATE_DEEP_SCRUB);
5235
5236 dout(10) << "starting a new chunky scrub" << dendl;
5237 }
5238
5239 chunky_scrub(handle);
5240 }
5241
5242 /*
5243 * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
5244 * chunk.
5245 *
5246 * The object store is partitioned into chunks which end on hash boundaries. For
5247 * each chunk, the following logic is performed:
5248 *
5249 * (1) Block writes on the chunk
5250 * (2) Request maps from replicas
5251 * (3) Wait for pushes to be applied (after recovery)
5252 * (4) Wait for writes to flush on the chunk
5253 * (5) Wait for maps from replicas
5254 * (6) Compare / repair all scrub maps
5255 * (7) Wait for digest updates to apply
5256 *
5257 * This logic is encoded in the mostly linear state machine:
5258 *
5259 * +------------------+
5260 * _________v__________ |
5261 * | | |
5262 * | INACTIVE | |
5263 * |____________________| |
5264 * | |
5265 * | +----------+ |
5266 * _________v___v______ | |
5267 * | | | |
5268 * | NEW_CHUNK | | |
5269 * |____________________| | |
5270 * | | |
5271 * _________v__________ | |
5272 * | | | |
5273 * | WAIT_PUSHES | | |
5274 * |____________________| | |
5275 * | | |
5276 * _________v__________ | |
5277 * | | | |
5278 * | WAIT_LAST_UPDATE | | |
5279 * |____________________| | |
5280 * | | |
5281 * _________v__________ | |
5282 * | | | |
5283 * | BUILD_MAP | | |
5284 * |____________________| | |
5285 * | | |
5286 * _________v__________ | |
5287 * | | | |
5288 * | WAIT_REPLICAS | | |
5289 * |____________________| | |
5290 * | | |
5291 * _________v__________ | |
5292 * | | | |
5293 * | COMPARE_MAPS | | |
5294 * |____________________| | |
5295 * | | |
5296 * | | |
5297 * _________v__________ | |
5298 * | | | |
5299 * |WAIT_DIGEST_UPDATES | | |
5300 * |____________________| | |
5301 * | | | |
5302 * | +----------+ |
5303 * _________v__________ |
5304 * | | |
5305 * | FINISH | |
5306 * |____________________| |
5307 * | |
5308 * +------------------+
5309 *
5310 * The primary determines the last update from the subset by walking the log. If
5311 * it sees a log entry pertaining to a file in the chunk, it tells the replicas
5312 * to wait until that update is applied before building a scrub map. Both the
5313 * primary and replicas will wait for any active pushes to be applied.
5314 *
5315 * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
5316 *
5317 * scrubber.state encodes the current state of the scrub (refer to state diagram
5318 * for details).
5319 */
5320 void PG::chunky_scrub(ThreadPool::TPHandle &handle)
5321 {
5322 // check for map changes
5323 if (scrubber.is_chunky_scrub_active()) {
5324 if (scrubber.epoch_start != info.history.same_interval_since) {
5325 dout(10) << "scrub pg changed, aborting" << dendl;
5326 scrub_clear_state();
5327 scrub_unreserve_replicas();
5328 return;
5329 }
5330 }
5331
5332 bool done = false;
5333 int ret;
5334
5335 while (!done) {
5336 dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
5337 << " [" << scrubber.start << "," << scrubber.end << ")"
5338 << " max_end " << scrubber.max_end << dendl;
5339
5340 switch (scrubber.state) {
5341 case PG::Scrubber::INACTIVE:
5342 dout(10) << "scrub start" << dendl;
5343 ceph_assert(is_primary());
5344
5345 publish_stats_to_osd();
5346 scrubber.epoch_start = info.history.same_interval_since;
5347 scrubber.active = true;
5348
5349 {
5350 ObjectStore::Transaction t;
5351 scrubber.cleanup_store(&t);
5352 scrubber.store.reset(Scrub::Store::create(osd->store, &t,
5353 info.pgid, coll));
5354 osd->store->queue_transaction(ch, std::move(t), nullptr);
5355 }
5356
5357 // Don't include temporary objects when scrubbing
5358 scrubber.start = info.pgid.pgid.get_hobj_start();
5359 scrubber.state = PG::Scrubber::NEW_CHUNK;
5360
5361 {
5362 bool repair = state_test(PG_STATE_REPAIR);
5363 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5364 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5365 stringstream oss;
5366 oss << info.pgid.pgid << " " << mode << " starts" << std::endl;
5367 osd->clog->debug(oss);
5368 }
5369
5370 scrubber.preempt_left = cct->_conf.get_val<uint64_t>(
5371 "osd_scrub_max_preemptions");
5372 scrubber.preempt_divisor = 1;
5373 break;
5374
5375 case PG::Scrubber::NEW_CHUNK:
5376 scrubber.primary_scrubmap = ScrubMap();
5377 scrubber.received_maps.clear();
5378
5379 // begin (possible) preemption window
5380 if (scrub_preempted) {
5381 scrubber.preempt_left--;
5382 scrubber.preempt_divisor *= 2;
5383 dout(10) << __func__ << " preempted, " << scrubber.preempt_left
5384 << " left" << dendl;
5385 scrub_preempted = false;
5386 }
5387 scrub_can_preempt = scrubber.preempt_left > 0;
5388
5389 {
5390 /* get the start and end of our scrub chunk
5391 *
5392 * Our scrub chunk has an important restriction we're going to need to
5393 * respect. We can't let head be start or end.
5394 * Using a half-open interval means that if end == head,
5395 * we'd scrub/lock head and the clone right next to head in different
5396 * chunks which would allow us to miss clones created between
5397 * scrubbing that chunk and scrubbing the chunk including head.
5398 * This isn't true for any of the other clones since clones can
5399 * only be created "just to the left of" head. There is one exception
5400 * to this: promotion of clones which always happens to the left of the
5401 * left-most clone, but promote_object checks the scrubber in that
5402 * case, so it should be ok. Also, it's ok to "miss" clones at the
5403 * left end of the range if we are a tier because they may legitimately
5404 * not exist (see _scrub).
5405 */
5406 int min = std::max<int64_t>(3, cct->_conf->osd_scrub_chunk_min /
5407 scrubber.preempt_divisor);
5408 int max = std::max<int64_t>(min, cct->_conf->osd_scrub_chunk_max /
5409 scrubber.preempt_divisor);
5410 hobject_t start = scrubber.start;
5411 hobject_t candidate_end;
5412 vector<hobject_t> objects;
5413 ret = get_pgbackend()->objects_list_partial(
5414 start,
5415 min,
5416 max,
5417 &objects,
5418 &candidate_end);
5419 ceph_assert(ret >= 0);
5420
5421 if (!objects.empty()) {
5422 hobject_t back = objects.back();
5423 while (candidate_end.is_head() &&
5424 candidate_end == back.get_head()) {
5425 candidate_end = back;
5426 objects.pop_back();
5427 if (objects.empty()) {
5428 ceph_assert(0 ==
5429 "Somehow we got more than 2 objects which"
5430 "have the same head but are not clones");
5431 }
5432 back = objects.back();
5433 }
5434 if (candidate_end.is_head()) {
5435 ceph_assert(candidate_end != back.get_head());
5436 candidate_end = candidate_end.get_object_boundary();
5437 }
5438 } else {
5439 ceph_assert(candidate_end.is_max());
5440 }
5441
5442 if (!_range_available_for_scrub(scrubber.start, candidate_end)) {
5443 // we'll be requeued by whatever made us unavailable for scrub
5444 dout(10) << __func__ << ": scrub blocked somewhere in range "
5445 << "[" << scrubber.start << ", " << candidate_end << ")"
5446 << dendl;
5447 done = true;
5448 break;
5449 }
5450 scrubber.end = candidate_end;
5451 if (scrubber.end > scrubber.max_end)
5452 scrubber.max_end = scrubber.end;
5453 }
5454
5455 // walk the log to find the latest update that affects our chunk
5456 scrubber.subset_last_update = eversion_t();
5457 for (auto p = projected_log.log.rbegin();
5458 p != projected_log.log.rend();
5459 ++p) {
5460 if (p->soid >= scrubber.start &&
5461 p->soid < scrubber.end) {
5462 scrubber.subset_last_update = p->version;
5463 break;
5464 }
5465 }
5466 if (scrubber.subset_last_update == eversion_t()) {
5467 for (list<pg_log_entry_t>::const_reverse_iterator p =
5468 pg_log.get_log().log.rbegin();
5469 p != pg_log.get_log().log.rend();
5470 ++p) {
5471 if (p->soid >= scrubber.start &&
5472 p->soid < scrubber.end) {
5473 scrubber.subset_last_update = p->version;
5474 break;
5475 }
5476 }
5477 }
5478
5479 scrubber.state = PG::Scrubber::WAIT_PUSHES;
5480 break;
5481
5482 case PG::Scrubber::WAIT_PUSHES:
5483 if (active_pushes == 0) {
5484 scrubber.state = PG::Scrubber::WAIT_LAST_UPDATE;
5485 } else {
5486 dout(15) << "wait for pushes to apply" << dendl;
5487 done = true;
5488 }
5489 break;
5490
5491 case PG::Scrubber::WAIT_LAST_UPDATE:
5492 if (last_update_applied < scrubber.subset_last_update) {
5493 // will be requeued by op_applied
5494 dout(15) << "wait for EC read/modify/writes to queue" << dendl;
5495 done = true;
5496 break;
5497 }
5498
5499 // ask replicas to scan
5500 scrubber.waiting_on_whom.insert(pg_whoami);
5501
5502 // request maps from replicas
5503 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
5504 i != acting_recovery_backfill.end();
5505 ++i) {
5506 if (*i == pg_whoami) continue;
5507 _request_scrub_map(*i, scrubber.subset_last_update,
5508 scrubber.start, scrubber.end, scrubber.deep,
5509 scrubber.preempt_left > 0);
5510 scrubber.waiting_on_whom.insert(*i);
5511 }
5512 dout(10) << __func__ << " waiting_on_whom " << scrubber.waiting_on_whom
5513 << dendl;
5514
5515 scrubber.state = PG::Scrubber::BUILD_MAP;
5516 scrubber.primary_scrubmap_pos.reset();
5517 break;
5518
5519 case PG::Scrubber::BUILD_MAP:
5520 ceph_assert(last_update_applied >= scrubber.subset_last_update);
5521
5522 // build my own scrub map
5523 if (scrub_preempted) {
5524 dout(10) << __func__ << " preempted" << dendl;
5525 scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
5526 break;
5527 }
5528 ret = build_scrub_map_chunk(
5529 scrubber.primary_scrubmap,
5530 scrubber.primary_scrubmap_pos,
5531 scrubber.start, scrubber.end,
5532 scrubber.deep,
5533 handle);
5534 if (ret == -EINPROGRESS) {
5535 requeue_scrub();
5536 done = true;
5537 break;
5538 }
5539 scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
5540 break;
5541
5542 case PG::Scrubber::BUILD_MAP_DONE:
5543 if (scrubber.primary_scrubmap_pos.ret < 0) {
5544 dout(5) << "error: " << scrubber.primary_scrubmap_pos.ret
5545 << ", aborting" << dendl;
5546 scrub_clear_state();
5547 scrub_unreserve_replicas();
5548 return;
5549 }
5550 dout(10) << __func__ << " waiting_on_whom was "
5551 << scrubber.waiting_on_whom << dendl;
5552 ceph_assert(scrubber.waiting_on_whom.count(pg_whoami));
5553 scrubber.waiting_on_whom.erase(pg_whoami);
5554
5555 scrubber.state = PG::Scrubber::WAIT_REPLICAS;
5556 break;
5557
5558 case PG::Scrubber::WAIT_REPLICAS:
5559 if (!scrubber.waiting_on_whom.empty()) {
5560 // will be requeued by sub_op_scrub_map
5561 dout(10) << "wait for replicas to build scrub map" << dendl;
5562 done = true;
5563 break;
5564 }
5565 // end (possible) preemption window
5566 scrub_can_preempt = false;
5567 if (scrub_preempted) {
5568 dout(10) << __func__ << " preempted, restarting chunk" << dendl;
5569 scrubber.state = PG::Scrubber::NEW_CHUNK;
5570 } else {
5571 scrubber.state = PG::Scrubber::COMPARE_MAPS;
5572 }
5573 break;
5574
5575 case PG::Scrubber::COMPARE_MAPS:
5576 ceph_assert(last_update_applied >= scrubber.subset_last_update);
5577 ceph_assert(scrubber.waiting_on_whom.empty());
5578
5579 scrub_compare_maps();
5580 scrubber.start = scrubber.end;
5581 scrubber.run_callbacks();
5582
5583 // requeue the writes from the chunk that just finished
5584 requeue_ops(waiting_for_scrub);
5585
5586 scrubber.state = PG::Scrubber::WAIT_DIGEST_UPDATES;
5587
5588 // fall-thru
5589
5590 case PG::Scrubber::WAIT_DIGEST_UPDATES:
5591 if (scrubber.num_digest_updates_pending) {
5592 dout(10) << __func__ << " waiting on "
5593 << scrubber.num_digest_updates_pending
5594 << " digest updates" << dendl;
5595 done = true;
5596 break;
5597 }
5598
5599 scrubber.preempt_left = cct->_conf.get_val<uint64_t>(
5600 "osd_scrub_max_preemptions");
5601 scrubber.preempt_divisor = 1;
5602
5603 if (!(scrubber.end.is_max())) {
5604 scrubber.state = PG::Scrubber::NEW_CHUNK;
5605 requeue_scrub();
5606 done = true;
5607 } else {
5608 scrubber.state = PG::Scrubber::FINISH;
5609 }
5610
5611 break;
5612
5613 case PG::Scrubber::FINISH:
5614 scrub_finish();
5615 scrubber.state = PG::Scrubber::INACTIVE;
5616 done = true;
5617
5618 if (!snap_trimq.empty()) {
5619 dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
5620 snap_trimmer_scrub_complete();
5621 }
5622
5623 break;
5624
5625 case PG::Scrubber::BUILD_MAP_REPLICA:
5626 // build my own scrub map
5627 if (scrub_preempted) {
5628 dout(10) << __func__ << " preempted" << dendl;
5629 ret = 0;
5630 } else {
5631 ret = build_scrub_map_chunk(
5632 scrubber.replica_scrubmap,
5633 scrubber.replica_scrubmap_pos,
5634 scrubber.start, scrubber.end,
5635 scrubber.deep,
5636 handle);
5637 }
5638 if (ret == -EINPROGRESS) {
5639 requeue_scrub();
5640 done = true;
5641 break;
5642 }
5643 // reply
5644 {
5645 MOSDRepScrubMap *reply = new MOSDRepScrubMap(
5646 spg_t(info.pgid.pgid, get_primary().shard),
5647 scrubber.replica_scrub_start,
5648 pg_whoami);
5649 reply->preempted = scrub_preempted;
5650 ::encode(scrubber.replica_scrubmap, reply->get_data());
5651 osd->send_message_osd_cluster(
5652 get_primary().osd, reply,
5653 scrubber.replica_scrub_start);
5654 }
5655 scrub_preempted = false;
5656 scrub_can_preempt = false;
5657 scrubber.state = PG::Scrubber::INACTIVE;
5658 scrubber.replica_scrubmap = ScrubMap();
5659 scrubber.replica_scrubmap_pos = ScrubMapBuilder();
5660 scrubber.start = hobject_t();
5661 scrubber.end = hobject_t();
5662 scrubber.max_end = hobject_t();
5663 done = true;
5664 break;
5665
5666 default:
5667 ceph_abort();
5668 }
5669 }
5670 dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
5671 << " [" << scrubber.start << "," << scrubber.end << ")"
5672 << " max_end " << scrubber.max_end << dendl;
5673 }
5674
5675 bool PG::write_blocked_by_scrub(const hobject_t& soid)
5676 {
5677 if (soid < scrubber.start || soid >= scrubber.end) {
5678 return false;
5679 }
5680 if (scrub_can_preempt) {
5681 if (!scrub_preempted) {
5682 dout(10) << __func__ << " " << soid << " preempted" << dendl;
5683 scrub_preempted = true;
5684 } else {
5685 dout(10) << __func__ << " " << soid << " already preempted" << dendl;
5686 }
5687 return false;
5688 }
5689 return true;
5690 }
5691
5692 bool PG::range_intersects_scrub(const hobject_t &start, const hobject_t& end)
5693 {
5694 // does [start, end] intersect [scrubber.start, scrubber.max_end)
5695 return (start < scrubber.max_end &&
5696 end >= scrubber.start);
5697 }
5698
5699 void PG::scrub_clear_state(bool has_error)
5700 {
5701 ceph_assert(is_locked());
5702 state_clear(PG_STATE_SCRUBBING);
5703 if (!has_error)
5704 state_clear(PG_STATE_REPAIR);
5705 state_clear(PG_STATE_DEEP_SCRUB);
5706 publish_stats_to_osd();
5707
5708 // local -> nothing.
5709 if (scrubber.local_reserved) {
5710 osd->dec_scrubs_local();
5711 scrubber.local_reserved = false;
5712 scrubber.reserved_peers.clear();
5713 }
5714
5715 requeue_ops(waiting_for_scrub);
5716
5717 scrubber.reset();
5718
5719 // type-specific state clear
5720 _scrub_clear_state();
5721 }
5722
5723 void PG::scrub_compare_maps()
5724 {
5725 dout(10) << __func__ << " has maps, analyzing" << dendl;
5726
5727 // construct authoritative scrub map for type specific scrubbing
5728 scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
5729 map<hobject_t,
5730 pair<boost::optional<uint32_t>,
5731 boost::optional<uint32_t>>> missing_digest;
5732
5733 map<pg_shard_t, ScrubMap *> maps;
5734 maps[pg_whoami] = &scrubber.primary_scrubmap;
5735
5736 for (const auto& i : acting_recovery_backfill) {
5737 if (i == pg_whoami) continue;
5738 dout(2) << __func__ << " replica " << i << " has "
5739 << scrubber.received_maps[i].objects.size()
5740 << " items" << dendl;
5741 maps[i] = &scrubber.received_maps[i];
5742 }
5743
5744 set<hobject_t> master_set;
5745
5746 // Construct master set
5747 for (const auto map : maps) {
5748 for (const auto i : map.second->objects) {
5749 master_set.insert(i.first);
5750 }
5751 }
5752
5753 stringstream ss;
5754 get_pgbackend()->be_omap_checks(maps, master_set,
5755 scrubber.omap_stats, ss);
5756
5757 if (!ss.str().empty()) {
5758 osd->clog->warn(ss);
5759 }
5760
5761 if (acting.size() > 1) {
5762 dout(10) << __func__ << " comparing replica scrub maps" << dendl;
5763
5764 // Map from object with errors to good peer
5765 map<hobject_t, list<pg_shard_t>> authoritative;
5766
5767 dout(2) << __func__ << " osd." << acting[0] << " has "
5768 << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
5769
5770 ss.str("");
5771 ss.clear();
5772
5773 get_pgbackend()->be_compare_scrubmaps(
5774 maps,
5775 master_set,
5776 state_test(PG_STATE_REPAIR),
5777 scrubber.missing,
5778 scrubber.inconsistent,
5779 authoritative,
5780 missing_digest,
5781 scrubber.shallow_errors,
5782 scrubber.deep_errors,
5783 scrubber.store.get(),
5784 info.pgid, acting,
5785 ss);
5786 dout(2) << ss.str() << dendl;
5787
5788 if (!ss.str().empty()) {
5789 osd->clog->error(ss);
5790 }
5791
5792 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
5793 i != authoritative.end();
5794 ++i) {
5795 list<pair<ScrubMap::object, pg_shard_t> > good_peers;
5796 for (list<pg_shard_t>::const_iterator j = i->second.begin();
5797 j != i->second.end();
5798 ++j) {
5799 good_peers.push_back(make_pair(maps[*j]->objects[i->first], *j));
5800 }
5801 scrubber.authoritative.insert(
5802 make_pair(
5803 i->first,
5804 good_peers));
5805 }
5806
5807 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
5808 i != authoritative.end();
5809 ++i) {
5810 scrubber.cleaned_meta_map.objects.erase(i->first);
5811 scrubber.cleaned_meta_map.objects.insert(
5812 *(maps[i->second.back()]->objects.find(i->first))
5813 );
5814 }
5815 }
5816
5817 ScrubMap for_meta_scrub;
5818 scrubber.clean_meta_map(for_meta_scrub);
5819
5820 // ok, do the pg-type specific scrubbing
5821 scrub_snapshot_metadata(for_meta_scrub, missing_digest);
5822 // Called here on the primary can use an authoritative map if it isn't the primary
5823 _scan_snaps(for_meta_scrub);
5824 if (!scrubber.store->empty()) {
5825 if (state_test(PG_STATE_REPAIR)) {
5826 dout(10) << __func__ << ": discarding scrub results" << dendl;
5827 scrubber.store->flush(nullptr);
5828 } else {
5829 dout(10) << __func__ << ": updating scrub object" << dendl;
5830 ObjectStore::Transaction t;
5831 scrubber.store->flush(&t);
5832 osd->store->queue_transaction(ch, std::move(t), nullptr);
5833 }
5834 }
5835 }
5836
5837 bool PG::scrub_process_inconsistent()
5838 {
5839 dout(10) << __func__ << ": checking authoritative" << dendl;
5840 bool repair = state_test(PG_STATE_REPAIR);
5841 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5842 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5843
5844 // authoriative only store objects which missing or inconsistent.
5845 if (!scrubber.authoritative.empty()) {
5846 stringstream ss;
5847 ss << info.pgid << " " << mode << " "
5848 << scrubber.missing.size() << " missing, "
5849 << scrubber.inconsistent.size() << " inconsistent objects";
5850 dout(2) << ss.str() << dendl;
5851 osd->clog->error(ss);
5852 if (repair) {
5853 state_clear(PG_STATE_CLEAN);
5854 for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >>::iterator i =
5855 scrubber.authoritative.begin();
5856 i != scrubber.authoritative.end();
5857 ++i) {
5858 set<pg_shard_t>::iterator j;
5859
5860 auto missing_entry = scrubber.missing.find(i->first);
5861 if (missing_entry != scrubber.missing.end()) {
5862 for (j = missing_entry->second.begin();
5863 j != missing_entry->second.end();
5864 ++j) {
5865 repair_object(
5866 i->first,
5867 &(i->second),
5868 *j);
5869 ++scrubber.fixed;
5870 }
5871 }
5872 if (scrubber.inconsistent.count(i->first)) {
5873 for (j = scrubber.inconsistent[i->first].begin();
5874 j != scrubber.inconsistent[i->first].end();
5875 ++j) {
5876 repair_object(i->first,
5877 &(i->second),
5878 *j);
5879 ++scrubber.fixed;
5880 }
5881 }
5882 }
5883 }
5884 }
5885 return (!scrubber.authoritative.empty() && repair);
5886 }
5887
5888 bool PG::ops_blocked_by_scrub() const {
5889 return (waiting_for_scrub.size() != 0);
5890 }
5891
5892 // the part that actually finalizes a scrub
5893 void PG::scrub_finish()
5894 {
5895 dout(20) << __func__ << dendl;
5896 bool repair = state_test(PG_STATE_REPAIR);
5897 bool do_auto_scrub = false;
5898 // if the repair request comes from auto-repair and large number of errors,
5899 // we would like to cancel auto-repair
5900 if (repair && scrubber.auto_repair
5901 && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) {
5902 state_clear(PG_STATE_REPAIR);
5903 repair = false;
5904 }
5905 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5906 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5907
5908 // if a regular scrub had errors within the limit, do a deep scrub to auto repair.
5909 if (scrubber.deep_scrub_on_error
5910 && scrubber.authoritative.size()
5911 && scrubber.authoritative.size() <= cct->_conf->osd_scrub_auto_repair_num_errors) {
5912 ceph_assert(!deep_scrub);
5913 do_auto_scrub = true;
5914 dout(20) << __func__ << " Try to auto repair after scrub errors" << dendl;
5915 }
5916 scrubber.deep_scrub_on_error = false;
5917
5918 // type-specific finish (can tally more errors)
5919 _scrub_finish();
5920
5921 bool has_error = scrub_process_inconsistent();
5922
5923 {
5924 stringstream oss;
5925 oss << info.pgid.pgid << " " << mode << " ";
5926 int total_errors = scrubber.shallow_errors + scrubber.deep_errors;
5927 if (total_errors)
5928 oss << total_errors << " errors";
5929 else
5930 oss << "ok";
5931 if (!deep_scrub && info.stats.stats.sum.num_deep_scrub_errors)
5932 oss << " ( " << info.stats.stats.sum.num_deep_scrub_errors
5933 << " remaining deep scrub error details lost)";
5934 if (repair)
5935 oss << ", " << scrubber.fixed << " fixed";
5936 if (total_errors)
5937 osd->clog->error(oss);
5938 else
5939 osd->clog->debug(oss);
5940 }
5941
5942 // finish up
5943 unreg_next_scrub();
5944 utime_t now = ceph_clock_now();
5945 info.history.last_scrub = info.last_update;
5946 info.history.last_scrub_stamp = now;
5947 if (scrubber.deep) {
5948 info.history.last_deep_scrub = info.last_update;
5949 info.history.last_deep_scrub_stamp = now;
5950 }
5951 // Since we don't know which errors were fixed, we can only clear them
5952 // when every one has been fixed.
5953 if (repair) {
5954 if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) {
5955 ceph_assert(deep_scrub);
5956 scrubber.shallow_errors = scrubber.deep_errors = 0;
5957 dout(20) << __func__ << " All may be fixed" << dendl;
5958 } else if (has_error) {
5959 // Deep scrub in order to get corrected error counts
5960 scrub_after_recovery = true;
5961 dout(20) << __func__ << " Set scrub_after_recovery" << dendl;
5962 } else if (scrubber.shallow_errors || scrubber.deep_errors) {
5963 // We have errors but nothing can be fixed, so there is no repair
5964 // possible.
5965 state_set(PG_STATE_FAILED_REPAIR);
5966 dout(10) << __func__ << " " << (scrubber.shallow_errors + scrubber.deep_errors)
5967 << " error(s) present with no repair possible" << dendl;
5968 }
5969 }
5970 if (deep_scrub) {
5971 if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0))
5972 info.history.last_clean_scrub_stamp = now;
5973 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
5974 info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
5975 info.stats.stats.sum.num_large_omap_objects = scrubber.omap_stats.large_omap_objects;
5976 info.stats.stats.sum.num_omap_bytes = scrubber.omap_stats.omap_bytes;
5977 info.stats.stats.sum.num_omap_keys = scrubber.omap_stats.omap_keys;
5978 dout(25) << __func__ << " shard " << pg_whoami << " num_omap_bytes = "
5979 << info.stats.stats.sum.num_omap_bytes << " num_omap_keys = "
5980 << info.stats.stats.sum.num_omap_keys << dendl;
5981 } else {
5982 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
5983 // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
5984 // because of deep-scrub errors
5985 if (scrubber.shallow_errors == 0)
5986 info.history.last_clean_scrub_stamp = now;
5987 }
5988 info.stats.stats.sum.num_scrub_errors =
5989 info.stats.stats.sum.num_shallow_scrub_errors +
5990 info.stats.stats.sum.num_deep_scrub_errors;
5991 if (scrubber.check_repair) {
5992 scrubber.check_repair = false;
5993 if (info.stats.stats.sum.num_scrub_errors) {
5994 state_set(PG_STATE_FAILED_REPAIR);
5995 dout(10) << __func__ << " " << info.stats.stats.sum.num_scrub_errors
5996 << " error(s) still present after re-scrub" << dendl;
5997 }
5998 }
5999 publish_stats_to_osd();
6000
6001 {
6002 ObjectStore::Transaction t;
6003 dirty_info = true;
6004 write_if_dirty(t);
6005 int tr = osd->store->queue_transaction(ch, std::move(t), NULL);
6006 ceph_assert(tr == 0);
6007 }
6008
6009
6010 if (has_error) {
6011 queue_peering_event(
6012 PGPeeringEventRef(
6013 std::make_shared<PGPeeringEvent>(
6014 get_osdmap_epoch(),
6015 get_osdmap_epoch(),
6016 DoRecovery())));
6017 }
6018
6019 scrub_clear_state(has_error);
6020 scrub_unreserve_replicas();
6021
6022 if (do_auto_scrub) {
6023 scrub_requested(false, false, true);
6024 } else {
6025 reg_next_scrub();
6026 }
6027
6028 if (is_active() && is_primary()) {
6029 share_pg_info();
6030 }
6031 }
6032
6033 void PG::share_pg_info()
6034 {
6035 dout(10) << "share_pg_info" << dendl;
6036
6037 // share new pg_info_t with replicas
6038 ceph_assert(!acting_recovery_backfill.empty());
6039 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
6040 i != acting_recovery_backfill.end();
6041 ++i) {
6042 if (*i == pg_whoami) continue;
6043 auto pg_shard = *i;
6044 auto peer = peer_info.find(pg_shard);
6045 if (peer != peer_info.end()) {
6046 peer->second.last_epoch_started = info.last_epoch_started;
6047 peer->second.last_interval_started = info.last_interval_started;
6048 peer->second.history.merge(info.history);
6049 }
6050 MOSDPGInfo *m = new MOSDPGInfo(get_osdmap_epoch());
6051 m->pg_list.push_back(
6052 make_pair(
6053 pg_notify_t(
6054 pg_shard.shard, pg_whoami.shard,
6055 get_osdmap_epoch(),
6056 get_osdmap_epoch(),
6057 info),
6058 past_intervals));
6059 osd->send_message_osd_cluster(pg_shard.osd, m, get_osdmap_epoch());
6060 }
6061 }
6062
6063 bool PG::append_log_entries_update_missing(
6064 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
6065 ObjectStore::Transaction &t, boost::optional<eversion_t> trim_to,
6066 boost::optional<eversion_t> roll_forward_to)
6067 {
6068 ceph_assert(!entries.empty());
6069 ceph_assert(entries.begin()->version > info.last_update);
6070
6071 PGLogEntryHandler rollbacker{this, &t};
6072 bool invalidate_stats =
6073 pg_log.append_new_log_entries(info.last_backfill,
6074 info.last_backfill_bitwise,
6075 entries,
6076 &rollbacker);
6077
6078 if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
6079 pg_log.roll_forward(&rollbacker);
6080 }
6081 if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
6082 pg_log.roll_forward_to(*roll_forward_to, &rollbacker);
6083 last_rollback_info_trimmed_to_applied = *roll_forward_to;
6084 }
6085
6086 info.last_update = pg_log.get_head();
6087
6088 if (pg_log.get_missing().num_missing() == 0) {
6089 // advance last_complete since nothing else is missing!
6090 info.last_complete = info.last_update;
6091 }
6092 info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
6093
6094 dout(20) << __func__ << " trim_to bool = " << bool(trim_to) << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
6095 if (trim_to)
6096 pg_log.trim(*trim_to, info);
6097 dirty_info = true;
6098 write_if_dirty(t);
6099 return invalidate_stats;
6100 }
6101
6102
6103 void PG::merge_new_log_entries(
6104 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
6105 ObjectStore::Transaction &t,
6106 boost::optional<eversion_t> trim_to,
6107 boost::optional<eversion_t> roll_forward_to)
6108 {
6109 dout(10) << __func__ << " " << entries << dendl;
6110 ceph_assert(is_primary());
6111
6112 bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
6113 for (set<pg_shard_t>::const_iterator i = acting_recovery_backfill.begin();
6114 i != acting_recovery_backfill.end();
6115 ++i) {
6116 pg_shard_t peer(*i);
6117 if (peer == pg_whoami) continue;
6118 ceph_assert(peer_missing.count(peer));
6119 ceph_assert(peer_info.count(peer));
6120 pg_missing_t& pmissing(peer_missing[peer]);
6121 dout(20) << __func__ << " peer_missing for " << peer << " = " << pmissing << dendl;
6122 pg_info_t& pinfo(peer_info[peer]);
6123 bool invalidate_stats = PGLog::append_log_entries_update_missing(
6124 pinfo.last_backfill,
6125 info.last_backfill_bitwise,
6126 entries,
6127 true,
6128 NULL,
6129 pmissing,
6130 NULL,
6131 this);
6132 pinfo.last_update = info.last_update;
6133 pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
6134 rebuild_missing = rebuild_missing || invalidate_stats;
6135 }
6136
6137 if (!rebuild_missing) {
6138 return;
6139 }
6140
6141 for (auto &&i: entries) {
6142 missing_loc.rebuild(
6143 i.soid,
6144 pg_whoami,
6145 acting_recovery_backfill,
6146 info,
6147 pg_log.get_missing(),
6148 peer_missing,
6149 peer_info);
6150 }
6151 }
6152
6153 void PG::update_history(const pg_history_t& new_history)
6154 {
6155 if (info.history.merge(new_history)) {
6156 dout(20) << __func__ << " advanced history from " << new_history << dendl;
6157 dirty_info = true;
6158 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
6159 dout(20) << __func__ << " clearing past_intervals" << dendl;
6160 past_intervals.clear();
6161 dirty_big_info = true;
6162 }
6163 }
6164 on_info_history_change();
6165 }
6166
6167 void PG::fulfill_info(
6168 pg_shard_t from, const pg_query_t &query,
6169 pair<pg_shard_t, pg_info_t> &notify_info)
6170 {
6171 ceph_assert(from == primary);
6172 ceph_assert(query.type == pg_query_t::INFO);
6173
6174 // info
6175 dout(10) << "sending info" << dendl;
6176 notify_info = make_pair(from, info);
6177 }
6178
6179 void PG::fulfill_log(
6180 pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
6181 {
6182 dout(10) << "log request from " << from << dendl;
6183 ceph_assert(from == primary);
6184 ceph_assert(query.type != pg_query_t::INFO);
6185 ConnectionRef con = osd->get_con_osd_cluster(
6186 from.osd, get_osdmap_epoch());
6187 if (!con) return;
6188
6189 MOSDPGLog *mlog = new MOSDPGLog(
6190 from.shard, pg_whoami.shard,
6191 get_osdmap_epoch(),
6192 info, query_epoch);
6193 mlog->missing = pg_log.get_missing();
6194
6195 // primary -> other, when building master log
6196 if (query.type == pg_query_t::LOG) {
6197 dout(10) << " sending info+missing+log since " << query.since
6198 << dendl;
6199 if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
6200 osd->clog->error() << info.pgid << " got broken pg_query_t::LOG since " << query.since
6201 << " when my log.tail is " << pg_log.get_tail()
6202 << ", sending full log instead";
6203 mlog->log = pg_log.get_log(); // primary should not have requested this!!
6204 } else
6205 mlog->log.copy_after(cct, pg_log.get_log(), query.since);
6206 }
6207 else if (query.type == pg_query_t::FULLLOG) {
6208 dout(10) << " sending info+missing+full log" << dendl;
6209 mlog->log = pg_log.get_log();
6210 }
6211
6212 dout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
6213
6214 osd->share_map_peer(from.osd, con.get(), get_osdmap());
6215 osd->send_message_osd_cluster(mlog, con.get());
6216 }
6217
6218 void PG::fulfill_query(const MQuery& query, RecoveryCtx *rctx)
6219 {
6220 if (query.query.type == pg_query_t::INFO) {
6221 pair<pg_shard_t, pg_info_t> notify_info;
6222 update_history(query.query.history);
6223 fulfill_info(query.from, query.query, notify_info);
6224 rctx->send_notify(
6225 notify_info.first,
6226 pg_notify_t(
6227 notify_info.first.shard, pg_whoami.shard,
6228 query.query_epoch,
6229 get_osdmap_epoch(),
6230 notify_info.second),
6231 past_intervals);
6232 } else {
6233 update_history(query.query.history);
6234 fulfill_log(query.from, query.query, query.query_epoch);
6235 }
6236 }
6237
6238 void PG::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
6239 {
6240 bool changed = false;
6241 if (osdmap->test_flag(CEPH_OSDMAP_FULL) &&
6242 !lastmap->test_flag(CEPH_OSDMAP_FULL)) {
6243 dout(10) << " cluster was marked full in " << osdmap->get_epoch() << dendl;
6244 changed = true;
6245 }
6246 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
6247 if (!pi) {
6248 return; // pool deleted
6249 }
6250 if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
6251 const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
6252 if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
6253 dout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
6254 changed = true;
6255 }
6256 }
6257 if (changed) {
6258 info.history.last_epoch_marked_full = osdmap->get_epoch();
6259 dirty_info = true;
6260 }
6261 }
6262
6263 bool PG::should_restart_peering(
6264 int newupprimary,
6265 int newactingprimary,
6266 const vector<int>& newup,
6267 const vector<int>& newacting,
6268 OSDMapRef lastmap,
6269 OSDMapRef osdmap)
6270 {
6271 if (PastIntervals::is_new_interval(
6272 primary.osd,
6273 newactingprimary,
6274 acting,
6275 newacting,
6276 up_primary.osd,
6277 newupprimary,
6278 up,
6279 newup,
6280 osdmap,
6281 lastmap,
6282 info.pgid.pgid)) {
6283 dout(20) << "new interval newup " << newup
6284 << " newacting " << newacting << dendl;
6285 return true;
6286 }
6287 if (!lastmap->is_up(osd->whoami) && osdmap->is_up(osd->whoami)) {
6288 dout(10) << __func__ << " osd transitioned from down -> up" << dendl;
6289 return true;
6290 }
6291 return false;
6292 }
6293
6294 bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
6295 {
6296 if (last_peering_reset > reply_epoch ||
6297 last_peering_reset > query_epoch) {
6298 dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " << query_epoch
6299 << " last_peering_reset " << last_peering_reset
6300 << dendl;
6301 return true;
6302 }
6303 return false;
6304 }
6305
6306 void PG::set_last_peering_reset()
6307 {
6308 dout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl;
6309 if (last_peering_reset != get_osdmap_epoch()) {
6310 last_peering_reset = get_osdmap_epoch();
6311 reset_interval_flush();
6312 }
6313 }
6314
6315 struct FlushState {
6316 PGRef pg;
6317 epoch_t epoch;
6318 FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
6319 ~FlushState() {
6320 pg->lock();
6321 if (!pg->pg_has_reset_since(epoch))
6322 pg->on_flushed();
6323 pg->unlock();
6324 }
6325 };
6326 typedef std::shared_ptr<FlushState> FlushStateRef;
6327
6328 void PG::start_flush(ObjectStore::Transaction *t)
6329 {
6330 // flush in progress ops
6331 FlushStateRef flush_trigger (std::make_shared<FlushState>(
6332 this, get_osdmap_epoch()));
6333 flushes_in_progress++;
6334 t->register_on_applied(new ContainerContext<FlushStateRef>(flush_trigger));
6335 t->register_on_commit(new ContainerContext<FlushStateRef>(flush_trigger));
6336 }
6337
6338 void PG::reset_interval_flush()
6339 {
6340 dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
6341 recovery_state.clear_blocked_outgoing();
6342
6343 Context *c = new QueuePeeringEvt<IntervalFlush>(
6344 this, get_osdmap_epoch(), IntervalFlush());
6345 if (!ch->flush_commit(c)) {
6346 dout(10) << "Beginning to block outgoing recovery messages" << dendl;
6347 recovery_state.begin_block_outgoing();
6348 } else {
6349 dout(10) << "Not blocking outgoing recovery messages" << dendl;
6350 delete c;
6351 }
6352 }
6353
6354 /* Called before initializing peering during advance_map */
6355 void PG::start_peering_interval(
6356 const OSDMapRef lastmap,
6357 const vector<int>& newup, int new_up_primary,
6358 const vector<int>& newacting, int new_acting_primary,
6359 ObjectStore::Transaction *t)
6360 {
6361 const OSDMapRef osdmap = get_osdmap();
6362
6363 set_last_peering_reset();
6364
6365 vector<int> oldacting, oldup;
6366 int oldrole = get_role();
6367
6368 if (is_primary()) {
6369 osd->clear_ready_to_merge(this);
6370 }
6371
6372 pg_shard_t old_acting_primary = get_primary();
6373 pg_shard_t old_up_primary = up_primary;
6374 bool was_old_primary = is_primary();
6375 bool was_old_replica = is_replica();
6376
6377 acting.swap(oldacting);
6378 up.swap(oldup);
6379 init_primary_up_acting(
6380 newup,
6381 newacting,
6382 new_up_primary,
6383 new_acting_primary);
6384
6385 if (info.stats.up != up ||
6386 info.stats.acting != acting ||
6387 info.stats.up_primary != new_up_primary ||
6388 info.stats.acting_primary != new_acting_primary) {
6389 info.stats.up = up;
6390 info.stats.up_primary = new_up_primary;
6391 info.stats.acting = acting;
6392 info.stats.acting_primary = new_acting_primary;
6393 info.stats.mapping_epoch = osdmap->get_epoch();
6394 }
6395
6396 pg_stats_publish_lock.Lock();
6397 pg_stats_publish_valid = false;
6398 pg_stats_publish_lock.Unlock();
6399
6400 // This will now be remapped during a backfill in cases
6401 // that it would not have been before.
6402 if (up != acting)
6403 state_set(PG_STATE_REMAPPED);
6404 else
6405 state_clear(PG_STATE_REMAPPED);
6406
6407 int role = osdmap->calc_pg_role(osd->whoami, acting, acting.size());
6408 if (pool.info.is_replicated() || role == pg_whoami.shard)
6409 set_role(role);
6410 else
6411 set_role(-1);
6412
6413 // did acting, up, primary|acker change?
6414 if (!lastmap) {
6415 dout(10) << " no lastmap" << dendl;
6416 dirty_info = true;
6417 dirty_big_info = true;
6418 info.history.same_interval_since = osdmap->get_epoch();
6419 } else {
6420 std::stringstream debug;
6421 ceph_assert(info.history.same_interval_since != 0);
6422 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
6423 get_is_recoverable_predicate());
6424 bool new_interval = PastIntervals::check_new_interval(
6425 old_acting_primary.osd,
6426 new_acting_primary,
6427 oldacting, newacting,
6428 old_up_primary.osd,
6429 new_up_primary,
6430 oldup, newup,
6431 info.history.same_interval_since,
6432 info.history.last_epoch_clean,
6433 osdmap,
6434 lastmap,
6435 info.pgid.pgid,
6436 recoverable.get(),
6437 &past_intervals,
6438 &debug);
6439 dout(10) << __func__ << ": check_new_interval output: "
6440 << debug.str() << dendl;
6441 if (new_interval) {
6442 if (osdmap->get_epoch() == osd->get_superblock().oldest_map &&
6443 info.history.last_epoch_clean < osdmap->get_epoch()) {
6444 dout(10) << " map gap, clearing past_intervals and faking" << dendl;
6445 // our information is incomplete and useless; someone else was clean
6446 // after everything we know if osdmaps were trimmed.
6447 past_intervals.clear();
6448 } else {
6449 dout(10) << " noting past " << past_intervals << dendl;
6450 }
6451 dirty_info = true;
6452 dirty_big_info = true;
6453 info.history.same_interval_since = osdmap->get_epoch();
6454 if (osdmap->have_pg_pool(info.pgid.pgid.pool()) &&
6455 info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
6456 osdmap->get_pg_num(info.pgid.pgid.pool()),
6457 nullptr)) {
6458 info.history.last_epoch_split = osdmap->get_epoch();
6459 }
6460 }
6461 }
6462
6463 if (old_up_primary != up_primary ||
6464 oldup != up) {
6465 info.history.same_up_since = osdmap->get_epoch();
6466 }
6467 // this comparison includes primary rank via pg_shard_t
6468 if (old_acting_primary != get_primary()) {
6469 info.history.same_primary_since = osdmap->get_epoch();
6470 }
6471
6472 on_new_interval();
6473
6474 dout(1) << __func__ << " up " << oldup << " -> " << up
6475 << ", acting " << oldacting << " -> " << acting
6476 << ", acting_primary " << old_acting_primary << " -> " << new_acting_primary
6477 << ", up_primary " << old_up_primary << " -> " << new_up_primary
6478 << ", role " << oldrole << " -> " << role
6479 << ", features acting " << acting_features
6480 << " upacting " << upacting_features
6481 << dendl;
6482
6483 // deactivate.
6484 state_clear(PG_STATE_ACTIVE);
6485 state_clear(PG_STATE_PEERED);
6486 state_clear(PG_STATE_PREMERGE);
6487 state_clear(PG_STATE_DOWN);
6488 state_clear(PG_STATE_RECOVERY_WAIT);
6489 state_clear(PG_STATE_RECOVERY_TOOFULL);
6490 state_clear(PG_STATE_RECOVERING);
6491
6492 peer_purged.clear();
6493 acting_recovery_backfill.clear();
6494 scrub_queued = false;
6495
6496 // reset primary/replica state?
6497 if (was_old_primary || is_primary()) {
6498 osd->remove_want_pg_temp(info.pgid.pgid);
6499 } else if (was_old_replica || is_replica()) {
6500 osd->remove_want_pg_temp(info.pgid.pgid);
6501 }
6502 clear_primary_state();
6503
6504
6505 // pg->on_*
6506 on_change(t);
6507
6508 projected_last_update = eversion_t();
6509
6510 ceph_assert(!deleting);
6511
6512 // should we tell the primary we are here?
6513 send_notify = !is_primary();
6514
6515 if (role != oldrole ||
6516 was_old_primary != is_primary()) {
6517 // did primary change?
6518 if (was_old_primary != is_primary()) {
6519 state_clear(PG_STATE_CLEAN);
6520 clear_publish_stats();
6521 }
6522
6523 on_role_change();
6524
6525 // take active waiters
6526 requeue_ops(waiting_for_peered);
6527
6528 } else {
6529 // no role change.
6530 // did primary change?
6531 if (get_primary() != old_acting_primary) {
6532 dout(10) << *this << " " << oldacting << " -> " << acting
6533 << ", acting primary "
6534 << old_acting_primary << " -> " << get_primary()
6535 << dendl;
6536 } else {
6537 // primary is the same.
6538 if (is_primary()) {
6539 // i am (still) primary. but my replica set changed.
6540 state_clear(PG_STATE_CLEAN);
6541
6542 dout(10) << oldacting << " -> " << acting
6543 << ", replicas changed" << dendl;
6544 }
6545 }
6546 }
6547 cancel_recovery();
6548
6549 if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
6550 dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
6551 osd->queue_want_pg_temp(info.pgid.pgid, acting);
6552 }
6553 }
6554
6555 void PG::on_new_interval()
6556 {
6557 const OSDMapRef osdmap = get_osdmap();
6558
6559 on_info_history_change();
6560
6561 // initialize features
6562 acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
6563 upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
6564 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
6565 if (*p == CRUSH_ITEM_NONE)
6566 continue;
6567 uint64_t f = osdmap->get_xinfo(*p).features;
6568 acting_features &= f;
6569 upacting_features &= f;
6570 }
6571 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
6572 if (*p == CRUSH_ITEM_NONE)
6573 continue;
6574 upacting_features &= osdmap->get_xinfo(*p).features;
6575 }
6576
6577 _on_new_interval();
6578 }
6579
6580 void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
6581 {
6582 ceph_assert(!is_primary());
6583
6584 update_history(oinfo.history);
6585 if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
6586 info.stats.stats.sum.num_scrub_errors = 0;
6587 info.stats.stats.sum.num_shallow_scrub_errors = 0;
6588 info.stats.stats.sum.num_deep_scrub_errors = 0;
6589 dirty_info = true;
6590 }
6591
6592 if (!(info.purged_snaps == oinfo.purged_snaps)) {
6593 dout(10) << __func__ << " updating purged_snaps to " << oinfo.purged_snaps
6594 << dendl;
6595 info.purged_snaps = oinfo.purged_snaps;
6596 dirty_info = true;
6597 dirty_big_info = true;
6598 }
6599 }
6600
6601 ostream& operator<<(ostream& out, const PG& pg)
6602 {
6603 out << "pg[" << pg.info
6604 << " " << pg.up;
6605 if (pg.acting != pg.up)
6606 out << "/" << pg.acting;
6607 if (pg.is_ec_pg())
6608 out << "p" << pg.get_primary();
6609 if (!pg.async_recovery_targets.empty())
6610 out << " async=[" << pg.async_recovery_targets << "]";
6611 if (!pg.backfill_targets.empty())
6612 out << " backfill=[" << pg.backfill_targets << "]";
6613 out << " r=" << pg.get_role();
6614 out << " lpr=" << pg.get_last_peering_reset();
6615
6616 if (pg.deleting)
6617 out << " DELETING";
6618
6619 if (!pg.past_intervals.empty()) {
6620 out << " pi=[" << pg.past_intervals.get_bounds()
6621 << ")/" << pg.past_intervals.size();
6622 }
6623
6624 if (pg.is_peered()) {
6625 if (pg.last_update_ondisk != pg.info.last_update)
6626 out << " luod=" << pg.last_update_ondisk;
6627 if (pg.last_update_applied != pg.info.last_update)
6628 out << " lua=" << pg.last_update_applied;
6629 }
6630
6631 if (pg.recovery_ops_active)
6632 out << " rops=" << pg.recovery_ops_active;
6633
6634 if (pg.pg_log.get_tail() != pg.info.log_tail ||
6635 pg.pg_log.get_head() != pg.info.last_update)
6636 out << " (info mismatch, " << pg.pg_log.get_log() << ")";
6637
6638 if (!pg.pg_log.get_log().empty()) {
6639 if ((pg.pg_log.get_log().log.begin()->version <= pg.pg_log.get_tail())) {
6640 out << " (log bound mismatch, actual=["
6641 << pg.pg_log.get_log().log.begin()->version << ","
6642 << pg.pg_log.get_log().log.rbegin()->version << "]";
6643 out << ")";
6644 }
6645 }
6646
6647 out << " crt=" << pg.pg_log.get_can_rollback_to();
6648
6649 if (pg.last_complete_ondisk != pg.info.last_complete)
6650 out << " lcod " << pg.last_complete_ondisk;
6651
6652 if (pg.is_primary()) {
6653 out << " mlcod " << pg.min_last_complete_ondisk;
6654 }
6655
6656 out << " " << pg_state_string(pg.get_state());
6657 if (pg.should_send_notify())
6658 out << " NOTIFY";
6659
6660 if (pg.scrubber.must_repair)
6661 out << " MUST_REPAIR";
6662 if (pg.scrubber.auto_repair)
6663 out << " AUTO_REPAIR";
6664 if (pg.scrubber.check_repair)
6665 out << " CHECK_REPAIR";
6666 if (pg.scrubber.deep_scrub_on_error)
6667 out << " DEEP_SCRUB_ON_ERROR";
6668 if (pg.scrubber.must_deep_scrub)
6669 out << " MUST_DEEP_SCRUB";
6670 if (pg.scrubber.must_scrub)
6671 out << " MUST_SCRUB";
6672 if (pg.scrubber.time_for_deep)
6673 out << " TIME_FOR_DEEP";
6674 if (pg.scrubber.need_auto)
6675 out << " NEED_AUTO";
6676
6677 //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
6678 if (pg.pg_log.get_missing().num_missing()) {
6679 out << " m=" << pg.pg_log.get_missing().num_missing();
6680 if (pg.is_primary()) {
6681 uint64_t unfound = pg.get_num_unfound();
6682 if (unfound)
6683 out << " u=" << unfound;
6684 }
6685 }
6686 if (!pg.is_clean()) {
6687 out << " mbc=" << pg.missing_loc.get_missing_by_count();
6688 }
6689 if (!pg.snap_trimq.empty()) {
6690 out << " trimq=";
6691 // only show a count if the set is large
6692 if (pg.snap_trimq.num_intervals() > 16) {
6693 out << pg.snap_trimq.size();
6694 } else {
6695 out << pg.snap_trimq;
6696 }
6697 }
6698 if (!pg.info.purged_snaps.empty()) {
6699 out << " ps="; // snap trim queue / purged snaps
6700 if (pg.info.purged_snaps.num_intervals() > 16) {
6701 out << pg.info.purged_snaps.size();
6702 } else {
6703 out << pg.info.purged_snaps;
6704 }
6705 }
6706
6707 out << "]";
6708
6709
6710 return out;
6711 }
6712
6713 bool PG::can_discard_op(OpRequestRef& op)
6714 {
6715 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
6716 if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) {
6717 dout(20) << " discard " << *m << dendl;
6718 return true;
6719 }
6720
6721 if (m->get_map_epoch() < info.history.same_primary_since) {
6722 dout(7) << " changed after " << m->get_map_epoch()
6723 << ", dropping " << *m << dendl;
6724 return true;
6725 }
6726
6727 if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) {
6728 // >= luminous client
6729 if (m->get_connection()->has_feature(CEPH_FEATURE_SERVER_NAUTILUS)) {
6730 // >= nautilus client
6731 if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) {
6732 dout(7) << __func__ << " sent before last_force_op_resend "
6733 << pool.info.last_force_op_resend
6734 << ", dropping" << *m << dendl;
6735 return true;
6736 }
6737 } else {
6738 // == < nautilus client (luminous or mimic)
6739 if (m->get_map_epoch() < pool.info.get_last_force_op_resend_prenautilus()) {
6740 dout(7) << __func__ << " sent before last_force_op_resend_prenautilus "
6741 << pool.info.last_force_op_resend_prenautilus
6742 << ", dropping" << *m << dendl;
6743 return true;
6744 }
6745 }
6746 if (m->get_map_epoch() < info.history.last_epoch_split) {
6747 dout(7) << __func__ << " pg split in "
6748 << info.history.last_epoch_split << ", dropping" << dendl;
6749 return true;
6750 }
6751 } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) {
6752 // < luminous client
6753 if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) {
6754 dout(7) << __func__ << " sent before last_force_op_resend_preluminous "
6755 << pool.info.last_force_op_resend_preluminous
6756 << ", dropping" << *m << dendl;
6757 return true;
6758 }
6759 }
6760
6761 return false;
6762 }
6763
6764 template<typename T, int MSGTYPE>
6765 bool PG::can_discard_replica_op(OpRequestRef& op)
6766 {
6767 const T *m = static_cast<const T *>(op->get_req());
6768 ceph_assert(m->get_type() == MSGTYPE);
6769
6770 int from = m->get_source().num();
6771
6772 // if a repop is replied after a replica goes down in a new osdmap, and
6773 // before the pg advances to this new osdmap, the repop replies before this
6774 // repop can be discarded by that replica OSD, because the primary resets the
6775 // connection to it when handling the new osdmap marking it down, and also
6776 // resets the messenger sesssion when the replica reconnects. to avoid the
6777 // out-of-order replies, the messages from that replica should be discarded.
6778 OSDMapRef next_map = osd->get_next_osdmap();
6779 if (next_map->is_down(from))
6780 return true;
6781 /* Mostly, this overlaps with the old_peering_msg
6782 * condition. An important exception is pushes
6783 * sent by replicas not in the acting set, since
6784 * if such a replica goes down it does not cause
6785 * a new interval. */
6786 if (next_map->get_down_at(from) >= m->map_epoch)
6787 return true;
6788
6789 // same pg?
6790 // if pg changes _at all_, we reset and repeer!
6791 if (old_peering_msg(m->map_epoch, m->map_epoch)) {
6792 dout(10) << "can_discard_replica_op pg changed " << info.history
6793 << " after " << m->map_epoch
6794 << ", dropping" << dendl;
6795 return true;
6796 }
6797 return false;
6798 }
6799
6800 bool PG::can_discard_scan(OpRequestRef op)
6801 {
6802 const MOSDPGScan *m = static_cast<const MOSDPGScan *>(op->get_req());
6803 ceph_assert(m->get_type() == MSG_OSD_PG_SCAN);
6804
6805 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
6806 dout(10) << " got old scan, ignoring" << dendl;
6807 return true;
6808 }
6809 return false;
6810 }
6811
6812 bool PG::can_discard_backfill(OpRequestRef op)
6813 {
6814 const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill *>(op->get_req());
6815 ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL);
6816
6817 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
6818 dout(10) << " got old backfill, ignoring" << dendl;
6819 return true;
6820 }
6821
6822 return false;
6823
6824 }
6825
6826 bool PG::can_discard_request(OpRequestRef& op)
6827 {
6828 switch (op->get_req()->get_type()) {
6829 case CEPH_MSG_OSD_OP:
6830 return can_discard_op(op);
6831 case CEPH_MSG_OSD_BACKOFF:
6832 return false; // never discard
6833 case MSG_OSD_REPOP:
6834 return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op);
6835 case MSG_OSD_PG_PUSH:
6836 return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op);
6837 case MSG_OSD_PG_PULL:
6838 return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
6839 case MSG_OSD_PG_PUSH_REPLY:
6840 return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
6841 case MSG_OSD_REPOPREPLY:
6842 return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
6843 case MSG_OSD_PG_RECOVERY_DELETE:
6844 return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op);
6845
6846 case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
6847 return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op);
6848
6849 case MSG_OSD_EC_WRITE:
6850 return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
6851 case MSG_OSD_EC_WRITE_REPLY:
6852 return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op);
6853 case MSG_OSD_EC_READ:
6854 return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
6855 case MSG_OSD_EC_READ_REPLY:
6856 return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
6857 case MSG_OSD_REP_SCRUB:
6858 return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
6859 case MSG_OSD_SCRUB_RESERVE:
6860 return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op);
6861 case MSG_OSD_REP_SCRUBMAP:
6862 return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op);
6863 case MSG_OSD_PG_UPDATE_LOG_MISSING:
6864 return can_discard_replica_op<
6865 MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op);
6866 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
6867 return can_discard_replica_op<
6868 MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
6869
6870 case MSG_OSD_PG_SCAN:
6871 return can_discard_scan(op);
6872 case MSG_OSD_PG_BACKFILL:
6873 return can_discard_backfill(op);
6874 case MSG_OSD_PG_BACKFILL_REMOVE:
6875 return can_discard_replica_op<MOSDPGBackfillRemove,
6876 MSG_OSD_PG_BACKFILL_REMOVE>(op);
6877 }
6878 return true;
6879 }
6880
6881 void PG::take_waiters()
6882 {
6883 dout(10) << "take_waiters" << dendl;
6884 requeue_map_waiters();
6885 }
6886
6887 void PG::do_peering_event(PGPeeringEventRef evt, RecoveryCtx *rctx)
6888 {
6889 dout(10) << __func__ << ": " << evt->get_desc() << dendl;
6890 ceph_assert(have_same_or_newer_map(evt->get_epoch_sent()));
6891 if (old_peering_evt(evt)) {
6892 dout(10) << "discard old " << evt->get_desc() << dendl;
6893 } else {
6894 recovery_state.handle_event(evt, rctx);
6895 }
6896 // write_if_dirty regardless of path above to ensure we capture any work
6897 // done by OSD::advance_pg().
6898 write_if_dirty(*rctx->transaction);
6899 }
6900
6901 void PG::queue_peering_event(PGPeeringEventRef evt)
6902 {
6903 if (old_peering_evt(evt))
6904 return;
6905 osd->osd->enqueue_peering_evt(info.pgid, evt);
6906 }
6907
6908 void PG::queue_null(epoch_t msg_epoch,
6909 epoch_t query_epoch)
6910 {
6911 dout(10) << "null" << dendl;
6912 queue_peering_event(
6913 PGPeeringEventRef(std::make_shared<PGPeeringEvent>(msg_epoch, query_epoch,
6914 NullEvt())));
6915 }
6916
6917 void PG::find_unfound(epoch_t queued, RecoveryCtx *rctx)
6918 {
6919 /*
6920 * if we couldn't start any recovery ops and things are still
6921 * unfound, see if we can discover more missing object locations.
6922 * It may be that our initial locations were bad and we errored
6923 * out while trying to pull.
6924 */
6925 discover_all_missing(*rctx->query_map);
6926 if (rctx->query_map->empty()) {
6927 string action;
6928 if (state_test(PG_STATE_BACKFILLING)) {
6929 auto evt = PGPeeringEventRef(
6930 new PGPeeringEvent(
6931 queued,
6932 queued,
6933 PG::UnfoundBackfill()));
6934 queue_peering_event(evt);
6935 action = "in backfill";
6936 } else if (state_test(PG_STATE_RECOVERING)) {
6937 auto evt = PGPeeringEventRef(
6938 new PGPeeringEvent(
6939 queued,
6940 queued,
6941 PG::UnfoundRecovery()));
6942 queue_peering_event(evt);
6943 action = "in recovery";
6944 } else {
6945 action = "already out of recovery/backfill";
6946 }
6947 dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
6948 } else {
6949 dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
6950 queue_recovery();
6951 }
6952 }
6953
6954 void PG::handle_advance_map(
6955 OSDMapRef osdmap, OSDMapRef lastmap,
6956 vector<int>& newup, int up_primary,
6957 vector<int>& newacting, int acting_primary,
6958 RecoveryCtx *rctx)
6959 {
6960 ceph_assert(lastmap->get_epoch() == osdmap_ref->get_epoch());
6961 ceph_assert(lastmap == osdmap_ref);
6962 dout(10) << "handle_advance_map "
6963 << newup << "/" << newacting
6964 << " -- " << up_primary << "/" << acting_primary
6965 << dendl;
6966 update_osdmap_ref(osdmap);
6967 osd_shard->update_pg_epoch(pg_slot, osdmap->get_epoch());
6968
6969 pool.update(cct, osdmap);
6970
6971 AdvMap evt(
6972 osdmap, lastmap, newup, up_primary,
6973 newacting, acting_primary);
6974 recovery_state.handle_event(evt, rctx);
6975 if (pool.info.last_change == osdmap_ref->get_epoch()) {
6976 on_pool_change();
6977 update_store_with_options();
6978 }
6979 last_require_osd_release = osdmap->require_osd_release;
6980 }
6981
6982 void PG::handle_activate_map(RecoveryCtx *rctx)
6983 {
6984 dout(10) << "handle_activate_map " << dendl;
6985 ActMap evt;
6986 recovery_state.handle_event(evt, rctx);
6987 if (osdmap_ref->get_epoch() - last_persisted_osdmap >
6988 cct->_conf->osd_pg_epoch_persisted_max_stale) {
6989 dout(20) << __func__ << ": Dirtying info: last_persisted is "
6990 << last_persisted_osdmap
6991 << " while current is " << osdmap_ref->get_epoch() << dendl;
6992 dirty_info = true;
6993 } else {
6994 dout(20) << __func__ << ": Not dirtying info: last_persisted is "
6995 << last_persisted_osdmap
6996 << " while current is " << osdmap_ref->get_epoch() << dendl;
6997 }
6998 if (osdmap_ref->check_new_blacklist_entries()) {
6999 check_blacklisted_watchers();
7000 }
7001 write_if_dirty(*rctx->transaction);
7002 }
7003
7004 void PG::handle_initialize(RecoveryCtx *rctx)
7005 {
7006 dout(10) << __func__ << dendl;
7007 Initialize evt;
7008 recovery_state.handle_event(evt, rctx);
7009 }
7010
7011 void PG::handle_query_state(Formatter *f)
7012 {
7013 dout(10) << "handle_query_state" << dendl;
7014 QueryState q(f);
7015 recovery_state.handle_event(q, 0);
7016 }
7017
7018 void PG::init_collection_pool_opts()
7019 {
7020 auto r = osd->store->set_collection_opts(ch, pool.info.opts);
7021 if (r < 0 && r != -EOPNOTSUPP) {
7022 derr << __func__ << " set_collection_opts returns error:" << r << dendl;
7023 }
7024 }
7025
7026 void PG::update_store_with_options()
7027 {
7028 init_collection_pool_opts();
7029 }
7030
7031 struct C_DeleteMore : public Context {
7032 PGRef pg;
7033 epoch_t epoch;
7034 C_DeleteMore(PG *p, epoch_t e) : pg(p), epoch(e) {}
7035 void finish(int r) override {
7036 ceph_abort();
7037 }
7038 void complete(int r) override {
7039 ceph_assert(r == 0);
7040 pg->lock();
7041 if (!pg->pg_has_reset_since(epoch)) {
7042 pg->osd->queue_for_pg_delete(pg->get_pgid(), epoch);
7043 }
7044 pg->unlock();
7045 delete this;
7046 }
7047 };
7048
7049 void PG::_delete_some(ObjectStore::Transaction *t)
7050 {
7051 dout(10) << __func__ << dendl;
7052
7053 {
7054 float osd_delete_sleep = osd->osd->get_osd_delete_sleep();
7055 if (osd_delete_sleep > 0 && delete_needs_sleep) {
7056 epoch_t e = get_osdmap()->get_epoch();
7057 PGRef pgref(this);
7058 auto delete_requeue_callback = new FunctionContext([this, pgref, e](int r) {
7059 dout(20) << __func__ << " wake up at "
7060 << ceph_clock_now()
7061 << ", re-queuing delete" << dendl;
7062 lock();
7063 delete_needs_sleep = false;
7064 if (!pg_has_reset_since(e)) {
7065 osd->queue_for_pg_delete(get_pgid(), e);
7066 }
7067 unlock();
7068 });
7069
7070 utime_t delete_schedule_time = ceph_clock_now();
7071 delete_schedule_time += osd_delete_sleep;
7072 Mutex::Locker l(osd->sleep_lock);
7073 osd->sleep_timer.add_event_at(delete_schedule_time,
7074 delete_requeue_callback);
7075 dout(20) << __func__ << " Delete scheduled at " << delete_schedule_time << dendl;
7076 return;
7077 }
7078 }
7079
7080 delete_needs_sleep = true;
7081
7082 vector<ghobject_t> olist;
7083 int max = std::min(osd->store->get_ideal_list_max(),
7084 (int)cct->_conf->osd_target_transaction_size);
7085 ghobject_t next;
7086 osd->store->collection_list(
7087 ch,
7088 next,
7089 ghobject_t::get_max(),
7090 max,
7091 &olist,
7092 &next);
7093 dout(20) << __func__ << " " << olist << dendl;
7094
7095 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
7096 int64_t num = 0;
7097 for (auto& oid : olist) {
7098 if (oid == pgmeta_oid) {
7099 continue;
7100 }
7101 if (oid.is_pgmeta()) {
7102 osd->clog->warn() << info.pgid << " found stray pgmeta-like " << oid
7103 << " during PG removal";
7104 }
7105 int r = snap_mapper.remove_oid(oid.hobj, &_t);
7106 if (r != 0 && r != -ENOENT) {
7107 ceph_abort();
7108 }
7109 t->remove(coll, oid);
7110 ++num;
7111 }
7112 if (num) {
7113 dout(20) << __func__ << " deleting " << num << " objects" << dendl;
7114 Context *fin = new C_DeleteMore(this, get_osdmap_epoch());
7115 t->register_on_commit(fin);
7116 } else {
7117 dout(20) << __func__ << " finished" << dendl;
7118 if (cct->_conf->osd_inject_failure_on_pg_removal) {
7119 _exit(1);
7120 }
7121
7122 // final flush here to ensure completions drop refs. Of particular concern
7123 // are the SnapMapper ContainerContexts.
7124 {
7125 PGRef pgref(this);
7126 PGLog::clear_info_log(info.pgid, t);
7127 t->remove_collection(coll);
7128 t->register_on_commit(new ContainerContext<PGRef>(pgref));
7129 t->register_on_applied(new ContainerContext<PGRef>(pgref));
7130 osd->store->queue_transaction(ch, std::move(*t));
7131 }
7132 ch->flush();
7133
7134 if (!osd->try_finish_pg_delete(this, pool.info.get_pg_num())) {
7135 dout(1) << __func__ << " raced with merge, reinstantiating" << dendl;
7136 ch = osd->store->create_new_collection(coll);
7137 _create(*t,
7138 info.pgid,
7139 info.pgid.get_split_bits(pool.info.get_pg_num()));
7140 _init(*t, info.pgid, &pool.info);
7141 last_epoch = 0; // to ensure pg epoch is also written
7142 dirty_info = true;
7143 dirty_big_info = true;
7144 } else {
7145 deleted = true;
7146
7147 // cancel reserver here, since the PG is about to get deleted and the
7148 // exit() methods don't run when that happens.
7149 osd->local_reserver.cancel_reservation(info.pgid);
7150
7151 osd->logger->dec(l_osd_pg_removing);
7152 }
7153 }
7154 }
7155
7156 // Compute pending backfill data
7157 static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes)
7158 {
7159 lgeneric_dout(cct, 20) << __func__ << " Adjust local usage " << (local_bytes >> 10) << "KiB"
7160 << " primary usage " << (bf_bytes >> 10) << "KiB" << dendl;
7161 return std::max((int64_t)0, bf_bytes - local_bytes);
7162 }
7163
7164 int PG::pg_stat_adjust(osd_stat_t *ns)
7165 {
7166 osd_stat_t &new_stat = *ns;
7167 if (is_primary()) {
7168 return 0;
7169 }
7170 // Adjust the kb_used by adding pending backfill data
7171 uint64_t reserved_num_bytes = get_reserved_num_bytes();
7172
7173 // For now we don't consider projected space gains here
7174 // I suggest we have an optional 2 pass backfill that frees up
7175 // space in a first pass. This could be triggered when at nearfull
7176 // or near to backfillfull.
7177 if (reserved_num_bytes > 0) {
7178 // TODO: Handle compression by adjusting by the PGs average
7179 // compression precentage.
7180 dout(20) << __func__ << " reserved_num_bytes " << (reserved_num_bytes >> 10) << "KiB"
7181 << " Before kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl;
7182 if (new_stat.statfs.available > reserved_num_bytes)
7183 new_stat.statfs.available -= reserved_num_bytes;
7184 else
7185 new_stat.statfs.available = 0;
7186 dout(20) << __func__ << " After kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl;
7187 return 1;
7188 }
7189 return 0;
7190 }
7191
7192
7193 /*------------ Recovery State Machine----------------*/
7194 #undef dout_prefix
7195 #define dout_prefix (context< RecoveryMachine >().pg->gen_prefix(*_dout) \
7196 << "state<" << get_state_name() << ">: ")
7197
7198 /*------Crashed-------*/
7199 PG::RecoveryState::Crashed::Crashed(my_context ctx)
7200 : my_base(ctx),
7201 NamedState(context< RecoveryMachine >().pg, "Crashed")
7202 {
7203 context< RecoveryMachine >().log_enter(state_name);
7204 ceph_abort_msg("we got a bad state machine event");
7205 }
7206
7207
7208 /*------Initial-------*/
7209 PG::RecoveryState::Initial::Initial(my_context ctx)
7210 : my_base(ctx),
7211 NamedState(context< RecoveryMachine >().pg, "Initial")
7212 {
7213 context< RecoveryMachine >().log_enter(state_name);
7214 }
7215
7216 boost::statechart::result PG::RecoveryState::Initial::react(const MNotifyRec& notify)
7217 {
7218 PG *pg = context< RecoveryMachine >().pg;
7219 pg->proc_replica_info(
7220 notify.from, notify.notify.info, notify.notify.epoch_sent);
7221 pg->set_last_peering_reset();
7222 return transit< Primary >();
7223 }
7224
7225 boost::statechart::result PG::RecoveryState::Initial::react(const MInfoRec& i)
7226 {
7227 PG *pg = context< RecoveryMachine >().pg;
7228 ceph_assert(!pg->is_primary());
7229 post_event(i);
7230 return transit< Stray >();
7231 }
7232
7233 boost::statechart::result PG::RecoveryState::Initial::react(const MLogRec& i)
7234 {
7235 PG *pg = context< RecoveryMachine >().pg;
7236 ceph_assert(!pg->is_primary());
7237 post_event(i);
7238 return transit< Stray >();
7239 }
7240
7241 void PG::RecoveryState::Initial::exit()
7242 {
7243 context< RecoveryMachine >().log_exit(state_name, enter_time);
7244 PG *pg = context< RecoveryMachine >().pg;
7245 utime_t dur = ceph_clock_now() - enter_time;
7246 pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur);
7247 }
7248
7249 /*------Started-------*/
7250 PG::RecoveryState::Started::Started(my_context ctx)
7251 : my_base(ctx),
7252 NamedState(context< RecoveryMachine >().pg, "Started")
7253 {
7254 context< RecoveryMachine >().log_enter(state_name);
7255 }
7256
7257 boost::statechart::result
7258 PG::RecoveryState::Started::react(const IntervalFlush&)
7259 {
7260 PG *pg = context< RecoveryMachine >().pg;
7261 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
7262 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
7263 return discard_event();
7264 }
7265
7266 boost::statechart::result PG::RecoveryState::Started::react(const AdvMap& advmap)
7267 {
7268 PG *pg = context< RecoveryMachine >().pg;
7269 ldout(pg->cct, 10) << "Started advmap" << dendl;
7270 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
7271 if (pg->should_restart_peering(
7272 advmap.up_primary,
7273 advmap.acting_primary,
7274 advmap.newup,
7275 advmap.newacting,
7276 advmap.lastmap,
7277 advmap.osdmap)) {
7278 ldout(pg->cct, 10) << "should_restart_peering, transitioning to Reset"
7279 << dendl;
7280 post_event(advmap);
7281 return transit< Reset >();
7282 }
7283 pg->remove_down_peer_info(advmap.osdmap);
7284 return discard_event();
7285 }
7286
7287 boost::statechart::result PG::RecoveryState::Started::react(const QueryState& q)
7288 {
7289 q.f->open_object_section("state");
7290 q.f->dump_string("name", state_name);
7291 q.f->dump_stream("enter_time") << enter_time;
7292 q.f->close_section();
7293 return discard_event();
7294 }
7295
7296 void PG::RecoveryState::Started::exit()
7297 {
7298 context< RecoveryMachine >().log_exit(state_name, enter_time);
7299 PG *pg = context< RecoveryMachine >().pg;
7300 utime_t dur = ceph_clock_now() - enter_time;
7301 pg->osd->recoverystate_perf->tinc(rs_started_latency, dur);
7302 }
7303
7304 /*--------Reset---------*/
7305 PG::RecoveryState::Reset::Reset(my_context ctx)
7306 : my_base(ctx),
7307 NamedState(context< RecoveryMachine >().pg, "Reset")
7308 {
7309 context< RecoveryMachine >().log_enter(state_name);
7310 PG *pg = context< RecoveryMachine >().pg;
7311
7312 pg->flushes_in_progress = 0;
7313 pg->set_last_peering_reset();
7314 }
7315
7316 boost::statechart::result
7317 PG::RecoveryState::Reset::react(const IntervalFlush&)
7318 {
7319 PG *pg = context< RecoveryMachine >().pg;
7320 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
7321 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
7322 return discard_event();
7323 }
7324
7325 boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
7326 {
7327 PG *pg = context< RecoveryMachine >().pg;
7328 ldout(pg->cct, 10) << "Reset advmap" << dendl;
7329
7330 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
7331
7332 if (pg->should_restart_peering(
7333 advmap.up_primary,
7334 advmap.acting_primary,
7335 advmap.newup,
7336 advmap.newacting,
7337 advmap.lastmap,
7338 advmap.osdmap)) {
7339 ldout(pg->cct, 10) << "should restart peering, calling start_peering_interval again"
7340 << dendl;
7341 pg->start_peering_interval(
7342 advmap.lastmap,
7343 advmap.newup, advmap.up_primary,
7344 advmap.newacting, advmap.acting_primary,
7345 context< RecoveryMachine >().get_cur_transaction());
7346 }
7347 pg->remove_down_peer_info(advmap.osdmap);
7348 pg->check_past_interval_bounds();
7349 return discard_event();
7350 }
7351
7352 boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&)
7353 {
7354 PG *pg = context< RecoveryMachine >().pg;
7355 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7356 context< RecoveryMachine >().send_notify(
7357 pg->get_primary(),
7358 pg_notify_t(
7359 pg->get_primary().shard, pg->pg_whoami.shard,
7360 pg->get_osdmap_epoch(),
7361 pg->get_osdmap_epoch(),
7362 pg->info),
7363 pg->past_intervals);
7364 }
7365
7366 pg->update_heartbeat_peers();
7367 pg->take_waiters();
7368
7369 return transit< Started >();
7370 }
7371
7372 boost::statechart::result PG::RecoveryState::Reset::react(const QueryState& q)
7373 {
7374 q.f->open_object_section("state");
7375 q.f->dump_string("name", state_name);
7376 q.f->dump_stream("enter_time") << enter_time;
7377 q.f->close_section();
7378 return discard_event();
7379 }
7380
7381 void PG::RecoveryState::Reset::exit()
7382 {
7383 context< RecoveryMachine >().log_exit(state_name, enter_time);
7384 PG *pg = context< RecoveryMachine >().pg;
7385 utime_t dur = ceph_clock_now() - enter_time;
7386 pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur);
7387 }
7388
7389 /*-------Start---------*/
7390 PG::RecoveryState::Start::Start(my_context ctx)
7391 : my_base(ctx),
7392 NamedState(context< RecoveryMachine >().pg, "Start")
7393 {
7394 context< RecoveryMachine >().log_enter(state_name);
7395
7396 PG *pg = context< RecoveryMachine >().pg;
7397 if (pg->is_primary()) {
7398 ldout(pg->cct, 1) << "transitioning to Primary" << dendl;
7399 post_event(MakePrimary());
7400 } else { //is_stray
7401 ldout(pg->cct, 1) << "transitioning to Stray" << dendl;
7402 post_event(MakeStray());
7403 }
7404 }
7405
7406 void PG::RecoveryState::Start::exit()
7407 {
7408 context< RecoveryMachine >().log_exit(state_name, enter_time);
7409 PG *pg = context< RecoveryMachine >().pg;
7410 utime_t dur = ceph_clock_now() - enter_time;
7411 pg->osd->recoverystate_perf->tinc(rs_start_latency, dur);
7412 }
7413
7414 /*---------Primary--------*/
7415 PG::RecoveryState::Primary::Primary(my_context ctx)
7416 : my_base(ctx),
7417 NamedState(context< RecoveryMachine >().pg, "Started/Primary")
7418 {
7419 context< RecoveryMachine >().log_enter(state_name);
7420 PG *pg = context< RecoveryMachine >().pg;
7421 ceph_assert(pg->want_acting.empty());
7422
7423 // set CREATING bit until we have peered for the first time.
7424 if (pg->info.history.last_epoch_started == 0) {
7425 pg->state_set(PG_STATE_CREATING);
7426 // use the history timestamp, which ultimately comes from the
7427 // monitor in the create case.
7428 utime_t t = pg->info.history.last_scrub_stamp;
7429 pg->info.stats.last_fresh = t;
7430 pg->info.stats.last_active = t;
7431 pg->info.stats.last_change = t;
7432 pg->info.stats.last_peered = t;
7433 pg->info.stats.last_clean = t;
7434 pg->info.stats.last_unstale = t;
7435 pg->info.stats.last_undegraded = t;
7436 pg->info.stats.last_fullsized = t;
7437 pg->info.stats.last_scrub_stamp = t;
7438 pg->info.stats.last_deep_scrub_stamp = t;
7439 pg->info.stats.last_clean_scrub_stamp = t;
7440 }
7441 }
7442
7443 boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& notevt)
7444 {
7445 PG *pg = context< RecoveryMachine >().pg;
7446 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
7447 pg->proc_replica_info(
7448 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
7449 return discard_event();
7450 }
7451
7452 boost::statechart::result PG::RecoveryState::Primary::react(const ActMap&)
7453 {
7454 PG *pg = context< RecoveryMachine >().pg;
7455 ldout(pg->cct, 7) << "handle ActMap primary" << dendl;
7456 pg->publish_stats_to_osd();
7457 pg->take_waiters();
7458 return discard_event();
7459 }
7460
7461 boost::statechart::result PG::RecoveryState::Primary::react(
7462 const SetForceRecovery&)
7463 {
7464 PG *pg = context< RecoveryMachine >().pg;
7465 pg->set_force_recovery(true);
7466 return discard_event();
7467 }
7468
7469 boost::statechart::result PG::RecoveryState::Primary::react(
7470 const UnsetForceRecovery&)
7471 {
7472 PG *pg = context< RecoveryMachine >().pg;
7473 pg->set_force_recovery(false);
7474 return discard_event();
7475 }
7476
7477 boost::statechart::result PG::RecoveryState::Primary::react(
7478 const RequestScrub& evt)
7479 {
7480 PG *pg = context< RecoveryMachine >().pg;
7481 if (pg->is_primary()) {
7482 pg->scrub_requested(evt.deep, evt.repair);
7483 ldout(pg->cct,10) << "marking for scrub" << dendl;
7484 }
7485 return discard_event();
7486 }
7487
7488 boost::statechart::result PG::RecoveryState::Primary::react(
7489 const SetForceBackfill&)
7490 {
7491 PG *pg = context< RecoveryMachine >().pg;
7492 pg->set_force_backfill(true);
7493 return discard_event();
7494 }
7495
7496 boost::statechart::result PG::RecoveryState::Primary::react(
7497 const UnsetForceBackfill&)
7498 {
7499 PG *pg = context< RecoveryMachine >().pg;
7500 pg->set_force_backfill(false);
7501 return discard_event();
7502 }
7503
7504 void PG::RecoveryState::Primary::exit()
7505 {
7506 context< RecoveryMachine >().log_exit(state_name, enter_time);
7507 PG *pg = context< RecoveryMachine >().pg;
7508 pg->want_acting.clear();
7509 utime_t dur = ceph_clock_now() - enter_time;
7510 pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur);
7511 pg->clear_primary_state();
7512 pg->state_clear(PG_STATE_CREATING);
7513 }
7514
7515 /*---------Peering--------*/
7516 PG::RecoveryState::Peering::Peering(my_context ctx)
7517 : my_base(ctx),
7518 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering"),
7519 history_les_bound(false)
7520 {
7521 context< RecoveryMachine >().log_enter(state_name);
7522
7523 PG *pg = context< RecoveryMachine >().pg;
7524 ceph_assert(!pg->is_peered());
7525 ceph_assert(!pg->is_peering());
7526 ceph_assert(pg->is_primary());
7527 pg->state_set(PG_STATE_PEERING);
7528 }
7529
7530 boost::statechart::result PG::RecoveryState::Peering::react(const AdvMap& advmap)
7531 {
7532 PG *pg = context< RecoveryMachine >().pg;
7533 ldout(pg->cct, 10) << "Peering advmap" << dendl;
7534 if (prior_set.affected_by_map(*(advmap.osdmap), pg)) {
7535 ldout(pg->cct, 1) << "Peering, affected_by_map, going to Reset" << dendl;
7536 post_event(advmap);
7537 return transit< Reset >();
7538 }
7539
7540 pg->adjust_need_up_thru(advmap.osdmap);
7541
7542 return forward_event();
7543 }
7544
7545 boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q)
7546 {
7547 PG *pg = context< RecoveryMachine >().pg;
7548
7549 q.f->open_object_section("state");
7550 q.f->dump_string("name", state_name);
7551 q.f->dump_stream("enter_time") << enter_time;
7552
7553 q.f->open_array_section("past_intervals");
7554 pg->past_intervals.dump(q.f);
7555 q.f->close_section();
7556
7557 q.f->open_array_section("probing_osds");
7558 for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
7559 p != prior_set.probe.end();
7560 ++p)
7561 q.f->dump_stream("osd") << *p;
7562 q.f->close_section();
7563
7564 if (prior_set.pg_down)
7565 q.f->dump_string("blocked", "peering is blocked due to down osds");
7566
7567 q.f->open_array_section("down_osds_we_would_probe");
7568 for (set<int>::iterator p = prior_set.down.begin();
7569 p != prior_set.down.end();
7570 ++p)
7571 q.f->dump_int("osd", *p);
7572 q.f->close_section();
7573
7574 q.f->open_array_section("peering_blocked_by");
7575 for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
7576 p != prior_set.blocked_by.end();
7577 ++p) {
7578 q.f->open_object_section("osd");
7579 q.f->dump_int("osd", p->first);
7580 q.f->dump_int("current_lost_at", p->second);
7581 q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
7582 q.f->close_section();
7583 }
7584 q.f->close_section();
7585
7586 if (history_les_bound) {
7587 q.f->open_array_section("peering_blocked_by_detail");
7588 q.f->open_object_section("item");
7589 q.f->dump_string("detail","peering_blocked_by_history_les_bound");
7590 q.f->close_section();
7591 q.f->close_section();
7592 }
7593
7594 q.f->close_section();
7595 return forward_event();
7596 }
7597
7598 void PG::RecoveryState::Peering::exit()
7599 {
7600 PG *pg = context< RecoveryMachine >().pg;
7601 ldout(pg->cct, 10) << "Leaving Peering" << dendl;
7602 context< RecoveryMachine >().log_exit(state_name, enter_time);
7603 pg->state_clear(PG_STATE_PEERING);
7604 pg->clear_probe_targets();
7605
7606 utime_t dur = ceph_clock_now() - enter_time;
7607 pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur);
7608 }
7609
7610
7611 /*------Backfilling-------*/
7612 PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
7613 : my_base(ctx),
7614 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Backfilling")
7615 {
7616 context< RecoveryMachine >().log_enter(state_name);
7617 PG *pg = context< RecoveryMachine >().pg;
7618 pg->backfill_reserved = true;
7619 pg->queue_recovery();
7620 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
7621 pg->state_clear(PG_STATE_BACKFILL_WAIT);
7622 pg->state_set(PG_STATE_BACKFILLING);
7623 pg->publish_stats_to_osd();
7624 }
7625
7626 void PG::RecoveryState::Backfilling::backfill_release_reservations()
7627 {
7628 PG *pg = context< RecoveryMachine >().pg;
7629 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7630 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
7631 it != pg->backfill_targets.end();
7632 ++it) {
7633 ceph_assert(*it != pg->pg_whoami);
7634 ConnectionRef con = pg->osd->get_con_osd_cluster(
7635 it->osd, pg->get_osdmap_epoch());
7636 if (con) {
7637 pg->osd->send_message_osd_cluster(
7638 new MBackfillReserve(
7639 MBackfillReserve::RELEASE,
7640 spg_t(pg->info.pgid.pgid, it->shard),
7641 pg->get_osdmap_epoch()),
7642 con.get());
7643 }
7644 }
7645 }
7646
7647 void PG::RecoveryState::Backfilling::cancel_backfill()
7648 {
7649 PG *pg = context< RecoveryMachine >().pg;
7650 backfill_release_reservations();
7651 if (!pg->waiting_on_backfill.empty()) {
7652 pg->waiting_on_backfill.clear();
7653 pg->finish_recovery_op(hobject_t::get_max());
7654 }
7655 }
7656
7657 boost::statechart::result
7658 PG::RecoveryState::Backfilling::react(const Backfilled &c)
7659 {
7660 backfill_release_reservations();
7661 return transit<Recovered>();
7662 }
7663
7664 boost::statechart::result
7665 PG::RecoveryState::Backfilling::react(const DeferBackfill &c)
7666 {
7667 PG *pg = context< RecoveryMachine >().pg;
7668 ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl;
7669 pg->state_set(PG_STATE_BACKFILL_WAIT);
7670 pg->state_clear(PG_STATE_BACKFILLING);
7671 cancel_backfill();
7672 pg->schedule_backfill_retry(c.delay);
7673 return transit<NotBackfilling>();
7674 }
7675
7676 boost::statechart::result
7677 PG::RecoveryState::Backfilling::react(const UnfoundBackfill &c)
7678 {
7679 PG *pg = context< RecoveryMachine >().pg;
7680 ldout(pg->cct, 10) << "backfill has unfound, can't continue" << dendl;
7681 pg->state_set(PG_STATE_BACKFILL_UNFOUND);
7682 pg->state_clear(PG_STATE_BACKFILLING);
7683 cancel_backfill();
7684 return transit<NotBackfilling>();
7685 }
7686
7687 boost::statechart::result
7688 PG::RecoveryState::Backfilling::react(const RemoteReservationRevokedTooFull &)
7689 {
7690 PG *pg = context< RecoveryMachine >().pg;
7691 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
7692 pg->state_clear(PG_STATE_BACKFILLING);
7693 cancel_backfill();
7694 pg->schedule_backfill_retry(pg->cct->_conf->osd_backfill_retry_interval);
7695 return transit<NotBackfilling>();
7696 }
7697
7698 boost::statechart::result
7699 PG::RecoveryState::Backfilling::react(const RemoteReservationRevoked &)
7700 {
7701 PG *pg = context< RecoveryMachine >().pg;
7702 pg->state_set(PG_STATE_BACKFILL_WAIT);
7703 cancel_backfill();
7704 if (pg->needs_backfill()) {
7705 return transit<WaitLocalBackfillReserved>();
7706 } else {
7707 // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
7708 return discard_event();
7709 }
7710 }
7711
7712 void PG::RecoveryState::Backfilling::exit()
7713 {
7714 context< RecoveryMachine >().log_exit(state_name, enter_time);
7715 PG *pg = context< RecoveryMachine >().pg;
7716 pg->backfill_reserved = false;
7717 pg->backfill_reserving = false;
7718 pg->state_clear(PG_STATE_BACKFILLING);
7719 pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
7720 utime_t dur = ceph_clock_now() - enter_time;
7721 pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
7722 }
7723
7724 /*--WaitRemoteBackfillReserved--*/
7725
7726 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
7727 : my_base(ctx),
7728 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteBackfillReserved"),
7729 backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
7730 {
7731 context< RecoveryMachine >().log_enter(state_name);
7732 PG *pg = context< RecoveryMachine >().pg;
7733 pg->state_set(PG_STATE_BACKFILL_WAIT);
7734 pg->publish_stats_to_osd();
7735 post_event(RemoteBackfillReserved());
7736 }
7737
7738 boost::statechart::result
7739 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
7740 {
7741 PG *pg = context< RecoveryMachine >().pg;
7742
7743 int64_t num_bytes = pg->info.stats.stats.sum.num_bytes;
7744 ldout(pg->cct, 10) << __func__ << " num_bytes " << num_bytes << dendl;
7745 if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) {
7746 //The primary never backfills itself
7747 ceph_assert(*backfill_osd_it != pg->pg_whoami);
7748 ConnectionRef con = pg->osd->get_con_osd_cluster(
7749 backfill_osd_it->osd, pg->get_osdmap_epoch());
7750 if (con) {
7751 pg->osd->send_message_osd_cluster(
7752 new MBackfillReserve(
7753 MBackfillReserve::REQUEST,
7754 spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
7755 pg->get_osdmap_epoch(),
7756 pg->get_backfill_priority(),
7757 num_bytes,
7758 pg->peer_bytes[*backfill_osd_it]),
7759 con.get());
7760 }
7761 ++backfill_osd_it;
7762 } else {
7763 pg->peer_bytes.clear();
7764 post_event(AllBackfillsReserved());
7765 }
7766 return discard_event();
7767 }
7768
7769 void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
7770 {
7771 context< RecoveryMachine >().log_exit(state_name, enter_time);
7772 PG *pg = context< RecoveryMachine >().pg;
7773 utime_t dur = ceph_clock_now() - enter_time;
7774 pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur);
7775 }
7776
7777 void PG::RecoveryState::WaitRemoteBackfillReserved::retry()
7778 {
7779 PG *pg = context< RecoveryMachine >().pg;
7780 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7781
7782 // Send CANCEL to all previously acquired reservations
7783 set<pg_shard_t>::const_iterator it, begin, end;
7784 begin = context< Active >().remote_shards_to_reserve_backfill.begin();
7785 end = context< Active >().remote_shards_to_reserve_backfill.end();
7786 ceph_assert(begin != end);
7787 for (it = begin; it != backfill_osd_it; ++it) {
7788 //The primary never backfills itself
7789 ceph_assert(*it != pg->pg_whoami);
7790 ConnectionRef con = pg->osd->get_con_osd_cluster(
7791 it->osd, pg->get_osdmap_epoch());
7792 if (con) {
7793 pg->osd->send_message_osd_cluster(
7794 new MBackfillReserve(
7795 MBackfillReserve::RELEASE,
7796 spg_t(pg->info.pgid.pgid, it->shard),
7797 pg->get_osdmap_epoch()),
7798 con.get());
7799 }
7800 }
7801
7802 pg->state_clear(PG_STATE_BACKFILL_WAIT);
7803 pg->publish_stats_to_osd();
7804
7805 pg->schedule_backfill_retry(pg->cct->_conf->osd_backfill_retry_interval);
7806 }
7807
7808 boost::statechart::result
7809 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejectedTooFull &evt)
7810 {
7811 PG *pg = context< RecoveryMachine >().pg;
7812 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
7813 retry();
7814 return transit<NotBackfilling>();
7815 }
7816
7817 boost::statechart::result
7818 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt)
7819 {
7820 retry();
7821 return transit<NotBackfilling>();
7822 }
7823
7824 /*--WaitLocalBackfillReserved--*/
7825 PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
7826 : my_base(ctx),
7827 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalBackfillReserved")
7828 {
7829 context< RecoveryMachine >().log_enter(state_name);
7830 PG *pg = context< RecoveryMachine >().pg;
7831 pg->state_set(PG_STATE_BACKFILL_WAIT);
7832 pg->osd->local_reserver.request_reservation(
7833 pg->info.pgid,
7834 new QueuePeeringEvt<LocalBackfillReserved>(
7835 pg, pg->get_osdmap_epoch(),
7836 LocalBackfillReserved()),
7837 pg->get_backfill_priority(),
7838 new QueuePeeringEvt<DeferBackfill>(
7839 pg, pg->get_osdmap_epoch(),
7840 DeferBackfill(0.0)));
7841 pg->publish_stats_to_osd();
7842 }
7843
7844 void PG::RecoveryState::WaitLocalBackfillReserved::exit()
7845 {
7846 context< RecoveryMachine >().log_exit(state_name, enter_time);
7847 PG *pg = context< RecoveryMachine >().pg;
7848 utime_t dur = ceph_clock_now() - enter_time;
7849 pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur);
7850 }
7851
7852 /*----NotBackfilling------*/
7853 PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
7854 : my_base(ctx),
7855 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotBackfilling")
7856 {
7857 context< RecoveryMachine >().log_enter(state_name);
7858 PG *pg = context< RecoveryMachine >().pg;
7859 pg->state_clear(PG_STATE_REPAIR);
7860 pg->publish_stats_to_osd();
7861 }
7862
7863 boost::statechart::result
7864 PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
7865 {
7866 return discard_event();
7867 }
7868
7869 boost::statechart::result
7870 PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejectedTooFull &evt)
7871 {
7872 return discard_event();
7873 }
7874
7875 void PG::RecoveryState::NotBackfilling::exit()
7876 {
7877 context< RecoveryMachine >().log_exit(state_name, enter_time);
7878 PG *pg = context< RecoveryMachine >().pg;
7879 pg->state_clear(PG_STATE_BACKFILL_UNFOUND);
7880 utime_t dur = ceph_clock_now() - enter_time;
7881 pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
7882 }
7883
7884 /*----NotRecovering------*/
7885 PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx)
7886 : my_base(ctx),
7887 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotRecovering")
7888 {
7889 context< RecoveryMachine >().log_enter(state_name);
7890 PG *pg = context< RecoveryMachine >().pg;
7891 pg->publish_stats_to_osd();
7892 }
7893
7894 void PG::RecoveryState::NotRecovering::exit()
7895 {
7896 context< RecoveryMachine >().log_exit(state_name, enter_time);
7897 PG *pg = context< RecoveryMachine >().pg;
7898 pg->state_clear(PG_STATE_RECOVERY_UNFOUND);
7899 utime_t dur = ceph_clock_now() - enter_time;
7900 pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur);
7901 }
7902
7903 /*---RepNotRecovering----*/
7904 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
7905 : my_base(ctx),
7906 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepNotRecovering")
7907 {
7908 context< RecoveryMachine >().log_enter(state_name);
7909 }
7910
7911 boost::statechart::result
7912 PG::RecoveryState::RepNotRecovering::react(const RejectTooFullRemoteReservation &evt)
7913 {
7914 PG *pg = context< RecoveryMachine >().pg;
7915 pg->reject_reservation();
7916 post_event(RemoteReservationRejectedTooFull());
7917 return discard_event();
7918 }
7919
7920 void PG::RecoveryState::RepNotRecovering::exit()
7921 {
7922 context< RecoveryMachine >().log_exit(state_name, enter_time);
7923 PG *pg = context< RecoveryMachine >().pg;
7924 utime_t dur = ceph_clock_now() - enter_time;
7925 pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur);
7926 }
7927
7928 /*---RepWaitRecoveryReserved--*/
7929 PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
7930 : my_base(ctx),
7931 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitRecoveryReserved")
7932 {
7933 context< RecoveryMachine >().log_enter(state_name);
7934 }
7935
7936 boost::statechart::result
7937 PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
7938 {
7939 PG *pg = context< RecoveryMachine >().pg;
7940 pg->osd->send_message_osd_cluster(
7941 pg->primary.osd,
7942 new MRecoveryReserve(
7943 MRecoveryReserve::GRANT,
7944 spg_t(pg->info.pgid.pgid, pg->primary.shard),
7945 pg->get_osdmap_epoch()),
7946 pg->get_osdmap_epoch());
7947 return transit<RepRecovering>();
7948 }
7949
7950 boost::statechart::result
7951 PG::RecoveryState::RepWaitRecoveryReserved::react(
7952 const RemoteReservationCanceled &evt)
7953 {
7954 PG *pg = context< RecoveryMachine >().pg;
7955 pg->clear_reserved_num_bytes();
7956 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7957 return transit<RepNotRecovering>();
7958 }
7959
7960 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
7961 {
7962 context< RecoveryMachine >().log_exit(state_name, enter_time);
7963 PG *pg = context< RecoveryMachine >().pg;
7964 utime_t dur = ceph_clock_now() - enter_time;
7965 pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur);
7966 }
7967
7968 /*-RepWaitBackfillReserved*/
7969 PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
7970 : my_base(ctx),
7971 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitBackfillReserved")
7972 {
7973 context< RecoveryMachine >().log_enter(state_name);
7974 }
7975
7976 boost::statechart::result
7977 PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
7978 {
7979 PG *pg = context< RecoveryMachine >().pg;
7980 // Use tentative_bacfill_full() to make sure enough
7981 // space is available to handle target bytes from primary.
7982
7983 // TODO: If we passed num_objects from primary we could account for
7984 // an estimate of the metadata overhead.
7985
7986 // TODO: If we had compressed_allocated and compressed_original from primary
7987 // we could compute compression ratio and adjust accordingly.
7988
7989 // XXX: There is no way to get omap overhead and this would only apply
7990 // to whatever possibly different partition that is storing the database.
7991
7992 // update_osd_stat() from heartbeat will do this on a new
7993 // statfs using pg->primary_num_bytes.
7994 uint64_t pending_adjustment = 0;
7995 int64_t primary_num_bytes = evt.primary_num_bytes;
7996 int64_t local_num_bytes = evt.local_num_bytes;
7997 if (primary_num_bytes) {
7998 // For erasure coded pool overestimate by a full stripe per object
7999 // because we don't know how each objected rounded to the nearest stripe
8000 if (pg->pool.info.is_erasure()) {
8001 primary_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count();
8002 primary_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects;
8003 local_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count();
8004 local_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * pg->info.stats.stats.sum.num_objects;
8005 }
8006 pending_adjustment = pending_backfill(pg->cct, primary_num_bytes, local_num_bytes);
8007 ldout(pg->cct, 10) << __func__ << " primary_num_bytes " << (primary_num_bytes >> 10) << "KiB"
8008 << " local " << (local_num_bytes >> 10) << "KiB"
8009 << " pending_adjustments " << (pending_adjustment >> 10) << "KiB"
8010 << dendl;
8011 }
8012 // This lock protects not only the stats OSDService but also setting the pg primary_num_bytes
8013 // That's why we don't immediately unlock
8014 Mutex::Locker l(pg->osd->stat_lock);
8015 osd_stat_t cur_stat = pg->osd->osd_stat;
8016 if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
8017 (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
8018 ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
8019 << dendl;
8020 post_event(RejectTooFullRemoteReservation());
8021 } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
8022 pg->osd->tentative_backfill_full(pg, pending_adjustment, cur_stat)) {
8023 ldout(pg->cct, 10) << "backfill reservation rejected: backfill full"
8024 << dendl;
8025 post_event(RejectTooFullRemoteReservation());
8026 } else {
8027 Context *preempt = nullptr;
8028 // Don't reserve space if skipped reservation check, this is used
8029 // to test the other backfill full check AND in case a corruption
8030 // of num_bytes requires ignoring that value and trying the
8031 // backfill anyway.
8032 if (primary_num_bytes && !pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation)
8033 pg->set_reserved_num_bytes(primary_num_bytes, local_num_bytes);
8034 else
8035 pg->clear_reserved_num_bytes();
8036 // Use un-ec-adjusted bytes for stats.
8037 pg->info.stats.stats.sum.num_bytes = evt.local_num_bytes;
8038 if (HAVE_FEATURE(pg->upacting_features, RECOVERY_RESERVATION_2)) {
8039 // older peers will interpret preemption as TOOFULL
8040 preempt = new QueuePeeringEvt<RemoteBackfillPreempted>(
8041 pg, pg->get_osdmap_epoch(),
8042 RemoteBackfillPreempted());
8043 }
8044 pg->osd->remote_reserver.request_reservation(
8045 pg->info.pgid,
8046 new QueuePeeringEvt<RemoteBackfillReserved>(
8047 pg, pg->get_osdmap_epoch(),
8048 RemoteBackfillReserved()),
8049 evt.priority,
8050 preempt);
8051 }
8052 return transit<RepWaitBackfillReserved>();
8053 }
8054
8055 boost::statechart::result
8056 PG::RecoveryState::RepNotRecovering::react(const RequestRecoveryPrio &evt)
8057 {
8058 PG *pg = context< RecoveryMachine >().pg;
8059
8060 // fall back to a local reckoning of priority of primary doesn't pass one
8061 // (pre-mimic compat)
8062 int prio = evt.priority ? evt.priority : pg->get_recovery_priority();
8063
8064 Context *preempt = nullptr;
8065 if (HAVE_FEATURE(pg->upacting_features, RECOVERY_RESERVATION_2)) {
8066 // older peers can't handle this
8067 preempt = new QueuePeeringEvt<RemoteRecoveryPreempted>(
8068 pg, pg->get_osdmap_epoch(),
8069 RemoteRecoveryPreempted());
8070 }
8071
8072 pg->osd->remote_reserver.request_reservation(
8073 pg->info.pgid,
8074 new QueuePeeringEvt<RemoteRecoveryReserved>(
8075 pg, pg->get_osdmap_epoch(),
8076 RemoteRecoveryReserved()),
8077 prio,
8078 preempt);
8079 return transit<RepWaitRecoveryReserved>();
8080 }
8081
8082 void PG::RecoveryState::RepWaitBackfillReserved::exit()
8083 {
8084 context< RecoveryMachine >().log_exit(state_name, enter_time);
8085 PG *pg = context< RecoveryMachine >().pg;
8086 utime_t dur = ceph_clock_now() - enter_time;
8087 pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur);
8088 }
8089
8090 boost::statechart::result
8091 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
8092 {
8093 PG *pg = context< RecoveryMachine >().pg;
8094
8095 pg->osd->send_message_osd_cluster(
8096 pg->primary.osd,
8097 new MBackfillReserve(
8098 MBackfillReserve::GRANT,
8099 spg_t(pg->info.pgid.pgid, pg->primary.shard),
8100 pg->get_osdmap_epoch()),
8101 pg->get_osdmap_epoch());
8102 return transit<RepRecovering>();
8103 }
8104
8105 boost::statechart::result
8106 PG::RecoveryState::RepWaitBackfillReserved::react(
8107 const RejectTooFullRemoteReservation &evt)
8108 {
8109 PG *pg = context< RecoveryMachine >().pg;
8110 pg->reject_reservation();
8111 post_event(RemoteReservationRejectedTooFull());
8112 return discard_event();
8113 }
8114
8115 boost::statechart::result
8116 PG::RecoveryState::RepWaitBackfillReserved::react(
8117 const RemoteReservationRejectedTooFull &evt)
8118 {
8119 PG *pg = context< RecoveryMachine >().pg;
8120 pg->clear_reserved_num_bytes();
8121 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
8122 return transit<RepNotRecovering>();
8123 }
8124
8125 boost::statechart::result
8126 PG::RecoveryState::RepWaitBackfillReserved::react(
8127 const RemoteReservationCanceled &evt)
8128 {
8129 PG *pg = context< RecoveryMachine >().pg;
8130 pg->clear_reserved_num_bytes();
8131 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
8132 return transit<RepNotRecovering>();
8133 }
8134
8135 /*---RepRecovering-------*/
8136 PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx)
8137 : my_base(ctx),
8138 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepRecovering")
8139 {
8140 context< RecoveryMachine >().log_enter(state_name);
8141 }
8142
8143 boost::statechart::result
8144 PG::RecoveryState::RepRecovering::react(const RemoteRecoveryPreempted &)
8145 {
8146 PG *pg = context< RecoveryMachine >().pg;
8147 pg->clear_reserved_num_bytes();
8148 pg->osd->send_message_osd_cluster(
8149 pg->primary.osd,
8150 new MRecoveryReserve(
8151 MRecoveryReserve::REVOKE,
8152 spg_t(pg->info.pgid.pgid, pg->primary.shard),
8153 pg->get_osdmap_epoch()),
8154 pg->get_osdmap_epoch());
8155 return discard_event();
8156 }
8157
8158 boost::statechart::result
8159 PG::RecoveryState::RepRecovering::react(const BackfillTooFull &)
8160 {
8161 PG *pg = context< RecoveryMachine >().pg;
8162 pg->clear_reserved_num_bytes();
8163 pg->osd->send_message_osd_cluster(
8164 pg->primary.osd,
8165 new MBackfillReserve(
8166 MBackfillReserve::REVOKE_TOOFULL,
8167 spg_t(pg->info.pgid.pgid, pg->primary.shard),
8168 pg->get_osdmap_epoch()),
8169 pg->get_osdmap_epoch());
8170 return discard_event();
8171 }
8172
8173 boost::statechart::result
8174 PG::RecoveryState::RepRecovering::react(const RemoteBackfillPreempted &)
8175 {
8176 PG *pg = context< RecoveryMachine >().pg;
8177 pg->clear_reserved_num_bytes();
8178 pg->osd->send_message_osd_cluster(
8179 pg->primary.osd,
8180 new MBackfillReserve(
8181 MBackfillReserve::REVOKE,
8182 spg_t(pg->info.pgid.pgid, pg->primary.shard),
8183 pg->get_osdmap_epoch()),
8184 pg->get_osdmap_epoch());
8185 return discard_event();
8186 }
8187
8188 void PG::RecoveryState::RepRecovering::exit()
8189 {
8190 context< RecoveryMachine >().log_exit(state_name, enter_time);
8191 PG *pg = context< RecoveryMachine >().pg;
8192 pg->clear_reserved_num_bytes();
8193 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
8194 utime_t dur = ceph_clock_now() - enter_time;
8195 pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur);
8196 }
8197
8198 /*------Activating--------*/
8199 PG::RecoveryState::Activating::Activating(my_context ctx)
8200 : my_base(ctx),
8201 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Activating")
8202 {
8203 context< RecoveryMachine >().log_enter(state_name);
8204 }
8205
8206 void PG::RecoveryState::Activating::exit()
8207 {
8208 context< RecoveryMachine >().log_exit(state_name, enter_time);
8209 PG *pg = context< RecoveryMachine >().pg;
8210 utime_t dur = ceph_clock_now() - enter_time;
8211 pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur);
8212 }
8213
8214 PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
8215 : my_base(ctx),
8216 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalRecoveryReserved")
8217 {
8218 context< RecoveryMachine >().log_enter(state_name);
8219 PG *pg = context< RecoveryMachine >().pg;
8220
8221 // Make sure all nodes that part of the recovery aren't full
8222 if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery &&
8223 pg->osd->check_osdmap_full(pg->acting_recovery_backfill)) {
8224 post_event(RecoveryTooFull());
8225 return;
8226 }
8227
8228 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
8229 pg->state_set(PG_STATE_RECOVERY_WAIT);
8230 pg->osd->local_reserver.request_reservation(
8231 pg->info.pgid,
8232 new QueuePeeringEvt<LocalRecoveryReserved>(
8233 pg, pg->get_osdmap_epoch(),
8234 LocalRecoveryReserved()),
8235 pg->get_recovery_priority(),
8236 new QueuePeeringEvt<DeferRecovery>(
8237 pg, pg->get_osdmap_epoch(),
8238 DeferRecovery(0.0)));
8239 pg->publish_stats_to_osd();
8240 }
8241
8242 boost::statechart::result
8243 PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
8244 {
8245 PG *pg = context< RecoveryMachine >().pg;
8246 pg->state_set(PG_STATE_RECOVERY_TOOFULL);
8247 pg->schedule_recovery_retry(pg->cct->_conf->osd_recovery_retry_interval);
8248 return transit<NotRecovering>();
8249 }
8250
8251 void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
8252 {
8253 context< RecoveryMachine >().log_exit(state_name, enter_time);
8254 PG *pg = context< RecoveryMachine >().pg;
8255 utime_t dur = ceph_clock_now() - enter_time;
8256 pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur);
8257 }
8258
8259 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
8260 : my_base(ctx),
8261 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
8262 remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
8263 {
8264 context< RecoveryMachine >().log_enter(state_name);
8265 post_event(RemoteRecoveryReserved());
8266 }
8267
8268 boost::statechart::result
8269 PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
8270 PG *pg = context< RecoveryMachine >().pg;
8271
8272 if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
8273 ceph_assert(*remote_recovery_reservation_it != pg->pg_whoami);
8274 ConnectionRef con = pg->osd->get_con_osd_cluster(
8275 remote_recovery_reservation_it->osd, pg->get_osdmap_epoch());
8276 if (con) {
8277 pg->osd->send_message_osd_cluster(
8278 new MRecoveryReserve(
8279 MRecoveryReserve::REQUEST,
8280 spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard),
8281 pg->get_osdmap_epoch(),
8282 pg->get_recovery_priority()),
8283 con.get());
8284 }
8285 ++remote_recovery_reservation_it;
8286 } else {
8287 post_event(AllRemotesReserved());
8288 }
8289 return discard_event();
8290 }
8291
8292 void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
8293 {
8294 context< RecoveryMachine >().log_exit(state_name, enter_time);
8295 PG *pg = context< RecoveryMachine >().pg;
8296 utime_t dur = ceph_clock_now() - enter_time;
8297 pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur);
8298 }
8299
8300 PG::RecoveryState::Recovering::Recovering(my_context ctx)
8301 : my_base(ctx),
8302 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovering")
8303 {
8304 context< RecoveryMachine >().log_enter(state_name);
8305
8306 PG *pg = context< RecoveryMachine >().pg;
8307 pg->state_clear(PG_STATE_RECOVERY_WAIT);
8308 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
8309 pg->state_set(PG_STATE_RECOVERING);
8310 ceph_assert(!pg->state_test(PG_STATE_ACTIVATING));
8311 pg->publish_stats_to_osd();
8312 pg->queue_recovery();
8313 }
8314
8315 void PG::RecoveryState::Recovering::release_reservations(bool cancel)
8316 {
8317 PG *pg = context< RecoveryMachine >().pg;
8318 ceph_assert(cancel || !pg->pg_log.get_missing().have_missing());
8319
8320 // release remote reservations
8321 for (set<pg_shard_t>::const_iterator i =
8322 context< Active >().remote_shards_to_reserve_recovery.begin();
8323 i != context< Active >().remote_shards_to_reserve_recovery.end();
8324 ++i) {
8325 if (*i == pg->pg_whoami) // skip myself
8326 continue;
8327 ConnectionRef con = pg->osd->get_con_osd_cluster(
8328 i->osd, pg->get_osdmap_epoch());
8329 if (con) {
8330 pg->osd->send_message_osd_cluster(
8331 new MRecoveryReserve(
8332 MRecoveryReserve::RELEASE,
8333 spg_t(pg->info.pgid.pgid, i->shard),
8334 pg->get_osdmap_epoch()),
8335 con.get());
8336 }
8337 }
8338 }
8339
8340 boost::statechart::result
8341 PG::RecoveryState::Recovering::react(const AllReplicasRecovered &evt)
8342 {
8343 PG *pg = context< RecoveryMachine >().pg;
8344 pg->state_clear(PG_STATE_FORCED_RECOVERY);
8345 release_reservations();
8346 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8347 return transit<Recovered>();
8348 }
8349
8350 boost::statechart::result
8351 PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
8352 {
8353 PG *pg = context< RecoveryMachine >().pg;
8354 pg->state_clear(PG_STATE_FORCED_RECOVERY);
8355 release_reservations();
8356 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8357 // XXX: Is this needed?
8358 pg->publish_stats_to_osd();
8359 return transit<WaitLocalBackfillReserved>();
8360 }
8361
8362 boost::statechart::result
8363 PG::RecoveryState::Recovering::react(const DeferRecovery &evt)
8364 {
8365 PG *pg = context< RecoveryMachine >().pg;
8366 if (!pg->state_test(PG_STATE_RECOVERING)) {
8367 // we may have finished recovery and have an AllReplicasRecovered
8368 // event queued to move us to the next state.
8369 ldout(pg->cct, 10) << "got defer recovery but not recovering" << dendl;
8370 return discard_event();
8371 }
8372 ldout(pg->cct, 10) << "defer recovery, retry delay " << evt.delay << dendl;
8373 pg->state_set(PG_STATE_RECOVERY_WAIT);
8374 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8375 release_reservations(true);
8376 pg->schedule_recovery_retry(evt.delay);
8377 return transit<NotRecovering>();
8378 }
8379
8380 boost::statechart::result
8381 PG::RecoveryState::Recovering::react(const UnfoundRecovery &evt)
8382 {
8383 PG *pg = context< RecoveryMachine >().pg;
8384 ldout(pg->cct, 10) << "recovery has unfound, can't continue" << dendl;
8385 pg->state_set(PG_STATE_RECOVERY_UNFOUND);
8386 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8387 release_reservations(true);
8388 return transit<NotRecovering>();
8389 }
8390
8391 void PG::RecoveryState::Recovering::exit()
8392 {
8393 context< RecoveryMachine >().log_exit(state_name, enter_time);
8394 PG *pg = context< RecoveryMachine >().pg;
8395 utime_t dur = ceph_clock_now() - enter_time;
8396 pg->state_clear(PG_STATE_RECOVERING);
8397 pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur);
8398 }
8399
8400 PG::RecoveryState::Recovered::Recovered(my_context ctx)
8401 : my_base(ctx),
8402 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovered")
8403 {
8404 pg_shard_t auth_log_shard;
8405
8406 context< RecoveryMachine >().log_enter(state_name);
8407
8408 PG *pg = context< RecoveryMachine >().pg;
8409
8410 ceph_assert(!pg->needs_recovery());
8411
8412 // if we finished backfill, all acting are active; recheck if
8413 // DEGRADED | UNDERSIZED is appropriate.
8414 ceph_assert(!pg->acting_recovery_backfill.empty());
8415 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
8416 pg->acting_recovery_backfill.size()) {
8417 pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
8418 pg->publish_stats_to_osd();
8419 }
8420
8421 // adjust acting set? (e.g. because backfill completed...)
8422 bool history_les_bound = false;
8423 if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
8424 true, &history_les_bound)) {
8425 ceph_assert(pg->want_acting.size());
8426 } else if (!pg->async_recovery_targets.empty()) {
8427 pg->choose_acting(auth_log_shard, true, &history_les_bound);
8428 }
8429
8430 if (context< Active >().all_replicas_activated &&
8431 pg->async_recovery_targets.empty())
8432 post_event(GoClean());
8433 }
8434
8435 void PG::RecoveryState::Recovered::exit()
8436 {
8437 context< RecoveryMachine >().log_exit(state_name, enter_time);
8438 PG *pg = context< RecoveryMachine >().pg;
8439 utime_t dur = ceph_clock_now() - enter_time;
8440 pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur);
8441 }
8442
8443 PG::RecoveryState::Clean::Clean(my_context ctx)
8444 : my_base(ctx),
8445 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Clean")
8446 {
8447 context< RecoveryMachine >().log_enter(state_name);
8448
8449 PG *pg = context< RecoveryMachine >().pg;
8450
8451 if (pg->info.last_complete != pg->info.last_update) {
8452 ceph_abort();
8453 }
8454 Context *c = pg->finish_recovery();
8455 context< RecoveryMachine >().get_cur_transaction()->register_on_commit(c);
8456
8457 pg->try_mark_clean();
8458 }
8459
8460 void PG::RecoveryState::Clean::exit()
8461 {
8462 context< RecoveryMachine >().log_exit(state_name, enter_time);
8463 PG *pg = context< RecoveryMachine >().pg;
8464 pg->state_clear(PG_STATE_CLEAN);
8465 utime_t dur = ceph_clock_now() - enter_time;
8466 pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur);
8467 }
8468
8469 template <typename T>
8470 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
8471 {
8472 set<int> osds_found;
8473 set<pg_shard_t> out;
8474 for (typename T::const_iterator i = in.begin();
8475 i != in.end();
8476 ++i) {
8477 if (*i != skip && !osds_found.count(i->osd)) {
8478 osds_found.insert(i->osd);
8479 out.insert(*i);
8480 }
8481 }
8482 return out;
8483 }
8484
8485 /*---------Active---------*/
8486 PG::RecoveryState::Active::Active(my_context ctx)
8487 : my_base(ctx),
8488 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active"),
8489 remote_shards_to_reserve_recovery(
8490 unique_osd_shard_set(
8491 context< RecoveryMachine >().pg->pg_whoami,
8492 context< RecoveryMachine >().pg->acting_recovery_backfill)),
8493 remote_shards_to_reserve_backfill(
8494 unique_osd_shard_set(
8495 context< RecoveryMachine >().pg->pg_whoami,
8496 context< RecoveryMachine >().pg->backfill_targets)),
8497 all_replicas_activated(false)
8498 {
8499 context< RecoveryMachine >().log_enter(state_name);
8500
8501 PG *pg = context< RecoveryMachine >().pg;
8502
8503 ceph_assert(!pg->backfill_reserving);
8504 ceph_assert(!pg->backfill_reserved);
8505 ceph_assert(pg->is_primary());
8506 ldout(pg->cct, 10) << "In Active, about to call activate" << dendl;
8507 pg->start_flush(context< RecoveryMachine >().get_cur_transaction());
8508 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
8509 pg->get_osdmap_epoch(),
8510 *context< RecoveryMachine >().get_query_map(),
8511 context< RecoveryMachine >().get_info_map(),
8512 context< RecoveryMachine >().get_recovery_ctx());
8513
8514 // everyone has to commit/ack before we are truly active
8515 pg->blocked_by.clear();
8516 for (set<pg_shard_t>::iterator p = pg->acting_recovery_backfill.begin();
8517 p != pg->acting_recovery_backfill.end();
8518 ++p) {
8519 if (p->shard != pg->pg_whoami.shard) {
8520 pg->blocked_by.insert(p->shard);
8521 }
8522 }
8523 pg->publish_stats_to_osd();
8524 ldout(pg->cct, 10) << "Activate Finished" << dendl;
8525 }
8526
8527 boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
8528 {
8529 PG *pg = context< RecoveryMachine >().pg;
8530 if (pg->should_restart_peering(
8531 advmap.up_primary,
8532 advmap.acting_primary,
8533 advmap.newup,
8534 advmap.newacting,
8535 advmap.lastmap,
8536 advmap.osdmap)) {
8537 ldout(pg->cct, 10) << "Active advmap interval change, fast return" << dendl;
8538 return forward_event();
8539 }
8540 ldout(pg->cct, 10) << "Active advmap" << dendl;
8541 bool need_publish = false;
8542
8543 if (advmap.osdmap->require_osd_release >= CEPH_RELEASE_MIMIC) {
8544 const auto& new_removed_snaps = advmap.osdmap->get_new_removed_snaps();
8545 auto i = new_removed_snaps.find(pg->info.pgid.pool());
8546 if (i != new_removed_snaps.end()) {
8547 bool bad = false;
8548 for (auto j : i->second) {
8549 if (pg->snap_trimq.intersects(j.first, j.second)) {
8550 decltype(pg->snap_trimq) added, overlap;
8551 added.insert(j.first, j.second);
8552 overlap.intersection_of(pg->snap_trimq, added);
8553 if (pg->last_require_osd_release < CEPH_RELEASE_MIMIC) {
8554 lderr(pg->cct) << __func__ << " removed_snaps already contains "
8555 << overlap << ", but this is the first mimic+ osdmap,"
8556 << " so it's expected" << dendl;
8557 } else {
8558 lderr(pg->cct) << __func__ << " removed_snaps already contains "
8559 << overlap << dendl;
8560 bad = true;
8561 }
8562 pg->snap_trimq.union_of(added);
8563 } else {
8564 pg->snap_trimq.insert(j.first, j.second);
8565 }
8566 }
8567 if (pg->last_require_osd_release < CEPH_RELEASE_MIMIC) {
8568 // at upgrade, we report *all* previously removed snaps as removed in
8569 // the first mimic epoch. remove the ones we previously divined were
8570 // removed (and subsequently purged) from the trimq.
8571 lderr(pg->cct) << __func__ << " first mimic map, filtering purged_snaps"
8572 << " from new removed_snaps" << dendl;
8573 pg->snap_trimq.subtract(pg->info.purged_snaps);
8574 }
8575 ldout(pg->cct,10) << __func__ << " new removed_snaps " << i->second
8576 << ", snap_trimq now " << pg->snap_trimq << dendl;
8577 ceph_assert(!bad || !pg->cct->_conf->osd_debug_verify_cached_snaps);
8578 pg->dirty_info = true;
8579 pg->dirty_big_info = true;
8580 }
8581
8582 const auto& new_purged_snaps = advmap.osdmap->get_new_purged_snaps();
8583 auto j = new_purged_snaps.find(pg->info.pgid.pool());
8584 if (j != new_purged_snaps.end()) {
8585 bool bad = false;
8586 for (auto k : j->second) {
8587 if (!pg->info.purged_snaps.contains(k.first, k.second)) {
8588 decltype(pg->info.purged_snaps) rm, overlap;
8589 rm.insert(k.first, k.second);
8590 overlap.intersection_of(pg->info.purged_snaps, rm);
8591 lderr(pg->cct) << __func__ << " purged_snaps does not contain "
8592 << rm << ", only " << overlap << dendl;
8593 pg->info.purged_snaps.subtract(overlap);
8594 // This can currently happen in the normal (if unlikely) course of
8595 // events. Because adding snaps to purged_snaps does not increase
8596 // the pg version or add a pg log entry, we don't reliably propagate
8597 // purged_snaps additions to other OSDs.
8598 // One example:
8599 // - purge S
8600 // - primary and replicas update purged_snaps
8601 // - no object updates
8602 // - pg mapping changes, new primary on different node
8603 // - new primary pg version == eversion_t(), so info is not
8604 // propagated.
8605 //bad = true;
8606 } else {
8607 pg->info.purged_snaps.erase(k.first, k.second);
8608 }
8609 }
8610 ldout(pg->cct,10) << __func__ << " new purged_snaps " << j->second
8611 << ", now " << pg->info.purged_snaps << dendl;
8612 ceph_assert(!bad || !pg->cct->_conf->osd_debug_verify_cached_snaps);
8613 pg->dirty_info = true;
8614 pg->dirty_big_info = true;
8615 }
8616 if (pg->dirty_big_info) {
8617 // share updated purged_snaps to mgr/mon so that we (a) stop reporting
8618 // purged snaps and (b) perhaps share more snaps that we have purged
8619 // but didn't fit in pg_stat_t.
8620 need_publish = true;
8621 pg->share_pg_info();
8622 }
8623 } else if (!pg->pool.newly_removed_snaps.empty()) {
8624 pg->snap_trimq.union_of(pg->pool.newly_removed_snaps);
8625 ldout(pg->cct, 10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl;
8626 pg->dirty_info = true;
8627 pg->dirty_big_info = true;
8628 }
8629
8630 for (size_t i = 0; i < pg->want_acting.size(); i++) {
8631 int osd = pg->want_acting[i];
8632 if (!advmap.osdmap->is_up(osd)) {
8633 pg_shard_t osd_with_shard(osd, shard_id_t(i));
8634 ceph_assert(pg->is_acting(osd_with_shard) || pg->is_up(osd_with_shard));
8635 }
8636 }
8637
8638 /* Check for changes in pool size (if the acting set changed as a result,
8639 * this does not matter) */
8640 if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) !=
8641 pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) {
8642 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) {
8643 pg->state_clear(PG_STATE_UNDERSIZED);
8644 } else {
8645 pg->state_set(PG_STATE_UNDERSIZED);
8646 }
8647 // degraded changes will be detected by call from publish_stats_to_osd()
8648 need_publish = true;
8649 }
8650
8651 // if we haven't reported our PG stats in a long time, do so now.
8652 if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
8653 ldout(pg->cct, 20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch)
8654 << " epochs" << dendl;
8655 need_publish = true;
8656 }
8657
8658 if (need_publish)
8659 pg->publish_stats_to_osd();
8660
8661 return forward_event();
8662 }
8663
8664 boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
8665 {
8666 PG *pg = context< RecoveryMachine >().pg;
8667 ldout(pg->cct, 10) << "Active: handling ActMap" << dendl;
8668 ceph_assert(pg->is_primary());
8669
8670 if (pg->have_unfound()) {
8671 // object may have become unfound
8672 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
8673 }
8674
8675 if (pg->cct->_conf->osd_check_for_log_corruption)
8676 pg->check_log_for_corruption(pg->osd->store);
8677
8678 uint64_t unfound = pg->missing_loc.num_unfound();
8679 if (unfound > 0 &&
8680 pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
8681 if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
8682 pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound
8683 << " objects unfound and apparently lost, would automatically "
8684 << "mark these objects lost but this feature is not yet implemented "
8685 << "(osd_auto_mark_unfound_lost)";
8686 } else
8687 pg->osd->clog->error() << pg->info.pgid.pgid << " has "
8688 << unfound << " objects unfound and apparently lost";
8689 }
8690
8691 if (pg->is_active()) {
8692 ldout(pg->cct, 10) << "Active: kicking snap trim" << dendl;
8693 pg->kick_snap_trim();
8694 }
8695
8696 if (pg->is_peered() &&
8697 !pg->is_clean() &&
8698 !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) &&
8699 (!pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || pg->is_degraded())) {
8700 pg->queue_recovery();
8701 }
8702 return forward_event();
8703 }
8704
8705 boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& notevt)
8706 {
8707 PG *pg = context< RecoveryMachine >().pg;
8708 ceph_assert(pg->is_primary());
8709 if (pg->peer_info.count(notevt.from)) {
8710 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
8711 << ", already have info from that osd, ignoring"
8712 << dendl;
8713 } else if (pg->peer_purged.count(notevt.from)) {
8714 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
8715 << ", already purged that peer, ignoring"
8716 << dendl;
8717 } else {
8718 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
8719 << ", calling proc_replica_info and discover_all_missing"
8720 << dendl;
8721 pg->proc_replica_info(
8722 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
8723 if (pg->have_unfound() || (pg->is_degraded() && pg->might_have_unfound.count(notevt.from))) {
8724 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
8725 }
8726 }
8727 return discard_event();
8728 }
8729
8730 boost::statechart::result PG::RecoveryState::Active::react(const MTrim& trim)
8731 {
8732 PG *pg = context< RecoveryMachine >().pg;
8733 ceph_assert(pg->is_primary());
8734
8735 // peer is informing us of their last_complete_ondisk
8736 ldout(pg->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl;
8737 pg->peer_last_complete_ondisk[pg_shard_t(trim.from, trim.shard)] = trim.trim_to;
8738
8739 // trim log when the pg is recovered
8740 pg->calc_min_last_complete_ondisk();
8741 return discard_event();
8742 }
8743
8744 boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoevt)
8745 {
8746 PG *pg = context< RecoveryMachine >().pg;
8747 ceph_assert(pg->is_primary());
8748
8749 ceph_assert(!pg->acting_recovery_backfill.empty());
8750 // don't update history (yet) if we are active and primary; the replica
8751 // may be telling us they have activated (and committed) but we can't
8752 // share that until _everyone_ does the same.
8753 if (pg->is_acting_recovery_backfill(infoevt.from) &&
8754 pg->peer_activated.count(infoevt.from) == 0) {
8755 ldout(pg->cct, 10) << " peer osd." << infoevt.from
8756 << " activated and committed" << dendl;
8757 pg->peer_activated.insert(infoevt.from);
8758 pg->blocked_by.erase(infoevt.from.shard);
8759 pg->publish_stats_to_osd();
8760 if (pg->peer_activated.size() == pg->acting_recovery_backfill.size()) {
8761 pg->all_activated_and_committed();
8762 }
8763 }
8764 return discard_event();
8765 }
8766
8767 boost::statechart::result PG::RecoveryState::Active::react(const MLogRec& logevt)
8768 {
8769 PG *pg = context< RecoveryMachine >().pg;
8770 ldout(pg->cct, 10) << "searching osd." << logevt.from
8771 << " log for unfound items" << dendl;
8772 pg->proc_replica_log(
8773 logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
8774 bool got_missing = pg->search_for_missing(
8775 pg->peer_info[logevt.from],
8776 pg->peer_missing[logevt.from],
8777 logevt.from,
8778 context< RecoveryMachine >().get_recovery_ctx());
8779 // If there are missing AND we are "fully" active then start recovery now
8780 if (got_missing && pg->state_test(PG_STATE_ACTIVE)) {
8781 post_event(DoRecovery());
8782 }
8783 return discard_event();
8784 }
8785
8786 boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
8787 {
8788 PG *pg = context< RecoveryMachine >().pg;
8789
8790 q.f->open_object_section("state");
8791 q.f->dump_string("name", state_name);
8792 q.f->dump_stream("enter_time") << enter_time;
8793
8794 {
8795 q.f->open_array_section("might_have_unfound");
8796 for (set<pg_shard_t>::iterator p = pg->might_have_unfound.begin();
8797 p != pg->might_have_unfound.end();
8798 ++p) {
8799 q.f->open_object_section("osd");
8800 q.f->dump_stream("osd") << *p;
8801 if (pg->peer_missing.count(*p)) {
8802 q.f->dump_string("status", "already probed");
8803 } else if (pg->peer_missing_requested.count(*p)) {
8804 q.f->dump_string("status", "querying");
8805 } else if (!pg->get_osdmap()->is_up(p->osd)) {
8806 q.f->dump_string("status", "osd is down");
8807 } else {
8808 q.f->dump_string("status", "not queried");
8809 }
8810 q.f->close_section();
8811 }
8812 q.f->close_section();
8813 }
8814 {
8815 q.f->open_object_section("recovery_progress");
8816 pg->dump_recovery_info(q.f);
8817 q.f->close_section();
8818 }
8819
8820 {
8821 q.f->open_object_section("scrub");
8822 q.f->dump_stream("scrubber.epoch_start") << pg->scrubber.epoch_start;
8823 q.f->dump_bool("scrubber.active", pg->scrubber.active);
8824 q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state));
8825 q.f->dump_stream("scrubber.start") << pg->scrubber.start;
8826 q.f->dump_stream("scrubber.end") << pg->scrubber.end;
8827 q.f->dump_stream("scrubber.max_end") << pg->scrubber.max_end;
8828 q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update;
8829 q.f->dump_bool("scrubber.deep", pg->scrubber.deep);
8830 {
8831 q.f->open_array_section("scrubber.waiting_on_whom");
8832 for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin();
8833 p != pg->scrubber.waiting_on_whom.end();
8834 ++p) {
8835 q.f->dump_stream("shard") << *p;
8836 }
8837 q.f->close_section();
8838 }
8839 q.f->close_section();
8840 }
8841
8842 q.f->close_section();
8843 return forward_event();
8844 }
8845
8846 boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActivated &evt)
8847 {
8848 PG *pg = context< RecoveryMachine >().pg;
8849 pg_t pgid = pg->info.pgid.pgid;
8850
8851 all_replicas_activated = true;
8852
8853 pg->state_clear(PG_STATE_ACTIVATING);
8854 pg->state_clear(PG_STATE_CREATING);
8855 pg->state_clear(PG_STATE_PREMERGE);
8856
8857 bool merge_target;
8858 if (pg->pool.info.is_pending_merge(pgid, &merge_target)) {
8859 pg->state_set(PG_STATE_PEERED);
8860 pg->state_set(PG_STATE_PREMERGE);
8861
8862 if (pg->actingset.size() != pg->get_osdmap()->get_pg_size(pgid)) {
8863 if (merge_target) {
8864 pg_t src = pgid;
8865 src.set_ps(pg->pool.info.get_pg_num_pending());
8866 assert(src.get_parent() == pgid);
8867 pg->osd->set_not_ready_to_merge_target(pgid, src);
8868 } else {
8869 pg->osd->set_not_ready_to_merge_source(pgid);
8870 }
8871 }
8872 } else if (pg->acting.size() < pg->pool.info.min_size) {
8873 pg->state_set(PG_STATE_PEERED);
8874 } else {
8875 pg->state_set(PG_STATE_ACTIVE);
8876 }
8877
8878 if (pg->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) {
8879 pg->osd->send_pg_created(pgid);
8880 }
8881
8882 pg->info.history.last_epoch_started = pg->info.last_epoch_started;
8883 pg->info.history.last_interval_started = pg->info.last_interval_started;
8884 pg->dirty_info = true;
8885
8886 pg->share_pg_info();
8887 pg->publish_stats_to_osd();
8888
8889 pg->check_local();
8890
8891 // waiters
8892 if (pg->flushes_in_progress == 0) {
8893 pg->requeue_ops(pg->waiting_for_peered);
8894 } else if (!pg->waiting_for_peered.empty()) {
8895 ldout(pg->cct, 10) << __func__ << " flushes in progress, moving "
8896 << pg->waiting_for_peered.size()
8897 << " items to waiting_for_flush"
8898 << dendl;
8899 ceph_assert(pg->waiting_for_flush.empty());
8900 pg->waiting_for_flush.swap(pg->waiting_for_peered);
8901 }
8902
8903 pg->on_activate();
8904
8905 return discard_event();
8906 }
8907
8908 void PG::RecoveryState::Active::exit()
8909 {
8910 context< RecoveryMachine >().log_exit(state_name, enter_time);
8911 PG *pg = context< RecoveryMachine >().pg;
8912 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
8913
8914 pg->blocked_by.clear();
8915 pg->backfill_reserved = false;
8916 pg->backfill_reserving = false;
8917 pg->state_clear(PG_STATE_ACTIVATING);
8918 pg->state_clear(PG_STATE_DEGRADED);
8919 pg->state_clear(PG_STATE_UNDERSIZED);
8920 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
8921 pg->state_clear(PG_STATE_BACKFILL_WAIT);
8922 pg->state_clear(PG_STATE_RECOVERY_WAIT);
8923 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
8924 utime_t dur = ceph_clock_now() - enter_time;
8925 pg->osd->recoverystate_perf->tinc(rs_active_latency, dur);
8926 pg->agent_stop();
8927 }
8928
8929 /*------ReplicaActive-----*/
8930 PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx)
8931 : my_base(ctx),
8932 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive")
8933 {
8934 context< RecoveryMachine >().log_enter(state_name);
8935
8936 PG *pg = context< RecoveryMachine >().pg;
8937 pg->start_flush(context< RecoveryMachine >().get_cur_transaction());
8938 }
8939
8940
8941 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
8942 const Activate& actevt) {
8943 PG *pg = context< RecoveryMachine >().pg;
8944 ldout(pg->cct, 10) << "In ReplicaActive, about to call activate" << dendl;
8945 map<int, map<spg_t, pg_query_t> > query_map;
8946 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
8947 actevt.activation_epoch,
8948 query_map, NULL, NULL);
8949 ldout(pg->cct, 10) << "Activate Finished" << dendl;
8950 return discard_event();
8951 }
8952
8953 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MInfoRec& infoevt)
8954 {
8955 PG *pg = context< RecoveryMachine >().pg;
8956 pg->proc_primary_info(*context<RecoveryMachine>().get_cur_transaction(),
8957 infoevt.info);
8958 return discard_event();
8959 }
8960
8961 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MLogRec& logevt)
8962 {
8963 PG *pg = context< RecoveryMachine >().pg;
8964 ldout(pg->cct, 10) << "received log from " << logevt.from << dendl;
8965 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
8966 pg->merge_log(*t, logevt.msg->info, logevt.msg->log, logevt.from);
8967 ceph_assert(pg->pg_log.get_head() == pg->info.last_update);
8968
8969 return discard_event();
8970 }
8971
8972 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MTrim& trim)
8973 {
8974 PG *pg = context< RecoveryMachine >().pg;
8975 // primary is instructing us to trim
8976 pg->pg_log.trim(trim.trim_to, pg->info);
8977 pg->dirty_info = true;
8978 return discard_event();
8979 }
8980
8981 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&)
8982 {
8983 PG *pg = context< RecoveryMachine >().pg;
8984 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
8985 context< RecoveryMachine >().send_notify(
8986 pg->get_primary(),
8987 pg_notify_t(
8988 pg->get_primary().shard, pg->pg_whoami.shard,
8989 pg->get_osdmap_epoch(),
8990 pg->get_osdmap_epoch(),
8991 pg->info),
8992 pg->past_intervals);
8993 }
8994 pg->take_waiters();
8995 return discard_event();
8996 }
8997
8998 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
8999 const MQuery& query)
9000 {
9001 PG *pg = context< RecoveryMachine >().pg;
9002 pg->fulfill_query(query, context<RecoveryMachine>().get_recovery_ctx());
9003 return discard_event();
9004 }
9005
9006 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const QueryState& q)
9007 {
9008 q.f->open_object_section("state");
9009 q.f->dump_string("name", state_name);
9010 q.f->dump_stream("enter_time") << enter_time;
9011 q.f->close_section();
9012 return forward_event();
9013 }
9014
9015 void PG::RecoveryState::ReplicaActive::exit()
9016 {
9017 context< RecoveryMachine >().log_exit(state_name, enter_time);
9018 PG *pg = context< RecoveryMachine >().pg;
9019 pg->clear_reserved_num_bytes();
9020 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
9021 utime_t dur = ceph_clock_now() - enter_time;
9022 pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur);
9023 }
9024
9025 /*-------Stray---*/
9026 PG::RecoveryState::Stray::Stray(my_context ctx)
9027 : my_base(ctx),
9028 NamedState(context< RecoveryMachine >().pg, "Started/Stray")
9029 {
9030 context< RecoveryMachine >().log_enter(state_name);
9031
9032 PG *pg = context< RecoveryMachine >().pg;
9033 ceph_assert(!pg->is_peered());
9034 ceph_assert(!pg->is_peering());
9035 ceph_assert(!pg->is_primary());
9036
9037 if (!pg->get_osdmap()->have_pg_pool(pg->get_pgid().pool())) {
9038 ldout(pg->cct,10) << __func__ << " pool is deleted" << dendl;
9039 post_event(DeleteStart());
9040 } else {
9041 pg->start_flush(context< RecoveryMachine >().get_cur_transaction());
9042 }
9043 }
9044
9045 boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
9046 {
9047 PG *pg = context< RecoveryMachine >().pg;
9048 MOSDPGLog *msg = logevt.msg.get();
9049 ldout(pg->cct, 10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
9050
9051 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
9052 if (msg->info.last_backfill == hobject_t()) {
9053 // restart backfill
9054 pg->info = msg->info;
9055 pg->on_info_history_change();
9056 pg->dirty_info = true;
9057 pg->dirty_big_info = true; // maybe.
9058
9059 PGLogEntryHandler rollbacker{pg, t};
9060 pg->pg_log.reset_backfill_claim_log(msg->log, &rollbacker);
9061
9062 pg->pg_log.reset_backfill();
9063 } else {
9064 pg->merge_log(*t, msg->info, msg->log, logevt.from);
9065 }
9066
9067 ceph_assert(pg->pg_log.get_head() == pg->info.last_update);
9068
9069 post_event(Activate(logevt.msg->info.last_epoch_started));
9070 return transit<ReplicaActive>();
9071 }
9072
9073 boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoevt)
9074 {
9075 PG *pg = context< RecoveryMachine >().pg;
9076 ldout(pg->cct, 10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
9077
9078 if (pg->info.last_update > infoevt.info.last_update) {
9079 // rewind divergent log entries
9080 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
9081 pg->rewind_divergent_log(*t, infoevt.info.last_update);
9082 pg->info.stats = infoevt.info.stats;
9083 pg->info.hit_set = infoevt.info.hit_set;
9084 }
9085
9086 ceph_assert(infoevt.info.last_update == pg->info.last_update);
9087 ceph_assert(pg->pg_log.get_head() == pg->info.last_update);
9088
9089 post_event(Activate(infoevt.info.last_epoch_started));
9090 return transit<ReplicaActive>();
9091 }
9092
9093 boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query)
9094 {
9095 PG *pg = context< RecoveryMachine >().pg;
9096 pg->fulfill_query(query, context<RecoveryMachine>().get_recovery_ctx());
9097 return discard_event();
9098 }
9099
9100 boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&)
9101 {
9102 PG *pg = context< RecoveryMachine >().pg;
9103 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
9104 context< RecoveryMachine >().send_notify(
9105 pg->get_primary(),
9106 pg_notify_t(
9107 pg->get_primary().shard, pg->pg_whoami.shard,
9108 pg->get_osdmap_epoch(),
9109 pg->get_osdmap_epoch(),
9110 pg->info),
9111 pg->past_intervals);
9112 }
9113 pg->take_waiters();
9114 return discard_event();
9115 }
9116
9117 void PG::RecoveryState::Stray::exit()
9118 {
9119 context< RecoveryMachine >().log_exit(state_name, enter_time);
9120 PG *pg = context< RecoveryMachine >().pg;
9121 utime_t dur = ceph_clock_now() - enter_time;
9122 pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur);
9123 }
9124
9125
9126 /*--------ToDelete----------*/
9127 PG::RecoveryState::ToDelete::ToDelete(my_context ctx)
9128 : my_base(ctx),
9129 NamedState(context< RecoveryMachine >().pg, "Started/ToDelete")
9130 {
9131 context< RecoveryMachine >().log_enter(state_name);
9132 PG *pg = context< RecoveryMachine >().pg;
9133 pg->osd->logger->inc(l_osd_pg_removing);
9134 }
9135
9136 void PG::RecoveryState::ToDelete::exit()
9137 {
9138 context< RecoveryMachine >().log_exit(state_name, enter_time);
9139 PG *pg = context< RecoveryMachine >().pg;
9140 // note: on a successful removal, this path doesn't execute. see
9141 // _delete_some().
9142 pg->osd->logger->dec(l_osd_pg_removing);
9143 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
9144 }
9145
9146 /*----WaitDeleteReserved----*/
9147 PG::RecoveryState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx)
9148 : my_base(ctx),
9149 NamedState(context< RecoveryMachine >().pg,
9150 "Started/ToDelete/WaitDeleteReseved")
9151 {
9152 context< RecoveryMachine >().log_enter(state_name);
9153 PG *pg = context< RecoveryMachine >().pg;
9154 context<ToDelete>().priority = pg->get_delete_priority();
9155 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
9156 pg->osd->local_reserver.request_reservation(
9157 pg->info.pgid,
9158 new QueuePeeringEvt<DeleteReserved>(
9159 pg, pg->get_osdmap_epoch(),
9160 DeleteReserved()),
9161 context<ToDelete>().priority,
9162 new QueuePeeringEvt<DeleteInterrupted>(
9163 pg, pg->get_osdmap_epoch(),
9164 DeleteInterrupted()));
9165 }
9166
9167 boost::statechart::result PG::RecoveryState::ToDelete::react(
9168 const ActMap& evt)
9169 {
9170 PG *pg = context< RecoveryMachine >().pg;
9171 if (pg->get_delete_priority() != priority) {
9172 ldout(pg->cct,10) << __func__ << " delete priority changed, resetting"
9173 << dendl;
9174 return transit<ToDelete>();
9175 }
9176 return discard_event();
9177 }
9178
9179 void PG::RecoveryState::WaitDeleteReserved::exit()
9180 {
9181 context< RecoveryMachine >().log_exit(state_name, enter_time);
9182 }
9183
9184 /*----Deleting-----*/
9185 PG::RecoveryState::Deleting::Deleting(my_context ctx)
9186 : my_base(ctx),
9187 NamedState(context< RecoveryMachine >().pg, "Started/ToDelete/Deleting")
9188 {
9189 context< RecoveryMachine >().log_enter(state_name);
9190 PG *pg = context< RecoveryMachine >().pg;
9191 pg->deleting = true;
9192 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
9193 pg->on_removal(t);
9194 t->register_on_commit(new C_DeleteMore(pg, pg->get_osdmap_epoch()));
9195 }
9196
9197 boost::statechart::result PG::RecoveryState::Deleting::react(
9198 const DeleteSome& evt)
9199 {
9200 PG *pg = context< RecoveryMachine >().pg;
9201 pg->_delete_some(context<RecoveryMachine>().get_cur_transaction());
9202 return discard_event();
9203 }
9204
9205 void PG::RecoveryState::Deleting::exit()
9206 {
9207 context< RecoveryMachine >().log_exit(state_name, enter_time);
9208 PG *pg = context< RecoveryMachine >().pg;
9209 pg->deleting = false;
9210 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
9211 }
9212
9213 /*--------GetInfo---------*/
9214 PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
9215 : my_base(ctx),
9216 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetInfo")
9217 {
9218 context< RecoveryMachine >().log_enter(state_name);
9219
9220 PG *pg = context< RecoveryMachine >().pg;
9221 pg->check_past_interval_bounds();
9222 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
9223
9224 ceph_assert(pg->blocked_by.empty());
9225
9226 prior_set = pg->build_prior();
9227
9228 pg->reset_min_peer_features();
9229 get_infos();
9230 if (prior_set.pg_down) {
9231 post_event(IsDown());
9232 } else if (peer_info_requested.empty()) {
9233 post_event(GotInfo());
9234 }
9235 }
9236
9237 void PG::RecoveryState::GetInfo::get_infos()
9238 {
9239 PG *pg = context< RecoveryMachine >().pg;
9240 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
9241
9242 pg->blocked_by.clear();
9243 for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
9244 it != prior_set.probe.end();
9245 ++it) {
9246 pg_shard_t peer = *it;
9247 if (peer == pg->pg_whoami) {
9248 continue;
9249 }
9250 if (pg->peer_info.count(peer)) {
9251 ldout(pg->cct, 10) << " have osd." << peer << " info " << pg->peer_info[peer] << dendl;
9252 continue;
9253 }
9254 if (peer_info_requested.count(peer)) {
9255 ldout(pg->cct, 10) << " already requested info from osd." << peer << dendl;
9256 pg->blocked_by.insert(peer.osd);
9257 } else if (!pg->get_osdmap()->is_up(peer.osd)) {
9258 ldout(pg->cct, 10) << " not querying info from down osd." << peer << dendl;
9259 } else {
9260 ldout(pg->cct, 10) << " querying info from osd." << peer << dendl;
9261 context< RecoveryMachine >().send_query(
9262 peer, pg_query_t(pg_query_t::INFO,
9263 it->shard, pg->pg_whoami.shard,
9264 pg->info.history,
9265 pg->get_osdmap_epoch()));
9266 peer_info_requested.insert(peer);
9267 pg->blocked_by.insert(peer.osd);
9268 }
9269 }
9270
9271 pg->publish_stats_to_osd();
9272 }
9273
9274 boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt)
9275 {
9276 PG *pg = context< RecoveryMachine >().pg;
9277
9278 set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
9279 if (p != peer_info_requested.end()) {
9280 peer_info_requested.erase(p);
9281 pg->blocked_by.erase(infoevt.from.osd);
9282 }
9283
9284 epoch_t old_start = pg->info.history.last_epoch_started;
9285 if (pg->proc_replica_info(
9286 infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
9287 // we got something new ...
9288 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
9289 if (old_start < pg->info.history.last_epoch_started) {
9290 ldout(pg->cct, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
9291 prior_set = pg->build_prior();
9292
9293 // filter out any osds that got dropped from the probe set from
9294 // peer_info_requested. this is less expensive than restarting
9295 // peering (which would re-probe everyone).
9296 set<pg_shard_t>::iterator p = peer_info_requested.begin();
9297 while (p != peer_info_requested.end()) {
9298 if (prior_set.probe.count(*p) == 0) {
9299 ldout(pg->cct, 20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
9300 peer_info_requested.erase(p++);
9301 } else {
9302 ++p;
9303 }
9304 }
9305 get_infos();
9306 }
9307 ldout(pg->cct, 20) << "Adding osd: " << infoevt.from.osd << " peer features: "
9308 << hex << infoevt.features << dec << dendl;
9309 pg->apply_peer_features(infoevt.features);
9310
9311 // are we done getting everything?
9312 if (peer_info_requested.empty() && !prior_set.pg_down) {
9313 ldout(pg->cct, 20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl;
9314 ldout(pg->cct, 20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl;
9315 ldout(pg->cct, 20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl;
9316 post_event(GotInfo());
9317 }
9318 }
9319 return discard_event();
9320 }
9321
9322 boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q)
9323 {
9324 PG *pg = context< RecoveryMachine >().pg;
9325 q.f->open_object_section("state");
9326 q.f->dump_string("name", state_name);
9327 q.f->dump_stream("enter_time") << enter_time;
9328
9329 q.f->open_array_section("requested_info_from");
9330 for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
9331 p != peer_info_requested.end();
9332 ++p) {
9333 q.f->open_object_section("osd");
9334 q.f->dump_stream("osd") << *p;
9335 if (pg->peer_info.count(*p)) {
9336 q.f->open_object_section("got_info");
9337 pg->peer_info[*p].dump(q.f);
9338 q.f->close_section();
9339 }
9340 q.f->close_section();
9341 }
9342 q.f->close_section();
9343
9344 q.f->close_section();
9345 return forward_event();
9346 }
9347
9348 void PG::RecoveryState::GetInfo::exit()
9349 {
9350 context< RecoveryMachine >().log_exit(state_name, enter_time);
9351 PG *pg = context< RecoveryMachine >().pg;
9352 utime_t dur = ceph_clock_now() - enter_time;
9353 pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur);
9354 pg->blocked_by.clear();
9355 }
9356
9357 /*------GetLog------------*/
9358 PG::RecoveryState::GetLog::GetLog(my_context ctx)
9359 : my_base(ctx),
9360 NamedState(
9361 context< RecoveryMachine >().pg, "Started/Primary/Peering/GetLog"),
9362 msg(0)
9363 {
9364 context< RecoveryMachine >().log_enter(state_name);
9365
9366 PG *pg = context< RecoveryMachine >().pg;
9367
9368 // adjust acting?
9369 if (!pg->choose_acting(auth_log_shard, false,
9370 &context< Peering >().history_les_bound)) {
9371 if (!pg->want_acting.empty()) {
9372 post_event(NeedActingChange());
9373 } else {
9374 post_event(IsIncomplete());
9375 }
9376 return;
9377 }
9378
9379 // am i the best?
9380 if (auth_log_shard == pg->pg_whoami) {
9381 post_event(GotLog());
9382 return;
9383 }
9384
9385 const pg_info_t& best = pg->peer_info[auth_log_shard];
9386
9387 // am i broken?
9388 if (pg->info.last_update < best.log_tail) {
9389 ldout(pg->cct, 10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
9390 post_event(IsIncomplete());
9391 return;
9392 }
9393
9394 // how much log to request?
9395 eversion_t request_log_from = pg->info.last_update;
9396 ceph_assert(!pg->acting_recovery_backfill.empty());
9397 for (set<pg_shard_t>::iterator p = pg->acting_recovery_backfill.begin();
9398 p != pg->acting_recovery_backfill.end();
9399 ++p) {
9400 if (*p == pg->pg_whoami) continue;
9401 pg_info_t& ri = pg->peer_info[*p];
9402 if (ri.last_update < pg->info.log_tail && ri.last_update >= best.log_tail &&
9403 ri.last_update < request_log_from)
9404 request_log_from = ri.last_update;
9405 }
9406
9407 // how much?
9408 ldout(pg->cct, 10) << " requesting log from osd." << auth_log_shard << dendl;
9409 context<RecoveryMachine>().send_query(
9410 auth_log_shard,
9411 pg_query_t(
9412 pg_query_t::LOG,
9413 auth_log_shard.shard, pg->pg_whoami.shard,
9414 request_log_from, pg->info.history,
9415 pg->get_osdmap_epoch()));
9416
9417 ceph_assert(pg->blocked_by.empty());
9418 pg->blocked_by.insert(auth_log_shard.osd);
9419 pg->publish_stats_to_osd();
9420 }
9421
9422 boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap)
9423 {
9424 PG *pg = context< RecoveryMachine >().pg;
9425 // make sure our log source didn't go down. we need to check
9426 // explicitly because it may not be part of the prior set, which
9427 // means the Peering state check won't catch it going down.
9428 if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
9429 ldout(pg->cct, 10) << "GetLog: auth_log_shard osd."
9430 << auth_log_shard.osd << " went down" << dendl;
9431 post_event(advmap);
9432 return transit< Reset >();
9433 }
9434
9435 // let the Peering state do its checks.
9436 return forward_event();
9437 }
9438
9439 boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt)
9440 {
9441 PG *pg = context< RecoveryMachine >().pg;
9442 ceph_assert(!msg);
9443 if (logevt.from != auth_log_shard) {
9444 ldout(pg->cct, 10) << "GetLog: discarding log from "
9445 << "non-auth_log_shard osd." << logevt.from << dendl;
9446 return discard_event();
9447 }
9448 ldout(pg->cct, 10) << "GetLog: received master log from osd"
9449 << logevt.from << dendl;
9450 msg = logevt.msg;
9451 post_event(GotLog());
9452 return discard_event();
9453 }
9454
9455 boost::statechart::result PG::RecoveryState::GetLog::react(const GotLog&)
9456 {
9457 PG *pg = context< RecoveryMachine >().pg;
9458 ldout(pg->cct, 10) << "leaving GetLog" << dendl;
9459 if (msg) {
9460 ldout(pg->cct, 10) << "processing master log" << dendl;
9461 pg->proc_master_log(*context<RecoveryMachine>().get_cur_transaction(),
9462 msg->info, msg->log, msg->missing,
9463 auth_log_shard);
9464 }
9465 pg->start_flush(context< RecoveryMachine >().get_cur_transaction());
9466 return transit< GetMissing >();
9467 }
9468
9469 boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q)
9470 {
9471 q.f->open_object_section("state");
9472 q.f->dump_string("name", state_name);
9473 q.f->dump_stream("enter_time") << enter_time;
9474 q.f->dump_stream("auth_log_shard") << auth_log_shard;
9475 q.f->close_section();
9476 return forward_event();
9477 }
9478
9479 void PG::RecoveryState::GetLog::exit()
9480 {
9481 context< RecoveryMachine >().log_exit(state_name, enter_time);
9482 PG *pg = context< RecoveryMachine >().pg;
9483 utime_t dur = ceph_clock_now() - enter_time;
9484 pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur);
9485 pg->blocked_by.clear();
9486 }
9487
9488 /*------WaitActingChange--------*/
9489 PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx)
9490 : my_base(ctx),
9491 NamedState(context< RecoveryMachine >().pg, "Started/Primary/WaitActingChange")
9492 {
9493 context< RecoveryMachine >().log_enter(state_name);
9494 }
9495
9496 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const AdvMap& advmap)
9497 {
9498 PG *pg = context< RecoveryMachine >().pg;
9499 OSDMapRef osdmap = advmap.osdmap;
9500
9501 ldout(pg->cct, 10) << "verifying no want_acting " << pg->want_acting << " targets didn't go down" << dendl;
9502 for (vector<int>::iterator p = pg->want_acting.begin(); p != pg->want_acting.end(); ++p) {
9503 if (!osdmap->is_up(*p)) {
9504 ldout(pg->cct, 10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
9505 post_event(advmap);
9506 return transit< Reset >();
9507 }
9508 }
9509 return forward_event();
9510 }
9511
9512 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MLogRec& logevt)
9513 {
9514 PG *pg = context< RecoveryMachine >().pg;
9515 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MLocRec" << dendl;
9516 return discard_event();
9517 }
9518
9519 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MInfoRec& evt)
9520 {
9521 PG *pg = context< RecoveryMachine >().pg;
9522 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
9523 return discard_event();
9524 }
9525
9526 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MNotifyRec& evt)
9527 {
9528 PG *pg = context< RecoveryMachine >().pg;
9529 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
9530 return discard_event();
9531 }
9532
9533 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const QueryState& q)
9534 {
9535 q.f->open_object_section("state");
9536 q.f->dump_string("name", state_name);
9537 q.f->dump_stream("enter_time") << enter_time;
9538 q.f->dump_string("comment", "waiting for pg acting set to change");
9539 q.f->close_section();
9540 return forward_event();
9541 }
9542
9543 void PG::RecoveryState::WaitActingChange::exit()
9544 {
9545 context< RecoveryMachine >().log_exit(state_name, enter_time);
9546 PG *pg = context< RecoveryMachine >().pg;
9547 utime_t dur = ceph_clock_now() - enter_time;
9548 pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur);
9549 }
9550
9551 /*------Down--------*/
9552 PG::RecoveryState::Down::Down(my_context ctx)
9553 : my_base(ctx),
9554 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Down")
9555 {
9556 context< RecoveryMachine >().log_enter(state_name);
9557 PG *pg = context< RecoveryMachine >().pg;
9558
9559 pg->state_clear(PG_STATE_PEERING);
9560 pg->state_set(PG_STATE_DOWN);
9561
9562 auto &prior_set = context< Peering >().prior_set;
9563 ceph_assert(pg->blocked_by.empty());
9564 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
9565 pg->publish_stats_to_osd();
9566 }
9567
9568 void PG::RecoveryState::Down::exit()
9569 {
9570 context< RecoveryMachine >().log_exit(state_name, enter_time);
9571 PG *pg = context< RecoveryMachine >().pg;
9572
9573 pg->state_clear(PG_STATE_DOWN);
9574 utime_t dur = ceph_clock_now() - enter_time;
9575 pg->osd->recoverystate_perf->tinc(rs_down_latency, dur);
9576
9577 pg->blocked_by.clear();
9578 }
9579
9580 boost::statechart::result PG::RecoveryState::Down::react(const QueryState& q)
9581 {
9582 q.f->open_object_section("state");
9583 q.f->dump_string("name", state_name);
9584 q.f->dump_stream("enter_time") << enter_time;
9585 q.f->dump_string("comment",
9586 "not enough up instances of this PG to go active");
9587 q.f->close_section();
9588 return forward_event();
9589 }
9590
9591 boost::statechart::result PG::RecoveryState::Down::react(const MNotifyRec& infoevt)
9592 {
9593 PG *pg = context< RecoveryMachine >().pg;
9594
9595 ceph_assert(pg->is_primary());
9596 epoch_t old_start = pg->info.history.last_epoch_started;
9597 if (!pg->peer_info.count(infoevt.from) &&
9598 pg->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) {
9599 pg->update_history(infoevt.notify.info.history);
9600 }
9601 // if we got something new to make pg escape down state
9602 if (pg->info.history.last_epoch_started > old_start) {
9603 ldout(pg->cct, 10) << " last_epoch_started moved forward, re-enter getinfo" << dendl;
9604 pg->state_clear(PG_STATE_DOWN);
9605 pg->state_set(PG_STATE_PEERING);
9606 return transit< GetInfo >();
9607 }
9608
9609 return discard_event();
9610 }
9611
9612
9613 /*------Incomplete--------*/
9614 PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
9615 : my_base(ctx),
9616 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Incomplete")
9617 {
9618 context< RecoveryMachine >().log_enter(state_name);
9619 PG *pg = context< RecoveryMachine >().pg;
9620
9621 pg->state_clear(PG_STATE_PEERING);
9622 pg->state_set(PG_STATE_INCOMPLETE);
9623
9624 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
9625 ceph_assert(pg->blocked_by.empty());
9626 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
9627 pg->publish_stats_to_osd();
9628 }
9629
9630 boost::statechart::result PG::RecoveryState::Incomplete::react(const AdvMap &advmap) {
9631 PG *pg = context< RecoveryMachine >().pg;
9632 int64_t poolnum = pg->info.pgid.pool();
9633
9634 // Reset if min_size turn smaller than previous value, pg might now be able to go active
9635 if (!advmap.osdmap->have_pg_pool(poolnum) ||
9636 advmap.lastmap->get_pools().find(poolnum)->second.min_size >
9637 advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
9638 post_event(advmap);
9639 return transit< Reset >();
9640 }
9641
9642 return forward_event();
9643 }
9644
9645 boost::statechart::result PG::RecoveryState::Incomplete::react(const MNotifyRec& notevt) {
9646 PG *pg = context< RecoveryMachine >().pg;
9647 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
9648 if (pg->proc_replica_info(
9649 notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
9650 // We got something new, try again!
9651 return transit< GetLog >();
9652 } else {
9653 return discard_event();
9654 }
9655 }
9656
9657 boost::statechart::result PG::RecoveryState::Incomplete::react(
9658 const QueryState& q)
9659 {
9660 q.f->open_object_section("state");
9661 q.f->dump_string("name", state_name);
9662 q.f->dump_stream("enter_time") << enter_time;
9663 q.f->dump_string("comment", "not enough complete instances of this PG");
9664 q.f->close_section();
9665 return forward_event();
9666 }
9667
9668 void PG::RecoveryState::Incomplete::exit()
9669 {
9670 context< RecoveryMachine >().log_exit(state_name, enter_time);
9671 PG *pg = context< RecoveryMachine >().pg;
9672
9673 pg->state_clear(PG_STATE_INCOMPLETE);
9674 utime_t dur = ceph_clock_now() - enter_time;
9675 pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur);
9676
9677 pg->blocked_by.clear();
9678 }
9679
9680 /*------GetMissing--------*/
9681 PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
9682 : my_base(ctx),
9683 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetMissing")
9684 {
9685 context< RecoveryMachine >().log_enter(state_name);
9686
9687 PG *pg = context< RecoveryMachine >().pg;
9688 ceph_assert(!pg->acting_recovery_backfill.empty());
9689 eversion_t since;
9690 for (set<pg_shard_t>::iterator i = pg->acting_recovery_backfill.begin();
9691 i != pg->acting_recovery_backfill.end();
9692 ++i) {
9693 if (*i == pg->get_primary()) continue;
9694 const pg_info_t& pi = pg->peer_info[*i];
9695 // reset this so to make sure the pg_missing_t is initialized and
9696 // has the correct semantics even if we don't need to get a
9697 // missing set from a shard. This way later additions due to
9698 // lost+unfound delete work properly.
9699 pg->peer_missing[*i].may_include_deletes = !pg->perform_deletes_during_peering();
9700
9701 if (pi.is_empty())
9702 continue; // no pg data, nothing divergent
9703
9704 if (pi.last_update < pg->pg_log.get_tail()) {
9705 ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
9706 pg->peer_missing[*i].clear();
9707 continue;
9708 }
9709 if (pi.last_backfill == hobject_t()) {
9710 ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
9711 pg->peer_missing[*i].clear();
9712 continue;
9713 }
9714
9715 if (pi.last_update == pi.last_complete && // peer has no missing
9716 pi.last_update == pg->info.last_update) { // peer is up to date
9717 // replica has no missing and identical log as us. no need to
9718 // pull anything.
9719 // FIXME: we can do better here. if last_update==last_complete we
9720 // can infer the rest!
9721 ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl;
9722 pg->peer_missing[*i].clear();
9723 continue;
9724 }
9725
9726 // We pull the log from the peer's last_epoch_started to ensure we
9727 // get enough log to detect divergent updates.
9728 since.epoch = pi.last_epoch_started;
9729 ceph_assert(pi.last_update >= pg->info.log_tail); // or else choose_acting() did a bad thing
9730 if (pi.log_tail <= since) {
9731 ldout(pg->cct, 10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
9732 context< RecoveryMachine >().send_query(
9733 *i,
9734 pg_query_t(
9735 pg_query_t::LOG,
9736 i->shard, pg->pg_whoami.shard,
9737 since, pg->info.history,
9738 pg->get_osdmap_epoch()));
9739 } else {
9740 ldout(pg->cct, 10) << " requesting fulllog+missing from osd." << *i
9741 << " (want since " << since << " < log.tail "
9742 << pi.log_tail << ")" << dendl;
9743 context< RecoveryMachine >().send_query(
9744 *i, pg_query_t(
9745 pg_query_t::FULLLOG,
9746 i->shard, pg->pg_whoami.shard,
9747 pg->info.history, pg->get_osdmap_epoch()));
9748 }
9749 peer_missing_requested.insert(*i);
9750 pg->blocked_by.insert(i->osd);
9751 }
9752
9753 if (peer_missing_requested.empty()) {
9754 if (pg->need_up_thru) {
9755 ldout(pg->cct, 10) << " still need up_thru update before going active"
9756 << dendl;
9757 post_event(NeedUpThru());
9758 return;
9759 }
9760
9761 // all good!
9762 post_event(Activate(pg->get_osdmap_epoch()));
9763 } else {
9764 pg->publish_stats_to_osd();
9765 }
9766 }
9767
9768 boost::statechart::result PG::RecoveryState::GetMissing::react(const MLogRec& logevt)
9769 {
9770 PG *pg = context< RecoveryMachine >().pg;
9771
9772 peer_missing_requested.erase(logevt.from);
9773 pg->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
9774
9775 if (peer_missing_requested.empty()) {
9776 if (pg->need_up_thru) {
9777 ldout(pg->cct, 10) << " still need up_thru update before going active"
9778 << dendl;
9779 post_event(NeedUpThru());
9780 } else {
9781 ldout(pg->cct, 10) << "Got last missing, don't need missing "
9782 << "posting Activate" << dendl;
9783 post_event(Activate(pg->get_osdmap_epoch()));
9784 }
9785 }
9786 return discard_event();
9787 }
9788
9789 boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState& q)
9790 {
9791 PG *pg = context< RecoveryMachine >().pg;
9792 q.f->open_object_section("state");
9793 q.f->dump_string("name", state_name);
9794 q.f->dump_stream("enter_time") << enter_time;
9795
9796 q.f->open_array_section("peer_missing_requested");
9797 for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
9798 p != peer_missing_requested.end();
9799 ++p) {
9800 q.f->open_object_section("osd");
9801 q.f->dump_stream("osd") << *p;
9802 if (pg->peer_missing.count(*p)) {
9803 q.f->open_object_section("got_missing");
9804 pg->peer_missing[*p].dump(q.f);
9805 q.f->close_section();
9806 }
9807 q.f->close_section();
9808 }
9809 q.f->close_section();
9810
9811 q.f->close_section();
9812 return forward_event();
9813 }
9814
9815 void PG::RecoveryState::GetMissing::exit()
9816 {
9817 context< RecoveryMachine >().log_exit(state_name, enter_time);
9818 PG *pg = context< RecoveryMachine >().pg;
9819 utime_t dur = ceph_clock_now() - enter_time;
9820 pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur);
9821 pg->blocked_by.clear();
9822 }
9823
9824 /*------WaitUpThru--------*/
9825 PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx)
9826 : my_base(ctx),
9827 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitUpThru")
9828 {
9829 context< RecoveryMachine >().log_enter(state_name);
9830 }
9831
9832 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const ActMap& am)
9833 {
9834 PG *pg = context< RecoveryMachine >().pg;
9835 if (!pg->need_up_thru) {
9836 post_event(Activate(pg->get_osdmap_epoch()));
9837 }
9838 return forward_event();
9839 }
9840
9841 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const MLogRec& logevt)
9842 {
9843 PG *pg = context< RecoveryMachine >().pg;
9844 ldout(pg->cct, 10) << "Noting missing from osd." << logevt.from << dendl;
9845 pg->peer_missing[logevt.from].claim(logevt.msg->missing);
9846 pg->peer_info[logevt.from] = logevt.msg->info;
9847 return discard_event();
9848 }
9849
9850 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const QueryState& q)
9851 {
9852 q.f->open_object_section("state");
9853 q.f->dump_string("name", state_name);
9854 q.f->dump_stream("enter_time") << enter_time;
9855 q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
9856 q.f->close_section();
9857 return forward_event();
9858 }
9859
9860 void PG::RecoveryState::WaitUpThru::exit()
9861 {
9862 context< RecoveryMachine >().log_exit(state_name, enter_time);
9863 PG *pg = context< RecoveryMachine >().pg;
9864 utime_t dur = ceph_clock_now() - enter_time;
9865 pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur);
9866 }
9867
9868 /*----RecoveryState::RecoveryMachine Methods-----*/
9869 #undef dout_prefix
9870 #define dout_prefix pg->gen_prefix(*_dout)
9871
9872 void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name)
9873 {
9874 PG *pg = context< RecoveryMachine >().pg;
9875 ldout(pg->cct, 5) << "enter " << state_name << dendl;
9876 pg->osd->pg_recovery_stats.log_enter(state_name);
9877 }
9878
9879 void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_t enter_time)
9880 {
9881 utime_t dur = ceph_clock_now() - enter_time;
9882 PG *pg = context< RecoveryMachine >().pg;
9883 ldout(pg->cct, 5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
9884 pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now() - enter_time,
9885 event_count, event_time);
9886 event_count = 0;
9887 event_time = utime_t();
9888 }
9889
9890
9891 /*---------------------------------------------------*/
9892 #undef dout_prefix
9893 #define dout_prefix ((debug_pg ? debug_pg->gen_prefix(*_dout) : *_dout) << " PriorSet: ")
9894
9895 void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
9896 ceph_assert(!rctx);
9897 ceph_assert(!orig_ctx);
9898 orig_ctx = new_ctx;
9899 if (new_ctx) {
9900 if (messages_pending_flush) {
9901 rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
9902 } else {
9903 rctx = *new_ctx;
9904 }
9905 rctx->start_time = ceph_clock_now();
9906 }
9907 }
9908
9909 void PG::RecoveryState::begin_block_outgoing() {
9910 ceph_assert(!messages_pending_flush);
9911 ceph_assert(orig_ctx);
9912 ceph_assert(rctx);
9913 messages_pending_flush = BufferedRecoveryMessages();
9914 rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
9915 }
9916
9917 void PG::RecoveryState::clear_blocked_outgoing() {
9918 ceph_assert(orig_ctx);
9919 ceph_assert(rctx);
9920 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
9921 }
9922
9923 void PG::RecoveryState::end_block_outgoing() {
9924 ceph_assert(messages_pending_flush);
9925 ceph_assert(orig_ctx);
9926 ceph_assert(rctx);
9927
9928 rctx = RecoveryCtx(*orig_ctx);
9929 rctx->accept_buffered_messages(*messages_pending_flush);
9930 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
9931 }
9932
9933 void PG::RecoveryState::end_handle() {
9934 if (rctx) {
9935 utime_t dur = ceph_clock_now() - rctx->start_time;
9936 machine.event_time += dur;
9937 }
9938
9939 machine.event_count++;
9940 rctx = boost::optional<RecoveryCtx>();
9941 orig_ctx = NULL;
9942 }
9943
9944 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi)
9945 {
9946 out << "BackfillInfo(" << bi.begin << "-" << bi.end
9947 << " " << bi.objects.size() << " objects";
9948 if (!bi.objects.empty())
9949 out << " " << bi.objects;
9950 out << ")";
9951 return out;
9952 }
9953
9954 void PG::dump_pgstate_history(Formatter *f)
9955 {
9956 lock();
9957 pgstate_history.dump(f);
9958 unlock();
9959 }
9960
9961 void PG::dump_missing(Formatter *f)
9962 {
9963 for (auto& i : pg_log.get_missing().get_items()) {
9964 f->open_object_section("object");
9965 f->dump_object("oid", i.first);
9966 f->dump_object("missing_info", i.second);
9967 if (missing_loc.needs_recovery(i.first)) {
9968 f->dump_bool("unfound", missing_loc.is_unfound(i.first));
9969 f->open_array_section("locations");
9970 for (auto l : missing_loc.get_locations(i.first)) {
9971 f->dump_object("shard", l);
9972 }
9973 f->close_section();
9974 }
9975 f->close_section();
9976 }
9977 }
9978
9979 void PG::get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f)
9980 {
9981 pg_stats_publish_lock.Lock();
9982 if (pg_stats_publish_valid) {
9983 f(pg_stats_publish, pg_stats_publish.get_effective_last_epoch_clean());
9984 }
9985 pg_stats_publish_lock.Unlock();
9986 }
9987
9988 void PG::with_heartbeat_peers(std::function<void(int)> f)
9989 {
9990 heartbeat_peer_lock.Lock();
9991 for (auto p : heartbeat_peers) {
9992 f(p);
9993 }
9994 for (auto p : probe_targets) {
9995 f(p);
9996 }
9997 heartbeat_peer_lock.Unlock();
9998 }