]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PG.cc
update sources to 12.2.8
[ceph.git] / ceph / src / osd / PG.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "PG.h"
16 // #include "msg/Messenger.h"
17 #include "messages/MOSDRepScrub.h"
18 // #include "common/cmdparse.h"
19 // #include "common/ceph_context.h"
20
21 #include "common/errno.h"
22 #include "common/config.h"
23 #include "OSD.h"
24 #include "OpRequest.h"
25 #include "ScrubStore.h"
26 #include "Session.h"
27
28 #include "common/Timer.h"
29 #include "common/perf_counters.h"
30
31 #include "messages/MOSDOp.h"
32 #include "messages/MOSDPGNotify.h"
33 // #include "messages/MOSDPGLog.h"
34 #include "messages/MOSDPGRemove.h"
35 #include "messages/MOSDPGInfo.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDPGBackfill.h"
39 #include "messages/MOSDPGBackfillRemove.h"
40 #include "messages/MBackfillReserve.h"
41 #include "messages/MRecoveryReserve.h"
42 #include "messages/MOSDPGPush.h"
43 #include "messages/MOSDPGPushReply.h"
44 #include "messages/MOSDPGPull.h"
45 #include "messages/MOSDECSubOpWrite.h"
46 #include "messages/MOSDECSubOpWriteReply.h"
47 #include "messages/MOSDECSubOpRead.h"
48 #include "messages/MOSDECSubOpReadReply.h"
49 #include "messages/MOSDPGUpdateLogMissing.h"
50 #include "messages/MOSDPGUpdateLogMissingReply.h"
51 #include "messages/MOSDBackoff.h"
52 #include "messages/MOSDScrubReserve.h"
53 #include "messages/MOSDSubOp.h"
54 #include "messages/MOSDRepOp.h"
55 #include "messages/MOSDSubOpReply.h"
56 #include "messages/MOSDRepOpReply.h"
57 #include "messages/MOSDRepScrubMap.h"
58 #include "messages/MOSDPGRecoveryDelete.h"
59 #include "messages/MOSDPGRecoveryDeleteReply.h"
60
61 #include "common/BackTrace.h"
62 #include "common/EventTrace.h"
63
64 #ifdef WITH_LTTNG
65 #define TRACEPOINT_DEFINE
66 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
67 #include "tracing/pg.h"
68 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
69 #undef TRACEPOINT_DEFINE
70 #else
71 #define tracepoint(...)
72 #endif
73
74 #include <sstream>
75
76 #define dout_context cct
77 #define dout_subsys ceph_subsys_osd
78 #undef dout_prefix
79 #define dout_prefix _prefix(_dout, this)
80
81 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
82 // easily skip them
83 const string infover_key("_infover");
84 const string info_key("_info");
85 const string biginfo_key("_biginfo");
86 const string epoch_key("_epoch");
87 const string fastinfo_key("_fastinfo");
88
89 template <class T>
90 static ostream& _prefix(std::ostream *_dout, T *t)
91 {
92 return *_dout << t->gen_prefix();
93 }
94
95 MEMPOOL_DEFINE_OBJECT_FACTORY(PG::CephPeeringEvt, pg_peering_evt, osd);
96
97 void PGStateHistory::enter(PG* pg, const utime_t entime, const char* state)
98 {
99 // Ignore trimming state machine for now
100 if (::strstr(state, "Trimming") != NULL) {
101 return;
102 } else if (pi != nullptr) {
103 pi->enter_state(entime, state);
104 } else {
105 // Store current state since we can't reliably take the PG lock here
106 if ( tmppi == nullptr) {
107 tmppi = std::unique_ptr<PGStateInstance>(new PGStateInstance);
108 }
109
110 thispg = pg;
111 tmppi->enter_state(entime, state);
112 }
113 }
114
115 void PGStateHistory::exit(const char* state) {
116 // Ignore trimming state machine for now
117 // Do nothing if PG is being destroyed!
118 if (::strstr(state, "Trimming") != NULL || pg_in_destructor) {
119 return;
120 } else {
121 bool ilocked = false;
122 if(!thispg->is_locked()) {
123 thispg->lock();
124 ilocked = true;
125 }
126 if (pi == nullptr) {
127 buffer.push_back(std::unique_ptr<PGStateInstance>(tmppi.release()));
128 pi = buffer.back().get();
129 pi->setepoch(thispg->get_osdmap()->get_epoch());
130 }
131
132 pi->exit_state(ceph_clock_now());
133 if (::strcmp(state, "Reset") == 0) {
134 this->reset();
135 }
136 if(ilocked) {
137 thispg->unlock();
138 }
139 }
140 }
141
142 void PGStateHistory::dump(Formatter* f) const {
143 f->open_array_section("history");
144 for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) {
145 f->open_object_section("states");
146 f->dump_stream("epoch") << (*pi)->this_epoch;
147 for (auto she : (*pi)->state_history) {
148 f->dump_string("state", std::get<2>(she));
149 f->dump_stream("enter") << std::get<0>(she);
150 f->dump_stream("exit") << std::get<1>(she);
151 }
152 f->close_section();
153 }
154 f->close_section();
155 }
156
157 void PG::get(const char* tag)
158 {
159 ref++;
160 #ifdef PG_DEBUG_REFS
161 Mutex::Locker l(_ref_id_lock);
162 _tag_counts[tag]++;
163 #endif
164 }
165
166 void PG::put(const char* tag)
167 {
168 #ifdef PG_DEBUG_REFS
169 {
170 Mutex::Locker l(_ref_id_lock);
171 auto tag_counts_entry = _tag_counts.find(tag);
172 assert(tag_counts_entry != _tag_counts.end());
173 --tag_counts_entry->second;
174 if (tag_counts_entry->second == 0) {
175 _tag_counts.erase(tag_counts_entry);
176 }
177 }
178 #endif
179 if (--ref== 0)
180 delete this;
181 }
182
183 #ifdef PG_DEBUG_REFS
184 uint64_t PG::get_with_id()
185 {
186 ref++;
187 Mutex::Locker l(_ref_id_lock);
188 uint64_t id = ++_ref_id;
189 BackTrace bt(0);
190 stringstream ss;
191 bt.print(ss);
192 dout(20) << __func__ << ": " << info.pgid << " got id " << id << " (new) ref==" << ref << dendl;
193 assert(!_live_ids.count(id));
194 _live_ids.insert(make_pair(id, ss.str()));
195 return id;
196 }
197
198 void PG::put_with_id(uint64_t id)
199 {
200 dout(20) << __func__ << ": " << info.pgid << " put id " << id << " (current) ref==" << ref << dendl;
201 {
202 Mutex::Locker l(_ref_id_lock);
203 assert(_live_ids.count(id));
204 _live_ids.erase(id);
205 }
206 if (--ref == 0)
207 delete this;
208 }
209
210 void PG::dump_live_ids()
211 {
212 Mutex::Locker l(_ref_id_lock);
213 dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl;
214 for (map<uint64_t, string>::iterator i = _live_ids.begin();
215 i != _live_ids.end();
216 ++i) {
217 dout(0) << "\t\tid: " << *i << dendl;
218 }
219 dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl;
220 for (map<string, uint64_t>::iterator i = _tag_counts.begin();
221 i != _tag_counts.end();
222 ++i) {
223 dout(0) << "\t\tid: " << *i << dendl;
224 }
225 }
226 #endif
227
228
229 void PGPool::update(OSDMapRef map)
230 {
231 const pg_pool_t *pi = map->get_pg_pool(id);
232 assert(pi);
233 info = *pi;
234 auid = pi->auid;
235 name = map->get_pool_name(id);
236 bool updated = false;
237 if ((map->get_epoch() != cached_epoch + 1) ||
238 (pi->get_snap_epoch() == map->get_epoch())) {
239 updated = true;
240 pi->build_removed_snaps(newly_removed_snaps);
241 interval_set<snapid_t> intersection;
242 intersection.intersection_of(newly_removed_snaps, cached_removed_snaps);
243 if (intersection == cached_removed_snaps) {
244 newly_removed_snaps.subtract(cached_removed_snaps);
245 cached_removed_snaps.union_of(newly_removed_snaps);
246 } else {
247 lgeneric_subdout(cct, osd, 0) << __func__
248 << " cached_removed_snaps shrank from " << cached_removed_snaps
249 << " to " << newly_removed_snaps << dendl;
250 cached_removed_snaps = newly_removed_snaps;
251 newly_removed_snaps.clear();
252 }
253 snapc = pi->get_snap_context();
254 } else {
255 /* 1) map->get_epoch() == cached_epoch + 1 &&
256 * 2) pi->get_snap_epoch() != map->get_epoch()
257 *
258 * From the if branch, 1 && 2 must be true. From 2, we know that
259 * this map didn't change the set of removed snaps. From 1, we
260 * know that our cached_removed_snaps matches the previous map.
261 * Thus, from 1 && 2, cached_removed snaps matches the current
262 * set of removed snaps and all we have to do is clear
263 * newly_removed_snaps.
264 */
265 newly_removed_snaps.clear();
266 }
267 cached_epoch = map->get_epoch();
268 lgeneric_subdout(cct, osd, 20)
269 << "PGPool::update cached_removed_snaps "
270 << cached_removed_snaps
271 << " newly_removed_snaps "
272 << newly_removed_snaps
273 << " snapc " << snapc
274 << (updated ? " (updated)":" (no change)")
275 << dendl;
276 }
277
278 PG::PG(OSDService *o, OSDMapRef curmap,
279 const PGPool &_pool, spg_t p) :
280 osd(o),
281 cct(o->cct),
282 osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
283 snap_mapper(
284 cct,
285 &osdriver,
286 p.ps(),
287 p.get_split_bits(curmap->get_pg_num(_pool.id)),
288 _pool.id,
289 p.shard),
290 osdmap_ref(curmap), last_persisted_osdmap_ref(curmap), pool(_pool),
291 _lock("PG::_lock"),
292 #ifdef PG_DEBUG_REFS
293 _ref_id_lock("PG::_ref_id_lock"), _ref_id(0),
294 #endif
295 deleting(false),
296 trace_endpoint("0.0.0.0", 0, "PG"),
297 dirty_info(false), dirty_big_info(false),
298 info(p),
299 info_struct_v(0),
300 coll(p),
301 pg_log(cct),
302 pgmeta_oid(p.make_pgmeta_oid()),
303 missing_loc(this),
304 past_intervals(
305 curmap->get_pools().at(p.pgid.pool()).ec_pool(),
306 *curmap),
307 stat_queue_item(this),
308 scrub_queued(false),
309 recovery_queued(false),
310 recovery_ops_active(0),
311 role(-1),
312 state(0),
313 send_notify(false),
314 pg_whoami(osd->whoami, p.shard),
315 need_up_thru(false),
316 last_peering_reset(0),
317 heartbeat_peer_lock("PG::heartbeat_peer_lock"),
318 backfill_reserved(false),
319 backfill_reserving(false),
320 flushes_in_progress(0),
321 pg_stats_publish_lock("PG::pg_stats_publish_lock"),
322 pg_stats_publish_valid(false),
323 osr(osd->osr_registry.lookup_or_create(p, (stringify(p)))),
324 finish_sync_event(NULL),
325 backoff_lock("PG::backoff_lock"),
326 scrub_after_recovery(false),
327 active_pushes(0),
328 recovery_state(this),
329 pg_id(p),
330 peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
331 acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
332 upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
333 last_epoch(0)
334 {
335 #ifdef PG_DEBUG_REFS
336 osd->add_pgid(p, this);
337 #endif
338 #ifdef WITH_BLKIN
339 std::stringstream ss;
340 ss << "PG " << info.pgid;
341 trace_endpoint.copy_name(ss.str());
342 #endif
343 osr->shard_hint = p;
344 }
345
346 PG::~PG()
347 {
348 pgstate_history.set_pg_in_destructor();
349 #ifdef PG_DEBUG_REFS
350 osd->remove_pgid(info.pgid, this);
351 #endif
352 }
353
354 void PG::lock_suspend_timeout(ThreadPool::TPHandle &handle)
355 {
356 handle.suspend_tp_timeout();
357 lock();
358 handle.reset_tp_timeout();
359 }
360
361 void PG::lock(bool no_lockdep) const
362 {
363 _lock.Lock(no_lockdep);
364 // if we have unrecorded dirty state with the lock dropped, there is a bug
365 assert(!dirty_info);
366 assert(!dirty_big_info);
367
368 dout(30) << "lock" << dendl;
369 }
370
371 std::string PG::gen_prefix() const
372 {
373 stringstream out;
374 OSDMapRef mapref = osdmap_ref;
375 if (_lock.is_locked_by_me()) {
376 out << "osd." << osd->whoami
377 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
378 << " " << *this << " ";
379 } else {
380 out << "osd." << osd->whoami
381 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
382 << " pg[" << info.pgid << "(unlocked)] ";
383 }
384 return out.str();
385 }
386
387 /********* PG **********/
388
389 void PG::proc_master_log(
390 ObjectStore::Transaction& t, pg_info_t &oinfo,
391 pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
392 {
393 dout(10) << "proc_master_log for osd." << from << ": "
394 << olog << " " << omissing << dendl;
395 assert(!is_peered() && is_primary());
396
397 // merge log into our own log to build master log. no need to
398 // make any adjustments to their missing map; we are taking their
399 // log to be authoritative (i.e., their entries are by definitely
400 // non-divergent).
401 merge_log(t, oinfo, olog, from);
402 peer_info[from] = oinfo;
403 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
404 might_have_unfound.insert(from);
405
406 // See doc/dev/osd_internals/last_epoch_started
407 if (oinfo.last_epoch_started > info.last_epoch_started) {
408 info.last_epoch_started = oinfo.last_epoch_started;
409 dirty_info = true;
410 }
411 if (oinfo.last_interval_started > info.last_interval_started) {
412 info.last_interval_started = oinfo.last_interval_started;
413 dirty_info = true;
414 }
415 update_history(oinfo.history);
416 assert(cct->_conf->osd_find_best_info_ignore_history_les ||
417 info.last_epoch_started >= info.history.last_epoch_started);
418
419 peer_missing[from].claim(omissing);
420 }
421
422 void PG::proc_replica_log(
423 pg_info_t &oinfo,
424 const pg_log_t &olog,
425 pg_missing_t& omissing,
426 pg_shard_t from)
427 {
428 dout(10) << "proc_replica_log for osd." << from << ": "
429 << oinfo << " " << olog << " " << omissing << dendl;
430
431 pg_log.proc_replica_log(oinfo, olog, omissing, from);
432
433 peer_info[from] = oinfo;
434 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
435 might_have_unfound.insert(from);
436
437 for (map<hobject_t, pg_missing_item>::const_iterator i =
438 omissing.get_items().begin();
439 i != omissing.get_items().end();
440 ++i) {
441 dout(20) << " after missing " << i->first << " need " << i->second.need
442 << " have " << i->second.have << dendl;
443 }
444 peer_missing[from].claim(omissing);
445 }
446
447 bool PG::proc_replica_info(
448 pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
449 {
450 map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
451 if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
452 dout(10) << " got dup osd." << from << " info " << oinfo << ", identical to ours" << dendl;
453 return false;
454 }
455
456 if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
457 dout(10) << " got info " << oinfo << " from down osd." << from
458 << " discarding" << dendl;
459 return false;
460 }
461
462 dout(10) << " got osd." << from << " " << oinfo << dendl;
463 assert(is_primary());
464 peer_info[from] = oinfo;
465 might_have_unfound.insert(from);
466
467 update_history(oinfo.history);
468
469 // stray?
470 if (!is_up(from) && !is_acting(from)) {
471 dout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
472 stray_set.insert(from);
473 if (is_clean()) {
474 purge_strays();
475 }
476 }
477
478 // was this a new info? if so, update peers!
479 if (p == peer_info.end())
480 update_heartbeat_peers();
481
482 return true;
483 }
484
485 void PG::remove_snap_mapped_object(
486 ObjectStore::Transaction &t, const hobject_t &soid)
487 {
488 t.remove(
489 coll,
490 ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard));
491 clear_object_snap_mapping(&t, soid);
492 }
493
494 void PG::clear_object_snap_mapping(
495 ObjectStore::Transaction *t, const hobject_t &soid)
496 {
497 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
498 if (soid.snap < CEPH_MAXSNAP) {
499 int r = snap_mapper.remove_oid(
500 soid,
501 &_t);
502 if (!(r == 0 || r == -ENOENT)) {
503 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
504 ceph_abort();
505 }
506 }
507 }
508
509 void PG::update_object_snap_mapping(
510 ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps)
511 {
512 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
513 assert(soid.snap < CEPH_MAXSNAP);
514 int r = snap_mapper.remove_oid(
515 soid,
516 &_t);
517 if (!(r == 0 || r == -ENOENT)) {
518 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
519 ceph_abort();
520 }
521 snap_mapper.add_oid(
522 soid,
523 snaps,
524 &_t);
525 }
526
527 void PG::merge_log(
528 ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_shard_t from)
529 {
530 PGLogEntryHandler rollbacker{this, &t};
531 pg_log.merge_log(
532 oinfo, olog, from, info, &rollbacker, dirty_info, dirty_big_info);
533 }
534
535 void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead)
536 {
537 PGLogEntryHandler rollbacker{this, &t};
538 pg_log.rewind_divergent_log(
539 newhead, info, &rollbacker, dirty_info, dirty_big_info);
540 }
541
542 /*
543 * Process information from a replica to determine if it could have any
544 * objects that i need.
545 *
546 * TODO: if the missing set becomes very large, this could get expensive.
547 * Instead, we probably want to just iterate over our unfound set.
548 */
549 bool PG::search_for_missing(
550 const pg_info_t &oinfo, const pg_missing_t &omissing,
551 pg_shard_t from,
552 RecoveryCtx *ctx)
553 {
554 uint64_t num_unfound_before = missing_loc.num_unfound();
555 bool found_missing = missing_loc.add_source_info(
556 from, oinfo, omissing, ctx->handle);
557 if (found_missing && num_unfound_before != missing_loc.num_unfound())
558 publish_stats_to_osd();
559 // avoid doing this if the peer is empty. This is abit of paranoia
560 // to avoid doing something rash if add_source_info() above
561 // incorrectly decided we found something new. (if the peer has
562 // last_update=0'0 that's impossible.)
563 if (found_missing &&
564 (get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
565 CEPH_FEATURE_OSD_ERASURE_CODES) &&
566 oinfo.last_update != eversion_t()) {
567 pg_info_t tinfo(oinfo);
568 tinfo.pgid.shard = pg_whoami.shard;
569 (*(ctx->info_map))[from.osd].push_back(
570 make_pair(
571 pg_notify_t(
572 from.shard, pg_whoami.shard,
573 get_osdmap()->get_epoch(),
574 get_osdmap()->get_epoch(),
575 tinfo),
576 past_intervals));
577 }
578 return found_missing;
579 }
580
581 bool PG::MissingLoc::readable_with_acting(
582 const hobject_t &hoid,
583 const set<pg_shard_t> &acting) const {
584 if (!needs_recovery(hoid))
585 return true;
586 if (is_deleted(hoid))
587 return false;
588 auto missing_loc_entry = missing_loc.find(hoid);
589 if (missing_loc_entry == missing_loc.end())
590 return false;
591 const set<pg_shard_t> &locs = missing_loc_entry->second;
592 ldout(pg->cct, 10) << __func__ << ": locs:" << locs << dendl;
593 set<pg_shard_t> have_acting;
594 for (set<pg_shard_t>::const_iterator i = locs.begin();
595 i != locs.end();
596 ++i) {
597 if (acting.count(*i))
598 have_acting.insert(*i);
599 }
600 return (*is_readable)(have_acting);
601 }
602
603 void PG::MissingLoc::add_batch_sources_info(
604 const set<pg_shard_t> &sources, ThreadPool::TPHandle* handle)
605 {
606 ldout(pg->cct, 10) << __func__ << ": adding sources in batch "
607 << sources.size() << dendl;
608 unsigned loop = 0;
609 for (map<hobject_t, pg_missing_item>::const_iterator i = needs_recovery_map.begin();
610 i != needs_recovery_map.end();
611 ++i) {
612 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
613 handle->reset_tp_timeout();
614 loop = 0;
615 }
616 if (i->second.is_delete())
617 continue;
618 missing_loc[i->first].insert(sources.begin(), sources.end());
619 missing_loc_sources.insert(sources.begin(), sources.end());
620 }
621 }
622
623 bool PG::MissingLoc::add_source_info(
624 pg_shard_t fromosd,
625 const pg_info_t &oinfo,
626 const pg_missing_t &omissing,
627 ThreadPool::TPHandle* handle)
628 {
629 bool found_missing = false;
630 unsigned loop = 0;
631 // found items?
632 for (map<hobject_t,pg_missing_item>::const_iterator p = needs_recovery_map.begin();
633 p != needs_recovery_map.end();
634 ++p) {
635 const hobject_t &soid(p->first);
636 eversion_t need = p->second.need;
637 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
638 handle->reset_tp_timeout();
639 loop = 0;
640 }
641 if (p->second.is_delete()) {
642 ldout(pg->cct, 10) << __func__ << " " << soid
643 << " delete, ignoring source" << dendl;
644 continue;
645 }
646 if (oinfo.last_update < need) {
647 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
648 << " also missing on osd." << fromosd
649 << " (last_update " << oinfo.last_update
650 << " < needed " << need << ")" << dendl;
651 continue;
652 }
653 if (!oinfo.last_backfill.is_max() &&
654 !oinfo.last_backfill_bitwise) {
655 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
656 << " also missing on osd." << fromosd
657 << " (last_backfill " << oinfo.last_backfill
658 << " but with wrong sort order)"
659 << dendl;
660 continue;
661 }
662 if (p->first >= oinfo.last_backfill) {
663 // FIXME: this is _probably_ true, although it could conceivably
664 // be in the undefined region! Hmm!
665 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
666 << " also missing on osd." << fromosd
667 << " (past last_backfill " << oinfo.last_backfill
668 << ")" << dendl;
669 continue;
670 }
671 if (oinfo.last_complete < need) {
672 if (omissing.is_missing(soid)) {
673 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
674 << " also missing on osd." << fromosd << dendl;
675 continue;
676 }
677 }
678
679 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
680 << " is on osd." << fromosd << dendl;
681
682 missing_loc[soid].insert(fromosd);
683 missing_loc_sources.insert(fromosd);
684 found_missing = true;
685 }
686
687 ldout(pg->cct, 20) << "needs_recovery_map missing " << needs_recovery_map
688 << dendl;
689 return found_missing;
690 }
691
692 void PG::discover_all_missing(map<int, map<spg_t,pg_query_t> > &query_map)
693 {
694 auto &missing = pg_log.get_missing();
695 uint64_t unfound = get_num_unfound();
696 assert(unfound > 0);
697
698 dout(10) << __func__ << " "
699 << missing.num_missing() << " missing, "
700 << unfound << " unfound"
701 << dendl;
702
703 std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
704 std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
705 for (; m != mend; ++m) {
706 pg_shard_t peer(*m);
707
708 if (!get_osdmap()->is_up(peer.osd)) {
709 dout(20) << __func__ << " skipping down osd." << peer << dendl;
710 continue;
711 }
712
713 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
714 if (iter != peer_info.end() &&
715 (iter->second.is_empty() || iter->second.dne())) {
716 // ignore empty peers
717 continue;
718 }
719
720 // If we've requested any of this stuff, the pg_missing_t information
721 // should be on its way.
722 // TODO: coalsce requested_* into a single data structure
723 if (peer_missing.find(peer) != peer_missing.end()) {
724 dout(20) << __func__ << ": osd." << peer
725 << ": we already have pg_missing_t" << dendl;
726 continue;
727 }
728 if (peer_log_requested.find(peer) != peer_log_requested.end()) {
729 dout(20) << __func__ << ": osd." << peer
730 << ": in peer_log_requested" << dendl;
731 continue;
732 }
733 if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
734 dout(20) << __func__ << ": osd." << peer
735 << ": in peer_missing_requested" << dendl;
736 continue;
737 }
738
739 // Request missing
740 dout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
741 << dendl;
742 peer_missing_requested.insert(peer);
743 query_map[peer.osd][spg_t(info.pgid.pgid, peer.shard)] =
744 pg_query_t(
745 pg_query_t::FULLLOG,
746 peer.shard, pg_whoami.shard,
747 info.history, get_osdmap()->get_epoch());
748 }
749 }
750
751 /******* PG ***********/
752 bool PG::needs_recovery() const
753 {
754 assert(is_primary());
755
756 auto &missing = pg_log.get_missing();
757
758 if (missing.num_missing()) {
759 dout(10) << __func__ << " primary has " << missing.num_missing()
760 << " missing" << dendl;
761 return true;
762 }
763
764 assert(!actingbackfill.empty());
765 set<pg_shard_t>::const_iterator end = actingbackfill.end();
766 set<pg_shard_t>::const_iterator a = actingbackfill.begin();
767 for (; a != end; ++a) {
768 if (*a == get_primary()) continue;
769 pg_shard_t peer = *a;
770 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
771 if (pm == peer_missing.end()) {
772 dout(10) << __func__ << " osd." << peer << " doesn't have missing set"
773 << dendl;
774 continue;
775 }
776 if (pm->second.num_missing()) {
777 dout(10) << __func__ << " osd." << peer << " has "
778 << pm->second.num_missing() << " missing" << dendl;
779 return true;
780 }
781 }
782
783 dout(10) << __func__ << " is recovered" << dendl;
784 return false;
785 }
786
787 bool PG::needs_backfill() const
788 {
789 assert(is_primary());
790
791 // We can assume that only possible osds that need backfill
792 // are on the backfill_targets vector nodes.
793 set<pg_shard_t>::const_iterator end = backfill_targets.end();
794 set<pg_shard_t>::const_iterator a = backfill_targets.begin();
795 for (; a != end; ++a) {
796 pg_shard_t peer = *a;
797 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
798 if (!pi->second.last_backfill.is_max()) {
799 dout(10) << __func__ << " osd." << peer << " has last_backfill " << pi->second.last_backfill << dendl;
800 return true;
801 }
802 }
803
804 dout(10) << __func__ << " does not need backfill" << dendl;
805 return false;
806 }
807
808
809 void PG::check_past_interval_bounds() const
810 {
811 auto rpib = get_required_past_interval_bounds(
812 info,
813 osd->get_superblock().oldest_map);
814 if (rpib.first >= rpib.second) {
815 if (!past_intervals.empty()) {
816 osd->clog->error() << info.pgid << " required past_interval bounds are"
817 << " empty [" << rpib << ") but past_intervals is not: "
818 << past_intervals;
819 derr << info.pgid << " required past_interval bounds are"
820 << " empty [" << rpib << ") but past_intervals is not: "
821 << past_intervals << dendl;
822 }
823 } else {
824 if (past_intervals.empty()) {
825 osd->clog->error() << info.pgid << " required past_interval bounds are"
826 << " not empty [" << rpib << ") but past_intervals "
827 << past_intervals << " is empty";
828 derr << info.pgid << " required past_interval bounds are"
829 << " not empty [" << rpib << ") but past_intervals "
830 << past_intervals << " is empty" << dendl;
831 assert(!past_intervals.empty());
832 }
833
834 auto apib = past_intervals.get_bounds();
835 if (apib.first > rpib.first) {
836 osd->clog->error() << info.pgid << " past_intervals [" << apib
837 << ") start interval does not contain the required"
838 << " bound [" << rpib << ") start";
839 derr << info.pgid << " past_intervals [" << apib
840 << ") start interval does not contain the required"
841 << " bound [" << rpib << ") start" << dendl;
842 assert(0 == "past_interval start interval mismatch");
843 }
844 if (apib.second != rpib.second) {
845 osd->clog->error() << info.pgid << " past_interal bound [" << apib
846 << ") end does not match required [" << rpib
847 << ") end";
848 derr << info.pgid << " past_interal bound [" << apib
849 << ") end does not match required [" << rpib
850 << ") end" << dendl;
851 assert(0 == "past_interval end mismatch");
852 }
853 }
854 }
855
856 bool PG::adjust_need_up_thru(const OSDMapRef osdmap)
857 {
858 epoch_t up_thru = osdmap->get_up_thru(osd->whoami);
859 if (need_up_thru &&
860 up_thru >= info.history.same_interval_since) {
861 dout(10) << "adjust_need_up_thru now " << up_thru << ", need_up_thru now false" << dendl;
862 need_up_thru = false;
863 return true;
864 }
865 return false;
866 }
867
868 void PG::remove_down_peer_info(const OSDMapRef osdmap)
869 {
870 // Remove any downed osds from peer_info
871 bool removed = false;
872 map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
873 while (p != peer_info.end()) {
874 if (!osdmap->is_up(p->first.osd)) {
875 dout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
876 peer_missing.erase(p->first);
877 peer_log_requested.erase(p->first);
878 peer_missing_requested.erase(p->first);
879 peer_info.erase(p++);
880 removed = true;
881 } else
882 ++p;
883 }
884
885 // if we removed anyone, update peers (which include peer_info)
886 if (removed)
887 update_heartbeat_peers();
888 check_recovery_sources(osdmap);
889 }
890
891 /*
892 * Returns true unless there is a non-lost OSD in might_have_unfound.
893 */
894 bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
895 {
896 assert(is_primary());
897
898 set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
899 set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
900 for (; peer != mend; ++peer) {
901 if (peer_missing.count(*peer))
902 continue;
903 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
904 if (iter != peer_info.end() &&
905 (iter->second.is_empty() || iter->second.dne()))
906 continue;
907 if (!osdmap->exists(peer->osd))
908 continue;
909 const osd_info_t &osd_info(osdmap->get_info(peer->osd));
910 if (osd_info.lost_at <= osd_info.up_from) {
911 // If there is even one OSD in might_have_unfound that isn't lost, we
912 // still might retrieve our unfound.
913 return false;
914 }
915 }
916 dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound
917 << " have been queried or are marked lost" << dendl;
918 return true;
919 }
920
921 PastIntervals::PriorSet PG::build_prior()
922 {
923 if (1) {
924 // sanity check
925 for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
926 it != peer_info.end();
927 ++it) {
928 assert(info.history.last_epoch_started >= it->second.history.last_epoch_started);
929 }
930 }
931
932 const OSDMap &osdmap = *get_osdmap();
933 PastIntervals::PriorSet prior = past_intervals.get_prior_set(
934 pool.info.ec_pool(),
935 info.history.last_epoch_started,
936 get_pgbackend()->get_is_recoverable_predicate(),
937 [&](epoch_t start, int osd, epoch_t *lost_at) {
938 const osd_info_t *pinfo = 0;
939 if (osdmap.exists(osd)) {
940 pinfo = &osdmap.get_info(osd);
941 if (lost_at)
942 *lost_at = pinfo->lost_at;
943 }
944
945 if (osdmap.is_up(osd)) {
946 return PastIntervals::UP;
947 } else if (!pinfo) {
948 return PastIntervals::DNE;
949 } else if (pinfo->lost_at > start) {
950 return PastIntervals::LOST;
951 } else {
952 return PastIntervals::DOWN;
953 }
954 },
955 up,
956 acting,
957 this);
958
959 if (prior.pg_down) {
960 state_set(PG_STATE_DOWN);
961 }
962
963 if (get_osdmap()->get_up_thru(osd->whoami) < info.history.same_interval_since) {
964 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
965 << " < same_since " << info.history.same_interval_since
966 << ", must notify monitor" << dendl;
967 need_up_thru = true;
968 } else {
969 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
970 << " >= same_since " << info.history.same_interval_since
971 << ", all is well" << dendl;
972 need_up_thru = false;
973 }
974 set_probe_targets(prior.probe);
975 return prior;
976 }
977
978 void PG::clear_primary_state()
979 {
980 dout(10) << "clear_primary_state" << dendl;
981
982 // clear peering state
983 stray_set.clear();
984 peer_log_requested.clear();
985 peer_missing_requested.clear();
986 peer_info.clear();
987 peer_missing.clear();
988 need_up_thru = false;
989 peer_last_complete_ondisk.clear();
990 peer_activated.clear();
991 min_last_complete_ondisk = eversion_t();
992 pg_trim_to = eversion_t();
993 might_have_unfound.clear();
994 projected_log = PGLog::IndexedLog();
995
996 last_update_ondisk = eversion_t();
997
998 snap_trimq.clear();
999
1000 finish_sync_event = 0; // so that _finish_recovery doesn't go off in another thread
1001
1002 missing_loc.clear();
1003
1004 release_pg_backoffs();
1005
1006 pg_log.reset_recovery_pointers();
1007
1008 scrubber.reserved_peers.clear();
1009 scrub_after_recovery = false;
1010
1011 agent_clear();
1012 }
1013
1014 PG::Scrubber::Scrubber()
1015 : reserved(false), reserve_failed(false),
1016 epoch_start(0),
1017 active(false),
1018 shallow_errors(0), deep_errors(0), fixed(0),
1019 must_scrub(false), must_deep_scrub(false), must_repair(false),
1020 auto_repair(false),
1021 num_digest_updates_pending(0),
1022 state(INACTIVE),
1023 deep(false)
1024 {}
1025
1026 PG::Scrubber::~Scrubber() {}
1027
1028 /**
1029 * find_best_info
1030 *
1031 * Returns an iterator to the best info in infos sorted by:
1032 * 1) Prefer newer last_update
1033 * 2) Prefer longer tail if it brings another info into contiguity
1034 * 3) Prefer current primary
1035 */
1036 map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
1037 const map<pg_shard_t, pg_info_t> &infos,
1038 bool restrict_to_up_acting,
1039 bool *history_les_bound) const
1040 {
1041 assert(history_les_bound);
1042 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1043 * to make changes to this process. Also, make sure to update it
1044 * when you find bugs! */
1045 eversion_t min_last_update_acceptable = eversion_t::max();
1046 epoch_t max_last_epoch_started_found = 0;
1047 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1048 i != infos.end();
1049 ++i) {
1050 if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1051 max_last_epoch_started_found < i->second.history.last_epoch_started) {
1052 *history_les_bound = true;
1053 max_last_epoch_started_found = i->second.history.last_epoch_started;
1054 }
1055 if (!i->second.is_incomplete() &&
1056 max_last_epoch_started_found < i->second.last_epoch_started) {
1057 max_last_epoch_started_found = i->second.last_epoch_started;
1058 }
1059 }
1060 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1061 i != infos.end();
1062 ++i) {
1063 if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1064 if (min_last_update_acceptable > i->second.last_update)
1065 min_last_update_acceptable = i->second.last_update;
1066 }
1067 }
1068 if (min_last_update_acceptable == eversion_t::max())
1069 return infos.end();
1070
1071 map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1072 // find osd with newest last_update (oldest for ec_pool).
1073 // if there are multiples, prefer
1074 // - a longer tail, if it brings another peer into log contiguity
1075 // - the current primary
1076 for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1077 p != infos.end();
1078 ++p) {
1079 if (restrict_to_up_acting && !is_up(p->first) &&
1080 !is_acting(p->first))
1081 continue;
1082 // Only consider peers with last_update >= min_last_update_acceptable
1083 if (p->second.last_update < min_last_update_acceptable)
1084 continue;
1085 // Disqualify anyone with a too old last_epoch_started
1086 if (p->second.last_epoch_started < max_last_epoch_started_found)
1087 continue;
1088 // Disqualify anyone who is incomplete (not fully backfilled)
1089 if (p->second.is_incomplete())
1090 continue;
1091 if (best == infos.end()) {
1092 best = p;
1093 continue;
1094 }
1095 // Prefer newer last_update
1096 if (pool.info.require_rollback()) {
1097 if (p->second.last_update > best->second.last_update)
1098 continue;
1099 if (p->second.last_update < best->second.last_update) {
1100 best = p;
1101 continue;
1102 }
1103 } else {
1104 if (p->second.last_update < best->second.last_update)
1105 continue;
1106 if (p->second.last_update > best->second.last_update) {
1107 best = p;
1108 continue;
1109 }
1110 }
1111
1112 // Prefer longer tail
1113 if (p->second.log_tail > best->second.log_tail) {
1114 continue;
1115 } else if (p->second.log_tail < best->second.log_tail) {
1116 best = p;
1117 continue;
1118 }
1119
1120 // prefer current primary (usually the caller), all things being equal
1121 if (p->first == pg_whoami) {
1122 dout(10) << "calc_acting prefer osd." << p->first
1123 << " because it is current primary" << dendl;
1124 best = p;
1125 continue;
1126 }
1127 }
1128 return best;
1129 }
1130
1131 void PG::calc_ec_acting(
1132 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1133 unsigned size,
1134 const vector<int> &acting,
1135 pg_shard_t acting_primary,
1136 const vector<int> &up,
1137 pg_shard_t up_primary,
1138 const map<pg_shard_t, pg_info_t> &all_info,
1139 bool restrict_to_up_acting,
1140 vector<int> *_want,
1141 set<pg_shard_t> *backfill,
1142 set<pg_shard_t> *acting_backfill,
1143 pg_shard_t *want_primary,
1144 ostream &ss)
1145 {
1146 vector<int> want(size, CRUSH_ITEM_NONE);
1147 map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1148 unsigned usable = 0;
1149 for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1150 i != all_info.end();
1151 ++i) {
1152 all_info_by_shard[i->first.shard].insert(i->first);
1153 }
1154 for (uint8_t i = 0; i < want.size(); ++i) {
1155 ss << "For position " << (unsigned)i << ": ";
1156 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1157 !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1158 all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1159 auth_log_shard->second.log_tail) {
1160 ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1161 want[i] = up[i];
1162 ++usable;
1163 continue;
1164 }
1165 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1166 ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1167 << " and ";
1168 backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1169 }
1170
1171 if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1172 !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1173 all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1174 auth_log_shard->second.log_tail) {
1175 ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1176 want[i] = acting[i];
1177 ++usable;
1178 } else if (!restrict_to_up_acting) {
1179 for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1180 j != all_info_by_shard[shard_id_t(i)].end();
1181 ++j) {
1182 assert(j->shard == i);
1183 if (!all_info.find(*j)->second.is_incomplete() &&
1184 all_info.find(*j)->second.last_update >=
1185 auth_log_shard->second.log_tail) {
1186 ss << " selecting stray: " << *j << std::endl;
1187 want[i] = j->osd;
1188 ++usable;
1189 break;
1190 }
1191 }
1192 if (want[i] == CRUSH_ITEM_NONE)
1193 ss << " failed to fill position " << (int)i << std::endl;
1194 }
1195 }
1196
1197 bool found_primary = false;
1198 for (uint8_t i = 0; i < want.size(); ++i) {
1199 if (want[i] != CRUSH_ITEM_NONE) {
1200 acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1201 if (!found_primary) {
1202 *want_primary = pg_shard_t(want[i], shard_id_t(i));
1203 found_primary = true;
1204 }
1205 }
1206 }
1207 acting_backfill->insert(backfill->begin(), backfill->end());
1208 _want->swap(want);
1209 }
1210
1211 /**
1212 * calculate the desired acting set.
1213 *
1214 * Choose an appropriate acting set. Prefer up[0], unless it is
1215 * incomplete, or another osd has a longer tail that allows us to
1216 * bring other up nodes up to date.
1217 */
1218 void PG::calc_replicated_acting(
1219 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1220 unsigned size,
1221 const vector<int> &acting,
1222 pg_shard_t acting_primary,
1223 const vector<int> &up,
1224 pg_shard_t up_primary,
1225 const map<pg_shard_t, pg_info_t> &all_info,
1226 bool restrict_to_up_acting,
1227 vector<int> *want,
1228 set<pg_shard_t> *backfill,
1229 set<pg_shard_t> *acting_backfill,
1230 pg_shard_t *want_primary,
1231 ostream &ss)
1232 {
1233 ss << "calc_acting newest update on osd." << auth_log_shard->first
1234 << " with " << auth_log_shard->second
1235 << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1236 pg_shard_t auth_log_shard_id = auth_log_shard->first;
1237
1238 // select primary
1239 map<pg_shard_t,pg_info_t>::const_iterator primary;
1240 if (up.size() &&
1241 !all_info.find(up_primary)->second.is_incomplete() &&
1242 all_info.find(up_primary)->second.last_update >=
1243 auth_log_shard->second.log_tail) {
1244 ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1245 primary = all_info.find(up_primary); // prefer up[0], all thing being equal
1246 } else {
1247 assert(!auth_log_shard->second.is_incomplete());
1248 ss << "up[0] needs backfill, osd." << auth_log_shard_id
1249 << " selected as primary instead" << std::endl;
1250 primary = auth_log_shard;
1251 }
1252
1253 ss << "calc_acting primary is osd." << primary->first
1254 << " with " << primary->second << std::endl;
1255 *want_primary = primary->first;
1256 want->push_back(primary->first.osd);
1257 acting_backfill->insert(primary->first);
1258 unsigned usable = 1;
1259
1260 // select replicas that have log contiguity with primary.
1261 // prefer up, then acting, then any peer_info osds
1262 for (vector<int>::const_iterator i = up.begin();
1263 i != up.end();
1264 ++i) {
1265 pg_shard_t up_cand = pg_shard_t(*i, shard_id_t::NO_SHARD);
1266 if (up_cand == primary->first)
1267 continue;
1268 const pg_info_t &cur_info = all_info.find(up_cand)->second;
1269 if (cur_info.is_incomplete() ||
1270 cur_info.last_update < MIN(
1271 primary->second.log_tail,
1272 auth_log_shard->second.log_tail)) {
1273 /* We include auth_log_shard->second.log_tail because in GetLog,
1274 * we will request logs back to the min last_update over our
1275 * acting_backfill set, which will result in our log being extended
1276 * as far backwards as necessary to pick up any peers which can
1277 * be log recovered by auth_log_shard's log */
1278 ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1279 backfill->insert(up_cand);
1280 acting_backfill->insert(up_cand);
1281 } else {
1282 want->push_back(*i);
1283 acting_backfill->insert(up_cand);
1284 usable++;
1285 ss << " osd." << *i << " (up) accepted " << cur_info << std::endl;
1286 }
1287 }
1288
1289 // This no longer has backfill OSDs, but they are covered above.
1290 for (vector<int>::const_iterator i = acting.begin();
1291 i != acting.end();
1292 ++i) {
1293 pg_shard_t acting_cand(*i, shard_id_t::NO_SHARD);
1294 if (usable >= size)
1295 break;
1296
1297 // skip up osds we already considered above
1298 if (acting_cand == primary->first)
1299 continue;
1300 vector<int>::const_iterator up_it = find(up.begin(), up.end(), acting_cand.osd);
1301 if (up_it != up.end())
1302 continue;
1303
1304 const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1305 if (cur_info.is_incomplete() ||
1306 cur_info.last_update < primary->second.log_tail) {
1307 ss << " shard " << acting_cand << " (stray) REJECTED "
1308 << cur_info << std::endl;
1309 } else {
1310 want->push_back(*i);
1311 acting_backfill->insert(acting_cand);
1312 ss << " shard " << acting_cand << " (stray) accepted "
1313 << cur_info << std::endl;
1314 usable++;
1315 }
1316 }
1317
1318 if (restrict_to_up_acting) {
1319 return;
1320 }
1321 for (map<pg_shard_t,pg_info_t>::const_iterator i = all_info.begin();
1322 i != all_info.end();
1323 ++i) {
1324 if (usable >= size)
1325 break;
1326
1327 // skip up osds we already considered above
1328 if (i->first == primary->first)
1329 continue;
1330 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i->first.osd);
1331 if (up_it != up.end())
1332 continue;
1333 vector<int>::const_iterator acting_it = find(
1334 acting.begin(), acting.end(), i->first.osd);
1335 if (acting_it != acting.end())
1336 continue;
1337
1338 if (i->second.is_incomplete() ||
1339 i->second.last_update < primary->second.log_tail) {
1340 ss << " shard " << i->first << " (stray) REJECTED "
1341 << i->second << std::endl;
1342 } else {
1343 want->push_back(i->first.osd);
1344 acting_backfill->insert(i->first);
1345 ss << " shard " << i->first << " (stray) accepted "
1346 << i->second << std::endl;
1347 usable++;
1348 }
1349 }
1350 }
1351
1352 /**
1353 * choose acting
1354 *
1355 * calculate the desired acting, and request a change with the monitor
1356 * if it differs from the current acting.
1357 *
1358 * if restrict_to_up_acting=true, we filter out anything that's not in
1359 * up/acting. in order to lift this restriction, we need to
1360 * 1) check whether it's worth switching the acting set any time we get
1361 * a new pg info (not just here, when recovery finishes)
1362 * 2) check whether anything in want_acting went down on each new map
1363 * (and, if so, calculate a new want_acting)
1364 * 3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap)
1365 * TODO!
1366 */
1367 bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
1368 bool restrict_to_up_acting,
1369 bool *history_les_bound)
1370 {
1371 map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
1372 all_info[pg_whoami] = info;
1373
1374 for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
1375 p != all_info.end();
1376 ++p) {
1377 dout(10) << "calc_acting osd." << p->first << " " << p->second << dendl;
1378 }
1379
1380 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
1381 find_best_info(all_info, restrict_to_up_acting, history_les_bound);
1382
1383 if (auth_log_shard == all_info.end()) {
1384 if (up != acting) {
1385 dout(10) << "choose_acting no suitable info found (incomplete backfills?),"
1386 << " reverting to up" << dendl;
1387 want_acting = up;
1388 vector<int> empty;
1389 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1390 } else {
1391 dout(10) << "choose_acting failed" << dendl;
1392 assert(want_acting.empty());
1393 }
1394 return false;
1395 }
1396
1397 assert(!auth_log_shard->second.is_incomplete());
1398 auth_log_shard_id = auth_log_shard->first;
1399
1400 set<pg_shard_t> want_backfill, want_acting_backfill;
1401 vector<int> want;
1402 pg_shard_t want_primary;
1403 stringstream ss;
1404 if (!pool.info.ec_pool())
1405 calc_replicated_acting(
1406 auth_log_shard,
1407 get_osdmap()->get_pg_size(info.pgid.pgid),
1408 acting,
1409 primary,
1410 up,
1411 up_primary,
1412 all_info,
1413 restrict_to_up_acting,
1414 &want,
1415 &want_backfill,
1416 &want_acting_backfill,
1417 &want_primary,
1418 ss);
1419 else
1420 calc_ec_acting(
1421 auth_log_shard,
1422 get_osdmap()->get_pg_size(info.pgid.pgid),
1423 acting,
1424 primary,
1425 up,
1426 up_primary,
1427 all_info,
1428 restrict_to_up_acting,
1429 &want,
1430 &want_backfill,
1431 &want_acting_backfill,
1432 &want_primary,
1433 ss);
1434 dout(10) << ss.str() << dendl;
1435
1436 unsigned num_want_acting = 0;
1437 set<pg_shard_t> have;
1438 for (int i = 0; i < (int)want.size(); ++i) {
1439 if (want[i] != CRUSH_ITEM_NONE) {
1440 ++num_want_acting;
1441 have.insert(
1442 pg_shard_t(
1443 want[i],
1444 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1445 }
1446 }
1447
1448 // We go incomplete if below min_size for ec_pools since backfill
1449 // does not currently maintain rollbackability
1450 // Otherwise, we will go "peered", but not "active"
1451 if (num_want_acting < pool.info.min_size &&
1452 (pool.info.ec_pool() ||
1453 !cct->_conf->osd_allow_recovery_below_min_size)) {
1454 want_acting.clear();
1455 dout(10) << "choose_acting failed, below min size" << dendl;
1456 return false;
1457 }
1458
1459 /* Check whether we have enough acting shards to later perform recovery */
1460 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable_predicate(
1461 get_pgbackend()->get_is_recoverable_predicate());
1462 if (!(*recoverable_predicate)(have)) {
1463 want_acting.clear();
1464 dout(10) << "choose_acting failed, not recoverable" << dendl;
1465 return false;
1466 }
1467
1468 if (want != acting) {
1469 dout(10) << "choose_acting want " << want << " != acting " << acting
1470 << ", requesting pg_temp change" << dendl;
1471 want_acting = want;
1472
1473 if (want_acting == up) {
1474 // There can't be any pending backfill if
1475 // want is the same as crush map up OSDs.
1476 assert(want_backfill.empty());
1477 vector<int> empty;
1478 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1479 } else
1480 osd->queue_want_pg_temp(info.pgid.pgid, want);
1481 return false;
1482 }
1483 want_acting.clear();
1484 actingbackfill = want_acting_backfill;
1485 dout(10) << "actingbackfill is " << actingbackfill << dendl;
1486 assert(backfill_targets.empty() || backfill_targets == want_backfill);
1487 if (backfill_targets.empty()) {
1488 // Caller is GetInfo
1489 backfill_targets = want_backfill;
1490 }
1491 // Will not change if already set because up would have had to change
1492 // Verify that nothing in backfill is in stray_set
1493 for (set<pg_shard_t>::iterator i = want_backfill.begin();
1494 i != want_backfill.end();
1495 ++i) {
1496 assert(stray_set.find(*i) == stray_set.end());
1497 }
1498 dout(10) << "choose_acting want " << want << " (== acting) backfill_targets "
1499 << want_backfill << dendl;
1500 return true;
1501 }
1502
1503 /* Build the might_have_unfound set.
1504 *
1505 * This is used by the primary OSD during recovery.
1506 *
1507 * This set tracks the OSDs which might have unfound objects that the primary
1508 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
1509 * will remove the OSD from the set.
1510 */
1511 void PG::build_might_have_unfound()
1512 {
1513 assert(might_have_unfound.empty());
1514 assert(is_primary());
1515
1516 dout(10) << __func__ << dendl;
1517
1518 check_past_interval_bounds();
1519
1520 might_have_unfound = past_intervals.get_might_have_unfound(
1521 pg_whoami,
1522 pool.info.ec_pool());
1523
1524 // include any (stray) peers
1525 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
1526 p != peer_info.end();
1527 ++p)
1528 might_have_unfound.insert(p->first);
1529
1530 dout(15) << __func__ << ": built " << might_have_unfound << dendl;
1531 }
1532
1533 struct C_PG_ActivateCommitted : public Context {
1534 PGRef pg;
1535 epoch_t epoch;
1536 epoch_t activation_epoch;
1537 C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
1538 : pg(p), epoch(e), activation_epoch(ae) {}
1539 void finish(int r) override {
1540 pg->_activate_committed(epoch, activation_epoch);
1541 }
1542 };
1543
1544 void PG::activate(ObjectStore::Transaction& t,
1545 epoch_t activation_epoch,
1546 list<Context*>& tfin,
1547 map<int, map<spg_t,pg_query_t> >& query_map,
1548 map<int,
1549 vector<
1550 pair<pg_notify_t,
1551 PastIntervals> > > *activator_map,
1552 RecoveryCtx *ctx)
1553 {
1554 assert(!is_peered());
1555 assert(scrubber.callbacks.empty());
1556 assert(callbacks_for_degraded_object.empty());
1557
1558 // twiddle pg state
1559 state_clear(PG_STATE_DOWN);
1560
1561 send_notify = false;
1562
1563 if (is_primary()) {
1564 // only update primary last_epoch_started if we will go active
1565 if (acting.size() >= pool.info.min_size) {
1566 assert(cct->_conf->osd_find_best_info_ignore_history_les ||
1567 info.last_epoch_started <= activation_epoch);
1568 info.last_epoch_started = activation_epoch;
1569 info.last_interval_started = info.history.same_interval_since;
1570 }
1571 } else if (is_acting(pg_whoami)) {
1572 /* update last_epoch_started on acting replica to whatever the primary sent
1573 * unless it's smaller (could happen if we are going peered rather than
1574 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
1575 if (info.last_epoch_started < activation_epoch) {
1576 info.last_epoch_started = activation_epoch;
1577 info.last_interval_started = info.history.same_interval_since;
1578 }
1579 }
1580
1581 auto &missing = pg_log.get_missing();
1582
1583 if (is_primary()) {
1584 last_update_ondisk = info.last_update;
1585 min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
1586 }
1587 last_update_applied = info.last_update;
1588 last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
1589
1590 need_up_thru = false;
1591
1592 // write pg info, log
1593 dirty_info = true;
1594 dirty_big_info = true; // maybe
1595
1596 // find out when we commit
1597 t.register_on_complete(
1598 new C_PG_ActivateCommitted(
1599 this,
1600 get_osdmap()->get_epoch(),
1601 activation_epoch));
1602
1603 // initialize snap_trimq
1604 if (is_primary()) {
1605 dout(20) << "activate - purged_snaps " << info.purged_snaps
1606 << " cached_removed_snaps " << pool.cached_removed_snaps << dendl;
1607 snap_trimq = pool.cached_removed_snaps;
1608 interval_set<snapid_t> intersection;
1609 intersection.intersection_of(snap_trimq, info.purged_snaps);
1610 if (intersection == info.purged_snaps) {
1611 snap_trimq.subtract(info.purged_snaps);
1612 } else {
1613 dout(0) << "warning: info.purged_snaps (" << info.purged_snaps
1614 << ") is not a subset of pool.cached_removed_snaps ("
1615 << pool.cached_removed_snaps << ")" << dendl;
1616 snap_trimq.subtract(intersection);
1617 }
1618 }
1619
1620 // init complete pointer
1621 if (missing.num_missing() == 0) {
1622 dout(10) << "activate - no missing, moving last_complete " << info.last_complete
1623 << " -> " << info.last_update << dendl;
1624 info.last_complete = info.last_update;
1625 pg_log.reset_recovery_pointers();
1626 } else {
1627 dout(10) << "activate - not complete, " << missing << dendl;
1628 pg_log.activate_not_complete(info);
1629 }
1630
1631 log_weirdness();
1632
1633 // if primary..
1634 if (is_primary()) {
1635 assert(ctx);
1636 // start up replicas
1637
1638 assert(!actingbackfill.empty());
1639 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1640 i != actingbackfill.end();
1641 ++i) {
1642 if (*i == pg_whoami) continue;
1643 pg_shard_t peer = *i;
1644 assert(peer_info.count(peer));
1645 pg_info_t& pi = peer_info[peer];
1646
1647 dout(10) << "activate peer osd." << peer << " " << pi << dendl;
1648
1649 MOSDPGLog *m = 0;
1650 assert(peer_missing.count(peer));
1651 pg_missing_t& pm = peer_missing[peer];
1652
1653 bool needs_past_intervals = pi.dne();
1654
1655 /*
1656 * cover case where peer sort order was different and
1657 * last_backfill cannot be interpreted
1658 */
1659 bool force_restart_backfill =
1660 !pi.last_backfill.is_max() &&
1661 !pi.last_backfill_bitwise;
1662
1663 if (pi.last_update == info.last_update && !force_restart_backfill) {
1664 // empty log
1665 if (!pi.last_backfill.is_max())
1666 osd->clog->info() << info.pgid << " continuing backfill to osd."
1667 << peer
1668 << " from (" << pi.log_tail << "," << pi.last_update
1669 << "] " << pi.last_backfill
1670 << " to " << info.last_update;
1671 if (!pi.is_empty() && activator_map) {
1672 dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
1673 (*activator_map)[peer.osd].push_back(
1674 make_pair(
1675 pg_notify_t(
1676 peer.shard, pg_whoami.shard,
1677 get_osdmap()->get_epoch(),
1678 get_osdmap()->get_epoch(),
1679 info),
1680 past_intervals));
1681 } else {
1682 dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
1683 m = new MOSDPGLog(
1684 i->shard, pg_whoami.shard,
1685 get_osdmap()->get_epoch(), info);
1686 }
1687 } else if (
1688 pg_log.get_tail() > pi.last_update ||
1689 pi.last_backfill == hobject_t() ||
1690 force_restart_backfill ||
1691 (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
1692 /* ^ This last case covers a situation where a replica is not contiguous
1693 * with the auth_log, but is contiguous with this replica. Reshuffling
1694 * the active set to handle this would be tricky, so instead we just go
1695 * ahead and backfill it anyway. This is probably preferrable in any
1696 * case since the replica in question would have to be significantly
1697 * behind.
1698 */
1699 // backfill
1700 osd->clog->debug() << info.pgid << " starting backfill to osd." << peer
1701 << " from (" << pi.log_tail << "," << pi.last_update
1702 << "] " << pi.last_backfill
1703 << " to " << info.last_update;
1704
1705 pi.last_update = info.last_update;
1706 pi.last_complete = info.last_update;
1707 pi.set_last_backfill(hobject_t());
1708 pi.last_epoch_started = info.last_epoch_started;
1709 pi.last_interval_started = info.last_interval_started;
1710 pi.history = info.history;
1711 pi.hit_set = info.hit_set;
1712 pi.stats.stats.clear();
1713
1714 // initialize peer with our purged_snaps.
1715 pi.purged_snaps = info.purged_snaps;
1716
1717 m = new MOSDPGLog(
1718 i->shard, pg_whoami.shard,
1719 get_osdmap()->get_epoch(), pi);
1720
1721 // send some recent log, so that op dup detection works well.
1722 m->log.copy_up_to(pg_log.get_log(), cct->_conf->osd_min_pg_log_entries);
1723 m->info.log_tail = m->log.tail;
1724 pi.log_tail = m->log.tail; // sigh...
1725
1726 pm.clear();
1727 } else {
1728 // catch up
1729 assert(pg_log.get_tail() <= pi.last_update);
1730 m = new MOSDPGLog(
1731 i->shard, pg_whoami.shard,
1732 get_osdmap()->get_epoch(), info);
1733 // send new stuff to append to replicas log
1734 m->log.copy_after(pg_log.get_log(), pi.last_update);
1735 }
1736
1737 // share past_intervals if we are creating the pg on the replica
1738 // based on whether our info for that peer was dne() *before*
1739 // updating pi.history in the backfill block above.
1740 if (m && needs_past_intervals)
1741 m->past_intervals = past_intervals;
1742
1743 // update local version of peer's missing list!
1744 if (m && pi.last_backfill != hobject_t()) {
1745 for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
1746 p != m->log.log.end();
1747 ++p) {
1748 if (p->soid <= pi.last_backfill &&
1749 !p->is_error()) {
1750 if (perform_deletes_during_peering() && p->is_delete()) {
1751 pm.rm(p->soid, p->version);
1752 } else {
1753 pm.add_next_event(*p);
1754 }
1755 }
1756 }
1757 }
1758
1759 if (m) {
1760 dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
1761 //m->log.print(cout);
1762 osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
1763 }
1764
1765 // peer now has
1766 pi.last_update = info.last_update;
1767
1768 // update our missing
1769 if (pm.num_missing() == 0) {
1770 pi.last_complete = pi.last_update;
1771 dout(10) << "activate peer osd." << peer << " " << pi << " uptodate" << dendl;
1772 } else {
1773 dout(10) << "activate peer osd." << peer << " " << pi << " missing " << pm << dendl;
1774 }
1775 }
1776
1777 // Set up missing_loc
1778 set<pg_shard_t> complete_shards;
1779 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1780 i != actingbackfill.end();
1781 ++i) {
1782 dout(20) << __func__ << " setting up missing_loc from shard " << *i << " " << dendl;
1783 if (*i == get_primary()) {
1784 missing_loc.add_active_missing(missing);
1785 if (!missing.have_missing())
1786 complete_shards.insert(*i);
1787 } else {
1788 auto peer_missing_entry = peer_missing.find(*i);
1789 assert(peer_missing_entry != peer_missing.end());
1790 missing_loc.add_active_missing(peer_missing_entry->second);
1791 if (!peer_missing_entry->second.have_missing() &&
1792 peer_info[*i].last_backfill.is_max())
1793 complete_shards.insert(*i);
1794 }
1795 }
1796 // If necessary, create might_have_unfound to help us find our unfound objects.
1797 // NOTE: It's important that we build might_have_unfound before trimming the
1798 // past intervals.
1799 might_have_unfound.clear();
1800 if (needs_recovery()) {
1801 // If only one shard has missing, we do a trick to add all others as recovery
1802 // source, this is considered safe since the PGLogs have been merged locally,
1803 // and covers vast majority of the use cases, like one OSD/host is down for
1804 // a while for hardware repairing
1805 if (complete_shards.size() + 1 == actingbackfill.size()) {
1806 missing_loc.add_batch_sources_info(complete_shards, ctx->handle);
1807 } else {
1808 missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
1809 ctx->handle);
1810 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1811 i != actingbackfill.end();
1812 ++i) {
1813 if (*i == pg_whoami) continue;
1814 dout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
1815 assert(peer_missing.count(*i));
1816 assert(peer_info.count(*i));
1817 missing_loc.add_source_info(
1818 *i,
1819 peer_info[*i],
1820 peer_missing[*i],
1821 ctx->handle);
1822 }
1823 }
1824 for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
1825 i != peer_missing.end();
1826 ++i) {
1827 if (is_actingbackfill(i->first))
1828 continue;
1829 assert(peer_info.count(i->first));
1830 search_for_missing(
1831 peer_info[i->first],
1832 i->second,
1833 i->first,
1834 ctx);
1835 }
1836
1837 build_might_have_unfound();
1838
1839 if (have_unfound())
1840 discover_all_missing(query_map);
1841 }
1842
1843 // num_objects_degraded if calculated should reflect this too, unless no
1844 // missing and we are about to go clean.
1845 if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
1846 state_set(PG_STATE_UNDERSIZED);
1847 }
1848
1849 state_set(PG_STATE_ACTIVATING);
1850 release_pg_backoffs();
1851 projected_last_update = info.last_update;
1852 }
1853 if (acting.size() >= pool.info.min_size) {
1854 PGLogEntryHandler handler{this, &t};
1855 pg_log.roll_forward(&handler);
1856 }
1857 }
1858
1859 bool PG::op_has_sufficient_caps(OpRequestRef& op)
1860 {
1861 // only check MOSDOp
1862 if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
1863 return true;
1864
1865 const MOSDOp *req = static_cast<const MOSDOp*>(op->get_req());
1866
1867 Session *session = static_cast<Session*>(req->get_connection()->get_priv());
1868 if (!session) {
1869 dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl;
1870 return false;
1871 }
1872 OSDCap& caps = session->caps;
1873 session->put();
1874
1875 const string &key = req->get_hobj().get_key().empty() ?
1876 req->get_oid().name :
1877 req->get_hobj().get_key();
1878
1879 bool cap = caps.is_capable(pool.name, req->get_hobj().nspace,
1880 pool.auid, key,
1881 op->need_read_cap(),
1882 op->need_write_cap(),
1883 op->classes());
1884
1885 dout(20) << "op_has_sufficient_caps "
1886 << "session=" << session
1887 << " pool=" << pool.id << " (" << pool.name
1888 << " " << req->get_hobj().nspace
1889 << ") owner=" << pool.auid
1890 << " need_read_cap=" << op->need_read_cap()
1891 << " need_write_cap=" << op->need_write_cap()
1892 << " classes=" << op->classes()
1893 << " -> " << (cap ? "yes" : "NO")
1894 << dendl;
1895 return cap;
1896 }
1897
1898 void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
1899 {
1900 lock();
1901 if (pg_has_reset_since(epoch)) {
1902 dout(10) << "_activate_committed " << epoch
1903 << ", that was an old interval" << dendl;
1904 } else if (is_primary()) {
1905 peer_activated.insert(pg_whoami);
1906 dout(10) << "_activate_committed " << epoch
1907 << " peer_activated now " << peer_activated
1908 << " last_interval_started " << info.history.last_interval_started
1909 << " last_epoch_started " << info.history.last_epoch_started
1910 << " same_interval_since " << info.history.same_interval_since << dendl;
1911 assert(!actingbackfill.empty());
1912 if (peer_activated.size() == actingbackfill.size())
1913 all_activated_and_committed();
1914 } else {
1915 dout(10) << "_activate_committed " << epoch << " telling primary" << dendl;
1916 MOSDPGInfo *m = new MOSDPGInfo(epoch);
1917 pg_notify_t i = pg_notify_t(
1918 get_primary().shard, pg_whoami.shard,
1919 get_osdmap()->get_epoch(),
1920 get_osdmap()->get_epoch(),
1921 info);
1922
1923 i.info.history.last_epoch_started = activation_epoch;
1924 i.info.history.last_interval_started = i.info.history.same_interval_since;
1925 if (acting.size() >= pool.info.min_size) {
1926 state_set(PG_STATE_ACTIVE);
1927 } else {
1928 state_set(PG_STATE_PEERED);
1929 }
1930
1931 m->pg_list.push_back(make_pair(i, PastIntervals()));
1932 osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap()->get_epoch());
1933
1934 // waiters
1935 if (flushes_in_progress == 0) {
1936 requeue_ops(waiting_for_peered);
1937 } else if (!waiting_for_peered.empty()) {
1938 dout(10) << __func__ << " flushes in progress, moving "
1939 << waiting_for_peered.size() << " items to waiting_for_flush"
1940 << dendl;
1941 assert(waiting_for_flush.empty());
1942 waiting_for_flush.swap(waiting_for_peered);
1943 }
1944 }
1945
1946 assert(!dirty_info);
1947
1948 unlock();
1949 }
1950
1951 /*
1952 * update info.history.last_epoch_started ONLY after we and all
1953 * replicas have activated AND committed the activate transaction
1954 * (i.e. the peering results are stable on disk).
1955 */
1956 void PG::all_activated_and_committed()
1957 {
1958 dout(10) << "all_activated_and_committed" << dendl;
1959 assert(is_primary());
1960 assert(peer_activated.size() == actingbackfill.size());
1961 assert(!actingbackfill.empty());
1962 assert(blocked_by.empty());
1963
1964 // Degraded?
1965 _update_calc_stats();
1966 if (info.stats.stats.sum.num_objects_degraded) {
1967 state_set(PG_STATE_DEGRADED);
1968 } else {
1969 state_clear(PG_STATE_DEGRADED);
1970 }
1971
1972 queue_peering_event(
1973 CephPeeringEvtRef(
1974 std::make_shared<CephPeeringEvt>(
1975 get_osdmap()->get_epoch(),
1976 get_osdmap()->get_epoch(),
1977 AllReplicasActivated())));
1978 }
1979
1980 bool PG::requeue_scrub(bool high_priority)
1981 {
1982 assert(is_locked());
1983 if (scrub_queued) {
1984 dout(10) << __func__ << ": already queued" << dendl;
1985 return false;
1986 } else {
1987 dout(10) << __func__ << ": queueing" << dendl;
1988 scrub_queued = true;
1989 osd->queue_for_scrub(this, high_priority);
1990 return true;
1991 }
1992 }
1993
1994 void PG::queue_recovery()
1995 {
1996 if (!is_primary() || !is_peered()) {
1997 dout(10) << "queue_recovery -- not primary or not peered " << dendl;
1998 assert(!recovery_queued);
1999 } else if (recovery_queued) {
2000 dout(10) << "queue_recovery -- already queued" << dendl;
2001 } else {
2002 dout(10) << "queue_recovery -- queuing" << dendl;
2003 recovery_queued = true;
2004 osd->queue_for_recovery(this);
2005 }
2006 }
2007
2008 bool PG::queue_scrub()
2009 {
2010 assert(is_locked());
2011 if (is_scrubbing()) {
2012 return false;
2013 }
2014 scrubber.priority = scrubber.must_scrub ?
2015 cct->_conf->osd_requested_scrub_priority : get_scrub_priority();
2016 scrubber.must_scrub = false;
2017 state_set(PG_STATE_SCRUBBING);
2018 if (scrubber.must_deep_scrub) {
2019 state_set(PG_STATE_DEEP_SCRUB);
2020 scrubber.must_deep_scrub = false;
2021 }
2022 if (scrubber.must_repair || scrubber.auto_repair) {
2023 state_set(PG_STATE_REPAIR);
2024 scrubber.must_repair = false;
2025 }
2026 requeue_scrub();
2027 return true;
2028 }
2029
2030 unsigned PG::get_scrub_priority()
2031 {
2032 // a higher value -> a higher priority
2033 int pool_scrub_priority = 0;
2034 pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority);
2035 return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
2036 }
2037
2038 struct C_PG_FinishRecovery : public Context {
2039 PGRef pg;
2040 explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
2041 void finish(int r) override {
2042 pg->_finish_recovery(this);
2043 }
2044 };
2045
2046 void PG::mark_clean()
2047 {
2048 if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2049 state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
2050 state_set(PG_STATE_CLEAN);
2051 info.history.last_epoch_clean = get_osdmap()->get_epoch();
2052 info.history.last_interval_clean = info.history.same_interval_since;
2053 past_intervals.clear();
2054 dirty_big_info = true;
2055 dirty_info = true;
2056 }
2057
2058 kick_snap_trim();
2059 }
2060
2061 void PG::_change_recovery_force_mode(int new_mode, bool clear)
2062 {
2063 if (!deleting) {
2064 // we can't and shouldn't do anything if the PG is being deleted locally
2065 if (clear) {
2066 state_clear(new_mode);
2067 } else {
2068 state_set(new_mode);
2069 }
2070 publish_stats_to_osd();
2071 }
2072 }
2073
2074 inline int PG::clamp_recovery_priority(int priority)
2075 {
2076 static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
2077 static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
2078
2079 // Clamp to valid range
2080 if (priority > OSD_RECOVERY_PRIORITY_MAX) {
2081 return OSD_RECOVERY_PRIORITY_MAX;
2082 } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
2083 return OSD_RECOVERY_PRIORITY_MIN;
2084 } else {
2085 return priority;
2086 }
2087 }
2088
2089 unsigned PG::get_recovery_priority()
2090 {
2091 // a higher value -> a higher priority
2092 int ret = 0;
2093
2094 if (state & PG_STATE_FORCED_RECOVERY) {
2095 ret = OSD_RECOVERY_PRIORITY_FORCED;
2096 } else {
2097 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &ret);
2098 ret = clamp_recovery_priority(OSD_RECOVERY_PRIORITY_BASE + ret);
2099 }
2100 dout(20) << __func__ << " recovery priority for " << *this << " is " << ret << ", state is " << state << dendl;
2101 return static_cast<unsigned>(ret);
2102 }
2103
2104 unsigned PG::get_backfill_priority()
2105 {
2106 // a higher value -> a higher priority
2107 int ret = OSD_BACKFILL_PRIORITY_BASE;
2108 if (state & PG_STATE_FORCED_BACKFILL) {
2109 ret = OSD_RECOVERY_PRIORITY_FORCED;
2110 } else {
2111 if (acting.size() < pool.info.min_size) {
2112 // inactive: no. of replicas < min_size, highest priority since it blocks IO
2113 ret = OSD_BACKFILL_INACTIVE_PRIORITY_BASE + (pool.info.min_size - acting.size());
2114
2115 } else if (is_undersized()) {
2116 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
2117 assert(pool.info.size > actingset.size());
2118 ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE + (pool.info.size - actingset.size());
2119
2120 } else if (is_degraded()) {
2121 // degraded: baseline degraded
2122 ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
2123 }
2124
2125 // Adjust with pool's recovery priority
2126 int pool_recovery_priority = 0;
2127 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2128
2129 ret = clamp_recovery_priority(pool_recovery_priority + ret);
2130 }
2131
2132 return static_cast<unsigned>(ret);
2133 }
2134
2135 void PG::finish_recovery(list<Context*>& tfin)
2136 {
2137 dout(10) << "finish_recovery" << dendl;
2138 assert(info.last_complete == info.last_update);
2139
2140 clear_recovery_state();
2141
2142 /*
2143 * sync all this before purging strays. but don't block!
2144 */
2145 finish_sync_event = new C_PG_FinishRecovery(this);
2146 tfin.push_back(finish_sync_event);
2147 }
2148
2149 void PG::_finish_recovery(Context *c)
2150 {
2151 lock();
2152 if (deleting) {
2153 unlock();
2154 return;
2155 }
2156 if (c == finish_sync_event) {
2157 dout(10) << "_finish_recovery" << dendl;
2158 finish_sync_event = 0;
2159 purge_strays();
2160
2161 publish_stats_to_osd();
2162
2163 if (scrub_after_recovery) {
2164 dout(10) << "_finish_recovery requeueing for scrub" << dendl;
2165 scrub_after_recovery = false;
2166 scrubber.must_deep_scrub = true;
2167 queue_scrub();
2168 }
2169 } else {
2170 dout(10) << "_finish_recovery -- stale" << dendl;
2171 }
2172 unlock();
2173 }
2174
2175 void PG::start_recovery_op(const hobject_t& soid)
2176 {
2177 dout(10) << "start_recovery_op " << soid
2178 #ifdef DEBUG_RECOVERY_OIDS
2179 << " (" << recovering_oids << ")"
2180 #endif
2181 << dendl;
2182 assert(recovery_ops_active >= 0);
2183 recovery_ops_active++;
2184 #ifdef DEBUG_RECOVERY_OIDS
2185 assert(recovering_oids.count(soid) == 0);
2186 recovering_oids.insert(soid);
2187 #endif
2188 osd->start_recovery_op(this, soid);
2189 }
2190
2191 void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
2192 {
2193 dout(10) << "finish_recovery_op " << soid
2194 #ifdef DEBUG_RECOVERY_OIDS
2195 << " (" << recovering_oids << ")"
2196 #endif
2197 << dendl;
2198 assert(recovery_ops_active > 0);
2199 recovery_ops_active--;
2200 #ifdef DEBUG_RECOVERY_OIDS
2201 assert(recovering_oids.count(soid));
2202 recovering_oids.erase(soid);
2203 #endif
2204 osd->finish_recovery_op(this, soid, dequeue);
2205
2206 if (!dequeue) {
2207 queue_recovery();
2208 }
2209 }
2210
2211 void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
2212 {
2213 child->update_snap_mapper_bits(split_bits);
2214 child->update_osdmap_ref(get_osdmap());
2215
2216 child->pool = pool;
2217
2218 // Log
2219 pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2220 child->info.last_complete = info.last_complete;
2221
2222 info.last_update = pg_log.get_head();
2223 child->info.last_update = child->pg_log.get_head();
2224
2225 child->info.last_user_version = info.last_user_version;
2226
2227 info.log_tail = pg_log.get_tail();
2228 child->info.log_tail = child->pg_log.get_tail();
2229
2230 if (info.last_complete < pg_log.get_tail())
2231 info.last_complete = pg_log.get_tail();
2232 if (child->info.last_complete < child->pg_log.get_tail())
2233 child->info.last_complete = child->pg_log.get_tail();
2234
2235 // Info
2236 child->info.history = info.history;
2237 child->info.history.epoch_created = get_osdmap()->get_epoch();
2238 child->info.purged_snaps = info.purged_snaps;
2239
2240 if (info.last_backfill.is_max()) {
2241 child->info.set_last_backfill(hobject_t::get_max());
2242 } else {
2243 // restart backfill on parent and child to be safe. we could
2244 // probably do better in the bitwise sort case, but it's more
2245 // fragile (there may be special work to do on backfill completion
2246 // in the future).
2247 info.set_last_backfill(hobject_t());
2248 child->info.set_last_backfill(hobject_t());
2249 // restarting backfill implies that the missing set is empty,
2250 // since it is only used for objects prior to last_backfill
2251 pg_log.reset_backfill();
2252 child->pg_log.reset_backfill();
2253 }
2254
2255 child->info.stats = info.stats;
2256 child->info.stats.parent_split_bits = split_bits;
2257 info.stats.stats_invalid = true;
2258 child->info.stats.stats_invalid = true;
2259 child->info.last_epoch_started = info.last_epoch_started;
2260 child->info.last_interval_started = info.last_interval_started;
2261
2262 child->snap_trimq = snap_trimq;
2263
2264 // There can't be recovery/backfill going on now
2265 int primary, up_primary;
2266 vector<int> newup, newacting;
2267 get_osdmap()->pg_to_up_acting_osds(
2268 child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2269 child->init_primary_up_acting(
2270 newup,
2271 newacting,
2272 up_primary,
2273 primary);
2274 child->role = OSDMap::calc_pg_role(osd->whoami, child->acting);
2275
2276 // this comparison includes primary rank via pg_shard_t
2277 if (get_primary() != child->get_primary())
2278 child->info.history.same_primary_since = get_osdmap()->get_epoch();
2279
2280 child->info.stats.up = up;
2281 child->info.stats.up_primary = up_primary;
2282 child->info.stats.acting = acting;
2283 child->info.stats.acting_primary = primary;
2284 child->info.stats.mapping_epoch = get_osdmap()->get_epoch();
2285
2286 // History
2287 child->past_intervals = past_intervals;
2288
2289 _split_into(child_pgid, child, split_bits);
2290
2291 // release all backoffs for simplicity
2292 release_backoffs(hobject_t(), hobject_t::get_max());
2293
2294 child->on_new_interval();
2295
2296 child->dirty_info = true;
2297 child->dirty_big_info = true;
2298 dirty_info = true;
2299 dirty_big_info = true;
2300 }
2301
2302 void PG::add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end)
2303 {
2304 ConnectionRef con = s->con;
2305 if (!con) // OSD::ms_handle_reset clears s->con without a lock
2306 return;
2307 BackoffRef b(s->have_backoff(info.pgid, begin));
2308 if (b) {
2309 derr << __func__ << " already have backoff for " << s << " begin " << begin
2310 << " " << *b << dendl;
2311 ceph_abort();
2312 }
2313 Mutex::Locker l(backoff_lock);
2314 {
2315 b = new Backoff(info.pgid, this, s, ++s->backoff_seq, begin, end);
2316 backoffs[begin].insert(b);
2317 s->add_backoff(b);
2318 dout(10) << __func__ << " session " << s << " added " << *b << dendl;
2319 }
2320 con->send_message(
2321 new MOSDBackoff(
2322 info.pgid,
2323 get_osdmap()->get_epoch(),
2324 CEPH_OSD_BACKOFF_OP_BLOCK,
2325 b->id,
2326 begin,
2327 end));
2328 }
2329
2330 void PG::release_backoffs(const hobject_t& begin, const hobject_t& end)
2331 {
2332 dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl;
2333 vector<BackoffRef> bv;
2334 {
2335 Mutex::Locker l(backoff_lock);
2336 auto p = backoffs.lower_bound(begin);
2337 while (p != backoffs.end()) {
2338 int r = cmp(p->first, end);
2339 dout(20) << __func__ << " ? " << r << " " << p->first
2340 << " " << p->second << dendl;
2341 // note: must still examine begin=end=p->first case
2342 if (r > 0 || (r == 0 && begin < end)) {
2343 break;
2344 }
2345 dout(20) << __func__ << " checking " << p->first
2346 << " " << p->second << dendl;
2347 auto q = p->second.begin();
2348 while (q != p->second.end()) {
2349 dout(20) << __func__ << " checking " << *q << dendl;
2350 int r = cmp((*q)->begin, begin);
2351 if (r == 0 || (r > 0 && (*q)->end < end)) {
2352 bv.push_back(*q);
2353 q = p->second.erase(q);
2354 } else {
2355 ++q;
2356 }
2357 }
2358 if (p->second.empty()) {
2359 p = backoffs.erase(p);
2360 } else {
2361 ++p;
2362 }
2363 }
2364 }
2365 for (auto b : bv) {
2366 Mutex::Locker l(b->lock);
2367 dout(10) << __func__ << " " << *b << dendl;
2368 if (b->session) {
2369 assert(b->pg == this);
2370 ConnectionRef con = b->session->con;
2371 if (con) { // OSD::ms_handle_reset clears s->con without a lock
2372 con->send_message(
2373 new MOSDBackoff(
2374 info.pgid,
2375 get_osdmap()->get_epoch(),
2376 CEPH_OSD_BACKOFF_OP_UNBLOCK,
2377 b->id,
2378 b->begin,
2379 b->end));
2380 }
2381 if (b->is_new()) {
2382 b->state = Backoff::STATE_DELETING;
2383 } else {
2384 b->session->rm_backoff(b);
2385 b->session.reset();
2386 }
2387 b->pg.reset();
2388 }
2389 }
2390 }
2391
2392 void PG::clear_backoffs()
2393 {
2394 dout(10) << __func__ << " " << dendl;
2395 map<hobject_t,set<BackoffRef>> ls;
2396 {
2397 Mutex::Locker l(backoff_lock);
2398 ls.swap(backoffs);
2399 }
2400 for (auto& p : ls) {
2401 for (auto& b : p.second) {
2402 Mutex::Locker l(b->lock);
2403 dout(10) << __func__ << " " << *b << dendl;
2404 if (b->session) {
2405 assert(b->pg == this);
2406 if (b->is_new()) {
2407 b->state = Backoff::STATE_DELETING;
2408 } else {
2409 b->session->rm_backoff(b);
2410 b->session.reset();
2411 }
2412 b->pg.reset();
2413 }
2414 }
2415 }
2416 }
2417
2418 // called by Session::clear_backoffs()
2419 void PG::rm_backoff(BackoffRef b)
2420 {
2421 dout(10) << __func__ << " " << *b << dendl;
2422 Mutex::Locker l(backoff_lock);
2423 assert(b->lock.is_locked_by_me());
2424 assert(b->pg == this);
2425 auto p = backoffs.find(b->begin);
2426 // may race with release_backoffs()
2427 if (p != backoffs.end()) {
2428 auto q = p->second.find(b);
2429 if (q != p->second.end()) {
2430 p->second.erase(q);
2431 if (p->second.empty()) {
2432 backoffs.erase(p);
2433 }
2434 }
2435 }
2436 }
2437
2438 void PG::clear_recovery_state()
2439 {
2440 dout(10) << "clear_recovery_state" << dendl;
2441
2442 pg_log.reset_recovery_pointers();
2443 finish_sync_event = 0;
2444
2445 hobject_t soid;
2446 while (recovery_ops_active > 0) {
2447 #ifdef DEBUG_RECOVERY_OIDS
2448 soid = *recovering_oids.begin();
2449 #endif
2450 finish_recovery_op(soid, true);
2451 }
2452
2453 backfill_targets.clear();
2454 backfill_info.clear();
2455 peer_backfill_info.clear();
2456 waiting_on_backfill.clear();
2457 _clear_recovery_state(); // pg impl specific hook
2458 }
2459
2460 void PG::cancel_recovery()
2461 {
2462 dout(10) << "cancel_recovery" << dendl;
2463 clear_recovery_state();
2464 }
2465
2466
2467 void PG::purge_strays()
2468 {
2469 dout(10) << "purge_strays " << stray_set << dendl;
2470
2471 bool removed = false;
2472 for (set<pg_shard_t>::iterator p = stray_set.begin();
2473 p != stray_set.end();
2474 ++p) {
2475 assert(!is_actingbackfill(*p));
2476 if (get_osdmap()->is_up(p->osd)) {
2477 dout(10) << "sending PGRemove to osd." << *p << dendl;
2478 vector<spg_t> to_remove;
2479 to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
2480 MOSDPGRemove *m = new MOSDPGRemove(
2481 get_osdmap()->get_epoch(),
2482 to_remove);
2483 osd->send_message_osd_cluster(p->osd, m, get_osdmap()->get_epoch());
2484 } else {
2485 dout(10) << "not sending PGRemove to down osd." << *p << dendl;
2486 }
2487 peer_missing.erase(*p);
2488 peer_info.erase(*p);
2489 peer_purged.insert(*p);
2490 removed = true;
2491 }
2492
2493 // if we removed anyone, update peers (which include peer_info)
2494 if (removed)
2495 update_heartbeat_peers();
2496
2497 stray_set.clear();
2498
2499 // clear _requested maps; we may have to peer() again if we discover
2500 // (more) stray content
2501 peer_log_requested.clear();
2502 peer_missing_requested.clear();
2503 }
2504
2505 void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
2506 {
2507 Mutex::Locker l(heartbeat_peer_lock);
2508 probe_targets.clear();
2509 for (set<pg_shard_t>::iterator i = probe_set.begin();
2510 i != probe_set.end();
2511 ++i) {
2512 probe_targets.insert(i->osd);
2513 }
2514 }
2515
2516 void PG::clear_probe_targets()
2517 {
2518 Mutex::Locker l(heartbeat_peer_lock);
2519 probe_targets.clear();
2520 }
2521
2522 void PG::update_heartbeat_peers()
2523 {
2524 assert(is_locked());
2525
2526 if (!is_primary())
2527 return;
2528
2529 set<int> new_peers;
2530 for (unsigned i=0; i<acting.size(); i++) {
2531 if (acting[i] != CRUSH_ITEM_NONE)
2532 new_peers.insert(acting[i]);
2533 }
2534 for (unsigned i=0; i<up.size(); i++) {
2535 if (up[i] != CRUSH_ITEM_NONE)
2536 new_peers.insert(up[i]);
2537 }
2538 for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
2539 p != peer_info.end();
2540 ++p)
2541 new_peers.insert(p->first.osd);
2542
2543 bool need_update = false;
2544 heartbeat_peer_lock.Lock();
2545 if (new_peers == heartbeat_peers) {
2546 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl;
2547 } else {
2548 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl;
2549 heartbeat_peers.swap(new_peers);
2550 need_update = true;
2551 }
2552 heartbeat_peer_lock.Unlock();
2553
2554 if (need_update)
2555 osd->need_heartbeat_peer_update();
2556 }
2557
2558
2559 bool PG::check_in_progress_op(
2560 const osd_reqid_t &r,
2561 eversion_t *version,
2562 version_t *user_version,
2563 int *return_code) const
2564 {
2565 return (
2566 projected_log.get_request(r, version, user_version, return_code) ||
2567 pg_log.get_log().get_request(r, version, user_version, return_code));
2568 }
2569
2570 void PG::_update_calc_stats()
2571 {
2572 info.stats.version = info.last_update;
2573 info.stats.created = info.history.epoch_created;
2574 info.stats.last_scrub = info.history.last_scrub;
2575 info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
2576 info.stats.last_deep_scrub = info.history.last_deep_scrub;
2577 info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
2578 info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
2579 info.stats.last_epoch_clean = info.history.last_epoch_clean;
2580
2581 info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
2582 info.stats.ondisk_log_size = info.stats.log_size;
2583 info.stats.log_start = pg_log.get_tail();
2584 info.stats.ondisk_log_start = pg_log.get_tail();
2585 info.stats.snaptrimq_len = snap_trimq.size();
2586
2587 unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
2588
2589 // In rare case that upset is too large (usually transient), use as target
2590 // for calculations below.
2591 unsigned target = std::max(num_shards, (unsigned)upset.size());
2592 // Not sure this could ever happen, that actingset > upset
2593 // which only matters if actingset > num_shards.
2594 unsigned nrep = std::max(actingset.size(), upset.size());
2595 // calc num_object_copies
2596 info.stats.stats.calc_copies(MAX(target, nrep));
2597 info.stats.stats.sum.num_objects_degraded = 0;
2598 info.stats.stats.sum.num_objects_unfound = 0;
2599 info.stats.stats.sum.num_objects_misplaced = 0;
2600 if ((is_remapped() || is_undersized() || !is_clean()) && (is_peered() || is_activating())) {
2601 dout(20) << __func__ << " actingset " << actingset << " upset "
2602 << upset << " actingbackfill " << actingbackfill << dendl;
2603 dout(20) << __func__ << " acting " << acting << " up " << up << dendl;
2604
2605 assert(!actingbackfill.empty());
2606
2607 // NOTE: we only generate degraded, misplaced and unfound
2608 // values for the summation, not individual stat categories.
2609 int64_t num_objects = info.stats.stats.sum.num_objects;
2610
2611 // Objects missing from up nodes, sorted by # objects.
2612 boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
2613 // Objects missing from nodes not in up, sort by # objects
2614 boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
2615
2616 int64_t missing;
2617
2618 // Primary first
2619 missing = pg_log.get_missing().num_missing();
2620 assert(actingbackfill.count(pg_whoami));
2621 if (upset.count(pg_whoami)) {
2622 missing_target_objects.insert(make_pair(missing, pg_whoami));
2623 } else {
2624 acting_source_objects.insert(make_pair(missing, pg_whoami));
2625 }
2626 info.stats.stats.sum.num_objects_missing_on_primary = missing;
2627
2628 // All other peers
2629 for (auto& peer : peer_info) {
2630 // Ignore other peers until we add code to look at detailed missing
2631 // information. (recovery)
2632 if (!actingbackfill.count(peer.first)) {
2633 continue;
2634 }
2635 missing = 0;
2636 // Backfill targets always track num_objects accurately
2637 // all other peers track missing accurately.
2638 if (is_backfill_targets(peer.first)) {
2639 missing = std::max((int64_t)0, num_objects - peer.second.stats.stats.sum.num_objects);
2640 } else {
2641 if (peer_missing.count(peer.first)) {
2642 missing = peer_missing[peer.first].num_missing();
2643 } else {
2644 dout(20) << __func__ << " no peer_missing found for " << peer.first << dendl;
2645 }
2646 }
2647 if (upset.count(peer.first)) {
2648 missing_target_objects.insert(make_pair(missing, peer.first));
2649 } else {
2650 acting_source_objects.insert(make_pair(missing, peer.first));
2651 }
2652 peer.second.stats.stats.sum.num_objects_missing = missing;
2653 }
2654
2655 if (pool.info.is_replicated()) {
2656 // Add to missing_target_objects up to target elements (num_objects missing)
2657 assert(target >= missing_target_objects.size());
2658 unsigned needed = target - missing_target_objects.size();
2659 for (; needed; --needed)
2660 missing_target_objects.insert(make_pair(num_objects, pg_shard_t(pg_shard_t::NO_OSD)));
2661 } else {
2662 for (unsigned i = 0 ; i < num_shards; ++i) {
2663 shard_id_t shard(i);
2664 bool found = false;
2665 for (const auto& t : missing_target_objects) {
2666 if (std::get<1>(t).shard == shard) {
2667 found = true;
2668 break;
2669 }
2670 }
2671 if (!found)
2672 missing_target_objects.insert(make_pair(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard)));
2673 }
2674 }
2675
2676 for (const auto& item : missing_target_objects)
2677 dout(20) << __func__ << " missing shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
2678 for (const auto& item : acting_source_objects)
2679 dout(20) << __func__ << " acting shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
2680
2681 // A misplaced object is not stored on the correct OSD
2682 int64_t misplaced = 0;
2683 // a degraded objects has fewer replicas or EC shards than the pool specifies.
2684 int64_t degraded = 0;
2685
2686 for (auto m = missing_target_objects.rbegin();
2687 m != missing_target_objects.rend(); ++m) {
2688
2689 int64_t extra_missing = -1;
2690
2691 if (pool.info.is_replicated()) {
2692 if (!acting_source_objects.empty()) {
2693 auto extra_copy = acting_source_objects.begin();
2694 extra_missing = std::get<0>(*extra_copy);
2695 acting_source_objects.erase(extra_copy);
2696 }
2697 } else { // Erasure coded
2698 // Use corresponding shard
2699 for (const auto& a : acting_source_objects) {
2700 if (std::get<1>(a).shard == std::get<1>(*m).shard) {
2701 extra_missing = std::get<0>(a);
2702 acting_source_objects.erase(a);
2703 break;
2704 }
2705 }
2706 }
2707
2708 if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
2709 // We don't know which of the objects on the target
2710 // are part of extra_missing so assume are all degraded.
2711 misplaced += std::get<0>(*m) - extra_missing;
2712 degraded += extra_missing;
2713 } else {
2714 // 1. extra_missing == -1, more targets than sources so degraded
2715 // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
2716 // previously degraded are now present on the target.
2717 degraded += std::get<0>(*m);
2718 }
2719 }
2720 // If there are still acting that haven't been accounted for
2721 // then they are misplaced
2722 for (const auto& a : acting_source_objects) {
2723 int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
2724 dout(20) << __func__ << " extra acting misplaced " << extra_misplaced << dendl;
2725 misplaced += extra_misplaced;
2726 }
2727 dout(20) << __func__ << " degraded " << degraded << dendl;
2728 dout(20) << __func__ << " misplaced " << misplaced << dendl;
2729
2730 info.stats.stats.sum.num_objects_degraded = degraded;
2731 info.stats.stats.sum.num_objects_unfound = get_num_unfound();
2732 info.stats.stats.sum.num_objects_misplaced = misplaced;
2733 }
2734 }
2735
2736 void PG::_update_blocked_by()
2737 {
2738 // set a max on the number of blocking peers we report. if we go
2739 // over, report a random subset. keep the result sorted.
2740 unsigned keep = MIN(blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
2741 unsigned skip = blocked_by.size() - keep;
2742 info.stats.blocked_by.clear();
2743 info.stats.blocked_by.resize(keep);
2744 unsigned pos = 0;
2745 for (set<int>::iterator p = blocked_by.begin();
2746 p != blocked_by.end() && keep > 0;
2747 ++p) {
2748 if (skip > 0 && (rand() % (skip + keep) < skip)) {
2749 --skip;
2750 } else {
2751 info.stats.blocked_by[pos++] = *p;
2752 --keep;
2753 }
2754 }
2755 }
2756
2757 void PG::publish_stats_to_osd()
2758 {
2759 if (!is_primary())
2760 return;
2761
2762 pg_stats_publish_lock.Lock();
2763
2764 if (info.stats.stats.sum.num_scrub_errors)
2765 state_set(PG_STATE_INCONSISTENT);
2766 else
2767 state_clear(PG_STATE_INCONSISTENT);
2768
2769 utime_t now = ceph_clock_now();
2770 if (info.stats.state != state) {
2771 info.stats.last_change = now;
2772 // Optimistic estimation, if we just find out an inactive PG,
2773 // assumt it is active till now.
2774 if (!(state & PG_STATE_ACTIVE) &&
2775 (info.stats.state & PG_STATE_ACTIVE))
2776 info.stats.last_active = now;
2777
2778 if ((state & PG_STATE_ACTIVE) &&
2779 !(info.stats.state & PG_STATE_ACTIVE))
2780 info.stats.last_became_active = now;
2781 if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
2782 !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
2783 info.stats.last_became_peered = now;
2784 if (!(state & PG_STATE_CREATING) &&
2785 (info.stats.state & PG_STATE_CREATING)) {
2786 osd->send_pg_created(get_pgid().pgid);
2787 }
2788 info.stats.state = state;
2789 }
2790
2791 _update_calc_stats();
2792 if (info.stats.stats.sum.num_objects_degraded) {
2793 state_set(PG_STATE_DEGRADED);
2794 } else {
2795 state_clear(PG_STATE_DEGRADED);
2796 }
2797 _update_blocked_by();
2798
2799 bool publish = false;
2800 pg_stat_t pre_publish = info.stats;
2801 pre_publish.stats.add(unstable_stats);
2802 utime_t cutoff = now;
2803 cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
2804 if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
2805 info.stats.last_fresh > cutoff) {
2806 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2807 << ": no change since " << info.stats.last_fresh << dendl;
2808 } else {
2809 // update our stat summary and timestamps
2810 info.stats.reported_epoch = get_osdmap()->get_epoch();
2811 ++info.stats.reported_seq;
2812
2813 info.stats.last_fresh = now;
2814
2815 if (info.stats.state & PG_STATE_CLEAN)
2816 info.stats.last_clean = now;
2817 if (info.stats.state & PG_STATE_ACTIVE)
2818 info.stats.last_active = now;
2819 if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
2820 info.stats.last_peered = now;
2821 info.stats.last_unstale = now;
2822 if ((info.stats.state & PG_STATE_DEGRADED) == 0)
2823 info.stats.last_undegraded = now;
2824 if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
2825 info.stats.last_fullsized = now;
2826
2827 // do not send pgstat to mon anymore once we are luminous, since mgr takes
2828 // care of this by sending MMonMgrReport to mon.
2829 publish =
2830 osd->osd->get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
2831 pg_stats_publish_valid = true;
2832 pg_stats_publish = pre_publish;
2833
2834 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2835 << ":" << pg_stats_publish.reported_seq << dendl;
2836 }
2837 pg_stats_publish_lock.Unlock();
2838
2839 if (publish)
2840 osd->pg_stat_queue_enqueue(this);
2841 }
2842
2843 void PG::clear_publish_stats()
2844 {
2845 dout(15) << "clear_stats" << dendl;
2846 pg_stats_publish_lock.Lock();
2847 pg_stats_publish_valid = false;
2848 pg_stats_publish_lock.Unlock();
2849
2850 osd->pg_stat_queue_dequeue(this);
2851 }
2852
2853 /**
2854 * initialize a newly instantiated pg
2855 *
2856 * Initialize PG state, as when a PG is initially created, or when it
2857 * is first instantiated on the current node.
2858 *
2859 * @param role our role/rank
2860 * @param newup up set
2861 * @param newacting acting set
2862 * @param history pg history
2863 * @param pi past_intervals
2864 * @param backfill true if info should be marked as backfill
2865 * @param t transaction to write out our new state in
2866 */
2867 void PG::init(
2868 int role,
2869 const vector<int>& newup, int new_up_primary,
2870 const vector<int>& newacting, int new_acting_primary,
2871 const pg_history_t& history,
2872 const PastIntervals& pi,
2873 bool backfill,
2874 ObjectStore::Transaction *t)
2875 {
2876 dout(10) << "init role " << role << " up " << newup << " acting " << newacting
2877 << " history " << history
2878 << " past_intervals " << pi
2879 << dendl;
2880
2881 set_role(role);
2882 acting = newacting;
2883 up = newup;
2884 init_primary_up_acting(
2885 newup,
2886 newacting,
2887 new_up_primary,
2888 new_acting_primary);
2889
2890 info.history = history;
2891 past_intervals = pi;
2892
2893 info.stats.up = up;
2894 info.stats.up_primary = new_up_primary;
2895 info.stats.acting = acting;
2896 info.stats.acting_primary = new_acting_primary;
2897 info.stats.mapping_epoch = info.history.same_interval_since;
2898
2899 if (backfill) {
2900 dout(10) << __func__ << ": Setting backfill" << dendl;
2901 info.set_last_backfill(hobject_t());
2902 info.last_complete = info.last_update;
2903 pg_log.mark_log_for_rewrite();
2904 }
2905
2906 on_new_interval();
2907
2908 dirty_info = true;
2909 dirty_big_info = true;
2910 write_if_dirty(*t);
2911 }
2912
2913 #pragma GCC diagnostic ignored "-Wpragmas"
2914 #pragma GCC diagnostic push
2915 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2916
2917 void PG::upgrade(ObjectStore *store)
2918 {
2919 assert(info_struct_v <= 10);
2920 ObjectStore::Transaction t;
2921
2922 assert(info_struct_v >= 7);
2923
2924 // 7 -> 8
2925 if (info_struct_v <= 7) {
2926 pg_log.mark_log_for_rewrite();
2927 ghobject_t log_oid(OSD::make_pg_log_oid(pg_id));
2928 ghobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id));
2929 t.remove(coll_t::meta(), log_oid);
2930 t.remove(coll_t::meta(), biginfo_oid);
2931 t.touch(coll, pgmeta_oid);
2932 }
2933
2934 // 8 -> 9
2935 if (info_struct_v <= 8) {
2936 // no special action needed.
2937 }
2938
2939 // 9 -> 10
2940 if (info_struct_v <= 9) {
2941 // previous versions weren't (as) aggressively clearing past_intervals
2942 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
2943 dout(20) << __func__ << " clearing past_intervals" << dendl;
2944 past_intervals.clear();
2945 }
2946 }
2947
2948 // update infover_key
2949 if (info_struct_v < cur_struct_v) {
2950 map<string,bufferlist> v;
2951 __u8 ver = cur_struct_v;
2952 ::encode(ver, v[infover_key]);
2953 t.omap_setkeys(coll, pgmeta_oid, v);
2954 }
2955
2956 dirty_info = true;
2957 dirty_big_info = true;
2958 write_if_dirty(t);
2959
2960 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
2961 ObjectStore::Sequencer>("upgrade"));
2962 int r = store->apply_transaction(osr.get(), std::move(t));
2963 if (r != 0) {
2964 derr << __func__ << ": apply_transaction returned "
2965 << cpp_strerror(r) << dendl;
2966 ceph_abort();
2967 }
2968 assert(r == 0);
2969
2970 C_SaferCond waiter;
2971 if (!osr->flush_commit(&waiter)) {
2972 waiter.wait();
2973 }
2974 }
2975
2976 #pragma GCC diagnostic pop
2977 #pragma GCC diagnostic warning "-Wpragmas"
2978
2979 int PG::_prepare_write_info(CephContext* cct,
2980 map<string,bufferlist> *km,
2981 epoch_t epoch,
2982 pg_info_t &info, pg_info_t &last_written_info,
2983 PastIntervals &past_intervals,
2984 bool dirty_big_info,
2985 bool dirty_epoch,
2986 bool try_fast_info,
2987 PerfCounters *logger)
2988 {
2989 if (dirty_epoch) {
2990 ::encode(epoch, (*km)[epoch_key]);
2991 }
2992
2993 if (logger)
2994 logger->inc(l_osd_pg_info);
2995
2996 // try to do info efficiently?
2997 if (!dirty_big_info && try_fast_info &&
2998 info.last_update > last_written_info.last_update) {
2999 pg_fast_info_t fast;
3000 fast.populate_from(info);
3001 bool did = fast.try_apply_to(&last_written_info);
3002 assert(did); // we verified last_update increased above
3003 if (info == last_written_info) {
3004 ::encode(fast, (*km)[fastinfo_key]);
3005 if (logger)
3006 logger->inc(l_osd_pg_fastinfo);
3007 return 0;
3008 }
3009 generic_dout(30) << __func__ << " fastinfo failed, info:\n";
3010 {
3011 JSONFormatter jf(true);
3012 jf.dump_object("info", info);
3013 jf.flush(*_dout);
3014 }
3015 {
3016 *_dout << "\nlast_written_info:\n";
3017 JSONFormatter jf(true);
3018 jf.dump_object("last_written_info", last_written_info);
3019 jf.flush(*_dout);
3020 }
3021 *_dout << dendl;
3022 }
3023 last_written_info = info;
3024
3025 // info. store purged_snaps separately.
3026 interval_set<snapid_t> purged_snaps;
3027 purged_snaps.swap(info.purged_snaps);
3028 ::encode(info, (*km)[info_key]);
3029 purged_snaps.swap(info.purged_snaps);
3030
3031 if (dirty_big_info) {
3032 // potentially big stuff
3033 bufferlist& bigbl = (*km)[biginfo_key];
3034 ::encode(past_intervals, bigbl);
3035 ::encode(info.purged_snaps, bigbl);
3036 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
3037 if (logger)
3038 logger->inc(l_osd_pg_biginfo);
3039 }
3040
3041 return 0;
3042 }
3043
3044 void PG::_create(ObjectStore::Transaction& t, spg_t pgid, int bits)
3045 {
3046 coll_t coll(pgid);
3047 t.create_collection(coll, bits);
3048 }
3049
3050 void PG::_init(ObjectStore::Transaction& t, spg_t pgid, const pg_pool_t *pool)
3051 {
3052 coll_t coll(pgid);
3053
3054 if (pool) {
3055 // Give a hint to the PG collection
3056 bufferlist hint;
3057 uint32_t pg_num = pool->get_pg_num();
3058 uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
3059 ::encode(pg_num, hint);
3060 ::encode(expected_num_objects_pg, hint);
3061 uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
3062 t.collection_hint(coll, hint_type, hint);
3063 }
3064
3065 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3066 t.touch(coll, pgmeta_oid);
3067 map<string,bufferlist> values;
3068 __u8 struct_v = cur_struct_v;
3069 ::encode(struct_v, values[infover_key]);
3070 t.omap_setkeys(coll, pgmeta_oid, values);
3071 }
3072
3073 void PG::prepare_write_info(map<string,bufferlist> *km)
3074 {
3075 info.stats.stats.add(unstable_stats);
3076 unstable_stats.clear();
3077
3078 bool need_update_epoch = last_epoch < get_osdmap()->get_epoch();
3079 int ret = _prepare_write_info(cct, km, get_osdmap()->get_epoch(),
3080 info,
3081 last_written_info,
3082 past_intervals,
3083 dirty_big_info, need_update_epoch,
3084 cct->_conf->osd_fast_info,
3085 osd->logger);
3086 assert(ret == 0);
3087 if (need_update_epoch)
3088 last_epoch = get_osdmap()->get_epoch();
3089 last_persisted_osdmap_ref = osdmap_ref;
3090
3091 dirty_info = false;
3092 dirty_big_info = false;
3093 }
3094
3095 #pragma GCC diagnostic ignored "-Wpragmas"
3096 #pragma GCC diagnostic push
3097 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3098
3099 bool PG::_has_removal_flag(ObjectStore *store,
3100 spg_t pgid)
3101 {
3102 coll_t coll(pgid);
3103 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3104
3105 // first try new way
3106 set<string> keys;
3107 keys.insert("_remove");
3108 map<string,bufferlist> values;
3109 if (store->omap_get_values(coll, pgmeta_oid, keys, &values) == 0 &&
3110 values.size() == 1)
3111 return true;
3112
3113 return false;
3114 }
3115
3116 int PG::peek_map_epoch(ObjectStore *store,
3117 spg_t pgid,
3118 epoch_t *pepoch,
3119 bufferlist *bl)
3120 {
3121 coll_t coll(pgid);
3122 ghobject_t legacy_infos_oid(OSD::make_infos_oid());
3123 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3124 epoch_t cur_epoch = 0;
3125
3126 assert(bl);
3127 {
3128 // validate collection name
3129 assert(coll.is_pg());
3130 }
3131
3132 // try for v8
3133 set<string> keys;
3134 keys.insert(infover_key);
3135 keys.insert(epoch_key);
3136 map<string,bufferlist> values;
3137 int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3138 if (r == 0) {
3139 assert(values.size() == 2);
3140
3141 // sanity check version
3142 bufferlist::iterator bp = values[infover_key].begin();
3143 __u8 struct_v = 0;
3144 ::decode(struct_v, bp);
3145 assert(struct_v >= 8);
3146
3147 // get epoch
3148 bp = values[epoch_key].begin();
3149 ::decode(cur_epoch, bp);
3150 } else {
3151 // probably bug 10617; see OSD::load_pgs()
3152 return -1;
3153 }
3154
3155 *pepoch = cur_epoch;
3156 return 0;
3157 }
3158
3159 #pragma GCC diagnostic pop
3160 #pragma GCC diagnostic warning "-Wpragmas"
3161
3162 void PG::write_if_dirty(ObjectStore::Transaction& t)
3163 {
3164 map<string,bufferlist> km;
3165 if (dirty_big_info || dirty_info)
3166 prepare_write_info(&km);
3167 pg_log.write_log_and_missing(t, &km, coll, pgmeta_oid, pool.info.require_rollback());
3168 if (!km.empty())
3169 t.omap_setkeys(coll, pgmeta_oid, km);
3170 }
3171
3172 void PG::trim_log()
3173 {
3174 assert(is_primary());
3175 calc_trim_to();
3176 dout(10) << __func__ << " to " << pg_trim_to << dendl;
3177 if (pg_trim_to != eversion_t()) {
3178 // inform peers to trim log
3179 assert(!actingbackfill.empty());
3180 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3181 i != actingbackfill.end();
3182 ++i) {
3183 if (*i == pg_whoami) continue;
3184 osd->send_message_osd_cluster(
3185 i->osd,
3186 new MOSDPGTrim(
3187 get_osdmap()->get_epoch(),
3188 spg_t(info.pgid.pgid, i->shard),
3189 pg_trim_to),
3190 get_osdmap()->get_epoch());
3191 }
3192
3193 // trim primary as well
3194 pg_log.trim(pg_trim_to, info);
3195 dirty_info = true;
3196 }
3197 }
3198
3199 void PG::add_log_entry(const pg_log_entry_t& e, bool applied)
3200 {
3201 // raise last_complete only if we were previously up to date
3202 if (info.last_complete == info.last_update)
3203 info.last_complete = e.version;
3204
3205 // raise last_update.
3206 assert(e.version > info.last_update);
3207 info.last_update = e.version;
3208
3209 // raise user_version, if it increased (it may have not get bumped
3210 // by all logged updates)
3211 if (e.user_version > info.last_user_version)
3212 info.last_user_version = e.user_version;
3213
3214 // log mutation
3215 pg_log.add(e, applied);
3216 dout(10) << "add_log_entry " << e << dendl;
3217 }
3218
3219
3220 void PG::append_log(
3221 const vector<pg_log_entry_t>& logv,
3222 eversion_t trim_to,
3223 eversion_t roll_forward_to,
3224 ObjectStore::Transaction &t,
3225 bool transaction_applied)
3226 {
3227 if (transaction_applied)
3228 update_snap_map(logv, t);
3229
3230 /* The primary has sent an info updating the history, but it may not
3231 * have arrived yet. We want to make sure that we cannot remember this
3232 * write without remembering that it happened in an interval which went
3233 * active in epoch history.last_epoch_started.
3234 */
3235 if (info.last_epoch_started != info.history.last_epoch_started) {
3236 info.history.last_epoch_started = info.last_epoch_started;
3237 }
3238 if (info.last_interval_started != info.history.last_interval_started) {
3239 info.history.last_interval_started = info.last_interval_started;
3240 }
3241 dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3242
3243 PGLogEntryHandler handler{this, &t};
3244 if (!transaction_applied) {
3245 /* We must be a backfill peer, so it's ok if we apply
3246 * out-of-turn since we won't be considered when
3247 * determining a min possible last_update.
3248 */
3249 pg_log.roll_forward(&handler);
3250 }
3251
3252 for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3253 p != logv.end();
3254 ++p) {
3255 add_log_entry(*p, transaction_applied);
3256
3257 /* We don't want to leave the rollforward artifacts around
3258 * here past last_backfill. It's ok for the same reason as
3259 * above */
3260 if (transaction_applied &&
3261 p->soid > info.last_backfill) {
3262 pg_log.roll_forward(&handler);
3263 }
3264 }
3265 auto last = logv.rbegin();
3266 if (is_primary() && last != logv.rend()) {
3267 projected_log.skip_can_rollback_to_to_head();
3268 projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
3269 }
3270
3271 if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
3272 pg_log.roll_forward_to(
3273 roll_forward_to,
3274 &handler);
3275 t.register_on_applied(
3276 new C_UpdateLastRollbackInfoTrimmedToApplied(
3277 this,
3278 get_osdmap()->get_epoch(),
3279 roll_forward_to));
3280 }
3281
3282 pg_log.trim(trim_to, info);
3283
3284 // update the local pg, pg log
3285 dirty_info = true;
3286 write_if_dirty(t);
3287 }
3288
3289 bool PG::check_log_for_corruption(ObjectStore *store)
3290 {
3291 /// TODO: this method needs to work with the omap log
3292 return true;
3293 }
3294
3295 //! Get the name we're going to save our corrupt page log as
3296 std::string PG::get_corrupt_pg_log_name() const
3297 {
3298 const int MAX_BUF = 512;
3299 char buf[MAX_BUF];
3300 struct tm tm_buf;
3301 time_t my_time(time(NULL));
3302 const struct tm *t = localtime_r(&my_time, &tm_buf);
3303 int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t);
3304 if (ret == 0) {
3305 dout(0) << "strftime failed" << dendl;
3306 return "corrupt_log_unknown_time";
3307 }
3308 string out(buf);
3309 out += stringify(info.pgid);
3310 return out;
3311 }
3312
3313 int PG::read_info(
3314 ObjectStore *store, spg_t pgid, const coll_t &coll, bufferlist &bl,
3315 pg_info_t &info, PastIntervals &past_intervals,
3316 __u8 &struct_v)
3317 {
3318 // try for v8 or later
3319 set<string> keys;
3320 keys.insert(infover_key);
3321 keys.insert(info_key);
3322 keys.insert(biginfo_key);
3323 keys.insert(fastinfo_key);
3324 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3325 map<string,bufferlist> values;
3326 int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3327 if (r == 0) {
3328 assert(values.size() == 3 ||
3329 values.size() == 4);
3330
3331 bufferlist::iterator p = values[infover_key].begin();
3332 ::decode(struct_v, p);
3333 assert(struct_v >= 8);
3334
3335 p = values[info_key].begin();
3336 ::decode(info, p);
3337
3338 p = values[biginfo_key].begin();
3339 if (struct_v >= 10) {
3340 ::decode(past_intervals, p);
3341 } else {
3342 past_intervals.decode_classic(p);
3343 }
3344 ::decode(info.purged_snaps, p);
3345
3346 p = values[fastinfo_key].begin();
3347 if (!p.end()) {
3348 pg_fast_info_t fast;
3349 ::decode(fast, p);
3350 fast.try_apply_to(&info);
3351 }
3352 return 0;
3353 }
3354
3355 // legacy (ver < 8)
3356 ghobject_t infos_oid(OSD::make_infos_oid());
3357 bufferlist::iterator p = bl.begin();
3358 ::decode(struct_v, p);
3359 assert(struct_v == 7);
3360
3361 // get info out of leveldb
3362 string k = get_info_key(info.pgid);
3363 string bk = get_biginfo_key(info.pgid);
3364 keys.clear();
3365 keys.insert(k);
3366 keys.insert(bk);
3367 values.clear();
3368 store->omap_get_values(coll_t::meta(), ghobject_t(infos_oid), keys, &values);
3369 assert(values.size() == 2);
3370
3371 p = values[k].begin();
3372 ::decode(info, p);
3373
3374 p = values[bk].begin();
3375 ::decode(past_intervals, p);
3376 interval_set<snapid_t> snap_collections; // obsolete
3377 ::decode(snap_collections, p);
3378 ::decode(info.purged_snaps, p);
3379 return 0;
3380 }
3381
3382 void PG::read_state(ObjectStore *store, bufferlist &bl)
3383 {
3384 int r = read_info(store, pg_id, coll, bl, info, past_intervals,
3385 info_struct_v);
3386 assert(r >= 0);
3387
3388 last_written_info = info;
3389
3390 // if we are upgrading from jewel, we need to force rebuild of
3391 // missing set. v9 was fastinfo, added v11.0.2-331-g1d5dc29a13
3392 // (before kraken). persisted missing set was circa
3393 // v11.0.0-866-gb0e239da95 (a bit earlier, also before kraken).
3394 // v8 was pre-jewel (per-pg meta object).
3395 bool force_rebuild_missing = info_struct_v < 9;
3396 if (force_rebuild_missing) {
3397 dout(10) << __func__ << " detected upgrade from jewel, force_rebuild_missing"
3398 << dendl;
3399 }
3400
3401 ostringstream oss;
3402 pg_log.read_log_and_missing(
3403 store,
3404 coll,
3405 info_struct_v < 8 ? coll_t::meta() : coll,
3406 ghobject_t(info_struct_v < 8 ? OSD::make_pg_log_oid(pg_id) : pgmeta_oid),
3407 info,
3408 force_rebuild_missing,
3409 oss,
3410 cct->_conf->osd_ignore_stale_divergent_priors,
3411 cct->_conf->osd_debug_verify_missing_on_start);
3412 if (oss.tellp())
3413 osd->clog->error() << oss.str();
3414
3415 if (force_rebuild_missing) {
3416 dout(10) << __func__ << " forced rebuild of missing got "
3417 << pg_log.get_missing()
3418 << dendl;
3419 }
3420
3421 // log any weirdness
3422 log_weirdness();
3423 }
3424
3425 void PG::log_weirdness()
3426 {
3427 if (pg_log.get_tail() != info.log_tail)
3428 osd->clog->error() << info.pgid
3429 << " info mismatch, log.tail " << pg_log.get_tail()
3430 << " != info.log_tail " << info.log_tail;
3431 if (pg_log.get_head() != info.last_update)
3432 osd->clog->error() << info.pgid
3433 << " info mismatch, log.head " << pg_log.get_head()
3434 << " != info.last_update " << info.last_update;
3435
3436 if (!pg_log.get_log().empty()) {
3437 // sloppy check
3438 if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
3439 osd->clog->error() << info.pgid
3440 << " log bound mismatch, info (tail,head] ("
3441 << pg_log.get_tail() << "," << pg_log.get_head() << "]"
3442 << " actual ["
3443 << pg_log.get_log().log.begin()->version << ","
3444 << pg_log.get_log().log.rbegin()->version << "]";
3445 }
3446
3447 if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
3448 osd->clog->error() << info.pgid
3449 << " caller_ops.size " << pg_log.get_log().caller_ops.size()
3450 << " > log size " << pg_log.get_log().log.size();
3451 }
3452 }
3453
3454 void PG::update_snap_map(
3455 const vector<pg_log_entry_t> &log_entries,
3456 ObjectStore::Transaction &t)
3457 {
3458 for (vector<pg_log_entry_t>::const_iterator i = log_entries.begin();
3459 i != log_entries.end();
3460 ++i) {
3461 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
3462 if (i->soid.snap < CEPH_MAXSNAP) {
3463 if (i->is_delete()) {
3464 int r = snap_mapper.remove_oid(
3465 i->soid,
3466 &_t);
3467 assert(r == 0);
3468 } else if (i->is_update()) {
3469 assert(i->snaps.length() > 0);
3470 vector<snapid_t> snaps;
3471 bufferlist snapbl = i->snaps;
3472 bufferlist::iterator p = snapbl.begin();
3473 try {
3474 ::decode(snaps, p);
3475 } catch (...) {
3476 derr << __func__ << " decode snaps failure on " << *i << dendl;
3477 snaps.clear();
3478 }
3479 set<snapid_t> _snaps(snaps.begin(), snaps.end());
3480
3481 if (i->is_clone() || i->is_promote()) {
3482 snap_mapper.add_oid(
3483 i->soid,
3484 _snaps,
3485 &_t);
3486 } else if (i->is_modify()) {
3487 assert(i->is_modify());
3488 int r = snap_mapper.update_snaps(
3489 i->soid,
3490 _snaps,
3491 0,
3492 &_t);
3493 assert(r == 0);
3494 } else {
3495 assert(i->is_clean());
3496 }
3497 }
3498 }
3499 }
3500 }
3501
3502 /**
3503 * filter trimming|trimmed snaps out of snapcontext
3504 */
3505 void PG::filter_snapc(vector<snapid_t> &snaps)
3506 {
3507 //nothing needs to trim, we can return immediately
3508 if(snap_trimq.empty() && info.purged_snaps.empty())
3509 return;
3510
3511 bool filtering = false;
3512 vector<snapid_t> newsnaps;
3513 for (vector<snapid_t>::iterator p = snaps.begin();
3514 p != snaps.end();
3515 ++p) {
3516 if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
3517 if (!filtering) {
3518 // start building a new vector with what we've seen so far
3519 dout(10) << "filter_snapc filtering " << snaps << dendl;
3520 newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
3521 filtering = true;
3522 }
3523 dout(20) << "filter_snapc removing trimq|purged snap " << *p << dendl;
3524 } else {
3525 if (filtering)
3526 newsnaps.push_back(*p); // continue building new vector
3527 }
3528 }
3529 if (filtering) {
3530 snaps.swap(newsnaps);
3531 dout(10) << "filter_snapc result " << snaps << dendl;
3532 }
3533 }
3534
3535 void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m)
3536 {
3537 for (map<hobject_t, list<OpRequestRef>>::iterator it = m.begin();
3538 it != m.end();
3539 ++it)
3540 requeue_ops(it->second);
3541 m.clear();
3542 }
3543
3544 void PG::requeue_op(OpRequestRef op)
3545 {
3546 auto p = waiting_for_map.find(op->get_source());
3547 if (p != waiting_for_map.end()) {
3548 dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")"
3549 << dendl;
3550 p->second.push_front(op);
3551 } else {
3552 dout(20) << __func__ << " " << op << dendl;
3553 osd->enqueue_front(info.pgid, PGQueueable(op, get_osdmap()->get_epoch()));
3554 }
3555 }
3556
3557 void PG::requeue_ops(list<OpRequestRef> &ls)
3558 {
3559 for (list<OpRequestRef>::reverse_iterator i = ls.rbegin();
3560 i != ls.rend();
3561 ++i) {
3562 auto p = waiting_for_map.find((*i)->get_source());
3563 if (p != waiting_for_map.end()) {
3564 dout(20) << __func__ << " " << *i << " (waiting_for_map " << p->first
3565 << ")" << dendl;
3566 p->second.push_front(*i);
3567 } else {
3568 dout(20) << __func__ << " " << *i << dendl;
3569 osd->enqueue_front(info.pgid, PGQueueable(*i, get_osdmap()->get_epoch()));
3570 }
3571 }
3572 ls.clear();
3573 }
3574
3575 void PG::requeue_map_waiters()
3576 {
3577 epoch_t epoch = get_osdmap()->get_epoch();
3578 auto p = waiting_for_map.begin();
3579 while (p != waiting_for_map.end()) {
3580 if (epoch < p->second.front()->min_epoch) {
3581 dout(20) << __func__ << " " << p->first << " front op "
3582 << p->second.front() << " must still wait, doing nothing"
3583 << dendl;
3584 ++p;
3585 } else {
3586 dout(20) << __func__ << " " << p->first << " " << p->second << dendl;
3587 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
3588 osd->enqueue_front(info.pgid, PGQueueable(*q, epoch));
3589 }
3590 p = waiting_for_map.erase(p);
3591 }
3592 }
3593 }
3594
3595
3596 // ==========================================================================================
3597 // SCRUB
3598
3599 /*
3600 * when holding pg and sched_scrub_lock, then the states are:
3601 * scheduling:
3602 * scrubber.reserved = true
3603 * scrub_rserved_peers includes whoami
3604 * osd->scrub_pending++
3605 * scheduling, replica declined:
3606 * scrubber.reserved = true
3607 * scrubber.reserved_peers includes -1
3608 * osd->scrub_pending++
3609 * pending:
3610 * scrubber.reserved = true
3611 * scrubber.reserved_peers.size() == acting.size();
3612 * pg on scrub_wq
3613 * osd->scrub_pending++
3614 * scrubbing:
3615 * scrubber.reserved = false;
3616 * scrubber.reserved_peers empty
3617 * osd->scrubber.active++
3618 */
3619
3620 // returns true if a scrub has been newly kicked off
3621 bool PG::sched_scrub()
3622 {
3623 bool nodeep_scrub = false;
3624 assert(is_locked());
3625 if (!(is_primary() && is_active() && is_clean() && !is_scrubbing())) {
3626 return false;
3627 }
3628
3629 double deep_scrub_interval = 0;
3630 pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
3631 if (deep_scrub_interval <= 0) {
3632 deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
3633 }
3634 bool time_for_deep = ceph_clock_now() >=
3635 info.history.last_deep_scrub_stamp + deep_scrub_interval;
3636
3637 bool deep_coin_flip = false;
3638 // Only add random deep scrubs when NOT user initiated scrub
3639 if (!scrubber.must_scrub)
3640 deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
3641 dout(20) << __func__ << ": time_for_deep=" << time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
3642
3643 time_for_deep = (time_for_deep || deep_coin_flip);
3644
3645 //NODEEP_SCRUB so ignore time initiated deep-scrub
3646 if (osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
3647 pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
3648 time_for_deep = false;
3649 nodeep_scrub = true;
3650 }
3651
3652 if (!scrubber.must_scrub) {
3653 assert(!scrubber.must_deep_scrub);
3654
3655 //NOSCRUB so skip regular scrubs
3656 if ((osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
3657 pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) && !time_for_deep) {
3658 if (scrubber.reserved) {
3659 // cancel scrub if it is still in scheduling,
3660 // so pgs from other pools where scrub are still legal
3661 // have a chance to go ahead with scrubbing.
3662 clear_scrub_reserved();
3663 scrub_unreserve_replicas();
3664 }
3665 return false;
3666 }
3667 }
3668
3669 if (cct->_conf->osd_scrub_auto_repair
3670 && get_pgbackend()->auto_repair_supported()
3671 && time_for_deep
3672 // respect the command from user, and not do auto-repair
3673 && !scrubber.must_repair
3674 && !scrubber.must_scrub
3675 && !scrubber.must_deep_scrub) {
3676 dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
3677 scrubber.auto_repair = true;
3678 } else {
3679 // this happens when user issue the scrub/repair command during
3680 // the scheduling of the scrub/repair (e.g. request reservation)
3681 scrubber.auto_repair = false;
3682 }
3683
3684 bool ret = true;
3685 if (!scrubber.reserved) {
3686 assert(scrubber.reserved_peers.empty());
3687 if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
3688 osd->inc_scrubs_pending()) {
3689 dout(20) << __func__ << ": reserved locally, reserving replicas" << dendl;
3690 scrubber.reserved = true;
3691 scrubber.reserved_peers.insert(pg_whoami);
3692 scrub_reserve_replicas();
3693 } else {
3694 dout(20) << __func__ << ": failed to reserve locally" << dendl;
3695 ret = false;
3696 }
3697 }
3698 if (scrubber.reserved) {
3699 if (scrubber.reserve_failed) {
3700 dout(20) << "sched_scrub: failed, a peer declined" << dendl;
3701 clear_scrub_reserved();
3702 scrub_unreserve_replicas();
3703 ret = false;
3704 } else if (scrubber.reserved_peers.size() == acting.size()) {
3705 dout(20) << "sched_scrub: success, reserved self and replicas" << dendl;
3706 if (time_for_deep) {
3707 dout(10) << "sched_scrub: scrub will be deep" << dendl;
3708 state_set(PG_STATE_DEEP_SCRUB);
3709 } else if (!scrubber.must_deep_scrub && info.stats.stats.sum.num_deep_scrub_errors) {
3710 if (!nodeep_scrub) {
3711 osd->clog->info() << "osd." << osd->whoami
3712 << " pg " << info.pgid
3713 << " Deep scrub errors, upgrading scrub to deep-scrub";
3714 state_set(PG_STATE_DEEP_SCRUB);
3715 } else if (!scrubber.must_scrub) {
3716 osd->clog->error() << "osd." << osd->whoami
3717 << " pg " << info.pgid
3718 << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
3719 clear_scrub_reserved();
3720 scrub_unreserve_replicas();
3721 return false;
3722 } else {
3723 osd->clog->error() << "osd." << osd->whoami
3724 << " pg " << info.pgid
3725 << " Regular scrub request, deep-scrub details will be lost";
3726 }
3727 }
3728 queue_scrub();
3729 } else {
3730 // none declined, since scrubber.reserved is set
3731 dout(20) << "sched_scrub: reserved " << scrubber.reserved_peers << ", waiting for replicas" << dendl;
3732 }
3733 }
3734
3735 return ret;
3736 }
3737
3738 void PG::reg_next_scrub()
3739 {
3740 if (!is_primary())
3741 return;
3742
3743 utime_t reg_stamp;
3744 if (scrubber.must_scrub ||
3745 (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats)) {
3746 reg_stamp = ceph_clock_now();
3747 } else {
3748 reg_stamp = info.history.last_scrub_stamp;
3749 }
3750 // note down the sched_time, so we can locate this scrub, and remove it
3751 // later on.
3752 double scrub_min_interval = 0, scrub_max_interval = 0;
3753 pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
3754 pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
3755 assert(scrubber.scrub_reg_stamp == utime_t());
3756 scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
3757 reg_stamp,
3758 scrub_min_interval,
3759 scrub_max_interval,
3760 scrubber.must_scrub);
3761 }
3762
3763 void PG::unreg_next_scrub()
3764 {
3765 if (is_primary()) {
3766 osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
3767 scrubber.scrub_reg_stamp = utime_t();
3768 }
3769 }
3770
3771 void PG::do_replica_scrub_map(OpRequestRef op)
3772 {
3773 const MOSDRepScrubMap *m = static_cast<const MOSDRepScrubMap*>(op->get_req());
3774 dout(7) << __func__ << " " << *m << dendl;
3775 if (m->map_epoch < info.history.same_interval_since) {
3776 dout(10) << __func__ << " discarding old from "
3777 << m->map_epoch << " < " << info.history.same_interval_since
3778 << dendl;
3779 return;
3780 }
3781 if (!scrubber.is_chunky_scrub_active()) {
3782 dout(10) << __func__ << " scrub isn't active" << dendl;
3783 return;
3784 }
3785
3786 op->mark_started();
3787
3788 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3789 scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3790 dout(10) << "map version is "
3791 << scrubber.received_maps[m->from].valid_through
3792 << dendl;
3793
3794 dout(10) << __func__ << " waiting_on_whom was " << scrubber.waiting_on_whom
3795 << dendl;
3796 assert(scrubber.waiting_on_whom.count(m->from));
3797 scrubber.waiting_on_whom.erase(m->from);
3798 if (m->preempted) {
3799 dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
3800 scrub_preempted = true;
3801 }
3802 if (scrubber.waiting_on_whom.empty()) {
3803 if (ops_blocked_by_scrub()) {
3804 requeue_scrub(true);
3805 } else {
3806 requeue_scrub(false);
3807 }
3808 }
3809 }
3810
3811 void PG::sub_op_scrub_map(OpRequestRef op)
3812 {
3813 // for legacy jewel compatibility only
3814 const MOSDSubOp *m = static_cast<const MOSDSubOp *>(op->get_req());
3815 assert(m->get_type() == MSG_OSD_SUBOP);
3816 dout(7) << "sub_op_scrub_map" << dendl;
3817
3818 if (m->map_epoch < info.history.same_interval_since) {
3819 dout(10) << "sub_op_scrub discarding old sub_op from "
3820 << m->map_epoch << " < " << info.history.same_interval_since << dendl;
3821 return;
3822 }
3823
3824 if (!scrubber.is_chunky_scrub_active()) {
3825 dout(10) << "sub_op_scrub_map scrub isn't active" << dendl;
3826 return;
3827 }
3828
3829 op->mark_started();
3830
3831 dout(10) << " got " << m->from << " scrub map" << dendl;
3832 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3833
3834 scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3835 dout(10) << "map version is "
3836 << scrubber.received_maps[m->from].valid_through
3837 << dendl;
3838
3839 scrubber.waiting_on_whom.erase(m->from);
3840
3841 if (scrubber.waiting_on_whom.empty()) {
3842 if (ops_blocked_by_scrub()) {
3843 requeue_scrub(true);
3844 } else {
3845 requeue_scrub(false);
3846 }
3847 }
3848 }
3849
3850 // send scrub v3 messages (chunky scrub)
3851 void PG::_request_scrub_map(
3852 pg_shard_t replica, eversion_t version,
3853 hobject_t start, hobject_t end,
3854 bool deep,
3855 bool allow_preemption)
3856 {
3857 assert(replica != pg_whoami);
3858 dout(10) << "scrub requesting scrubmap from osd." << replica
3859 << " deep " << (int)deep << dendl;
3860 MOSDRepScrub *repscrubop = new MOSDRepScrub(
3861 spg_t(info.pgid.pgid, replica.shard), version,
3862 get_osdmap()->get_epoch(),
3863 get_last_peering_reset(),
3864 start, end, deep,
3865 allow_preemption,
3866 scrubber.priority,
3867 ops_blocked_by_scrub());
3868 // default priority, we want the rep scrub processed prior to any recovery
3869 // or client io messages (we are holding a lock!)
3870 osd->send_message_osd_cluster(
3871 replica.osd, repscrubop, get_osdmap()->get_epoch());
3872 }
3873
3874 void PG::handle_scrub_reserve_request(OpRequestRef op)
3875 {
3876 dout(7) << __func__ << " " << *op->get_req() << dendl;
3877 op->mark_started();
3878 if (scrubber.reserved) {
3879 dout(10) << __func__ << " ignoring reserve request: Already reserved"
3880 << dendl;
3881 return;
3882 }
3883 if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
3884 osd->inc_scrubs_pending()) {
3885 scrubber.reserved = true;
3886 } else {
3887 dout(20) << __func__ << ": failed to reserve remotely" << dendl;
3888 scrubber.reserved = false;
3889 }
3890 if (op->get_req()->get_type() == MSG_OSD_SCRUB_RESERVE) {
3891 const MOSDScrubReserve *m =
3892 static_cast<const MOSDScrubReserve*>(op->get_req());
3893 Message *reply = new MOSDScrubReserve(
3894 spg_t(info.pgid.pgid, primary.shard),
3895 m->map_epoch,
3896 scrubber.reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
3897 pg_whoami);
3898 osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
3899 } else {
3900 // for jewel compat only
3901 const MOSDSubOp *req = static_cast<const MOSDSubOp*>(op->get_req());
3902 assert(req->get_type() == MSG_OSD_SUBOP);
3903 MOSDSubOpReply *reply = new MOSDSubOpReply(
3904 req, pg_whoami, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
3905 ::encode(scrubber.reserved, reply->get_data());
3906 osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
3907 }
3908 }
3909
3910 void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
3911 {
3912 dout(7) << __func__ << " " << *op->get_req() << dendl;
3913 op->mark_started();
3914 if (!scrubber.reserved) {
3915 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
3916 return;
3917 }
3918 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
3919 dout(10) << " already had osd." << from << " reserved" << dendl;
3920 } else {
3921 dout(10) << " osd." << from << " scrub reserve = success" << dendl;
3922 scrubber.reserved_peers.insert(from);
3923 sched_scrub();
3924 }
3925 }
3926
3927 void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
3928 {
3929 dout(7) << __func__ << " " << *op->get_req() << dendl;
3930 op->mark_started();
3931 if (!scrubber.reserved) {
3932 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
3933 return;
3934 }
3935 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
3936 dout(10) << " already had osd." << from << " reserved" << dendl;
3937 } else {
3938 /* One decline stops this pg from being scheduled for scrubbing. */
3939 dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
3940 scrubber.reserve_failed = true;
3941 sched_scrub();
3942 }
3943 }
3944
3945 void PG::handle_scrub_reserve_release(OpRequestRef op)
3946 {
3947 dout(7) << __func__ << " " << *op->get_req() << dendl;
3948 op->mark_started();
3949 clear_scrub_reserved();
3950 }
3951
3952 void PG::reject_reservation()
3953 {
3954 osd->send_message_osd_cluster(
3955 primary.osd,
3956 new MBackfillReserve(
3957 MBackfillReserve::REJECT,
3958 spg_t(info.pgid.pgid, primary.shard),
3959 get_osdmap()->get_epoch()),
3960 get_osdmap()->get_epoch());
3961 }
3962
3963 void PG::schedule_backfill_retry(float delay)
3964 {
3965 Mutex::Locker lock(osd->recovery_request_lock);
3966 osd->recovery_request_timer.add_event_after(
3967 delay,
3968 new QueuePeeringEvt<RequestBackfill>(
3969 this, get_osdmap()->get_epoch(),
3970 RequestBackfill()));
3971 }
3972
3973 void PG::schedule_recovery_retry(float delay)
3974 {
3975 Mutex::Locker lock(osd->recovery_request_lock);
3976 osd->recovery_request_timer.add_event_after(
3977 delay,
3978 new QueuePeeringEvt<DoRecovery>(
3979 this, get_osdmap()->get_epoch(),
3980 DoRecovery()));
3981 }
3982
3983 void PG::clear_scrub_reserved()
3984 {
3985 scrubber.reserved_peers.clear();
3986 scrubber.reserve_failed = false;
3987
3988 if (scrubber.reserved) {
3989 scrubber.reserved = false;
3990 osd->dec_scrubs_pending();
3991 }
3992 }
3993
3994 void PG::scrub_reserve_replicas()
3995 {
3996 assert(backfill_targets.empty());
3997 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3998 i != actingbackfill.end();
3999 ++i) {
4000 if (*i == pg_whoami) continue;
4001 dout(10) << "scrub requesting reserve from osd." << *i << dendl;
4002 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
4003 osd->send_message_osd_cluster(
4004 i->osd,
4005 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
4006 get_osdmap()->get_epoch(),
4007 MOSDScrubReserve::REQUEST, pg_whoami),
4008 get_osdmap()->get_epoch());
4009 } else {
4010 // for jewel compat only
4011 vector<OSDOp> scrub(1);
4012 scrub[0].op.op = CEPH_OSD_OP_SCRUB_RESERVE;
4013 hobject_t poid;
4014 eversion_t v;
4015 osd_reqid_t reqid;
4016 MOSDSubOp *subop = new MOSDSubOp(
4017 reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
4018 get_osdmap()->get_epoch(), osd->get_tid(), v);
4019 subop->ops = scrub;
4020 osd->send_message_osd_cluster(
4021 i->osd, subop, get_osdmap()->get_epoch());
4022 }
4023 }
4024 }
4025
4026 void PG::scrub_unreserve_replicas()
4027 {
4028 assert(backfill_targets.empty());
4029 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4030 i != actingbackfill.end();
4031 ++i) {
4032 if (*i == pg_whoami) continue;
4033 dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
4034 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
4035 osd->send_message_osd_cluster(
4036 i->osd,
4037 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
4038 get_osdmap()->get_epoch(),
4039 MOSDScrubReserve::RELEASE, pg_whoami),
4040 get_osdmap()->get_epoch());
4041 } else {
4042 // for jewel compat only
4043 vector<OSDOp> scrub(1);
4044 scrub[0].op.op = CEPH_OSD_OP_SCRUB_UNRESERVE;
4045 hobject_t poid;
4046 eversion_t v;
4047 osd_reqid_t reqid;
4048 MOSDSubOp *subop = new MOSDSubOp(
4049 reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
4050 get_osdmap()->get_epoch(), osd->get_tid(), v);
4051 subop->ops = scrub;
4052 osd->send_message_osd_cluster(i->osd, subop, get_osdmap()->get_epoch());
4053 }
4054 }
4055 }
4056
4057 void PG::_scan_rollback_obs(
4058 const vector<ghobject_t> &rollback_obs,
4059 ThreadPool::TPHandle &handle)
4060 {
4061 ObjectStore::Transaction t;
4062 eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
4063 for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
4064 i != rollback_obs.end();
4065 ++i) {
4066 if (i->generation < trimmed_to.version) {
4067 osd->clog->error() << "osd." << osd->whoami
4068 << " pg " << info.pgid
4069 << " found obsolete rollback obj "
4070 << *i << " generation < trimmed_to "
4071 << trimmed_to
4072 << "...repaired";
4073 t.remove(coll, *i);
4074 }
4075 }
4076 if (!t.empty()) {
4077 derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
4078 << dendl;
4079 osd->store->queue_transaction(osr.get(), std::move(t), NULL);
4080 }
4081 }
4082
4083 void PG::_scan_snaps(ScrubMap &smap)
4084 {
4085 hobject_t head;
4086 SnapSet snapset;
4087
4088 // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify
4089 // caller using clean_meta_map(), and it works properly.
4090 dout(20) << __func__ << " start" << dendl;
4091
4092 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4093 i != smap.objects.rend();
4094 ++i) {
4095 const hobject_t &hoid = i->first;
4096 ScrubMap::object &o = i->second;
4097
4098 dout(20) << __func__ << " " << hoid << dendl;
4099
4100 if (hoid.is_head() || hoid.is_snapdir()) {
4101 // parse the SnapSet
4102 bufferlist bl;
4103 if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
4104 continue;
4105 }
4106 bl.push_back(o.attrs[SS_ATTR]);
4107 auto p = bl.begin();
4108 try {
4109 ::decode(snapset, p);
4110 } catch(...) {
4111 continue;
4112 }
4113 head = hoid.get_head();
4114 // Make sure head_exists is correct for is_legacy() check
4115 if (hoid.is_head())
4116 snapset.head_exists = true;
4117 continue;
4118 }
4119 if (hoid.snap < CEPH_MAXSNAP) {
4120 // check and if necessary fix snap_mapper
4121 if (hoid.get_head() != head) {
4122 derr << __func__ << " no head for " << hoid << " (have " << head << ")"
4123 << dendl;
4124 continue;
4125 }
4126 set<snapid_t> obj_snaps;
4127 if (!snapset.is_legacy()) {
4128 auto p = snapset.clone_snaps.find(hoid.snap);
4129 if (p == snapset.clone_snaps.end()) {
4130 derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset
4131 << dendl;
4132 continue;
4133 }
4134 obj_snaps.insert(p->second.begin(), p->second.end());
4135 } else {
4136 bufferlist bl;
4137 if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4138 continue;
4139 }
4140 bl.push_back(o.attrs[OI_ATTR]);
4141 object_info_t oi;
4142 try {
4143 oi.decode(bl);
4144 } catch(...) {
4145 continue;
4146 }
4147 obj_snaps.insert(oi.legacy_snaps.begin(), oi.legacy_snaps.end());
4148 }
4149 set<snapid_t> cur_snaps;
4150 int r = snap_mapper.get_snaps(hoid, &cur_snaps);
4151 if (r != 0 && r != -ENOENT) {
4152 derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
4153 ceph_abort();
4154 }
4155 if (r == -ENOENT || cur_snaps != obj_snaps) {
4156 ObjectStore::Transaction t;
4157 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4158 if (r == 0) {
4159 r = snap_mapper.remove_oid(hoid, &_t);
4160 if (r != 0) {
4161 derr << __func__ << ": remove_oid returned " << cpp_strerror(r)
4162 << dendl;
4163 ceph_abort();
4164 }
4165 osd->clog->error() << "osd." << osd->whoami
4166 << " found snap mapper error on pg "
4167 << info.pgid
4168 << " oid " << hoid << " snaps in mapper: "
4169 << cur_snaps << ", oi: "
4170 << obj_snaps
4171 << "...repaired";
4172 } else {
4173 osd->clog->error() << "osd." << osd->whoami
4174 << " found snap mapper error on pg "
4175 << info.pgid
4176 << " oid " << hoid << " snaps missing in mapper"
4177 << ", should be: "
4178 << obj_snaps
4179 << "...repaired";
4180 }
4181 snap_mapper.add_oid(hoid, obj_snaps, &_t);
4182
4183 // wait for repair to apply to avoid confusing other bits of the system.
4184 {
4185 Cond my_cond;
4186 Mutex my_lock("PG::_scan_snaps my_lock");
4187 int r = 0;
4188 bool done;
4189 t.register_on_applied_sync(
4190 new C_SafeCond(&my_lock, &my_cond, &done, &r));
4191 r = osd->store->apply_transaction(osr.get(), std::move(t));
4192 if (r != 0) {
4193 derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4194 << dendl;
4195 } else {
4196 my_lock.Lock();
4197 while (!done)
4198 my_cond.Wait(my_lock);
4199 my_lock.Unlock();
4200 }
4201 }
4202 }
4203 }
4204 }
4205 }
4206
4207 void PG::_repair_oinfo_oid(ScrubMap &smap)
4208 {
4209 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4210 i != smap.objects.rend();
4211 ++i) {
4212 const hobject_t &hoid = i->first;
4213 ScrubMap::object &o = i->second;
4214
4215 bufferlist bl;
4216 if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4217 continue;
4218 }
4219 bl.push_back(o.attrs[OI_ATTR]);
4220 object_info_t oi;
4221 try {
4222 oi.decode(bl);
4223 } catch(...) {
4224 continue;
4225 }
4226 if (oi.soid != hoid) {
4227 ObjectStore::Transaction t;
4228 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4229 osd->clog->error() << "osd." << osd->whoami
4230 << " found object info error on pg "
4231 << info.pgid
4232 << " oid " << hoid << " oid in object info: "
4233 << oi.soid
4234 << "...repaired";
4235 // Fix object info
4236 oi.soid = hoid;
4237 bl.clear();
4238 ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4239
4240 bufferptr bp(bl.c_str(), bl.length());
4241 o.attrs[OI_ATTR] = bp;
4242
4243 t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
4244 int r = osd->store->apply_transaction(osr.get(), std::move(t));
4245 if (r != 0) {
4246 derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4247 << dendl;
4248 }
4249 }
4250 }
4251 }
4252 int PG::build_scrub_map_chunk(
4253 ScrubMap &map,
4254 ScrubMapBuilder &pos,
4255 hobject_t start,
4256 hobject_t end,
4257 bool deep,
4258 ThreadPool::TPHandle &handle)
4259 {
4260 dout(10) << __func__ << " [" << start << "," << end << ") "
4261 << " pos " << pos
4262 << dendl;
4263
4264 // start
4265 while (pos.empty()) {
4266 pos.deep = deep;
4267 map.valid_through = info.last_update;
4268 osr->flush();
4269
4270 // objects
4271 vector<ghobject_t> rollback_obs;
4272 pos.ret = get_pgbackend()->objects_list_range(
4273 start,
4274 end,
4275 0,
4276 &pos.ls,
4277 &rollback_obs);
4278 if (pos.ret < 0) {
4279 dout(5) << "objects_list_range error: " << pos.ret << dendl;
4280 return pos.ret;
4281 }
4282 if (pos.ls.empty()) {
4283 break;
4284 }
4285 _scan_rollback_obs(rollback_obs, handle);
4286 pos.pos = 0;
4287 return -EINPROGRESS;
4288 }
4289
4290 // scan objects
4291 while (!pos.done()) {
4292 int r = get_pgbackend()->be_scan_list(map, pos);
4293 if (r == -EINPROGRESS) {
4294 return r;
4295 }
4296 }
4297
4298 // finish
4299 dout(20) << __func__ << " finishing" << dendl;
4300 assert(pos.done());
4301 _repair_oinfo_oid(map);
4302 if (!is_primary()) {
4303 ScrubMap for_meta_scrub;
4304 // In case we restarted smaller chunk, clear old data
4305 scrubber.cleaned_meta_map.clear_from(scrubber.start);
4306 scrubber.cleaned_meta_map.insert(map);
4307 scrubber.clean_meta_map(for_meta_scrub);
4308 _scan_snaps(for_meta_scrub);
4309 }
4310
4311 dout(20) << __func__ << " done, got " << map.objects.size() << " items"
4312 << dendl;
4313 return 0;
4314 }
4315
4316 void PG::Scrubber::cleanup_store(ObjectStore::Transaction *t) {
4317 if (!store)
4318 return;
4319 struct OnComplete : Context {
4320 std::unique_ptr<Scrub::Store> store;
4321 OnComplete(
4322 std::unique_ptr<Scrub::Store> &&store)
4323 : store(std::move(store)) {}
4324 void finish(int) override {}
4325 };
4326 store->cleanup(t);
4327 t->register_on_complete(new OnComplete(std::move(store)));
4328 assert(!store);
4329 }
4330
4331 void PG::repair_object(
4332 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
4333 pg_shard_t bad_peer)
4334 {
4335 list<pg_shard_t> op_shards;
4336 for (auto i : *ok_peers) {
4337 op_shards.push_back(i.second);
4338 }
4339 dout(10) << "repair_object " << soid << " bad_peer osd."
4340 << bad_peer << " ok_peers osd.{" << op_shards << "}" << dendl;
4341 ScrubMap::object &po = ok_peers->back().first;
4342 eversion_t v;
4343 bufferlist bv;
4344 bv.push_back(po.attrs[OI_ATTR]);
4345 object_info_t oi;
4346 try {
4347 bufferlist::iterator bliter = bv.begin();
4348 ::decode(oi, bliter);
4349 } catch (...) {
4350 dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
4351 assert(0);
4352 }
4353 if (bad_peer != primary) {
4354 peer_missing[bad_peer].add(soid, oi.version, eversion_t(), false);
4355 } else {
4356 // We should only be scrubbing if the PG is clean.
4357 assert(waiting_for_unreadable_object.empty());
4358
4359 pg_log.missing_add(soid, oi.version, eversion_t());
4360
4361 pg_log.set_last_requested(0);
4362 dout(10) << __func__ << ": primary = " << primary << dendl;
4363 }
4364
4365 if (is_ec_pg() || bad_peer == primary) {
4366 // we'd better collect all shard for EC pg, and prepare good peers as the
4367 // source of pull in the case of replicated pg.
4368 missing_loc.add_missing(soid, oi.version, eversion_t());
4369 list<pair<ScrubMap::object, pg_shard_t> >::iterator i;
4370 for (i = ok_peers->begin();
4371 i != ok_peers->end();
4372 ++i)
4373 missing_loc.add_location(soid, i->second);
4374 }
4375 }
4376
4377 /* replica_scrub
4378 *
4379 * Wait for last_update_applied to match msg->scrub_to as above. Wait
4380 * for pushes to complete in case of recent recovery. Build a single
4381 * scrubmap of objects that are in the range [msg->start, msg->end).
4382 */
4383 void PG::replica_scrub(
4384 OpRequestRef op,
4385 ThreadPool::TPHandle &handle)
4386 {
4387 const MOSDRepScrub *msg = static_cast<const MOSDRepScrub *>(op->get_req());
4388 assert(!scrubber.active_rep_scrub);
4389 dout(7) << "replica_scrub" << dendl;
4390
4391 if (msg->map_epoch < info.history.same_interval_since) {
4392 dout(10) << "replica_scrub discarding old replica_scrub from "
4393 << msg->map_epoch << " < " << info.history.same_interval_since
4394 << dendl;
4395 return;
4396 }
4397
4398 assert(msg->chunky);
4399 if (last_update_applied < msg->scrub_to) {
4400 dout(10) << "waiting for last_update_applied to catch up" << dendl;
4401 scrubber.active_rep_scrub = op;
4402 return;
4403 }
4404
4405 if (active_pushes > 0) {
4406 dout(10) << "waiting for active pushes to finish" << dendl;
4407 scrubber.active_rep_scrub = op;
4408 return;
4409 }
4410
4411 scrubber.state = Scrubber::BUILD_MAP_REPLICA;
4412 scrubber.replica_scrub_start = msg->min_epoch;
4413 scrubber.start = msg->start;
4414 scrubber.end = msg->end;
4415 scrubber.max_end = msg->end;
4416 scrubber.deep = msg->deep;
4417 scrubber.epoch_start = info.history.same_interval_since;
4418 if (msg->priority) {
4419 scrubber.priority = msg->priority;
4420 } else {
4421 scrubber.priority = get_scrub_priority();
4422 }
4423
4424 scrub_can_preempt = msg->allow_preemption;
4425 scrub_preempted = false;
4426 scrubber.replica_scrubmap_pos.reset();
4427
4428 requeue_scrub(msg->high_priority);
4429 }
4430
4431 /* Scrub:
4432 * PG_STATE_SCRUBBING is set when the scrub is queued
4433 *
4434 * scrub will be chunky if all OSDs in PG support chunky scrub
4435 * scrub will fail if OSDs are too old.
4436 */
4437 void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
4438 {
4439 if (cct->_conf->osd_scrub_sleep > 0 &&
4440 (scrubber.state == PG::Scrubber::NEW_CHUNK ||
4441 scrubber.state == PG::Scrubber::INACTIVE) &&
4442 scrubber.needs_sleep) {
4443 ceph_assert(!scrubber.sleeping);
4444 dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
4445
4446 // Do an async sleep so we don't block the op queue
4447 OSDService *osds = osd;
4448 spg_t pgid = get_pgid();
4449 int state = scrubber.state;
4450 auto scrub_requeue_callback =
4451 new FunctionContext([osds, pgid, state](int r) {
4452 PG *pg = osds->osd->lookup_lock_pg(pgid);
4453 if (pg == nullptr) {
4454 lgeneric_dout(osds->osd->cct, 20)
4455 << "scrub_requeue_callback: Could not find "
4456 << "PG " << pgid << " can't complete scrub requeue after sleep"
4457 << dendl;
4458 return;
4459 }
4460 pg->scrubber.sleeping = false;
4461 pg->scrubber.needs_sleep = false;
4462 lgeneric_dout(pg->cct, 20)
4463 << "scrub_requeue_callback: slept for "
4464 << ceph_clock_now() - pg->scrubber.sleep_start
4465 << ", re-queuing scrub with state " << state << dendl;
4466 pg->scrub_queued = false;
4467 pg->requeue_scrub();
4468 pg->scrubber.sleep_start = utime_t();
4469 pg->unlock();
4470 });
4471 Mutex::Locker l(osd->scrub_sleep_lock);
4472 osd->scrub_sleep_timer.add_event_after(cct->_conf->osd_scrub_sleep,
4473 scrub_requeue_callback);
4474 scrubber.sleeping = true;
4475 scrubber.sleep_start = ceph_clock_now();
4476 return;
4477 }
4478 if (pg_has_reset_since(queued)) {
4479 return;
4480 }
4481 assert(scrub_queued);
4482 scrub_queued = false;
4483 scrubber.needs_sleep = true;
4484
4485 // for the replica
4486 if (!is_primary() &&
4487 scrubber.state == PG::Scrubber::BUILD_MAP_REPLICA) {
4488 chunky_scrub(handle);
4489 return;
4490 }
4491
4492 if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
4493 dout(10) << "scrub -- not primary or active or not clean" << dendl;
4494 state_clear(PG_STATE_SCRUBBING);
4495 state_clear(PG_STATE_REPAIR);
4496 state_clear(PG_STATE_DEEP_SCRUB);
4497 publish_stats_to_osd();
4498 return;
4499 }
4500
4501 if (!scrubber.active) {
4502 assert(backfill_targets.empty());
4503
4504 scrubber.deep = state_test(PG_STATE_DEEP_SCRUB);
4505
4506 dout(10) << "starting a new chunky scrub" << dendl;
4507 }
4508
4509 chunky_scrub(handle);
4510 }
4511
4512 /*
4513 * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
4514 * chunk.
4515 *
4516 * The object store is partitioned into chunks which end on hash boundaries. For
4517 * each chunk, the following logic is performed:
4518 *
4519 * (1) Block writes on the chunk
4520 * (2) Request maps from replicas
4521 * (3) Wait for pushes to be applied (after recovery)
4522 * (4) Wait for writes to flush on the chunk
4523 * (5) Wait for maps from replicas
4524 * (6) Compare / repair all scrub maps
4525 * (7) Wait for digest updates to apply
4526 *
4527 * This logic is encoded in the mostly linear state machine:
4528 *
4529 * +------------------+
4530 * _________v__________ |
4531 * | | |
4532 * | INACTIVE | |
4533 * |____________________| |
4534 * | |
4535 * | +----------+ |
4536 * _________v___v______ | |
4537 * | | | |
4538 * | NEW_CHUNK | | |
4539 * |____________________| | |
4540 * | | |
4541 * _________v__________ | |
4542 * | | | |
4543 * | WAIT_PUSHES | | |
4544 * |____________________| | |
4545 * | | |
4546 * _________v__________ | |
4547 * | | | |
4548 * | WAIT_LAST_UPDATE | | |
4549 * |____________________| | |
4550 * | | |
4551 * _________v__________ | |
4552 * | | | |
4553 * | BUILD_MAP | | |
4554 * |____________________| | |
4555 * | | |
4556 * _________v__________ | |
4557 * | | | |
4558 * | WAIT_REPLICAS | | |
4559 * |____________________| | |
4560 * | | |
4561 * _________v__________ | |
4562 * | | | |
4563 * | COMPARE_MAPS | | |
4564 * |____________________| | |
4565 * | | |
4566 * | | |
4567 * _________v__________ | |
4568 * | | | |
4569 * |WAIT_DIGEST_UPDATES | | |
4570 * |____________________| | |
4571 * | | | |
4572 * | +----------+ |
4573 * _________v__________ |
4574 * | | |
4575 * | FINISH | |
4576 * |____________________| |
4577 * | |
4578 * +------------------+
4579 *
4580 * The primary determines the last update from the subset by walking the log. If
4581 * it sees a log entry pertaining to a file in the chunk, it tells the replicas
4582 * to wait until that update is applied before building a scrub map. Both the
4583 * primary and replicas will wait for any active pushes to be applied.
4584 *
4585 * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
4586 *
4587 * scrubber.state encodes the current state of the scrub (refer to state diagram
4588 * for details).
4589 */
4590 void PG::chunky_scrub(ThreadPool::TPHandle &handle)
4591 {
4592 // check for map changes
4593 if (scrubber.is_chunky_scrub_active()) {
4594 if (scrubber.epoch_start != info.history.same_interval_since) {
4595 dout(10) << "scrub pg changed, aborting" << dendl;
4596 scrub_clear_state();
4597 scrub_unreserve_replicas();
4598 return;
4599 }
4600 }
4601
4602 bool done = false;
4603 int ret;
4604
4605 while (!done) {
4606 dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
4607 << " [" << scrubber.start << "," << scrubber.end << ")"
4608 << " max_end " << scrubber.max_end << dendl;
4609
4610 switch (scrubber.state) {
4611 case PG::Scrubber::INACTIVE:
4612 dout(10) << "scrub start" << dendl;
4613 assert(is_primary());
4614
4615 publish_stats_to_osd();
4616 scrubber.epoch_start = info.history.same_interval_since;
4617 scrubber.active = true;
4618
4619 osd->inc_scrubs_active(scrubber.reserved);
4620 if (scrubber.reserved) {
4621 scrubber.reserved = false;
4622 scrubber.reserved_peers.clear();
4623 }
4624
4625 {
4626 ObjectStore::Transaction t;
4627 scrubber.cleanup_store(&t);
4628 scrubber.store.reset(Scrub::Store::create(osd->store, &t,
4629 info.pgid, coll));
4630 osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4631 }
4632
4633 // Don't include temporary objects when scrubbing
4634 scrubber.start = info.pgid.pgid.get_hobj_start();
4635 scrubber.state = PG::Scrubber::NEW_CHUNK;
4636
4637 {
4638 bool repair = state_test(PG_STATE_REPAIR);
4639 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4640 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4641 stringstream oss;
4642 oss << info.pgid.pgid << " " << mode << " starts" << std::endl;
4643 osd->clog->debug(oss);
4644 }
4645
4646 scrubber.preempt_left = cct->_conf->get_val<uint64_t>(
4647 "osd_scrub_max_preemptions");
4648 scrubber.preempt_divisor = 1;
4649 break;
4650
4651 case PG::Scrubber::NEW_CHUNK:
4652 scrubber.primary_scrubmap = ScrubMap();
4653 scrubber.received_maps.clear();
4654
4655 // begin (possible) preemption window
4656 if (scrub_preempted) {
4657 scrubber.preempt_left--;
4658 scrubber.preempt_divisor *= 2;
4659 dout(10) << __func__ << " preempted, " << scrubber.preempt_left
4660 << " left" << dendl;
4661 scrub_preempted = false;
4662 }
4663 scrub_can_preempt = scrubber.preempt_left > 0;
4664
4665 {
4666 /* get the start and end of our scrub chunk
4667 *
4668 * Our scrub chunk has an important restriction we're going to need to
4669 * respect. We can't let head or snapdir be start or end.
4670 * Using a half-open interval means that if end == head|snapdir,
4671 * we'd scrub/lock head and the clone right next to head in different
4672 * chunks which would allow us to miss clones created between
4673 * scrubbing that chunk and scrubbing the chunk including head.
4674 * This isn't true for any of the other clones since clones can
4675 * only be created "just to the left of" head. There is one exception
4676 * to this: promotion of clones which always happens to the left of the
4677 * left-most clone, but promote_object checks the scrubber in that
4678 * case, so it should be ok. Also, it's ok to "miss" clones at the
4679 * left end of the range if we are a tier because they may legitimately
4680 * not exist (see _scrub).
4681 */
4682 int min = std::max<int64_t>(3, cct->_conf->osd_scrub_chunk_min /
4683 scrubber.preempt_divisor);
4684 int max = std::max<int64_t>(min, cct->_conf->osd_scrub_chunk_max /
4685 scrubber.preempt_divisor);
4686 hobject_t start = scrubber.start;
4687 hobject_t candidate_end;
4688 vector<hobject_t> objects;
4689 osr->flush();
4690 ret = get_pgbackend()->objects_list_partial(
4691 start,
4692 min,
4693 max,
4694 &objects,
4695 &candidate_end);
4696 assert(ret >= 0);
4697
4698 if (!objects.empty()) {
4699 hobject_t back = objects.back();
4700 while (candidate_end.has_snapset() &&
4701 candidate_end.get_head() == back.get_head()) {
4702 candidate_end = back;
4703 objects.pop_back();
4704 if (objects.empty()) {
4705 assert(0 ==
4706 "Somehow we got more than 2 objects which"
4707 "have the same head but are not clones");
4708 }
4709 back = objects.back();
4710 }
4711 if (candidate_end.has_snapset()) {
4712 assert(candidate_end.get_head() != back.get_head());
4713 candidate_end = candidate_end.get_object_boundary();
4714 }
4715 } else {
4716 assert(candidate_end.is_max());
4717 }
4718
4719 if (!_range_available_for_scrub(scrubber.start, candidate_end)) {
4720 // we'll be requeued by whatever made us unavailable for scrub
4721 dout(10) << __func__ << ": scrub blocked somewhere in range "
4722 << "[" << scrubber.start << ", " << candidate_end << ")"
4723 << dendl;
4724 done = true;
4725 break;
4726 }
4727 scrubber.end = candidate_end;
4728 if (scrubber.end > scrubber.max_end)
4729 scrubber.max_end = scrubber.end;
4730 }
4731
4732 // walk the log to find the latest update that affects our chunk
4733 scrubber.subset_last_update = eversion_t();
4734 for (auto p = projected_log.log.rbegin();
4735 p != projected_log.log.rend();
4736 ++p) {
4737 if (p->soid >= scrubber.start &&
4738 p->soid < scrubber.end) {
4739 scrubber.subset_last_update = p->version;
4740 break;
4741 }
4742 }
4743 if (scrubber.subset_last_update == eversion_t()) {
4744 for (list<pg_log_entry_t>::const_reverse_iterator p =
4745 pg_log.get_log().log.rbegin();
4746 p != pg_log.get_log().log.rend();
4747 ++p) {
4748 if (p->soid >= scrubber.start &&
4749 p->soid < scrubber.end) {
4750 scrubber.subset_last_update = p->version;
4751 break;
4752 }
4753 }
4754 }
4755
4756 // ask replicas to wait until
4757 // last_update_applied >= scrubber.subset_last_update and then scan
4758 scrubber.waiting_on_whom.insert(pg_whoami);
4759
4760 // request maps from replicas
4761 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4762 i != actingbackfill.end();
4763 ++i) {
4764 if (*i == pg_whoami) continue;
4765 _request_scrub_map(*i, scrubber.subset_last_update,
4766 scrubber.start, scrubber.end, scrubber.deep,
4767 scrubber.preempt_left > 0);
4768 scrubber.waiting_on_whom.insert(*i);
4769 }
4770 dout(10) << __func__ << " waiting_on_whom " << scrubber.waiting_on_whom
4771 << dendl;
4772
4773 scrubber.state = PG::Scrubber::WAIT_PUSHES;
4774 break;
4775
4776 case PG::Scrubber::WAIT_PUSHES:
4777 if (active_pushes == 0) {
4778 scrubber.state = PG::Scrubber::WAIT_LAST_UPDATE;
4779 } else {
4780 dout(15) << "wait for pushes to apply" << dendl;
4781 done = true;
4782 }
4783 break;
4784
4785 case PG::Scrubber::WAIT_LAST_UPDATE:
4786 if (last_update_applied < scrubber.subset_last_update) {
4787 // will be requeued by op_applied
4788 dout(15) << "wait for writes to flush" << dendl;
4789 done = true;
4790 break;
4791 }
4792
4793 scrubber.state = PG::Scrubber::BUILD_MAP;
4794 scrubber.primary_scrubmap_pos.reset();
4795 break;
4796
4797 case PG::Scrubber::BUILD_MAP:
4798 assert(last_update_applied >= scrubber.subset_last_update);
4799
4800 // build my own scrub map
4801 if (scrub_preempted) {
4802 dout(10) << __func__ << " preempted" << dendl;
4803 scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
4804 break;
4805 }
4806 ret = build_scrub_map_chunk(
4807 scrubber.primary_scrubmap,
4808 scrubber.primary_scrubmap_pos,
4809 scrubber.start, scrubber.end,
4810 scrubber.deep,
4811 handle);
4812 if (ret == -EINPROGRESS) {
4813 requeue_scrub();
4814 done = true;
4815 break;
4816 }
4817 scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
4818 break;
4819
4820 case PG::Scrubber::BUILD_MAP_DONE:
4821 if (scrubber.primary_scrubmap_pos.ret < 0) {
4822 dout(5) << "error: " << scrubber.primary_scrubmap_pos.ret
4823 << ", aborting" << dendl;
4824 scrub_clear_state();
4825 scrub_unreserve_replicas();
4826 return;
4827 }
4828 dout(10) << __func__ << " waiting_on_whom was "
4829 << scrubber.waiting_on_whom << dendl;
4830 assert(scrubber.waiting_on_whom.count(pg_whoami));
4831 scrubber.waiting_on_whom.erase(pg_whoami);
4832
4833 scrubber.state = PG::Scrubber::WAIT_REPLICAS;
4834 break;
4835
4836 case PG::Scrubber::WAIT_REPLICAS:
4837 if (!scrubber.waiting_on_whom.empty()) {
4838 // will be requeued by sub_op_scrub_map
4839 dout(10) << "wait for replicas to build scrub map" << dendl;
4840 done = true;
4841 break;
4842 }
4843 // end (possible) preemption window
4844 scrub_can_preempt = false;
4845 if (scrub_preempted) {
4846 dout(10) << __func__ << " preempted, restarting chunk" << dendl;
4847 scrubber.state = PG::Scrubber::NEW_CHUNK;
4848 } else {
4849 scrubber.state = PG::Scrubber::COMPARE_MAPS;
4850 }
4851 break;
4852
4853 case PG::Scrubber::COMPARE_MAPS:
4854 assert(last_update_applied >= scrubber.subset_last_update);
4855 assert(scrubber.waiting_on_whom.empty());
4856
4857 scrub_compare_maps();
4858 scrubber.start = scrubber.end;
4859 scrubber.run_callbacks();
4860
4861 // requeue the writes from the chunk that just finished
4862 requeue_ops(waiting_for_scrub);
4863
4864 scrubber.state = PG::Scrubber::WAIT_DIGEST_UPDATES;
4865
4866 // fall-thru
4867
4868 case PG::Scrubber::WAIT_DIGEST_UPDATES:
4869 if (scrubber.num_digest_updates_pending) {
4870 dout(10) << __func__ << " waiting on "
4871 << scrubber.num_digest_updates_pending
4872 << " digest updates" << dendl;
4873 done = true;
4874 break;
4875 }
4876
4877 scrubber.preempt_left = cct->_conf->get_val<uint64_t>(
4878 "osd_scrub_max_preemptions");
4879 scrubber.preempt_divisor = 1;
4880
4881 if (!(scrubber.end.is_max())) {
4882 scrubber.state = PG::Scrubber::NEW_CHUNK;
4883 requeue_scrub();
4884 done = true;
4885 } else {
4886 scrubber.state = PG::Scrubber::FINISH;
4887 }
4888
4889 break;
4890
4891 case PG::Scrubber::FINISH:
4892 scrub_finish();
4893 scrubber.state = PG::Scrubber::INACTIVE;
4894 done = true;
4895
4896 if (!snap_trimq.empty()) {
4897 dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
4898 snap_trimmer_scrub_complete();
4899 }
4900
4901 break;
4902
4903 case PG::Scrubber::BUILD_MAP_REPLICA:
4904 // build my own scrub map
4905 if (scrub_preempted) {
4906 dout(10) << __func__ << " preempted" << dendl;
4907 ret = 0;
4908 } else {
4909 ret = build_scrub_map_chunk(
4910 scrubber.replica_scrubmap,
4911 scrubber.replica_scrubmap_pos,
4912 scrubber.start, scrubber.end,
4913 scrubber.deep,
4914 handle);
4915 }
4916 if (ret == -EINPROGRESS) {
4917 requeue_scrub();
4918 done = true;
4919 break;
4920 }
4921 // reply
4922 if (HAVE_FEATURE(acting_features, SERVER_LUMINOUS)) {
4923 MOSDRepScrubMap *reply = new MOSDRepScrubMap(
4924 spg_t(info.pgid.pgid, get_primary().shard),
4925 scrubber.replica_scrub_start,
4926 pg_whoami);
4927 reply->preempted = scrub_preempted;
4928 ::encode(scrubber.replica_scrubmap, reply->get_data());
4929 osd->send_message_osd_cluster(
4930 get_primary().osd, reply,
4931 scrubber.replica_scrub_start);
4932 } else {
4933 // for jewel compatibility
4934 vector<OSDOp> scrub(1);
4935 scrub[0].op.op = CEPH_OSD_OP_SCRUB_MAP;
4936 hobject_t poid;
4937 eversion_t v;
4938 osd_reqid_t reqid;
4939 MOSDSubOp *subop = new MOSDSubOp(
4940 reqid,
4941 pg_whoami,
4942 spg_t(info.pgid.pgid, get_primary().shard),
4943 poid,
4944 0,
4945 scrubber.replica_scrub_start,
4946 osd->get_tid(),
4947 v);
4948 ::encode(scrubber.replica_scrubmap, subop->get_data());
4949 subop->ops = scrub;
4950 osd->send_message_osd_cluster(
4951 get_primary().osd, subop,
4952 scrubber.replica_scrub_start);
4953 }
4954 scrub_preempted = false;
4955 scrub_can_preempt = false;
4956 scrubber.state = PG::Scrubber::INACTIVE;
4957 scrubber.replica_scrubmap = ScrubMap();
4958 scrubber.replica_scrubmap_pos = ScrubMapBuilder();
4959 scrubber.start = hobject_t();
4960 scrubber.end = hobject_t();
4961 scrubber.max_end = hobject_t();
4962 done = true;
4963 break;
4964
4965 default:
4966 ceph_abort();
4967 }
4968 }
4969 dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
4970 << " [" << scrubber.start << "," << scrubber.end << ")"
4971 << " max_end " << scrubber.max_end << dendl;
4972 }
4973
4974 bool PG::write_blocked_by_scrub(const hobject_t& soid)
4975 {
4976 if (soid < scrubber.start || soid >= scrubber.end) {
4977 return false;
4978 }
4979 if (scrub_can_preempt) {
4980 if (!scrub_preempted) {
4981 dout(10) << __func__ << " " << soid << " preempted" << dendl;
4982 scrub_preempted = true;
4983 } else {
4984 dout(10) << __func__ << " " << soid << " already preempted" << dendl;
4985 }
4986 return false;
4987 }
4988 return true;
4989 }
4990
4991 bool PG::range_intersects_scrub(const hobject_t &start, const hobject_t& end)
4992 {
4993 // does [start, end] intersect [scrubber.start, scrubber.max_end)
4994 return (start < scrubber.max_end &&
4995 end >= scrubber.start);
4996 }
4997
4998 void PG::scrub_clear_state()
4999 {
5000 assert(is_locked());
5001 state_clear(PG_STATE_SCRUBBING);
5002 state_clear(PG_STATE_REPAIR);
5003 state_clear(PG_STATE_DEEP_SCRUB);
5004 publish_stats_to_osd();
5005
5006 // active -> nothing.
5007 if (scrubber.active)
5008 osd->dec_scrubs_active();
5009
5010 requeue_ops(waiting_for_scrub);
5011
5012 scrubber.reset();
5013
5014 // type-specific state clear
5015 _scrub_clear_state();
5016 }
5017
5018 void PG::scrub_compare_maps()
5019 {
5020 dout(10) << __func__ << " has maps, analyzing" << dendl;
5021
5022 // construct authoritative scrub map for type specific scrubbing
5023 scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
5024 map<hobject_t,
5025 pair<boost::optional<uint32_t>,
5026 boost::optional<uint32_t>>> missing_digest;
5027
5028 map<pg_shard_t, ScrubMap *> maps;
5029 maps[pg_whoami] = &scrubber.primary_scrubmap;
5030
5031 for (const auto& i : actingbackfill) {
5032 if (i == pg_whoami) continue;
5033 dout(2) << __func__ << " replica " << i << " has "
5034 << scrubber.received_maps[i].objects.size()
5035 << " items" << dendl;
5036 maps[i] = &scrubber.received_maps[i];
5037 }
5038
5039 set<hobject_t> master_set;
5040
5041 // Construct master set
5042 for (const auto map : maps) {
5043 for (const auto i : map.second->objects) {
5044 master_set.insert(i.first);
5045 }
5046 }
5047
5048 stringstream ss;
5049 get_pgbackend()->be_large_omap_check(maps, master_set,
5050 scrubber.large_omap_objects, ss);
5051 if (!ss.str().empty()) {
5052 osd->clog->warn(ss);
5053 }
5054
5055 if (acting.size() > 1) {
5056 dout(10) << __func__ << " comparing replica scrub maps" << dendl;
5057
5058 // Map from object with errors to good peer
5059 map<hobject_t, list<pg_shard_t>> authoritative;
5060
5061 dout(2) << __func__ << " osd." << acting[0] << " has "
5062 << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
5063
5064 ss.str("");
5065 ss.clear();
5066
5067 get_pgbackend()->be_compare_scrubmaps(
5068 maps,
5069 master_set,
5070 state_test(PG_STATE_REPAIR),
5071 scrubber.missing,
5072 scrubber.inconsistent,
5073 authoritative,
5074 missing_digest,
5075 scrubber.shallow_errors,
5076 scrubber.deep_errors,
5077 scrubber.store.get(),
5078 info.pgid, acting,
5079 ss);
5080 dout(2) << ss.str() << dendl;
5081
5082 if (!ss.str().empty()) {
5083 osd->clog->error(ss);
5084 }
5085
5086 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
5087 i != authoritative.end();
5088 ++i) {
5089 list<pair<ScrubMap::object, pg_shard_t> > good_peers;
5090 for (list<pg_shard_t>::const_iterator j = i->second.begin();
5091 j != i->second.end();
5092 ++j) {
5093 good_peers.push_back(make_pair(maps[*j]->objects[i->first], *j));
5094 }
5095 scrubber.authoritative.insert(
5096 make_pair(
5097 i->first,
5098 good_peers));
5099 }
5100
5101 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
5102 i != authoritative.end();
5103 ++i) {
5104 scrubber.cleaned_meta_map.objects.erase(i->first);
5105 scrubber.cleaned_meta_map.objects.insert(
5106 *(maps[i->second.back()]->objects.find(i->first))
5107 );
5108 }
5109 }
5110
5111 ScrubMap for_meta_scrub;
5112 scrubber.clean_meta_map(for_meta_scrub);
5113
5114 // ok, do the pg-type specific scrubbing
5115 scrub_snapshot_metadata(for_meta_scrub, missing_digest);
5116 // Called here on the primary can use an authoritative map if it isn't the primary
5117 _scan_snaps(for_meta_scrub);
5118 if (!scrubber.store->empty()) {
5119 if (state_test(PG_STATE_REPAIR)) {
5120 dout(10) << __func__ << ": discarding scrub results" << dendl;
5121 scrubber.store->flush(nullptr);
5122 } else {
5123 dout(10) << __func__ << ": updating scrub object" << dendl;
5124 ObjectStore::Transaction t;
5125 scrubber.store->flush(&t);
5126 osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
5127 }
5128 }
5129 }
5130
5131 bool PG::scrub_process_inconsistent()
5132 {
5133 dout(10) << __func__ << ": checking authoritative" << dendl;
5134 bool repair = state_test(PG_STATE_REPAIR);
5135 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5136 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5137
5138 // authoriative only store objects which missing or inconsistent.
5139 if (!scrubber.authoritative.empty()) {
5140 stringstream ss;
5141 ss << info.pgid << " " << mode << " "
5142 << scrubber.missing.size() << " missing, "
5143 << scrubber.inconsistent.size() << " inconsistent objects";
5144 dout(2) << ss.str() << dendl;
5145 osd->clog->error(ss);
5146 if (repair) {
5147 state_clear(PG_STATE_CLEAN);
5148 for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >>::iterator i =
5149 scrubber.authoritative.begin();
5150 i != scrubber.authoritative.end();
5151 ++i) {
5152 set<pg_shard_t>::iterator j;
5153
5154 auto missing_entry = scrubber.missing.find(i->first);
5155 if (missing_entry != scrubber.missing.end()) {
5156 for (j = missing_entry->second.begin();
5157 j != missing_entry->second.end();
5158 ++j) {
5159 repair_object(
5160 i->first,
5161 &(i->second),
5162 *j);
5163 ++scrubber.fixed;
5164 }
5165 }
5166 if (scrubber.inconsistent.count(i->first)) {
5167 for (j = scrubber.inconsistent[i->first].begin();
5168 j != scrubber.inconsistent[i->first].end();
5169 ++j) {
5170 repair_object(i->first,
5171 &(i->second),
5172 *j);
5173 ++scrubber.fixed;
5174 }
5175 }
5176 }
5177 }
5178 }
5179 return (!scrubber.authoritative.empty() && repair);
5180 }
5181
5182 bool PG::ops_blocked_by_scrub() const {
5183 return (waiting_for_scrub.size() != 0);
5184 }
5185
5186 // the part that actually finalizes a scrub
5187 void PG::scrub_finish()
5188 {
5189 bool repair = state_test(PG_STATE_REPAIR);
5190 // if the repair request comes from auto-repair and large number of errors,
5191 // we would like to cancel auto-repair
5192 if (repair && scrubber.auto_repair
5193 && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) {
5194 state_clear(PG_STATE_REPAIR);
5195 repair = false;
5196 }
5197 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5198 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5199
5200 // type-specific finish (can tally more errors)
5201 _scrub_finish();
5202
5203 bool has_error = scrub_process_inconsistent();
5204
5205 {
5206 stringstream oss;
5207 oss << info.pgid.pgid << " " << mode << " ";
5208 int total_errors = scrubber.shallow_errors + scrubber.deep_errors;
5209 if (total_errors)
5210 oss << total_errors << " errors";
5211 else
5212 oss << "ok";
5213 if (!deep_scrub && info.stats.stats.sum.num_deep_scrub_errors)
5214 oss << " ( " << info.stats.stats.sum.num_deep_scrub_errors
5215 << " remaining deep scrub error details lost)";
5216 if (repair)
5217 oss << ", " << scrubber.fixed << " fixed";
5218 if (total_errors)
5219 osd->clog->error(oss);
5220 else
5221 osd->clog->debug(oss);
5222 }
5223
5224 // finish up
5225 unreg_next_scrub();
5226 utime_t now = ceph_clock_now();
5227 info.history.last_scrub = info.last_update;
5228 info.history.last_scrub_stamp = now;
5229 if (scrubber.deep) {
5230 info.history.last_deep_scrub = info.last_update;
5231 info.history.last_deep_scrub_stamp = now;
5232 }
5233 // Since we don't know which errors were fixed, we can only clear them
5234 // when every one has been fixed.
5235 if (repair) {
5236 if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) {
5237 assert(deep_scrub);
5238 scrubber.shallow_errors = scrubber.deep_errors = 0;
5239 } else {
5240 // Deep scrub in order to get corrected error counts
5241 scrub_after_recovery = true;
5242 }
5243 }
5244 if (deep_scrub) {
5245 if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0))
5246 info.history.last_clean_scrub_stamp = now;
5247 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
5248 info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
5249 info.stats.stats.sum.num_large_omap_objects = scrubber.large_omap_objects;
5250 } else {
5251 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
5252 // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
5253 // because of deep-scrub errors
5254 if (scrubber.shallow_errors == 0)
5255 info.history.last_clean_scrub_stamp = now;
5256 }
5257 info.stats.stats.sum.num_scrub_errors =
5258 info.stats.stats.sum.num_shallow_scrub_errors +
5259 info.stats.stats.sum.num_deep_scrub_errors;
5260 reg_next_scrub();
5261
5262 {
5263 ObjectStore::Transaction t;
5264 dirty_info = true;
5265 write_if_dirty(t);
5266 int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
5267 assert(tr == 0);
5268 }
5269
5270
5271 if (has_error) {
5272 queue_peering_event(
5273 CephPeeringEvtRef(
5274 std::make_shared<CephPeeringEvt>(
5275 get_osdmap()->get_epoch(),
5276 get_osdmap()->get_epoch(),
5277 DoRecovery())));
5278 }
5279
5280 scrub_clear_state();
5281 scrub_unreserve_replicas();
5282
5283 if (is_active() && is_primary()) {
5284 share_pg_info();
5285 }
5286 }
5287
5288 void PG::share_pg_info()
5289 {
5290 dout(10) << "share_pg_info" << dendl;
5291
5292 // share new pg_info_t with replicas
5293 assert(!actingbackfill.empty());
5294 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
5295 i != actingbackfill.end();
5296 ++i) {
5297 if (*i == pg_whoami) continue;
5298 pg_shard_t peer = *i;
5299 if (peer_info.count(peer)) {
5300 peer_info[peer].last_epoch_started = info.last_epoch_started;
5301 peer_info[peer].last_interval_started = info.last_interval_started;
5302 peer_info[peer].history.merge(info.history);
5303 }
5304 MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
5305 m->pg_list.push_back(
5306 make_pair(
5307 pg_notify_t(
5308 peer.shard, pg_whoami.shard,
5309 get_osdmap()->get_epoch(),
5310 get_osdmap()->get_epoch(),
5311 info),
5312 PastIntervals()));
5313 osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
5314 }
5315 }
5316
5317 bool PG::append_log_entries_update_missing(
5318 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
5319 ObjectStore::Transaction &t, boost::optional<eversion_t> trim_to,
5320 boost::optional<eversion_t> roll_forward_to)
5321 {
5322 assert(!entries.empty());
5323 assert(entries.begin()->version > info.last_update);
5324
5325 PGLogEntryHandler rollbacker{this, &t};
5326 bool invalidate_stats =
5327 pg_log.append_new_log_entries(info.last_backfill,
5328 info.last_backfill_bitwise,
5329 entries,
5330 &rollbacker);
5331
5332 if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
5333 pg_log.roll_forward(&rollbacker);
5334 }
5335 if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
5336 pg_log.roll_forward_to(*roll_forward_to, &rollbacker);
5337 last_rollback_info_trimmed_to_applied = *roll_forward_to;
5338 }
5339
5340 info.last_update = pg_log.get_head();
5341
5342 if (pg_log.get_missing().num_missing() == 0) {
5343 // advance last_complete since nothing else is missing!
5344 info.last_complete = info.last_update;
5345 }
5346 info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
5347
5348 dout(20) << __func__ << "trim_to bool = " << bool(trim_to) << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
5349 if (trim_to)
5350 pg_log.trim(*trim_to, info);
5351 dirty_info = true;
5352 write_if_dirty(t);
5353 return invalidate_stats;
5354 }
5355
5356
5357 void PG::merge_new_log_entries(
5358 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
5359 ObjectStore::Transaction &t,
5360 boost::optional<eversion_t> trim_to,
5361 boost::optional<eversion_t> roll_forward_to)
5362 {
5363 dout(10) << __func__ << " " << entries << dendl;
5364 assert(is_primary());
5365
5366 bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
5367 for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
5368 i != actingbackfill.end();
5369 ++i) {
5370 pg_shard_t peer(*i);
5371 if (peer == pg_whoami) continue;
5372 assert(peer_missing.count(peer));
5373 assert(peer_info.count(peer));
5374 pg_missing_t& pmissing(peer_missing[peer]);
5375 dout(20) << __func__ << " peer_missing for " << peer << " = " << pmissing << dendl;
5376 pg_info_t& pinfo(peer_info[peer]);
5377 bool invalidate_stats = PGLog::append_log_entries_update_missing(
5378 pinfo.last_backfill,
5379 info.last_backfill_bitwise,
5380 entries,
5381 true,
5382 NULL,
5383 pmissing,
5384 NULL,
5385 this);
5386 pinfo.last_update = info.last_update;
5387 pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
5388 rebuild_missing = rebuild_missing || invalidate_stats;
5389 }
5390
5391 if (!rebuild_missing) {
5392 return;
5393 }
5394
5395 for (auto &&i: entries) {
5396 missing_loc.rebuild(
5397 i.soid,
5398 pg_whoami,
5399 actingbackfill,
5400 info,
5401 pg_log.get_missing(),
5402 peer_missing,
5403 peer_info);
5404 }
5405 }
5406
5407 void PG::update_history(const pg_history_t& new_history)
5408 {
5409 unreg_next_scrub();
5410 if (info.history.merge(new_history)) {
5411 dout(20) << __func__ << " advanced history from " << new_history << dendl;
5412 dirty_info = true;
5413 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
5414 dout(20) << __func__ << " clearing past_intervals" << dendl;
5415 past_intervals.clear();
5416 dirty_big_info = true;
5417 }
5418 }
5419 reg_next_scrub();
5420 }
5421
5422 void PG::fulfill_info(
5423 pg_shard_t from, const pg_query_t &query,
5424 pair<pg_shard_t, pg_info_t> &notify_info)
5425 {
5426 assert(from == primary);
5427 assert(query.type == pg_query_t::INFO);
5428
5429 // info
5430 dout(10) << "sending info" << dendl;
5431 notify_info = make_pair(from, info);
5432 }
5433
5434 void PG::fulfill_log(
5435 pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
5436 {
5437 dout(10) << "log request from " << from << dendl;
5438 assert(from == primary);
5439 assert(query.type != pg_query_t::INFO);
5440 ConnectionRef con = osd->get_con_osd_cluster(
5441 from.osd, get_osdmap()->get_epoch());
5442 if (!con) return;
5443
5444 MOSDPGLog *mlog = new MOSDPGLog(
5445 from.shard, pg_whoami.shard,
5446 get_osdmap()->get_epoch(),
5447 info, query_epoch);
5448 mlog->missing = pg_log.get_missing();
5449
5450 // primary -> other, when building master log
5451 if (query.type == pg_query_t::LOG) {
5452 dout(10) << " sending info+missing+log since " << query.since
5453 << dendl;
5454 if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
5455 osd->clog->error() << info.pgid << " got broken pg_query_t::LOG since " << query.since
5456 << " when my log.tail is " << pg_log.get_tail()
5457 << ", sending full log instead";
5458 mlog->log = pg_log.get_log(); // primary should not have requested this!!
5459 } else
5460 mlog->log.copy_after(pg_log.get_log(), query.since);
5461 }
5462 else if (query.type == pg_query_t::FULLLOG) {
5463 dout(10) << " sending info+missing+full log" << dendl;
5464 mlog->log = pg_log.get_log();
5465 }
5466
5467 dout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
5468
5469 osd->share_map_peer(from.osd, con.get(), get_osdmap());
5470 osd->send_message_osd_cluster(mlog, con.get());
5471 }
5472
5473 void PG::fulfill_query(const MQuery& query, RecoveryCtx *rctx)
5474 {
5475 if (query.query.type == pg_query_t::INFO) {
5476 pair<pg_shard_t, pg_info_t> notify_info;
5477 update_history(query.query.history);
5478 fulfill_info(query.from, query.query, notify_info);
5479 rctx->send_notify(
5480 notify_info.first,
5481 pg_notify_t(
5482 notify_info.first.shard, pg_whoami.shard,
5483 query.query_epoch,
5484 get_osdmap()->get_epoch(),
5485 notify_info.second),
5486 past_intervals);
5487 } else {
5488 update_history(query.query.history);
5489 fulfill_log(query.from, query.query, query.query_epoch);
5490 }
5491 }
5492
5493 void PG::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
5494 {
5495 bool changed = false;
5496 if (osdmap->test_flag(CEPH_OSDMAP_FULL) &&
5497 !lastmap->test_flag(CEPH_OSDMAP_FULL)) {
5498 dout(10) << " cluster was marked full in " << osdmap->get_epoch() << dendl;
5499 changed = true;
5500 }
5501 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
5502 assert(pi);
5503 if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
5504 const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
5505 if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
5506 dout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
5507 changed = true;
5508 }
5509 }
5510 if (changed) {
5511 info.history.last_epoch_marked_full = osdmap->get_epoch();
5512 dirty_info = true;
5513 }
5514 }
5515
5516 bool PG::should_restart_peering(
5517 int newupprimary,
5518 int newactingprimary,
5519 const vector<int>& newup,
5520 const vector<int>& newacting,
5521 OSDMapRef lastmap,
5522 OSDMapRef osdmap)
5523 {
5524 if (PastIntervals::is_new_interval(
5525 primary.osd,
5526 newactingprimary,
5527 acting,
5528 newacting,
5529 up_primary.osd,
5530 newupprimary,
5531 up,
5532 newup,
5533 osdmap,
5534 lastmap,
5535 info.pgid.pgid)) {
5536 dout(20) << "new interval newup " << newup
5537 << " newacting " << newacting << dendl;
5538 return true;
5539 }
5540 if (!lastmap->is_up(osd->whoami) && osdmap->is_up(osd->whoami)) {
5541 dout(10) << __func__ << " osd transitioned from down -> up" << dendl;
5542 return true;
5543 }
5544 return false;
5545 }
5546
5547 bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
5548 {
5549 if (last_peering_reset > reply_epoch ||
5550 last_peering_reset > query_epoch) {
5551 dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " << query_epoch
5552 << " last_peering_reset " << last_peering_reset
5553 << dendl;
5554 return true;
5555 }
5556 return false;
5557 }
5558
5559 void PG::set_last_peering_reset()
5560 {
5561 dout(20) << "set_last_peering_reset " << get_osdmap()->get_epoch() << dendl;
5562 if (last_peering_reset != get_osdmap()->get_epoch()) {
5563 last_peering_reset = get_osdmap()->get_epoch();
5564 reset_interval_flush();
5565 }
5566 }
5567
5568 struct FlushState {
5569 PGRef pg;
5570 epoch_t epoch;
5571 FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
5572 ~FlushState() {
5573 pg->lock();
5574 if (!pg->pg_has_reset_since(epoch))
5575 pg->queue_flushed(epoch);
5576 pg->unlock();
5577 }
5578 };
5579 typedef ceph::shared_ptr<FlushState> FlushStateRef;
5580
5581 void PG::start_flush(ObjectStore::Transaction *t,
5582 list<Context *> *on_applied,
5583 list<Context *> *on_safe)
5584 {
5585 // flush in progress ops
5586 FlushStateRef flush_trigger (std::make_shared<FlushState>(
5587 this, get_osdmap()->get_epoch()));
5588 t->nop();
5589 flushes_in_progress++;
5590 on_applied->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5591 on_safe->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5592 }
5593
5594 void PG::reset_interval_flush()
5595 {
5596 dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
5597 recovery_state.clear_blocked_outgoing();
5598
5599 Context *c = new QueuePeeringEvt<IntervalFlush>(
5600 this, get_osdmap()->get_epoch(), IntervalFlush());
5601 if (!osr->flush_commit(c)) {
5602 dout(10) << "Beginning to block outgoing recovery messages" << dendl;
5603 recovery_state.begin_block_outgoing();
5604 } else {
5605 dout(10) << "Not blocking outgoing recovery messages" << dendl;
5606 delete c;
5607 }
5608 }
5609
5610 /* Called before initializing peering during advance_map */
5611 void PG::start_peering_interval(
5612 const OSDMapRef lastmap,
5613 const vector<int>& newup, int new_up_primary,
5614 const vector<int>& newacting, int new_acting_primary,
5615 ObjectStore::Transaction *t)
5616 {
5617 const OSDMapRef osdmap = get_osdmap();
5618
5619 set_last_peering_reset();
5620
5621 vector<int> oldacting, oldup;
5622 int oldrole = get_role();
5623
5624 unreg_next_scrub();
5625
5626 pg_shard_t old_acting_primary = get_primary();
5627 pg_shard_t old_up_primary = up_primary;
5628 bool was_old_primary = is_primary();
5629 bool was_old_replica = is_replica();
5630
5631 acting.swap(oldacting);
5632 up.swap(oldup);
5633 init_primary_up_acting(
5634 newup,
5635 newacting,
5636 new_up_primary,
5637 new_acting_primary);
5638
5639 if (info.stats.up != up ||
5640 info.stats.acting != acting ||
5641 info.stats.up_primary != new_up_primary ||
5642 info.stats.acting_primary != new_acting_primary) {
5643 info.stats.up = up;
5644 info.stats.up_primary = new_up_primary;
5645 info.stats.acting = acting;
5646 info.stats.acting_primary = new_acting_primary;
5647 info.stats.mapping_epoch = osdmap->get_epoch();
5648 }
5649
5650 pg_stats_publish_lock.Lock();
5651 pg_stats_publish_valid = false;
5652 pg_stats_publish_lock.Unlock();
5653
5654 // This will now be remapped during a backfill in cases
5655 // that it would not have been before.
5656 if (up != acting)
5657 state_set(PG_STATE_REMAPPED);
5658 else
5659 state_clear(PG_STATE_REMAPPED);
5660
5661 int role = osdmap->calc_pg_role(osd->whoami, acting, acting.size());
5662 if (pool.info.is_replicated() || role == pg_whoami.shard)
5663 set_role(role);
5664 else
5665 set_role(-1);
5666
5667 // did acting, up, primary|acker change?
5668 if (!lastmap) {
5669 dout(10) << " no lastmap" << dendl;
5670 dirty_info = true;
5671 dirty_big_info = true;
5672 info.history.same_interval_since = osdmap->get_epoch();
5673 } else {
5674 std::stringstream debug;
5675 assert(info.history.same_interval_since != 0);
5676 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
5677 get_is_recoverable_predicate());
5678 bool new_interval = PastIntervals::check_new_interval(
5679 old_acting_primary.osd,
5680 new_acting_primary,
5681 oldacting, newacting,
5682 old_up_primary.osd,
5683 new_up_primary,
5684 oldup, newup,
5685 info.history.same_interval_since,
5686 info.history.last_epoch_clean,
5687 osdmap,
5688 lastmap,
5689 info.pgid.pgid,
5690 recoverable.get(),
5691 &past_intervals,
5692 &debug);
5693 dout(10) << __func__ << ": check_new_interval output: "
5694 << debug.str() << dendl;
5695 if (new_interval) {
5696 if (osdmap->get_epoch() == osd->get_superblock().oldest_map &&
5697 info.history.last_epoch_clean < osdmap->get_epoch()) {
5698 dout(10) << " map gap, clearing past_intervals and faking" << dendl;
5699 // our information is incomplete and useless; someone else was clean
5700 // after everything we know if osdmaps were trimmed.
5701 past_intervals.clear();
5702 } else {
5703 dout(10) << " noting past " << past_intervals << dendl;
5704 }
5705 dirty_info = true;
5706 dirty_big_info = true;
5707 info.history.same_interval_since = osdmap->get_epoch();
5708 if (info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
5709 osdmap->get_pg_num(info.pgid.pgid.pool()),
5710 nullptr)) {
5711 info.history.last_epoch_split = osdmap->get_epoch();
5712 }
5713 }
5714 }
5715
5716 if (old_up_primary != up_primary ||
5717 oldup != up) {
5718 info.history.same_up_since = osdmap->get_epoch();
5719 }
5720 // this comparison includes primary rank via pg_shard_t
5721 if (old_acting_primary != get_primary()) {
5722 info.history.same_primary_since = osdmap->get_epoch();
5723 }
5724
5725 on_new_interval();
5726
5727 dout(1) << __func__ << " up " << oldup << " -> " << up
5728 << ", acting " << oldacting << " -> " << acting
5729 << ", acting_primary " << old_acting_primary << " -> " << new_acting_primary
5730 << ", up_primary " << old_up_primary << " -> " << new_up_primary
5731 << ", role " << oldrole << " -> " << role
5732 << ", features acting " << acting_features
5733 << " upacting " << upacting_features
5734 << dendl;
5735
5736 // deactivate.
5737 state_clear(PG_STATE_ACTIVE);
5738 state_clear(PG_STATE_PEERED);
5739 state_clear(PG_STATE_DOWN);
5740 state_clear(PG_STATE_RECOVERY_WAIT);
5741 state_clear(PG_STATE_RECOVERY_TOOFULL);
5742 state_clear(PG_STATE_RECOVERING);
5743
5744 peer_purged.clear();
5745 actingbackfill.clear();
5746 scrub_queued = false;
5747
5748 // reset primary/replica state?
5749 if (was_old_primary || is_primary()) {
5750 osd->remove_want_pg_temp(info.pgid.pgid);
5751 } else if (was_old_replica || is_replica()) {
5752 osd->remove_want_pg_temp(info.pgid.pgid);
5753 }
5754 clear_primary_state();
5755
5756
5757 // pg->on_*
5758 on_change(t);
5759
5760 projected_last_update = eversion_t();
5761
5762 assert(!deleting);
5763
5764 // should we tell the primary we are here?
5765 send_notify = !is_primary();
5766
5767 if (role != oldrole ||
5768 was_old_primary != is_primary()) {
5769 // did primary change?
5770 if (was_old_primary != is_primary()) {
5771 state_clear(PG_STATE_CLEAN);
5772 clear_publish_stats();
5773 }
5774
5775 on_role_change();
5776
5777 // take active waiters
5778 requeue_ops(waiting_for_peered);
5779
5780 } else {
5781 // no role change.
5782 // did primary change?
5783 if (get_primary() != old_acting_primary) {
5784 dout(10) << *this << " " << oldacting << " -> " << acting
5785 << ", acting primary "
5786 << old_acting_primary << " -> " << get_primary()
5787 << dendl;
5788 } else {
5789 // primary is the same.
5790 if (is_primary()) {
5791 // i am (still) primary. but my replica set changed.
5792 state_clear(PG_STATE_CLEAN);
5793
5794 dout(10) << oldacting << " -> " << acting
5795 << ", replicas changed" << dendl;
5796 }
5797 }
5798 }
5799 cancel_recovery();
5800
5801 if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
5802 dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
5803 osd->queue_want_pg_temp(info.pgid.pgid, acting);
5804 }
5805 }
5806
5807 void PG::on_new_interval()
5808 {
5809 const OSDMapRef osdmap = get_osdmap();
5810
5811 reg_next_scrub();
5812
5813 // initialize features
5814 acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5815 upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5816 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
5817 if (*p == CRUSH_ITEM_NONE)
5818 continue;
5819 uint64_t f = osdmap->get_xinfo(*p).features;
5820 acting_features &= f;
5821 upacting_features &= f;
5822 }
5823 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
5824 if (*p == CRUSH_ITEM_NONE)
5825 continue;
5826 upacting_features &= osdmap->get_xinfo(*p).features;
5827 }
5828
5829 _on_new_interval();
5830 }
5831
5832 void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
5833 {
5834 assert(!is_primary());
5835
5836 update_history(oinfo.history);
5837 if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
5838 info.stats.stats.sum.num_scrub_errors = 0;
5839 info.stats.stats.sum.num_shallow_scrub_errors = 0;
5840 info.stats.stats.sum.num_deep_scrub_errors = 0;
5841 dirty_info = true;
5842 }
5843
5844 if (!(info.purged_snaps == oinfo.purged_snaps)) {
5845 dout(10) << __func__ << " updating purged_snaps to " << oinfo.purged_snaps
5846 << dendl;
5847 info.purged_snaps = oinfo.purged_snaps;
5848 dirty_info = true;
5849 dirty_big_info = true;
5850 }
5851 }
5852
5853 ostream& operator<<(ostream& out, const PG& pg)
5854 {
5855 out << "pg[" << pg.info
5856 << " " << pg.up;
5857 if (pg.acting != pg.up)
5858 out << "/" << pg.acting;
5859 if (pg.is_ec_pg())
5860 out << "p" << pg.get_primary();
5861 out << " r=" << pg.get_role();
5862 out << " lpr=" << pg.get_last_peering_reset();
5863
5864 if (!pg.past_intervals.empty()) {
5865 out << " pi=[" << pg.past_intervals.get_bounds()
5866 << ")/" << pg.past_intervals.size();
5867 }
5868
5869 if (pg.is_peered()) {
5870 if (pg.last_update_ondisk != pg.info.last_update)
5871 out << " luod=" << pg.last_update_ondisk;
5872 if (pg.last_update_applied != pg.info.last_update)
5873 out << " lua=" << pg.last_update_applied;
5874 }
5875
5876 if (pg.recovery_ops_active)
5877 out << " rops=" << pg.recovery_ops_active;
5878
5879 if (pg.pg_log.get_tail() != pg.info.log_tail ||
5880 pg.pg_log.get_head() != pg.info.last_update)
5881 out << " (info mismatch, " << pg.pg_log.get_log() << ")";
5882
5883 if (!pg.pg_log.get_log().empty()) {
5884 if ((pg.pg_log.get_log().log.begin()->version <= pg.pg_log.get_tail())) {
5885 out << " (log bound mismatch, actual=["
5886 << pg.pg_log.get_log().log.begin()->version << ","
5887 << pg.pg_log.get_log().log.rbegin()->version << "]";
5888 out << ")";
5889 }
5890 }
5891
5892 if (!pg.backfill_targets.empty())
5893 out << " bft=" << pg.backfill_targets;
5894 out << " crt=" << pg.pg_log.get_can_rollback_to();
5895
5896 if (pg.last_complete_ondisk != pg.info.last_complete)
5897 out << " lcod " << pg.last_complete_ondisk;
5898
5899 if (pg.is_primary()) {
5900 out << " mlcod " << pg.min_last_complete_ondisk;
5901 }
5902
5903 out << " " << pg_state_string(pg.get_state());
5904 if (pg.should_send_notify())
5905 out << " NOTIFY";
5906
5907 if (pg.scrubber.must_repair)
5908 out << " MUST_REPAIR";
5909 if (pg.scrubber.auto_repair)
5910 out << " AUTO_REPAIR";
5911 if (pg.scrubber.must_deep_scrub)
5912 out << " MUST_DEEP_SCRUB";
5913 if (pg.scrubber.must_scrub)
5914 out << " MUST_SCRUB";
5915
5916 //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
5917 if (pg.pg_log.get_missing().num_missing()) {
5918 out << " m=" << pg.pg_log.get_missing().num_missing();
5919 if (pg.is_primary()) {
5920 uint64_t unfound = pg.get_num_unfound();
5921 if (unfound)
5922 out << " u=" << unfound;
5923 }
5924 }
5925 if (pg.snap_trimq.size())
5926 out << " snaptrimq=" << pg.snap_trimq;
5927
5928 out << "]";
5929
5930
5931 return out;
5932 }
5933
5934 bool PG::can_discard_op(OpRequestRef& op)
5935 {
5936 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
5937 if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) {
5938 dout(20) << " discard " << *m << dendl;
5939 return true;
5940 }
5941
5942 if (m->get_map_epoch() < info.history.same_primary_since) {
5943 dout(7) << " changed after " << m->get_map_epoch()
5944 << ", dropping " << *m << dendl;
5945 return true;
5946 }
5947
5948 if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) {
5949 if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) {
5950 dout(7) << __func__ << " sent before last_force_op_resend "
5951 << pool.info.last_force_op_resend << ", dropping" << *m << dendl;
5952 return true;
5953 }
5954 if (m->get_map_epoch() < info.history.last_epoch_split) {
5955 dout(7) << __func__ << " pg split in "
5956 << info.history.last_epoch_split << ", dropping" << dendl;
5957 return true;
5958 }
5959 } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) {
5960 if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) {
5961 dout(7) << __func__ << " sent before last_force_op_resend_preluminous "
5962 << pool.info.last_force_op_resend_preluminous
5963 << ", dropping" << *m << dendl;
5964 return true;
5965 }
5966 }
5967
5968 return false;
5969 }
5970
5971 template<typename T, int MSGTYPE>
5972 bool PG::can_discard_replica_op(OpRequestRef& op)
5973 {
5974 const T *m = static_cast<const T *>(op->get_req());
5975 assert(m->get_type() == MSGTYPE);
5976
5977 int from = m->get_source().num();
5978
5979 // if a repop is replied after a replica goes down in a new osdmap, and
5980 // before the pg advances to this new osdmap, the repop replies before this
5981 // repop can be discarded by that replica OSD, because the primary resets the
5982 // connection to it when handling the new osdmap marking it down, and also
5983 // resets the messenger sesssion when the replica reconnects. to avoid the
5984 // out-of-order replies, the messages from that replica should be discarded.
5985 if (osd->get_osdmap()->is_down(from))
5986 return true;
5987 /* Mostly, this overlaps with the old_peering_msg
5988 * condition. An important exception is pushes
5989 * sent by replicas not in the acting set, since
5990 * if such a replica goes down it does not cause
5991 * a new interval. */
5992 if (get_osdmap()->get_down_at(from) >= m->map_epoch)
5993 return true;
5994
5995 // same pg?
5996 // if pg changes _at all_, we reset and repeer!
5997 if (old_peering_msg(m->map_epoch, m->map_epoch)) {
5998 dout(10) << "can_discard_replica_op pg changed " << info.history
5999 << " after " << m->map_epoch
6000 << ", dropping" << dendl;
6001 return true;
6002 }
6003 return false;
6004 }
6005
6006 bool PG::can_discard_scan(OpRequestRef op)
6007 {
6008 const MOSDPGScan *m = static_cast<const MOSDPGScan *>(op->get_req());
6009 assert(m->get_type() == MSG_OSD_PG_SCAN);
6010
6011 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
6012 dout(10) << " got old scan, ignoring" << dendl;
6013 return true;
6014 }
6015 return false;
6016 }
6017
6018 bool PG::can_discard_backfill(OpRequestRef op)
6019 {
6020 const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill *>(op->get_req());
6021 assert(m->get_type() == MSG_OSD_PG_BACKFILL);
6022
6023 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
6024 dout(10) << " got old backfill, ignoring" << dendl;
6025 return true;
6026 }
6027
6028 return false;
6029
6030 }
6031
6032 bool PG::can_discard_request(OpRequestRef& op)
6033 {
6034 switch (op->get_req()->get_type()) {
6035 case CEPH_MSG_OSD_OP:
6036 return can_discard_op(op);
6037 case CEPH_MSG_OSD_BACKOFF:
6038 return false; // never discard
6039 case MSG_OSD_SUBOP:
6040 return can_discard_replica_op<MOSDSubOp, MSG_OSD_SUBOP>(op);
6041 case MSG_OSD_REPOP:
6042 return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op);
6043 case MSG_OSD_PG_PUSH:
6044 return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op);
6045 case MSG_OSD_PG_PULL:
6046 return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
6047 case MSG_OSD_PG_PUSH_REPLY:
6048 return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
6049 case MSG_OSD_SUBOPREPLY:
6050 return can_discard_replica_op<MOSDSubOpReply, MSG_OSD_SUBOPREPLY>(op);
6051 case MSG_OSD_REPOPREPLY:
6052 return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
6053 case MSG_OSD_PG_RECOVERY_DELETE:
6054 return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op);
6055
6056 case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
6057 return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op);
6058
6059 case MSG_OSD_EC_WRITE:
6060 return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
6061 case MSG_OSD_EC_WRITE_REPLY:
6062 return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op);
6063 case MSG_OSD_EC_READ:
6064 return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
6065 case MSG_OSD_EC_READ_REPLY:
6066 return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
6067 case MSG_OSD_REP_SCRUB:
6068 return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
6069 case MSG_OSD_SCRUB_RESERVE:
6070 return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op);
6071 case MSG_OSD_REP_SCRUBMAP:
6072 return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op);
6073 case MSG_OSD_PG_UPDATE_LOG_MISSING:
6074 return can_discard_replica_op<
6075 MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op);
6076 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
6077 return can_discard_replica_op<
6078 MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
6079
6080 case MSG_OSD_PG_SCAN:
6081 return can_discard_scan(op);
6082 case MSG_OSD_PG_BACKFILL:
6083 return can_discard_backfill(op);
6084 case MSG_OSD_PG_BACKFILL_REMOVE:
6085 return can_discard_replica_op<MOSDPGBackfillRemove,
6086 MSG_OSD_PG_BACKFILL_REMOVE>(op);
6087 }
6088 return true;
6089 }
6090
6091 void PG::take_waiters()
6092 {
6093 dout(10) << "take_waiters" << dendl;
6094 requeue_map_waiters();
6095 for (list<CephPeeringEvtRef>::iterator i = peering_waiters.begin();
6096 i != peering_waiters.end();
6097 ++i) osd->queue_for_peering(this);
6098 peering_queue.splice(peering_queue.begin(), peering_waiters,
6099 peering_waiters.begin(), peering_waiters.end());
6100 }
6101
6102 void PG::handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx)
6103 {
6104 dout(10) << "handle_peering_event: " << evt->get_desc() << dendl;
6105 if (!have_same_or_newer_map(evt->get_epoch_sent())) {
6106 dout(10) << "deferring event " << evt->get_desc() << dendl;
6107 peering_waiters.push_back(evt);
6108 return;
6109 }
6110 if (old_peering_evt(evt))
6111 return;
6112 recovery_state.handle_event(evt, rctx);
6113 }
6114
6115 void PG::queue_peering_event(CephPeeringEvtRef evt)
6116 {
6117 if (old_peering_evt(evt))
6118 return;
6119 peering_queue.push_back(evt);
6120 osd->queue_for_peering(this);
6121 }
6122
6123 void PG::queue_null(epoch_t msg_epoch,
6124 epoch_t query_epoch)
6125 {
6126 dout(10) << "null" << dendl;
6127 queue_peering_event(
6128 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
6129 NullEvt())));
6130 }
6131
6132 void PG::queue_flushed(epoch_t e)
6133 {
6134 dout(10) << "flushed" << dendl;
6135 queue_peering_event(
6136 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(e, e,
6137 FlushedEvt())));
6138 }
6139
6140 void PG::queue_query(epoch_t msg_epoch,
6141 epoch_t query_epoch,
6142 pg_shard_t from, const pg_query_t& q)
6143 {
6144 dout(10) << "handle_query " << q << " from replica " << from << dendl;
6145 queue_peering_event(
6146 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
6147 MQuery(from, q, query_epoch))));
6148 }
6149
6150 void PG::handle_advance_map(
6151 OSDMapRef osdmap, OSDMapRef lastmap,
6152 vector<int>& newup, int up_primary,
6153 vector<int>& newacting, int acting_primary,
6154 RecoveryCtx *rctx)
6155 {
6156 assert(lastmap->get_epoch() == osdmap_ref->get_epoch());
6157 assert(lastmap == osdmap_ref);
6158 dout(10) << "handle_advance_map "
6159 << newup << "/" << newacting
6160 << " -- " << up_primary << "/" << acting_primary
6161 << dendl;
6162 update_osdmap_ref(osdmap);
6163 pool.update(osdmap);
6164 past_intervals.update_type_from_map(pool.info.ec_pool(), *osdmap);
6165 if (cct->_conf->osd_debug_verify_cached_snaps) {
6166 interval_set<snapid_t> actual_removed_snaps;
6167 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
6168 assert(pi);
6169 pi->build_removed_snaps(actual_removed_snaps);
6170 if (!(actual_removed_snaps == pool.cached_removed_snaps)) {
6171 derr << __func__ << ": mismatch between the actual removed snaps "
6172 << actual_removed_snaps << " and pool.cached_removed_snaps "
6173 << " pool.cached_removed_snaps " << pool.cached_removed_snaps
6174 << dendl;
6175 }
6176 assert(actual_removed_snaps == pool.cached_removed_snaps);
6177 }
6178 AdvMap evt(
6179 osdmap, lastmap, newup, up_primary,
6180 newacting, acting_primary);
6181 recovery_state.handle_event(evt, rctx);
6182 if (pool.info.last_change == osdmap_ref->get_epoch()) {
6183 on_pool_change();
6184 update_store_with_options();
6185 }
6186 }
6187
6188 void PG::handle_activate_map(RecoveryCtx *rctx)
6189 {
6190 dout(10) << "handle_activate_map " << dendl;
6191 ActMap evt;
6192 recovery_state.handle_event(evt, rctx);
6193 if (osdmap_ref->get_epoch() - last_persisted_osdmap_ref->get_epoch() >
6194 cct->_conf->osd_pg_epoch_persisted_max_stale) {
6195 dout(20) << __func__ << ": Dirtying info: last_persisted is "
6196 << last_persisted_osdmap_ref->get_epoch()
6197 << " while current is " << osdmap_ref->get_epoch() << dendl;
6198 dirty_info = true;
6199 } else {
6200 dout(20) << __func__ << ": Not dirtying info: last_persisted is "
6201 << last_persisted_osdmap_ref->get_epoch()
6202 << " while current is " << osdmap_ref->get_epoch() << dendl;
6203 }
6204 if (osdmap_ref->check_new_blacklist_entries()) check_blacklisted_watchers();
6205 }
6206
6207 void PG::handle_loaded(RecoveryCtx *rctx)
6208 {
6209 dout(10) << "handle_loaded" << dendl;
6210 Load evt;
6211 recovery_state.handle_event(evt, rctx);
6212 }
6213
6214 void PG::handle_create(RecoveryCtx *rctx)
6215 {
6216 dout(10) << "handle_create" << dendl;
6217 rctx->created_pgs.insert(this);
6218 Initialize evt;
6219 recovery_state.handle_event(evt, rctx);
6220 ActMap evt2;
6221 recovery_state.handle_event(evt2, rctx);
6222
6223 rctx->on_applied->add(make_lambda_context([this]() {
6224 update_store_with_options();
6225 }));
6226 }
6227
6228 void PG::handle_query_state(Formatter *f)
6229 {
6230 dout(10) << "handle_query_state" << dendl;
6231 QueryState q(f);
6232 recovery_state.handle_event(q, 0);
6233 }
6234
6235 void PG::update_store_with_options()
6236 {
6237 auto r = osd->store->set_collection_opts(coll, pool.info.opts);
6238 if(r < 0 && r != -EOPNOTSUPP) {
6239 derr << __func__ << " set_collection_opts returns error:" << r << dendl;
6240 }
6241 }
6242
6243 void PG::update_store_on_load()
6244 {
6245 if (osd->store->get_type() == "filestore") {
6246 // legacy filestore didn't store collection bit width; fix.
6247 int bits = osd->store->collection_bits(coll);
6248 if (bits < 0) {
6249 assert(!coll.is_meta()); // otherwise OSD::load_pgs() did a bad thing
6250 bits = info.pgid.get_split_bits(pool.info.get_pg_num());
6251 lderr(cct) << __func__ << " setting bit width to " << bits << dendl;
6252 ObjectStore::Transaction t;
6253 t.collection_set_bits(coll, bits);
6254 osd->store->apply_transaction(osr.get(), std::move(t));
6255 }
6256 }
6257 }
6258
6259 /*------------ Recovery State Machine----------------*/
6260 #undef dout_prefix
6261 #define dout_prefix (*_dout << context< RecoveryMachine >().pg->gen_prefix() \
6262 << "state<" << get_state_name() << ">: ")
6263
6264 /*------Crashed-------*/
6265 PG::RecoveryState::Crashed::Crashed(my_context ctx)
6266 : my_base(ctx),
6267 NamedState(context< RecoveryMachine >().pg, "Crashed")
6268 {
6269 context< RecoveryMachine >().log_enter(state_name);
6270 assert(0 == "we got a bad state machine event");
6271 }
6272
6273
6274 /*------Initial-------*/
6275 PG::RecoveryState::Initial::Initial(my_context ctx)
6276 : my_base(ctx),
6277 NamedState(context< RecoveryMachine >().pg, "Initial")
6278 {
6279 context< RecoveryMachine >().log_enter(state_name);
6280 }
6281
6282 boost::statechart::result PG::RecoveryState::Initial::react(const Load& l)
6283 {
6284 PG *pg = context< RecoveryMachine >().pg;
6285
6286 // do we tell someone we're here?
6287 pg->send_notify = (!pg->is_primary());
6288 pg->update_store_with_options();
6289
6290 pg->update_store_on_load();
6291
6292 return transit< Reset >();
6293 }
6294
6295 boost::statechart::result PG::RecoveryState::Initial::react(const MNotifyRec& notify)
6296 {
6297 PG *pg = context< RecoveryMachine >().pg;
6298 pg->proc_replica_info(
6299 notify.from, notify.notify.info, notify.notify.epoch_sent);
6300 pg->set_last_peering_reset();
6301 return transit< Primary >();
6302 }
6303
6304 boost::statechart::result PG::RecoveryState::Initial::react(const MInfoRec& i)
6305 {
6306 PG *pg = context< RecoveryMachine >().pg;
6307 assert(!pg->is_primary());
6308 post_event(i);
6309 return transit< Stray >();
6310 }
6311
6312 boost::statechart::result PG::RecoveryState::Initial::react(const MLogRec& i)
6313 {
6314 PG *pg = context< RecoveryMachine >().pg;
6315 assert(!pg->is_primary());
6316 post_event(i);
6317 return transit< Stray >();
6318 }
6319
6320 void PG::RecoveryState::Initial::exit()
6321 {
6322 context< RecoveryMachine >().log_exit(state_name, enter_time);
6323 PG *pg = context< RecoveryMachine >().pg;
6324 utime_t dur = ceph_clock_now() - enter_time;
6325 pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur);
6326 }
6327
6328 /*------Started-------*/
6329 PG::RecoveryState::Started::Started(my_context ctx)
6330 : my_base(ctx),
6331 NamedState(context< RecoveryMachine >().pg, "Started")
6332 {
6333 context< RecoveryMachine >().log_enter(state_name);
6334 }
6335
6336 boost::statechart::result
6337 PG::RecoveryState::Started::react(const IntervalFlush&)
6338 {
6339 PG *pg = context< RecoveryMachine >().pg;
6340 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6341 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6342 return discard_event();
6343 }
6344
6345
6346 boost::statechart::result
6347 PG::RecoveryState::Started::react(const FlushedEvt&)
6348 {
6349 PG *pg = context< RecoveryMachine >().pg;
6350 pg->on_flushed();
6351 return discard_event();
6352 }
6353
6354
6355 boost::statechart::result PG::RecoveryState::Started::react(const AdvMap& advmap)
6356 {
6357 PG *pg = context< RecoveryMachine >().pg;
6358 ldout(pg->cct, 10) << "Started advmap" << dendl;
6359 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6360 if (pg->should_restart_peering(
6361 advmap.up_primary,
6362 advmap.acting_primary,
6363 advmap.newup,
6364 advmap.newacting,
6365 advmap.lastmap,
6366 advmap.osdmap)) {
6367 ldout(pg->cct, 10) << "should_restart_peering, transitioning to Reset"
6368 << dendl;
6369 post_event(advmap);
6370 return transit< Reset >();
6371 }
6372 pg->remove_down_peer_info(advmap.osdmap);
6373 return discard_event();
6374 }
6375
6376 boost::statechart::result PG::RecoveryState::Started::react(const QueryState& q)
6377 {
6378 q.f->open_object_section("state");
6379 q.f->dump_string("name", state_name);
6380 q.f->dump_stream("enter_time") << enter_time;
6381 q.f->close_section();
6382 return discard_event();
6383 }
6384
6385 void PG::RecoveryState::Started::exit()
6386 {
6387 context< RecoveryMachine >().log_exit(state_name, enter_time);
6388 PG *pg = context< RecoveryMachine >().pg;
6389 utime_t dur = ceph_clock_now() - enter_time;
6390 pg->osd->recoverystate_perf->tinc(rs_started_latency, dur);
6391 }
6392
6393 /*--------Reset---------*/
6394 PG::RecoveryState::Reset::Reset(my_context ctx)
6395 : my_base(ctx),
6396 NamedState(context< RecoveryMachine >().pg, "Reset")
6397 {
6398 context< RecoveryMachine >().log_enter(state_name);
6399 PG *pg = context< RecoveryMachine >().pg;
6400
6401 pg->flushes_in_progress = 0;
6402 pg->set_last_peering_reset();
6403 }
6404
6405 boost::statechart::result
6406 PG::RecoveryState::Reset::react(const FlushedEvt&)
6407 {
6408 PG *pg = context< RecoveryMachine >().pg;
6409 pg->on_flushed();
6410 return discard_event();
6411 }
6412
6413 boost::statechart::result
6414 PG::RecoveryState::Reset::react(const IntervalFlush&)
6415 {
6416 PG *pg = context< RecoveryMachine >().pg;
6417 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6418 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6419 return discard_event();
6420 }
6421
6422 boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
6423 {
6424 PG *pg = context< RecoveryMachine >().pg;
6425 ldout(pg->cct, 10) << "Reset advmap" << dendl;
6426
6427 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6428
6429 if (pg->should_restart_peering(
6430 advmap.up_primary,
6431 advmap.acting_primary,
6432 advmap.newup,
6433 advmap.newacting,
6434 advmap.lastmap,
6435 advmap.osdmap)) {
6436 ldout(pg->cct, 10) << "should restart peering, calling start_peering_interval again"
6437 << dendl;
6438 pg->start_peering_interval(
6439 advmap.lastmap,
6440 advmap.newup, advmap.up_primary,
6441 advmap.newacting, advmap.acting_primary,
6442 context< RecoveryMachine >().get_cur_transaction());
6443 }
6444 pg->remove_down_peer_info(advmap.osdmap);
6445 pg->check_past_interval_bounds();
6446 return discard_event();
6447 }
6448
6449 boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&)
6450 {
6451 PG *pg = context< RecoveryMachine >().pg;
6452 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
6453 context< RecoveryMachine >().send_notify(
6454 pg->get_primary(),
6455 pg_notify_t(
6456 pg->get_primary().shard, pg->pg_whoami.shard,
6457 pg->get_osdmap()->get_epoch(),
6458 pg->get_osdmap()->get_epoch(),
6459 pg->info),
6460 pg->past_intervals);
6461 }
6462
6463 pg->update_heartbeat_peers();
6464 pg->take_waiters();
6465
6466 return transit< Started >();
6467 }
6468
6469 boost::statechart::result PG::RecoveryState::Reset::react(const QueryState& q)
6470 {
6471 q.f->open_object_section("state");
6472 q.f->dump_string("name", state_name);
6473 q.f->dump_stream("enter_time") << enter_time;
6474 q.f->close_section();
6475 return discard_event();
6476 }
6477
6478 void PG::RecoveryState::Reset::exit()
6479 {
6480 context< RecoveryMachine >().log_exit(state_name, enter_time);
6481 PG *pg = context< RecoveryMachine >().pg;
6482 utime_t dur = ceph_clock_now() - enter_time;
6483 pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur);
6484 }
6485
6486 /*-------Start---------*/
6487 PG::RecoveryState::Start::Start(my_context ctx)
6488 : my_base(ctx),
6489 NamedState(context< RecoveryMachine >().pg, "Start")
6490 {
6491 context< RecoveryMachine >().log_enter(state_name);
6492
6493 PG *pg = context< RecoveryMachine >().pg;
6494 if (pg->is_primary()) {
6495 ldout(pg->cct, 1) << "transitioning to Primary" << dendl;
6496 post_event(MakePrimary());
6497 } else { //is_stray
6498 ldout(pg->cct, 1) << "transitioning to Stray" << dendl;
6499 post_event(MakeStray());
6500 }
6501 }
6502
6503 void PG::RecoveryState::Start::exit()
6504 {
6505 context< RecoveryMachine >().log_exit(state_name, enter_time);
6506 PG *pg = context< RecoveryMachine >().pg;
6507 utime_t dur = ceph_clock_now() - enter_time;
6508 pg->osd->recoverystate_perf->tinc(rs_start_latency, dur);
6509 }
6510
6511 /*---------Primary--------*/
6512 PG::RecoveryState::Primary::Primary(my_context ctx)
6513 : my_base(ctx),
6514 NamedState(context< RecoveryMachine >().pg, "Started/Primary")
6515 {
6516 context< RecoveryMachine >().log_enter(state_name);
6517 PG *pg = context< RecoveryMachine >().pg;
6518 assert(pg->want_acting.empty());
6519
6520 // set CREATING bit until we have peered for the first time.
6521 if (pg->info.history.last_epoch_started == 0) {
6522 pg->state_set(PG_STATE_CREATING);
6523 // use the history timestamp, which ultimately comes from the
6524 // monitor in the create case.
6525 utime_t t = pg->info.history.last_scrub_stamp;
6526 pg->info.stats.last_fresh = t;
6527 pg->info.stats.last_active = t;
6528 pg->info.stats.last_change = t;
6529 pg->info.stats.last_peered = t;
6530 pg->info.stats.last_clean = t;
6531 pg->info.stats.last_unstale = t;
6532 pg->info.stats.last_undegraded = t;
6533 pg->info.stats.last_fullsized = t;
6534 pg->info.stats.last_scrub_stamp = t;
6535 pg->info.stats.last_deep_scrub_stamp = t;
6536 pg->info.stats.last_clean_scrub_stamp = t;
6537 }
6538 }
6539
6540 boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& notevt)
6541 {
6542 PG *pg = context< RecoveryMachine >().pg;
6543 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
6544 pg->proc_replica_info(
6545 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
6546 return discard_event();
6547 }
6548
6549 boost::statechart::result PG::RecoveryState::Primary::react(const ActMap&)
6550 {
6551 PG *pg = context< RecoveryMachine >().pg;
6552 ldout(pg->cct, 7) << "handle ActMap primary" << dendl;
6553 pg->publish_stats_to_osd();
6554 pg->take_waiters();
6555 return discard_event();
6556 }
6557
6558 void PG::RecoveryState::Primary::exit()
6559 {
6560 context< RecoveryMachine >().log_exit(state_name, enter_time);
6561 PG *pg = context< RecoveryMachine >().pg;
6562 pg->want_acting.clear();
6563 utime_t dur = ceph_clock_now() - enter_time;
6564 pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur);
6565 pg->clear_primary_state();
6566 pg->state_clear(PG_STATE_CREATING);
6567 }
6568
6569 /*---------Peering--------*/
6570 PG::RecoveryState::Peering::Peering(my_context ctx)
6571 : my_base(ctx),
6572 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering"),
6573 history_les_bound(false)
6574 {
6575 context< RecoveryMachine >().log_enter(state_name);
6576
6577 PG *pg = context< RecoveryMachine >().pg;
6578 assert(!pg->is_peered());
6579 assert(!pg->is_peering());
6580 assert(pg->is_primary());
6581 pg->state_set(PG_STATE_PEERING);
6582 }
6583
6584 boost::statechart::result PG::RecoveryState::Peering::react(const AdvMap& advmap)
6585 {
6586 PG *pg = context< RecoveryMachine >().pg;
6587 ldout(pg->cct, 10) << "Peering advmap" << dendl;
6588 if (prior_set.affected_by_map(*(advmap.osdmap), pg)) {
6589 ldout(pg->cct, 1) << "Peering, affected_by_map, going to Reset" << dendl;
6590 post_event(advmap);
6591 return transit< Reset >();
6592 }
6593
6594 pg->adjust_need_up_thru(advmap.osdmap);
6595
6596 return forward_event();
6597 }
6598
6599 boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q)
6600 {
6601 PG *pg = context< RecoveryMachine >().pg;
6602
6603 q.f->open_object_section("state");
6604 q.f->dump_string("name", state_name);
6605 q.f->dump_stream("enter_time") << enter_time;
6606
6607 q.f->open_array_section("past_intervals");
6608 pg->past_intervals.dump(q.f);
6609 q.f->close_section();
6610
6611 q.f->open_array_section("probing_osds");
6612 for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
6613 p != prior_set.probe.end();
6614 ++p)
6615 q.f->dump_stream("osd") << *p;
6616 q.f->close_section();
6617
6618 if (prior_set.pg_down)
6619 q.f->dump_string("blocked", "peering is blocked due to down osds");
6620
6621 q.f->open_array_section("down_osds_we_would_probe");
6622 for (set<int>::iterator p = prior_set.down.begin();
6623 p != prior_set.down.end();
6624 ++p)
6625 q.f->dump_int("osd", *p);
6626 q.f->close_section();
6627
6628 q.f->open_array_section("peering_blocked_by");
6629 for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
6630 p != prior_set.blocked_by.end();
6631 ++p) {
6632 q.f->open_object_section("osd");
6633 q.f->dump_int("osd", p->first);
6634 q.f->dump_int("current_lost_at", p->second);
6635 q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
6636 q.f->close_section();
6637 }
6638 q.f->close_section();
6639
6640 if (history_les_bound) {
6641 q.f->open_array_section("peering_blocked_by_detail");
6642 q.f->open_object_section("item");
6643 q.f->dump_string("detail","peering_blocked_by_history_les_bound");
6644 q.f->close_section();
6645 q.f->close_section();
6646 }
6647
6648 q.f->close_section();
6649 return forward_event();
6650 }
6651
6652 void PG::RecoveryState::Peering::exit()
6653 {
6654 PG *pg = context< RecoveryMachine >().pg;
6655 ldout(pg->cct, 10) << "Leaving Peering" << dendl;
6656 context< RecoveryMachine >().log_exit(state_name, enter_time);
6657 pg->state_clear(PG_STATE_PEERING);
6658 pg->clear_probe_targets();
6659
6660 utime_t dur = ceph_clock_now() - enter_time;
6661 pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur);
6662 }
6663
6664
6665 /*------Backfilling-------*/
6666 PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
6667 : my_base(ctx),
6668 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Backfilling")
6669 {
6670 context< RecoveryMachine >().log_enter(state_name);
6671 PG *pg = context< RecoveryMachine >().pg;
6672 pg->backfill_reserved = true;
6673 pg->queue_recovery();
6674 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
6675 pg->state_clear(PG_STATE_BACKFILL_WAIT);
6676 pg->state_set(PG_STATE_BACKFILLING);
6677 pg->publish_stats_to_osd();
6678 }
6679
6680 boost::statechart::result
6681 PG::RecoveryState::Backfilling::react(const DeferBackfill &c)
6682 {
6683 PG *pg = context< RecoveryMachine >().pg;
6684 ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl;
6685 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6686
6687 pg->state_set(PG_STATE_BACKFILL_WAIT);
6688 pg->state_clear(PG_STATE_BACKFILLING);
6689
6690 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6691 it != pg->backfill_targets.end();
6692 ++it) {
6693 assert(*it != pg->pg_whoami);
6694 ConnectionRef con = pg->osd->get_con_osd_cluster(
6695 it->osd, pg->get_osdmap()->get_epoch());
6696 if (con) {
6697 pg->osd->send_message_osd_cluster(
6698 new MBackfillReserve(
6699 MBackfillReserve::REJECT,
6700 spg_t(pg->info.pgid.pgid, it->shard),
6701 pg->get_osdmap()->get_epoch()),
6702 con.get());
6703 }
6704 }
6705
6706
6707 if (!pg->waiting_on_backfill.empty()) {
6708 pg->waiting_on_backfill.clear();
6709 pg->finish_recovery_op(hobject_t::get_max());
6710 }
6711
6712 pg->schedule_backfill_retry(c.delay);
6713 return transit<NotBackfilling>();
6714 }
6715
6716 boost::statechart::result
6717 PG::RecoveryState::Backfilling::react(const UnfoundBackfill &c)
6718 {
6719 PG *pg = context< RecoveryMachine >().pg;
6720 ldout(pg->cct, 10) << "backfill has unfound, can't continue" << dendl;
6721 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6722
6723 pg->state_set(PG_STATE_BACKFILL_UNFOUND);
6724 pg->state_clear(PG_STATE_BACKFILLING);
6725
6726 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6727 it != pg->backfill_targets.end();
6728 ++it) {
6729 assert(*it != pg->pg_whoami);
6730 ConnectionRef con = pg->osd->get_con_osd_cluster(
6731 it->osd, pg->get_osdmap()->get_epoch());
6732 if (con) {
6733 pg->osd->send_message_osd_cluster(
6734 new MBackfillReserve(
6735 MBackfillReserve::REJECT,
6736 spg_t(pg->info.pgid.pgid, it->shard),
6737 pg->get_osdmap()->get_epoch()),
6738 con.get());
6739 }
6740 }
6741
6742 pg->waiting_on_backfill.clear();
6743
6744 return transit<NotBackfilling>();
6745 }
6746
6747 boost::statechart::result
6748 PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &)
6749 {
6750 PG *pg = context< RecoveryMachine >().pg;
6751 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6752 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6753
6754 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6755 it != pg->backfill_targets.end();
6756 ++it) {
6757 assert(*it != pg->pg_whoami);
6758 ConnectionRef con = pg->osd->get_con_osd_cluster(
6759 it->osd, pg->get_osdmap()->get_epoch());
6760 if (con) {
6761 pg->osd->send_message_osd_cluster(
6762 new MBackfillReserve(
6763 MBackfillReserve::REJECT,
6764 spg_t(pg->info.pgid.pgid, it->shard),
6765 pg->get_osdmap()->get_epoch()),
6766 con.get());
6767 }
6768 }
6769
6770 if (!pg->waiting_on_backfill.empty()) {
6771 pg->waiting_on_backfill.clear();
6772 pg->finish_recovery_op(hobject_t::get_max());
6773 }
6774
6775 pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
6776 return transit<NotBackfilling>();
6777 }
6778
6779 void PG::RecoveryState::Backfilling::exit()
6780 {
6781 context< RecoveryMachine >().log_exit(state_name, enter_time);
6782 PG *pg = context< RecoveryMachine >().pg;
6783 pg->backfill_reserved = false;
6784 pg->backfill_reserving = false;
6785 pg->state_clear(PG_STATE_BACKFILLING);
6786 pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
6787 utime_t dur = ceph_clock_now() - enter_time;
6788 pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
6789 }
6790
6791 /*--WaitRemoteBackfillReserved--*/
6792
6793 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
6794 : my_base(ctx),
6795 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteBackfillReserved"),
6796 backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
6797 {
6798 context< RecoveryMachine >().log_enter(state_name);
6799 PG *pg = context< RecoveryMachine >().pg;
6800 pg->state_set(PG_STATE_BACKFILL_WAIT);
6801 pg->publish_stats_to_osd();
6802 post_event(RemoteBackfillReserved());
6803 }
6804
6805 boost::statechart::result
6806 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
6807 {
6808 PG *pg = context< RecoveryMachine >().pg;
6809
6810 if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) {
6811 //The primary never backfills itself
6812 assert(*backfill_osd_it != pg->pg_whoami);
6813 ConnectionRef con = pg->osd->get_con_osd_cluster(
6814 backfill_osd_it->osd, pg->get_osdmap()->get_epoch());
6815 if (con) {
6816 pg->osd->send_message_osd_cluster(
6817 new MBackfillReserve(
6818 MBackfillReserve::REQUEST,
6819 spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
6820 pg->get_osdmap()->get_epoch(),
6821 pg->get_backfill_priority()),
6822 con.get());
6823 }
6824 ++backfill_osd_it;
6825 } else {
6826 post_event(AllBackfillsReserved());
6827 }
6828 return discard_event();
6829 }
6830
6831 void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
6832 {
6833 context< RecoveryMachine >().log_exit(state_name, enter_time);
6834 PG *pg = context< RecoveryMachine >().pg;
6835 utime_t dur = ceph_clock_now() - enter_time;
6836 pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur);
6837 }
6838
6839 boost::statechart::result
6840 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt)
6841 {
6842 PG *pg = context< RecoveryMachine >().pg;
6843 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6844
6845 // Send REJECT to all previously acquired reservations
6846 set<pg_shard_t>::const_iterator it, begin, end, next;
6847 begin = context< Active >().remote_shards_to_reserve_backfill.begin();
6848 end = context< Active >().remote_shards_to_reserve_backfill.end();
6849 assert(begin != end);
6850 for (next = it = begin, ++next ; next != backfill_osd_it; ++it, ++next) {
6851 //The primary never backfills itself
6852 assert(*it != pg->pg_whoami);
6853 ConnectionRef con = pg->osd->get_con_osd_cluster(
6854 it->osd, pg->get_osdmap()->get_epoch());
6855 if (con) {
6856 pg->osd->send_message_osd_cluster(
6857 new MBackfillReserve(
6858 MBackfillReserve::REJECT,
6859 spg_t(pg->info.pgid.pgid, it->shard),
6860 pg->get_osdmap()->get_epoch()),
6861 con.get());
6862 }
6863 }
6864
6865 pg->state_clear(PG_STATE_BACKFILL_WAIT);
6866 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6867 pg->publish_stats_to_osd();
6868
6869 pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
6870
6871 return transit<NotBackfilling>();
6872 }
6873
6874 /*--WaitLocalBackfillReserved--*/
6875 PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
6876 : my_base(ctx),
6877 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalBackfillReserved")
6878 {
6879 context< RecoveryMachine >().log_enter(state_name);
6880 PG *pg = context< RecoveryMachine >().pg;
6881 pg->state_set(PG_STATE_BACKFILL_WAIT);
6882 pg->osd->local_reserver.request_reservation(
6883 pg->info.pgid,
6884 new QueuePeeringEvt<LocalBackfillReserved>(
6885 pg, pg->get_osdmap()->get_epoch(),
6886 LocalBackfillReserved()),
6887 pg->get_backfill_priority(),
6888 new QueuePeeringEvt<DeferBackfill>(
6889 pg, pg->get_osdmap()->get_epoch(),
6890 DeferBackfill(0.0)));
6891 pg->publish_stats_to_osd();
6892 }
6893
6894 void PG::RecoveryState::WaitLocalBackfillReserved::exit()
6895 {
6896 context< RecoveryMachine >().log_exit(state_name, enter_time);
6897 PG *pg = context< RecoveryMachine >().pg;
6898 utime_t dur = ceph_clock_now() - enter_time;
6899 pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur);
6900 }
6901
6902 /*----NotBackfilling------*/
6903 PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
6904 : my_base(ctx),
6905 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotBackfilling")
6906 {
6907 context< RecoveryMachine >().log_enter(state_name);
6908 PG *pg = context< RecoveryMachine >().pg;
6909 pg->publish_stats_to_osd();
6910 }
6911
6912 boost::statechart::result
6913 PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
6914 {
6915 return discard_event();
6916 }
6917
6918 boost::statechart::result
6919 PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
6920 {
6921 return discard_event();
6922 }
6923
6924 void PG::RecoveryState::NotBackfilling::exit()
6925 {
6926 context< RecoveryMachine >().log_exit(state_name, enter_time);
6927 PG *pg = context< RecoveryMachine >().pg;
6928 pg->state_clear(PG_STATE_BACKFILL_UNFOUND);
6929 utime_t dur = ceph_clock_now() - enter_time;
6930 pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
6931 }
6932
6933 /*----NotRecovering------*/
6934 PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx)
6935 : my_base(ctx),
6936 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotRecovering")
6937 {
6938 context< RecoveryMachine >().log_enter(state_name);
6939 PG *pg = context< RecoveryMachine >().pg;
6940 pg->publish_stats_to_osd();
6941 }
6942
6943 void PG::RecoveryState::NotRecovering::exit()
6944 {
6945 context< RecoveryMachine >().log_exit(state_name, enter_time);
6946 PG *pg = context< RecoveryMachine >().pg;
6947 pg->state_clear(PG_STATE_RECOVERY_UNFOUND);
6948 utime_t dur = ceph_clock_now() - enter_time;
6949 pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur);
6950 }
6951
6952 /*---RepNotRecovering----*/
6953 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
6954 : my_base(ctx),
6955 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepNotRecovering")
6956 {
6957 context< RecoveryMachine >().log_enter(state_name);
6958 }
6959
6960 boost::statechart::result
6961 PG::RecoveryState::RepNotRecovering::react(const RejectRemoteReservation &evt)
6962 {
6963 PG *pg = context< RecoveryMachine >().pg;
6964 pg->reject_reservation();
6965 post_event(RemoteReservationRejected());
6966 return discard_event();
6967 }
6968
6969 void PG::RecoveryState::RepNotRecovering::exit()
6970 {
6971 context< RecoveryMachine >().log_exit(state_name, enter_time);
6972 PG *pg = context< RecoveryMachine >().pg;
6973 utime_t dur = ceph_clock_now() - enter_time;
6974 pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur);
6975 }
6976
6977 /*---RepWaitRecoveryReserved--*/
6978 PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
6979 : my_base(ctx),
6980 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitRecoveryReserved")
6981 {
6982 context< RecoveryMachine >().log_enter(state_name);
6983 PG *pg = context< RecoveryMachine >().pg;
6984
6985 pg->osd->remote_reserver.request_reservation(
6986 pg->info.pgid,
6987 new QueuePeeringEvt<RemoteRecoveryReserved>(
6988 pg, pg->get_osdmap()->get_epoch(),
6989 RemoteRecoveryReserved()),
6990 pg->get_recovery_priority());
6991 }
6992
6993 boost::statechart::result
6994 PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
6995 {
6996 PG *pg = context< RecoveryMachine >().pg;
6997 pg->osd->send_message_osd_cluster(
6998 pg->primary.osd,
6999 new MRecoveryReserve(
7000 MRecoveryReserve::GRANT,
7001 spg_t(pg->info.pgid.pgid, pg->primary.shard),
7002 pg->get_osdmap()->get_epoch()),
7003 pg->get_osdmap()->get_epoch());
7004 return transit<RepRecovering>();
7005 }
7006
7007 boost::statechart::result
7008 PG::RecoveryState::RepWaitRecoveryReserved::react(
7009 const RemoteReservationCanceled &evt)
7010 {
7011 PG *pg = context< RecoveryMachine >().pg;
7012 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7013 return transit<RepNotRecovering>();
7014 }
7015
7016 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
7017 {
7018 context< RecoveryMachine >().log_exit(state_name, enter_time);
7019 PG *pg = context< RecoveryMachine >().pg;
7020 utime_t dur = ceph_clock_now() - enter_time;
7021 pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur);
7022 }
7023
7024 /*-RepWaitBackfillReserved*/
7025 PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
7026 : my_base(ctx),
7027 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitBackfillReserved")
7028 {
7029 context< RecoveryMachine >().log_enter(state_name);
7030 }
7031
7032 boost::statechart::result
7033 PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
7034 {
7035 PG *pg = context< RecoveryMachine >().pg;
7036 ostringstream ss;
7037
7038 if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
7039 (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
7040 ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
7041 << dendl;
7042 post_event(RejectRemoteReservation());
7043 } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
7044 pg->osd->check_backfill_full(ss)) {
7045 ldout(pg->cct, 10) << "backfill reservation rejected: "
7046 << ss.str() << dendl;
7047 post_event(RejectRemoteReservation());
7048 } else {
7049 pg->osd->remote_reserver.request_reservation(
7050 pg->info.pgid,
7051 new QueuePeeringEvt<RemoteBackfillReserved>(
7052 pg, pg->get_osdmap()->get_epoch(),
7053 RemoteBackfillReserved()), evt.priority);
7054 }
7055 return transit<RepWaitBackfillReserved>();
7056 }
7057
7058 void PG::RecoveryState::RepWaitBackfillReserved::exit()
7059 {
7060 context< RecoveryMachine >().log_exit(state_name, enter_time);
7061 PG *pg = context< RecoveryMachine >().pg;
7062 utime_t dur = ceph_clock_now() - enter_time;
7063 pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur);
7064 }
7065
7066 boost::statechart::result
7067 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
7068 {
7069 PG *pg = context< RecoveryMachine >().pg;
7070
7071 ostringstream ss;
7072 if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
7073 (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
7074 ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
7075 << "failure injection" << dendl;
7076 post_event(RejectRemoteReservation());
7077 return discard_event();
7078 } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
7079 pg->osd->check_backfill_full(ss)) {
7080 ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
7081 << ss.str() << dendl;
7082 post_event(RejectRemoteReservation());
7083 return discard_event();
7084 } else {
7085 pg->osd->send_message_osd_cluster(
7086 pg->primary.osd,
7087 new MBackfillReserve(
7088 MBackfillReserve::GRANT,
7089 spg_t(pg->info.pgid.pgid, pg->primary.shard),
7090 pg->get_osdmap()->get_epoch()),
7091 pg->get_osdmap()->get_epoch());
7092 return transit<RepRecovering>();
7093 }
7094 }
7095
7096 boost::statechart::result
7097 PG::RecoveryState::RepWaitBackfillReserved::react(
7098 const RejectRemoteReservation &evt)
7099 {
7100 PG *pg = context< RecoveryMachine >().pg;
7101 pg->reject_reservation();
7102 post_event(RemoteReservationRejected());
7103 return discard_event();
7104 }
7105
7106 boost::statechart::result
7107 PG::RecoveryState::RepWaitBackfillReserved::react(
7108 const RemoteReservationRejected &evt)
7109 {
7110 PG *pg = context< RecoveryMachine >().pg;
7111 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7112 return transit<RepNotRecovering>();
7113 }
7114
7115 boost::statechart::result
7116 PG::RecoveryState::RepWaitBackfillReserved::react(
7117 const RemoteReservationCanceled &evt)
7118 {
7119 PG *pg = context< RecoveryMachine >().pg;
7120 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7121 return transit<RepNotRecovering>();
7122 }
7123
7124 /*---RepRecovering-------*/
7125 PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx)
7126 : my_base(ctx),
7127 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepRecovering")
7128 {
7129 context< RecoveryMachine >().log_enter(state_name);
7130 }
7131
7132 boost::statechart::result
7133 PG::RecoveryState::RepRecovering::react(const BackfillTooFull &)
7134 {
7135 PG *pg = context< RecoveryMachine >().pg;
7136 pg->reject_reservation();
7137 return discard_event();
7138 }
7139
7140 void PG::RecoveryState::RepRecovering::exit()
7141 {
7142 context< RecoveryMachine >().log_exit(state_name, enter_time);
7143 PG *pg = context< RecoveryMachine >().pg;
7144 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7145 utime_t dur = ceph_clock_now() - enter_time;
7146 pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur);
7147 }
7148
7149 /*------Activating--------*/
7150 PG::RecoveryState::Activating::Activating(my_context ctx)
7151 : my_base(ctx),
7152 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Activating")
7153 {
7154 context< RecoveryMachine >().log_enter(state_name);
7155 }
7156
7157 void PG::RecoveryState::Activating::exit()
7158 {
7159 context< RecoveryMachine >().log_exit(state_name, enter_time);
7160 PG *pg = context< RecoveryMachine >().pg;
7161 utime_t dur = ceph_clock_now() - enter_time;
7162 pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur);
7163 }
7164
7165 PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
7166 : my_base(ctx),
7167 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalRecoveryReserved")
7168 {
7169 context< RecoveryMachine >().log_enter(state_name);
7170 PG *pg = context< RecoveryMachine >().pg;
7171
7172 // Make sure all nodes that part of the recovery aren't full
7173 if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery &&
7174 pg->osd->check_osdmap_full(pg->actingbackfill)) {
7175 post_event(RecoveryTooFull());
7176 return;
7177 }
7178
7179 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7180 pg->state_set(PG_STATE_RECOVERY_WAIT);
7181 pg->osd->local_reserver.request_reservation(
7182 pg->info.pgid,
7183 new QueuePeeringEvt<LocalRecoveryReserved>(
7184 pg, pg->get_osdmap()->get_epoch(),
7185 LocalRecoveryReserved()),
7186 pg->get_recovery_priority(),
7187 new QueuePeeringEvt<DeferRecovery>(
7188 pg, pg->get_osdmap()->get_epoch(),
7189 DeferRecovery(0.0)));
7190 pg->publish_stats_to_osd();
7191 }
7192
7193 boost::statechart::result
7194 PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
7195 {
7196 PG *pg = context< RecoveryMachine >().pg;
7197 pg->state_set(PG_STATE_RECOVERY_TOOFULL);
7198 pg->schedule_recovery_retry(pg->cct->_conf->osd_recovery_retry_interval);
7199 return transit<NotRecovering>();
7200 }
7201
7202 void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
7203 {
7204 context< RecoveryMachine >().log_exit(state_name, enter_time);
7205 PG *pg = context< RecoveryMachine >().pg;
7206 utime_t dur = ceph_clock_now() - enter_time;
7207 pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur);
7208 }
7209
7210 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
7211 : my_base(ctx),
7212 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
7213 remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
7214 {
7215 context< RecoveryMachine >().log_enter(state_name);
7216 post_event(RemoteRecoveryReserved());
7217 }
7218
7219 boost::statechart::result
7220 PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
7221 PG *pg = context< RecoveryMachine >().pg;
7222
7223 if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
7224 assert(*remote_recovery_reservation_it != pg->pg_whoami);
7225 ConnectionRef con = pg->osd->get_con_osd_cluster(
7226 remote_recovery_reservation_it->osd, pg->get_osdmap()->get_epoch());
7227 if (con) {
7228 pg->osd->send_message_osd_cluster(
7229 new MRecoveryReserve(
7230 MRecoveryReserve::REQUEST,
7231 spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard),
7232 pg->get_osdmap()->get_epoch()),
7233 con.get());
7234 }
7235 ++remote_recovery_reservation_it;
7236 } else {
7237 post_event(AllRemotesReserved());
7238 }
7239 return discard_event();
7240 }
7241
7242 void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
7243 {
7244 context< RecoveryMachine >().log_exit(state_name, enter_time);
7245 PG *pg = context< RecoveryMachine >().pg;
7246 utime_t dur = ceph_clock_now() - enter_time;
7247 pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur);
7248 }
7249
7250 PG::RecoveryState::Recovering::Recovering(my_context ctx)
7251 : my_base(ctx),
7252 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovering")
7253 {
7254 context< RecoveryMachine >().log_enter(state_name);
7255
7256 PG *pg = context< RecoveryMachine >().pg;
7257 pg->state_clear(PG_STATE_RECOVERY_WAIT);
7258 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7259 pg->state_set(PG_STATE_RECOVERING);
7260 assert(!pg->state_test(PG_STATE_ACTIVATING));
7261 pg->publish_stats_to_osd();
7262 pg->queue_recovery();
7263 }
7264
7265 void PG::RecoveryState::Recovering::release_reservations(bool cancel)
7266 {
7267 PG *pg = context< RecoveryMachine >().pg;
7268 assert(cancel || !pg->pg_log.get_missing().have_missing());
7269
7270 // release remote reservations
7271 for (set<pg_shard_t>::const_iterator i =
7272 context< Active >().remote_shards_to_reserve_recovery.begin();
7273 i != context< Active >().remote_shards_to_reserve_recovery.end();
7274 ++i) {
7275 if (*i == pg->pg_whoami) // skip myself
7276 continue;
7277 ConnectionRef con = pg->osd->get_con_osd_cluster(
7278 i->osd, pg->get_osdmap()->get_epoch());
7279 if (con) {
7280 pg->osd->send_message_osd_cluster(
7281 new MRecoveryReserve(
7282 MRecoveryReserve::RELEASE,
7283 spg_t(pg->info.pgid.pgid, i->shard),
7284 pg->get_osdmap()->get_epoch()),
7285 con.get());
7286 }
7287 }
7288 }
7289
7290 boost::statechart::result
7291 PG::RecoveryState::Recovering::react(const AllReplicasRecovered &evt)
7292 {
7293 PG *pg = context< RecoveryMachine >().pg;
7294 pg->state_clear(PG_STATE_RECOVERING);
7295 pg->state_clear(PG_STATE_FORCED_RECOVERY);
7296 release_reservations();
7297 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7298 return transit<Recovered>();
7299 }
7300
7301 boost::statechart::result
7302 PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
7303 {
7304 PG *pg = context< RecoveryMachine >().pg;
7305 pg->state_clear(PG_STATE_RECOVERING);
7306 pg->state_clear(PG_STATE_FORCED_RECOVERY);
7307 release_reservations();
7308 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7309 // XXX: Is this needed?
7310 pg->publish_stats_to_osd();
7311 return transit<WaitLocalBackfillReserved>();
7312 }
7313
7314 boost::statechart::result
7315 PG::RecoveryState::Recovering::react(const DeferRecovery &evt)
7316 {
7317 PG *pg = context< RecoveryMachine >().pg;
7318 if (!pg->state_test(PG_STATE_RECOVERING)) {
7319 // we may have finished recovery and have an AllReplicasRecovered
7320 // event queued to move us to the next state.
7321 ldout(pg->cct, 10) << "got defer recovery but not recovering" << dendl;
7322 return discard_event();
7323 }
7324 ldout(pg->cct, 10) << "defer recovery, retry delay " << evt.delay << dendl;
7325 pg->state_clear(PG_STATE_RECOVERING);
7326 pg->state_set(PG_STATE_RECOVERY_WAIT);
7327 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7328 release_reservations(true);
7329 pg->schedule_recovery_retry(evt.delay);
7330 return transit<NotRecovering>();
7331 }
7332
7333 boost::statechart::result
7334 PG::RecoveryState::Recovering::react(const UnfoundRecovery &evt)
7335 {
7336 PG *pg = context< RecoveryMachine >().pg;
7337 ldout(pg->cct, 10) << "recovery has unfound, can't continue" << dendl;
7338 pg->state_set(PG_STATE_RECOVERY_UNFOUND);
7339 pg->state_clear(PG_STATE_RECOVERING);
7340 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7341 release_reservations(true);
7342 return transit<NotRecovering>();
7343 }
7344
7345 void PG::RecoveryState::Recovering::exit()
7346 {
7347 context< RecoveryMachine >().log_exit(state_name, enter_time);
7348 PG *pg = context< RecoveryMachine >().pg;
7349 utime_t dur = ceph_clock_now() - enter_time;
7350 pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur);
7351 }
7352
7353 PG::RecoveryState::Recovered::Recovered(my_context ctx)
7354 : my_base(ctx),
7355 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovered")
7356 {
7357 pg_shard_t auth_log_shard;
7358
7359 context< RecoveryMachine >().log_enter(state_name);
7360
7361 PG *pg = context< RecoveryMachine >().pg;
7362
7363 assert(!pg->needs_recovery());
7364
7365 // if we finished backfill, all acting are active; recheck if
7366 // DEGRADED | UNDERSIZED is appropriate.
7367 assert(!pg->actingbackfill.empty());
7368 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
7369 pg->actingbackfill.size()) {
7370 pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
7371 pg->publish_stats_to_osd();
7372 }
7373
7374 // trim pglog on recovered
7375 pg->trim_log();
7376
7377 // adjust acting set? (e.g. because backfill completed...)
7378 bool history_les_bound = false;
7379 if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
7380 true, &history_les_bound))
7381 assert(pg->want_acting.size());
7382
7383 if (context< Active >().all_replicas_activated)
7384 post_event(GoClean());
7385 }
7386
7387 void PG::RecoveryState::Recovered::exit()
7388 {
7389 context< RecoveryMachine >().log_exit(state_name, enter_time);
7390 PG *pg = context< RecoveryMachine >().pg;
7391 utime_t dur = ceph_clock_now() - enter_time;
7392 pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur);
7393 }
7394
7395 PG::RecoveryState::Clean::Clean(my_context ctx)
7396 : my_base(ctx),
7397 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Clean")
7398 {
7399 context< RecoveryMachine >().log_enter(state_name);
7400
7401 PG *pg = context< RecoveryMachine >().pg;
7402
7403 if (pg->info.last_complete != pg->info.last_update) {
7404 ceph_abort();
7405 }
7406 pg->finish_recovery(*context< RecoveryMachine >().get_on_safe_context_list());
7407
7408 if (pg->is_active()) {
7409 pg->mark_clean();
7410 }
7411
7412 pg->share_pg_info();
7413 pg->publish_stats_to_osd();
7414 pg->requeue_ops(pg->waiting_for_clean_to_primary_repair);
7415 }
7416
7417 void PG::RecoveryState::Clean::exit()
7418 {
7419 context< RecoveryMachine >().log_exit(state_name, enter_time);
7420 PG *pg = context< RecoveryMachine >().pg;
7421 pg->state_clear(PG_STATE_CLEAN);
7422 utime_t dur = ceph_clock_now() - enter_time;
7423 pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur);
7424 }
7425
7426 template <typename T>
7427 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
7428 {
7429 set<int> osds_found;
7430 set<pg_shard_t> out;
7431 for (typename T::const_iterator i = in.begin();
7432 i != in.end();
7433 ++i) {
7434 if (*i != skip && !osds_found.count(i->osd)) {
7435 osds_found.insert(i->osd);
7436 out.insert(*i);
7437 }
7438 }
7439 return out;
7440 }
7441
7442 /*---------Active---------*/
7443 PG::RecoveryState::Active::Active(my_context ctx)
7444 : my_base(ctx),
7445 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active"),
7446 remote_shards_to_reserve_recovery(
7447 unique_osd_shard_set(
7448 context< RecoveryMachine >().pg->pg_whoami,
7449 context< RecoveryMachine >().pg->actingbackfill)),
7450 remote_shards_to_reserve_backfill(
7451 unique_osd_shard_set(
7452 context< RecoveryMachine >().pg->pg_whoami,
7453 context< RecoveryMachine >().pg->backfill_targets)),
7454 all_replicas_activated(false)
7455 {
7456 context< RecoveryMachine >().log_enter(state_name);
7457
7458 PG *pg = context< RecoveryMachine >().pg;
7459
7460 assert(!pg->backfill_reserving);
7461 assert(!pg->backfill_reserved);
7462 assert(pg->is_primary());
7463 ldout(pg->cct, 10) << "In Active, about to call activate" << dendl;
7464 pg->start_flush(
7465 context< RecoveryMachine >().get_cur_transaction(),
7466 context< RecoveryMachine >().get_on_applied_context_list(),
7467 context< RecoveryMachine >().get_on_safe_context_list());
7468 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7469 pg->get_osdmap()->get_epoch(),
7470 *context< RecoveryMachine >().get_on_safe_context_list(),
7471 *context< RecoveryMachine >().get_query_map(),
7472 context< RecoveryMachine >().get_info_map(),
7473 context< RecoveryMachine >().get_recovery_ctx());
7474
7475 // everyone has to commit/ack before we are truly active
7476 pg->blocked_by.clear();
7477 for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
7478 p != pg->actingbackfill.end();
7479 ++p) {
7480 if (p->shard != pg->pg_whoami.shard) {
7481 pg->blocked_by.insert(p->shard);
7482 }
7483 }
7484 pg->publish_stats_to_osd();
7485 ldout(pg->cct, 10) << "Activate Finished" << dendl;
7486 }
7487
7488 boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
7489 {
7490 PG *pg = context< RecoveryMachine >().pg;
7491 ldout(pg->cct, 10) << "Active advmap" << dendl;
7492 if (!pg->pool.newly_removed_snaps.empty()) {
7493 pg->snap_trimq.union_of(pg->pool.newly_removed_snaps);
7494 ldout(pg->cct, 10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl;
7495 pg->dirty_info = true;
7496 pg->dirty_big_info = true;
7497 }
7498
7499 for (size_t i = 0; i < pg->want_acting.size(); i++) {
7500 int osd = pg->want_acting[i];
7501 if (!advmap.osdmap->is_up(osd)) {
7502 pg_shard_t osd_with_shard(osd, shard_id_t(i));
7503 assert(pg->is_acting(osd_with_shard) || pg->is_up(osd_with_shard));
7504 }
7505 }
7506
7507 bool need_publish = false;
7508 /* Check for changes in pool size (if the acting set changed as a result,
7509 * this does not matter) */
7510 if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) !=
7511 pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) {
7512 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) {
7513 pg->state_clear(PG_STATE_UNDERSIZED);
7514 } else {
7515 pg->state_set(PG_STATE_UNDERSIZED);
7516 }
7517 // degraded changes will be detected by call from publish_stats_to_osd()
7518 need_publish = true;
7519 }
7520
7521 // if we haven't reported our PG stats in a long time, do so now.
7522 if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
7523 ldout(pg->cct, 20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch)
7524 << " epochs" << dendl;
7525 need_publish = true;
7526 }
7527
7528 if (need_publish)
7529 pg->publish_stats_to_osd();
7530
7531 return forward_event();
7532 }
7533
7534 boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
7535 {
7536 PG *pg = context< RecoveryMachine >().pg;
7537 ldout(pg->cct, 10) << "Active: handling ActMap" << dendl;
7538 assert(pg->is_primary());
7539
7540 if (pg->have_unfound()) {
7541 // object may have become unfound
7542 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7543 }
7544
7545 if (pg->cct->_conf->osd_check_for_log_corruption)
7546 pg->check_log_for_corruption(pg->osd->store);
7547
7548 uint64_t unfound = pg->missing_loc.num_unfound();
7549 if (unfound > 0 &&
7550 pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
7551 if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
7552 pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound
7553 << " objects unfound and apparently lost, would automatically "
7554 << "mark these objects lost but this feature is not yet implemented "
7555 << "(osd_auto_mark_unfound_lost)";
7556 } else
7557 pg->osd->clog->error() << pg->info.pgid.pgid << " has "
7558 << unfound << " objects unfound and apparently lost";
7559 }
7560
7561 if (pg->is_active()) {
7562 ldout(pg->cct, 10) << "Active: kicking snap trim" << dendl;
7563 pg->kick_snap_trim();
7564 }
7565
7566 if (pg->is_peered() &&
7567 !pg->is_clean() &&
7568 !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) &&
7569 (!pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || pg->is_degraded())) {
7570 pg->queue_recovery();
7571 }
7572 return forward_event();
7573 }
7574
7575 boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& notevt)
7576 {
7577 PG *pg = context< RecoveryMachine >().pg;
7578 assert(pg->is_primary());
7579 if (pg->peer_info.count(notevt.from)) {
7580 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7581 << ", already have info from that osd, ignoring"
7582 << dendl;
7583 } else if (pg->peer_purged.count(notevt.from)) {
7584 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7585 << ", already purged that peer, ignoring"
7586 << dendl;
7587 } else {
7588 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7589 << ", calling proc_replica_info and discover_all_missing"
7590 << dendl;
7591 pg->proc_replica_info(
7592 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
7593 if (pg->have_unfound()) {
7594 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7595 }
7596 }
7597 return discard_event();
7598 }
7599
7600 boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoevt)
7601 {
7602 PG *pg = context< RecoveryMachine >().pg;
7603 assert(pg->is_primary());
7604
7605 assert(!pg->actingbackfill.empty());
7606 // don't update history (yet) if we are active and primary; the replica
7607 // may be telling us they have activated (and committed) but we can't
7608 // share that until _everyone_ does the same.
7609 if (pg->is_actingbackfill(infoevt.from)) {
7610 ldout(pg->cct, 10) << " peer osd." << infoevt.from
7611 << " activated and committed" << dendl;
7612 pg->peer_activated.insert(infoevt.from);
7613 pg->blocked_by.erase(infoevt.from.shard);
7614 pg->publish_stats_to_osd();
7615 if (pg->peer_activated.size() == pg->actingbackfill.size()) {
7616 pg->all_activated_and_committed();
7617 }
7618 }
7619 return discard_event();
7620 }
7621
7622 boost::statechart::result PG::RecoveryState::Active::react(const MLogRec& logevt)
7623 {
7624 PG *pg = context< RecoveryMachine >().pg;
7625 ldout(pg->cct, 10) << "searching osd." << logevt.from
7626 << " log for unfound items" << dendl;
7627 pg->proc_replica_log(
7628 logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
7629 bool got_missing = pg->search_for_missing(
7630 pg->peer_info[logevt.from],
7631 pg->peer_missing[logevt.from],
7632 logevt.from,
7633 context< RecoveryMachine >().get_recovery_ctx());
7634 // If there are missing AND we are "fully" active then start recovery now
7635 if (got_missing && pg->state_test(PG_STATE_ACTIVE)) {
7636 post_event(DoRecovery());
7637 }
7638 return discard_event();
7639 }
7640
7641 boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
7642 {
7643 PG *pg = context< RecoveryMachine >().pg;
7644
7645 q.f->open_object_section("state");
7646 q.f->dump_string("name", state_name);
7647 q.f->dump_stream("enter_time") << enter_time;
7648
7649 {
7650 q.f->open_array_section("might_have_unfound");
7651 for (set<pg_shard_t>::iterator p = pg->might_have_unfound.begin();
7652 p != pg->might_have_unfound.end();
7653 ++p) {
7654 q.f->open_object_section("osd");
7655 q.f->dump_stream("osd") << *p;
7656 if (pg->peer_missing.count(*p)) {
7657 q.f->dump_string("status", "already probed");
7658 } else if (pg->peer_missing_requested.count(*p)) {
7659 q.f->dump_string("status", "querying");
7660 } else if (!pg->get_osdmap()->is_up(p->osd)) {
7661 q.f->dump_string("status", "osd is down");
7662 } else {
7663 q.f->dump_string("status", "not queried");
7664 }
7665 q.f->close_section();
7666 }
7667 q.f->close_section();
7668 }
7669 {
7670 q.f->open_object_section("recovery_progress");
7671 pg->dump_recovery_info(q.f);
7672 q.f->close_section();
7673 }
7674
7675 {
7676 q.f->open_object_section("scrub");
7677 q.f->dump_stream("scrubber.epoch_start") << pg->scrubber.epoch_start;
7678 q.f->dump_bool("scrubber.active", pg->scrubber.active);
7679 q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state));
7680 q.f->dump_stream("scrubber.start") << pg->scrubber.start;
7681 q.f->dump_stream("scrubber.end") << pg->scrubber.end;
7682 q.f->dump_stream("scrubber.max_end") << pg->scrubber.max_end;
7683 q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update;
7684 q.f->dump_bool("scrubber.deep", pg->scrubber.deep);
7685 {
7686 q.f->open_array_section("scrubber.waiting_on_whom");
7687 for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin();
7688 p != pg->scrubber.waiting_on_whom.end();
7689 ++p) {
7690 q.f->dump_stream("shard") << *p;
7691 }
7692 q.f->close_section();
7693 }
7694 q.f->close_section();
7695 }
7696
7697 q.f->close_section();
7698 return forward_event();
7699 }
7700
7701 boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActivated &evt)
7702 {
7703 PG *pg = context< RecoveryMachine >().pg;
7704 all_replicas_activated = true;
7705
7706 pg->state_clear(PG_STATE_ACTIVATING);
7707 pg->state_clear(PG_STATE_CREATING);
7708 if (pg->acting.size() >= pg->pool.info.min_size) {
7709 pg->state_set(PG_STATE_ACTIVE);
7710 } else {
7711 pg->state_set(PG_STATE_PEERED);
7712 }
7713
7714 // info.last_epoch_started is set during activate()
7715 pg->info.history.last_epoch_started = pg->info.last_epoch_started;
7716 pg->info.history.last_interval_started = pg->info.last_interval_started;
7717 pg->dirty_info = true;
7718
7719 pg->share_pg_info();
7720 pg->publish_stats_to_osd();
7721
7722 pg->check_local();
7723
7724 // waiters
7725 if (pg->flushes_in_progress == 0) {
7726 pg->requeue_ops(pg->waiting_for_peered);
7727 } else if (!pg->waiting_for_peered.empty()) {
7728 ldout(pg->cct, 10) << __func__ << " flushes in progress, moving "
7729 << pg->waiting_for_peered.size()
7730 << " items to waiting_for_flush"
7731 << dendl;
7732 assert(pg->waiting_for_flush.empty());
7733 pg->waiting_for_flush.swap(pg->waiting_for_peered);
7734 }
7735
7736 pg->on_activate();
7737
7738 return discard_event();
7739 }
7740
7741 void PG::RecoveryState::Active::exit()
7742 {
7743 context< RecoveryMachine >().log_exit(state_name, enter_time);
7744 PG *pg = context< RecoveryMachine >().pg;
7745 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7746
7747 pg->blocked_by.clear();
7748 pg->backfill_reserved = false;
7749 pg->backfill_reserving = false;
7750 pg->state_clear(PG_STATE_ACTIVATING);
7751 pg->state_clear(PG_STATE_DEGRADED);
7752 pg->state_clear(PG_STATE_UNDERSIZED);
7753 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
7754 pg->state_clear(PG_STATE_BACKFILL_WAIT);
7755 pg->state_clear(PG_STATE_RECOVERY_WAIT);
7756 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7757 utime_t dur = ceph_clock_now() - enter_time;
7758 pg->osd->recoverystate_perf->tinc(rs_active_latency, dur);
7759 pg->agent_stop();
7760 }
7761
7762 /*------ReplicaActive-----*/
7763 PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx)
7764 : my_base(ctx),
7765 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive")
7766 {
7767 context< RecoveryMachine >().log_enter(state_name);
7768
7769 PG *pg = context< RecoveryMachine >().pg;
7770 pg->start_flush(
7771 context< RecoveryMachine >().get_cur_transaction(),
7772 context< RecoveryMachine >().get_on_applied_context_list(),
7773 context< RecoveryMachine >().get_on_safe_context_list());
7774 }
7775
7776
7777 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
7778 const Activate& actevt) {
7779 PG *pg = context< RecoveryMachine >().pg;
7780 ldout(pg->cct, 10) << "In ReplicaActive, about to call activate" << dendl;
7781 map<int, map<spg_t, pg_query_t> > query_map;
7782 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7783 actevt.activation_epoch,
7784 *context< RecoveryMachine >().get_on_safe_context_list(),
7785 query_map, NULL, NULL);
7786 ldout(pg->cct, 10) << "Activate Finished" << dendl;
7787 return discard_event();
7788 }
7789
7790 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MInfoRec& infoevt)
7791 {
7792 PG *pg = context< RecoveryMachine >().pg;
7793 pg->proc_primary_info(*context<RecoveryMachine>().get_cur_transaction(),
7794 infoevt.info);
7795 return discard_event();
7796 }
7797
7798 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MLogRec& logevt)
7799 {
7800 PG *pg = context< RecoveryMachine >().pg;
7801 ldout(pg->cct, 10) << "received log from " << logevt.from << dendl;
7802 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7803 pg->merge_log(*t, logevt.msg->info, logevt.msg->log, logevt.from);
7804 assert(pg->pg_log.get_head() == pg->info.last_update);
7805
7806 return discard_event();
7807 }
7808
7809 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&)
7810 {
7811 PG *pg = context< RecoveryMachine >().pg;
7812 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7813 context< RecoveryMachine >().send_notify(
7814 pg->get_primary(),
7815 pg_notify_t(
7816 pg->get_primary().shard, pg->pg_whoami.shard,
7817 pg->get_osdmap()->get_epoch(),
7818 pg->get_osdmap()->get_epoch(),
7819 pg->info),
7820 pg->past_intervals);
7821 }
7822 pg->take_waiters();
7823 return discard_event();
7824 }
7825
7826 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
7827 const MQuery& query)
7828 {
7829 PG *pg = context< RecoveryMachine >().pg;
7830 pg->fulfill_query(query, context<RecoveryMachine>().get_recovery_ctx());
7831 return discard_event();
7832 }
7833
7834 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const QueryState& q)
7835 {
7836 q.f->open_object_section("state");
7837 q.f->dump_string("name", state_name);
7838 q.f->dump_stream("enter_time") << enter_time;
7839 q.f->close_section();
7840 return forward_event();
7841 }
7842
7843 void PG::RecoveryState::ReplicaActive::exit()
7844 {
7845 context< RecoveryMachine >().log_exit(state_name, enter_time);
7846 PG *pg = context< RecoveryMachine >().pg;
7847 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7848 utime_t dur = ceph_clock_now() - enter_time;
7849 pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur);
7850 }
7851
7852 /*-------Stray---*/
7853 PG::RecoveryState::Stray::Stray(my_context ctx)
7854 : my_base(ctx),
7855 NamedState(context< RecoveryMachine >().pg, "Started/Stray")
7856 {
7857 context< RecoveryMachine >().log_enter(state_name);
7858
7859 PG *pg = context< RecoveryMachine >().pg;
7860 assert(!pg->is_peered());
7861 assert(!pg->is_peering());
7862 assert(!pg->is_primary());
7863 pg->start_flush(
7864 context< RecoveryMachine >().get_cur_transaction(),
7865 context< RecoveryMachine >().get_on_applied_context_list(),
7866 context< RecoveryMachine >().get_on_safe_context_list());
7867 }
7868
7869 boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
7870 {
7871 PG *pg = context< RecoveryMachine >().pg;
7872 MOSDPGLog *msg = logevt.msg.get();
7873 ldout(pg->cct, 10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
7874
7875 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7876 if (msg->info.last_backfill == hobject_t()) {
7877 // restart backfill
7878 pg->unreg_next_scrub();
7879 pg->info = msg->info;
7880 pg->reg_next_scrub();
7881 pg->dirty_info = true;
7882 pg->dirty_big_info = true; // maybe.
7883
7884 PGLogEntryHandler rollbacker{pg, t};
7885 pg->pg_log.reset_backfill_claim_log(msg->log, &rollbacker);
7886
7887 pg->pg_log.reset_backfill();
7888 } else {
7889 pg->merge_log(*t, msg->info, msg->log, logevt.from);
7890 }
7891
7892 assert(pg->pg_log.get_head() == pg->info.last_update);
7893
7894 post_event(Activate(logevt.msg->info.last_epoch_started));
7895 return transit<ReplicaActive>();
7896 }
7897
7898 boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoevt)
7899 {
7900 PG *pg = context< RecoveryMachine >().pg;
7901 ldout(pg->cct, 10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
7902
7903 if (pg->info.last_update > infoevt.info.last_update) {
7904 // rewind divergent log entries
7905 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7906 pg->rewind_divergent_log(*t, infoevt.info.last_update);
7907 pg->info.stats = infoevt.info.stats;
7908 pg->info.hit_set = infoevt.info.hit_set;
7909 }
7910
7911 assert(infoevt.info.last_update == pg->info.last_update);
7912 assert(pg->pg_log.get_head() == pg->info.last_update);
7913
7914 post_event(Activate(infoevt.info.last_epoch_started));
7915 return transit<ReplicaActive>();
7916 }
7917
7918 boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query)
7919 {
7920 PG *pg = context< RecoveryMachine >().pg;
7921 pg->fulfill_query(query, context<RecoveryMachine>().get_recovery_ctx());
7922 return discard_event();
7923 }
7924
7925 boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&)
7926 {
7927 PG *pg = context< RecoveryMachine >().pg;
7928 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7929 context< RecoveryMachine >().send_notify(
7930 pg->get_primary(),
7931 pg_notify_t(
7932 pg->get_primary().shard, pg->pg_whoami.shard,
7933 pg->get_osdmap()->get_epoch(),
7934 pg->get_osdmap()->get_epoch(),
7935 pg->info),
7936 pg->past_intervals);
7937 }
7938 pg->take_waiters();
7939 return discard_event();
7940 }
7941
7942 void PG::RecoveryState::Stray::exit()
7943 {
7944 context< RecoveryMachine >().log_exit(state_name, enter_time);
7945 PG *pg = context< RecoveryMachine >().pg;
7946 utime_t dur = ceph_clock_now() - enter_time;
7947 pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur);
7948 }
7949
7950 /*--------GetInfo---------*/
7951 PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
7952 : my_base(ctx),
7953 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetInfo")
7954 {
7955 context< RecoveryMachine >().log_enter(state_name);
7956
7957 PG *pg = context< RecoveryMachine >().pg;
7958 pg->check_past_interval_bounds();
7959 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7960
7961 assert(pg->blocked_by.empty());
7962
7963 prior_set = pg->build_prior();
7964
7965 pg->reset_min_peer_features();
7966 get_infos();
7967 if (prior_set.pg_down) {
7968 post_event(IsDown());
7969 } else if (peer_info_requested.empty()) {
7970 post_event(GotInfo());
7971 }
7972 }
7973
7974 void PG::RecoveryState::GetInfo::get_infos()
7975 {
7976 PG *pg = context< RecoveryMachine >().pg;
7977 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7978
7979 pg->blocked_by.clear();
7980 for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
7981 it != prior_set.probe.end();
7982 ++it) {
7983 pg_shard_t peer = *it;
7984 if (peer == pg->pg_whoami) {
7985 continue;
7986 }
7987 if (pg->peer_info.count(peer)) {
7988 ldout(pg->cct, 10) << " have osd." << peer << " info " << pg->peer_info[peer] << dendl;
7989 continue;
7990 }
7991 if (peer_info_requested.count(peer)) {
7992 ldout(pg->cct, 10) << " already requested info from osd." << peer << dendl;
7993 pg->blocked_by.insert(peer.osd);
7994 } else if (!pg->get_osdmap()->is_up(peer.osd)) {
7995 ldout(pg->cct, 10) << " not querying info from down osd." << peer << dendl;
7996 } else {
7997 ldout(pg->cct, 10) << " querying info from osd." << peer << dendl;
7998 context< RecoveryMachine >().send_query(
7999 peer, pg_query_t(pg_query_t::INFO,
8000 it->shard, pg->pg_whoami.shard,
8001 pg->info.history,
8002 pg->get_osdmap()->get_epoch()));
8003 peer_info_requested.insert(peer);
8004 pg->blocked_by.insert(peer.osd);
8005 }
8006 }
8007
8008 pg->publish_stats_to_osd();
8009 }
8010
8011 boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt)
8012 {
8013 PG *pg = context< RecoveryMachine >().pg;
8014
8015 set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
8016 if (p != peer_info_requested.end()) {
8017 peer_info_requested.erase(p);
8018 pg->blocked_by.erase(infoevt.from.osd);
8019 }
8020
8021 epoch_t old_start = pg->info.history.last_epoch_started;
8022 if (pg->proc_replica_info(
8023 infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
8024 // we got something new ...
8025 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
8026 if (old_start < pg->info.history.last_epoch_started) {
8027 ldout(pg->cct, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
8028 prior_set = pg->build_prior();
8029
8030 // filter out any osds that got dropped from the probe set from
8031 // peer_info_requested. this is less expensive than restarting
8032 // peering (which would re-probe everyone).
8033 set<pg_shard_t>::iterator p = peer_info_requested.begin();
8034 while (p != peer_info_requested.end()) {
8035 if (prior_set.probe.count(*p) == 0) {
8036 ldout(pg->cct, 20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
8037 peer_info_requested.erase(p++);
8038 } else {
8039 ++p;
8040 }
8041 }
8042 get_infos();
8043 }
8044 ldout(pg->cct, 20) << "Adding osd: " << infoevt.from.osd << " peer features: "
8045 << hex << infoevt.features << dec << dendl;
8046 pg->apply_peer_features(infoevt.features);
8047
8048 // are we done getting everything?
8049 if (peer_info_requested.empty() && !prior_set.pg_down) {
8050 ldout(pg->cct, 20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl;
8051 ldout(pg->cct, 20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl;
8052 ldout(pg->cct, 20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl;
8053 post_event(GotInfo());
8054 }
8055 }
8056 return discard_event();
8057 }
8058
8059 boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q)
8060 {
8061 PG *pg = context< RecoveryMachine >().pg;
8062 q.f->open_object_section("state");
8063 q.f->dump_string("name", state_name);
8064 q.f->dump_stream("enter_time") << enter_time;
8065
8066 q.f->open_array_section("requested_info_from");
8067 for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
8068 p != peer_info_requested.end();
8069 ++p) {
8070 q.f->open_object_section("osd");
8071 q.f->dump_stream("osd") << *p;
8072 if (pg->peer_info.count(*p)) {
8073 q.f->open_object_section("got_info");
8074 pg->peer_info[*p].dump(q.f);
8075 q.f->close_section();
8076 }
8077 q.f->close_section();
8078 }
8079 q.f->close_section();
8080
8081 q.f->close_section();
8082 return forward_event();
8083 }
8084
8085 void PG::RecoveryState::GetInfo::exit()
8086 {
8087 context< RecoveryMachine >().log_exit(state_name, enter_time);
8088 PG *pg = context< RecoveryMachine >().pg;
8089 utime_t dur = ceph_clock_now() - enter_time;
8090 pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur);
8091 pg->blocked_by.clear();
8092 pg->publish_stats_to_osd();
8093 }
8094
8095 /*------GetLog------------*/
8096 PG::RecoveryState::GetLog::GetLog(my_context ctx)
8097 : my_base(ctx),
8098 NamedState(
8099 context< RecoveryMachine >().pg, "Started/Primary/Peering/GetLog"),
8100 msg(0)
8101 {
8102 context< RecoveryMachine >().log_enter(state_name);
8103
8104 PG *pg = context< RecoveryMachine >().pg;
8105
8106 // adjust acting?
8107 if (!pg->choose_acting(auth_log_shard, false,
8108 &context< Peering >().history_les_bound)) {
8109 if (!pg->want_acting.empty()) {
8110 post_event(NeedActingChange());
8111 } else {
8112 post_event(IsIncomplete());
8113 }
8114 return;
8115 }
8116
8117 // am i the best?
8118 if (auth_log_shard == pg->pg_whoami) {
8119 post_event(GotLog());
8120 return;
8121 }
8122
8123 const pg_info_t& best = pg->peer_info[auth_log_shard];
8124
8125 // am i broken?
8126 if (pg->info.last_update < best.log_tail) {
8127 ldout(pg->cct, 10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
8128 post_event(IsIncomplete());
8129 return;
8130 }
8131
8132 // how much log to request?
8133 eversion_t request_log_from = pg->info.last_update;
8134 assert(!pg->actingbackfill.empty());
8135 for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
8136 p != pg->actingbackfill.end();
8137 ++p) {
8138 if (*p == pg->pg_whoami) continue;
8139 pg_info_t& ri = pg->peer_info[*p];
8140 if (ri.last_update < pg->info.log_tail && ri.last_update >= best.log_tail &&
8141 ri.last_update < request_log_from)
8142 request_log_from = ri.last_update;
8143 }
8144
8145 // how much?
8146 ldout(pg->cct, 10) << " requesting log from osd." << auth_log_shard << dendl;
8147 context<RecoveryMachine>().send_query(
8148 auth_log_shard,
8149 pg_query_t(
8150 pg_query_t::LOG,
8151 auth_log_shard.shard, pg->pg_whoami.shard,
8152 request_log_from, pg->info.history,
8153 pg->get_osdmap()->get_epoch()));
8154
8155 assert(pg->blocked_by.empty());
8156 pg->blocked_by.insert(auth_log_shard.osd);
8157 pg->publish_stats_to_osd();
8158 }
8159
8160 boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap)
8161 {
8162 PG *pg = context< RecoveryMachine >().pg;
8163 // make sure our log source didn't go down. we need to check
8164 // explicitly because it may not be part of the prior set, which
8165 // means the Peering state check won't catch it going down.
8166 if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
8167 ldout(pg->cct, 10) << "GetLog: auth_log_shard osd."
8168 << auth_log_shard.osd << " went down" << dendl;
8169 post_event(advmap);
8170 return transit< Reset >();
8171 }
8172
8173 // let the Peering state do its checks.
8174 return forward_event();
8175 }
8176
8177 boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt)
8178 {
8179 PG *pg = context< RecoveryMachine >().pg;
8180 assert(!msg);
8181 if (logevt.from != auth_log_shard) {
8182 ldout(pg->cct, 10) << "GetLog: discarding log from "
8183 << "non-auth_log_shard osd." << logevt.from << dendl;
8184 return discard_event();
8185 }
8186 ldout(pg->cct, 10) << "GetLog: received master log from osd"
8187 << logevt.from << dendl;
8188 msg = logevt.msg;
8189 post_event(GotLog());
8190 return discard_event();
8191 }
8192
8193 boost::statechart::result PG::RecoveryState::GetLog::react(const GotLog&)
8194 {
8195 PG *pg = context< RecoveryMachine >().pg;
8196 ldout(pg->cct, 10) << "leaving GetLog" << dendl;
8197 if (msg) {
8198 ldout(pg->cct, 10) << "processing master log" << dendl;
8199 pg->proc_master_log(*context<RecoveryMachine>().get_cur_transaction(),
8200 msg->info, msg->log, msg->missing,
8201 auth_log_shard);
8202 }
8203 pg->start_flush(
8204 context< RecoveryMachine >().get_cur_transaction(),
8205 context< RecoveryMachine >().get_on_applied_context_list(),
8206 context< RecoveryMachine >().get_on_safe_context_list());
8207 return transit< GetMissing >();
8208 }
8209
8210 boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q)
8211 {
8212 q.f->open_object_section("state");
8213 q.f->dump_string("name", state_name);
8214 q.f->dump_stream("enter_time") << enter_time;
8215 q.f->dump_stream("auth_log_shard") << auth_log_shard;
8216 q.f->close_section();
8217 return forward_event();
8218 }
8219
8220 void PG::RecoveryState::GetLog::exit()
8221 {
8222 context< RecoveryMachine >().log_exit(state_name, enter_time);
8223 PG *pg = context< RecoveryMachine >().pg;
8224 utime_t dur = ceph_clock_now() - enter_time;
8225 pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur);
8226 pg->blocked_by.clear();
8227 pg->publish_stats_to_osd();
8228 }
8229
8230 /*------WaitActingChange--------*/
8231 PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx)
8232 : my_base(ctx),
8233 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitActingChange")
8234 {
8235 context< RecoveryMachine >().log_enter(state_name);
8236 }
8237
8238 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const AdvMap& advmap)
8239 {
8240 PG *pg = context< RecoveryMachine >().pg;
8241 OSDMapRef osdmap = advmap.osdmap;
8242
8243 ldout(pg->cct, 10) << "verifying no want_acting " << pg->want_acting << " targets didn't go down" << dendl;
8244 for (vector<int>::iterator p = pg->want_acting.begin(); p != pg->want_acting.end(); ++p) {
8245 if (!osdmap->is_up(*p)) {
8246 ldout(pg->cct, 10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
8247 post_event(advmap);
8248 return transit< Reset >();
8249 }
8250 }
8251 return forward_event();
8252 }
8253
8254 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MLogRec& logevt)
8255 {
8256 PG *pg = context< RecoveryMachine >().pg;
8257 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MLocRec" << dendl;
8258 return discard_event();
8259 }
8260
8261 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MInfoRec& evt)
8262 {
8263 PG *pg = context< RecoveryMachine >().pg;
8264 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
8265 return discard_event();
8266 }
8267
8268 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MNotifyRec& evt)
8269 {
8270 PG *pg = context< RecoveryMachine >().pg;
8271 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
8272 return discard_event();
8273 }
8274
8275 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const QueryState& q)
8276 {
8277 q.f->open_object_section("state");
8278 q.f->dump_string("name", state_name);
8279 q.f->dump_stream("enter_time") << enter_time;
8280 q.f->dump_string("comment", "waiting for pg acting set to change");
8281 q.f->close_section();
8282 return forward_event();
8283 }
8284
8285 void PG::RecoveryState::WaitActingChange::exit()
8286 {
8287 context< RecoveryMachine >().log_exit(state_name, enter_time);
8288 PG *pg = context< RecoveryMachine >().pg;
8289 utime_t dur = ceph_clock_now() - enter_time;
8290 pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur);
8291 }
8292
8293 /*------Down--------*/
8294 PG::RecoveryState::Down::Down(my_context ctx)
8295 : my_base(ctx),
8296 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Down")
8297 {
8298 context< RecoveryMachine >().log_enter(state_name);
8299 PG *pg = context< RecoveryMachine >().pg;
8300
8301 pg->state_clear(PG_STATE_PEERING);
8302 pg->state_set(PG_STATE_DOWN);
8303
8304 auto &prior_set = context< Peering >().prior_set;
8305 assert(pg->blocked_by.empty());
8306 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
8307 pg->publish_stats_to_osd();
8308 }
8309
8310 void PG::RecoveryState::Down::exit()
8311 {
8312 context< RecoveryMachine >().log_exit(state_name, enter_time);
8313 PG *pg = context< RecoveryMachine >().pg;
8314
8315 pg->state_clear(PG_STATE_DOWN);
8316 utime_t dur = ceph_clock_now() - enter_time;
8317 pg->osd->recoverystate_perf->tinc(rs_down_latency, dur);
8318
8319 pg->blocked_by.clear();
8320 pg->publish_stats_to_osd();
8321 }
8322
8323 boost::statechart::result PG::RecoveryState::Down::react(const QueryState& q)
8324 {
8325 q.f->open_object_section("state");
8326 q.f->dump_string("name", state_name);
8327 q.f->dump_stream("enter_time") << enter_time;
8328 q.f->dump_string("comment",
8329 "not enough up instances of this PG to go active");
8330 q.f->close_section();
8331 return forward_event();
8332 }
8333
8334 /*------Incomplete--------*/
8335 PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
8336 : my_base(ctx),
8337 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Incomplete")
8338 {
8339 context< RecoveryMachine >().log_enter(state_name);
8340 PG *pg = context< RecoveryMachine >().pg;
8341
8342 pg->state_clear(PG_STATE_PEERING);
8343 pg->state_set(PG_STATE_INCOMPLETE);
8344
8345 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
8346 assert(pg->blocked_by.empty());
8347 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
8348 pg->publish_stats_to_osd();
8349 }
8350
8351 boost::statechart::result PG::RecoveryState::Incomplete::react(const AdvMap &advmap) {
8352 PG *pg = context< RecoveryMachine >().pg;
8353 int64_t poolnum = pg->info.pgid.pool();
8354
8355 // Reset if min_size turn smaller than previous value, pg might now be able to go active
8356 if (!advmap.osdmap->have_pg_pool(poolnum) ||
8357 advmap.lastmap->get_pools().find(poolnum)->second.min_size >
8358 advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
8359 post_event(advmap);
8360 return transit< Reset >();
8361 }
8362
8363 return forward_event();
8364 }
8365
8366 boost::statechart::result PG::RecoveryState::Incomplete::react(const MNotifyRec& notevt) {
8367 PG *pg = context< RecoveryMachine >().pg;
8368 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
8369 if (pg->proc_replica_info(
8370 notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
8371 // We got something new, try again!
8372 return transit< GetLog >();
8373 } else {
8374 return discard_event();
8375 }
8376 }
8377
8378 boost::statechart::result PG::RecoveryState::Incomplete::react(
8379 const QueryState& q)
8380 {
8381 q.f->open_object_section("state");
8382 q.f->dump_string("name", state_name);
8383 q.f->dump_stream("enter_time") << enter_time;
8384 q.f->dump_string("comment", "not enough complete instances of this PG");
8385 q.f->close_section();
8386 return forward_event();
8387 }
8388
8389 void PG::RecoveryState::Incomplete::exit()
8390 {
8391 context< RecoveryMachine >().log_exit(state_name, enter_time);
8392 PG *pg = context< RecoveryMachine >().pg;
8393
8394 pg->state_clear(PG_STATE_INCOMPLETE);
8395 utime_t dur = ceph_clock_now() - enter_time;
8396 pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur);
8397
8398 pg->blocked_by.clear();
8399 pg->publish_stats_to_osd();
8400 }
8401
8402 /*------GetMissing--------*/
8403 PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
8404 : my_base(ctx),
8405 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetMissing")
8406 {
8407 context< RecoveryMachine >().log_enter(state_name);
8408
8409 PG *pg = context< RecoveryMachine >().pg;
8410 assert(!pg->actingbackfill.empty());
8411 eversion_t since;
8412 for (set<pg_shard_t>::iterator i = pg->actingbackfill.begin();
8413 i != pg->actingbackfill.end();
8414 ++i) {
8415 if (*i == pg->get_primary()) continue;
8416 const pg_info_t& pi = pg->peer_info[*i];
8417 // reset this so to make sure the pg_missing_t is initialized and
8418 // has the correct semantics even if we don't need to get a
8419 // missing set from a shard. This way later additions due to
8420 // lost+unfound delete work properly.
8421 pg->peer_missing[*i].may_include_deletes = !pg->perform_deletes_during_peering();
8422
8423 if (pi.is_empty())
8424 continue; // no pg data, nothing divergent
8425
8426 if (pi.last_update < pg->pg_log.get_tail()) {
8427 ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
8428 pg->peer_missing[*i].clear();
8429 continue;
8430 }
8431 if (pi.last_backfill == hobject_t()) {
8432 ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
8433 pg->peer_missing[*i].clear();
8434 continue;
8435 }
8436
8437 if (pi.last_update == pi.last_complete && // peer has no missing
8438 pi.last_update == pg->info.last_update) { // peer is up to date
8439 // replica has no missing and identical log as us. no need to
8440 // pull anything.
8441 // FIXME: we can do better here. if last_update==last_complete we
8442 // can infer the rest!
8443 ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl;
8444 pg->peer_missing[*i].clear();
8445 continue;
8446 }
8447
8448 // We pull the log from the peer's last_epoch_started to ensure we
8449 // get enough log to detect divergent updates.
8450 since.epoch = pi.last_epoch_started;
8451 assert(pi.last_update >= pg->info.log_tail); // or else choose_acting() did a bad thing
8452 if (pi.log_tail <= since) {
8453 ldout(pg->cct, 10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
8454 context< RecoveryMachine >().send_query(
8455 *i,
8456 pg_query_t(
8457 pg_query_t::LOG,
8458 i->shard, pg->pg_whoami.shard,
8459 since, pg->info.history,
8460 pg->get_osdmap()->get_epoch()));
8461 } else {
8462 ldout(pg->cct, 10) << " requesting fulllog+missing from osd." << *i
8463 << " (want since " << since << " < log.tail "
8464 << pi.log_tail << ")" << dendl;
8465 context< RecoveryMachine >().send_query(
8466 *i, pg_query_t(
8467 pg_query_t::FULLLOG,
8468 i->shard, pg->pg_whoami.shard,
8469 pg->info.history, pg->get_osdmap()->get_epoch()));
8470 }
8471 peer_missing_requested.insert(*i);
8472 pg->blocked_by.insert(i->osd);
8473 }
8474
8475 if (peer_missing_requested.empty()) {
8476 if (pg->need_up_thru) {
8477 ldout(pg->cct, 10) << " still need up_thru update before going active"
8478 << dendl;
8479 post_event(NeedUpThru());
8480 return;
8481 }
8482
8483 // all good!
8484 post_event(Activate(pg->get_osdmap()->get_epoch()));
8485 } else {
8486 pg->publish_stats_to_osd();
8487 }
8488 }
8489
8490 boost::statechart::result PG::RecoveryState::GetMissing::react(const MLogRec& logevt)
8491 {
8492 PG *pg = context< RecoveryMachine >().pg;
8493
8494 peer_missing_requested.erase(logevt.from);
8495 pg->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
8496
8497 if (peer_missing_requested.empty()) {
8498 if (pg->need_up_thru) {
8499 ldout(pg->cct, 10) << " still need up_thru update before going active"
8500 << dendl;
8501 post_event(NeedUpThru());
8502 } else {
8503 ldout(pg->cct, 10) << "Got last missing, don't need missing "
8504 << "posting Activate" << dendl;
8505 post_event(Activate(pg->get_osdmap()->get_epoch()));
8506 }
8507 }
8508 return discard_event();
8509 }
8510
8511 boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState& q)
8512 {
8513 PG *pg = context< RecoveryMachine >().pg;
8514 q.f->open_object_section("state");
8515 q.f->dump_string("name", state_name);
8516 q.f->dump_stream("enter_time") << enter_time;
8517
8518 q.f->open_array_section("peer_missing_requested");
8519 for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
8520 p != peer_missing_requested.end();
8521 ++p) {
8522 q.f->open_object_section("osd");
8523 q.f->dump_stream("osd") << *p;
8524 if (pg->peer_missing.count(*p)) {
8525 q.f->open_object_section("got_missing");
8526 pg->peer_missing[*p].dump(q.f);
8527 q.f->close_section();
8528 }
8529 q.f->close_section();
8530 }
8531 q.f->close_section();
8532
8533 q.f->close_section();
8534 return forward_event();
8535 }
8536
8537 void PG::RecoveryState::GetMissing::exit()
8538 {
8539 context< RecoveryMachine >().log_exit(state_name, enter_time);
8540 PG *pg = context< RecoveryMachine >().pg;
8541 utime_t dur = ceph_clock_now() - enter_time;
8542 pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur);
8543 pg->blocked_by.clear();
8544 pg->publish_stats_to_osd();
8545 }
8546
8547 /*------WaitUpThru--------*/
8548 PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx)
8549 : my_base(ctx),
8550 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitUpThru")
8551 {
8552 context< RecoveryMachine >().log_enter(state_name);
8553 }
8554
8555 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const ActMap& am)
8556 {
8557 PG *pg = context< RecoveryMachine >().pg;
8558 if (!pg->need_up_thru) {
8559 post_event(Activate(pg->get_osdmap()->get_epoch()));
8560 }
8561 return forward_event();
8562 }
8563
8564 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const MLogRec& logevt)
8565 {
8566 PG *pg = context< RecoveryMachine >().pg;
8567 ldout(pg->cct, 10) << "Noting missing from osd." << logevt.from << dendl;
8568 pg->peer_missing[logevt.from].claim(logevt.msg->missing);
8569 pg->peer_info[logevt.from] = logevt.msg->info;
8570 return discard_event();
8571 }
8572
8573 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const QueryState& q)
8574 {
8575 q.f->open_object_section("state");
8576 q.f->dump_string("name", state_name);
8577 q.f->dump_stream("enter_time") << enter_time;
8578 q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
8579 q.f->close_section();
8580 return forward_event();
8581 }
8582
8583 void PG::RecoveryState::WaitUpThru::exit()
8584 {
8585 context< RecoveryMachine >().log_exit(state_name, enter_time);
8586 PG *pg = context< RecoveryMachine >().pg;
8587 utime_t dur = ceph_clock_now() - enter_time;
8588 pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur);
8589 }
8590
8591 /*----RecoveryState::RecoveryMachine Methods-----*/
8592 #undef dout_prefix
8593 #define dout_prefix *_dout << pg->gen_prefix()
8594
8595 void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name)
8596 {
8597 PG *pg = context< RecoveryMachine >().pg;
8598 ldout(pg->cct, 5) << "enter " << state_name << dendl;
8599 pg->osd->pg_recovery_stats.log_enter(state_name);
8600 }
8601
8602 void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_t enter_time)
8603 {
8604 utime_t dur = ceph_clock_now() - enter_time;
8605 PG *pg = context< RecoveryMachine >().pg;
8606 ldout(pg->cct, 5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
8607 pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now() - enter_time,
8608 event_count, event_time);
8609 event_count = 0;
8610 event_time = utime_t();
8611 }
8612
8613
8614 /*---------------------------------------------------*/
8615 #undef dout_prefix
8616 #define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ")
8617
8618 void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
8619 assert(!rctx);
8620 assert(!orig_ctx);
8621 orig_ctx = new_ctx;
8622 if (new_ctx) {
8623 if (messages_pending_flush) {
8624 rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
8625 } else {
8626 rctx = *new_ctx;
8627 }
8628 rctx->start_time = ceph_clock_now();
8629 }
8630 }
8631
8632 void PG::RecoveryState::begin_block_outgoing() {
8633 assert(!messages_pending_flush);
8634 assert(orig_ctx);
8635 assert(rctx);
8636 messages_pending_flush = BufferedRecoveryMessages();
8637 rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
8638 }
8639
8640 void PG::RecoveryState::clear_blocked_outgoing() {
8641 assert(orig_ctx);
8642 assert(rctx);
8643 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8644 }
8645
8646 void PG::RecoveryState::end_block_outgoing() {
8647 assert(messages_pending_flush);
8648 assert(orig_ctx);
8649 assert(rctx);
8650
8651 rctx = RecoveryCtx(*orig_ctx);
8652 rctx->accept_buffered_messages(*messages_pending_flush);
8653 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8654 }
8655
8656 void PG::RecoveryState::end_handle() {
8657 if (rctx) {
8658 utime_t dur = ceph_clock_now() - rctx->start_time;
8659 machine.event_time += dur;
8660 }
8661
8662 machine.event_count++;
8663 rctx = boost::optional<RecoveryCtx>();
8664 orig_ctx = NULL;
8665 }
8666
8667 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi)
8668 {
8669 out << "BackfillInfo(" << bi.begin << "-" << bi.end
8670 << " " << bi.objects.size() << " objects";
8671 if (!bi.objects.empty())
8672 out << " " << bi.objects;
8673 out << ")";
8674 return out;
8675 }
8676
8677 void intrusive_ptr_add_ref(PG *pg) { pg->get("intptr"); }
8678 void intrusive_ptr_release(PG *pg) { pg->put("intptr"); }
8679
8680 #ifdef PG_DEBUG_REFS
8681 uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
8682 void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
8683 #endif