]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PG.cc
e753310534bd86bac4155092a9de1d1f30451655
[ceph.git] / ceph / src / osd / PG.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "PG.h"
16 // #include "msg/Messenger.h"
17 #include "messages/MOSDRepScrub.h"
18 // #include "common/cmdparse.h"
19 // #include "common/ceph_context.h"
20
21 #include "common/errno.h"
22 #include "common/config.h"
23 #include "OSD.h"
24 #include "OpRequest.h"
25 #include "ScrubStore.h"
26 #include "Session.h"
27
28 #include "common/Timer.h"
29 #include "common/perf_counters.h"
30
31 #include "messages/MOSDOp.h"
32 #include "messages/MOSDPGNotify.h"
33 // #include "messages/MOSDPGLog.h"
34 #include "messages/MOSDPGRemove.h"
35 #include "messages/MOSDPGInfo.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDPGBackfill.h"
39 #include "messages/MOSDPGBackfillRemove.h"
40 #include "messages/MBackfillReserve.h"
41 #include "messages/MRecoveryReserve.h"
42 #include "messages/MOSDPGPush.h"
43 #include "messages/MOSDPGPushReply.h"
44 #include "messages/MOSDPGPull.h"
45 #include "messages/MOSDECSubOpWrite.h"
46 #include "messages/MOSDECSubOpWriteReply.h"
47 #include "messages/MOSDECSubOpRead.h"
48 #include "messages/MOSDECSubOpReadReply.h"
49 #include "messages/MOSDPGUpdateLogMissing.h"
50 #include "messages/MOSDPGUpdateLogMissingReply.h"
51 #include "messages/MOSDBackoff.h"
52 #include "messages/MOSDScrubReserve.h"
53 #include "messages/MOSDSubOp.h"
54 #include "messages/MOSDRepOp.h"
55 #include "messages/MOSDSubOpReply.h"
56 #include "messages/MOSDRepOpReply.h"
57 #include "messages/MOSDRepScrubMap.h"
58
59 #include "common/BackTrace.h"
60 #include "common/EventTrace.h"
61
62 #ifdef WITH_LTTNG
63 #define TRACEPOINT_DEFINE
64 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
65 #include "tracing/pg.h"
66 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
67 #undef TRACEPOINT_DEFINE
68 #else
69 #define tracepoint(...)
70 #endif
71
72 #include <sstream>
73
74 #define dout_context cct
75 #define dout_subsys ceph_subsys_osd
76 #undef dout_prefix
77 #define dout_prefix _prefix(_dout, this)
78
79 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
80 // easily skip them
81 const string infover_key("_infover");
82 const string info_key("_info");
83 const string biginfo_key("_biginfo");
84 const string epoch_key("_epoch");
85 const string fastinfo_key("_fastinfo");
86
87 template <class T>
88 static ostream& _prefix(std::ostream *_dout, T *t)
89 {
90 return *_dout << t->gen_prefix();
91 }
92
93 MEMPOOL_DEFINE_OBJECT_FACTORY(PG::CephPeeringEvt, pg_peering_evt, osd);
94
95 void PGStateHistory::enter(PG* pg, const utime_t entime, const char* state)
96 {
97 // Ignore trimming state machine for now
98 if (::strstr(state, "Trimming") != NULL) {
99 return;
100 } else if (pi != nullptr) {
101 pi->enter_state(entime, state);
102 } else {
103 // Store current state since we can't reliably take the PG lock here
104 if ( tmppi == nullptr) {
105 tmppi = std::unique_ptr<PGStateInstance>(new PGStateInstance);
106 }
107
108 thispg = pg;
109 tmppi->enter_state(entime, state);
110 }
111 }
112
113 void PGStateHistory::exit(const char* state) {
114 // Ignore trimming state machine for now
115 // Do nothing if PG is being destroyed!
116 if (::strstr(state, "Trimming") != NULL || pg_in_destructor) {
117 return;
118 } else {
119 bool ilocked = false;
120 if(!thispg->is_locked()) {
121 thispg->lock();
122 ilocked = true;
123 }
124 if (pi == nullptr) {
125 buffer.push_back(std::unique_ptr<PGStateInstance>(tmppi.release()));
126 pi = buffer.back().get();
127 pi->setepoch(thispg->get_osdmap()->get_epoch());
128 }
129
130 pi->exit_state(ceph_clock_now());
131 if (::strcmp(state, "Reset") == 0) {
132 this->reset();
133 }
134 if(ilocked) {
135 thispg->unlock();
136 }
137 }
138 }
139
140 void PGStateHistory::dump(Formatter* f) const {
141 f->open_array_section("history");
142 for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) {
143 f->open_object_section("states");
144 f->dump_stream("epoch") << (*pi)->this_epoch;
145 for (auto she : (*pi)->state_history) {
146 f->dump_string("state", std::get<2>(she));
147 f->dump_stream("enter") << std::get<0>(she);
148 f->dump_stream("exit") << std::get<1>(she);
149 }
150 f->close_section();
151 }
152 f->close_section();
153 }
154
155 void PG::get(const char* tag)
156 {
157 ref++;
158 #ifdef PG_DEBUG_REFS
159 Mutex::Locker l(_ref_id_lock);
160 _tag_counts[tag]++;
161 #endif
162 }
163
164 void PG::put(const char* tag)
165 {
166 #ifdef PG_DEBUG_REFS
167 {
168 Mutex::Locker l(_ref_id_lock);
169 auto tag_counts_entry = _tag_counts.find(tag);
170 assert(tag_counts_entry != _tag_counts.end());
171 --tag_counts_entry->second;
172 if (tag_counts_entry->second == 0) {
173 _tag_counts.erase(tag_counts_entry);
174 }
175 }
176 #endif
177 if (--ref== 0)
178 delete this;
179 }
180
181 #ifdef PG_DEBUG_REFS
182 uint64_t PG::get_with_id()
183 {
184 ref++;
185 Mutex::Locker l(_ref_id_lock);
186 uint64_t id = ++_ref_id;
187 BackTrace bt(0);
188 stringstream ss;
189 bt.print(ss);
190 dout(20) << __func__ << ": " << info.pgid << " got id " << id << " (new) ref==" << ref << dendl;
191 assert(!_live_ids.count(id));
192 _live_ids.insert(make_pair(id, ss.str()));
193 return id;
194 }
195
196 void PG::put_with_id(uint64_t id)
197 {
198 dout(20) << __func__ << ": " << info.pgid << " put id " << id << " (current) ref==" << ref << dendl;
199 {
200 Mutex::Locker l(_ref_id_lock);
201 assert(_live_ids.count(id));
202 _live_ids.erase(id);
203 }
204 if (--ref == 0)
205 delete this;
206 }
207
208 void PG::dump_live_ids()
209 {
210 Mutex::Locker l(_ref_id_lock);
211 dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl;
212 for (map<uint64_t, string>::iterator i = _live_ids.begin();
213 i != _live_ids.end();
214 ++i) {
215 dout(0) << "\t\tid: " << *i << dendl;
216 }
217 dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl;
218 for (map<string, uint64_t>::iterator i = _tag_counts.begin();
219 i != _tag_counts.end();
220 ++i) {
221 dout(0) << "\t\tid: " << *i << dendl;
222 }
223 }
224 #endif
225
226 void PGPool::update(OSDMapRef map)
227 {
228 const pg_pool_t *pi = map->get_pg_pool(id);
229 assert(pi);
230 info = *pi;
231 auid = pi->auid;
232 name = map->get_pool_name(id);
233 bool updated = false;
234 if ((map->get_epoch() != cached_epoch + 1) ||
235 (pi->get_snap_epoch() == map->get_epoch())) {
236 updated = true;
237 pi->build_removed_snaps(newly_removed_snaps);
238 interval_set<snapid_t> intersection;
239 intersection.intersection_of(newly_removed_snaps, cached_removed_snaps);
240 if (intersection == cached_removed_snaps) {
241 newly_removed_snaps.subtract(cached_removed_snaps);
242 cached_removed_snaps.union_of(newly_removed_snaps);
243 } else {
244 lgeneric_subdout(cct, osd, 0) << __func__
245 << " cached_removed_snaps shrank from " << cached_removed_snaps
246 << " to " << newly_removed_snaps << dendl;
247 cached_removed_snaps = newly_removed_snaps;
248 newly_removed_snaps.clear();
249 }
250 snapc = pi->get_snap_context();
251 } else {
252 /* 1) map->get_epoch() == cached_epoch + 1 &&
253 * 2) pi->get_snap_epoch() != map->get_epoch()
254 *
255 * From the if branch, 1 && 2 must be true. From 2, we know that
256 * this map didn't change the set of removed snaps. From 1, we
257 * know that our cached_removed_snaps matches the previous map.
258 * Thus, from 1 && 2, cached_removed snaps matches the current
259 * set of removed snaps and all we have to do is clear
260 * newly_removed_snaps.
261 */
262 newly_removed_snaps.clear();
263 }
264 cached_epoch = map->get_epoch();
265 lgeneric_subdout(cct, osd, 20)
266 << "PGPool::update cached_removed_snaps "
267 << cached_removed_snaps
268 << " newly_removed_snaps "
269 << newly_removed_snaps
270 << " snapc " << snapc
271 << (updated ? " (updated)":" (no change)")
272 << dendl;
273 }
274
275 PG::PG(OSDService *o, OSDMapRef curmap,
276 const PGPool &_pool, spg_t p) :
277 osd(o),
278 cct(o->cct),
279 osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
280 snap_mapper(
281 cct,
282 &osdriver,
283 p.ps(),
284 p.get_split_bits(curmap->get_pg_num(_pool.id)),
285 _pool.id,
286 p.shard),
287 osdmap_ref(curmap), last_persisted_osdmap_ref(curmap), pool(_pool),
288 _lock("PG::_lock"),
289 #ifdef PG_DEBUG_REFS
290 _ref_id_lock("PG::_ref_id_lock"), _ref_id(0),
291 #endif
292 deleting(false),
293 trace_endpoint("0.0.0.0", 0, "PG"),
294 dirty_info(false), dirty_big_info(false),
295 info(p),
296 info_struct_v(0),
297 coll(p), pg_log(cct),
298 pgmeta_oid(p.make_pgmeta_oid()),
299 missing_loc(this),
300 past_intervals(
301 curmap->get_pools().at(p.pgid.pool()).ec_pool(),
302 *curmap),
303 stat_queue_item(this),
304 scrub_queued(false),
305 recovery_queued(false),
306 recovery_ops_active(0),
307 role(-1),
308 state(0),
309 send_notify(false),
310 pg_whoami(osd->whoami, p.shard),
311 need_up_thru(false),
312 last_peering_reset(0),
313 heartbeat_peer_lock("PG::heartbeat_peer_lock"),
314 backfill_reserved(false),
315 backfill_reserving(false),
316 flushes_in_progress(0),
317 pg_stats_publish_lock("PG::pg_stats_publish_lock"),
318 pg_stats_publish_valid(false),
319 osr(osd->osr_registry.lookup_or_create(p, (stringify(p)))),
320 finish_sync_event(NULL),
321 backoff_lock("PG::backoff_lock"),
322 scrub_after_recovery(false),
323 active_pushes(0),
324 recovery_state(this),
325 pg_id(p),
326 peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
327 acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
328 upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
329 last_epoch(0)
330 {
331 #ifdef PG_DEBUG_REFS
332 osd->add_pgid(p, this);
333 #endif
334 #ifdef WITH_BLKIN
335 std::stringstream ss;
336 ss << "PG " << info.pgid;
337 trace_endpoint.copy_name(ss.str());
338 #endif
339 osr->shard_hint = p;
340 }
341
342 PG::~PG()
343 {
344 pgstate_history.set_pg_in_destructor();
345 #ifdef PG_DEBUG_REFS
346 osd->remove_pgid(info.pgid, this);
347 #endif
348 }
349
350 void PG::lock_suspend_timeout(ThreadPool::TPHandle &handle)
351 {
352 handle.suspend_tp_timeout();
353 lock();
354 handle.reset_tp_timeout();
355 }
356
357 void PG::lock(bool no_lockdep) const
358 {
359 _lock.Lock(no_lockdep);
360 // if we have unrecorded dirty state with the lock dropped, there is a bug
361 assert(!dirty_info);
362 assert(!dirty_big_info);
363
364 dout(30) << "lock" << dendl;
365 }
366
367 std::string PG::gen_prefix() const
368 {
369 stringstream out;
370 OSDMapRef mapref = osdmap_ref;
371 if (_lock.is_locked_by_me()) {
372 out << "osd." << osd->whoami
373 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
374 << " " << *this << " ";
375 } else {
376 out << "osd." << osd->whoami
377 << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
378 << " pg[" << info.pgid << "(unlocked)] ";
379 }
380 return out.str();
381 }
382
383 /********* PG **********/
384
385 void PG::proc_master_log(
386 ObjectStore::Transaction& t, pg_info_t &oinfo,
387 pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
388 {
389 dout(10) << "proc_master_log for osd." << from << ": "
390 << olog << " " << omissing << dendl;
391 assert(!is_peered() && is_primary());
392
393 // merge log into our own log to build master log. no need to
394 // make any adjustments to their missing map; we are taking their
395 // log to be authoritative (i.e., their entries are by definitely
396 // non-divergent).
397 merge_log(t, oinfo, olog, from);
398 peer_info[from] = oinfo;
399 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
400 might_have_unfound.insert(from);
401
402 // See doc/dev/osd_internals/last_epoch_started
403 if (oinfo.last_epoch_started > info.last_epoch_started) {
404 info.last_epoch_started = oinfo.last_epoch_started;
405 dirty_info = true;
406 }
407 if (oinfo.last_interval_started > info.last_interval_started) {
408 info.last_interval_started = oinfo.last_interval_started;
409 dirty_info = true;
410 }
411 update_history(oinfo.history);
412 assert(cct->_conf->osd_find_best_info_ignore_history_les ||
413 info.last_epoch_started >= info.history.last_epoch_started);
414
415 peer_missing[from].claim(omissing);
416 }
417
418 void PG::proc_replica_log(
419 pg_info_t &oinfo,
420 const pg_log_t &olog,
421 pg_missing_t& omissing,
422 pg_shard_t from)
423 {
424 dout(10) << "proc_replica_log for osd." << from << ": "
425 << oinfo << " " << olog << " " << omissing << dendl;
426
427 pg_log.proc_replica_log(oinfo, olog, omissing, from);
428
429 peer_info[from] = oinfo;
430 dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
431 might_have_unfound.insert(from);
432
433 for (map<hobject_t, pg_missing_item>::const_iterator i =
434 omissing.get_items().begin();
435 i != omissing.get_items().end();
436 ++i) {
437 dout(20) << " after missing " << i->first << " need " << i->second.need
438 << " have " << i->second.have << dendl;
439 }
440 peer_missing[from].claim(omissing);
441 }
442
443 bool PG::proc_replica_info(
444 pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
445 {
446 map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
447 if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
448 dout(10) << " got dup osd." << from << " info " << oinfo << ", identical to ours" << dendl;
449 return false;
450 }
451
452 if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
453 dout(10) << " got info " << oinfo << " from down osd." << from
454 << " discarding" << dendl;
455 return false;
456 }
457
458 dout(10) << " got osd." << from << " " << oinfo << dendl;
459 assert(is_primary());
460 peer_info[from] = oinfo;
461 might_have_unfound.insert(from);
462
463 update_history(oinfo.history);
464
465 // stray?
466 if (!is_up(from) && !is_acting(from)) {
467 dout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
468 stray_set.insert(from);
469 if (is_clean()) {
470 purge_strays();
471 }
472 }
473
474 // was this a new info? if so, update peers!
475 if (p == peer_info.end())
476 update_heartbeat_peers();
477
478 return true;
479 }
480
481 void PG::remove_snap_mapped_object(
482 ObjectStore::Transaction &t, const hobject_t &soid)
483 {
484 t.remove(
485 coll,
486 ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard));
487 clear_object_snap_mapping(&t, soid);
488 }
489
490 void PG::clear_object_snap_mapping(
491 ObjectStore::Transaction *t, const hobject_t &soid)
492 {
493 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
494 if (soid.snap < CEPH_MAXSNAP) {
495 int r = snap_mapper.remove_oid(
496 soid,
497 &_t);
498 if (!(r == 0 || r == -ENOENT)) {
499 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
500 ceph_abort();
501 }
502 }
503 }
504
505 void PG::update_object_snap_mapping(
506 ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps)
507 {
508 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
509 assert(soid.snap < CEPH_MAXSNAP);
510 int r = snap_mapper.remove_oid(
511 soid,
512 &_t);
513 if (!(r == 0 || r == -ENOENT)) {
514 derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
515 ceph_abort();
516 }
517 snap_mapper.add_oid(
518 soid,
519 snaps,
520 &_t);
521 }
522
523 void PG::merge_log(
524 ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_shard_t from)
525 {
526 PGLogEntryHandler rollbacker{this, &t};
527 pg_log.merge_log(
528 oinfo, olog, from, info, &rollbacker, dirty_info, dirty_big_info);
529 }
530
531 void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead)
532 {
533 PGLogEntryHandler rollbacker{this, &t};
534 pg_log.rewind_divergent_log(
535 newhead, info, &rollbacker, dirty_info, dirty_big_info);
536 }
537
538 /*
539 * Process information from a replica to determine if it could have any
540 * objects that i need.
541 *
542 * TODO: if the missing set becomes very large, this could get expensive.
543 * Instead, we probably want to just iterate over our unfound set.
544 */
545 bool PG::search_for_missing(
546 const pg_info_t &oinfo, const pg_missing_t &omissing,
547 pg_shard_t from,
548 RecoveryCtx *ctx)
549 {
550 uint64_t num_unfound_before = missing_loc.num_unfound();
551 bool found_missing = missing_loc.add_source_info(
552 from, oinfo, omissing, ctx->handle);
553 if (found_missing && num_unfound_before != missing_loc.num_unfound())
554 publish_stats_to_osd();
555 if (found_missing &&
556 (get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
557 CEPH_FEATURE_OSD_ERASURE_CODES)) {
558 pg_info_t tinfo(oinfo);
559 tinfo.pgid.shard = pg_whoami.shard;
560 (*(ctx->info_map))[from.osd].push_back(
561 make_pair(
562 pg_notify_t(
563 from.shard, pg_whoami.shard,
564 get_osdmap()->get_epoch(),
565 get_osdmap()->get_epoch(),
566 tinfo),
567 past_intervals));
568 }
569 return found_missing;
570 }
571
572 bool PG::MissingLoc::readable_with_acting(
573 const hobject_t &hoid,
574 const set<pg_shard_t> &acting) const {
575 if (!needs_recovery(hoid)) return true;
576 auto missing_loc_entry = missing_loc.find(hoid);
577 if (missing_loc_entry == missing_loc.end()) return false;
578 const set<pg_shard_t> &locs = missing_loc_entry->second;
579 ldout(pg->cct, 10) << __func__ << ": locs:" << locs << dendl;
580 set<pg_shard_t> have_acting;
581 for (set<pg_shard_t>::const_iterator i = locs.begin();
582 i != locs.end();
583 ++i) {
584 if (acting.count(*i))
585 have_acting.insert(*i);
586 }
587 return (*is_readable)(have_acting);
588 }
589
590 void PG::MissingLoc::add_batch_sources_info(
591 const set<pg_shard_t> &sources, ThreadPool::TPHandle* handle)
592 {
593 ldout(pg->cct, 10) << __func__ << ": adding sources in batch "
594 << sources.size() << dendl;
595 unsigned loop = 0;
596 for (map<hobject_t, pg_missing_item>::const_iterator i = needs_recovery_map.begin();
597 i != needs_recovery_map.end();
598 ++i) {
599 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
600 handle->reset_tp_timeout();
601 loop = 0;
602 }
603 missing_loc[i->first].insert(sources.begin(), sources.end());
604 missing_loc_sources.insert(sources.begin(), sources.end());
605 }
606 }
607
608 bool PG::MissingLoc::add_source_info(
609 pg_shard_t fromosd,
610 const pg_info_t &oinfo,
611 const pg_missing_t &omissing,
612 ThreadPool::TPHandle* handle)
613 {
614 bool found_missing = false;
615 unsigned loop = 0;
616 // found items?
617 for (map<hobject_t,pg_missing_item>::const_iterator p = needs_recovery_map.begin();
618 p != needs_recovery_map.end();
619 ++p) {
620 const hobject_t &soid(p->first);
621 eversion_t need = p->second.need;
622 if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
623 handle->reset_tp_timeout();
624 loop = 0;
625 }
626 if (oinfo.last_update < need) {
627 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
628 << " also missing on osd." << fromosd
629 << " (last_update " << oinfo.last_update
630 << " < needed " << need << ")" << dendl;
631 continue;
632 }
633 if (!oinfo.last_backfill.is_max() &&
634 !oinfo.last_backfill_bitwise) {
635 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
636 << " also missing on osd." << fromosd
637 << " (last_backfill " << oinfo.last_backfill
638 << " but with wrong sort order)"
639 << dendl;
640 continue;
641 }
642 if (p->first >= oinfo.last_backfill) {
643 // FIXME: this is _probably_ true, although it could conceivably
644 // be in the undefined region! Hmm!
645 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
646 << " also missing on osd." << fromosd
647 << " (past last_backfill " << oinfo.last_backfill
648 << ")" << dendl;
649 continue;
650 }
651 if (oinfo.last_complete < need) {
652 if (omissing.is_missing(soid)) {
653 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
654 << " also missing on osd." << fromosd << dendl;
655 continue;
656 }
657 }
658
659 ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
660 << " is on osd." << fromosd << dendl;
661
662 missing_loc[soid].insert(fromosd);
663 missing_loc_sources.insert(fromosd);
664 found_missing = true;
665 }
666
667 ldout(pg->cct, 20) << "needs_recovery_map missing " << needs_recovery_map
668 << dendl;
669 return found_missing;
670 }
671
672 void PG::discover_all_missing(map<int, map<spg_t,pg_query_t> > &query_map)
673 {
674 auto &missing = pg_log.get_missing();
675 uint64_t unfound = get_num_unfound();
676 assert(unfound > 0);
677
678 dout(10) << __func__ << " "
679 << missing.num_missing() << " missing, "
680 << unfound << " unfound"
681 << dendl;
682
683 std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
684 std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
685 for (; m != mend; ++m) {
686 pg_shard_t peer(*m);
687
688 if (!get_osdmap()->is_up(peer.osd)) {
689 dout(20) << __func__ << " skipping down osd." << peer << dendl;
690 continue;
691 }
692
693 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
694 if (iter != peer_info.end() &&
695 (iter->second.is_empty() || iter->second.dne())) {
696 // ignore empty peers
697 continue;
698 }
699
700 // If we've requested any of this stuff, the pg_missing_t information
701 // should be on its way.
702 // TODO: coalsce requested_* into a single data structure
703 if (peer_missing.find(peer) != peer_missing.end()) {
704 dout(20) << __func__ << ": osd." << peer
705 << ": we already have pg_missing_t" << dendl;
706 continue;
707 }
708 if (peer_log_requested.find(peer) != peer_log_requested.end()) {
709 dout(20) << __func__ << ": osd." << peer
710 << ": in peer_log_requested" << dendl;
711 continue;
712 }
713 if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
714 dout(20) << __func__ << ": osd." << peer
715 << ": in peer_missing_requested" << dendl;
716 continue;
717 }
718
719 // Request missing
720 dout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
721 << dendl;
722 peer_missing_requested.insert(peer);
723 query_map[peer.osd][spg_t(info.pgid.pgid, peer.shard)] =
724 pg_query_t(
725 pg_query_t::FULLLOG,
726 peer.shard, pg_whoami.shard,
727 info.history, get_osdmap()->get_epoch());
728 }
729 }
730
731 /******* PG ***********/
732 bool PG::needs_recovery() const
733 {
734 assert(is_primary());
735
736 auto &missing = pg_log.get_missing();
737
738 if (missing.num_missing()) {
739 dout(10) << __func__ << " primary has " << missing.num_missing()
740 << " missing" << dendl;
741 return true;
742 }
743
744 assert(!actingbackfill.empty());
745 set<pg_shard_t>::const_iterator end = actingbackfill.end();
746 set<pg_shard_t>::const_iterator a = actingbackfill.begin();
747 for (; a != end; ++a) {
748 if (*a == get_primary()) continue;
749 pg_shard_t peer = *a;
750 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
751 if (pm == peer_missing.end()) {
752 dout(10) << __func__ << " osd." << peer << " doesn't have missing set"
753 << dendl;
754 continue;
755 }
756 if (pm->second.num_missing()) {
757 dout(10) << __func__ << " osd." << peer << " has "
758 << pm->second.num_missing() << " missing" << dendl;
759 return true;
760 }
761 }
762
763 dout(10) << __func__ << " is recovered" << dendl;
764 return false;
765 }
766
767 bool PG::needs_backfill() const
768 {
769 assert(is_primary());
770
771 // We can assume that only possible osds that need backfill
772 // are on the backfill_targets vector nodes.
773 set<pg_shard_t>::const_iterator end = backfill_targets.end();
774 set<pg_shard_t>::const_iterator a = backfill_targets.begin();
775 for (; a != end; ++a) {
776 pg_shard_t peer = *a;
777 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
778 if (!pi->second.last_backfill.is_max()) {
779 dout(10) << __func__ << " osd." << peer << " has last_backfill " << pi->second.last_backfill << dendl;
780 return true;
781 }
782 }
783
784 dout(10) << __func__ << " does not need backfill" << dendl;
785 return false;
786 }
787
788
789 void PG::check_past_interval_bounds() const
790 {
791 auto rpib = get_required_past_interval_bounds(
792 info,
793 osd->get_superblock().oldest_map);
794 if (rpib.first >= rpib.second) {
795 if (!past_intervals.empty()) {
796 osd->clog->error() << info.pgid << " required past_interval bounds are"
797 << " empty [" << rpib << ") but past_intervals is not: "
798 << past_intervals;
799 derr << info.pgid << " required past_interval bounds are"
800 << " empty [" << rpib << ") but past_intervals is not: "
801 << past_intervals << dendl;
802 }
803 } else {
804 if (past_intervals.empty()) {
805 osd->clog->error() << info.pgid << " required past_interval bounds are"
806 << " not empty [" << rpib << ") but past_intervals "
807 << past_intervals << " is empty";
808 derr << info.pgid << " required past_interval bounds are"
809 << " not empty [" << rpib << ") but past_intervals "
810 << past_intervals << " is empty" << dendl;
811 assert(!past_intervals.empty());
812 }
813
814 auto apib = past_intervals.get_bounds();
815 if (apib.first > rpib.first) {
816 osd->clog->error() << info.pgid << " past_intervals [" << apib
817 << ") start interval does not contain the required"
818 << " bound [" << rpib << ") start";
819 derr << info.pgid << " past_intervals [" << apib
820 << ") start interval does not contain the required"
821 << " bound [" << rpib << ") start" << dendl;
822 assert(0 == "past_interval start interval mismatch");
823 }
824 if (apib.second != rpib.second) {
825 osd->clog->error() << info.pgid << " past_interal bound [" << apib
826 << ") end does not match required [" << rpib
827 << ") end";
828 derr << info.pgid << " past_interal bound [" << apib
829 << ") end does not match required [" << rpib
830 << ") end" << dendl;
831 assert(0 == "past_interval end mismatch");
832 }
833 }
834 }
835
836 bool PG::adjust_need_up_thru(const OSDMapRef osdmap)
837 {
838 epoch_t up_thru = osdmap->get_up_thru(osd->whoami);
839 if (need_up_thru &&
840 up_thru >= info.history.same_interval_since) {
841 dout(10) << "adjust_need_up_thru now " << up_thru << ", need_up_thru now false" << dendl;
842 need_up_thru = false;
843 return true;
844 }
845 return false;
846 }
847
848 void PG::remove_down_peer_info(const OSDMapRef osdmap)
849 {
850 // Remove any downed osds from peer_info
851 bool removed = false;
852 map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
853 while (p != peer_info.end()) {
854 if (!osdmap->is_up(p->first.osd)) {
855 dout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
856 peer_missing.erase(p->first);
857 peer_log_requested.erase(p->first);
858 peer_missing_requested.erase(p->first);
859 peer_info.erase(p++);
860 removed = true;
861 } else
862 ++p;
863 }
864
865 // if we removed anyone, update peers (which include peer_info)
866 if (removed)
867 update_heartbeat_peers();
868 check_recovery_sources(osdmap);
869 }
870
871 /*
872 * Returns true unless there is a non-lost OSD in might_have_unfound.
873 */
874 bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
875 {
876 assert(is_primary());
877
878 set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
879 set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
880 for (; peer != mend; ++peer) {
881 if (peer_missing.count(*peer))
882 continue;
883 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
884 if (iter != peer_info.end() &&
885 (iter->second.is_empty() || iter->second.dne()))
886 continue;
887 if (!osdmap->exists(peer->osd))
888 continue;
889 const osd_info_t &osd_info(osdmap->get_info(peer->osd));
890 if (osd_info.lost_at <= osd_info.up_from) {
891 // If there is even one OSD in might_have_unfound that isn't lost, we
892 // still might retrieve our unfound.
893 return false;
894 }
895 }
896 dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound
897 << " have been queried or are marked lost" << dendl;
898 return true;
899 }
900
901 PastIntervals::PriorSet PG::build_prior()
902 {
903 if (1) {
904 // sanity check
905 for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
906 it != peer_info.end();
907 ++it) {
908 assert(info.history.last_epoch_started >= it->second.history.last_epoch_started);
909 }
910 }
911
912 const OSDMap &osdmap = *get_osdmap();
913 PastIntervals::PriorSet prior = past_intervals.get_prior_set(
914 pool.info.ec_pool(),
915 info.history.last_epoch_started,
916 get_pgbackend()->get_is_recoverable_predicate(),
917 [&](epoch_t start, int osd, epoch_t *lost_at) {
918 const osd_info_t *pinfo = 0;
919 if (osdmap.exists(osd)) {
920 pinfo = &osdmap.get_info(osd);
921 if (lost_at)
922 *lost_at = pinfo->lost_at;
923 }
924
925 if (osdmap.is_up(osd)) {
926 return PastIntervals::UP;
927 } else if (!pinfo) {
928 return PastIntervals::DNE;
929 } else if (pinfo->lost_at > start) {
930 return PastIntervals::LOST;
931 } else {
932 return PastIntervals::DOWN;
933 }
934 },
935 up,
936 acting,
937 this);
938
939 if (prior.pg_down) {
940 state_set(PG_STATE_DOWN);
941 }
942
943 if (get_osdmap()->get_up_thru(osd->whoami) < info.history.same_interval_since) {
944 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
945 << " < same_since " << info.history.same_interval_since
946 << ", must notify monitor" << dendl;
947 need_up_thru = true;
948 } else {
949 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
950 << " >= same_since " << info.history.same_interval_since
951 << ", all is well" << dendl;
952 need_up_thru = false;
953 }
954 set_probe_targets(prior.probe);
955 return prior;
956 }
957
958 void PG::clear_primary_state()
959 {
960 dout(10) << "clear_primary_state" << dendl;
961
962 // clear peering state
963 stray_set.clear();
964 peer_log_requested.clear();
965 peer_missing_requested.clear();
966 peer_info.clear();
967 peer_missing.clear();
968 need_up_thru = false;
969 peer_last_complete_ondisk.clear();
970 peer_activated.clear();
971 min_last_complete_ondisk = eversion_t();
972 pg_trim_to = eversion_t();
973 might_have_unfound.clear();
974 projected_log = PGLog::IndexedLog();
975
976 last_update_ondisk = eversion_t();
977
978 snap_trimq.clear();
979
980 finish_sync_event = 0; // so that _finish_recovery doesn't go off in another thread
981
982 missing_loc.clear();
983
984 release_pg_backoffs();
985
986 pg_log.reset_recovery_pointers();
987
988 scrubber.reserved_peers.clear();
989 scrub_after_recovery = false;
990
991 agent_clear();
992 }
993
994 PG::Scrubber::Scrubber()
995 : reserved(false), reserve_failed(false),
996 epoch_start(0),
997 active(false), queue_snap_trim(false),
998 waiting_on(0), shallow_errors(0), deep_errors(0), fixed(0),
999 must_scrub(false), must_deep_scrub(false), must_repair(false),
1000 auto_repair(false),
1001 num_digest_updates_pending(0),
1002 state(INACTIVE),
1003 deep(false),
1004 seed(0)
1005 {}
1006
1007 PG::Scrubber::~Scrubber() {}
1008
1009 /**
1010 * find_best_info
1011 *
1012 * Returns an iterator to the best info in infos sorted by:
1013 * 1) Prefer newer last_update
1014 * 2) Prefer longer tail if it brings another info into contiguity
1015 * 3) Prefer current primary
1016 */
1017 map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
1018 const map<pg_shard_t, pg_info_t> &infos,
1019 bool restrict_to_up_acting,
1020 bool *history_les_bound) const
1021 {
1022 assert(history_les_bound);
1023 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1024 * to make changes to this process. Also, make sure to update it
1025 * when you find bugs! */
1026 eversion_t min_last_update_acceptable = eversion_t::max();
1027 epoch_t max_last_epoch_started_found = 0;
1028 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1029 i != infos.end();
1030 ++i) {
1031 if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1032 max_last_epoch_started_found < i->second.history.last_epoch_started) {
1033 *history_les_bound = true;
1034 max_last_epoch_started_found = i->second.history.last_epoch_started;
1035 }
1036 if (!i->second.is_incomplete() &&
1037 max_last_epoch_started_found < i->second.last_epoch_started) {
1038 max_last_epoch_started_found = i->second.last_epoch_started;
1039 }
1040 }
1041 for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1042 i != infos.end();
1043 ++i) {
1044 if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1045 if (min_last_update_acceptable > i->second.last_update)
1046 min_last_update_acceptable = i->second.last_update;
1047 }
1048 }
1049 if (min_last_update_acceptable == eversion_t::max())
1050 return infos.end();
1051
1052 map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1053 // find osd with newest last_update (oldest for ec_pool).
1054 // if there are multiples, prefer
1055 // - a longer tail, if it brings another peer into log contiguity
1056 // - the current primary
1057 for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1058 p != infos.end();
1059 ++p) {
1060 if (restrict_to_up_acting && !is_up(p->first) &&
1061 !is_acting(p->first))
1062 continue;
1063 // Only consider peers with last_update >= min_last_update_acceptable
1064 if (p->second.last_update < min_last_update_acceptable)
1065 continue;
1066 // Disqualify anyone with a too old last_epoch_started
1067 if (p->second.last_epoch_started < max_last_epoch_started_found)
1068 continue;
1069 // Disqualify anyone who is incomplete (not fully backfilled)
1070 if (p->second.is_incomplete())
1071 continue;
1072 if (best == infos.end()) {
1073 best = p;
1074 continue;
1075 }
1076 // Prefer newer last_update
1077 if (pool.info.require_rollback()) {
1078 if (p->second.last_update > best->second.last_update)
1079 continue;
1080 if (p->second.last_update < best->second.last_update) {
1081 best = p;
1082 continue;
1083 }
1084 } else {
1085 if (p->second.last_update < best->second.last_update)
1086 continue;
1087 if (p->second.last_update > best->second.last_update) {
1088 best = p;
1089 continue;
1090 }
1091 }
1092
1093 // Prefer longer tail
1094 if (p->second.log_tail > best->second.log_tail) {
1095 continue;
1096 } else if (p->second.log_tail < best->second.log_tail) {
1097 best = p;
1098 continue;
1099 }
1100
1101 // prefer current primary (usually the caller), all things being equal
1102 if (p->first == pg_whoami) {
1103 dout(10) << "calc_acting prefer osd." << p->first
1104 << " because it is current primary" << dendl;
1105 best = p;
1106 continue;
1107 }
1108 }
1109 return best;
1110 }
1111
1112 void PG::calc_ec_acting(
1113 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1114 unsigned size,
1115 const vector<int> &acting,
1116 pg_shard_t acting_primary,
1117 const vector<int> &up,
1118 pg_shard_t up_primary,
1119 const map<pg_shard_t, pg_info_t> &all_info,
1120 bool restrict_to_up_acting,
1121 vector<int> *_want,
1122 set<pg_shard_t> *backfill,
1123 set<pg_shard_t> *acting_backfill,
1124 pg_shard_t *want_primary,
1125 ostream &ss)
1126 {
1127 vector<int> want(size, CRUSH_ITEM_NONE);
1128 map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1129 unsigned usable = 0;
1130 for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1131 i != all_info.end();
1132 ++i) {
1133 all_info_by_shard[i->first.shard].insert(i->first);
1134 }
1135 for (uint8_t i = 0; i < want.size(); ++i) {
1136 ss << "For position " << (unsigned)i << ": ";
1137 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1138 !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1139 all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1140 auth_log_shard->second.log_tail) {
1141 ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1142 want[i] = up[i];
1143 ++usable;
1144 continue;
1145 }
1146 if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1147 ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1148 << " and ";
1149 backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1150 }
1151
1152 if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1153 !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1154 all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1155 auth_log_shard->second.log_tail) {
1156 ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1157 want[i] = acting[i];
1158 ++usable;
1159 } else if (!restrict_to_up_acting) {
1160 for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1161 j != all_info_by_shard[shard_id_t(i)].end();
1162 ++j) {
1163 assert(j->shard == i);
1164 if (!all_info.find(*j)->second.is_incomplete() &&
1165 all_info.find(*j)->second.last_update >=
1166 auth_log_shard->second.log_tail) {
1167 ss << " selecting stray: " << *j << std::endl;
1168 want[i] = j->osd;
1169 ++usable;
1170 break;
1171 }
1172 }
1173 if (want[i] == CRUSH_ITEM_NONE)
1174 ss << " failed to fill position " << (int)i << std::endl;
1175 }
1176 }
1177
1178 bool found_primary = false;
1179 for (uint8_t i = 0; i < want.size(); ++i) {
1180 if (want[i] != CRUSH_ITEM_NONE) {
1181 acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1182 if (!found_primary) {
1183 *want_primary = pg_shard_t(want[i], shard_id_t(i));
1184 found_primary = true;
1185 }
1186 }
1187 }
1188 acting_backfill->insert(backfill->begin(), backfill->end());
1189 _want->swap(want);
1190 }
1191
1192 /**
1193 * calculate the desired acting set.
1194 *
1195 * Choose an appropriate acting set. Prefer up[0], unless it is
1196 * incomplete, or another osd has a longer tail that allows us to
1197 * bring other up nodes up to date.
1198 */
1199 void PG::calc_replicated_acting(
1200 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1201 unsigned size,
1202 const vector<int> &acting,
1203 pg_shard_t acting_primary,
1204 const vector<int> &up,
1205 pg_shard_t up_primary,
1206 const map<pg_shard_t, pg_info_t> &all_info,
1207 bool restrict_to_up_acting,
1208 vector<int> *want,
1209 set<pg_shard_t> *backfill,
1210 set<pg_shard_t> *acting_backfill,
1211 pg_shard_t *want_primary,
1212 ostream &ss)
1213 {
1214 ss << "calc_acting newest update on osd." << auth_log_shard->first
1215 << " with " << auth_log_shard->second
1216 << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1217 pg_shard_t auth_log_shard_id = auth_log_shard->first;
1218
1219 // select primary
1220 map<pg_shard_t,pg_info_t>::const_iterator primary;
1221 if (up.size() &&
1222 !all_info.find(up_primary)->second.is_incomplete() &&
1223 all_info.find(up_primary)->second.last_update >=
1224 auth_log_shard->second.log_tail) {
1225 ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1226 primary = all_info.find(up_primary); // prefer up[0], all thing being equal
1227 } else {
1228 assert(!auth_log_shard->second.is_incomplete());
1229 ss << "up[0] needs backfill, osd." << auth_log_shard_id
1230 << " selected as primary instead" << std::endl;
1231 primary = auth_log_shard;
1232 }
1233
1234 ss << "calc_acting primary is osd." << primary->first
1235 << " with " << primary->second << std::endl;
1236 *want_primary = primary->first;
1237 want->push_back(primary->first.osd);
1238 acting_backfill->insert(primary->first);
1239 unsigned usable = 1;
1240
1241 // select replicas that have log contiguity with primary.
1242 // prefer up, then acting, then any peer_info osds
1243 for (vector<int>::const_iterator i = up.begin();
1244 i != up.end();
1245 ++i) {
1246 pg_shard_t up_cand = pg_shard_t(*i, shard_id_t::NO_SHARD);
1247 if (up_cand == primary->first)
1248 continue;
1249 const pg_info_t &cur_info = all_info.find(up_cand)->second;
1250 if (cur_info.is_incomplete() ||
1251 cur_info.last_update < MIN(
1252 primary->second.log_tail,
1253 auth_log_shard->second.log_tail)) {
1254 /* We include auth_log_shard->second.log_tail because in GetLog,
1255 * we will request logs back to the min last_update over our
1256 * acting_backfill set, which will result in our log being extended
1257 * as far backwards as necessary to pick up any peers which can
1258 * be log recovered by auth_log_shard's log */
1259 ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1260 backfill->insert(up_cand);
1261 acting_backfill->insert(up_cand);
1262 } else {
1263 want->push_back(*i);
1264 acting_backfill->insert(up_cand);
1265 usable++;
1266 ss << " osd." << *i << " (up) accepted " << cur_info << std::endl;
1267 }
1268 }
1269
1270 // This no longer has backfill OSDs, but they are covered above.
1271 for (vector<int>::const_iterator i = acting.begin();
1272 i != acting.end();
1273 ++i) {
1274 pg_shard_t acting_cand(*i, shard_id_t::NO_SHARD);
1275 if (usable >= size)
1276 break;
1277
1278 // skip up osds we already considered above
1279 if (acting_cand == primary->first)
1280 continue;
1281 vector<int>::const_iterator up_it = find(up.begin(), up.end(), acting_cand.osd);
1282 if (up_it != up.end())
1283 continue;
1284
1285 const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1286 if (cur_info.is_incomplete() ||
1287 cur_info.last_update < primary->second.log_tail) {
1288 ss << " shard " << acting_cand << " (stray) REJECTED "
1289 << cur_info << std::endl;
1290 } else {
1291 want->push_back(*i);
1292 acting_backfill->insert(acting_cand);
1293 ss << " shard " << acting_cand << " (stray) accepted "
1294 << cur_info << std::endl;
1295 usable++;
1296 }
1297 }
1298
1299 if (restrict_to_up_acting) {
1300 return;
1301 }
1302 for (map<pg_shard_t,pg_info_t>::const_iterator i = all_info.begin();
1303 i != all_info.end();
1304 ++i) {
1305 if (usable >= size)
1306 break;
1307
1308 // skip up osds we already considered above
1309 if (i->first == primary->first)
1310 continue;
1311 vector<int>::const_iterator up_it = find(up.begin(), up.end(), i->first.osd);
1312 if (up_it != up.end())
1313 continue;
1314 vector<int>::const_iterator acting_it = find(
1315 acting.begin(), acting.end(), i->first.osd);
1316 if (acting_it != acting.end())
1317 continue;
1318
1319 if (i->second.is_incomplete() ||
1320 i->second.last_update < primary->second.log_tail) {
1321 ss << " shard " << i->first << " (stray) REJECTED "
1322 << i->second << std::endl;
1323 } else {
1324 want->push_back(i->first.osd);
1325 acting_backfill->insert(i->first);
1326 ss << " shard " << i->first << " (stray) accepted "
1327 << i->second << std::endl;
1328 usable++;
1329 }
1330 }
1331 }
1332
1333 /**
1334 * choose acting
1335 *
1336 * calculate the desired acting, and request a change with the monitor
1337 * if it differs from the current acting.
1338 *
1339 * if restrict_to_up_acting=true, we filter out anything that's not in
1340 * up/acting. in order to lift this restriction, we need to
1341 * 1) check whether it's worth switching the acting set any time we get
1342 * a new pg info (not just here, when recovery finishes)
1343 * 2) check whether anything in want_acting went down on each new map
1344 * (and, if so, calculate a new want_acting)
1345 * 3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap)
1346 * TODO!
1347 */
1348 bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
1349 bool restrict_to_up_acting,
1350 bool *history_les_bound)
1351 {
1352 map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
1353 all_info[pg_whoami] = info;
1354
1355 for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
1356 p != all_info.end();
1357 ++p) {
1358 dout(10) << "calc_acting osd." << p->first << " " << p->second << dendl;
1359 }
1360
1361 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
1362 find_best_info(all_info, restrict_to_up_acting, history_les_bound);
1363
1364 if (auth_log_shard == all_info.end()) {
1365 if (up != acting) {
1366 dout(10) << "choose_acting no suitable info found (incomplete backfills?),"
1367 << " reverting to up" << dendl;
1368 want_acting = up;
1369 vector<int> empty;
1370 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1371 } else {
1372 dout(10) << "choose_acting failed" << dendl;
1373 assert(want_acting.empty());
1374 }
1375 return false;
1376 }
1377
1378 assert(!auth_log_shard->second.is_incomplete());
1379 auth_log_shard_id = auth_log_shard->first;
1380
1381 set<pg_shard_t> want_backfill, want_acting_backfill;
1382 vector<int> want;
1383 pg_shard_t want_primary;
1384 stringstream ss;
1385 if (!pool.info.ec_pool())
1386 calc_replicated_acting(
1387 auth_log_shard,
1388 get_osdmap()->get_pg_size(info.pgid.pgid),
1389 acting,
1390 primary,
1391 up,
1392 up_primary,
1393 all_info,
1394 restrict_to_up_acting,
1395 &want,
1396 &want_backfill,
1397 &want_acting_backfill,
1398 &want_primary,
1399 ss);
1400 else
1401 calc_ec_acting(
1402 auth_log_shard,
1403 get_osdmap()->get_pg_size(info.pgid.pgid),
1404 acting,
1405 primary,
1406 up,
1407 up_primary,
1408 all_info,
1409 restrict_to_up_acting,
1410 &want,
1411 &want_backfill,
1412 &want_acting_backfill,
1413 &want_primary,
1414 ss);
1415 dout(10) << ss.str() << dendl;
1416
1417 unsigned num_want_acting = 0;
1418 set<pg_shard_t> have;
1419 for (int i = 0; i < (int)want.size(); ++i) {
1420 if (want[i] != CRUSH_ITEM_NONE) {
1421 ++num_want_acting;
1422 have.insert(
1423 pg_shard_t(
1424 want[i],
1425 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1426 }
1427 }
1428
1429 // We go incomplete if below min_size for ec_pools since backfill
1430 // does not currently maintain rollbackability
1431 // Otherwise, we will go "peered", but not "active"
1432 if (num_want_acting < pool.info.min_size &&
1433 (pool.info.ec_pool() ||
1434 !cct->_conf->osd_allow_recovery_below_min_size)) {
1435 want_acting.clear();
1436 dout(10) << "choose_acting failed, below min size" << dendl;
1437 return false;
1438 }
1439
1440 /* Check whether we have enough acting shards to later perform recovery */
1441 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable_predicate(
1442 get_pgbackend()->get_is_recoverable_predicate());
1443 if (!(*recoverable_predicate)(have)) {
1444 want_acting.clear();
1445 dout(10) << "choose_acting failed, not recoverable" << dendl;
1446 return false;
1447 }
1448
1449 if (want != acting) {
1450 dout(10) << "choose_acting want " << want << " != acting " << acting
1451 << ", requesting pg_temp change" << dendl;
1452 want_acting = want;
1453
1454 if (want_acting == up) {
1455 // There can't be any pending backfill if
1456 // want is the same as crush map up OSDs.
1457 assert(want_backfill.empty());
1458 vector<int> empty;
1459 osd->queue_want_pg_temp(info.pgid.pgid, empty);
1460 } else
1461 osd->queue_want_pg_temp(info.pgid.pgid, want);
1462 return false;
1463 }
1464 want_acting.clear();
1465 actingbackfill = want_acting_backfill;
1466 dout(10) << "actingbackfill is " << actingbackfill << dendl;
1467 assert(backfill_targets.empty() || backfill_targets == want_backfill);
1468 if (backfill_targets.empty()) {
1469 // Caller is GetInfo
1470 backfill_targets = want_backfill;
1471 }
1472 // Will not change if already set because up would have had to change
1473 // Verify that nothing in backfill is in stray_set
1474 for (set<pg_shard_t>::iterator i = want_backfill.begin();
1475 i != want_backfill.end();
1476 ++i) {
1477 assert(stray_set.find(*i) == stray_set.end());
1478 }
1479 dout(10) << "choose_acting want " << want << " (== acting) backfill_targets "
1480 << want_backfill << dendl;
1481 return true;
1482 }
1483
1484 /* Build the might_have_unfound set.
1485 *
1486 * This is used by the primary OSD during recovery.
1487 *
1488 * This set tracks the OSDs which might have unfound objects that the primary
1489 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
1490 * will remove the OSD from the set.
1491 */
1492 void PG::build_might_have_unfound()
1493 {
1494 assert(might_have_unfound.empty());
1495 assert(is_primary());
1496
1497 dout(10) << __func__ << dendl;
1498
1499 check_past_interval_bounds();
1500
1501 might_have_unfound = past_intervals.get_might_have_unfound(
1502 pg_whoami,
1503 pool.info.ec_pool());
1504
1505 // include any (stray) peers
1506 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
1507 p != peer_info.end();
1508 ++p)
1509 might_have_unfound.insert(p->first);
1510
1511 dout(15) << __func__ << ": built " << might_have_unfound << dendl;
1512 }
1513
1514 struct C_PG_ActivateCommitted : public Context {
1515 PGRef pg;
1516 epoch_t epoch;
1517 epoch_t activation_epoch;
1518 C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
1519 : pg(p), epoch(e), activation_epoch(ae) {}
1520 void finish(int r) override {
1521 pg->_activate_committed(epoch, activation_epoch);
1522 }
1523 };
1524
1525 void PG::activate(ObjectStore::Transaction& t,
1526 epoch_t activation_epoch,
1527 list<Context*>& tfin,
1528 map<int, map<spg_t,pg_query_t> >& query_map,
1529 map<int,
1530 vector<
1531 pair<pg_notify_t,
1532 PastIntervals> > > *activator_map,
1533 RecoveryCtx *ctx)
1534 {
1535 assert(!is_peered());
1536 assert(scrubber.callbacks.empty());
1537 assert(callbacks_for_degraded_object.empty());
1538
1539 // twiddle pg state
1540 state_clear(PG_STATE_DOWN);
1541
1542 send_notify = false;
1543
1544 if (is_primary()) {
1545 // only update primary last_epoch_started if we will go active
1546 if (acting.size() >= pool.info.min_size) {
1547 assert(cct->_conf->osd_find_best_info_ignore_history_les ||
1548 info.last_epoch_started <= activation_epoch);
1549 info.last_epoch_started = activation_epoch;
1550 info.last_interval_started = info.history.same_interval_since;
1551 }
1552 } else if (is_acting(pg_whoami)) {
1553 /* update last_epoch_started on acting replica to whatever the primary sent
1554 * unless it's smaller (could happen if we are going peered rather than
1555 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
1556 if (info.last_epoch_started < activation_epoch) {
1557 info.last_epoch_started = activation_epoch;
1558 info.last_interval_started = info.history.same_interval_since;
1559 }
1560 }
1561
1562 auto &missing = pg_log.get_missing();
1563
1564 if (is_primary()) {
1565 last_update_ondisk = info.last_update;
1566 min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
1567 }
1568 last_update_applied = info.last_update;
1569 last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
1570
1571 need_up_thru = false;
1572
1573 // write pg info, log
1574 dirty_info = true;
1575 dirty_big_info = true; // maybe
1576
1577 // find out when we commit
1578 t.register_on_complete(
1579 new C_PG_ActivateCommitted(
1580 this,
1581 get_osdmap()->get_epoch(),
1582 activation_epoch));
1583
1584 // initialize snap_trimq
1585 if (is_primary()) {
1586 dout(20) << "activate - purged_snaps " << info.purged_snaps
1587 << " cached_removed_snaps " << pool.cached_removed_snaps << dendl;
1588 snap_trimq = pool.cached_removed_snaps;
1589 interval_set<snapid_t> intersection;
1590 intersection.intersection_of(snap_trimq, info.purged_snaps);
1591 if (intersection == info.purged_snaps) {
1592 snap_trimq.subtract(info.purged_snaps);
1593 } else {
1594 dout(0) << "warning: info.purged_snaps (" << info.purged_snaps
1595 << ") is not a subset of pool.cached_removed_snaps ("
1596 << pool.cached_removed_snaps << ")" << dendl;
1597 snap_trimq.subtract(intersection);
1598 }
1599 }
1600
1601 // init complete pointer
1602 if (missing.num_missing() == 0) {
1603 dout(10) << "activate - no missing, moving last_complete " << info.last_complete
1604 << " -> " << info.last_update << dendl;
1605 info.last_complete = info.last_update;
1606 pg_log.reset_recovery_pointers();
1607 } else {
1608 dout(10) << "activate - not complete, " << missing << dendl;
1609 pg_log.activate_not_complete(info);
1610 }
1611
1612 log_weirdness();
1613
1614 // if primary..
1615 if (is_primary()) {
1616 assert(ctx);
1617 // start up replicas
1618
1619 assert(!actingbackfill.empty());
1620 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1621 i != actingbackfill.end();
1622 ++i) {
1623 if (*i == pg_whoami) continue;
1624 pg_shard_t peer = *i;
1625 assert(peer_info.count(peer));
1626 pg_info_t& pi = peer_info[peer];
1627
1628 dout(10) << "activate peer osd." << peer << " " << pi << dendl;
1629
1630 MOSDPGLog *m = 0;
1631 pg_missing_t& pm = peer_missing[peer];
1632
1633 bool needs_past_intervals = pi.dne();
1634
1635 /*
1636 * cover case where peer sort order was different and
1637 * last_backfill cannot be interpreted
1638 */
1639 bool force_restart_backfill =
1640 !pi.last_backfill.is_max() &&
1641 !pi.last_backfill_bitwise;
1642
1643 if (pi.last_update == info.last_update && !force_restart_backfill) {
1644 // empty log
1645 if (!pi.last_backfill.is_max())
1646 osd->clog->info() << info.pgid << " continuing backfill to osd."
1647 << peer
1648 << " from (" << pi.log_tail << "," << pi.last_update
1649 << "] " << pi.last_backfill
1650 << " to " << info.last_update;
1651 if (!pi.is_empty() && activator_map) {
1652 dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
1653 (*activator_map)[peer.osd].push_back(
1654 make_pair(
1655 pg_notify_t(
1656 peer.shard, pg_whoami.shard,
1657 get_osdmap()->get_epoch(),
1658 get_osdmap()->get_epoch(),
1659 info),
1660 past_intervals));
1661 } else {
1662 dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
1663 m = new MOSDPGLog(
1664 i->shard, pg_whoami.shard,
1665 get_osdmap()->get_epoch(), info);
1666 }
1667 } else if (
1668 pg_log.get_tail() > pi.last_update ||
1669 pi.last_backfill == hobject_t() ||
1670 force_restart_backfill ||
1671 (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
1672 /* ^ This last case covers a situation where a replica is not contiguous
1673 * with the auth_log, but is contiguous with this replica. Reshuffling
1674 * the active set to handle this would be tricky, so instead we just go
1675 * ahead and backfill it anyway. This is probably preferrable in any
1676 * case since the replica in question would have to be significantly
1677 * behind.
1678 */
1679 // backfill
1680 osd->clog->info() << info.pgid << " starting backfill to osd." << peer
1681 << " from (" << pi.log_tail << "," << pi.last_update
1682 << "] " << pi.last_backfill
1683 << " to " << info.last_update;
1684
1685 pi.last_update = info.last_update;
1686 pi.last_complete = info.last_update;
1687 pi.set_last_backfill(hobject_t());
1688 pi.last_epoch_started = info.last_epoch_started;
1689 pi.last_interval_started = info.last_interval_started;
1690 pi.history = info.history;
1691 pi.hit_set = info.hit_set;
1692 pi.stats.stats.clear();
1693
1694 // initialize peer with our purged_snaps.
1695 pi.purged_snaps = info.purged_snaps;
1696
1697 m = new MOSDPGLog(
1698 i->shard, pg_whoami.shard,
1699 get_osdmap()->get_epoch(), pi);
1700
1701 // send some recent log, so that op dup detection works well.
1702 m->log.copy_up_to(pg_log.get_log(), cct->_conf->osd_min_pg_log_entries);
1703 m->info.log_tail = m->log.tail;
1704 pi.log_tail = m->log.tail; // sigh...
1705
1706 pm.clear();
1707 } else {
1708 // catch up
1709 assert(pg_log.get_tail() <= pi.last_update);
1710 m = new MOSDPGLog(
1711 i->shard, pg_whoami.shard,
1712 get_osdmap()->get_epoch(), info);
1713 // send new stuff to append to replicas log
1714 m->log.copy_after(pg_log.get_log(), pi.last_update);
1715 }
1716
1717 // share past_intervals if we are creating the pg on the replica
1718 // based on whether our info for that peer was dne() *before*
1719 // updating pi.history in the backfill block above.
1720 if (m && needs_past_intervals)
1721 m->past_intervals = past_intervals;
1722
1723 // update local version of peer's missing list!
1724 if (m && pi.last_backfill != hobject_t()) {
1725 for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
1726 p != m->log.log.end();
1727 ++p)
1728 if (p->soid <= pi.last_backfill &&
1729 !p->is_error())
1730 pm.add_next_event(*p);
1731 }
1732
1733 if (m) {
1734 dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
1735 //m->log.print(cout);
1736 osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
1737 }
1738
1739 // peer now has
1740 pi.last_update = info.last_update;
1741
1742 // update our missing
1743 if (pm.num_missing() == 0) {
1744 pi.last_complete = pi.last_update;
1745 dout(10) << "activate peer osd." << peer << " " << pi << " uptodate" << dendl;
1746 } else {
1747 dout(10) << "activate peer osd." << peer << " " << pi << " missing " << pm << dendl;
1748 }
1749 }
1750
1751 // Set up missing_loc
1752 set<pg_shard_t> complete_shards;
1753 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1754 i != actingbackfill.end();
1755 ++i) {
1756 if (*i == get_primary()) {
1757 missing_loc.add_active_missing(missing);
1758 if (!missing.have_missing())
1759 complete_shards.insert(*i);
1760 } else {
1761 auto peer_missing_entry = peer_missing.find(*i);
1762 assert(peer_missing_entry != peer_missing.end());
1763 missing_loc.add_active_missing(peer_missing_entry->second);
1764 if (!peer_missing_entry->second.have_missing() &&
1765 peer_info[*i].last_backfill.is_max())
1766 complete_shards.insert(*i);
1767 }
1768 }
1769 // If necessary, create might_have_unfound to help us find our unfound objects.
1770 // NOTE: It's important that we build might_have_unfound before trimming the
1771 // past intervals.
1772 might_have_unfound.clear();
1773 if (needs_recovery()) {
1774 // If only one shard has missing, we do a trick to add all others as recovery
1775 // source, this is considered safe since the PGLogs have been merged locally,
1776 // and covers vast majority of the use cases, like one OSD/host is down for
1777 // a while for hardware repairing
1778 if (complete_shards.size() + 1 == actingbackfill.size()) {
1779 missing_loc.add_batch_sources_info(complete_shards, ctx->handle);
1780 } else {
1781 missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
1782 ctx->handle);
1783 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1784 i != actingbackfill.end();
1785 ++i) {
1786 if (*i == pg_whoami) continue;
1787 dout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
1788 assert(peer_missing.count(*i));
1789 assert(peer_info.count(*i));
1790 missing_loc.add_source_info(
1791 *i,
1792 peer_info[*i],
1793 peer_missing[*i],
1794 ctx->handle);
1795 }
1796 }
1797 for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
1798 i != peer_missing.end();
1799 ++i) {
1800 if (is_actingbackfill(i->first))
1801 continue;
1802 assert(peer_info.count(i->first));
1803 search_for_missing(
1804 peer_info[i->first],
1805 i->second,
1806 i->first,
1807 ctx);
1808 }
1809
1810 build_might_have_unfound();
1811
1812 state_set(PG_STATE_DEGRADED);
1813 if (have_unfound())
1814 discover_all_missing(query_map);
1815 }
1816
1817 // degraded?
1818 if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
1819 state_set(PG_STATE_DEGRADED);
1820 state_set(PG_STATE_UNDERSIZED);
1821 }
1822
1823 state_set(PG_STATE_ACTIVATING);
1824 release_pg_backoffs();
1825 projected_last_update = info.last_update;
1826 }
1827 if (acting.size() >= pool.info.min_size) {
1828 PGLogEntryHandler handler{this, &t};
1829 pg_log.roll_forward(&handler);
1830 }
1831 }
1832
1833 bool PG::op_has_sufficient_caps(OpRequestRef& op)
1834 {
1835 // only check MOSDOp
1836 if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
1837 return true;
1838
1839 const MOSDOp *req = static_cast<const MOSDOp*>(op->get_req());
1840
1841 Session *session = static_cast<Session*>(req->get_connection()->get_priv());
1842 if (!session) {
1843 dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl;
1844 return false;
1845 }
1846 OSDCap& caps = session->caps;
1847 session->put();
1848
1849 const string &key = req->get_hobj().get_key().empty() ?
1850 req->get_oid().name :
1851 req->get_hobj().get_key();
1852
1853 bool cap = caps.is_capable(pool.name, req->get_hobj().nspace,
1854 pool.auid, key,
1855 op->need_read_cap(),
1856 op->need_write_cap(),
1857 op->classes());
1858
1859 dout(20) << "op_has_sufficient_caps pool=" << pool.id << " (" << pool.name
1860 << " " << req->get_hobj().nspace
1861 << ") owner=" << pool.auid
1862 << " need_read_cap=" << op->need_read_cap()
1863 << " need_write_cap=" << op->need_write_cap()
1864 << " classes=" << op->classes()
1865 << " -> " << (cap ? "yes" : "NO")
1866 << dendl;
1867 return cap;
1868 }
1869
1870 void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
1871 {
1872 lock();
1873 if (pg_has_reset_since(epoch)) {
1874 dout(10) << "_activate_committed " << epoch
1875 << ", that was an old interval" << dendl;
1876 } else if (is_primary()) {
1877 peer_activated.insert(pg_whoami);
1878 dout(10) << "_activate_committed " << epoch
1879 << " peer_activated now " << peer_activated
1880 << " last_interval_started " << info.history.last_interval_started
1881 << " last_epoch_started " << info.history.last_epoch_started
1882 << " same_interval_since " << info.history.same_interval_since << dendl;
1883 assert(!actingbackfill.empty());
1884 if (peer_activated.size() == actingbackfill.size())
1885 all_activated_and_committed();
1886 } else {
1887 dout(10) << "_activate_committed " << epoch << " telling primary" << dendl;
1888 MOSDPGInfo *m = new MOSDPGInfo(epoch);
1889 pg_notify_t i = pg_notify_t(
1890 get_primary().shard, pg_whoami.shard,
1891 get_osdmap()->get_epoch(),
1892 get_osdmap()->get_epoch(),
1893 info);
1894
1895 i.info.history.last_epoch_started = activation_epoch;
1896 i.info.history.last_interval_started = i.info.history.same_interval_since;
1897 if (acting.size() >= pool.info.min_size) {
1898 state_set(PG_STATE_ACTIVE);
1899 } else {
1900 state_set(PG_STATE_PEERED);
1901 }
1902
1903 m->pg_list.push_back(make_pair(i, PastIntervals()));
1904 osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap()->get_epoch());
1905
1906 // waiters
1907 if (flushes_in_progress == 0) {
1908 requeue_ops(waiting_for_peered);
1909 }
1910 }
1911
1912 assert(!dirty_info);
1913
1914 unlock();
1915 }
1916
1917 /*
1918 * update info.history.last_epoch_started ONLY after we and all
1919 * replicas have activated AND committed the activate transaction
1920 * (i.e. the peering results are stable on disk).
1921 */
1922 void PG::all_activated_and_committed()
1923 {
1924 dout(10) << "all_activated_and_committed" << dendl;
1925 assert(is_primary());
1926 assert(peer_activated.size() == actingbackfill.size());
1927 assert(!actingbackfill.empty());
1928 assert(blocked_by.empty());
1929
1930 queue_peering_event(
1931 CephPeeringEvtRef(
1932 std::make_shared<CephPeeringEvt>(
1933 get_osdmap()->get_epoch(),
1934 get_osdmap()->get_epoch(),
1935 AllReplicasActivated())));
1936 }
1937
1938 bool PG::requeue_scrub(bool high_priority)
1939 {
1940 assert(is_locked());
1941 if (scrub_queued) {
1942 dout(10) << __func__ << ": already queued" << dendl;
1943 return false;
1944 } else {
1945 dout(10) << __func__ << ": queueing" << dendl;
1946 scrub_queued = true;
1947 osd->queue_for_scrub(this, high_priority);
1948 return true;
1949 }
1950 }
1951
1952 void PG::queue_recovery(bool front)
1953 {
1954 if (!is_primary() || !is_peered()) {
1955 dout(10) << "queue_recovery -- not primary or not peered " << dendl;
1956 assert(!recovery_queued);
1957 } else if (recovery_queued) {
1958 dout(10) << "queue_recovery -- already queued" << dendl;
1959 } else {
1960 dout(10) << "queue_recovery -- queuing" << dendl;
1961 recovery_queued = true;
1962 osd->queue_for_recovery(this, front);
1963 }
1964 }
1965
1966 bool PG::queue_scrub()
1967 {
1968 assert(is_locked());
1969 if (is_scrubbing()) {
1970 return false;
1971 }
1972 scrubber.priority = scrubber.must_scrub ?
1973 cct->_conf->osd_requested_scrub_priority : get_scrub_priority();
1974 scrubber.must_scrub = false;
1975 state_set(PG_STATE_SCRUBBING);
1976 if (scrubber.must_deep_scrub) {
1977 state_set(PG_STATE_DEEP_SCRUB);
1978 scrubber.must_deep_scrub = false;
1979 }
1980 if (scrubber.must_repair || scrubber.auto_repair) {
1981 state_set(PG_STATE_REPAIR);
1982 scrubber.must_repair = false;
1983 }
1984 requeue_scrub();
1985 return true;
1986 }
1987
1988 unsigned PG::get_scrub_priority()
1989 {
1990 // a higher value -> a higher priority
1991 int pool_scrub_priority = 0;
1992 pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority);
1993 return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
1994 }
1995
1996 struct C_PG_FinishRecovery : public Context {
1997 PGRef pg;
1998 explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
1999 void finish(int r) override {
2000 pg->_finish_recovery(this);
2001 }
2002 };
2003
2004 void PG::mark_clean()
2005 {
2006 // only mark CLEAN if we have the desired number of replicas AND we
2007 // are not remapped.
2008 if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid) &&
2009 up == acting)
2010 state_set(PG_STATE_CLEAN);
2011
2012 // NOTE: this is actually a bit premature: we haven't purged the
2013 // strays yet.
2014 info.history.last_epoch_clean = get_osdmap()->get_epoch();
2015 info.history.last_interval_clean = info.history.same_interval_since;
2016
2017 past_intervals.clear();
2018 dirty_big_info = true;
2019
2020 if (is_active()) {
2021 /* The check is needed because if we are below min_size we're not
2022 * actually active */
2023 kick_snap_trim();
2024 }
2025
2026 dirty_info = true;
2027 }
2028
2029 unsigned PG::get_recovery_priority()
2030 {
2031 // a higher value -> a higher priority
2032
2033 int pool_recovery_priority = 0;
2034 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2035
2036 int ret = OSD_RECOVERY_PRIORITY_BASE + pool_recovery_priority;
2037
2038 // Clamp to valid range
2039 if (ret > OSD_RECOVERY_PRIORITY_MAX) {
2040 ret = OSD_RECOVERY_PRIORITY_MAX;
2041 } else if (ret < OSD_RECOVERY_PRIORITY_MIN) {
2042 ret = OSD_RECOVERY_PRIORITY_MIN;
2043 }
2044
2045 static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
2046 static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
2047
2048 return static_cast<unsigned>(ret);
2049 }
2050
2051 unsigned PG::get_backfill_priority()
2052 {
2053 // a higher value -> a higher priority
2054
2055 int ret = OSD_BACKFILL_PRIORITY_BASE;
2056 if (acting.size() < pool.info.min_size) {
2057 // inactive: no. of replicas < min_size, highest priority since it blocks IO
2058 ret = OSD_BACKFILL_INACTIVE_PRIORITY_BASE + (pool.info.min_size - acting.size());
2059
2060 } else if (is_undersized()) {
2061 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
2062 assert(pool.info.size > actingset.size());
2063 ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE + (pool.info.size - actingset.size());
2064
2065 } else if (is_degraded()) {
2066 // degraded: baseline degraded
2067 ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
2068 }
2069
2070 // Adjust with pool's recovery priority
2071 int pool_recovery_priority = 0;
2072 pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2073 ret += pool_recovery_priority;
2074
2075 // Clamp to valid range
2076 if (ret > OSD_RECOVERY_PRIORITY_MAX) {
2077 ret = OSD_RECOVERY_PRIORITY_MAX;
2078 } else if (ret < OSD_RECOVERY_PRIORITY_MIN) {
2079 ret = OSD_RECOVERY_PRIORITY_MIN;
2080 }
2081
2082 return static_cast<unsigned>(ret);
2083 }
2084
2085 void PG::finish_recovery(list<Context*>& tfin)
2086 {
2087 dout(10) << "finish_recovery" << dendl;
2088 assert(info.last_complete == info.last_update);
2089
2090 clear_recovery_state();
2091
2092 /*
2093 * sync all this before purging strays. but don't block!
2094 */
2095 finish_sync_event = new C_PG_FinishRecovery(this);
2096 tfin.push_back(finish_sync_event);
2097 }
2098
2099 void PG::_finish_recovery(Context *c)
2100 {
2101 lock();
2102 if (deleting) {
2103 unlock();
2104 return;
2105 }
2106 if (c == finish_sync_event) {
2107 dout(10) << "_finish_recovery" << dendl;
2108 finish_sync_event = 0;
2109 purge_strays();
2110
2111 publish_stats_to_osd();
2112
2113 if (scrub_after_recovery) {
2114 dout(10) << "_finish_recovery requeueing for scrub" << dendl;
2115 scrub_after_recovery = false;
2116 scrubber.must_deep_scrub = true;
2117 queue_scrub();
2118 }
2119 } else {
2120 dout(10) << "_finish_recovery -- stale" << dendl;
2121 }
2122 unlock();
2123 }
2124
2125 void PG::start_recovery_op(const hobject_t& soid)
2126 {
2127 dout(10) << "start_recovery_op " << soid
2128 #ifdef DEBUG_RECOVERY_OIDS
2129 << " (" << recovering_oids << ")"
2130 #endif
2131 << dendl;
2132 assert(recovery_ops_active >= 0);
2133 recovery_ops_active++;
2134 #ifdef DEBUG_RECOVERY_OIDS
2135 assert(recovering_oids.count(soid) == 0);
2136 recovering_oids.insert(soid);
2137 #endif
2138 osd->start_recovery_op(this, soid);
2139 }
2140
2141 void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
2142 {
2143 dout(10) << "finish_recovery_op " << soid
2144 #ifdef DEBUG_RECOVERY_OIDS
2145 << " (" << recovering_oids << ")"
2146 #endif
2147 << dendl;
2148 assert(recovery_ops_active > 0);
2149 recovery_ops_active--;
2150 #ifdef DEBUG_RECOVERY_OIDS
2151 assert(recovering_oids.count(soid));
2152 recovering_oids.erase(soid);
2153 #endif
2154 osd->finish_recovery_op(this, soid, dequeue);
2155
2156 if (!dequeue) {
2157 queue_recovery();
2158 }
2159 }
2160
2161 void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
2162 {
2163 child->update_snap_mapper_bits(split_bits);
2164 child->update_osdmap_ref(get_osdmap());
2165
2166 child->pool = pool;
2167
2168 // Log
2169 pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2170 child->info.last_complete = info.last_complete;
2171
2172 info.last_update = pg_log.get_head();
2173 child->info.last_update = child->pg_log.get_head();
2174
2175 child->info.last_user_version = info.last_user_version;
2176
2177 info.log_tail = pg_log.get_tail();
2178 child->info.log_tail = child->pg_log.get_tail();
2179
2180 if (info.last_complete < pg_log.get_tail())
2181 info.last_complete = pg_log.get_tail();
2182 if (child->info.last_complete < child->pg_log.get_tail())
2183 child->info.last_complete = child->pg_log.get_tail();
2184
2185 // Info
2186 child->info.history = info.history;
2187 child->info.history.epoch_created = get_osdmap()->get_epoch();
2188 child->info.purged_snaps = info.purged_snaps;
2189
2190 if (info.last_backfill.is_max()) {
2191 child->info.set_last_backfill(hobject_t::get_max());
2192 } else {
2193 // restart backfill on parent and child to be safe. we could
2194 // probably do better in the bitwise sort case, but it's more
2195 // fragile (there may be special work to do on backfill completion
2196 // in the future).
2197 info.set_last_backfill(hobject_t());
2198 child->info.set_last_backfill(hobject_t());
2199 }
2200
2201 child->info.stats = info.stats;
2202 child->info.stats.parent_split_bits = split_bits;
2203 info.stats.stats_invalid = true;
2204 child->info.stats.stats_invalid = true;
2205 child->info.last_epoch_started = info.last_epoch_started;
2206 child->info.last_interval_started = info.last_interval_started;
2207
2208 child->snap_trimq = snap_trimq;
2209
2210 // There can't be recovery/backfill going on now
2211 int primary, up_primary;
2212 vector<int> newup, newacting;
2213 get_osdmap()->pg_to_up_acting_osds(
2214 child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2215 child->init_primary_up_acting(
2216 newup,
2217 newacting,
2218 up_primary,
2219 primary);
2220 child->role = OSDMap::calc_pg_role(osd->whoami, child->acting);
2221
2222 // this comparison includes primary rank via pg_shard_t
2223 if (get_primary() != child->get_primary())
2224 child->info.history.same_primary_since = get_osdmap()->get_epoch();
2225
2226 child->info.stats.up = up;
2227 child->info.stats.up_primary = up_primary;
2228 child->info.stats.acting = acting;
2229 child->info.stats.acting_primary = primary;
2230 child->info.stats.mapping_epoch = get_osdmap()->get_epoch();
2231
2232 // History
2233 child->past_intervals = past_intervals;
2234
2235 _split_into(child_pgid, child, split_bits);
2236
2237 // release all backoffs for simplicity
2238 release_backoffs(hobject_t(), hobject_t::get_max());
2239
2240 child->on_new_interval();
2241
2242 child->dirty_info = true;
2243 child->dirty_big_info = true;
2244 dirty_info = true;
2245 dirty_big_info = true;
2246 }
2247
2248 void PG::add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end)
2249 {
2250 ConnectionRef con = s->con;
2251 if (!con) // OSD::ms_handle_reset clears s->con without a lock
2252 return;
2253 BackoffRef b(s->have_backoff(info.pgid, begin));
2254 if (b) {
2255 derr << __func__ << " already have backoff for " << s << " begin " << begin
2256 << " " << *b << dendl;
2257 ceph_abort();
2258 }
2259 Mutex::Locker l(backoff_lock);
2260 {
2261 b = new Backoff(info.pgid, this, s, ++s->backoff_seq, begin, end);
2262 backoffs[begin].insert(b);
2263 s->add_backoff(b);
2264 dout(10) << __func__ << " session " << s << " added " << *b << dendl;
2265 }
2266 con->send_message(
2267 new MOSDBackoff(
2268 info.pgid,
2269 get_osdmap()->get_epoch(),
2270 CEPH_OSD_BACKOFF_OP_BLOCK,
2271 b->id,
2272 begin,
2273 end));
2274 }
2275
2276 void PG::release_backoffs(const hobject_t& begin, const hobject_t& end)
2277 {
2278 dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl;
2279 vector<BackoffRef> bv;
2280 {
2281 Mutex::Locker l(backoff_lock);
2282 auto p = backoffs.lower_bound(begin);
2283 while (p != backoffs.end()) {
2284 int r = cmp(p->first, end);
2285 dout(20) << __func__ << " ? " << r << " " << p->first
2286 << " " << p->second << dendl;
2287 // note: must still examine begin=end=p->first case
2288 if (r > 0 || (r == 0 && begin < end)) {
2289 break;
2290 }
2291 dout(20) << __func__ << " checking " << p->first
2292 << " " << p->second << dendl;
2293 auto q = p->second.begin();
2294 while (q != p->second.end()) {
2295 dout(20) << __func__ << " checking " << *q << dendl;
2296 int r = cmp((*q)->begin, begin);
2297 if (r == 0 || (r > 0 && (*q)->end < end)) {
2298 bv.push_back(*q);
2299 q = p->second.erase(q);
2300 } else {
2301 ++q;
2302 }
2303 }
2304 if (p->second.empty()) {
2305 p = backoffs.erase(p);
2306 } else {
2307 ++p;
2308 }
2309 }
2310 }
2311 for (auto b : bv) {
2312 Mutex::Locker l(b->lock);
2313 dout(10) << __func__ << " " << *b << dendl;
2314 if (b->session) {
2315 assert(b->pg == this);
2316 ConnectionRef con = b->session->con;
2317 if (con) { // OSD::ms_handle_reset clears s->con without a lock
2318 con->send_message(
2319 new MOSDBackoff(
2320 info.pgid,
2321 get_osdmap()->get_epoch(),
2322 CEPH_OSD_BACKOFF_OP_UNBLOCK,
2323 b->id,
2324 b->begin,
2325 b->end));
2326 }
2327 if (b->is_new()) {
2328 b->state = Backoff::STATE_DELETING;
2329 } else {
2330 b->session->rm_backoff(b);
2331 b->session.reset();
2332 }
2333 b->pg.reset();
2334 }
2335 }
2336 }
2337
2338 void PG::clear_backoffs()
2339 {
2340 dout(10) << __func__ << " " << dendl;
2341 map<hobject_t,set<BackoffRef>> ls;
2342 {
2343 Mutex::Locker l(backoff_lock);
2344 ls.swap(backoffs);
2345 }
2346 for (auto& p : ls) {
2347 for (auto& b : p.second) {
2348 Mutex::Locker l(b->lock);
2349 dout(10) << __func__ << " " << *b << dendl;
2350 if (b->session) {
2351 assert(b->pg == this);
2352 if (b->is_new()) {
2353 b->state = Backoff::STATE_DELETING;
2354 } else {
2355 b->session->rm_backoff(b);
2356 b->session.reset();
2357 }
2358 b->pg.reset();
2359 }
2360 }
2361 }
2362 }
2363
2364 // called by Session::clear_backoffs()
2365 void PG::rm_backoff(BackoffRef b)
2366 {
2367 dout(10) << __func__ << " " << *b << dendl;
2368 Mutex::Locker l(backoff_lock);
2369 assert(b->lock.is_locked_by_me());
2370 assert(b->pg == this);
2371 auto p = backoffs.find(b->begin);
2372 // may race with release_backoffs()
2373 if (p != backoffs.end()) {
2374 auto q = p->second.find(b);
2375 if (q != p->second.end()) {
2376 p->second.erase(q);
2377 if (p->second.empty()) {
2378 backoffs.erase(p);
2379 }
2380 }
2381 }
2382 }
2383
2384 void PG::clear_recovery_state()
2385 {
2386 dout(10) << "clear_recovery_state" << dendl;
2387
2388 pg_log.reset_recovery_pointers();
2389 finish_sync_event = 0;
2390
2391 hobject_t soid;
2392 while (recovery_ops_active > 0) {
2393 #ifdef DEBUG_RECOVERY_OIDS
2394 soid = *recovering_oids.begin();
2395 #endif
2396 finish_recovery_op(soid, true);
2397 }
2398
2399 backfill_targets.clear();
2400 backfill_info.clear();
2401 peer_backfill_info.clear();
2402 waiting_on_backfill.clear();
2403 _clear_recovery_state(); // pg impl specific hook
2404 }
2405
2406 void PG::cancel_recovery()
2407 {
2408 dout(10) << "cancel_recovery" << dendl;
2409 clear_recovery_state();
2410 }
2411
2412
2413 void PG::purge_strays()
2414 {
2415 dout(10) << "purge_strays " << stray_set << dendl;
2416
2417 bool removed = false;
2418 for (set<pg_shard_t>::iterator p = stray_set.begin();
2419 p != stray_set.end();
2420 ++p) {
2421 assert(!is_actingbackfill(*p));
2422 if (get_osdmap()->is_up(p->osd)) {
2423 dout(10) << "sending PGRemove to osd." << *p << dendl;
2424 vector<spg_t> to_remove;
2425 to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
2426 MOSDPGRemove *m = new MOSDPGRemove(
2427 get_osdmap()->get_epoch(),
2428 to_remove);
2429 osd->send_message_osd_cluster(p->osd, m, get_osdmap()->get_epoch());
2430 } else {
2431 dout(10) << "not sending PGRemove to down osd." << *p << dendl;
2432 }
2433 peer_missing.erase(*p);
2434 peer_info.erase(*p);
2435 peer_purged.insert(*p);
2436 removed = true;
2437 }
2438
2439 // if we removed anyone, update peers (which include peer_info)
2440 if (removed)
2441 update_heartbeat_peers();
2442
2443 stray_set.clear();
2444
2445 // clear _requested maps; we may have to peer() again if we discover
2446 // (more) stray content
2447 peer_log_requested.clear();
2448 peer_missing_requested.clear();
2449 }
2450
2451 void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
2452 {
2453 Mutex::Locker l(heartbeat_peer_lock);
2454 probe_targets.clear();
2455 for (set<pg_shard_t>::iterator i = probe_set.begin();
2456 i != probe_set.end();
2457 ++i) {
2458 probe_targets.insert(i->osd);
2459 }
2460 }
2461
2462 void PG::clear_probe_targets()
2463 {
2464 Mutex::Locker l(heartbeat_peer_lock);
2465 probe_targets.clear();
2466 }
2467
2468 void PG::update_heartbeat_peers()
2469 {
2470 assert(is_locked());
2471
2472 if (!is_primary())
2473 return;
2474
2475 set<int> new_peers;
2476 for (unsigned i=0; i<acting.size(); i++) {
2477 if (acting[i] != CRUSH_ITEM_NONE)
2478 new_peers.insert(acting[i]);
2479 }
2480 for (unsigned i=0; i<up.size(); i++) {
2481 if (up[i] != CRUSH_ITEM_NONE)
2482 new_peers.insert(up[i]);
2483 }
2484 for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
2485 p != peer_info.end();
2486 ++p)
2487 new_peers.insert(p->first.osd);
2488
2489 bool need_update = false;
2490 heartbeat_peer_lock.Lock();
2491 if (new_peers == heartbeat_peers) {
2492 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl;
2493 } else {
2494 dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl;
2495 heartbeat_peers.swap(new_peers);
2496 need_update = true;
2497 }
2498 heartbeat_peer_lock.Unlock();
2499
2500 if (need_update)
2501 osd->need_heartbeat_peer_update();
2502 }
2503
2504
2505 bool PG::check_in_progress_op(
2506 const osd_reqid_t &r,
2507 eversion_t *version,
2508 version_t *user_version,
2509 int *return_code) const
2510 {
2511 return (
2512 projected_log.get_request(r, version, user_version, return_code) ||
2513 pg_log.get_log().get_request(r, version, user_version, return_code));
2514 }
2515
2516 void PG::_update_calc_stats()
2517 {
2518 info.stats.version = info.last_update;
2519 info.stats.created = info.history.epoch_created;
2520 info.stats.last_scrub = info.history.last_scrub;
2521 info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
2522 info.stats.last_deep_scrub = info.history.last_deep_scrub;
2523 info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
2524 info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
2525 info.stats.last_epoch_clean = info.history.last_epoch_clean;
2526
2527 info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
2528 info.stats.ondisk_log_size = info.stats.log_size;
2529 info.stats.log_start = pg_log.get_tail();
2530 info.stats.ondisk_log_start = pg_log.get_tail();
2531
2532 // If actingset is larger then upset we will have misplaced,
2533 // so we will report based on actingset size.
2534
2535 // If upset is larger then we will have degraded,
2536 // so we will report based on upset size.
2537
2538 // If target is the largest of them all, it will contribute to
2539 // the degraded count because num_object_copies is
2540 // computed using target and eventual used to get degraded total.
2541
2542 unsigned target = get_osdmap()->get_pg_size(info.pgid.pgid);
2543 unsigned nrep = MAX(actingset.size(), upset.size());
2544 // calc num_object_copies
2545 info.stats.stats.calc_copies(MAX(target, nrep));
2546 info.stats.stats.sum.num_objects_degraded = 0;
2547 info.stats.stats.sum.num_objects_unfound = 0;
2548 info.stats.stats.sum.num_objects_misplaced = 0;
2549 if ((is_degraded() || is_undersized() || !is_clean()) && is_peered()) {
2550 // NOTE: we only generate copies, degraded, misplaced and unfound
2551 // values for the summation, not individual stat categories.
2552 int64_t num_objects = info.stats.stats.sum.num_objects;
2553
2554 // Total sum of all missing
2555 int64_t missing = 0;
2556 // Objects that have arrived backfilled to up OSDs (not in acting)
2557 int64_t backfilled = 0;
2558 // A misplaced object is not stored on the correct OSD
2559 int64_t misplaced = 0;
2560 // Total of object copies/shards found
2561 int64_t object_copies = 0;
2562
2563 // num_objects_missing on each peer
2564 for (map<pg_shard_t, pg_info_t>::iterator pi =
2565 peer_info.begin();
2566 pi != peer_info.end();
2567 ++pi) {
2568 map<pg_shard_t, pg_missing_t>::const_iterator pm =
2569 peer_missing.find(pi->first);
2570 if (pm != peer_missing.end()) {
2571 pi->second.stats.stats.sum.num_objects_missing =
2572 pm->second.num_missing();
2573 }
2574 }
2575
2576 assert(!actingbackfill.empty());
2577 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
2578 i != actingbackfill.end();
2579 ++i) {
2580 const pg_shard_t &p = *i;
2581
2582 bool in_up = (upset.find(p) != upset.end());
2583 bool in_acting = (actingset.find(p) != actingset.end());
2584 assert(in_up || in_acting);
2585
2586 // in acting Compute total objects excluding num_missing
2587 // in acting and not in up Compute misplaced objects excluding num_missing
2588 // in up and not in acting Compute total objects already backfilled
2589 if (in_acting) {
2590 unsigned osd_missing;
2591 // primary handling
2592 if (p == pg_whoami) {
2593 osd_missing = pg_log.get_missing().num_missing();
2594 info.stats.stats.sum.num_objects_missing_on_primary =
2595 osd_missing;
2596 object_copies += num_objects; // My local (primary) count
2597 } else {
2598 assert(peer_missing.count(p));
2599 osd_missing = peer_missing[p].num_missing();
2600 object_copies += peer_info[p].stats.stats.sum.num_objects;
2601 }
2602 missing += osd_missing;
2603 // Count non-missing objects not in up as misplaced
2604 if (!in_up && num_objects > osd_missing)
2605 misplaced += num_objects - osd_missing;
2606 } else {
2607 assert(in_up && !in_acting);
2608
2609 // If this peer has more objects then it should, ignore them
2610 backfilled += MIN(num_objects, peer_info[p].stats.stats.sum.num_objects);
2611 }
2612 }
2613
2614 // Any objects that have been backfilled to up OSDs can deducted from misplaced
2615 misplaced = MAX(0, misplaced - backfilled);
2616
2617 // Deduct computed total missing on acting nodes
2618 object_copies -= missing;
2619 // Include computed backfilled objects on up nodes
2620 object_copies += backfilled;
2621 // a degraded objects has fewer replicas or EC shards than the
2622 // pool specifies. num_object_copies will never be smaller than target * num_copies.
2623 int64_t degraded = MAX(0, info.stats.stats.sum.num_object_copies - object_copies);
2624
2625 info.stats.stats.sum.num_objects_degraded = degraded;
2626 info.stats.stats.sum.num_objects_unfound = get_num_unfound();
2627 info.stats.stats.sum.num_objects_misplaced = misplaced;
2628 }
2629 }
2630
2631 void PG::_update_blocked_by()
2632 {
2633 // set a max on the number of blocking peers we report. if we go
2634 // over, report a random subset. keep the result sorted.
2635 unsigned keep = MIN(blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
2636 unsigned skip = blocked_by.size() - keep;
2637 info.stats.blocked_by.clear();
2638 info.stats.blocked_by.resize(keep);
2639 unsigned pos = 0;
2640 for (set<int>::iterator p = blocked_by.begin();
2641 p != blocked_by.end() && keep > 0;
2642 ++p) {
2643 if (skip > 0 && (rand() % (skip + keep) < skip)) {
2644 --skip;
2645 } else {
2646 info.stats.blocked_by[pos++] = *p;
2647 --keep;
2648 }
2649 }
2650 }
2651
2652 void PG::publish_stats_to_osd()
2653 {
2654 if (!is_primary())
2655 return;
2656
2657 pg_stats_publish_lock.Lock();
2658
2659 if (info.stats.stats.sum.num_scrub_errors)
2660 state_set(PG_STATE_INCONSISTENT);
2661 else
2662 state_clear(PG_STATE_INCONSISTENT);
2663
2664 utime_t now = ceph_clock_now();
2665 if (info.stats.state != state) {
2666 info.stats.last_change = now;
2667 // Optimistic estimation, if we just find out an inactive PG,
2668 // assumt it is active till now.
2669 if (!(state & PG_STATE_ACTIVE) &&
2670 (info.stats.state & PG_STATE_ACTIVE))
2671 info.stats.last_active = now;
2672
2673 if ((state & PG_STATE_ACTIVE) &&
2674 !(info.stats.state & PG_STATE_ACTIVE))
2675 info.stats.last_became_active = now;
2676 if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
2677 !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
2678 info.stats.last_became_peered = now;
2679 if (!(state & PG_STATE_CREATING) &&
2680 (info.stats.state & PG_STATE_CREATING)) {
2681 osd->send_pg_created(get_pgid().pgid);
2682 }
2683 info.stats.state = state;
2684 }
2685
2686 _update_calc_stats();
2687 _update_blocked_by();
2688
2689 bool publish = false;
2690 pg_stat_t pre_publish = info.stats;
2691 pre_publish.stats.add(unstable_stats);
2692 utime_t cutoff = now;
2693 cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
2694 if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
2695 info.stats.last_fresh > cutoff) {
2696 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2697 << ": no change since " << info.stats.last_fresh << dendl;
2698 } else {
2699 // update our stat summary and timestamps
2700 info.stats.reported_epoch = get_osdmap()->get_epoch();
2701 ++info.stats.reported_seq;
2702
2703 info.stats.last_fresh = now;
2704
2705 if (info.stats.state & PG_STATE_CLEAN)
2706 info.stats.last_clean = now;
2707 if (info.stats.state & PG_STATE_ACTIVE)
2708 info.stats.last_active = now;
2709 if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
2710 info.stats.last_peered = now;
2711 info.stats.last_unstale = now;
2712 if ((info.stats.state & PG_STATE_DEGRADED) == 0)
2713 info.stats.last_undegraded = now;
2714 if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
2715 info.stats.last_fullsized = now;
2716
2717 // do not send pgstat to mon anymore once we are luminous, since mgr takes
2718 // care of this by sending MMonMgrReport to mon.
2719 publish =
2720 osd->osd->get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
2721 pg_stats_publish_valid = true;
2722 pg_stats_publish = pre_publish;
2723
2724 dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2725 << ":" << pg_stats_publish.reported_seq << dendl;
2726 }
2727 pg_stats_publish_lock.Unlock();
2728
2729 if (publish)
2730 osd->pg_stat_queue_enqueue(this);
2731 }
2732
2733 void PG::clear_publish_stats()
2734 {
2735 dout(15) << "clear_stats" << dendl;
2736 pg_stats_publish_lock.Lock();
2737 pg_stats_publish_valid = false;
2738 pg_stats_publish_lock.Unlock();
2739
2740 osd->pg_stat_queue_dequeue(this);
2741 }
2742
2743 /**
2744 * initialize a newly instantiated pg
2745 *
2746 * Initialize PG state, as when a PG is initially created, or when it
2747 * is first instantiated on the current node.
2748 *
2749 * @param role our role/rank
2750 * @param newup up set
2751 * @param newacting acting set
2752 * @param history pg history
2753 * @param pi past_intervals
2754 * @param backfill true if info should be marked as backfill
2755 * @param t transaction to write out our new state in
2756 */
2757 void PG::init(
2758 int role,
2759 const vector<int>& newup, int new_up_primary,
2760 const vector<int>& newacting, int new_acting_primary,
2761 const pg_history_t& history,
2762 const PastIntervals& pi,
2763 bool backfill,
2764 ObjectStore::Transaction *t)
2765 {
2766 dout(10) << "init role " << role << " up " << newup << " acting " << newacting
2767 << " history " << history
2768 << " past_intervals " << pi
2769 << dendl;
2770
2771 set_role(role);
2772 acting = newacting;
2773 up = newup;
2774 init_primary_up_acting(
2775 newup,
2776 newacting,
2777 new_up_primary,
2778 new_acting_primary);
2779
2780 info.history = history;
2781 past_intervals = pi;
2782
2783 info.stats.up = up;
2784 info.stats.up_primary = new_up_primary;
2785 info.stats.acting = acting;
2786 info.stats.acting_primary = new_acting_primary;
2787 info.stats.mapping_epoch = info.history.same_interval_since;
2788
2789 if (backfill) {
2790 dout(10) << __func__ << ": Setting backfill" << dendl;
2791 info.set_last_backfill(hobject_t());
2792 info.last_complete = info.last_update;
2793 pg_log.mark_log_for_rewrite();
2794 }
2795
2796 on_new_interval();
2797
2798 dirty_info = true;
2799 dirty_big_info = true;
2800 write_if_dirty(*t);
2801 }
2802
2803 #pragma GCC diagnostic ignored "-Wpragmas"
2804 #pragma GCC diagnostic push
2805 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2806
2807 void PG::upgrade(ObjectStore *store)
2808 {
2809 assert(info_struct_v <= 10);
2810 ObjectStore::Transaction t;
2811
2812 assert(info_struct_v >= 7);
2813
2814 // 7 -> 8
2815 if (info_struct_v <= 7) {
2816 pg_log.mark_log_for_rewrite();
2817 ghobject_t log_oid(OSD::make_pg_log_oid(pg_id));
2818 ghobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id));
2819 t.remove(coll_t::meta(), log_oid);
2820 t.remove(coll_t::meta(), biginfo_oid);
2821 t.touch(coll, pgmeta_oid);
2822 }
2823
2824 // 8 -> 9
2825 if (info_struct_v <= 8) {
2826 // no special action needed.
2827 }
2828
2829 // 9 -> 10
2830 if (info_struct_v <= 9) {
2831 // previous versions weren't (as) aggressively clearing past_intervals
2832 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
2833 dout(20) << __func__ << " clearing past_intervals" << dendl;
2834 past_intervals.clear();
2835 }
2836 }
2837
2838 // update infover_key
2839 if (info_struct_v < cur_struct_v) {
2840 map<string,bufferlist> v;
2841 __u8 ver = cur_struct_v;
2842 ::encode(ver, v[infover_key]);
2843 t.omap_setkeys(coll, pgmeta_oid, v);
2844 }
2845
2846 dirty_info = true;
2847 dirty_big_info = true;
2848 write_if_dirty(t);
2849
2850 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
2851 ObjectStore::Sequencer>("upgrade"));
2852 int r = store->apply_transaction(osr.get(), std::move(t));
2853 if (r != 0) {
2854 derr << __func__ << ": apply_transaction returned "
2855 << cpp_strerror(r) << dendl;
2856 ceph_abort();
2857 }
2858 assert(r == 0);
2859
2860 C_SaferCond waiter;
2861 if (!osr->flush_commit(&waiter)) {
2862 waiter.wait();
2863 }
2864 }
2865
2866 #pragma GCC diagnostic pop
2867 #pragma GCC diagnostic warning "-Wpragmas"
2868
2869 int PG::_prepare_write_info(CephContext* cct,
2870 map<string,bufferlist> *km,
2871 epoch_t epoch,
2872 pg_info_t &info, pg_info_t &last_written_info,
2873 PastIntervals &past_intervals,
2874 bool dirty_big_info,
2875 bool dirty_epoch,
2876 bool try_fast_info,
2877 PerfCounters *logger)
2878 {
2879 if (dirty_epoch) {
2880 ::encode(epoch, (*km)[epoch_key]);
2881 }
2882
2883 if (logger)
2884 logger->inc(l_osd_pg_info);
2885
2886 // try to do info efficiently?
2887 if (!dirty_big_info && try_fast_info &&
2888 info.last_update > last_written_info.last_update) {
2889 pg_fast_info_t fast;
2890 fast.populate_from(info);
2891 bool did = fast.try_apply_to(&last_written_info);
2892 assert(did); // we verified last_update increased above
2893 if (info == last_written_info) {
2894 ::encode(fast, (*km)[fastinfo_key]);
2895 if (logger)
2896 logger->inc(l_osd_pg_fastinfo);
2897 return 0;
2898 }
2899 generic_dout(30) << __func__ << " fastinfo failed, info:\n";
2900 {
2901 JSONFormatter jf(true);
2902 jf.dump_object("info", info);
2903 jf.flush(*_dout);
2904 }
2905 {
2906 *_dout << "\nlast_written_info:\n";
2907 JSONFormatter jf(true);
2908 jf.dump_object("last_written_info", last_written_info);
2909 jf.flush(*_dout);
2910 }
2911 *_dout << dendl;
2912 }
2913 last_written_info = info;
2914
2915 // info. store purged_snaps separately.
2916 interval_set<snapid_t> purged_snaps;
2917 purged_snaps.swap(info.purged_snaps);
2918 ::encode(info, (*km)[info_key]);
2919 purged_snaps.swap(info.purged_snaps);
2920
2921 if (dirty_big_info) {
2922 // potentially big stuff
2923 bufferlist& bigbl = (*km)[biginfo_key];
2924 ::encode(past_intervals, bigbl);
2925 ::encode(info.purged_snaps, bigbl);
2926 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
2927 if (logger)
2928 logger->inc(l_osd_pg_biginfo);
2929 }
2930
2931 return 0;
2932 }
2933
2934 void PG::_create(ObjectStore::Transaction& t, spg_t pgid, int bits)
2935 {
2936 coll_t coll(pgid);
2937 t.create_collection(coll, bits);
2938 }
2939
2940 void PG::_init(ObjectStore::Transaction& t, spg_t pgid, const pg_pool_t *pool)
2941 {
2942 coll_t coll(pgid);
2943
2944 if (pool) {
2945 // Give a hint to the PG collection
2946 bufferlist hint;
2947 uint32_t pg_num = pool->get_pg_num();
2948 uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
2949 ::encode(pg_num, hint);
2950 ::encode(expected_num_objects_pg, hint);
2951 uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
2952 t.collection_hint(coll, hint_type, hint);
2953 }
2954
2955 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
2956 t.touch(coll, pgmeta_oid);
2957 map<string,bufferlist> values;
2958 __u8 struct_v = cur_struct_v;
2959 ::encode(struct_v, values[infover_key]);
2960 t.omap_setkeys(coll, pgmeta_oid, values);
2961 }
2962
2963 void PG::prepare_write_info(map<string,bufferlist> *km)
2964 {
2965 info.stats.stats.add(unstable_stats);
2966 unstable_stats.clear();
2967
2968 bool need_update_epoch = last_epoch < get_osdmap()->get_epoch();
2969 int ret = _prepare_write_info(cct, km, get_osdmap()->get_epoch(),
2970 info,
2971 last_written_info,
2972 past_intervals,
2973 dirty_big_info, need_update_epoch,
2974 cct->_conf->osd_fast_info,
2975 osd->logger);
2976 assert(ret == 0);
2977 if (need_update_epoch)
2978 last_epoch = get_osdmap()->get_epoch();
2979 last_persisted_osdmap_ref = osdmap_ref;
2980
2981 dirty_info = false;
2982 dirty_big_info = false;
2983 }
2984
2985 #pragma GCC diagnostic ignored "-Wpragmas"
2986 #pragma GCC diagnostic push
2987 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2988
2989 bool PG::_has_removal_flag(ObjectStore *store,
2990 spg_t pgid)
2991 {
2992 coll_t coll(pgid);
2993 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
2994
2995 // first try new way
2996 set<string> keys;
2997 keys.insert("_remove");
2998 map<string,bufferlist> values;
2999 if (store->omap_get_values(coll, pgmeta_oid, keys, &values) == 0 &&
3000 values.size() == 1)
3001 return true;
3002
3003 return false;
3004 }
3005
3006 int PG::peek_map_epoch(ObjectStore *store,
3007 spg_t pgid,
3008 epoch_t *pepoch,
3009 bufferlist *bl)
3010 {
3011 coll_t coll(pgid);
3012 ghobject_t legacy_infos_oid(OSD::make_infos_oid());
3013 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3014 epoch_t cur_epoch = 0;
3015
3016 assert(bl);
3017 {
3018 // validate collection name
3019 assert(coll.is_pg());
3020 }
3021
3022 // try for v8
3023 set<string> keys;
3024 keys.insert(infover_key);
3025 keys.insert(epoch_key);
3026 map<string,bufferlist> values;
3027 int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3028 if (r == 0) {
3029 assert(values.size() == 2);
3030
3031 // sanity check version
3032 bufferlist::iterator bp = values[infover_key].begin();
3033 __u8 struct_v = 0;
3034 ::decode(struct_v, bp);
3035 assert(struct_v >= 8);
3036
3037 // get epoch
3038 bp = values[epoch_key].begin();
3039 ::decode(cur_epoch, bp);
3040 } else {
3041 // probably bug 10617; see OSD::load_pgs()
3042 return -1;
3043 }
3044
3045 *pepoch = cur_epoch;
3046 return 0;
3047 }
3048
3049 #pragma GCC diagnostic pop
3050 #pragma GCC diagnostic warning "-Wpragmas"
3051
3052 void PG::write_if_dirty(ObjectStore::Transaction& t)
3053 {
3054 map<string,bufferlist> km;
3055 if (dirty_big_info || dirty_info)
3056 prepare_write_info(&km);
3057 pg_log.write_log_and_missing(t, &km, coll, pgmeta_oid, pool.info.require_rollback());
3058 if (!km.empty())
3059 t.omap_setkeys(coll, pgmeta_oid, km);
3060 }
3061
3062 void PG::trim_log()
3063 {
3064 assert(is_primary());
3065 calc_trim_to();
3066 dout(10) << __func__ << " to " << pg_trim_to << dendl;
3067 if (pg_trim_to != eversion_t()) {
3068 // inform peers to trim log
3069 assert(!actingbackfill.empty());
3070 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3071 i != actingbackfill.end();
3072 ++i) {
3073 if (*i == pg_whoami) continue;
3074 osd->send_message_osd_cluster(
3075 i->osd,
3076 new MOSDPGTrim(
3077 get_osdmap()->get_epoch(),
3078 spg_t(info.pgid.pgid, i->shard),
3079 pg_trim_to),
3080 get_osdmap()->get_epoch());
3081 }
3082
3083 // trim primary as well
3084 pg_log.trim(pg_trim_to, info);
3085 dirty_info = true;
3086 }
3087 }
3088
3089 void PG::add_log_entry(const pg_log_entry_t& e, bool applied)
3090 {
3091 // raise last_complete only if we were previously up to date
3092 if (info.last_complete == info.last_update)
3093 info.last_complete = e.version;
3094
3095 // raise last_update.
3096 assert(e.version > info.last_update);
3097 info.last_update = e.version;
3098
3099 // raise user_version, if it increased (it may have not get bumped
3100 // by all logged updates)
3101 if (e.user_version > info.last_user_version)
3102 info.last_user_version = e.user_version;
3103
3104 // log mutation
3105 pg_log.add(e, applied);
3106 dout(10) << "add_log_entry " << e << dendl;
3107 }
3108
3109
3110 void PG::append_log(
3111 const vector<pg_log_entry_t>& logv,
3112 eversion_t trim_to,
3113 eversion_t roll_forward_to,
3114 ObjectStore::Transaction &t,
3115 bool transaction_applied)
3116 {
3117 if (transaction_applied)
3118 update_snap_map(logv, t);
3119
3120 /* The primary has sent an info updating the history, but it may not
3121 * have arrived yet. We want to make sure that we cannot remember this
3122 * write without remembering that it happened in an interval which went
3123 * active in epoch history.last_epoch_started.
3124 */
3125 if (info.last_epoch_started != info.history.last_epoch_started) {
3126 info.history.last_epoch_started = info.last_epoch_started;
3127 }
3128 if (info.last_interval_started != info.history.last_interval_started) {
3129 info.history.last_interval_started = info.last_interval_started;
3130 }
3131 dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3132
3133 PGLogEntryHandler handler{this, &t};
3134 if (!transaction_applied) {
3135 /* We must be a backfill peer, so it's ok if we apply
3136 * out-of-turn since we won't be considered when
3137 * determining a min possible last_update.
3138 */
3139 pg_log.roll_forward(&handler);
3140 }
3141
3142 for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3143 p != logv.end();
3144 ++p) {
3145 add_log_entry(*p, transaction_applied);
3146
3147 /* We don't want to leave the rollforward artifacts around
3148 * here past last_backfill. It's ok for the same reason as
3149 * above */
3150 if (transaction_applied &&
3151 p->soid > info.last_backfill) {
3152 pg_log.roll_forward(&handler);
3153 }
3154 }
3155 auto last = logv.rbegin();
3156 if (is_primary() && last != logv.rend()) {
3157 projected_log.skip_can_rollback_to_to_head();
3158 projected_log.trim(cct, last->version, nullptr);
3159 }
3160
3161 if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
3162 pg_log.roll_forward_to(
3163 roll_forward_to,
3164 &handler);
3165 t.register_on_applied(
3166 new C_UpdateLastRollbackInfoTrimmedToApplied(
3167 this,
3168 get_osdmap()->get_epoch(),
3169 roll_forward_to));
3170 }
3171
3172 pg_log.trim(trim_to, info);
3173
3174 // update the local pg, pg log
3175 dirty_info = true;
3176 write_if_dirty(t);
3177 }
3178
3179 bool PG::check_log_for_corruption(ObjectStore *store)
3180 {
3181 /// TODO: this method needs to work with the omap log
3182 return true;
3183 }
3184
3185 //! Get the name we're going to save our corrupt page log as
3186 std::string PG::get_corrupt_pg_log_name() const
3187 {
3188 const int MAX_BUF = 512;
3189 char buf[MAX_BUF];
3190 struct tm tm_buf;
3191 time_t my_time(time(NULL));
3192 const struct tm *t = localtime_r(&my_time, &tm_buf);
3193 int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t);
3194 if (ret == 0) {
3195 dout(0) << "strftime failed" << dendl;
3196 return "corrupt_log_unknown_time";
3197 }
3198 string out(buf);
3199 out += stringify(info.pgid);
3200 return out;
3201 }
3202
3203 int PG::read_info(
3204 ObjectStore *store, spg_t pgid, const coll_t &coll, bufferlist &bl,
3205 pg_info_t &info, PastIntervals &past_intervals,
3206 __u8 &struct_v)
3207 {
3208 // try for v8 or later
3209 set<string> keys;
3210 keys.insert(infover_key);
3211 keys.insert(info_key);
3212 keys.insert(biginfo_key);
3213 keys.insert(fastinfo_key);
3214 ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3215 map<string,bufferlist> values;
3216 int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3217 if (r == 0) {
3218 assert(values.size() == 3 ||
3219 values.size() == 4);
3220
3221 bufferlist::iterator p = values[infover_key].begin();
3222 ::decode(struct_v, p);
3223 assert(struct_v >= 8);
3224
3225 p = values[info_key].begin();
3226 ::decode(info, p);
3227
3228 p = values[biginfo_key].begin();
3229 if (struct_v >= 10) {
3230 ::decode(past_intervals, p);
3231 } else {
3232 past_intervals.decode_classic(p);
3233 }
3234 ::decode(info.purged_snaps, p);
3235
3236 p = values[fastinfo_key].begin();
3237 if (!p.end()) {
3238 pg_fast_info_t fast;
3239 ::decode(fast, p);
3240 fast.try_apply_to(&info);
3241 }
3242 return 0;
3243 }
3244
3245 // legacy (ver < 8)
3246 ghobject_t infos_oid(OSD::make_infos_oid());
3247 bufferlist::iterator p = bl.begin();
3248 ::decode(struct_v, p);
3249 assert(struct_v == 7);
3250
3251 // get info out of leveldb
3252 string k = get_info_key(info.pgid);
3253 string bk = get_biginfo_key(info.pgid);
3254 keys.clear();
3255 keys.insert(k);
3256 keys.insert(bk);
3257 values.clear();
3258 store->omap_get_values(coll_t::meta(), ghobject_t(infos_oid), keys, &values);
3259 assert(values.size() == 2);
3260
3261 p = values[k].begin();
3262 ::decode(info, p);
3263
3264 p = values[bk].begin();
3265 ::decode(past_intervals, p);
3266 interval_set<snapid_t> snap_collections; // obsolete
3267 ::decode(snap_collections, p);
3268 ::decode(info.purged_snaps, p);
3269 return 0;
3270 }
3271
3272 void PG::read_state(ObjectStore *store, bufferlist &bl)
3273 {
3274 int r = read_info(store, pg_id, coll, bl, info, past_intervals,
3275 info_struct_v);
3276 assert(r >= 0);
3277
3278 last_written_info = info;
3279
3280 ostringstream oss;
3281 pg_log.read_log_and_missing(
3282 store,
3283 coll,
3284 info_struct_v < 8 ? coll_t::meta() : coll,
3285 ghobject_t(info_struct_v < 8 ? OSD::make_pg_log_oid(pg_id) : pgmeta_oid),
3286 info,
3287 oss,
3288 cct->_conf->osd_ignore_stale_divergent_priors,
3289 cct->_conf->osd_debug_verify_missing_on_start);
3290 if (oss.tellp())
3291 osd->clog->error() << oss.rdbuf();
3292
3293 // log any weirdness
3294 log_weirdness();
3295 }
3296
3297 void PG::log_weirdness()
3298 {
3299 if (pg_log.get_tail() != info.log_tail)
3300 osd->clog->error() << info.pgid
3301 << " info mismatch, log.tail " << pg_log.get_tail()
3302 << " != info.log_tail " << info.log_tail;
3303 if (pg_log.get_head() != info.last_update)
3304 osd->clog->error() << info.pgid
3305 << " info mismatch, log.head " << pg_log.get_head()
3306 << " != info.last_update " << info.last_update;
3307
3308 if (!pg_log.get_log().empty()) {
3309 // sloppy check
3310 if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
3311 osd->clog->error() << info.pgid
3312 << " log bound mismatch, info (" << pg_log.get_tail() << ","
3313 << pg_log.get_head() << "]"
3314 << " actual ["
3315 << pg_log.get_log().log.begin()->version << ","
3316 << pg_log.get_log().log.rbegin()->version << "]";
3317 }
3318
3319 if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
3320 osd->clog->error() << info.pgid
3321 << " caller_ops.size " << pg_log.get_log().caller_ops.size()
3322 << " > log size " << pg_log.get_log().log.size();
3323 }
3324 }
3325
3326 void PG::update_snap_map(
3327 const vector<pg_log_entry_t> &log_entries,
3328 ObjectStore::Transaction &t)
3329 {
3330 for (vector<pg_log_entry_t>::const_iterator i = log_entries.begin();
3331 i != log_entries.end();
3332 ++i) {
3333 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
3334 if (i->soid.snap < CEPH_MAXSNAP) {
3335 if (i->is_delete()) {
3336 int r = snap_mapper.remove_oid(
3337 i->soid,
3338 &_t);
3339 assert(r == 0);
3340 } else if (i->is_update()) {
3341 assert(i->snaps.length() > 0);
3342 vector<snapid_t> snaps;
3343 bufferlist snapbl = i->snaps;
3344 bufferlist::iterator p = snapbl.begin();
3345 try {
3346 ::decode(snaps, p);
3347 } catch (...) {
3348 snaps.clear();
3349 }
3350 set<snapid_t> _snaps(snaps.begin(), snaps.end());
3351
3352 if (i->is_clone() || i->is_promote()) {
3353 snap_mapper.add_oid(
3354 i->soid,
3355 _snaps,
3356 &_t);
3357 } else if (i->is_modify()) {
3358 assert(i->is_modify());
3359 int r = snap_mapper.update_snaps(
3360 i->soid,
3361 _snaps,
3362 0,
3363 &_t);
3364 assert(r == 0);
3365 } else {
3366 assert(i->is_clean());
3367 }
3368 }
3369 }
3370 }
3371 }
3372
3373 /**
3374 * filter trimming|trimmed snaps out of snapcontext
3375 */
3376 void PG::filter_snapc(vector<snapid_t> &snaps)
3377 {
3378 //nothing needs to trim, we can return immediately
3379 if(snap_trimq.empty() && info.purged_snaps.empty())
3380 return;
3381
3382 bool filtering = false;
3383 vector<snapid_t> newsnaps;
3384 for (vector<snapid_t>::iterator p = snaps.begin();
3385 p != snaps.end();
3386 ++p) {
3387 if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
3388 if (!filtering) {
3389 // start building a new vector with what we've seen so far
3390 dout(10) << "filter_snapc filtering " << snaps << dendl;
3391 newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
3392 filtering = true;
3393 }
3394 dout(20) << "filter_snapc removing trimq|purged snap " << *p << dendl;
3395 } else {
3396 if (filtering)
3397 newsnaps.push_back(*p); // continue building new vector
3398 }
3399 }
3400 if (filtering) {
3401 snaps.swap(newsnaps);
3402 dout(10) << "filter_snapc result " << snaps << dendl;
3403 }
3404 }
3405
3406 void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m)
3407 {
3408 for (map<hobject_t, list<OpRequestRef>>::iterator it = m.begin();
3409 it != m.end();
3410 ++it)
3411 requeue_ops(it->second);
3412 m.clear();
3413 }
3414
3415 void PG::requeue_op(OpRequestRef op)
3416 {
3417 auto p = waiting_for_map.find(op->get_source());
3418 if (p != waiting_for_map.end()) {
3419 dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")"
3420 << dendl;
3421 p->second.push_front(op);
3422 } else {
3423 dout(20) << __func__ << " " << op << dendl;
3424 osd->enqueue_front(info.pgid, PGQueueable(op, get_osdmap()->get_epoch()));
3425 }
3426 }
3427
3428 void PG::requeue_ops(list<OpRequestRef> &ls)
3429 {
3430 for (list<OpRequestRef>::reverse_iterator i = ls.rbegin();
3431 i != ls.rend();
3432 ++i) {
3433 auto p = waiting_for_map.find((*i)->get_source());
3434 if (p != waiting_for_map.end()) {
3435 dout(20) << __func__ << " " << *i << " (waiting_for_map " << p->first
3436 << ")" << dendl;
3437 p->second.push_front(*i);
3438 } else {
3439 dout(20) << __func__ << " " << *i << dendl;
3440 osd->enqueue_front(info.pgid, PGQueueable(*i, get_osdmap()->get_epoch()));
3441 }
3442 }
3443 ls.clear();
3444 }
3445
3446 void PG::requeue_map_waiters()
3447 {
3448 epoch_t epoch = get_osdmap()->get_epoch();
3449 auto p = waiting_for_map.begin();
3450 while (p != waiting_for_map.end()) {
3451 if (epoch < p->second.front()->min_epoch) {
3452 dout(20) << __func__ << " " << p->first << " front op "
3453 << p->second.front() << " must still wait, doing nothing"
3454 << dendl;
3455 ++p;
3456 } else {
3457 dout(20) << __func__ << " " << p->first << " " << p->second << dendl;
3458 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
3459 osd->enqueue_front(info.pgid, PGQueueable(*q, epoch));
3460 }
3461 p = waiting_for_map.erase(p);
3462 }
3463 }
3464 }
3465
3466
3467 // ==========================================================================================
3468 // SCRUB
3469
3470 /*
3471 * when holding pg and sched_scrub_lock, then the states are:
3472 * scheduling:
3473 * scrubber.reserved = true
3474 * scrub_rserved_peers includes whoami
3475 * osd->scrub_pending++
3476 * scheduling, replica declined:
3477 * scrubber.reserved = true
3478 * scrubber.reserved_peers includes -1
3479 * osd->scrub_pending++
3480 * pending:
3481 * scrubber.reserved = true
3482 * scrubber.reserved_peers.size() == acting.size();
3483 * pg on scrub_wq
3484 * osd->scrub_pending++
3485 * scrubbing:
3486 * scrubber.reserved = false;
3487 * scrubber.reserved_peers empty
3488 * osd->scrubber.active++
3489 */
3490
3491 // returns true if a scrub has been newly kicked off
3492 bool PG::sched_scrub()
3493 {
3494 bool nodeep_scrub = false;
3495 assert(is_locked());
3496 if (!(is_primary() && is_active() && is_clean() && !is_scrubbing())) {
3497 return false;
3498 }
3499
3500 double deep_scrub_interval = 0;
3501 pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
3502 if (deep_scrub_interval <= 0) {
3503 deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
3504 }
3505 bool time_for_deep = ceph_clock_now() >=
3506 info.history.last_deep_scrub_stamp + deep_scrub_interval;
3507
3508 bool deep_coin_flip = false;
3509 // Only add random deep scrubs when NOT user initiated scrub
3510 if (!scrubber.must_scrub)
3511 deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
3512 dout(20) << __func__ << ": time_for_deep=" << time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
3513
3514 time_for_deep = (time_for_deep || deep_coin_flip);
3515
3516 //NODEEP_SCRUB so ignore time initiated deep-scrub
3517 if (osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
3518 pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
3519 time_for_deep = false;
3520 nodeep_scrub = true;
3521 }
3522
3523 if (!scrubber.must_scrub) {
3524 assert(!scrubber.must_deep_scrub);
3525
3526 //NOSCRUB so skip regular scrubs
3527 if ((osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
3528 pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) && !time_for_deep) {
3529 if (scrubber.reserved) {
3530 // cancel scrub if it is still in scheduling,
3531 // so pgs from other pools where scrub are still legal
3532 // have a chance to go ahead with scrubbing.
3533 clear_scrub_reserved();
3534 scrub_unreserve_replicas();
3535 }
3536 return false;
3537 }
3538 }
3539
3540 if (cct->_conf->osd_scrub_auto_repair
3541 && get_pgbackend()->auto_repair_supported()
3542 && time_for_deep
3543 // respect the command from user, and not do auto-repair
3544 && !scrubber.must_repair
3545 && !scrubber.must_scrub
3546 && !scrubber.must_deep_scrub) {
3547 dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
3548 scrubber.auto_repair = true;
3549 } else {
3550 // this happens when user issue the scrub/repair command during
3551 // the scheduling of the scrub/repair (e.g. request reservation)
3552 scrubber.auto_repair = false;
3553 }
3554
3555 bool ret = true;
3556 if (!scrubber.reserved) {
3557 assert(scrubber.reserved_peers.empty());
3558 if (osd->inc_scrubs_pending()) {
3559 dout(20) << "sched_scrub: reserved locally, reserving replicas" << dendl;
3560 scrubber.reserved = true;
3561 scrubber.reserved_peers.insert(pg_whoami);
3562 scrub_reserve_replicas();
3563 } else {
3564 dout(20) << "sched_scrub: failed to reserve locally" << dendl;
3565 ret = false;
3566 }
3567 }
3568 if (scrubber.reserved) {
3569 if (scrubber.reserve_failed) {
3570 dout(20) << "sched_scrub: failed, a peer declined" << dendl;
3571 clear_scrub_reserved();
3572 scrub_unreserve_replicas();
3573 ret = false;
3574 } else if (scrubber.reserved_peers.size() == acting.size()) {
3575 dout(20) << "sched_scrub: success, reserved self and replicas" << dendl;
3576 if (time_for_deep) {
3577 dout(10) << "sched_scrub: scrub will be deep" << dendl;
3578 state_set(PG_STATE_DEEP_SCRUB);
3579 } else if (!scrubber.must_deep_scrub && info.stats.stats.sum.num_deep_scrub_errors) {
3580 if (!nodeep_scrub) {
3581 osd->clog->info() << "osd." << osd->whoami
3582 << " pg " << info.pgid
3583 << " Deep scrub errors, upgrading scrub to deep-scrub";
3584 state_set(PG_STATE_DEEP_SCRUB);
3585 } else if (!scrubber.must_scrub) {
3586 osd->clog->error() << "osd." << osd->whoami
3587 << " pg " << info.pgid
3588 << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
3589 clear_scrub_reserved();
3590 scrub_unreserve_replicas();
3591 return false;
3592 } else {
3593 osd->clog->error() << "osd." << osd->whoami
3594 << " pg " << info.pgid
3595 << " Regular scrub request, losing deep-scrub details";
3596 }
3597 }
3598 queue_scrub();
3599 } else {
3600 // none declined, since scrubber.reserved is set
3601 dout(20) << "sched_scrub: reserved " << scrubber.reserved_peers << ", waiting for replicas" << dendl;
3602 }
3603 }
3604
3605 return ret;
3606 }
3607
3608 void PG::reg_next_scrub()
3609 {
3610 if (!is_primary())
3611 return;
3612
3613 utime_t reg_stamp;
3614 if (scrubber.must_scrub ||
3615 (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats)) {
3616 reg_stamp = ceph_clock_now();
3617 } else {
3618 reg_stamp = info.history.last_scrub_stamp;
3619 }
3620 // note down the sched_time, so we can locate this scrub, and remove it
3621 // later on.
3622 double scrub_min_interval = 0, scrub_max_interval = 0;
3623 pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
3624 pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
3625 assert(scrubber.scrub_reg_stamp == utime_t());
3626 scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
3627 reg_stamp,
3628 scrub_min_interval,
3629 scrub_max_interval,
3630 scrubber.must_scrub);
3631 }
3632
3633 void PG::unreg_next_scrub()
3634 {
3635 if (is_primary()) {
3636 osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
3637 scrubber.scrub_reg_stamp = utime_t();
3638 }
3639 }
3640
3641 void PG::do_replica_scrub_map(OpRequestRef op)
3642 {
3643 const MOSDRepScrubMap *m = static_cast<const MOSDRepScrubMap*>(op->get_req());
3644 dout(7) << __func__ << " " << *m << dendl;
3645 if (m->map_epoch < info.history.same_interval_since) {
3646 dout(10) << __func__ << " discarding old from "
3647 << m->map_epoch << " < " << info.history.same_interval_since
3648 << dendl;
3649 return;
3650 }
3651 if (!scrubber.is_chunky_scrub_active()) {
3652 dout(10) << __func__ << " scrub isn't active" << dendl;
3653 return;
3654 }
3655
3656 op->mark_started();
3657
3658 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3659 scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3660 dout(10) << "map version is "
3661 << scrubber.received_maps[m->from].valid_through
3662 << dendl;
3663
3664 --scrubber.waiting_on;
3665 scrubber.waiting_on_whom.erase(m->from);
3666 if (scrubber.waiting_on == 0) {
3667 if (ops_blocked_by_scrub()) {
3668 requeue_scrub(true);
3669 } else {
3670 requeue_scrub(false);
3671 }
3672 }
3673 }
3674
3675 void PG::sub_op_scrub_map(OpRequestRef op)
3676 {
3677 // for legacy jewel compatibility only
3678 const MOSDSubOp *m = static_cast<const MOSDSubOp *>(op->get_req());
3679 assert(m->get_type() == MSG_OSD_SUBOP);
3680 dout(7) << "sub_op_scrub_map" << dendl;
3681
3682 if (m->map_epoch < info.history.same_interval_since) {
3683 dout(10) << "sub_op_scrub discarding old sub_op from "
3684 << m->map_epoch << " < " << info.history.same_interval_since << dendl;
3685 return;
3686 }
3687
3688 if (!scrubber.is_chunky_scrub_active()) {
3689 dout(10) << "sub_op_scrub_map scrub isn't active" << dendl;
3690 return;
3691 }
3692
3693 op->mark_started();
3694
3695 dout(10) << " got " << m->from << " scrub map" << dendl;
3696 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3697
3698 scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3699 dout(10) << "map version is "
3700 << scrubber.received_maps[m->from].valid_through
3701 << dendl;
3702
3703 --scrubber.waiting_on;
3704 scrubber.waiting_on_whom.erase(m->from);
3705
3706 if (scrubber.waiting_on == 0) {
3707 if (ops_blocked_by_scrub()) {
3708 requeue_scrub(true);
3709 } else {
3710 requeue_scrub(false);
3711 }
3712 }
3713 }
3714
3715 // send scrub v3 messages (chunky scrub)
3716 void PG::_request_scrub_map(
3717 pg_shard_t replica, eversion_t version,
3718 hobject_t start, hobject_t end,
3719 bool deep, uint32_t seed)
3720 {
3721 assert(replica != pg_whoami);
3722 dout(10) << "scrub requesting scrubmap from osd." << replica
3723 << " deep " << (int)deep << " seed " << seed << dendl;
3724 MOSDRepScrub *repscrubop = new MOSDRepScrub(
3725 spg_t(info.pgid.pgid, replica.shard), version,
3726 get_osdmap()->get_epoch(),
3727 get_last_peering_reset(),
3728 start, end, deep, seed);
3729 // default priority, we want the rep scrub processed prior to any recovery
3730 // or client io messages (we are holding a lock!)
3731 osd->send_message_osd_cluster(
3732 replica.osd, repscrubop, get_osdmap()->get_epoch());
3733 }
3734
3735 void PG::handle_scrub_reserve_request(OpRequestRef op)
3736 {
3737 dout(7) << __func__ << " " << *op->get_req() << dendl;
3738 op->mark_started();
3739 if (scrubber.reserved) {
3740 dout(10) << __func__ << " ignoring reserve request: Already reserved"
3741 << dendl;
3742 return;
3743 }
3744 scrubber.reserved = osd->inc_scrubs_pending();
3745 if (op->get_req()->get_type() == MSG_OSD_SCRUB_RESERVE) {
3746 const MOSDScrubReserve *m =
3747 static_cast<const MOSDScrubReserve*>(op->get_req());
3748 Message *reply = new MOSDScrubReserve(
3749 spg_t(info.pgid.pgid, primary.shard),
3750 m->map_epoch,
3751 scrubber.reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
3752 pg_whoami);
3753 osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
3754 } else {
3755 // for jewel compat only
3756 const MOSDSubOp *req = static_cast<const MOSDSubOp*>(op->get_req());
3757 assert(req->get_type() == MSG_OSD_SUBOP);
3758 MOSDSubOpReply *reply = new MOSDSubOpReply(
3759 req, pg_whoami, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
3760 ::encode(scrubber.reserved, reply->get_data());
3761 osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
3762 }
3763 }
3764
3765 void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
3766 {
3767 dout(7) << __func__ << " " << *op->get_req() << dendl;
3768 op->mark_started();
3769 if (!scrubber.reserved) {
3770 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
3771 return;
3772 }
3773 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
3774 dout(10) << " already had osd." << from << " reserved" << dendl;
3775 } else {
3776 dout(10) << " osd." << from << " scrub reserve = success" << dendl;
3777 scrubber.reserved_peers.insert(from);
3778 sched_scrub();
3779 }
3780 }
3781
3782 void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
3783 {
3784 dout(7) << __func__ << " " << *op->get_req() << dendl;
3785 op->mark_started();
3786 if (!scrubber.reserved) {
3787 dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
3788 return;
3789 }
3790 if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
3791 dout(10) << " already had osd." << from << " reserved" << dendl;
3792 } else {
3793 /* One decline stops this pg from being scheduled for scrubbing. */
3794 dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
3795 scrubber.reserve_failed = true;
3796 sched_scrub();
3797 }
3798 }
3799
3800 void PG::handle_scrub_reserve_release(OpRequestRef op)
3801 {
3802 dout(7) << __func__ << " " << *op->get_req() << dendl;
3803 op->mark_started();
3804 clear_scrub_reserved();
3805 }
3806
3807 void PG::reject_reservation()
3808 {
3809 osd->send_message_osd_cluster(
3810 primary.osd,
3811 new MBackfillReserve(
3812 MBackfillReserve::REJECT,
3813 spg_t(info.pgid.pgid, primary.shard),
3814 get_osdmap()->get_epoch()),
3815 get_osdmap()->get_epoch());
3816 }
3817
3818 void PG::schedule_backfill_full_retry()
3819 {
3820 Mutex::Locker lock(osd->recovery_request_lock);
3821 osd->recovery_request_timer.add_event_after(
3822 cct->_conf->osd_backfill_retry_interval,
3823 new QueuePeeringEvt<RequestBackfill>(
3824 this, get_osdmap()->get_epoch(),
3825 RequestBackfill()));
3826 }
3827
3828 void PG::schedule_recovery_full_retry()
3829 {
3830 Mutex::Locker lock(osd->recovery_request_lock);
3831 osd->recovery_request_timer.add_event_after(
3832 cct->_conf->osd_recovery_retry_interval,
3833 new QueuePeeringEvt<DoRecovery>(
3834 this, get_osdmap()->get_epoch(),
3835 DoRecovery()));
3836 }
3837
3838 void PG::clear_scrub_reserved()
3839 {
3840 scrubber.reserved_peers.clear();
3841 scrubber.reserve_failed = false;
3842
3843 if (scrubber.reserved) {
3844 scrubber.reserved = false;
3845 osd->dec_scrubs_pending();
3846 }
3847 }
3848
3849 void PG::scrub_reserve_replicas()
3850 {
3851 assert(backfill_targets.empty());
3852 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3853 i != actingbackfill.end();
3854 ++i) {
3855 if (*i == pg_whoami) continue;
3856 dout(10) << "scrub requesting reserve from osd." << *i << dendl;
3857 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
3858 osd->send_message_osd_cluster(
3859 i->osd,
3860 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
3861 get_osdmap()->get_epoch(),
3862 MOSDScrubReserve::REQUEST, pg_whoami),
3863 get_osdmap()->get_epoch());
3864 } else {
3865 // for jewel compat only
3866 vector<OSDOp> scrub(1);
3867 scrub[0].op.op = CEPH_OSD_OP_SCRUB_RESERVE;
3868 hobject_t poid;
3869 eversion_t v;
3870 osd_reqid_t reqid;
3871 MOSDSubOp *subop = new MOSDSubOp(
3872 reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
3873 get_osdmap()->get_epoch(), osd->get_tid(), v);
3874 subop->ops = scrub;
3875 osd->send_message_osd_cluster(
3876 i->osd, subop, get_osdmap()->get_epoch());
3877 }
3878 }
3879 }
3880
3881 void PG::scrub_unreserve_replicas()
3882 {
3883 assert(backfill_targets.empty());
3884 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3885 i != actingbackfill.end();
3886 ++i) {
3887 if (*i == pg_whoami) continue;
3888 dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
3889 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
3890 osd->send_message_osd_cluster(
3891 i->osd,
3892 new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
3893 get_osdmap()->get_epoch(),
3894 MOSDScrubReserve::RELEASE, pg_whoami),
3895 get_osdmap()->get_epoch());
3896 } else {
3897 // for jewel compat only
3898 vector<OSDOp> scrub(1);
3899 scrub[0].op.op = CEPH_OSD_OP_SCRUB_UNRESERVE;
3900 hobject_t poid;
3901 eversion_t v;
3902 osd_reqid_t reqid;
3903 MOSDSubOp *subop = new MOSDSubOp(
3904 reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
3905 get_osdmap()->get_epoch(), osd->get_tid(), v);
3906 subop->ops = scrub;
3907 osd->send_message_osd_cluster(i->osd, subop, get_osdmap()->get_epoch());
3908 }
3909 }
3910 }
3911
3912 void PG::_scan_rollback_obs(
3913 const vector<ghobject_t> &rollback_obs,
3914 ThreadPool::TPHandle &handle)
3915 {
3916 ObjectStore::Transaction t;
3917 eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
3918 for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
3919 i != rollback_obs.end();
3920 ++i) {
3921 if (i->generation < trimmed_to.version) {
3922 osd->clog->error() << "osd." << osd->whoami
3923 << " pg " << info.pgid
3924 << " found obsolete rollback obj "
3925 << *i << " generation < trimmed_to "
3926 << trimmed_to
3927 << "...repaired";
3928 t.remove(coll, *i);
3929 }
3930 }
3931 if (!t.empty()) {
3932 derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
3933 << dendl;
3934 osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3935 }
3936 }
3937
3938 void PG::_scan_snaps(ScrubMap &smap)
3939 {
3940 hobject_t head;
3941 SnapSet snapset;
3942 for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
3943 i != smap.objects.rend();
3944 ++i) {
3945 const hobject_t &hoid = i->first;
3946 ScrubMap::object &o = i->second;
3947
3948 if (hoid.is_head() || hoid.is_snapdir()) {
3949 // parse the SnapSet
3950 bufferlist bl;
3951 if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
3952 continue;
3953 }
3954 bl.push_back(o.attrs[SS_ATTR]);
3955 auto p = bl.begin();
3956 try {
3957 ::decode(snapset, p);
3958 } catch(...) {
3959 continue;
3960 }
3961 head = hoid.get_head();
3962 continue;
3963 }
3964 if (hoid.snap < CEPH_MAXSNAP) {
3965 // check and if necessary fix snap_mapper
3966 if (hoid.get_head() != head) {
3967 derr << __func__ << " no head for " << hoid << " (have " << head << ")"
3968 << dendl;
3969 continue;
3970 }
3971 set<snapid_t> obj_snaps;
3972 if (!snapset.is_legacy()) {
3973 auto p = snapset.clone_snaps.find(hoid.snap);
3974 if (p == snapset.clone_snaps.end()) {
3975 derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset
3976 << dendl;
3977 continue;
3978 }
3979 obj_snaps.insert(p->second.begin(), p->second.end());
3980 } else {
3981 bufferlist bl;
3982 if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
3983 continue;
3984 }
3985 bl.push_back(o.attrs[OI_ATTR]);
3986 object_info_t oi;
3987 try {
3988 oi.decode(bl);
3989 } catch(...) {
3990 continue;
3991 }
3992 obj_snaps.insert(oi.legacy_snaps.begin(), oi.legacy_snaps.end());
3993 }
3994 set<snapid_t> cur_snaps;
3995 int r = snap_mapper.get_snaps(hoid, &cur_snaps);
3996 if (r != 0 && r != -ENOENT) {
3997 derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
3998 ceph_abort();
3999 }
4000 if (r == -ENOENT || cur_snaps != obj_snaps) {
4001 ObjectStore::Transaction t;
4002 OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4003 if (r == 0) {
4004 r = snap_mapper.remove_oid(hoid, &_t);
4005 if (r != 0) {
4006 derr << __func__ << ": remove_oid returned " << cpp_strerror(r)
4007 << dendl;
4008 ceph_abort();
4009 }
4010 osd->clog->error() << "osd." << osd->whoami
4011 << " found snap mapper error on pg "
4012 << info.pgid
4013 << " oid " << hoid << " snaps in mapper: "
4014 << cur_snaps << ", oi: "
4015 << obj_snaps
4016 << "...repaired";
4017 } else {
4018 osd->clog->error() << "osd." << osd->whoami
4019 << " found snap mapper error on pg "
4020 << info.pgid
4021 << " oid " << hoid << " snaps missing in mapper"
4022 << ", should be: "
4023 << obj_snaps
4024 << "...repaired";
4025 }
4026 snap_mapper.add_oid(hoid, obj_snaps, &_t);
4027 r = osd->store->apply_transaction(osr.get(), std::move(t));
4028 if (r != 0) {
4029 derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4030 << dendl;
4031 }
4032 }
4033 }
4034 }
4035 }
4036
4037 /*
4038 * build a scrub map over a chunk without releasing the lock
4039 * only used by chunky scrub
4040 */
4041 int PG::build_scrub_map_chunk(
4042 ScrubMap &map,
4043 hobject_t start, hobject_t end, bool deep, uint32_t seed,
4044 ThreadPool::TPHandle &handle)
4045 {
4046 dout(10) << __func__ << " [" << start << "," << end << ") "
4047 << " seed " << seed << dendl;
4048
4049 map.valid_through = info.last_update;
4050
4051 // objects
4052 vector<hobject_t> ls;
4053 vector<ghobject_t> rollback_obs;
4054 int ret = get_pgbackend()->objects_list_range(
4055 start,
4056 end,
4057 0,
4058 &ls,
4059 &rollback_obs);
4060 if (ret < 0) {
4061 dout(5) << "objects_list_range error: " << ret << dendl;
4062 return ret;
4063 }
4064
4065
4066 get_pgbackend()->be_scan_list(map, ls, deep, seed, handle);
4067 _scan_rollback_obs(rollback_obs, handle);
4068 _scan_snaps(map);
4069
4070 dout(20) << __func__ << " done" << dendl;
4071 return 0;
4072 }
4073
4074 void PG::Scrubber::cleanup_store(ObjectStore::Transaction *t) {
4075 if (!store)
4076 return;
4077 struct OnComplete : Context {
4078 std::unique_ptr<Scrub::Store> store;
4079 OnComplete(
4080 std::unique_ptr<Scrub::Store> &&store)
4081 : store(std::move(store)) {}
4082 void finish(int) override {}
4083 };
4084 store->cleanup(t);
4085 t->register_on_complete(new OnComplete(std::move(store)));
4086 assert(!store);
4087 }
4088
4089 void PG::repair_object(
4090 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
4091 pg_shard_t bad_peer)
4092 {
4093 list<pg_shard_t> op_shards;
4094 for (auto i : *ok_peers) {
4095 op_shards.push_back(i.second);
4096 }
4097 dout(10) << "repair_object " << soid << " bad_peer osd."
4098 << bad_peer << " ok_peers osd.{" << op_shards << "}" << dendl;
4099 ScrubMap::object &po = ok_peers->back().first;
4100 eversion_t v;
4101 bufferlist bv;
4102 bv.push_back(po.attrs[OI_ATTR]);
4103 object_info_t oi(bv);
4104 if (bad_peer != primary) {
4105 peer_missing[bad_peer].add(soid, oi.version, eversion_t());
4106 } else {
4107 // We should only be scrubbing if the PG is clean.
4108 assert(waiting_for_unreadable_object.empty());
4109
4110 pg_log.missing_add(soid, oi.version, eversion_t());
4111
4112 pg_log.set_last_requested(0);
4113 dout(10) << __func__ << ": primary = " << primary << dendl;
4114 }
4115
4116 if (is_ec_pg() || bad_peer == primary) {
4117 // we'd better collect all shard for EC pg, and prepare good peers as the
4118 // source of pull in the case of replicated pg.
4119 missing_loc.add_missing(soid, oi.version, eversion_t());
4120 list<pair<ScrubMap::object, pg_shard_t> >::iterator i;
4121 for (i = ok_peers->begin();
4122 i != ok_peers->end();
4123 ++i)
4124 missing_loc.add_location(soid, i->second);
4125 }
4126 }
4127
4128 /* replica_scrub
4129 *
4130 * Wait for last_update_applied to match msg->scrub_to as above. Wait
4131 * for pushes to complete in case of recent recovery. Build a single
4132 * scrubmap of objects that are in the range [msg->start, msg->end).
4133 */
4134 void PG::replica_scrub(
4135 OpRequestRef op,
4136 ThreadPool::TPHandle &handle)
4137 {
4138 const MOSDRepScrub *msg = static_cast<const MOSDRepScrub *>(op->get_req());
4139 assert(!scrubber.active_rep_scrub);
4140 dout(7) << "replica_scrub" << dendl;
4141
4142 if (msg->map_epoch < info.history.same_interval_since) {
4143 dout(10) << "replica_scrub discarding old replica_scrub from "
4144 << msg->map_epoch << " < " << info.history.same_interval_since
4145 << dendl;
4146 return;
4147 }
4148
4149 ScrubMap map;
4150
4151 assert(msg->chunky);
4152 if (last_update_applied < msg->scrub_to) {
4153 dout(10) << "waiting for last_update_applied to catch up" << dendl;
4154 scrubber.active_rep_scrub = op;
4155 return;
4156 }
4157
4158 if (active_pushes > 0) {
4159 dout(10) << "waiting for active pushes to finish" << dendl;
4160 scrubber.active_rep_scrub = op;
4161 return;
4162 }
4163
4164 // compensate for hobject_t's with wrong pool from sloppy hammer OSDs
4165 hobject_t start = msg->start;
4166 hobject_t end = msg->end;
4167 if (!start.is_max())
4168 start.pool = info.pgid.pool();
4169 if (!end.is_max())
4170 end.pool = info.pgid.pool();
4171
4172 build_scrub_map_chunk(
4173 map, start, end, msg->deep, msg->seed,
4174 handle);
4175
4176 if (HAVE_FEATURE(acting_features, SERVER_LUMINOUS)) {
4177 MOSDRepScrubMap *reply = new MOSDRepScrubMap(
4178 spg_t(info.pgid.pgid, get_primary().shard),
4179 msg->map_epoch,
4180 pg_whoami);
4181 ::encode(map, reply->get_data());
4182 osd->send_message_osd_cluster(reply, msg->get_connection());
4183 } else {
4184 // for jewel compatibility
4185 vector<OSDOp> scrub(1);
4186 scrub[0].op.op = CEPH_OSD_OP_SCRUB_MAP;
4187 hobject_t poid;
4188 eversion_t v;
4189 osd_reqid_t reqid;
4190 MOSDSubOp *subop = new MOSDSubOp(
4191 reqid,
4192 pg_whoami,
4193 spg_t(info.pgid.pgid, get_primary().shard),
4194 poid,
4195 0,
4196 msg->map_epoch,
4197 osd->get_tid(),
4198 v);
4199 ::encode(map, subop->get_data());
4200 subop->ops = scrub;
4201 osd->send_message_osd_cluster(subop, msg->get_connection());
4202 }
4203 }
4204
4205 /* Scrub:
4206 * PG_STATE_SCRUBBING is set when the scrub is queued
4207 *
4208 * scrub will be chunky if all OSDs in PG support chunky scrub
4209 * scrub will fail if OSDs are too old.
4210 */
4211 void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
4212 {
4213 if (cct->_conf->osd_scrub_sleep > 0 &&
4214 (scrubber.state == PG::Scrubber::NEW_CHUNK ||
4215 scrubber.state == PG::Scrubber::INACTIVE) &&
4216 scrubber.needs_sleep) {
4217 ceph_assert(!scrubber.sleeping);
4218 dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
4219
4220 // Do an async sleep so we don't block the op queue
4221 OSDService *osds = osd;
4222 spg_t pgid = get_pgid();
4223 int state = scrubber.state;
4224 auto scrub_requeue_callback =
4225 new FunctionContext([osds, pgid, state](int r) {
4226 PG *pg = osds->osd->lookup_lock_pg(pgid);
4227 if (pg == nullptr) {
4228 lgeneric_dout(osds->osd->cct, 20)
4229 << "scrub_requeue_callback: Could not find "
4230 << "PG " << pgid << " can't complete scrub requeue after sleep"
4231 << dendl;
4232 return;
4233 }
4234 pg->scrubber.sleeping = false;
4235 pg->scrubber.needs_sleep = false;
4236 lgeneric_dout(pg->cct, 20)
4237 << "scrub_requeue_callback: slept for "
4238 << ceph_clock_now() - pg->scrubber.sleep_start
4239 << ", re-queuing scrub with state " << state << dendl;
4240 pg->scrub_queued = false;
4241 pg->requeue_scrub();
4242 pg->scrubber.sleep_start = utime_t();
4243 pg->unlock();
4244 });
4245 Mutex::Locker l(osd->scrub_sleep_lock);
4246 osd->scrub_sleep_timer.add_event_after(cct->_conf->osd_scrub_sleep,
4247 scrub_requeue_callback);
4248 scrubber.sleeping = true;
4249 scrubber.sleep_start = ceph_clock_now();
4250 return;
4251 }
4252 if (pg_has_reset_since(queued)) {
4253 return;
4254 }
4255 assert(scrub_queued);
4256 scrub_queued = false;
4257 scrubber.needs_sleep = true;
4258
4259 if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
4260 dout(10) << "scrub -- not primary or active or not clean" << dendl;
4261 state_clear(PG_STATE_SCRUBBING);
4262 state_clear(PG_STATE_REPAIR);
4263 state_clear(PG_STATE_DEEP_SCRUB);
4264 publish_stats_to_osd();
4265 return;
4266 }
4267
4268 if (!scrubber.active) {
4269 assert(backfill_targets.empty());
4270
4271 scrubber.deep = state_test(PG_STATE_DEEP_SCRUB);
4272
4273 dout(10) << "starting a new chunky scrub" << dendl;
4274 }
4275
4276 chunky_scrub(handle);
4277 }
4278
4279 /*
4280 * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
4281 * chunk.
4282 *
4283 * The object store is partitioned into chunks which end on hash boundaries. For
4284 * each chunk, the following logic is performed:
4285 *
4286 * (1) Block writes on the chunk
4287 * (2) Request maps from replicas
4288 * (3) Wait for pushes to be applied (after recovery)
4289 * (4) Wait for writes to flush on the chunk
4290 * (5) Wait for maps from replicas
4291 * (6) Compare / repair all scrub maps
4292 * (7) Wait for digest updates to apply
4293 *
4294 * This logic is encoded in the mostly linear state machine:
4295 *
4296 * +------------------+
4297 * _________v__________ |
4298 * | | |
4299 * | INACTIVE | |
4300 * |____________________| |
4301 * | |
4302 * | +----------+ |
4303 * _________v___v______ | |
4304 * | | | |
4305 * | NEW_CHUNK | | |
4306 * |____________________| | |
4307 * | | |
4308 * _________v__________ | |
4309 * | | | |
4310 * | WAIT_PUSHES | | |
4311 * |____________________| | |
4312 * | | |
4313 * _________v__________ | |
4314 * | | | |
4315 * | WAIT_LAST_UPDATE | | |
4316 * |____________________| | |
4317 * | | |
4318 * _________v__________ | |
4319 * | | | |
4320 * | BUILD_MAP | | |
4321 * |____________________| | |
4322 * | | |
4323 * _________v__________ | |
4324 * | | | |
4325 * | WAIT_REPLICAS | | |
4326 * |____________________| | |
4327 * | | |
4328 * _________v__________ | |
4329 * | | | |
4330 * | COMPARE_MAPS | | |
4331 * |____________________| | |
4332 * | | |
4333 * | | |
4334 * _________v__________ | |
4335 * | | | |
4336 * |WAIT_DIGEST_UPDATES | | |
4337 * |____________________| | |
4338 * | | | |
4339 * | +----------+ |
4340 * _________v__________ |
4341 * | | |
4342 * | FINISH | |
4343 * |____________________| |
4344 * | |
4345 * +------------------+
4346 *
4347 * The primary determines the last update from the subset by walking the log. If
4348 * it sees a log entry pertaining to a file in the chunk, it tells the replicas
4349 * to wait until that update is applied before building a scrub map. Both the
4350 * primary and replicas will wait for any active pushes to be applied.
4351 *
4352 * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
4353 *
4354 * scrubber.state encodes the current state of the scrub (refer to state diagram
4355 * for details).
4356 */
4357 void PG::chunky_scrub(ThreadPool::TPHandle &handle)
4358 {
4359 // check for map changes
4360 if (scrubber.is_chunky_scrub_active()) {
4361 if (scrubber.epoch_start != info.history.same_interval_since) {
4362 dout(10) << "scrub pg changed, aborting" << dendl;
4363 scrub_clear_state();
4364 scrub_unreserve_replicas();
4365 return;
4366 }
4367 }
4368
4369 bool done = false;
4370 int ret;
4371
4372 while (!done) {
4373 dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
4374 << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
4375
4376 switch (scrubber.state) {
4377 case PG::Scrubber::INACTIVE:
4378 dout(10) << "scrub start" << dendl;
4379
4380 publish_stats_to_osd();
4381 scrubber.epoch_start = info.history.same_interval_since;
4382 scrubber.active = true;
4383
4384 osd->inc_scrubs_active(scrubber.reserved);
4385 if (scrubber.reserved) {
4386 scrubber.reserved = false;
4387 scrubber.reserved_peers.clear();
4388 }
4389
4390 {
4391 ObjectStore::Transaction t;
4392 scrubber.cleanup_store(&t);
4393 scrubber.store.reset(Scrub::Store::create(osd->store, &t,
4394 info.pgid, coll));
4395 osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4396 }
4397
4398 // Don't include temporary objects when scrubbing
4399 scrubber.start = info.pgid.pgid.get_hobj_start();
4400 scrubber.state = PG::Scrubber::NEW_CHUNK;
4401
4402 {
4403 bool repair = state_test(PG_STATE_REPAIR);
4404 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4405 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4406 stringstream oss;
4407 oss << info.pgid.pgid << " " << mode << " starts" << std::endl;
4408 osd->clog->info(oss);
4409 }
4410
4411 scrubber.seed = -1;
4412
4413 break;
4414
4415 case PG::Scrubber::NEW_CHUNK:
4416 scrubber.primary_scrubmap = ScrubMap();
4417 scrubber.received_maps.clear();
4418
4419 {
4420 /* get the start and end of our scrub chunk
4421 *
4422 * Our scrub chunk has an important restriction we're going to need to
4423 * respect. We can't let head or snapdir be start or end.
4424 * Using a half-open interval means that if end == head|snapdir,
4425 * we'd scrub/lock head and the clone right next to head in different
4426 * chunks which would allow us to miss clones created between
4427 * scrubbing that chunk and scrubbing the chunk including head.
4428 * This isn't true for any of the other clones since clones can
4429 * only be created "just to the left of" head. There is one exception
4430 * to this: promotion of clones which always happens to the left of the
4431 * left-most clone, but promote_object checks the scrubber in that
4432 * case, so it should be ok. Also, it's ok to "miss" clones at the
4433 * left end of the range if we are a tier because they may legitimately
4434 * not exist (see _scrub).
4435 */
4436 int min = MAX(3, cct->_conf->osd_scrub_chunk_min);
4437 hobject_t start = scrubber.start;
4438 hobject_t candidate_end;
4439 vector<hobject_t> objects;
4440 ret = get_pgbackend()->objects_list_partial(
4441 start,
4442 min,
4443 MAX(min, cct->_conf->osd_scrub_chunk_max),
4444 &objects,
4445 &candidate_end);
4446 assert(ret >= 0);
4447
4448 if (!objects.empty()) {
4449 hobject_t back = objects.back();
4450 while (candidate_end.has_snapset() &&
4451 candidate_end.get_head() == back.get_head()) {
4452 candidate_end = back;
4453 objects.pop_back();
4454 if (objects.empty()) {
4455 assert(0 ==
4456 "Somehow we got more than 2 objects which"
4457 "have the same head but are not clones");
4458 }
4459 back = objects.back();
4460 }
4461 if (candidate_end.has_snapset()) {
4462 assert(candidate_end.get_head() != back.get_head());
4463 candidate_end = candidate_end.get_object_boundary();
4464 }
4465 } else {
4466 assert(candidate_end.is_max());
4467 }
4468
4469 if (!_range_available_for_scrub(scrubber.start, candidate_end)) {
4470 // we'll be requeued by whatever made us unavailable for scrub
4471 dout(10) << __func__ << ": scrub blocked somewhere in range "
4472 << "[" << scrubber.start << ", " << candidate_end << ")"
4473 << dendl;
4474 done = true;
4475 break;
4476 }
4477 scrubber.end = candidate_end;
4478 }
4479
4480 // walk the log to find the latest update that affects our chunk
4481 scrubber.subset_last_update = eversion_t();
4482 for (auto p = projected_log.log.rbegin();
4483 p != projected_log.log.rend();
4484 ++p) {
4485 if (p->soid >= scrubber.start &&
4486 p->soid < scrubber.end) {
4487 scrubber.subset_last_update = p->version;
4488 break;
4489 }
4490 }
4491 if (scrubber.subset_last_update == eversion_t()) {
4492 for (list<pg_log_entry_t>::const_reverse_iterator p =
4493 pg_log.get_log().log.rbegin();
4494 p != pg_log.get_log().log.rend();
4495 ++p) {
4496 if (p->soid >= scrubber.start &&
4497 p->soid < scrubber.end) {
4498 scrubber.subset_last_update = p->version;
4499 break;
4500 }
4501 }
4502 }
4503
4504 // ask replicas to wait until
4505 // last_update_applied >= scrubber.subset_last_update and then scan
4506 scrubber.waiting_on_whom.insert(pg_whoami);
4507 ++scrubber.waiting_on;
4508
4509 // request maps from replicas
4510 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4511 i != actingbackfill.end();
4512 ++i) {
4513 if (*i == pg_whoami) continue;
4514 _request_scrub_map(*i, scrubber.subset_last_update,
4515 scrubber.start, scrubber.end, scrubber.deep,
4516 scrubber.seed);
4517 scrubber.waiting_on_whom.insert(*i);
4518 ++scrubber.waiting_on;
4519 }
4520
4521 scrubber.state = PG::Scrubber::WAIT_PUSHES;
4522
4523 break;
4524
4525 case PG::Scrubber::WAIT_PUSHES:
4526 if (active_pushes == 0) {
4527 scrubber.state = PG::Scrubber::WAIT_LAST_UPDATE;
4528 } else {
4529 dout(15) << "wait for pushes to apply" << dendl;
4530 done = true;
4531 }
4532 break;
4533
4534 case PG::Scrubber::WAIT_LAST_UPDATE:
4535 if (last_update_applied >= scrubber.subset_last_update) {
4536 scrubber.state = PG::Scrubber::BUILD_MAP;
4537 } else {
4538 // will be requeued by op_applied
4539 dout(15) << "wait for writes to flush" << dendl;
4540 done = true;
4541 }
4542 break;
4543
4544 case PG::Scrubber::BUILD_MAP:
4545 assert(last_update_applied >= scrubber.subset_last_update);
4546
4547 // build my own scrub map
4548 ret = build_scrub_map_chunk(scrubber.primary_scrubmap,
4549 scrubber.start, scrubber.end,
4550 scrubber.deep, scrubber.seed,
4551 handle);
4552 if (ret < 0) {
4553 dout(5) << "error building scrub map: " << ret << ", aborting" << dendl;
4554 scrub_clear_state();
4555 scrub_unreserve_replicas();
4556 return;
4557 }
4558
4559 --scrubber.waiting_on;
4560 scrubber.waiting_on_whom.erase(pg_whoami);
4561
4562 scrubber.state = PG::Scrubber::WAIT_REPLICAS;
4563 break;
4564
4565 case PG::Scrubber::WAIT_REPLICAS:
4566 if (scrubber.waiting_on > 0) {
4567 // will be requeued by sub_op_scrub_map
4568 dout(10) << "wait for replicas to build scrub map" << dendl;
4569 done = true;
4570 } else {
4571 scrubber.state = PG::Scrubber::COMPARE_MAPS;
4572 }
4573 break;
4574
4575 case PG::Scrubber::COMPARE_MAPS:
4576 assert(last_update_applied >= scrubber.subset_last_update);
4577 assert(scrubber.waiting_on == 0);
4578
4579 scrub_compare_maps();
4580 scrubber.start = scrubber.end;
4581 scrubber.run_callbacks();
4582
4583 // requeue the writes from the chunk that just finished
4584 requeue_ops(waiting_for_scrub);
4585
4586 scrubber.state = PG::Scrubber::WAIT_DIGEST_UPDATES;
4587
4588 // fall-thru
4589
4590 case PG::Scrubber::WAIT_DIGEST_UPDATES:
4591 if (scrubber.num_digest_updates_pending) {
4592 dout(10) << __func__ << " waiting on "
4593 << scrubber.num_digest_updates_pending
4594 << " digest updates" << dendl;
4595 done = true;
4596 break;
4597 }
4598
4599 if (!(scrubber.end.is_max())) {
4600 scrubber.state = PG::Scrubber::NEW_CHUNK;
4601 requeue_scrub();
4602 done = true;
4603 } else {
4604 scrubber.state = PG::Scrubber::FINISH;
4605 }
4606
4607 break;
4608
4609 case PG::Scrubber::FINISH:
4610 scrub_finish();
4611 scrubber.state = PG::Scrubber::INACTIVE;
4612 done = true;
4613
4614 break;
4615
4616 default:
4617 ceph_abort();
4618 }
4619 }
4620 dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
4621 << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
4622 }
4623
4624 void PG::scrub_clear_state()
4625 {
4626 assert(is_locked());
4627 state_clear(PG_STATE_SCRUBBING);
4628 state_clear(PG_STATE_REPAIR);
4629 state_clear(PG_STATE_DEEP_SCRUB);
4630 publish_stats_to_osd();
4631
4632 // active -> nothing.
4633 if (scrubber.active)
4634 osd->dec_scrubs_active();
4635
4636 requeue_ops(waiting_for_scrub);
4637
4638 if (scrubber.queue_snap_trim) {
4639 dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
4640 snap_trimmer_scrub_complete();
4641 }
4642
4643 scrubber.reset();
4644
4645 // type-specific state clear
4646 _scrub_clear_state();
4647 }
4648
4649 void PG::scrub_compare_maps()
4650 {
4651 dout(10) << __func__ << " has maps, analyzing" << dendl;
4652
4653 // construct authoritative scrub map for type specific scrubbing
4654 scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
4655 map<hobject_t, pair<uint32_t, uint32_t>> missing_digest;
4656
4657 if (acting.size() > 1) {
4658 dout(10) << __func__ << " comparing replica scrub maps" << dendl;
4659
4660 stringstream ss;
4661
4662 // Map from object with errors to good peer
4663 map<hobject_t, list<pg_shard_t>> authoritative;
4664 map<pg_shard_t, ScrubMap *> maps;
4665
4666 dout(2) << __func__ << " osd." << acting[0] << " has "
4667 << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
4668 maps[pg_whoami] = &scrubber.primary_scrubmap;
4669
4670 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4671 i != actingbackfill.end();
4672 ++i) {
4673 if (*i == pg_whoami) continue;
4674 dout(2) << __func__ << " replica " << *i << " has "
4675 << scrubber.received_maps[*i].objects.size()
4676 << " items" << dendl;
4677 maps[*i] = &scrubber.received_maps[*i];
4678 }
4679
4680 get_pgbackend()->be_compare_scrubmaps(
4681 maps,
4682 state_test(PG_STATE_REPAIR),
4683 scrubber.missing,
4684 scrubber.inconsistent,
4685 authoritative,
4686 missing_digest,
4687 scrubber.shallow_errors,
4688 scrubber.deep_errors,
4689 scrubber.store.get(),
4690 info.pgid, acting,
4691 ss);
4692 dout(2) << ss.str() << dendl;
4693
4694 if (!ss.str().empty()) {
4695 osd->clog->error(ss);
4696 }
4697
4698 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
4699 i != authoritative.end();
4700 ++i) {
4701 list<pair<ScrubMap::object, pg_shard_t> > good_peers;
4702 for (list<pg_shard_t>::const_iterator j = i->second.begin();
4703 j != i->second.end();
4704 ++j) {
4705 good_peers.push_back(make_pair(maps[*j]->objects[i->first], *j));
4706 }
4707 scrubber.authoritative.insert(
4708 make_pair(
4709 i->first,
4710 good_peers));
4711 }
4712
4713 for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
4714 i != authoritative.end();
4715 ++i) {
4716 scrubber.cleaned_meta_map.objects.erase(i->first);
4717 scrubber.cleaned_meta_map.objects.insert(
4718 *(maps[i->second.back()]->objects.find(i->first))
4719 );
4720 }
4721 }
4722
4723 ScrubMap for_meta_scrub;
4724 if (scrubber.end.is_max() ||
4725 scrubber.cleaned_meta_map.objects.empty()) {
4726 scrubber.cleaned_meta_map.swap(for_meta_scrub);
4727 } else {
4728 auto iter = scrubber.cleaned_meta_map.objects.end();
4729 --iter; // not empty, see if clause
4730 auto begin = scrubber.cleaned_meta_map.objects.begin();
4731 while (iter != begin) {
4732 auto next = iter--;
4733 if (next->first.get_head() != iter->first.get_head()) {
4734 ++iter;
4735 break;
4736 }
4737 }
4738 for_meta_scrub.objects.insert(begin, iter);
4739 scrubber.cleaned_meta_map.objects.erase(begin, iter);
4740 }
4741
4742 // ok, do the pg-type specific scrubbing
4743 scrub_snapshot_metadata(for_meta_scrub, missing_digest);
4744 if (!scrubber.store->empty()) {
4745 if (state_test(PG_STATE_REPAIR)) {
4746 dout(10) << __func__ << ": discarding scrub results" << dendl;
4747 scrubber.store->flush(nullptr);
4748 } else {
4749 dout(10) << __func__ << ": updating scrub object" << dendl;
4750 ObjectStore::Transaction t;
4751 scrubber.store->flush(&t);
4752 osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4753 }
4754 }
4755 }
4756
4757 bool PG::scrub_process_inconsistent()
4758 {
4759 dout(10) << __func__ << ": checking authoritative" << dendl;
4760 bool repair = state_test(PG_STATE_REPAIR);
4761 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4762 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4763
4764 // authoriative only store objects which missing or inconsistent.
4765 if (!scrubber.authoritative.empty()) {
4766 stringstream ss;
4767 ss << info.pgid << " " << mode << " "
4768 << scrubber.missing.size() << " missing, "
4769 << scrubber.inconsistent.size() << " inconsistent objects";
4770 dout(2) << ss.str() << dendl;
4771 osd->clog->error(ss);
4772 if (repair) {
4773 state_clear(PG_STATE_CLEAN);
4774 for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >>::iterator i =
4775 scrubber.authoritative.begin();
4776 i != scrubber.authoritative.end();
4777 ++i) {
4778 set<pg_shard_t>::iterator j;
4779
4780 auto missing_entry = scrubber.missing.find(i->first);
4781 if (missing_entry != scrubber.missing.end()) {
4782 for (j = missing_entry->second.begin();
4783 j != missing_entry->second.end();
4784 ++j) {
4785 repair_object(
4786 i->first,
4787 &(i->second),
4788 *j);
4789 ++scrubber.fixed;
4790 }
4791 }
4792 if (scrubber.inconsistent.count(i->first)) {
4793 for (j = scrubber.inconsistent[i->first].begin();
4794 j != scrubber.inconsistent[i->first].end();
4795 ++j) {
4796 repair_object(i->first,
4797 &(i->second),
4798 *j);
4799 ++scrubber.fixed;
4800 }
4801 }
4802 }
4803 }
4804 }
4805 return (!scrubber.authoritative.empty() && repair);
4806 }
4807
4808 bool PG::ops_blocked_by_scrub() const {
4809 return (waiting_for_scrub.size() != 0);
4810 }
4811
4812 // the part that actually finalizes a scrub
4813 void PG::scrub_finish()
4814 {
4815 bool repair = state_test(PG_STATE_REPAIR);
4816 // if the repair request comes from auto-repair and large number of errors,
4817 // we would like to cancel auto-repair
4818 if (repair && scrubber.auto_repair
4819 && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) {
4820 state_clear(PG_STATE_REPAIR);
4821 repair = false;
4822 }
4823 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4824 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4825
4826 // type-specific finish (can tally more errors)
4827 _scrub_finish();
4828
4829 bool has_error = scrub_process_inconsistent();
4830
4831 {
4832 stringstream oss;
4833 oss << info.pgid.pgid << " " << mode << " ";
4834 int total_errors = scrubber.shallow_errors + scrubber.deep_errors;
4835 if (total_errors)
4836 oss << total_errors << " errors";
4837 else
4838 oss << "ok";
4839 if (!deep_scrub && info.stats.stats.sum.num_deep_scrub_errors)
4840 oss << " ( " << info.stats.stats.sum.num_deep_scrub_errors
4841 << " remaining deep scrub error details lost)";
4842 if (repair)
4843 oss << ", " << scrubber.fixed << " fixed";
4844 if (total_errors)
4845 osd->clog->error(oss);
4846 else
4847 osd->clog->info(oss);
4848 }
4849
4850 // finish up
4851 unreg_next_scrub();
4852 utime_t now = ceph_clock_now();
4853 info.history.last_scrub = info.last_update;
4854 info.history.last_scrub_stamp = now;
4855 if (scrubber.deep) {
4856 info.history.last_deep_scrub = info.last_update;
4857 info.history.last_deep_scrub_stamp = now;
4858 }
4859 // Since we don't know which errors were fixed, we can only clear them
4860 // when every one has been fixed.
4861 if (repair) {
4862 if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) {
4863 assert(deep_scrub);
4864 scrubber.shallow_errors = scrubber.deep_errors = 0;
4865 } else {
4866 // Deep scrub in order to get corrected error counts
4867 scrub_after_recovery = true;
4868 }
4869 }
4870 if (deep_scrub) {
4871 if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0))
4872 info.history.last_clean_scrub_stamp = now;
4873 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
4874 info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
4875 } else {
4876 info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
4877 // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
4878 // because of deep-scrub errors
4879 if (scrubber.shallow_errors == 0)
4880 info.history.last_clean_scrub_stamp = now;
4881 }
4882 info.stats.stats.sum.num_scrub_errors =
4883 info.stats.stats.sum.num_shallow_scrub_errors +
4884 info.stats.stats.sum.num_deep_scrub_errors;
4885 reg_next_scrub();
4886
4887 {
4888 ObjectStore::Transaction t;
4889 dirty_info = true;
4890 write_if_dirty(t);
4891 int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
4892 assert(tr == 0);
4893 }
4894
4895
4896 if (has_error) {
4897 queue_peering_event(
4898 CephPeeringEvtRef(
4899 std::make_shared<CephPeeringEvt>(
4900 get_osdmap()->get_epoch(),
4901 get_osdmap()->get_epoch(),
4902 DoRecovery())));
4903 }
4904
4905 scrub_clear_state();
4906 scrub_unreserve_replicas();
4907
4908 if (is_active() && is_primary()) {
4909 share_pg_info();
4910 }
4911 }
4912
4913 void PG::share_pg_info()
4914 {
4915 dout(10) << "share_pg_info" << dendl;
4916
4917 // share new pg_info_t with replicas
4918 assert(!actingbackfill.empty());
4919 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4920 i != actingbackfill.end();
4921 ++i) {
4922 if (*i == pg_whoami) continue;
4923 pg_shard_t peer = *i;
4924 if (peer_info.count(peer)) {
4925 peer_info[peer].last_epoch_started = info.last_epoch_started;
4926 peer_info[peer].last_interval_started = info.last_interval_started;
4927 peer_info[peer].history.merge(info.history);
4928 }
4929 MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
4930 m->pg_list.push_back(
4931 make_pair(
4932 pg_notify_t(
4933 peer.shard, pg_whoami.shard,
4934 get_osdmap()->get_epoch(),
4935 get_osdmap()->get_epoch(),
4936 info),
4937 PastIntervals()));
4938 osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
4939 }
4940 }
4941
4942 bool PG::append_log_entries_update_missing(
4943 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
4944 ObjectStore::Transaction &t)
4945 {
4946 assert(!entries.empty());
4947 assert(entries.begin()->version > info.last_update);
4948
4949 PGLogEntryHandler rollbacker{this, &t};
4950 bool invalidate_stats =
4951 pg_log.append_new_log_entries(info.last_backfill,
4952 info.last_backfill_bitwise,
4953 entries,
4954 &rollbacker);
4955 info.last_update = pg_log.get_head();
4956
4957 if (pg_log.get_missing().num_missing() == 0) {
4958 // advance last_complete since nothing else is missing!
4959 info.last_complete = info.last_update;
4960 }
4961
4962 info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
4963 dirty_info = true;
4964 write_if_dirty(t);
4965 return invalidate_stats;
4966 }
4967
4968
4969 void PG::merge_new_log_entries(
4970 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
4971 ObjectStore::Transaction &t)
4972 {
4973 dout(10) << __func__ << " " << entries << dendl;
4974 assert(is_primary());
4975
4976 bool rebuild_missing = append_log_entries_update_missing(entries, t);
4977 for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
4978 i != actingbackfill.end();
4979 ++i) {
4980 pg_shard_t peer(*i);
4981 if (peer == pg_whoami) continue;
4982 assert(peer_missing.count(peer));
4983 assert(peer_info.count(peer));
4984 pg_missing_t& pmissing(peer_missing[peer]);
4985 pg_info_t& pinfo(peer_info[peer]);
4986 bool invalidate_stats = PGLog::append_log_entries_update_missing(
4987 pinfo.last_backfill,
4988 info.last_backfill_bitwise,
4989 entries,
4990 true,
4991 NULL,
4992 pmissing,
4993 NULL,
4994 this);
4995 pinfo.last_update = info.last_update;
4996 pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
4997 rebuild_missing = rebuild_missing || invalidate_stats;
4998 }
4999
5000 if (!rebuild_missing) {
5001 return;
5002 }
5003
5004 for (auto &&i: entries) {
5005 missing_loc.rebuild(
5006 i.soid,
5007 pg_whoami,
5008 actingbackfill,
5009 info,
5010 pg_log.get_missing(),
5011 peer_missing,
5012 peer_info);
5013 }
5014 }
5015
5016 void PG::update_history(const pg_history_t& new_history)
5017 {
5018 unreg_next_scrub();
5019 if (info.history.merge(new_history)) {
5020 dout(20) << __func__ << " advanced history from " << new_history << dendl;
5021 dirty_info = true;
5022 if (info.history.last_epoch_clean >= info.history.same_interval_since) {
5023 dout(20) << __func__ << " clearing past_intervals" << dendl;
5024 past_intervals.clear();
5025 dirty_big_info = true;
5026 }
5027 }
5028 reg_next_scrub();
5029 }
5030
5031 void PG::fulfill_info(
5032 pg_shard_t from, const pg_query_t &query,
5033 pair<pg_shard_t, pg_info_t> &notify_info)
5034 {
5035 assert(from == primary);
5036 assert(query.type == pg_query_t::INFO);
5037
5038 // info
5039 dout(10) << "sending info" << dendl;
5040 notify_info = make_pair(from, info);
5041 }
5042
5043 void PG::fulfill_log(
5044 pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
5045 {
5046 dout(10) << "log request from " << from << dendl;
5047 assert(from == primary);
5048 assert(query.type != pg_query_t::INFO);
5049 ConnectionRef con = osd->get_con_osd_cluster(
5050 from.osd, get_osdmap()->get_epoch());
5051 if (!con) return;
5052
5053 MOSDPGLog *mlog = new MOSDPGLog(
5054 from.shard, pg_whoami.shard,
5055 get_osdmap()->get_epoch(),
5056 info, query_epoch);
5057 mlog->missing = pg_log.get_missing();
5058
5059 // primary -> other, when building master log
5060 if (query.type == pg_query_t::LOG) {
5061 dout(10) << " sending info+missing+log since " << query.since
5062 << dendl;
5063 if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
5064 osd->clog->error() << info.pgid << " got broken pg_query_t::LOG since " << query.since
5065 << " when my log.tail is " << pg_log.get_tail()
5066 << ", sending full log instead";
5067 mlog->log = pg_log.get_log(); // primary should not have requested this!!
5068 } else
5069 mlog->log.copy_after(pg_log.get_log(), query.since);
5070 }
5071 else if (query.type == pg_query_t::FULLLOG) {
5072 dout(10) << " sending info+missing+full log" << dendl;
5073 mlog->log = pg_log.get_log();
5074 }
5075
5076 dout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
5077
5078 osd->share_map_peer(from.osd, con.get(), get_osdmap());
5079 osd->send_message_osd_cluster(mlog, con.get());
5080 }
5081
5082 void PG::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
5083 {
5084 bool changed = false;
5085 if (osdmap->test_flag(CEPH_OSDMAP_FULL) &&
5086 !lastmap->test_flag(CEPH_OSDMAP_FULL)) {
5087 dout(10) << " cluster was marked full in " << osdmap->get_epoch() << dendl;
5088 changed = true;
5089 }
5090 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
5091 assert(pi);
5092 if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
5093 const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
5094 if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
5095 dout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
5096 changed = true;
5097 }
5098 }
5099 if (changed) {
5100 info.history.last_epoch_marked_full = osdmap->get_epoch();
5101 dirty_info = true;
5102 }
5103 }
5104
5105 bool PG::should_restart_peering(
5106 int newupprimary,
5107 int newactingprimary,
5108 const vector<int>& newup,
5109 const vector<int>& newacting,
5110 OSDMapRef lastmap,
5111 OSDMapRef osdmap)
5112 {
5113 if (PastIntervals::is_new_interval(
5114 primary.osd,
5115 newactingprimary,
5116 acting,
5117 newacting,
5118 up_primary.osd,
5119 newupprimary,
5120 up,
5121 newup,
5122 osdmap,
5123 lastmap,
5124 info.pgid.pgid)) {
5125 dout(20) << "new interval newup " << newup
5126 << " newacting " << newacting << dendl;
5127 return true;
5128 } else {
5129 return false;
5130 }
5131 }
5132
5133 bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
5134 {
5135 if (last_peering_reset > reply_epoch ||
5136 last_peering_reset > query_epoch) {
5137 dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " << query_epoch
5138 << " last_peering_reset " << last_peering_reset
5139 << dendl;
5140 return true;
5141 }
5142 return false;
5143 }
5144
5145 void PG::set_last_peering_reset()
5146 {
5147 dout(20) << "set_last_peering_reset " << get_osdmap()->get_epoch() << dendl;
5148 if (last_peering_reset != get_osdmap()->get_epoch()) {
5149 last_peering_reset = get_osdmap()->get_epoch();
5150 reset_interval_flush();
5151 }
5152 }
5153
5154 struct FlushState {
5155 PGRef pg;
5156 epoch_t epoch;
5157 FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
5158 ~FlushState() {
5159 pg->lock();
5160 if (!pg->pg_has_reset_since(epoch))
5161 pg->queue_flushed(epoch);
5162 pg->unlock();
5163 }
5164 };
5165 typedef ceph::shared_ptr<FlushState> FlushStateRef;
5166
5167 void PG::start_flush(ObjectStore::Transaction *t,
5168 list<Context *> *on_applied,
5169 list<Context *> *on_safe)
5170 {
5171 // flush in progress ops
5172 FlushStateRef flush_trigger (std::make_shared<FlushState>(
5173 this, get_osdmap()->get_epoch()));
5174 t->nop();
5175 flushes_in_progress++;
5176 on_applied->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5177 on_safe->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5178 }
5179
5180 void PG::reset_interval_flush()
5181 {
5182 dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
5183 recovery_state.clear_blocked_outgoing();
5184
5185 Context *c = new QueuePeeringEvt<IntervalFlush>(
5186 this, get_osdmap()->get_epoch(), IntervalFlush());
5187 if (!osr->flush_commit(c)) {
5188 dout(10) << "Beginning to block outgoing recovery messages" << dendl;
5189 recovery_state.begin_block_outgoing();
5190 } else {
5191 dout(10) << "Not blocking outgoing recovery messages" << dendl;
5192 delete c;
5193 }
5194 }
5195
5196 /* Called before initializing peering during advance_map */
5197 void PG::start_peering_interval(
5198 const OSDMapRef lastmap,
5199 const vector<int>& newup, int new_up_primary,
5200 const vector<int>& newacting, int new_acting_primary,
5201 ObjectStore::Transaction *t)
5202 {
5203 const OSDMapRef osdmap = get_osdmap();
5204
5205 set_last_peering_reset();
5206
5207 vector<int> oldacting, oldup;
5208 int oldrole = get_role();
5209
5210 unreg_next_scrub();
5211
5212 pg_shard_t old_acting_primary = get_primary();
5213 pg_shard_t old_up_primary = up_primary;
5214 bool was_old_primary = is_primary();
5215
5216 acting.swap(oldacting);
5217 up.swap(oldup);
5218 init_primary_up_acting(
5219 newup,
5220 newacting,
5221 new_up_primary,
5222 new_acting_primary);
5223
5224 if (info.stats.up != up ||
5225 info.stats.acting != acting ||
5226 info.stats.up_primary != new_up_primary ||
5227 info.stats.acting_primary != new_acting_primary) {
5228 info.stats.up = up;
5229 info.stats.up_primary = new_up_primary;
5230 info.stats.acting = acting;
5231 info.stats.acting_primary = new_acting_primary;
5232 info.stats.mapping_epoch = osdmap->get_epoch();
5233 }
5234
5235 pg_stats_publish_lock.Lock();
5236 pg_stats_publish_valid = false;
5237 pg_stats_publish_lock.Unlock();
5238
5239 // This will now be remapped during a backfill in cases
5240 // that it would not have been before.
5241 if (up != acting)
5242 state_set(PG_STATE_REMAPPED);
5243 else
5244 state_clear(PG_STATE_REMAPPED);
5245
5246 int role = osdmap->calc_pg_role(osd->whoami, acting, acting.size());
5247 if (pool.info.is_replicated() || role == pg_whoami.shard)
5248 set_role(role);
5249 else
5250 set_role(-1);
5251
5252 // did acting, up, primary|acker change?
5253 if (!lastmap) {
5254 dout(10) << " no lastmap" << dendl;
5255 dirty_info = true;
5256 dirty_big_info = true;
5257 info.history.same_interval_since = osdmap->get_epoch();
5258 } else {
5259 std::stringstream debug;
5260 assert(info.history.same_interval_since != 0);
5261 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
5262 get_is_recoverable_predicate());
5263 bool new_interval = PastIntervals::check_new_interval(
5264 old_acting_primary.osd,
5265 new_acting_primary,
5266 oldacting, newacting,
5267 old_up_primary.osd,
5268 new_up_primary,
5269 oldup, newup,
5270 info.history.same_interval_since,
5271 info.history.last_epoch_clean,
5272 osdmap,
5273 lastmap,
5274 info.pgid.pgid,
5275 recoverable.get(),
5276 &past_intervals,
5277 &debug);
5278 dout(10) << __func__ << ": check_new_interval output: "
5279 << debug.str() << dendl;
5280 if (new_interval) {
5281 if (osdmap->get_epoch() == osd->get_superblock().oldest_map &&
5282 info.history.last_epoch_clean < osdmap->get_epoch()) {
5283 dout(10) << " map gap, clearing past_intervals and faking" << dendl;
5284 // our information is incomplete and useless; someone else was clean
5285 // after everything we know if osdmaps were trimmed.
5286 past_intervals.clear();
5287 } else {
5288 dout(10) << " noting past " << past_intervals << dendl;
5289 }
5290 dirty_info = true;
5291 dirty_big_info = true;
5292 info.history.same_interval_since = osdmap->get_epoch();
5293 if (info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
5294 osdmap->get_pg_num(info.pgid.pgid.pool()),
5295 nullptr)) {
5296 info.history.last_epoch_split = osdmap->get_epoch();
5297 }
5298 }
5299 }
5300
5301 if (old_up_primary != up_primary ||
5302 oldup != up) {
5303 info.history.same_up_since = osdmap->get_epoch();
5304 }
5305 // this comparison includes primary rank via pg_shard_t
5306 if (old_acting_primary != get_primary()) {
5307 info.history.same_primary_since = osdmap->get_epoch();
5308 }
5309
5310 on_new_interval();
5311
5312 dout(1) << __func__ << " up " << oldup << " -> " << up
5313 << ", acting " << oldacting << " -> " << acting
5314 << ", acting_primary " << old_acting_primary << " -> " << new_acting_primary
5315 << ", up_primary " << old_up_primary << " -> " << new_up_primary
5316 << ", role " << oldrole << " -> " << role
5317 << ", features acting " << acting_features
5318 << " upacting " << upacting_features
5319 << dendl;
5320
5321 // deactivate.
5322 state_clear(PG_STATE_ACTIVE);
5323 state_clear(PG_STATE_PEERED);
5324 state_clear(PG_STATE_DOWN);
5325 state_clear(PG_STATE_RECOVERY_WAIT);
5326 state_clear(PG_STATE_RECOVERY_TOOFULL);
5327 state_clear(PG_STATE_RECOVERING);
5328
5329 peer_purged.clear();
5330 actingbackfill.clear();
5331 scrub_queued = false;
5332
5333 // reset primary state?
5334 if (was_old_primary || is_primary()) {
5335 osd->remove_want_pg_temp(info.pgid.pgid);
5336 }
5337 clear_primary_state();
5338
5339
5340 // pg->on_*
5341 on_change(t);
5342
5343 projected_last_update = eversion_t();
5344
5345 assert(!deleting);
5346
5347 // should we tell the primary we are here?
5348 send_notify = !is_primary();
5349
5350 if (role != oldrole ||
5351 was_old_primary != is_primary()) {
5352 // did primary change?
5353 if (was_old_primary != is_primary()) {
5354 state_clear(PG_STATE_CLEAN);
5355 clear_publish_stats();
5356 }
5357
5358 on_role_change();
5359
5360 // take active waiters
5361 requeue_ops(waiting_for_peered);
5362
5363 } else {
5364 // no role change.
5365 // did primary change?
5366 if (get_primary() != old_acting_primary) {
5367 dout(10) << *this << " " << oldacting << " -> " << acting
5368 << ", acting primary "
5369 << old_acting_primary << " -> " << get_primary()
5370 << dendl;
5371 } else {
5372 // primary is the same.
5373 if (is_primary()) {
5374 // i am (still) primary. but my replica set changed.
5375 state_clear(PG_STATE_CLEAN);
5376
5377 dout(10) << oldacting << " -> " << acting
5378 << ", replicas changed" << dendl;
5379 }
5380 }
5381 }
5382 cancel_recovery();
5383
5384 if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
5385 dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
5386 osd->queue_want_pg_temp(info.pgid.pgid, acting);
5387 }
5388 }
5389
5390 void PG::on_new_interval()
5391 {
5392 const OSDMapRef osdmap = get_osdmap();
5393
5394 reg_next_scrub();
5395
5396 // initialize features
5397 acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5398 upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5399 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
5400 if (*p == CRUSH_ITEM_NONE)
5401 continue;
5402 uint64_t f = osdmap->get_xinfo(*p).features;
5403 acting_features &= f;
5404 upacting_features &= f;
5405 }
5406 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
5407 if (*p == CRUSH_ITEM_NONE)
5408 continue;
5409 upacting_features &= osdmap->get_xinfo(*p).features;
5410 }
5411
5412 assert(osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE));
5413
5414 _on_new_interval();
5415 }
5416
5417 void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
5418 {
5419 assert(!is_primary());
5420
5421 update_history(oinfo.history);
5422
5423 if (last_complete_ondisk.epoch >= info.history.last_epoch_started) {
5424 // DEBUG: verify that the snaps are empty in snap_mapper
5425 if (cct->_conf->osd_debug_verify_snaps_on_info) {
5426 interval_set<snapid_t> p;
5427 p.union_of(oinfo.purged_snaps, info.purged_snaps);
5428 p.subtract(info.purged_snaps);
5429 if (!p.empty()) {
5430 for (interval_set<snapid_t>::iterator i = p.begin();
5431 i != p.end();
5432 ++i) {
5433 for (snapid_t snap = i.get_start();
5434 snap != i.get_len() + i.get_start();
5435 ++snap) {
5436 vector<hobject_t> hoids;
5437 int r = snap_mapper.get_next_objects_to_trim(snap, 1, &hoids);
5438 if (r != 0 && r != -ENOENT) {
5439 derr << __func__ << ": snap_mapper get_next_object_to_trim returned "
5440 << cpp_strerror(r) << dendl;
5441 ceph_abort();
5442 } else if (r != -ENOENT) {
5443 assert(!hoids.empty());
5444 derr << __func__ << ": snap_mapper get_next_object_to_trim returned "
5445 << cpp_strerror(r) << " for object "
5446 << hoids[0] << " on snap " << snap
5447 << " which should have been fully trimmed " << dendl;
5448 ceph_abort();
5449 }
5450 }
5451 }
5452 }
5453 }
5454 info.purged_snaps = oinfo.purged_snaps;
5455 dirty_info = true;
5456 dirty_big_info = true;
5457 }
5458 }
5459
5460 ostream& operator<<(ostream& out, const PG& pg)
5461 {
5462 out << "pg[" << pg.info
5463 << " " << pg.up;
5464 if (pg.acting != pg.up)
5465 out << "/" << pg.acting;
5466 out << " r=" << pg.get_role();
5467 out << " lpr=" << pg.get_last_peering_reset();
5468
5469 if (!pg.past_intervals.empty()) {
5470 out << " pi=[" << pg.past_intervals.get_bounds()
5471 << ")/" << pg.past_intervals.size();
5472 }
5473
5474 if (pg.is_peered()) {
5475 if (pg.last_update_ondisk != pg.info.last_update)
5476 out << " luod=" << pg.last_update_ondisk;
5477 if (pg.last_update_applied != pg.info.last_update)
5478 out << " lua=" << pg.last_update_applied;
5479 }
5480
5481 if (pg.recovery_ops_active)
5482 out << " rops=" << pg.recovery_ops_active;
5483
5484 if (pg.pg_log.get_tail() != pg.info.log_tail ||
5485 pg.pg_log.get_head() != pg.info.last_update)
5486 out << " (info mismatch, " << pg.pg_log.get_log() << ")";
5487
5488 if (!pg.pg_log.get_log().empty()) {
5489 if ((pg.pg_log.get_log().log.begin()->version <= pg.pg_log.get_tail())) {
5490 out << " (log bound mismatch, actual=["
5491 << pg.pg_log.get_log().log.begin()->version << ","
5492 << pg.pg_log.get_log().log.rbegin()->version << "]";
5493 out << ")";
5494 }
5495 }
5496
5497 if (!pg.backfill_targets.empty())
5498 out << " bft=" << pg.backfill_targets;
5499 out << " crt=" << pg.pg_log.get_can_rollback_to();
5500
5501 if (pg.last_complete_ondisk != pg.info.last_complete)
5502 out << " lcod " << pg.last_complete_ondisk;
5503
5504 if (pg.is_primary()) {
5505 out << " mlcod " << pg.min_last_complete_ondisk;
5506 }
5507
5508 out << " " << pg_state_string(pg.get_state());
5509 if (pg.should_send_notify())
5510 out << " NOTIFY";
5511
5512 if (pg.scrubber.must_repair)
5513 out << " MUST_REPAIR";
5514 if (pg.scrubber.auto_repair)
5515 out << " AUTO_REPAIR";
5516 if (pg.scrubber.must_deep_scrub)
5517 out << " MUST_DEEP_SCRUB";
5518 if (pg.scrubber.must_scrub)
5519 out << " MUST_SCRUB";
5520
5521 //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
5522 if (pg.pg_log.get_missing().num_missing()) {
5523 out << " m=" << pg.pg_log.get_missing().num_missing();
5524 if (pg.is_primary()) {
5525 uint64_t unfound = pg.get_num_unfound();
5526 if (unfound)
5527 out << " u=" << unfound;
5528 }
5529 }
5530 if (pg.snap_trimq.size())
5531 out << " snaptrimq=" << pg.snap_trimq;
5532
5533 out << "]";
5534
5535
5536 return out;
5537 }
5538
5539 bool PG::can_discard_op(OpRequestRef& op)
5540 {
5541 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
5542 if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) {
5543 dout(20) << " discard " << *m << dendl;
5544 return true;
5545 }
5546
5547 if (m->get_map_epoch() < info.history.same_primary_since) {
5548 dout(7) << " changed after " << m->get_map_epoch()
5549 << ", dropping " << *m << dendl;
5550 return true;
5551 }
5552
5553 if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) {
5554 if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) {
5555 dout(7) << __func__ << " sent before last_force_op_resend "
5556 << pool.info.last_force_op_resend << ", dropping" << *m << dendl;
5557 return true;
5558 }
5559 if (m->get_map_epoch() < info.history.last_epoch_split) {
5560 dout(7) << __func__ << " pg split in "
5561 << info.history.last_epoch_split << ", dropping" << dendl;
5562 return true;
5563 }
5564 } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) {
5565 if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) {
5566 dout(7) << __func__ << " sent before last_force_op_resend_preluminous "
5567 << pool.info.last_force_op_resend_preluminous
5568 << ", dropping" << *m << dendl;
5569 return true;
5570 }
5571 }
5572
5573 return false;
5574 }
5575
5576 template<typename T, int MSGTYPE>
5577 bool PG::can_discard_replica_op(OpRequestRef& op)
5578 {
5579 const T *m = static_cast<const T *>(op->get_req());
5580 assert(m->get_type() == MSGTYPE);
5581
5582 /* Mostly, this overlaps with the old_peering_msg
5583 * condition. An important exception is pushes
5584 * sent by replicas not in the acting set, since
5585 * if such a replica goes down it does not cause
5586 * a new interval. */
5587 int from = m->get_source().num();
5588 if (get_osdmap()->get_down_at(from) >= m->map_epoch)
5589 return true;
5590
5591 // same pg?
5592 // if pg changes _at all_, we reset and repeer!
5593 if (old_peering_msg(m->map_epoch, m->map_epoch)) {
5594 dout(10) << "can_discard_replica_op pg changed " << info.history
5595 << " after " << m->map_epoch
5596 << ", dropping" << dendl;
5597 return true;
5598 }
5599 return false;
5600 }
5601
5602 bool PG::can_discard_scan(OpRequestRef op)
5603 {
5604 const MOSDPGScan *m = static_cast<const MOSDPGScan *>(op->get_req());
5605 assert(m->get_type() == MSG_OSD_PG_SCAN);
5606
5607 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
5608 dout(10) << " got old scan, ignoring" << dendl;
5609 return true;
5610 }
5611 return false;
5612 }
5613
5614 bool PG::can_discard_backfill(OpRequestRef op)
5615 {
5616 const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill *>(op->get_req());
5617 assert(m->get_type() == MSG_OSD_PG_BACKFILL);
5618
5619 if (old_peering_msg(m->map_epoch, m->query_epoch)) {
5620 dout(10) << " got old backfill, ignoring" << dendl;
5621 return true;
5622 }
5623
5624 return false;
5625
5626 }
5627
5628 bool PG::can_discard_request(OpRequestRef& op)
5629 {
5630 switch (op->get_req()->get_type()) {
5631 case CEPH_MSG_OSD_OP:
5632 return can_discard_op(op);
5633 case CEPH_MSG_OSD_BACKOFF:
5634 return false; // never discard
5635 case MSG_OSD_SUBOP:
5636 return can_discard_replica_op<MOSDSubOp, MSG_OSD_SUBOP>(op);
5637 case MSG_OSD_REPOP:
5638 return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op);
5639 case MSG_OSD_PG_PUSH:
5640 return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op);
5641 case MSG_OSD_PG_PULL:
5642 return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
5643 case MSG_OSD_PG_PUSH_REPLY:
5644 return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
5645 case MSG_OSD_SUBOPREPLY:
5646 return can_discard_replica_op<MOSDSubOpReply, MSG_OSD_SUBOPREPLY>(op);
5647 case MSG_OSD_REPOPREPLY:
5648 return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
5649
5650 case MSG_OSD_EC_WRITE:
5651 return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
5652 case MSG_OSD_EC_WRITE_REPLY:
5653 return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op);
5654 case MSG_OSD_EC_READ:
5655 return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
5656 case MSG_OSD_EC_READ_REPLY:
5657 return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
5658 case MSG_OSD_REP_SCRUB:
5659 return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
5660 case MSG_OSD_SCRUB_RESERVE:
5661 return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op);
5662 case MSG_OSD_REP_SCRUBMAP:
5663 return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op);
5664 case MSG_OSD_PG_UPDATE_LOG_MISSING:
5665 return can_discard_replica_op<
5666 MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op);
5667 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
5668 return can_discard_replica_op<
5669 MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
5670
5671 case MSG_OSD_PG_SCAN:
5672 return can_discard_scan(op);
5673 case MSG_OSD_PG_BACKFILL:
5674 return can_discard_backfill(op);
5675 case MSG_OSD_PG_BACKFILL_REMOVE:
5676 return can_discard_replica_op<MOSDPGBackfillRemove,
5677 MSG_OSD_PG_BACKFILL_REMOVE>(op);
5678 }
5679 return true;
5680 }
5681
5682 void PG::take_waiters()
5683 {
5684 dout(10) << "take_waiters" << dendl;
5685 requeue_map_waiters();
5686 for (list<CephPeeringEvtRef>::iterator i = peering_waiters.begin();
5687 i != peering_waiters.end();
5688 ++i) osd->queue_for_peering(this);
5689 peering_queue.splice(peering_queue.begin(), peering_waiters,
5690 peering_waiters.begin(), peering_waiters.end());
5691 }
5692
5693 void PG::handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx)
5694 {
5695 dout(10) << "handle_peering_event: " << evt->get_desc() << dendl;
5696 if (!have_same_or_newer_map(evt->get_epoch_sent())) {
5697 dout(10) << "deferring event " << evt->get_desc() << dendl;
5698 peering_waiters.push_back(evt);
5699 return;
5700 }
5701 if (old_peering_evt(evt))
5702 return;
5703 recovery_state.handle_event(evt, rctx);
5704 }
5705
5706 void PG::queue_peering_event(CephPeeringEvtRef evt)
5707 {
5708 if (old_peering_evt(evt))
5709 return;
5710 peering_queue.push_back(evt);
5711 osd->queue_for_peering(this);
5712 }
5713
5714 void PG::queue_null(epoch_t msg_epoch,
5715 epoch_t query_epoch)
5716 {
5717 dout(10) << "null" << dendl;
5718 queue_peering_event(
5719 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
5720 NullEvt())));
5721 }
5722
5723 void PG::queue_flushed(epoch_t e)
5724 {
5725 dout(10) << "flushed" << dendl;
5726 queue_peering_event(
5727 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(e, e,
5728 FlushedEvt())));
5729 }
5730
5731 void PG::queue_query(epoch_t msg_epoch,
5732 epoch_t query_epoch,
5733 pg_shard_t from, const pg_query_t& q)
5734 {
5735 dout(10) << "handle_query " << q << " from replica " << from << dendl;
5736 queue_peering_event(
5737 CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
5738 MQuery(from, q, query_epoch))));
5739 }
5740
5741 void PG::handle_advance_map(
5742 OSDMapRef osdmap, OSDMapRef lastmap,
5743 vector<int>& newup, int up_primary,
5744 vector<int>& newacting, int acting_primary,
5745 RecoveryCtx *rctx)
5746 {
5747 assert(lastmap->get_epoch() == osdmap_ref->get_epoch());
5748 assert(lastmap == osdmap_ref);
5749 dout(10) << "handle_advance_map "
5750 << newup << "/" << newacting
5751 << " -- " << up_primary << "/" << acting_primary
5752 << dendl;
5753 update_osdmap_ref(osdmap);
5754 pool.update(osdmap);
5755 past_intervals.update_type_from_map(pool.info.ec_pool(), *osdmap);
5756 if (cct->_conf->osd_debug_verify_cached_snaps) {
5757 interval_set<snapid_t> actual_removed_snaps;
5758 const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
5759 assert(pi);
5760 pi->build_removed_snaps(actual_removed_snaps);
5761 if (!(actual_removed_snaps == pool.cached_removed_snaps)) {
5762 derr << __func__ << ": mismatch between the actual removed snaps "
5763 << actual_removed_snaps << " and pool.cached_removed_snaps "
5764 << " pool.cached_removed_snaps " << pool.cached_removed_snaps
5765 << dendl;
5766 }
5767 assert(actual_removed_snaps == pool.cached_removed_snaps);
5768 }
5769 AdvMap evt(
5770 osdmap, lastmap, newup, up_primary,
5771 newacting, acting_primary);
5772 recovery_state.handle_event(evt, rctx);
5773 if (pool.info.last_change == osdmap_ref->get_epoch()) {
5774 on_pool_change();
5775 update_store_with_options();
5776 }
5777 }
5778
5779 void PG::handle_activate_map(RecoveryCtx *rctx)
5780 {
5781 dout(10) << "handle_activate_map " << dendl;
5782 ActMap evt;
5783 recovery_state.handle_event(evt, rctx);
5784 if (osdmap_ref->get_epoch() - last_persisted_osdmap_ref->get_epoch() >
5785 cct->_conf->osd_pg_epoch_persisted_max_stale) {
5786 dout(20) << __func__ << ": Dirtying info: last_persisted is "
5787 << last_persisted_osdmap_ref->get_epoch()
5788 << " while current is " << osdmap_ref->get_epoch() << dendl;
5789 dirty_info = true;
5790 } else {
5791 dout(20) << __func__ << ": Not dirtying info: last_persisted is "
5792 << last_persisted_osdmap_ref->get_epoch()
5793 << " while current is " << osdmap_ref->get_epoch() << dendl;
5794 }
5795 if (osdmap_ref->check_new_blacklist_entries()) check_blacklisted_watchers();
5796 }
5797
5798 void PG::handle_loaded(RecoveryCtx *rctx)
5799 {
5800 dout(10) << "handle_loaded" << dendl;
5801 Load evt;
5802 recovery_state.handle_event(evt, rctx);
5803 }
5804
5805 void PG::handle_create(RecoveryCtx *rctx)
5806 {
5807 dout(10) << "handle_create" << dendl;
5808 rctx->created_pgs.insert(this);
5809 Initialize evt;
5810 recovery_state.handle_event(evt, rctx);
5811 ActMap evt2;
5812 recovery_state.handle_event(evt2, rctx);
5813 }
5814
5815 void PG::handle_query_state(Formatter *f)
5816 {
5817 dout(10) << "handle_query_state" << dendl;
5818 QueryState q(f);
5819 recovery_state.handle_event(q, 0);
5820 }
5821
5822 void PG::update_store_with_options()
5823 {
5824 auto r = osd->store->set_collection_opts(coll, pool.info.opts);
5825 if(r < 0 && r != -EOPNOTSUPP) {
5826 derr << __func__ << "set_collection_opts returns error:" << r << dendl;
5827 }
5828 }
5829
5830 void PG::update_store_on_load()
5831 {
5832 if (osd->store->get_type() == "filestore") {
5833 // legacy filestore didn't store collection bit width; fix.
5834 int bits = osd->store->collection_bits(coll);
5835 if (bits < 0) {
5836 if (coll.is_meta())
5837 bits = 0;
5838 else
5839 bits = info.pgid.get_split_bits(pool.info.get_pg_num());
5840 lderr(cct) << __func__ << " setting bit width to " << bits << dendl;
5841 ObjectStore::Transaction t;
5842 t.collection_set_bits(coll, bits);
5843 osd->store->apply_transaction(osr.get(), std::move(t));
5844 }
5845 }
5846 }
5847
5848 /*------------ Recovery State Machine----------------*/
5849 #undef dout_prefix
5850 #define dout_prefix (*_dout << context< RecoveryMachine >().pg->gen_prefix() \
5851 << "state<" << get_state_name() << ">: ")
5852
5853 /*------Crashed-------*/
5854 PG::RecoveryState::Crashed::Crashed(my_context ctx)
5855 : my_base(ctx),
5856 NamedState(context< RecoveryMachine >().pg, "Crashed")
5857 {
5858 context< RecoveryMachine >().log_enter(state_name);
5859 assert(0 == "we got a bad state machine event");
5860 }
5861
5862
5863 /*------Initial-------*/
5864 PG::RecoveryState::Initial::Initial(my_context ctx)
5865 : my_base(ctx),
5866 NamedState(context< RecoveryMachine >().pg, "Initial")
5867 {
5868 context< RecoveryMachine >().log_enter(state_name);
5869 }
5870
5871 boost::statechart::result PG::RecoveryState::Initial::react(const Load& l)
5872 {
5873 PG *pg = context< RecoveryMachine >().pg;
5874
5875 // do we tell someone we're here?
5876 pg->send_notify = (!pg->is_primary());
5877 pg->update_store_with_options();
5878
5879 pg->update_store_on_load();
5880
5881 return transit< Reset >();
5882 }
5883
5884 boost::statechart::result PG::RecoveryState::Initial::react(const MNotifyRec& notify)
5885 {
5886 PG *pg = context< RecoveryMachine >().pg;
5887 pg->proc_replica_info(
5888 notify.from, notify.notify.info, notify.notify.epoch_sent);
5889 pg->set_last_peering_reset();
5890 return transit< Primary >();
5891 }
5892
5893 boost::statechart::result PG::RecoveryState::Initial::react(const MInfoRec& i)
5894 {
5895 PG *pg = context< RecoveryMachine >().pg;
5896 assert(!pg->is_primary());
5897 post_event(i);
5898 return transit< Stray >();
5899 }
5900
5901 boost::statechart::result PG::RecoveryState::Initial::react(const MLogRec& i)
5902 {
5903 PG *pg = context< RecoveryMachine >().pg;
5904 assert(!pg->is_primary());
5905 post_event(i);
5906 return transit< Stray >();
5907 }
5908
5909 void PG::RecoveryState::Initial::exit()
5910 {
5911 context< RecoveryMachine >().log_exit(state_name, enter_time);
5912 PG *pg = context< RecoveryMachine >().pg;
5913 utime_t dur = ceph_clock_now() - enter_time;
5914 pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur);
5915 }
5916
5917 /*------Started-------*/
5918 PG::RecoveryState::Started::Started(my_context ctx)
5919 : my_base(ctx),
5920 NamedState(context< RecoveryMachine >().pg, "Started")
5921 {
5922 context< RecoveryMachine >().log_enter(state_name);
5923 }
5924
5925 boost::statechart::result
5926 PG::RecoveryState::Started::react(const IntervalFlush&)
5927 {
5928 PG *pg = context< RecoveryMachine >().pg;
5929 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
5930 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
5931 return discard_event();
5932 }
5933
5934
5935 boost::statechart::result
5936 PG::RecoveryState::Started::react(const FlushedEvt&)
5937 {
5938 PG *pg = context< RecoveryMachine >().pg;
5939 pg->on_flushed();
5940 return discard_event();
5941 }
5942
5943
5944 boost::statechart::result PG::RecoveryState::Started::react(const AdvMap& advmap)
5945 {
5946 PG *pg = context< RecoveryMachine >().pg;
5947 ldout(pg->cct, 10) << "Started advmap" << dendl;
5948 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
5949 if (pg->should_restart_peering(
5950 advmap.up_primary,
5951 advmap.acting_primary,
5952 advmap.newup,
5953 advmap.newacting,
5954 advmap.lastmap,
5955 advmap.osdmap)) {
5956 ldout(pg->cct, 10) << "should_restart_peering, transitioning to Reset"
5957 << dendl;
5958 post_event(advmap);
5959 return transit< Reset >();
5960 }
5961 pg->remove_down_peer_info(advmap.osdmap);
5962 return discard_event();
5963 }
5964
5965 boost::statechart::result PG::RecoveryState::Started::react(const QueryState& q)
5966 {
5967 q.f->open_object_section("state");
5968 q.f->dump_string("name", state_name);
5969 q.f->dump_stream("enter_time") << enter_time;
5970 q.f->close_section();
5971 return discard_event();
5972 }
5973
5974 void PG::RecoveryState::Started::exit()
5975 {
5976 context< RecoveryMachine >().log_exit(state_name, enter_time);
5977 PG *pg = context< RecoveryMachine >().pg;
5978 utime_t dur = ceph_clock_now() - enter_time;
5979 pg->osd->recoverystate_perf->tinc(rs_started_latency, dur);
5980 }
5981
5982 /*--------Reset---------*/
5983 PG::RecoveryState::Reset::Reset(my_context ctx)
5984 : my_base(ctx),
5985 NamedState(context< RecoveryMachine >().pg, "Reset")
5986 {
5987 context< RecoveryMachine >().log_enter(state_name);
5988 PG *pg = context< RecoveryMachine >().pg;
5989
5990 pg->flushes_in_progress = 0;
5991 pg->set_last_peering_reset();
5992 }
5993
5994 boost::statechart::result
5995 PG::RecoveryState::Reset::react(const FlushedEvt&)
5996 {
5997 PG *pg = context< RecoveryMachine >().pg;
5998 pg->on_flushed();
5999 return discard_event();
6000 }
6001
6002 boost::statechart::result
6003 PG::RecoveryState::Reset::react(const IntervalFlush&)
6004 {
6005 PG *pg = context< RecoveryMachine >().pg;
6006 ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6007 context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6008 return discard_event();
6009 }
6010
6011 boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
6012 {
6013 PG *pg = context< RecoveryMachine >().pg;
6014 ldout(pg->cct, 10) << "Reset advmap" << dendl;
6015
6016 pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6017
6018 if (pg->should_restart_peering(
6019 advmap.up_primary,
6020 advmap.acting_primary,
6021 advmap.newup,
6022 advmap.newacting,
6023 advmap.lastmap,
6024 advmap.osdmap)) {
6025 ldout(pg->cct, 10) << "should restart peering, calling start_peering_interval again"
6026 << dendl;
6027 pg->start_peering_interval(
6028 advmap.lastmap,
6029 advmap.newup, advmap.up_primary,
6030 advmap.newacting, advmap.acting_primary,
6031 context< RecoveryMachine >().get_cur_transaction());
6032 }
6033 pg->remove_down_peer_info(advmap.osdmap);
6034 pg->check_past_interval_bounds();
6035 return discard_event();
6036 }
6037
6038 boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&)
6039 {
6040 PG *pg = context< RecoveryMachine >().pg;
6041 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
6042 context< RecoveryMachine >().send_notify(
6043 pg->get_primary(),
6044 pg_notify_t(
6045 pg->get_primary().shard, pg->pg_whoami.shard,
6046 pg->get_osdmap()->get_epoch(),
6047 pg->get_osdmap()->get_epoch(),
6048 pg->info),
6049 pg->past_intervals);
6050 }
6051
6052 pg->update_heartbeat_peers();
6053 pg->take_waiters();
6054
6055 return transit< Started >();
6056 }
6057
6058 boost::statechart::result PG::RecoveryState::Reset::react(const QueryState& q)
6059 {
6060 q.f->open_object_section("state");
6061 q.f->dump_string("name", state_name);
6062 q.f->dump_stream("enter_time") << enter_time;
6063 q.f->close_section();
6064 return discard_event();
6065 }
6066
6067 void PG::RecoveryState::Reset::exit()
6068 {
6069 context< RecoveryMachine >().log_exit(state_name, enter_time);
6070 PG *pg = context< RecoveryMachine >().pg;
6071 utime_t dur = ceph_clock_now() - enter_time;
6072 pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur);
6073 }
6074
6075 /*-------Start---------*/
6076 PG::RecoveryState::Start::Start(my_context ctx)
6077 : my_base(ctx),
6078 NamedState(context< RecoveryMachine >().pg, "Start")
6079 {
6080 context< RecoveryMachine >().log_enter(state_name);
6081
6082 PG *pg = context< RecoveryMachine >().pg;
6083 if (pg->is_primary()) {
6084 ldout(pg->cct, 1) << "transitioning to Primary" << dendl;
6085 post_event(MakePrimary());
6086 } else { //is_stray
6087 ldout(pg->cct, 1) << "transitioning to Stray" << dendl;
6088 post_event(MakeStray());
6089 }
6090 }
6091
6092 void PG::RecoveryState::Start::exit()
6093 {
6094 context< RecoveryMachine >().log_exit(state_name, enter_time);
6095 PG *pg = context< RecoveryMachine >().pg;
6096 utime_t dur = ceph_clock_now() - enter_time;
6097 pg->osd->recoverystate_perf->tinc(rs_start_latency, dur);
6098 }
6099
6100 /*---------Primary--------*/
6101 PG::RecoveryState::Primary::Primary(my_context ctx)
6102 : my_base(ctx),
6103 NamedState(context< RecoveryMachine >().pg, "Started/Primary")
6104 {
6105 context< RecoveryMachine >().log_enter(state_name);
6106 PG *pg = context< RecoveryMachine >().pg;
6107 assert(pg->want_acting.empty());
6108
6109 // set CREATING bit until we have peered for the first time.
6110 if (pg->info.history.last_epoch_started == 0) {
6111 pg->state_set(PG_STATE_CREATING);
6112 // use the history timestamp, which ultimately comes from the
6113 // monitor in the create case.
6114 utime_t t = pg->info.history.last_scrub_stamp;
6115 pg->info.stats.last_fresh = t;
6116 pg->info.stats.last_active = t;
6117 pg->info.stats.last_change = t;
6118 pg->info.stats.last_peered = t;
6119 pg->info.stats.last_clean = t;
6120 pg->info.stats.last_unstale = t;
6121 pg->info.stats.last_undegraded = t;
6122 pg->info.stats.last_fullsized = t;
6123 pg->info.stats.last_scrub_stamp = t;
6124 pg->info.stats.last_deep_scrub_stamp = t;
6125 pg->info.stats.last_clean_scrub_stamp = t;
6126 }
6127 }
6128
6129 boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& notevt)
6130 {
6131 PG *pg = context< RecoveryMachine >().pg;
6132 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
6133 pg->proc_replica_info(
6134 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
6135 return discard_event();
6136 }
6137
6138 boost::statechart::result PG::RecoveryState::Primary::react(const ActMap&)
6139 {
6140 PG *pg = context< RecoveryMachine >().pg;
6141 ldout(pg->cct, 7) << "handle ActMap primary" << dendl;
6142 pg->publish_stats_to_osd();
6143 pg->take_waiters();
6144 return discard_event();
6145 }
6146
6147 void PG::RecoveryState::Primary::exit()
6148 {
6149 context< RecoveryMachine >().log_exit(state_name, enter_time);
6150 PG *pg = context< RecoveryMachine >().pg;
6151 pg->want_acting.clear();
6152 utime_t dur = ceph_clock_now() - enter_time;
6153 pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur);
6154 pg->clear_primary_state();
6155 pg->state_clear(PG_STATE_CREATING);
6156 }
6157
6158 /*---------Peering--------*/
6159 PG::RecoveryState::Peering::Peering(my_context ctx)
6160 : my_base(ctx),
6161 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering"),
6162 history_les_bound(false)
6163 {
6164 context< RecoveryMachine >().log_enter(state_name);
6165
6166 PG *pg = context< RecoveryMachine >().pg;
6167 assert(!pg->is_peered());
6168 assert(!pg->is_peering());
6169 assert(pg->is_primary());
6170 pg->state_set(PG_STATE_PEERING);
6171 }
6172
6173 boost::statechart::result PG::RecoveryState::Peering::react(const AdvMap& advmap)
6174 {
6175 PG *pg = context< RecoveryMachine >().pg;
6176 ldout(pg->cct, 10) << "Peering advmap" << dendl;
6177 if (prior_set.affected_by_map(*(advmap.osdmap), pg)) {
6178 ldout(pg->cct, 1) << "Peering, affected_by_map, going to Reset" << dendl;
6179 post_event(advmap);
6180 return transit< Reset >();
6181 }
6182
6183 pg->adjust_need_up_thru(advmap.osdmap);
6184
6185 return forward_event();
6186 }
6187
6188 boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q)
6189 {
6190 PG *pg = context< RecoveryMachine >().pg;
6191
6192 q.f->open_object_section("state");
6193 q.f->dump_string("name", state_name);
6194 q.f->dump_stream("enter_time") << enter_time;
6195
6196 q.f->open_array_section("past_intervals");
6197 pg->past_intervals.dump(q.f);
6198 q.f->close_section();
6199
6200 q.f->open_array_section("probing_osds");
6201 for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
6202 p != prior_set.probe.end();
6203 ++p)
6204 q.f->dump_stream("osd") << *p;
6205 q.f->close_section();
6206
6207 if (prior_set.pg_down)
6208 q.f->dump_string("blocked", "peering is blocked due to down osds");
6209
6210 q.f->open_array_section("down_osds_we_would_probe");
6211 for (set<int>::iterator p = prior_set.down.begin();
6212 p != prior_set.down.end();
6213 ++p)
6214 q.f->dump_int("osd", *p);
6215 q.f->close_section();
6216
6217 q.f->open_array_section("peering_blocked_by");
6218 for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
6219 p != prior_set.blocked_by.end();
6220 ++p) {
6221 q.f->open_object_section("osd");
6222 q.f->dump_int("osd", p->first);
6223 q.f->dump_int("current_lost_at", p->second);
6224 q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
6225 q.f->close_section();
6226 }
6227 q.f->close_section();
6228
6229 if (history_les_bound) {
6230 q.f->open_array_section("peering_blocked_by_detail");
6231 q.f->open_object_section("item");
6232 q.f->dump_string("detail","peering_blocked_by_history_les_bound");
6233 q.f->close_section();
6234 q.f->close_section();
6235 }
6236
6237 q.f->close_section();
6238 return forward_event();
6239 }
6240
6241 void PG::RecoveryState::Peering::exit()
6242 {
6243 PG *pg = context< RecoveryMachine >().pg;
6244 ldout(pg->cct, 10) << "Leaving Peering" << dendl;
6245 context< RecoveryMachine >().log_exit(state_name, enter_time);
6246 pg->state_clear(PG_STATE_PEERING);
6247 pg->clear_probe_targets();
6248
6249 utime_t dur = ceph_clock_now() - enter_time;
6250 pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur);
6251 }
6252
6253
6254 /*------Backfilling-------*/
6255 PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
6256 : my_base(ctx),
6257 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Backfilling")
6258 {
6259 context< RecoveryMachine >().log_enter(state_name);
6260 PG *pg = context< RecoveryMachine >().pg;
6261 pg->backfill_reserved = true;
6262 pg->queue_recovery();
6263 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
6264 pg->state_clear(PG_STATE_BACKFILL_WAIT);
6265 pg->state_set(PG_STATE_BACKFILL);
6266 pg->publish_stats_to_osd();
6267 }
6268
6269 boost::statechart::result
6270 PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &)
6271 {
6272 PG *pg = context< RecoveryMachine >().pg;
6273 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6274 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6275
6276 for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6277 it != pg->backfill_targets.end();
6278 ++it) {
6279 assert(*it != pg->pg_whoami);
6280 ConnectionRef con = pg->osd->get_con_osd_cluster(
6281 it->osd, pg->get_osdmap()->get_epoch());
6282 if (con) {
6283 pg->osd->send_message_osd_cluster(
6284 new MBackfillReserve(
6285 MBackfillReserve::REJECT,
6286 spg_t(pg->info.pgid.pgid, it->shard),
6287 pg->get_osdmap()->get_epoch()),
6288 con.get());
6289 }
6290 }
6291
6292 pg->waiting_on_backfill.clear();
6293 pg->finish_recovery_op(hobject_t::get_max());
6294
6295 pg->schedule_backfill_full_retry();
6296 return transit<NotBackfilling>();
6297 }
6298
6299 void PG::RecoveryState::Backfilling::exit()
6300 {
6301 context< RecoveryMachine >().log_exit(state_name, enter_time);
6302 PG *pg = context< RecoveryMachine >().pg;
6303 pg->backfill_reserved = false;
6304 pg->backfill_reserving = false;
6305 pg->state_clear(PG_STATE_BACKFILL);
6306 utime_t dur = ceph_clock_now() - enter_time;
6307 pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
6308 }
6309
6310 /*--WaitRemoteBackfillReserved--*/
6311
6312 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
6313 : my_base(ctx),
6314 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteBackfillReserved"),
6315 backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
6316 {
6317 context< RecoveryMachine >().log_enter(state_name);
6318 PG *pg = context< RecoveryMachine >().pg;
6319 pg->state_set(PG_STATE_BACKFILL_WAIT);
6320 pg->publish_stats_to_osd();
6321 post_event(RemoteBackfillReserved());
6322 }
6323
6324 boost::statechart::result
6325 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
6326 {
6327 PG *pg = context< RecoveryMachine >().pg;
6328
6329 if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) {
6330 //The primary never backfills itself
6331 assert(*backfill_osd_it != pg->pg_whoami);
6332 ConnectionRef con = pg->osd->get_con_osd_cluster(
6333 backfill_osd_it->osd, pg->get_osdmap()->get_epoch());
6334 if (con) {
6335 pg->osd->send_message_osd_cluster(
6336 new MBackfillReserve(
6337 MBackfillReserve::REQUEST,
6338 spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
6339 pg->get_osdmap()->get_epoch(),
6340 pg->get_backfill_priority()),
6341 con.get());
6342 }
6343 ++backfill_osd_it;
6344 } else {
6345 post_event(AllBackfillsReserved());
6346 }
6347 return discard_event();
6348 }
6349
6350 void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
6351 {
6352 context< RecoveryMachine >().log_exit(state_name, enter_time);
6353 PG *pg = context< RecoveryMachine >().pg;
6354 utime_t dur = ceph_clock_now() - enter_time;
6355 pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur);
6356 }
6357
6358 boost::statechart::result
6359 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt)
6360 {
6361 PG *pg = context< RecoveryMachine >().pg;
6362 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6363
6364 // Send REJECT to all previously acquired reservations
6365 set<pg_shard_t>::const_iterator it, begin, end, next;
6366 begin = context< Active >().remote_shards_to_reserve_backfill.begin();
6367 end = context< Active >().remote_shards_to_reserve_backfill.end();
6368 assert(begin != end);
6369 for (next = it = begin, ++next ; next != backfill_osd_it; ++it, ++next) {
6370 //The primary never backfills itself
6371 assert(*it != pg->pg_whoami);
6372 ConnectionRef con = pg->osd->get_con_osd_cluster(
6373 it->osd, pg->get_osdmap()->get_epoch());
6374 if (con) {
6375 pg->osd->send_message_osd_cluster(
6376 new MBackfillReserve(
6377 MBackfillReserve::REJECT,
6378 spg_t(pg->info.pgid.pgid, it->shard),
6379 pg->get_osdmap()->get_epoch()),
6380 con.get());
6381 }
6382 }
6383
6384 pg->state_clear(PG_STATE_BACKFILL_WAIT);
6385 pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6386 pg->publish_stats_to_osd();
6387
6388 pg->schedule_backfill_full_retry();
6389
6390 return transit<NotBackfilling>();
6391 }
6392
6393 /*--WaitLocalBackfillReserved--*/
6394 PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
6395 : my_base(ctx),
6396 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalBackfillReserved")
6397 {
6398 context< RecoveryMachine >().log_enter(state_name);
6399 PG *pg = context< RecoveryMachine >().pg;
6400 pg->state_set(PG_STATE_BACKFILL_WAIT);
6401 pg->osd->local_reserver.request_reservation(
6402 pg->info.pgid,
6403 new QueuePeeringEvt<LocalBackfillReserved>(
6404 pg, pg->get_osdmap()->get_epoch(),
6405 LocalBackfillReserved()),
6406 pg->get_backfill_priority());
6407 pg->publish_stats_to_osd();
6408 }
6409
6410 void PG::RecoveryState::WaitLocalBackfillReserved::exit()
6411 {
6412 context< RecoveryMachine >().log_exit(state_name, enter_time);
6413 PG *pg = context< RecoveryMachine >().pg;
6414 utime_t dur = ceph_clock_now() - enter_time;
6415 pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur);
6416 }
6417
6418 /*----NotBackfilling------*/
6419 PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
6420 : my_base(ctx),
6421 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotBackfilling")
6422 {
6423 context< RecoveryMachine >().log_enter(state_name);
6424 PG *pg = context< RecoveryMachine >().pg;
6425 pg->publish_stats_to_osd();
6426 }
6427
6428 boost::statechart::result
6429 PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
6430 {
6431 return discard_event();
6432 }
6433
6434 boost::statechart::result
6435 PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
6436 {
6437 return discard_event();
6438 }
6439
6440 void PG::RecoveryState::NotBackfilling::exit()
6441 {
6442 context< RecoveryMachine >().log_exit(state_name, enter_time);
6443 PG *pg = context< RecoveryMachine >().pg;
6444 utime_t dur = ceph_clock_now() - enter_time;
6445 pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
6446 }
6447
6448 /*----NotRecovering------*/
6449 PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx)
6450 : my_base(ctx),
6451 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotRecovering")
6452 {
6453 context< RecoveryMachine >().log_enter(state_name);
6454 PG *pg = context< RecoveryMachine >().pg;
6455 pg->publish_stats_to_osd();
6456 }
6457
6458 void PG::RecoveryState::NotRecovering::exit()
6459 {
6460 context< RecoveryMachine >().log_exit(state_name, enter_time);
6461 PG *pg = context< RecoveryMachine >().pg;
6462 utime_t dur = ceph_clock_now() - enter_time;
6463 pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur);
6464 }
6465
6466 /*---RepNotRecovering----*/
6467 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
6468 : my_base(ctx),
6469 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepNotRecovering")
6470 {
6471 context< RecoveryMachine >().log_enter(state_name);
6472 }
6473
6474 void PG::RecoveryState::RepNotRecovering::exit()
6475 {
6476 context< RecoveryMachine >().log_exit(state_name, enter_time);
6477 PG *pg = context< RecoveryMachine >().pg;
6478 utime_t dur = ceph_clock_now() - enter_time;
6479 pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur);
6480 }
6481
6482 /*---RepWaitRecoveryReserved--*/
6483 PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
6484 : my_base(ctx),
6485 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitRecoveryReserved")
6486 {
6487 context< RecoveryMachine >().log_enter(state_name);
6488 PG *pg = context< RecoveryMachine >().pg;
6489
6490 pg->osd->remote_reserver.request_reservation(
6491 pg->info.pgid,
6492 new QueuePeeringEvt<RemoteRecoveryReserved>(
6493 pg, pg->get_osdmap()->get_epoch(),
6494 RemoteRecoveryReserved()),
6495 pg->get_recovery_priority());
6496 }
6497
6498 boost::statechart::result
6499 PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
6500 {
6501 PG *pg = context< RecoveryMachine >().pg;
6502 pg->osd->send_message_osd_cluster(
6503 pg->primary.osd,
6504 new MRecoveryReserve(
6505 MRecoveryReserve::GRANT,
6506 spg_t(pg->info.pgid.pgid, pg->primary.shard),
6507 pg->get_osdmap()->get_epoch()),
6508 pg->get_osdmap()->get_epoch());
6509 return transit<RepRecovering>();
6510 }
6511
6512 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
6513 {
6514 context< RecoveryMachine >().log_exit(state_name, enter_time);
6515 PG *pg = context< RecoveryMachine >().pg;
6516 utime_t dur = ceph_clock_now() - enter_time;
6517 pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur);
6518 }
6519
6520 /*-RepWaitBackfillReserved*/
6521 PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
6522 : my_base(ctx),
6523 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitBackfillReserved")
6524 {
6525 context< RecoveryMachine >().log_enter(state_name);
6526 }
6527
6528 boost::statechart::result
6529 PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
6530 {
6531 PG *pg = context< RecoveryMachine >().pg;
6532 ostringstream ss;
6533
6534 if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
6535 (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
6536 ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
6537 << dendl;
6538 post_event(RemoteReservationRejected());
6539 } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
6540 pg->osd->check_backfill_full(ss)) {
6541 ldout(pg->cct, 10) << "backfill reservation rejected: "
6542 << ss.str() << dendl;
6543 post_event(RemoteReservationRejected());
6544 } else {
6545 pg->osd->remote_reserver.request_reservation(
6546 pg->info.pgid,
6547 new QueuePeeringEvt<RemoteBackfillReserved>(
6548 pg, pg->get_osdmap()->get_epoch(),
6549 RemoteBackfillReserved()), evt.priority);
6550 }
6551 return transit<RepWaitBackfillReserved>();
6552 }
6553
6554 void PG::RecoveryState::RepWaitBackfillReserved::exit()
6555 {
6556 context< RecoveryMachine >().log_exit(state_name, enter_time);
6557 PG *pg = context< RecoveryMachine >().pg;
6558 utime_t dur = ceph_clock_now() - enter_time;
6559 pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur);
6560 }
6561
6562 boost::statechart::result
6563 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
6564 {
6565 PG *pg = context< RecoveryMachine >().pg;
6566
6567 ostringstream ss;
6568 if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
6569 (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
6570 ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
6571 << "failure injection" << dendl;
6572 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6573 post_event(RemoteReservationRejected());
6574 return discard_event();
6575 } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
6576 pg->osd->check_backfill_full(ss)) {
6577 ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
6578 << ss.str() << dendl;
6579 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6580 post_event(RemoteReservationRejected());
6581 return discard_event();
6582 } else {
6583 pg->osd->send_message_osd_cluster(
6584 pg->primary.osd,
6585 new MBackfillReserve(
6586 MBackfillReserve::GRANT,
6587 spg_t(pg->info.pgid.pgid, pg->primary.shard),
6588 pg->get_osdmap()->get_epoch()),
6589 pg->get_osdmap()->get_epoch());
6590 return transit<RepRecovering>();
6591 }
6592 }
6593
6594 boost::statechart::result
6595 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteReservationRejected &evt)
6596 {
6597 PG *pg = context< RecoveryMachine >().pg;
6598 pg->reject_reservation();
6599 return transit<RepNotRecovering>();
6600 }
6601
6602 /*---RepRecovering-------*/
6603 PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx)
6604 : my_base(ctx),
6605 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepRecovering")
6606 {
6607 context< RecoveryMachine >().log_enter(state_name);
6608 }
6609
6610 boost::statechart::result
6611 PG::RecoveryState::RepRecovering::react(const BackfillTooFull &)
6612 {
6613 PG *pg = context< RecoveryMachine >().pg;
6614 pg->reject_reservation();
6615 return discard_event();
6616 }
6617
6618 void PG::RecoveryState::RepRecovering::exit()
6619 {
6620 context< RecoveryMachine >().log_exit(state_name, enter_time);
6621 PG *pg = context< RecoveryMachine >().pg;
6622 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6623 utime_t dur = ceph_clock_now() - enter_time;
6624 pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur);
6625 }
6626
6627 /*------Activating--------*/
6628 PG::RecoveryState::Activating::Activating(my_context ctx)
6629 : my_base(ctx),
6630 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Activating")
6631 {
6632 context< RecoveryMachine >().log_enter(state_name);
6633 }
6634
6635 void PG::RecoveryState::Activating::exit()
6636 {
6637 context< RecoveryMachine >().log_exit(state_name, enter_time);
6638 PG *pg = context< RecoveryMachine >().pg;
6639 utime_t dur = ceph_clock_now() - enter_time;
6640 pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur);
6641 }
6642
6643 PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
6644 : my_base(ctx),
6645 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalRecoveryReserved")
6646 {
6647 context< RecoveryMachine >().log_enter(state_name);
6648 PG *pg = context< RecoveryMachine >().pg;
6649
6650 // Make sure all nodes that part of the recovery aren't full
6651 if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery &&
6652 pg->osd->check_osdmap_full(pg->actingbackfill)) {
6653 post_event(RecoveryTooFull());
6654 return;
6655 }
6656
6657 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
6658 pg->state_set(PG_STATE_RECOVERY_WAIT);
6659 pg->osd->local_reserver.request_reservation(
6660 pg->info.pgid,
6661 new QueuePeeringEvt<LocalRecoveryReserved>(
6662 pg, pg->get_osdmap()->get_epoch(),
6663 LocalRecoveryReserved()),
6664 pg->get_recovery_priority());
6665 pg->publish_stats_to_osd();
6666 }
6667
6668 boost::statechart::result
6669 PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
6670 {
6671 PG *pg = context< RecoveryMachine >().pg;
6672 pg->state_set(PG_STATE_RECOVERY_TOOFULL);
6673 pg->schedule_recovery_full_retry();
6674 return transit<NotRecovering>();
6675 }
6676
6677 void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
6678 {
6679 context< RecoveryMachine >().log_exit(state_name, enter_time);
6680 PG *pg = context< RecoveryMachine >().pg;
6681 utime_t dur = ceph_clock_now() - enter_time;
6682 pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur);
6683 }
6684
6685 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
6686 : my_base(ctx),
6687 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
6688 remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
6689 {
6690 context< RecoveryMachine >().log_enter(state_name);
6691 post_event(RemoteRecoveryReserved());
6692 }
6693
6694 boost::statechart::result
6695 PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
6696 PG *pg = context< RecoveryMachine >().pg;
6697
6698 if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
6699 assert(*remote_recovery_reservation_it != pg->pg_whoami);
6700 ConnectionRef con = pg->osd->get_con_osd_cluster(
6701 remote_recovery_reservation_it->osd, pg->get_osdmap()->get_epoch());
6702 if (con) {
6703 pg->osd->send_message_osd_cluster(
6704 new MRecoveryReserve(
6705 MRecoveryReserve::REQUEST,
6706 spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard),
6707 pg->get_osdmap()->get_epoch()),
6708 con.get());
6709 }
6710 ++remote_recovery_reservation_it;
6711 } else {
6712 post_event(AllRemotesReserved());
6713 }
6714 return discard_event();
6715 }
6716
6717 void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
6718 {
6719 context< RecoveryMachine >().log_exit(state_name, enter_time);
6720 PG *pg = context< RecoveryMachine >().pg;
6721 utime_t dur = ceph_clock_now() - enter_time;
6722 pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur);
6723 }
6724
6725 PG::RecoveryState::Recovering::Recovering(my_context ctx)
6726 : my_base(ctx),
6727 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovering")
6728 {
6729 context< RecoveryMachine >().log_enter(state_name);
6730
6731 PG *pg = context< RecoveryMachine >().pg;
6732 pg->state_clear(PG_STATE_RECOVERY_WAIT);
6733 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
6734 pg->state_set(PG_STATE_RECOVERING);
6735 pg->publish_stats_to_osd();
6736 pg->queue_recovery();
6737 }
6738
6739 void PG::RecoveryState::Recovering::release_reservations()
6740 {
6741 PG *pg = context< RecoveryMachine >().pg;
6742 assert(!pg->pg_log.get_missing().have_missing());
6743
6744 // release remote reservations
6745 for (set<pg_shard_t>::const_iterator i =
6746 context< Active >().remote_shards_to_reserve_recovery.begin();
6747 i != context< Active >().remote_shards_to_reserve_recovery.end();
6748 ++i) {
6749 if (*i == pg->pg_whoami) // skip myself
6750 continue;
6751 ConnectionRef con = pg->osd->get_con_osd_cluster(
6752 i->osd, pg->get_osdmap()->get_epoch());
6753 if (con) {
6754 pg->osd->send_message_osd_cluster(
6755 new MRecoveryReserve(
6756 MRecoveryReserve::RELEASE,
6757 spg_t(pg->info.pgid.pgid, i->shard),
6758 pg->get_osdmap()->get_epoch()),
6759 con.get());
6760 }
6761 }
6762 }
6763
6764 boost::statechart::result
6765 PG::RecoveryState::Recovering::react(const AllReplicasRecovered &evt)
6766 {
6767 PG *pg = context< RecoveryMachine >().pg;
6768 pg->state_clear(PG_STATE_RECOVERING);
6769 release_reservations();
6770 return transit<Recovered>();
6771 }
6772
6773 boost::statechart::result
6774 PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
6775 {
6776 PG *pg = context< RecoveryMachine >().pg;
6777 pg->state_clear(PG_STATE_RECOVERING);
6778 release_reservations();
6779 return transit<WaitRemoteBackfillReserved>();
6780 }
6781
6782 void PG::RecoveryState::Recovering::exit()
6783 {
6784 context< RecoveryMachine >().log_exit(state_name, enter_time);
6785 PG *pg = context< RecoveryMachine >().pg;
6786 utime_t dur = ceph_clock_now() - enter_time;
6787 pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur);
6788 }
6789
6790 PG::RecoveryState::Recovered::Recovered(my_context ctx)
6791 : my_base(ctx),
6792 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovered")
6793 {
6794 pg_shard_t auth_log_shard;
6795
6796 context< RecoveryMachine >().log_enter(state_name);
6797
6798 PG *pg = context< RecoveryMachine >().pg;
6799 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6800
6801 assert(!pg->needs_recovery());
6802
6803 // if we finished backfill, all acting are active; recheck if
6804 // DEGRADED | UNDERSIZED is appropriate.
6805 assert(!pg->actingbackfill.empty());
6806 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
6807 pg->actingbackfill.size()) {
6808 pg->state_clear(PG_STATE_DEGRADED);
6809 pg->publish_stats_to_osd();
6810 }
6811
6812 // trim pglog on recovered
6813 pg->trim_log();
6814
6815 // adjust acting set? (e.g. because backfill completed...)
6816 bool history_les_bound = false;
6817 if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
6818 true, &history_les_bound))
6819 assert(pg->want_acting.size());
6820
6821 if (context< Active >().all_replicas_activated)
6822 post_event(GoClean());
6823 }
6824
6825 void PG::RecoveryState::Recovered::exit()
6826 {
6827 context< RecoveryMachine >().log_exit(state_name, enter_time);
6828 PG *pg = context< RecoveryMachine >().pg;
6829 utime_t dur = ceph_clock_now() - enter_time;
6830 pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur);
6831 }
6832
6833 PG::RecoveryState::Clean::Clean(my_context ctx)
6834 : my_base(ctx),
6835 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Clean")
6836 {
6837 context< RecoveryMachine >().log_enter(state_name);
6838
6839 PG *pg = context< RecoveryMachine >().pg;
6840
6841 if (pg->info.last_complete != pg->info.last_update) {
6842 ceph_abort();
6843 }
6844 pg->finish_recovery(*context< RecoveryMachine >().get_on_safe_context_list());
6845 pg->mark_clean();
6846
6847 pg->share_pg_info();
6848 pg->publish_stats_to_osd();
6849
6850 }
6851
6852 void PG::RecoveryState::Clean::exit()
6853 {
6854 context< RecoveryMachine >().log_exit(state_name, enter_time);
6855 PG *pg = context< RecoveryMachine >().pg;
6856 pg->state_clear(PG_STATE_CLEAN);
6857 utime_t dur = ceph_clock_now() - enter_time;
6858 pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur);
6859 }
6860
6861 template <typename T>
6862 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
6863 {
6864 set<int> osds_found;
6865 set<pg_shard_t> out;
6866 for (typename T::const_iterator i = in.begin();
6867 i != in.end();
6868 ++i) {
6869 if (*i != skip && !osds_found.count(i->osd)) {
6870 osds_found.insert(i->osd);
6871 out.insert(*i);
6872 }
6873 }
6874 return out;
6875 }
6876
6877 /*---------Active---------*/
6878 PG::RecoveryState::Active::Active(my_context ctx)
6879 : my_base(ctx),
6880 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active"),
6881 remote_shards_to_reserve_recovery(
6882 unique_osd_shard_set(
6883 context< RecoveryMachine >().pg->pg_whoami,
6884 context< RecoveryMachine >().pg->actingbackfill)),
6885 remote_shards_to_reserve_backfill(
6886 unique_osd_shard_set(
6887 context< RecoveryMachine >().pg->pg_whoami,
6888 context< RecoveryMachine >().pg->backfill_targets)),
6889 all_replicas_activated(false)
6890 {
6891 context< RecoveryMachine >().log_enter(state_name);
6892
6893 PG *pg = context< RecoveryMachine >().pg;
6894
6895 assert(!pg->backfill_reserving);
6896 assert(!pg->backfill_reserved);
6897 assert(pg->is_primary());
6898 ldout(pg->cct, 10) << "In Active, about to call activate" << dendl;
6899 pg->start_flush(
6900 context< RecoveryMachine >().get_cur_transaction(),
6901 context< RecoveryMachine >().get_on_applied_context_list(),
6902 context< RecoveryMachine >().get_on_safe_context_list());
6903 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
6904 pg->get_osdmap()->get_epoch(),
6905 *context< RecoveryMachine >().get_on_safe_context_list(),
6906 *context< RecoveryMachine >().get_query_map(),
6907 context< RecoveryMachine >().get_info_map(),
6908 context< RecoveryMachine >().get_recovery_ctx());
6909
6910 // everyone has to commit/ack before we are truly active
6911 pg->blocked_by.clear();
6912 for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
6913 p != pg->actingbackfill.end();
6914 ++p) {
6915 if (p->shard != pg->pg_whoami.shard) {
6916 pg->blocked_by.insert(p->shard);
6917 }
6918 }
6919 pg->publish_stats_to_osd();
6920 ldout(pg->cct, 10) << "Activate Finished" << dendl;
6921 }
6922
6923 boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
6924 {
6925 PG *pg = context< RecoveryMachine >().pg;
6926 ldout(pg->cct, 10) << "Active advmap" << dendl;
6927 if (!pg->pool.newly_removed_snaps.empty()) {
6928 pg->snap_trimq.union_of(pg->pool.newly_removed_snaps);
6929 ldout(pg->cct, 10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl;
6930 pg->dirty_info = true;
6931 pg->dirty_big_info = true;
6932 }
6933
6934 for (size_t i = 0; i < pg->want_acting.size(); i++) {
6935 int osd = pg->want_acting[i];
6936 if (!advmap.osdmap->is_up(osd)) {
6937 pg_shard_t osd_with_shard(osd, shard_id_t(i));
6938 assert(pg->is_acting(osd_with_shard) || pg->is_up(osd_with_shard));
6939 }
6940 }
6941
6942 bool need_publish = false;
6943 /* Check for changes in pool size (if the acting set changed as a result,
6944 * this does not matter) */
6945 if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) !=
6946 pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) {
6947 if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) {
6948 pg->state_clear(PG_STATE_UNDERSIZED);
6949 if (pg->needs_recovery()) {
6950 pg->state_set(PG_STATE_DEGRADED);
6951 } else {
6952 pg->state_clear(PG_STATE_DEGRADED);
6953 }
6954 } else {
6955 pg->state_set(PG_STATE_UNDERSIZED);
6956 pg->state_set(PG_STATE_DEGRADED);
6957 }
6958 need_publish = true; // degraded may have changed
6959 }
6960
6961 // if we haven't reported our PG stats in a long time, do so now.
6962 if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
6963 ldout(pg->cct, 20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch)
6964 << " epochs" << dendl;
6965 need_publish = true;
6966 }
6967
6968 if (need_publish)
6969 pg->publish_stats_to_osd();
6970
6971 return forward_event();
6972 }
6973
6974 boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
6975 {
6976 PG *pg = context< RecoveryMachine >().pg;
6977 ldout(pg->cct, 10) << "Active: handling ActMap" << dendl;
6978 assert(pg->is_primary());
6979
6980 if (pg->have_unfound()) {
6981 // object may have become unfound
6982 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
6983 }
6984
6985 if (pg->cct->_conf->osd_check_for_log_corruption)
6986 pg->check_log_for_corruption(pg->osd->store);
6987
6988 uint64_t unfound = pg->missing_loc.num_unfound();
6989 if (unfound > 0 &&
6990 pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
6991 if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
6992 pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound
6993 << " objects unfound and apparently lost, would automatically marking lost but NOT IMPLEMENTED";
6994 } else
6995 pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound << " objects unfound and apparently lost";
6996 }
6997
6998 if (pg->is_active()) {
6999 ldout(pg->cct, 10) << "Active: kicking snap trim" << dendl;
7000 pg->kick_snap_trim();
7001 }
7002
7003 if (pg->is_peered() &&
7004 !pg->is_clean() &&
7005 !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) &&
7006 (!pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || pg->is_degraded())) {
7007 pg->queue_recovery();
7008 }
7009 return forward_event();
7010 }
7011
7012 boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& notevt)
7013 {
7014 PG *pg = context< RecoveryMachine >().pg;
7015 assert(pg->is_primary());
7016 if (pg->peer_info.count(notevt.from)) {
7017 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7018 << ", already have info from that osd, ignoring"
7019 << dendl;
7020 } else if (pg->peer_purged.count(notevt.from)) {
7021 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7022 << ", already purged that peer, ignoring"
7023 << dendl;
7024 } else {
7025 ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7026 << ", calling proc_replica_info and discover_all_missing"
7027 << dendl;
7028 pg->proc_replica_info(
7029 notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
7030 if (pg->have_unfound()) {
7031 pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7032 }
7033 }
7034 return discard_event();
7035 }
7036
7037 boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoevt)
7038 {
7039 PG *pg = context< RecoveryMachine >().pg;
7040 assert(pg->is_primary());
7041
7042 assert(!pg->actingbackfill.empty());
7043 // don't update history (yet) if we are active and primary; the replica
7044 // may be telling us they have activated (and committed) but we can't
7045 // share that until _everyone_ does the same.
7046 if (pg->is_actingbackfill(infoevt.from)) {
7047 ldout(pg->cct, 10) << " peer osd." << infoevt.from
7048 << " activated and committed" << dendl;
7049 pg->peer_activated.insert(infoevt.from);
7050 pg->blocked_by.erase(infoevt.from.shard);
7051 pg->publish_stats_to_osd();
7052 if (pg->peer_activated.size() == pg->actingbackfill.size()) {
7053 pg->all_activated_and_committed();
7054 }
7055 }
7056 return discard_event();
7057 }
7058
7059 boost::statechart::result PG::RecoveryState::Active::react(const MLogRec& logevt)
7060 {
7061 PG *pg = context< RecoveryMachine >().pg;
7062 ldout(pg->cct, 10) << "searching osd." << logevt.from
7063 << " log for unfound items" << dendl;
7064 pg->proc_replica_log(
7065 logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
7066 bool got_missing = pg->search_for_missing(
7067 pg->peer_info[logevt.from],
7068 pg->peer_missing[logevt.from],
7069 logevt.from,
7070 context< RecoveryMachine >().get_recovery_ctx());
7071 if (pg->is_peered() &&
7072 got_missing)
7073 pg->queue_recovery();
7074 return discard_event();
7075 }
7076
7077 boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
7078 {
7079 PG *pg = context< RecoveryMachine >().pg;
7080
7081 q.f->open_object_section("state");
7082 q.f->dump_string("name", state_name);
7083 q.f->dump_stream("enter_time") << enter_time;
7084
7085 {
7086 q.f->open_array_section("might_have_unfound");
7087 for (set<pg_shard_t>::iterator p = pg->might_have_unfound.begin();
7088 p != pg->might_have_unfound.end();
7089 ++p) {
7090 q.f->open_object_section("osd");
7091 q.f->dump_stream("osd") << *p;
7092 if (pg->peer_missing.count(*p)) {
7093 q.f->dump_string("status", "already probed");
7094 } else if (pg->peer_missing_requested.count(*p)) {
7095 q.f->dump_string("status", "querying");
7096 } else if (!pg->get_osdmap()->is_up(p->osd)) {
7097 q.f->dump_string("status", "osd is down");
7098 } else {
7099 q.f->dump_string("status", "not queried");
7100 }
7101 q.f->close_section();
7102 }
7103 q.f->close_section();
7104 }
7105 {
7106 q.f->open_object_section("recovery_progress");
7107 pg->dump_recovery_info(q.f);
7108 q.f->close_section();
7109 }
7110
7111 {
7112 q.f->open_object_section("scrub");
7113 q.f->dump_stream("scrubber.epoch_start") << pg->scrubber.epoch_start;
7114 q.f->dump_bool("scrubber.active", pg->scrubber.active);
7115 q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state));
7116 q.f->dump_stream("scrubber.start") << pg->scrubber.start;
7117 q.f->dump_stream("scrubber.end") << pg->scrubber.end;
7118 q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update;
7119 q.f->dump_bool("scrubber.deep", pg->scrubber.deep);
7120 q.f->dump_unsigned("scrubber.seed", pg->scrubber.seed);
7121 q.f->dump_int("scrubber.waiting_on", pg->scrubber.waiting_on);
7122 {
7123 q.f->open_array_section("scrubber.waiting_on_whom");
7124 for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin();
7125 p != pg->scrubber.waiting_on_whom.end();
7126 ++p) {
7127 q.f->dump_stream("shard") << *p;
7128 }
7129 q.f->close_section();
7130 }
7131 q.f->close_section();
7132 }
7133
7134 q.f->close_section();
7135 return forward_event();
7136 }
7137
7138 boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActivated &evt)
7139 {
7140 PG *pg = context< RecoveryMachine >().pg;
7141 all_replicas_activated = true;
7142
7143 pg->state_clear(PG_STATE_ACTIVATING);
7144 pg->state_clear(PG_STATE_CREATING);
7145 if (pg->acting.size() >= pg->pool.info.min_size) {
7146 pg->state_set(PG_STATE_ACTIVE);
7147 } else {
7148 pg->state_set(PG_STATE_PEERED);
7149 }
7150
7151 // info.last_epoch_started is set during activate()
7152 pg->info.history.last_epoch_started = pg->info.last_epoch_started;
7153 pg->info.history.last_interval_started = pg->info.last_interval_started;
7154 pg->dirty_info = true;
7155
7156 pg->share_pg_info();
7157 pg->publish_stats_to_osd();
7158
7159 pg->check_local();
7160
7161 // waiters
7162 if (pg->flushes_in_progress == 0) {
7163 pg->requeue_ops(pg->waiting_for_peered);
7164 }
7165
7166 pg->on_activate();
7167
7168 return discard_event();
7169 }
7170
7171 void PG::RecoveryState::Active::exit()
7172 {
7173 context< RecoveryMachine >().log_exit(state_name, enter_time);
7174 PG *pg = context< RecoveryMachine >().pg;
7175 pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7176
7177 pg->blocked_by.clear();
7178 pg->backfill_reserved = false;
7179 pg->backfill_reserving = false;
7180 pg->state_clear(PG_STATE_ACTIVATING);
7181 pg->state_clear(PG_STATE_DEGRADED);
7182 pg->state_clear(PG_STATE_UNDERSIZED);
7183 pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
7184 pg->state_clear(PG_STATE_BACKFILL_WAIT);
7185 pg->state_clear(PG_STATE_RECOVERY_WAIT);
7186 pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7187 utime_t dur = ceph_clock_now() - enter_time;
7188 pg->osd->recoverystate_perf->tinc(rs_active_latency, dur);
7189 pg->agent_stop();
7190 }
7191
7192 /*------ReplicaActive-----*/
7193 PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx)
7194 : my_base(ctx),
7195 NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive")
7196 {
7197 context< RecoveryMachine >().log_enter(state_name);
7198
7199 PG *pg = context< RecoveryMachine >().pg;
7200 pg->start_flush(
7201 context< RecoveryMachine >().get_cur_transaction(),
7202 context< RecoveryMachine >().get_on_applied_context_list(),
7203 context< RecoveryMachine >().get_on_safe_context_list());
7204 }
7205
7206
7207 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
7208 const Activate& actevt) {
7209 PG *pg = context< RecoveryMachine >().pg;
7210 ldout(pg->cct, 10) << "In ReplicaActive, about to call activate" << dendl;
7211 map<int, map<spg_t, pg_query_t> > query_map;
7212 pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7213 actevt.activation_epoch,
7214 *context< RecoveryMachine >().get_on_safe_context_list(),
7215 query_map, NULL, NULL);
7216 ldout(pg->cct, 10) << "Activate Finished" << dendl;
7217 return discard_event();
7218 }
7219
7220 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MInfoRec& infoevt)
7221 {
7222 PG *pg = context< RecoveryMachine >().pg;
7223 pg->proc_primary_info(*context<RecoveryMachine>().get_cur_transaction(),
7224 infoevt.info);
7225 return discard_event();
7226 }
7227
7228 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MLogRec& logevt)
7229 {
7230 PG *pg = context< RecoveryMachine >().pg;
7231 ldout(pg->cct, 10) << "received log from " << logevt.from << dendl;
7232 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7233 pg->merge_log(*t, logevt.msg->info, logevt.msg->log, logevt.from);
7234 assert(pg->pg_log.get_head() == pg->info.last_update);
7235
7236 return discard_event();
7237 }
7238
7239 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&)
7240 {
7241 PG *pg = context< RecoveryMachine >().pg;
7242 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7243 context< RecoveryMachine >().send_notify(
7244 pg->get_primary(),
7245 pg_notify_t(
7246 pg->get_primary().shard, pg->pg_whoami.shard,
7247 pg->get_osdmap()->get_epoch(),
7248 pg->get_osdmap()->get_epoch(),
7249 pg->info),
7250 pg->past_intervals);
7251 }
7252 pg->take_waiters();
7253 return discard_event();
7254 }
7255
7256 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MQuery& query)
7257 {
7258 PG *pg = context< RecoveryMachine >().pg;
7259 if (query.query.type == pg_query_t::MISSING) {
7260 pg->update_history(query.query.history);
7261 pg->fulfill_log(query.from, query.query, query.query_epoch);
7262 } // else: from prior to activation, safe to ignore
7263 return discard_event();
7264 }
7265
7266 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const QueryState& q)
7267 {
7268 q.f->open_object_section("state");
7269 q.f->dump_string("name", state_name);
7270 q.f->dump_stream("enter_time") << enter_time;
7271 q.f->close_section();
7272 return forward_event();
7273 }
7274
7275 void PG::RecoveryState::ReplicaActive::exit()
7276 {
7277 context< RecoveryMachine >().log_exit(state_name, enter_time);
7278 PG *pg = context< RecoveryMachine >().pg;
7279 pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7280 utime_t dur = ceph_clock_now() - enter_time;
7281 pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur);
7282 }
7283
7284 /*-------Stray---*/
7285 PG::RecoveryState::Stray::Stray(my_context ctx)
7286 : my_base(ctx),
7287 NamedState(context< RecoveryMachine >().pg, "Started/Stray")
7288 {
7289 context< RecoveryMachine >().log_enter(state_name);
7290
7291 PG *pg = context< RecoveryMachine >().pg;
7292 assert(!pg->is_peered());
7293 assert(!pg->is_peering());
7294 assert(!pg->is_primary());
7295 pg->start_flush(
7296 context< RecoveryMachine >().get_cur_transaction(),
7297 context< RecoveryMachine >().get_on_applied_context_list(),
7298 context< RecoveryMachine >().get_on_safe_context_list());
7299 }
7300
7301 boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
7302 {
7303 PG *pg = context< RecoveryMachine >().pg;
7304 MOSDPGLog *msg = logevt.msg.get();
7305 ldout(pg->cct, 10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
7306
7307 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7308 if (msg->info.last_backfill == hobject_t()) {
7309 // restart backfill
7310 pg->unreg_next_scrub();
7311 pg->info = msg->info;
7312 pg->reg_next_scrub();
7313 pg->dirty_info = true;
7314 pg->dirty_big_info = true; // maybe.
7315
7316 PGLogEntryHandler rollbacker{pg, t};
7317 pg->pg_log.reset_backfill_claim_log(msg->log, &rollbacker);
7318
7319 pg->pg_log.reset_backfill();
7320 } else {
7321 pg->merge_log(*t, msg->info, msg->log, logevt.from);
7322 }
7323
7324 assert(pg->pg_log.get_head() == pg->info.last_update);
7325
7326 post_event(Activate(logevt.msg->info.last_epoch_started));
7327 return transit<ReplicaActive>();
7328 }
7329
7330 boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoevt)
7331 {
7332 PG *pg = context< RecoveryMachine >().pg;
7333 ldout(pg->cct, 10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
7334
7335 if (pg->info.last_update > infoevt.info.last_update) {
7336 // rewind divergent log entries
7337 ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7338 pg->rewind_divergent_log(*t, infoevt.info.last_update);
7339 pg->info.stats = infoevt.info.stats;
7340 pg->info.hit_set = infoevt.info.hit_set;
7341 }
7342
7343 assert(infoevt.info.last_update == pg->info.last_update);
7344 assert(pg->pg_log.get_head() == pg->info.last_update);
7345
7346 post_event(Activate(infoevt.info.last_epoch_started));
7347 return transit<ReplicaActive>();
7348 }
7349
7350 boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query)
7351 {
7352 PG *pg = context< RecoveryMachine >().pg;
7353 if (query.query.type == pg_query_t::INFO) {
7354 pair<pg_shard_t, pg_info_t> notify_info;
7355 pg->update_history(query.query.history);
7356 pg->fulfill_info(query.from, query.query, notify_info);
7357 context< RecoveryMachine >().send_notify(
7358 notify_info.first,
7359 pg_notify_t(
7360 notify_info.first.shard, pg->pg_whoami.shard,
7361 query.query_epoch,
7362 pg->get_osdmap()->get_epoch(),
7363 notify_info.second),
7364 pg->past_intervals);
7365 } else {
7366 pg->fulfill_log(query.from, query.query, query.query_epoch);
7367 }
7368 return discard_event();
7369 }
7370
7371 boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&)
7372 {
7373 PG *pg = context< RecoveryMachine >().pg;
7374 if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7375 context< RecoveryMachine >().send_notify(
7376 pg->get_primary(),
7377 pg_notify_t(
7378 pg->get_primary().shard, pg->pg_whoami.shard,
7379 pg->get_osdmap()->get_epoch(),
7380 pg->get_osdmap()->get_epoch(),
7381 pg->info),
7382 pg->past_intervals);
7383 }
7384 pg->take_waiters();
7385 return discard_event();
7386 }
7387
7388 void PG::RecoveryState::Stray::exit()
7389 {
7390 context< RecoveryMachine >().log_exit(state_name, enter_time);
7391 PG *pg = context< RecoveryMachine >().pg;
7392 utime_t dur = ceph_clock_now() - enter_time;
7393 pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur);
7394 }
7395
7396 /*--------GetInfo---------*/
7397 PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
7398 : my_base(ctx),
7399 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetInfo")
7400 {
7401 context< RecoveryMachine >().log_enter(state_name);
7402
7403 PG *pg = context< RecoveryMachine >().pg;
7404 pg->check_past_interval_bounds();
7405 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7406
7407 assert(pg->blocked_by.empty());
7408
7409 prior_set = pg->build_prior();
7410
7411 pg->reset_min_peer_features();
7412 get_infos();
7413 if (prior_set.pg_down) {
7414 post_event(IsDown());
7415 } else if (peer_info_requested.empty()) {
7416 post_event(GotInfo());
7417 }
7418 }
7419
7420 void PG::RecoveryState::GetInfo::get_infos()
7421 {
7422 PG *pg = context< RecoveryMachine >().pg;
7423 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7424
7425 pg->blocked_by.clear();
7426 for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
7427 it != prior_set.probe.end();
7428 ++it) {
7429 pg_shard_t peer = *it;
7430 if (peer == pg->pg_whoami) {
7431 continue;
7432 }
7433 if (pg->peer_info.count(peer)) {
7434 ldout(pg->cct, 10) << " have osd." << peer << " info " << pg->peer_info[peer] << dendl;
7435 continue;
7436 }
7437 if (peer_info_requested.count(peer)) {
7438 ldout(pg->cct, 10) << " already requested info from osd." << peer << dendl;
7439 pg->blocked_by.insert(peer.osd);
7440 } else if (!pg->get_osdmap()->is_up(peer.osd)) {
7441 ldout(pg->cct, 10) << " not querying info from down osd." << peer << dendl;
7442 } else {
7443 ldout(pg->cct, 10) << " querying info from osd." << peer << dendl;
7444 context< RecoveryMachine >().send_query(
7445 peer, pg_query_t(pg_query_t::INFO,
7446 it->shard, pg->pg_whoami.shard,
7447 pg->info.history,
7448 pg->get_osdmap()->get_epoch()));
7449 peer_info_requested.insert(peer);
7450 pg->blocked_by.insert(peer.osd);
7451 }
7452 }
7453
7454 pg->publish_stats_to_osd();
7455 }
7456
7457 boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt)
7458 {
7459 PG *pg = context< RecoveryMachine >().pg;
7460
7461 set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
7462 if (p != peer_info_requested.end()) {
7463 peer_info_requested.erase(p);
7464 pg->blocked_by.erase(infoevt.from.osd);
7465 }
7466
7467 epoch_t old_start = pg->info.history.last_epoch_started;
7468 if (pg->proc_replica_info(
7469 infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
7470 // we got something new ...
7471 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7472 if (old_start < pg->info.history.last_epoch_started) {
7473 ldout(pg->cct, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
7474 prior_set = pg->build_prior();
7475
7476 // filter out any osds that got dropped from the probe set from
7477 // peer_info_requested. this is less expensive than restarting
7478 // peering (which would re-probe everyone).
7479 set<pg_shard_t>::iterator p = peer_info_requested.begin();
7480 while (p != peer_info_requested.end()) {
7481 if (prior_set.probe.count(*p) == 0) {
7482 ldout(pg->cct, 20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
7483 peer_info_requested.erase(p++);
7484 } else {
7485 ++p;
7486 }
7487 }
7488 get_infos();
7489 }
7490 ldout(pg->cct, 20) << "Adding osd: " << infoevt.from.osd << " peer features: "
7491 << hex << infoevt.features << dec << dendl;
7492 pg->apply_peer_features(infoevt.features);
7493
7494 // are we done getting everything?
7495 if (peer_info_requested.empty() && !prior_set.pg_down) {
7496 ldout(pg->cct, 20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl;
7497 ldout(pg->cct, 20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl;
7498 ldout(pg->cct, 20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl;
7499 post_event(GotInfo());
7500 }
7501 }
7502 return discard_event();
7503 }
7504
7505 boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q)
7506 {
7507 PG *pg = context< RecoveryMachine >().pg;
7508 q.f->open_object_section("state");
7509 q.f->dump_string("name", state_name);
7510 q.f->dump_stream("enter_time") << enter_time;
7511
7512 q.f->open_array_section("requested_info_from");
7513 for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
7514 p != peer_info_requested.end();
7515 ++p) {
7516 q.f->open_object_section("osd");
7517 q.f->dump_stream("osd") << *p;
7518 if (pg->peer_info.count(*p)) {
7519 q.f->open_object_section("got_info");
7520 pg->peer_info[*p].dump(q.f);
7521 q.f->close_section();
7522 }
7523 q.f->close_section();
7524 }
7525 q.f->close_section();
7526
7527 q.f->close_section();
7528 return forward_event();
7529 }
7530
7531 void PG::RecoveryState::GetInfo::exit()
7532 {
7533 context< RecoveryMachine >().log_exit(state_name, enter_time);
7534 PG *pg = context< RecoveryMachine >().pg;
7535 utime_t dur = ceph_clock_now() - enter_time;
7536 pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur);
7537 pg->blocked_by.clear();
7538 pg->publish_stats_to_osd();
7539 }
7540
7541 /*------GetLog------------*/
7542 PG::RecoveryState::GetLog::GetLog(my_context ctx)
7543 : my_base(ctx),
7544 NamedState(
7545 context< RecoveryMachine >().pg, "Started/Primary/Peering/GetLog"),
7546 msg(0)
7547 {
7548 context< RecoveryMachine >().log_enter(state_name);
7549
7550 PG *pg = context< RecoveryMachine >().pg;
7551
7552 // adjust acting?
7553 if (!pg->choose_acting(auth_log_shard, false,
7554 &context< Peering >().history_les_bound)) {
7555 if (!pg->want_acting.empty()) {
7556 post_event(NeedActingChange());
7557 } else {
7558 post_event(IsIncomplete());
7559 }
7560 return;
7561 }
7562
7563 // am i the best?
7564 if (auth_log_shard == pg->pg_whoami) {
7565 post_event(GotLog());
7566 return;
7567 }
7568
7569 const pg_info_t& best = pg->peer_info[auth_log_shard];
7570
7571 // am i broken?
7572 if (pg->info.last_update < best.log_tail) {
7573 ldout(pg->cct, 10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
7574 post_event(IsIncomplete());
7575 return;
7576 }
7577
7578 // how much log to request?
7579 eversion_t request_log_from = pg->info.last_update;
7580 assert(!pg->actingbackfill.empty());
7581 for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
7582 p != pg->actingbackfill.end();
7583 ++p) {
7584 if (*p == pg->pg_whoami) continue;
7585 pg_info_t& ri = pg->peer_info[*p];
7586 if (ri.last_update < pg->info.log_tail && ri.last_update >= best.log_tail &&
7587 ri.last_update < request_log_from)
7588 request_log_from = ri.last_update;
7589 }
7590
7591 // how much?
7592 ldout(pg->cct, 10) << " requesting log from osd." << auth_log_shard << dendl;
7593 context<RecoveryMachine>().send_query(
7594 auth_log_shard,
7595 pg_query_t(
7596 pg_query_t::LOG,
7597 auth_log_shard.shard, pg->pg_whoami.shard,
7598 request_log_from, pg->info.history,
7599 pg->get_osdmap()->get_epoch()));
7600
7601 assert(pg->blocked_by.empty());
7602 pg->blocked_by.insert(auth_log_shard.osd);
7603 pg->publish_stats_to_osd();
7604 }
7605
7606 boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap)
7607 {
7608 PG *pg = context< RecoveryMachine >().pg;
7609 // make sure our log source didn't go down. we need to check
7610 // explicitly because it may not be part of the prior set, which
7611 // means the Peering state check won't catch it going down.
7612 if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
7613 ldout(pg->cct, 10) << "GetLog: auth_log_shard osd."
7614 << auth_log_shard.osd << " went down" << dendl;
7615 post_event(advmap);
7616 return transit< Reset >();
7617 }
7618
7619 // let the Peering state do its checks.
7620 return forward_event();
7621 }
7622
7623 boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt)
7624 {
7625 PG *pg = context< RecoveryMachine >().pg;
7626 assert(!msg);
7627 if (logevt.from != auth_log_shard) {
7628 ldout(pg->cct, 10) << "GetLog: discarding log from "
7629 << "non-auth_log_shard osd." << logevt.from << dendl;
7630 return discard_event();
7631 }
7632 ldout(pg->cct, 10) << "GetLog: received master log from osd"
7633 << logevt.from << dendl;
7634 msg = logevt.msg;
7635 post_event(GotLog());
7636 return discard_event();
7637 }
7638
7639 boost::statechart::result PG::RecoveryState::GetLog::react(const GotLog&)
7640 {
7641 PG *pg = context< RecoveryMachine >().pg;
7642 ldout(pg->cct, 10) << "leaving GetLog" << dendl;
7643 if (msg) {
7644 ldout(pg->cct, 10) << "processing master log" << dendl;
7645 pg->proc_master_log(*context<RecoveryMachine>().get_cur_transaction(),
7646 msg->info, msg->log, msg->missing,
7647 auth_log_shard);
7648 }
7649 pg->start_flush(
7650 context< RecoveryMachine >().get_cur_transaction(),
7651 context< RecoveryMachine >().get_on_applied_context_list(),
7652 context< RecoveryMachine >().get_on_safe_context_list());
7653 return transit< GetMissing >();
7654 }
7655
7656 boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q)
7657 {
7658 q.f->open_object_section("state");
7659 q.f->dump_string("name", state_name);
7660 q.f->dump_stream("enter_time") << enter_time;
7661 q.f->dump_stream("auth_log_shard") << auth_log_shard;
7662 q.f->close_section();
7663 return forward_event();
7664 }
7665
7666 void PG::RecoveryState::GetLog::exit()
7667 {
7668 context< RecoveryMachine >().log_exit(state_name, enter_time);
7669 PG *pg = context< RecoveryMachine >().pg;
7670 utime_t dur = ceph_clock_now() - enter_time;
7671 pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur);
7672 pg->blocked_by.clear();
7673 pg->publish_stats_to_osd();
7674 }
7675
7676 /*------WaitActingChange--------*/
7677 PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx)
7678 : my_base(ctx),
7679 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitActingChange")
7680 {
7681 context< RecoveryMachine >().log_enter(state_name);
7682 }
7683
7684 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const AdvMap& advmap)
7685 {
7686 PG *pg = context< RecoveryMachine >().pg;
7687 OSDMapRef osdmap = advmap.osdmap;
7688
7689 ldout(pg->cct, 10) << "verifying no want_acting " << pg->want_acting << " targets didn't go down" << dendl;
7690 for (vector<int>::iterator p = pg->want_acting.begin(); p != pg->want_acting.end(); ++p) {
7691 if (!osdmap->is_up(*p)) {
7692 ldout(pg->cct, 10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
7693 post_event(advmap);
7694 return transit< Reset >();
7695 }
7696 }
7697 return forward_event();
7698 }
7699
7700 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MLogRec& logevt)
7701 {
7702 PG *pg = context< RecoveryMachine >().pg;
7703 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MLocRec" << dendl;
7704 return discard_event();
7705 }
7706
7707 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MInfoRec& evt)
7708 {
7709 PG *pg = context< RecoveryMachine >().pg;
7710 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
7711 return discard_event();
7712 }
7713
7714 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MNotifyRec& evt)
7715 {
7716 PG *pg = context< RecoveryMachine >().pg;
7717 ldout(pg->cct, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
7718 return discard_event();
7719 }
7720
7721 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const QueryState& q)
7722 {
7723 q.f->open_object_section("state");
7724 q.f->dump_string("name", state_name);
7725 q.f->dump_stream("enter_time") << enter_time;
7726 q.f->dump_string("comment", "waiting for pg acting set to change");
7727 q.f->close_section();
7728 return forward_event();
7729 }
7730
7731 void PG::RecoveryState::WaitActingChange::exit()
7732 {
7733 context< RecoveryMachine >().log_exit(state_name, enter_time);
7734 PG *pg = context< RecoveryMachine >().pg;
7735 utime_t dur = ceph_clock_now() - enter_time;
7736 pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur);
7737 }
7738
7739 /*------Down--------*/
7740 PG::RecoveryState::Down::Down(my_context ctx)
7741 : my_base(ctx),
7742 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Down")
7743 {
7744 context< RecoveryMachine >().log_enter(state_name);
7745 PG *pg = context< RecoveryMachine >().pg;
7746
7747 pg->state_clear(PG_STATE_PEERING);
7748 pg->state_set(PG_STATE_DOWN);
7749
7750 auto &prior_set = context< Peering >().prior_set;
7751 assert(pg->blocked_by.empty());
7752 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
7753 pg->publish_stats_to_osd();
7754 }
7755
7756 void PG::RecoveryState::Down::exit()
7757 {
7758 context< RecoveryMachine >().log_exit(state_name, enter_time);
7759 PG *pg = context< RecoveryMachine >().pg;
7760
7761 pg->state_clear(PG_STATE_DOWN);
7762 utime_t dur = ceph_clock_now() - enter_time;
7763 pg->osd->recoverystate_perf->tinc(rs_down_latency, dur);
7764
7765 pg->blocked_by.clear();
7766 pg->publish_stats_to_osd();
7767 }
7768
7769 boost::statechart::result PG::RecoveryState::Down::react(const QueryState& q)
7770 {
7771 q.f->open_object_section("state");
7772 q.f->dump_string("name", state_name);
7773 q.f->dump_stream("enter_time") << enter_time;
7774 q.f->dump_string("comment",
7775 "not enough up instances of this PG to go active");
7776 q.f->close_section();
7777 return forward_event();
7778 }
7779
7780 /*------Incomplete--------*/
7781 PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
7782 : my_base(ctx),
7783 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Incomplete")
7784 {
7785 context< RecoveryMachine >().log_enter(state_name);
7786 PG *pg = context< RecoveryMachine >().pg;
7787
7788 pg->state_clear(PG_STATE_PEERING);
7789 pg->state_set(PG_STATE_INCOMPLETE);
7790
7791 PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7792 assert(pg->blocked_by.empty());
7793 pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
7794 pg->publish_stats_to_osd();
7795 }
7796
7797 boost::statechart::result PG::RecoveryState::Incomplete::react(const AdvMap &advmap) {
7798 PG *pg = context< RecoveryMachine >().pg;
7799 int64_t poolnum = pg->info.pgid.pool();
7800
7801 // Reset if min_size turn smaller than previous value, pg might now be able to go active
7802 if (advmap.lastmap->get_pools().find(poolnum)->second.min_size >
7803 advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
7804 post_event(advmap);
7805 return transit< Reset >();
7806 }
7807
7808 return forward_event();
7809 }
7810
7811 boost::statechart::result PG::RecoveryState::Incomplete::react(const MNotifyRec& notevt) {
7812 PG *pg = context< RecoveryMachine >().pg;
7813 ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
7814 if (pg->proc_replica_info(
7815 notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
7816 // We got something new, try again!
7817 return transit< GetLog >();
7818 } else {
7819 return discard_event();
7820 }
7821 }
7822
7823 boost::statechart::result PG::RecoveryState::Incomplete::react(
7824 const QueryState& q)
7825 {
7826 q.f->open_object_section("state");
7827 q.f->dump_string("name", state_name);
7828 q.f->dump_stream("enter_time") << enter_time;
7829 q.f->dump_string("comment", "not enough complete instances of this PG");
7830 q.f->close_section();
7831 return forward_event();
7832 }
7833
7834 void PG::RecoveryState::Incomplete::exit()
7835 {
7836 context< RecoveryMachine >().log_exit(state_name, enter_time);
7837 PG *pg = context< RecoveryMachine >().pg;
7838
7839 pg->state_clear(PG_STATE_INCOMPLETE);
7840 utime_t dur = ceph_clock_now() - enter_time;
7841 pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur);
7842
7843 pg->blocked_by.clear();
7844 pg->publish_stats_to_osd();
7845 }
7846
7847 /*------GetMissing--------*/
7848 PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
7849 : my_base(ctx),
7850 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetMissing")
7851 {
7852 context< RecoveryMachine >().log_enter(state_name);
7853
7854 PG *pg = context< RecoveryMachine >().pg;
7855 assert(!pg->actingbackfill.empty());
7856 eversion_t since;
7857 for (set<pg_shard_t>::iterator i = pg->actingbackfill.begin();
7858 i != pg->actingbackfill.end();
7859 ++i) {
7860 if (*i == pg->get_primary()) continue;
7861 const pg_info_t& pi = pg->peer_info[*i];
7862
7863 if (pi.is_empty())
7864 continue; // no pg data, nothing divergent
7865
7866 if (pi.last_update < pg->pg_log.get_tail()) {
7867 ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
7868 pg->peer_missing[*i];
7869 continue;
7870 }
7871 if (pi.last_backfill == hobject_t()) {
7872 ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
7873 pg->peer_missing[*i];
7874 continue;
7875 }
7876
7877 if (pi.last_update == pi.last_complete && // peer has no missing
7878 pi.last_update == pg->info.last_update) { // peer is up to date
7879 // replica has no missing and identical log as us. no need to
7880 // pull anything.
7881 // FIXME: we can do better here. if last_update==last_complete we
7882 // can infer the rest!
7883 ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl;
7884 pg->peer_missing[*i];
7885 continue;
7886 }
7887
7888 // We pull the log from the peer's last_epoch_started to ensure we
7889 // get enough log to detect divergent updates.
7890 since.epoch = pi.last_epoch_started;
7891 assert(pi.last_update >= pg->info.log_tail); // or else choose_acting() did a bad thing
7892 if (pi.log_tail <= since) {
7893 ldout(pg->cct, 10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
7894 context< RecoveryMachine >().send_query(
7895 *i,
7896 pg_query_t(
7897 pg_query_t::LOG,
7898 i->shard, pg->pg_whoami.shard,
7899 since, pg->info.history,
7900 pg->get_osdmap()->get_epoch()));
7901 } else {
7902 ldout(pg->cct, 10) << " requesting fulllog+missing from osd." << *i
7903 << " (want since " << since << " < log.tail "
7904 << pi.log_tail << ")" << dendl;
7905 context< RecoveryMachine >().send_query(
7906 *i, pg_query_t(
7907 pg_query_t::FULLLOG,
7908 i->shard, pg->pg_whoami.shard,
7909 pg->info.history, pg->get_osdmap()->get_epoch()));
7910 }
7911 peer_missing_requested.insert(*i);
7912 pg->blocked_by.insert(i->osd);
7913 }
7914
7915 if (peer_missing_requested.empty()) {
7916 if (pg->need_up_thru) {
7917 ldout(pg->cct, 10) << " still need up_thru update before going active"
7918 << dendl;
7919 post_event(NeedUpThru());
7920 return;
7921 }
7922
7923 // all good!
7924 post_event(Activate(pg->get_osdmap()->get_epoch()));
7925 } else {
7926 pg->publish_stats_to_osd();
7927 }
7928 }
7929
7930 boost::statechart::result PG::RecoveryState::GetMissing::react(const MLogRec& logevt)
7931 {
7932 PG *pg = context< RecoveryMachine >().pg;
7933
7934 peer_missing_requested.erase(logevt.from);
7935 pg->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
7936
7937 if (peer_missing_requested.empty()) {
7938 if (pg->need_up_thru) {
7939 ldout(pg->cct, 10) << " still need up_thru update before going active"
7940 << dendl;
7941 post_event(NeedUpThru());
7942 } else {
7943 ldout(pg->cct, 10) << "Got last missing, don't need missing "
7944 << "posting Activate" << dendl;
7945 post_event(Activate(pg->get_osdmap()->get_epoch()));
7946 }
7947 }
7948 return discard_event();
7949 }
7950
7951 boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState& q)
7952 {
7953 PG *pg = context< RecoveryMachine >().pg;
7954 q.f->open_object_section("state");
7955 q.f->dump_string("name", state_name);
7956 q.f->dump_stream("enter_time") << enter_time;
7957
7958 q.f->open_array_section("peer_missing_requested");
7959 for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
7960 p != peer_missing_requested.end();
7961 ++p) {
7962 q.f->open_object_section("osd");
7963 q.f->dump_stream("osd") << *p;
7964 if (pg->peer_missing.count(*p)) {
7965 q.f->open_object_section("got_missing");
7966 pg->peer_missing[*p].dump(q.f);
7967 q.f->close_section();
7968 }
7969 q.f->close_section();
7970 }
7971 q.f->close_section();
7972
7973 q.f->close_section();
7974 return forward_event();
7975 }
7976
7977 void PG::RecoveryState::GetMissing::exit()
7978 {
7979 context< RecoveryMachine >().log_exit(state_name, enter_time);
7980 PG *pg = context< RecoveryMachine >().pg;
7981 utime_t dur = ceph_clock_now() - enter_time;
7982 pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur);
7983 pg->blocked_by.clear();
7984 pg->publish_stats_to_osd();
7985 }
7986
7987 /*------WaitUpThru--------*/
7988 PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx)
7989 : my_base(ctx),
7990 NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitUpThru")
7991 {
7992 context< RecoveryMachine >().log_enter(state_name);
7993 }
7994
7995 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const ActMap& am)
7996 {
7997 PG *pg = context< RecoveryMachine >().pg;
7998 if (!pg->need_up_thru) {
7999 post_event(Activate(pg->get_osdmap()->get_epoch()));
8000 }
8001 return forward_event();
8002 }
8003
8004 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const MLogRec& logevt)
8005 {
8006 PG *pg = context< RecoveryMachine >().pg;
8007 ldout(pg->cct, 10) << "Noting missing from osd." << logevt.from << dendl;
8008 pg->peer_missing[logevt.from].claim(logevt.msg->missing);
8009 pg->peer_info[logevt.from] = logevt.msg->info;
8010 return discard_event();
8011 }
8012
8013 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const QueryState& q)
8014 {
8015 q.f->open_object_section("state");
8016 q.f->dump_string("name", state_name);
8017 q.f->dump_stream("enter_time") << enter_time;
8018 q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
8019 q.f->close_section();
8020 return forward_event();
8021 }
8022
8023 void PG::RecoveryState::WaitUpThru::exit()
8024 {
8025 context< RecoveryMachine >().log_exit(state_name, enter_time);
8026 PG *pg = context< RecoveryMachine >().pg;
8027 utime_t dur = ceph_clock_now() - enter_time;
8028 pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur);
8029 }
8030
8031 /*----RecoveryState::RecoveryMachine Methods-----*/
8032 #undef dout_prefix
8033 #define dout_prefix *_dout << pg->gen_prefix()
8034
8035 void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name)
8036 {
8037 PG *pg = context< RecoveryMachine >().pg;
8038 ldout(pg->cct, 5) << "enter " << state_name << dendl;
8039 pg->osd->pg_recovery_stats.log_enter(state_name);
8040 }
8041
8042 void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_t enter_time)
8043 {
8044 utime_t dur = ceph_clock_now() - enter_time;
8045 PG *pg = context< RecoveryMachine >().pg;
8046 ldout(pg->cct, 5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
8047 pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now() - enter_time,
8048 event_count, event_time);
8049 event_count = 0;
8050 event_time = utime_t();
8051 }
8052
8053
8054 /*---------------------------------------------------*/
8055 #undef dout_prefix
8056 #define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ")
8057
8058 void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
8059 assert(!rctx);
8060 assert(!orig_ctx);
8061 orig_ctx = new_ctx;
8062 if (new_ctx) {
8063 if (messages_pending_flush) {
8064 rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
8065 } else {
8066 rctx = *new_ctx;
8067 }
8068 rctx->start_time = ceph_clock_now();
8069 }
8070 }
8071
8072 void PG::RecoveryState::begin_block_outgoing() {
8073 assert(!messages_pending_flush);
8074 assert(orig_ctx);
8075 assert(rctx);
8076 messages_pending_flush = BufferedRecoveryMessages();
8077 rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
8078 }
8079
8080 void PG::RecoveryState::clear_blocked_outgoing() {
8081 assert(orig_ctx);
8082 assert(rctx);
8083 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8084 }
8085
8086 void PG::RecoveryState::end_block_outgoing() {
8087 assert(messages_pending_flush);
8088 assert(orig_ctx);
8089 assert(rctx);
8090
8091 rctx = RecoveryCtx(*orig_ctx);
8092 rctx->accept_buffered_messages(*messages_pending_flush);
8093 messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8094 }
8095
8096 void PG::RecoveryState::end_handle() {
8097 if (rctx) {
8098 utime_t dur = ceph_clock_now() - rctx->start_time;
8099 machine.event_time += dur;
8100 }
8101
8102 machine.event_count++;
8103 rctx = boost::optional<RecoveryCtx>();
8104 orig_ctx = NULL;
8105 }
8106
8107 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi)
8108 {
8109 out << "BackfillInfo(" << bi.begin << "-" << bi.end
8110 << " " << bi.objects.size() << " objects";
8111 if (!bi.objects.empty())
8112 out << " " << bi.objects;
8113 out << ")";
8114 return out;
8115 }
8116
8117 void intrusive_ptr_add_ref(PG *pg) { pg->get("intptr"); }
8118 void intrusive_ptr_release(PG *pg) { pg->put("intptr"); }
8119
8120 #ifdef PG_DEBUG_REFS
8121 uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
8122 void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
8123 #endif