]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PG.h
update sources to v12.1.3
[ceph.git] / ceph / src / osd / PG.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef CEPH_PG_H
16 #define CEPH_PG_H
17
18 #include <boost/statechart/custom_reaction.hpp>
19 #include <boost/statechart/event.hpp>
20 #include <boost/statechart/simple_state.hpp>
21 #include <boost/statechart/state.hpp>
22 #include <boost/statechart/state_machine.hpp>
23 #include <boost/statechart/transition.hpp>
24 #include <boost/statechart/event_base.hpp>
25 #include <boost/scoped_ptr.hpp>
26 #include <boost/circular_buffer.hpp>
27 #include "include/memory.h"
28 #include "include/mempool.h"
29
30 // re-include our assert to clobber boost's
31 #include "include/assert.h"
32
33 #include "include/types.h"
34 #include "include/stringify.h"
35 #include "osd_types.h"
36 #include "include/xlist.h"
37 #include "SnapMapper.h"
38 #include "Session.h"
39 #include "common/Timer.h"
40
41 #include "PGLog.h"
42 #include "OSDMap.h"
43 #include "messages/MOSDPGLog.h"
44 #include "include/str_list.h"
45 #include "PGBackend.h"
46
47 #include <atomic>
48 #include <list>
49 #include <memory>
50 #include <stack>
51 #include <string>
52 #include <tuple>
53 using namespace std;
54
55 // #include "include/unordered_map.h"
56 // #include "include/unordered_set.h"
57
58 //#define DEBUG_RECOVERY_OIDS // track set of recovering oids explicitly, to find counting bugs
59
60 class OSD;
61 class OSDService;
62 class MOSDOp;
63 class MOSDPGScan;
64 class MOSDPGBackfill;
65 class MOSDPGInfo;
66
67 class PG;
68 struct OpRequest;
69 typedef OpRequest::Ref OpRequestRef;
70 class MOSDPGLog;
71 class CephContext;
72
73 namespace Scrub {
74 class Store;
75 }
76
77 void intrusive_ptr_add_ref(PG *pg);
78 void intrusive_ptr_release(PG *pg);
79
80 using state_history_entry = std::tuple<utime_t, utime_t, const char*>;
81 using embedded_state = std::pair<utime_t, const char*>;
82
83 struct PGStateInstance {
84 // Time spent in pg states
85
86 void setepoch(const epoch_t current_epoch) {
87 this_epoch = current_epoch;
88 }
89
90 void enter_state(const utime_t entime, const char* state) {
91 embedded_states.push(std::make_pair(entime, state));
92 }
93
94 void exit_state(const utime_t extime) {
95 embedded_state this_state = embedded_states.top();
96 state_history.push_back(state_history_entry{
97 this_state.first, extime, this_state.second});
98 embedded_states.pop();
99 }
100
101 epoch_t this_epoch;
102 utime_t enter_time;
103 std::vector<state_history_entry> state_history;
104 std::stack<embedded_state> embedded_states;
105 };
106
107 class PGStateHistory {
108 // Member access protected with the PG lock
109 public:
110 PGStateHistory() : buffer(10) {}
111
112 void enter(PG* pg, const utime_t entime, const char* state);
113
114 void exit(const char* state);
115
116 void reset() {
117 pi = nullptr;
118 }
119
120 void set_pg_in_destructor() { pg_in_destructor = true; }
121
122 void dump(Formatter* f) const;
123
124 private:
125 bool pg_in_destructor = false;
126 PG* thispg = nullptr;
127 std::unique_ptr<PGStateInstance> tmppi;
128 PGStateInstance* pi = nullptr;
129 boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer;
130
131 };
132
133 #ifdef PG_DEBUG_REFS
134 #include "common/tracked_int_ptr.hpp"
135 uint64_t get_with_id(PG *pg);
136 void put_with_id(PG *pg, uint64_t id);
137 typedef TrackedIntPtr<PG> PGRef;
138 #else
139 typedef boost::intrusive_ptr<PG> PGRef;
140 #endif
141
142 class PGRecoveryStats {
143 struct per_state_info {
144 uint64_t enter, exit; // enter/exit counts
145 uint64_t events;
146 utime_t event_time; // time spent processing events
147 utime_t total_time; // total time in state
148 utime_t min_time, max_time;
149
150 // cppcheck-suppress unreachableCode
151 per_state_info() : enter(0), exit(0), events(0) {}
152 };
153 map<const char *,per_state_info> info;
154 Mutex lock;
155
156 public:
157 PGRecoveryStats() : lock("PGRecoverStats::lock") {}
158
159 void reset() {
160 Mutex::Locker l(lock);
161 info.clear();
162 }
163 void dump(ostream& out) {
164 Mutex::Locker l(lock);
165 for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
166 per_state_info& i = p->second;
167 out << i.enter << "\t" << i.exit << "\t"
168 << i.events << "\t" << i.event_time << "\t"
169 << i.total_time << "\t"
170 << i.min_time << "\t" << i.max_time << "\t"
171 << p->first << "\n";
172 }
173 }
174
175 void dump_formatted(Formatter *f) {
176 Mutex::Locker l(lock);
177 f->open_array_section("pg_recovery_stats");
178 for (map<const char *,per_state_info>::iterator p = info.begin();
179 p != info.end(); ++p) {
180 per_state_info& i = p->second;
181 f->open_object_section("recovery_state");
182 f->dump_int("enter", i.enter);
183 f->dump_int("exit", i.exit);
184 f->dump_int("events", i.events);
185 f->dump_stream("event_time") << i.event_time;
186 f->dump_stream("total_time") << i.total_time;
187 f->dump_stream("min_time") << i.min_time;
188 f->dump_stream("max_time") << i.max_time;
189 vector<string> states;
190 get_str_vec(p->first, "/", states);
191 f->open_array_section("nested_states");
192 for (vector<string>::iterator st = states.begin();
193 st != states.end(); ++st) {
194 f->dump_string("state", *st);
195 }
196 f->close_section();
197 f->close_section();
198 }
199 f->close_section();
200 }
201
202 void log_enter(const char *s) {
203 Mutex::Locker l(lock);
204 info[s].enter++;
205 }
206 void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
207 Mutex::Locker l(lock);
208 per_state_info &i = info[s];
209 i.exit++;
210 i.total_time += dur;
211 if (dur > i.max_time)
212 i.max_time = dur;
213 if (dur < i.min_time || i.min_time == utime_t())
214 i.min_time = dur;
215 i.events += events;
216 i.event_time += event_dur;
217 }
218 };
219
220 struct PGPool {
221 CephContext* cct;
222 epoch_t cached_epoch;
223 int64_t id;
224 string name;
225 uint64_t auid;
226
227 pg_pool_t info;
228 SnapContext snapc; // the default pool snapc, ready to go.
229
230 interval_set<snapid_t> cached_removed_snaps; // current removed_snaps set
231 interval_set<snapid_t> newly_removed_snaps; // newly removed in the last epoch
232
233 PGPool(CephContext* cct, OSDMapRef map, int64_t i)
234 : cct(cct),
235 cached_epoch(map->get_epoch()),
236 id(i),
237 name(map->get_pool_name(id)),
238 auid(map->get_pg_pool(id)->auid) {
239 const pg_pool_t *pi = map->get_pg_pool(id);
240 assert(pi);
241 info = *pi;
242 snapc = pi->get_snap_context();
243 pi->build_removed_snaps(cached_removed_snaps);
244 }
245
246 void update(OSDMapRef map);
247 };
248
249 /** PG - Replica Placement Group
250 *
251 */
252
253 class PG : public DoutPrefixProvider {
254 protected:
255 OSDService *osd;
256 CephContext *cct;
257 OSDriver osdriver;
258 SnapMapper snap_mapper;
259 bool eio_errors_to_process = false;
260
261 virtual PGBackend *get_pgbackend() = 0;
262 public:
263 std::string gen_prefix() const override;
264 CephContext *get_cct() const override { return cct; }
265 unsigned get_subsys() const override { return ceph_subsys_osd; }
266
267 /*** PG ****/
268 void update_snap_mapper_bits(uint32_t bits) {
269 snap_mapper.update_bits(bits);
270 }
271 /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
272 IsPGRecoverablePredicate *get_is_recoverable_predicate() {
273 return get_pgbackend()->get_is_recoverable_predicate();
274 }
275 protected:
276 OSDMapRef osdmap_ref;
277 OSDMapRef last_persisted_osdmap_ref;
278 PGPool pool;
279
280 void requeue_map_waiters();
281
282 void update_osdmap_ref(OSDMapRef newmap) {
283 assert(_lock.is_locked_by_me());
284 osdmap_ref = std::move(newmap);
285 }
286
287 public:
288 OSDMapRef get_osdmap() const {
289 assert(is_locked());
290 assert(osdmap_ref);
291 return osdmap_ref;
292 }
293 protected:
294
295 /** locking and reference counting.
296 * I destroy myself when the reference count hits zero.
297 * lock() should be called before doing anything.
298 * get() should be called on pointer copy (to another thread, etc.).
299 * put() should be called on destruction of some previously copied pointer.
300 * unlock() when done with the current pointer (_most common_).
301 */
302 mutable Mutex _lock;
303 std::atomic_uint ref{0};
304
305 #ifdef PG_DEBUG_REFS
306 Mutex _ref_id_lock;
307 map<uint64_t, string> _live_ids;
308 map<string, uint64_t> _tag_counts;
309 uint64_t _ref_id;
310 #endif
311
312 public:
313 bool deleting; // true while in removing or OSD is shutting down
314
315 ZTracer::Endpoint trace_endpoint;
316
317 void lock_suspend_timeout(ThreadPool::TPHandle &handle);
318 void lock(bool no_lockdep = false) const;
319 void unlock() const {
320 //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
321 assert(!dirty_info);
322 assert(!dirty_big_info);
323 _lock.Unlock();
324 }
325
326 bool is_locked() const {
327 return _lock.is_locked();
328 }
329
330 #ifdef PG_DEBUG_REFS
331 uint64_t get_with_id();
332 void put_with_id(uint64_t);
333 void dump_live_ids();
334 #endif
335 void get(const char* tag);
336 void put(const char* tag);
337
338 bool dirty_info, dirty_big_info;
339
340 public:
341 bool is_ec_pg() const {
342 return pool.info.ec_pool();
343 }
344 // pg state
345 pg_info_t info; ///< current pg info
346 pg_info_t last_written_info; ///< last written info
347 __u8 info_struct_v;
348 static const __u8 cur_struct_v = 10;
349 // v10 is the new past_intervals encoding
350 // v9 was fastinfo_key addition
351 // v8 was the move to a per-pg pgmeta object
352 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
353 // (first appeared in cuttlefish).
354 static const __u8 compat_struct_v = 7;
355 bool must_upgrade() {
356 return info_struct_v < cur_struct_v;
357 }
358 bool can_upgrade() {
359 return info_struct_v >= compat_struct_v;
360 }
361 void upgrade(ObjectStore *store);
362
363 const coll_t coll;
364 ObjectStore::CollectionHandle ch;
365 PGLog pg_log;
366 static string get_info_key(spg_t pgid) {
367 return stringify(pgid) + "_info";
368 }
369 static string get_biginfo_key(spg_t pgid) {
370 return stringify(pgid) + "_biginfo";
371 }
372 static string get_epoch_key(spg_t pgid) {
373 return stringify(pgid) + "_epoch";
374 }
375 ghobject_t pgmeta_oid;
376
377 class MissingLoc {
378 map<hobject_t, pg_missing_item> needs_recovery_map;
379 map<hobject_t, set<pg_shard_t> > missing_loc;
380 set<pg_shard_t> missing_loc_sources;
381 PG *pg;
382 set<pg_shard_t> empty_set;
383 public:
384 boost::scoped_ptr<IsPGReadablePredicate> is_readable;
385 boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable;
386 explicit MissingLoc(PG *pg)
387 : pg(pg) {}
388 void set_backend_predicates(
389 IsPGReadablePredicate *_is_readable,
390 IsPGRecoverablePredicate *_is_recoverable) {
391 is_readable.reset(_is_readable);
392 is_recoverable.reset(_is_recoverable);
393 }
394 string gen_prefix() const { return pg->gen_prefix(); }
395 bool needs_recovery(
396 const hobject_t &hoid,
397 eversion_t *v = 0) const {
398 map<hobject_t, pg_missing_item>::const_iterator i =
399 needs_recovery_map.find(hoid);
400 if (i == needs_recovery_map.end())
401 return false;
402 if (v)
403 *v = i->second.need;
404 return true;
405 }
406 bool is_deleted(const hobject_t &hoid) const {
407 auto i = needs_recovery_map.find(hoid);
408 if (i == needs_recovery_map.end())
409 return false;
410 return i->second.is_delete();
411 }
412 bool is_unfound(const hobject_t &hoid) const {
413 return needs_recovery(hoid) && !is_deleted(hoid) && (
414 !missing_loc.count(hoid) ||
415 !(*is_recoverable)(missing_loc.find(hoid)->second));
416 }
417 bool readable_with_acting(
418 const hobject_t &hoid,
419 const set<pg_shard_t> &acting) const;
420 uint64_t num_unfound() const {
421 uint64_t ret = 0;
422 for (map<hobject_t, pg_missing_item>::const_iterator i =
423 needs_recovery_map.begin();
424 i != needs_recovery_map.end();
425 ++i) {
426 if (is_unfound(i->first))
427 ++ret;
428 }
429 return ret;
430 }
431
432 bool have_unfound() const {
433 for (map<hobject_t, pg_missing_item>::const_iterator i =
434 needs_recovery_map.begin();
435 i != needs_recovery_map.end();
436 ++i) {
437 if (is_unfound(i->first))
438 return true;
439 }
440 return false;
441 }
442 void clear() {
443 needs_recovery_map.clear();
444 missing_loc.clear();
445 missing_loc_sources.clear();
446 }
447
448 void add_location(const hobject_t &hoid, pg_shard_t location) {
449 missing_loc[hoid].insert(location);
450 }
451 void remove_location(const hobject_t &hoid, pg_shard_t location) {
452 missing_loc[hoid].erase(location);
453 }
454 void add_active_missing(const pg_missing_t &missing) {
455 for (map<hobject_t, pg_missing_item>::const_iterator i =
456 missing.get_items().begin();
457 i != missing.get_items().end();
458 ++i) {
459 map<hobject_t, pg_missing_item>::const_iterator j =
460 needs_recovery_map.find(i->first);
461 if (j == needs_recovery_map.end()) {
462 needs_recovery_map.insert(*i);
463 } else {
464 lgeneric_dout(pg->cct, 0) << this << " " << pg->info.pgid << " unexpected need for "
465 << i->first << " have " << j->second
466 << " tried to add " << i->second << dendl;
467 assert(i->second.need == j->second.need);
468 }
469 }
470 }
471
472 void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have) {
473 needs_recovery_map[hoid] = pg_missing_item(need, have);
474 }
475 void revise_need(const hobject_t &hoid, eversion_t need) {
476 assert(needs_recovery(hoid));
477 needs_recovery_map[hoid].need = need;
478 }
479
480 /// Adds info about a possible recovery source
481 bool add_source_info(
482 pg_shard_t source, ///< [in] source
483 const pg_info_t &oinfo, ///< [in] info
484 const pg_missing_t &omissing, ///< [in] (optional) missing
485 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
486 ); ///< @return whether a new object location was discovered
487
488 /// Adds recovery sources in batch
489 void add_batch_sources_info(
490 const set<pg_shard_t> &sources, ///< [in] a set of resources which can be used for all objects
491 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
492 );
493
494 /// Uses osdmap to update structures for now down sources
495 void check_recovery_sources(const OSDMapRef& osdmap);
496
497 /// Call when hoid is no longer missing in acting set
498 void recovered(const hobject_t &hoid) {
499 needs_recovery_map.erase(hoid);
500 missing_loc.erase(hoid);
501 }
502
503 /// Call to update structures for hoid after a change
504 void rebuild(
505 const hobject_t &hoid,
506 pg_shard_t self,
507 const set<pg_shard_t> to_recover,
508 const pg_info_t &info,
509 const pg_missing_t &missing,
510 const map<pg_shard_t, pg_missing_t> &pmissing,
511 const map<pg_shard_t, pg_info_t> &pinfo) {
512 recovered(hoid);
513 boost::optional<pg_missing_item> item;
514 auto miter = missing.get_items().find(hoid);
515 if (miter != missing.get_items().end()) {
516 item = miter->second;
517 } else {
518 for (auto &&i: to_recover) {
519 if (i == self)
520 continue;
521 auto pmiter = pmissing.find(i);
522 assert(pmiter != pmissing.end());
523 miter = pmiter->second.get_items().find(hoid);
524 if (miter != pmiter->second.get_items().end()) {
525 item = miter->second;
526 break;
527 }
528 }
529 }
530 if (!item)
531 return; // recovered!
532
533 needs_recovery_map[hoid] = *item;
534 if (item->is_delete())
535 return;
536 auto mliter =
537 missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first;
538 assert(info.last_backfill.is_max());
539 assert(info.last_update >= item->need);
540 if (!missing.is_missing(hoid))
541 mliter->second.insert(self);
542 for (auto &&i: pmissing) {
543 auto pinfoiter = pinfo.find(i.first);
544 assert(pinfoiter != pinfo.end());
545 if (item->need <= pinfoiter->second.last_update &&
546 hoid <= pinfoiter->second.last_backfill &&
547 !i.second.is_missing(hoid))
548 mliter->second.insert(i.first);
549 }
550 }
551
552 const set<pg_shard_t> &get_locations(const hobject_t &hoid) const {
553 return missing_loc.count(hoid) ?
554 missing_loc.find(hoid)->second : empty_set;
555 }
556 const map<hobject_t, set<pg_shard_t>> &get_missing_locs() const {
557 return missing_loc;
558 }
559 const map<hobject_t, pg_missing_item> &get_needs_recovery() const {
560 return needs_recovery_map;
561 }
562 } missing_loc;
563
564 PastIntervals past_intervals;
565
566 interval_set<snapid_t> snap_trimq;
567
568 /* You should not use these items without taking their respective queue locks
569 * (if they have one) */
570 xlist<PG*>::item stat_queue_item;
571 bool scrub_queued;
572 bool recovery_queued;
573
574 int recovery_ops_active;
575 set<pg_shard_t> waiting_on_backfill;
576 #ifdef DEBUG_RECOVERY_OIDS
577 set<hobject_t> recovering_oids;
578 #endif
579
580 protected:
581 int role; // 0 = primary, 1 = replica, -1=none.
582 unsigned state; // PG_STATE_*
583
584 bool send_notify; ///< true if we are non-primary and should notify the primary
585
586 public:
587 eversion_t last_update_ondisk; // last_update that has committed; ONLY DEFINED WHEN is_active()
588 eversion_t last_complete_ondisk; // last_complete that has committed.
589 eversion_t last_update_applied;
590
591
592 struct C_UpdateLastRollbackInfoTrimmedToApplied : Context {
593 PGRef pg;
594 epoch_t e;
595 eversion_t v;
596 C_UpdateLastRollbackInfoTrimmedToApplied(PG *pg, epoch_t e, eversion_t v)
597 : pg(pg), e(e), v(v) {}
598 void finish(int) override {
599 pg->lock();
600 if (!pg->pg_has_reset_since(e)) {
601 pg->last_rollback_info_trimmed_to_applied = v;
602 }
603 pg->unlock();
604 }
605 };
606 // entries <= last_rollback_info_trimmed_to_applied have been trimmed,
607 // and the transaction has applied
608 eversion_t last_rollback_info_trimmed_to_applied;
609
610 // primary state
611 public:
612 pg_shard_t primary;
613 pg_shard_t pg_whoami;
614 pg_shard_t up_primary;
615 vector<int> up, acting, want_acting;
616 set<pg_shard_t> actingbackfill, actingset, upset;
617 map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
618 eversion_t min_last_complete_ondisk; // up: min over last_complete_ondisk, peer_last_complete_ondisk
619 eversion_t pg_trim_to;
620
621 set<int> blocked_by; ///< osds we are blocked by (for pg stats)
622
623 // [primary only] content recovery state
624
625 public:
626 struct BufferedRecoveryMessages {
627 map<int, map<spg_t, pg_query_t> > query_map;
628 map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map;
629 map<int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
630 };
631
632 struct RecoveryCtx {
633 utime_t start_time;
634 map<int, map<spg_t, pg_query_t> > *query_map;
635 map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map;
636 map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list;
637 set<PGRef> created_pgs;
638 C_Contexts *on_applied;
639 C_Contexts *on_safe;
640 ObjectStore::Transaction *transaction;
641 ThreadPool::TPHandle* handle;
642 RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map,
643 map<int,
644 vector<pair<pg_notify_t, PastIntervals> > > *info_map,
645 map<int,
646 vector<pair<pg_notify_t, PastIntervals> > > *notify_list,
647 C_Contexts *on_applied,
648 C_Contexts *on_safe,
649 ObjectStore::Transaction *transaction)
650 : query_map(query_map), info_map(info_map),
651 notify_list(notify_list),
652 on_applied(on_applied),
653 on_safe(on_safe),
654 transaction(transaction),
655 handle(NULL) {}
656
657 RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
658 : query_map(&(buf.query_map)),
659 info_map(&(buf.info_map)),
660 notify_list(&(buf.notify_list)),
661 on_applied(rctx.on_applied),
662 on_safe(rctx.on_safe),
663 transaction(rctx.transaction),
664 handle(rctx.handle) {}
665
666 void accept_buffered_messages(BufferedRecoveryMessages &m) {
667 assert(query_map);
668 assert(info_map);
669 assert(notify_list);
670 for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
671 i != m.query_map.end();
672 ++i) {
673 map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
674 for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
675 j != i->second.end();
676 ++j) {
677 omap[j->first] = j->second;
678 }
679 }
680 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
681 = m.info_map.begin();
682 i != m.info_map.end();
683 ++i) {
684 vector<pair<pg_notify_t, PastIntervals> > &ovec =
685 (*info_map)[i->first];
686 ovec.reserve(ovec.size() + i->second.size());
687 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
688 }
689 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
690 = m.notify_list.begin();
691 i != m.notify_list.end();
692 ++i) {
693 vector<pair<pg_notify_t, PastIntervals> > &ovec =
694 (*notify_list)[i->first];
695 ovec.reserve(ovec.size() + i->second.size());
696 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
697 }
698 }
699 };
700
701
702 PGStateHistory pgstate_history;
703
704 struct NamedState {
705 const char *state_name;
706 utime_t enter_time;
707 PG* pg;
708 const char *get_state_name() { return state_name; }
709 NamedState(PG *pg_, const char *state_name_)
710 : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) {
711 pg->pgstate_history.enter(pg, enter_time, state_name);
712 }
713 virtual ~NamedState() { pg->pgstate_history.exit(state_name); }
714 };
715
716
717
718 protected:
719
720 /*
721 * peer_info -- projected (updates _before_ replicas ack)
722 * peer_missing -- committed (updates _after_ replicas ack)
723 */
724
725 bool need_up_thru;
726 set<pg_shard_t> stray_set; // non-acting osds that have PG data.
727 eversion_t oldest_update; // acting: lowest (valid) last_update in active set
728 map<pg_shard_t, pg_info_t> peer_info; // info from peers (stray or prior)
729 set<pg_shard_t> peer_purged; // peers purged
730 map<pg_shard_t, pg_missing_t> peer_missing;
731 set<pg_shard_t> peer_log_requested; // logs i've requested (and start stamps)
732 set<pg_shard_t> peer_missing_requested;
733
734 // i deleted these strays; ignore racing PGInfo from them
735 set<pg_shard_t> peer_activated;
736
737 // primary-only, recovery-only state
738 set<pg_shard_t> might_have_unfound; // These osds might have objects on them
739 // which are unfound on the primary
740 epoch_t last_peering_reset;
741
742
743 /* heartbeat peers */
744 void set_probe_targets(const set<pg_shard_t> &probe_set);
745 void clear_probe_targets();
746 public:
747 Mutex heartbeat_peer_lock;
748 set<int> heartbeat_peers;
749 set<int> probe_targets;
750
751 /**
752 * BackfillInterval
753 *
754 * Represents the objects in a range [begin, end)
755 *
756 * Possible states:
757 * 1) begin == end == hobject_t() indicates the the interval is unpopulated
758 * 2) Else, objects contains all objects in [begin, end)
759 */
760 struct BackfillInterval {
761 // info about a backfill interval on a peer
762 eversion_t version; /// version at which the scan occurred
763 map<hobject_t,eversion_t> objects;
764 hobject_t begin;
765 hobject_t end;
766
767 /// clear content
768 void clear() {
769 *this = BackfillInterval();
770 }
771
772 /// clear objects list only
773 void clear_objects() {
774 objects.clear();
775 }
776
777 /// reinstantiate with a new start+end position and sort order
778 void reset(hobject_t start) {
779 clear();
780 begin = end = start;
781 }
782
783 /// true if there are no objects in this interval
784 bool empty() const {
785 return objects.empty();
786 }
787
788 /// true if interval extends to the end of the range
789 bool extends_to_end() const {
790 return end.is_max();
791 }
792
793 /// removes items <= soid and adjusts begin to the first object
794 void trim_to(const hobject_t &soid) {
795 trim();
796 while (!objects.empty() &&
797 objects.begin()->first <= soid) {
798 pop_front();
799 }
800 }
801
802 /// Adjusts begin to the first object
803 void trim() {
804 if (!objects.empty())
805 begin = objects.begin()->first;
806 else
807 begin = end;
808 }
809
810 /// drop first entry, and adjust @begin accordingly
811 void pop_front() {
812 assert(!objects.empty());
813 objects.erase(objects.begin());
814 trim();
815 }
816
817 /// dump
818 void dump(Formatter *f) const {
819 f->dump_stream("begin") << begin;
820 f->dump_stream("end") << end;
821 f->open_array_section("objects");
822 for (map<hobject_t, eversion_t>::const_iterator i =
823 objects.begin();
824 i != objects.end();
825 ++i) {
826 f->open_object_section("object");
827 f->dump_stream("object") << i->first;
828 f->dump_stream("version") << i->second;
829 f->close_section();
830 }
831 f->close_section();
832 }
833 };
834
835 protected:
836 BackfillInterval backfill_info;
837 map<pg_shard_t, BackfillInterval> peer_backfill_info;
838 bool backfill_reserved;
839 bool backfill_reserving;
840
841 friend class OSD;
842
843 public:
844 set<pg_shard_t> backfill_targets;
845
846 bool is_backfill_targets(pg_shard_t osd) {
847 return backfill_targets.count(osd);
848 }
849
850 protected:
851
852 /*
853 * blocked request wait hierarchy
854 *
855 * In order to preserve request ordering we need to be careful about the
856 * order in which blocked requests get requeued. Generally speaking, we
857 * push the requests back up to the op_wq in reverse order (most recent
858 * request first) so that they come back out again in the original order.
859 * However, because there are multiple wait queues, we need to requeue
860 * waitlists in order. Generally speaking, we requeue the wait lists
861 * that are checked first.
862 *
863 * Here are the various wait lists, in the order they are used during
864 * request processing, with notes:
865 *
866 * - waiting_for_map
867 * - may start or stop blocking at any time (depending on client epoch)
868 * - waiting_for_peered
869 * - !is_peered() or flushes_in_progress
870 * - only starts blocking on interval change; never restarts
871 * - waiting_for_active
872 * - !is_active()
873 * - only starts blocking on interval change; never restarts
874 * - waiting_for_scrub
875 * - starts and stops blocking for varying intervals during scrub
876 * - waiting_for_unreadable_object
877 * - never restarts once object is readable (* except for EIO?)
878 * - waiting_for_degraded_object
879 * - never restarts once object is writeable (* except for EIO?)
880 * - waiting_for_blocked_object
881 * - starts and stops based on proxied op activity
882 * - obc rwlocks
883 * - starts and stops based on read/write activity
884 *
885 * Notes:
886 *
887 * 1. During and interval change, we requeue *everything* in the above order.
888 *
889 * 2. When an obc rwlock is released, we check for a scrub block and requeue
890 * the op there if it applies. We ignore the unreadable/degraded/blocked
891 * queues because we assume they cannot apply at that time (this is
892 * probably mostly true).
893 *
894 * 3. The requeue_ops helper will push ops onto the waiting_for_map list if
895 * it is non-empty.
896 *
897 * These three behaviors are generally sufficient to maintain ordering, with
898 * the possible exception of cases where we make an object degraded or
899 * unreadable that was previously okay, e.g. when scrub or op processing
900 * encounter an unexpected error. FIXME.
901 */
902
903 // pg waiters
904 unsigned flushes_in_progress;
905
906 // ops with newer maps than our (or blocked behind them)
907 // track these by client, since inter-request ordering doesn't otherwise
908 // matter.
909 unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map;
910
911 // ops waiting on peered
912 list<OpRequestRef> waiting_for_peered;
913
914 // ops waiting on active (require peered as well)
915 list<OpRequestRef> waiting_for_active;
916 list<OpRequestRef> waiting_for_scrub;
917
918 list<OpRequestRef> waiting_for_cache_not_full;
919 list<OpRequestRef> waiting_for_clean_to_primary_repair;
920 map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object,
921 waiting_for_degraded_object,
922 waiting_for_blocked_object;
923
924 set<hobject_t> objects_blocked_on_cache_full;
925 map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
926 map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
927
928 // Callbacks should assume pg (and nothing else) is locked
929 map<hobject_t, list<Context*>> callbacks_for_degraded_object;
930
931 map<eversion_t,
932 list<pair<OpRequestRef, version_t> > > waiting_for_ondisk;
933
934 void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m);
935 void requeue_op(OpRequestRef op);
936 void requeue_ops(list<OpRequestRef> &l);
937
938 // stats that persist lazily
939 object_stat_collection_t unstable_stats;
940
941 // publish stats
942 Mutex pg_stats_publish_lock;
943 bool pg_stats_publish_valid;
944 pg_stat_t pg_stats_publish;
945
946 // for ordering writes
947 ceph::shared_ptr<ObjectStore::Sequencer> osr;
948
949 void _update_calc_stats();
950 void _update_blocked_by();
951 void publish_stats_to_osd();
952 void clear_publish_stats();
953
954 public:
955 void clear_primary_state();
956
957 bool is_actingbackfill(pg_shard_t osd) const {
958 return actingbackfill.count(osd);
959 }
960 bool is_acting(pg_shard_t osd) const {
961 return has_shard(pool.info.ec_pool(), acting, osd);
962 }
963 bool is_up(pg_shard_t osd) const {
964 return has_shard(pool.info.ec_pool(), up, osd);
965 }
966 static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) {
967 if (ec) {
968 return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd;
969 } else {
970 return std::find(v.begin(), v.end(), osd.osd) != v.end();
971 }
972 }
973
974 bool needs_recovery() const;
975 bool needs_backfill() const;
976
977 /// clip calculated priority to reasonable range
978 inline int clamp_recovery_priority(int priority);
979 /// get log recovery reservation priority
980 unsigned get_recovery_priority();
981 /// get backfill reservation priority
982 unsigned get_backfill_priority();
983
984 void mark_clean(); ///< mark an active pg clean
985 void _change_recovery_force_mode(int new_mode, bool clear);
986
987 /// return [start,end) bounds for required past_intervals
988 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
989 const pg_info_t &info,
990 epoch_t oldest_map) {
991 epoch_t start = MAX(
992 info.history.last_epoch_clean ? info.history.last_epoch_clean :
993 info.history.epoch_pool_created,
994 oldest_map);
995 epoch_t end = MAX(
996 info.history.same_interval_since,
997 info.history.epoch_pool_created);
998 return make_pair(start, end);
999 }
1000 void check_past_interval_bounds() const;
1001 PastIntervals::PriorSet build_prior();
1002
1003 void remove_down_peer_info(const OSDMapRef osdmap);
1004
1005 bool adjust_need_up_thru(const OSDMapRef osdmap);
1006
1007 bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
1008 virtual void dump_recovery_info(Formatter *f) const = 0;
1009
1010 bool calc_min_last_complete_ondisk() {
1011 eversion_t min = last_complete_ondisk;
1012 assert(!actingbackfill.empty());
1013 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1014 i != actingbackfill.end();
1015 ++i) {
1016 if (*i == get_primary()) continue;
1017 if (peer_last_complete_ondisk.count(*i) == 0)
1018 return false; // we don't have complete info
1019 eversion_t a = peer_last_complete_ondisk[*i];
1020 if (a < min)
1021 min = a;
1022 }
1023 if (min == min_last_complete_ondisk)
1024 return false;
1025 min_last_complete_ondisk = min;
1026 return true;
1027 }
1028
1029 virtual void calc_trim_to() = 0;
1030
1031 void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
1032 pg_missing_t& omissing, pg_shard_t from);
1033 void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
1034 pg_missing_t& omissing, pg_shard_t from);
1035 bool proc_replica_info(
1036 pg_shard_t from, const pg_info_t &info, epoch_t send_epoch);
1037
1038 struct PGLogEntryHandler : public PGLog::LogEntryHandler {
1039 PG *pg;
1040 ObjectStore::Transaction *t;
1041 PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {}
1042
1043 // LogEntryHandler
1044 void remove(const hobject_t &hoid) override {
1045 pg->get_pgbackend()->remove(hoid, t);
1046 }
1047 void try_stash(const hobject_t &hoid, version_t v) override {
1048 pg->get_pgbackend()->try_stash(hoid, v, t);
1049 }
1050 void rollback(const pg_log_entry_t &entry) override {
1051 assert(entry.can_rollback());
1052 pg->get_pgbackend()->rollback(entry, t);
1053 }
1054 void rollforward(const pg_log_entry_t &entry) override {
1055 pg->get_pgbackend()->rollforward(entry, t);
1056 }
1057 void trim(const pg_log_entry_t &entry) override {
1058 pg->get_pgbackend()->trim(entry, t);
1059 }
1060 };
1061
1062 void update_object_snap_mapping(
1063 ObjectStore::Transaction *t, const hobject_t &soid,
1064 const set<snapid_t> &snaps);
1065 void clear_object_snap_mapping(
1066 ObjectStore::Transaction *t, const hobject_t &soid);
1067 void remove_snap_mapped_object(
1068 ObjectStore::Transaction& t, const hobject_t& soid);
1069 void merge_log(
1070 ObjectStore::Transaction& t, pg_info_t &oinfo,
1071 pg_log_t &olog, pg_shard_t from);
1072 void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead);
1073 bool search_for_missing(
1074 const pg_info_t &oinfo, const pg_missing_t &omissing,
1075 pg_shard_t fromosd,
1076 RecoveryCtx*);
1077
1078 void check_for_lost_objects();
1079 void forget_lost_objects();
1080
1081 void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map);
1082
1083 void trim_write_ahead();
1084
1085 map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
1086 const map<pg_shard_t, pg_info_t> &infos,
1087 bool restrict_to_up_acting,
1088 bool *history_les_bound) const;
1089 static void calc_ec_acting(
1090 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1091 unsigned size,
1092 const vector<int> &acting,
1093 pg_shard_t acting_primary,
1094 const vector<int> &up,
1095 pg_shard_t up_primary,
1096 const map<pg_shard_t, pg_info_t> &all_info,
1097 bool restrict_to_up_acting,
1098 vector<int> *want,
1099 set<pg_shard_t> *backfill,
1100 set<pg_shard_t> *acting_backfill,
1101 pg_shard_t *want_primary,
1102 ostream &ss);
1103 static void calc_replicated_acting(
1104 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1105 unsigned size,
1106 const vector<int> &acting,
1107 pg_shard_t acting_primary,
1108 const vector<int> &up,
1109 pg_shard_t up_primary,
1110 const map<pg_shard_t, pg_info_t> &all_info,
1111 bool restrict_to_up_acting,
1112 vector<int> *want,
1113 set<pg_shard_t> *backfill,
1114 set<pg_shard_t> *acting_backfill,
1115 pg_shard_t *want_primary,
1116 ostream &ss);
1117 bool choose_acting(pg_shard_t &auth_log_shard,
1118 bool restrict_to_up_acting,
1119 bool *history_les_bound);
1120 void build_might_have_unfound();
1121 void activate(
1122 ObjectStore::Transaction& t,
1123 epoch_t activation_epoch,
1124 list<Context*>& tfin,
1125 map<int, map<spg_t,pg_query_t> >& query_map,
1126 map<int,
1127 vector<pair<pg_notify_t, PastIntervals> > > *activator_map,
1128 RecoveryCtx *ctx);
1129 void _activate_committed(epoch_t epoch, epoch_t activation_epoch);
1130 void all_activated_and_committed();
1131
1132 void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
1133
1134 bool have_unfound() const {
1135 return missing_loc.have_unfound();
1136 }
1137 uint64_t get_num_unfound() const {
1138 return missing_loc.num_unfound();
1139 }
1140
1141 virtual void check_local() = 0;
1142
1143 /**
1144 * @param ops_begun returns how many recovery ops the function started
1145 * @returns true if any useful work was accomplished; false otherwise
1146 */
1147 virtual bool start_recovery_ops(
1148 uint64_t max,
1149 ThreadPool::TPHandle &handle,
1150 uint64_t *ops_begun) = 0;
1151
1152 void purge_strays();
1153
1154 void update_heartbeat_peers();
1155
1156 Context *finish_sync_event;
1157
1158 void finish_recovery(list<Context*>& tfin);
1159 void _finish_recovery(Context *c);
1160 void cancel_recovery();
1161 void clear_recovery_state();
1162 virtual void _clear_recovery_state() = 0;
1163 virtual void check_recovery_sources(const OSDMapRef& newmap) = 0;
1164 void start_recovery_op(const hobject_t& soid);
1165 void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
1166
1167 void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
1168 virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
1169
1170 friend class C_OSD_RepModify_Commit;
1171
1172 // -- backoff --
1173 Mutex backoff_lock; // orders inside Backoff::lock
1174 map<hobject_t,set<BackoffRef>> backoffs;
1175
1176 void add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end);
1177 void release_backoffs(const hobject_t& begin, const hobject_t& end);
1178 void release_backoffs(const hobject_t& o) {
1179 release_backoffs(o, o);
1180 }
1181 void clear_backoffs();
1182
1183 void add_pg_backoff(SessionRef s) {
1184 hobject_t begin = info.pgid.pgid.get_hobj_start();
1185 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1186 add_backoff(s, begin, end);
1187 }
1188 void release_pg_backoffs() {
1189 hobject_t begin = info.pgid.pgid.get_hobj_start();
1190 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1191 release_backoffs(begin, end);
1192 }
1193
1194 void rm_backoff(BackoffRef b);
1195
1196 // -- scrub --
1197 struct Scrubber {
1198 Scrubber();
1199 ~Scrubber();
1200
1201 // metadata
1202 set<pg_shard_t> reserved_peers;
1203 bool reserved, reserve_failed;
1204 epoch_t epoch_start;
1205
1206 // common to both scrubs
1207 bool active;
1208 int waiting_on;
1209 set<pg_shard_t> waiting_on_whom;
1210 int shallow_errors;
1211 int deep_errors;
1212 int fixed;
1213 ScrubMap primary_scrubmap;
1214 map<pg_shard_t, ScrubMap> received_maps;
1215 OpRequestRef active_rep_scrub;
1216 utime_t scrub_reg_stamp; // stamp we registered for
1217
1218 // For async sleep
1219 bool sleeping = false;
1220 bool needs_sleep = true;
1221 utime_t sleep_start;
1222
1223 // flags to indicate explicitly requested scrubs (by admin)
1224 bool must_scrub, must_deep_scrub, must_repair;
1225
1226 // Priority to use for scrub scheduling
1227 unsigned priority;
1228
1229 // this flag indicates whether we would like to do auto-repair of the PG or not
1230 bool auto_repair;
1231
1232 // Maps from objects with errors to missing/inconsistent peers
1233 map<hobject_t, set<pg_shard_t>> missing;
1234 map<hobject_t, set<pg_shard_t>> inconsistent;
1235
1236 // Map from object with errors to good peers
1237 map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative;
1238
1239 // Cleaned map pending snap metadata scrub
1240 ScrubMap cleaned_meta_map;
1241
1242 // digest updates which we are waiting on
1243 int num_digest_updates_pending;
1244
1245 // chunky scrub
1246 hobject_t start, end;
1247 eversion_t subset_last_update;
1248
1249 // chunky scrub state
1250 enum State {
1251 INACTIVE,
1252 NEW_CHUNK,
1253 WAIT_PUSHES,
1254 WAIT_LAST_UPDATE,
1255 BUILD_MAP,
1256 WAIT_REPLICAS,
1257 COMPARE_MAPS,
1258 WAIT_DIGEST_UPDATES,
1259 FINISH,
1260 } state;
1261
1262 std::unique_ptr<Scrub::Store> store;
1263 // deep scrub
1264 bool deep;
1265 uint32_t seed;
1266
1267 list<Context*> callbacks;
1268 void add_callback(Context *context) {
1269 callbacks.push_back(context);
1270 }
1271 void run_callbacks() {
1272 list<Context*> to_run;
1273 to_run.swap(callbacks);
1274 for (list<Context*>::iterator i = to_run.begin();
1275 i != to_run.end();
1276 ++i) {
1277 (*i)->complete(0);
1278 }
1279 }
1280
1281 static const char *state_string(const PG::Scrubber::State& state) {
1282 const char *ret = NULL;
1283 switch( state )
1284 {
1285 case INACTIVE: ret = "INACTIVE"; break;
1286 case NEW_CHUNK: ret = "NEW_CHUNK"; break;
1287 case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
1288 case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
1289 case BUILD_MAP: ret = "BUILD_MAP"; break;
1290 case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
1291 case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
1292 case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
1293 case FINISH: ret = "FINISH"; break;
1294 }
1295 return ret;
1296 }
1297
1298 bool is_chunky_scrub_active() const { return state != INACTIVE; }
1299
1300 // classic (non chunk) scrubs block all writes
1301 // chunky scrubs only block writes to a range
1302 bool write_blocked_by_scrub(const hobject_t &soid) {
1303 return (soid >= start && soid < end);
1304 }
1305
1306 // clear all state
1307 void reset() {
1308 active = false;
1309 waiting_on = 0;
1310 waiting_on_whom.clear();
1311 if (active_rep_scrub) {
1312 active_rep_scrub = OpRequestRef();
1313 }
1314 received_maps.clear();
1315
1316 must_scrub = false;
1317 must_deep_scrub = false;
1318 must_repair = false;
1319 auto_repair = false;
1320
1321 state = PG::Scrubber::INACTIVE;
1322 start = hobject_t();
1323 end = hobject_t();
1324 subset_last_update = eversion_t();
1325 shallow_errors = 0;
1326 deep_errors = 0;
1327 fixed = 0;
1328 deep = false;
1329 seed = 0;
1330 run_callbacks();
1331 inconsistent.clear();
1332 missing.clear();
1333 authoritative.clear();
1334 num_digest_updates_pending = 0;
1335 cleaned_meta_map = ScrubMap();
1336 sleeping = false;
1337 needs_sleep = true;
1338 sleep_start = utime_t();
1339 }
1340
1341 void create_results(const hobject_t& obj);
1342 void cleanup_store(ObjectStore::Transaction *t);
1343 } scrubber;
1344
1345 bool scrub_after_recovery;
1346
1347 int active_pushes;
1348
1349 void repair_object(
1350 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
1351 pg_shard_t bad_peer);
1352
1353 void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
1354 void chunky_scrub(ThreadPool::TPHandle &handle);
1355 void scrub_compare_maps();
1356 /**
1357 * return true if any inconsistency/missing is repaired, false otherwise
1358 */
1359 bool scrub_process_inconsistent();
1360 bool ops_blocked_by_scrub() const;
1361 void scrub_finish();
1362 void scrub_clear_state();
1363 void _scan_snaps(ScrubMap &map);
1364 void _repair_oinfo_oid(ScrubMap &map);
1365 void _scan_rollback_obs(
1366 const vector<ghobject_t> &rollback_obs,
1367 ThreadPool::TPHandle &handle);
1368 void _request_scrub_map(pg_shard_t replica, eversion_t version,
1369 hobject_t start, hobject_t end, bool deep,
1370 uint32_t seed);
1371 int build_scrub_map_chunk(
1372 ScrubMap &map,
1373 hobject_t start, hobject_t end, bool deep, uint32_t seed,
1374 ThreadPool::TPHandle &handle);
1375 /**
1376 * returns true if [begin, end) is good to scrub at this time
1377 * a false return value obliges the implementer to requeue scrub when the
1378 * condition preventing scrub clears
1379 */
1380 virtual bool _range_available_for_scrub(
1381 const hobject_t &begin, const hobject_t &end) = 0;
1382 virtual void scrub_snapshot_metadata(
1383 ScrubMap &map,
1384 const std::map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest) { }
1385 virtual void _scrub_clear_state() { }
1386 virtual void _scrub_finish() { }
1387 virtual void split_colls(
1388 spg_t child,
1389 int split_bits,
1390 int seed,
1391 const pg_pool_t *pool,
1392 ObjectStore::Transaction *t) = 0;
1393 void clear_scrub_reserved();
1394 void scrub_reserve_replicas();
1395 void scrub_unreserve_replicas();
1396 bool scrub_all_replicas_reserved() const;
1397 bool sched_scrub();
1398 void reg_next_scrub();
1399 void unreg_next_scrub();
1400
1401 void replica_scrub(
1402 OpRequestRef op,
1403 ThreadPool::TPHandle &handle);
1404 void do_replica_scrub_map(OpRequestRef op);
1405 void sub_op_scrub_map(OpRequestRef op);
1406
1407 void handle_scrub_reserve_request(OpRequestRef op);
1408 void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
1409 void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
1410 void handle_scrub_reserve_release(OpRequestRef op);
1411
1412 void reject_reservation();
1413 void schedule_backfill_full_retry();
1414 void schedule_recovery_full_retry();
1415
1416 // -- recovery state --
1417
1418 template <class EVT>
1419 struct QueuePeeringEvt : Context {
1420 PGRef pg;
1421 epoch_t epoch;
1422 EVT evt;
1423 QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
1424 pg(pg), epoch(epoch), evt(evt) {}
1425 void finish(int r) override {
1426 pg->lock();
1427 pg->queue_peering_event(PG::CephPeeringEvtRef(
1428 new PG::CephPeeringEvt(
1429 epoch,
1430 epoch,
1431 evt)));
1432 pg->unlock();
1433 }
1434 };
1435
1436 class CephPeeringEvt {
1437 epoch_t epoch_sent;
1438 epoch_t epoch_requested;
1439 boost::intrusive_ptr< const boost::statechart::event_base > evt;
1440 string desc;
1441 public:
1442 MEMPOOL_CLASS_HELPERS();
1443 template <class T>
1444 CephPeeringEvt(epoch_t epoch_sent,
1445 epoch_t epoch_requested,
1446 const T &evt_) :
1447 epoch_sent(epoch_sent), epoch_requested(epoch_requested),
1448 evt(evt_.intrusive_from_this()) {
1449 stringstream out;
1450 out << "epoch_sent: " << epoch_sent
1451 << " epoch_requested: " << epoch_requested << " ";
1452 evt_.print(&out);
1453 desc = out.str();
1454 }
1455 epoch_t get_epoch_sent() { return epoch_sent; }
1456 epoch_t get_epoch_requested() { return epoch_requested; }
1457 const boost::statechart::event_base &get_event() { return *evt; }
1458 string get_desc() { return desc; }
1459 };
1460 typedef ceph::shared_ptr<CephPeeringEvt> CephPeeringEvtRef;
1461 list<CephPeeringEvtRef> peering_queue; // op queue
1462 list<CephPeeringEvtRef> peering_waiters;
1463
1464 struct QueryState : boost::statechart::event< QueryState > {
1465 Formatter *f;
1466 explicit QueryState(Formatter *f) : f(f) {}
1467 void print(std::ostream *out) const {
1468 *out << "Query";
1469 }
1470 };
1471
1472 struct MInfoRec : boost::statechart::event< MInfoRec > {
1473 pg_shard_t from;
1474 pg_info_t info;
1475 epoch_t msg_epoch;
1476 MInfoRec(pg_shard_t from, const pg_info_t &info, epoch_t msg_epoch) :
1477 from(from), info(info), msg_epoch(msg_epoch) {}
1478 void print(std::ostream *out) const {
1479 *out << "MInfoRec from " << from << " info: " << info;
1480 }
1481 };
1482
1483 struct MLogRec : boost::statechart::event< MLogRec > {
1484 pg_shard_t from;
1485 boost::intrusive_ptr<MOSDPGLog> msg;
1486 MLogRec(pg_shard_t from, MOSDPGLog *msg) :
1487 from(from), msg(msg) {}
1488 void print(std::ostream *out) const {
1489 *out << "MLogRec from " << from;
1490 }
1491 };
1492
1493 struct MNotifyRec : boost::statechart::event< MNotifyRec > {
1494 pg_shard_t from;
1495 pg_notify_t notify;
1496 uint64_t features;
1497 MNotifyRec(pg_shard_t from, const pg_notify_t &notify, uint64_t f) :
1498 from(from), notify(notify), features(f) {}
1499 void print(std::ostream *out) const {
1500 *out << "MNotifyRec from " << from << " notify: " << notify
1501 << " features: 0x" << hex << features << dec;
1502 }
1503 };
1504
1505 struct MQuery : boost::statechart::event< MQuery > {
1506 pg_shard_t from;
1507 pg_query_t query;
1508 epoch_t query_epoch;
1509 MQuery(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch):
1510 from(from), query(query), query_epoch(query_epoch) {}
1511 void print(std::ostream *out) const {
1512 *out << "MQuery from " << from
1513 << " query_epoch " << query_epoch
1514 << " query: " << query;
1515 }
1516 };
1517
1518 struct AdvMap : boost::statechart::event< AdvMap > {
1519 OSDMapRef osdmap;
1520 OSDMapRef lastmap;
1521 vector<int> newup, newacting;
1522 int up_primary, acting_primary;
1523 AdvMap(
1524 OSDMapRef osdmap, OSDMapRef lastmap,
1525 vector<int>& newup, int up_primary,
1526 vector<int>& newacting, int acting_primary):
1527 osdmap(osdmap), lastmap(lastmap),
1528 newup(newup),
1529 newacting(newacting),
1530 up_primary(up_primary),
1531 acting_primary(acting_primary) {}
1532 void print(std::ostream *out) const {
1533 *out << "AdvMap";
1534 }
1535 };
1536
1537 struct ActMap : boost::statechart::event< ActMap > {
1538 ActMap() : boost::statechart::event< ActMap >() {}
1539 void print(std::ostream *out) const {
1540 *out << "ActMap";
1541 }
1542 };
1543 struct Activate : boost::statechart::event< Activate > {
1544 epoch_t activation_epoch;
1545 explicit Activate(epoch_t q) : boost::statechart::event< Activate >(),
1546 activation_epoch(q) {}
1547 void print(std::ostream *out) const {
1548 *out << "Activate from " << activation_epoch;
1549 }
1550 };
1551 struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > {
1552 unsigned priority;
1553 explicit RequestBackfillPrio(unsigned prio) :
1554 boost::statechart::event< RequestBackfillPrio >(),
1555 priority(prio) {}
1556 void print(std::ostream *out) const {
1557 *out << "RequestBackfillPrio: priority " << priority;
1558 }
1559 };
1560 #define TrivialEvent(T) struct T : boost::statechart::event< T > { \
1561 T() : boost::statechart::event< T >() {} \
1562 void print(std::ostream *out) const { \
1563 *out << #T; \
1564 } \
1565 };
1566 TrivialEvent(Initialize)
1567 TrivialEvent(Load)
1568 TrivialEvent(GotInfo)
1569 TrivialEvent(NeedUpThru)
1570 TrivialEvent(NullEvt)
1571 TrivialEvent(FlushedEvt)
1572 TrivialEvent(Backfilled)
1573 TrivialEvent(LocalBackfillReserved)
1574 TrivialEvent(RemoteBackfillReserved)
1575 TrivialEvent(RemoteReservationRejected)
1576 TrivialEvent(CancelBackfill)
1577 TrivialEvent(RequestBackfill)
1578 TrivialEvent(RequestRecovery)
1579 TrivialEvent(RecoveryDone)
1580 TrivialEvent(BackfillTooFull)
1581 TrivialEvent(RecoveryTooFull)
1582 TrivialEvent(CancelRecovery)
1583
1584 TrivialEvent(AllReplicasRecovered)
1585 TrivialEvent(DoRecovery)
1586 TrivialEvent(LocalRecoveryReserved)
1587 TrivialEvent(RemoteRecoveryReserved)
1588 TrivialEvent(AllRemotesReserved)
1589 TrivialEvent(AllBackfillsReserved)
1590 TrivialEvent(GoClean)
1591
1592 TrivialEvent(AllReplicasActivated)
1593
1594 TrivialEvent(IntervalFlush)
1595
1596 /* Encapsulates PG recovery process */
1597 class RecoveryState {
1598 void start_handle(RecoveryCtx *new_ctx);
1599 void end_handle();
1600 public:
1601 void begin_block_outgoing();
1602 void end_block_outgoing();
1603 void clear_blocked_outgoing();
1604 private:
1605
1606 /* States */
1607 struct Initial;
1608 class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > {
1609 RecoveryState *state;
1610 public:
1611 PG *pg;
1612
1613 utime_t event_time;
1614 uint64_t event_count;
1615
1616 void clear_event_counters() {
1617 event_time = utime_t();
1618 event_count = 0;
1619 }
1620
1621 void log_enter(const char *state_name);
1622 void log_exit(const char *state_name, utime_t duration);
1623
1624 RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {}
1625
1626 /* Accessor functions for state methods */
1627 ObjectStore::Transaction* get_cur_transaction() {
1628 assert(state->rctx);
1629 assert(state->rctx->transaction);
1630 return state->rctx->transaction;
1631 }
1632
1633 void send_query(pg_shard_t to, const pg_query_t &query) {
1634 assert(state->rctx);
1635 assert(state->rctx->query_map);
1636 (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
1637 query;
1638 }
1639
1640 map<int, map<spg_t, pg_query_t> > *get_query_map() {
1641 assert(state->rctx);
1642 assert(state->rctx->query_map);
1643 return state->rctx->query_map;
1644 }
1645
1646 map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() {
1647 assert(state->rctx);
1648 assert(state->rctx->info_map);
1649 return state->rctx->info_map;
1650 }
1651
1652 list< Context* > *get_on_safe_context_list() {
1653 assert(state->rctx);
1654 assert(state->rctx->on_safe);
1655 return &(state->rctx->on_safe->contexts);
1656 }
1657
1658 list< Context * > *get_on_applied_context_list() {
1659 assert(state->rctx);
1660 assert(state->rctx->on_applied);
1661 return &(state->rctx->on_applied->contexts);
1662 }
1663
1664 RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
1665
1666 void send_notify(pg_shard_t to,
1667 const pg_notify_t &info, const PastIntervals &pi) {
1668 assert(state->rctx);
1669 assert(state->rctx->notify_list);
1670 (*state->rctx->notify_list)[to.osd].push_back(make_pair(info, pi));
1671 }
1672 };
1673 friend class RecoveryMachine;
1674
1675 /* States */
1676
1677 struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState {
1678 explicit Crashed(my_context ctx);
1679 };
1680
1681 struct Reset;
1682
1683 struct Initial : boost::statechart::state< Initial, RecoveryMachine >, NamedState {
1684 explicit Initial(my_context ctx);
1685 void exit();
1686
1687 typedef boost::mpl::list <
1688 boost::statechart::transition< Initialize, Reset >,
1689 boost::statechart::custom_reaction< Load >,
1690 boost::statechart::custom_reaction< NullEvt >,
1691 boost::statechart::transition< boost::statechart::event_base, Crashed >
1692 > reactions;
1693
1694 boost::statechart::result react(const Load&);
1695 boost::statechart::result react(const MNotifyRec&);
1696 boost::statechart::result react(const MInfoRec&);
1697 boost::statechart::result react(const MLogRec&);
1698 boost::statechart::result react(const boost::statechart::event_base&) {
1699 return discard_event();
1700 }
1701 };
1702
1703 struct Reset : boost::statechart::state< Reset, RecoveryMachine >, NamedState {
1704 explicit Reset(my_context ctx);
1705 void exit();
1706
1707 typedef boost::mpl::list <
1708 boost::statechart::custom_reaction< QueryState >,
1709 boost::statechart::custom_reaction< AdvMap >,
1710 boost::statechart::custom_reaction< ActMap >,
1711 boost::statechart::custom_reaction< NullEvt >,
1712 boost::statechart::custom_reaction< FlushedEvt >,
1713 boost::statechart::custom_reaction< IntervalFlush >,
1714 boost::statechart::transition< boost::statechart::event_base, Crashed >
1715 > reactions;
1716 boost::statechart::result react(const QueryState& q);
1717 boost::statechart::result react(const AdvMap&);
1718 boost::statechart::result react(const ActMap&);
1719 boost::statechart::result react(const FlushedEvt&);
1720 boost::statechart::result react(const IntervalFlush&);
1721 boost::statechart::result react(const boost::statechart::event_base&) {
1722 return discard_event();
1723 }
1724 };
1725
1726 struct Start;
1727
1728 struct Started : boost::statechart::state< Started, RecoveryMachine, Start >, NamedState {
1729 explicit Started(my_context ctx);
1730 void exit();
1731
1732 typedef boost::mpl::list <
1733 boost::statechart::custom_reaction< QueryState >,
1734 boost::statechart::custom_reaction< AdvMap >,
1735 boost::statechart::custom_reaction< NullEvt >,
1736 boost::statechart::custom_reaction< FlushedEvt >,
1737 boost::statechart::custom_reaction< IntervalFlush >,
1738 boost::statechart::transition< boost::statechart::event_base, Crashed >
1739 > reactions;
1740 boost::statechart::result react(const QueryState& q);
1741 boost::statechart::result react(const AdvMap&);
1742 boost::statechart::result react(const FlushedEvt&);
1743 boost::statechart::result react(const IntervalFlush&);
1744 boost::statechart::result react(const boost::statechart::event_base&) {
1745 return discard_event();
1746 }
1747 };
1748
1749 struct MakePrimary : boost::statechart::event< MakePrimary > {
1750 MakePrimary() : boost::statechart::event< MakePrimary >() {}
1751 };
1752 struct MakeStray : boost::statechart::event< MakeStray > {
1753 MakeStray() : boost::statechart::event< MakeStray >() {}
1754 };
1755 struct Primary;
1756 struct Stray;
1757
1758 struct Start : boost::statechart::state< Start, Started >, NamedState {
1759 explicit Start(my_context ctx);
1760 void exit();
1761
1762 typedef boost::mpl::list <
1763 boost::statechart::transition< MakePrimary, Primary >,
1764 boost::statechart::transition< MakeStray, Stray >
1765 > reactions;
1766 };
1767
1768 struct Peering;
1769 struct WaitActingChange;
1770 struct NeedActingChange : boost::statechart::event< NeedActingChange > {
1771 NeedActingChange() : boost::statechart::event< NeedActingChange >() {}
1772 };
1773 struct Incomplete;
1774 struct IsIncomplete : boost::statechart::event< IsIncomplete > {
1775 IsIncomplete() : boost::statechart::event< IsIncomplete >() {}
1776 };
1777 struct Down;
1778 struct IsDown : boost::statechart::event< IsDown > {
1779 IsDown() : boost::statechart::event< IsDown >() {}
1780 };
1781
1782 struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
1783 explicit Primary(my_context ctx);
1784 void exit();
1785
1786 typedef boost::mpl::list <
1787 boost::statechart::custom_reaction< ActMap >,
1788 boost::statechart::custom_reaction< MNotifyRec >,
1789 boost::statechart::transition< NeedActingChange, WaitActingChange >
1790 > reactions;
1791 boost::statechart::result react(const ActMap&);
1792 boost::statechart::result react(const MNotifyRec&);
1793 };
1794
1795 struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
1796 NamedState {
1797 typedef boost::mpl::list <
1798 boost::statechart::custom_reaction< QueryState >,
1799 boost::statechart::custom_reaction< AdvMap >,
1800 boost::statechart::custom_reaction< MLogRec >,
1801 boost::statechart::custom_reaction< MInfoRec >,
1802 boost::statechart::custom_reaction< MNotifyRec >
1803 > reactions;
1804 explicit WaitActingChange(my_context ctx);
1805 boost::statechart::result react(const QueryState& q);
1806 boost::statechart::result react(const AdvMap&);
1807 boost::statechart::result react(const MLogRec&);
1808 boost::statechart::result react(const MInfoRec&);
1809 boost::statechart::result react(const MNotifyRec&);
1810 void exit();
1811 };
1812
1813 struct GetInfo;
1814 struct Active;
1815
1816 struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
1817 PastIntervals::PriorSet prior_set;
1818 bool history_les_bound; //< need osd_find_best_info_ignore_history_les
1819
1820 explicit Peering(my_context ctx);
1821 void exit();
1822
1823 typedef boost::mpl::list <
1824 boost::statechart::custom_reaction< QueryState >,
1825 boost::statechart::transition< Activate, Active >,
1826 boost::statechart::custom_reaction< AdvMap >
1827 > reactions;
1828 boost::statechart::result react(const QueryState& q);
1829 boost::statechart::result react(const AdvMap &advmap);
1830 };
1831
1832 struct WaitLocalRecoveryReserved;
1833 struct Activating;
1834 struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState {
1835 explicit Active(my_context ctx);
1836 void exit();
1837
1838 const set<pg_shard_t> remote_shards_to_reserve_recovery;
1839 const set<pg_shard_t> remote_shards_to_reserve_backfill;
1840 bool all_replicas_activated;
1841
1842 typedef boost::mpl::list <
1843 boost::statechart::custom_reaction< QueryState >,
1844 boost::statechart::custom_reaction< ActMap >,
1845 boost::statechart::custom_reaction< AdvMap >,
1846 boost::statechart::custom_reaction< MInfoRec >,
1847 boost::statechart::custom_reaction< MNotifyRec >,
1848 boost::statechart::custom_reaction< MLogRec >,
1849 boost::statechart::custom_reaction< Backfilled >,
1850 boost::statechart::custom_reaction< AllReplicasActivated >
1851 > reactions;
1852 boost::statechart::result react(const QueryState& q);
1853 boost::statechart::result react(const ActMap&);
1854 boost::statechart::result react(const AdvMap&);
1855 boost::statechart::result react(const MInfoRec& infoevt);
1856 boost::statechart::result react(const MNotifyRec& notevt);
1857 boost::statechart::result react(const MLogRec& logevt);
1858 boost::statechart::result react(const Backfilled&) {
1859 return discard_event();
1860 }
1861 boost::statechart::result react(const AllReplicasActivated&);
1862 };
1863
1864 struct Clean : boost::statechart::state< Clean, Active >, NamedState {
1865 typedef boost::mpl::list<
1866 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >
1867 > reactions;
1868 explicit Clean(my_context ctx);
1869 void exit();
1870 };
1871
1872 struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
1873 typedef boost::mpl::list<
1874 boost::statechart::transition< GoClean, Clean >,
1875 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
1876 boost::statechart::custom_reaction< AllReplicasActivated >
1877 > reactions;
1878 explicit Recovered(my_context ctx);
1879 void exit();
1880 boost::statechart::result react(const AllReplicasActivated&) {
1881 post_event(GoClean());
1882 return forward_event();
1883 }
1884 };
1885
1886 struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
1887 typedef boost::mpl::list<
1888 boost::statechart::transition< Backfilled, Recovered >,
1889 boost::statechart::custom_reaction< CancelBackfill >,
1890 boost::statechart::custom_reaction< RemoteReservationRejected >
1891 > reactions;
1892 explicit Backfilling(my_context ctx);
1893 boost::statechart::result react(const RemoteReservationRejected& evt);
1894 boost::statechart::result react(const CancelBackfill& evt);
1895 void exit();
1896 };
1897
1898 struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
1899 typedef boost::mpl::list<
1900 boost::statechart::custom_reaction< RemoteBackfillReserved >,
1901 boost::statechart::custom_reaction< RemoteReservationRejected >,
1902 boost::statechart::transition< AllBackfillsReserved, Backfilling >
1903 > reactions;
1904 set<pg_shard_t>::const_iterator backfill_osd_it;
1905 explicit WaitRemoteBackfillReserved(my_context ctx);
1906 void exit();
1907 boost::statechart::result react(const RemoteBackfillReserved& evt);
1908 boost::statechart::result react(const RemoteReservationRejected& evt);
1909 };
1910
1911 struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
1912 typedef boost::mpl::list<
1913 boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved >
1914 > reactions;
1915 explicit WaitLocalBackfillReserved(my_context ctx);
1916 void exit();
1917 };
1918
1919 struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
1920 typedef boost::mpl::list<
1921 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
1922 boost::statechart::custom_reaction< RemoteBackfillReserved >,
1923 boost::statechart::custom_reaction< RemoteReservationRejected >
1924 > reactions;
1925 explicit NotBackfilling(my_context ctx);
1926 void exit();
1927 boost::statechart::result react(const RemoteBackfillReserved& evt);
1928 boost::statechart::result react(const RemoteReservationRejected& evt);
1929 };
1930
1931 struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
1932 typedef boost::mpl::list<
1933 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
1934 boost::statechart::custom_reaction< CancelRecovery >
1935 > reactions;
1936 explicit NotRecovering(my_context ctx);
1937 boost::statechart::result react(const CancelRecovery& evt) {
1938 /* no-op */
1939 return discard_event();
1940 }
1941 void exit();
1942 };
1943
1944 struct RepNotRecovering;
1945 struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
1946 explicit ReplicaActive(my_context ctx);
1947 void exit();
1948
1949 typedef boost::mpl::list <
1950 boost::statechart::custom_reaction< QueryState >,
1951 boost::statechart::custom_reaction< ActMap >,
1952 boost::statechart::custom_reaction< MQuery >,
1953 boost::statechart::custom_reaction< MInfoRec >,
1954 boost::statechart::custom_reaction< MLogRec >,
1955 boost::statechart::custom_reaction< Activate >
1956 > reactions;
1957 boost::statechart::result react(const QueryState& q);
1958 boost::statechart::result react(const MInfoRec& infoevt);
1959 boost::statechart::result react(const MLogRec& logevt);
1960 boost::statechart::result react(const ActMap&);
1961 boost::statechart::result react(const MQuery&);
1962 boost::statechart::result react(const Activate&);
1963 };
1964
1965 struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
1966 typedef boost::mpl::list<
1967 boost::statechart::transition< RecoveryDone, RepNotRecovering >,
1968 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
1969 boost::statechart::custom_reaction< BackfillTooFull >
1970 > reactions;
1971 explicit RepRecovering(my_context ctx);
1972 boost::statechart::result react(const BackfillTooFull &evt);
1973 void exit();
1974 };
1975
1976 struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
1977 typedef boost::mpl::list<
1978 boost::statechart::custom_reaction< RemoteBackfillReserved >,
1979 boost::statechart::custom_reaction< RemoteReservationRejected >
1980 > reactions;
1981 explicit RepWaitBackfillReserved(my_context ctx);
1982 void exit();
1983 boost::statechart::result react(const RemoteBackfillReserved &evt);
1984 boost::statechart::result react(const RemoteReservationRejected &evt);
1985 };
1986
1987 struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
1988 typedef boost::mpl::list<
1989 boost::statechart::custom_reaction< RemoteRecoveryReserved >
1990 > reactions;
1991 explicit RepWaitRecoveryReserved(my_context ctx);
1992 void exit();
1993 boost::statechart::result react(const RemoteRecoveryReserved &evt);
1994 };
1995
1996 struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
1997 typedef boost::mpl::list<
1998 boost::statechart::custom_reaction< RequestBackfillPrio >,
1999 boost::statechart::transition< RequestRecovery, RepWaitRecoveryReserved >,
2000 boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers
2001 > reactions;
2002 explicit RepNotRecovering(my_context ctx);
2003 boost::statechart::result react(const RequestBackfillPrio &evt);
2004 void exit();
2005 };
2006
2007 struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
2008 typedef boost::mpl::list <
2009 boost::statechart::custom_reaction< AllReplicasRecovered >,
2010 boost::statechart::custom_reaction< CancelRecovery >,
2011 boost::statechart::custom_reaction< RequestBackfill >
2012 > reactions;
2013 explicit Recovering(my_context ctx);
2014 void exit();
2015 void release_reservations(bool cancel = false);
2016 boost::statechart::result react(const AllReplicasRecovered &evt);
2017 boost::statechart::result react(const CancelRecovery& evt);
2018 boost::statechart::result react(const RequestBackfill &evt);
2019 };
2020
2021 struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState {
2022 typedef boost::mpl::list <
2023 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2024 boost::statechart::transition< AllRemotesReserved, Recovering >
2025 > reactions;
2026 set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
2027 explicit WaitRemoteRecoveryReserved(my_context ctx);
2028 boost::statechart::result react(const RemoteRecoveryReserved &evt);
2029 void exit();
2030 };
2031
2032 struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState {
2033 typedef boost::mpl::list <
2034 boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >,
2035 boost::statechart::custom_reaction< RecoveryTooFull >
2036 > reactions;
2037 explicit WaitLocalRecoveryReserved(my_context ctx);
2038 void exit();
2039 boost::statechart::result react(const RecoveryTooFull &evt);
2040 };
2041
2042 struct Activating : boost::statechart::state< Activating, Active >, NamedState {
2043 typedef boost::mpl::list <
2044 boost::statechart::transition< AllReplicasRecovered, Recovered >,
2045 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2046 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved >
2047 > reactions;
2048 explicit Activating(my_context ctx);
2049 void exit();
2050 };
2051
2052 struct Stray : boost::statechart::state< Stray, Started >, NamedState {
2053 map<int, pair<pg_query_t, epoch_t> > pending_queries;
2054
2055 explicit Stray(my_context ctx);
2056 void exit();
2057
2058 typedef boost::mpl::list <
2059 boost::statechart::custom_reaction< MQuery >,
2060 boost::statechart::custom_reaction< MLogRec >,
2061 boost::statechart::custom_reaction< MInfoRec >,
2062 boost::statechart::custom_reaction< ActMap >,
2063 boost::statechart::custom_reaction< RecoveryDone >
2064 > reactions;
2065 boost::statechart::result react(const MQuery& query);
2066 boost::statechart::result react(const MLogRec& logevt);
2067 boost::statechart::result react(const MInfoRec& infoevt);
2068 boost::statechart::result react(const ActMap&);
2069 boost::statechart::result react(const RecoveryDone&) {
2070 return discard_event();
2071 }
2072 };
2073
2074 struct GetLog;
2075
2076 struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
2077 set<pg_shard_t> peer_info_requested;
2078
2079 explicit GetInfo(my_context ctx);
2080 void exit();
2081 void get_infos();
2082
2083 typedef boost::mpl::list <
2084 boost::statechart::custom_reaction< QueryState >,
2085 boost::statechart::transition< GotInfo, GetLog >,
2086 boost::statechart::custom_reaction< MNotifyRec >,
2087 boost::statechart::transition< IsDown, Down >
2088 > reactions;
2089 boost::statechart::result react(const QueryState& q);
2090 boost::statechart::result react(const MNotifyRec& infoevt);
2091 };
2092
2093 struct GotLog : boost::statechart::event< GotLog > {
2094 GotLog() : boost::statechart::event< GotLog >() {}
2095 };
2096
2097 struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState {
2098 pg_shard_t auth_log_shard;
2099 boost::intrusive_ptr<MOSDPGLog> msg;
2100
2101 explicit GetLog(my_context ctx);
2102 void exit();
2103
2104 typedef boost::mpl::list <
2105 boost::statechart::custom_reaction< QueryState >,
2106 boost::statechart::custom_reaction< MLogRec >,
2107 boost::statechart::custom_reaction< GotLog >,
2108 boost::statechart::custom_reaction< AdvMap >,
2109 boost::statechart::transition< IsIncomplete, Incomplete >
2110 > reactions;
2111 boost::statechart::result react(const AdvMap&);
2112 boost::statechart::result react(const QueryState& q);
2113 boost::statechart::result react(const MLogRec& logevt);
2114 boost::statechart::result react(const GotLog&);
2115 };
2116
2117 struct WaitUpThru;
2118
2119 struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState {
2120 set<pg_shard_t> peer_missing_requested;
2121
2122 explicit GetMissing(my_context ctx);
2123 void exit();
2124
2125 typedef boost::mpl::list <
2126 boost::statechart::custom_reaction< QueryState >,
2127 boost::statechart::custom_reaction< MLogRec >,
2128 boost::statechart::transition< NeedUpThru, WaitUpThru >
2129 > reactions;
2130 boost::statechart::result react(const QueryState& q);
2131 boost::statechart::result react(const MLogRec& logevt);
2132 };
2133
2134 struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState {
2135 explicit WaitUpThru(my_context ctx);
2136 void exit();
2137
2138 typedef boost::mpl::list <
2139 boost::statechart::custom_reaction< QueryState >,
2140 boost::statechart::custom_reaction< ActMap >,
2141 boost::statechart::custom_reaction< MLogRec >
2142 > reactions;
2143 boost::statechart::result react(const QueryState& q);
2144 boost::statechart::result react(const ActMap& am);
2145 boost::statechart::result react(const MLogRec& logrec);
2146 };
2147
2148 struct Down : boost::statechart::state< Down, Peering>, NamedState {
2149 explicit Down(my_context ctx);
2150 typedef boost::mpl::list <
2151 boost::statechart::custom_reaction< QueryState >
2152 > reactions;
2153 boost::statechart::result react(const QueryState& infoevt);
2154 void exit();
2155 };
2156
2157 struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState {
2158 typedef boost::mpl::list <
2159 boost::statechart::custom_reaction< AdvMap >,
2160 boost::statechart::custom_reaction< MNotifyRec >,
2161 boost::statechart::custom_reaction< QueryState >
2162 > reactions;
2163 explicit Incomplete(my_context ctx);
2164 boost::statechart::result react(const AdvMap &advmap);
2165 boost::statechart::result react(const MNotifyRec& infoevt);
2166 boost::statechart::result react(const QueryState& infoevt);
2167 void exit();
2168 };
2169
2170
2171 RecoveryMachine machine;
2172 PG *pg;
2173
2174 /// context passed in by state machine caller
2175 RecoveryCtx *orig_ctx;
2176
2177 /// populated if we are buffering messages pending a flush
2178 boost::optional<BufferedRecoveryMessages> messages_pending_flush;
2179
2180 /**
2181 * populated between start_handle() and end_handle(), points into
2182 * the message lists for messages_pending_flush while blocking messages
2183 * or into orig_ctx otherwise
2184 */
2185 boost::optional<RecoveryCtx> rctx;
2186
2187 public:
2188 explicit RecoveryState(PG *pg)
2189 : machine(this, pg), pg(pg), orig_ctx(0) {
2190 machine.initiate();
2191 }
2192
2193 void handle_event(const boost::statechart::event_base &evt,
2194 RecoveryCtx *rctx) {
2195 start_handle(rctx);
2196 machine.process_event(evt);
2197 end_handle();
2198 }
2199
2200 void handle_event(CephPeeringEvtRef evt,
2201 RecoveryCtx *rctx) {
2202 start_handle(rctx);
2203 machine.process_event(evt->get_event());
2204 end_handle();
2205 }
2206
2207 } recovery_state;
2208
2209
2210 public:
2211 PG(OSDService *o, OSDMapRef curmap,
2212 const PGPool &pool, spg_t p);
2213 ~PG() override;
2214
2215 private:
2216 // Prevent copying
2217 explicit PG(const PG& rhs);
2218 PG& operator=(const PG& rhs);
2219 const spg_t pg_id;
2220 uint64_t peer_features;
2221 uint64_t acting_features;
2222 uint64_t upacting_features;
2223
2224 epoch_t last_epoch;
2225
2226 public:
2227 const spg_t& get_pgid() const { return pg_id; }
2228
2229 void reset_min_peer_features() {
2230 peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
2231 }
2232 uint64_t get_min_peer_features() const { return peer_features; }
2233 void apply_peer_features(uint64_t f) { peer_features &= f; }
2234
2235 uint64_t get_min_acting_features() const { return acting_features; }
2236 uint64_t get_min_upacting_features() const { return upacting_features; }
2237 bool perform_deletes_during_peering() const {
2238 return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
2239 }
2240
2241 void init_primary_up_acting(
2242 const vector<int> &newup,
2243 const vector<int> &newacting,
2244 int new_up_primary,
2245 int new_acting_primary) {
2246 actingset.clear();
2247 acting = newacting;
2248 for (uint8_t i = 0; i < acting.size(); ++i) {
2249 if (acting[i] != CRUSH_ITEM_NONE)
2250 actingset.insert(
2251 pg_shard_t(
2252 acting[i],
2253 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2254 }
2255 upset.clear();
2256 up = newup;
2257 for (uint8_t i = 0; i < up.size(); ++i) {
2258 if (up[i] != CRUSH_ITEM_NONE)
2259 upset.insert(
2260 pg_shard_t(
2261 up[i],
2262 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2263 }
2264 if (!pool.info.ec_pool()) {
2265 up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
2266 primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
2267 return;
2268 }
2269 up_primary = pg_shard_t();
2270 primary = pg_shard_t();
2271 for (uint8_t i = 0; i < up.size(); ++i) {
2272 if (up[i] == new_up_primary) {
2273 up_primary = pg_shard_t(up[i], shard_id_t(i));
2274 break;
2275 }
2276 }
2277 for (uint8_t i = 0; i < acting.size(); ++i) {
2278 if (acting[i] == new_acting_primary) {
2279 primary = pg_shard_t(acting[i], shard_id_t(i));
2280 break;
2281 }
2282 }
2283 assert(up_primary.osd == new_up_primary);
2284 assert(primary.osd == new_acting_primary);
2285 }
2286 pg_shard_t get_primary() const { return primary; }
2287
2288 int get_role() const { return role; }
2289 void set_role(int r) { role = r; }
2290
2291 bool is_primary() const { return pg_whoami == primary; }
2292 bool is_replica() const { return role > 0; }
2293
2294 epoch_t get_last_peering_reset() const { return last_peering_reset; }
2295
2296 //int get_state() const { return state; }
2297 bool state_test(int m) const { return (state & m) != 0; }
2298 void state_set(int m) { state |= m; }
2299 void state_clear(int m) { state &= ~m; }
2300
2301 bool is_complete() const { return info.last_complete == info.last_update; }
2302 bool should_send_notify() const { return send_notify; }
2303
2304 int get_state() const { return state; }
2305 bool is_active() const { return state_test(PG_STATE_ACTIVE); }
2306 bool is_activating() const { return state_test(PG_STATE_ACTIVATING); }
2307 bool is_peering() const { return state_test(PG_STATE_PEERING); }
2308 bool is_down() const { return state_test(PG_STATE_DOWN); }
2309 bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
2310 bool is_clean() const { return state_test(PG_STATE_CLEAN); }
2311 bool is_degraded() const { return state_test(PG_STATE_DEGRADED); }
2312 bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
2313
2314 bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
2315 bool is_peered() const {
2316 return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED);
2317 }
2318
2319 bool is_empty() const { return info.last_update == eversion_t(0,0); }
2320
2321 void init(
2322 int role,
2323 const vector<int>& up,
2324 int up_primary,
2325 const vector<int>& acting,
2326 int acting_primary,
2327 const pg_history_t& history,
2328 const PastIntervals& pim,
2329 bool backfill,
2330 ObjectStore::Transaction *t);
2331
2332 // pg on-disk state
2333 void do_pending_flush();
2334
2335 static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits);
2336 static void _init(ObjectStore::Transaction& t,
2337 spg_t pgid, const pg_pool_t *pool);
2338
2339 private:
2340 void prepare_write_info(map<string,bufferlist> *km);
2341
2342 void update_store_with_options();
2343 void update_store_on_load();
2344
2345 public:
2346 static int _prepare_write_info(
2347 CephContext* cct,
2348 map<string,bufferlist> *km,
2349 epoch_t epoch,
2350 pg_info_t &info,
2351 pg_info_t &last_written_info,
2352 PastIntervals &past_intervals,
2353 bool dirty_big_info,
2354 bool dirty_epoch,
2355 bool try_fast_info,
2356 PerfCounters *logger = nullptr);
2357 void write_if_dirty(ObjectStore::Transaction& t);
2358
2359 PGLog::IndexedLog projected_log;
2360 bool check_in_progress_op(
2361 const osd_reqid_t &r,
2362 eversion_t *version,
2363 version_t *user_version,
2364 int *return_code) const;
2365 eversion_t projected_last_update;
2366 eversion_t get_next_version() const {
2367 eversion_t at_version(
2368 get_osdmap()->get_epoch(),
2369 projected_last_update.version+1);
2370 assert(at_version > info.last_update);
2371 assert(at_version > pg_log.get_head());
2372 assert(at_version > projected_last_update);
2373 return at_version;
2374 }
2375
2376 void add_log_entry(const pg_log_entry_t& e, bool applied);
2377 void append_log(
2378 const vector<pg_log_entry_t>& logv,
2379 eversion_t trim_to,
2380 eversion_t roll_forward_to,
2381 ObjectStore::Transaction &t,
2382 bool transaction_applied = true);
2383 bool check_log_for_corruption(ObjectStore *store);
2384 void trim_log();
2385
2386 std::string get_corrupt_pg_log_name() const;
2387 static int read_info(
2388 ObjectStore *store, spg_t pgid, const coll_t &coll,
2389 bufferlist &bl, pg_info_t &info, PastIntervals &past_intervals,
2390 __u8 &);
2391 void read_state(ObjectStore *store, bufferlist &bl);
2392 static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
2393 static int peek_map_epoch(ObjectStore *store, spg_t pgid,
2394 epoch_t *pepoch, bufferlist *bl);
2395 void update_snap_map(
2396 const vector<pg_log_entry_t> &log_entries,
2397 ObjectStore::Transaction& t);
2398
2399 void filter_snapc(vector<snapid_t> &snaps);
2400
2401 void log_weirdness();
2402
2403 virtual void kick_snap_trim() = 0;
2404 virtual void snap_trimmer_scrub_complete() = 0;
2405 bool requeue_scrub(bool high_priority = false);
2406 void queue_recovery();
2407 bool queue_scrub();
2408 unsigned get_scrub_priority();
2409
2410 /// share pg info after a pg is active
2411 void share_pg_info();
2412
2413
2414 bool append_log_entries_update_missing(
2415 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
2416 ObjectStore::Transaction &t);
2417
2418 /**
2419 * Merge entries updating missing as necessary on all
2420 * actingbackfill logs and missings (also missing_loc)
2421 */
2422 void merge_new_log_entries(
2423 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
2424 ObjectStore::Transaction &t);
2425
2426 void reset_interval_flush();
2427 void start_peering_interval(
2428 const OSDMapRef lastmap,
2429 const vector<int>& newup, int up_primary,
2430 const vector<int>& newacting, int acting_primary,
2431 ObjectStore::Transaction *t);
2432 void on_new_interval();
2433 virtual void _on_new_interval() = 0;
2434 void start_flush(ObjectStore::Transaction *t,
2435 list<Context *> *on_applied,
2436 list<Context *> *on_safe);
2437 void set_last_peering_reset();
2438 bool pg_has_reset_since(epoch_t e) {
2439 assert(is_locked());
2440 return deleting || e < get_last_peering_reset();
2441 }
2442
2443 void update_history(const pg_history_t& history);
2444 void fulfill_info(pg_shard_t from, const pg_query_t &query,
2445 pair<pg_shard_t, pg_info_t> &notify_info);
2446 void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
2447
2448 void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap);
2449
2450 bool should_restart_peering(
2451 int newupprimary,
2452 int newactingprimary,
2453 const vector<int>& newup,
2454 const vector<int>& newacting,
2455 OSDMapRef lastmap,
2456 OSDMapRef osdmap);
2457
2458 // OpRequest queueing
2459 bool can_discard_op(OpRequestRef& op);
2460 bool can_discard_scan(OpRequestRef op);
2461 bool can_discard_backfill(OpRequestRef op);
2462 bool can_discard_request(OpRequestRef& op);
2463
2464 template<typename T, int MSGTYPE>
2465 bool can_discard_replica_op(OpRequestRef& op);
2466
2467 bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
2468 bool old_peering_evt(CephPeeringEvtRef evt) {
2469 return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
2470 }
2471 static bool have_same_or_newer_map(epoch_t cur_epoch, epoch_t e) {
2472 return e <= cur_epoch;
2473 }
2474 bool have_same_or_newer_map(epoch_t e) {
2475 return e <= get_osdmap()->get_epoch();
2476 }
2477
2478 bool op_has_sufficient_caps(OpRequestRef& op);
2479
2480
2481 // recovery bits
2482 void take_waiters();
2483 void queue_peering_event(CephPeeringEvtRef evt);
2484 void handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx);
2485 void queue_query(epoch_t msg_epoch, epoch_t query_epoch,
2486 pg_shard_t from, const pg_query_t& q);
2487 void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
2488 void queue_flushed(epoch_t started_at);
2489 void handle_advance_map(
2490 OSDMapRef osdmap, OSDMapRef lastmap,
2491 vector<int>& newup, int up_primary,
2492 vector<int>& newacting, int acting_primary,
2493 RecoveryCtx *rctx);
2494 void handle_activate_map(RecoveryCtx *rctx);
2495 void handle_create(RecoveryCtx *rctx);
2496 void handle_loaded(RecoveryCtx *rctx);
2497 void handle_query_state(Formatter *f);
2498
2499 virtual void on_removal(ObjectStore::Transaction *t) = 0;
2500
2501
2502 // abstract bits
2503 virtual void do_request(
2504 OpRequestRef& op,
2505 ThreadPool::TPHandle &handle
2506 ) = 0;
2507
2508 virtual void do_op(OpRequestRef& op) = 0;
2509 virtual void do_sub_op(OpRequestRef op) = 0;
2510 virtual void do_sub_op_reply(OpRequestRef op) = 0;
2511 virtual void do_scan(
2512 OpRequestRef op,
2513 ThreadPool::TPHandle &handle
2514 ) = 0;
2515 virtual void do_backfill(OpRequestRef op) = 0;
2516 virtual void snap_trimmer(epoch_t epoch_queued) = 0;
2517
2518 virtual int do_command(
2519 cmdmap_t cmdmap,
2520 ostream& ss,
2521 bufferlist& idata,
2522 bufferlist& odata,
2523 ConnectionRef conn,
2524 ceph_tid_t tid) = 0;
2525
2526 virtual void on_role_change() = 0;
2527 virtual void on_pool_change() = 0;
2528 virtual void on_change(ObjectStore::Transaction *t) = 0;
2529 virtual void on_activate() = 0;
2530 virtual void on_flushed() = 0;
2531 virtual void on_shutdown() = 0;
2532 virtual void check_blacklisted_watchers() = 0;
2533 virtual void get_watchers(std::list<obj_watch_item_t>&) = 0;
2534
2535 virtual bool agent_work(int max) = 0;
2536 virtual bool agent_work(int max, int agent_flush_quota) = 0;
2537 virtual void agent_stop() = 0;
2538 virtual void agent_delay() = 0;
2539 virtual void agent_clear() = 0;
2540 virtual void agent_choose_mode_restart() = 0;
2541 };
2542
2543 ostream& operator<<(ostream& out, const PG& pg);
2544
2545 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi);
2546
2547 #endif