]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PG.h
e93549e9f83fc7f81f3556e407c33f86778e1c68
[ceph.git] / ceph / src / osd / PG.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef CEPH_PG_H
16 #define CEPH_PG_H
17
18 #include <boost/statechart/custom_reaction.hpp>
19 #include <boost/statechart/event.hpp>
20 #include <boost/statechart/simple_state.hpp>
21 #include <boost/statechart/state.hpp>
22 #include <boost/statechart/state_machine.hpp>
23 #include <boost/statechart/transition.hpp>
24 #include <boost/statechart/event_base.hpp>
25 #include <boost/scoped_ptr.hpp>
26 #include <boost/circular_buffer.hpp>
27 #include "include/memory.h"
28 #include "include/mempool.h"
29
30 // re-include our assert to clobber boost's
31 #include "include/assert.h"
32
33 #include "include/types.h"
34 #include "include/stringify.h"
35 #include "osd_types.h"
36 #include "include/xlist.h"
37 #include "SnapMapper.h"
38 #include "Session.h"
39 #include "common/Timer.h"
40
41 #include "PGLog.h"
42 #include "OSDMap.h"
43 #include "messages/MOSDPGLog.h"
44 #include "include/str_list.h"
45 #include "PGBackend.h"
46
47 #include <atomic>
48 #include <list>
49 #include <memory>
50 #include <stack>
51 #include <string>
52 #include <tuple>
53 using namespace std;
54
55 // #include "include/unordered_map.h"
56 // #include "include/unordered_set.h"
57
58 //#define DEBUG_RECOVERY_OIDS // track set of recovering oids explicitly, to find counting bugs
59
60 class OSD;
61 class OSDService;
62 class MOSDOp;
63 class MOSDPGScan;
64 class MOSDPGBackfill;
65 class MOSDPGInfo;
66
67 class PG;
68 struct OpRequest;
69 typedef OpRequest::Ref OpRequestRef;
70 class MOSDPGLog;
71 class CephContext;
72
73 namespace Scrub {
74 class Store;
75 }
76
77 void intrusive_ptr_add_ref(PG *pg);
78 void intrusive_ptr_release(PG *pg);
79
80 using state_history_entry = std::tuple<utime_t, utime_t, const char*>;
81 using embedded_state = std::pair<utime_t, const char*>;
82
83 struct PGStateInstance {
84 // Time spent in pg states
85
86 void setepoch(const epoch_t current_epoch) {
87 this_epoch = current_epoch;
88 }
89
90 void enter_state(const utime_t entime, const char* state) {
91 embedded_states.push(std::make_pair(entime, state));
92 }
93
94 void exit_state(const utime_t extime) {
95 embedded_state this_state = embedded_states.top();
96 state_history.push_back(state_history_entry{
97 this_state.first, extime, this_state.second});
98 embedded_states.pop();
99 }
100
101 epoch_t this_epoch;
102 utime_t enter_time;
103 std::vector<state_history_entry> state_history;
104 std::stack<embedded_state> embedded_states;
105 };
106
107 class PGStateHistory {
108 // Member access protected with the PG lock
109 public:
110 PGStateHistory() : buffer(10) {}
111
112 void enter(PG* pg, const utime_t entime, const char* state);
113
114 void exit(const char* state);
115
116 void reset() {
117 pi = nullptr;
118 }
119
120 void set_pg_in_destructor() { pg_in_destructor = true; }
121
122 void dump(Formatter* f) const;
123
124 private:
125 bool pg_in_destructor = false;
126 PG* thispg = nullptr;
127 std::unique_ptr<PGStateInstance> tmppi;
128 PGStateInstance* pi = nullptr;
129 boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer;
130
131 };
132
133 #ifdef PG_DEBUG_REFS
134 #include "common/tracked_int_ptr.hpp"
135 uint64_t get_with_id(PG *pg);
136 void put_with_id(PG *pg, uint64_t id);
137 typedef TrackedIntPtr<PG> PGRef;
138 #else
139 typedef boost::intrusive_ptr<PG> PGRef;
140 #endif
141
142 class PGRecoveryStats {
143 struct per_state_info {
144 uint64_t enter, exit; // enter/exit counts
145 uint64_t events;
146 utime_t event_time; // time spent processing events
147 utime_t total_time; // total time in state
148 utime_t min_time, max_time;
149
150 // cppcheck-suppress unreachableCode
151 per_state_info() : enter(0), exit(0), events(0) {}
152 };
153 map<const char *,per_state_info> info;
154 Mutex lock;
155
156 public:
157 PGRecoveryStats() : lock("PGRecoverStats::lock") {}
158
159 void reset() {
160 Mutex::Locker l(lock);
161 info.clear();
162 }
163 void dump(ostream& out) {
164 Mutex::Locker l(lock);
165 for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
166 per_state_info& i = p->second;
167 out << i.enter << "\t" << i.exit << "\t"
168 << i.events << "\t" << i.event_time << "\t"
169 << i.total_time << "\t"
170 << i.min_time << "\t" << i.max_time << "\t"
171 << p->first << "\n";
172 }
173 }
174
175 void dump_formatted(Formatter *f) {
176 Mutex::Locker l(lock);
177 f->open_array_section("pg_recovery_stats");
178 for (map<const char *,per_state_info>::iterator p = info.begin();
179 p != info.end(); ++p) {
180 per_state_info& i = p->second;
181 f->open_object_section("recovery_state");
182 f->dump_int("enter", i.enter);
183 f->dump_int("exit", i.exit);
184 f->dump_int("events", i.events);
185 f->dump_stream("event_time") << i.event_time;
186 f->dump_stream("total_time") << i.total_time;
187 f->dump_stream("min_time") << i.min_time;
188 f->dump_stream("max_time") << i.max_time;
189 vector<string> states;
190 get_str_vec(p->first, "/", states);
191 f->open_array_section("nested_states");
192 for (vector<string>::iterator st = states.begin();
193 st != states.end(); ++st) {
194 f->dump_string("state", *st);
195 }
196 f->close_section();
197 f->close_section();
198 }
199 f->close_section();
200 }
201
202 void log_enter(const char *s) {
203 Mutex::Locker l(lock);
204 info[s].enter++;
205 }
206 void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
207 Mutex::Locker l(lock);
208 per_state_info &i = info[s];
209 i.exit++;
210 i.total_time += dur;
211 if (dur > i.max_time)
212 i.max_time = dur;
213 if (dur < i.min_time || i.min_time == utime_t())
214 i.min_time = dur;
215 i.events += events;
216 i.event_time += event_dur;
217 }
218 };
219
220 struct PGPool {
221 CephContext* cct;
222 epoch_t cached_epoch;
223 int64_t id;
224 string name;
225 uint64_t auid;
226
227 pg_pool_t info;
228 SnapContext snapc; // the default pool snapc, ready to go.
229
230 interval_set<snapid_t> cached_removed_snaps; // current removed_snaps set
231 interval_set<snapid_t> newly_removed_snaps; // newly removed in the last epoch
232
233 PGPool(CephContext* cct, OSDMapRef map, int64_t i)
234 : cct(cct),
235 cached_epoch(map->get_epoch()),
236 id(i),
237 name(map->get_pool_name(id)),
238 auid(map->get_pg_pool(id)->auid) {
239 const pg_pool_t *pi = map->get_pg_pool(id);
240 assert(pi);
241 info = *pi;
242 snapc = pi->get_snap_context();
243 pi->build_removed_snaps(cached_removed_snaps);
244 }
245
246 void update(OSDMapRef map);
247 };
248
249 /** PG - Replica Placement Group
250 *
251 */
252
253 class PG : public DoutPrefixProvider {
254 protected:
255 OSDService *osd;
256 CephContext *cct;
257 OSDriver osdriver;
258 SnapMapper snap_mapper;
259
260 virtual PGBackend *get_pgbackend() = 0;
261 public:
262 std::string gen_prefix() const override;
263 CephContext *get_cct() const override { return cct; }
264 unsigned get_subsys() const override { return ceph_subsys_osd; }
265
266 /*** PG ****/
267 void update_snap_mapper_bits(uint32_t bits) {
268 snap_mapper.update_bits(bits);
269 }
270 /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
271 IsPGRecoverablePredicate *get_is_recoverable_predicate() {
272 return get_pgbackend()->get_is_recoverable_predicate();
273 }
274 protected:
275 OSDMapRef osdmap_ref;
276 OSDMapRef last_persisted_osdmap_ref;
277 PGPool pool;
278
279 void requeue_map_waiters();
280
281 void update_osdmap_ref(OSDMapRef newmap) {
282 assert(_lock.is_locked_by_me());
283 osdmap_ref = std::move(newmap);
284 }
285
286 public:
287 OSDMapRef get_osdmap() const {
288 assert(is_locked());
289 assert(osdmap_ref);
290 return osdmap_ref;
291 }
292 protected:
293
294 /** locking and reference counting.
295 * I destroy myself when the reference count hits zero.
296 * lock() should be called before doing anything.
297 * get() should be called on pointer copy (to another thread, etc.).
298 * put() should be called on destruction of some previously copied pointer.
299 * unlock() when done with the current pointer (_most common_).
300 */
301 mutable Mutex _lock;
302 std::atomic_uint ref{0};
303
304 #ifdef PG_DEBUG_REFS
305 Mutex _ref_id_lock;
306 map<uint64_t, string> _live_ids;
307 map<string, uint64_t> _tag_counts;
308 uint64_t _ref_id;
309 #endif
310
311 public:
312 bool deleting; // true while in removing or OSD is shutting down
313
314 ZTracer::Endpoint trace_endpoint;
315
316 void lock_suspend_timeout(ThreadPool::TPHandle &handle);
317 void lock(bool no_lockdep = false) const;
318 void unlock() const {
319 //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
320 assert(!dirty_info);
321 assert(!dirty_big_info);
322 _lock.Unlock();
323 }
324
325 bool is_locked() const {
326 return _lock.is_locked();
327 }
328
329 #ifdef PG_DEBUG_REFS
330 uint64_t get_with_id();
331 void put_with_id(uint64_t);
332 void dump_live_ids();
333 #endif
334 void get(const char* tag);
335 void put(const char* tag);
336
337 bool dirty_info, dirty_big_info;
338
339 public:
340 bool is_ec_pg() const {
341 return pool.info.ec_pool();
342 }
343 // pg state
344 pg_info_t info; ///< current pg info
345 pg_info_t last_written_info; ///< last written info
346 __u8 info_struct_v;
347 static const __u8 cur_struct_v = 10;
348 // v10 is the new past_intervals encoding
349 // v9 was fastinfo_key addition
350 // v8 was the move to a per-pg pgmeta object
351 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
352 // (first appeared in cuttlefish).
353 static const __u8 compat_struct_v = 7;
354 bool must_upgrade() {
355 return info_struct_v < cur_struct_v;
356 }
357 bool can_upgrade() {
358 return info_struct_v >= compat_struct_v;
359 }
360 void upgrade(ObjectStore *store);
361
362 const coll_t coll;
363 ObjectStore::CollectionHandle ch;
364 PGLog pg_log;
365 static string get_info_key(spg_t pgid) {
366 return stringify(pgid) + "_info";
367 }
368 static string get_biginfo_key(spg_t pgid) {
369 return stringify(pgid) + "_biginfo";
370 }
371 static string get_epoch_key(spg_t pgid) {
372 return stringify(pgid) + "_epoch";
373 }
374 ghobject_t pgmeta_oid;
375
376 class MissingLoc {
377 map<hobject_t, pg_missing_item> needs_recovery_map;
378 map<hobject_t, set<pg_shard_t> > missing_loc;
379 set<pg_shard_t> missing_loc_sources;
380 PG *pg;
381 set<pg_shard_t> empty_set;
382 public:
383 boost::scoped_ptr<IsPGReadablePredicate> is_readable;
384 boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable;
385 explicit MissingLoc(PG *pg)
386 : pg(pg) {}
387 void set_backend_predicates(
388 IsPGReadablePredicate *_is_readable,
389 IsPGRecoverablePredicate *_is_recoverable) {
390 is_readable.reset(_is_readable);
391 is_recoverable.reset(_is_recoverable);
392 }
393 string gen_prefix() const { return pg->gen_prefix(); }
394 bool needs_recovery(
395 const hobject_t &hoid,
396 eversion_t *v = 0) const {
397 map<hobject_t, pg_missing_item>::const_iterator i =
398 needs_recovery_map.find(hoid);
399 if (i == needs_recovery_map.end())
400 return false;
401 if (v)
402 *v = i->second.need;
403 return true;
404 }
405 bool is_unfound(const hobject_t &hoid) const {
406 return needs_recovery(hoid) && (
407 !missing_loc.count(hoid) ||
408 !(*is_recoverable)(missing_loc.find(hoid)->second));
409 }
410 bool readable_with_acting(
411 const hobject_t &hoid,
412 const set<pg_shard_t> &acting) const;
413 uint64_t num_unfound() const {
414 uint64_t ret = 0;
415 for (map<hobject_t, pg_missing_item>::const_iterator i =
416 needs_recovery_map.begin();
417 i != needs_recovery_map.end();
418 ++i) {
419 if (is_unfound(i->first))
420 ++ret;
421 }
422 return ret;
423 }
424
425 bool have_unfound() const {
426 for (map<hobject_t, pg_missing_item>::const_iterator i =
427 needs_recovery_map.begin();
428 i != needs_recovery_map.end();
429 ++i) {
430 if (is_unfound(i->first))
431 return true;
432 }
433 return false;
434 }
435 void clear() {
436 needs_recovery_map.clear();
437 missing_loc.clear();
438 missing_loc_sources.clear();
439 }
440
441 void add_location(const hobject_t &hoid, pg_shard_t location) {
442 missing_loc[hoid].insert(location);
443 }
444 void remove_location(const hobject_t &hoid, pg_shard_t location) {
445 missing_loc[hoid].erase(location);
446 }
447 void add_active_missing(const pg_missing_t &missing) {
448 for (map<hobject_t, pg_missing_item>::const_iterator i =
449 missing.get_items().begin();
450 i != missing.get_items().end();
451 ++i) {
452 map<hobject_t, pg_missing_item>::const_iterator j =
453 needs_recovery_map.find(i->first);
454 if (j == needs_recovery_map.end()) {
455 needs_recovery_map.insert(*i);
456 } else {
457 assert(i->second.need == j->second.need);
458 }
459 }
460 }
461
462 void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have) {
463 needs_recovery_map[hoid] = pg_missing_item(need, have);
464 }
465 void revise_need(const hobject_t &hoid, eversion_t need) {
466 assert(needs_recovery(hoid));
467 needs_recovery_map[hoid].need = need;
468 }
469
470 /// Adds info about a possible recovery source
471 bool add_source_info(
472 pg_shard_t source, ///< [in] source
473 const pg_info_t &oinfo, ///< [in] info
474 const pg_missing_t &omissing, ///< [in] (optional) missing
475 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
476 ); ///< @return whether a new object location was discovered
477
478 /// Adds recovery sources in batch
479 void add_batch_sources_info(
480 const set<pg_shard_t> &sources, ///< [in] a set of resources which can be used for all objects
481 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
482 );
483
484 /// Uses osdmap to update structures for now down sources
485 void check_recovery_sources(const OSDMapRef& osdmap);
486
487 /// Call when hoid is no longer missing in acting set
488 void recovered(const hobject_t &hoid) {
489 needs_recovery_map.erase(hoid);
490 missing_loc.erase(hoid);
491 }
492
493 /// Call to update structures for hoid after a change
494 void rebuild(
495 const hobject_t &hoid,
496 pg_shard_t self,
497 const set<pg_shard_t> to_recover,
498 const pg_info_t &info,
499 const pg_missing_t &missing,
500 const map<pg_shard_t, pg_missing_t> &pmissing,
501 const map<pg_shard_t, pg_info_t> &pinfo) {
502 recovered(hoid);
503 boost::optional<pg_missing_item> item;
504 auto miter = missing.get_items().find(hoid);
505 if (miter != missing.get_items().end()) {
506 item = miter->second;
507 } else {
508 for (auto &&i: to_recover) {
509 if (i == self)
510 continue;
511 auto pmiter = pmissing.find(i);
512 assert(pmiter != pmissing.end());
513 miter = pmiter->second.get_items().find(hoid);
514 if (miter != pmiter->second.get_items().end()) {
515 item = miter->second;
516 break;
517 }
518 }
519 }
520 if (!item)
521 return; // recovered!
522
523 needs_recovery_map[hoid] = *item;
524 auto mliter =
525 missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first;
526 assert(info.last_backfill.is_max());
527 assert(info.last_update >= item->need);
528 if (!missing.is_missing(hoid))
529 mliter->second.insert(self);
530 for (auto &&i: pmissing) {
531 auto pinfoiter = pinfo.find(i.first);
532 assert(pinfoiter != pinfo.end());
533 if (item->need <= pinfoiter->second.last_update &&
534 hoid <= pinfoiter->second.last_backfill &&
535 !i.second.is_missing(hoid))
536 mliter->second.insert(i.first);
537 }
538 }
539
540 const set<pg_shard_t> &get_locations(const hobject_t &hoid) const {
541 return missing_loc.count(hoid) ?
542 missing_loc.find(hoid)->second : empty_set;
543 }
544 const map<hobject_t, set<pg_shard_t>> &get_missing_locs() const {
545 return missing_loc;
546 }
547 const map<hobject_t, pg_missing_item> &get_needs_recovery() const {
548 return needs_recovery_map;
549 }
550 } missing_loc;
551
552 PastIntervals past_intervals;
553
554 interval_set<snapid_t> snap_trimq;
555
556 /* You should not use these items without taking their respective queue locks
557 * (if they have one) */
558 xlist<PG*>::item stat_queue_item;
559 bool scrub_queued;
560 bool recovery_queued;
561
562 int recovery_ops_active;
563 set<pg_shard_t> waiting_on_backfill;
564 #ifdef DEBUG_RECOVERY_OIDS
565 set<hobject_t> recovering_oids;
566 #endif
567
568 protected:
569 int role; // 0 = primary, 1 = replica, -1=none.
570 unsigned state; // PG_STATE_*
571
572 bool send_notify; ///< true if we are non-primary and should notify the primary
573
574 public:
575 eversion_t last_update_ondisk; // last_update that has committed; ONLY DEFINED WHEN is_active()
576 eversion_t last_complete_ondisk; // last_complete that has committed.
577 eversion_t last_update_applied;
578
579
580 struct C_UpdateLastRollbackInfoTrimmedToApplied : Context {
581 PGRef pg;
582 epoch_t e;
583 eversion_t v;
584 C_UpdateLastRollbackInfoTrimmedToApplied(PG *pg, epoch_t e, eversion_t v)
585 : pg(pg), e(e), v(v) {}
586 void finish(int) override {
587 pg->lock();
588 if (!pg->pg_has_reset_since(e)) {
589 pg->last_rollback_info_trimmed_to_applied = v;
590 }
591 pg->unlock();
592 }
593 };
594 // entries <= last_rollback_info_trimmed_to_applied have been trimmed,
595 // and the transaction has applied
596 eversion_t last_rollback_info_trimmed_to_applied;
597
598 // primary state
599 public:
600 pg_shard_t primary;
601 pg_shard_t pg_whoami;
602 pg_shard_t up_primary;
603 vector<int> up, acting, want_acting;
604 set<pg_shard_t> actingbackfill, actingset, upset;
605 map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
606 eversion_t min_last_complete_ondisk; // up: min over last_complete_ondisk, peer_last_complete_ondisk
607 eversion_t pg_trim_to;
608
609 set<int> blocked_by; ///< osds we are blocked by (for pg stats)
610
611 // [primary only] content recovery state
612
613 public:
614 struct BufferedRecoveryMessages {
615 map<int, map<spg_t, pg_query_t> > query_map;
616 map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map;
617 map<int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
618 };
619
620 struct RecoveryCtx {
621 utime_t start_time;
622 map<int, map<spg_t, pg_query_t> > *query_map;
623 map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map;
624 map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list;
625 set<PGRef> created_pgs;
626 C_Contexts *on_applied;
627 C_Contexts *on_safe;
628 ObjectStore::Transaction *transaction;
629 ThreadPool::TPHandle* handle;
630 RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map,
631 map<int,
632 vector<pair<pg_notify_t, PastIntervals> > > *info_map,
633 map<int,
634 vector<pair<pg_notify_t, PastIntervals> > > *notify_list,
635 C_Contexts *on_applied,
636 C_Contexts *on_safe,
637 ObjectStore::Transaction *transaction)
638 : query_map(query_map), info_map(info_map),
639 notify_list(notify_list),
640 on_applied(on_applied),
641 on_safe(on_safe),
642 transaction(transaction),
643 handle(NULL) {}
644
645 RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
646 : query_map(&(buf.query_map)),
647 info_map(&(buf.info_map)),
648 notify_list(&(buf.notify_list)),
649 on_applied(rctx.on_applied),
650 on_safe(rctx.on_safe),
651 transaction(rctx.transaction),
652 handle(rctx.handle) {}
653
654 void accept_buffered_messages(BufferedRecoveryMessages &m) {
655 assert(query_map);
656 assert(info_map);
657 assert(notify_list);
658 for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
659 i != m.query_map.end();
660 ++i) {
661 map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
662 for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
663 j != i->second.end();
664 ++j) {
665 omap[j->first] = j->second;
666 }
667 }
668 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
669 = m.info_map.begin();
670 i != m.info_map.end();
671 ++i) {
672 vector<pair<pg_notify_t, PastIntervals> > &ovec =
673 (*info_map)[i->first];
674 ovec.reserve(ovec.size() + i->second.size());
675 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
676 }
677 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
678 = m.notify_list.begin();
679 i != m.notify_list.end();
680 ++i) {
681 vector<pair<pg_notify_t, PastIntervals> > &ovec =
682 (*notify_list)[i->first];
683 ovec.reserve(ovec.size() + i->second.size());
684 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
685 }
686 }
687 };
688
689
690 PGStateHistory pgstate_history;
691
692 struct NamedState {
693 const char *state_name;
694 utime_t enter_time;
695 PG* pg;
696 const char *get_state_name() { return state_name; }
697 NamedState(PG *pg_, const char *state_name_)
698 : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) {
699 pg->pgstate_history.enter(pg, enter_time, state_name);
700 }
701 virtual ~NamedState() { pg->pgstate_history.exit(state_name); }
702 };
703
704
705
706 protected:
707
708 /*
709 * peer_info -- projected (updates _before_ replicas ack)
710 * peer_missing -- committed (updates _after_ replicas ack)
711 */
712
713 bool need_up_thru;
714 set<pg_shard_t> stray_set; // non-acting osds that have PG data.
715 eversion_t oldest_update; // acting: lowest (valid) last_update in active set
716 map<pg_shard_t, pg_info_t> peer_info; // info from peers (stray or prior)
717 set<pg_shard_t> peer_purged; // peers purged
718 map<pg_shard_t, pg_missing_t> peer_missing;
719 set<pg_shard_t> peer_log_requested; // logs i've requested (and start stamps)
720 set<pg_shard_t> peer_missing_requested;
721
722 // i deleted these strays; ignore racing PGInfo from them
723 set<pg_shard_t> peer_activated;
724
725 // primary-only, recovery-only state
726 set<pg_shard_t> might_have_unfound; // These osds might have objects on them
727 // which are unfound on the primary
728 epoch_t last_peering_reset;
729
730
731 /* heartbeat peers */
732 void set_probe_targets(const set<pg_shard_t> &probe_set);
733 void clear_probe_targets();
734 public:
735 Mutex heartbeat_peer_lock;
736 set<int> heartbeat_peers;
737 set<int> probe_targets;
738
739 /**
740 * BackfillInterval
741 *
742 * Represents the objects in a range [begin, end)
743 *
744 * Possible states:
745 * 1) begin == end == hobject_t() indicates the the interval is unpopulated
746 * 2) Else, objects contains all objects in [begin, end)
747 */
748 struct BackfillInterval {
749 // info about a backfill interval on a peer
750 eversion_t version; /// version at which the scan occurred
751 map<hobject_t,eversion_t> objects;
752 hobject_t begin;
753 hobject_t end;
754
755 /// clear content
756 void clear() {
757 *this = BackfillInterval();
758 }
759
760 /// clear objects list only
761 void clear_objects() {
762 objects.clear();
763 }
764
765 /// reinstantiate with a new start+end position and sort order
766 void reset(hobject_t start) {
767 clear();
768 begin = end = start;
769 }
770
771 /// true if there are no objects in this interval
772 bool empty() const {
773 return objects.empty();
774 }
775
776 /// true if interval extends to the end of the range
777 bool extends_to_end() const {
778 return end.is_max();
779 }
780
781 /// removes items <= soid and adjusts begin to the first object
782 void trim_to(const hobject_t &soid) {
783 trim();
784 while (!objects.empty() &&
785 objects.begin()->first <= soid) {
786 pop_front();
787 }
788 }
789
790 /// Adjusts begin to the first object
791 void trim() {
792 if (!objects.empty())
793 begin = objects.begin()->first;
794 else
795 begin = end;
796 }
797
798 /// drop first entry, and adjust @begin accordingly
799 void pop_front() {
800 assert(!objects.empty());
801 objects.erase(objects.begin());
802 trim();
803 }
804
805 /// dump
806 void dump(Formatter *f) const {
807 f->dump_stream("begin") << begin;
808 f->dump_stream("end") << end;
809 f->open_array_section("objects");
810 for (map<hobject_t, eversion_t>::const_iterator i =
811 objects.begin();
812 i != objects.end();
813 ++i) {
814 f->open_object_section("object");
815 f->dump_stream("object") << i->first;
816 f->dump_stream("version") << i->second;
817 f->close_section();
818 }
819 f->close_section();
820 }
821 };
822
823 protected:
824 BackfillInterval backfill_info;
825 map<pg_shard_t, BackfillInterval> peer_backfill_info;
826 bool backfill_reserved;
827 bool backfill_reserving;
828
829 friend class OSD;
830
831 public:
832 set<pg_shard_t> backfill_targets;
833
834 bool is_backfill_targets(pg_shard_t osd) {
835 return backfill_targets.count(osd);
836 }
837
838 protected:
839
840 /*
841 * blocked request wait hierarchy
842 *
843 * In order to preserve request ordering we need to be careful about the
844 * order in which blocked requests get requeued. Generally speaking, we
845 * push the requests back up to the op_wq in reverse order (most recent
846 * request first) so that they come back out again in the original order.
847 * However, because there are multiple wait queues, we need to requeue
848 * waitlists in order. Generally speaking, we requeue the wait lists
849 * that are checked first.
850 *
851 * Here are the various wait lists, in the order they are used during
852 * request processing, with notes:
853 *
854 * - waiting_for_map
855 * - may start or stop blocking at any time (depending on client epoch)
856 * - waiting_for_peered
857 * - !is_peered() or flushes_in_progress
858 * - only starts blocking on interval change; never restarts
859 * - waiting_for_active
860 * - !is_active()
861 * - only starts blocking on interval change; never restarts
862 * - waiting_for_scrub
863 * - starts and stops blocking for varying intervals during scrub
864 * - waiting_for_unreadable_object
865 * - never restarts once object is readable (* except for EIO?)
866 * - waiting_for_degraded_object
867 * - never restarts once object is writeable (* except for EIO?)
868 * - waiting_for_blocked_object
869 * - starts and stops based on proxied op activity
870 * - obc rwlocks
871 * - starts and stops based on read/write activity
872 *
873 * Notes:
874 *
875 * 1. During and interval change, we requeue *everything* in the above order.
876 *
877 * 2. When an obc rwlock is released, we check for a scrub block and requeue
878 * the op there if it applies. We ignore the unreadable/degraded/blocked
879 * queues because we assume they cannot apply at that time (this is
880 * probably mostly true).
881 *
882 * 3. The requeue_ops helper will push ops onto the waiting_for_map list if
883 * it is non-empty.
884 *
885 * These three behaviors are generally sufficient to maintain ordering, with
886 * the possible exception of cases where we make an object degraded or
887 * unreadable that was previously okay, e.g. when scrub or op processing
888 * encounter an unexpected error. FIXME.
889 */
890
891 // pg waiters
892 unsigned flushes_in_progress;
893
894 // ops with newer maps than our (or blocked behind them)
895 // track these by client, since inter-request ordering doesn't otherwise
896 // matter.
897 unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map;
898
899 // ops waiting on peered
900 list<OpRequestRef> waiting_for_peered;
901
902 // ops waiting on active (require peered as well)
903 list<OpRequestRef> waiting_for_active;
904 list<OpRequestRef> waiting_for_scrub;
905
906 list<OpRequestRef> waiting_for_cache_not_full;
907 list<OpRequestRef> waiting_for_all_missing;
908 map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object,
909 waiting_for_degraded_object,
910 waiting_for_blocked_object;
911
912 set<hobject_t> objects_blocked_on_cache_full;
913 map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
914 map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
915
916 // Callbacks should assume pg (and nothing else) is locked
917 map<hobject_t, list<Context*>> callbacks_for_degraded_object;
918
919 map<eversion_t,
920 list<pair<OpRequestRef, version_t> > > waiting_for_ondisk;
921
922 void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m);
923 void requeue_op(OpRequestRef op);
924 void requeue_ops(list<OpRequestRef> &l);
925
926 // stats that persist lazily
927 object_stat_collection_t unstable_stats;
928
929 // publish stats
930 Mutex pg_stats_publish_lock;
931 bool pg_stats_publish_valid;
932 pg_stat_t pg_stats_publish;
933
934 // for ordering writes
935 ceph::shared_ptr<ObjectStore::Sequencer> osr;
936
937 void _update_calc_stats();
938 void _update_blocked_by();
939 void publish_stats_to_osd();
940 void clear_publish_stats();
941
942 public:
943 void clear_primary_state();
944
945 bool is_actingbackfill(pg_shard_t osd) const {
946 return actingbackfill.count(osd);
947 }
948 bool is_acting(pg_shard_t osd) const {
949 return has_shard(pool.info.ec_pool(), acting, osd);
950 }
951 bool is_up(pg_shard_t osd) const {
952 return has_shard(pool.info.ec_pool(), up, osd);
953 }
954 static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) {
955 if (ec) {
956 return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd;
957 } else {
958 return std::find(v.begin(), v.end(), osd.osd) != v.end();
959 }
960 }
961
962 bool needs_recovery() const;
963 bool needs_backfill() const;
964
965 /// get log recovery reservation priority
966 unsigned get_recovery_priority();
967 /// get backfill reservation priority
968 unsigned get_backfill_priority();
969
970 void mark_clean(); ///< mark an active pg clean
971
972 /// return [start,end) bounds for required past_intervals
973 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
974 const pg_info_t &info,
975 epoch_t oldest_map) {
976 epoch_t start = MAX(
977 info.history.last_epoch_clean ? info.history.last_epoch_clean :
978 info.history.epoch_pool_created,
979 oldest_map);
980 epoch_t end = MAX(
981 info.history.same_interval_since,
982 info.history.epoch_pool_created);
983 return make_pair(start, end);
984 }
985 void check_past_interval_bounds() const;
986 PastIntervals::PriorSet build_prior();
987
988 void remove_down_peer_info(const OSDMapRef osdmap);
989
990 bool adjust_need_up_thru(const OSDMapRef osdmap);
991
992 bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
993 virtual void dump_recovery_info(Formatter *f) const = 0;
994
995 bool calc_min_last_complete_ondisk() {
996 eversion_t min = last_complete_ondisk;
997 assert(!actingbackfill.empty());
998 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
999 i != actingbackfill.end();
1000 ++i) {
1001 if (*i == get_primary()) continue;
1002 if (peer_last_complete_ondisk.count(*i) == 0)
1003 return false; // we don't have complete info
1004 eversion_t a = peer_last_complete_ondisk[*i];
1005 if (a < min)
1006 min = a;
1007 }
1008 if (min == min_last_complete_ondisk)
1009 return false;
1010 min_last_complete_ondisk = min;
1011 return true;
1012 }
1013
1014 virtual void calc_trim_to() = 0;
1015
1016 void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
1017 pg_missing_t& omissing, pg_shard_t from);
1018 void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
1019 pg_missing_t& omissing, pg_shard_t from);
1020 bool proc_replica_info(
1021 pg_shard_t from, const pg_info_t &info, epoch_t send_epoch);
1022
1023 struct PGLogEntryHandler : public PGLog::LogEntryHandler {
1024 PG *pg;
1025 ObjectStore::Transaction *t;
1026 PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {}
1027
1028 // LogEntryHandler
1029 void remove(const hobject_t &hoid) override {
1030 pg->get_pgbackend()->remove(hoid, t);
1031 }
1032 void try_stash(const hobject_t &hoid, version_t v) override {
1033 pg->get_pgbackend()->try_stash(hoid, v, t);
1034 }
1035 void rollback(const pg_log_entry_t &entry) override {
1036 assert(entry.can_rollback());
1037 pg->get_pgbackend()->rollback(entry, t);
1038 }
1039 void rollforward(const pg_log_entry_t &entry) override {
1040 pg->get_pgbackend()->rollforward(entry, t);
1041 }
1042 void trim(const pg_log_entry_t &entry) override {
1043 pg->get_pgbackend()->trim(entry, t);
1044 }
1045 };
1046
1047 void update_object_snap_mapping(
1048 ObjectStore::Transaction *t, const hobject_t &soid,
1049 const set<snapid_t> &snaps);
1050 void clear_object_snap_mapping(
1051 ObjectStore::Transaction *t, const hobject_t &soid);
1052 void remove_snap_mapped_object(
1053 ObjectStore::Transaction& t, const hobject_t& soid);
1054 void merge_log(
1055 ObjectStore::Transaction& t, pg_info_t &oinfo,
1056 pg_log_t &olog, pg_shard_t from);
1057 void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead);
1058 bool search_for_missing(
1059 const pg_info_t &oinfo, const pg_missing_t &omissing,
1060 pg_shard_t fromosd,
1061 RecoveryCtx*);
1062
1063 void check_for_lost_objects();
1064 void forget_lost_objects();
1065
1066 void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map);
1067
1068 void trim_write_ahead();
1069
1070 map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
1071 const map<pg_shard_t, pg_info_t> &infos,
1072 bool restrict_to_up_acting,
1073 bool *history_les_bound) const;
1074 static void calc_ec_acting(
1075 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1076 unsigned size,
1077 const vector<int> &acting,
1078 pg_shard_t acting_primary,
1079 const vector<int> &up,
1080 pg_shard_t up_primary,
1081 const map<pg_shard_t, pg_info_t> &all_info,
1082 bool restrict_to_up_acting,
1083 vector<int> *want,
1084 set<pg_shard_t> *backfill,
1085 set<pg_shard_t> *acting_backfill,
1086 pg_shard_t *want_primary,
1087 ostream &ss);
1088 static void calc_replicated_acting(
1089 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1090 unsigned size,
1091 const vector<int> &acting,
1092 pg_shard_t acting_primary,
1093 const vector<int> &up,
1094 pg_shard_t up_primary,
1095 const map<pg_shard_t, pg_info_t> &all_info,
1096 bool restrict_to_up_acting,
1097 vector<int> *want,
1098 set<pg_shard_t> *backfill,
1099 set<pg_shard_t> *acting_backfill,
1100 pg_shard_t *want_primary,
1101 ostream &ss);
1102 bool choose_acting(pg_shard_t &auth_log_shard,
1103 bool restrict_to_up_acting,
1104 bool *history_les_bound);
1105 void build_might_have_unfound();
1106 void activate(
1107 ObjectStore::Transaction& t,
1108 epoch_t activation_epoch,
1109 list<Context*>& tfin,
1110 map<int, map<spg_t,pg_query_t> >& query_map,
1111 map<int,
1112 vector<pair<pg_notify_t, PastIntervals> > > *activator_map,
1113 RecoveryCtx *ctx);
1114 void _activate_committed(epoch_t epoch, epoch_t activation_epoch);
1115 void all_activated_and_committed();
1116
1117 void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
1118
1119 bool have_unfound() const {
1120 return missing_loc.have_unfound();
1121 }
1122 uint64_t get_num_unfound() const {
1123 return missing_loc.num_unfound();
1124 }
1125
1126 virtual void check_local() = 0;
1127
1128 /**
1129 * @param ops_begun returns how many recovery ops the function started
1130 * @returns true if any useful work was accomplished; false otherwise
1131 */
1132 virtual bool start_recovery_ops(
1133 uint64_t max,
1134 ThreadPool::TPHandle &handle,
1135 uint64_t *ops_begun) = 0;
1136
1137 void purge_strays();
1138
1139 void update_heartbeat_peers();
1140
1141 Context *finish_sync_event;
1142
1143 void finish_recovery(list<Context*>& tfin);
1144 void _finish_recovery(Context *c);
1145 void cancel_recovery();
1146 void clear_recovery_state();
1147 virtual void _clear_recovery_state() = 0;
1148 virtual void check_recovery_sources(const OSDMapRef& newmap) = 0;
1149 void start_recovery_op(const hobject_t& soid);
1150 void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
1151
1152 void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
1153 virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
1154
1155 friend class C_OSD_RepModify_Commit;
1156
1157 // -- backoff --
1158 Mutex backoff_lock; // orders inside Backoff::lock
1159 map<hobject_t,set<BackoffRef>> backoffs;
1160
1161 void add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end);
1162 void release_backoffs(const hobject_t& begin, const hobject_t& end);
1163 void release_backoffs(const hobject_t& o) {
1164 release_backoffs(o, o);
1165 }
1166 void clear_backoffs();
1167
1168 void add_pg_backoff(SessionRef s) {
1169 hobject_t begin = info.pgid.pgid.get_hobj_start();
1170 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1171 add_backoff(s, begin, end);
1172 }
1173 void release_pg_backoffs() {
1174 hobject_t begin = info.pgid.pgid.get_hobj_start();
1175 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1176 release_backoffs(begin, end);
1177 }
1178
1179 void rm_backoff(BackoffRef b);
1180
1181 // -- scrub --
1182 struct Scrubber {
1183 Scrubber();
1184 ~Scrubber();
1185
1186 // metadata
1187 set<pg_shard_t> reserved_peers;
1188 bool reserved, reserve_failed;
1189 epoch_t epoch_start;
1190
1191 // common to both scrubs
1192 bool active;
1193 bool queue_snap_trim;
1194 int waiting_on;
1195 set<pg_shard_t> waiting_on_whom;
1196 int shallow_errors;
1197 int deep_errors;
1198 int fixed;
1199 ScrubMap primary_scrubmap;
1200 map<pg_shard_t, ScrubMap> received_maps;
1201 OpRequestRef active_rep_scrub;
1202 utime_t scrub_reg_stamp; // stamp we registered for
1203
1204 // For async sleep
1205 bool sleeping = false;
1206 bool needs_sleep = true;
1207 utime_t sleep_start;
1208
1209 // flags to indicate explicitly requested scrubs (by admin)
1210 bool must_scrub, must_deep_scrub, must_repair;
1211
1212 // Priority to use for scrub scheduling
1213 unsigned priority;
1214
1215 // this flag indicates whether we would like to do auto-repair of the PG or not
1216 bool auto_repair;
1217
1218 // Maps from objects with errors to missing/inconsistent peers
1219 map<hobject_t, set<pg_shard_t>> missing;
1220 map<hobject_t, set<pg_shard_t>> inconsistent;
1221
1222 // Map from object with errors to good peers
1223 map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative;
1224
1225 // Cleaned map pending snap metadata scrub
1226 ScrubMap cleaned_meta_map;
1227
1228 // digest updates which we are waiting on
1229 int num_digest_updates_pending;
1230
1231 // chunky scrub
1232 hobject_t start, end;
1233 eversion_t subset_last_update;
1234
1235 // chunky scrub state
1236 enum State {
1237 INACTIVE,
1238 NEW_CHUNK,
1239 WAIT_PUSHES,
1240 WAIT_LAST_UPDATE,
1241 BUILD_MAP,
1242 WAIT_REPLICAS,
1243 COMPARE_MAPS,
1244 WAIT_DIGEST_UPDATES,
1245 FINISH,
1246 } state;
1247
1248 std::unique_ptr<Scrub::Store> store;
1249 // deep scrub
1250 bool deep;
1251 uint32_t seed;
1252
1253 list<Context*> callbacks;
1254 void add_callback(Context *context) {
1255 callbacks.push_back(context);
1256 }
1257 void run_callbacks() {
1258 list<Context*> to_run;
1259 to_run.swap(callbacks);
1260 for (list<Context*>::iterator i = to_run.begin();
1261 i != to_run.end();
1262 ++i) {
1263 (*i)->complete(0);
1264 }
1265 }
1266
1267 static const char *state_string(const PG::Scrubber::State& state) {
1268 const char *ret = NULL;
1269 switch( state )
1270 {
1271 case INACTIVE: ret = "INACTIVE"; break;
1272 case NEW_CHUNK: ret = "NEW_CHUNK"; break;
1273 case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
1274 case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
1275 case BUILD_MAP: ret = "BUILD_MAP"; break;
1276 case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
1277 case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
1278 case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
1279 case FINISH: ret = "FINISH"; break;
1280 }
1281 return ret;
1282 }
1283
1284 bool is_chunky_scrub_active() const { return state != INACTIVE; }
1285
1286 // classic (non chunk) scrubs block all writes
1287 // chunky scrubs only block writes to a range
1288 bool write_blocked_by_scrub(const hobject_t &soid) {
1289 return (soid >= start && soid < end);
1290 }
1291
1292 // clear all state
1293 void reset() {
1294 active = false;
1295 queue_snap_trim = false;
1296 waiting_on = 0;
1297 waiting_on_whom.clear();
1298 if (active_rep_scrub) {
1299 active_rep_scrub = OpRequestRef();
1300 }
1301 received_maps.clear();
1302
1303 must_scrub = false;
1304 must_deep_scrub = false;
1305 must_repair = false;
1306 auto_repair = false;
1307
1308 state = PG::Scrubber::INACTIVE;
1309 start = hobject_t();
1310 end = hobject_t();
1311 subset_last_update = eversion_t();
1312 shallow_errors = 0;
1313 deep_errors = 0;
1314 fixed = 0;
1315 deep = false;
1316 seed = 0;
1317 run_callbacks();
1318 inconsistent.clear();
1319 missing.clear();
1320 authoritative.clear();
1321 num_digest_updates_pending = 0;
1322 cleaned_meta_map = ScrubMap();
1323 sleeping = false;
1324 needs_sleep = true;
1325 sleep_start = utime_t();
1326 }
1327
1328 void create_results(const hobject_t& obj);
1329 void cleanup_store(ObjectStore::Transaction *t);
1330 } scrubber;
1331
1332 bool scrub_after_recovery;
1333
1334 int active_pushes;
1335
1336 void repair_object(
1337 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
1338 pg_shard_t bad_peer);
1339
1340 void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
1341 void chunky_scrub(ThreadPool::TPHandle &handle);
1342 void scrub_compare_maps();
1343 /**
1344 * return true if any inconsistency/missing is repaired, false otherwise
1345 */
1346 bool scrub_process_inconsistent();
1347 bool ops_blocked_by_scrub() const;
1348 void scrub_finish();
1349 void scrub_clear_state();
1350 void _scan_snaps(ScrubMap &map);
1351 void _scan_rollback_obs(
1352 const vector<ghobject_t> &rollback_obs,
1353 ThreadPool::TPHandle &handle);
1354 void _request_scrub_map(pg_shard_t replica, eversion_t version,
1355 hobject_t start, hobject_t end, bool deep,
1356 uint32_t seed);
1357 int build_scrub_map_chunk(
1358 ScrubMap &map,
1359 hobject_t start, hobject_t end, bool deep, uint32_t seed,
1360 ThreadPool::TPHandle &handle);
1361 /**
1362 * returns true if [begin, end) is good to scrub at this time
1363 * a false return value obliges the implementer to requeue scrub when the
1364 * condition preventing scrub clears
1365 */
1366 virtual bool _range_available_for_scrub(
1367 const hobject_t &begin, const hobject_t &end) = 0;
1368 virtual void scrub_snapshot_metadata(
1369 ScrubMap &map,
1370 const std::map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest) { }
1371 virtual void _scrub_clear_state() { }
1372 virtual void _scrub_finish() { }
1373 virtual void split_colls(
1374 spg_t child,
1375 int split_bits,
1376 int seed,
1377 const pg_pool_t *pool,
1378 ObjectStore::Transaction *t) = 0;
1379 void clear_scrub_reserved();
1380 void scrub_reserve_replicas();
1381 void scrub_unreserve_replicas();
1382 bool scrub_all_replicas_reserved() const;
1383 bool sched_scrub();
1384 void reg_next_scrub();
1385 void unreg_next_scrub();
1386
1387 void replica_scrub(
1388 OpRequestRef op,
1389 ThreadPool::TPHandle &handle);
1390 void do_replica_scrub_map(OpRequestRef op);
1391 void sub_op_scrub_map(OpRequestRef op);
1392
1393 void handle_scrub_reserve_request(OpRequestRef op);
1394 void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
1395 void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
1396 void handle_scrub_reserve_release(OpRequestRef op);
1397
1398 void reject_reservation();
1399 void schedule_backfill_full_retry();
1400 void schedule_recovery_full_retry();
1401
1402 // -- recovery state --
1403
1404 template <class EVT>
1405 struct QueuePeeringEvt : Context {
1406 PGRef pg;
1407 epoch_t epoch;
1408 EVT evt;
1409 QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
1410 pg(pg), epoch(epoch), evt(evt) {}
1411 void finish(int r) override {
1412 pg->lock();
1413 pg->queue_peering_event(PG::CephPeeringEvtRef(
1414 new PG::CephPeeringEvt(
1415 epoch,
1416 epoch,
1417 evt)));
1418 pg->unlock();
1419 }
1420 };
1421
1422 class CephPeeringEvt {
1423 epoch_t epoch_sent;
1424 epoch_t epoch_requested;
1425 boost::intrusive_ptr< const boost::statechart::event_base > evt;
1426 string desc;
1427 public:
1428 MEMPOOL_CLASS_HELPERS();
1429 template <class T>
1430 CephPeeringEvt(epoch_t epoch_sent,
1431 epoch_t epoch_requested,
1432 const T &evt_) :
1433 epoch_sent(epoch_sent), epoch_requested(epoch_requested),
1434 evt(evt_.intrusive_from_this()) {
1435 stringstream out;
1436 out << "epoch_sent: " << epoch_sent
1437 << " epoch_requested: " << epoch_requested << " ";
1438 evt_.print(&out);
1439 desc = out.str();
1440 }
1441 epoch_t get_epoch_sent() { return epoch_sent; }
1442 epoch_t get_epoch_requested() { return epoch_requested; }
1443 const boost::statechart::event_base &get_event() { return *evt; }
1444 string get_desc() { return desc; }
1445 };
1446 typedef ceph::shared_ptr<CephPeeringEvt> CephPeeringEvtRef;
1447 list<CephPeeringEvtRef> peering_queue; // op queue
1448 list<CephPeeringEvtRef> peering_waiters;
1449
1450 struct QueryState : boost::statechart::event< QueryState > {
1451 Formatter *f;
1452 explicit QueryState(Formatter *f) : f(f) {}
1453 void print(std::ostream *out) const {
1454 *out << "Query";
1455 }
1456 };
1457
1458 struct MInfoRec : boost::statechart::event< MInfoRec > {
1459 pg_shard_t from;
1460 pg_info_t info;
1461 epoch_t msg_epoch;
1462 MInfoRec(pg_shard_t from, const pg_info_t &info, epoch_t msg_epoch) :
1463 from(from), info(info), msg_epoch(msg_epoch) {}
1464 void print(std::ostream *out) const {
1465 *out << "MInfoRec from " << from << " info: " << info;
1466 }
1467 };
1468
1469 struct MLogRec : boost::statechart::event< MLogRec > {
1470 pg_shard_t from;
1471 boost::intrusive_ptr<MOSDPGLog> msg;
1472 MLogRec(pg_shard_t from, MOSDPGLog *msg) :
1473 from(from), msg(msg) {}
1474 void print(std::ostream *out) const {
1475 *out << "MLogRec from " << from;
1476 }
1477 };
1478
1479 struct MNotifyRec : boost::statechart::event< MNotifyRec > {
1480 pg_shard_t from;
1481 pg_notify_t notify;
1482 uint64_t features;
1483 MNotifyRec(pg_shard_t from, const pg_notify_t &notify, uint64_t f) :
1484 from(from), notify(notify), features(f) {}
1485 void print(std::ostream *out) const {
1486 *out << "MNotifyRec from " << from << " notify: " << notify
1487 << " features: 0x" << hex << features << dec;
1488 }
1489 };
1490
1491 struct MQuery : boost::statechart::event< MQuery > {
1492 pg_shard_t from;
1493 pg_query_t query;
1494 epoch_t query_epoch;
1495 MQuery(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch):
1496 from(from), query(query), query_epoch(query_epoch) {}
1497 void print(std::ostream *out) const {
1498 *out << "MQuery from " << from
1499 << " query_epoch " << query_epoch
1500 << " query: " << query;
1501 }
1502 };
1503
1504 struct AdvMap : boost::statechart::event< AdvMap > {
1505 OSDMapRef osdmap;
1506 OSDMapRef lastmap;
1507 vector<int> newup, newacting;
1508 int up_primary, acting_primary;
1509 AdvMap(
1510 OSDMapRef osdmap, OSDMapRef lastmap,
1511 vector<int>& newup, int up_primary,
1512 vector<int>& newacting, int acting_primary):
1513 osdmap(osdmap), lastmap(lastmap),
1514 newup(newup),
1515 newacting(newacting),
1516 up_primary(up_primary),
1517 acting_primary(acting_primary) {}
1518 void print(std::ostream *out) const {
1519 *out << "AdvMap";
1520 }
1521 };
1522
1523 struct ActMap : boost::statechart::event< ActMap > {
1524 ActMap() : boost::statechart::event< ActMap >() {}
1525 void print(std::ostream *out) const {
1526 *out << "ActMap";
1527 }
1528 };
1529 struct Activate : boost::statechart::event< Activate > {
1530 epoch_t activation_epoch;
1531 explicit Activate(epoch_t q) : boost::statechart::event< Activate >(),
1532 activation_epoch(q) {}
1533 void print(std::ostream *out) const {
1534 *out << "Activate from " << activation_epoch;
1535 }
1536 };
1537 struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > {
1538 unsigned priority;
1539 explicit RequestBackfillPrio(unsigned prio) :
1540 boost::statechart::event< RequestBackfillPrio >(),
1541 priority(prio) {}
1542 void print(std::ostream *out) const {
1543 *out << "RequestBackfillPrio: priority " << priority;
1544 }
1545 };
1546 #define TrivialEvent(T) struct T : boost::statechart::event< T > { \
1547 T() : boost::statechart::event< T >() {} \
1548 void print(std::ostream *out) const { \
1549 *out << #T; \
1550 } \
1551 };
1552 TrivialEvent(Initialize)
1553 TrivialEvent(Load)
1554 TrivialEvent(GotInfo)
1555 TrivialEvent(NeedUpThru)
1556 TrivialEvent(NullEvt)
1557 TrivialEvent(FlushedEvt)
1558 TrivialEvent(Backfilled)
1559 TrivialEvent(LocalBackfillReserved)
1560 TrivialEvent(RemoteBackfillReserved)
1561 TrivialEvent(RemoteReservationRejected)
1562 TrivialEvent(RequestBackfill)
1563 TrivialEvent(RequestRecovery)
1564 TrivialEvent(RecoveryDone)
1565 TrivialEvent(BackfillTooFull)
1566 TrivialEvent(RecoveryTooFull)
1567
1568 TrivialEvent(AllReplicasRecovered)
1569 TrivialEvent(DoRecovery)
1570 TrivialEvent(LocalRecoveryReserved)
1571 TrivialEvent(RemoteRecoveryReserved)
1572 TrivialEvent(AllRemotesReserved)
1573 TrivialEvent(AllBackfillsReserved)
1574 TrivialEvent(GoClean)
1575
1576 TrivialEvent(AllReplicasActivated)
1577
1578 TrivialEvent(IntervalFlush)
1579
1580 /* Encapsulates PG recovery process */
1581 class RecoveryState {
1582 void start_handle(RecoveryCtx *new_ctx);
1583 void end_handle();
1584 public:
1585 void begin_block_outgoing();
1586 void end_block_outgoing();
1587 void clear_blocked_outgoing();
1588 private:
1589
1590 /* States */
1591 struct Initial;
1592 class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > {
1593 RecoveryState *state;
1594 public:
1595 PG *pg;
1596
1597 utime_t event_time;
1598 uint64_t event_count;
1599
1600 void clear_event_counters() {
1601 event_time = utime_t();
1602 event_count = 0;
1603 }
1604
1605 void log_enter(const char *state_name);
1606 void log_exit(const char *state_name, utime_t duration);
1607
1608 RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {}
1609
1610 /* Accessor functions for state methods */
1611 ObjectStore::Transaction* get_cur_transaction() {
1612 assert(state->rctx);
1613 assert(state->rctx->transaction);
1614 return state->rctx->transaction;
1615 }
1616
1617 void send_query(pg_shard_t to, const pg_query_t &query) {
1618 assert(state->rctx);
1619 assert(state->rctx->query_map);
1620 (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
1621 query;
1622 }
1623
1624 map<int, map<spg_t, pg_query_t> > *get_query_map() {
1625 assert(state->rctx);
1626 assert(state->rctx->query_map);
1627 return state->rctx->query_map;
1628 }
1629
1630 map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() {
1631 assert(state->rctx);
1632 assert(state->rctx->info_map);
1633 return state->rctx->info_map;
1634 }
1635
1636 list< Context* > *get_on_safe_context_list() {
1637 assert(state->rctx);
1638 assert(state->rctx->on_safe);
1639 return &(state->rctx->on_safe->contexts);
1640 }
1641
1642 list< Context * > *get_on_applied_context_list() {
1643 assert(state->rctx);
1644 assert(state->rctx->on_applied);
1645 return &(state->rctx->on_applied->contexts);
1646 }
1647
1648 RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
1649
1650 void send_notify(pg_shard_t to,
1651 const pg_notify_t &info, const PastIntervals &pi) {
1652 assert(state->rctx);
1653 assert(state->rctx->notify_list);
1654 (*state->rctx->notify_list)[to.osd].push_back(make_pair(info, pi));
1655 }
1656 };
1657 friend class RecoveryMachine;
1658
1659 /* States */
1660
1661 struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState {
1662 explicit Crashed(my_context ctx);
1663 };
1664
1665 struct Reset;
1666
1667 struct Initial : boost::statechart::state< Initial, RecoveryMachine >, NamedState {
1668 explicit Initial(my_context ctx);
1669 void exit();
1670
1671 typedef boost::mpl::list <
1672 boost::statechart::transition< Initialize, Reset >,
1673 boost::statechart::custom_reaction< Load >,
1674 boost::statechart::custom_reaction< NullEvt >,
1675 boost::statechart::transition< boost::statechart::event_base, Crashed >
1676 > reactions;
1677
1678 boost::statechart::result react(const Load&);
1679 boost::statechart::result react(const MNotifyRec&);
1680 boost::statechart::result react(const MInfoRec&);
1681 boost::statechart::result react(const MLogRec&);
1682 boost::statechart::result react(const boost::statechart::event_base&) {
1683 return discard_event();
1684 }
1685 };
1686
1687 struct Reset : boost::statechart::state< Reset, RecoveryMachine >, NamedState {
1688 explicit Reset(my_context ctx);
1689 void exit();
1690
1691 typedef boost::mpl::list <
1692 boost::statechart::custom_reaction< QueryState >,
1693 boost::statechart::custom_reaction< AdvMap >,
1694 boost::statechart::custom_reaction< ActMap >,
1695 boost::statechart::custom_reaction< NullEvt >,
1696 boost::statechart::custom_reaction< FlushedEvt >,
1697 boost::statechart::custom_reaction< IntervalFlush >,
1698 boost::statechart::transition< boost::statechart::event_base, Crashed >
1699 > reactions;
1700 boost::statechart::result react(const QueryState& q);
1701 boost::statechart::result react(const AdvMap&);
1702 boost::statechart::result react(const ActMap&);
1703 boost::statechart::result react(const FlushedEvt&);
1704 boost::statechart::result react(const IntervalFlush&);
1705 boost::statechart::result react(const boost::statechart::event_base&) {
1706 return discard_event();
1707 }
1708 };
1709
1710 struct Start;
1711
1712 struct Started : boost::statechart::state< Started, RecoveryMachine, Start >, NamedState {
1713 explicit Started(my_context ctx);
1714 void exit();
1715
1716 typedef boost::mpl::list <
1717 boost::statechart::custom_reaction< QueryState >,
1718 boost::statechart::custom_reaction< AdvMap >,
1719 boost::statechart::custom_reaction< NullEvt >,
1720 boost::statechart::custom_reaction< FlushedEvt >,
1721 boost::statechart::custom_reaction< IntervalFlush >,
1722 boost::statechart::transition< boost::statechart::event_base, Crashed >
1723 > reactions;
1724 boost::statechart::result react(const QueryState& q);
1725 boost::statechart::result react(const AdvMap&);
1726 boost::statechart::result react(const FlushedEvt&);
1727 boost::statechart::result react(const IntervalFlush&);
1728 boost::statechart::result react(const boost::statechart::event_base&) {
1729 return discard_event();
1730 }
1731 };
1732
1733 struct MakePrimary : boost::statechart::event< MakePrimary > {
1734 MakePrimary() : boost::statechart::event< MakePrimary >() {}
1735 };
1736 struct MakeStray : boost::statechart::event< MakeStray > {
1737 MakeStray() : boost::statechart::event< MakeStray >() {}
1738 };
1739 struct Primary;
1740 struct Stray;
1741
1742 struct Start : boost::statechart::state< Start, Started >, NamedState {
1743 explicit Start(my_context ctx);
1744 void exit();
1745
1746 typedef boost::mpl::list <
1747 boost::statechart::transition< MakePrimary, Primary >,
1748 boost::statechart::transition< MakeStray, Stray >
1749 > reactions;
1750 };
1751
1752 struct Peering;
1753 struct WaitActingChange;
1754 struct NeedActingChange : boost::statechart::event< NeedActingChange > {
1755 NeedActingChange() : boost::statechart::event< NeedActingChange >() {}
1756 };
1757 struct Incomplete;
1758 struct IsIncomplete : boost::statechart::event< IsIncomplete > {
1759 IsIncomplete() : boost::statechart::event< IsIncomplete >() {}
1760 };
1761 struct Down;
1762 struct IsDown : boost::statechart::event< IsDown > {
1763 IsDown() : boost::statechart::event< IsDown >() {}
1764 };
1765
1766 struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
1767 explicit Primary(my_context ctx);
1768 void exit();
1769
1770 typedef boost::mpl::list <
1771 boost::statechart::custom_reaction< ActMap >,
1772 boost::statechart::custom_reaction< MNotifyRec >,
1773 boost::statechart::transition< NeedActingChange, WaitActingChange >
1774 > reactions;
1775 boost::statechart::result react(const ActMap&);
1776 boost::statechart::result react(const MNotifyRec&);
1777 };
1778
1779 struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
1780 NamedState {
1781 typedef boost::mpl::list <
1782 boost::statechart::custom_reaction< QueryState >,
1783 boost::statechart::custom_reaction< AdvMap >,
1784 boost::statechart::custom_reaction< MLogRec >,
1785 boost::statechart::custom_reaction< MInfoRec >,
1786 boost::statechart::custom_reaction< MNotifyRec >
1787 > reactions;
1788 explicit WaitActingChange(my_context ctx);
1789 boost::statechart::result react(const QueryState& q);
1790 boost::statechart::result react(const AdvMap&);
1791 boost::statechart::result react(const MLogRec&);
1792 boost::statechart::result react(const MInfoRec&);
1793 boost::statechart::result react(const MNotifyRec&);
1794 void exit();
1795 };
1796
1797 struct GetInfo;
1798 struct Active;
1799
1800 struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
1801 PastIntervals::PriorSet prior_set;
1802 bool history_les_bound; //< need osd_find_best_info_ignore_history_les
1803
1804 explicit Peering(my_context ctx);
1805 void exit();
1806
1807 typedef boost::mpl::list <
1808 boost::statechart::custom_reaction< QueryState >,
1809 boost::statechart::transition< Activate, Active >,
1810 boost::statechart::custom_reaction< AdvMap >
1811 > reactions;
1812 boost::statechart::result react(const QueryState& q);
1813 boost::statechart::result react(const AdvMap &advmap);
1814 };
1815
1816 struct WaitLocalRecoveryReserved;
1817 struct Activating;
1818 struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState {
1819 explicit Active(my_context ctx);
1820 void exit();
1821
1822 const set<pg_shard_t> remote_shards_to_reserve_recovery;
1823 const set<pg_shard_t> remote_shards_to_reserve_backfill;
1824 bool all_replicas_activated;
1825
1826 typedef boost::mpl::list <
1827 boost::statechart::custom_reaction< QueryState >,
1828 boost::statechart::custom_reaction< ActMap >,
1829 boost::statechart::custom_reaction< AdvMap >,
1830 boost::statechart::custom_reaction< MInfoRec >,
1831 boost::statechart::custom_reaction< MNotifyRec >,
1832 boost::statechart::custom_reaction< MLogRec >,
1833 boost::statechart::custom_reaction< Backfilled >,
1834 boost::statechart::custom_reaction< AllReplicasActivated >
1835 > reactions;
1836 boost::statechart::result react(const QueryState& q);
1837 boost::statechart::result react(const ActMap&);
1838 boost::statechart::result react(const AdvMap&);
1839 boost::statechart::result react(const MInfoRec& infoevt);
1840 boost::statechart::result react(const MNotifyRec& notevt);
1841 boost::statechart::result react(const MLogRec& logevt);
1842 boost::statechart::result react(const Backfilled&) {
1843 return discard_event();
1844 }
1845 boost::statechart::result react(const AllReplicasActivated&);
1846 };
1847
1848 struct Clean : boost::statechart::state< Clean, Active >, NamedState {
1849 typedef boost::mpl::list<
1850 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >
1851 > reactions;
1852 explicit Clean(my_context ctx);
1853 void exit();
1854 };
1855
1856 struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
1857 typedef boost::mpl::list<
1858 boost::statechart::transition< GoClean, Clean >,
1859 boost::statechart::custom_reaction< AllReplicasActivated >
1860 > reactions;
1861 explicit Recovered(my_context ctx);
1862 void exit();
1863 boost::statechart::result react(const AllReplicasActivated&) {
1864 post_event(GoClean());
1865 return forward_event();
1866 }
1867 };
1868
1869 struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
1870 typedef boost::mpl::list<
1871 boost::statechart::transition< Backfilled, Recovered >,
1872 boost::statechart::custom_reaction< RemoteReservationRejected >
1873 > reactions;
1874 explicit Backfilling(my_context ctx);
1875 boost::statechart::result react(const RemoteReservationRejected& evt);
1876 void exit();
1877 };
1878
1879 struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
1880 typedef boost::mpl::list<
1881 boost::statechart::custom_reaction< RemoteBackfillReserved >,
1882 boost::statechart::custom_reaction< RemoteReservationRejected >,
1883 boost::statechart::transition< AllBackfillsReserved, Backfilling >
1884 > reactions;
1885 set<pg_shard_t>::const_iterator backfill_osd_it;
1886 explicit WaitRemoteBackfillReserved(my_context ctx);
1887 void exit();
1888 boost::statechart::result react(const RemoteBackfillReserved& evt);
1889 boost::statechart::result react(const RemoteReservationRejected& evt);
1890 };
1891
1892 struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
1893 typedef boost::mpl::list<
1894 boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved >
1895 > reactions;
1896 explicit WaitLocalBackfillReserved(my_context ctx);
1897 void exit();
1898 };
1899
1900 struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
1901 typedef boost::mpl::list<
1902 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
1903 boost::statechart::custom_reaction< RemoteBackfillReserved >,
1904 boost::statechart::custom_reaction< RemoteReservationRejected >
1905 > reactions;
1906 explicit NotBackfilling(my_context ctx);
1907 void exit();
1908 boost::statechart::result react(const RemoteBackfillReserved& evt);
1909 boost::statechart::result react(const RemoteReservationRejected& evt);
1910 };
1911
1912 struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
1913 typedef boost::mpl::list<
1914 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >
1915 > reactions;
1916 explicit NotRecovering(my_context ctx);
1917 void exit();
1918 };
1919
1920 struct RepNotRecovering;
1921 struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
1922 explicit ReplicaActive(my_context ctx);
1923 void exit();
1924
1925 typedef boost::mpl::list <
1926 boost::statechart::custom_reaction< QueryState >,
1927 boost::statechart::custom_reaction< ActMap >,
1928 boost::statechart::custom_reaction< MQuery >,
1929 boost::statechart::custom_reaction< MInfoRec >,
1930 boost::statechart::custom_reaction< MLogRec >,
1931 boost::statechart::custom_reaction< Activate >
1932 > reactions;
1933 boost::statechart::result react(const QueryState& q);
1934 boost::statechart::result react(const MInfoRec& infoevt);
1935 boost::statechart::result react(const MLogRec& logevt);
1936 boost::statechart::result react(const ActMap&);
1937 boost::statechart::result react(const MQuery&);
1938 boost::statechart::result react(const Activate&);
1939 };
1940
1941 struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
1942 typedef boost::mpl::list<
1943 boost::statechart::transition< RecoveryDone, RepNotRecovering >,
1944 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
1945 boost::statechart::custom_reaction< BackfillTooFull >
1946 > reactions;
1947 explicit RepRecovering(my_context ctx);
1948 boost::statechart::result react(const BackfillTooFull &evt);
1949 void exit();
1950 };
1951
1952 struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
1953 typedef boost::mpl::list<
1954 boost::statechart::custom_reaction< RemoteBackfillReserved >,
1955 boost::statechart::custom_reaction< RemoteReservationRejected >
1956 > reactions;
1957 explicit RepWaitBackfillReserved(my_context ctx);
1958 void exit();
1959 boost::statechart::result react(const RemoteBackfillReserved &evt);
1960 boost::statechart::result react(const RemoteReservationRejected &evt);
1961 };
1962
1963 struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
1964 typedef boost::mpl::list<
1965 boost::statechart::custom_reaction< RemoteRecoveryReserved >
1966 > reactions;
1967 explicit RepWaitRecoveryReserved(my_context ctx);
1968 void exit();
1969 boost::statechart::result react(const RemoteRecoveryReserved &evt);
1970 };
1971
1972 struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
1973 typedef boost::mpl::list<
1974 boost::statechart::custom_reaction< RequestBackfillPrio >,
1975 boost::statechart::transition< RequestRecovery, RepWaitRecoveryReserved >,
1976 boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers
1977 > reactions;
1978 explicit RepNotRecovering(my_context ctx);
1979 boost::statechart::result react(const RequestBackfillPrio &evt);
1980 void exit();
1981 };
1982
1983 struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
1984 typedef boost::mpl::list <
1985 boost::statechart::custom_reaction< AllReplicasRecovered >,
1986 boost::statechart::custom_reaction< RequestBackfill >
1987 > reactions;
1988 explicit Recovering(my_context ctx);
1989 void exit();
1990 void release_reservations();
1991 boost::statechart::result react(const AllReplicasRecovered &evt);
1992 boost::statechart::result react(const RequestBackfill &evt);
1993 };
1994
1995 struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState {
1996 typedef boost::mpl::list <
1997 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
1998 boost::statechart::transition< AllRemotesReserved, Recovering >
1999 > reactions;
2000 set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
2001 explicit WaitRemoteRecoveryReserved(my_context ctx);
2002 boost::statechart::result react(const RemoteRecoveryReserved &evt);
2003 void exit();
2004 };
2005
2006 struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState {
2007 typedef boost::mpl::list <
2008 boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >,
2009 boost::statechart::custom_reaction< RecoveryTooFull >
2010 > reactions;
2011 explicit WaitLocalRecoveryReserved(my_context ctx);
2012 void exit();
2013 boost::statechart::result react(const RecoveryTooFull &evt);
2014 };
2015
2016 struct Activating : boost::statechart::state< Activating, Active >, NamedState {
2017 typedef boost::mpl::list <
2018 boost::statechart::transition< AllReplicasRecovered, Recovered >,
2019 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2020 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved >
2021 > reactions;
2022 explicit Activating(my_context ctx);
2023 void exit();
2024 };
2025
2026 struct Stray : boost::statechart::state< Stray, Started >, NamedState {
2027 map<int, pair<pg_query_t, epoch_t> > pending_queries;
2028
2029 explicit Stray(my_context ctx);
2030 void exit();
2031
2032 typedef boost::mpl::list <
2033 boost::statechart::custom_reaction< MQuery >,
2034 boost::statechart::custom_reaction< MLogRec >,
2035 boost::statechart::custom_reaction< MInfoRec >,
2036 boost::statechart::custom_reaction< ActMap >,
2037 boost::statechart::custom_reaction< RecoveryDone >
2038 > reactions;
2039 boost::statechart::result react(const MQuery& query);
2040 boost::statechart::result react(const MLogRec& logevt);
2041 boost::statechart::result react(const MInfoRec& infoevt);
2042 boost::statechart::result react(const ActMap&);
2043 boost::statechart::result react(const RecoveryDone&) {
2044 return discard_event();
2045 }
2046 };
2047
2048 struct GetLog;
2049
2050 struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
2051 set<pg_shard_t> peer_info_requested;
2052
2053 explicit GetInfo(my_context ctx);
2054 void exit();
2055 void get_infos();
2056
2057 typedef boost::mpl::list <
2058 boost::statechart::custom_reaction< QueryState >,
2059 boost::statechart::transition< GotInfo, GetLog >,
2060 boost::statechart::custom_reaction< MNotifyRec >,
2061 boost::statechart::transition< IsDown, Down >
2062 > reactions;
2063 boost::statechart::result react(const QueryState& q);
2064 boost::statechart::result react(const MNotifyRec& infoevt);
2065 };
2066
2067 struct GotLog : boost::statechart::event< GotLog > {
2068 GotLog() : boost::statechart::event< GotLog >() {}
2069 };
2070
2071 struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState {
2072 pg_shard_t auth_log_shard;
2073 boost::intrusive_ptr<MOSDPGLog> msg;
2074
2075 explicit GetLog(my_context ctx);
2076 void exit();
2077
2078 typedef boost::mpl::list <
2079 boost::statechart::custom_reaction< QueryState >,
2080 boost::statechart::custom_reaction< MLogRec >,
2081 boost::statechart::custom_reaction< GotLog >,
2082 boost::statechart::custom_reaction< AdvMap >,
2083 boost::statechart::transition< IsIncomplete, Incomplete >
2084 > reactions;
2085 boost::statechart::result react(const AdvMap&);
2086 boost::statechart::result react(const QueryState& q);
2087 boost::statechart::result react(const MLogRec& logevt);
2088 boost::statechart::result react(const GotLog&);
2089 };
2090
2091 struct WaitUpThru;
2092
2093 struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState {
2094 set<pg_shard_t> peer_missing_requested;
2095
2096 explicit GetMissing(my_context ctx);
2097 void exit();
2098
2099 typedef boost::mpl::list <
2100 boost::statechart::custom_reaction< QueryState >,
2101 boost::statechart::custom_reaction< MLogRec >,
2102 boost::statechart::transition< NeedUpThru, WaitUpThru >
2103 > reactions;
2104 boost::statechart::result react(const QueryState& q);
2105 boost::statechart::result react(const MLogRec& logevt);
2106 };
2107
2108 struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState {
2109 explicit WaitUpThru(my_context ctx);
2110 void exit();
2111
2112 typedef boost::mpl::list <
2113 boost::statechart::custom_reaction< QueryState >,
2114 boost::statechart::custom_reaction< ActMap >,
2115 boost::statechart::custom_reaction< MLogRec >
2116 > reactions;
2117 boost::statechart::result react(const QueryState& q);
2118 boost::statechart::result react(const ActMap& am);
2119 boost::statechart::result react(const MLogRec& logrec);
2120 };
2121
2122 struct Down : boost::statechart::state< Down, Peering>, NamedState {
2123 explicit Down(my_context ctx);
2124 typedef boost::mpl::list <
2125 boost::statechart::custom_reaction< QueryState >
2126 > reactions;
2127 boost::statechart::result react(const QueryState& infoevt);
2128 void exit();
2129 };
2130
2131 struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState {
2132 typedef boost::mpl::list <
2133 boost::statechart::custom_reaction< AdvMap >,
2134 boost::statechart::custom_reaction< MNotifyRec >,
2135 boost::statechart::custom_reaction< QueryState >
2136 > reactions;
2137 explicit Incomplete(my_context ctx);
2138 boost::statechart::result react(const AdvMap &advmap);
2139 boost::statechart::result react(const MNotifyRec& infoevt);
2140 boost::statechart::result react(const QueryState& infoevt);
2141 void exit();
2142 };
2143
2144
2145 RecoveryMachine machine;
2146 PG *pg;
2147
2148 /// context passed in by state machine caller
2149 RecoveryCtx *orig_ctx;
2150
2151 /// populated if we are buffering messages pending a flush
2152 boost::optional<BufferedRecoveryMessages> messages_pending_flush;
2153
2154 /**
2155 * populated between start_handle() and end_handle(), points into
2156 * the message lists for messages_pending_flush while blocking messages
2157 * or into orig_ctx otherwise
2158 */
2159 boost::optional<RecoveryCtx> rctx;
2160
2161 public:
2162 explicit RecoveryState(PG *pg)
2163 : machine(this, pg), pg(pg), orig_ctx(0) {
2164 machine.initiate();
2165 }
2166
2167 void handle_event(const boost::statechart::event_base &evt,
2168 RecoveryCtx *rctx) {
2169 start_handle(rctx);
2170 machine.process_event(evt);
2171 end_handle();
2172 }
2173
2174 void handle_event(CephPeeringEvtRef evt,
2175 RecoveryCtx *rctx) {
2176 start_handle(rctx);
2177 machine.process_event(evt->get_event());
2178 end_handle();
2179 }
2180
2181 } recovery_state;
2182
2183
2184 public:
2185 PG(OSDService *o, OSDMapRef curmap,
2186 const PGPool &pool, spg_t p);
2187 ~PG() override;
2188
2189 private:
2190 // Prevent copying
2191 explicit PG(const PG& rhs);
2192 PG& operator=(const PG& rhs);
2193 const spg_t pg_id;
2194 uint64_t peer_features;
2195 uint64_t acting_features;
2196 uint64_t upacting_features;
2197
2198 epoch_t last_epoch;
2199
2200 public:
2201 const spg_t& get_pgid() const { return pg_id; }
2202
2203 void reset_min_peer_features() {
2204 peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
2205 }
2206 uint64_t get_min_peer_features() const { return peer_features; }
2207 void apply_peer_features(uint64_t f) { peer_features &= f; }
2208
2209 uint64_t get_min_acting_features() const { return acting_features; }
2210 uint64_t get_min_upacting_features() const { return upacting_features; }
2211
2212 void init_primary_up_acting(
2213 const vector<int> &newup,
2214 const vector<int> &newacting,
2215 int new_up_primary,
2216 int new_acting_primary) {
2217 actingset.clear();
2218 acting = newacting;
2219 for (uint8_t i = 0; i < acting.size(); ++i) {
2220 if (acting[i] != CRUSH_ITEM_NONE)
2221 actingset.insert(
2222 pg_shard_t(
2223 acting[i],
2224 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2225 }
2226 upset.clear();
2227 up = newup;
2228 for (uint8_t i = 0; i < up.size(); ++i) {
2229 if (up[i] != CRUSH_ITEM_NONE)
2230 upset.insert(
2231 pg_shard_t(
2232 up[i],
2233 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2234 }
2235 if (!pool.info.ec_pool()) {
2236 up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
2237 primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
2238 return;
2239 }
2240 up_primary = pg_shard_t();
2241 primary = pg_shard_t();
2242 for (uint8_t i = 0; i < up.size(); ++i) {
2243 if (up[i] == new_up_primary) {
2244 up_primary = pg_shard_t(up[i], shard_id_t(i));
2245 break;
2246 }
2247 }
2248 for (uint8_t i = 0; i < acting.size(); ++i) {
2249 if (acting[i] == new_acting_primary) {
2250 primary = pg_shard_t(acting[i], shard_id_t(i));
2251 break;
2252 }
2253 }
2254 assert(up_primary.osd == new_up_primary);
2255 assert(primary.osd == new_acting_primary);
2256 }
2257 pg_shard_t get_primary() const { return primary; }
2258
2259 int get_role() const { return role; }
2260 void set_role(int r) { role = r; }
2261
2262 bool is_primary() const { return pg_whoami == primary; }
2263 bool is_replica() const { return role > 0; }
2264
2265 epoch_t get_last_peering_reset() const { return last_peering_reset; }
2266
2267 //int get_state() const { return state; }
2268 bool state_test(int m) const { return (state & m) != 0; }
2269 void state_set(int m) { state |= m; }
2270 void state_clear(int m) { state &= ~m; }
2271
2272 bool is_complete() const { return info.last_complete == info.last_update; }
2273 bool should_send_notify() const { return send_notify; }
2274
2275 int get_state() const { return state; }
2276 bool is_active() const { return state_test(PG_STATE_ACTIVE); }
2277 bool is_activating() const { return state_test(PG_STATE_ACTIVATING); }
2278 bool is_peering() const { return state_test(PG_STATE_PEERING); }
2279 bool is_down() const { return state_test(PG_STATE_DOWN); }
2280 bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
2281 bool is_clean() const { return state_test(PG_STATE_CLEAN); }
2282 bool is_degraded() const { return state_test(PG_STATE_DEGRADED); }
2283 bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
2284
2285 bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
2286 bool is_peered() const {
2287 return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED);
2288 }
2289
2290 bool is_empty() const { return info.last_update == eversion_t(0,0); }
2291
2292 void init(
2293 int role,
2294 const vector<int>& up,
2295 int up_primary,
2296 const vector<int>& acting,
2297 int acting_primary,
2298 const pg_history_t& history,
2299 const PastIntervals& pim,
2300 bool backfill,
2301 ObjectStore::Transaction *t);
2302
2303 // pg on-disk state
2304 void do_pending_flush();
2305
2306 static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits);
2307 static void _init(ObjectStore::Transaction& t,
2308 spg_t pgid, const pg_pool_t *pool);
2309
2310 private:
2311 void prepare_write_info(map<string,bufferlist> *km);
2312
2313 void update_store_with_options();
2314 void update_store_on_load();
2315
2316 public:
2317 static int _prepare_write_info(
2318 CephContext* cct,
2319 map<string,bufferlist> *km,
2320 epoch_t epoch,
2321 pg_info_t &info,
2322 pg_info_t &last_written_info,
2323 PastIntervals &past_intervals,
2324 bool dirty_big_info,
2325 bool dirty_epoch,
2326 bool try_fast_info,
2327 PerfCounters *logger = nullptr);
2328 void write_if_dirty(ObjectStore::Transaction& t);
2329
2330 PGLog::IndexedLog projected_log;
2331 bool check_in_progress_op(
2332 const osd_reqid_t &r,
2333 eversion_t *version,
2334 version_t *user_version,
2335 int *return_code) const;
2336 eversion_t projected_last_update;
2337 eversion_t get_next_version() const {
2338 eversion_t at_version(
2339 get_osdmap()->get_epoch(),
2340 projected_last_update.version+1);
2341 assert(at_version > info.last_update);
2342 assert(at_version > pg_log.get_head());
2343 assert(at_version > projected_last_update);
2344 return at_version;
2345 }
2346
2347 void add_log_entry(const pg_log_entry_t& e, bool applied);
2348 void append_log(
2349 const vector<pg_log_entry_t>& logv,
2350 eversion_t trim_to,
2351 eversion_t roll_forward_to,
2352 ObjectStore::Transaction &t,
2353 bool transaction_applied = true);
2354 bool check_log_for_corruption(ObjectStore *store);
2355 void trim_log();
2356
2357 std::string get_corrupt_pg_log_name() const;
2358 static int read_info(
2359 ObjectStore *store, spg_t pgid, const coll_t &coll,
2360 bufferlist &bl, pg_info_t &info, PastIntervals &past_intervals,
2361 __u8 &);
2362 void read_state(ObjectStore *store, bufferlist &bl);
2363 static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
2364 static int peek_map_epoch(ObjectStore *store, spg_t pgid,
2365 epoch_t *pepoch, bufferlist *bl);
2366 void update_snap_map(
2367 const vector<pg_log_entry_t> &log_entries,
2368 ObjectStore::Transaction& t);
2369
2370 void filter_snapc(vector<snapid_t> &snaps);
2371
2372 void log_weirdness();
2373
2374 virtual void kick_snap_trim() = 0;
2375 virtual void snap_trimmer_scrub_complete() = 0;
2376 bool requeue_scrub(bool high_priority = false);
2377 void queue_recovery(bool front = false);
2378 bool queue_scrub();
2379 unsigned get_scrub_priority();
2380
2381 /// share pg info after a pg is active
2382 void share_pg_info();
2383
2384
2385 bool append_log_entries_update_missing(
2386 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
2387 ObjectStore::Transaction &t);
2388
2389 /**
2390 * Merge entries updating missing as necessary on all
2391 * actingbackfill logs and missings (also missing_loc)
2392 */
2393 void merge_new_log_entries(
2394 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
2395 ObjectStore::Transaction &t);
2396
2397 void reset_interval_flush();
2398 void start_peering_interval(
2399 const OSDMapRef lastmap,
2400 const vector<int>& newup, int up_primary,
2401 const vector<int>& newacting, int acting_primary,
2402 ObjectStore::Transaction *t);
2403 void on_new_interval();
2404 virtual void _on_new_interval() = 0;
2405 void start_flush(ObjectStore::Transaction *t,
2406 list<Context *> *on_applied,
2407 list<Context *> *on_safe);
2408 void set_last_peering_reset();
2409 bool pg_has_reset_since(epoch_t e) {
2410 assert(is_locked());
2411 return deleting || e < get_last_peering_reset();
2412 }
2413
2414 void update_history(const pg_history_t& history);
2415 void fulfill_info(pg_shard_t from, const pg_query_t &query,
2416 pair<pg_shard_t, pg_info_t> &notify_info);
2417 void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
2418
2419 void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap);
2420
2421 bool should_restart_peering(
2422 int newupprimary,
2423 int newactingprimary,
2424 const vector<int>& newup,
2425 const vector<int>& newacting,
2426 OSDMapRef lastmap,
2427 OSDMapRef osdmap);
2428
2429 // OpRequest queueing
2430 bool can_discard_op(OpRequestRef& op);
2431 bool can_discard_scan(OpRequestRef op);
2432 bool can_discard_backfill(OpRequestRef op);
2433 bool can_discard_request(OpRequestRef& op);
2434
2435 template<typename T, int MSGTYPE>
2436 bool can_discard_replica_op(OpRequestRef& op);
2437
2438 bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
2439 bool old_peering_evt(CephPeeringEvtRef evt) {
2440 return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
2441 }
2442 static bool have_same_or_newer_map(epoch_t cur_epoch, epoch_t e) {
2443 return e <= cur_epoch;
2444 }
2445 bool have_same_or_newer_map(epoch_t e) {
2446 return e <= get_osdmap()->get_epoch();
2447 }
2448
2449 bool op_has_sufficient_caps(OpRequestRef& op);
2450
2451
2452 // recovery bits
2453 void take_waiters();
2454 void queue_peering_event(CephPeeringEvtRef evt);
2455 void handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx);
2456 void queue_query(epoch_t msg_epoch, epoch_t query_epoch,
2457 pg_shard_t from, const pg_query_t& q);
2458 void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
2459 void queue_flushed(epoch_t started_at);
2460 void handle_advance_map(
2461 OSDMapRef osdmap, OSDMapRef lastmap,
2462 vector<int>& newup, int up_primary,
2463 vector<int>& newacting, int acting_primary,
2464 RecoveryCtx *rctx);
2465 void handle_activate_map(RecoveryCtx *rctx);
2466 void handle_create(RecoveryCtx *rctx);
2467 void handle_loaded(RecoveryCtx *rctx);
2468 void handle_query_state(Formatter *f);
2469
2470 virtual void on_removal(ObjectStore::Transaction *t) = 0;
2471
2472
2473 // abstract bits
2474 virtual void do_request(
2475 OpRequestRef& op,
2476 ThreadPool::TPHandle &handle
2477 ) = 0;
2478
2479 virtual void do_op(OpRequestRef& op) = 0;
2480 virtual void do_sub_op(OpRequestRef op) = 0;
2481 virtual void do_sub_op_reply(OpRequestRef op) = 0;
2482 virtual void do_scan(
2483 OpRequestRef op,
2484 ThreadPool::TPHandle &handle
2485 ) = 0;
2486 virtual void do_backfill(OpRequestRef op) = 0;
2487 virtual void snap_trimmer(epoch_t epoch_queued) = 0;
2488
2489 virtual int do_command(
2490 cmdmap_t cmdmap,
2491 ostream& ss,
2492 bufferlist& idata,
2493 bufferlist& odata,
2494 ConnectionRef conn,
2495 ceph_tid_t tid) = 0;
2496
2497 virtual void on_role_change() = 0;
2498 virtual void on_pool_change() = 0;
2499 virtual void on_change(ObjectStore::Transaction *t) = 0;
2500 virtual void on_activate() = 0;
2501 virtual void on_flushed() = 0;
2502 virtual void on_shutdown() = 0;
2503 virtual void check_blacklisted_watchers() = 0;
2504 virtual void get_watchers(std::list<obj_watch_item_t>&) = 0;
2505
2506 virtual bool agent_work(int max) = 0;
2507 virtual bool agent_work(int max, int agent_flush_quota) = 0;
2508 virtual void agent_stop() = 0;
2509 virtual void agent_delay() = 0;
2510 virtual void agent_clear() = 0;
2511 virtual void agent_choose_mode_restart() = 0;
2512 };
2513
2514 ostream& operator<<(ostream& out, const PG& pg);
2515
2516 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi);
2517
2518 #endif