]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PG.h
5493a5004ea5f4f80442431bcf619cbbe42401a5
[ceph.git] / ceph / src / osd / PG.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef CEPH_PG_H
16 #define CEPH_PG_H
17
18 #include <boost/statechart/custom_reaction.hpp>
19 #include <boost/statechart/event.hpp>
20 #include <boost/statechart/simple_state.hpp>
21 #include <boost/statechart/state.hpp>
22 #include <boost/statechart/state_machine.hpp>
23 #include <boost/statechart/transition.hpp>
24 #include <boost/statechart/event_base.hpp>
25 #include <boost/scoped_ptr.hpp>
26 #include <boost/circular_buffer.hpp>
27 #include "include/memory.h"
28 #include "include/mempool.h"
29
30 // re-include our assert to clobber boost's
31 #include "include/assert.h"
32
33 #include "include/types.h"
34 #include "include/stringify.h"
35 #include "osd_types.h"
36 #include "include/xlist.h"
37 #include "SnapMapper.h"
38 #include "Session.h"
39 #include "common/Timer.h"
40
41 #include "PGLog.h"
42 #include "OSDMap.h"
43 #include "messages/MOSDPGLog.h"
44 #include "include/str_list.h"
45 #include "PGBackend.h"
46
47 #include <atomic>
48 #include <list>
49 #include <memory>
50 #include <stack>
51 #include <string>
52 #include <tuple>
53 using namespace std;
54
55 // #include "include/unordered_map.h"
56 // #include "include/unordered_set.h"
57
58 //#define DEBUG_RECOVERY_OIDS // track set of recovering oids explicitly, to find counting bugs
59
60 class OSD;
61 class OSDService;
62 class MOSDOp;
63 class MOSDPGScan;
64 class MOSDPGBackfill;
65 class MOSDPGInfo;
66
67 class PG;
68 struct OpRequest;
69 typedef OpRequest::Ref OpRequestRef;
70 class MOSDPGLog;
71 class CephContext;
72
73 namespace Scrub {
74 class Store;
75 }
76
77 void intrusive_ptr_add_ref(PG *pg);
78 void intrusive_ptr_release(PG *pg);
79
80 using state_history_entry = std::tuple<utime_t, utime_t, const char*>;
81 using embedded_state = std::pair<utime_t, const char*>;
82
83 struct PGStateInstance {
84 // Time spent in pg states
85
86 void setepoch(const epoch_t current_epoch) {
87 this_epoch = current_epoch;
88 }
89
90 void enter_state(const utime_t entime, const char* state) {
91 embedded_states.push(std::make_pair(entime, state));
92 }
93
94 void exit_state(const utime_t extime) {
95 embedded_state this_state = embedded_states.top();
96 state_history.push_back(state_history_entry{
97 this_state.first, extime, this_state.second});
98 embedded_states.pop();
99 }
100
101 epoch_t this_epoch;
102 utime_t enter_time;
103 std::vector<state_history_entry> state_history;
104 std::stack<embedded_state> embedded_states;
105 };
106
107 class PGStateHistory {
108 // Member access protected with the PG lock
109 public:
110 PGStateHistory() : buffer(10) {}
111
112 void enter(PG* pg, const utime_t entime, const char* state);
113
114 void exit(const char* state);
115
116 void reset() {
117 pi = nullptr;
118 }
119
120 void set_pg_in_destructor() { pg_in_destructor = true; }
121
122 void dump(Formatter* f) const;
123
124 private:
125 bool pg_in_destructor = false;
126 PG* thispg = nullptr;
127 std::unique_ptr<PGStateInstance> tmppi;
128 PGStateInstance* pi = nullptr;
129 boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer;
130
131 };
132
133 #ifdef PG_DEBUG_REFS
134 #include "common/tracked_int_ptr.hpp"
135 uint64_t get_with_id(PG *pg);
136 void put_with_id(PG *pg, uint64_t id);
137 typedef TrackedIntPtr<PG> PGRef;
138 #else
139 typedef boost::intrusive_ptr<PG> PGRef;
140 #endif
141
142 class PGRecoveryStats {
143 struct per_state_info {
144 uint64_t enter, exit; // enter/exit counts
145 uint64_t events;
146 utime_t event_time; // time spent processing events
147 utime_t total_time; // total time in state
148 utime_t min_time, max_time;
149
150 // cppcheck-suppress unreachableCode
151 per_state_info() : enter(0), exit(0), events(0) {}
152 };
153 map<const char *,per_state_info> info;
154 Mutex lock;
155
156 public:
157 PGRecoveryStats() : lock("PGRecoverStats::lock") {}
158
159 void reset() {
160 Mutex::Locker l(lock);
161 info.clear();
162 }
163 void dump(ostream& out) {
164 Mutex::Locker l(lock);
165 for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
166 per_state_info& i = p->second;
167 out << i.enter << "\t" << i.exit << "\t"
168 << i.events << "\t" << i.event_time << "\t"
169 << i.total_time << "\t"
170 << i.min_time << "\t" << i.max_time << "\t"
171 << p->first << "\n";
172 }
173 }
174
175 void dump_formatted(Formatter *f) {
176 Mutex::Locker l(lock);
177 f->open_array_section("pg_recovery_stats");
178 for (map<const char *,per_state_info>::iterator p = info.begin();
179 p != info.end(); ++p) {
180 per_state_info& i = p->second;
181 f->open_object_section("recovery_state");
182 f->dump_int("enter", i.enter);
183 f->dump_int("exit", i.exit);
184 f->dump_int("events", i.events);
185 f->dump_stream("event_time") << i.event_time;
186 f->dump_stream("total_time") << i.total_time;
187 f->dump_stream("min_time") << i.min_time;
188 f->dump_stream("max_time") << i.max_time;
189 vector<string> states;
190 get_str_vec(p->first, "/", states);
191 f->open_array_section("nested_states");
192 for (vector<string>::iterator st = states.begin();
193 st != states.end(); ++st) {
194 f->dump_string("state", *st);
195 }
196 f->close_section();
197 f->close_section();
198 }
199 f->close_section();
200 }
201
202 void log_enter(const char *s) {
203 Mutex::Locker l(lock);
204 info[s].enter++;
205 }
206 void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
207 Mutex::Locker l(lock);
208 per_state_info &i = info[s];
209 i.exit++;
210 i.total_time += dur;
211 if (dur > i.max_time)
212 i.max_time = dur;
213 if (dur < i.min_time || i.min_time == utime_t())
214 i.min_time = dur;
215 i.events += events;
216 i.event_time += event_dur;
217 }
218 };
219
220 struct PGPool {
221 CephContext* cct;
222 epoch_t cached_epoch;
223 int64_t id;
224 string name;
225 uint64_t auid;
226
227 pg_pool_t info;
228 SnapContext snapc; // the default pool snapc, ready to go.
229
230 interval_set<snapid_t> cached_removed_snaps; // current removed_snaps set
231 interval_set<snapid_t> newly_removed_snaps; // newly removed in the last epoch
232
233 PGPool(CephContext* cct, OSDMapRef map, int64_t i)
234 : cct(cct),
235 cached_epoch(map->get_epoch()),
236 id(i),
237 name(map->get_pool_name(id)),
238 auid(map->get_pg_pool(id)->auid) {
239 const pg_pool_t *pi = map->get_pg_pool(id);
240 assert(pi);
241 info = *pi;
242 snapc = pi->get_snap_context();
243 pi->build_removed_snaps(cached_removed_snaps);
244 }
245
246 void update(OSDMapRef map);
247 };
248
249 /** PG - Replica Placement Group
250 *
251 */
252
253 class PG : public DoutPrefixProvider {
254 protected:
255 OSDService *osd;
256 CephContext *cct;
257 OSDriver osdriver;
258 SnapMapper snap_mapper;
259
260 virtual PGBackend *get_pgbackend() = 0;
261 public:
262 std::string gen_prefix() const override;
263 CephContext *get_cct() const override { return cct; }
264 unsigned get_subsys() const override { return ceph_subsys_osd; }
265
266 /*** PG ****/
267 void update_snap_mapper_bits(uint32_t bits) {
268 snap_mapper.update_bits(bits);
269 }
270 /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
271 IsPGRecoverablePredicate *get_is_recoverable_predicate() {
272 return get_pgbackend()->get_is_recoverable_predicate();
273 }
274 protected:
275 OSDMapRef osdmap_ref;
276 OSDMapRef last_persisted_osdmap_ref;
277 PGPool pool;
278
279 void requeue_map_waiters();
280
281 void update_osdmap_ref(OSDMapRef newmap) {
282 assert(_lock.is_locked_by_me());
283 osdmap_ref = std::move(newmap);
284 }
285
286 public:
287 OSDMapRef get_osdmap() const {
288 assert(is_locked());
289 assert(osdmap_ref);
290 return osdmap_ref;
291 }
292 protected:
293
294 /** locking and reference counting.
295 * I destroy myself when the reference count hits zero.
296 * lock() should be called before doing anything.
297 * get() should be called on pointer copy (to another thread, etc.).
298 * put() should be called on destruction of some previously copied pointer.
299 * unlock() when done with the current pointer (_most common_).
300 */
301 mutable Mutex _lock;
302 std::atomic_uint ref{0};
303
304 #ifdef PG_DEBUG_REFS
305 Mutex _ref_id_lock;
306 map<uint64_t, string> _live_ids;
307 map<string, uint64_t> _tag_counts;
308 uint64_t _ref_id;
309 #endif
310
311 public:
312 bool deleting; // true while in removing or OSD is shutting down
313
314 ZTracer::Endpoint trace_endpoint;
315
316 void lock_suspend_timeout(ThreadPool::TPHandle &handle);
317 void lock(bool no_lockdep = false) const;
318 void unlock() const {
319 //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
320 assert(!dirty_info);
321 assert(!dirty_big_info);
322 _lock.Unlock();
323 }
324
325 bool is_locked() const {
326 return _lock.is_locked();
327 }
328
329 #ifdef PG_DEBUG_REFS
330 uint64_t get_with_id();
331 void put_with_id(uint64_t);
332 void dump_live_ids();
333 #endif
334 void get(const char* tag);
335 void put(const char* tag);
336
337 bool dirty_info, dirty_big_info;
338
339 public:
340 bool is_ec_pg() const {
341 return pool.info.ec_pool();
342 }
343 // pg state
344 pg_info_t info; ///< current pg info
345 pg_info_t last_written_info; ///< last written info
346 __u8 info_struct_v;
347 static const __u8 cur_struct_v = 10;
348 // v10 is the new past_intervals encoding
349 // v9 was fastinfo_key addition
350 // v8 was the move to a per-pg pgmeta object
351 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
352 // (first appeared in cuttlefish).
353 static const __u8 compat_struct_v = 7;
354 bool must_upgrade() {
355 return info_struct_v < cur_struct_v;
356 }
357 bool can_upgrade() {
358 return info_struct_v >= compat_struct_v;
359 }
360 void upgrade(ObjectStore *store);
361
362 const coll_t coll;
363 ObjectStore::CollectionHandle ch;
364 PGLog pg_log;
365 static string get_info_key(spg_t pgid) {
366 return stringify(pgid) + "_info";
367 }
368 static string get_biginfo_key(spg_t pgid) {
369 return stringify(pgid) + "_biginfo";
370 }
371 static string get_epoch_key(spg_t pgid) {
372 return stringify(pgid) + "_epoch";
373 }
374 ghobject_t pgmeta_oid;
375
376 class MissingLoc {
377 map<hobject_t, pg_missing_item> needs_recovery_map;
378 map<hobject_t, set<pg_shard_t> > missing_loc;
379 set<pg_shard_t> missing_loc_sources;
380 PG *pg;
381 set<pg_shard_t> empty_set;
382 public:
383 boost::scoped_ptr<IsPGReadablePredicate> is_readable;
384 boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable;
385 explicit MissingLoc(PG *pg)
386 : pg(pg) {}
387 void set_backend_predicates(
388 IsPGReadablePredicate *_is_readable,
389 IsPGRecoverablePredicate *_is_recoverable) {
390 is_readable.reset(_is_readable);
391 is_recoverable.reset(_is_recoverable);
392 }
393 string gen_prefix() const { return pg->gen_prefix(); }
394 bool needs_recovery(
395 const hobject_t &hoid,
396 eversion_t *v = 0) const {
397 map<hobject_t, pg_missing_item>::const_iterator i =
398 needs_recovery_map.find(hoid);
399 if (i == needs_recovery_map.end())
400 return false;
401 if (v)
402 *v = i->second.need;
403 return true;
404 }
405 bool is_unfound(const hobject_t &hoid) const {
406 return needs_recovery(hoid) && (
407 !missing_loc.count(hoid) ||
408 !(*is_recoverable)(missing_loc.find(hoid)->second));
409 }
410 bool readable_with_acting(
411 const hobject_t &hoid,
412 const set<pg_shard_t> &acting) const;
413 uint64_t num_unfound() const {
414 uint64_t ret = 0;
415 for (map<hobject_t, pg_missing_item>::const_iterator i =
416 needs_recovery_map.begin();
417 i != needs_recovery_map.end();
418 ++i) {
419 if (is_unfound(i->first))
420 ++ret;
421 }
422 return ret;
423 }
424
425 bool have_unfound() const {
426 for (map<hobject_t, pg_missing_item>::const_iterator i =
427 needs_recovery_map.begin();
428 i != needs_recovery_map.end();
429 ++i) {
430 if (is_unfound(i->first))
431 return true;
432 }
433 return false;
434 }
435 void clear() {
436 needs_recovery_map.clear();
437 missing_loc.clear();
438 missing_loc_sources.clear();
439 }
440
441 void add_location(const hobject_t &hoid, pg_shard_t location) {
442 missing_loc[hoid].insert(location);
443 }
444 void remove_location(const hobject_t &hoid, pg_shard_t location) {
445 missing_loc[hoid].erase(location);
446 }
447 void add_active_missing(const pg_missing_t &missing) {
448 for (map<hobject_t, pg_missing_item>::const_iterator i =
449 missing.get_items().begin();
450 i != missing.get_items().end();
451 ++i) {
452 map<hobject_t, pg_missing_item>::const_iterator j =
453 needs_recovery_map.find(i->first);
454 if (j == needs_recovery_map.end()) {
455 needs_recovery_map.insert(*i);
456 } else {
457 assert(i->second.need == j->second.need);
458 }
459 }
460 }
461
462 void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have) {
463 needs_recovery_map[hoid] = pg_missing_item(need, have);
464 }
465 void revise_need(const hobject_t &hoid, eversion_t need) {
466 assert(needs_recovery(hoid));
467 needs_recovery_map[hoid].need = need;
468 }
469
470 /// Adds info about a possible recovery source
471 bool add_source_info(
472 pg_shard_t source, ///< [in] source
473 const pg_info_t &oinfo, ///< [in] info
474 const pg_missing_t &omissing, ///< [in] (optional) missing
475 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
476 ); ///< @return whether a new object location was discovered
477
478 /// Adds recovery sources in batch
479 void add_batch_sources_info(
480 const set<pg_shard_t> &sources, ///< [in] a set of resources which can be used for all objects
481 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
482 );
483
484 /// Uses osdmap to update structures for now down sources
485 void check_recovery_sources(const OSDMapRef& osdmap);
486
487 /// Call when hoid is no longer missing in acting set
488 void recovered(const hobject_t &hoid) {
489 needs_recovery_map.erase(hoid);
490 missing_loc.erase(hoid);
491 }
492
493 /// Call to update structures for hoid after a change
494 void rebuild(
495 const hobject_t &hoid,
496 pg_shard_t self,
497 const set<pg_shard_t> to_recover,
498 const pg_info_t &info,
499 const pg_missing_t &missing,
500 const map<pg_shard_t, pg_missing_t> &pmissing,
501 const map<pg_shard_t, pg_info_t> &pinfo) {
502 recovered(hoid);
503 boost::optional<pg_missing_item> item;
504 auto miter = missing.get_items().find(hoid);
505 if (miter != missing.get_items().end()) {
506 item = miter->second;
507 } else {
508 for (auto &&i: to_recover) {
509 if (i == self)
510 continue;
511 auto pmiter = pmissing.find(i);
512 assert(pmiter != pmissing.end());
513 miter = pmiter->second.get_items().find(hoid);
514 if (miter != pmiter->second.get_items().end()) {
515 item = miter->second;
516 break;
517 }
518 }
519 }
520 if (!item)
521 return; // recovered!
522
523 needs_recovery_map[hoid] = *item;
524 auto mliter =
525 missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first;
526 assert(info.last_backfill.is_max());
527 assert(info.last_update >= item->need);
528 if (!missing.is_missing(hoid))
529 mliter->second.insert(self);
530 for (auto &&i: pmissing) {
531 auto pinfoiter = pinfo.find(i.first);
532 assert(pinfoiter != pinfo.end());
533 if (item->need <= pinfoiter->second.last_update &&
534 hoid <= pinfoiter->second.last_backfill &&
535 !i.second.is_missing(hoid))
536 mliter->second.insert(i.first);
537 }
538 }
539
540 const set<pg_shard_t> &get_locations(const hobject_t &hoid) const {
541 return missing_loc.count(hoid) ?
542 missing_loc.find(hoid)->second : empty_set;
543 }
544 const map<hobject_t, set<pg_shard_t>> &get_missing_locs() const {
545 return missing_loc;
546 }
547 const map<hobject_t, pg_missing_item> &get_needs_recovery() const {
548 return needs_recovery_map;
549 }
550 } missing_loc;
551
552 PastIntervals past_intervals;
553
554 interval_set<snapid_t> snap_trimq;
555
556 /* You should not use these items without taking their respective queue locks
557 * (if they have one) */
558 xlist<PG*>::item stat_queue_item;
559 bool scrub_queued;
560 bool recovery_queued;
561
562 int recovery_ops_active;
563 set<pg_shard_t> waiting_on_backfill;
564 #ifdef DEBUG_RECOVERY_OIDS
565 set<hobject_t> recovering_oids;
566 #endif
567
568 protected:
569 int role; // 0 = primary, 1 = replica, -1=none.
570 unsigned state; // PG_STATE_*
571
572 bool send_notify; ///< true if we are non-primary and should notify the primary
573
574 public:
575 eversion_t last_update_ondisk; // last_update that has committed; ONLY DEFINED WHEN is_active()
576 eversion_t last_complete_ondisk; // last_complete that has committed.
577 eversion_t last_update_applied;
578
579
580 struct C_UpdateLastRollbackInfoTrimmedToApplied : Context {
581 PGRef pg;
582 epoch_t e;
583 eversion_t v;
584 C_UpdateLastRollbackInfoTrimmedToApplied(PG *pg, epoch_t e, eversion_t v)
585 : pg(pg), e(e), v(v) {}
586 void finish(int) override {
587 pg->lock();
588 if (!pg->pg_has_reset_since(e)) {
589 pg->last_rollback_info_trimmed_to_applied = v;
590 }
591 pg->unlock();
592 }
593 };
594 // entries <= last_rollback_info_trimmed_to_applied have been trimmed,
595 // and the transaction has applied
596 eversion_t last_rollback_info_trimmed_to_applied;
597
598 // primary state
599 public:
600 pg_shard_t primary;
601 pg_shard_t pg_whoami;
602 pg_shard_t up_primary;
603 vector<int> up, acting, want_acting;
604 set<pg_shard_t> actingbackfill, actingset, upset;
605 map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
606 eversion_t min_last_complete_ondisk; // up: min over last_complete_ondisk, peer_last_complete_ondisk
607 eversion_t pg_trim_to;
608
609 set<int> blocked_by; ///< osds we are blocked by (for pg stats)
610
611 // [primary only] content recovery state
612
613 public:
614 struct BufferedRecoveryMessages {
615 map<int, map<spg_t, pg_query_t> > query_map;
616 map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map;
617 map<int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
618 };
619
620 struct RecoveryCtx {
621 utime_t start_time;
622 map<int, map<spg_t, pg_query_t> > *query_map;
623 map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map;
624 map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list;
625 set<PGRef> created_pgs;
626 C_Contexts *on_applied;
627 C_Contexts *on_safe;
628 ObjectStore::Transaction *transaction;
629 ThreadPool::TPHandle* handle;
630 RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map,
631 map<int,
632 vector<pair<pg_notify_t, PastIntervals> > > *info_map,
633 map<int,
634 vector<pair<pg_notify_t, PastIntervals> > > *notify_list,
635 C_Contexts *on_applied,
636 C_Contexts *on_safe,
637 ObjectStore::Transaction *transaction)
638 : query_map(query_map), info_map(info_map),
639 notify_list(notify_list),
640 on_applied(on_applied),
641 on_safe(on_safe),
642 transaction(transaction),
643 handle(NULL) {}
644
645 RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
646 : query_map(&(buf.query_map)),
647 info_map(&(buf.info_map)),
648 notify_list(&(buf.notify_list)),
649 on_applied(rctx.on_applied),
650 on_safe(rctx.on_safe),
651 transaction(rctx.transaction),
652 handle(rctx.handle) {}
653
654 void accept_buffered_messages(BufferedRecoveryMessages &m) {
655 assert(query_map);
656 assert(info_map);
657 assert(notify_list);
658 for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
659 i != m.query_map.end();
660 ++i) {
661 map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
662 for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
663 j != i->second.end();
664 ++j) {
665 omap[j->first] = j->second;
666 }
667 }
668 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
669 = m.info_map.begin();
670 i != m.info_map.end();
671 ++i) {
672 vector<pair<pg_notify_t, PastIntervals> > &ovec =
673 (*info_map)[i->first];
674 ovec.reserve(ovec.size() + i->second.size());
675 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
676 }
677 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
678 = m.notify_list.begin();
679 i != m.notify_list.end();
680 ++i) {
681 vector<pair<pg_notify_t, PastIntervals> > &ovec =
682 (*notify_list)[i->first];
683 ovec.reserve(ovec.size() + i->second.size());
684 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
685 }
686 }
687 };
688
689
690 PGStateHistory pgstate_history;
691
692 struct NamedState {
693 const char *state_name;
694 utime_t enter_time;
695 PG* pg;
696 const char *get_state_name() { return state_name; }
697 NamedState(PG *pg_, const char *state_name_)
698 : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) {
699 pg->pgstate_history.enter(pg, enter_time, state_name);
700 }
701 virtual ~NamedState() { pg->pgstate_history.exit(state_name); }
702 };
703
704
705
706 protected:
707
708 /*
709 * peer_info -- projected (updates _before_ replicas ack)
710 * peer_missing -- committed (updates _after_ replicas ack)
711 */
712
713 bool need_up_thru;
714 set<pg_shard_t> stray_set; // non-acting osds that have PG data.
715 eversion_t oldest_update; // acting: lowest (valid) last_update in active set
716 map<pg_shard_t, pg_info_t> peer_info; // info from peers (stray or prior)
717 set<pg_shard_t> peer_purged; // peers purged
718 map<pg_shard_t, pg_missing_t> peer_missing;
719 set<pg_shard_t> peer_log_requested; // logs i've requested (and start stamps)
720 set<pg_shard_t> peer_missing_requested;
721
722 // i deleted these strays; ignore racing PGInfo from them
723 set<pg_shard_t> peer_activated;
724
725 // primary-only, recovery-only state
726 set<pg_shard_t> might_have_unfound; // These osds might have objects on them
727 // which are unfound on the primary
728 epoch_t last_peering_reset;
729
730
731 /* heartbeat peers */
732 void set_probe_targets(const set<pg_shard_t> &probe_set);
733 void clear_probe_targets();
734 public:
735 Mutex heartbeat_peer_lock;
736 set<int> heartbeat_peers;
737 set<int> probe_targets;
738
739 /**
740 * BackfillInterval
741 *
742 * Represents the objects in a range [begin, end)
743 *
744 * Possible states:
745 * 1) begin == end == hobject_t() indicates the the interval is unpopulated
746 * 2) Else, objects contains all objects in [begin, end)
747 */
748 struct BackfillInterval {
749 // info about a backfill interval on a peer
750 eversion_t version; /// version at which the scan occurred
751 map<hobject_t,eversion_t> objects;
752 hobject_t begin;
753 hobject_t end;
754
755 /// clear content
756 void clear() {
757 *this = BackfillInterval();
758 }
759
760 /// clear objects list only
761 void clear_objects() {
762 objects.clear();
763 }
764
765 /// reinstantiate with a new start+end position and sort order
766 void reset(hobject_t start) {
767 clear();
768 begin = end = start;
769 }
770
771 /// true if there are no objects in this interval
772 bool empty() const {
773 return objects.empty();
774 }
775
776 /// true if interval extends to the end of the range
777 bool extends_to_end() const {
778 return end.is_max();
779 }
780
781 /// removes items <= soid and adjusts begin to the first object
782 void trim_to(const hobject_t &soid) {
783 trim();
784 while (!objects.empty() &&
785 objects.begin()->first <= soid) {
786 pop_front();
787 }
788 }
789
790 /// Adjusts begin to the first object
791 void trim() {
792 if (!objects.empty())
793 begin = objects.begin()->first;
794 else
795 begin = end;
796 }
797
798 /// drop first entry, and adjust @begin accordingly
799 void pop_front() {
800 assert(!objects.empty());
801 objects.erase(objects.begin());
802 trim();
803 }
804
805 /// dump
806 void dump(Formatter *f) const {
807 f->dump_stream("begin") << begin;
808 f->dump_stream("end") << end;
809 f->open_array_section("objects");
810 for (map<hobject_t, eversion_t>::const_iterator i =
811 objects.begin();
812 i != objects.end();
813 ++i) {
814 f->open_object_section("object");
815 f->dump_stream("object") << i->first;
816 f->dump_stream("version") << i->second;
817 f->close_section();
818 }
819 f->close_section();
820 }
821 };
822
823 protected:
824 BackfillInterval backfill_info;
825 map<pg_shard_t, BackfillInterval> peer_backfill_info;
826 bool backfill_reserved;
827 bool backfill_reserving;
828
829 friend class OSD;
830
831 public:
832 set<pg_shard_t> backfill_targets;
833
834 bool is_backfill_targets(pg_shard_t osd) {
835 return backfill_targets.count(osd);
836 }
837
838 protected:
839
840 /*
841 * blocked request wait hierarchy
842 *
843 * In order to preserve request ordering we need to be careful about the
844 * order in which blocked requests get requeued. Generally speaking, we
845 * push the requests back up to the op_wq in reverse order (most recent
846 * request first) so that they come back out again in the original order.
847 * However, because there are multiple wait queues, we need to requeue
848 * waitlists in order. Generally speaking, we requeue the wait lists
849 * that are checked first.
850 *
851 * Here are the various wait lists, in the order they are used during
852 * request processing, with notes:
853 *
854 * - waiting_for_map
855 * - may start or stop blocking at any time (depending on client epoch)
856 * - waiting_for_peered
857 * - !is_peered() or flushes_in_progress
858 * - only starts blocking on interval change; never restarts
859 * - waiting_for_active
860 * - !is_active()
861 * - only starts blocking on interval change; never restarts
862 * - waiting_for_scrub
863 * - starts and stops blocking for varying intervals during scrub
864 * - waiting_for_unreadable_object
865 * - never restarts once object is readable (* except for EIO?)
866 * - waiting_for_degraded_object
867 * - never restarts once object is writeable (* except for EIO?)
868 * - waiting_for_blocked_object
869 * - starts and stops based on proxied op activity
870 * - obc rwlocks
871 * - starts and stops based on read/write activity
872 *
873 * Notes:
874 *
875 * 1. During and interval change, we requeue *everything* in the above order.
876 *
877 * 2. When an obc rwlock is released, we check for a scrub block and requeue
878 * the op there if it applies. We ignore the unreadable/degraded/blocked
879 * queues because we assume they cannot apply at that time (this is
880 * probably mostly true).
881 *
882 * 3. The requeue_ops helper will push ops onto the waiting_for_map list if
883 * it is non-empty.
884 *
885 * These three behaviors are generally sufficient to maintain ordering, with
886 * the possible exception of cases where we make an object degraded or
887 * unreadable that was previously okay, e.g. when scrub or op processing
888 * encounter an unexpected error. FIXME.
889 */
890
891 // pg waiters
892 unsigned flushes_in_progress;
893
894 // ops with newer maps than our (or blocked behind them)
895 // track these by client, since inter-request ordering doesn't otherwise
896 // matter.
897 unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map;
898
899 // ops waiting on peered
900 list<OpRequestRef> waiting_for_peered;
901
902 // ops waiting on active (require peered as well)
903 list<OpRequestRef> waiting_for_active;
904 list<OpRequestRef> waiting_for_scrub;
905
906 list<OpRequestRef> waiting_for_cache_not_full;
907 list<OpRequestRef> waiting_for_all_missing;
908 map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object,
909 waiting_for_degraded_object,
910 waiting_for_blocked_object;
911
912 set<hobject_t> objects_blocked_on_cache_full;
913 map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
914 map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
915
916 // Callbacks should assume pg (and nothing else) is locked
917 map<hobject_t, list<Context*>> callbacks_for_degraded_object;
918
919 map<eversion_t,
920 list<pair<OpRequestRef, version_t> > > waiting_for_ondisk;
921
922 void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m);
923 void requeue_op(OpRequestRef op);
924 void requeue_ops(list<OpRequestRef> &l);
925
926 // stats that persist lazily
927 object_stat_collection_t unstable_stats;
928
929 // publish stats
930 Mutex pg_stats_publish_lock;
931 bool pg_stats_publish_valid;
932 pg_stat_t pg_stats_publish;
933
934 // for ordering writes
935 ceph::shared_ptr<ObjectStore::Sequencer> osr;
936
937 void _update_calc_stats();
938 void _update_blocked_by();
939 void publish_stats_to_osd();
940 void clear_publish_stats();
941
942 public:
943 void clear_primary_state();
944
945 public:
946 bool is_actingbackfill(pg_shard_t osd) const {
947 return actingbackfill.count(osd);
948 }
949 bool is_acting(pg_shard_t osd) const {
950 return has_shard(pool.info.ec_pool(), acting, osd);
951 }
952 bool is_up(pg_shard_t osd) const {
953 return has_shard(pool.info.ec_pool(), up, osd);
954 }
955 static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) {
956 if (ec) {
957 return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd;
958 } else {
959 return std::find(v.begin(), v.end(), osd.osd) != v.end();
960 }
961 }
962
963 bool needs_recovery() const;
964 bool needs_backfill() const;
965
966 /// get log recovery reservation priority
967 unsigned get_recovery_priority();
968 /// get backfill reservation priority
969 unsigned get_backfill_priority();
970
971 void mark_clean(); ///< mark an active pg clean
972
973 /// return [start,end) bounds for required past_intervals
974 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
975 const pg_info_t &info,
976 epoch_t oldest_map) {
977 epoch_t start = MAX(
978 info.history.last_epoch_clean ? info.history.last_epoch_clean :
979 info.history.epoch_created,
980 oldest_map);
981 epoch_t end = MAX(
982 info.history.same_interval_since,
983 info.history.epoch_created);
984 return make_pair(start, end);
985 }
986 void check_past_interval_bounds() const;
987 PastIntervals::PriorSet build_prior();
988
989 void remove_down_peer_info(const OSDMapRef osdmap);
990
991 bool adjust_need_up_thru(const OSDMapRef osdmap);
992
993 bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
994 virtual void dump_recovery_info(Formatter *f) const = 0;
995
996 bool calc_min_last_complete_ondisk() {
997 eversion_t min = last_complete_ondisk;
998 assert(!actingbackfill.empty());
999 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1000 i != actingbackfill.end();
1001 ++i) {
1002 if (*i == get_primary()) continue;
1003 if (peer_last_complete_ondisk.count(*i) == 0)
1004 return false; // we don't have complete info
1005 eversion_t a = peer_last_complete_ondisk[*i];
1006 if (a < min)
1007 min = a;
1008 }
1009 if (min == min_last_complete_ondisk)
1010 return false;
1011 min_last_complete_ondisk = min;
1012 return true;
1013 }
1014
1015 virtual void calc_trim_to() = 0;
1016
1017 void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
1018 pg_missing_t& omissing, pg_shard_t from);
1019 void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
1020 pg_missing_t& omissing, pg_shard_t from);
1021 bool proc_replica_info(
1022 pg_shard_t from, const pg_info_t &info, epoch_t send_epoch);
1023
1024 struct PGLogEntryHandler : public PGLog::LogEntryHandler {
1025 PG *pg;
1026 ObjectStore::Transaction *t;
1027 PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {}
1028
1029 // LogEntryHandler
1030 void remove(const hobject_t &hoid) override {
1031 pg->get_pgbackend()->remove(hoid, t);
1032 }
1033 void try_stash(const hobject_t &hoid, version_t v) override {
1034 pg->get_pgbackend()->try_stash(hoid, v, t);
1035 }
1036 void rollback(const pg_log_entry_t &entry) override {
1037 assert(entry.can_rollback());
1038 pg->get_pgbackend()->rollback(entry, t);
1039 }
1040 void rollforward(const pg_log_entry_t &entry) override {
1041 pg->get_pgbackend()->rollforward(entry, t);
1042 }
1043 void trim(const pg_log_entry_t &entry) override {
1044 pg->get_pgbackend()->trim(entry, t);
1045 }
1046 };
1047
1048 void update_object_snap_mapping(
1049 ObjectStore::Transaction *t, const hobject_t &soid,
1050 const set<snapid_t> &snaps);
1051 void clear_object_snap_mapping(
1052 ObjectStore::Transaction *t, const hobject_t &soid);
1053 void remove_snap_mapped_object(
1054 ObjectStore::Transaction& t, const hobject_t& soid);
1055 void merge_log(
1056 ObjectStore::Transaction& t, pg_info_t &oinfo,
1057 pg_log_t &olog, pg_shard_t from);
1058 void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead);
1059 bool search_for_missing(
1060 const pg_info_t &oinfo, const pg_missing_t &omissing,
1061 pg_shard_t fromosd,
1062 RecoveryCtx*);
1063
1064 void check_for_lost_objects();
1065 void forget_lost_objects();
1066
1067 void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map);
1068
1069 void trim_write_ahead();
1070
1071 map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
1072 const map<pg_shard_t, pg_info_t> &infos,
1073 bool restrict_to_up_acting,
1074 bool *history_les_bound) const;
1075 static void calc_ec_acting(
1076 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1077 unsigned size,
1078 const vector<int> &acting,
1079 pg_shard_t acting_primary,
1080 const vector<int> &up,
1081 pg_shard_t up_primary,
1082 const map<pg_shard_t, pg_info_t> &all_info,
1083 bool compat_mode,
1084 bool restrict_to_up_acting,
1085 vector<int> *want,
1086 set<pg_shard_t> *backfill,
1087 set<pg_shard_t> *acting_backfill,
1088 pg_shard_t *want_primary,
1089 ostream &ss);
1090 static void calc_replicated_acting(
1091 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1092 unsigned size,
1093 const vector<int> &acting,
1094 pg_shard_t acting_primary,
1095 const vector<int> &up,
1096 pg_shard_t up_primary,
1097 const map<pg_shard_t, pg_info_t> &all_info,
1098 bool compat_mode,
1099 bool restrict_to_up_acting,
1100 vector<int> *want,
1101 set<pg_shard_t> *backfill,
1102 set<pg_shard_t> *acting_backfill,
1103 pg_shard_t *want_primary,
1104 ostream &ss);
1105 bool choose_acting(pg_shard_t &auth_log_shard,
1106 bool restrict_to_up_acting,
1107 bool *history_les_bound);
1108 void build_might_have_unfound();
1109 void activate(
1110 ObjectStore::Transaction& t,
1111 epoch_t activation_epoch,
1112 list<Context*>& tfin,
1113 map<int, map<spg_t,pg_query_t> >& query_map,
1114 map<int,
1115 vector<pair<pg_notify_t, PastIntervals> > > *activator_map,
1116 RecoveryCtx *ctx);
1117 void _activate_committed(epoch_t epoch, epoch_t activation_epoch);
1118 void all_activated_and_committed();
1119
1120 void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
1121
1122 bool have_unfound() const {
1123 return missing_loc.have_unfound();
1124 }
1125 uint64_t get_num_unfound() const {
1126 return missing_loc.num_unfound();
1127 }
1128
1129 virtual void check_local() = 0;
1130
1131 /**
1132 * @param ops_begun returns how many recovery ops the function started
1133 * @returns true if any useful work was accomplished; false otherwise
1134 */
1135 virtual bool start_recovery_ops(
1136 uint64_t max,
1137 ThreadPool::TPHandle &handle,
1138 uint64_t *ops_begun) = 0;
1139
1140 void purge_strays();
1141
1142 void update_heartbeat_peers();
1143
1144 Context *finish_sync_event;
1145
1146 void finish_recovery(list<Context*>& tfin);
1147 void _finish_recovery(Context *c);
1148 void cancel_recovery();
1149 void clear_recovery_state();
1150 virtual void _clear_recovery_state() = 0;
1151 virtual void check_recovery_sources(const OSDMapRef& newmap) = 0;
1152 void start_recovery_op(const hobject_t& soid);
1153 void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
1154
1155 void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
1156 virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
1157
1158 friend class C_OSD_RepModify_Commit;
1159
1160 // -- backoff --
1161 Mutex backoff_lock; // orders inside Backoff::lock
1162 map<hobject_t,set<BackoffRef>> backoffs;
1163
1164 void add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end);
1165 void release_backoffs(const hobject_t& begin, const hobject_t& end);
1166 void release_backoffs(const hobject_t& o) {
1167 release_backoffs(o, o);
1168 }
1169 void clear_backoffs();
1170
1171 void add_pg_backoff(SessionRef s) {
1172 hobject_t begin = info.pgid.pgid.get_hobj_start();
1173 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1174 add_backoff(s, begin, end);
1175 }
1176 void release_pg_backoffs() {
1177 hobject_t begin = info.pgid.pgid.get_hobj_start();
1178 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1179 release_backoffs(begin, end);
1180 }
1181
1182 void rm_backoff(BackoffRef b);
1183
1184 // -- scrub --
1185 struct Scrubber {
1186 Scrubber();
1187 ~Scrubber();
1188
1189 // metadata
1190 set<pg_shard_t> reserved_peers;
1191 bool reserved, reserve_failed;
1192 epoch_t epoch_start;
1193
1194 // common to both scrubs
1195 bool active;
1196 bool queue_snap_trim;
1197 int waiting_on;
1198 set<pg_shard_t> waiting_on_whom;
1199 int shallow_errors;
1200 int deep_errors;
1201 int fixed;
1202 ScrubMap primary_scrubmap;
1203 map<pg_shard_t, ScrubMap> received_maps;
1204 OpRequestRef active_rep_scrub;
1205 utime_t scrub_reg_stamp; // stamp we registered for
1206
1207 // For async sleep
1208 bool sleeping = false;
1209 bool needs_sleep = true;
1210 utime_t sleep_start;
1211
1212 // flags to indicate explicitly requested scrubs (by admin)
1213 bool must_scrub, must_deep_scrub, must_repair;
1214
1215 // Priority to use for scrub scheduling
1216 unsigned priority;
1217
1218 // this flag indicates whether we would like to do auto-repair of the PG or not
1219 bool auto_repair;
1220
1221 // Maps from objects with errors to missing/inconsistent peers
1222 map<hobject_t, set<pg_shard_t>> missing;
1223 map<hobject_t, set<pg_shard_t>> inconsistent;
1224
1225 // Map from object with errors to good peers
1226 map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative;
1227
1228 // Cleaned map pending snap metadata scrub
1229 ScrubMap cleaned_meta_map;
1230
1231 // digest updates which we are waiting on
1232 int num_digest_updates_pending;
1233
1234 // chunky scrub
1235 hobject_t start, end;
1236 eversion_t subset_last_update;
1237
1238 // chunky scrub state
1239 enum State {
1240 INACTIVE,
1241 NEW_CHUNK,
1242 WAIT_PUSHES,
1243 WAIT_LAST_UPDATE,
1244 BUILD_MAP,
1245 WAIT_REPLICAS,
1246 COMPARE_MAPS,
1247 WAIT_DIGEST_UPDATES,
1248 FINISH,
1249 } state;
1250
1251 std::unique_ptr<Scrub::Store> store;
1252 // deep scrub
1253 bool deep;
1254 uint32_t seed;
1255
1256 list<Context*> callbacks;
1257 void add_callback(Context *context) {
1258 callbacks.push_back(context);
1259 }
1260 void run_callbacks() {
1261 list<Context*> to_run;
1262 to_run.swap(callbacks);
1263 for (list<Context*>::iterator i = to_run.begin();
1264 i != to_run.end();
1265 ++i) {
1266 (*i)->complete(0);
1267 }
1268 }
1269
1270 static const char *state_string(const PG::Scrubber::State& state) {
1271 const char *ret = NULL;
1272 switch( state )
1273 {
1274 case INACTIVE: ret = "INACTIVE"; break;
1275 case NEW_CHUNK: ret = "NEW_CHUNK"; break;
1276 case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
1277 case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
1278 case BUILD_MAP: ret = "BUILD_MAP"; break;
1279 case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
1280 case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
1281 case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
1282 case FINISH: ret = "FINISH"; break;
1283 }
1284 return ret;
1285 }
1286
1287 bool is_chunky_scrub_active() const { return state != INACTIVE; }
1288
1289 // classic (non chunk) scrubs block all writes
1290 // chunky scrubs only block writes to a range
1291 bool write_blocked_by_scrub(const hobject_t &soid) {
1292 return (soid >= start && soid < end);
1293 }
1294
1295 // clear all state
1296 void reset() {
1297 active = false;
1298 queue_snap_trim = false;
1299 waiting_on = 0;
1300 waiting_on_whom.clear();
1301 if (active_rep_scrub) {
1302 active_rep_scrub = OpRequestRef();
1303 }
1304 received_maps.clear();
1305
1306 must_scrub = false;
1307 must_deep_scrub = false;
1308 must_repair = false;
1309 auto_repair = false;
1310
1311 state = PG::Scrubber::INACTIVE;
1312 start = hobject_t();
1313 end = hobject_t();
1314 subset_last_update = eversion_t();
1315 shallow_errors = 0;
1316 deep_errors = 0;
1317 fixed = 0;
1318 deep = false;
1319 seed = 0;
1320 run_callbacks();
1321 inconsistent.clear();
1322 missing.clear();
1323 authoritative.clear();
1324 num_digest_updates_pending = 0;
1325 cleaned_meta_map = ScrubMap();
1326 sleeping = false;
1327 needs_sleep = true;
1328 sleep_start = utime_t();
1329 }
1330
1331 void create_results(const hobject_t& obj);
1332 void cleanup_store(ObjectStore::Transaction *t);
1333 } scrubber;
1334
1335 bool scrub_after_recovery;
1336
1337 int active_pushes;
1338
1339 void repair_object(
1340 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
1341 pg_shard_t bad_peer);
1342
1343 void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
1344 void chunky_scrub(ThreadPool::TPHandle &handle);
1345 void scrub_compare_maps();
1346 /**
1347 * return true if any inconsistency/missing is repaired, false otherwise
1348 */
1349 bool scrub_process_inconsistent();
1350 void scrub_finish();
1351 void scrub_clear_state();
1352 void _scan_snaps(ScrubMap &map);
1353 void _scan_rollback_obs(
1354 const vector<ghobject_t> &rollback_obs,
1355 ThreadPool::TPHandle &handle);
1356 void _request_scrub_map(pg_shard_t replica, eversion_t version,
1357 hobject_t start, hobject_t end, bool deep,
1358 uint32_t seed);
1359 int build_scrub_map_chunk(
1360 ScrubMap &map,
1361 hobject_t start, hobject_t end, bool deep, uint32_t seed,
1362 ThreadPool::TPHandle &handle);
1363 /**
1364 * returns true if [begin, end) is good to scrub at this time
1365 * a false return value obliges the implementer to requeue scrub when the
1366 * condition preventing scrub clears
1367 */
1368 virtual bool _range_available_for_scrub(
1369 const hobject_t &begin, const hobject_t &end) = 0;
1370 virtual void scrub_snapshot_metadata(
1371 ScrubMap &map,
1372 const std::map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest) { }
1373 virtual void _scrub_clear_state() { }
1374 virtual void _scrub_finish() { }
1375 virtual void split_colls(
1376 spg_t child,
1377 int split_bits,
1378 int seed,
1379 const pg_pool_t *pool,
1380 ObjectStore::Transaction *t) = 0;
1381 void clear_scrub_reserved();
1382 void scrub_reserve_replicas();
1383 void scrub_unreserve_replicas();
1384 bool scrub_all_replicas_reserved() const;
1385 bool sched_scrub();
1386 void reg_next_scrub();
1387 void unreg_next_scrub();
1388
1389 void replica_scrub(
1390 OpRequestRef op,
1391 ThreadPool::TPHandle &handle);
1392 void do_replica_scrub_map(OpRequestRef op);
1393 void sub_op_scrub_map(OpRequestRef op);
1394
1395 void handle_scrub_reserve_request(OpRequestRef op);
1396 void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
1397 void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
1398 void handle_scrub_reserve_release(OpRequestRef op);
1399
1400 void reject_reservation();
1401 void schedule_backfill_full_retry();
1402 void schedule_recovery_full_retry();
1403
1404 // -- recovery state --
1405
1406 template <class EVT>
1407 struct QueuePeeringEvt : Context {
1408 PGRef pg;
1409 epoch_t epoch;
1410 EVT evt;
1411 QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
1412 pg(pg), epoch(epoch), evt(evt) {}
1413 void finish(int r) override {
1414 pg->lock();
1415 pg->queue_peering_event(PG::CephPeeringEvtRef(
1416 new PG::CephPeeringEvt(
1417 epoch,
1418 epoch,
1419 evt)));
1420 pg->unlock();
1421 }
1422 };
1423
1424 class CephPeeringEvt {
1425 epoch_t epoch_sent;
1426 epoch_t epoch_requested;
1427 boost::intrusive_ptr< const boost::statechart::event_base > evt;
1428 string desc;
1429 public:
1430 MEMPOOL_CLASS_HELPERS();
1431 template <class T>
1432 CephPeeringEvt(epoch_t epoch_sent,
1433 epoch_t epoch_requested,
1434 const T &evt_) :
1435 epoch_sent(epoch_sent), epoch_requested(epoch_requested),
1436 evt(evt_.intrusive_from_this()) {
1437 stringstream out;
1438 out << "epoch_sent: " << epoch_sent
1439 << " epoch_requested: " << epoch_requested << " ";
1440 evt_.print(&out);
1441 desc = out.str();
1442 }
1443 epoch_t get_epoch_sent() { return epoch_sent; }
1444 epoch_t get_epoch_requested() { return epoch_requested; }
1445 const boost::statechart::event_base &get_event() { return *evt; }
1446 string get_desc() { return desc; }
1447 };
1448 typedef ceph::shared_ptr<CephPeeringEvt> CephPeeringEvtRef;
1449 list<CephPeeringEvtRef> peering_queue; // op queue
1450 list<CephPeeringEvtRef> peering_waiters;
1451
1452 struct QueryState : boost::statechart::event< QueryState > {
1453 Formatter *f;
1454 explicit QueryState(Formatter *f) : f(f) {}
1455 void print(std::ostream *out) const {
1456 *out << "Query";
1457 }
1458 };
1459
1460 struct MInfoRec : boost::statechart::event< MInfoRec > {
1461 pg_shard_t from;
1462 pg_info_t info;
1463 epoch_t msg_epoch;
1464 MInfoRec(pg_shard_t from, const pg_info_t &info, epoch_t msg_epoch) :
1465 from(from), info(info), msg_epoch(msg_epoch) {}
1466 void print(std::ostream *out) const {
1467 *out << "MInfoRec from " << from << " info: " << info;
1468 }
1469 };
1470
1471 struct MLogRec : boost::statechart::event< MLogRec > {
1472 pg_shard_t from;
1473 boost::intrusive_ptr<MOSDPGLog> msg;
1474 MLogRec(pg_shard_t from, MOSDPGLog *msg) :
1475 from(from), msg(msg) {}
1476 void print(std::ostream *out) const {
1477 *out << "MLogRec from " << from;
1478 }
1479 };
1480
1481 struct MNotifyRec : boost::statechart::event< MNotifyRec > {
1482 pg_shard_t from;
1483 pg_notify_t notify;
1484 uint64_t features;
1485 MNotifyRec(pg_shard_t from, const pg_notify_t &notify, uint64_t f) :
1486 from(from), notify(notify), features(f) {}
1487 void print(std::ostream *out) const {
1488 *out << "MNotifyRec from " << from << " notify: " << notify
1489 << " features: 0x" << hex << features << dec;
1490 }
1491 };
1492
1493 struct MQuery : boost::statechart::event< MQuery > {
1494 pg_shard_t from;
1495 pg_query_t query;
1496 epoch_t query_epoch;
1497 MQuery(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch):
1498 from(from), query(query), query_epoch(query_epoch) {}
1499 void print(std::ostream *out) const {
1500 *out << "MQuery from " << from
1501 << " query_epoch " << query_epoch
1502 << " query: " << query;
1503 }
1504 };
1505
1506 struct AdvMap : boost::statechart::event< AdvMap > {
1507 OSDMapRef osdmap;
1508 OSDMapRef lastmap;
1509 vector<int> newup, newacting;
1510 int up_primary, acting_primary;
1511 AdvMap(
1512 OSDMapRef osdmap, OSDMapRef lastmap,
1513 vector<int>& newup, int up_primary,
1514 vector<int>& newacting, int acting_primary):
1515 osdmap(osdmap), lastmap(lastmap),
1516 newup(newup),
1517 newacting(newacting),
1518 up_primary(up_primary),
1519 acting_primary(acting_primary) {}
1520 void print(std::ostream *out) const {
1521 *out << "AdvMap";
1522 }
1523 };
1524
1525 struct ActMap : boost::statechart::event< ActMap > {
1526 ActMap() : boost::statechart::event< ActMap >() {}
1527 void print(std::ostream *out) const {
1528 *out << "ActMap";
1529 }
1530 };
1531 struct Activate : boost::statechart::event< Activate > {
1532 epoch_t activation_epoch;
1533 explicit Activate(epoch_t q) : boost::statechart::event< Activate >(),
1534 activation_epoch(q) {}
1535 void print(std::ostream *out) const {
1536 *out << "Activate from " << activation_epoch;
1537 }
1538 };
1539 struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > {
1540 unsigned priority;
1541 explicit RequestBackfillPrio(unsigned prio) :
1542 boost::statechart::event< RequestBackfillPrio >(),
1543 priority(prio) {}
1544 void print(std::ostream *out) const {
1545 *out << "RequestBackfillPrio: priority " << priority;
1546 }
1547 };
1548 #define TrivialEvent(T) struct T : boost::statechart::event< T > { \
1549 T() : boost::statechart::event< T >() {} \
1550 void print(std::ostream *out) const { \
1551 *out << #T; \
1552 } \
1553 };
1554 TrivialEvent(Initialize)
1555 TrivialEvent(Load)
1556 TrivialEvent(GotInfo)
1557 TrivialEvent(NeedUpThru)
1558 TrivialEvent(NullEvt)
1559 TrivialEvent(FlushedEvt)
1560 TrivialEvent(Backfilled)
1561 TrivialEvent(LocalBackfillReserved)
1562 TrivialEvent(RemoteBackfillReserved)
1563 TrivialEvent(RemoteReservationRejected)
1564 TrivialEvent(RequestBackfill)
1565 TrivialEvent(RequestRecovery)
1566 TrivialEvent(RecoveryDone)
1567 TrivialEvent(BackfillTooFull)
1568 TrivialEvent(RecoveryTooFull)
1569
1570 TrivialEvent(AllReplicasRecovered)
1571 TrivialEvent(DoRecovery)
1572 TrivialEvent(LocalRecoveryReserved)
1573 TrivialEvent(RemoteRecoveryReserved)
1574 TrivialEvent(AllRemotesReserved)
1575 TrivialEvent(AllBackfillsReserved)
1576 TrivialEvent(GoClean)
1577
1578 TrivialEvent(AllReplicasActivated)
1579
1580 TrivialEvent(IntervalFlush)
1581
1582 /* Encapsulates PG recovery process */
1583 class RecoveryState {
1584 void start_handle(RecoveryCtx *new_ctx);
1585 void end_handle();
1586 public:
1587 void begin_block_outgoing();
1588 void end_block_outgoing();
1589 void clear_blocked_outgoing();
1590 private:
1591
1592 /* States */
1593 struct Initial;
1594 class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > {
1595 RecoveryState *state;
1596 public:
1597 PG *pg;
1598
1599 utime_t event_time;
1600 uint64_t event_count;
1601
1602 void clear_event_counters() {
1603 event_time = utime_t();
1604 event_count = 0;
1605 }
1606
1607 void log_enter(const char *state_name);
1608 void log_exit(const char *state_name, utime_t duration);
1609
1610 RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {}
1611
1612 /* Accessor functions for state methods */
1613 ObjectStore::Transaction* get_cur_transaction() {
1614 assert(state->rctx);
1615 assert(state->rctx->transaction);
1616 return state->rctx->transaction;
1617 }
1618
1619 void send_query(pg_shard_t to, const pg_query_t &query) {
1620 assert(state->rctx);
1621 assert(state->rctx->query_map);
1622 (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
1623 query;
1624 }
1625
1626 map<int, map<spg_t, pg_query_t> > *get_query_map() {
1627 assert(state->rctx);
1628 assert(state->rctx->query_map);
1629 return state->rctx->query_map;
1630 }
1631
1632 map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() {
1633 assert(state->rctx);
1634 assert(state->rctx->info_map);
1635 return state->rctx->info_map;
1636 }
1637
1638 list< Context* > *get_on_safe_context_list() {
1639 assert(state->rctx);
1640 assert(state->rctx->on_safe);
1641 return &(state->rctx->on_safe->contexts);
1642 }
1643
1644 list< Context * > *get_on_applied_context_list() {
1645 assert(state->rctx);
1646 assert(state->rctx->on_applied);
1647 return &(state->rctx->on_applied->contexts);
1648 }
1649
1650 RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
1651
1652 void send_notify(pg_shard_t to,
1653 const pg_notify_t &info, const PastIntervals &pi) {
1654 assert(state->rctx);
1655 assert(state->rctx->notify_list);
1656 (*state->rctx->notify_list)[to.osd].push_back(make_pair(info, pi));
1657 }
1658 };
1659 friend class RecoveryMachine;
1660
1661 /* States */
1662
1663 struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState {
1664 explicit Crashed(my_context ctx);
1665 };
1666
1667 struct Reset;
1668
1669 struct Initial : boost::statechart::state< Initial, RecoveryMachine >, NamedState {
1670 explicit Initial(my_context ctx);
1671 void exit();
1672
1673 typedef boost::mpl::list <
1674 boost::statechart::transition< Initialize, Reset >,
1675 boost::statechart::custom_reaction< Load >,
1676 boost::statechart::custom_reaction< NullEvt >,
1677 boost::statechart::transition< boost::statechart::event_base, Crashed >
1678 > reactions;
1679
1680 boost::statechart::result react(const Load&);
1681 boost::statechart::result react(const MNotifyRec&);
1682 boost::statechart::result react(const MInfoRec&);
1683 boost::statechart::result react(const MLogRec&);
1684 boost::statechart::result react(const boost::statechart::event_base&) {
1685 return discard_event();
1686 }
1687 };
1688
1689 struct Reset : boost::statechart::state< Reset, RecoveryMachine >, NamedState {
1690 explicit Reset(my_context ctx);
1691 void exit();
1692
1693 typedef boost::mpl::list <
1694 boost::statechart::custom_reaction< QueryState >,
1695 boost::statechart::custom_reaction< AdvMap >,
1696 boost::statechart::custom_reaction< ActMap >,
1697 boost::statechart::custom_reaction< NullEvt >,
1698 boost::statechart::custom_reaction< FlushedEvt >,
1699 boost::statechart::custom_reaction< IntervalFlush >,
1700 boost::statechart::transition< boost::statechart::event_base, Crashed >
1701 > reactions;
1702 boost::statechart::result react(const QueryState& q);
1703 boost::statechart::result react(const AdvMap&);
1704 boost::statechart::result react(const ActMap&);
1705 boost::statechart::result react(const FlushedEvt&);
1706 boost::statechart::result react(const IntervalFlush&);
1707 boost::statechart::result react(const boost::statechart::event_base&) {
1708 return discard_event();
1709 }
1710 };
1711
1712 struct Start;
1713
1714 struct Started : boost::statechart::state< Started, RecoveryMachine, Start >, NamedState {
1715 explicit Started(my_context ctx);
1716 void exit();
1717
1718 typedef boost::mpl::list <
1719 boost::statechart::custom_reaction< QueryState >,
1720 boost::statechart::custom_reaction< AdvMap >,
1721 boost::statechart::custom_reaction< NullEvt >,
1722 boost::statechart::custom_reaction< FlushedEvt >,
1723 boost::statechart::custom_reaction< IntervalFlush >,
1724 boost::statechart::transition< boost::statechart::event_base, Crashed >
1725 > reactions;
1726 boost::statechart::result react(const QueryState& q);
1727 boost::statechart::result react(const AdvMap&);
1728 boost::statechart::result react(const FlushedEvt&);
1729 boost::statechart::result react(const IntervalFlush&);
1730 boost::statechart::result react(const boost::statechart::event_base&) {
1731 return discard_event();
1732 }
1733 };
1734
1735 struct MakePrimary : boost::statechart::event< MakePrimary > {
1736 MakePrimary() : boost::statechart::event< MakePrimary >() {}
1737 };
1738 struct MakeStray : boost::statechart::event< MakeStray > {
1739 MakeStray() : boost::statechart::event< MakeStray >() {}
1740 };
1741 struct Primary;
1742 struct Stray;
1743
1744 struct Start : boost::statechart::state< Start, Started >, NamedState {
1745 explicit Start(my_context ctx);
1746 void exit();
1747
1748 typedef boost::mpl::list <
1749 boost::statechart::transition< MakePrimary, Primary >,
1750 boost::statechart::transition< MakeStray, Stray >
1751 > reactions;
1752 };
1753
1754 struct Peering;
1755 struct WaitActingChange;
1756 struct NeedActingChange : boost::statechart::event< NeedActingChange > {
1757 NeedActingChange() : boost::statechart::event< NeedActingChange >() {}
1758 };
1759 struct Incomplete;
1760 struct IsIncomplete : boost::statechart::event< IsIncomplete > {
1761 IsIncomplete() : boost::statechart::event< IsIncomplete >() {}
1762 };
1763 struct Down;
1764 struct IsDown : boost::statechart::event< IsDown > {
1765 IsDown() : boost::statechart::event< IsDown >() {}
1766 };
1767
1768 struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
1769 explicit Primary(my_context ctx);
1770 void exit();
1771
1772 typedef boost::mpl::list <
1773 boost::statechart::custom_reaction< ActMap >,
1774 boost::statechart::custom_reaction< MNotifyRec >,
1775 boost::statechart::transition< NeedActingChange, WaitActingChange >
1776 > reactions;
1777 boost::statechart::result react(const ActMap&);
1778 boost::statechart::result react(const MNotifyRec&);
1779 };
1780
1781 struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
1782 NamedState {
1783 typedef boost::mpl::list <
1784 boost::statechart::custom_reaction< QueryState >,
1785 boost::statechart::custom_reaction< AdvMap >,
1786 boost::statechart::custom_reaction< MLogRec >,
1787 boost::statechart::custom_reaction< MInfoRec >,
1788 boost::statechart::custom_reaction< MNotifyRec >
1789 > reactions;
1790 explicit WaitActingChange(my_context ctx);
1791 boost::statechart::result react(const QueryState& q);
1792 boost::statechart::result react(const AdvMap&);
1793 boost::statechart::result react(const MLogRec&);
1794 boost::statechart::result react(const MInfoRec&);
1795 boost::statechart::result react(const MNotifyRec&);
1796 void exit();
1797 };
1798
1799 struct GetInfo;
1800 struct Active;
1801
1802 struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
1803 PastIntervals::PriorSet prior_set;
1804 bool history_les_bound; //< need osd_find_best_info_ignore_history_les
1805
1806 explicit Peering(my_context ctx);
1807 void exit();
1808
1809 typedef boost::mpl::list <
1810 boost::statechart::custom_reaction< QueryState >,
1811 boost::statechart::transition< Activate, Active >,
1812 boost::statechart::custom_reaction< AdvMap >
1813 > reactions;
1814 boost::statechart::result react(const QueryState& q);
1815 boost::statechart::result react(const AdvMap &advmap);
1816 };
1817
1818 struct WaitLocalRecoveryReserved;
1819 struct Activating;
1820 struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState {
1821 explicit Active(my_context ctx);
1822 void exit();
1823
1824 const set<pg_shard_t> remote_shards_to_reserve_recovery;
1825 const set<pg_shard_t> remote_shards_to_reserve_backfill;
1826 bool all_replicas_activated;
1827
1828 typedef boost::mpl::list <
1829 boost::statechart::custom_reaction< QueryState >,
1830 boost::statechart::custom_reaction< ActMap >,
1831 boost::statechart::custom_reaction< AdvMap >,
1832 boost::statechart::custom_reaction< MInfoRec >,
1833 boost::statechart::custom_reaction< MNotifyRec >,
1834 boost::statechart::custom_reaction< MLogRec >,
1835 boost::statechart::custom_reaction< Backfilled >,
1836 boost::statechart::custom_reaction< AllReplicasActivated >
1837 > reactions;
1838 boost::statechart::result react(const QueryState& q);
1839 boost::statechart::result react(const ActMap&);
1840 boost::statechart::result react(const AdvMap&);
1841 boost::statechart::result react(const MInfoRec& infoevt);
1842 boost::statechart::result react(const MNotifyRec& notevt);
1843 boost::statechart::result react(const MLogRec& logevt);
1844 boost::statechart::result react(const Backfilled&) {
1845 return discard_event();
1846 }
1847 boost::statechart::result react(const AllReplicasActivated&);
1848 };
1849
1850 struct Clean : boost::statechart::state< Clean, Active >, NamedState {
1851 typedef boost::mpl::list<
1852 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >
1853 > reactions;
1854 explicit Clean(my_context ctx);
1855 void exit();
1856 };
1857
1858 struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
1859 typedef boost::mpl::list<
1860 boost::statechart::transition< GoClean, Clean >,
1861 boost::statechart::custom_reaction< AllReplicasActivated >
1862 > reactions;
1863 explicit Recovered(my_context ctx);
1864 void exit();
1865 boost::statechart::result react(const AllReplicasActivated&) {
1866 post_event(GoClean());
1867 return forward_event();
1868 }
1869 };
1870
1871 struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
1872 typedef boost::mpl::list<
1873 boost::statechart::transition< Backfilled, Recovered >,
1874 boost::statechart::custom_reaction< RemoteReservationRejected >
1875 > reactions;
1876 explicit Backfilling(my_context ctx);
1877 boost::statechart::result react(const RemoteReservationRejected& evt);
1878 void exit();
1879 };
1880
1881 struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
1882 typedef boost::mpl::list<
1883 boost::statechart::custom_reaction< RemoteBackfillReserved >,
1884 boost::statechart::custom_reaction< RemoteReservationRejected >,
1885 boost::statechart::transition< AllBackfillsReserved, Backfilling >
1886 > reactions;
1887 set<pg_shard_t>::const_iterator backfill_osd_it;
1888 explicit WaitRemoteBackfillReserved(my_context ctx);
1889 void exit();
1890 boost::statechart::result react(const RemoteBackfillReserved& evt);
1891 boost::statechart::result react(const RemoteReservationRejected& evt);
1892 };
1893
1894 struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
1895 typedef boost::mpl::list<
1896 boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved >
1897 > reactions;
1898 explicit WaitLocalBackfillReserved(my_context ctx);
1899 void exit();
1900 };
1901
1902 struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
1903 typedef boost::mpl::list<
1904 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
1905 boost::statechart::custom_reaction< RemoteBackfillReserved >,
1906 boost::statechart::custom_reaction< RemoteReservationRejected >
1907 > reactions;
1908 explicit NotBackfilling(my_context ctx);
1909 void exit();
1910 boost::statechart::result react(const RemoteBackfillReserved& evt);
1911 boost::statechart::result react(const RemoteReservationRejected& evt);
1912 };
1913
1914 struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
1915 typedef boost::mpl::list<
1916 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >
1917 > reactions;
1918 explicit NotRecovering(my_context ctx);
1919 void exit();
1920 };
1921
1922 struct RepNotRecovering;
1923 struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
1924 explicit ReplicaActive(my_context ctx);
1925 void exit();
1926
1927 typedef boost::mpl::list <
1928 boost::statechart::custom_reaction< QueryState >,
1929 boost::statechart::custom_reaction< ActMap >,
1930 boost::statechart::custom_reaction< MQuery >,
1931 boost::statechart::custom_reaction< MInfoRec >,
1932 boost::statechart::custom_reaction< MLogRec >,
1933 boost::statechart::custom_reaction< Activate >
1934 > reactions;
1935 boost::statechart::result react(const QueryState& q);
1936 boost::statechart::result react(const MInfoRec& infoevt);
1937 boost::statechart::result react(const MLogRec& logevt);
1938 boost::statechart::result react(const ActMap&);
1939 boost::statechart::result react(const MQuery&);
1940 boost::statechart::result react(const Activate&);
1941 };
1942
1943 struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
1944 typedef boost::mpl::list<
1945 boost::statechart::transition< RecoveryDone, RepNotRecovering >,
1946 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
1947 boost::statechart::custom_reaction< BackfillTooFull >
1948 > reactions;
1949 explicit RepRecovering(my_context ctx);
1950 boost::statechart::result react(const BackfillTooFull &evt);
1951 void exit();
1952 };
1953
1954 struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
1955 typedef boost::mpl::list<
1956 boost::statechart::custom_reaction< RemoteBackfillReserved >,
1957 boost::statechart::custom_reaction< RemoteReservationRejected >
1958 > reactions;
1959 explicit RepWaitBackfillReserved(my_context ctx);
1960 void exit();
1961 boost::statechart::result react(const RemoteBackfillReserved &evt);
1962 boost::statechart::result react(const RemoteReservationRejected &evt);
1963 };
1964
1965 struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
1966 typedef boost::mpl::list<
1967 boost::statechart::custom_reaction< RemoteRecoveryReserved >
1968 > reactions;
1969 explicit RepWaitRecoveryReserved(my_context ctx);
1970 void exit();
1971 boost::statechart::result react(const RemoteRecoveryReserved &evt);
1972 };
1973
1974 struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
1975 typedef boost::mpl::list<
1976 boost::statechart::custom_reaction< RequestBackfillPrio >,
1977 boost::statechart::transition< RequestRecovery, RepWaitRecoveryReserved >,
1978 boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers
1979 > reactions;
1980 explicit RepNotRecovering(my_context ctx);
1981 boost::statechart::result react(const RequestBackfillPrio &evt);
1982 void exit();
1983 };
1984
1985 struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
1986 typedef boost::mpl::list <
1987 boost::statechart::custom_reaction< AllReplicasRecovered >,
1988 boost::statechart::custom_reaction< RequestBackfill >
1989 > reactions;
1990 explicit Recovering(my_context ctx);
1991 void exit();
1992 void release_reservations();
1993 boost::statechart::result react(const AllReplicasRecovered &evt);
1994 boost::statechart::result react(const RequestBackfill &evt);
1995 };
1996
1997 struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState {
1998 typedef boost::mpl::list <
1999 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2000 boost::statechart::transition< AllRemotesReserved, Recovering >
2001 > reactions;
2002 set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
2003 explicit WaitRemoteRecoveryReserved(my_context ctx);
2004 boost::statechart::result react(const RemoteRecoveryReserved &evt);
2005 void exit();
2006 };
2007
2008 struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState {
2009 typedef boost::mpl::list <
2010 boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >,
2011 boost::statechart::custom_reaction< RecoveryTooFull >
2012 > reactions;
2013 explicit WaitLocalRecoveryReserved(my_context ctx);
2014 void exit();
2015 boost::statechart::result react(const RecoveryTooFull &evt);
2016 };
2017
2018 struct Activating : boost::statechart::state< Activating, Active >, NamedState {
2019 typedef boost::mpl::list <
2020 boost::statechart::transition< AllReplicasRecovered, Recovered >,
2021 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2022 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved >
2023 > reactions;
2024 explicit Activating(my_context ctx);
2025 void exit();
2026 };
2027
2028 struct Stray : boost::statechart::state< Stray, Started >, NamedState {
2029 map<int, pair<pg_query_t, epoch_t> > pending_queries;
2030
2031 explicit Stray(my_context ctx);
2032 void exit();
2033
2034 typedef boost::mpl::list <
2035 boost::statechart::custom_reaction< MQuery >,
2036 boost::statechart::custom_reaction< MLogRec >,
2037 boost::statechart::custom_reaction< MInfoRec >,
2038 boost::statechart::custom_reaction< ActMap >,
2039 boost::statechart::custom_reaction< RecoveryDone >
2040 > reactions;
2041 boost::statechart::result react(const MQuery& query);
2042 boost::statechart::result react(const MLogRec& logevt);
2043 boost::statechart::result react(const MInfoRec& infoevt);
2044 boost::statechart::result react(const ActMap&);
2045 boost::statechart::result react(const RecoveryDone&) {
2046 return discard_event();
2047 }
2048 };
2049
2050 struct GetLog;
2051
2052 struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
2053 set<pg_shard_t> peer_info_requested;
2054
2055 explicit GetInfo(my_context ctx);
2056 void exit();
2057 void get_infos();
2058
2059 typedef boost::mpl::list <
2060 boost::statechart::custom_reaction< QueryState >,
2061 boost::statechart::transition< GotInfo, GetLog >,
2062 boost::statechart::custom_reaction< MNotifyRec >,
2063 boost::statechart::transition< IsDown, Down >
2064 > reactions;
2065 boost::statechart::result react(const QueryState& q);
2066 boost::statechart::result react(const MNotifyRec& infoevt);
2067 };
2068
2069 struct GotLog : boost::statechart::event< GotLog > {
2070 GotLog() : boost::statechart::event< GotLog >() {}
2071 };
2072
2073 struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState {
2074 pg_shard_t auth_log_shard;
2075 boost::intrusive_ptr<MOSDPGLog> msg;
2076
2077 explicit GetLog(my_context ctx);
2078 void exit();
2079
2080 typedef boost::mpl::list <
2081 boost::statechart::custom_reaction< QueryState >,
2082 boost::statechart::custom_reaction< MLogRec >,
2083 boost::statechart::custom_reaction< GotLog >,
2084 boost::statechart::custom_reaction< AdvMap >,
2085 boost::statechart::transition< IsIncomplete, Incomplete >
2086 > reactions;
2087 boost::statechart::result react(const AdvMap&);
2088 boost::statechart::result react(const QueryState& q);
2089 boost::statechart::result react(const MLogRec& logevt);
2090 boost::statechart::result react(const GotLog&);
2091 };
2092
2093 struct WaitUpThru;
2094
2095 struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState {
2096 set<pg_shard_t> peer_missing_requested;
2097
2098 explicit GetMissing(my_context ctx);
2099 void exit();
2100
2101 typedef boost::mpl::list <
2102 boost::statechart::custom_reaction< QueryState >,
2103 boost::statechart::custom_reaction< MLogRec >,
2104 boost::statechart::transition< NeedUpThru, WaitUpThru >
2105 > reactions;
2106 boost::statechart::result react(const QueryState& q);
2107 boost::statechart::result react(const MLogRec& logevt);
2108 };
2109
2110 struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState {
2111 explicit WaitUpThru(my_context ctx);
2112 void exit();
2113
2114 typedef boost::mpl::list <
2115 boost::statechart::custom_reaction< QueryState >,
2116 boost::statechart::custom_reaction< ActMap >,
2117 boost::statechart::custom_reaction< MLogRec >
2118 > reactions;
2119 boost::statechart::result react(const QueryState& q);
2120 boost::statechart::result react(const ActMap& am);
2121 boost::statechart::result react(const MLogRec& logrec);
2122 };
2123
2124 struct Down : boost::statechart::state< Down, Peering>, NamedState {
2125 explicit Down(my_context ctx);
2126 typedef boost::mpl::list <
2127 boost::statechart::custom_reaction< QueryState >
2128 > reactions;
2129 boost::statechart::result react(const QueryState& infoevt);
2130 void exit();
2131 };
2132
2133 struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState {
2134 typedef boost::mpl::list <
2135 boost::statechart::custom_reaction< AdvMap >,
2136 boost::statechart::custom_reaction< MNotifyRec >,
2137 boost::statechart::custom_reaction< QueryState >
2138 > reactions;
2139 explicit Incomplete(my_context ctx);
2140 boost::statechart::result react(const AdvMap &advmap);
2141 boost::statechart::result react(const MNotifyRec& infoevt);
2142 boost::statechart::result react(const QueryState& infoevt);
2143 void exit();
2144 };
2145
2146
2147 RecoveryMachine machine;
2148 PG *pg;
2149
2150 /// context passed in by state machine caller
2151 RecoveryCtx *orig_ctx;
2152
2153 /// populated if we are buffering messages pending a flush
2154 boost::optional<BufferedRecoveryMessages> messages_pending_flush;
2155
2156 /**
2157 * populated between start_handle() and end_handle(), points into
2158 * the message lists for messages_pending_flush while blocking messages
2159 * or into orig_ctx otherwise
2160 */
2161 boost::optional<RecoveryCtx> rctx;
2162
2163 public:
2164 explicit RecoveryState(PG *pg)
2165 : machine(this, pg), pg(pg), orig_ctx(0) {
2166 machine.initiate();
2167 }
2168
2169 void handle_event(const boost::statechart::event_base &evt,
2170 RecoveryCtx *rctx) {
2171 start_handle(rctx);
2172 machine.process_event(evt);
2173 end_handle();
2174 }
2175
2176 void handle_event(CephPeeringEvtRef evt,
2177 RecoveryCtx *rctx) {
2178 start_handle(rctx);
2179 machine.process_event(evt->get_event());
2180 end_handle();
2181 }
2182
2183 } recovery_state;
2184
2185
2186 public:
2187 PG(OSDService *o, OSDMapRef curmap,
2188 const PGPool &pool, spg_t p);
2189 ~PG() override;
2190
2191 private:
2192 // Prevent copying
2193 explicit PG(const PG& rhs);
2194 PG& operator=(const PG& rhs);
2195 const spg_t pg_id;
2196 uint64_t peer_features;
2197 uint64_t acting_features;
2198 uint64_t upacting_features;
2199
2200 epoch_t last_epoch;
2201
2202 Mutex scrub_sleep_lock;
2203 SafeTimer scrub_sleep_timer;
2204
2205 public:
2206 const spg_t& get_pgid() const { return pg_id; }
2207
2208 void reset_min_peer_features() {
2209 peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
2210 }
2211 uint64_t get_min_peer_features() const { return peer_features; }
2212 void apply_peer_features(uint64_t f) { peer_features &= f; }
2213
2214 uint64_t get_min_acting_features() const { return acting_features; }
2215 uint64_t get_min_upacting_features() const { return upacting_features; }
2216
2217 void init_primary_up_acting(
2218 const vector<int> &newup,
2219 const vector<int> &newacting,
2220 int new_up_primary,
2221 int new_acting_primary) {
2222 actingset.clear();
2223 acting = newacting;
2224 for (uint8_t i = 0; i < acting.size(); ++i) {
2225 if (acting[i] != CRUSH_ITEM_NONE)
2226 actingset.insert(
2227 pg_shard_t(
2228 acting[i],
2229 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2230 }
2231 upset.clear();
2232 up = newup;
2233 for (uint8_t i = 0; i < up.size(); ++i) {
2234 if (up[i] != CRUSH_ITEM_NONE)
2235 upset.insert(
2236 pg_shard_t(
2237 up[i],
2238 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2239 }
2240 if (!pool.info.ec_pool()) {
2241 up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
2242 primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
2243 return;
2244 }
2245 up_primary = pg_shard_t();
2246 primary = pg_shard_t();
2247 for (uint8_t i = 0; i < up.size(); ++i) {
2248 if (up[i] == new_up_primary) {
2249 up_primary = pg_shard_t(up[i], shard_id_t(i));
2250 break;
2251 }
2252 }
2253 for (uint8_t i = 0; i < acting.size(); ++i) {
2254 if (acting[i] == new_acting_primary) {
2255 primary = pg_shard_t(acting[i], shard_id_t(i));
2256 break;
2257 }
2258 }
2259 assert(up_primary.osd == new_up_primary);
2260 assert(primary.osd == new_acting_primary);
2261 }
2262 pg_shard_t get_primary() const { return primary; }
2263
2264 int get_role() const { return role; }
2265 void set_role(int r) { role = r; }
2266
2267 bool is_primary() const { return pg_whoami == primary; }
2268 bool is_replica() const { return role > 0; }
2269
2270 epoch_t get_last_peering_reset() const { return last_peering_reset; }
2271
2272 //int get_state() const { return state; }
2273 bool state_test(int m) const { return (state & m) != 0; }
2274 void state_set(int m) { state |= m; }
2275 void state_clear(int m) { state &= ~m; }
2276
2277 bool is_complete() const { return info.last_complete == info.last_update; }
2278 bool should_send_notify() const { return send_notify; }
2279
2280 int get_state() const { return state; }
2281 bool is_active() const { return state_test(PG_STATE_ACTIVE); }
2282 bool is_activating() const { return state_test(PG_STATE_ACTIVATING); }
2283 bool is_peering() const { return state_test(PG_STATE_PEERING); }
2284 bool is_down() const { return state_test(PG_STATE_DOWN); }
2285 bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
2286 bool is_clean() const { return state_test(PG_STATE_CLEAN); }
2287 bool is_degraded() const { return state_test(PG_STATE_DEGRADED); }
2288 bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
2289
2290 bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
2291 bool is_peered() const {
2292 return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED);
2293 }
2294
2295 bool is_empty() const { return info.last_update == eversion_t(0,0); }
2296
2297 void init(
2298 int role,
2299 const vector<int>& up,
2300 int up_primary,
2301 const vector<int>& acting,
2302 int acting_primary,
2303 const pg_history_t& history,
2304 const PastIntervals& pim,
2305 bool backfill,
2306 ObjectStore::Transaction *t);
2307
2308 // pg on-disk state
2309 void do_pending_flush();
2310
2311 static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits);
2312 static void _init(ObjectStore::Transaction& t,
2313 spg_t pgid, const pg_pool_t *pool);
2314
2315 private:
2316 void prepare_write_info(map<string,bufferlist> *km);
2317
2318 void update_store_with_options();
2319 void update_store_on_load();
2320
2321 public:
2322 static int _prepare_write_info(
2323 CephContext* cct,
2324 map<string,bufferlist> *km,
2325 epoch_t epoch,
2326 pg_info_t &info,
2327 pg_info_t &last_written_info,
2328 PastIntervals &past_intervals,
2329 bool dirty_big_info,
2330 bool dirty_epoch,
2331 bool try_fast_info,
2332 PerfCounters *logger = nullptr);
2333 void write_if_dirty(ObjectStore::Transaction& t);
2334
2335 PGLog::IndexedLog projected_log;
2336 bool check_in_progress_op(
2337 const osd_reqid_t &r,
2338 eversion_t *version,
2339 version_t *user_version,
2340 int *return_code) const;
2341 eversion_t projected_last_update;
2342 eversion_t get_next_version() const {
2343 eversion_t at_version(
2344 get_osdmap()->get_epoch(),
2345 projected_last_update.version+1);
2346 assert(at_version > info.last_update);
2347 assert(at_version > pg_log.get_head());
2348 assert(at_version > projected_last_update);
2349 return at_version;
2350 }
2351
2352 void add_log_entry(const pg_log_entry_t& e, bool applied);
2353 void append_log(
2354 const vector<pg_log_entry_t>& logv,
2355 eversion_t trim_to,
2356 eversion_t roll_forward_to,
2357 ObjectStore::Transaction &t,
2358 bool transaction_applied = true);
2359 bool check_log_for_corruption(ObjectStore *store);
2360 void trim_log();
2361
2362 std::string get_corrupt_pg_log_name() const;
2363 static int read_info(
2364 ObjectStore *store, spg_t pgid, const coll_t &coll,
2365 bufferlist &bl, pg_info_t &info, PastIntervals &past_intervals,
2366 __u8 &);
2367 void read_state(ObjectStore *store, bufferlist &bl);
2368 static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
2369 static int peek_map_epoch(ObjectStore *store, spg_t pgid,
2370 epoch_t *pepoch, bufferlist *bl);
2371 void update_snap_map(
2372 const vector<pg_log_entry_t> &log_entries,
2373 ObjectStore::Transaction& t);
2374
2375 void filter_snapc(vector<snapid_t> &snaps);
2376
2377 void log_weirdness();
2378
2379 virtual void kick_snap_trim() = 0;
2380 virtual void snap_trimmer_scrub_complete() = 0;
2381 bool requeue_scrub();
2382 void queue_recovery(bool front = false);
2383 bool queue_scrub();
2384 unsigned get_scrub_priority();
2385
2386 /// share pg info after a pg is active
2387 void share_pg_info();
2388
2389
2390 bool append_log_entries_update_missing(
2391 const mempool::osd::list<pg_log_entry_t> &entries,
2392 ObjectStore::Transaction &t);
2393
2394 /**
2395 * Merge entries updating missing as necessary on all
2396 * actingbackfill logs and missings (also missing_loc)
2397 */
2398 void merge_new_log_entries(
2399 const mempool::osd::list<pg_log_entry_t> &entries,
2400 ObjectStore::Transaction &t);
2401
2402 void reset_interval_flush();
2403 void start_peering_interval(
2404 const OSDMapRef lastmap,
2405 const vector<int>& newup, int up_primary,
2406 const vector<int>& newacting, int acting_primary,
2407 ObjectStore::Transaction *t);
2408 void on_new_interval();
2409 virtual void _on_new_interval() = 0;
2410 void start_flush(ObjectStore::Transaction *t,
2411 list<Context *> *on_applied,
2412 list<Context *> *on_safe);
2413 void set_last_peering_reset();
2414 bool pg_has_reset_since(epoch_t e) {
2415 assert(is_locked());
2416 return deleting || e < get_last_peering_reset();
2417 }
2418
2419 void update_history(const pg_history_t& history);
2420 void fulfill_info(pg_shard_t from, const pg_query_t &query,
2421 pair<pg_shard_t, pg_info_t> &notify_info);
2422 void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
2423
2424 void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap);
2425
2426 bool should_restart_peering(
2427 int newupprimary,
2428 int newactingprimary,
2429 const vector<int>& newup,
2430 const vector<int>& newacting,
2431 OSDMapRef lastmap,
2432 OSDMapRef osdmap);
2433
2434 // OpRequest queueing
2435 bool can_discard_op(OpRequestRef& op);
2436 bool can_discard_scan(OpRequestRef op);
2437 bool can_discard_backfill(OpRequestRef op);
2438 bool can_discard_request(OpRequestRef& op);
2439
2440 template<typename T, int MSGTYPE>
2441 bool can_discard_replica_op(OpRequestRef& op);
2442
2443 bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
2444 bool old_peering_evt(CephPeeringEvtRef evt) {
2445 return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
2446 }
2447 static bool have_same_or_newer_map(epoch_t cur_epoch, epoch_t e) {
2448 return e <= cur_epoch;
2449 }
2450 bool have_same_or_newer_map(epoch_t e) {
2451 return e <= get_osdmap()->get_epoch();
2452 }
2453
2454 bool op_has_sufficient_caps(OpRequestRef& op);
2455
2456
2457 // recovery bits
2458 void take_waiters();
2459 void queue_peering_event(CephPeeringEvtRef evt);
2460 void handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx);
2461 void queue_query(epoch_t msg_epoch, epoch_t query_epoch,
2462 pg_shard_t from, const pg_query_t& q);
2463 void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
2464 void queue_flushed(epoch_t started_at);
2465 void handle_advance_map(
2466 OSDMapRef osdmap, OSDMapRef lastmap,
2467 vector<int>& newup, int up_primary,
2468 vector<int>& newacting, int acting_primary,
2469 RecoveryCtx *rctx);
2470 void handle_activate_map(RecoveryCtx *rctx);
2471 void handle_create(RecoveryCtx *rctx);
2472 void handle_loaded(RecoveryCtx *rctx);
2473 void handle_query_state(Formatter *f);
2474
2475 virtual void on_removal(ObjectStore::Transaction *t) = 0;
2476
2477
2478 // abstract bits
2479 virtual void do_request(
2480 OpRequestRef& op,
2481 ThreadPool::TPHandle &handle
2482 ) = 0;
2483
2484 virtual void do_op(OpRequestRef& op) = 0;
2485 virtual void do_sub_op(OpRequestRef op) = 0;
2486 virtual void do_sub_op_reply(OpRequestRef op) = 0;
2487 virtual void do_scan(
2488 OpRequestRef op,
2489 ThreadPool::TPHandle &handle
2490 ) = 0;
2491 virtual void do_backfill(OpRequestRef op) = 0;
2492 virtual void snap_trimmer(epoch_t epoch_queued) = 0;
2493
2494 virtual int do_command(
2495 cmdmap_t cmdmap,
2496 ostream& ss,
2497 bufferlist& idata,
2498 bufferlist& odata,
2499 ConnectionRef conn,
2500 ceph_tid_t tid) = 0;
2501
2502 virtual void on_role_change() = 0;
2503 virtual void on_pool_change() = 0;
2504 virtual void on_change(ObjectStore::Transaction *t) = 0;
2505 virtual void on_activate() = 0;
2506 virtual void on_flushed() = 0;
2507 virtual void on_shutdown() = 0;
2508 virtual void check_blacklisted_watchers() = 0;
2509 virtual void get_watchers(std::list<obj_watch_item_t>&) = 0;
2510
2511 virtual bool agent_work(int max) = 0;
2512 virtual bool agent_work(int max, int agent_flush_quota) = 0;
2513 virtual void agent_stop() = 0;
2514 virtual void agent_delay() = 0;
2515 virtual void agent_clear() = 0;
2516 virtual void agent_choose_mode_restart() = 0;
2517 };
2518
2519 ostream& operator<<(ostream& out, const PG& pg);
2520
2521 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi);
2522
2523 #endif