]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PG.h
update sources to v12.1.1
[ceph.git] / ceph / src / osd / PG.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef CEPH_PG_H
16 #define CEPH_PG_H
17
18 #include <boost/statechart/custom_reaction.hpp>
19 #include <boost/statechart/event.hpp>
20 #include <boost/statechart/simple_state.hpp>
21 #include <boost/statechart/state.hpp>
22 #include <boost/statechart/state_machine.hpp>
23 #include <boost/statechart/transition.hpp>
24 #include <boost/statechart/event_base.hpp>
25 #include <boost/scoped_ptr.hpp>
26 #include <boost/circular_buffer.hpp>
27 #include "include/memory.h"
28 #include "include/mempool.h"
29
30 // re-include our assert to clobber boost's
31 #include "include/assert.h"
32
33 #include "include/types.h"
34 #include "include/stringify.h"
35 #include "osd_types.h"
36 #include "include/xlist.h"
37 #include "SnapMapper.h"
38 #include "Session.h"
39 #include "common/Timer.h"
40
41 #include "PGLog.h"
42 #include "OSDMap.h"
43 #include "messages/MOSDPGLog.h"
44 #include "include/str_list.h"
45 #include "PGBackend.h"
46
47 #include <atomic>
48 #include <list>
49 #include <memory>
50 #include <stack>
51 #include <string>
52 #include <tuple>
53 using namespace std;
54
55 // #include "include/unordered_map.h"
56 // #include "include/unordered_set.h"
57
58 //#define DEBUG_RECOVERY_OIDS // track set of recovering oids explicitly, to find counting bugs
59
60 class OSD;
61 class OSDService;
62 class MOSDOp;
63 class MOSDPGScan;
64 class MOSDPGBackfill;
65 class MOSDPGInfo;
66
67 class PG;
68 struct OpRequest;
69 typedef OpRequest::Ref OpRequestRef;
70 class MOSDPGLog;
71 class CephContext;
72
73 namespace Scrub {
74 class Store;
75 }
76
77 void intrusive_ptr_add_ref(PG *pg);
78 void intrusive_ptr_release(PG *pg);
79
80 using state_history_entry = std::tuple<utime_t, utime_t, const char*>;
81 using embedded_state = std::pair<utime_t, const char*>;
82
83 struct PGStateInstance {
84 // Time spent in pg states
85
86 void setepoch(const epoch_t current_epoch) {
87 this_epoch = current_epoch;
88 }
89
90 void enter_state(const utime_t entime, const char* state) {
91 embedded_states.push(std::make_pair(entime, state));
92 }
93
94 void exit_state(const utime_t extime) {
95 embedded_state this_state = embedded_states.top();
96 state_history.push_back(state_history_entry{
97 this_state.first, extime, this_state.second});
98 embedded_states.pop();
99 }
100
101 epoch_t this_epoch;
102 utime_t enter_time;
103 std::vector<state_history_entry> state_history;
104 std::stack<embedded_state> embedded_states;
105 };
106
107 class PGStateHistory {
108 // Member access protected with the PG lock
109 public:
110 PGStateHistory() : buffer(10) {}
111
112 void enter(PG* pg, const utime_t entime, const char* state);
113
114 void exit(const char* state);
115
116 void reset() {
117 pi = nullptr;
118 }
119
120 void set_pg_in_destructor() { pg_in_destructor = true; }
121
122 void dump(Formatter* f) const;
123
124 private:
125 bool pg_in_destructor = false;
126 PG* thispg = nullptr;
127 std::unique_ptr<PGStateInstance> tmppi;
128 PGStateInstance* pi = nullptr;
129 boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer;
130
131 };
132
133 #ifdef PG_DEBUG_REFS
134 #include "common/tracked_int_ptr.hpp"
135 uint64_t get_with_id(PG *pg);
136 void put_with_id(PG *pg, uint64_t id);
137 typedef TrackedIntPtr<PG> PGRef;
138 #else
139 typedef boost::intrusive_ptr<PG> PGRef;
140 #endif
141
142 class PGRecoveryStats {
143 struct per_state_info {
144 uint64_t enter, exit; // enter/exit counts
145 uint64_t events;
146 utime_t event_time; // time spent processing events
147 utime_t total_time; // total time in state
148 utime_t min_time, max_time;
149
150 // cppcheck-suppress unreachableCode
151 per_state_info() : enter(0), exit(0), events(0) {}
152 };
153 map<const char *,per_state_info> info;
154 Mutex lock;
155
156 public:
157 PGRecoveryStats() : lock("PGRecoverStats::lock") {}
158
159 void reset() {
160 Mutex::Locker l(lock);
161 info.clear();
162 }
163 void dump(ostream& out) {
164 Mutex::Locker l(lock);
165 for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
166 per_state_info& i = p->second;
167 out << i.enter << "\t" << i.exit << "\t"
168 << i.events << "\t" << i.event_time << "\t"
169 << i.total_time << "\t"
170 << i.min_time << "\t" << i.max_time << "\t"
171 << p->first << "\n";
172 }
173 }
174
175 void dump_formatted(Formatter *f) {
176 Mutex::Locker l(lock);
177 f->open_array_section("pg_recovery_stats");
178 for (map<const char *,per_state_info>::iterator p = info.begin();
179 p != info.end(); ++p) {
180 per_state_info& i = p->second;
181 f->open_object_section("recovery_state");
182 f->dump_int("enter", i.enter);
183 f->dump_int("exit", i.exit);
184 f->dump_int("events", i.events);
185 f->dump_stream("event_time") << i.event_time;
186 f->dump_stream("total_time") << i.total_time;
187 f->dump_stream("min_time") << i.min_time;
188 f->dump_stream("max_time") << i.max_time;
189 vector<string> states;
190 get_str_vec(p->first, "/", states);
191 f->open_array_section("nested_states");
192 for (vector<string>::iterator st = states.begin();
193 st != states.end(); ++st) {
194 f->dump_string("state", *st);
195 }
196 f->close_section();
197 f->close_section();
198 }
199 f->close_section();
200 }
201
202 void log_enter(const char *s) {
203 Mutex::Locker l(lock);
204 info[s].enter++;
205 }
206 void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
207 Mutex::Locker l(lock);
208 per_state_info &i = info[s];
209 i.exit++;
210 i.total_time += dur;
211 if (dur > i.max_time)
212 i.max_time = dur;
213 if (dur < i.min_time || i.min_time == utime_t())
214 i.min_time = dur;
215 i.events += events;
216 i.event_time += event_dur;
217 }
218 };
219
220 struct PGPool {
221 CephContext* cct;
222 epoch_t cached_epoch;
223 int64_t id;
224 string name;
225 uint64_t auid;
226
227 pg_pool_t info;
228 SnapContext snapc; // the default pool snapc, ready to go.
229
230 interval_set<snapid_t> cached_removed_snaps; // current removed_snaps set
231 interval_set<snapid_t> newly_removed_snaps; // newly removed in the last epoch
232
233 PGPool(CephContext* cct, OSDMapRef map, int64_t i)
234 : cct(cct),
235 cached_epoch(map->get_epoch()),
236 id(i),
237 name(map->get_pool_name(id)),
238 auid(map->get_pg_pool(id)->auid) {
239 const pg_pool_t *pi = map->get_pg_pool(id);
240 assert(pi);
241 info = *pi;
242 snapc = pi->get_snap_context();
243 pi->build_removed_snaps(cached_removed_snaps);
244 }
245
246 void update(OSDMapRef map);
247 };
248
249 /** PG - Replica Placement Group
250 *
251 */
252
253 class PG : public DoutPrefixProvider {
254 protected:
255 OSDService *osd;
256 CephContext *cct;
257 OSDriver osdriver;
258 SnapMapper snap_mapper;
259 bool eio_errors_to_process = false;
260
261 virtual PGBackend *get_pgbackend() = 0;
262 public:
263 std::string gen_prefix() const override;
264 CephContext *get_cct() const override { return cct; }
265 unsigned get_subsys() const override { return ceph_subsys_osd; }
266
267 /*** PG ****/
268 void update_snap_mapper_bits(uint32_t bits) {
269 snap_mapper.update_bits(bits);
270 }
271 /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
272 IsPGRecoverablePredicate *get_is_recoverable_predicate() {
273 return get_pgbackend()->get_is_recoverable_predicate();
274 }
275 protected:
276 OSDMapRef osdmap_ref;
277 OSDMapRef last_persisted_osdmap_ref;
278 PGPool pool;
279
280 void requeue_map_waiters();
281
282 void update_osdmap_ref(OSDMapRef newmap) {
283 assert(_lock.is_locked_by_me());
284 osdmap_ref = std::move(newmap);
285 }
286
287 public:
288 OSDMapRef get_osdmap() const {
289 assert(is_locked());
290 assert(osdmap_ref);
291 return osdmap_ref;
292 }
293 protected:
294
295 /** locking and reference counting.
296 * I destroy myself when the reference count hits zero.
297 * lock() should be called before doing anything.
298 * get() should be called on pointer copy (to another thread, etc.).
299 * put() should be called on destruction of some previously copied pointer.
300 * unlock() when done with the current pointer (_most common_).
301 */
302 mutable Mutex _lock;
303 std::atomic_uint ref{0};
304
305 #ifdef PG_DEBUG_REFS
306 Mutex _ref_id_lock;
307 map<uint64_t, string> _live_ids;
308 map<string, uint64_t> _tag_counts;
309 uint64_t _ref_id;
310 #endif
311
312 public:
313 bool deleting; // true while in removing or OSD is shutting down
314
315 ZTracer::Endpoint trace_endpoint;
316
317 void lock_suspend_timeout(ThreadPool::TPHandle &handle);
318 void lock(bool no_lockdep = false) const;
319 void unlock() const {
320 //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
321 assert(!dirty_info);
322 assert(!dirty_big_info);
323 _lock.Unlock();
324 }
325
326 bool is_locked() const {
327 return _lock.is_locked();
328 }
329
330 #ifdef PG_DEBUG_REFS
331 uint64_t get_with_id();
332 void put_with_id(uint64_t);
333 void dump_live_ids();
334 #endif
335 void get(const char* tag);
336 void put(const char* tag);
337
338 bool dirty_info, dirty_big_info;
339
340 public:
341 bool is_ec_pg() const {
342 return pool.info.ec_pool();
343 }
344 // pg state
345 pg_info_t info; ///< current pg info
346 pg_info_t last_written_info; ///< last written info
347 __u8 info_struct_v;
348 static const __u8 cur_struct_v = 10;
349 // v10 is the new past_intervals encoding
350 // v9 was fastinfo_key addition
351 // v8 was the move to a per-pg pgmeta object
352 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
353 // (first appeared in cuttlefish).
354 static const __u8 compat_struct_v = 7;
355 bool must_upgrade() {
356 return info_struct_v < cur_struct_v;
357 }
358 bool can_upgrade() {
359 return info_struct_v >= compat_struct_v;
360 }
361 void upgrade(ObjectStore *store);
362
363 const coll_t coll;
364 ObjectStore::CollectionHandle ch;
365 PGLog pg_log;
366 static string get_info_key(spg_t pgid) {
367 return stringify(pgid) + "_info";
368 }
369 static string get_biginfo_key(spg_t pgid) {
370 return stringify(pgid) + "_biginfo";
371 }
372 static string get_epoch_key(spg_t pgid) {
373 return stringify(pgid) + "_epoch";
374 }
375 ghobject_t pgmeta_oid;
376
377 class MissingLoc {
378 map<hobject_t, pg_missing_item> needs_recovery_map;
379 map<hobject_t, set<pg_shard_t> > missing_loc;
380 set<pg_shard_t> missing_loc_sources;
381 PG *pg;
382 set<pg_shard_t> empty_set;
383 public:
384 boost::scoped_ptr<IsPGReadablePredicate> is_readable;
385 boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable;
386 explicit MissingLoc(PG *pg)
387 : pg(pg) {}
388 void set_backend_predicates(
389 IsPGReadablePredicate *_is_readable,
390 IsPGRecoverablePredicate *_is_recoverable) {
391 is_readable.reset(_is_readable);
392 is_recoverable.reset(_is_recoverable);
393 }
394 string gen_prefix() const { return pg->gen_prefix(); }
395 bool needs_recovery(
396 const hobject_t &hoid,
397 eversion_t *v = 0) const {
398 map<hobject_t, pg_missing_item>::const_iterator i =
399 needs_recovery_map.find(hoid);
400 if (i == needs_recovery_map.end())
401 return false;
402 if (v)
403 *v = i->second.need;
404 return true;
405 }
406 bool is_unfound(const hobject_t &hoid) const {
407 return needs_recovery(hoid) && (
408 !missing_loc.count(hoid) ||
409 !(*is_recoverable)(missing_loc.find(hoid)->second));
410 }
411 bool readable_with_acting(
412 const hobject_t &hoid,
413 const set<pg_shard_t> &acting) const;
414 uint64_t num_unfound() const {
415 uint64_t ret = 0;
416 for (map<hobject_t, pg_missing_item>::const_iterator i =
417 needs_recovery_map.begin();
418 i != needs_recovery_map.end();
419 ++i) {
420 if (is_unfound(i->first))
421 ++ret;
422 }
423 return ret;
424 }
425
426 bool have_unfound() const {
427 for (map<hobject_t, pg_missing_item>::const_iterator i =
428 needs_recovery_map.begin();
429 i != needs_recovery_map.end();
430 ++i) {
431 if (is_unfound(i->first))
432 return true;
433 }
434 return false;
435 }
436 void clear() {
437 needs_recovery_map.clear();
438 missing_loc.clear();
439 missing_loc_sources.clear();
440 }
441
442 void add_location(const hobject_t &hoid, pg_shard_t location) {
443 missing_loc[hoid].insert(location);
444 }
445 void remove_location(const hobject_t &hoid, pg_shard_t location) {
446 missing_loc[hoid].erase(location);
447 }
448 void add_active_missing(const pg_missing_t &missing) {
449 for (map<hobject_t, pg_missing_item>::const_iterator i =
450 missing.get_items().begin();
451 i != missing.get_items().end();
452 ++i) {
453 map<hobject_t, pg_missing_item>::const_iterator j =
454 needs_recovery_map.find(i->first);
455 if (j == needs_recovery_map.end()) {
456 needs_recovery_map.insert(*i);
457 } else {
458 assert(i->second.need == j->second.need);
459 }
460 }
461 }
462
463 void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have) {
464 needs_recovery_map[hoid] = pg_missing_item(need, have);
465 }
466 void revise_need(const hobject_t &hoid, eversion_t need) {
467 assert(needs_recovery(hoid));
468 needs_recovery_map[hoid].need = need;
469 }
470
471 /// Adds info about a possible recovery source
472 bool add_source_info(
473 pg_shard_t source, ///< [in] source
474 const pg_info_t &oinfo, ///< [in] info
475 const pg_missing_t &omissing, ///< [in] (optional) missing
476 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
477 ); ///< @return whether a new object location was discovered
478
479 /// Adds recovery sources in batch
480 void add_batch_sources_info(
481 const set<pg_shard_t> &sources, ///< [in] a set of resources which can be used for all objects
482 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
483 );
484
485 /// Uses osdmap to update structures for now down sources
486 void check_recovery_sources(const OSDMapRef& osdmap);
487
488 /// Call when hoid is no longer missing in acting set
489 void recovered(const hobject_t &hoid) {
490 needs_recovery_map.erase(hoid);
491 missing_loc.erase(hoid);
492 }
493
494 /// Call to update structures for hoid after a change
495 void rebuild(
496 const hobject_t &hoid,
497 pg_shard_t self,
498 const set<pg_shard_t> to_recover,
499 const pg_info_t &info,
500 const pg_missing_t &missing,
501 const map<pg_shard_t, pg_missing_t> &pmissing,
502 const map<pg_shard_t, pg_info_t> &pinfo) {
503 recovered(hoid);
504 boost::optional<pg_missing_item> item;
505 auto miter = missing.get_items().find(hoid);
506 if (miter != missing.get_items().end()) {
507 item = miter->second;
508 } else {
509 for (auto &&i: to_recover) {
510 if (i == self)
511 continue;
512 auto pmiter = pmissing.find(i);
513 assert(pmiter != pmissing.end());
514 miter = pmiter->second.get_items().find(hoid);
515 if (miter != pmiter->second.get_items().end()) {
516 item = miter->second;
517 break;
518 }
519 }
520 }
521 if (!item)
522 return; // recovered!
523
524 needs_recovery_map[hoid] = *item;
525 auto mliter =
526 missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first;
527 assert(info.last_backfill.is_max());
528 assert(info.last_update >= item->need);
529 if (!missing.is_missing(hoid))
530 mliter->second.insert(self);
531 for (auto &&i: pmissing) {
532 auto pinfoiter = pinfo.find(i.first);
533 assert(pinfoiter != pinfo.end());
534 if (item->need <= pinfoiter->second.last_update &&
535 hoid <= pinfoiter->second.last_backfill &&
536 !i.second.is_missing(hoid))
537 mliter->second.insert(i.first);
538 }
539 }
540
541 const set<pg_shard_t> &get_locations(const hobject_t &hoid) const {
542 return missing_loc.count(hoid) ?
543 missing_loc.find(hoid)->second : empty_set;
544 }
545 const map<hobject_t, set<pg_shard_t>> &get_missing_locs() const {
546 return missing_loc;
547 }
548 const map<hobject_t, pg_missing_item> &get_needs_recovery() const {
549 return needs_recovery_map;
550 }
551 } missing_loc;
552
553 PastIntervals past_intervals;
554
555 interval_set<snapid_t> snap_trimq;
556
557 /* You should not use these items without taking their respective queue locks
558 * (if they have one) */
559 xlist<PG*>::item stat_queue_item;
560 bool scrub_queued;
561 bool recovery_queued;
562
563 int recovery_ops_active;
564 set<pg_shard_t> waiting_on_backfill;
565 #ifdef DEBUG_RECOVERY_OIDS
566 set<hobject_t> recovering_oids;
567 #endif
568
569 protected:
570 int role; // 0 = primary, 1 = replica, -1=none.
571 unsigned state; // PG_STATE_*
572
573 bool send_notify; ///< true if we are non-primary and should notify the primary
574
575 public:
576 eversion_t last_update_ondisk; // last_update that has committed; ONLY DEFINED WHEN is_active()
577 eversion_t last_complete_ondisk; // last_complete that has committed.
578 eversion_t last_update_applied;
579
580
581 struct C_UpdateLastRollbackInfoTrimmedToApplied : Context {
582 PGRef pg;
583 epoch_t e;
584 eversion_t v;
585 C_UpdateLastRollbackInfoTrimmedToApplied(PG *pg, epoch_t e, eversion_t v)
586 : pg(pg), e(e), v(v) {}
587 void finish(int) override {
588 pg->lock();
589 if (!pg->pg_has_reset_since(e)) {
590 pg->last_rollback_info_trimmed_to_applied = v;
591 }
592 pg->unlock();
593 }
594 };
595 // entries <= last_rollback_info_trimmed_to_applied have been trimmed,
596 // and the transaction has applied
597 eversion_t last_rollback_info_trimmed_to_applied;
598
599 // primary state
600 public:
601 pg_shard_t primary;
602 pg_shard_t pg_whoami;
603 pg_shard_t up_primary;
604 vector<int> up, acting, want_acting;
605 set<pg_shard_t> actingbackfill, actingset, upset;
606 map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
607 eversion_t min_last_complete_ondisk; // up: min over last_complete_ondisk, peer_last_complete_ondisk
608 eversion_t pg_trim_to;
609
610 set<int> blocked_by; ///< osds we are blocked by (for pg stats)
611
612 // [primary only] content recovery state
613
614 public:
615 struct BufferedRecoveryMessages {
616 map<int, map<spg_t, pg_query_t> > query_map;
617 map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map;
618 map<int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
619 };
620
621 struct RecoveryCtx {
622 utime_t start_time;
623 map<int, map<spg_t, pg_query_t> > *query_map;
624 map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map;
625 map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list;
626 set<PGRef> created_pgs;
627 C_Contexts *on_applied;
628 C_Contexts *on_safe;
629 ObjectStore::Transaction *transaction;
630 ThreadPool::TPHandle* handle;
631 RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map,
632 map<int,
633 vector<pair<pg_notify_t, PastIntervals> > > *info_map,
634 map<int,
635 vector<pair<pg_notify_t, PastIntervals> > > *notify_list,
636 C_Contexts *on_applied,
637 C_Contexts *on_safe,
638 ObjectStore::Transaction *transaction)
639 : query_map(query_map), info_map(info_map),
640 notify_list(notify_list),
641 on_applied(on_applied),
642 on_safe(on_safe),
643 transaction(transaction),
644 handle(NULL) {}
645
646 RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
647 : query_map(&(buf.query_map)),
648 info_map(&(buf.info_map)),
649 notify_list(&(buf.notify_list)),
650 on_applied(rctx.on_applied),
651 on_safe(rctx.on_safe),
652 transaction(rctx.transaction),
653 handle(rctx.handle) {}
654
655 void accept_buffered_messages(BufferedRecoveryMessages &m) {
656 assert(query_map);
657 assert(info_map);
658 assert(notify_list);
659 for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
660 i != m.query_map.end();
661 ++i) {
662 map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
663 for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
664 j != i->second.end();
665 ++j) {
666 omap[j->first] = j->second;
667 }
668 }
669 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
670 = m.info_map.begin();
671 i != m.info_map.end();
672 ++i) {
673 vector<pair<pg_notify_t, PastIntervals> > &ovec =
674 (*info_map)[i->first];
675 ovec.reserve(ovec.size() + i->second.size());
676 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
677 }
678 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
679 = m.notify_list.begin();
680 i != m.notify_list.end();
681 ++i) {
682 vector<pair<pg_notify_t, PastIntervals> > &ovec =
683 (*notify_list)[i->first];
684 ovec.reserve(ovec.size() + i->second.size());
685 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
686 }
687 }
688 };
689
690
691 PGStateHistory pgstate_history;
692
693 struct NamedState {
694 const char *state_name;
695 utime_t enter_time;
696 PG* pg;
697 const char *get_state_name() { return state_name; }
698 NamedState(PG *pg_, const char *state_name_)
699 : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) {
700 pg->pgstate_history.enter(pg, enter_time, state_name);
701 }
702 virtual ~NamedState() { pg->pgstate_history.exit(state_name); }
703 };
704
705
706
707 protected:
708
709 /*
710 * peer_info -- projected (updates _before_ replicas ack)
711 * peer_missing -- committed (updates _after_ replicas ack)
712 */
713
714 bool need_up_thru;
715 set<pg_shard_t> stray_set; // non-acting osds that have PG data.
716 eversion_t oldest_update; // acting: lowest (valid) last_update in active set
717 map<pg_shard_t, pg_info_t> peer_info; // info from peers (stray or prior)
718 set<pg_shard_t> peer_purged; // peers purged
719 map<pg_shard_t, pg_missing_t> peer_missing;
720 set<pg_shard_t> peer_log_requested; // logs i've requested (and start stamps)
721 set<pg_shard_t> peer_missing_requested;
722
723 // i deleted these strays; ignore racing PGInfo from them
724 set<pg_shard_t> peer_activated;
725
726 // primary-only, recovery-only state
727 set<pg_shard_t> might_have_unfound; // These osds might have objects on them
728 // which are unfound on the primary
729 epoch_t last_peering_reset;
730
731
732 /* heartbeat peers */
733 void set_probe_targets(const set<pg_shard_t> &probe_set);
734 void clear_probe_targets();
735 public:
736 Mutex heartbeat_peer_lock;
737 set<int> heartbeat_peers;
738 set<int> probe_targets;
739
740 /**
741 * BackfillInterval
742 *
743 * Represents the objects in a range [begin, end)
744 *
745 * Possible states:
746 * 1) begin == end == hobject_t() indicates the the interval is unpopulated
747 * 2) Else, objects contains all objects in [begin, end)
748 */
749 struct BackfillInterval {
750 // info about a backfill interval on a peer
751 eversion_t version; /// version at which the scan occurred
752 map<hobject_t,eversion_t> objects;
753 hobject_t begin;
754 hobject_t end;
755
756 /// clear content
757 void clear() {
758 *this = BackfillInterval();
759 }
760
761 /// clear objects list only
762 void clear_objects() {
763 objects.clear();
764 }
765
766 /// reinstantiate with a new start+end position and sort order
767 void reset(hobject_t start) {
768 clear();
769 begin = end = start;
770 }
771
772 /// true if there are no objects in this interval
773 bool empty() const {
774 return objects.empty();
775 }
776
777 /// true if interval extends to the end of the range
778 bool extends_to_end() const {
779 return end.is_max();
780 }
781
782 /// removes items <= soid and adjusts begin to the first object
783 void trim_to(const hobject_t &soid) {
784 trim();
785 while (!objects.empty() &&
786 objects.begin()->first <= soid) {
787 pop_front();
788 }
789 }
790
791 /// Adjusts begin to the first object
792 void trim() {
793 if (!objects.empty())
794 begin = objects.begin()->first;
795 else
796 begin = end;
797 }
798
799 /// drop first entry, and adjust @begin accordingly
800 void pop_front() {
801 assert(!objects.empty());
802 objects.erase(objects.begin());
803 trim();
804 }
805
806 /// dump
807 void dump(Formatter *f) const {
808 f->dump_stream("begin") << begin;
809 f->dump_stream("end") << end;
810 f->open_array_section("objects");
811 for (map<hobject_t, eversion_t>::const_iterator i =
812 objects.begin();
813 i != objects.end();
814 ++i) {
815 f->open_object_section("object");
816 f->dump_stream("object") << i->first;
817 f->dump_stream("version") << i->second;
818 f->close_section();
819 }
820 f->close_section();
821 }
822 };
823
824 protected:
825 BackfillInterval backfill_info;
826 map<pg_shard_t, BackfillInterval> peer_backfill_info;
827 bool backfill_reserved;
828 bool backfill_reserving;
829
830 friend class OSD;
831
832 public:
833 set<pg_shard_t> backfill_targets;
834
835 bool is_backfill_targets(pg_shard_t osd) {
836 return backfill_targets.count(osd);
837 }
838
839 protected:
840
841 /*
842 * blocked request wait hierarchy
843 *
844 * In order to preserve request ordering we need to be careful about the
845 * order in which blocked requests get requeued. Generally speaking, we
846 * push the requests back up to the op_wq in reverse order (most recent
847 * request first) so that they come back out again in the original order.
848 * However, because there are multiple wait queues, we need to requeue
849 * waitlists in order. Generally speaking, we requeue the wait lists
850 * that are checked first.
851 *
852 * Here are the various wait lists, in the order they are used during
853 * request processing, with notes:
854 *
855 * - waiting_for_map
856 * - may start or stop blocking at any time (depending on client epoch)
857 * - waiting_for_peered
858 * - !is_peered() or flushes_in_progress
859 * - only starts blocking on interval change; never restarts
860 * - waiting_for_active
861 * - !is_active()
862 * - only starts blocking on interval change; never restarts
863 * - waiting_for_scrub
864 * - starts and stops blocking for varying intervals during scrub
865 * - waiting_for_unreadable_object
866 * - never restarts once object is readable (* except for EIO?)
867 * - waiting_for_degraded_object
868 * - never restarts once object is writeable (* except for EIO?)
869 * - waiting_for_blocked_object
870 * - starts and stops based on proxied op activity
871 * - obc rwlocks
872 * - starts and stops based on read/write activity
873 *
874 * Notes:
875 *
876 * 1. During and interval change, we requeue *everything* in the above order.
877 *
878 * 2. When an obc rwlock is released, we check for a scrub block and requeue
879 * the op there if it applies. We ignore the unreadable/degraded/blocked
880 * queues because we assume they cannot apply at that time (this is
881 * probably mostly true).
882 *
883 * 3. The requeue_ops helper will push ops onto the waiting_for_map list if
884 * it is non-empty.
885 *
886 * These three behaviors are generally sufficient to maintain ordering, with
887 * the possible exception of cases where we make an object degraded or
888 * unreadable that was previously okay, e.g. when scrub or op processing
889 * encounter an unexpected error. FIXME.
890 */
891
892 // pg waiters
893 unsigned flushes_in_progress;
894
895 // ops with newer maps than our (or blocked behind them)
896 // track these by client, since inter-request ordering doesn't otherwise
897 // matter.
898 unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map;
899
900 // ops waiting on peered
901 list<OpRequestRef> waiting_for_peered;
902
903 // ops waiting on active (require peered as well)
904 list<OpRequestRef> waiting_for_active;
905 list<OpRequestRef> waiting_for_scrub;
906
907 list<OpRequestRef> waiting_for_cache_not_full;
908 list<OpRequestRef> waiting_for_clean_to_primary_repair;
909 map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object,
910 waiting_for_degraded_object,
911 waiting_for_blocked_object;
912
913 set<hobject_t> objects_blocked_on_cache_full;
914 map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
915 map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
916
917 // Callbacks should assume pg (and nothing else) is locked
918 map<hobject_t, list<Context*>> callbacks_for_degraded_object;
919
920 map<eversion_t,
921 list<pair<OpRequestRef, version_t> > > waiting_for_ondisk;
922
923 void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m);
924 void requeue_op(OpRequestRef op);
925 void requeue_ops(list<OpRequestRef> &l);
926
927 // stats that persist lazily
928 object_stat_collection_t unstable_stats;
929
930 // publish stats
931 Mutex pg_stats_publish_lock;
932 bool pg_stats_publish_valid;
933 pg_stat_t pg_stats_publish;
934
935 // for ordering writes
936 ceph::shared_ptr<ObjectStore::Sequencer> osr;
937
938 void _update_calc_stats();
939 void _update_blocked_by();
940 void publish_stats_to_osd();
941 void clear_publish_stats();
942
943 public:
944 void clear_primary_state();
945
946 bool is_actingbackfill(pg_shard_t osd) const {
947 return actingbackfill.count(osd);
948 }
949 bool is_acting(pg_shard_t osd) const {
950 return has_shard(pool.info.ec_pool(), acting, osd);
951 }
952 bool is_up(pg_shard_t osd) const {
953 return has_shard(pool.info.ec_pool(), up, osd);
954 }
955 static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) {
956 if (ec) {
957 return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd;
958 } else {
959 return std::find(v.begin(), v.end(), osd.osd) != v.end();
960 }
961 }
962
963 bool needs_recovery() const;
964 bool needs_backfill() const;
965
966 /// get log recovery reservation priority
967 unsigned get_recovery_priority();
968 /// get backfill reservation priority
969 unsigned get_backfill_priority();
970
971 void mark_clean(); ///< mark an active pg clean
972
973 /// return [start,end) bounds for required past_intervals
974 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
975 const pg_info_t &info,
976 epoch_t oldest_map) {
977 epoch_t start = MAX(
978 info.history.last_epoch_clean ? info.history.last_epoch_clean :
979 info.history.epoch_pool_created,
980 oldest_map);
981 epoch_t end = MAX(
982 info.history.same_interval_since,
983 info.history.epoch_pool_created);
984 return make_pair(start, end);
985 }
986 void check_past_interval_bounds() const;
987 PastIntervals::PriorSet build_prior();
988
989 void remove_down_peer_info(const OSDMapRef osdmap);
990
991 bool adjust_need_up_thru(const OSDMapRef osdmap);
992
993 bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
994 virtual void dump_recovery_info(Formatter *f) const = 0;
995
996 bool calc_min_last_complete_ondisk() {
997 eversion_t min = last_complete_ondisk;
998 assert(!actingbackfill.empty());
999 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1000 i != actingbackfill.end();
1001 ++i) {
1002 if (*i == get_primary()) continue;
1003 if (peer_last_complete_ondisk.count(*i) == 0)
1004 return false; // we don't have complete info
1005 eversion_t a = peer_last_complete_ondisk[*i];
1006 if (a < min)
1007 min = a;
1008 }
1009 if (min == min_last_complete_ondisk)
1010 return false;
1011 min_last_complete_ondisk = min;
1012 return true;
1013 }
1014
1015 virtual void calc_trim_to() = 0;
1016
1017 void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
1018 pg_missing_t& omissing, pg_shard_t from);
1019 void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
1020 pg_missing_t& omissing, pg_shard_t from);
1021 bool proc_replica_info(
1022 pg_shard_t from, const pg_info_t &info, epoch_t send_epoch);
1023
1024 struct PGLogEntryHandler : public PGLog::LogEntryHandler {
1025 PG *pg;
1026 ObjectStore::Transaction *t;
1027 PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {}
1028
1029 // LogEntryHandler
1030 void remove(const hobject_t &hoid) override {
1031 pg->get_pgbackend()->remove(hoid, t);
1032 }
1033 void try_stash(const hobject_t &hoid, version_t v) override {
1034 pg->get_pgbackend()->try_stash(hoid, v, t);
1035 }
1036 void rollback(const pg_log_entry_t &entry) override {
1037 assert(entry.can_rollback());
1038 pg->get_pgbackend()->rollback(entry, t);
1039 }
1040 void rollforward(const pg_log_entry_t &entry) override {
1041 pg->get_pgbackend()->rollforward(entry, t);
1042 }
1043 void trim(const pg_log_entry_t &entry) override {
1044 pg->get_pgbackend()->trim(entry, t);
1045 }
1046 };
1047
1048 void update_object_snap_mapping(
1049 ObjectStore::Transaction *t, const hobject_t &soid,
1050 const set<snapid_t> &snaps);
1051 void clear_object_snap_mapping(
1052 ObjectStore::Transaction *t, const hobject_t &soid);
1053 void remove_snap_mapped_object(
1054 ObjectStore::Transaction& t, const hobject_t& soid);
1055 void merge_log(
1056 ObjectStore::Transaction& t, pg_info_t &oinfo,
1057 pg_log_t &olog, pg_shard_t from);
1058 void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead);
1059 bool search_for_missing(
1060 const pg_info_t &oinfo, const pg_missing_t &omissing,
1061 pg_shard_t fromosd,
1062 RecoveryCtx*);
1063
1064 void check_for_lost_objects();
1065 void forget_lost_objects();
1066
1067 void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map);
1068
1069 void trim_write_ahead();
1070
1071 map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
1072 const map<pg_shard_t, pg_info_t> &infos,
1073 bool restrict_to_up_acting,
1074 bool *history_les_bound) const;
1075 static void calc_ec_acting(
1076 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1077 unsigned size,
1078 const vector<int> &acting,
1079 pg_shard_t acting_primary,
1080 const vector<int> &up,
1081 pg_shard_t up_primary,
1082 const map<pg_shard_t, pg_info_t> &all_info,
1083 bool restrict_to_up_acting,
1084 vector<int> *want,
1085 set<pg_shard_t> *backfill,
1086 set<pg_shard_t> *acting_backfill,
1087 pg_shard_t *want_primary,
1088 ostream &ss);
1089 static void calc_replicated_acting(
1090 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1091 unsigned size,
1092 const vector<int> &acting,
1093 pg_shard_t acting_primary,
1094 const vector<int> &up,
1095 pg_shard_t up_primary,
1096 const map<pg_shard_t, pg_info_t> &all_info,
1097 bool restrict_to_up_acting,
1098 vector<int> *want,
1099 set<pg_shard_t> *backfill,
1100 set<pg_shard_t> *acting_backfill,
1101 pg_shard_t *want_primary,
1102 ostream &ss);
1103 bool choose_acting(pg_shard_t &auth_log_shard,
1104 bool restrict_to_up_acting,
1105 bool *history_les_bound);
1106 void build_might_have_unfound();
1107 void activate(
1108 ObjectStore::Transaction& t,
1109 epoch_t activation_epoch,
1110 list<Context*>& tfin,
1111 map<int, map<spg_t,pg_query_t> >& query_map,
1112 map<int,
1113 vector<pair<pg_notify_t, PastIntervals> > > *activator_map,
1114 RecoveryCtx *ctx);
1115 void _activate_committed(epoch_t epoch, epoch_t activation_epoch);
1116 void all_activated_and_committed();
1117
1118 void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
1119
1120 bool have_unfound() const {
1121 return missing_loc.have_unfound();
1122 }
1123 uint64_t get_num_unfound() const {
1124 return missing_loc.num_unfound();
1125 }
1126
1127 virtual void check_local() = 0;
1128
1129 /**
1130 * @param ops_begun returns how many recovery ops the function started
1131 * @returns true if any useful work was accomplished; false otherwise
1132 */
1133 virtual bool start_recovery_ops(
1134 uint64_t max,
1135 ThreadPool::TPHandle &handle,
1136 uint64_t *ops_begun) = 0;
1137
1138 void purge_strays();
1139
1140 void update_heartbeat_peers();
1141
1142 Context *finish_sync_event;
1143
1144 void finish_recovery(list<Context*>& tfin);
1145 void _finish_recovery(Context *c);
1146 void cancel_recovery();
1147 void clear_recovery_state();
1148 virtual void _clear_recovery_state() = 0;
1149 virtual void check_recovery_sources(const OSDMapRef& newmap) = 0;
1150 void start_recovery_op(const hobject_t& soid);
1151 void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
1152
1153 void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
1154 virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
1155
1156 friend class C_OSD_RepModify_Commit;
1157
1158 // -- backoff --
1159 Mutex backoff_lock; // orders inside Backoff::lock
1160 map<hobject_t,set<BackoffRef>> backoffs;
1161
1162 void add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end);
1163 void release_backoffs(const hobject_t& begin, const hobject_t& end);
1164 void release_backoffs(const hobject_t& o) {
1165 release_backoffs(o, o);
1166 }
1167 void clear_backoffs();
1168
1169 void add_pg_backoff(SessionRef s) {
1170 hobject_t begin = info.pgid.pgid.get_hobj_start();
1171 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1172 add_backoff(s, begin, end);
1173 }
1174 void release_pg_backoffs() {
1175 hobject_t begin = info.pgid.pgid.get_hobj_start();
1176 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1177 release_backoffs(begin, end);
1178 }
1179
1180 void rm_backoff(BackoffRef b);
1181
1182 // -- scrub --
1183 struct Scrubber {
1184 Scrubber();
1185 ~Scrubber();
1186
1187 // metadata
1188 set<pg_shard_t> reserved_peers;
1189 bool reserved, reserve_failed;
1190 epoch_t epoch_start;
1191
1192 // common to both scrubs
1193 bool active;
1194 int waiting_on;
1195 set<pg_shard_t> waiting_on_whom;
1196 int shallow_errors;
1197 int deep_errors;
1198 int fixed;
1199 ScrubMap primary_scrubmap;
1200 map<pg_shard_t, ScrubMap> received_maps;
1201 OpRequestRef active_rep_scrub;
1202 utime_t scrub_reg_stamp; // stamp we registered for
1203
1204 // For async sleep
1205 bool sleeping = false;
1206 bool needs_sleep = true;
1207 utime_t sleep_start;
1208
1209 // flags to indicate explicitly requested scrubs (by admin)
1210 bool must_scrub, must_deep_scrub, must_repair;
1211
1212 // Priority to use for scrub scheduling
1213 unsigned priority;
1214
1215 // this flag indicates whether we would like to do auto-repair of the PG or not
1216 bool auto_repair;
1217
1218 // Maps from objects with errors to missing/inconsistent peers
1219 map<hobject_t, set<pg_shard_t>> missing;
1220 map<hobject_t, set<pg_shard_t>> inconsistent;
1221
1222 // Map from object with errors to good peers
1223 map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative;
1224
1225 // Cleaned map pending snap metadata scrub
1226 ScrubMap cleaned_meta_map;
1227
1228 // digest updates which we are waiting on
1229 int num_digest_updates_pending;
1230
1231 // chunky scrub
1232 hobject_t start, end;
1233 eversion_t subset_last_update;
1234
1235 // chunky scrub state
1236 enum State {
1237 INACTIVE,
1238 NEW_CHUNK,
1239 WAIT_PUSHES,
1240 WAIT_LAST_UPDATE,
1241 BUILD_MAP,
1242 WAIT_REPLICAS,
1243 COMPARE_MAPS,
1244 WAIT_DIGEST_UPDATES,
1245 FINISH,
1246 } state;
1247
1248 std::unique_ptr<Scrub::Store> store;
1249 // deep scrub
1250 bool deep;
1251 uint32_t seed;
1252
1253 list<Context*> callbacks;
1254 void add_callback(Context *context) {
1255 callbacks.push_back(context);
1256 }
1257 void run_callbacks() {
1258 list<Context*> to_run;
1259 to_run.swap(callbacks);
1260 for (list<Context*>::iterator i = to_run.begin();
1261 i != to_run.end();
1262 ++i) {
1263 (*i)->complete(0);
1264 }
1265 }
1266
1267 static const char *state_string(const PG::Scrubber::State& state) {
1268 const char *ret = NULL;
1269 switch( state )
1270 {
1271 case INACTIVE: ret = "INACTIVE"; break;
1272 case NEW_CHUNK: ret = "NEW_CHUNK"; break;
1273 case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
1274 case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
1275 case BUILD_MAP: ret = "BUILD_MAP"; break;
1276 case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
1277 case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
1278 case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
1279 case FINISH: ret = "FINISH"; break;
1280 }
1281 return ret;
1282 }
1283
1284 bool is_chunky_scrub_active() const { return state != INACTIVE; }
1285
1286 // classic (non chunk) scrubs block all writes
1287 // chunky scrubs only block writes to a range
1288 bool write_blocked_by_scrub(const hobject_t &soid) {
1289 return (soid >= start && soid < end);
1290 }
1291
1292 // clear all state
1293 void reset() {
1294 active = false;
1295 waiting_on = 0;
1296 waiting_on_whom.clear();
1297 if (active_rep_scrub) {
1298 active_rep_scrub = OpRequestRef();
1299 }
1300 received_maps.clear();
1301
1302 must_scrub = false;
1303 must_deep_scrub = false;
1304 must_repair = false;
1305 auto_repair = false;
1306
1307 state = PG::Scrubber::INACTIVE;
1308 start = hobject_t();
1309 end = hobject_t();
1310 subset_last_update = eversion_t();
1311 shallow_errors = 0;
1312 deep_errors = 0;
1313 fixed = 0;
1314 deep = false;
1315 seed = 0;
1316 run_callbacks();
1317 inconsistent.clear();
1318 missing.clear();
1319 authoritative.clear();
1320 num_digest_updates_pending = 0;
1321 cleaned_meta_map = ScrubMap();
1322 sleeping = false;
1323 needs_sleep = true;
1324 sleep_start = utime_t();
1325 }
1326
1327 void create_results(const hobject_t& obj);
1328 void cleanup_store(ObjectStore::Transaction *t);
1329 } scrubber;
1330
1331 bool scrub_after_recovery;
1332
1333 int active_pushes;
1334
1335 void repair_object(
1336 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
1337 pg_shard_t bad_peer);
1338
1339 void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
1340 void chunky_scrub(ThreadPool::TPHandle &handle);
1341 void scrub_compare_maps();
1342 /**
1343 * return true if any inconsistency/missing is repaired, false otherwise
1344 */
1345 bool scrub_process_inconsistent();
1346 bool ops_blocked_by_scrub() const;
1347 void scrub_finish();
1348 void scrub_clear_state();
1349 void _scan_snaps(ScrubMap &map);
1350 void _repair_oinfo_oid(ScrubMap &map);
1351 void _scan_rollback_obs(
1352 const vector<ghobject_t> &rollback_obs,
1353 ThreadPool::TPHandle &handle);
1354 void _request_scrub_map(pg_shard_t replica, eversion_t version,
1355 hobject_t start, hobject_t end, bool deep,
1356 uint32_t seed);
1357 int build_scrub_map_chunk(
1358 ScrubMap &map,
1359 hobject_t start, hobject_t end, bool deep, uint32_t seed,
1360 ThreadPool::TPHandle &handle);
1361 /**
1362 * returns true if [begin, end) is good to scrub at this time
1363 * a false return value obliges the implementer to requeue scrub when the
1364 * condition preventing scrub clears
1365 */
1366 virtual bool _range_available_for_scrub(
1367 const hobject_t &begin, const hobject_t &end) = 0;
1368 virtual void scrub_snapshot_metadata(
1369 ScrubMap &map,
1370 const std::map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest) { }
1371 virtual void _scrub_clear_state() { }
1372 virtual void _scrub_finish() { }
1373 virtual void split_colls(
1374 spg_t child,
1375 int split_bits,
1376 int seed,
1377 const pg_pool_t *pool,
1378 ObjectStore::Transaction *t) = 0;
1379 void clear_scrub_reserved();
1380 void scrub_reserve_replicas();
1381 void scrub_unreserve_replicas();
1382 bool scrub_all_replicas_reserved() const;
1383 bool sched_scrub();
1384 void reg_next_scrub();
1385 void unreg_next_scrub();
1386
1387 void replica_scrub(
1388 OpRequestRef op,
1389 ThreadPool::TPHandle &handle);
1390 void do_replica_scrub_map(OpRequestRef op);
1391 void sub_op_scrub_map(OpRequestRef op);
1392
1393 void handle_scrub_reserve_request(OpRequestRef op);
1394 void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
1395 void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
1396 void handle_scrub_reserve_release(OpRequestRef op);
1397
1398 void reject_reservation();
1399 void schedule_backfill_full_retry();
1400 void schedule_recovery_full_retry();
1401
1402 // -- recovery state --
1403
1404 template <class EVT>
1405 struct QueuePeeringEvt : Context {
1406 PGRef pg;
1407 epoch_t epoch;
1408 EVT evt;
1409 QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
1410 pg(pg), epoch(epoch), evt(evt) {}
1411 void finish(int r) override {
1412 pg->lock();
1413 pg->queue_peering_event(PG::CephPeeringEvtRef(
1414 new PG::CephPeeringEvt(
1415 epoch,
1416 epoch,
1417 evt)));
1418 pg->unlock();
1419 }
1420 };
1421
1422 class CephPeeringEvt {
1423 epoch_t epoch_sent;
1424 epoch_t epoch_requested;
1425 boost::intrusive_ptr< const boost::statechart::event_base > evt;
1426 string desc;
1427 public:
1428 MEMPOOL_CLASS_HELPERS();
1429 template <class T>
1430 CephPeeringEvt(epoch_t epoch_sent,
1431 epoch_t epoch_requested,
1432 const T &evt_) :
1433 epoch_sent(epoch_sent), epoch_requested(epoch_requested),
1434 evt(evt_.intrusive_from_this()) {
1435 stringstream out;
1436 out << "epoch_sent: " << epoch_sent
1437 << " epoch_requested: " << epoch_requested << " ";
1438 evt_.print(&out);
1439 desc = out.str();
1440 }
1441 epoch_t get_epoch_sent() { return epoch_sent; }
1442 epoch_t get_epoch_requested() { return epoch_requested; }
1443 const boost::statechart::event_base &get_event() { return *evt; }
1444 string get_desc() { return desc; }
1445 };
1446 typedef ceph::shared_ptr<CephPeeringEvt> CephPeeringEvtRef;
1447 list<CephPeeringEvtRef> peering_queue; // op queue
1448 list<CephPeeringEvtRef> peering_waiters;
1449
1450 struct QueryState : boost::statechart::event< QueryState > {
1451 Formatter *f;
1452 explicit QueryState(Formatter *f) : f(f) {}
1453 void print(std::ostream *out) const {
1454 *out << "Query";
1455 }
1456 };
1457
1458 struct MInfoRec : boost::statechart::event< MInfoRec > {
1459 pg_shard_t from;
1460 pg_info_t info;
1461 epoch_t msg_epoch;
1462 MInfoRec(pg_shard_t from, const pg_info_t &info, epoch_t msg_epoch) :
1463 from(from), info(info), msg_epoch(msg_epoch) {}
1464 void print(std::ostream *out) const {
1465 *out << "MInfoRec from " << from << " info: " << info;
1466 }
1467 };
1468
1469 struct MLogRec : boost::statechart::event< MLogRec > {
1470 pg_shard_t from;
1471 boost::intrusive_ptr<MOSDPGLog> msg;
1472 MLogRec(pg_shard_t from, MOSDPGLog *msg) :
1473 from(from), msg(msg) {}
1474 void print(std::ostream *out) const {
1475 *out << "MLogRec from " << from;
1476 }
1477 };
1478
1479 struct MNotifyRec : boost::statechart::event< MNotifyRec > {
1480 pg_shard_t from;
1481 pg_notify_t notify;
1482 uint64_t features;
1483 MNotifyRec(pg_shard_t from, const pg_notify_t &notify, uint64_t f) :
1484 from(from), notify(notify), features(f) {}
1485 void print(std::ostream *out) const {
1486 *out << "MNotifyRec from " << from << " notify: " << notify
1487 << " features: 0x" << hex << features << dec;
1488 }
1489 };
1490
1491 struct MQuery : boost::statechart::event< MQuery > {
1492 pg_shard_t from;
1493 pg_query_t query;
1494 epoch_t query_epoch;
1495 MQuery(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch):
1496 from(from), query(query), query_epoch(query_epoch) {}
1497 void print(std::ostream *out) const {
1498 *out << "MQuery from " << from
1499 << " query_epoch " << query_epoch
1500 << " query: " << query;
1501 }
1502 };
1503
1504 struct AdvMap : boost::statechart::event< AdvMap > {
1505 OSDMapRef osdmap;
1506 OSDMapRef lastmap;
1507 vector<int> newup, newacting;
1508 int up_primary, acting_primary;
1509 AdvMap(
1510 OSDMapRef osdmap, OSDMapRef lastmap,
1511 vector<int>& newup, int up_primary,
1512 vector<int>& newacting, int acting_primary):
1513 osdmap(osdmap), lastmap(lastmap),
1514 newup(newup),
1515 newacting(newacting),
1516 up_primary(up_primary),
1517 acting_primary(acting_primary) {}
1518 void print(std::ostream *out) const {
1519 *out << "AdvMap";
1520 }
1521 };
1522
1523 struct ActMap : boost::statechart::event< ActMap > {
1524 ActMap() : boost::statechart::event< ActMap >() {}
1525 void print(std::ostream *out) const {
1526 *out << "ActMap";
1527 }
1528 };
1529 struct Activate : boost::statechart::event< Activate > {
1530 epoch_t activation_epoch;
1531 explicit Activate(epoch_t q) : boost::statechart::event< Activate >(),
1532 activation_epoch(q) {}
1533 void print(std::ostream *out) const {
1534 *out << "Activate from " << activation_epoch;
1535 }
1536 };
1537 struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > {
1538 unsigned priority;
1539 explicit RequestBackfillPrio(unsigned prio) :
1540 boost::statechart::event< RequestBackfillPrio >(),
1541 priority(prio) {}
1542 void print(std::ostream *out) const {
1543 *out << "RequestBackfillPrio: priority " << priority;
1544 }
1545 };
1546 #define TrivialEvent(T) struct T : boost::statechart::event< T > { \
1547 T() : boost::statechart::event< T >() {} \
1548 void print(std::ostream *out) const { \
1549 *out << #T; \
1550 } \
1551 };
1552 TrivialEvent(Initialize)
1553 TrivialEvent(Load)
1554 TrivialEvent(GotInfo)
1555 TrivialEvent(NeedUpThru)
1556 TrivialEvent(NullEvt)
1557 TrivialEvent(FlushedEvt)
1558 TrivialEvent(Backfilled)
1559 TrivialEvent(LocalBackfillReserved)
1560 TrivialEvent(RemoteBackfillReserved)
1561 TrivialEvent(RemoteReservationRejected)
1562 TrivialEvent(CancelBackfill)
1563 TrivialEvent(RequestBackfill)
1564 TrivialEvent(RequestRecovery)
1565 TrivialEvent(RecoveryDone)
1566 TrivialEvent(BackfillTooFull)
1567 TrivialEvent(RecoveryTooFull)
1568 TrivialEvent(CancelRecovery)
1569
1570 TrivialEvent(AllReplicasRecovered)
1571 TrivialEvent(DoRecovery)
1572 TrivialEvent(LocalRecoveryReserved)
1573 TrivialEvent(RemoteRecoveryReserved)
1574 TrivialEvent(AllRemotesReserved)
1575 TrivialEvent(AllBackfillsReserved)
1576 TrivialEvent(GoClean)
1577
1578 TrivialEvent(AllReplicasActivated)
1579
1580 TrivialEvent(IntervalFlush)
1581
1582 /* Encapsulates PG recovery process */
1583 class RecoveryState {
1584 void start_handle(RecoveryCtx *new_ctx);
1585 void end_handle();
1586 public:
1587 void begin_block_outgoing();
1588 void end_block_outgoing();
1589 void clear_blocked_outgoing();
1590 private:
1591
1592 /* States */
1593 struct Initial;
1594 class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > {
1595 RecoveryState *state;
1596 public:
1597 PG *pg;
1598
1599 utime_t event_time;
1600 uint64_t event_count;
1601
1602 void clear_event_counters() {
1603 event_time = utime_t();
1604 event_count = 0;
1605 }
1606
1607 void log_enter(const char *state_name);
1608 void log_exit(const char *state_name, utime_t duration);
1609
1610 RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {}
1611
1612 /* Accessor functions for state methods */
1613 ObjectStore::Transaction* get_cur_transaction() {
1614 assert(state->rctx);
1615 assert(state->rctx->transaction);
1616 return state->rctx->transaction;
1617 }
1618
1619 void send_query(pg_shard_t to, const pg_query_t &query) {
1620 assert(state->rctx);
1621 assert(state->rctx->query_map);
1622 (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
1623 query;
1624 }
1625
1626 map<int, map<spg_t, pg_query_t> > *get_query_map() {
1627 assert(state->rctx);
1628 assert(state->rctx->query_map);
1629 return state->rctx->query_map;
1630 }
1631
1632 map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() {
1633 assert(state->rctx);
1634 assert(state->rctx->info_map);
1635 return state->rctx->info_map;
1636 }
1637
1638 list< Context* > *get_on_safe_context_list() {
1639 assert(state->rctx);
1640 assert(state->rctx->on_safe);
1641 return &(state->rctx->on_safe->contexts);
1642 }
1643
1644 list< Context * > *get_on_applied_context_list() {
1645 assert(state->rctx);
1646 assert(state->rctx->on_applied);
1647 return &(state->rctx->on_applied->contexts);
1648 }
1649
1650 RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
1651
1652 void send_notify(pg_shard_t to,
1653 const pg_notify_t &info, const PastIntervals &pi) {
1654 assert(state->rctx);
1655 assert(state->rctx->notify_list);
1656 (*state->rctx->notify_list)[to.osd].push_back(make_pair(info, pi));
1657 }
1658 };
1659 friend class RecoveryMachine;
1660
1661 /* States */
1662
1663 struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState {
1664 explicit Crashed(my_context ctx);
1665 };
1666
1667 struct Reset;
1668
1669 struct Initial : boost::statechart::state< Initial, RecoveryMachine >, NamedState {
1670 explicit Initial(my_context ctx);
1671 void exit();
1672
1673 typedef boost::mpl::list <
1674 boost::statechart::transition< Initialize, Reset >,
1675 boost::statechart::custom_reaction< Load >,
1676 boost::statechart::custom_reaction< NullEvt >,
1677 boost::statechart::transition< boost::statechart::event_base, Crashed >
1678 > reactions;
1679
1680 boost::statechart::result react(const Load&);
1681 boost::statechart::result react(const MNotifyRec&);
1682 boost::statechart::result react(const MInfoRec&);
1683 boost::statechart::result react(const MLogRec&);
1684 boost::statechart::result react(const boost::statechart::event_base&) {
1685 return discard_event();
1686 }
1687 };
1688
1689 struct Reset : boost::statechart::state< Reset, RecoveryMachine >, NamedState {
1690 explicit Reset(my_context ctx);
1691 void exit();
1692
1693 typedef boost::mpl::list <
1694 boost::statechart::custom_reaction< QueryState >,
1695 boost::statechart::custom_reaction< AdvMap >,
1696 boost::statechart::custom_reaction< ActMap >,
1697 boost::statechart::custom_reaction< NullEvt >,
1698 boost::statechart::custom_reaction< FlushedEvt >,
1699 boost::statechart::custom_reaction< IntervalFlush >,
1700 boost::statechart::transition< boost::statechart::event_base, Crashed >
1701 > reactions;
1702 boost::statechart::result react(const QueryState& q);
1703 boost::statechart::result react(const AdvMap&);
1704 boost::statechart::result react(const ActMap&);
1705 boost::statechart::result react(const FlushedEvt&);
1706 boost::statechart::result react(const IntervalFlush&);
1707 boost::statechart::result react(const boost::statechart::event_base&) {
1708 return discard_event();
1709 }
1710 };
1711
1712 struct Start;
1713
1714 struct Started : boost::statechart::state< Started, RecoveryMachine, Start >, NamedState {
1715 explicit Started(my_context ctx);
1716 void exit();
1717
1718 typedef boost::mpl::list <
1719 boost::statechart::custom_reaction< QueryState >,
1720 boost::statechart::custom_reaction< AdvMap >,
1721 boost::statechart::custom_reaction< NullEvt >,
1722 boost::statechart::custom_reaction< FlushedEvt >,
1723 boost::statechart::custom_reaction< IntervalFlush >,
1724 boost::statechart::transition< boost::statechart::event_base, Crashed >
1725 > reactions;
1726 boost::statechart::result react(const QueryState& q);
1727 boost::statechart::result react(const AdvMap&);
1728 boost::statechart::result react(const FlushedEvt&);
1729 boost::statechart::result react(const IntervalFlush&);
1730 boost::statechart::result react(const boost::statechart::event_base&) {
1731 return discard_event();
1732 }
1733 };
1734
1735 struct MakePrimary : boost::statechart::event< MakePrimary > {
1736 MakePrimary() : boost::statechart::event< MakePrimary >() {}
1737 };
1738 struct MakeStray : boost::statechart::event< MakeStray > {
1739 MakeStray() : boost::statechart::event< MakeStray >() {}
1740 };
1741 struct Primary;
1742 struct Stray;
1743
1744 struct Start : boost::statechart::state< Start, Started >, NamedState {
1745 explicit Start(my_context ctx);
1746 void exit();
1747
1748 typedef boost::mpl::list <
1749 boost::statechart::transition< MakePrimary, Primary >,
1750 boost::statechart::transition< MakeStray, Stray >
1751 > reactions;
1752 };
1753
1754 struct Peering;
1755 struct WaitActingChange;
1756 struct NeedActingChange : boost::statechart::event< NeedActingChange > {
1757 NeedActingChange() : boost::statechart::event< NeedActingChange >() {}
1758 };
1759 struct Incomplete;
1760 struct IsIncomplete : boost::statechart::event< IsIncomplete > {
1761 IsIncomplete() : boost::statechart::event< IsIncomplete >() {}
1762 };
1763 struct Down;
1764 struct IsDown : boost::statechart::event< IsDown > {
1765 IsDown() : boost::statechart::event< IsDown >() {}
1766 };
1767
1768 struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
1769 explicit Primary(my_context ctx);
1770 void exit();
1771
1772 typedef boost::mpl::list <
1773 boost::statechart::custom_reaction< ActMap >,
1774 boost::statechart::custom_reaction< MNotifyRec >,
1775 boost::statechart::transition< NeedActingChange, WaitActingChange >
1776 > reactions;
1777 boost::statechart::result react(const ActMap&);
1778 boost::statechart::result react(const MNotifyRec&);
1779 };
1780
1781 struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
1782 NamedState {
1783 typedef boost::mpl::list <
1784 boost::statechart::custom_reaction< QueryState >,
1785 boost::statechart::custom_reaction< AdvMap >,
1786 boost::statechart::custom_reaction< MLogRec >,
1787 boost::statechart::custom_reaction< MInfoRec >,
1788 boost::statechart::custom_reaction< MNotifyRec >
1789 > reactions;
1790 explicit WaitActingChange(my_context ctx);
1791 boost::statechart::result react(const QueryState& q);
1792 boost::statechart::result react(const AdvMap&);
1793 boost::statechart::result react(const MLogRec&);
1794 boost::statechart::result react(const MInfoRec&);
1795 boost::statechart::result react(const MNotifyRec&);
1796 void exit();
1797 };
1798
1799 struct GetInfo;
1800 struct Active;
1801
1802 struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
1803 PastIntervals::PriorSet prior_set;
1804 bool history_les_bound; //< need osd_find_best_info_ignore_history_les
1805
1806 explicit Peering(my_context ctx);
1807 void exit();
1808
1809 typedef boost::mpl::list <
1810 boost::statechart::custom_reaction< QueryState >,
1811 boost::statechart::transition< Activate, Active >,
1812 boost::statechart::custom_reaction< AdvMap >
1813 > reactions;
1814 boost::statechart::result react(const QueryState& q);
1815 boost::statechart::result react(const AdvMap &advmap);
1816 };
1817
1818 struct WaitLocalRecoveryReserved;
1819 struct Activating;
1820 struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState {
1821 explicit Active(my_context ctx);
1822 void exit();
1823
1824 const set<pg_shard_t> remote_shards_to_reserve_recovery;
1825 const set<pg_shard_t> remote_shards_to_reserve_backfill;
1826 bool all_replicas_activated;
1827
1828 typedef boost::mpl::list <
1829 boost::statechart::custom_reaction< QueryState >,
1830 boost::statechart::custom_reaction< ActMap >,
1831 boost::statechart::custom_reaction< AdvMap >,
1832 boost::statechart::custom_reaction< MInfoRec >,
1833 boost::statechart::custom_reaction< MNotifyRec >,
1834 boost::statechart::custom_reaction< MLogRec >,
1835 boost::statechart::custom_reaction< Backfilled >,
1836 boost::statechart::custom_reaction< AllReplicasActivated >
1837 > reactions;
1838 boost::statechart::result react(const QueryState& q);
1839 boost::statechart::result react(const ActMap&);
1840 boost::statechart::result react(const AdvMap&);
1841 boost::statechart::result react(const MInfoRec& infoevt);
1842 boost::statechart::result react(const MNotifyRec& notevt);
1843 boost::statechart::result react(const MLogRec& logevt);
1844 boost::statechart::result react(const Backfilled&) {
1845 return discard_event();
1846 }
1847 boost::statechart::result react(const AllReplicasActivated&);
1848 };
1849
1850 struct Clean : boost::statechart::state< Clean, Active >, NamedState {
1851 typedef boost::mpl::list<
1852 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >
1853 > reactions;
1854 explicit Clean(my_context ctx);
1855 void exit();
1856 };
1857
1858 struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
1859 typedef boost::mpl::list<
1860 boost::statechart::transition< GoClean, Clean >,
1861 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
1862 boost::statechart::custom_reaction< AllReplicasActivated >
1863 > reactions;
1864 explicit Recovered(my_context ctx);
1865 void exit();
1866 boost::statechart::result react(const AllReplicasActivated&) {
1867 post_event(GoClean());
1868 return forward_event();
1869 }
1870 };
1871
1872 struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
1873 typedef boost::mpl::list<
1874 boost::statechart::transition< Backfilled, Recovered >,
1875 boost::statechart::custom_reaction< CancelBackfill >,
1876 boost::statechart::custom_reaction< RemoteReservationRejected >
1877 > reactions;
1878 explicit Backfilling(my_context ctx);
1879 boost::statechart::result react(const RemoteReservationRejected& evt);
1880 boost::statechart::result react(const CancelBackfill& evt);
1881 void exit();
1882 };
1883
1884 struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
1885 typedef boost::mpl::list<
1886 boost::statechart::custom_reaction< RemoteBackfillReserved >,
1887 boost::statechart::custom_reaction< RemoteReservationRejected >,
1888 boost::statechart::transition< AllBackfillsReserved, Backfilling >
1889 > reactions;
1890 set<pg_shard_t>::const_iterator backfill_osd_it;
1891 explicit WaitRemoteBackfillReserved(my_context ctx);
1892 void exit();
1893 boost::statechart::result react(const RemoteBackfillReserved& evt);
1894 boost::statechart::result react(const RemoteReservationRejected& evt);
1895 };
1896
1897 struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
1898 typedef boost::mpl::list<
1899 boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved >
1900 > reactions;
1901 explicit WaitLocalBackfillReserved(my_context ctx);
1902 void exit();
1903 };
1904
1905 struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
1906 typedef boost::mpl::list<
1907 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
1908 boost::statechart::custom_reaction< RemoteBackfillReserved >,
1909 boost::statechart::custom_reaction< RemoteReservationRejected >
1910 > reactions;
1911 explicit NotBackfilling(my_context ctx);
1912 void exit();
1913 boost::statechart::result react(const RemoteBackfillReserved& evt);
1914 boost::statechart::result react(const RemoteReservationRejected& evt);
1915 };
1916
1917 struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
1918 typedef boost::mpl::list<
1919 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >
1920 > reactions;
1921 explicit NotRecovering(my_context ctx);
1922 void exit();
1923 };
1924
1925 struct RepNotRecovering;
1926 struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
1927 explicit ReplicaActive(my_context ctx);
1928 void exit();
1929
1930 typedef boost::mpl::list <
1931 boost::statechart::custom_reaction< QueryState >,
1932 boost::statechart::custom_reaction< ActMap >,
1933 boost::statechart::custom_reaction< MQuery >,
1934 boost::statechart::custom_reaction< MInfoRec >,
1935 boost::statechart::custom_reaction< MLogRec >,
1936 boost::statechart::custom_reaction< Activate >
1937 > reactions;
1938 boost::statechart::result react(const QueryState& q);
1939 boost::statechart::result react(const MInfoRec& infoevt);
1940 boost::statechart::result react(const MLogRec& logevt);
1941 boost::statechart::result react(const ActMap&);
1942 boost::statechart::result react(const MQuery&);
1943 boost::statechart::result react(const Activate&);
1944 };
1945
1946 struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
1947 typedef boost::mpl::list<
1948 boost::statechart::transition< RecoveryDone, RepNotRecovering >,
1949 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
1950 boost::statechart::custom_reaction< BackfillTooFull >
1951 > reactions;
1952 explicit RepRecovering(my_context ctx);
1953 boost::statechart::result react(const BackfillTooFull &evt);
1954 void exit();
1955 };
1956
1957 struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
1958 typedef boost::mpl::list<
1959 boost::statechart::custom_reaction< RemoteBackfillReserved >,
1960 boost::statechart::custom_reaction< RemoteReservationRejected >
1961 > reactions;
1962 explicit RepWaitBackfillReserved(my_context ctx);
1963 void exit();
1964 boost::statechart::result react(const RemoteBackfillReserved &evt);
1965 boost::statechart::result react(const RemoteReservationRejected &evt);
1966 };
1967
1968 struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
1969 typedef boost::mpl::list<
1970 boost::statechart::custom_reaction< RemoteRecoveryReserved >
1971 > reactions;
1972 explicit RepWaitRecoveryReserved(my_context ctx);
1973 void exit();
1974 boost::statechart::result react(const RemoteRecoveryReserved &evt);
1975 };
1976
1977 struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
1978 typedef boost::mpl::list<
1979 boost::statechart::custom_reaction< RequestBackfillPrio >,
1980 boost::statechart::transition< RequestRecovery, RepWaitRecoveryReserved >,
1981 boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers
1982 > reactions;
1983 explicit RepNotRecovering(my_context ctx);
1984 boost::statechart::result react(const RequestBackfillPrio &evt);
1985 void exit();
1986 };
1987
1988 struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
1989 typedef boost::mpl::list <
1990 boost::statechart::custom_reaction< AllReplicasRecovered >,
1991 boost::statechart::custom_reaction< CancelRecovery >,
1992 boost::statechart::custom_reaction< RequestBackfill >
1993 > reactions;
1994 explicit Recovering(my_context ctx);
1995 void exit();
1996 void release_reservations(bool cancel = false);
1997 boost::statechart::result react(const AllReplicasRecovered &evt);
1998 boost::statechart::result react(const CancelRecovery& evt);
1999 boost::statechart::result react(const RequestBackfill &evt);
2000 };
2001
2002 struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState {
2003 typedef boost::mpl::list <
2004 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2005 boost::statechart::transition< AllRemotesReserved, Recovering >
2006 > reactions;
2007 set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
2008 explicit WaitRemoteRecoveryReserved(my_context ctx);
2009 boost::statechart::result react(const RemoteRecoveryReserved &evt);
2010 void exit();
2011 };
2012
2013 struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState {
2014 typedef boost::mpl::list <
2015 boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >,
2016 boost::statechart::custom_reaction< RecoveryTooFull >
2017 > reactions;
2018 explicit WaitLocalRecoveryReserved(my_context ctx);
2019 void exit();
2020 boost::statechart::result react(const RecoveryTooFull &evt);
2021 };
2022
2023 struct Activating : boost::statechart::state< Activating, Active >, NamedState {
2024 typedef boost::mpl::list <
2025 boost::statechart::transition< AllReplicasRecovered, Recovered >,
2026 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2027 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved >
2028 > reactions;
2029 explicit Activating(my_context ctx);
2030 void exit();
2031 };
2032
2033 struct Stray : boost::statechart::state< Stray, Started >, NamedState {
2034 map<int, pair<pg_query_t, epoch_t> > pending_queries;
2035
2036 explicit Stray(my_context ctx);
2037 void exit();
2038
2039 typedef boost::mpl::list <
2040 boost::statechart::custom_reaction< MQuery >,
2041 boost::statechart::custom_reaction< MLogRec >,
2042 boost::statechart::custom_reaction< MInfoRec >,
2043 boost::statechart::custom_reaction< ActMap >,
2044 boost::statechart::custom_reaction< RecoveryDone >
2045 > reactions;
2046 boost::statechart::result react(const MQuery& query);
2047 boost::statechart::result react(const MLogRec& logevt);
2048 boost::statechart::result react(const MInfoRec& infoevt);
2049 boost::statechart::result react(const ActMap&);
2050 boost::statechart::result react(const RecoveryDone&) {
2051 return discard_event();
2052 }
2053 };
2054
2055 struct GetLog;
2056
2057 struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
2058 set<pg_shard_t> peer_info_requested;
2059
2060 explicit GetInfo(my_context ctx);
2061 void exit();
2062 void get_infos();
2063
2064 typedef boost::mpl::list <
2065 boost::statechart::custom_reaction< QueryState >,
2066 boost::statechart::transition< GotInfo, GetLog >,
2067 boost::statechart::custom_reaction< MNotifyRec >,
2068 boost::statechart::transition< IsDown, Down >
2069 > reactions;
2070 boost::statechart::result react(const QueryState& q);
2071 boost::statechart::result react(const MNotifyRec& infoevt);
2072 };
2073
2074 struct GotLog : boost::statechart::event< GotLog > {
2075 GotLog() : boost::statechart::event< GotLog >() {}
2076 };
2077
2078 struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState {
2079 pg_shard_t auth_log_shard;
2080 boost::intrusive_ptr<MOSDPGLog> msg;
2081
2082 explicit GetLog(my_context ctx);
2083 void exit();
2084
2085 typedef boost::mpl::list <
2086 boost::statechart::custom_reaction< QueryState >,
2087 boost::statechart::custom_reaction< MLogRec >,
2088 boost::statechart::custom_reaction< GotLog >,
2089 boost::statechart::custom_reaction< AdvMap >,
2090 boost::statechart::transition< IsIncomplete, Incomplete >
2091 > reactions;
2092 boost::statechart::result react(const AdvMap&);
2093 boost::statechart::result react(const QueryState& q);
2094 boost::statechart::result react(const MLogRec& logevt);
2095 boost::statechart::result react(const GotLog&);
2096 };
2097
2098 struct WaitUpThru;
2099
2100 struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState {
2101 set<pg_shard_t> peer_missing_requested;
2102
2103 explicit GetMissing(my_context ctx);
2104 void exit();
2105
2106 typedef boost::mpl::list <
2107 boost::statechart::custom_reaction< QueryState >,
2108 boost::statechart::custom_reaction< MLogRec >,
2109 boost::statechart::transition< NeedUpThru, WaitUpThru >
2110 > reactions;
2111 boost::statechart::result react(const QueryState& q);
2112 boost::statechart::result react(const MLogRec& logevt);
2113 };
2114
2115 struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState {
2116 explicit WaitUpThru(my_context ctx);
2117 void exit();
2118
2119 typedef boost::mpl::list <
2120 boost::statechart::custom_reaction< QueryState >,
2121 boost::statechart::custom_reaction< ActMap >,
2122 boost::statechart::custom_reaction< MLogRec >
2123 > reactions;
2124 boost::statechart::result react(const QueryState& q);
2125 boost::statechart::result react(const ActMap& am);
2126 boost::statechart::result react(const MLogRec& logrec);
2127 };
2128
2129 struct Down : boost::statechart::state< Down, Peering>, NamedState {
2130 explicit Down(my_context ctx);
2131 typedef boost::mpl::list <
2132 boost::statechart::custom_reaction< QueryState >
2133 > reactions;
2134 boost::statechart::result react(const QueryState& infoevt);
2135 void exit();
2136 };
2137
2138 struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState {
2139 typedef boost::mpl::list <
2140 boost::statechart::custom_reaction< AdvMap >,
2141 boost::statechart::custom_reaction< MNotifyRec >,
2142 boost::statechart::custom_reaction< QueryState >
2143 > reactions;
2144 explicit Incomplete(my_context ctx);
2145 boost::statechart::result react(const AdvMap &advmap);
2146 boost::statechart::result react(const MNotifyRec& infoevt);
2147 boost::statechart::result react(const QueryState& infoevt);
2148 void exit();
2149 };
2150
2151
2152 RecoveryMachine machine;
2153 PG *pg;
2154
2155 /// context passed in by state machine caller
2156 RecoveryCtx *orig_ctx;
2157
2158 /// populated if we are buffering messages pending a flush
2159 boost::optional<BufferedRecoveryMessages> messages_pending_flush;
2160
2161 /**
2162 * populated between start_handle() and end_handle(), points into
2163 * the message lists for messages_pending_flush while blocking messages
2164 * or into orig_ctx otherwise
2165 */
2166 boost::optional<RecoveryCtx> rctx;
2167
2168 public:
2169 explicit RecoveryState(PG *pg)
2170 : machine(this, pg), pg(pg), orig_ctx(0) {
2171 machine.initiate();
2172 }
2173
2174 void handle_event(const boost::statechart::event_base &evt,
2175 RecoveryCtx *rctx) {
2176 start_handle(rctx);
2177 machine.process_event(evt);
2178 end_handle();
2179 }
2180
2181 void handle_event(CephPeeringEvtRef evt,
2182 RecoveryCtx *rctx) {
2183 start_handle(rctx);
2184 machine.process_event(evt->get_event());
2185 end_handle();
2186 }
2187
2188 } recovery_state;
2189
2190
2191 public:
2192 PG(OSDService *o, OSDMapRef curmap,
2193 const PGPool &pool, spg_t p);
2194 ~PG() override;
2195
2196 private:
2197 // Prevent copying
2198 explicit PG(const PG& rhs);
2199 PG& operator=(const PG& rhs);
2200 const spg_t pg_id;
2201 uint64_t peer_features;
2202 uint64_t acting_features;
2203 uint64_t upacting_features;
2204
2205 epoch_t last_epoch;
2206
2207 public:
2208 const spg_t& get_pgid() const { return pg_id; }
2209
2210 void reset_min_peer_features() {
2211 peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
2212 }
2213 uint64_t get_min_peer_features() const { return peer_features; }
2214 void apply_peer_features(uint64_t f) { peer_features &= f; }
2215
2216 uint64_t get_min_acting_features() const { return acting_features; }
2217 uint64_t get_min_upacting_features() const { return upacting_features; }
2218
2219 void init_primary_up_acting(
2220 const vector<int> &newup,
2221 const vector<int> &newacting,
2222 int new_up_primary,
2223 int new_acting_primary) {
2224 actingset.clear();
2225 acting = newacting;
2226 for (uint8_t i = 0; i < acting.size(); ++i) {
2227 if (acting[i] != CRUSH_ITEM_NONE)
2228 actingset.insert(
2229 pg_shard_t(
2230 acting[i],
2231 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2232 }
2233 upset.clear();
2234 up = newup;
2235 for (uint8_t i = 0; i < up.size(); ++i) {
2236 if (up[i] != CRUSH_ITEM_NONE)
2237 upset.insert(
2238 pg_shard_t(
2239 up[i],
2240 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2241 }
2242 if (!pool.info.ec_pool()) {
2243 up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
2244 primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
2245 return;
2246 }
2247 up_primary = pg_shard_t();
2248 primary = pg_shard_t();
2249 for (uint8_t i = 0; i < up.size(); ++i) {
2250 if (up[i] == new_up_primary) {
2251 up_primary = pg_shard_t(up[i], shard_id_t(i));
2252 break;
2253 }
2254 }
2255 for (uint8_t i = 0; i < acting.size(); ++i) {
2256 if (acting[i] == new_acting_primary) {
2257 primary = pg_shard_t(acting[i], shard_id_t(i));
2258 break;
2259 }
2260 }
2261 assert(up_primary.osd == new_up_primary);
2262 assert(primary.osd == new_acting_primary);
2263 }
2264 pg_shard_t get_primary() const { return primary; }
2265
2266 int get_role() const { return role; }
2267 void set_role(int r) { role = r; }
2268
2269 bool is_primary() const { return pg_whoami == primary; }
2270 bool is_replica() const { return role > 0; }
2271
2272 epoch_t get_last_peering_reset() const { return last_peering_reset; }
2273
2274 //int get_state() const { return state; }
2275 bool state_test(int m) const { return (state & m) != 0; }
2276 void state_set(int m) { state |= m; }
2277 void state_clear(int m) { state &= ~m; }
2278
2279 bool is_complete() const { return info.last_complete == info.last_update; }
2280 bool should_send_notify() const { return send_notify; }
2281
2282 int get_state() const { return state; }
2283 bool is_active() const { return state_test(PG_STATE_ACTIVE); }
2284 bool is_activating() const { return state_test(PG_STATE_ACTIVATING); }
2285 bool is_peering() const { return state_test(PG_STATE_PEERING); }
2286 bool is_down() const { return state_test(PG_STATE_DOWN); }
2287 bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
2288 bool is_clean() const { return state_test(PG_STATE_CLEAN); }
2289 bool is_degraded() const { return state_test(PG_STATE_DEGRADED); }
2290 bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
2291
2292 bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
2293 bool is_peered() const {
2294 return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED);
2295 }
2296
2297 bool is_empty() const { return info.last_update == eversion_t(0,0); }
2298
2299 void init(
2300 int role,
2301 const vector<int>& up,
2302 int up_primary,
2303 const vector<int>& acting,
2304 int acting_primary,
2305 const pg_history_t& history,
2306 const PastIntervals& pim,
2307 bool backfill,
2308 ObjectStore::Transaction *t);
2309
2310 // pg on-disk state
2311 void do_pending_flush();
2312
2313 static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits);
2314 static void _init(ObjectStore::Transaction& t,
2315 spg_t pgid, const pg_pool_t *pool);
2316
2317 private:
2318 void prepare_write_info(map<string,bufferlist> *km);
2319
2320 void update_store_with_options();
2321 void update_store_on_load();
2322
2323 public:
2324 static int _prepare_write_info(
2325 CephContext* cct,
2326 map<string,bufferlist> *km,
2327 epoch_t epoch,
2328 pg_info_t &info,
2329 pg_info_t &last_written_info,
2330 PastIntervals &past_intervals,
2331 bool dirty_big_info,
2332 bool dirty_epoch,
2333 bool try_fast_info,
2334 PerfCounters *logger = nullptr);
2335 void write_if_dirty(ObjectStore::Transaction& t);
2336
2337 PGLog::IndexedLog projected_log;
2338 bool check_in_progress_op(
2339 const osd_reqid_t &r,
2340 eversion_t *version,
2341 version_t *user_version,
2342 int *return_code) const;
2343 eversion_t projected_last_update;
2344 eversion_t get_next_version() const {
2345 eversion_t at_version(
2346 get_osdmap()->get_epoch(),
2347 projected_last_update.version+1);
2348 assert(at_version > info.last_update);
2349 assert(at_version > pg_log.get_head());
2350 assert(at_version > projected_last_update);
2351 return at_version;
2352 }
2353
2354 void add_log_entry(const pg_log_entry_t& e, bool applied);
2355 void append_log(
2356 const vector<pg_log_entry_t>& logv,
2357 eversion_t trim_to,
2358 eversion_t roll_forward_to,
2359 ObjectStore::Transaction &t,
2360 bool transaction_applied = true);
2361 bool check_log_for_corruption(ObjectStore *store);
2362 void trim_log();
2363
2364 std::string get_corrupt_pg_log_name() const;
2365 static int read_info(
2366 ObjectStore *store, spg_t pgid, const coll_t &coll,
2367 bufferlist &bl, pg_info_t &info, PastIntervals &past_intervals,
2368 __u8 &);
2369 void read_state(ObjectStore *store, bufferlist &bl);
2370 static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
2371 static int peek_map_epoch(ObjectStore *store, spg_t pgid,
2372 epoch_t *pepoch, bufferlist *bl);
2373 void update_snap_map(
2374 const vector<pg_log_entry_t> &log_entries,
2375 ObjectStore::Transaction& t);
2376
2377 void filter_snapc(vector<snapid_t> &snaps);
2378
2379 void log_weirdness();
2380
2381 virtual void kick_snap_trim() = 0;
2382 virtual void snap_trimmer_scrub_complete() = 0;
2383 bool requeue_scrub(bool high_priority = false);
2384 void queue_recovery(bool front = false);
2385 bool queue_scrub();
2386 unsigned get_scrub_priority();
2387
2388 /// share pg info after a pg is active
2389 void share_pg_info();
2390
2391
2392 bool append_log_entries_update_missing(
2393 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
2394 ObjectStore::Transaction &t);
2395
2396 /**
2397 * Merge entries updating missing as necessary on all
2398 * actingbackfill logs and missings (also missing_loc)
2399 */
2400 void merge_new_log_entries(
2401 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
2402 ObjectStore::Transaction &t);
2403
2404 void reset_interval_flush();
2405 void start_peering_interval(
2406 const OSDMapRef lastmap,
2407 const vector<int>& newup, int up_primary,
2408 const vector<int>& newacting, int acting_primary,
2409 ObjectStore::Transaction *t);
2410 void on_new_interval();
2411 virtual void _on_new_interval() = 0;
2412 void start_flush(ObjectStore::Transaction *t,
2413 list<Context *> *on_applied,
2414 list<Context *> *on_safe);
2415 void set_last_peering_reset();
2416 bool pg_has_reset_since(epoch_t e) {
2417 assert(is_locked());
2418 return deleting || e < get_last_peering_reset();
2419 }
2420
2421 void update_history(const pg_history_t& history);
2422 void fulfill_info(pg_shard_t from, const pg_query_t &query,
2423 pair<pg_shard_t, pg_info_t> &notify_info);
2424 void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
2425
2426 void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap);
2427
2428 bool should_restart_peering(
2429 int newupprimary,
2430 int newactingprimary,
2431 const vector<int>& newup,
2432 const vector<int>& newacting,
2433 OSDMapRef lastmap,
2434 OSDMapRef osdmap);
2435
2436 // OpRequest queueing
2437 bool can_discard_op(OpRequestRef& op);
2438 bool can_discard_scan(OpRequestRef op);
2439 bool can_discard_backfill(OpRequestRef op);
2440 bool can_discard_request(OpRequestRef& op);
2441
2442 template<typename T, int MSGTYPE>
2443 bool can_discard_replica_op(OpRequestRef& op);
2444
2445 bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
2446 bool old_peering_evt(CephPeeringEvtRef evt) {
2447 return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
2448 }
2449 static bool have_same_or_newer_map(epoch_t cur_epoch, epoch_t e) {
2450 return e <= cur_epoch;
2451 }
2452 bool have_same_or_newer_map(epoch_t e) {
2453 return e <= get_osdmap()->get_epoch();
2454 }
2455
2456 bool op_has_sufficient_caps(OpRequestRef& op);
2457
2458
2459 // recovery bits
2460 void take_waiters();
2461 void queue_peering_event(CephPeeringEvtRef evt);
2462 void handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx);
2463 void queue_query(epoch_t msg_epoch, epoch_t query_epoch,
2464 pg_shard_t from, const pg_query_t& q);
2465 void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
2466 void queue_flushed(epoch_t started_at);
2467 void handle_advance_map(
2468 OSDMapRef osdmap, OSDMapRef lastmap,
2469 vector<int>& newup, int up_primary,
2470 vector<int>& newacting, int acting_primary,
2471 RecoveryCtx *rctx);
2472 void handle_activate_map(RecoveryCtx *rctx);
2473 void handle_create(RecoveryCtx *rctx);
2474 void handle_loaded(RecoveryCtx *rctx);
2475 void handle_query_state(Formatter *f);
2476
2477 virtual void on_removal(ObjectStore::Transaction *t) = 0;
2478
2479
2480 // abstract bits
2481 virtual void do_request(
2482 OpRequestRef& op,
2483 ThreadPool::TPHandle &handle
2484 ) = 0;
2485
2486 virtual void do_op(OpRequestRef& op) = 0;
2487 virtual void do_sub_op(OpRequestRef op) = 0;
2488 virtual void do_sub_op_reply(OpRequestRef op) = 0;
2489 virtual void do_scan(
2490 OpRequestRef op,
2491 ThreadPool::TPHandle &handle
2492 ) = 0;
2493 virtual void do_backfill(OpRequestRef op) = 0;
2494 virtual void snap_trimmer(epoch_t epoch_queued) = 0;
2495
2496 virtual int do_command(
2497 cmdmap_t cmdmap,
2498 ostream& ss,
2499 bufferlist& idata,
2500 bufferlist& odata,
2501 ConnectionRef conn,
2502 ceph_tid_t tid) = 0;
2503
2504 virtual void on_role_change() = 0;
2505 virtual void on_pool_change() = 0;
2506 virtual void on_change(ObjectStore::Transaction *t) = 0;
2507 virtual void on_activate() = 0;
2508 virtual void on_flushed() = 0;
2509 virtual void on_shutdown() = 0;
2510 virtual void check_blacklisted_watchers() = 0;
2511 virtual void get_watchers(std::list<obj_watch_item_t>&) = 0;
2512
2513 virtual bool agent_work(int max) = 0;
2514 virtual bool agent_work(int max, int agent_flush_quota) = 0;
2515 virtual void agent_stop() = 0;
2516 virtual void agent_delay() = 0;
2517 virtual void agent_clear() = 0;
2518 virtual void agent_choose_mode_restart() = 0;
2519 };
2520
2521 ostream& operator<<(ostream& out, const PG& pg);
2522
2523 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi);
2524
2525 #endif