]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #ifndef CEPH_PG_H | |
16 | #define CEPH_PG_H | |
17 | ||
18 | #include <boost/statechart/custom_reaction.hpp> | |
19 | #include <boost/statechart/event.hpp> | |
20 | #include <boost/statechart/simple_state.hpp> | |
21 | #include <boost/statechart/state.hpp> | |
22 | #include <boost/statechart/state_machine.hpp> | |
23 | #include <boost/statechart/transition.hpp> | |
24 | #include <boost/statechart/event_base.hpp> | |
25 | #include <boost/scoped_ptr.hpp> | |
26 | #include <boost/circular_buffer.hpp> | |
91327a77 | 27 | #include <boost/container/flat_set.hpp> |
7c673cae FG |
28 | #include "include/mempool.h" |
29 | ||
30 | // re-include our assert to clobber boost's | |
11fdf7f2 | 31 | #include "include/ceph_assert.h" |
7c673cae FG |
32 | |
33 | #include "include/types.h" | |
34 | #include "include/stringify.h" | |
35 | #include "osd_types.h" | |
36 | #include "include/xlist.h" | |
37 | #include "SnapMapper.h" | |
38 | #include "Session.h" | |
39 | #include "common/Timer.h" | |
40 | ||
41 | #include "PGLog.h" | |
42 | #include "OSDMap.h" | |
43 | #include "messages/MOSDPGLog.h" | |
44 | #include "include/str_list.h" | |
45 | #include "PGBackend.h" | |
11fdf7f2 TL |
46 | #include "PGPeeringEvent.h" |
47 | ||
48 | #include "mgr/OSDPerfMetricTypes.h" | |
7c673cae FG |
49 | |
50 | #include <atomic> | |
51 | #include <list> | |
52 | #include <memory> | |
53 | #include <stack> | |
54 | #include <string> | |
55 | #include <tuple> | |
7c673cae FG |
56 | |
57 | //#define DEBUG_RECOVERY_OIDS // track set of recovering oids explicitly, to find counting bugs | |
11fdf7f2 | 58 | //#define PG_DEBUG_REFS // track provenance of pg refs, helpful for finding leaks |
7c673cae FG |
59 | |
60 | class OSD; | |
61 | class OSDService; | |
11fdf7f2 TL |
62 | class OSDShard; |
63 | class OSDShardPGSlot; | |
7c673cae FG |
64 | class MOSDOp; |
65 | class MOSDPGScan; | |
66 | class MOSDPGBackfill; | |
67 | class MOSDPGInfo; | |
68 | ||
69 | class PG; | |
70 | struct OpRequest; | |
71 | typedef OpRequest::Ref OpRequestRef; | |
72 | class MOSDPGLog; | |
73 | class CephContext; | |
11fdf7f2 | 74 | class DynamicPerfStats; |
7c673cae FG |
75 | |
76 | namespace Scrub { | |
77 | class Store; | |
78 | } | |
79 | ||
7c673cae FG |
80 | using state_history_entry = std::tuple<utime_t, utime_t, const char*>; |
81 | using embedded_state = std::pair<utime_t, const char*>; | |
82 | ||
83 | struct PGStateInstance { | |
84 | // Time spent in pg states | |
85 | ||
86 | void setepoch(const epoch_t current_epoch) { | |
87 | this_epoch = current_epoch; | |
88 | } | |
89 | ||
90 | void enter_state(const utime_t entime, const char* state) { | |
91 | embedded_states.push(std::make_pair(entime, state)); | |
92 | } | |
93 | ||
94 | void exit_state(const utime_t extime) { | |
95 | embedded_state this_state = embedded_states.top(); | |
96 | state_history.push_back(state_history_entry{ | |
97 | this_state.first, extime, this_state.second}); | |
98 | embedded_states.pop(); | |
99 | } | |
100 | ||
101 | epoch_t this_epoch; | |
102 | utime_t enter_time; | |
103 | std::vector<state_history_entry> state_history; | |
104 | std::stack<embedded_state> embedded_states; | |
105 | }; | |
106 | ||
107 | class PGStateHistory { | |
108 | // Member access protected with the PG lock | |
109 | public: | |
110 | PGStateHistory() : buffer(10) {} | |
111 | ||
112 | void enter(PG* pg, const utime_t entime, const char* state); | |
113 | ||
114 | void exit(const char* state); | |
115 | ||
116 | void reset() { | |
117 | pi = nullptr; | |
118 | } | |
119 | ||
120 | void set_pg_in_destructor() { pg_in_destructor = true; } | |
121 | ||
122 | void dump(Formatter* f) const; | |
123 | ||
a8e16298 TL |
124 | string get_current_state() { |
125 | if (pi == nullptr) return "unknown"; | |
126 | return std::get<1>(pi->embedded_states.top()); | |
127 | } | |
128 | ||
7c673cae FG |
129 | private: |
130 | bool pg_in_destructor = false; | |
131 | PG* thispg = nullptr; | |
132 | std::unique_ptr<PGStateInstance> tmppi; | |
133 | PGStateInstance* pi = nullptr; | |
134 | boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer; | |
135 | ||
136 | }; | |
137 | ||
138 | #ifdef PG_DEBUG_REFS | |
139 | #include "common/tracked_int_ptr.hpp" | |
140 | uint64_t get_with_id(PG *pg); | |
141 | void put_with_id(PG *pg, uint64_t id); | |
142 | typedef TrackedIntPtr<PG> PGRef; | |
143 | #else | |
144 | typedef boost::intrusive_ptr<PG> PGRef; | |
145 | #endif | |
146 | ||
147 | class PGRecoveryStats { | |
148 | struct per_state_info { | |
149 | uint64_t enter, exit; // enter/exit counts | |
150 | uint64_t events; | |
151 | utime_t event_time; // time spent processing events | |
152 | utime_t total_time; // total time in state | |
153 | utime_t min_time, max_time; | |
154 | ||
155 | // cppcheck-suppress unreachableCode | |
156 | per_state_info() : enter(0), exit(0), events(0) {} | |
157 | }; | |
158 | map<const char *,per_state_info> info; | |
159 | Mutex lock; | |
160 | ||
161 | public: | |
162 | PGRecoveryStats() : lock("PGRecoverStats::lock") {} | |
163 | ||
164 | void reset() { | |
11fdf7f2 | 165 | std::lock_guard l(lock); |
7c673cae FG |
166 | info.clear(); |
167 | } | |
168 | void dump(ostream& out) { | |
11fdf7f2 | 169 | std::lock_guard l(lock); |
7c673cae FG |
170 | for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) { |
171 | per_state_info& i = p->second; | |
172 | out << i.enter << "\t" << i.exit << "\t" | |
173 | << i.events << "\t" << i.event_time << "\t" | |
174 | << i.total_time << "\t" | |
175 | << i.min_time << "\t" << i.max_time << "\t" | |
176 | << p->first << "\n"; | |
177 | } | |
178 | } | |
179 | ||
180 | void dump_formatted(Formatter *f) { | |
11fdf7f2 | 181 | std::lock_guard l(lock); |
7c673cae FG |
182 | f->open_array_section("pg_recovery_stats"); |
183 | for (map<const char *,per_state_info>::iterator p = info.begin(); | |
184 | p != info.end(); ++p) { | |
185 | per_state_info& i = p->second; | |
186 | f->open_object_section("recovery_state"); | |
187 | f->dump_int("enter", i.enter); | |
188 | f->dump_int("exit", i.exit); | |
189 | f->dump_int("events", i.events); | |
190 | f->dump_stream("event_time") << i.event_time; | |
191 | f->dump_stream("total_time") << i.total_time; | |
192 | f->dump_stream("min_time") << i.min_time; | |
193 | f->dump_stream("max_time") << i.max_time; | |
194 | vector<string> states; | |
195 | get_str_vec(p->first, "/", states); | |
196 | f->open_array_section("nested_states"); | |
197 | for (vector<string>::iterator st = states.begin(); | |
198 | st != states.end(); ++st) { | |
199 | f->dump_string("state", *st); | |
200 | } | |
201 | f->close_section(); | |
202 | f->close_section(); | |
203 | } | |
204 | f->close_section(); | |
205 | } | |
206 | ||
207 | void log_enter(const char *s) { | |
11fdf7f2 | 208 | std::lock_guard l(lock); |
7c673cae FG |
209 | info[s].enter++; |
210 | } | |
211 | void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) { | |
11fdf7f2 | 212 | std::lock_guard l(lock); |
7c673cae FG |
213 | per_state_info &i = info[s]; |
214 | i.exit++; | |
215 | i.total_time += dur; | |
216 | if (dur > i.max_time) | |
217 | i.max_time = dur; | |
218 | if (dur < i.min_time || i.min_time == utime_t()) | |
219 | i.min_time = dur; | |
220 | i.events += events; | |
221 | i.event_time += event_dur; | |
222 | } | |
223 | }; | |
224 | ||
225 | struct PGPool { | |
226 | CephContext* cct; | |
227 | epoch_t cached_epoch; | |
228 | int64_t id; | |
229 | string name; | |
7c673cae FG |
230 | |
231 | pg_pool_t info; | |
232 | SnapContext snapc; // the default pool snapc, ready to go. | |
233 | ||
11fdf7f2 | 234 | // these two sets are for < mimic only |
7c673cae FG |
235 | interval_set<snapid_t> cached_removed_snaps; // current removed_snaps set |
236 | interval_set<snapid_t> newly_removed_snaps; // newly removed in the last epoch | |
237 | ||
11fdf7f2 TL |
238 | PGPool(CephContext* cct, OSDMapRef map, int64_t i, const pg_pool_t& info, |
239 | const string& name) | |
7c673cae FG |
240 | : cct(cct), |
241 | cached_epoch(map->get_epoch()), | |
242 | id(i), | |
11fdf7f2 TL |
243 | name(name), |
244 | info(info) { | |
245 | snapc = info.get_snap_context(); | |
246 | if (map->require_osd_release < CEPH_RELEASE_MIMIC) { | |
247 | info.build_removed_snaps(cached_removed_snaps); | |
248 | } | |
7c673cae FG |
249 | } |
250 | ||
11fdf7f2 | 251 | void update(CephContext *cct, OSDMapRef map); |
7c673cae FG |
252 | }; |
253 | ||
254 | /** PG - Replica Placement Group | |
255 | * | |
256 | */ | |
257 | ||
258 | class PG : public DoutPrefixProvider { | |
a8e16298 | 259 | public: |
11fdf7f2 TL |
260 | // -- members -- |
261 | const spg_t pg_id; | |
262 | const coll_t coll; | |
263 | ||
264 | ObjectStore::CollectionHandle ch; | |
265 | ||
266 | struct RecoveryCtx; | |
267 | ||
268 | // -- methods -- | |
269 | std::ostream& gen_prefix(std::ostream& out) const override; | |
270 | CephContext *get_cct() const override { | |
271 | return cct; | |
272 | } | |
273 | unsigned get_subsys() const override { | |
274 | return ceph_subsys_osd; | |
275 | } | |
276 | ||
277 | const OSDMapRef& get_osdmap() const { | |
278 | ceph_assert(is_locked()); | |
279 | ceph_assert(osdmap_ref); | |
280 | return osdmap_ref; | |
281 | } | |
282 | epoch_t get_osdmap_epoch() const { | |
283 | return osdmap_ref->get_epoch(); | |
284 | } | |
285 | ||
286 | void lock_suspend_timeout(ThreadPool::TPHandle &handle) { | |
287 | handle.suspend_tp_timeout(); | |
288 | lock(); | |
289 | handle.reset_tp_timeout(); | |
290 | } | |
291 | void lock(bool no_lockdep = false) const; | |
292 | void unlock() const { | |
293 | //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl; | |
294 | ceph_assert(!dirty_info); | |
295 | ceph_assert(!dirty_big_info); | |
296 | _lock.Unlock(); | |
297 | } | |
298 | bool is_locked() const { | |
299 | return _lock.is_locked(); | |
300 | } | |
301 | ||
302 | const spg_t& get_pgid() const { | |
303 | return pg_id; | |
304 | } | |
305 | ||
306 | const PGPool& get_pool() const { | |
307 | return pool; | |
308 | } | |
309 | uint64_t get_last_user_version() const { | |
310 | return info.last_user_version; | |
311 | } | |
312 | const pg_history_t& get_history() const { | |
313 | return info.history; | |
314 | } | |
315 | bool get_need_up_thru() const { | |
316 | return need_up_thru; | |
317 | } | |
318 | epoch_t get_same_interval_since() const { | |
319 | return info.history.same_interval_since; | |
320 | } | |
321 | ||
322 | void set_last_scrub_stamp(utime_t t) { | |
323 | info.stats.last_scrub_stamp = t; | |
324 | info.history.last_scrub_stamp = t; | |
325 | } | |
326 | ||
327 | void set_last_deep_scrub_stamp(utime_t t) { | |
328 | info.stats.last_deep_scrub_stamp = t; | |
329 | info.history.last_deep_scrub_stamp = t; | |
330 | } | |
331 | ||
332 | bool is_deleting() const { | |
333 | return deleting; | |
334 | } | |
335 | bool is_deleted() const { | |
336 | return deleted; | |
337 | } | |
338 | bool is_replica() const { | |
339 | return role > 0; | |
340 | } | |
341 | bool is_primary() const { | |
342 | return pg_whoami == primary; | |
343 | } | |
344 | bool pg_has_reset_since(epoch_t e) { | |
345 | ceph_assert(is_locked()); | |
346 | return deleted || e < get_last_peering_reset(); | |
347 | } | |
348 | ||
349 | bool is_ec_pg() const { | |
350 | return pool.info.is_erasure(); | |
351 | } | |
352 | int get_role() const { | |
353 | return role; | |
354 | } | |
355 | const vector<int> get_acting() const { | |
356 | return acting; | |
357 | } | |
358 | int get_acting_primary() const { | |
359 | return primary.osd; | |
360 | } | |
361 | pg_shard_t get_primary() const { | |
362 | return primary; | |
363 | } | |
364 | const vector<int> get_up() const { | |
365 | return up; | |
366 | } | |
367 | int get_up_primary() const { | |
368 | return up_primary.osd; | |
369 | } | |
370 | const PastIntervals& get_past_intervals() const { | |
371 | return past_intervals; | |
372 | } | |
373 | ||
374 | /// initialize created PG | |
375 | void init( | |
376 | int role, | |
377 | const vector<int>& up, | |
378 | int up_primary, | |
379 | const vector<int>& acting, | |
380 | int acting_primary, | |
381 | const pg_history_t& history, | |
382 | const PastIntervals& pim, | |
383 | bool backfill, | |
384 | ObjectStore::Transaction *t); | |
385 | ||
386 | /// read existing pg state off disk | |
387 | void read_state(ObjectStore *store); | |
388 | static int peek_map_epoch(ObjectStore *store, spg_t pgid, epoch_t *pepoch); | |
389 | ||
390 | static int get_latest_struct_v() { | |
391 | return latest_struct_v; | |
392 | } | |
393 | static int get_compat_struct_v() { | |
394 | return compat_struct_v; | |
395 | } | |
396 | static int read_info( | |
397 | ObjectStore *store, spg_t pgid, const coll_t &coll, | |
398 | pg_info_t &info, PastIntervals &past_intervals, | |
399 | __u8 &); | |
400 | static bool _has_removal_flag(ObjectStore *store, spg_t pgid); | |
401 | ||
402 | void rm_backoff(BackoffRef b); | |
403 | ||
404 | void update_snap_mapper_bits(uint32_t bits) { | |
405 | snap_mapper.update_bits(bits); | |
406 | } | |
407 | void start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *v); | |
408 | virtual void split_colls( | |
409 | spg_t child, | |
410 | int split_bits, | |
411 | int seed, | |
412 | const pg_pool_t *pool, | |
413 | ObjectStore::Transaction *t) = 0; | |
414 | void split_into(pg_t child_pgid, PG *child, unsigned split_bits); | |
415 | void merge_from(map<spg_t,PGRef>& sources, RecoveryCtx *rctx, | |
416 | unsigned split_bits, | |
417 | const pg_merge_meta_t& last_pg_merge_meta); | |
418 | void finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction *t); | |
419 | ||
420 | void scrub(epoch_t queued, ThreadPool::TPHandle &handle); | |
494da23a TL |
421 | |
422 | bool is_scrub_registered(); | |
11fdf7f2 TL |
423 | void reg_next_scrub(); |
424 | void unreg_next_scrub(); | |
425 | ||
494da23a TL |
426 | void on_info_history_change(); |
427 | ||
428 | void scrub_requested(bool deep, bool repair, bool need_auto = false); | |
429 | ||
11fdf7f2 TL |
430 | bool is_forced_recovery_or_backfill() const { |
431 | return get_state() & (PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL); | |
432 | } | |
a8e16298 TL |
433 | bool set_force_recovery(bool b); |
434 | bool set_force_backfill(bool b); | |
435 | ||
11fdf7f2 TL |
436 | void queue_peering_event(PGPeeringEventRef evt); |
437 | void do_peering_event(PGPeeringEventRef evt, RecoveryCtx *rcx); | |
438 | void queue_null(epoch_t msg_epoch, epoch_t query_epoch); | |
439 | void queue_flushed(epoch_t started_at); | |
440 | void handle_advance_map( | |
441 | OSDMapRef osdmap, OSDMapRef lastmap, | |
442 | vector<int>& newup, int up_primary, | |
443 | vector<int>& newacting, int acting_primary, | |
444 | RecoveryCtx *rctx); | |
445 | void handle_activate_map(RecoveryCtx *rctx); | |
446 | void handle_initialize(RecoveryCtx *rctx); | |
447 | void handle_query_state(Formatter *f); | |
448 | ||
449 | /** | |
450 | * @param ops_begun returns how many recovery ops the function started | |
451 | * @returns true if any useful work was accomplished; false otherwise | |
452 | */ | |
453 | virtual bool start_recovery_ops( | |
454 | uint64_t max, | |
455 | ThreadPool::TPHandle &handle, | |
456 | uint64_t *ops_begun) = 0; | |
457 | ||
458 | // more work after the above, but with a RecoveryCtx | |
459 | void find_unfound(epoch_t queued, RecoveryCtx *rctx); | |
460 | ||
461 | virtual void get_watchers(std::list<obj_watch_item_t> *ls) = 0; | |
462 | ||
463 | void dump_pgstate_history(Formatter *f); | |
464 | void dump_missing(Formatter *f); | |
465 | ||
466 | void get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f); | |
467 | void with_heartbeat_peers(std::function<void(int)> f); | |
468 | ||
469 | void shutdown(); | |
470 | virtual void on_shutdown() = 0; | |
471 | ||
472 | bool get_must_scrub() const { | |
473 | return scrubber.must_scrub; | |
474 | } | |
475 | bool sched_scrub(); | |
476 | ||
477 | virtual void do_request( | |
478 | OpRequestRef& op, | |
479 | ThreadPool::TPHandle &handle | |
480 | ) = 0; | |
481 | virtual void clear_cache() = 0; | |
482 | virtual int get_cache_obj_count() = 0; | |
483 | ||
484 | virtual void snap_trimmer(epoch_t epoch_queued) = 0; | |
485 | virtual int do_command( | |
486 | cmdmap_t cmdmap, | |
487 | ostream& ss, | |
488 | bufferlist& idata, | |
489 | bufferlist& odata, | |
490 | ConnectionRef conn, | |
491 | ceph_tid_t tid) = 0; | |
492 | ||
493 | virtual bool agent_work(int max) = 0; | |
494 | virtual bool agent_work(int max, int agent_flush_quota) = 0; | |
495 | virtual void agent_stop() = 0; | |
496 | virtual void agent_delay() = 0; | |
497 | virtual void agent_clear() = 0; | |
498 | virtual void agent_choose_mode_restart() = 0; | |
499 | ||
500 | virtual void on_removal(ObjectStore::Transaction *t) = 0; | |
501 | ||
502 | void _delete_some(ObjectStore::Transaction *t); | |
503 | ||
504 | virtual void set_dynamic_perf_stats_queries( | |
505 | const std::list<OSDPerfMetricQuery> &queries) { | |
506 | } | |
507 | virtual void get_dynamic_perf_stats(DynamicPerfStats *stats) { | |
508 | } | |
509 | ||
510 | // reference counting | |
511 | #ifdef PG_DEBUG_REFS | |
512 | uint64_t get_with_id(); | |
513 | void put_with_id(uint64_t); | |
514 | void dump_live_ids(); | |
515 | #endif | |
516 | void get(const char* tag); | |
517 | void put(const char* tag); | |
518 | int get_num_ref() { | |
519 | return ref; | |
520 | } | |
521 | ||
522 | // ctor | |
523 | PG(OSDService *o, OSDMapRef curmap, | |
524 | const PGPool &pool, spg_t p); | |
525 | ~PG() override; | |
526 | ||
527 | // prevent copying | |
528 | explicit PG(const PG& rhs) = delete; | |
529 | PG& operator=(const PG& rhs) = delete; | |
530 | ||
7c673cae | 531 | protected: |
11fdf7f2 TL |
532 | // ------------- |
533 | // protected | |
7c673cae | 534 | OSDService *osd; |
11fdf7f2 TL |
535 | public: |
536 | OSDShard *osd_shard = nullptr; | |
537 | OSDShardPGSlot *pg_slot = nullptr; | |
538 | protected: | |
7c673cae | 539 | CephContext *cct; |
11fdf7f2 TL |
540 | |
541 | // osdmap | |
542 | OSDMapRef osdmap_ref; | |
543 | ||
544 | PGPool pool; | |
545 | ||
546 | // locking and reference counting. | |
547 | // I destroy myself when the reference count hits zero. | |
548 | // lock() should be called before doing anything. | |
549 | // get() should be called on pointer copy (to another thread, etc.). | |
550 | // put() should be called on destruction of some previously copied pointer. | |
551 | // unlock() when done with the current pointer (_most common_). | |
552 | mutable Mutex _lock = {"PG::_lock"}; | |
553 | ||
554 | std::atomic<unsigned int> ref{0}; | |
555 | ||
556 | #ifdef PG_DEBUG_REFS | |
557 | Mutex _ref_id_lock = {"PG::_ref_id_lock"}; | |
558 | map<uint64_t, string> _live_ids; | |
559 | map<string, uint64_t> _tag_counts; | |
560 | uint64_t _ref_id = 0; | |
561 | ||
562 | friend uint64_t get_with_id(PG *pg) { return pg->get_with_id(); } | |
563 | friend void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); } | |
564 | #endif | |
565 | ||
566 | private: | |
567 | friend void intrusive_ptr_add_ref(PG *pg) { | |
568 | pg->get("intptr"); | |
569 | } | |
570 | friend void intrusive_ptr_release(PG *pg) { | |
571 | pg->put("intptr"); | |
572 | } | |
573 | ||
574 | ||
575 | // ===================== | |
576 | ||
577 | protected: | |
7c673cae FG |
578 | OSDriver osdriver; |
579 | SnapMapper snap_mapper; | |
224ce89b | 580 | bool eio_errors_to_process = false; |
7c673cae FG |
581 | |
582 | virtual PGBackend *get_pgbackend() = 0; | |
11fdf7f2 | 583 | virtual const PGBackend* get_pgbackend() const = 0; |
7c673cae | 584 | |
11fdf7f2 | 585 | protected: |
7c673cae | 586 | /*** PG ****/ |
7c673cae | 587 | /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done |
11fdf7f2 | 588 | IsPGRecoverablePredicate *get_is_recoverable_predicate() const { |
7c673cae FG |
589 | return get_pgbackend()->get_is_recoverable_predicate(); |
590 | } | |
591 | protected: | |
11fdf7f2 | 592 | epoch_t last_persisted_osdmap; |
7c673cae FG |
593 | |
594 | void requeue_map_waiters(); | |
595 | ||
596 | void update_osdmap_ref(OSDMapRef newmap) { | |
11fdf7f2 | 597 | ceph_assert(_lock.is_locked_by_me()); |
7c673cae FG |
598 | osdmap_ref = std::move(newmap); |
599 | } | |
600 | ||
7c673cae FG |
601 | protected: |
602 | ||
7c673cae | 603 | |
7c673cae | 604 | bool deleting; // true while in removing or OSD is shutting down |
11fdf7f2 | 605 | atomic<bool> deleted = {false}; |
7c673cae FG |
606 | |
607 | ZTracer::Endpoint trace_endpoint; | |
608 | ||
7c673cae | 609 | |
11fdf7f2 | 610 | protected: |
7c673cae FG |
611 | bool dirty_info, dirty_big_info; |
612 | ||
11fdf7f2 | 613 | protected: |
7c673cae FG |
614 | // pg state |
615 | pg_info_t info; ///< current pg info | |
616 | pg_info_t last_written_info; ///< last written info | |
11fdf7f2 TL |
617 | __u8 info_struct_v = 0; |
618 | static const __u8 latest_struct_v = 10; | |
7c673cae FG |
619 | // v10 is the new past_intervals encoding |
620 | // v9 was fastinfo_key addition | |
621 | // v8 was the move to a per-pg pgmeta object | |
622 | // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad | |
623 | // (first appeared in cuttlefish). | |
11fdf7f2 | 624 | static const __u8 compat_struct_v = 10; |
7c673cae FG |
625 | void upgrade(ObjectStore *store); |
626 | ||
11fdf7f2 | 627 | protected: |
7c673cae | 628 | PGLog pg_log; |
7c673cae FG |
629 | ghobject_t pgmeta_oid; |
630 | ||
91327a77 AA |
631 | // ------------------ |
632 | // MissingLoc | |
633 | ||
7c673cae | 634 | class MissingLoc { |
91327a77 AA |
635 | public: |
636 | // a loc_count indicates how many locations we know in each of | |
637 | // these distinct sets | |
638 | struct loc_count_t { | |
639 | int up = 0; //< up | |
640 | int other = 0; //< other | |
641 | ||
642 | friend bool operator<(const loc_count_t& l, | |
643 | const loc_count_t& r) { | |
644 | return (l.up < r.up || | |
645 | (l.up == r.up && | |
646 | (l.other < r.other))); | |
647 | } | |
648 | friend ostream& operator<<(ostream& out, const loc_count_t& l) { | |
11fdf7f2 TL |
649 | ceph_assert(l.up >= 0); |
650 | ceph_assert(l.other >= 0); | |
91327a77 AA |
651 | return out << "(" << l.up << "+" << l.other << ")"; |
652 | } | |
653 | }; | |
654 | ||
655 | ||
656 | private: | |
657 | ||
658 | loc_count_t _get_count(const set<pg_shard_t>& shards) { | |
659 | loc_count_t r; | |
660 | for (auto s : shards) { | |
661 | if (pg->upset.count(s)) { | |
662 | r.up++; | |
663 | } else { | |
664 | r.other++; | |
665 | } | |
666 | } | |
667 | return r; | |
668 | } | |
669 | ||
7c673cae FG |
670 | map<hobject_t, pg_missing_item> needs_recovery_map; |
671 | map<hobject_t, set<pg_shard_t> > missing_loc; | |
672 | set<pg_shard_t> missing_loc_sources; | |
91327a77 AA |
673 | |
674 | // for every entry in missing_loc, we count how many of each type of shard we have, | |
675 | // and maintain totals here. The sum of the values for this map will always equal | |
676 | // missing_loc.size(). | |
677 | map < shard_id_t, map<loc_count_t,int> > missing_by_count; | |
678 | ||
679 | void pgs_by_shard_id(const set<pg_shard_t>& s, map< shard_id_t, set<pg_shard_t> >& pgsbs) { | |
680 | if (pg->get_osdmap()->pg_is_ec(pg->info.pgid.pgid)) { | |
681 | int num_shards = pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid); | |
682 | // For completely missing shards initialize with empty set<pg_shard_t> | |
683 | for (int i = 0 ; i < num_shards ; ++i) { | |
684 | shard_id_t shard(i); | |
685 | pgsbs[shard]; | |
686 | } | |
687 | for (auto pgs: s) | |
688 | pgsbs[pgs.shard].insert(pgs); | |
689 | } else { | |
690 | pgsbs[shard_id_t::NO_SHARD] = s; | |
691 | } | |
692 | } | |
693 | ||
694 | void _inc_count(const set<pg_shard_t>& s) { | |
695 | map< shard_id_t, set<pg_shard_t> > pgsbs; | |
696 | pgs_by_shard_id(s, pgsbs); | |
697 | for (auto shard: pgsbs) | |
698 | ++missing_by_count[shard.first][_get_count(shard.second)]; | |
699 | } | |
700 | void _dec_count(const set<pg_shard_t>& s) { | |
701 | map< shard_id_t, set<pg_shard_t> > pgsbs; | |
702 | pgs_by_shard_id(s, pgsbs); | |
703 | for (auto shard: pgsbs) { | |
704 | auto p = missing_by_count[shard.first].find(_get_count(shard.second)); | |
11fdf7f2 | 705 | ceph_assert(p != missing_by_count[shard.first].end()); |
91327a77 AA |
706 | if (--p->second == 0) { |
707 | missing_by_count[shard.first].erase(p); | |
708 | } | |
709 | } | |
710 | } | |
711 | ||
7c673cae FG |
712 | PG *pg; |
713 | set<pg_shard_t> empty_set; | |
714 | public: | |
715 | boost::scoped_ptr<IsPGReadablePredicate> is_readable; | |
716 | boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable; | |
717 | explicit MissingLoc(PG *pg) | |
91327a77 | 718 | : pg(pg) { } |
7c673cae FG |
719 | void set_backend_predicates( |
720 | IsPGReadablePredicate *_is_readable, | |
721 | IsPGRecoverablePredicate *_is_recoverable) { | |
722 | is_readable.reset(_is_readable); | |
723 | is_recoverable.reset(_is_recoverable); | |
724 | } | |
11fdf7f2 TL |
725 | std::ostream& gen_prefix(std::ostream& out) const { |
726 | return pg->gen_prefix(out); | |
727 | } | |
7c673cae FG |
728 | bool needs_recovery( |
729 | const hobject_t &hoid, | |
730 | eversion_t *v = 0) const { | |
731 | map<hobject_t, pg_missing_item>::const_iterator i = | |
732 | needs_recovery_map.find(hoid); | |
733 | if (i == needs_recovery_map.end()) | |
734 | return false; | |
735 | if (v) | |
736 | *v = i->second.need; | |
737 | return true; | |
738 | } | |
c07f9fc5 FG |
739 | bool is_deleted(const hobject_t &hoid) const { |
740 | auto i = needs_recovery_map.find(hoid); | |
741 | if (i == needs_recovery_map.end()) | |
742 | return false; | |
743 | return i->second.is_delete(); | |
744 | } | |
7c673cae | 745 | bool is_unfound(const hobject_t &hoid) const { |
11fdf7f2 TL |
746 | auto it = needs_recovery_map.find(hoid); |
747 | if (it == needs_recovery_map.end()) { | |
748 | return false; | |
749 | } | |
750 | if (it->second.is_delete()) { | |
751 | return false; | |
752 | } | |
753 | auto mit = missing_loc.find(hoid); | |
754 | return mit == missing_loc.end() || !(*is_recoverable)(mit->second); | |
7c673cae FG |
755 | } |
756 | bool readable_with_acting( | |
757 | const hobject_t &hoid, | |
758 | const set<pg_shard_t> &acting) const; | |
759 | uint64_t num_unfound() const { | |
760 | uint64_t ret = 0; | |
761 | for (map<hobject_t, pg_missing_item>::const_iterator i = | |
762 | needs_recovery_map.begin(); | |
763 | i != needs_recovery_map.end(); | |
764 | ++i) { | |
11fdf7f2 TL |
765 | if (i->second.is_delete()) |
766 | continue; | |
767 | auto mi = missing_loc.find(i->first); | |
768 | if (mi == missing_loc.end() || !(*is_recoverable)(mi->second)) | |
7c673cae FG |
769 | ++ret; |
770 | } | |
771 | return ret; | |
772 | } | |
773 | ||
774 | bool have_unfound() const { | |
775 | for (map<hobject_t, pg_missing_item>::const_iterator i = | |
776 | needs_recovery_map.begin(); | |
777 | i != needs_recovery_map.end(); | |
778 | ++i) { | |
11fdf7f2 TL |
779 | if (i->second.is_delete()) |
780 | continue; | |
781 | auto mi = missing_loc.find(i->first); | |
782 | if (mi == missing_loc.end() || !(*is_recoverable)(mi->second)) | |
7c673cae FG |
783 | return true; |
784 | } | |
785 | return false; | |
786 | } | |
787 | void clear() { | |
788 | needs_recovery_map.clear(); | |
789 | missing_loc.clear(); | |
790 | missing_loc_sources.clear(); | |
91327a77 | 791 | missing_by_count.clear(); |
7c673cae FG |
792 | } |
793 | ||
794 | void add_location(const hobject_t &hoid, pg_shard_t location) { | |
91327a77 AA |
795 | auto p = missing_loc.find(hoid); |
796 | if (p == missing_loc.end()) { | |
797 | p = missing_loc.emplace(hoid, set<pg_shard_t>()).first; | |
798 | } else { | |
799 | _dec_count(p->second); | |
800 | } | |
801 | p->second.insert(location); | |
802 | _inc_count(p->second); | |
7c673cae FG |
803 | } |
804 | void remove_location(const hobject_t &hoid, pg_shard_t location) { | |
91327a77 AA |
805 | auto p = missing_loc.find(hoid); |
806 | if (p != missing_loc.end()) { | |
807 | _dec_count(p->second); | |
808 | p->second.erase(location); | |
11fdf7f2 TL |
809 | if (p->second.empty()) { |
810 | missing_loc.erase(p); | |
811 | } else { | |
812 | _inc_count(p->second); | |
813 | } | |
91327a77 | 814 | } |
7c673cae | 815 | } |
11fdf7f2 TL |
816 | |
817 | void clear_location(const hobject_t &hoid) { | |
818 | auto p = missing_loc.find(hoid); | |
819 | if (p != missing_loc.end()) { | |
820 | _dec_count(p->second); | |
821 | missing_loc.erase(p); | |
822 | } | |
823 | } | |
824 | ||
7c673cae FG |
825 | void add_active_missing(const pg_missing_t &missing) { |
826 | for (map<hobject_t, pg_missing_item>::const_iterator i = | |
827 | missing.get_items().begin(); | |
828 | i != missing.get_items().end(); | |
829 | ++i) { | |
830 | map<hobject_t, pg_missing_item>::const_iterator j = | |
831 | needs_recovery_map.find(i->first); | |
832 | if (j == needs_recovery_map.end()) { | |
833 | needs_recovery_map.insert(*i); | |
834 | } else { | |
c07f9fc5 FG |
835 | lgeneric_dout(pg->cct, 0) << this << " " << pg->info.pgid << " unexpected need for " |
836 | << i->first << " have " << j->second | |
837 | << " tried to add " << i->second << dendl; | |
11fdf7f2 | 838 | ceph_assert(i->second.need == j->second.need); |
7c673cae FG |
839 | } |
840 | } | |
841 | } | |
842 | ||
11fdf7f2 TL |
843 | void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have, bool is_delete=false) { |
844 | needs_recovery_map[hoid] = pg_missing_item(need, have, is_delete); | |
7c673cae FG |
845 | } |
846 | void revise_need(const hobject_t &hoid, eversion_t need) { | |
11fdf7f2 TL |
847 | auto it = needs_recovery_map.find(hoid); |
848 | ceph_assert(it != needs_recovery_map.end()); | |
849 | it->second.need = need; | |
7c673cae FG |
850 | } |
851 | ||
852 | /// Adds info about a possible recovery source | |
853 | bool add_source_info( | |
854 | pg_shard_t source, ///< [in] source | |
855 | const pg_info_t &oinfo, ///< [in] info | |
856 | const pg_missing_t &omissing, ///< [in] (optional) missing | |
857 | ThreadPool::TPHandle* handle ///< [in] ThreadPool handle | |
858 | ); ///< @return whether a new object location was discovered | |
859 | ||
860 | /// Adds recovery sources in batch | |
861 | void add_batch_sources_info( | |
862 | const set<pg_shard_t> &sources, ///< [in] a set of resources which can be used for all objects | |
863 | ThreadPool::TPHandle* handle ///< [in] ThreadPool handle | |
864 | ); | |
865 | ||
866 | /// Uses osdmap to update structures for now down sources | |
867 | void check_recovery_sources(const OSDMapRef& osdmap); | |
868 | ||
869 | /// Call when hoid is no longer missing in acting set | |
870 | void recovered(const hobject_t &hoid) { | |
871 | needs_recovery_map.erase(hoid); | |
91327a77 AA |
872 | auto p = missing_loc.find(hoid); |
873 | if (p != missing_loc.end()) { | |
874 | _dec_count(p->second); | |
875 | missing_loc.erase(p); | |
876 | } | |
7c673cae FG |
877 | } |
878 | ||
879 | /// Call to update structures for hoid after a change | |
880 | void rebuild( | |
881 | const hobject_t &hoid, | |
882 | pg_shard_t self, | |
883 | const set<pg_shard_t> to_recover, | |
884 | const pg_info_t &info, | |
885 | const pg_missing_t &missing, | |
886 | const map<pg_shard_t, pg_missing_t> &pmissing, | |
887 | const map<pg_shard_t, pg_info_t> &pinfo) { | |
888 | recovered(hoid); | |
889 | boost::optional<pg_missing_item> item; | |
890 | auto miter = missing.get_items().find(hoid); | |
891 | if (miter != missing.get_items().end()) { | |
892 | item = miter->second; | |
893 | } else { | |
894 | for (auto &&i: to_recover) { | |
895 | if (i == self) | |
896 | continue; | |
897 | auto pmiter = pmissing.find(i); | |
11fdf7f2 | 898 | ceph_assert(pmiter != pmissing.end()); |
7c673cae FG |
899 | miter = pmiter->second.get_items().find(hoid); |
900 | if (miter != pmiter->second.get_items().end()) { | |
901 | item = miter->second; | |
902 | break; | |
903 | } | |
904 | } | |
905 | } | |
906 | if (!item) | |
907 | return; // recovered! | |
908 | ||
909 | needs_recovery_map[hoid] = *item; | |
c07f9fc5 FG |
910 | if (item->is_delete()) |
911 | return; | |
7c673cae FG |
912 | auto mliter = |
913 | missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first; | |
11fdf7f2 TL |
914 | ceph_assert(info.last_backfill.is_max()); |
915 | ceph_assert(info.last_update >= item->need); | |
7c673cae FG |
916 | if (!missing.is_missing(hoid)) |
917 | mliter->second.insert(self); | |
918 | for (auto &&i: pmissing) { | |
11fdf7f2 TL |
919 | if (i.first == self) |
920 | continue; | |
7c673cae | 921 | auto pinfoiter = pinfo.find(i.first); |
11fdf7f2 | 922 | ceph_assert(pinfoiter != pinfo.end()); |
7c673cae FG |
923 | if (item->need <= pinfoiter->second.last_update && |
924 | hoid <= pinfoiter->second.last_backfill && | |
925 | !i.second.is_missing(hoid)) | |
926 | mliter->second.insert(i.first); | |
927 | } | |
91327a77 | 928 | _inc_count(mliter->second); |
7c673cae FG |
929 | } |
930 | ||
931 | const set<pg_shard_t> &get_locations(const hobject_t &hoid) const { | |
11fdf7f2 TL |
932 | auto it = missing_loc.find(hoid); |
933 | return it == missing_loc.end() ? empty_set : it->second; | |
7c673cae FG |
934 | } |
935 | const map<hobject_t, set<pg_shard_t>> &get_missing_locs() const { | |
936 | return missing_loc; | |
937 | } | |
938 | const map<hobject_t, pg_missing_item> &get_needs_recovery() const { | |
939 | return needs_recovery_map; | |
940 | } | |
91327a77 AA |
941 | const map < shard_id_t, map<loc_count_t,int> > &get_missing_by_count() const { |
942 | return missing_by_count; | |
943 | } | |
7c673cae FG |
944 | } missing_loc; |
945 | ||
946 | PastIntervals past_intervals; | |
947 | ||
948 | interval_set<snapid_t> snap_trimq; | |
949 | ||
950 | /* You should not use these items without taking their respective queue locks | |
951 | * (if they have one) */ | |
952 | xlist<PG*>::item stat_queue_item; | |
953 | bool scrub_queued; | |
954 | bool recovery_queued; | |
955 | ||
956 | int recovery_ops_active; | |
957 | set<pg_shard_t> waiting_on_backfill; | |
958 | #ifdef DEBUG_RECOVERY_OIDS | |
11fdf7f2 | 959 | multiset<hobject_t> recovering_oids; |
7c673cae FG |
960 | #endif |
961 | ||
962 | protected: | |
963 | int role; // 0 = primary, 1 = replica, -1=none. | |
11fdf7f2 | 964 | uint64_t state; // PG_STATE_* |
7c673cae FG |
965 | |
966 | bool send_notify; ///< true if we are non-primary and should notify the primary | |
967 | ||
11fdf7f2 | 968 | protected: |
7c673cae FG |
969 | eversion_t last_update_ondisk; // last_update that has committed; ONLY DEFINED WHEN is_active() |
970 | eversion_t last_complete_ondisk; // last_complete that has committed. | |
971 | eversion_t last_update_applied; | |
972 | ||
11fdf7f2 | 973 | // entries <= last_rollback_info_trimmed_to_applied have been trimmed |
7c673cae FG |
974 | eversion_t last_rollback_info_trimmed_to_applied; |
975 | ||
976 | // primary state | |
11fdf7f2 | 977 | protected: |
7c673cae FG |
978 | pg_shard_t primary; |
979 | pg_shard_t pg_whoami; | |
980 | pg_shard_t up_primary; | |
981 | vector<int> up, acting, want_acting; | |
11fdf7f2 TL |
982 | // acting_recovery_backfill contains shards that are acting, |
983 | // async recovery targets, or backfill targets. | |
984 | set<pg_shard_t> acting_recovery_backfill, actingset, upset; | |
7c673cae FG |
985 | map<pg_shard_t,eversion_t> peer_last_complete_ondisk; |
986 | eversion_t min_last_complete_ondisk; // up: min over last_complete_ondisk, peer_last_complete_ondisk | |
987 | eversion_t pg_trim_to; | |
988 | ||
989 | set<int> blocked_by; ///< osds we are blocked by (for pg stats) | |
990 | ||
11fdf7f2 | 991 | protected: |
7c673cae | 992 | // [primary only] content recovery state |
7c673cae FG |
993 | struct BufferedRecoveryMessages { |
994 | map<int, map<spg_t, pg_query_t> > query_map; | |
995 | map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map; | |
996 | map<int, vector<pair<pg_notify_t, PastIntervals> > > notify_list; | |
997 | }; | |
998 | ||
94b18763 FG |
999 | public: |
1000 | bool dne() { return info.dne(); } | |
7c673cae FG |
1001 | struct RecoveryCtx { |
1002 | utime_t start_time; | |
1003 | map<int, map<spg_t, pg_query_t> > *query_map; | |
1004 | map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map; | |
1005 | map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list; | |
7c673cae FG |
1006 | ObjectStore::Transaction *transaction; |
1007 | ThreadPool::TPHandle* handle; | |
1008 | RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map, | |
1009 | map<int, | |
1010 | vector<pair<pg_notify_t, PastIntervals> > > *info_map, | |
1011 | map<int, | |
1012 | vector<pair<pg_notify_t, PastIntervals> > > *notify_list, | |
7c673cae FG |
1013 | ObjectStore::Transaction *transaction) |
1014 | : query_map(query_map), info_map(info_map), | |
1015 | notify_list(notify_list), | |
7c673cae FG |
1016 | transaction(transaction), |
1017 | handle(NULL) {} | |
1018 | ||
1019 | RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx) | |
1020 | : query_map(&(buf.query_map)), | |
1021 | info_map(&(buf.info_map)), | |
1022 | notify_list(&(buf.notify_list)), | |
7c673cae FG |
1023 | transaction(rctx.transaction), |
1024 | handle(rctx.handle) {} | |
1025 | ||
1026 | void accept_buffered_messages(BufferedRecoveryMessages &m) { | |
11fdf7f2 TL |
1027 | ceph_assert(query_map); |
1028 | ceph_assert(info_map); | |
1029 | ceph_assert(notify_list); | |
7c673cae FG |
1030 | for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin(); |
1031 | i != m.query_map.end(); | |
1032 | ++i) { | |
1033 | map<spg_t, pg_query_t> &omap = (*query_map)[i->first]; | |
1034 | for (map<spg_t, pg_query_t>::iterator j = i->second.begin(); | |
1035 | j != i->second.end(); | |
1036 | ++j) { | |
1037 | omap[j->first] = j->second; | |
1038 | } | |
1039 | } | |
1040 | for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i | |
1041 | = m.info_map.begin(); | |
1042 | i != m.info_map.end(); | |
1043 | ++i) { | |
1044 | vector<pair<pg_notify_t, PastIntervals> > &ovec = | |
1045 | (*info_map)[i->first]; | |
1046 | ovec.reserve(ovec.size() + i->second.size()); | |
1047 | ovec.insert(ovec.end(), i->second.begin(), i->second.end()); | |
1048 | } | |
1049 | for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i | |
1050 | = m.notify_list.begin(); | |
1051 | i != m.notify_list.end(); | |
1052 | ++i) { | |
1053 | vector<pair<pg_notify_t, PastIntervals> > &ovec = | |
1054 | (*notify_list)[i->first]; | |
1055 | ovec.reserve(ovec.size() + i->second.size()); | |
1056 | ovec.insert(ovec.end(), i->second.begin(), i->second.end()); | |
1057 | } | |
1058 | } | |
1adf2230 AA |
1059 | |
1060 | void send_notify(pg_shard_t to, | |
1061 | const pg_notify_t &info, const PastIntervals &pi) { | |
11fdf7f2 | 1062 | ceph_assert(notify_list); |
1adf2230 AA |
1063 | (*notify_list)[to.osd].push_back(make_pair(info, pi)); |
1064 | } | |
7c673cae | 1065 | }; |
11fdf7f2 | 1066 | protected: |
7c673cae FG |
1067 | |
1068 | PGStateHistory pgstate_history; | |
1069 | ||
1070 | struct NamedState { | |
1071 | const char *state_name; | |
1072 | utime_t enter_time; | |
1073 | PG* pg; | |
1074 | const char *get_state_name() { return state_name; } | |
1075 | NamedState(PG *pg_, const char *state_name_) | |
1076 | : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) { | |
1077 | pg->pgstate_history.enter(pg, enter_time, state_name); | |
1078 | } | |
1079 | virtual ~NamedState() { pg->pgstate_history.exit(state_name); } | |
1080 | }; | |
1081 | ||
1082 | ||
1083 | ||
1084 | protected: | |
1085 | ||
1086 | /* | |
1087 | * peer_info -- projected (updates _before_ replicas ack) | |
1088 | * peer_missing -- committed (updates _after_ replicas ack) | |
1089 | */ | |
1090 | ||
1091 | bool need_up_thru; | |
1092 | set<pg_shard_t> stray_set; // non-acting osds that have PG data. | |
7c673cae | 1093 | map<pg_shard_t, pg_info_t> peer_info; // info from peers (stray or prior) |
11fdf7f2 | 1094 | map<pg_shard_t, int64_t> peer_bytes; // Peer's num_bytes from peer_info |
7c673cae FG |
1095 | set<pg_shard_t> peer_purged; // peers purged |
1096 | map<pg_shard_t, pg_missing_t> peer_missing; | |
1097 | set<pg_shard_t> peer_log_requested; // logs i've requested (and start stamps) | |
1098 | set<pg_shard_t> peer_missing_requested; | |
1099 | ||
1100 | // i deleted these strays; ignore racing PGInfo from them | |
1101 | set<pg_shard_t> peer_activated; | |
1102 | ||
1103 | // primary-only, recovery-only state | |
1104 | set<pg_shard_t> might_have_unfound; // These osds might have objects on them | |
1105 | // which are unfound on the primary | |
1106 | epoch_t last_peering_reset; | |
1107 | ||
11fdf7f2 TL |
1108 | epoch_t get_last_peering_reset() const { |
1109 | return last_peering_reset; | |
1110 | } | |
7c673cae FG |
1111 | |
1112 | /* heartbeat peers */ | |
1113 | void set_probe_targets(const set<pg_shard_t> &probe_set); | |
1114 | void clear_probe_targets(); | |
11fdf7f2 | 1115 | |
7c673cae FG |
1116 | Mutex heartbeat_peer_lock; |
1117 | set<int> heartbeat_peers; | |
1118 | set<int> probe_targets; | |
1119 | ||
11fdf7f2 | 1120 | public: |
7c673cae FG |
1121 | /** |
1122 | * BackfillInterval | |
1123 | * | |
1124 | * Represents the objects in a range [begin, end) | |
1125 | * | |
1126 | * Possible states: | |
1127 | * 1) begin == end == hobject_t() indicates the the interval is unpopulated | |
1128 | * 2) Else, objects contains all objects in [begin, end) | |
1129 | */ | |
1130 | struct BackfillInterval { | |
1131 | // info about a backfill interval on a peer | |
1132 | eversion_t version; /// version at which the scan occurred | |
1133 | map<hobject_t,eversion_t> objects; | |
1134 | hobject_t begin; | |
1135 | hobject_t end; | |
1136 | ||
1137 | /// clear content | |
1138 | void clear() { | |
1139 | *this = BackfillInterval(); | |
1140 | } | |
1141 | ||
1142 | /// clear objects list only | |
1143 | void clear_objects() { | |
1144 | objects.clear(); | |
1145 | } | |
1146 | ||
1147 | /// reinstantiate with a new start+end position and sort order | |
1148 | void reset(hobject_t start) { | |
1149 | clear(); | |
1150 | begin = end = start; | |
1151 | } | |
1152 | ||
1153 | /// true if there are no objects in this interval | |
1154 | bool empty() const { | |
1155 | return objects.empty(); | |
1156 | } | |
1157 | ||
1158 | /// true if interval extends to the end of the range | |
1159 | bool extends_to_end() const { | |
1160 | return end.is_max(); | |
1161 | } | |
1162 | ||
1163 | /// removes items <= soid and adjusts begin to the first object | |
1164 | void trim_to(const hobject_t &soid) { | |
1165 | trim(); | |
1166 | while (!objects.empty() && | |
1167 | objects.begin()->first <= soid) { | |
1168 | pop_front(); | |
1169 | } | |
1170 | } | |
1171 | ||
1172 | /// Adjusts begin to the first object | |
1173 | void trim() { | |
1174 | if (!objects.empty()) | |
1175 | begin = objects.begin()->first; | |
1176 | else | |
1177 | begin = end; | |
1178 | } | |
1179 | ||
1180 | /// drop first entry, and adjust @begin accordingly | |
1181 | void pop_front() { | |
11fdf7f2 | 1182 | ceph_assert(!objects.empty()); |
7c673cae FG |
1183 | objects.erase(objects.begin()); |
1184 | trim(); | |
1185 | } | |
1186 | ||
1187 | /// dump | |
1188 | void dump(Formatter *f) const { | |
1189 | f->dump_stream("begin") << begin; | |
1190 | f->dump_stream("end") << end; | |
1191 | f->open_array_section("objects"); | |
1192 | for (map<hobject_t, eversion_t>::const_iterator i = | |
1193 | objects.begin(); | |
1194 | i != objects.end(); | |
1195 | ++i) { | |
1196 | f->open_object_section("object"); | |
1197 | f->dump_stream("object") << i->first; | |
1198 | f->dump_stream("version") << i->second; | |
1199 | f->close_section(); | |
1200 | } | |
1201 | f->close_section(); | |
1202 | } | |
1203 | }; | |
1204 | ||
1205 | protected: | |
1206 | BackfillInterval backfill_info; | |
1207 | map<pg_shard_t, BackfillInterval> peer_backfill_info; | |
1208 | bool backfill_reserved; | |
1209 | bool backfill_reserving; | |
1210 | ||
11fdf7f2 | 1211 | set<pg_shard_t> backfill_targets, async_recovery_targets; |
7c673cae | 1212 | |
11fdf7f2 TL |
1213 | // The primary's num_bytes and local num_bytes for this pg, only valid |
1214 | // during backfill for non-primary shards. | |
1215 | // Both of these are adjusted for EC to reflect the on-disk bytes | |
1216 | std::atomic<int64_t> primary_num_bytes = 0; | |
1217 | std::atomic<int64_t> local_num_bytes = 0; | |
7c673cae | 1218 | |
11fdf7f2 | 1219 | public: |
7c673cae FG |
1220 | bool is_backfill_targets(pg_shard_t osd) { |
1221 | return backfill_targets.count(osd); | |
1222 | } | |
1223 | ||
11fdf7f2 TL |
1224 | // Space reserved for backfill is primary_num_bytes - local_num_bytes |
1225 | // Don't care that difference itself isn't atomic | |
1226 | uint64_t get_reserved_num_bytes() { | |
1227 | int64_t primary = primary_num_bytes.load(); | |
1228 | int64_t local = local_num_bytes.load(); | |
1229 | if (primary > local) | |
1230 | return primary - local; | |
1231 | else | |
1232 | return 0; | |
1233 | } | |
1234 | ||
1235 | bool is_remote_backfilling() { | |
1236 | return primary_num_bytes.load() > 0; | |
1237 | } | |
1238 | ||
1239 | void set_reserved_num_bytes(int64_t primary, int64_t local); | |
1240 | void clear_reserved_num_bytes(); | |
1241 | ||
1242 | // If num_bytes are inconsistent and local_num- goes negative | |
1243 | // it's ok, because it would then be ignored. | |
1244 | ||
1245 | // The value of num_bytes could be negative, | |
1246 | // but we don't let local_num_bytes go negative. | |
1247 | void add_local_num_bytes(int64_t num_bytes) { | |
1248 | if (num_bytes) { | |
1249 | int64_t prev_bytes = local_num_bytes.load(); | |
1250 | int64_t new_bytes; | |
1251 | do { | |
1252 | new_bytes = prev_bytes + num_bytes; | |
1253 | if (new_bytes < 0) | |
1254 | new_bytes = 0; | |
1255 | } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes)); | |
1256 | } | |
1257 | } | |
1258 | void sub_local_num_bytes(int64_t num_bytes) { | |
1259 | ceph_assert(num_bytes >= 0); | |
1260 | if (num_bytes) { | |
1261 | int64_t prev_bytes = local_num_bytes.load(); | |
1262 | int64_t new_bytes; | |
1263 | do { | |
1264 | new_bytes = prev_bytes - num_bytes; | |
1265 | if (new_bytes < 0) | |
1266 | new_bytes = 0; | |
1267 | } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes)); | |
1268 | } | |
1269 | } | |
1270 | // The value of num_bytes could be negative, | |
1271 | // but we don't let info.stats.stats.sum.num_bytes go negative. | |
1272 | void add_num_bytes(int64_t num_bytes) { | |
1273 | ceph_assert(_lock.is_locked_by_me()); | |
1274 | if (num_bytes) { | |
1275 | info.stats.stats.sum.num_bytes += num_bytes; | |
1276 | if (info.stats.stats.sum.num_bytes < 0) { | |
1277 | info.stats.stats.sum.num_bytes = 0; | |
1278 | } | |
1279 | } | |
1280 | } | |
1281 | void sub_num_bytes(int64_t num_bytes) { | |
1282 | ceph_assert(_lock.is_locked_by_me()); | |
1283 | ceph_assert(num_bytes >= 0); | |
1284 | if (num_bytes) { | |
1285 | info.stats.stats.sum.num_bytes -= num_bytes; | |
1286 | if (info.stats.stats.sum.num_bytes < 0) { | |
1287 | info.stats.stats.sum.num_bytes = 0; | |
1288 | } | |
1289 | } | |
1290 | } | |
1291 | ||
1292 | // Only used in testing so not worried about needing the PG lock here | |
1293 | int64_t get_stats_num_bytes() { | |
1294 | Mutex::Locker l(_lock); | |
1295 | int num_bytes = info.stats.stats.sum.num_bytes; | |
1296 | if (pool.info.is_erasure()) { | |
1297 | num_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count(); | |
1298 | // Round up each object by a stripe | |
1299 | num_bytes += get_pgbackend()->get_ec_stripe_chunk_size() * info.stats.stats.sum.num_objects; | |
1300 | } | |
1301 | int64_t lnb = local_num_bytes.load(); | |
1302 | if (lnb && lnb != num_bytes) { | |
1303 | lgeneric_dout(cct, 0) << this << " " << info.pgid << " num_bytes mismatch " | |
1304 | << lnb << " vs stats " | |
1305 | << info.stats.stats.sum.num_bytes << " / chunk " | |
1306 | << get_pgbackend()->get_ec_data_chunk_count() | |
1307 | << dendl; | |
1308 | } | |
1309 | return num_bytes; | |
1310 | } | |
1311 | ||
7c673cae FG |
1312 | protected: |
1313 | ||
1314 | /* | |
1315 | * blocked request wait hierarchy | |
1316 | * | |
1317 | * In order to preserve request ordering we need to be careful about the | |
1318 | * order in which blocked requests get requeued. Generally speaking, we | |
1319 | * push the requests back up to the op_wq in reverse order (most recent | |
1320 | * request first) so that they come back out again in the original order. | |
1321 | * However, because there are multiple wait queues, we need to requeue | |
1322 | * waitlists in order. Generally speaking, we requeue the wait lists | |
1323 | * that are checked first. | |
1324 | * | |
1325 | * Here are the various wait lists, in the order they are used during | |
1326 | * request processing, with notes: | |
1327 | * | |
1328 | * - waiting_for_map | |
1329 | * - may start or stop blocking at any time (depending on client epoch) | |
1330 | * - waiting_for_peered | |
1331 | * - !is_peered() or flushes_in_progress | |
1332 | * - only starts blocking on interval change; never restarts | |
1333 | * - waiting_for_active | |
1334 | * - !is_active() | |
1335 | * - only starts blocking on interval change; never restarts | |
b32b8144 FG |
1336 | * - waiting_for_flush |
1337 | * - is_active() and flushes_in_progress | |
1338 | * - waiting for final flush during activate | |
7c673cae FG |
1339 | * - waiting_for_scrub |
1340 | * - starts and stops blocking for varying intervals during scrub | |
1341 | * - waiting_for_unreadable_object | |
1342 | * - never restarts once object is readable (* except for EIO?) | |
1343 | * - waiting_for_degraded_object | |
1344 | * - never restarts once object is writeable (* except for EIO?) | |
1345 | * - waiting_for_blocked_object | |
1346 | * - starts and stops based on proxied op activity | |
1347 | * - obc rwlocks | |
1348 | * - starts and stops based on read/write activity | |
1349 | * | |
1350 | * Notes: | |
1351 | * | |
1352 | * 1. During and interval change, we requeue *everything* in the above order. | |
1353 | * | |
1354 | * 2. When an obc rwlock is released, we check for a scrub block and requeue | |
1355 | * the op there if it applies. We ignore the unreadable/degraded/blocked | |
1356 | * queues because we assume they cannot apply at that time (this is | |
1357 | * probably mostly true). | |
1358 | * | |
1359 | * 3. The requeue_ops helper will push ops onto the waiting_for_map list if | |
1360 | * it is non-empty. | |
1361 | * | |
1362 | * These three behaviors are generally sufficient to maintain ordering, with | |
1363 | * the possible exception of cases where we make an object degraded or | |
1364 | * unreadable that was previously okay, e.g. when scrub or op processing | |
1365 | * encounter an unexpected error. FIXME. | |
1366 | */ | |
1367 | ||
1368 | // pg waiters | |
1369 | unsigned flushes_in_progress; | |
1370 | ||
1371 | // ops with newer maps than our (or blocked behind them) | |
1372 | // track these by client, since inter-request ordering doesn't otherwise | |
1373 | // matter. | |
1374 | unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map; | |
1375 | ||
1376 | // ops waiting on peered | |
1377 | list<OpRequestRef> waiting_for_peered; | |
1378 | ||
1379 | // ops waiting on active (require peered as well) | |
1380 | list<OpRequestRef> waiting_for_active; | |
b32b8144 | 1381 | list<OpRequestRef> waiting_for_flush; |
7c673cae FG |
1382 | list<OpRequestRef> waiting_for_scrub; |
1383 | ||
1384 | list<OpRequestRef> waiting_for_cache_not_full; | |
224ce89b | 1385 | list<OpRequestRef> waiting_for_clean_to_primary_repair; |
7c673cae FG |
1386 | map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object, |
1387 | waiting_for_degraded_object, | |
1388 | waiting_for_blocked_object; | |
1389 | ||
1390 | set<hobject_t> objects_blocked_on_cache_full; | |
1391 | map<hobject_t,snapid_t> objects_blocked_on_degraded_snap; | |
1392 | map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion; | |
1393 | ||
1394 | // Callbacks should assume pg (and nothing else) is locked | |
1395 | map<hobject_t, list<Context*>> callbacks_for_degraded_object; | |
1396 | ||
1397 | map<eversion_t, | |
11fdf7f2 | 1398 | list<tuple<OpRequestRef, version_t, int> > > waiting_for_ondisk; |
7c673cae FG |
1399 | |
1400 | void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m); | |
1401 | void requeue_op(OpRequestRef op); | |
1402 | void requeue_ops(list<OpRequestRef> &l); | |
1403 | ||
1404 | // stats that persist lazily | |
1405 | object_stat_collection_t unstable_stats; | |
1406 | ||
1407 | // publish stats | |
1408 | Mutex pg_stats_publish_lock; | |
1409 | bool pg_stats_publish_valid; | |
1410 | pg_stat_t pg_stats_publish; | |
1411 | ||
7c673cae FG |
1412 | void _update_calc_stats(); |
1413 | void _update_blocked_by(); | |
a8e16298 | 1414 | friend class TestOpsSocketHook; |
7c673cae FG |
1415 | void publish_stats_to_osd(); |
1416 | void clear_publish_stats(); | |
1417 | ||
7c673cae FG |
1418 | void clear_primary_state(); |
1419 | ||
11fdf7f2 TL |
1420 | bool is_acting_recovery_backfill(pg_shard_t osd) const { |
1421 | return acting_recovery_backfill.count(osd); | |
7c673cae FG |
1422 | } |
1423 | bool is_acting(pg_shard_t osd) const { | |
11fdf7f2 | 1424 | return has_shard(pool.info.is_erasure(), acting, osd); |
7c673cae FG |
1425 | } |
1426 | bool is_up(pg_shard_t osd) const { | |
11fdf7f2 | 1427 | return has_shard(pool.info.is_erasure(), up, osd); |
7c673cae FG |
1428 | } |
1429 | static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) { | |
1430 | if (ec) { | |
1431 | return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd; | |
1432 | } else { | |
1433 | return std::find(v.begin(), v.end(), osd.osd) != v.end(); | |
1434 | } | |
1435 | } | |
1436 | ||
1437 | bool needs_recovery() const; | |
1438 | bool needs_backfill() const; | |
1439 | ||
c07f9fc5 | 1440 | /// clip calculated priority to reasonable range |
81eedcae | 1441 | int clamp_recovery_priority(int prio, int pool_recovery_prio, int max); |
7c673cae FG |
1442 | /// get log recovery reservation priority |
1443 | unsigned get_recovery_priority(); | |
1444 | /// get backfill reservation priority | |
1445 | unsigned get_backfill_priority(); | |
11fdf7f2 TL |
1446 | /// get priority for pg deletion |
1447 | unsigned get_delete_priority(); | |
7c673cae | 1448 | |
11fdf7f2 | 1449 | void try_mark_clean(); ///< mark an active pg clean |
7c673cae FG |
1450 | |
1451 | /// return [start,end) bounds for required past_intervals | |
1452 | static pair<epoch_t, epoch_t> get_required_past_interval_bounds( | |
1453 | const pg_info_t &info, | |
1454 | epoch_t oldest_map) { | |
11fdf7f2 | 1455 | epoch_t start = std::max( |
7c673cae | 1456 | info.history.last_epoch_clean ? info.history.last_epoch_clean : |
31f18b77 | 1457 | info.history.epoch_pool_created, |
7c673cae | 1458 | oldest_map); |
11fdf7f2 | 1459 | epoch_t end = std::max( |
7c673cae | 1460 | info.history.same_interval_since, |
31f18b77 | 1461 | info.history.epoch_pool_created); |
7c673cae FG |
1462 | return make_pair(start, end); |
1463 | } | |
1464 | void check_past_interval_bounds() const; | |
1465 | PastIntervals::PriorSet build_prior(); | |
1466 | ||
1467 | void remove_down_peer_info(const OSDMapRef osdmap); | |
1468 | ||
1469 | bool adjust_need_up_thru(const OSDMapRef osdmap); | |
1470 | ||
1471 | bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const; | |
1472 | virtual void dump_recovery_info(Formatter *f) const = 0; | |
1473 | ||
11fdf7f2 | 1474 | void calc_min_last_complete_ondisk() { |
7c673cae | 1475 | eversion_t min = last_complete_ondisk; |
11fdf7f2 TL |
1476 | ceph_assert(!acting_recovery_backfill.empty()); |
1477 | for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin(); | |
1478 | i != acting_recovery_backfill.end(); | |
7c673cae FG |
1479 | ++i) { |
1480 | if (*i == get_primary()) continue; | |
1481 | if (peer_last_complete_ondisk.count(*i) == 0) | |
11fdf7f2 | 1482 | return; // we don't have complete info |
7c673cae FG |
1483 | eversion_t a = peer_last_complete_ondisk[*i]; |
1484 | if (a < min) | |
1485 | min = a; | |
1486 | } | |
1487 | if (min == min_last_complete_ondisk) | |
11fdf7f2 | 1488 | return; |
7c673cae | 1489 | min_last_complete_ondisk = min; |
11fdf7f2 | 1490 | return; |
7c673cae FG |
1491 | } |
1492 | ||
1493 | virtual void calc_trim_to() = 0; | |
1494 | ||
f64942e4 AA |
1495 | virtual void calc_trim_to_aggressive() = 0; |
1496 | ||
7c673cae FG |
1497 | void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog, |
1498 | pg_missing_t& omissing, pg_shard_t from); | |
1499 | void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, | |
1500 | pg_missing_t& omissing, pg_shard_t from); | |
1501 | bool proc_replica_info( | |
1502 | pg_shard_t from, const pg_info_t &info, epoch_t send_epoch); | |
1503 | ||
1504 | struct PGLogEntryHandler : public PGLog::LogEntryHandler { | |
1505 | PG *pg; | |
1506 | ObjectStore::Transaction *t; | |
1507 | PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {} | |
1508 | ||
1509 | // LogEntryHandler | |
1510 | void remove(const hobject_t &hoid) override { | |
1511 | pg->get_pgbackend()->remove(hoid, t); | |
1512 | } | |
1513 | void try_stash(const hobject_t &hoid, version_t v) override { | |
1514 | pg->get_pgbackend()->try_stash(hoid, v, t); | |
1515 | } | |
1516 | void rollback(const pg_log_entry_t &entry) override { | |
11fdf7f2 | 1517 | ceph_assert(entry.can_rollback()); |
7c673cae FG |
1518 | pg->get_pgbackend()->rollback(entry, t); |
1519 | } | |
1520 | void rollforward(const pg_log_entry_t &entry) override { | |
1521 | pg->get_pgbackend()->rollforward(entry, t); | |
1522 | } | |
1523 | void trim(const pg_log_entry_t &entry) override { | |
1524 | pg->get_pgbackend()->trim(entry, t); | |
1525 | } | |
1526 | }; | |
1527 | ||
1528 | void update_object_snap_mapping( | |
1529 | ObjectStore::Transaction *t, const hobject_t &soid, | |
1530 | const set<snapid_t> &snaps); | |
1531 | void clear_object_snap_mapping( | |
1532 | ObjectStore::Transaction *t, const hobject_t &soid); | |
1533 | void remove_snap_mapped_object( | |
1534 | ObjectStore::Transaction& t, const hobject_t& soid); | |
1535 | void merge_log( | |
1536 | ObjectStore::Transaction& t, pg_info_t &oinfo, | |
1537 | pg_log_t &olog, pg_shard_t from); | |
1538 | void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead); | |
1539 | bool search_for_missing( | |
1540 | const pg_info_t &oinfo, const pg_missing_t &omissing, | |
1541 | pg_shard_t fromosd, | |
1542 | RecoveryCtx*); | |
1543 | ||
7c673cae FG |
1544 | void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map); |
1545 | ||
7c673cae FG |
1546 | map<pg_shard_t, pg_info_t>::const_iterator find_best_info( |
1547 | const map<pg_shard_t, pg_info_t> &infos, | |
1548 | bool restrict_to_up_acting, | |
1549 | bool *history_les_bound) const; | |
1550 | static void calc_ec_acting( | |
1551 | map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard, | |
1552 | unsigned size, | |
1553 | const vector<int> &acting, | |
7c673cae | 1554 | const vector<int> &up, |
7c673cae | 1555 | const map<pg_shard_t, pg_info_t> &all_info, |
7c673cae FG |
1556 | bool restrict_to_up_acting, |
1557 | vector<int> *want, | |
1558 | set<pg_shard_t> *backfill, | |
1559 | set<pg_shard_t> *acting_backfill, | |
7c673cae FG |
1560 | ostream &ss); |
1561 | static void calc_replicated_acting( | |
1562 | map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard, | |
11fdf7f2 | 1563 | uint64_t force_auth_primary_missing_objects, |
7c673cae FG |
1564 | unsigned size, |
1565 | const vector<int> &acting, | |
7c673cae FG |
1566 | const vector<int> &up, |
1567 | pg_shard_t up_primary, | |
1568 | const map<pg_shard_t, pg_info_t> &all_info, | |
7c673cae FG |
1569 | bool restrict_to_up_acting, |
1570 | vector<int> *want, | |
1571 | set<pg_shard_t> *backfill, | |
1572 | set<pg_shard_t> *acting_backfill, | |
11fdf7f2 | 1573 | const OSDMapRef osdmap, |
7c673cae | 1574 | ostream &ss); |
11fdf7f2 TL |
1575 | void choose_async_recovery_ec(const map<pg_shard_t, pg_info_t> &all_info, |
1576 | const pg_info_t &auth_info, | |
1577 | vector<int> *want, | |
81eedcae TL |
1578 | set<pg_shard_t> *async_recovery, |
1579 | const OSDMapRef osdmap) const; | |
11fdf7f2 TL |
1580 | void choose_async_recovery_replicated(const map<pg_shard_t, pg_info_t> &all_info, |
1581 | const pg_info_t &auth_info, | |
1582 | vector<int> *want, | |
81eedcae TL |
1583 | set<pg_shard_t> *async_recovery, |
1584 | const OSDMapRef osdmap) const; | |
11fdf7f2 TL |
1585 | |
1586 | bool recoverable_and_ge_min_size(const vector<int> &want) const; | |
7c673cae FG |
1587 | bool choose_acting(pg_shard_t &auth_log_shard, |
1588 | bool restrict_to_up_acting, | |
1589 | bool *history_les_bound); | |
1590 | void build_might_have_unfound(); | |
1591 | void activate( | |
1592 | ObjectStore::Transaction& t, | |
1593 | epoch_t activation_epoch, | |
7c673cae FG |
1594 | map<int, map<spg_t,pg_query_t> >& query_map, |
1595 | map<int, | |
1596 | vector<pair<pg_notify_t, PastIntervals> > > *activator_map, | |
1597 | RecoveryCtx *ctx); | |
11fdf7f2 TL |
1598 | |
1599 | struct C_PG_ActivateCommitted : public Context { | |
1600 | PGRef pg; | |
1601 | epoch_t epoch; | |
1602 | epoch_t activation_epoch; | |
1603 | C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae) | |
1604 | : pg(p), epoch(e), activation_epoch(ae) {} | |
1605 | void finish(int r) override { | |
1606 | pg->_activate_committed(epoch, activation_epoch); | |
1607 | } | |
1608 | }; | |
7c673cae FG |
1609 | void _activate_committed(epoch_t epoch, epoch_t activation_epoch); |
1610 | void all_activated_and_committed(); | |
1611 | ||
1612 | void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info); | |
1613 | ||
1614 | bool have_unfound() const { | |
1615 | return missing_loc.have_unfound(); | |
1616 | } | |
1617 | uint64_t get_num_unfound() const { | |
1618 | return missing_loc.num_unfound(); | |
1619 | } | |
81eedcae TL |
1620 | bool all_missing_unfound() const { |
1621 | const auto& missing = pg_log.get_missing(); | |
1622 | if (!missing.have_missing()) | |
1623 | return false; | |
1624 | for (auto& m : missing.get_items()) { | |
1625 | if (!missing_loc.is_unfound(m.first)) | |
1626 | return false; | |
1627 | } | |
1628 | return true; | |
1629 | } | |
7c673cae FG |
1630 | |
1631 | virtual void check_local() = 0; | |
1632 | ||
7c673cae FG |
1633 | void purge_strays(); |
1634 | ||
1635 | void update_heartbeat_peers(); | |
1636 | ||
1637 | Context *finish_sync_event; | |
1638 | ||
11fdf7f2 | 1639 | Context *finish_recovery(); |
7c673cae | 1640 | void _finish_recovery(Context *c); |
11fdf7f2 TL |
1641 | struct C_PG_FinishRecovery : public Context { |
1642 | PGRef pg; | |
1643 | explicit C_PG_FinishRecovery(PG *p) : pg(p) {} | |
1644 | void finish(int r) override { | |
1645 | pg->_finish_recovery(this); | |
1646 | } | |
1647 | }; | |
7c673cae FG |
1648 | void cancel_recovery(); |
1649 | void clear_recovery_state(); | |
1650 | virtual void _clear_recovery_state() = 0; | |
1651 | virtual void check_recovery_sources(const OSDMapRef& newmap) = 0; | |
1652 | void start_recovery_op(const hobject_t& soid); | |
1653 | void finish_recovery_op(const hobject_t& soid, bool dequeue=false); | |
1654 | ||
7c673cae FG |
1655 | virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0; |
1656 | ||
1657 | friend class C_OSD_RepModify_Commit; | |
11fdf7f2 | 1658 | friend class C_DeleteMore; |
7c673cae FG |
1659 | |
1660 | // -- backoff -- | |
1661 | Mutex backoff_lock; // orders inside Backoff::lock | |
1662 | map<hobject_t,set<BackoffRef>> backoffs; | |
1663 | ||
1664 | void add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end); | |
1665 | void release_backoffs(const hobject_t& begin, const hobject_t& end); | |
1666 | void release_backoffs(const hobject_t& o) { | |
1667 | release_backoffs(o, o); | |
1668 | } | |
1669 | void clear_backoffs(); | |
1670 | ||
1671 | void add_pg_backoff(SessionRef s) { | |
1672 | hobject_t begin = info.pgid.pgid.get_hobj_start(); | |
1673 | hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num()); | |
1674 | add_backoff(s, begin, end); | |
1675 | } | |
1676 | void release_pg_backoffs() { | |
1677 | hobject_t begin = info.pgid.pgid.get_hobj_start(); | |
1678 | hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num()); | |
1679 | release_backoffs(begin, end); | |
1680 | } | |
1681 | ||
7c673cae | 1682 | // -- scrub -- |
11fdf7f2 | 1683 | public: |
7c673cae FG |
1684 | struct Scrubber { |
1685 | Scrubber(); | |
1686 | ~Scrubber(); | |
1687 | ||
1688 | // metadata | |
1689 | set<pg_shard_t> reserved_peers; | |
1690 | bool reserved, reserve_failed; | |
1691 | epoch_t epoch_start; | |
1692 | ||
1693 | // common to both scrubs | |
1694 | bool active; | |
7c673cae FG |
1695 | set<pg_shard_t> waiting_on_whom; |
1696 | int shallow_errors; | |
1697 | int deep_errors; | |
1698 | int fixed; | |
1699 | ScrubMap primary_scrubmap; | |
28e407b8 AA |
1700 | ScrubMapBuilder primary_scrubmap_pos; |
1701 | epoch_t replica_scrub_start = 0; | |
1702 | ScrubMap replica_scrubmap; | |
1703 | ScrubMapBuilder replica_scrubmap_pos; | |
7c673cae FG |
1704 | map<pg_shard_t, ScrubMap> received_maps; |
1705 | OpRequestRef active_rep_scrub; | |
1706 | utime_t scrub_reg_stamp; // stamp we registered for | |
1707 | ||
494da23a TL |
1708 | static utime_t scrub_must_stamp() { return utime_t(0,1); } |
1709 | ||
11fdf7f2 TL |
1710 | omap_stat_t omap_stats = (const struct omap_stat_t){ 0 }; |
1711 | ||
7c673cae FG |
1712 | // For async sleep |
1713 | bool sleeping = false; | |
1714 | bool needs_sleep = true; | |
1715 | utime_t sleep_start; | |
1716 | ||
1717 | // flags to indicate explicitly requested scrubs (by admin) | |
494da23a | 1718 | bool must_scrub, must_deep_scrub, must_repair, need_auto; |
7c673cae FG |
1719 | |
1720 | // Priority to use for scrub scheduling | |
11fdf7f2 | 1721 | unsigned priority = 0; |
7c673cae | 1722 | |
494da23a | 1723 | bool time_for_deep; |
7c673cae FG |
1724 | // this flag indicates whether we would like to do auto-repair of the PG or not |
1725 | bool auto_repair; | |
11fdf7f2 TL |
1726 | // this flag indicates that we are scrubbing post repair to verify everything is fixed |
1727 | bool check_repair; | |
1728 | // this flag indicates that if a regular scrub detects errors <= osd_scrub_auto_repair_num_errors, | |
1729 | // we should deep scrub in order to auto repair | |
1730 | bool deep_scrub_on_error; | |
7c673cae FG |
1731 | |
1732 | // Maps from objects with errors to missing/inconsistent peers | |
1733 | map<hobject_t, set<pg_shard_t>> missing; | |
1734 | map<hobject_t, set<pg_shard_t>> inconsistent; | |
1735 | ||
1736 | // Map from object with errors to good peers | |
1737 | map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative; | |
1738 | ||
1739 | // Cleaned map pending snap metadata scrub | |
1740 | ScrubMap cleaned_meta_map; | |
1741 | ||
28e407b8 AA |
1742 | void clean_meta_map(ScrubMap &for_meta_scrub) { |
1743 | if (end.is_max() || | |
1744 | cleaned_meta_map.objects.empty()) { | |
1745 | cleaned_meta_map.swap(for_meta_scrub); | |
1746 | } else { | |
1747 | auto iter = cleaned_meta_map.objects.end(); | |
1748 | --iter; // not empty, see if clause | |
1749 | auto begin = cleaned_meta_map.objects.begin(); | |
1750 | if (iter->first.has_snapset()) { | |
1751 | ++iter; | |
1752 | } else { | |
1753 | while (iter != begin) { | |
1754 | auto next = iter--; | |
1755 | if (next->first.get_head() != iter->first.get_head()) { | |
1756 | ++iter; | |
1757 | break; | |
1758 | } | |
1759 | } | |
1760 | } | |
1761 | for_meta_scrub.objects.insert(begin, iter); | |
1762 | cleaned_meta_map.objects.erase(begin, iter); | |
1763 | } | |
1764 | } | |
1765 | ||
7c673cae FG |
1766 | // digest updates which we are waiting on |
1767 | int num_digest_updates_pending; | |
1768 | ||
1769 | // chunky scrub | |
28e407b8 AA |
1770 | hobject_t start, end; // [start,end) |
1771 | hobject_t max_end; // Largest end that may have been sent to replicas | |
7c673cae FG |
1772 | eversion_t subset_last_update; |
1773 | ||
1774 | // chunky scrub state | |
1775 | enum State { | |
1776 | INACTIVE, | |
1777 | NEW_CHUNK, | |
1778 | WAIT_PUSHES, | |
1779 | WAIT_LAST_UPDATE, | |
1780 | BUILD_MAP, | |
28e407b8 | 1781 | BUILD_MAP_DONE, |
7c673cae FG |
1782 | WAIT_REPLICAS, |
1783 | COMPARE_MAPS, | |
1784 | WAIT_DIGEST_UPDATES, | |
1785 | FINISH, | |
28e407b8 | 1786 | BUILD_MAP_REPLICA, |
7c673cae FG |
1787 | } state; |
1788 | ||
1789 | std::unique_ptr<Scrub::Store> store; | |
1790 | // deep scrub | |
1791 | bool deep; | |
28e407b8 AA |
1792 | int preempt_left; |
1793 | int preempt_divisor; | |
7c673cae FG |
1794 | |
1795 | list<Context*> callbacks; | |
1796 | void add_callback(Context *context) { | |
1797 | callbacks.push_back(context); | |
1798 | } | |
1799 | void run_callbacks() { | |
1800 | list<Context*> to_run; | |
1801 | to_run.swap(callbacks); | |
1802 | for (list<Context*>::iterator i = to_run.begin(); | |
1803 | i != to_run.end(); | |
1804 | ++i) { | |
1805 | (*i)->complete(0); | |
1806 | } | |
1807 | } | |
1808 | ||
1809 | static const char *state_string(const PG::Scrubber::State& state) { | |
1810 | const char *ret = NULL; | |
1811 | switch( state ) | |
1812 | { | |
1813 | case INACTIVE: ret = "INACTIVE"; break; | |
1814 | case NEW_CHUNK: ret = "NEW_CHUNK"; break; | |
1815 | case WAIT_PUSHES: ret = "WAIT_PUSHES"; break; | |
1816 | case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break; | |
1817 | case BUILD_MAP: ret = "BUILD_MAP"; break; | |
28e407b8 | 1818 | case BUILD_MAP_DONE: ret = "BUILD_MAP_DONE"; break; |
7c673cae FG |
1819 | case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break; |
1820 | case COMPARE_MAPS: ret = "COMPARE_MAPS"; break; | |
1821 | case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break; | |
1822 | case FINISH: ret = "FINISH"; break; | |
28e407b8 | 1823 | case BUILD_MAP_REPLICA: ret = "BUILD_MAP_REPLICA"; break; |
7c673cae FG |
1824 | } |
1825 | return ret; | |
1826 | } | |
1827 | ||
1828 | bool is_chunky_scrub_active() const { return state != INACTIVE; } | |
1829 | ||
7c673cae FG |
1830 | // clear all state |
1831 | void reset() { | |
1832 | active = false; | |
7c673cae FG |
1833 | waiting_on_whom.clear(); |
1834 | if (active_rep_scrub) { | |
1835 | active_rep_scrub = OpRequestRef(); | |
1836 | } | |
1837 | received_maps.clear(); | |
1838 | ||
1839 | must_scrub = false; | |
1840 | must_deep_scrub = false; | |
1841 | must_repair = false; | |
494da23a TL |
1842 | need_auto = false; |
1843 | time_for_deep = false; | |
7c673cae | 1844 | auto_repair = false; |
11fdf7f2 TL |
1845 | check_repair = false; |
1846 | deep_scrub_on_error = false; | |
7c673cae FG |
1847 | |
1848 | state = PG::Scrubber::INACTIVE; | |
1849 | start = hobject_t(); | |
1850 | end = hobject_t(); | |
28e407b8 | 1851 | max_end = hobject_t(); |
7c673cae FG |
1852 | subset_last_update = eversion_t(); |
1853 | shallow_errors = 0; | |
1854 | deep_errors = 0; | |
1855 | fixed = 0; | |
11fdf7f2 | 1856 | omap_stats = (const struct omap_stat_t){ 0 }; |
7c673cae | 1857 | deep = false; |
7c673cae FG |
1858 | run_callbacks(); |
1859 | inconsistent.clear(); | |
1860 | missing.clear(); | |
1861 | authoritative.clear(); | |
1862 | num_digest_updates_pending = 0; | |
28e407b8 AA |
1863 | primary_scrubmap = ScrubMap(); |
1864 | primary_scrubmap_pos.reset(); | |
1865 | replica_scrubmap = ScrubMap(); | |
1866 | replica_scrubmap_pos.reset(); | |
7c673cae FG |
1867 | cleaned_meta_map = ScrubMap(); |
1868 | sleeping = false; | |
1869 | needs_sleep = true; | |
1870 | sleep_start = utime_t(); | |
1871 | } | |
1872 | ||
1873 | void create_results(const hobject_t& obj); | |
1874 | void cleanup_store(ObjectStore::Transaction *t); | |
1875 | } scrubber; | |
1876 | ||
11fdf7f2 | 1877 | protected: |
7c673cae FG |
1878 | bool scrub_after_recovery; |
1879 | ||
1880 | int active_pushes; | |
1881 | ||
28e407b8 AA |
1882 | bool scrub_can_preempt = false; |
1883 | bool scrub_preempted = false; | |
1884 | ||
1885 | // we allow some number of preemptions of the scrub, which mean we do | |
1886 | // not block. then we start to block. once we start blocking, we do | |
1887 | // not stop until the scrub range is completed. | |
1888 | bool write_blocked_by_scrub(const hobject_t &soid); | |
1889 | ||
1890 | /// true if the given range intersects the scrub interval in any way | |
1891 | bool range_intersects_scrub(const hobject_t &start, const hobject_t& end); | |
1892 | ||
7c673cae FG |
1893 | void repair_object( |
1894 | const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers, | |
1895 | pg_shard_t bad_peer); | |
1896 | ||
7c673cae FG |
1897 | void chunky_scrub(ThreadPool::TPHandle &handle); |
1898 | void scrub_compare_maps(); | |
1899 | /** | |
1900 | * return true if any inconsistency/missing is repaired, false otherwise | |
1901 | */ | |
1902 | bool scrub_process_inconsistent(); | |
31f18b77 | 1903 | bool ops_blocked_by_scrub() const; |
7c673cae | 1904 | void scrub_finish(); |
11fdf7f2 | 1905 | void scrub_clear_state(bool keep_repair = false); |
7c673cae | 1906 | void _scan_snaps(ScrubMap &map); |
224ce89b | 1907 | void _repair_oinfo_oid(ScrubMap &map); |
11fdf7f2 | 1908 | void _scan_rollback_obs(const vector<ghobject_t> &rollback_obs); |
7c673cae FG |
1909 | void _request_scrub_map(pg_shard_t replica, eversion_t version, |
1910 | hobject_t start, hobject_t end, bool deep, | |
28e407b8 | 1911 | bool allow_preemption); |
7c673cae FG |
1912 | int build_scrub_map_chunk( |
1913 | ScrubMap &map, | |
28e407b8 AA |
1914 | ScrubMapBuilder &pos, |
1915 | hobject_t start, hobject_t end, bool deep, | |
7c673cae FG |
1916 | ThreadPool::TPHandle &handle); |
1917 | /** | |
1918 | * returns true if [begin, end) is good to scrub at this time | |
1919 | * a false return value obliges the implementer to requeue scrub when the | |
1920 | * condition preventing scrub clears | |
1921 | */ | |
1922 | virtual bool _range_available_for_scrub( | |
1923 | const hobject_t &begin, const hobject_t &end) = 0; | |
1924 | virtual void scrub_snapshot_metadata( | |
1925 | ScrubMap &map, | |
28e407b8 AA |
1926 | const std::map<hobject_t, |
1927 | pair<boost::optional<uint32_t>, | |
1928 | boost::optional<uint32_t>>> &missing_digest) { } | |
7c673cae FG |
1929 | virtual void _scrub_clear_state() { } |
1930 | virtual void _scrub_finish() { } | |
7c673cae FG |
1931 | void clear_scrub_reserved(); |
1932 | void scrub_reserve_replicas(); | |
1933 | void scrub_unreserve_replicas(); | |
1934 | bool scrub_all_replicas_reserved() const; | |
7c673cae FG |
1935 | |
1936 | void replica_scrub( | |
1937 | OpRequestRef op, | |
1938 | ThreadPool::TPHandle &handle); | |
1939 | void do_replica_scrub_map(OpRequestRef op); | |
7c673cae FG |
1940 | |
1941 | void handle_scrub_reserve_request(OpRequestRef op); | |
1942 | void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from); | |
1943 | void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from); | |
1944 | void handle_scrub_reserve_release(OpRequestRef op); | |
1945 | ||
1946 | void reject_reservation(); | |
3efd9988 FG |
1947 | void schedule_backfill_retry(float retry); |
1948 | void schedule_recovery_retry(float retry); | |
7c673cae FG |
1949 | |
1950 | // -- recovery state -- | |
1951 | ||
1952 | template <class EVT> | |
1953 | struct QueuePeeringEvt : Context { | |
1954 | PGRef pg; | |
1955 | epoch_t epoch; | |
1956 | EVT evt; | |
1957 | QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) : | |
1958 | pg(pg), epoch(epoch), evt(evt) {} | |
1959 | void finish(int r) override { | |
1960 | pg->lock(); | |
11fdf7f2 TL |
1961 | pg->queue_peering_event(PGPeeringEventRef( |
1962 | new PGPeeringEvent( | |
7c673cae FG |
1963 | epoch, |
1964 | epoch, | |
1965 | evt))); | |
1966 | pg->unlock(); | |
1967 | } | |
1968 | }; | |
1969 | ||
7c673cae FG |
1970 | |
1971 | struct QueryState : boost::statechart::event< QueryState > { | |
1972 | Formatter *f; | |
1973 | explicit QueryState(Formatter *f) : f(f) {} | |
1974 | void print(std::ostream *out) const { | |
1975 | *out << "Query"; | |
1976 | } | |
1977 | }; | |
1978 | ||
11fdf7f2 TL |
1979 | public: |
1980 | int pg_stat_adjust(osd_stat_t *new_stat); | |
1981 | protected: | |
7c673cae FG |
1982 | |
1983 | struct AdvMap : boost::statechart::event< AdvMap > { | |
1984 | OSDMapRef osdmap; | |
1985 | OSDMapRef lastmap; | |
1986 | vector<int> newup, newacting; | |
1987 | int up_primary, acting_primary; | |
1988 | AdvMap( | |
1989 | OSDMapRef osdmap, OSDMapRef lastmap, | |
1990 | vector<int>& newup, int up_primary, | |
1991 | vector<int>& newacting, int acting_primary): | |
1992 | osdmap(osdmap), lastmap(lastmap), | |
1993 | newup(newup), | |
1994 | newacting(newacting), | |
1995 | up_primary(up_primary), | |
1996 | acting_primary(acting_primary) {} | |
1997 | void print(std::ostream *out) const { | |
1998 | *out << "AdvMap"; | |
1999 | } | |
2000 | }; | |
2001 | ||
2002 | struct ActMap : boost::statechart::event< ActMap > { | |
2003 | ActMap() : boost::statechart::event< ActMap >() {} | |
2004 | void print(std::ostream *out) const { | |
2005 | *out << "ActMap"; | |
2006 | } | |
2007 | }; | |
2008 | struct Activate : boost::statechart::event< Activate > { | |
2009 | epoch_t activation_epoch; | |
2010 | explicit Activate(epoch_t q) : boost::statechart::event< Activate >(), | |
2011 | activation_epoch(q) {} | |
2012 | void print(std::ostream *out) const { | |
2013 | *out << "Activate from " << activation_epoch; | |
2014 | } | |
2015 | }; | |
11fdf7f2 | 2016 | public: |
b32b8144 FG |
2017 | struct UnfoundBackfill : boost::statechart::event<UnfoundBackfill> { |
2018 | explicit UnfoundBackfill() {} | |
2019 | void print(std::ostream *out) const { | |
2020 | *out << "UnfoundBackfill"; | |
2021 | } | |
2022 | }; | |
2023 | struct UnfoundRecovery : boost::statechart::event<UnfoundRecovery> { | |
2024 | explicit UnfoundRecovery() {} | |
2025 | void print(std::ostream *out) const { | |
2026 | *out << "UnfoundRecovery"; | |
2027 | } | |
2028 | }; | |
11fdf7f2 TL |
2029 | |
2030 | struct RequestScrub : boost::statechart::event<RequestScrub> { | |
2031 | bool deep; | |
2032 | bool repair; | |
2033 | explicit RequestScrub(bool d, bool r) : deep(d), repair(r) {} | |
2034 | void print(std::ostream *out) const { | |
2035 | *out << "RequestScrub(" << (deep ? "deep" : "shallow") | |
2036 | << (repair ? " repair" : ""); | |
2037 | } | |
2038 | }; | |
2039 | ||
b32b8144 | 2040 | protected: |
7c673cae | 2041 | TrivialEvent(Initialize) |
7c673cae FG |
2042 | TrivialEvent(GotInfo) |
2043 | TrivialEvent(NeedUpThru) | |
7c673cae FG |
2044 | TrivialEvent(Backfilled) |
2045 | TrivialEvent(LocalBackfillReserved) | |
3efd9988 | 2046 | TrivialEvent(RejectRemoteReservation) |
11fdf7f2 | 2047 | public: |
7c673cae | 2048 | TrivialEvent(RequestBackfill) |
11fdf7f2 TL |
2049 | protected: |
2050 | TrivialEvent(RemoteRecoveryPreempted) | |
2051 | TrivialEvent(RemoteBackfillPreempted) | |
7c673cae FG |
2052 | TrivialEvent(BackfillTooFull) |
2053 | TrivialEvent(RecoveryTooFull) | |
3efd9988 FG |
2054 | |
2055 | TrivialEvent(MakePrimary) | |
2056 | TrivialEvent(MakeStray) | |
2057 | TrivialEvent(NeedActingChange) | |
2058 | TrivialEvent(IsIncomplete) | |
2059 | TrivialEvent(IsDown) | |
7c673cae FG |
2060 | |
2061 | TrivialEvent(AllReplicasRecovered) | |
2062 | TrivialEvent(DoRecovery) | |
2063 | TrivialEvent(LocalRecoveryReserved) | |
11fdf7f2 TL |
2064 | public: |
2065 | protected: | |
7c673cae FG |
2066 | TrivialEvent(AllRemotesReserved) |
2067 | TrivialEvent(AllBackfillsReserved) | |
2068 | TrivialEvent(GoClean) | |
2069 | ||
2070 | TrivialEvent(AllReplicasActivated) | |
2071 | ||
2072 | TrivialEvent(IntervalFlush) | |
2073 | ||
11fdf7f2 TL |
2074 | public: |
2075 | TrivialEvent(DeleteStart) | |
2076 | TrivialEvent(DeleteSome) | |
2077 | ||
2078 | TrivialEvent(SetForceRecovery) | |
2079 | TrivialEvent(UnsetForceRecovery) | |
2080 | TrivialEvent(SetForceBackfill) | |
2081 | TrivialEvent(UnsetForceBackfill) | |
2082 | ||
2083 | protected: | |
2084 | TrivialEvent(DeleteReserved) | |
2085 | TrivialEvent(DeleteInterrupted) | |
2086 | ||
7c673cae FG |
2087 | /* Encapsulates PG recovery process */ |
2088 | class RecoveryState { | |
2089 | void start_handle(RecoveryCtx *new_ctx); | |
2090 | void end_handle(); | |
2091 | public: | |
2092 | void begin_block_outgoing(); | |
2093 | void end_block_outgoing(); | |
2094 | void clear_blocked_outgoing(); | |
2095 | private: | |
2096 | ||
2097 | /* States */ | |
2098 | struct Initial; | |
2099 | class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > { | |
2100 | RecoveryState *state; | |
2101 | public: | |
2102 | PG *pg; | |
2103 | ||
2104 | utime_t event_time; | |
2105 | uint64_t event_count; | |
2106 | ||
2107 | void clear_event_counters() { | |
2108 | event_time = utime_t(); | |
2109 | event_count = 0; | |
2110 | } | |
2111 | ||
2112 | void log_enter(const char *state_name); | |
2113 | void log_exit(const char *state_name, utime_t duration); | |
2114 | ||
2115 | RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {} | |
2116 | ||
2117 | /* Accessor functions for state methods */ | |
2118 | ObjectStore::Transaction* get_cur_transaction() { | |
11fdf7f2 TL |
2119 | ceph_assert(state->rctx); |
2120 | ceph_assert(state->rctx->transaction); | |
7c673cae FG |
2121 | return state->rctx->transaction; |
2122 | } | |
2123 | ||
2124 | void send_query(pg_shard_t to, const pg_query_t &query) { | |
11fdf7f2 TL |
2125 | ceph_assert(state->rctx); |
2126 | ceph_assert(state->rctx->query_map); | |
7c673cae FG |
2127 | (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] = |
2128 | query; | |
2129 | } | |
2130 | ||
2131 | map<int, map<spg_t, pg_query_t> > *get_query_map() { | |
11fdf7f2 TL |
2132 | ceph_assert(state->rctx); |
2133 | ceph_assert(state->rctx->query_map); | |
7c673cae FG |
2134 | return state->rctx->query_map; |
2135 | } | |
2136 | ||
2137 | map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() { | |
11fdf7f2 TL |
2138 | ceph_assert(state->rctx); |
2139 | ceph_assert(state->rctx->info_map); | |
7c673cae FG |
2140 | return state->rctx->info_map; |
2141 | } | |
2142 | ||
7c673cae FG |
2143 | RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); } |
2144 | ||
2145 | void send_notify(pg_shard_t to, | |
2146 | const pg_notify_t &info, const PastIntervals &pi) { | |
11fdf7f2 | 2147 | ceph_assert(state->rctx); |
1adf2230 | 2148 | state->rctx->send_notify(to, info, pi); |
7c673cae FG |
2149 | } |
2150 | }; | |
2151 | friend class RecoveryMachine; | |
2152 | ||
2153 | /* States */ | |
b32b8144 FG |
2154 | // Initial |
2155 | // Reset | |
2156 | // Start | |
2157 | // Started | |
2158 | // Primary | |
2159 | // WaitActingChange | |
2160 | // Peering | |
2161 | // GetInfo | |
2162 | // GetLog | |
2163 | // GetMissing | |
2164 | // WaitUpThru | |
2165 | // Incomplete | |
2166 | // Active | |
2167 | // Activating | |
2168 | // Clean | |
2169 | // Recovered | |
2170 | // Backfilling | |
2171 | // WaitRemoteBackfillReserved | |
2172 | // WaitLocalBackfillReserved | |
2173 | // NotBackfilling | |
2174 | // NotRecovering | |
2175 | // Recovering | |
2176 | // WaitRemoteRecoveryReserved | |
2177 | // WaitLocalRecoveryReserved | |
2178 | // ReplicaActive | |
2179 | // RepNotRecovering | |
2180 | // RepRecovering | |
2181 | // RepWaitBackfillReserved | |
2182 | // RepWaitRecoveryReserved | |
2183 | // Stray | |
11fdf7f2 TL |
2184 | // ToDelete |
2185 | // WaitDeleteReserved | |
2186 | // Deleting | |
2187 | // Crashed | |
7c673cae FG |
2188 | |
2189 | struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState { | |
2190 | explicit Crashed(my_context ctx); | |
2191 | }; | |
2192 | ||
2193 | struct Reset; | |
2194 | ||
2195 | struct Initial : boost::statechart::state< Initial, RecoveryMachine >, NamedState { | |
2196 | explicit Initial(my_context ctx); | |
2197 | void exit(); | |
2198 | ||
2199 | typedef boost::mpl::list < | |
2200 | boost::statechart::transition< Initialize, Reset >, | |
7c673cae FG |
2201 | boost::statechart::custom_reaction< NullEvt >, |
2202 | boost::statechart::transition< boost::statechart::event_base, Crashed > | |
2203 | > reactions; | |
2204 | ||
7c673cae FG |
2205 | boost::statechart::result react(const MNotifyRec&); |
2206 | boost::statechart::result react(const MInfoRec&); | |
2207 | boost::statechart::result react(const MLogRec&); | |
2208 | boost::statechart::result react(const boost::statechart::event_base&) { | |
2209 | return discard_event(); | |
2210 | } | |
2211 | }; | |
2212 | ||
2213 | struct Reset : boost::statechart::state< Reset, RecoveryMachine >, NamedState { | |
2214 | explicit Reset(my_context ctx); | |
2215 | void exit(); | |
2216 | ||
2217 | typedef boost::mpl::list < | |
2218 | boost::statechart::custom_reaction< QueryState >, | |
2219 | boost::statechart::custom_reaction< AdvMap >, | |
2220 | boost::statechart::custom_reaction< ActMap >, | |
2221 | boost::statechart::custom_reaction< NullEvt >, | |
7c673cae FG |
2222 | boost::statechart::custom_reaction< IntervalFlush >, |
2223 | boost::statechart::transition< boost::statechart::event_base, Crashed > | |
2224 | > reactions; | |
2225 | boost::statechart::result react(const QueryState& q); | |
2226 | boost::statechart::result react(const AdvMap&); | |
2227 | boost::statechart::result react(const ActMap&); | |
7c673cae FG |
2228 | boost::statechart::result react(const IntervalFlush&); |
2229 | boost::statechart::result react(const boost::statechart::event_base&) { | |
2230 | return discard_event(); | |
2231 | } | |
2232 | }; | |
2233 | ||
2234 | struct Start; | |
2235 | ||
2236 | struct Started : boost::statechart::state< Started, RecoveryMachine, Start >, NamedState { | |
2237 | explicit Started(my_context ctx); | |
2238 | void exit(); | |
2239 | ||
2240 | typedef boost::mpl::list < | |
2241 | boost::statechart::custom_reaction< QueryState >, | |
2242 | boost::statechart::custom_reaction< AdvMap >, | |
7c673cae | 2243 | boost::statechart::custom_reaction< IntervalFlush >, |
11fdf7f2 TL |
2244 | // ignored |
2245 | boost::statechart::custom_reaction< NullEvt >, | |
2246 | boost::statechart::custom_reaction<SetForceRecovery>, | |
2247 | boost::statechart::custom_reaction<UnsetForceRecovery>, | |
2248 | boost::statechart::custom_reaction<SetForceBackfill>, | |
2249 | boost::statechart::custom_reaction<UnsetForceBackfill>, | |
2250 | boost::statechart::custom_reaction<RequestScrub>, | |
2251 | // crash | |
7c673cae FG |
2252 | boost::statechart::transition< boost::statechart::event_base, Crashed > |
2253 | > reactions; | |
2254 | boost::statechart::result react(const QueryState& q); | |
2255 | boost::statechart::result react(const AdvMap&); | |
7c673cae FG |
2256 | boost::statechart::result react(const IntervalFlush&); |
2257 | boost::statechart::result react(const boost::statechart::event_base&) { | |
2258 | return discard_event(); | |
2259 | } | |
2260 | }; | |
2261 | ||
7c673cae FG |
2262 | struct Primary; |
2263 | struct Stray; | |
2264 | ||
2265 | struct Start : boost::statechart::state< Start, Started >, NamedState { | |
2266 | explicit Start(my_context ctx); | |
2267 | void exit(); | |
2268 | ||
2269 | typedef boost::mpl::list < | |
2270 | boost::statechart::transition< MakePrimary, Primary >, | |
2271 | boost::statechart::transition< MakeStray, Stray > | |
2272 | > reactions; | |
2273 | }; | |
2274 | ||
2275 | struct Peering; | |
2276 | struct WaitActingChange; | |
7c673cae | 2277 | struct Incomplete; |
7c673cae | 2278 | struct Down; |
7c673cae FG |
2279 | |
2280 | struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState { | |
2281 | explicit Primary(my_context ctx); | |
2282 | void exit(); | |
2283 | ||
2284 | typedef boost::mpl::list < | |
2285 | boost::statechart::custom_reaction< ActMap >, | |
2286 | boost::statechart::custom_reaction< MNotifyRec >, | |
11fdf7f2 TL |
2287 | boost::statechart::transition< NeedActingChange, WaitActingChange >, |
2288 | boost::statechart::custom_reaction<SetForceRecovery>, | |
2289 | boost::statechart::custom_reaction<UnsetForceRecovery>, | |
2290 | boost::statechart::custom_reaction<SetForceBackfill>, | |
2291 | boost::statechart::custom_reaction<UnsetForceBackfill>, | |
2292 | boost::statechart::custom_reaction<RequestScrub> | |
7c673cae FG |
2293 | > reactions; |
2294 | boost::statechart::result react(const ActMap&); | |
2295 | boost::statechart::result react(const MNotifyRec&); | |
11fdf7f2 TL |
2296 | boost::statechart::result react(const SetForceRecovery&); |
2297 | boost::statechart::result react(const UnsetForceRecovery&); | |
2298 | boost::statechart::result react(const SetForceBackfill&); | |
2299 | boost::statechart::result react(const UnsetForceBackfill&); | |
2300 | boost::statechart::result react(const RequestScrub&); | |
7c673cae FG |
2301 | }; |
2302 | ||
2303 | struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>, | |
2304 | NamedState { | |
2305 | typedef boost::mpl::list < | |
2306 | boost::statechart::custom_reaction< QueryState >, | |
2307 | boost::statechart::custom_reaction< AdvMap >, | |
2308 | boost::statechart::custom_reaction< MLogRec >, | |
2309 | boost::statechart::custom_reaction< MInfoRec >, | |
2310 | boost::statechart::custom_reaction< MNotifyRec > | |
2311 | > reactions; | |
2312 | explicit WaitActingChange(my_context ctx); | |
2313 | boost::statechart::result react(const QueryState& q); | |
2314 | boost::statechart::result react(const AdvMap&); | |
2315 | boost::statechart::result react(const MLogRec&); | |
2316 | boost::statechart::result react(const MInfoRec&); | |
2317 | boost::statechart::result react(const MNotifyRec&); | |
2318 | void exit(); | |
2319 | }; | |
2320 | ||
2321 | struct GetInfo; | |
2322 | struct Active; | |
2323 | ||
2324 | struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState { | |
2325 | PastIntervals::PriorSet prior_set; | |
2326 | bool history_les_bound; //< need osd_find_best_info_ignore_history_les | |
2327 | ||
2328 | explicit Peering(my_context ctx); | |
2329 | void exit(); | |
2330 | ||
2331 | typedef boost::mpl::list < | |
2332 | boost::statechart::custom_reaction< QueryState >, | |
2333 | boost::statechart::transition< Activate, Active >, | |
2334 | boost::statechart::custom_reaction< AdvMap > | |
2335 | > reactions; | |
2336 | boost::statechart::result react(const QueryState& q); | |
2337 | boost::statechart::result react(const AdvMap &advmap); | |
2338 | }; | |
2339 | ||
2340 | struct WaitLocalRecoveryReserved; | |
2341 | struct Activating; | |
2342 | struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState { | |
2343 | explicit Active(my_context ctx); | |
2344 | void exit(); | |
2345 | ||
2346 | const set<pg_shard_t> remote_shards_to_reserve_recovery; | |
2347 | const set<pg_shard_t> remote_shards_to_reserve_backfill; | |
2348 | bool all_replicas_activated; | |
2349 | ||
2350 | typedef boost::mpl::list < | |
2351 | boost::statechart::custom_reaction< QueryState >, | |
2352 | boost::statechart::custom_reaction< ActMap >, | |
2353 | boost::statechart::custom_reaction< AdvMap >, | |
2354 | boost::statechart::custom_reaction< MInfoRec >, | |
2355 | boost::statechart::custom_reaction< MNotifyRec >, | |
2356 | boost::statechart::custom_reaction< MLogRec >, | |
11fdf7f2 | 2357 | boost::statechart::custom_reaction< MTrim >, |
7c673cae | 2358 | boost::statechart::custom_reaction< Backfilled >, |
3efd9988 FG |
2359 | boost::statechart::custom_reaction< AllReplicasActivated >, |
2360 | boost::statechart::custom_reaction< DeferRecovery >, | |
b32b8144 | 2361 | boost::statechart::custom_reaction< DeferBackfill >, |
11fdf7f2 | 2362 | boost::statechart::custom_reaction< UnfoundRecovery >, |
b32b8144 | 2363 | boost::statechart::custom_reaction< UnfoundBackfill >, |
11fdf7f2 TL |
2364 | boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>, |
2365 | boost::statechart::custom_reaction< RemoteReservationRevoked>, | |
b32b8144 | 2366 | boost::statechart::custom_reaction< DoRecovery> |
7c673cae FG |
2367 | > reactions; |
2368 | boost::statechart::result react(const QueryState& q); | |
2369 | boost::statechart::result react(const ActMap&); | |
2370 | boost::statechart::result react(const AdvMap&); | |
2371 | boost::statechart::result react(const MInfoRec& infoevt); | |
2372 | boost::statechart::result react(const MNotifyRec& notevt); | |
2373 | boost::statechart::result react(const MLogRec& logevt); | |
11fdf7f2 | 2374 | boost::statechart::result react(const MTrim& trimevt); |
7c673cae FG |
2375 | boost::statechart::result react(const Backfilled&) { |
2376 | return discard_event(); | |
2377 | } | |
2378 | boost::statechart::result react(const AllReplicasActivated&); | |
3efd9988 FG |
2379 | boost::statechart::result react(const DeferRecovery& evt) { |
2380 | return discard_event(); | |
2381 | } | |
2382 | boost::statechart::result react(const DeferBackfill& evt) { | |
2383 | return discard_event(); | |
2384 | } | |
b32b8144 | 2385 | boost::statechart::result react(const UnfoundRecovery& evt) { |
11fdf7f2 | 2386 | return discard_event(); |
b32b8144 FG |
2387 | } |
2388 | boost::statechart::result react(const UnfoundBackfill& evt) { | |
11fdf7f2 TL |
2389 | return discard_event(); |
2390 | } | |
2391 | boost::statechart::result react(const RemoteReservationRevokedTooFull&) { | |
2392 | return discard_event(); | |
2393 | } | |
2394 | boost::statechart::result react(const RemoteReservationRevoked&) { | |
2395 | return discard_event(); | |
b32b8144 FG |
2396 | } |
2397 | boost::statechart::result react(const DoRecovery&) { | |
2398 | return discard_event(); | |
2399 | } | |
7c673cae FG |
2400 | }; |
2401 | ||
2402 | struct Clean : boost::statechart::state< Clean, Active >, NamedState { | |
2403 | typedef boost::mpl::list< | |
11fdf7f2 TL |
2404 | boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, |
2405 | boost::statechart::custom_reaction<SetForceRecovery>, | |
2406 | boost::statechart::custom_reaction<SetForceBackfill> | |
7c673cae FG |
2407 | > reactions; |
2408 | explicit Clean(my_context ctx); | |
2409 | void exit(); | |
11fdf7f2 TL |
2410 | boost::statechart::result react(const boost::statechart::event_base&) { |
2411 | return discard_event(); | |
2412 | } | |
7c673cae FG |
2413 | }; |
2414 | ||
2415 | struct Recovered : boost::statechart::state< Recovered, Active >, NamedState { | |
2416 | typedef boost::mpl::list< | |
2417 | boost::statechart::transition< GoClean, Clean >, | |
224ce89b | 2418 | boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, |
7c673cae FG |
2419 | boost::statechart::custom_reaction< AllReplicasActivated > |
2420 | > reactions; | |
2421 | explicit Recovered(my_context ctx); | |
2422 | void exit(); | |
2423 | boost::statechart::result react(const AllReplicasActivated&) { | |
2424 | post_event(GoClean()); | |
2425 | return forward_event(); | |
2426 | } | |
2427 | }; | |
2428 | ||
2429 | struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState { | |
2430 | typedef boost::mpl::list< | |
11fdf7f2 | 2431 | boost::statechart::custom_reaction< Backfilled >, |
3efd9988 | 2432 | boost::statechart::custom_reaction< DeferBackfill >, |
b32b8144 | 2433 | boost::statechart::custom_reaction< UnfoundBackfill >, |
11fdf7f2 TL |
2434 | boost::statechart::custom_reaction< RemoteReservationRejected >, |
2435 | boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>, | |
2436 | boost::statechart::custom_reaction< RemoteReservationRevoked> | |
7c673cae FG |
2437 | > reactions; |
2438 | explicit Backfilling(my_context ctx); | |
11fdf7f2 TL |
2439 | boost::statechart::result react(const RemoteReservationRejected& evt) { |
2440 | // for compat with old peers | |
2441 | post_event(RemoteReservationRevokedTooFull()); | |
2442 | return discard_event(); | |
2443 | } | |
2444 | void backfill_release_reservations(); | |
2445 | boost::statechart::result react(const Backfilled& evt); | |
2446 | boost::statechart::result react(const RemoteReservationRevokedTooFull& evt); | |
2447 | boost::statechart::result react(const RemoteReservationRevoked& evt); | |
3efd9988 | 2448 | boost::statechart::result react(const DeferBackfill& evt); |
b32b8144 | 2449 | boost::statechart::result react(const UnfoundBackfill& evt); |
11fdf7f2 | 2450 | void cancel_backfill(); |
7c673cae FG |
2451 | void exit(); |
2452 | }; | |
2453 | ||
2454 | struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState { | |
2455 | typedef boost::mpl::list< | |
2456 | boost::statechart::custom_reaction< RemoteBackfillReserved >, | |
2457 | boost::statechart::custom_reaction< RemoteReservationRejected >, | |
11fdf7f2 | 2458 | boost::statechart::custom_reaction< RemoteReservationRevoked >, |
7c673cae FG |
2459 | boost::statechart::transition< AllBackfillsReserved, Backfilling > |
2460 | > reactions; | |
2461 | set<pg_shard_t>::const_iterator backfill_osd_it; | |
2462 | explicit WaitRemoteBackfillReserved(my_context ctx); | |
11fdf7f2 | 2463 | void retry(); |
7c673cae FG |
2464 | void exit(); |
2465 | boost::statechart::result react(const RemoteBackfillReserved& evt); | |
2466 | boost::statechart::result react(const RemoteReservationRejected& evt); | |
11fdf7f2 | 2467 | boost::statechart::result react(const RemoteReservationRevoked& evt); |
7c673cae FG |
2468 | }; |
2469 | ||
2470 | struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState { | |
2471 | typedef boost::mpl::list< | |
2472 | boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved > | |
2473 | > reactions; | |
2474 | explicit WaitLocalBackfillReserved(my_context ctx); | |
2475 | void exit(); | |
2476 | }; | |
2477 | ||
2478 | struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState { | |
2479 | typedef boost::mpl::list< | |
2480 | boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>, | |
2481 | boost::statechart::custom_reaction< RemoteBackfillReserved >, | |
2482 | boost::statechart::custom_reaction< RemoteReservationRejected > | |
2483 | > reactions; | |
2484 | explicit NotBackfilling(my_context ctx); | |
2485 | void exit(); | |
2486 | boost::statechart::result react(const RemoteBackfillReserved& evt); | |
2487 | boost::statechart::result react(const RemoteReservationRejected& evt); | |
2488 | }; | |
2489 | ||
2490 | struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState { | |
2491 | typedef boost::mpl::list< | |
c07f9fc5 | 2492 | boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, |
b32b8144 FG |
2493 | boost::statechart::custom_reaction< DeferRecovery >, |
2494 | boost::statechart::custom_reaction< UnfoundRecovery > | |
7c673cae FG |
2495 | > reactions; |
2496 | explicit NotRecovering(my_context ctx); | |
3efd9988 | 2497 | boost::statechart::result react(const DeferRecovery& evt) { |
c07f9fc5 FG |
2498 | /* no-op */ |
2499 | return discard_event(); | |
2500 | } | |
b32b8144 FG |
2501 | boost::statechart::result react(const UnfoundRecovery& evt) { |
2502 | /* no-op */ | |
2503 | return discard_event(); | |
2504 | } | |
7c673cae FG |
2505 | void exit(); |
2506 | }; | |
2507 | ||
11fdf7f2 | 2508 | struct ToDelete; |
7c673cae FG |
2509 | struct RepNotRecovering; |
2510 | struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState { | |
2511 | explicit ReplicaActive(my_context ctx); | |
2512 | void exit(); | |
2513 | ||
2514 | typedef boost::mpl::list < | |
2515 | boost::statechart::custom_reaction< QueryState >, | |
2516 | boost::statechart::custom_reaction< ActMap >, | |
2517 | boost::statechart::custom_reaction< MQuery >, | |
2518 | boost::statechart::custom_reaction< MInfoRec >, | |
2519 | boost::statechart::custom_reaction< MLogRec >, | |
11fdf7f2 | 2520 | boost::statechart::custom_reaction< MTrim >, |
3efd9988 FG |
2521 | boost::statechart::custom_reaction< Activate >, |
2522 | boost::statechart::custom_reaction< DeferRecovery >, | |
b32b8144 FG |
2523 | boost::statechart::custom_reaction< DeferBackfill >, |
2524 | boost::statechart::custom_reaction< UnfoundRecovery >, | |
11fdf7f2 TL |
2525 | boost::statechart::custom_reaction< UnfoundBackfill >, |
2526 | boost::statechart::custom_reaction< RemoteBackfillPreempted >, | |
2527 | boost::statechart::custom_reaction< RemoteRecoveryPreempted >, | |
2528 | boost::statechart::custom_reaction< RecoveryDone >, | |
2529 | boost::statechart::transition<DeleteStart, ToDelete> | |
7c673cae FG |
2530 | > reactions; |
2531 | boost::statechart::result react(const QueryState& q); | |
2532 | boost::statechart::result react(const MInfoRec& infoevt); | |
2533 | boost::statechart::result react(const MLogRec& logevt); | |
11fdf7f2 | 2534 | boost::statechart::result react(const MTrim& trimevt); |
7c673cae FG |
2535 | boost::statechart::result react(const ActMap&); |
2536 | boost::statechart::result react(const MQuery&); | |
2537 | boost::statechart::result react(const Activate&); | |
11fdf7f2 TL |
2538 | boost::statechart::result react(const RecoveryDone&) { |
2539 | return discard_event(); | |
2540 | } | |
3efd9988 FG |
2541 | boost::statechart::result react(const DeferRecovery& evt) { |
2542 | return discard_event(); | |
2543 | } | |
2544 | boost::statechart::result react(const DeferBackfill& evt) { | |
2545 | return discard_event(); | |
2546 | } | |
b32b8144 FG |
2547 | boost::statechart::result react(const UnfoundRecovery& evt) { |
2548 | return discard_event(); | |
2549 | } | |
2550 | boost::statechart::result react(const UnfoundBackfill& evt) { | |
2551 | return discard_event(); | |
2552 | } | |
11fdf7f2 TL |
2553 | boost::statechart::result react(const RemoteBackfillPreempted& evt) { |
2554 | return discard_event(); | |
2555 | } | |
2556 | boost::statechart::result react(const RemoteRecoveryPreempted& evt) { | |
2557 | return discard_event(); | |
2558 | } | |
7c673cae FG |
2559 | }; |
2560 | ||
2561 | struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState { | |
2562 | typedef boost::mpl::list< | |
2563 | boost::statechart::transition< RecoveryDone, RepNotRecovering >, | |
3efd9988 | 2564 | // for compat with old peers |
7c673cae | 2565 | boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >, |
3efd9988 | 2566 | boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >, |
11fdf7f2 TL |
2567 | boost::statechart::custom_reaction< BackfillTooFull >, |
2568 | boost::statechart::custom_reaction< RemoteRecoveryPreempted >, | |
2569 | boost::statechart::custom_reaction< RemoteBackfillPreempted > | |
7c673cae FG |
2570 | > reactions; |
2571 | explicit RepRecovering(my_context ctx); | |
11fdf7f2 | 2572 | boost::statechart::result react(const RemoteRecoveryPreempted &evt); |
7c673cae | 2573 | boost::statechart::result react(const BackfillTooFull &evt); |
11fdf7f2 | 2574 | boost::statechart::result react(const RemoteBackfillPreempted &evt); |
7c673cae FG |
2575 | void exit(); |
2576 | }; | |
2577 | ||
2578 | struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState { | |
2579 | typedef boost::mpl::list< | |
2580 | boost::statechart::custom_reaction< RemoteBackfillReserved >, | |
3efd9988 FG |
2581 | boost::statechart::custom_reaction< RejectRemoteReservation >, |
2582 | boost::statechart::custom_reaction< RemoteReservationRejected >, | |
2583 | boost::statechart::custom_reaction< RemoteReservationCanceled > | |
7c673cae FG |
2584 | > reactions; |
2585 | explicit RepWaitBackfillReserved(my_context ctx); | |
2586 | void exit(); | |
2587 | boost::statechart::result react(const RemoteBackfillReserved &evt); | |
3efd9988 | 2588 | boost::statechart::result react(const RejectRemoteReservation &evt); |
7c673cae | 2589 | boost::statechart::result react(const RemoteReservationRejected &evt); |
3efd9988 | 2590 | boost::statechart::result react(const RemoteReservationCanceled &evt); |
7c673cae FG |
2591 | }; |
2592 | ||
2593 | struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState { | |
2594 | typedef boost::mpl::list< | |
3efd9988 FG |
2595 | boost::statechart::custom_reaction< RemoteRecoveryReserved >, |
2596 | // for compat with old peers | |
2597 | boost::statechart::custom_reaction< RemoteReservationRejected >, | |
2598 | boost::statechart::custom_reaction< RemoteReservationCanceled > | |
7c673cae FG |
2599 | > reactions; |
2600 | explicit RepWaitRecoveryReserved(my_context ctx); | |
2601 | void exit(); | |
2602 | boost::statechart::result react(const RemoteRecoveryReserved &evt); | |
3efd9988 FG |
2603 | boost::statechart::result react(const RemoteReservationRejected &evt) { |
2604 | // for compat with old peers | |
2605 | post_event(RemoteReservationCanceled()); | |
2606 | return discard_event(); | |
2607 | } | |
2608 | boost::statechart::result react(const RemoteReservationCanceled &evt); | |
7c673cae FG |
2609 | }; |
2610 | ||
2611 | struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState { | |
2612 | typedef boost::mpl::list< | |
11fdf7f2 | 2613 | boost::statechart::custom_reaction< RequestRecoveryPrio >, |
7c673cae | 2614 | boost::statechart::custom_reaction< RequestBackfillPrio >, |
3efd9988 FG |
2615 | boost::statechart::custom_reaction< RejectRemoteReservation >, |
2616 | boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >, | |
2617 | boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >, | |
11fdf7f2 TL |
2618 | boost::statechart::custom_reaction< RemoteRecoveryReserved >, |
2619 | boost::statechart::custom_reaction< RemoteBackfillReserved >, | |
7c673cae FG |
2620 | boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers |
2621 | > reactions; | |
2622 | explicit RepNotRecovering(my_context ctx); | |
11fdf7f2 | 2623 | boost::statechart::result react(const RequestRecoveryPrio &evt); |
7c673cae | 2624 | boost::statechart::result react(const RequestBackfillPrio &evt); |
11fdf7f2 TL |
2625 | boost::statechart::result react(const RemoteBackfillReserved &evt) { |
2626 | // my reservation completion raced with a RELEASE from primary | |
2627 | return discard_event(); | |
2628 | } | |
2629 | boost::statechart::result react(const RemoteRecoveryReserved &evt) { | |
2630 | // my reservation completion raced with a RELEASE from primary | |
2631 | return discard_event(); | |
2632 | } | |
3efd9988 | 2633 | boost::statechart::result react(const RejectRemoteReservation &evt); |
7c673cae FG |
2634 | void exit(); |
2635 | }; | |
2636 | ||
2637 | struct Recovering : boost::statechart::state< Recovering, Active >, NamedState { | |
2638 | typedef boost::mpl::list < | |
2639 | boost::statechart::custom_reaction< AllReplicasRecovered >, | |
3efd9988 | 2640 | boost::statechart::custom_reaction< DeferRecovery >, |
b32b8144 | 2641 | boost::statechart::custom_reaction< UnfoundRecovery >, |
7c673cae FG |
2642 | boost::statechart::custom_reaction< RequestBackfill > |
2643 | > reactions; | |
2644 | explicit Recovering(my_context ctx); | |
2645 | void exit(); | |
224ce89b | 2646 | void release_reservations(bool cancel = false); |
7c673cae | 2647 | boost::statechart::result react(const AllReplicasRecovered &evt); |
3efd9988 | 2648 | boost::statechart::result react(const DeferRecovery& evt); |
b32b8144 | 2649 | boost::statechart::result react(const UnfoundRecovery& evt); |
7c673cae FG |
2650 | boost::statechart::result react(const RequestBackfill &evt); |
2651 | }; | |
2652 | ||
2653 | struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState { | |
2654 | typedef boost::mpl::list < | |
2655 | boost::statechart::custom_reaction< RemoteRecoveryReserved >, | |
2656 | boost::statechart::transition< AllRemotesReserved, Recovering > | |
2657 | > reactions; | |
2658 | set<pg_shard_t>::const_iterator remote_recovery_reservation_it; | |
2659 | explicit WaitRemoteRecoveryReserved(my_context ctx); | |
2660 | boost::statechart::result react(const RemoteRecoveryReserved &evt); | |
2661 | void exit(); | |
2662 | }; | |
2663 | ||
2664 | struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState { | |
2665 | typedef boost::mpl::list < | |
2666 | boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >, | |
2667 | boost::statechart::custom_reaction< RecoveryTooFull > | |
2668 | > reactions; | |
2669 | explicit WaitLocalRecoveryReserved(my_context ctx); | |
2670 | void exit(); | |
2671 | boost::statechart::result react(const RecoveryTooFull &evt); | |
2672 | }; | |
2673 | ||
2674 | struct Activating : boost::statechart::state< Activating, Active >, NamedState { | |
2675 | typedef boost::mpl::list < | |
2676 | boost::statechart::transition< AllReplicasRecovered, Recovered >, | |
2677 | boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, | |
2678 | boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved > | |
2679 | > reactions; | |
2680 | explicit Activating(my_context ctx); | |
2681 | void exit(); | |
2682 | }; | |
2683 | ||
11fdf7f2 TL |
2684 | struct Stray : boost::statechart::state< Stray, Started >, |
2685 | NamedState { | |
7c673cae FG |
2686 | explicit Stray(my_context ctx); |
2687 | void exit(); | |
2688 | ||
2689 | typedef boost::mpl::list < | |
2690 | boost::statechart::custom_reaction< MQuery >, | |
2691 | boost::statechart::custom_reaction< MLogRec >, | |
2692 | boost::statechart::custom_reaction< MInfoRec >, | |
2693 | boost::statechart::custom_reaction< ActMap >, | |
11fdf7f2 TL |
2694 | boost::statechart::custom_reaction< RecoveryDone >, |
2695 | boost::statechart::transition<DeleteStart, ToDelete> | |
7c673cae FG |
2696 | > reactions; |
2697 | boost::statechart::result react(const MQuery& query); | |
2698 | boost::statechart::result react(const MLogRec& logevt); | |
2699 | boost::statechart::result react(const MInfoRec& infoevt); | |
2700 | boost::statechart::result react(const ActMap&); | |
2701 | boost::statechart::result react(const RecoveryDone&) { | |
2702 | return discard_event(); | |
2703 | } | |
2704 | }; | |
2705 | ||
11fdf7f2 TL |
2706 | struct WaitDeleteReserved; |
2707 | struct ToDelete : boost::statechart::state<ToDelete, Started, WaitDeleteReserved>, NamedState { | |
2708 | unsigned priority = 0; | |
2709 | typedef boost::mpl::list < | |
2710 | boost::statechart::custom_reaction< ActMap >, | |
2711 | boost::statechart::custom_reaction< DeleteSome > | |
2712 | > reactions; | |
2713 | explicit ToDelete(my_context ctx); | |
2714 | boost::statechart::result react(const ActMap &evt); | |
2715 | boost::statechart::result react(const DeleteSome &evt) { | |
2716 | // happens if we drop out of Deleting due to reprioritization etc. | |
2717 | return discard_event(); | |
2718 | } | |
2719 | void exit(); | |
2720 | }; | |
2721 | ||
2722 | struct Deleting; | |
2723 | struct WaitDeleteReserved : boost::statechart::state<WaitDeleteReserved, | |
2724 | ToDelete>, NamedState { | |
2725 | typedef boost::mpl::list < | |
2726 | boost::statechart::transition<DeleteReserved, Deleting> | |
2727 | > reactions; | |
2728 | explicit WaitDeleteReserved(my_context ctx); | |
2729 | void exit(); | |
2730 | }; | |
2731 | ||
2732 | struct Deleting : boost::statechart::state<Deleting, | |
2733 | ToDelete>, NamedState { | |
2734 | typedef boost::mpl::list < | |
2735 | boost::statechart::custom_reaction< DeleteSome >, | |
2736 | boost::statechart::transition<DeleteInterrupted, WaitDeleteReserved> | |
2737 | > reactions; | |
2738 | explicit Deleting(my_context ctx); | |
2739 | boost::statechart::result react(const DeleteSome &evt); | |
2740 | void exit(); | |
2741 | }; | |
2742 | ||
7c673cae FG |
2743 | struct GetLog; |
2744 | ||
2745 | struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState { | |
2746 | set<pg_shard_t> peer_info_requested; | |
2747 | ||
2748 | explicit GetInfo(my_context ctx); | |
2749 | void exit(); | |
2750 | void get_infos(); | |
2751 | ||
2752 | typedef boost::mpl::list < | |
2753 | boost::statechart::custom_reaction< QueryState >, | |
2754 | boost::statechart::transition< GotInfo, GetLog >, | |
2755 | boost::statechart::custom_reaction< MNotifyRec >, | |
2756 | boost::statechart::transition< IsDown, Down > | |
2757 | > reactions; | |
2758 | boost::statechart::result react(const QueryState& q); | |
2759 | boost::statechart::result react(const MNotifyRec& infoevt); | |
2760 | }; | |
2761 | ||
2762 | struct GotLog : boost::statechart::event< GotLog > { | |
2763 | GotLog() : boost::statechart::event< GotLog >() {} | |
2764 | }; | |
2765 | ||
2766 | struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState { | |
2767 | pg_shard_t auth_log_shard; | |
2768 | boost::intrusive_ptr<MOSDPGLog> msg; | |
2769 | ||
2770 | explicit GetLog(my_context ctx); | |
2771 | void exit(); | |
2772 | ||
2773 | typedef boost::mpl::list < | |
2774 | boost::statechart::custom_reaction< QueryState >, | |
2775 | boost::statechart::custom_reaction< MLogRec >, | |
2776 | boost::statechart::custom_reaction< GotLog >, | |
2777 | boost::statechart::custom_reaction< AdvMap >, | |
2778 | boost::statechart::transition< IsIncomplete, Incomplete > | |
2779 | > reactions; | |
2780 | boost::statechart::result react(const AdvMap&); | |
2781 | boost::statechart::result react(const QueryState& q); | |
2782 | boost::statechart::result react(const MLogRec& logevt); | |
2783 | boost::statechart::result react(const GotLog&); | |
2784 | }; | |
2785 | ||
2786 | struct WaitUpThru; | |
2787 | ||
2788 | struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState { | |
2789 | set<pg_shard_t> peer_missing_requested; | |
2790 | ||
2791 | explicit GetMissing(my_context ctx); | |
2792 | void exit(); | |
2793 | ||
2794 | typedef boost::mpl::list < | |
2795 | boost::statechart::custom_reaction< QueryState >, | |
2796 | boost::statechart::custom_reaction< MLogRec >, | |
2797 | boost::statechart::transition< NeedUpThru, WaitUpThru > | |
2798 | > reactions; | |
2799 | boost::statechart::result react(const QueryState& q); | |
2800 | boost::statechart::result react(const MLogRec& logevt); | |
2801 | }; | |
2802 | ||
2803 | struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState { | |
2804 | explicit WaitUpThru(my_context ctx); | |
2805 | void exit(); | |
2806 | ||
2807 | typedef boost::mpl::list < | |
2808 | boost::statechart::custom_reaction< QueryState >, | |
2809 | boost::statechart::custom_reaction< ActMap >, | |
2810 | boost::statechart::custom_reaction< MLogRec > | |
2811 | > reactions; | |
2812 | boost::statechart::result react(const QueryState& q); | |
2813 | boost::statechart::result react(const ActMap& am); | |
2814 | boost::statechart::result react(const MLogRec& logrec); | |
2815 | }; | |
2816 | ||
2817 | struct Down : boost::statechart::state< Down, Peering>, NamedState { | |
2818 | explicit Down(my_context ctx); | |
2819 | typedef boost::mpl::list < | |
11fdf7f2 TL |
2820 | boost::statechart::custom_reaction< QueryState >, |
2821 | boost::statechart::custom_reaction< MNotifyRec > | |
7c673cae | 2822 | > reactions; |
11fdf7f2 TL |
2823 | boost::statechart::result react(const QueryState& q); |
2824 | boost::statechart::result react(const MNotifyRec& infoevt); | |
7c673cae FG |
2825 | void exit(); |
2826 | }; | |
2827 | ||
2828 | struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState { | |
2829 | typedef boost::mpl::list < | |
2830 | boost::statechart::custom_reaction< AdvMap >, | |
2831 | boost::statechart::custom_reaction< MNotifyRec >, | |
2832 | boost::statechart::custom_reaction< QueryState > | |
2833 | > reactions; | |
2834 | explicit Incomplete(my_context ctx); | |
2835 | boost::statechart::result react(const AdvMap &advmap); | |
2836 | boost::statechart::result react(const MNotifyRec& infoevt); | |
11fdf7f2 | 2837 | boost::statechart::result react(const QueryState& q); |
7c673cae FG |
2838 | void exit(); |
2839 | }; | |
2840 | ||
7c673cae FG |
2841 | RecoveryMachine machine; |
2842 | PG *pg; | |
2843 | ||
2844 | /// context passed in by state machine caller | |
2845 | RecoveryCtx *orig_ctx; | |
2846 | ||
2847 | /// populated if we are buffering messages pending a flush | |
2848 | boost::optional<BufferedRecoveryMessages> messages_pending_flush; | |
2849 | ||
2850 | /** | |
2851 | * populated between start_handle() and end_handle(), points into | |
2852 | * the message lists for messages_pending_flush while blocking messages | |
2853 | * or into orig_ctx otherwise | |
2854 | */ | |
2855 | boost::optional<RecoveryCtx> rctx; | |
2856 | ||
2857 | public: | |
2858 | explicit RecoveryState(PG *pg) | |
2859 | : machine(this, pg), pg(pg), orig_ctx(0) { | |
2860 | machine.initiate(); | |
2861 | } | |
2862 | ||
2863 | void handle_event(const boost::statechart::event_base &evt, | |
2864 | RecoveryCtx *rctx) { | |
2865 | start_handle(rctx); | |
2866 | machine.process_event(evt); | |
2867 | end_handle(); | |
2868 | } | |
2869 | ||
11fdf7f2 | 2870 | void handle_event(PGPeeringEventRef evt, |
7c673cae FG |
2871 | RecoveryCtx *rctx) { |
2872 | start_handle(rctx); | |
2873 | machine.process_event(evt->get_event()); | |
2874 | end_handle(); | |
2875 | } | |
2876 | ||
2877 | } recovery_state; | |
2878 | ||
2879 | ||
7c673cae | 2880 | |
7c673cae FG |
2881 | uint64_t peer_features; |
2882 | uint64_t acting_features; | |
2883 | uint64_t upacting_features; | |
2884 | ||
2885 | epoch_t last_epoch; | |
2886 | ||
11fdf7f2 TL |
2887 | /// most recently consumed osdmap's require_osd_version |
2888 | unsigned last_require_osd_release = 0; | |
2889 | bool delete_needs_sleep = false; | |
a8e16298 | 2890 | |
11fdf7f2 | 2891 | protected: |
7c673cae FG |
2892 | void reset_min_peer_features() { |
2893 | peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT; | |
2894 | } | |
2895 | uint64_t get_min_peer_features() const { return peer_features; } | |
2896 | void apply_peer_features(uint64_t f) { peer_features &= f; } | |
2897 | ||
2898 | uint64_t get_min_acting_features() const { return acting_features; } | |
2899 | uint64_t get_min_upacting_features() const { return upacting_features; } | |
c07f9fc5 FG |
2900 | bool perform_deletes_during_peering() const { |
2901 | return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES)); | |
2902 | } | |
7c673cae | 2903 | |
f64942e4 AA |
2904 | bool hard_limit_pglog() const { |
2905 | return (get_osdmap()->test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT)); | |
2906 | } | |
2907 | ||
7c673cae FG |
2908 | void init_primary_up_acting( |
2909 | const vector<int> &newup, | |
2910 | const vector<int> &newacting, | |
2911 | int new_up_primary, | |
2912 | int new_acting_primary) { | |
2913 | actingset.clear(); | |
2914 | acting = newacting; | |
2915 | for (uint8_t i = 0; i < acting.size(); ++i) { | |
2916 | if (acting[i] != CRUSH_ITEM_NONE) | |
2917 | actingset.insert( | |
2918 | pg_shard_t( | |
2919 | acting[i], | |
11fdf7f2 | 2920 | pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD)); |
7c673cae FG |
2921 | } |
2922 | upset.clear(); | |
2923 | up = newup; | |
2924 | for (uint8_t i = 0; i < up.size(); ++i) { | |
2925 | if (up[i] != CRUSH_ITEM_NONE) | |
2926 | upset.insert( | |
2927 | pg_shard_t( | |
2928 | up[i], | |
11fdf7f2 | 2929 | pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD)); |
7c673cae | 2930 | } |
11fdf7f2 | 2931 | if (!pool.info.is_erasure()) { |
7c673cae FG |
2932 | up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD); |
2933 | primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD); | |
2934 | return; | |
2935 | } | |
2936 | up_primary = pg_shard_t(); | |
2937 | primary = pg_shard_t(); | |
2938 | for (uint8_t i = 0; i < up.size(); ++i) { | |
2939 | if (up[i] == new_up_primary) { | |
2940 | up_primary = pg_shard_t(up[i], shard_id_t(i)); | |
2941 | break; | |
2942 | } | |
2943 | } | |
2944 | for (uint8_t i = 0; i < acting.size(); ++i) { | |
2945 | if (acting[i] == new_acting_primary) { | |
2946 | primary = pg_shard_t(acting[i], shard_id_t(i)); | |
2947 | break; | |
2948 | } | |
2949 | } | |
11fdf7f2 TL |
2950 | ceph_assert(up_primary.osd == new_up_primary); |
2951 | ceph_assert(primary.osd == new_acting_primary); | |
7c673cae | 2952 | } |
7c673cae | 2953 | |
11fdf7f2 TL |
2954 | void set_role(int r) { |
2955 | role = r; | |
2956 | } | |
7c673cae | 2957 | |
11fdf7f2 TL |
2958 | bool state_test(uint64_t m) const { return (state & m) != 0; } |
2959 | void state_set(uint64_t m) { state |= m; } | |
2960 | void state_clear(uint64_t m) { state &= ~m; } | |
7c673cae FG |
2961 | |
2962 | bool is_complete() const { return info.last_complete == info.last_update; } | |
2963 | bool should_send_notify() const { return send_notify; } | |
2964 | ||
11fdf7f2 TL |
2965 | uint64_t get_state() const { return state; } |
2966 | bool is_active() const { return state_test(PG_STATE_ACTIVE); } | |
2967 | bool is_activating() const { return state_test(PG_STATE_ACTIVATING); } | |
2968 | bool is_peering() const { return state_test(PG_STATE_PEERING); } | |
2969 | bool is_down() const { return state_test(PG_STATE_DOWN); } | |
2970 | bool is_recovery_unfound() const { return state_test(PG_STATE_RECOVERY_UNFOUND); } | |
2971 | bool is_backfill_unfound() const { return state_test(PG_STATE_BACKFILL_UNFOUND); } | |
2972 | bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); } | |
2973 | bool is_clean() const { return state_test(PG_STATE_CLEAN); } | |
2974 | bool is_degraded() const { return state_test(PG_STATE_DEGRADED); } | |
2975 | bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); } | |
2976 | bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); } | |
2977 | bool is_remapped() const { return state_test(PG_STATE_REMAPPED); } | |
2978 | bool is_peered() const { | |
7c673cae FG |
2979 | return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED); |
2980 | } | |
91327a77 | 2981 | bool is_recovering() const { return state_test(PG_STATE_RECOVERING); } |
11fdf7f2 TL |
2982 | bool is_premerge() const { return state_test(PG_STATE_PREMERGE); } |
2983 | bool is_repair() const { return state_test(PG_STATE_REPAIR); } | |
7c673cae | 2984 | |
11fdf7f2 | 2985 | bool is_empty() const { return info.last_update == eversion_t(0,0); } |
7c673cae FG |
2986 | |
2987 | // pg on-disk state | |
2988 | void do_pending_flush(); | |
2989 | ||
11fdf7f2 | 2990 | public: |
7c673cae FG |
2991 | static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits); |
2992 | static void _init(ObjectStore::Transaction& t, | |
2993 | spg_t pgid, const pg_pool_t *pool); | |
2994 | ||
11fdf7f2 | 2995 | protected: |
7c673cae FG |
2996 | void prepare_write_info(map<string,bufferlist> *km); |
2997 | ||
2998 | void update_store_with_options(); | |
7c673cae FG |
2999 | |
3000 | public: | |
3001 | static int _prepare_write_info( | |
3002 | CephContext* cct, | |
3003 | map<string,bufferlist> *km, | |
3004 | epoch_t epoch, | |
3005 | pg_info_t &info, | |
3006 | pg_info_t &last_written_info, | |
3007 | PastIntervals &past_intervals, | |
3008 | bool dirty_big_info, | |
3009 | bool dirty_epoch, | |
3010 | bool try_fast_info, | |
3011 | PerfCounters *logger = nullptr); | |
11fdf7f2 TL |
3012 | |
3013 | void write_if_dirty(RecoveryCtx *rctx) { | |
3014 | write_if_dirty(*rctx->transaction); | |
3015 | } | |
3016 | protected: | |
7c673cae FG |
3017 | void write_if_dirty(ObjectStore::Transaction& t); |
3018 | ||
3019 | PGLog::IndexedLog projected_log; | |
3020 | bool check_in_progress_op( | |
3021 | const osd_reqid_t &r, | |
3022 | eversion_t *version, | |
3023 | version_t *user_version, | |
3024 | int *return_code) const; | |
3025 | eversion_t projected_last_update; | |
3026 | eversion_t get_next_version() const { | |
3027 | eversion_t at_version( | |
11fdf7f2 | 3028 | get_osdmap_epoch(), |
7c673cae | 3029 | projected_last_update.version+1); |
11fdf7f2 TL |
3030 | ceph_assert(at_version > info.last_update); |
3031 | ceph_assert(at_version > pg_log.get_head()); | |
3032 | ceph_assert(at_version > projected_last_update); | |
7c673cae FG |
3033 | return at_version; |
3034 | } | |
3035 | ||
3036 | void add_log_entry(const pg_log_entry_t& e, bool applied); | |
3037 | void append_log( | |
3038 | const vector<pg_log_entry_t>& logv, | |
3039 | eversion_t trim_to, | |
3040 | eversion_t roll_forward_to, | |
3041 | ObjectStore::Transaction &t, | |
11fdf7f2 TL |
3042 | bool transaction_applied = true, |
3043 | bool async = false); | |
7c673cae | 3044 | bool check_log_for_corruption(ObjectStore *store); |
7c673cae FG |
3045 | |
3046 | std::string get_corrupt_pg_log_name() const; | |
11fdf7f2 | 3047 | |
7c673cae FG |
3048 | void update_snap_map( |
3049 | const vector<pg_log_entry_t> &log_entries, | |
3050 | ObjectStore::Transaction& t); | |
3051 | ||
3052 | void filter_snapc(vector<snapid_t> &snaps); | |
3053 | ||
3054 | void log_weirdness(); | |
3055 | ||
3056 | virtual void kick_snap_trim() = 0; | |
3057 | virtual void snap_trimmer_scrub_complete() = 0; | |
31f18b77 | 3058 | bool requeue_scrub(bool high_priority = false); |
c07f9fc5 | 3059 | void queue_recovery(); |
7c673cae FG |
3060 | bool queue_scrub(); |
3061 | unsigned get_scrub_priority(); | |
3062 | ||
3063 | /// share pg info after a pg is active | |
3064 | void share_pg_info(); | |
3065 | ||
3066 | ||
3067 | bool append_log_entries_update_missing( | |
31f18b77 | 3068 | const mempool::osd_pglog::list<pg_log_entry_t> &entries, |
94b18763 FG |
3069 | ObjectStore::Transaction &t, |
3070 | boost::optional<eversion_t> trim_to, | |
3071 | boost::optional<eversion_t> roll_forward_to); | |
7c673cae FG |
3072 | |
3073 | /** | |
3074 | * Merge entries updating missing as necessary on all | |
11fdf7f2 | 3075 | * acting_recovery_backfill logs and missings (also missing_loc) |
7c673cae FG |
3076 | */ |
3077 | void merge_new_log_entries( | |
31f18b77 | 3078 | const mempool::osd_pglog::list<pg_log_entry_t> &entries, |
94b18763 FG |
3079 | ObjectStore::Transaction &t, |
3080 | boost::optional<eversion_t> trim_to, | |
3081 | boost::optional<eversion_t> roll_forward_to); | |
7c673cae FG |
3082 | |
3083 | void reset_interval_flush(); | |
3084 | void start_peering_interval( | |
3085 | const OSDMapRef lastmap, | |
3086 | const vector<int>& newup, int up_primary, | |
3087 | const vector<int>& newacting, int acting_primary, | |
3088 | ObjectStore::Transaction *t); | |
3089 | void on_new_interval(); | |
3090 | virtual void _on_new_interval() = 0; | |
11fdf7f2 | 3091 | void start_flush(ObjectStore::Transaction *t); |
7c673cae | 3092 | void set_last_peering_reset(); |
7c673cae FG |
3093 | |
3094 | void update_history(const pg_history_t& history); | |
3095 | void fulfill_info(pg_shard_t from, const pg_query_t &query, | |
3096 | pair<pg_shard_t, pg_info_t> ¬ify_info); | |
3097 | void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch); | |
1adf2230 | 3098 | void fulfill_query(const MQuery& q, RecoveryCtx *rctx); |
7c673cae FG |
3099 | void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap); |
3100 | ||
3101 | bool should_restart_peering( | |
3102 | int newupprimary, | |
3103 | int newactingprimary, | |
3104 | const vector<int>& newup, | |
3105 | const vector<int>& newacting, | |
3106 | OSDMapRef lastmap, | |
3107 | OSDMapRef osdmap); | |
3108 | ||
3109 | // OpRequest queueing | |
3110 | bool can_discard_op(OpRequestRef& op); | |
3111 | bool can_discard_scan(OpRequestRef op); | |
3112 | bool can_discard_backfill(OpRequestRef op); | |
3113 | bool can_discard_request(OpRequestRef& op); | |
3114 | ||
3115 | template<typename T, int MSGTYPE> | |
3116 | bool can_discard_replica_op(OpRequestRef& op); | |
3117 | ||
3118 | bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch); | |
11fdf7f2 | 3119 | bool old_peering_evt(PGPeeringEventRef evt) { |
7c673cae FG |
3120 | return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested()); |
3121 | } | |
3122 | static bool have_same_or_newer_map(epoch_t cur_epoch, epoch_t e) { | |
3123 | return e <= cur_epoch; | |
3124 | } | |
3125 | bool have_same_or_newer_map(epoch_t e) { | |
11fdf7f2 | 3126 | return e <= get_osdmap_epoch(); |
7c673cae FG |
3127 | } |
3128 | ||
3129 | bool op_has_sufficient_caps(OpRequestRef& op); | |
3130 | ||
3131 | ||
3132 | // recovery bits | |
3133 | void take_waiters(); | |
7c673cae FG |
3134 | |
3135 | ||
3136 | // abstract bits | |
11fdf7f2 | 3137 | friend class FlushState; |
7c673cae FG |
3138 | |
3139 | virtual void on_role_change() = 0; | |
3140 | virtual void on_pool_change() = 0; | |
3141 | virtual void on_change(ObjectStore::Transaction *t) = 0; | |
3142 | virtual void on_activate() = 0; | |
3143 | virtual void on_flushed() = 0; | |
7c673cae | 3144 | virtual void check_blacklisted_watchers() = 0; |
7c673cae | 3145 | |
11fdf7f2 | 3146 | friend ostream& operator<<(ostream& out, const PG& pg); |
7c673cae FG |
3147 | }; |
3148 | ||
7c673cae FG |
3149 | |
3150 | ostream& operator<<(ostream& out, const PG::BackfillInterval& bi); | |
3151 | ||
3152 | #endif |