]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #ifndef CEPH_PG_H | |
16 | #define CEPH_PG_H | |
17 | ||
18 | #include <boost/statechart/custom_reaction.hpp> | |
19 | #include <boost/statechart/event.hpp> | |
20 | #include <boost/statechart/simple_state.hpp> | |
21 | #include <boost/statechart/state.hpp> | |
22 | #include <boost/statechart/state_machine.hpp> | |
23 | #include <boost/statechart/transition.hpp> | |
24 | #include <boost/statechart/event_base.hpp> | |
25 | #include <boost/scoped_ptr.hpp> | |
26 | #include <boost/circular_buffer.hpp> | |
27 | #include "include/memory.h" | |
28 | #include "include/mempool.h" | |
29 | ||
30 | // re-include our assert to clobber boost's | |
31 | #include "include/assert.h" | |
32 | ||
33 | #include "include/types.h" | |
34 | #include "include/stringify.h" | |
35 | #include "osd_types.h" | |
36 | #include "include/xlist.h" | |
37 | #include "SnapMapper.h" | |
38 | #include "Session.h" | |
39 | #include "common/Timer.h" | |
40 | ||
41 | #include "PGLog.h" | |
42 | #include "OSDMap.h" | |
43 | #include "messages/MOSDPGLog.h" | |
44 | #include "include/str_list.h" | |
45 | #include "PGBackend.h" | |
46 | ||
47 | #include <atomic> | |
48 | #include <list> | |
49 | #include <memory> | |
50 | #include <stack> | |
51 | #include <string> | |
52 | #include <tuple> | |
53 | using namespace std; | |
54 | ||
55 | // #include "include/unordered_map.h" | |
56 | // #include "include/unordered_set.h" | |
57 | ||
58 | //#define DEBUG_RECOVERY_OIDS // track set of recovering oids explicitly, to find counting bugs | |
59 | ||
60 | class OSD; | |
61 | class OSDService; | |
62 | class MOSDOp; | |
63 | class MOSDPGScan; | |
64 | class MOSDPGBackfill; | |
65 | class MOSDPGInfo; | |
66 | ||
67 | class PG; | |
68 | struct OpRequest; | |
69 | typedef OpRequest::Ref OpRequestRef; | |
70 | class MOSDPGLog; | |
71 | class CephContext; | |
72 | ||
73 | namespace Scrub { | |
74 | class Store; | |
75 | } | |
76 | ||
77 | void intrusive_ptr_add_ref(PG *pg); | |
78 | void intrusive_ptr_release(PG *pg); | |
79 | ||
80 | using state_history_entry = std::tuple<utime_t, utime_t, const char*>; | |
81 | using embedded_state = std::pair<utime_t, const char*>; | |
82 | ||
83 | struct PGStateInstance { | |
84 | // Time spent in pg states | |
85 | ||
86 | void setepoch(const epoch_t current_epoch) { | |
87 | this_epoch = current_epoch; | |
88 | } | |
89 | ||
90 | void enter_state(const utime_t entime, const char* state) { | |
91 | embedded_states.push(std::make_pair(entime, state)); | |
92 | } | |
93 | ||
94 | void exit_state(const utime_t extime) { | |
95 | embedded_state this_state = embedded_states.top(); | |
96 | state_history.push_back(state_history_entry{ | |
97 | this_state.first, extime, this_state.second}); | |
98 | embedded_states.pop(); | |
99 | } | |
100 | ||
101 | epoch_t this_epoch; | |
102 | utime_t enter_time; | |
103 | std::vector<state_history_entry> state_history; | |
104 | std::stack<embedded_state> embedded_states; | |
105 | }; | |
106 | ||
107 | class PGStateHistory { | |
108 | // Member access protected with the PG lock | |
109 | public: | |
110 | PGStateHistory() : buffer(10) {} | |
111 | ||
112 | void enter(PG* pg, const utime_t entime, const char* state); | |
113 | ||
114 | void exit(const char* state); | |
115 | ||
116 | void reset() { | |
117 | pi = nullptr; | |
118 | } | |
119 | ||
120 | void set_pg_in_destructor() { pg_in_destructor = true; } | |
121 | ||
122 | void dump(Formatter* f) const; | |
123 | ||
124 | private: | |
125 | bool pg_in_destructor = false; | |
126 | PG* thispg = nullptr; | |
127 | std::unique_ptr<PGStateInstance> tmppi; | |
128 | PGStateInstance* pi = nullptr; | |
129 | boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer; | |
130 | ||
131 | }; | |
132 | ||
133 | #ifdef PG_DEBUG_REFS | |
134 | #include "common/tracked_int_ptr.hpp" | |
135 | uint64_t get_with_id(PG *pg); | |
136 | void put_with_id(PG *pg, uint64_t id); | |
137 | typedef TrackedIntPtr<PG> PGRef; | |
138 | #else | |
139 | typedef boost::intrusive_ptr<PG> PGRef; | |
140 | #endif | |
141 | ||
142 | class PGRecoveryStats { | |
143 | struct per_state_info { | |
144 | uint64_t enter, exit; // enter/exit counts | |
145 | uint64_t events; | |
146 | utime_t event_time; // time spent processing events | |
147 | utime_t total_time; // total time in state | |
148 | utime_t min_time, max_time; | |
149 | ||
150 | // cppcheck-suppress unreachableCode | |
151 | per_state_info() : enter(0), exit(0), events(0) {} | |
152 | }; | |
153 | map<const char *,per_state_info> info; | |
154 | Mutex lock; | |
155 | ||
156 | public: | |
157 | PGRecoveryStats() : lock("PGRecoverStats::lock") {} | |
158 | ||
159 | void reset() { | |
160 | Mutex::Locker l(lock); | |
161 | info.clear(); | |
162 | } | |
163 | void dump(ostream& out) { | |
164 | Mutex::Locker l(lock); | |
165 | for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) { | |
166 | per_state_info& i = p->second; | |
167 | out << i.enter << "\t" << i.exit << "\t" | |
168 | << i.events << "\t" << i.event_time << "\t" | |
169 | << i.total_time << "\t" | |
170 | << i.min_time << "\t" << i.max_time << "\t" | |
171 | << p->first << "\n"; | |
172 | } | |
173 | } | |
174 | ||
175 | void dump_formatted(Formatter *f) { | |
176 | Mutex::Locker l(lock); | |
177 | f->open_array_section("pg_recovery_stats"); | |
178 | for (map<const char *,per_state_info>::iterator p = info.begin(); | |
179 | p != info.end(); ++p) { | |
180 | per_state_info& i = p->second; | |
181 | f->open_object_section("recovery_state"); | |
182 | f->dump_int("enter", i.enter); | |
183 | f->dump_int("exit", i.exit); | |
184 | f->dump_int("events", i.events); | |
185 | f->dump_stream("event_time") << i.event_time; | |
186 | f->dump_stream("total_time") << i.total_time; | |
187 | f->dump_stream("min_time") << i.min_time; | |
188 | f->dump_stream("max_time") << i.max_time; | |
189 | vector<string> states; | |
190 | get_str_vec(p->first, "/", states); | |
191 | f->open_array_section("nested_states"); | |
192 | for (vector<string>::iterator st = states.begin(); | |
193 | st != states.end(); ++st) { | |
194 | f->dump_string("state", *st); | |
195 | } | |
196 | f->close_section(); | |
197 | f->close_section(); | |
198 | } | |
199 | f->close_section(); | |
200 | } | |
201 | ||
202 | void log_enter(const char *s) { | |
203 | Mutex::Locker l(lock); | |
204 | info[s].enter++; | |
205 | } | |
206 | void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) { | |
207 | Mutex::Locker l(lock); | |
208 | per_state_info &i = info[s]; | |
209 | i.exit++; | |
210 | i.total_time += dur; | |
211 | if (dur > i.max_time) | |
212 | i.max_time = dur; | |
213 | if (dur < i.min_time || i.min_time == utime_t()) | |
214 | i.min_time = dur; | |
215 | i.events += events; | |
216 | i.event_time += event_dur; | |
217 | } | |
218 | }; | |
219 | ||
220 | struct PGPool { | |
221 | CephContext* cct; | |
222 | epoch_t cached_epoch; | |
223 | int64_t id; | |
224 | string name; | |
225 | uint64_t auid; | |
226 | ||
227 | pg_pool_t info; | |
228 | SnapContext snapc; // the default pool snapc, ready to go. | |
229 | ||
230 | interval_set<snapid_t> cached_removed_snaps; // current removed_snaps set | |
231 | interval_set<snapid_t> newly_removed_snaps; // newly removed in the last epoch | |
232 | ||
233 | PGPool(CephContext* cct, OSDMapRef map, int64_t i) | |
234 | : cct(cct), | |
235 | cached_epoch(map->get_epoch()), | |
236 | id(i), | |
237 | name(map->get_pool_name(id)), | |
238 | auid(map->get_pg_pool(id)->auid) { | |
239 | const pg_pool_t *pi = map->get_pg_pool(id); | |
240 | assert(pi); | |
241 | info = *pi; | |
242 | snapc = pi->get_snap_context(); | |
243 | pi->build_removed_snaps(cached_removed_snaps); | |
244 | } | |
245 | ||
246 | void update(OSDMapRef map); | |
247 | }; | |
248 | ||
249 | /** PG - Replica Placement Group | |
250 | * | |
251 | */ | |
252 | ||
253 | class PG : public DoutPrefixProvider { | |
254 | protected: | |
255 | OSDService *osd; | |
256 | CephContext *cct; | |
257 | OSDriver osdriver; | |
258 | SnapMapper snap_mapper; | |
224ce89b | 259 | bool eio_errors_to_process = false; |
7c673cae FG |
260 | |
261 | virtual PGBackend *get_pgbackend() = 0; | |
262 | public: | |
263 | std::string gen_prefix() const override; | |
264 | CephContext *get_cct() const override { return cct; } | |
265 | unsigned get_subsys() const override { return ceph_subsys_osd; } | |
266 | ||
267 | /*** PG ****/ | |
268 | void update_snap_mapper_bits(uint32_t bits) { | |
269 | snap_mapper.update_bits(bits); | |
270 | } | |
271 | /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done | |
272 | IsPGRecoverablePredicate *get_is_recoverable_predicate() { | |
273 | return get_pgbackend()->get_is_recoverable_predicate(); | |
274 | } | |
275 | protected: | |
276 | OSDMapRef osdmap_ref; | |
277 | OSDMapRef last_persisted_osdmap_ref; | |
278 | PGPool pool; | |
279 | ||
280 | void requeue_map_waiters(); | |
281 | ||
282 | void update_osdmap_ref(OSDMapRef newmap) { | |
283 | assert(_lock.is_locked_by_me()); | |
284 | osdmap_ref = std::move(newmap); | |
285 | } | |
286 | ||
287 | public: | |
288 | OSDMapRef get_osdmap() const { | |
289 | assert(is_locked()); | |
290 | assert(osdmap_ref); | |
291 | return osdmap_ref; | |
292 | } | |
293 | protected: | |
294 | ||
295 | /** locking and reference counting. | |
296 | * I destroy myself when the reference count hits zero. | |
297 | * lock() should be called before doing anything. | |
298 | * get() should be called on pointer copy (to another thread, etc.). | |
299 | * put() should be called on destruction of some previously copied pointer. | |
300 | * unlock() when done with the current pointer (_most common_). | |
301 | */ | |
302 | mutable Mutex _lock; | |
303 | std::atomic_uint ref{0}; | |
304 | ||
305 | #ifdef PG_DEBUG_REFS | |
306 | Mutex _ref_id_lock; | |
307 | map<uint64_t, string> _live_ids; | |
308 | map<string, uint64_t> _tag_counts; | |
309 | uint64_t _ref_id; | |
310 | #endif | |
311 | ||
312 | public: | |
313 | bool deleting; // true while in removing or OSD is shutting down | |
314 | ||
315 | ZTracer::Endpoint trace_endpoint; | |
316 | ||
317 | void lock_suspend_timeout(ThreadPool::TPHandle &handle); | |
318 | void lock(bool no_lockdep = false) const; | |
319 | void unlock() const { | |
320 | //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl; | |
321 | assert(!dirty_info); | |
322 | assert(!dirty_big_info); | |
323 | _lock.Unlock(); | |
324 | } | |
325 | ||
326 | bool is_locked() const { | |
327 | return _lock.is_locked(); | |
328 | } | |
329 | ||
330 | #ifdef PG_DEBUG_REFS | |
331 | uint64_t get_with_id(); | |
332 | void put_with_id(uint64_t); | |
333 | void dump_live_ids(); | |
334 | #endif | |
335 | void get(const char* tag); | |
336 | void put(const char* tag); | |
337 | ||
338 | bool dirty_info, dirty_big_info; | |
339 | ||
340 | public: | |
341 | bool is_ec_pg() const { | |
342 | return pool.info.ec_pool(); | |
343 | } | |
344 | // pg state | |
345 | pg_info_t info; ///< current pg info | |
346 | pg_info_t last_written_info; ///< last written info | |
347 | __u8 info_struct_v; | |
348 | static const __u8 cur_struct_v = 10; | |
349 | // v10 is the new past_intervals encoding | |
350 | // v9 was fastinfo_key addition | |
351 | // v8 was the move to a per-pg pgmeta object | |
352 | // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad | |
353 | // (first appeared in cuttlefish). | |
354 | static const __u8 compat_struct_v = 7; | |
355 | bool must_upgrade() { | |
356 | return info_struct_v < cur_struct_v; | |
357 | } | |
358 | bool can_upgrade() { | |
359 | return info_struct_v >= compat_struct_v; | |
360 | } | |
361 | void upgrade(ObjectStore *store); | |
362 | ||
363 | const coll_t coll; | |
364 | ObjectStore::CollectionHandle ch; | |
365 | PGLog pg_log; | |
366 | static string get_info_key(spg_t pgid) { | |
367 | return stringify(pgid) + "_info"; | |
368 | } | |
369 | static string get_biginfo_key(spg_t pgid) { | |
370 | return stringify(pgid) + "_biginfo"; | |
371 | } | |
372 | static string get_epoch_key(spg_t pgid) { | |
373 | return stringify(pgid) + "_epoch"; | |
374 | } | |
375 | ghobject_t pgmeta_oid; | |
376 | ||
377 | class MissingLoc { | |
378 | map<hobject_t, pg_missing_item> needs_recovery_map; | |
379 | map<hobject_t, set<pg_shard_t> > missing_loc; | |
380 | set<pg_shard_t> missing_loc_sources; | |
381 | PG *pg; | |
382 | set<pg_shard_t> empty_set; | |
383 | public: | |
384 | boost::scoped_ptr<IsPGReadablePredicate> is_readable; | |
385 | boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable; | |
386 | explicit MissingLoc(PG *pg) | |
387 | : pg(pg) {} | |
388 | void set_backend_predicates( | |
389 | IsPGReadablePredicate *_is_readable, | |
390 | IsPGRecoverablePredicate *_is_recoverable) { | |
391 | is_readable.reset(_is_readable); | |
392 | is_recoverable.reset(_is_recoverable); | |
393 | } | |
394 | string gen_prefix() const { return pg->gen_prefix(); } | |
395 | bool needs_recovery( | |
396 | const hobject_t &hoid, | |
397 | eversion_t *v = 0) const { | |
398 | map<hobject_t, pg_missing_item>::const_iterator i = | |
399 | needs_recovery_map.find(hoid); | |
400 | if (i == needs_recovery_map.end()) | |
401 | return false; | |
402 | if (v) | |
403 | *v = i->second.need; | |
404 | return true; | |
405 | } | |
c07f9fc5 FG |
406 | bool is_deleted(const hobject_t &hoid) const { |
407 | auto i = needs_recovery_map.find(hoid); | |
408 | if (i == needs_recovery_map.end()) | |
409 | return false; | |
410 | return i->second.is_delete(); | |
411 | } | |
7c673cae | 412 | bool is_unfound(const hobject_t &hoid) const { |
c07f9fc5 | 413 | return needs_recovery(hoid) && !is_deleted(hoid) && ( |
7c673cae FG |
414 | !missing_loc.count(hoid) || |
415 | !(*is_recoverable)(missing_loc.find(hoid)->second)); | |
416 | } | |
417 | bool readable_with_acting( | |
418 | const hobject_t &hoid, | |
419 | const set<pg_shard_t> &acting) const; | |
420 | uint64_t num_unfound() const { | |
421 | uint64_t ret = 0; | |
422 | for (map<hobject_t, pg_missing_item>::const_iterator i = | |
423 | needs_recovery_map.begin(); | |
424 | i != needs_recovery_map.end(); | |
425 | ++i) { | |
426 | if (is_unfound(i->first)) | |
427 | ++ret; | |
428 | } | |
429 | return ret; | |
430 | } | |
431 | ||
432 | bool have_unfound() const { | |
433 | for (map<hobject_t, pg_missing_item>::const_iterator i = | |
434 | needs_recovery_map.begin(); | |
435 | i != needs_recovery_map.end(); | |
436 | ++i) { | |
437 | if (is_unfound(i->first)) | |
438 | return true; | |
439 | } | |
440 | return false; | |
441 | } | |
442 | void clear() { | |
443 | needs_recovery_map.clear(); | |
444 | missing_loc.clear(); | |
445 | missing_loc_sources.clear(); | |
446 | } | |
447 | ||
448 | void add_location(const hobject_t &hoid, pg_shard_t location) { | |
449 | missing_loc[hoid].insert(location); | |
450 | } | |
451 | void remove_location(const hobject_t &hoid, pg_shard_t location) { | |
452 | missing_loc[hoid].erase(location); | |
453 | } | |
454 | void add_active_missing(const pg_missing_t &missing) { | |
455 | for (map<hobject_t, pg_missing_item>::const_iterator i = | |
456 | missing.get_items().begin(); | |
457 | i != missing.get_items().end(); | |
458 | ++i) { | |
459 | map<hobject_t, pg_missing_item>::const_iterator j = | |
460 | needs_recovery_map.find(i->first); | |
461 | if (j == needs_recovery_map.end()) { | |
462 | needs_recovery_map.insert(*i); | |
463 | } else { | |
c07f9fc5 FG |
464 | lgeneric_dout(pg->cct, 0) << this << " " << pg->info.pgid << " unexpected need for " |
465 | << i->first << " have " << j->second | |
466 | << " tried to add " << i->second << dendl; | |
7c673cae FG |
467 | assert(i->second.need == j->second.need); |
468 | } | |
469 | } | |
470 | } | |
471 | ||
472 | void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have) { | |
473 | needs_recovery_map[hoid] = pg_missing_item(need, have); | |
474 | } | |
475 | void revise_need(const hobject_t &hoid, eversion_t need) { | |
476 | assert(needs_recovery(hoid)); | |
477 | needs_recovery_map[hoid].need = need; | |
478 | } | |
479 | ||
480 | /// Adds info about a possible recovery source | |
481 | bool add_source_info( | |
482 | pg_shard_t source, ///< [in] source | |
483 | const pg_info_t &oinfo, ///< [in] info | |
484 | const pg_missing_t &omissing, ///< [in] (optional) missing | |
485 | ThreadPool::TPHandle* handle ///< [in] ThreadPool handle | |
486 | ); ///< @return whether a new object location was discovered | |
487 | ||
488 | /// Adds recovery sources in batch | |
489 | void add_batch_sources_info( | |
490 | const set<pg_shard_t> &sources, ///< [in] a set of resources which can be used for all objects | |
491 | ThreadPool::TPHandle* handle ///< [in] ThreadPool handle | |
492 | ); | |
493 | ||
494 | /// Uses osdmap to update structures for now down sources | |
495 | void check_recovery_sources(const OSDMapRef& osdmap); | |
496 | ||
497 | /// Call when hoid is no longer missing in acting set | |
498 | void recovered(const hobject_t &hoid) { | |
499 | needs_recovery_map.erase(hoid); | |
500 | missing_loc.erase(hoid); | |
501 | } | |
502 | ||
503 | /// Call to update structures for hoid after a change | |
504 | void rebuild( | |
505 | const hobject_t &hoid, | |
506 | pg_shard_t self, | |
507 | const set<pg_shard_t> to_recover, | |
508 | const pg_info_t &info, | |
509 | const pg_missing_t &missing, | |
510 | const map<pg_shard_t, pg_missing_t> &pmissing, | |
511 | const map<pg_shard_t, pg_info_t> &pinfo) { | |
512 | recovered(hoid); | |
513 | boost::optional<pg_missing_item> item; | |
514 | auto miter = missing.get_items().find(hoid); | |
515 | if (miter != missing.get_items().end()) { | |
516 | item = miter->second; | |
517 | } else { | |
518 | for (auto &&i: to_recover) { | |
519 | if (i == self) | |
520 | continue; | |
521 | auto pmiter = pmissing.find(i); | |
522 | assert(pmiter != pmissing.end()); | |
523 | miter = pmiter->second.get_items().find(hoid); | |
524 | if (miter != pmiter->second.get_items().end()) { | |
525 | item = miter->second; | |
526 | break; | |
527 | } | |
528 | } | |
529 | } | |
530 | if (!item) | |
531 | return; // recovered! | |
532 | ||
533 | needs_recovery_map[hoid] = *item; | |
c07f9fc5 FG |
534 | if (item->is_delete()) |
535 | return; | |
7c673cae FG |
536 | auto mliter = |
537 | missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first; | |
538 | assert(info.last_backfill.is_max()); | |
539 | assert(info.last_update >= item->need); | |
540 | if (!missing.is_missing(hoid)) | |
541 | mliter->second.insert(self); | |
542 | for (auto &&i: pmissing) { | |
543 | auto pinfoiter = pinfo.find(i.first); | |
544 | assert(pinfoiter != pinfo.end()); | |
545 | if (item->need <= pinfoiter->second.last_update && | |
546 | hoid <= pinfoiter->second.last_backfill && | |
547 | !i.second.is_missing(hoid)) | |
548 | mliter->second.insert(i.first); | |
549 | } | |
550 | } | |
551 | ||
552 | const set<pg_shard_t> &get_locations(const hobject_t &hoid) const { | |
553 | return missing_loc.count(hoid) ? | |
554 | missing_loc.find(hoid)->second : empty_set; | |
555 | } | |
556 | const map<hobject_t, set<pg_shard_t>> &get_missing_locs() const { | |
557 | return missing_loc; | |
558 | } | |
559 | const map<hobject_t, pg_missing_item> &get_needs_recovery() const { | |
560 | return needs_recovery_map; | |
561 | } | |
562 | } missing_loc; | |
563 | ||
564 | PastIntervals past_intervals; | |
565 | ||
566 | interval_set<snapid_t> snap_trimq; | |
567 | ||
568 | /* You should not use these items without taking their respective queue locks | |
569 | * (if they have one) */ | |
570 | xlist<PG*>::item stat_queue_item; | |
571 | bool scrub_queued; | |
572 | bool recovery_queued; | |
573 | ||
574 | int recovery_ops_active; | |
575 | set<pg_shard_t> waiting_on_backfill; | |
576 | #ifdef DEBUG_RECOVERY_OIDS | |
577 | set<hobject_t> recovering_oids; | |
578 | #endif | |
579 | ||
580 | protected: | |
581 | int role; // 0 = primary, 1 = replica, -1=none. | |
582 | unsigned state; // PG_STATE_* | |
583 | ||
584 | bool send_notify; ///< true if we are non-primary and should notify the primary | |
585 | ||
586 | public: | |
587 | eversion_t last_update_ondisk; // last_update that has committed; ONLY DEFINED WHEN is_active() | |
588 | eversion_t last_complete_ondisk; // last_complete that has committed. | |
589 | eversion_t last_update_applied; | |
590 | ||
591 | ||
592 | struct C_UpdateLastRollbackInfoTrimmedToApplied : Context { | |
593 | PGRef pg; | |
594 | epoch_t e; | |
595 | eversion_t v; | |
596 | C_UpdateLastRollbackInfoTrimmedToApplied(PG *pg, epoch_t e, eversion_t v) | |
597 | : pg(pg), e(e), v(v) {} | |
598 | void finish(int) override { | |
599 | pg->lock(); | |
600 | if (!pg->pg_has_reset_since(e)) { | |
601 | pg->last_rollback_info_trimmed_to_applied = v; | |
602 | } | |
603 | pg->unlock(); | |
604 | } | |
605 | }; | |
606 | // entries <= last_rollback_info_trimmed_to_applied have been trimmed, | |
607 | // and the transaction has applied | |
608 | eversion_t last_rollback_info_trimmed_to_applied; | |
609 | ||
610 | // primary state | |
611 | public: | |
612 | pg_shard_t primary; | |
613 | pg_shard_t pg_whoami; | |
614 | pg_shard_t up_primary; | |
615 | vector<int> up, acting, want_acting; | |
616 | set<pg_shard_t> actingbackfill, actingset, upset; | |
617 | map<pg_shard_t,eversion_t> peer_last_complete_ondisk; | |
618 | eversion_t min_last_complete_ondisk; // up: min over last_complete_ondisk, peer_last_complete_ondisk | |
619 | eversion_t pg_trim_to; | |
620 | ||
621 | set<int> blocked_by; ///< osds we are blocked by (for pg stats) | |
622 | ||
623 | // [primary only] content recovery state | |
624 | ||
625 | public: | |
626 | struct BufferedRecoveryMessages { | |
627 | map<int, map<spg_t, pg_query_t> > query_map; | |
628 | map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map; | |
629 | map<int, vector<pair<pg_notify_t, PastIntervals> > > notify_list; | |
630 | }; | |
631 | ||
632 | struct RecoveryCtx { | |
633 | utime_t start_time; | |
634 | map<int, map<spg_t, pg_query_t> > *query_map; | |
635 | map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map; | |
636 | map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list; | |
637 | set<PGRef> created_pgs; | |
638 | C_Contexts *on_applied; | |
639 | C_Contexts *on_safe; | |
640 | ObjectStore::Transaction *transaction; | |
641 | ThreadPool::TPHandle* handle; | |
642 | RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map, | |
643 | map<int, | |
644 | vector<pair<pg_notify_t, PastIntervals> > > *info_map, | |
645 | map<int, | |
646 | vector<pair<pg_notify_t, PastIntervals> > > *notify_list, | |
647 | C_Contexts *on_applied, | |
648 | C_Contexts *on_safe, | |
649 | ObjectStore::Transaction *transaction) | |
650 | : query_map(query_map), info_map(info_map), | |
651 | notify_list(notify_list), | |
652 | on_applied(on_applied), | |
653 | on_safe(on_safe), | |
654 | transaction(transaction), | |
655 | handle(NULL) {} | |
656 | ||
657 | RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx) | |
658 | : query_map(&(buf.query_map)), | |
659 | info_map(&(buf.info_map)), | |
660 | notify_list(&(buf.notify_list)), | |
661 | on_applied(rctx.on_applied), | |
662 | on_safe(rctx.on_safe), | |
663 | transaction(rctx.transaction), | |
664 | handle(rctx.handle) {} | |
665 | ||
666 | void accept_buffered_messages(BufferedRecoveryMessages &m) { | |
667 | assert(query_map); | |
668 | assert(info_map); | |
669 | assert(notify_list); | |
670 | for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin(); | |
671 | i != m.query_map.end(); | |
672 | ++i) { | |
673 | map<spg_t, pg_query_t> &omap = (*query_map)[i->first]; | |
674 | for (map<spg_t, pg_query_t>::iterator j = i->second.begin(); | |
675 | j != i->second.end(); | |
676 | ++j) { | |
677 | omap[j->first] = j->second; | |
678 | } | |
679 | } | |
680 | for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i | |
681 | = m.info_map.begin(); | |
682 | i != m.info_map.end(); | |
683 | ++i) { | |
684 | vector<pair<pg_notify_t, PastIntervals> > &ovec = | |
685 | (*info_map)[i->first]; | |
686 | ovec.reserve(ovec.size() + i->second.size()); | |
687 | ovec.insert(ovec.end(), i->second.begin(), i->second.end()); | |
688 | } | |
689 | for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i | |
690 | = m.notify_list.begin(); | |
691 | i != m.notify_list.end(); | |
692 | ++i) { | |
693 | vector<pair<pg_notify_t, PastIntervals> > &ovec = | |
694 | (*notify_list)[i->first]; | |
695 | ovec.reserve(ovec.size() + i->second.size()); | |
696 | ovec.insert(ovec.end(), i->second.begin(), i->second.end()); | |
697 | } | |
698 | } | |
699 | }; | |
700 | ||
701 | ||
702 | PGStateHistory pgstate_history; | |
703 | ||
704 | struct NamedState { | |
705 | const char *state_name; | |
706 | utime_t enter_time; | |
707 | PG* pg; | |
708 | const char *get_state_name() { return state_name; } | |
709 | NamedState(PG *pg_, const char *state_name_) | |
710 | : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) { | |
711 | pg->pgstate_history.enter(pg, enter_time, state_name); | |
712 | } | |
713 | virtual ~NamedState() { pg->pgstate_history.exit(state_name); } | |
714 | }; | |
715 | ||
716 | ||
717 | ||
718 | protected: | |
719 | ||
720 | /* | |
721 | * peer_info -- projected (updates _before_ replicas ack) | |
722 | * peer_missing -- committed (updates _after_ replicas ack) | |
723 | */ | |
724 | ||
725 | bool need_up_thru; | |
726 | set<pg_shard_t> stray_set; // non-acting osds that have PG data. | |
727 | eversion_t oldest_update; // acting: lowest (valid) last_update in active set | |
728 | map<pg_shard_t, pg_info_t> peer_info; // info from peers (stray or prior) | |
729 | set<pg_shard_t> peer_purged; // peers purged | |
730 | map<pg_shard_t, pg_missing_t> peer_missing; | |
731 | set<pg_shard_t> peer_log_requested; // logs i've requested (and start stamps) | |
732 | set<pg_shard_t> peer_missing_requested; | |
733 | ||
734 | // i deleted these strays; ignore racing PGInfo from them | |
735 | set<pg_shard_t> peer_activated; | |
736 | ||
737 | // primary-only, recovery-only state | |
738 | set<pg_shard_t> might_have_unfound; // These osds might have objects on them | |
739 | // which are unfound on the primary | |
740 | epoch_t last_peering_reset; | |
741 | ||
742 | ||
743 | /* heartbeat peers */ | |
744 | void set_probe_targets(const set<pg_shard_t> &probe_set); | |
745 | void clear_probe_targets(); | |
746 | public: | |
747 | Mutex heartbeat_peer_lock; | |
748 | set<int> heartbeat_peers; | |
749 | set<int> probe_targets; | |
750 | ||
751 | /** | |
752 | * BackfillInterval | |
753 | * | |
754 | * Represents the objects in a range [begin, end) | |
755 | * | |
756 | * Possible states: | |
757 | * 1) begin == end == hobject_t() indicates the the interval is unpopulated | |
758 | * 2) Else, objects contains all objects in [begin, end) | |
759 | */ | |
760 | struct BackfillInterval { | |
761 | // info about a backfill interval on a peer | |
762 | eversion_t version; /// version at which the scan occurred | |
763 | map<hobject_t,eversion_t> objects; | |
764 | hobject_t begin; | |
765 | hobject_t end; | |
766 | ||
767 | /// clear content | |
768 | void clear() { | |
769 | *this = BackfillInterval(); | |
770 | } | |
771 | ||
772 | /// clear objects list only | |
773 | void clear_objects() { | |
774 | objects.clear(); | |
775 | } | |
776 | ||
777 | /// reinstantiate with a new start+end position and sort order | |
778 | void reset(hobject_t start) { | |
779 | clear(); | |
780 | begin = end = start; | |
781 | } | |
782 | ||
783 | /// true if there are no objects in this interval | |
784 | bool empty() const { | |
785 | return objects.empty(); | |
786 | } | |
787 | ||
788 | /// true if interval extends to the end of the range | |
789 | bool extends_to_end() const { | |
790 | return end.is_max(); | |
791 | } | |
792 | ||
793 | /// removes items <= soid and adjusts begin to the first object | |
794 | void trim_to(const hobject_t &soid) { | |
795 | trim(); | |
796 | while (!objects.empty() && | |
797 | objects.begin()->first <= soid) { | |
798 | pop_front(); | |
799 | } | |
800 | } | |
801 | ||
802 | /// Adjusts begin to the first object | |
803 | void trim() { | |
804 | if (!objects.empty()) | |
805 | begin = objects.begin()->first; | |
806 | else | |
807 | begin = end; | |
808 | } | |
809 | ||
810 | /// drop first entry, and adjust @begin accordingly | |
811 | void pop_front() { | |
812 | assert(!objects.empty()); | |
813 | objects.erase(objects.begin()); | |
814 | trim(); | |
815 | } | |
816 | ||
817 | /// dump | |
818 | void dump(Formatter *f) const { | |
819 | f->dump_stream("begin") << begin; | |
820 | f->dump_stream("end") << end; | |
821 | f->open_array_section("objects"); | |
822 | for (map<hobject_t, eversion_t>::const_iterator i = | |
823 | objects.begin(); | |
824 | i != objects.end(); | |
825 | ++i) { | |
826 | f->open_object_section("object"); | |
827 | f->dump_stream("object") << i->first; | |
828 | f->dump_stream("version") << i->second; | |
829 | f->close_section(); | |
830 | } | |
831 | f->close_section(); | |
832 | } | |
833 | }; | |
834 | ||
835 | protected: | |
836 | BackfillInterval backfill_info; | |
837 | map<pg_shard_t, BackfillInterval> peer_backfill_info; | |
838 | bool backfill_reserved; | |
839 | bool backfill_reserving; | |
840 | ||
841 | friend class OSD; | |
842 | ||
843 | public: | |
844 | set<pg_shard_t> backfill_targets; | |
845 | ||
846 | bool is_backfill_targets(pg_shard_t osd) { | |
847 | return backfill_targets.count(osd); | |
848 | } | |
849 | ||
850 | protected: | |
851 | ||
852 | /* | |
853 | * blocked request wait hierarchy | |
854 | * | |
855 | * In order to preserve request ordering we need to be careful about the | |
856 | * order in which blocked requests get requeued. Generally speaking, we | |
857 | * push the requests back up to the op_wq in reverse order (most recent | |
858 | * request first) so that they come back out again in the original order. | |
859 | * However, because there are multiple wait queues, we need to requeue | |
860 | * waitlists in order. Generally speaking, we requeue the wait lists | |
861 | * that are checked first. | |
862 | * | |
863 | * Here are the various wait lists, in the order they are used during | |
864 | * request processing, with notes: | |
865 | * | |
866 | * - waiting_for_map | |
867 | * - may start or stop blocking at any time (depending on client epoch) | |
868 | * - waiting_for_peered | |
869 | * - !is_peered() or flushes_in_progress | |
870 | * - only starts blocking on interval change; never restarts | |
871 | * - waiting_for_active | |
872 | * - !is_active() | |
873 | * - only starts blocking on interval change; never restarts | |
874 | * - waiting_for_scrub | |
875 | * - starts and stops blocking for varying intervals during scrub | |
876 | * - waiting_for_unreadable_object | |
877 | * - never restarts once object is readable (* except for EIO?) | |
878 | * - waiting_for_degraded_object | |
879 | * - never restarts once object is writeable (* except for EIO?) | |
880 | * - waiting_for_blocked_object | |
881 | * - starts and stops based on proxied op activity | |
882 | * - obc rwlocks | |
883 | * - starts and stops based on read/write activity | |
884 | * | |
885 | * Notes: | |
886 | * | |
887 | * 1. During and interval change, we requeue *everything* in the above order. | |
888 | * | |
889 | * 2. When an obc rwlock is released, we check for a scrub block and requeue | |
890 | * the op there if it applies. We ignore the unreadable/degraded/blocked | |
891 | * queues because we assume they cannot apply at that time (this is | |
892 | * probably mostly true). | |
893 | * | |
894 | * 3. The requeue_ops helper will push ops onto the waiting_for_map list if | |
895 | * it is non-empty. | |
896 | * | |
897 | * These three behaviors are generally sufficient to maintain ordering, with | |
898 | * the possible exception of cases where we make an object degraded or | |
899 | * unreadable that was previously okay, e.g. when scrub or op processing | |
900 | * encounter an unexpected error. FIXME. | |
901 | */ | |
902 | ||
903 | // pg waiters | |
904 | unsigned flushes_in_progress; | |
905 | ||
906 | // ops with newer maps than our (or blocked behind them) | |
907 | // track these by client, since inter-request ordering doesn't otherwise | |
908 | // matter. | |
909 | unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map; | |
910 | ||
911 | // ops waiting on peered | |
912 | list<OpRequestRef> waiting_for_peered; | |
913 | ||
914 | // ops waiting on active (require peered as well) | |
915 | list<OpRequestRef> waiting_for_active; | |
916 | list<OpRequestRef> waiting_for_scrub; | |
917 | ||
918 | list<OpRequestRef> waiting_for_cache_not_full; | |
224ce89b | 919 | list<OpRequestRef> waiting_for_clean_to_primary_repair; |
7c673cae FG |
920 | map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object, |
921 | waiting_for_degraded_object, | |
922 | waiting_for_blocked_object; | |
923 | ||
924 | set<hobject_t> objects_blocked_on_cache_full; | |
925 | map<hobject_t,snapid_t> objects_blocked_on_degraded_snap; | |
926 | map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion; | |
927 | ||
928 | // Callbacks should assume pg (and nothing else) is locked | |
929 | map<hobject_t, list<Context*>> callbacks_for_degraded_object; | |
930 | ||
931 | map<eversion_t, | |
932 | list<pair<OpRequestRef, version_t> > > waiting_for_ondisk; | |
933 | ||
934 | void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m); | |
935 | void requeue_op(OpRequestRef op); | |
936 | void requeue_ops(list<OpRequestRef> &l); | |
937 | ||
938 | // stats that persist lazily | |
939 | object_stat_collection_t unstable_stats; | |
940 | ||
941 | // publish stats | |
942 | Mutex pg_stats_publish_lock; | |
943 | bool pg_stats_publish_valid; | |
944 | pg_stat_t pg_stats_publish; | |
945 | ||
946 | // for ordering writes | |
947 | ceph::shared_ptr<ObjectStore::Sequencer> osr; | |
948 | ||
949 | void _update_calc_stats(); | |
950 | void _update_blocked_by(); | |
951 | void publish_stats_to_osd(); | |
952 | void clear_publish_stats(); | |
953 | ||
954 | public: | |
955 | void clear_primary_state(); | |
956 | ||
7c673cae FG |
957 | bool is_actingbackfill(pg_shard_t osd) const { |
958 | return actingbackfill.count(osd); | |
959 | } | |
960 | bool is_acting(pg_shard_t osd) const { | |
961 | return has_shard(pool.info.ec_pool(), acting, osd); | |
962 | } | |
963 | bool is_up(pg_shard_t osd) const { | |
964 | return has_shard(pool.info.ec_pool(), up, osd); | |
965 | } | |
966 | static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) { | |
967 | if (ec) { | |
968 | return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd; | |
969 | } else { | |
970 | return std::find(v.begin(), v.end(), osd.osd) != v.end(); | |
971 | } | |
972 | } | |
973 | ||
974 | bool needs_recovery() const; | |
975 | bool needs_backfill() const; | |
976 | ||
c07f9fc5 FG |
977 | /// clip calculated priority to reasonable range |
978 | inline int clamp_recovery_priority(int priority); | |
7c673cae FG |
979 | /// get log recovery reservation priority |
980 | unsigned get_recovery_priority(); | |
981 | /// get backfill reservation priority | |
982 | unsigned get_backfill_priority(); | |
983 | ||
984 | void mark_clean(); ///< mark an active pg clean | |
c07f9fc5 | 985 | void change_recovery_force_mode(int new_mode, bool clear); |
7c673cae FG |
986 | |
987 | /// return [start,end) bounds for required past_intervals | |
988 | static pair<epoch_t, epoch_t> get_required_past_interval_bounds( | |
989 | const pg_info_t &info, | |
990 | epoch_t oldest_map) { | |
991 | epoch_t start = MAX( | |
992 | info.history.last_epoch_clean ? info.history.last_epoch_clean : | |
31f18b77 | 993 | info.history.epoch_pool_created, |
7c673cae FG |
994 | oldest_map); |
995 | epoch_t end = MAX( | |
996 | info.history.same_interval_since, | |
31f18b77 | 997 | info.history.epoch_pool_created); |
7c673cae FG |
998 | return make_pair(start, end); |
999 | } | |
1000 | void check_past_interval_bounds() const; | |
1001 | PastIntervals::PriorSet build_prior(); | |
1002 | ||
1003 | void remove_down_peer_info(const OSDMapRef osdmap); | |
1004 | ||
1005 | bool adjust_need_up_thru(const OSDMapRef osdmap); | |
1006 | ||
1007 | bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const; | |
1008 | virtual void dump_recovery_info(Formatter *f) const = 0; | |
1009 | ||
1010 | bool calc_min_last_complete_ondisk() { | |
1011 | eversion_t min = last_complete_ondisk; | |
1012 | assert(!actingbackfill.empty()); | |
1013 | for (set<pg_shard_t>::iterator i = actingbackfill.begin(); | |
1014 | i != actingbackfill.end(); | |
1015 | ++i) { | |
1016 | if (*i == get_primary()) continue; | |
1017 | if (peer_last_complete_ondisk.count(*i) == 0) | |
1018 | return false; // we don't have complete info | |
1019 | eversion_t a = peer_last_complete_ondisk[*i]; | |
1020 | if (a < min) | |
1021 | min = a; | |
1022 | } | |
1023 | if (min == min_last_complete_ondisk) | |
1024 | return false; | |
1025 | min_last_complete_ondisk = min; | |
1026 | return true; | |
1027 | } | |
1028 | ||
1029 | virtual void calc_trim_to() = 0; | |
1030 | ||
1031 | void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog, | |
1032 | pg_missing_t& omissing, pg_shard_t from); | |
1033 | void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, | |
1034 | pg_missing_t& omissing, pg_shard_t from); | |
1035 | bool proc_replica_info( | |
1036 | pg_shard_t from, const pg_info_t &info, epoch_t send_epoch); | |
1037 | ||
1038 | struct PGLogEntryHandler : public PGLog::LogEntryHandler { | |
1039 | PG *pg; | |
1040 | ObjectStore::Transaction *t; | |
1041 | PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {} | |
1042 | ||
1043 | // LogEntryHandler | |
1044 | void remove(const hobject_t &hoid) override { | |
1045 | pg->get_pgbackend()->remove(hoid, t); | |
1046 | } | |
1047 | void try_stash(const hobject_t &hoid, version_t v) override { | |
1048 | pg->get_pgbackend()->try_stash(hoid, v, t); | |
1049 | } | |
1050 | void rollback(const pg_log_entry_t &entry) override { | |
1051 | assert(entry.can_rollback()); | |
1052 | pg->get_pgbackend()->rollback(entry, t); | |
1053 | } | |
1054 | void rollforward(const pg_log_entry_t &entry) override { | |
1055 | pg->get_pgbackend()->rollforward(entry, t); | |
1056 | } | |
1057 | void trim(const pg_log_entry_t &entry) override { | |
1058 | pg->get_pgbackend()->trim(entry, t); | |
1059 | } | |
1060 | }; | |
1061 | ||
1062 | void update_object_snap_mapping( | |
1063 | ObjectStore::Transaction *t, const hobject_t &soid, | |
1064 | const set<snapid_t> &snaps); | |
1065 | void clear_object_snap_mapping( | |
1066 | ObjectStore::Transaction *t, const hobject_t &soid); | |
1067 | void remove_snap_mapped_object( | |
1068 | ObjectStore::Transaction& t, const hobject_t& soid); | |
1069 | void merge_log( | |
1070 | ObjectStore::Transaction& t, pg_info_t &oinfo, | |
1071 | pg_log_t &olog, pg_shard_t from); | |
1072 | void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead); | |
1073 | bool search_for_missing( | |
1074 | const pg_info_t &oinfo, const pg_missing_t &omissing, | |
1075 | pg_shard_t fromosd, | |
1076 | RecoveryCtx*); | |
1077 | ||
1078 | void check_for_lost_objects(); | |
1079 | void forget_lost_objects(); | |
1080 | ||
1081 | void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map); | |
1082 | ||
1083 | void trim_write_ahead(); | |
1084 | ||
1085 | map<pg_shard_t, pg_info_t>::const_iterator find_best_info( | |
1086 | const map<pg_shard_t, pg_info_t> &infos, | |
1087 | bool restrict_to_up_acting, | |
1088 | bool *history_les_bound) const; | |
1089 | static void calc_ec_acting( | |
1090 | map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard, | |
1091 | unsigned size, | |
1092 | const vector<int> &acting, | |
1093 | pg_shard_t acting_primary, | |
1094 | const vector<int> &up, | |
1095 | pg_shard_t up_primary, | |
1096 | const map<pg_shard_t, pg_info_t> &all_info, | |
7c673cae FG |
1097 | bool restrict_to_up_acting, |
1098 | vector<int> *want, | |
1099 | set<pg_shard_t> *backfill, | |
1100 | set<pg_shard_t> *acting_backfill, | |
1101 | pg_shard_t *want_primary, | |
1102 | ostream &ss); | |
1103 | static void calc_replicated_acting( | |
1104 | map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard, | |
1105 | unsigned size, | |
1106 | const vector<int> &acting, | |
1107 | pg_shard_t acting_primary, | |
1108 | const vector<int> &up, | |
1109 | pg_shard_t up_primary, | |
1110 | const map<pg_shard_t, pg_info_t> &all_info, | |
7c673cae FG |
1111 | bool restrict_to_up_acting, |
1112 | vector<int> *want, | |
1113 | set<pg_shard_t> *backfill, | |
1114 | set<pg_shard_t> *acting_backfill, | |
1115 | pg_shard_t *want_primary, | |
1116 | ostream &ss); | |
1117 | bool choose_acting(pg_shard_t &auth_log_shard, | |
1118 | bool restrict_to_up_acting, | |
1119 | bool *history_les_bound); | |
1120 | void build_might_have_unfound(); | |
1121 | void activate( | |
1122 | ObjectStore::Transaction& t, | |
1123 | epoch_t activation_epoch, | |
1124 | list<Context*>& tfin, | |
1125 | map<int, map<spg_t,pg_query_t> >& query_map, | |
1126 | map<int, | |
1127 | vector<pair<pg_notify_t, PastIntervals> > > *activator_map, | |
1128 | RecoveryCtx *ctx); | |
1129 | void _activate_committed(epoch_t epoch, epoch_t activation_epoch); | |
1130 | void all_activated_and_committed(); | |
1131 | ||
1132 | void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info); | |
1133 | ||
1134 | bool have_unfound() const { | |
1135 | return missing_loc.have_unfound(); | |
1136 | } | |
1137 | uint64_t get_num_unfound() const { | |
1138 | return missing_loc.num_unfound(); | |
1139 | } | |
1140 | ||
1141 | virtual void check_local() = 0; | |
1142 | ||
1143 | /** | |
1144 | * @param ops_begun returns how many recovery ops the function started | |
1145 | * @returns true if any useful work was accomplished; false otherwise | |
1146 | */ | |
1147 | virtual bool start_recovery_ops( | |
1148 | uint64_t max, | |
1149 | ThreadPool::TPHandle &handle, | |
1150 | uint64_t *ops_begun) = 0; | |
1151 | ||
1152 | void purge_strays(); | |
1153 | ||
1154 | void update_heartbeat_peers(); | |
1155 | ||
1156 | Context *finish_sync_event; | |
1157 | ||
1158 | void finish_recovery(list<Context*>& tfin); | |
1159 | void _finish_recovery(Context *c); | |
1160 | void cancel_recovery(); | |
1161 | void clear_recovery_state(); | |
1162 | virtual void _clear_recovery_state() = 0; | |
1163 | virtual void check_recovery_sources(const OSDMapRef& newmap) = 0; | |
1164 | void start_recovery_op(const hobject_t& soid); | |
1165 | void finish_recovery_op(const hobject_t& soid, bool dequeue=false); | |
1166 | ||
1167 | void split_into(pg_t child_pgid, PG *child, unsigned split_bits); | |
1168 | virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0; | |
1169 | ||
1170 | friend class C_OSD_RepModify_Commit; | |
1171 | ||
1172 | // -- backoff -- | |
1173 | Mutex backoff_lock; // orders inside Backoff::lock | |
1174 | map<hobject_t,set<BackoffRef>> backoffs; | |
1175 | ||
1176 | void add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end); | |
1177 | void release_backoffs(const hobject_t& begin, const hobject_t& end); | |
1178 | void release_backoffs(const hobject_t& o) { | |
1179 | release_backoffs(o, o); | |
1180 | } | |
1181 | void clear_backoffs(); | |
1182 | ||
1183 | void add_pg_backoff(SessionRef s) { | |
1184 | hobject_t begin = info.pgid.pgid.get_hobj_start(); | |
1185 | hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num()); | |
1186 | add_backoff(s, begin, end); | |
1187 | } | |
1188 | void release_pg_backoffs() { | |
1189 | hobject_t begin = info.pgid.pgid.get_hobj_start(); | |
1190 | hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num()); | |
1191 | release_backoffs(begin, end); | |
1192 | } | |
1193 | ||
1194 | void rm_backoff(BackoffRef b); | |
1195 | ||
1196 | // -- scrub -- | |
1197 | struct Scrubber { | |
1198 | Scrubber(); | |
1199 | ~Scrubber(); | |
1200 | ||
1201 | // metadata | |
1202 | set<pg_shard_t> reserved_peers; | |
1203 | bool reserved, reserve_failed; | |
1204 | epoch_t epoch_start; | |
1205 | ||
1206 | // common to both scrubs | |
1207 | bool active; | |
7c673cae FG |
1208 | int waiting_on; |
1209 | set<pg_shard_t> waiting_on_whom; | |
1210 | int shallow_errors; | |
1211 | int deep_errors; | |
1212 | int fixed; | |
1213 | ScrubMap primary_scrubmap; | |
1214 | map<pg_shard_t, ScrubMap> received_maps; | |
1215 | OpRequestRef active_rep_scrub; | |
1216 | utime_t scrub_reg_stamp; // stamp we registered for | |
1217 | ||
1218 | // For async sleep | |
1219 | bool sleeping = false; | |
1220 | bool needs_sleep = true; | |
1221 | utime_t sleep_start; | |
1222 | ||
1223 | // flags to indicate explicitly requested scrubs (by admin) | |
1224 | bool must_scrub, must_deep_scrub, must_repair; | |
1225 | ||
1226 | // Priority to use for scrub scheduling | |
1227 | unsigned priority; | |
1228 | ||
1229 | // this flag indicates whether we would like to do auto-repair of the PG or not | |
1230 | bool auto_repair; | |
1231 | ||
1232 | // Maps from objects with errors to missing/inconsistent peers | |
1233 | map<hobject_t, set<pg_shard_t>> missing; | |
1234 | map<hobject_t, set<pg_shard_t>> inconsistent; | |
1235 | ||
1236 | // Map from object with errors to good peers | |
1237 | map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative; | |
1238 | ||
1239 | // Cleaned map pending snap metadata scrub | |
1240 | ScrubMap cleaned_meta_map; | |
1241 | ||
1242 | // digest updates which we are waiting on | |
1243 | int num_digest_updates_pending; | |
1244 | ||
1245 | // chunky scrub | |
1246 | hobject_t start, end; | |
1247 | eversion_t subset_last_update; | |
1248 | ||
1249 | // chunky scrub state | |
1250 | enum State { | |
1251 | INACTIVE, | |
1252 | NEW_CHUNK, | |
1253 | WAIT_PUSHES, | |
1254 | WAIT_LAST_UPDATE, | |
1255 | BUILD_MAP, | |
1256 | WAIT_REPLICAS, | |
1257 | COMPARE_MAPS, | |
1258 | WAIT_DIGEST_UPDATES, | |
1259 | FINISH, | |
1260 | } state; | |
1261 | ||
1262 | std::unique_ptr<Scrub::Store> store; | |
1263 | // deep scrub | |
1264 | bool deep; | |
1265 | uint32_t seed; | |
1266 | ||
1267 | list<Context*> callbacks; | |
1268 | void add_callback(Context *context) { | |
1269 | callbacks.push_back(context); | |
1270 | } | |
1271 | void run_callbacks() { | |
1272 | list<Context*> to_run; | |
1273 | to_run.swap(callbacks); | |
1274 | for (list<Context*>::iterator i = to_run.begin(); | |
1275 | i != to_run.end(); | |
1276 | ++i) { | |
1277 | (*i)->complete(0); | |
1278 | } | |
1279 | } | |
1280 | ||
1281 | static const char *state_string(const PG::Scrubber::State& state) { | |
1282 | const char *ret = NULL; | |
1283 | switch( state ) | |
1284 | { | |
1285 | case INACTIVE: ret = "INACTIVE"; break; | |
1286 | case NEW_CHUNK: ret = "NEW_CHUNK"; break; | |
1287 | case WAIT_PUSHES: ret = "WAIT_PUSHES"; break; | |
1288 | case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break; | |
1289 | case BUILD_MAP: ret = "BUILD_MAP"; break; | |
1290 | case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break; | |
1291 | case COMPARE_MAPS: ret = "COMPARE_MAPS"; break; | |
1292 | case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break; | |
1293 | case FINISH: ret = "FINISH"; break; | |
1294 | } | |
1295 | return ret; | |
1296 | } | |
1297 | ||
1298 | bool is_chunky_scrub_active() const { return state != INACTIVE; } | |
1299 | ||
1300 | // classic (non chunk) scrubs block all writes | |
1301 | // chunky scrubs only block writes to a range | |
1302 | bool write_blocked_by_scrub(const hobject_t &soid) { | |
1303 | return (soid >= start && soid < end); | |
1304 | } | |
1305 | ||
1306 | // clear all state | |
1307 | void reset() { | |
1308 | active = false; | |
7c673cae FG |
1309 | waiting_on = 0; |
1310 | waiting_on_whom.clear(); | |
1311 | if (active_rep_scrub) { | |
1312 | active_rep_scrub = OpRequestRef(); | |
1313 | } | |
1314 | received_maps.clear(); | |
1315 | ||
1316 | must_scrub = false; | |
1317 | must_deep_scrub = false; | |
1318 | must_repair = false; | |
1319 | auto_repair = false; | |
1320 | ||
1321 | state = PG::Scrubber::INACTIVE; | |
1322 | start = hobject_t(); | |
1323 | end = hobject_t(); | |
1324 | subset_last_update = eversion_t(); | |
1325 | shallow_errors = 0; | |
1326 | deep_errors = 0; | |
1327 | fixed = 0; | |
1328 | deep = false; | |
1329 | seed = 0; | |
1330 | run_callbacks(); | |
1331 | inconsistent.clear(); | |
1332 | missing.clear(); | |
1333 | authoritative.clear(); | |
1334 | num_digest_updates_pending = 0; | |
1335 | cleaned_meta_map = ScrubMap(); | |
1336 | sleeping = false; | |
1337 | needs_sleep = true; | |
1338 | sleep_start = utime_t(); | |
1339 | } | |
1340 | ||
1341 | void create_results(const hobject_t& obj); | |
1342 | void cleanup_store(ObjectStore::Transaction *t); | |
1343 | } scrubber; | |
1344 | ||
1345 | bool scrub_after_recovery; | |
1346 | ||
1347 | int active_pushes; | |
1348 | ||
1349 | void repair_object( | |
1350 | const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers, | |
1351 | pg_shard_t bad_peer); | |
1352 | ||
1353 | void scrub(epoch_t queued, ThreadPool::TPHandle &handle); | |
1354 | void chunky_scrub(ThreadPool::TPHandle &handle); | |
1355 | void scrub_compare_maps(); | |
1356 | /** | |
1357 | * return true if any inconsistency/missing is repaired, false otherwise | |
1358 | */ | |
1359 | bool scrub_process_inconsistent(); | |
31f18b77 | 1360 | bool ops_blocked_by_scrub() const; |
7c673cae FG |
1361 | void scrub_finish(); |
1362 | void scrub_clear_state(); | |
1363 | void _scan_snaps(ScrubMap &map); | |
224ce89b | 1364 | void _repair_oinfo_oid(ScrubMap &map); |
7c673cae FG |
1365 | void _scan_rollback_obs( |
1366 | const vector<ghobject_t> &rollback_obs, | |
1367 | ThreadPool::TPHandle &handle); | |
1368 | void _request_scrub_map(pg_shard_t replica, eversion_t version, | |
1369 | hobject_t start, hobject_t end, bool deep, | |
1370 | uint32_t seed); | |
1371 | int build_scrub_map_chunk( | |
1372 | ScrubMap &map, | |
1373 | hobject_t start, hobject_t end, bool deep, uint32_t seed, | |
1374 | ThreadPool::TPHandle &handle); | |
1375 | /** | |
1376 | * returns true if [begin, end) is good to scrub at this time | |
1377 | * a false return value obliges the implementer to requeue scrub when the | |
1378 | * condition preventing scrub clears | |
1379 | */ | |
1380 | virtual bool _range_available_for_scrub( | |
1381 | const hobject_t &begin, const hobject_t &end) = 0; | |
1382 | virtual void scrub_snapshot_metadata( | |
1383 | ScrubMap &map, | |
1384 | const std::map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest) { } | |
1385 | virtual void _scrub_clear_state() { } | |
1386 | virtual void _scrub_finish() { } | |
1387 | virtual void split_colls( | |
1388 | spg_t child, | |
1389 | int split_bits, | |
1390 | int seed, | |
1391 | const pg_pool_t *pool, | |
1392 | ObjectStore::Transaction *t) = 0; | |
1393 | void clear_scrub_reserved(); | |
1394 | void scrub_reserve_replicas(); | |
1395 | void scrub_unreserve_replicas(); | |
1396 | bool scrub_all_replicas_reserved() const; | |
1397 | bool sched_scrub(); | |
1398 | void reg_next_scrub(); | |
1399 | void unreg_next_scrub(); | |
1400 | ||
1401 | void replica_scrub( | |
1402 | OpRequestRef op, | |
1403 | ThreadPool::TPHandle &handle); | |
1404 | void do_replica_scrub_map(OpRequestRef op); | |
1405 | void sub_op_scrub_map(OpRequestRef op); | |
1406 | ||
1407 | void handle_scrub_reserve_request(OpRequestRef op); | |
1408 | void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from); | |
1409 | void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from); | |
1410 | void handle_scrub_reserve_release(OpRequestRef op); | |
1411 | ||
1412 | void reject_reservation(); | |
1413 | void schedule_backfill_full_retry(); | |
1414 | void schedule_recovery_full_retry(); | |
1415 | ||
1416 | // -- recovery state -- | |
1417 | ||
1418 | template <class EVT> | |
1419 | struct QueuePeeringEvt : Context { | |
1420 | PGRef pg; | |
1421 | epoch_t epoch; | |
1422 | EVT evt; | |
1423 | QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) : | |
1424 | pg(pg), epoch(epoch), evt(evt) {} | |
1425 | void finish(int r) override { | |
1426 | pg->lock(); | |
1427 | pg->queue_peering_event(PG::CephPeeringEvtRef( | |
1428 | new PG::CephPeeringEvt( | |
1429 | epoch, | |
1430 | epoch, | |
1431 | evt))); | |
1432 | pg->unlock(); | |
1433 | } | |
1434 | }; | |
1435 | ||
1436 | class CephPeeringEvt { | |
1437 | epoch_t epoch_sent; | |
1438 | epoch_t epoch_requested; | |
1439 | boost::intrusive_ptr< const boost::statechart::event_base > evt; | |
1440 | string desc; | |
1441 | public: | |
1442 | MEMPOOL_CLASS_HELPERS(); | |
1443 | template <class T> | |
1444 | CephPeeringEvt(epoch_t epoch_sent, | |
1445 | epoch_t epoch_requested, | |
1446 | const T &evt_) : | |
1447 | epoch_sent(epoch_sent), epoch_requested(epoch_requested), | |
1448 | evt(evt_.intrusive_from_this()) { | |
1449 | stringstream out; | |
1450 | out << "epoch_sent: " << epoch_sent | |
1451 | << " epoch_requested: " << epoch_requested << " "; | |
1452 | evt_.print(&out); | |
1453 | desc = out.str(); | |
1454 | } | |
1455 | epoch_t get_epoch_sent() { return epoch_sent; } | |
1456 | epoch_t get_epoch_requested() { return epoch_requested; } | |
1457 | const boost::statechart::event_base &get_event() { return *evt; } | |
1458 | string get_desc() { return desc; } | |
1459 | }; | |
1460 | typedef ceph::shared_ptr<CephPeeringEvt> CephPeeringEvtRef; | |
1461 | list<CephPeeringEvtRef> peering_queue; // op queue | |
1462 | list<CephPeeringEvtRef> peering_waiters; | |
1463 | ||
1464 | struct QueryState : boost::statechart::event< QueryState > { | |
1465 | Formatter *f; | |
1466 | explicit QueryState(Formatter *f) : f(f) {} | |
1467 | void print(std::ostream *out) const { | |
1468 | *out << "Query"; | |
1469 | } | |
1470 | }; | |
1471 | ||
1472 | struct MInfoRec : boost::statechart::event< MInfoRec > { | |
1473 | pg_shard_t from; | |
1474 | pg_info_t info; | |
1475 | epoch_t msg_epoch; | |
1476 | MInfoRec(pg_shard_t from, const pg_info_t &info, epoch_t msg_epoch) : | |
1477 | from(from), info(info), msg_epoch(msg_epoch) {} | |
1478 | void print(std::ostream *out) const { | |
1479 | *out << "MInfoRec from " << from << " info: " << info; | |
1480 | } | |
1481 | }; | |
1482 | ||
1483 | struct MLogRec : boost::statechart::event< MLogRec > { | |
1484 | pg_shard_t from; | |
1485 | boost::intrusive_ptr<MOSDPGLog> msg; | |
1486 | MLogRec(pg_shard_t from, MOSDPGLog *msg) : | |
1487 | from(from), msg(msg) {} | |
1488 | void print(std::ostream *out) const { | |
1489 | *out << "MLogRec from " << from; | |
1490 | } | |
1491 | }; | |
1492 | ||
1493 | struct MNotifyRec : boost::statechart::event< MNotifyRec > { | |
1494 | pg_shard_t from; | |
1495 | pg_notify_t notify; | |
1496 | uint64_t features; | |
1497 | MNotifyRec(pg_shard_t from, const pg_notify_t ¬ify, uint64_t f) : | |
1498 | from(from), notify(notify), features(f) {} | |
1499 | void print(std::ostream *out) const { | |
1500 | *out << "MNotifyRec from " << from << " notify: " << notify | |
1501 | << " features: 0x" << hex << features << dec; | |
1502 | } | |
1503 | }; | |
1504 | ||
1505 | struct MQuery : boost::statechart::event< MQuery > { | |
1506 | pg_shard_t from; | |
1507 | pg_query_t query; | |
1508 | epoch_t query_epoch; | |
1509 | MQuery(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch): | |
1510 | from(from), query(query), query_epoch(query_epoch) {} | |
1511 | void print(std::ostream *out) const { | |
1512 | *out << "MQuery from " << from | |
1513 | << " query_epoch " << query_epoch | |
1514 | << " query: " << query; | |
1515 | } | |
1516 | }; | |
1517 | ||
1518 | struct AdvMap : boost::statechart::event< AdvMap > { | |
1519 | OSDMapRef osdmap; | |
1520 | OSDMapRef lastmap; | |
1521 | vector<int> newup, newacting; | |
1522 | int up_primary, acting_primary; | |
1523 | AdvMap( | |
1524 | OSDMapRef osdmap, OSDMapRef lastmap, | |
1525 | vector<int>& newup, int up_primary, | |
1526 | vector<int>& newacting, int acting_primary): | |
1527 | osdmap(osdmap), lastmap(lastmap), | |
1528 | newup(newup), | |
1529 | newacting(newacting), | |
1530 | up_primary(up_primary), | |
1531 | acting_primary(acting_primary) {} | |
1532 | void print(std::ostream *out) const { | |
1533 | *out << "AdvMap"; | |
1534 | } | |
1535 | }; | |
1536 | ||
1537 | struct ActMap : boost::statechart::event< ActMap > { | |
1538 | ActMap() : boost::statechart::event< ActMap >() {} | |
1539 | void print(std::ostream *out) const { | |
1540 | *out << "ActMap"; | |
1541 | } | |
1542 | }; | |
1543 | struct Activate : boost::statechart::event< Activate > { | |
1544 | epoch_t activation_epoch; | |
1545 | explicit Activate(epoch_t q) : boost::statechart::event< Activate >(), | |
1546 | activation_epoch(q) {} | |
1547 | void print(std::ostream *out) const { | |
1548 | *out << "Activate from " << activation_epoch; | |
1549 | } | |
1550 | }; | |
1551 | struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > { | |
1552 | unsigned priority; | |
1553 | explicit RequestBackfillPrio(unsigned prio) : | |
1554 | boost::statechart::event< RequestBackfillPrio >(), | |
1555 | priority(prio) {} | |
1556 | void print(std::ostream *out) const { | |
1557 | *out << "RequestBackfillPrio: priority " << priority; | |
1558 | } | |
1559 | }; | |
1560 | #define TrivialEvent(T) struct T : boost::statechart::event< T > { \ | |
1561 | T() : boost::statechart::event< T >() {} \ | |
1562 | void print(std::ostream *out) const { \ | |
1563 | *out << #T; \ | |
1564 | } \ | |
1565 | }; | |
1566 | TrivialEvent(Initialize) | |
1567 | TrivialEvent(Load) | |
1568 | TrivialEvent(GotInfo) | |
1569 | TrivialEvent(NeedUpThru) | |
1570 | TrivialEvent(NullEvt) | |
1571 | TrivialEvent(FlushedEvt) | |
1572 | TrivialEvent(Backfilled) | |
1573 | TrivialEvent(LocalBackfillReserved) | |
1574 | TrivialEvent(RemoteBackfillReserved) | |
1575 | TrivialEvent(RemoteReservationRejected) | |
224ce89b | 1576 | TrivialEvent(CancelBackfill) |
7c673cae FG |
1577 | TrivialEvent(RequestBackfill) |
1578 | TrivialEvent(RequestRecovery) | |
1579 | TrivialEvent(RecoveryDone) | |
1580 | TrivialEvent(BackfillTooFull) | |
1581 | TrivialEvent(RecoveryTooFull) | |
224ce89b | 1582 | TrivialEvent(CancelRecovery) |
7c673cae FG |
1583 | |
1584 | TrivialEvent(AllReplicasRecovered) | |
1585 | TrivialEvent(DoRecovery) | |
1586 | TrivialEvent(LocalRecoveryReserved) | |
1587 | TrivialEvent(RemoteRecoveryReserved) | |
1588 | TrivialEvent(AllRemotesReserved) | |
1589 | TrivialEvent(AllBackfillsReserved) | |
1590 | TrivialEvent(GoClean) | |
1591 | ||
1592 | TrivialEvent(AllReplicasActivated) | |
1593 | ||
1594 | TrivialEvent(IntervalFlush) | |
1595 | ||
1596 | /* Encapsulates PG recovery process */ | |
1597 | class RecoveryState { | |
1598 | void start_handle(RecoveryCtx *new_ctx); | |
1599 | void end_handle(); | |
1600 | public: | |
1601 | void begin_block_outgoing(); | |
1602 | void end_block_outgoing(); | |
1603 | void clear_blocked_outgoing(); | |
1604 | private: | |
1605 | ||
1606 | /* States */ | |
1607 | struct Initial; | |
1608 | class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > { | |
1609 | RecoveryState *state; | |
1610 | public: | |
1611 | PG *pg; | |
1612 | ||
1613 | utime_t event_time; | |
1614 | uint64_t event_count; | |
1615 | ||
1616 | void clear_event_counters() { | |
1617 | event_time = utime_t(); | |
1618 | event_count = 0; | |
1619 | } | |
1620 | ||
1621 | void log_enter(const char *state_name); | |
1622 | void log_exit(const char *state_name, utime_t duration); | |
1623 | ||
1624 | RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {} | |
1625 | ||
1626 | /* Accessor functions for state methods */ | |
1627 | ObjectStore::Transaction* get_cur_transaction() { | |
1628 | assert(state->rctx); | |
1629 | assert(state->rctx->transaction); | |
1630 | return state->rctx->transaction; | |
1631 | } | |
1632 | ||
1633 | void send_query(pg_shard_t to, const pg_query_t &query) { | |
1634 | assert(state->rctx); | |
1635 | assert(state->rctx->query_map); | |
1636 | (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] = | |
1637 | query; | |
1638 | } | |
1639 | ||
1640 | map<int, map<spg_t, pg_query_t> > *get_query_map() { | |
1641 | assert(state->rctx); | |
1642 | assert(state->rctx->query_map); | |
1643 | return state->rctx->query_map; | |
1644 | } | |
1645 | ||
1646 | map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() { | |
1647 | assert(state->rctx); | |
1648 | assert(state->rctx->info_map); | |
1649 | return state->rctx->info_map; | |
1650 | } | |
1651 | ||
1652 | list< Context* > *get_on_safe_context_list() { | |
1653 | assert(state->rctx); | |
1654 | assert(state->rctx->on_safe); | |
1655 | return &(state->rctx->on_safe->contexts); | |
1656 | } | |
1657 | ||
1658 | list< Context * > *get_on_applied_context_list() { | |
1659 | assert(state->rctx); | |
1660 | assert(state->rctx->on_applied); | |
1661 | return &(state->rctx->on_applied->contexts); | |
1662 | } | |
1663 | ||
1664 | RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); } | |
1665 | ||
1666 | void send_notify(pg_shard_t to, | |
1667 | const pg_notify_t &info, const PastIntervals &pi) { | |
1668 | assert(state->rctx); | |
1669 | assert(state->rctx->notify_list); | |
1670 | (*state->rctx->notify_list)[to.osd].push_back(make_pair(info, pi)); | |
1671 | } | |
1672 | }; | |
1673 | friend class RecoveryMachine; | |
1674 | ||
1675 | /* States */ | |
1676 | ||
1677 | struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState { | |
1678 | explicit Crashed(my_context ctx); | |
1679 | }; | |
1680 | ||
1681 | struct Reset; | |
1682 | ||
1683 | struct Initial : boost::statechart::state< Initial, RecoveryMachine >, NamedState { | |
1684 | explicit Initial(my_context ctx); | |
1685 | void exit(); | |
1686 | ||
1687 | typedef boost::mpl::list < | |
1688 | boost::statechart::transition< Initialize, Reset >, | |
1689 | boost::statechart::custom_reaction< Load >, | |
1690 | boost::statechart::custom_reaction< NullEvt >, | |
1691 | boost::statechart::transition< boost::statechart::event_base, Crashed > | |
1692 | > reactions; | |
1693 | ||
1694 | boost::statechart::result react(const Load&); | |
1695 | boost::statechart::result react(const MNotifyRec&); | |
1696 | boost::statechart::result react(const MInfoRec&); | |
1697 | boost::statechart::result react(const MLogRec&); | |
1698 | boost::statechart::result react(const boost::statechart::event_base&) { | |
1699 | return discard_event(); | |
1700 | } | |
1701 | }; | |
1702 | ||
1703 | struct Reset : boost::statechart::state< Reset, RecoveryMachine >, NamedState { | |
1704 | explicit Reset(my_context ctx); | |
1705 | void exit(); | |
1706 | ||
1707 | typedef boost::mpl::list < | |
1708 | boost::statechart::custom_reaction< QueryState >, | |
1709 | boost::statechart::custom_reaction< AdvMap >, | |
1710 | boost::statechart::custom_reaction< ActMap >, | |
1711 | boost::statechart::custom_reaction< NullEvt >, | |
1712 | boost::statechart::custom_reaction< FlushedEvt >, | |
1713 | boost::statechart::custom_reaction< IntervalFlush >, | |
1714 | boost::statechart::transition< boost::statechart::event_base, Crashed > | |
1715 | > reactions; | |
1716 | boost::statechart::result react(const QueryState& q); | |
1717 | boost::statechart::result react(const AdvMap&); | |
1718 | boost::statechart::result react(const ActMap&); | |
1719 | boost::statechart::result react(const FlushedEvt&); | |
1720 | boost::statechart::result react(const IntervalFlush&); | |
1721 | boost::statechart::result react(const boost::statechart::event_base&) { | |
1722 | return discard_event(); | |
1723 | } | |
1724 | }; | |
1725 | ||
1726 | struct Start; | |
1727 | ||
1728 | struct Started : boost::statechart::state< Started, RecoveryMachine, Start >, NamedState { | |
1729 | explicit Started(my_context ctx); | |
1730 | void exit(); | |
1731 | ||
1732 | typedef boost::mpl::list < | |
1733 | boost::statechart::custom_reaction< QueryState >, | |
1734 | boost::statechart::custom_reaction< AdvMap >, | |
1735 | boost::statechart::custom_reaction< NullEvt >, | |
1736 | boost::statechart::custom_reaction< FlushedEvt >, | |
1737 | boost::statechart::custom_reaction< IntervalFlush >, | |
1738 | boost::statechart::transition< boost::statechart::event_base, Crashed > | |
1739 | > reactions; | |
1740 | boost::statechart::result react(const QueryState& q); | |
1741 | boost::statechart::result react(const AdvMap&); | |
1742 | boost::statechart::result react(const FlushedEvt&); | |
1743 | boost::statechart::result react(const IntervalFlush&); | |
1744 | boost::statechart::result react(const boost::statechart::event_base&) { | |
1745 | return discard_event(); | |
1746 | } | |
1747 | }; | |
1748 | ||
1749 | struct MakePrimary : boost::statechart::event< MakePrimary > { | |
1750 | MakePrimary() : boost::statechart::event< MakePrimary >() {} | |
1751 | }; | |
1752 | struct MakeStray : boost::statechart::event< MakeStray > { | |
1753 | MakeStray() : boost::statechart::event< MakeStray >() {} | |
1754 | }; | |
1755 | struct Primary; | |
1756 | struct Stray; | |
1757 | ||
1758 | struct Start : boost::statechart::state< Start, Started >, NamedState { | |
1759 | explicit Start(my_context ctx); | |
1760 | void exit(); | |
1761 | ||
1762 | typedef boost::mpl::list < | |
1763 | boost::statechart::transition< MakePrimary, Primary >, | |
1764 | boost::statechart::transition< MakeStray, Stray > | |
1765 | > reactions; | |
1766 | }; | |
1767 | ||
1768 | struct Peering; | |
1769 | struct WaitActingChange; | |
1770 | struct NeedActingChange : boost::statechart::event< NeedActingChange > { | |
1771 | NeedActingChange() : boost::statechart::event< NeedActingChange >() {} | |
1772 | }; | |
1773 | struct Incomplete; | |
1774 | struct IsIncomplete : boost::statechart::event< IsIncomplete > { | |
1775 | IsIncomplete() : boost::statechart::event< IsIncomplete >() {} | |
1776 | }; | |
1777 | struct Down; | |
1778 | struct IsDown : boost::statechart::event< IsDown > { | |
1779 | IsDown() : boost::statechart::event< IsDown >() {} | |
1780 | }; | |
1781 | ||
1782 | struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState { | |
1783 | explicit Primary(my_context ctx); | |
1784 | void exit(); | |
1785 | ||
1786 | typedef boost::mpl::list < | |
1787 | boost::statechart::custom_reaction< ActMap >, | |
1788 | boost::statechart::custom_reaction< MNotifyRec >, | |
1789 | boost::statechart::transition< NeedActingChange, WaitActingChange > | |
1790 | > reactions; | |
1791 | boost::statechart::result react(const ActMap&); | |
1792 | boost::statechart::result react(const MNotifyRec&); | |
1793 | }; | |
1794 | ||
1795 | struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>, | |
1796 | NamedState { | |
1797 | typedef boost::mpl::list < | |
1798 | boost::statechart::custom_reaction< QueryState >, | |
1799 | boost::statechart::custom_reaction< AdvMap >, | |
1800 | boost::statechart::custom_reaction< MLogRec >, | |
1801 | boost::statechart::custom_reaction< MInfoRec >, | |
1802 | boost::statechart::custom_reaction< MNotifyRec > | |
1803 | > reactions; | |
1804 | explicit WaitActingChange(my_context ctx); | |
1805 | boost::statechart::result react(const QueryState& q); | |
1806 | boost::statechart::result react(const AdvMap&); | |
1807 | boost::statechart::result react(const MLogRec&); | |
1808 | boost::statechart::result react(const MInfoRec&); | |
1809 | boost::statechart::result react(const MNotifyRec&); | |
1810 | void exit(); | |
1811 | }; | |
1812 | ||
1813 | struct GetInfo; | |
1814 | struct Active; | |
1815 | ||
1816 | struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState { | |
1817 | PastIntervals::PriorSet prior_set; | |
1818 | bool history_les_bound; //< need osd_find_best_info_ignore_history_les | |
1819 | ||
1820 | explicit Peering(my_context ctx); | |
1821 | void exit(); | |
1822 | ||
1823 | typedef boost::mpl::list < | |
1824 | boost::statechart::custom_reaction< QueryState >, | |
1825 | boost::statechart::transition< Activate, Active >, | |
1826 | boost::statechart::custom_reaction< AdvMap > | |
1827 | > reactions; | |
1828 | boost::statechart::result react(const QueryState& q); | |
1829 | boost::statechart::result react(const AdvMap &advmap); | |
1830 | }; | |
1831 | ||
1832 | struct WaitLocalRecoveryReserved; | |
1833 | struct Activating; | |
1834 | struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState { | |
1835 | explicit Active(my_context ctx); | |
1836 | void exit(); | |
1837 | ||
1838 | const set<pg_shard_t> remote_shards_to_reserve_recovery; | |
1839 | const set<pg_shard_t> remote_shards_to_reserve_backfill; | |
1840 | bool all_replicas_activated; | |
1841 | ||
1842 | typedef boost::mpl::list < | |
1843 | boost::statechart::custom_reaction< QueryState >, | |
1844 | boost::statechart::custom_reaction< ActMap >, | |
1845 | boost::statechart::custom_reaction< AdvMap >, | |
1846 | boost::statechart::custom_reaction< MInfoRec >, | |
1847 | boost::statechart::custom_reaction< MNotifyRec >, | |
1848 | boost::statechart::custom_reaction< MLogRec >, | |
1849 | boost::statechart::custom_reaction< Backfilled >, | |
1850 | boost::statechart::custom_reaction< AllReplicasActivated > | |
1851 | > reactions; | |
1852 | boost::statechart::result react(const QueryState& q); | |
1853 | boost::statechart::result react(const ActMap&); | |
1854 | boost::statechart::result react(const AdvMap&); | |
1855 | boost::statechart::result react(const MInfoRec& infoevt); | |
1856 | boost::statechart::result react(const MNotifyRec& notevt); | |
1857 | boost::statechart::result react(const MLogRec& logevt); | |
1858 | boost::statechart::result react(const Backfilled&) { | |
1859 | return discard_event(); | |
1860 | } | |
1861 | boost::statechart::result react(const AllReplicasActivated&); | |
1862 | }; | |
1863 | ||
1864 | struct Clean : boost::statechart::state< Clean, Active >, NamedState { | |
1865 | typedef boost::mpl::list< | |
1866 | boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved > | |
1867 | > reactions; | |
1868 | explicit Clean(my_context ctx); | |
1869 | void exit(); | |
1870 | }; | |
1871 | ||
1872 | struct Recovered : boost::statechart::state< Recovered, Active >, NamedState { | |
1873 | typedef boost::mpl::list< | |
1874 | boost::statechart::transition< GoClean, Clean >, | |
224ce89b | 1875 | boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, |
7c673cae FG |
1876 | boost::statechart::custom_reaction< AllReplicasActivated > |
1877 | > reactions; | |
1878 | explicit Recovered(my_context ctx); | |
1879 | void exit(); | |
1880 | boost::statechart::result react(const AllReplicasActivated&) { | |
1881 | post_event(GoClean()); | |
1882 | return forward_event(); | |
1883 | } | |
1884 | }; | |
1885 | ||
1886 | struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState { | |
1887 | typedef boost::mpl::list< | |
1888 | boost::statechart::transition< Backfilled, Recovered >, | |
224ce89b | 1889 | boost::statechart::custom_reaction< CancelBackfill >, |
7c673cae FG |
1890 | boost::statechart::custom_reaction< RemoteReservationRejected > |
1891 | > reactions; | |
1892 | explicit Backfilling(my_context ctx); | |
1893 | boost::statechart::result react(const RemoteReservationRejected& evt); | |
224ce89b | 1894 | boost::statechart::result react(const CancelBackfill& evt); |
7c673cae FG |
1895 | void exit(); |
1896 | }; | |
1897 | ||
1898 | struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState { | |
1899 | typedef boost::mpl::list< | |
1900 | boost::statechart::custom_reaction< RemoteBackfillReserved >, | |
1901 | boost::statechart::custom_reaction< RemoteReservationRejected >, | |
1902 | boost::statechart::transition< AllBackfillsReserved, Backfilling > | |
1903 | > reactions; | |
1904 | set<pg_shard_t>::const_iterator backfill_osd_it; | |
1905 | explicit WaitRemoteBackfillReserved(my_context ctx); | |
1906 | void exit(); | |
1907 | boost::statechart::result react(const RemoteBackfillReserved& evt); | |
1908 | boost::statechart::result react(const RemoteReservationRejected& evt); | |
1909 | }; | |
1910 | ||
1911 | struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState { | |
1912 | typedef boost::mpl::list< | |
1913 | boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved > | |
1914 | > reactions; | |
1915 | explicit WaitLocalBackfillReserved(my_context ctx); | |
1916 | void exit(); | |
1917 | }; | |
1918 | ||
1919 | struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState { | |
1920 | typedef boost::mpl::list< | |
1921 | boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>, | |
1922 | boost::statechart::custom_reaction< RemoteBackfillReserved >, | |
1923 | boost::statechart::custom_reaction< RemoteReservationRejected > | |
1924 | > reactions; | |
1925 | explicit NotBackfilling(my_context ctx); | |
1926 | void exit(); | |
1927 | boost::statechart::result react(const RemoteBackfillReserved& evt); | |
1928 | boost::statechart::result react(const RemoteReservationRejected& evt); | |
1929 | }; | |
1930 | ||
1931 | struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState { | |
1932 | typedef boost::mpl::list< | |
c07f9fc5 FG |
1933 | boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, |
1934 | boost::statechart::custom_reaction< CancelRecovery > | |
7c673cae FG |
1935 | > reactions; |
1936 | explicit NotRecovering(my_context ctx); | |
c07f9fc5 FG |
1937 | boost::statechart::result react(const CancelRecovery& evt) { |
1938 | /* no-op */ | |
1939 | return discard_event(); | |
1940 | } | |
7c673cae FG |
1941 | void exit(); |
1942 | }; | |
1943 | ||
1944 | struct RepNotRecovering; | |
1945 | struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState { | |
1946 | explicit ReplicaActive(my_context ctx); | |
1947 | void exit(); | |
1948 | ||
1949 | typedef boost::mpl::list < | |
1950 | boost::statechart::custom_reaction< QueryState >, | |
1951 | boost::statechart::custom_reaction< ActMap >, | |
1952 | boost::statechart::custom_reaction< MQuery >, | |
1953 | boost::statechart::custom_reaction< MInfoRec >, | |
1954 | boost::statechart::custom_reaction< MLogRec >, | |
1955 | boost::statechart::custom_reaction< Activate > | |
1956 | > reactions; | |
1957 | boost::statechart::result react(const QueryState& q); | |
1958 | boost::statechart::result react(const MInfoRec& infoevt); | |
1959 | boost::statechart::result react(const MLogRec& logevt); | |
1960 | boost::statechart::result react(const ActMap&); | |
1961 | boost::statechart::result react(const MQuery&); | |
1962 | boost::statechart::result react(const Activate&); | |
1963 | }; | |
1964 | ||
1965 | struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState { | |
1966 | typedef boost::mpl::list< | |
1967 | boost::statechart::transition< RecoveryDone, RepNotRecovering >, | |
1968 | boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >, | |
1969 | boost::statechart::custom_reaction< BackfillTooFull > | |
1970 | > reactions; | |
1971 | explicit RepRecovering(my_context ctx); | |
1972 | boost::statechart::result react(const BackfillTooFull &evt); | |
1973 | void exit(); | |
1974 | }; | |
1975 | ||
1976 | struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState { | |
1977 | typedef boost::mpl::list< | |
1978 | boost::statechart::custom_reaction< RemoteBackfillReserved >, | |
1979 | boost::statechart::custom_reaction< RemoteReservationRejected > | |
1980 | > reactions; | |
1981 | explicit RepWaitBackfillReserved(my_context ctx); | |
1982 | void exit(); | |
1983 | boost::statechart::result react(const RemoteBackfillReserved &evt); | |
1984 | boost::statechart::result react(const RemoteReservationRejected &evt); | |
1985 | }; | |
1986 | ||
1987 | struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState { | |
1988 | typedef boost::mpl::list< | |
1989 | boost::statechart::custom_reaction< RemoteRecoveryReserved > | |
1990 | > reactions; | |
1991 | explicit RepWaitRecoveryReserved(my_context ctx); | |
1992 | void exit(); | |
1993 | boost::statechart::result react(const RemoteRecoveryReserved &evt); | |
1994 | }; | |
1995 | ||
1996 | struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState { | |
1997 | typedef boost::mpl::list< | |
1998 | boost::statechart::custom_reaction< RequestBackfillPrio >, | |
1999 | boost::statechart::transition< RequestRecovery, RepWaitRecoveryReserved >, | |
2000 | boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers | |
2001 | > reactions; | |
2002 | explicit RepNotRecovering(my_context ctx); | |
2003 | boost::statechart::result react(const RequestBackfillPrio &evt); | |
2004 | void exit(); | |
2005 | }; | |
2006 | ||
2007 | struct Recovering : boost::statechart::state< Recovering, Active >, NamedState { | |
2008 | typedef boost::mpl::list < | |
2009 | boost::statechart::custom_reaction< AllReplicasRecovered >, | |
224ce89b | 2010 | boost::statechart::custom_reaction< CancelRecovery >, |
7c673cae FG |
2011 | boost::statechart::custom_reaction< RequestBackfill > |
2012 | > reactions; | |
2013 | explicit Recovering(my_context ctx); | |
2014 | void exit(); | |
224ce89b | 2015 | void release_reservations(bool cancel = false); |
7c673cae | 2016 | boost::statechart::result react(const AllReplicasRecovered &evt); |
224ce89b | 2017 | boost::statechart::result react(const CancelRecovery& evt); |
7c673cae FG |
2018 | boost::statechart::result react(const RequestBackfill &evt); |
2019 | }; | |
2020 | ||
2021 | struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState { | |
2022 | typedef boost::mpl::list < | |
2023 | boost::statechart::custom_reaction< RemoteRecoveryReserved >, | |
2024 | boost::statechart::transition< AllRemotesReserved, Recovering > | |
2025 | > reactions; | |
2026 | set<pg_shard_t>::const_iterator remote_recovery_reservation_it; | |
2027 | explicit WaitRemoteRecoveryReserved(my_context ctx); | |
2028 | boost::statechart::result react(const RemoteRecoveryReserved &evt); | |
2029 | void exit(); | |
2030 | }; | |
2031 | ||
2032 | struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState { | |
2033 | typedef boost::mpl::list < | |
2034 | boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >, | |
2035 | boost::statechart::custom_reaction< RecoveryTooFull > | |
2036 | > reactions; | |
2037 | explicit WaitLocalRecoveryReserved(my_context ctx); | |
2038 | void exit(); | |
2039 | boost::statechart::result react(const RecoveryTooFull &evt); | |
2040 | }; | |
2041 | ||
2042 | struct Activating : boost::statechart::state< Activating, Active >, NamedState { | |
2043 | typedef boost::mpl::list < | |
2044 | boost::statechart::transition< AllReplicasRecovered, Recovered >, | |
2045 | boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, | |
2046 | boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved > | |
2047 | > reactions; | |
2048 | explicit Activating(my_context ctx); | |
2049 | void exit(); | |
2050 | }; | |
2051 | ||
2052 | struct Stray : boost::statechart::state< Stray, Started >, NamedState { | |
2053 | map<int, pair<pg_query_t, epoch_t> > pending_queries; | |
2054 | ||
2055 | explicit Stray(my_context ctx); | |
2056 | void exit(); | |
2057 | ||
2058 | typedef boost::mpl::list < | |
2059 | boost::statechart::custom_reaction< MQuery >, | |
2060 | boost::statechart::custom_reaction< MLogRec >, | |
2061 | boost::statechart::custom_reaction< MInfoRec >, | |
2062 | boost::statechart::custom_reaction< ActMap >, | |
2063 | boost::statechart::custom_reaction< RecoveryDone > | |
2064 | > reactions; | |
2065 | boost::statechart::result react(const MQuery& query); | |
2066 | boost::statechart::result react(const MLogRec& logevt); | |
2067 | boost::statechart::result react(const MInfoRec& infoevt); | |
2068 | boost::statechart::result react(const ActMap&); | |
2069 | boost::statechart::result react(const RecoveryDone&) { | |
2070 | return discard_event(); | |
2071 | } | |
2072 | }; | |
2073 | ||
2074 | struct GetLog; | |
2075 | ||
2076 | struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState { | |
2077 | set<pg_shard_t> peer_info_requested; | |
2078 | ||
2079 | explicit GetInfo(my_context ctx); | |
2080 | void exit(); | |
2081 | void get_infos(); | |
2082 | ||
2083 | typedef boost::mpl::list < | |
2084 | boost::statechart::custom_reaction< QueryState >, | |
2085 | boost::statechart::transition< GotInfo, GetLog >, | |
2086 | boost::statechart::custom_reaction< MNotifyRec >, | |
2087 | boost::statechart::transition< IsDown, Down > | |
2088 | > reactions; | |
2089 | boost::statechart::result react(const QueryState& q); | |
2090 | boost::statechart::result react(const MNotifyRec& infoevt); | |
2091 | }; | |
2092 | ||
2093 | struct GotLog : boost::statechart::event< GotLog > { | |
2094 | GotLog() : boost::statechart::event< GotLog >() {} | |
2095 | }; | |
2096 | ||
2097 | struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState { | |
2098 | pg_shard_t auth_log_shard; | |
2099 | boost::intrusive_ptr<MOSDPGLog> msg; | |
2100 | ||
2101 | explicit GetLog(my_context ctx); | |
2102 | void exit(); | |
2103 | ||
2104 | typedef boost::mpl::list < | |
2105 | boost::statechart::custom_reaction< QueryState >, | |
2106 | boost::statechart::custom_reaction< MLogRec >, | |
2107 | boost::statechart::custom_reaction< GotLog >, | |
2108 | boost::statechart::custom_reaction< AdvMap >, | |
2109 | boost::statechart::transition< IsIncomplete, Incomplete > | |
2110 | > reactions; | |
2111 | boost::statechart::result react(const AdvMap&); | |
2112 | boost::statechart::result react(const QueryState& q); | |
2113 | boost::statechart::result react(const MLogRec& logevt); | |
2114 | boost::statechart::result react(const GotLog&); | |
2115 | }; | |
2116 | ||
2117 | struct WaitUpThru; | |
2118 | ||
2119 | struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState { | |
2120 | set<pg_shard_t> peer_missing_requested; | |
2121 | ||
2122 | explicit GetMissing(my_context ctx); | |
2123 | void exit(); | |
2124 | ||
2125 | typedef boost::mpl::list < | |
2126 | boost::statechart::custom_reaction< QueryState >, | |
2127 | boost::statechart::custom_reaction< MLogRec >, | |
2128 | boost::statechart::transition< NeedUpThru, WaitUpThru > | |
2129 | > reactions; | |
2130 | boost::statechart::result react(const QueryState& q); | |
2131 | boost::statechart::result react(const MLogRec& logevt); | |
2132 | }; | |
2133 | ||
2134 | struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState { | |
2135 | explicit WaitUpThru(my_context ctx); | |
2136 | void exit(); | |
2137 | ||
2138 | typedef boost::mpl::list < | |
2139 | boost::statechart::custom_reaction< QueryState >, | |
2140 | boost::statechart::custom_reaction< ActMap >, | |
2141 | boost::statechart::custom_reaction< MLogRec > | |
2142 | > reactions; | |
2143 | boost::statechart::result react(const QueryState& q); | |
2144 | boost::statechart::result react(const ActMap& am); | |
2145 | boost::statechart::result react(const MLogRec& logrec); | |
2146 | }; | |
2147 | ||
2148 | struct Down : boost::statechart::state< Down, Peering>, NamedState { | |
2149 | explicit Down(my_context ctx); | |
2150 | typedef boost::mpl::list < | |
2151 | boost::statechart::custom_reaction< QueryState > | |
2152 | > reactions; | |
2153 | boost::statechart::result react(const QueryState& infoevt); | |
2154 | void exit(); | |
2155 | }; | |
2156 | ||
2157 | struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState { | |
2158 | typedef boost::mpl::list < | |
2159 | boost::statechart::custom_reaction< AdvMap >, | |
2160 | boost::statechart::custom_reaction< MNotifyRec >, | |
2161 | boost::statechart::custom_reaction< QueryState > | |
2162 | > reactions; | |
2163 | explicit Incomplete(my_context ctx); | |
2164 | boost::statechart::result react(const AdvMap &advmap); | |
2165 | boost::statechart::result react(const MNotifyRec& infoevt); | |
2166 | boost::statechart::result react(const QueryState& infoevt); | |
2167 | void exit(); | |
2168 | }; | |
2169 | ||
2170 | ||
2171 | RecoveryMachine machine; | |
2172 | PG *pg; | |
2173 | ||
2174 | /// context passed in by state machine caller | |
2175 | RecoveryCtx *orig_ctx; | |
2176 | ||
2177 | /// populated if we are buffering messages pending a flush | |
2178 | boost::optional<BufferedRecoveryMessages> messages_pending_flush; | |
2179 | ||
2180 | /** | |
2181 | * populated between start_handle() and end_handle(), points into | |
2182 | * the message lists for messages_pending_flush while blocking messages | |
2183 | * or into orig_ctx otherwise | |
2184 | */ | |
2185 | boost::optional<RecoveryCtx> rctx; | |
2186 | ||
2187 | public: | |
2188 | explicit RecoveryState(PG *pg) | |
2189 | : machine(this, pg), pg(pg), orig_ctx(0) { | |
2190 | machine.initiate(); | |
2191 | } | |
2192 | ||
2193 | void handle_event(const boost::statechart::event_base &evt, | |
2194 | RecoveryCtx *rctx) { | |
2195 | start_handle(rctx); | |
2196 | machine.process_event(evt); | |
2197 | end_handle(); | |
2198 | } | |
2199 | ||
2200 | void handle_event(CephPeeringEvtRef evt, | |
2201 | RecoveryCtx *rctx) { | |
2202 | start_handle(rctx); | |
2203 | machine.process_event(evt->get_event()); | |
2204 | end_handle(); | |
2205 | } | |
2206 | ||
2207 | } recovery_state; | |
2208 | ||
2209 | ||
2210 | public: | |
2211 | PG(OSDService *o, OSDMapRef curmap, | |
2212 | const PGPool &pool, spg_t p); | |
2213 | ~PG() override; | |
2214 | ||
2215 | private: | |
2216 | // Prevent copying | |
2217 | explicit PG(const PG& rhs); | |
2218 | PG& operator=(const PG& rhs); | |
2219 | const spg_t pg_id; | |
2220 | uint64_t peer_features; | |
2221 | uint64_t acting_features; | |
2222 | uint64_t upacting_features; | |
2223 | ||
2224 | epoch_t last_epoch; | |
2225 | ||
7c673cae FG |
2226 | public: |
2227 | const spg_t& get_pgid() const { return pg_id; } | |
2228 | ||
2229 | void reset_min_peer_features() { | |
2230 | peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT; | |
2231 | } | |
2232 | uint64_t get_min_peer_features() const { return peer_features; } | |
2233 | void apply_peer_features(uint64_t f) { peer_features &= f; } | |
2234 | ||
2235 | uint64_t get_min_acting_features() const { return acting_features; } | |
2236 | uint64_t get_min_upacting_features() const { return upacting_features; } | |
c07f9fc5 FG |
2237 | bool perform_deletes_during_peering() const { |
2238 | return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES)); | |
2239 | } | |
7c673cae FG |
2240 | |
2241 | void init_primary_up_acting( | |
2242 | const vector<int> &newup, | |
2243 | const vector<int> &newacting, | |
2244 | int new_up_primary, | |
2245 | int new_acting_primary) { | |
2246 | actingset.clear(); | |
2247 | acting = newacting; | |
2248 | for (uint8_t i = 0; i < acting.size(); ++i) { | |
2249 | if (acting[i] != CRUSH_ITEM_NONE) | |
2250 | actingset.insert( | |
2251 | pg_shard_t( | |
2252 | acting[i], | |
2253 | pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD)); | |
2254 | } | |
2255 | upset.clear(); | |
2256 | up = newup; | |
2257 | for (uint8_t i = 0; i < up.size(); ++i) { | |
2258 | if (up[i] != CRUSH_ITEM_NONE) | |
2259 | upset.insert( | |
2260 | pg_shard_t( | |
2261 | up[i], | |
2262 | pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD)); | |
2263 | } | |
2264 | if (!pool.info.ec_pool()) { | |
2265 | up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD); | |
2266 | primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD); | |
2267 | return; | |
2268 | } | |
2269 | up_primary = pg_shard_t(); | |
2270 | primary = pg_shard_t(); | |
2271 | for (uint8_t i = 0; i < up.size(); ++i) { | |
2272 | if (up[i] == new_up_primary) { | |
2273 | up_primary = pg_shard_t(up[i], shard_id_t(i)); | |
2274 | break; | |
2275 | } | |
2276 | } | |
2277 | for (uint8_t i = 0; i < acting.size(); ++i) { | |
2278 | if (acting[i] == new_acting_primary) { | |
2279 | primary = pg_shard_t(acting[i], shard_id_t(i)); | |
2280 | break; | |
2281 | } | |
2282 | } | |
2283 | assert(up_primary.osd == new_up_primary); | |
2284 | assert(primary.osd == new_acting_primary); | |
2285 | } | |
2286 | pg_shard_t get_primary() const { return primary; } | |
2287 | ||
2288 | int get_role() const { return role; } | |
2289 | void set_role(int r) { role = r; } | |
2290 | ||
2291 | bool is_primary() const { return pg_whoami == primary; } | |
2292 | bool is_replica() const { return role > 0; } | |
2293 | ||
2294 | epoch_t get_last_peering_reset() const { return last_peering_reset; } | |
2295 | ||
2296 | //int get_state() const { return state; } | |
2297 | bool state_test(int m) const { return (state & m) != 0; } | |
2298 | void state_set(int m) { state |= m; } | |
2299 | void state_clear(int m) { state &= ~m; } | |
2300 | ||
2301 | bool is_complete() const { return info.last_complete == info.last_update; } | |
2302 | bool should_send_notify() const { return send_notify; } | |
2303 | ||
2304 | int get_state() const { return state; } | |
2305 | bool is_active() const { return state_test(PG_STATE_ACTIVE); } | |
2306 | bool is_activating() const { return state_test(PG_STATE_ACTIVATING); } | |
2307 | bool is_peering() const { return state_test(PG_STATE_PEERING); } | |
2308 | bool is_down() const { return state_test(PG_STATE_DOWN); } | |
2309 | bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); } | |
2310 | bool is_clean() const { return state_test(PG_STATE_CLEAN); } | |
2311 | bool is_degraded() const { return state_test(PG_STATE_DEGRADED); } | |
2312 | bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); } | |
2313 | ||
2314 | bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); } | |
2315 | bool is_peered() const { | |
2316 | return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED); | |
2317 | } | |
2318 | ||
2319 | bool is_empty() const { return info.last_update == eversion_t(0,0); } | |
2320 | ||
2321 | void init( | |
2322 | int role, | |
2323 | const vector<int>& up, | |
2324 | int up_primary, | |
2325 | const vector<int>& acting, | |
2326 | int acting_primary, | |
2327 | const pg_history_t& history, | |
2328 | const PastIntervals& pim, | |
2329 | bool backfill, | |
2330 | ObjectStore::Transaction *t); | |
2331 | ||
2332 | // pg on-disk state | |
2333 | void do_pending_flush(); | |
2334 | ||
2335 | static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits); | |
2336 | static void _init(ObjectStore::Transaction& t, | |
2337 | spg_t pgid, const pg_pool_t *pool); | |
2338 | ||
2339 | private: | |
2340 | void prepare_write_info(map<string,bufferlist> *km); | |
2341 | ||
2342 | void update_store_with_options(); | |
2343 | void update_store_on_load(); | |
2344 | ||
2345 | public: | |
2346 | static int _prepare_write_info( | |
2347 | CephContext* cct, | |
2348 | map<string,bufferlist> *km, | |
2349 | epoch_t epoch, | |
2350 | pg_info_t &info, | |
2351 | pg_info_t &last_written_info, | |
2352 | PastIntervals &past_intervals, | |
2353 | bool dirty_big_info, | |
2354 | bool dirty_epoch, | |
2355 | bool try_fast_info, | |
2356 | PerfCounters *logger = nullptr); | |
2357 | void write_if_dirty(ObjectStore::Transaction& t); | |
2358 | ||
2359 | PGLog::IndexedLog projected_log; | |
2360 | bool check_in_progress_op( | |
2361 | const osd_reqid_t &r, | |
2362 | eversion_t *version, | |
2363 | version_t *user_version, | |
2364 | int *return_code) const; | |
2365 | eversion_t projected_last_update; | |
2366 | eversion_t get_next_version() const { | |
2367 | eversion_t at_version( | |
2368 | get_osdmap()->get_epoch(), | |
2369 | projected_last_update.version+1); | |
2370 | assert(at_version > info.last_update); | |
2371 | assert(at_version > pg_log.get_head()); | |
2372 | assert(at_version > projected_last_update); | |
2373 | return at_version; | |
2374 | } | |
2375 | ||
2376 | void add_log_entry(const pg_log_entry_t& e, bool applied); | |
2377 | void append_log( | |
2378 | const vector<pg_log_entry_t>& logv, | |
2379 | eversion_t trim_to, | |
2380 | eversion_t roll_forward_to, | |
2381 | ObjectStore::Transaction &t, | |
2382 | bool transaction_applied = true); | |
2383 | bool check_log_for_corruption(ObjectStore *store); | |
2384 | void trim_log(); | |
2385 | ||
2386 | std::string get_corrupt_pg_log_name() const; | |
2387 | static int read_info( | |
2388 | ObjectStore *store, spg_t pgid, const coll_t &coll, | |
2389 | bufferlist &bl, pg_info_t &info, PastIntervals &past_intervals, | |
2390 | __u8 &); | |
2391 | void read_state(ObjectStore *store, bufferlist &bl); | |
2392 | static bool _has_removal_flag(ObjectStore *store, spg_t pgid); | |
2393 | static int peek_map_epoch(ObjectStore *store, spg_t pgid, | |
2394 | epoch_t *pepoch, bufferlist *bl); | |
2395 | void update_snap_map( | |
2396 | const vector<pg_log_entry_t> &log_entries, | |
2397 | ObjectStore::Transaction& t); | |
2398 | ||
2399 | void filter_snapc(vector<snapid_t> &snaps); | |
2400 | ||
2401 | void log_weirdness(); | |
2402 | ||
2403 | virtual void kick_snap_trim() = 0; | |
2404 | virtual void snap_trimmer_scrub_complete() = 0; | |
31f18b77 | 2405 | bool requeue_scrub(bool high_priority = false); |
c07f9fc5 | 2406 | void queue_recovery(); |
7c673cae FG |
2407 | bool queue_scrub(); |
2408 | unsigned get_scrub_priority(); | |
2409 | ||
2410 | /// share pg info after a pg is active | |
2411 | void share_pg_info(); | |
2412 | ||
2413 | ||
2414 | bool append_log_entries_update_missing( | |
31f18b77 | 2415 | const mempool::osd_pglog::list<pg_log_entry_t> &entries, |
7c673cae FG |
2416 | ObjectStore::Transaction &t); |
2417 | ||
2418 | /** | |
2419 | * Merge entries updating missing as necessary on all | |
2420 | * actingbackfill logs and missings (also missing_loc) | |
2421 | */ | |
2422 | void merge_new_log_entries( | |
31f18b77 | 2423 | const mempool::osd_pglog::list<pg_log_entry_t> &entries, |
7c673cae FG |
2424 | ObjectStore::Transaction &t); |
2425 | ||
2426 | void reset_interval_flush(); | |
2427 | void start_peering_interval( | |
2428 | const OSDMapRef lastmap, | |
2429 | const vector<int>& newup, int up_primary, | |
2430 | const vector<int>& newacting, int acting_primary, | |
2431 | ObjectStore::Transaction *t); | |
2432 | void on_new_interval(); | |
2433 | virtual void _on_new_interval() = 0; | |
2434 | void start_flush(ObjectStore::Transaction *t, | |
2435 | list<Context *> *on_applied, | |
2436 | list<Context *> *on_safe); | |
2437 | void set_last_peering_reset(); | |
2438 | bool pg_has_reset_since(epoch_t e) { | |
2439 | assert(is_locked()); | |
2440 | return deleting || e < get_last_peering_reset(); | |
2441 | } | |
2442 | ||
2443 | void update_history(const pg_history_t& history); | |
2444 | void fulfill_info(pg_shard_t from, const pg_query_t &query, | |
2445 | pair<pg_shard_t, pg_info_t> ¬ify_info); | |
2446 | void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch); | |
2447 | ||
2448 | void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap); | |
2449 | ||
2450 | bool should_restart_peering( | |
2451 | int newupprimary, | |
2452 | int newactingprimary, | |
2453 | const vector<int>& newup, | |
2454 | const vector<int>& newacting, | |
2455 | OSDMapRef lastmap, | |
2456 | OSDMapRef osdmap); | |
2457 | ||
2458 | // OpRequest queueing | |
2459 | bool can_discard_op(OpRequestRef& op); | |
2460 | bool can_discard_scan(OpRequestRef op); | |
2461 | bool can_discard_backfill(OpRequestRef op); | |
2462 | bool can_discard_request(OpRequestRef& op); | |
2463 | ||
2464 | template<typename T, int MSGTYPE> | |
2465 | bool can_discard_replica_op(OpRequestRef& op); | |
2466 | ||
2467 | bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch); | |
2468 | bool old_peering_evt(CephPeeringEvtRef evt) { | |
2469 | return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested()); | |
2470 | } | |
2471 | static bool have_same_or_newer_map(epoch_t cur_epoch, epoch_t e) { | |
2472 | return e <= cur_epoch; | |
2473 | } | |
2474 | bool have_same_or_newer_map(epoch_t e) { | |
2475 | return e <= get_osdmap()->get_epoch(); | |
2476 | } | |
2477 | ||
2478 | bool op_has_sufficient_caps(OpRequestRef& op); | |
2479 | ||
2480 | ||
2481 | // recovery bits | |
2482 | void take_waiters(); | |
2483 | void queue_peering_event(CephPeeringEvtRef evt); | |
2484 | void handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx); | |
2485 | void queue_query(epoch_t msg_epoch, epoch_t query_epoch, | |
2486 | pg_shard_t from, const pg_query_t& q); | |
2487 | void queue_null(epoch_t msg_epoch, epoch_t query_epoch); | |
2488 | void queue_flushed(epoch_t started_at); | |
2489 | void handle_advance_map( | |
2490 | OSDMapRef osdmap, OSDMapRef lastmap, | |
2491 | vector<int>& newup, int up_primary, | |
2492 | vector<int>& newacting, int acting_primary, | |
2493 | RecoveryCtx *rctx); | |
2494 | void handle_activate_map(RecoveryCtx *rctx); | |
2495 | void handle_create(RecoveryCtx *rctx); | |
2496 | void handle_loaded(RecoveryCtx *rctx); | |
2497 | void handle_query_state(Formatter *f); | |
2498 | ||
2499 | virtual void on_removal(ObjectStore::Transaction *t) = 0; | |
2500 | ||
2501 | ||
2502 | // abstract bits | |
2503 | virtual void do_request( | |
2504 | OpRequestRef& op, | |
2505 | ThreadPool::TPHandle &handle | |
2506 | ) = 0; | |
2507 | ||
2508 | virtual void do_op(OpRequestRef& op) = 0; | |
2509 | virtual void do_sub_op(OpRequestRef op) = 0; | |
2510 | virtual void do_sub_op_reply(OpRequestRef op) = 0; | |
2511 | virtual void do_scan( | |
2512 | OpRequestRef op, | |
2513 | ThreadPool::TPHandle &handle | |
2514 | ) = 0; | |
2515 | virtual void do_backfill(OpRequestRef op) = 0; | |
2516 | virtual void snap_trimmer(epoch_t epoch_queued) = 0; | |
2517 | ||
2518 | virtual int do_command( | |
2519 | cmdmap_t cmdmap, | |
2520 | ostream& ss, | |
2521 | bufferlist& idata, | |
2522 | bufferlist& odata, | |
2523 | ConnectionRef conn, | |
2524 | ceph_tid_t tid) = 0; | |
2525 | ||
2526 | virtual void on_role_change() = 0; | |
2527 | virtual void on_pool_change() = 0; | |
2528 | virtual void on_change(ObjectStore::Transaction *t) = 0; | |
2529 | virtual void on_activate() = 0; | |
2530 | virtual void on_flushed() = 0; | |
2531 | virtual void on_shutdown() = 0; | |
2532 | virtual void check_blacklisted_watchers() = 0; | |
2533 | virtual void get_watchers(std::list<obj_watch_item_t>&) = 0; | |
2534 | ||
2535 | virtual bool agent_work(int max) = 0; | |
2536 | virtual bool agent_work(int max, int agent_flush_quota) = 0; | |
2537 | virtual void agent_stop() = 0; | |
2538 | virtual void agent_delay() = 0; | |
2539 | virtual void agent_clear() = 0; | |
2540 | virtual void agent_choose_mode_restart() = 0; | |
2541 | }; | |
2542 | ||
2543 | ostream& operator<<(ostream& out, const PG& pg); | |
2544 | ||
2545 | ostream& operator<<(ostream& out, const PG::BackfillInterval& bi); | |
2546 | ||
2547 | #endif |