]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #ifndef CEPH_LIBRBD_JOURNAL_H | |
5 | #define CEPH_LIBRBD_JOURNAL_H | |
6 | ||
7 | #include "include/int_types.h" | |
8 | #include "include/Context.h" | |
9 | #include "include/interval_set.h" | |
11fdf7f2 | 10 | #include "include/rados/librados_fwd.hpp" |
9f95a23c | 11 | #include "common/AsyncOpTracker.h" |
7c673cae | 12 | #include "common/Cond.h" |
a4b75251 | 13 | #include "common/Timer.h" |
9f95a23c | 14 | #include "common/RefCountedObj.h" |
7c673cae FG |
15 | #include "journal/Future.h" |
16 | #include "journal/JournalMetadataListener.h" | |
17 | #include "journal/ReplayEntry.h" | |
18 | #include "journal/ReplayHandler.h" | |
19 | #include "librbd/Utils.h" | |
f67539c2 | 20 | #include "librbd/asio/ContextWQ.h" |
f38dd50b | 21 | #include "librbd/io/Types.h" |
7c673cae FG |
22 | #include "librbd/journal/Types.h" |
23 | #include "librbd/journal/TypeTraits.h" | |
24 | ||
25 | #include <algorithm> | |
26 | #include <list> | |
27 | #include <string> | |
28 | #include <atomic> | |
29 | #include <unordered_map> | |
30 | ||
f67539c2 | 31 | class ContextWQ; |
f67539c2 | 32 | namespace journal { class Journaler; } |
7c673cae FG |
33 | |
34 | namespace librbd { | |
35 | ||
36 | class ImageCtx; | |
37 | ||
7c673cae FG |
38 | namespace journal { template <typename> class Replay; } |
39 | ||
40 | template <typename ImageCtxT = ImageCtx> | |
9f95a23c | 41 | class Journal : public RefCountedObject { |
7c673cae FG |
42 | public: |
43 | /** | |
44 | * @verbatim | |
45 | * | |
46 | * <start> | |
47 | * | | |
48 | * v | |
49 | * UNINITIALIZED ---> INITIALIZING ---> REPLAYING ------> FLUSHING ---> READY | |
50 | * | * . ^ * . * | | |
51 | * | * . | * . * | | |
52 | * | * . | (error) * . . . . . . . * | | |
53 | * | * . | * . * | | |
54 | * | * . | v . * | | |
55 | * | * . | FLUSHING_RESTART . * | | |
56 | * | * . | | . * | | |
57 | * | * . | | . * | | |
58 | * | * . | v . * v | |
59 | * | * . | RESTARTING < * * * * * STOPPING | |
60 | * | * . | | . | | |
61 | * | * . | | . | | |
62 | * | * * * * * * . \-------------/ . | | |
63 | * | * (error) . . | | |
64 | * | * . . . . . . . . . . . . . . . . | | |
65 | * | * . . | | |
66 | * | v v v | | |
67 | * | CLOSED <----- CLOSING <---------------------------------------/ | |
68 | * | | | |
69 | * | v | |
70 | * \---> <finish> | |
71 | * | |
72 | * @endverbatim | |
73 | */ | |
74 | enum State { | |
75 | STATE_UNINITIALIZED, | |
76 | STATE_INITIALIZING, | |
77 | STATE_REPLAYING, | |
78 | STATE_FLUSHING_RESTART, | |
79 | STATE_RESTARTING_REPLAY, | |
80 | STATE_FLUSHING_REPLAY, | |
81 | STATE_READY, | |
82 | STATE_STOPPING, | |
83 | STATE_CLOSING, | |
84 | STATE_CLOSED | |
85 | }; | |
86 | ||
87 | static const std::string IMAGE_CLIENT_ID; | |
88 | static const std::string LOCAL_MIRROR_UUID; | |
89 | static const std::string ORPHAN_MIRROR_UUID; | |
90 | ||
7c673cae FG |
91 | Journal(ImageCtxT &image_ctx); |
92 | ~Journal(); | |
93 | ||
f67539c2 TL |
94 | static void get_work_queue(CephContext *cct, ContextWQ **work_queue); |
95 | ||
7c673cae FG |
96 | static bool is_journal_supported(ImageCtxT &image_ctx); |
97 | static int create(librados::IoCtx &io_ctx, const std::string &image_id, | |
98 | uint8_t order, uint8_t splay_width, | |
99 | const std::string &object_pool); | |
100 | static int remove(librados::IoCtx &io_ctx, const std::string &image_id); | |
101 | static int reset(librados::IoCtx &io_ctx, const std::string &image_id); | |
102 | ||
103 | static void is_tag_owner(ImageCtxT *image_ctx, bool *is_tag_owner, | |
104 | Context *on_finish); | |
105 | static void is_tag_owner(librados::IoCtx& io_ctx, std::string& image_id, | |
f67539c2 | 106 | bool *is_tag_owner, asio::ContextWQ *op_work_queue, |
7c673cae FG |
107 | Context *on_finish); |
108 | static void get_tag_owner(librados::IoCtx& io_ctx, std::string& image_id, | |
109 | std::string *mirror_uuid, | |
f67539c2 | 110 | asio::ContextWQ *op_work_queue, Context *on_finish); |
7c673cae FG |
111 | static int request_resync(ImageCtxT *image_ctx); |
112 | static void promote(ImageCtxT *image_ctx, Context *on_finish); | |
113 | static void demote(ImageCtxT *image_ctx, Context *on_finish); | |
114 | ||
115 | bool is_journal_ready() const; | |
116 | bool is_journal_replaying() const; | |
117 | bool is_journal_appending() const; | |
118 | ||
119 | void wait_for_journal_ready(Context *on_ready); | |
120 | ||
121 | void open(Context *on_finish); | |
122 | void close(Context *on_finish); | |
123 | ||
124 | bool is_tag_owner() const; | |
125 | uint64_t get_tag_tid() const; | |
126 | journal::TagData get_tag_data() const; | |
127 | ||
128 | void allocate_local_tag(Context *on_finish); | |
129 | void allocate_tag(const std::string &mirror_uuid, | |
130 | const journal::TagPredecessor &predecessor, | |
131 | Context *on_finish); | |
132 | ||
133 | void flush_commit_position(Context *on_finish); | |
134 | ||
494da23a TL |
135 | void user_flushed(); |
136 | ||
f38dd50b | 137 | uint64_t append_write_event(const io::Extents &image_extents, |
7c673cae | 138 | const bufferlist &bl, |
7c673cae | 139 | bool flush_entry); |
f38dd50b TL |
140 | uint64_t append_write_same_event(const io::Extents &image_extents, |
141 | const bufferlist &bl, | |
142 | bool flush_entry); | |
39ae355f TL |
143 | uint64_t append_compare_and_write_event(uint64_t offset, |
144 | size_t length, | |
145 | const bufferlist &cmp_bl, | |
146 | const bufferlist &write_bl, | |
147 | bool flush_entry); | |
f38dd50b TL |
148 | uint64_t append_discard_event(const io::Extents &image_extents, |
149 | uint32_t discard_granularity_bytes, | |
150 | bool flush_entry); | |
7c673cae | 151 | uint64_t append_io_event(journal::EventEntry &&event_entry, |
7c673cae | 152 | uint64_t offset, size_t length, |
b32b8144 | 153 | bool flush_entry, int filter_ret_val); |
7c673cae FG |
154 | void commit_io_event(uint64_t tid, int r); |
155 | void commit_io_event_extent(uint64_t tid, uint64_t offset, uint64_t length, | |
156 | int r); | |
157 | ||
158 | void append_op_event(uint64_t op_tid, journal::EventEntry &&event_entry, | |
159 | Context *on_safe); | |
160 | void commit_op_event(uint64_t tid, int r, Context *on_safe); | |
161 | void replay_op_ready(uint64_t op_tid, Context *on_resume); | |
162 | ||
163 | void flush_event(uint64_t tid, Context *on_safe); | |
164 | void wait_event(uint64_t tid, Context *on_safe); | |
165 | ||
166 | uint64_t allocate_op_tid() { | |
167 | uint64_t op_tid = ++m_op_tid; | |
11fdf7f2 | 168 | ceph_assert(op_tid != 0); |
7c673cae FG |
169 | return op_tid; |
170 | } | |
171 | ||
172 | void start_external_replay(journal::Replay<ImageCtxT> **journal_replay, | |
173 | Context *on_start); | |
174 | void stop_external_replay(); | |
175 | ||
176 | void add_listener(journal::Listener *listener); | |
177 | void remove_listener(journal::Listener *listener); | |
178 | ||
179 | int is_resync_requested(bool *do_resync); | |
180 | ||
181 | inline ContextWQ *get_work_queue() { | |
182 | return m_work_queue; | |
183 | } | |
184 | ||
185 | private: | |
186 | ImageCtxT &m_image_ctx; | |
187 | ||
188 | // mock unit testing support | |
189 | typedef journal::TypeTraits<ImageCtxT> TypeTraits; | |
190 | typedef typename TypeTraits::Journaler Journaler; | |
191 | typedef typename TypeTraits::Future Future; | |
192 | typedef typename TypeTraits::ReplayEntry ReplayEntry; | |
193 | ||
194 | typedef std::list<bufferlist> Bufferlists; | |
195 | typedef std::list<Context *> Contexts; | |
196 | typedef std::list<Future> Futures; | |
197 | typedef interval_set<uint64_t> ExtentInterval; | |
198 | ||
199 | struct Event { | |
200 | Futures futures; | |
7c673cae FG |
201 | Contexts on_safe_contexts; |
202 | ExtentInterval pending_extents; | |
b32b8144 | 203 | int filter_ret_val = 0; |
7c673cae FG |
204 | bool committed_io = false; |
205 | bool safe = false; | |
206 | int ret_val = 0; | |
207 | ||
208 | Event() { | |
209 | } | |
f38dd50b | 210 | Event(const Futures &_futures, const io::Extents &image_extents, |
11fdf7f2 TL |
211 | int filter_ret_val) |
212 | : futures(_futures), filter_ret_val(filter_ret_val) { | |
f38dd50b TL |
213 | for (auto &extent : image_extents) { |
214 | if (extent.second > 0) { | |
215 | pending_extents.insert(extent.first, extent.second); | |
216 | } | |
7c673cae FG |
217 | } |
218 | } | |
219 | }; | |
220 | ||
221 | typedef std::unordered_map<uint64_t, Event> Events; | |
222 | typedef std::unordered_map<uint64_t, Future> TidToFutures; | |
223 | ||
224 | struct C_IOEventSafe : public Context { | |
225 | Journal *journal; | |
226 | uint64_t tid; | |
227 | ||
228 | C_IOEventSafe(Journal *_journal, uint64_t _tid) | |
229 | : journal(_journal), tid(_tid) { | |
230 | } | |
231 | ||
232 | void finish(int r) override { | |
233 | journal->handle_io_event_safe(r, tid); | |
234 | } | |
235 | }; | |
236 | ||
237 | struct C_OpEventSafe : public Context { | |
238 | Journal *journal; | |
239 | uint64_t tid; | |
240 | Future op_start_future; | |
241 | Future op_finish_future; | |
242 | Context *on_safe; | |
243 | ||
244 | C_OpEventSafe(Journal *journal, uint64_t tid, const Future &op_start_future, | |
245 | const Future &op_finish_future, Context *on_safe) | |
246 | : journal(journal), tid(tid), op_start_future(op_start_future), | |
247 | op_finish_future(op_finish_future), on_safe(on_safe) { | |
248 | } | |
249 | ||
250 | void finish(int r) override { | |
251 | journal->handle_op_event_safe(r, tid, op_start_future, op_finish_future, | |
252 | on_safe); | |
253 | } | |
254 | }; | |
255 | ||
256 | struct C_ReplayProcessSafe : public Context { | |
257 | Journal *journal; | |
258 | ReplayEntry replay_entry; | |
259 | ||
260 | C_ReplayProcessSafe(Journal *journal, ReplayEntry &&replay_entry) : | |
261 | journal(journal), replay_entry(std::move(replay_entry)) { | |
262 | } | |
263 | void finish(int r) override { | |
264 | journal->handle_replay_process_safe(replay_entry, r); | |
265 | } | |
266 | }; | |
267 | ||
268 | struct ReplayHandler : public ::journal::ReplayHandler { | |
269 | Journal *journal; | |
270 | ReplayHandler(Journal *_journal) : journal(_journal) { | |
271 | } | |
272 | ||
7c673cae FG |
273 | void handle_entries_available() override { |
274 | journal->handle_replay_ready(); | |
275 | } | |
276 | void handle_complete(int r) override { | |
277 | journal->handle_replay_complete(r); | |
278 | } | |
279 | }; | |
280 | ||
281 | ContextWQ *m_work_queue = nullptr; | |
282 | SafeTimer *m_timer = nullptr; | |
9f95a23c | 283 | ceph::mutex *m_timer_lock = nullptr; |
7c673cae FG |
284 | |
285 | Journaler *m_journaler; | |
9f95a23c | 286 | mutable ceph::mutex m_lock = ceph::make_mutex("Journal<I>::m_lock"); |
7c673cae FG |
287 | State m_state; |
288 | uint64_t m_max_append_size = 0; | |
289 | uint64_t m_tag_class = 0; | |
290 | uint64_t m_tag_tid = 0; | |
291 | journal::ImageClientMeta m_client_meta; | |
292 | journal::TagData m_tag_data; | |
293 | ||
294 | int m_error_result; | |
295 | Contexts m_wait_for_state_contexts; | |
296 | ||
297 | ReplayHandler m_replay_handler; | |
298 | bool m_close_pending; | |
299 | ||
9f95a23c | 300 | ceph::mutex m_event_lock = ceph::make_mutex("Journal<I>::m_event_lock"); |
7c673cae FG |
301 | uint64_t m_event_tid; |
302 | Events m_events; | |
303 | ||
494da23a TL |
304 | std::atomic<bool> m_user_flushed = false; |
305 | ||
7c673cae FG |
306 | std::atomic<uint64_t> m_op_tid = { 0 }; |
307 | TidToFutures m_op_futures; | |
308 | ||
309 | bool m_processing_entry = false; | |
310 | bool m_blocking_writes; | |
311 | ||
312 | journal::Replay<ImageCtxT> *m_journal_replay; | |
313 | ||
9f95a23c | 314 | AsyncOpTracker m_async_journal_op_tracker; |
7c673cae FG |
315 | |
316 | struct MetadataListener : public ::journal::JournalMetadataListener { | |
317 | Journal<ImageCtxT> *journal; | |
318 | ||
319 | MetadataListener(Journal<ImageCtxT> *journal) : journal(journal) { } | |
320 | ||
f67539c2 | 321 | void handle_update(::journal::JournalMetadata *) override; |
7c673cae FG |
322 | } m_metadata_listener; |
323 | ||
324 | typedef std::set<journal::Listener *> Listeners; | |
325 | Listeners m_listeners; | |
9f95a23c | 326 | ceph::condition_variable m_listener_cond; |
7c673cae FG |
327 | bool m_listener_notify = false; |
328 | ||
329 | uint64_t m_refresh_sequence = 0; | |
330 | ||
9f95a23c TL |
331 | bool is_journal_replaying(const ceph::mutex &) const; |
332 | bool is_tag_owner(const ceph::mutex &) const; | |
7c673cae | 333 | |
f38dd50b TL |
334 | void add_write_event_entries(uint64_t offset, size_t length, |
335 | const bufferlist &bl, | |
336 | uint64_t buffer_offset, | |
337 | Bufferlists *bufferlists); | |
7c673cae FG |
338 | uint64_t append_io_events(journal::EventType event_type, |
339 | const Bufferlists &bufferlists, | |
f38dd50b | 340 | const io::Extents &extents, bool flush_entry, |
b32b8144 | 341 | int filter_ret_val); |
9f95a23c | 342 | Future wait_event(ceph::mutex &lock, uint64_t tid, Context *on_safe); |
7c673cae FG |
343 | |
344 | void create_journaler(); | |
345 | void destroy_journaler(int r); | |
346 | void recreate_journaler(int r); | |
347 | ||
348 | void complete_event(typename Events::iterator it, int r); | |
349 | ||
350 | void start_append(); | |
351 | ||
352 | void handle_open(int r); | |
353 | ||
354 | void handle_replay_ready(); | |
355 | void handle_replay_complete(int r); | |
356 | void handle_replay_process_ready(int r); | |
357 | void handle_replay_process_safe(ReplayEntry replay_entry, int r); | |
358 | ||
359 | void handle_start_external_replay(int r, | |
360 | journal::Replay<ImageCtxT> **journal_replay, | |
361 | Context *on_finish); | |
362 | ||
363 | void handle_flushing_restart(int r); | |
364 | void handle_flushing_replay(); | |
365 | ||
366 | void handle_recording_stopped(int r); | |
367 | ||
368 | void handle_journal_destroyed(int r); | |
369 | ||
370 | void handle_io_event_safe(int r, uint64_t tid); | |
371 | void handle_op_event_safe(int r, uint64_t tid, const Future &op_start_future, | |
372 | const Future &op_finish_future, Context *on_safe); | |
373 | ||
374 | void stop_recording(); | |
375 | ||
376 | void transition_state(State state, int r); | |
377 | ||
378 | bool is_steady_state() const; | |
379 | void wait_for_steady_state(Context *on_state); | |
380 | ||
381 | int check_resync_requested(bool *do_resync); | |
382 | ||
383 | void handle_metadata_updated(); | |
384 | void handle_refresh_metadata(uint64_t refresh_sequence, uint64_t tag_tid, | |
385 | journal::TagData tag_data, int r); | |
386 | ||
387 | }; | |
388 | ||
389 | } // namespace librbd | |
390 | ||
391 | extern template class librbd::Journal<librbd::ImageCtx>; | |
392 | ||
393 | #endif // CEPH_LIBRBD_JOURNAL_H |