]> git.proxmox.com Git - ceph.git/blame - ceph/src/librbd/Journal.h
bump version to 18.2.4-pve3
[ceph.git] / ceph / src / librbd / Journal.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#ifndef CEPH_LIBRBD_JOURNAL_H
5#define CEPH_LIBRBD_JOURNAL_H
6
7#include "include/int_types.h"
8#include "include/Context.h"
9#include "include/interval_set.h"
11fdf7f2 10#include "include/rados/librados_fwd.hpp"
9f95a23c 11#include "common/AsyncOpTracker.h"
7c673cae 12#include "common/Cond.h"
a4b75251 13#include "common/Timer.h"
9f95a23c 14#include "common/RefCountedObj.h"
7c673cae
FG
15#include "journal/Future.h"
16#include "journal/JournalMetadataListener.h"
17#include "journal/ReplayEntry.h"
18#include "journal/ReplayHandler.h"
19#include "librbd/Utils.h"
f67539c2 20#include "librbd/asio/ContextWQ.h"
f38dd50b 21#include "librbd/io/Types.h"
7c673cae
FG
22#include "librbd/journal/Types.h"
23#include "librbd/journal/TypeTraits.h"
24
25#include <algorithm>
26#include <list>
27#include <string>
28#include <atomic>
29#include <unordered_map>
30
f67539c2 31class ContextWQ;
f67539c2 32namespace journal { class Journaler; }
7c673cae
FG
33
34namespace librbd {
35
36class ImageCtx;
37
7c673cae
FG
38namespace journal { template <typename> class Replay; }
39
40template <typename ImageCtxT = ImageCtx>
9f95a23c 41class Journal : public RefCountedObject {
7c673cae
FG
42public:
43 /**
44 * @verbatim
45 *
46 * <start>
47 * |
48 * v
49 * UNINITIALIZED ---> INITIALIZING ---> REPLAYING ------> FLUSHING ---> READY
50 * | * . ^ * . * |
51 * | * . | * . * |
52 * | * . | (error) * . . . . . . . * |
53 * | * . | * . * |
54 * | * . | v . * |
55 * | * . | FLUSHING_RESTART . * |
56 * | * . | | . * |
57 * | * . | | . * |
58 * | * . | v . * v
59 * | * . | RESTARTING < * * * * * STOPPING
60 * | * . | | . |
61 * | * . | | . |
62 * | * * * * * * . \-------------/ . |
63 * | * (error) . . |
64 * | * . . . . . . . . . . . . . . . . |
65 * | * . . |
66 * | v v v |
67 * | CLOSED <----- CLOSING <---------------------------------------/
68 * | |
69 * | v
70 * \---> <finish>
71 *
72 * @endverbatim
73 */
74 enum State {
75 STATE_UNINITIALIZED,
76 STATE_INITIALIZING,
77 STATE_REPLAYING,
78 STATE_FLUSHING_RESTART,
79 STATE_RESTARTING_REPLAY,
80 STATE_FLUSHING_REPLAY,
81 STATE_READY,
82 STATE_STOPPING,
83 STATE_CLOSING,
84 STATE_CLOSED
85 };
86
87 static const std::string IMAGE_CLIENT_ID;
88 static const std::string LOCAL_MIRROR_UUID;
89 static const std::string ORPHAN_MIRROR_UUID;
90
7c673cae
FG
91 Journal(ImageCtxT &image_ctx);
92 ~Journal();
93
f67539c2
TL
94 static void get_work_queue(CephContext *cct, ContextWQ **work_queue);
95
7c673cae
FG
96 static bool is_journal_supported(ImageCtxT &image_ctx);
97 static int create(librados::IoCtx &io_ctx, const std::string &image_id,
98 uint8_t order, uint8_t splay_width,
99 const std::string &object_pool);
100 static int remove(librados::IoCtx &io_ctx, const std::string &image_id);
101 static int reset(librados::IoCtx &io_ctx, const std::string &image_id);
102
103 static void is_tag_owner(ImageCtxT *image_ctx, bool *is_tag_owner,
104 Context *on_finish);
105 static void is_tag_owner(librados::IoCtx& io_ctx, std::string& image_id,
f67539c2 106 bool *is_tag_owner, asio::ContextWQ *op_work_queue,
7c673cae
FG
107 Context *on_finish);
108 static void get_tag_owner(librados::IoCtx& io_ctx, std::string& image_id,
109 std::string *mirror_uuid,
f67539c2 110 asio::ContextWQ *op_work_queue, Context *on_finish);
7c673cae
FG
111 static int request_resync(ImageCtxT *image_ctx);
112 static void promote(ImageCtxT *image_ctx, Context *on_finish);
113 static void demote(ImageCtxT *image_ctx, Context *on_finish);
114
115 bool is_journal_ready() const;
116 bool is_journal_replaying() const;
117 bool is_journal_appending() const;
118
119 void wait_for_journal_ready(Context *on_ready);
120
121 void open(Context *on_finish);
122 void close(Context *on_finish);
123
124 bool is_tag_owner() const;
125 uint64_t get_tag_tid() const;
126 journal::TagData get_tag_data() const;
127
128 void allocate_local_tag(Context *on_finish);
129 void allocate_tag(const std::string &mirror_uuid,
130 const journal::TagPredecessor &predecessor,
131 Context *on_finish);
132
133 void flush_commit_position(Context *on_finish);
134
494da23a
TL
135 void user_flushed();
136
f38dd50b 137 uint64_t append_write_event(const io::Extents &image_extents,
7c673cae 138 const bufferlist &bl,
7c673cae 139 bool flush_entry);
f38dd50b
TL
140 uint64_t append_write_same_event(const io::Extents &image_extents,
141 const bufferlist &bl,
142 bool flush_entry);
39ae355f
TL
143 uint64_t append_compare_and_write_event(uint64_t offset,
144 size_t length,
145 const bufferlist &cmp_bl,
146 const bufferlist &write_bl,
147 bool flush_entry);
f38dd50b
TL
148 uint64_t append_discard_event(const io::Extents &image_extents,
149 uint32_t discard_granularity_bytes,
150 bool flush_entry);
7c673cae 151 uint64_t append_io_event(journal::EventEntry &&event_entry,
7c673cae 152 uint64_t offset, size_t length,
b32b8144 153 bool flush_entry, int filter_ret_val);
7c673cae
FG
154 void commit_io_event(uint64_t tid, int r);
155 void commit_io_event_extent(uint64_t tid, uint64_t offset, uint64_t length,
156 int r);
157
158 void append_op_event(uint64_t op_tid, journal::EventEntry &&event_entry,
159 Context *on_safe);
160 void commit_op_event(uint64_t tid, int r, Context *on_safe);
161 void replay_op_ready(uint64_t op_tid, Context *on_resume);
162
163 void flush_event(uint64_t tid, Context *on_safe);
164 void wait_event(uint64_t tid, Context *on_safe);
165
166 uint64_t allocate_op_tid() {
167 uint64_t op_tid = ++m_op_tid;
11fdf7f2 168 ceph_assert(op_tid != 0);
7c673cae
FG
169 return op_tid;
170 }
171
172 void start_external_replay(journal::Replay<ImageCtxT> **journal_replay,
173 Context *on_start);
174 void stop_external_replay();
175
176 void add_listener(journal::Listener *listener);
177 void remove_listener(journal::Listener *listener);
178
179 int is_resync_requested(bool *do_resync);
180
181 inline ContextWQ *get_work_queue() {
182 return m_work_queue;
183 }
184
185private:
186 ImageCtxT &m_image_ctx;
187
188 // mock unit testing support
189 typedef journal::TypeTraits<ImageCtxT> TypeTraits;
190 typedef typename TypeTraits::Journaler Journaler;
191 typedef typename TypeTraits::Future Future;
192 typedef typename TypeTraits::ReplayEntry ReplayEntry;
193
194 typedef std::list<bufferlist> Bufferlists;
195 typedef std::list<Context *> Contexts;
196 typedef std::list<Future> Futures;
197 typedef interval_set<uint64_t> ExtentInterval;
198
199 struct Event {
200 Futures futures;
7c673cae
FG
201 Contexts on_safe_contexts;
202 ExtentInterval pending_extents;
b32b8144 203 int filter_ret_val = 0;
7c673cae
FG
204 bool committed_io = false;
205 bool safe = false;
206 int ret_val = 0;
207
208 Event() {
209 }
f38dd50b 210 Event(const Futures &_futures, const io::Extents &image_extents,
11fdf7f2
TL
211 int filter_ret_val)
212 : futures(_futures), filter_ret_val(filter_ret_val) {
f38dd50b
TL
213 for (auto &extent : image_extents) {
214 if (extent.second > 0) {
215 pending_extents.insert(extent.first, extent.second);
216 }
7c673cae
FG
217 }
218 }
219 };
220
221 typedef std::unordered_map<uint64_t, Event> Events;
222 typedef std::unordered_map<uint64_t, Future> TidToFutures;
223
224 struct C_IOEventSafe : public Context {
225 Journal *journal;
226 uint64_t tid;
227
228 C_IOEventSafe(Journal *_journal, uint64_t _tid)
229 : journal(_journal), tid(_tid) {
230 }
231
232 void finish(int r) override {
233 journal->handle_io_event_safe(r, tid);
234 }
235 };
236
237 struct C_OpEventSafe : public Context {
238 Journal *journal;
239 uint64_t tid;
240 Future op_start_future;
241 Future op_finish_future;
242 Context *on_safe;
243
244 C_OpEventSafe(Journal *journal, uint64_t tid, const Future &op_start_future,
245 const Future &op_finish_future, Context *on_safe)
246 : journal(journal), tid(tid), op_start_future(op_start_future),
247 op_finish_future(op_finish_future), on_safe(on_safe) {
248 }
249
250 void finish(int r) override {
251 journal->handle_op_event_safe(r, tid, op_start_future, op_finish_future,
252 on_safe);
253 }
254 };
255
256 struct C_ReplayProcessSafe : public Context {
257 Journal *journal;
258 ReplayEntry replay_entry;
259
260 C_ReplayProcessSafe(Journal *journal, ReplayEntry &&replay_entry) :
261 journal(journal), replay_entry(std::move(replay_entry)) {
262 }
263 void finish(int r) override {
264 journal->handle_replay_process_safe(replay_entry, r);
265 }
266 };
267
268 struct ReplayHandler : public ::journal::ReplayHandler {
269 Journal *journal;
270 ReplayHandler(Journal *_journal) : journal(_journal) {
271 }
272
7c673cae
FG
273 void handle_entries_available() override {
274 journal->handle_replay_ready();
275 }
276 void handle_complete(int r) override {
277 journal->handle_replay_complete(r);
278 }
279 };
280
281 ContextWQ *m_work_queue = nullptr;
282 SafeTimer *m_timer = nullptr;
9f95a23c 283 ceph::mutex *m_timer_lock = nullptr;
7c673cae
FG
284
285 Journaler *m_journaler;
9f95a23c 286 mutable ceph::mutex m_lock = ceph::make_mutex("Journal<I>::m_lock");
7c673cae
FG
287 State m_state;
288 uint64_t m_max_append_size = 0;
289 uint64_t m_tag_class = 0;
290 uint64_t m_tag_tid = 0;
291 journal::ImageClientMeta m_client_meta;
292 journal::TagData m_tag_data;
293
294 int m_error_result;
295 Contexts m_wait_for_state_contexts;
296
297 ReplayHandler m_replay_handler;
298 bool m_close_pending;
299
9f95a23c 300 ceph::mutex m_event_lock = ceph::make_mutex("Journal<I>::m_event_lock");
7c673cae
FG
301 uint64_t m_event_tid;
302 Events m_events;
303
494da23a
TL
304 std::atomic<bool> m_user_flushed = false;
305
7c673cae
FG
306 std::atomic<uint64_t> m_op_tid = { 0 };
307 TidToFutures m_op_futures;
308
309 bool m_processing_entry = false;
310 bool m_blocking_writes;
311
312 journal::Replay<ImageCtxT> *m_journal_replay;
313
9f95a23c 314 AsyncOpTracker m_async_journal_op_tracker;
7c673cae
FG
315
316 struct MetadataListener : public ::journal::JournalMetadataListener {
317 Journal<ImageCtxT> *journal;
318
319 MetadataListener(Journal<ImageCtxT> *journal) : journal(journal) { }
320
f67539c2 321 void handle_update(::journal::JournalMetadata *) override;
7c673cae
FG
322 } m_metadata_listener;
323
324 typedef std::set<journal::Listener *> Listeners;
325 Listeners m_listeners;
9f95a23c 326 ceph::condition_variable m_listener_cond;
7c673cae
FG
327 bool m_listener_notify = false;
328
329 uint64_t m_refresh_sequence = 0;
330
9f95a23c
TL
331 bool is_journal_replaying(const ceph::mutex &) const;
332 bool is_tag_owner(const ceph::mutex &) const;
7c673cae 333
f38dd50b
TL
334 void add_write_event_entries(uint64_t offset, size_t length,
335 const bufferlist &bl,
336 uint64_t buffer_offset,
337 Bufferlists *bufferlists);
7c673cae
FG
338 uint64_t append_io_events(journal::EventType event_type,
339 const Bufferlists &bufferlists,
f38dd50b 340 const io::Extents &extents, bool flush_entry,
b32b8144 341 int filter_ret_val);
9f95a23c 342 Future wait_event(ceph::mutex &lock, uint64_t tid, Context *on_safe);
7c673cae
FG
343
344 void create_journaler();
345 void destroy_journaler(int r);
346 void recreate_journaler(int r);
347
348 void complete_event(typename Events::iterator it, int r);
349
350 void start_append();
351
352 void handle_open(int r);
353
354 void handle_replay_ready();
355 void handle_replay_complete(int r);
356 void handle_replay_process_ready(int r);
357 void handle_replay_process_safe(ReplayEntry replay_entry, int r);
358
359 void handle_start_external_replay(int r,
360 journal::Replay<ImageCtxT> **journal_replay,
361 Context *on_finish);
362
363 void handle_flushing_restart(int r);
364 void handle_flushing_replay();
365
366 void handle_recording_stopped(int r);
367
368 void handle_journal_destroyed(int r);
369
370 void handle_io_event_safe(int r, uint64_t tid);
371 void handle_op_event_safe(int r, uint64_t tid, const Future &op_start_future,
372 const Future &op_finish_future, Context *on_safe);
373
374 void stop_recording();
375
376 void transition_state(State state, int r);
377
378 bool is_steady_state() const;
379 void wait_for_steady_state(Context *on_state);
380
381 int check_resync_requested(bool *do_resync);
382
383 void handle_metadata_updated();
384 void handle_refresh_metadata(uint64_t refresh_sequence, uint64_t tag_tid,
385 journal::TagData tag_data, int r);
386
387};
388
389} // namespace librbd
390
391extern template class librbd::Journal<librbd::ImageCtx>;
392
393#endif // CEPH_LIBRBD_JOURNAL_H