]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDLog.h
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / mds / MDLog.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 #ifndef CEPH_MDLOG_H
17 #define CEPH_MDLOG_H
18
19 enum {
20 l_mdl_first = 5000,
21 l_mdl_evadd,
22 l_mdl_evex,
23 l_mdl_evtrm,
24 l_mdl_ev,
25 l_mdl_evexg,
26 l_mdl_evexd,
27 l_mdl_segadd,
28 l_mdl_segex,
29 l_mdl_segtrm,
30 l_mdl_seg,
31 l_mdl_segexg,
32 l_mdl_segexd,
33 l_mdl_expos,
34 l_mdl_wrpos,
35 l_mdl_rdpos,
36 l_mdl_jlat,
37 l_mdl_replayed,
38 l_mdl_last,
39 };
40
41 #include "include/types.h"
42 #include "include/Context.h"
43
44 #include "common/Thread.h"
45 #include "common/Cond.h"
46
47 #include "LogSegment.h"
48
49 #include <list>
50
51 class Journaler;
52 class JournalPointer;
53 class LogEvent;
54 class MDSRank;
55 class LogSegment;
56 class ESubtreeMap;
57
58 class PerfCounters;
59
60 #include <map>
61 using std::map;
62
63 #include "common/Finisher.h"
64
65
66 class MDLog {
67 public:
68 MDSRank *mds;
69 protected:
70 int num_events; // in events
71
72 int unflushed;
73
74 bool capped;
75
76 // Log position which is persistent *and* for which
77 // submit_entry wait_for_safe callbacks have already
78 // been called.
79 uint64_t safe_pos;
80
81 inodeno_t ino;
82 Journaler *journaler;
83
84 PerfCounters *logger;
85
86
87 // -- replay --
88 class ReplayThread : public Thread {
89 MDLog *log;
90 public:
91 explicit ReplayThread(MDLog *l) : log(l) {}
92 void* entry() override {
93 log->_replay_thread();
94 return 0;
95 }
96 } replay_thread;
97 bool already_replayed;
98
99 friend class ReplayThread;
100 friend class C_MDL_Replay;
101
102 list<MDSInternalContextBase*> waitfor_replay;
103
104 void _replay(); // old way
105 void _replay_thread(); // new way
106
107 // Journal recovery/rewrite logic
108 class RecoveryThread : public Thread {
109 MDLog *log;
110 MDSInternalContextBase *completion;
111 public:
112 void set_completion(MDSInternalContextBase *c) {completion = c;}
113 explicit RecoveryThread(MDLog *l) : log(l), completion(NULL) {}
114 void* entry() override {
115 log->_recovery_thread(completion);
116 return 0;
117 }
118 } recovery_thread;
119 void _recovery_thread(MDSInternalContextBase *completion);
120 void _reformat_journal(JournalPointer const &jp, Journaler *old_journal, MDSInternalContextBase *completion);
121
122 // -- segments --
123 map<uint64_t,LogSegment*> segments;
124 set<LogSegment*> expiring_segments;
125 set<LogSegment*> expired_segments;
126 uint64_t event_seq;
127 int expiring_events;
128 int expired_events;
129
130 struct PendingEvent {
131 LogEvent *le;
132 MDSContext *fin;
133 bool flush;
134 PendingEvent(LogEvent *e, MDSContext *c, bool f=false) : le(e), fin(c), flush(f) {}
135 };
136
137 int64_t mdsmap_up_features;
138 map<uint64_t,list<PendingEvent> > pending_events; // log segment -> event list
139 Mutex submit_mutex;
140 Cond submit_cond;
141
142 void set_safe_pos(uint64_t pos)
143 {
144 Mutex::Locker l(submit_mutex);
145 assert(pos >= safe_pos);
146 safe_pos = pos;
147 }
148 friend class MDSLogContextBase;
149
150 void _submit_thread();
151 class SubmitThread : public Thread {
152 MDLog *log;
153 public:
154 explicit SubmitThread(MDLog *l) : log(l) {}
155 void* entry() override {
156 log->_submit_thread();
157 return 0;
158 }
159 } submit_thread;
160 friend class SubmitThread;
161
162 public:
163 const std::set<LogSegment*> &get_expiring_segments() const
164 {
165 return expiring_segments;
166 }
167 protected:
168
169 // -- subtreemaps --
170 friend class ESubtreeMap;
171 friend class MDCache;
172
173 uint64_t get_last_segment_seq() const {
174 assert(!segments.empty());
175 return segments.rbegin()->first;
176 }
177 LogSegment *get_oldest_segment() {
178 return segments.begin()->second;
179 }
180 void remove_oldest_segment() {
181 map<uint64_t, LogSegment*>::iterator p = segments.begin();
182 delete p->second;
183 segments.erase(p);
184 }
185
186 public:
187 void create_logger();
188
189 // replay state
190 map<inodeno_t, set<inodeno_t> > pending_exports;
191
192 void set_write_iohint(unsigned iohint_flags);
193
194 public:
195 explicit MDLog(MDSRank *m) : mds(m),
196 num_events(0),
197 unflushed(0),
198 capped(false),
199 safe_pos(0),
200 journaler(0),
201 logger(0),
202 replay_thread(this),
203 already_replayed(false),
204 recovery_thread(this),
205 event_seq(0), expiring_events(0), expired_events(0),
206 mdsmap_up_features(0),
207 submit_mutex("MDLog::submit_mutex"),
208 submit_thread(this),
209 cur_event(NULL) { }
210 ~MDLog();
211
212
213 private:
214 // -- segments --
215 void _start_new_segment();
216 void _prepare_new_segment();
217 void _journal_segment_subtree_map(MDSInternalContextBase *onsync);
218 public:
219 void start_new_segment() {
220 Mutex::Locker l(submit_mutex);
221 _start_new_segment();
222 }
223 void prepare_new_segment() {
224 Mutex::Locker l(submit_mutex);
225 _prepare_new_segment();
226 }
227 void journal_segment_subtree_map(MDSInternalContextBase *onsync=NULL) {
228 submit_mutex.Lock();
229 _journal_segment_subtree_map(onsync);
230 submit_mutex.Unlock();
231 if (onsync)
232 flush();
233 }
234
235 LogSegment *peek_current_segment() {
236 return segments.empty() ? NULL : segments.rbegin()->second;
237 }
238
239 LogSegment *get_current_segment() {
240 assert(!segments.empty());
241 return segments.rbegin()->second;
242 }
243
244 LogSegment *get_segment(log_segment_seq_t seq) {
245 if (segments.count(seq))
246 return segments[seq];
247 return NULL;
248 }
249
250 bool have_any_segments() const {
251 return !segments.empty();
252 }
253
254 void flush_logger();
255
256 size_t get_num_events() const { return num_events; }
257 size_t get_num_segments() const { return segments.size(); }
258
259 uint64_t get_read_pos() const;
260 uint64_t get_write_pos() const;
261 uint64_t get_safe_pos() const;
262 Journaler *get_journaler() { return journaler; }
263 bool empty() const { return segments.empty(); }
264
265 bool is_capped() const { return capped; }
266 void cap();
267
268 void kick_submitter();
269 void shutdown();
270
271 // -- events --
272 private:
273 LogEvent *cur_event;
274 public:
275 void _start_entry(LogEvent *e);
276 void start_entry(LogEvent *e) {
277 Mutex::Locker l(submit_mutex);
278 _start_entry(e);
279 }
280 void cancel_entry(LogEvent *e);
281 void _submit_entry(LogEvent *e, MDSLogContextBase *c);
282 void submit_entry(LogEvent *e, MDSLogContextBase *c = 0) {
283 Mutex::Locker l(submit_mutex);
284 _submit_entry(e, c);
285 submit_cond.Signal();
286 }
287 void start_submit_entry(LogEvent *e, MDSLogContextBase *c = 0) {
288 Mutex::Locker l(submit_mutex);
289 _start_entry(e);
290 _submit_entry(e, c);
291 submit_cond.Signal();
292 }
293 bool entry_is_open() const { return cur_event != NULL; }
294
295 void wait_for_safe( MDSInternalContextBase *c );
296 void flush();
297 bool is_flushed() const {
298 return unflushed == 0;
299 }
300
301 private:
302 void try_expire(LogSegment *ls, int op_prio);
303 void _maybe_expired(LogSegment *ls, int op_prio);
304 void _expired(LogSegment *ls);
305 void _trim_expired_segments();
306
307 friend class C_MaybeExpiredSegment;
308 friend class C_MDL_Flushed;
309
310 public:
311 void trim_expired_segments();
312 void trim(int max=-1);
313 int trim_all();
314 bool expiry_done() const
315 {
316 return expiring_segments.empty() && expired_segments.empty();
317 };
318
319 private:
320 void write_head(MDSInternalContextBase *onfinish);
321
322 public:
323 void create(MDSInternalContextBase *onfinish); // fresh, empty log!
324 void open(MDSInternalContextBase *onopen); // append() or replay() to follow!
325 void reopen(MDSInternalContextBase *onopen);
326 void append();
327 void replay(MDSInternalContextBase *onfinish);
328
329 void standby_trim_segments();
330
331 void dump_replay_status(Formatter *f) const;
332 };
333
334 #endif