]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | ||
16 | #ifndef CEPH_MDLOG_H | |
17 | #define CEPH_MDLOG_H | |
18 | ||
19 | enum { | |
20 | l_mdl_first = 5000, | |
21 | l_mdl_evadd, | |
22 | l_mdl_evex, | |
23 | l_mdl_evtrm, | |
24 | l_mdl_ev, | |
25 | l_mdl_evexg, | |
26 | l_mdl_evexd, | |
27 | l_mdl_segadd, | |
28 | l_mdl_segex, | |
29 | l_mdl_segtrm, | |
30 | l_mdl_seg, | |
31 | l_mdl_segexg, | |
32 | l_mdl_segexd, | |
33 | l_mdl_expos, | |
34 | l_mdl_wrpos, | |
35 | l_mdl_rdpos, | |
36 | l_mdl_jlat, | |
37 | l_mdl_replayed, | |
38 | l_mdl_last, | |
39 | }; | |
40 | ||
41 | #include "include/types.h" | |
42 | #include "include/Context.h" | |
43 | ||
44 | #include "common/Thread.h" | |
45 | #include "common/Cond.h" | |
46 | ||
47 | #include "LogSegment.h" | |
48 | ||
49 | #include <list> | |
50 | ||
51 | class Journaler; | |
52 | class JournalPointer; | |
53 | class LogEvent; | |
54 | class MDSRank; | |
55 | class LogSegment; | |
56 | class ESubtreeMap; | |
57 | ||
58 | class PerfCounters; | |
59 | ||
60 | #include <map> | |
61 | using std::map; | |
62 | ||
63 | #include "common/Finisher.h" | |
64 | ||
65 | ||
66 | class MDLog { | |
67 | public: | |
68 | MDSRank *mds; | |
69 | protected: | |
70 | int num_events; // in events | |
71 | ||
72 | int unflushed; | |
73 | ||
74 | bool capped; | |
75 | ||
76 | // Log position which is persistent *and* for which | |
77 | // submit_entry wait_for_safe callbacks have already | |
78 | // been called. | |
79 | uint64_t safe_pos; | |
80 | ||
81 | inodeno_t ino; | |
82 | Journaler *journaler; | |
83 | ||
84 | PerfCounters *logger; | |
85 | ||
86 | ||
87 | // -- replay -- | |
88 | class ReplayThread : public Thread { | |
89 | MDLog *log; | |
90 | public: | |
91 | explicit ReplayThread(MDLog *l) : log(l) {} | |
92 | void* entry() override { | |
93 | log->_replay_thread(); | |
94 | return 0; | |
95 | } | |
96 | } replay_thread; | |
97 | bool already_replayed; | |
98 | ||
99 | friend class ReplayThread; | |
100 | friend class C_MDL_Replay; | |
101 | ||
102 | list<MDSInternalContextBase*> waitfor_replay; | |
103 | ||
104 | void _replay(); // old way | |
105 | void _replay_thread(); // new way | |
106 | ||
107 | // Journal recovery/rewrite logic | |
108 | class RecoveryThread : public Thread { | |
109 | MDLog *log; | |
110 | MDSInternalContextBase *completion; | |
111 | public: | |
112 | void set_completion(MDSInternalContextBase *c) {completion = c;} | |
113 | explicit RecoveryThread(MDLog *l) : log(l), completion(NULL) {} | |
114 | void* entry() override { | |
115 | log->_recovery_thread(completion); | |
116 | return 0; | |
117 | } | |
118 | } recovery_thread; | |
119 | void _recovery_thread(MDSInternalContextBase *completion); | |
120 | void _reformat_journal(JournalPointer const &jp, Journaler *old_journal, MDSInternalContextBase *completion); | |
121 | ||
122 | // -- segments -- | |
123 | map<uint64_t,LogSegment*> segments; | |
124 | set<LogSegment*> expiring_segments; | |
125 | set<LogSegment*> expired_segments; | |
126 | uint64_t event_seq; | |
127 | int expiring_events; | |
128 | int expired_events; | |
129 | ||
130 | struct PendingEvent { | |
131 | LogEvent *le; | |
132 | MDSContext *fin; | |
133 | bool flush; | |
134 | PendingEvent(LogEvent *e, MDSContext *c, bool f=false) : le(e), fin(c), flush(f) {} | |
135 | }; | |
136 | ||
137 | int64_t mdsmap_up_features; | |
138 | map<uint64_t,list<PendingEvent> > pending_events; // log segment -> event list | |
139 | Mutex submit_mutex; | |
140 | Cond submit_cond; | |
141 | ||
142 | void set_safe_pos(uint64_t pos) | |
143 | { | |
144 | Mutex::Locker l(submit_mutex); | |
145 | assert(pos >= safe_pos); | |
146 | safe_pos = pos; | |
147 | } | |
148 | friend class MDSLogContextBase; | |
149 | ||
150 | void _submit_thread(); | |
151 | class SubmitThread : public Thread { | |
152 | MDLog *log; | |
153 | public: | |
154 | explicit SubmitThread(MDLog *l) : log(l) {} | |
155 | void* entry() override { | |
156 | log->_submit_thread(); | |
157 | return 0; | |
158 | } | |
159 | } submit_thread; | |
160 | friend class SubmitThread; | |
161 | ||
162 | public: | |
163 | const std::set<LogSegment*> &get_expiring_segments() const | |
164 | { | |
165 | return expiring_segments; | |
166 | } | |
167 | protected: | |
168 | ||
169 | // -- subtreemaps -- | |
170 | friend class ESubtreeMap; | |
171 | friend class MDCache; | |
172 | ||
173 | uint64_t get_last_segment_seq() const { | |
174 | assert(!segments.empty()); | |
175 | return segments.rbegin()->first; | |
176 | } | |
177 | LogSegment *get_oldest_segment() { | |
178 | return segments.begin()->second; | |
179 | } | |
180 | void remove_oldest_segment() { | |
181 | map<uint64_t, LogSegment*>::iterator p = segments.begin(); | |
182 | delete p->second; | |
183 | segments.erase(p); | |
184 | } | |
185 | ||
186 | public: | |
187 | void create_logger(); | |
188 | ||
189 | // replay state | |
190 | map<inodeno_t, set<inodeno_t> > pending_exports; | |
191 | ||
192 | void set_write_iohint(unsigned iohint_flags); | |
193 | ||
194 | public: | |
195 | explicit MDLog(MDSRank *m) : mds(m), | |
196 | num_events(0), | |
197 | unflushed(0), | |
198 | capped(false), | |
199 | safe_pos(0), | |
200 | journaler(0), | |
201 | logger(0), | |
202 | replay_thread(this), | |
203 | already_replayed(false), | |
204 | recovery_thread(this), | |
205 | event_seq(0), expiring_events(0), expired_events(0), | |
206 | mdsmap_up_features(0), | |
207 | submit_mutex("MDLog::submit_mutex"), | |
208 | submit_thread(this), | |
209 | cur_event(NULL) { } | |
210 | ~MDLog(); | |
211 | ||
212 | ||
213 | private: | |
214 | // -- segments -- | |
215 | void _start_new_segment(); | |
216 | void _prepare_new_segment(); | |
217 | void _journal_segment_subtree_map(MDSInternalContextBase *onsync); | |
218 | public: | |
219 | void start_new_segment() { | |
220 | Mutex::Locker l(submit_mutex); | |
221 | _start_new_segment(); | |
222 | } | |
223 | void prepare_new_segment() { | |
224 | Mutex::Locker l(submit_mutex); | |
225 | _prepare_new_segment(); | |
226 | } | |
227 | void journal_segment_subtree_map(MDSInternalContextBase *onsync=NULL) { | |
228 | submit_mutex.Lock(); | |
229 | _journal_segment_subtree_map(onsync); | |
230 | submit_mutex.Unlock(); | |
231 | if (onsync) | |
232 | flush(); | |
233 | } | |
234 | ||
235 | LogSegment *peek_current_segment() { | |
236 | return segments.empty() ? NULL : segments.rbegin()->second; | |
237 | } | |
238 | ||
239 | LogSegment *get_current_segment() { | |
240 | assert(!segments.empty()); | |
241 | return segments.rbegin()->second; | |
242 | } | |
243 | ||
244 | LogSegment *get_segment(log_segment_seq_t seq) { | |
245 | if (segments.count(seq)) | |
246 | return segments[seq]; | |
247 | return NULL; | |
248 | } | |
249 | ||
250 | bool have_any_segments() const { | |
251 | return !segments.empty(); | |
252 | } | |
253 | ||
254 | void flush_logger(); | |
255 | ||
256 | size_t get_num_events() const { return num_events; } | |
257 | size_t get_num_segments() const { return segments.size(); } | |
258 | ||
259 | uint64_t get_read_pos() const; | |
260 | uint64_t get_write_pos() const; | |
261 | uint64_t get_safe_pos() const; | |
262 | Journaler *get_journaler() { return journaler; } | |
263 | bool empty() const { return segments.empty(); } | |
264 | ||
265 | bool is_capped() const { return capped; } | |
266 | void cap(); | |
267 | ||
268 | void kick_submitter(); | |
269 | void shutdown(); | |
270 | ||
271 | // -- events -- | |
272 | private: | |
273 | LogEvent *cur_event; | |
274 | public: | |
275 | void _start_entry(LogEvent *e); | |
276 | void start_entry(LogEvent *e) { | |
277 | Mutex::Locker l(submit_mutex); | |
278 | _start_entry(e); | |
279 | } | |
280 | void cancel_entry(LogEvent *e); | |
281 | void _submit_entry(LogEvent *e, MDSLogContextBase *c); | |
282 | void submit_entry(LogEvent *e, MDSLogContextBase *c = 0) { | |
283 | Mutex::Locker l(submit_mutex); | |
284 | _submit_entry(e, c); | |
285 | submit_cond.Signal(); | |
286 | } | |
287 | void start_submit_entry(LogEvent *e, MDSLogContextBase *c = 0) { | |
288 | Mutex::Locker l(submit_mutex); | |
289 | _start_entry(e); | |
290 | _submit_entry(e, c); | |
291 | submit_cond.Signal(); | |
292 | } | |
293 | bool entry_is_open() const { return cur_event != NULL; } | |
294 | ||
295 | void wait_for_safe( MDSInternalContextBase *c ); | |
296 | void flush(); | |
297 | bool is_flushed() const { | |
298 | return unflushed == 0; | |
299 | } | |
300 | ||
301 | private: | |
302 | void try_expire(LogSegment *ls, int op_prio); | |
303 | void _maybe_expired(LogSegment *ls, int op_prio); | |
304 | void _expired(LogSegment *ls); | |
305 | void _trim_expired_segments(); | |
306 | ||
307 | friend class C_MaybeExpiredSegment; | |
308 | friend class C_MDL_Flushed; | |
309 | ||
310 | public: | |
311 | void trim_expired_segments(); | |
312 | void trim(int max=-1); | |
313 | int trim_all(); | |
314 | bool expiry_done() const | |
315 | { | |
316 | return expiring_segments.empty() && expired_segments.empty(); | |
317 | }; | |
318 | ||
319 | private: | |
320 | void write_head(MDSInternalContextBase *onfinish); | |
321 | ||
322 | public: | |
323 | void create(MDSInternalContextBase *onfinish); // fresh, empty log! | |
324 | void open(MDSInternalContextBase *onopen); // append() or replay() to follow! | |
325 | void reopen(MDSInternalContextBase *onopen); | |
326 | void append(); | |
327 | void replay(MDSInternalContextBase *onfinish); | |
328 | ||
329 | void standby_trim_segments(); | |
330 | ||
331 | void dump_replay_status(Formatter *f) const; | |
332 | }; | |
333 | ||
334 | #endif |