]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | ||
16 | ||
17 | #ifndef CEPH_MDCACHE_H | |
18 | #define CEPH_MDCACHE_H | |
19 | ||
20 | #include "include/types.h" | |
21 | #include "include/filepath.h" | |
22 | #include "include/elist.h" | |
23 | ||
24 | #include "osdc/Filer.h" | |
25 | #include "CInode.h" | |
26 | #include "CDentry.h" | |
27 | #include "CDir.h" | |
28 | #include "include/Context.h" | |
29 | #include "events/EMetaBlob.h" | |
30 | #include "RecoveryQueue.h" | |
31 | #include "StrayManager.h" | |
32 | #include "MDSContext.h" | |
33 | #include "MDSMap.h" | |
34 | #include "Mutation.h" | |
35 | ||
36 | #include "messages/MClientRequest.h" | |
37 | #include "messages/MMDSSlaveRequest.h" | |
38 | ||
39 | class PerfCounters; | |
40 | ||
41 | class MDSRank; | |
42 | class Session; | |
43 | class Migrator; | |
44 | ||
45 | class Message; | |
46 | class Session; | |
47 | ||
48 | class MMDSResolve; | |
49 | class MMDSResolveAck; | |
50 | class MMDSCacheRejoin; | |
51 | class MDiscover; | |
52 | class MDiscoverReply; | |
53 | class MCacheExpire; | |
54 | class MDirUpdate; | |
55 | class MDentryLink; | |
56 | class MDentryUnlink; | |
57 | class MLock; | |
58 | struct MMDSFindIno; | |
59 | struct MMDSFindInoReply; | |
60 | struct MMDSOpenIno; | |
61 | struct MMDSOpenInoReply; | |
62 | ||
63 | class Message; | |
64 | class MClientRequest; | |
65 | class MMDSSlaveRequest; | |
66 | struct MClientSnap; | |
67 | ||
68 | class MMDSFragmentNotify; | |
69 | ||
70 | class ESubtreeMap; | |
71 | ||
72 | enum { | |
73 | l_mdc_first = 3000, | |
74 | // How many inodes currently in stray dentries | |
75 | l_mdc_num_strays, | |
76 | // How many stray dentries are currently delayed for purge due to refs | |
77 | l_mdc_num_strays_delayed, | |
78 | // How many stray dentries are currently being enqueued for purge | |
79 | l_mdc_num_strays_enqueuing, | |
80 | ||
81 | // How many dentries have ever been added to stray dir | |
82 | l_mdc_strays_created, | |
83 | // How many dentries have been passed on to PurgeQueue | |
84 | l_mdc_strays_enqueued, | |
85 | // How many strays have been reintegrated? | |
86 | l_mdc_strays_reintegrated, | |
87 | // How many strays have been migrated? | |
88 | l_mdc_strays_migrated, | |
89 | ||
90 | // How many inode sizes currently being recovered | |
91 | l_mdc_num_recovering_processing, | |
92 | // How many inodes currently waiting to have size recovered | |
93 | l_mdc_num_recovering_enqueued, | |
94 | // How many inodes waiting with elevated priority for recovery | |
95 | l_mdc_num_recovering_prioritized, | |
96 | // How many inodes ever started size recovery | |
97 | l_mdc_recovery_started, | |
98 | // How many inodes ever completed size recovery | |
99 | l_mdc_recovery_completed, | |
100 | ||
101 | l_mdc_last, | |
102 | }; | |
103 | ||
104 | ||
105 | // flags for predirty_journal_parents() | |
106 | static const int PREDIRTY_PRIMARY = 1; // primary dn, adjust nested accounting | |
107 | static const int PREDIRTY_DIR = 2; // update parent dir mtime/size | |
108 | static const int PREDIRTY_SHALLOW = 4; // only go to immediate parent (for easier rollback) | |
109 | ||
110 | class MDCache { | |
111 | public: | |
112 | // my master | |
113 | MDSRank *mds; | |
114 | ||
115 | // -- my cache -- | |
116 | LRU lru; // dentry lru for expiring items from cache | |
31f18b77 | 117 | LRU bottom_lru; // dentries that should be trimmed ASAP |
7c673cae FG |
118 | protected: |
119 | ceph::unordered_map<vinodeno_t,CInode*> inode_map; // map of inodes by ino | |
120 | CInode *root; // root inode | |
121 | CInode *myin; // .ceph/mds%d dir | |
122 | ||
123 | bool readonly; | |
124 | void set_readonly() { readonly = true; } | |
125 | ||
126 | CInode *strays[NUM_STRAY]; // my stray dir | |
127 | int stray_index; | |
128 | ||
129 | CInode *get_stray() { | |
130 | return strays[stray_index]; | |
131 | } | |
132 | ||
133 | set<CInode*> base_inodes; | |
134 | ||
135 | std::unique_ptr<PerfCounters> logger; | |
136 | ||
137 | Filer filer; | |
138 | ||
139 | bool exceeded_size_limit; | |
140 | ||
141 | public: | |
142 | void advance_stray() { | |
143 | stray_index = (stray_index+1)%NUM_STRAY; | |
144 | } | |
145 | ||
146 | void activate_stray_manager(); | |
147 | ||
148 | /** | |
149 | * Call this when you know that a CDentry is ready to be passed | |
150 | * on to StrayManager (i.e. this is a stray you've just created) | |
151 | */ | |
152 | void notify_stray(CDentry *dn) { | |
153 | assert(dn->get_dir()->get_inode()->is_stray()); | |
154 | stray_manager.eval_stray(dn); | |
155 | } | |
156 | ||
157 | void maybe_eval_stray(CInode *in, bool delay=false); | |
31f18b77 FG |
158 | void clear_dirty_bits_for_stray(CInode* diri); |
159 | ||
7c673cae FG |
160 | bool is_readonly() { return readonly; } |
161 | void force_readonly(); | |
162 | ||
163 | DecayRate decayrate; | |
164 | ||
165 | int num_inodes_with_caps; | |
166 | ||
167 | unsigned max_dir_commit_size; | |
168 | ||
169 | static file_layout_t gen_default_file_layout(const MDSMap &mdsmap); | |
170 | static file_layout_t gen_default_log_layout(const MDSMap &mdsmap); | |
171 | ||
172 | file_layout_t default_file_layout; | |
173 | file_layout_t default_log_layout; | |
174 | ||
175 | void register_perfcounters(); | |
176 | ||
177 | // -- client leases -- | |
178 | public: | |
179 | static const int client_lease_pools = 3; | |
180 | float client_lease_durations[client_lease_pools]; | |
181 | protected: | |
182 | xlist<ClientLease*> client_leases[client_lease_pools]; | |
183 | public: | |
184 | void touch_client_lease(ClientLease *r, int pool, utime_t ttl) { | |
185 | client_leases[pool].push_back(&r->item_lease); | |
186 | r->ttl = ttl; | |
187 | } | |
188 | ||
189 | void notify_stray_removed() | |
190 | { | |
191 | stray_manager.notify_stray_removed(); | |
192 | } | |
193 | ||
194 | void notify_stray_created() | |
195 | { | |
196 | stray_manager.notify_stray_created(); | |
197 | } | |
198 | ||
31f18b77 FG |
199 | void eval_remote(CDentry *dn) |
200 | { | |
201 | stray_manager.eval_remote(dn); | |
202 | } | |
203 | ||
7c673cae FG |
204 | // -- client caps -- |
205 | uint64_t last_cap_id; | |
206 | ||
207 | ||
208 | ||
209 | // -- discover -- | |
210 | struct discover_info_t { | |
211 | ceph_tid_t tid; | |
212 | mds_rank_t mds; | |
213 | inodeno_t ino; | |
214 | frag_t frag; | |
215 | snapid_t snap; | |
216 | filepath want_path; | |
31f18b77 | 217 | CInode *basei; |
7c673cae FG |
218 | bool want_base_dir; |
219 | bool want_xlocked; | |
220 | ||
221 | discover_info_t() : | |
31f18b77 | 222 | tid(0), mds(-1), snap(CEPH_NOSNAP), basei(NULL), |
7c673cae FG |
223 | want_base_dir(false), want_xlocked(false) {} |
224 | ~discover_info_t() { | |
31f18b77 FG |
225 | if (basei) |
226 | basei->put(MDSCacheObject::PIN_DISCOVERBASE); | |
7c673cae | 227 | } |
31f18b77 FG |
228 | void pin_base(CInode *b) { |
229 | basei = b; | |
230 | basei->get(MDSCacheObject::PIN_DISCOVERBASE); | |
7c673cae FG |
231 | } |
232 | }; | |
233 | ||
234 | map<ceph_tid_t, discover_info_t> discovers; | |
235 | ceph_tid_t discover_last_tid; | |
236 | ||
237 | void _send_discover(discover_info_t& dis); | |
238 | discover_info_t& _create_discover(mds_rank_t mds) { | |
239 | ceph_tid_t t = ++discover_last_tid; | |
240 | discover_info_t& d = discovers[t]; | |
241 | d.tid = t; | |
242 | d.mds = mds; | |
243 | return d; | |
244 | } | |
245 | ||
246 | // waiters | |
247 | map<int, map<inodeno_t, list<MDSInternalContextBase*> > > waiting_for_base_ino; | |
248 | ||
249 | void discover_base_ino(inodeno_t want_ino, MDSInternalContextBase *onfinish, mds_rank_t from=MDS_RANK_NONE); | |
250 | void discover_dir_frag(CInode *base, frag_t approx_fg, MDSInternalContextBase *onfinish, | |
251 | mds_rank_t from=MDS_RANK_NONE); | |
252 | void discover_path(CInode *base, snapid_t snap, filepath want_path, MDSInternalContextBase *onfinish, | |
253 | bool want_xlocked=false, mds_rank_t from=MDS_RANK_NONE); | |
254 | void discover_path(CDir *base, snapid_t snap, filepath want_path, MDSInternalContextBase *onfinish, | |
255 | bool want_xlocked=false); | |
256 | void kick_discovers(mds_rank_t who); // after a failure. | |
257 | ||
258 | ||
259 | // -- subtrees -- | |
260 | protected: | |
261 | /* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */ | |
262 | map<CDir*,set<CDir*> > subtrees; | |
263 | map<CInode*,list<pair<CDir*,CDir*> > > projected_subtree_renames; // renamed ino -> target dir | |
264 | ||
265 | // adjust subtree auth specification | |
266 | // dir->dir_auth | |
267 | // imports/exports/nested_exports | |
268 | // join/split subtrees as appropriate | |
269 | public: | |
270 | bool is_subtrees() { return !subtrees.empty(); } | |
271 | void list_subtrees(list<CDir*>& ls); | |
224ce89b WB |
272 | void adjust_subtree_auth(CDir *root, mds_authority_t auth); |
273 | void adjust_subtree_auth(CDir *root, mds_rank_t a, mds_rank_t b=CDIR_AUTH_UNKNOWN) { | |
274 | adjust_subtree_auth(root, mds_authority_t(a,b)); | |
7c673cae FG |
275 | } |
276 | void adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, mds_authority_t auth); | |
277 | void adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, mds_rank_t a) { | |
278 | adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN)); | |
279 | } | |
280 | void adjust_bounded_subtree_auth(CDir *dir, vector<dirfrag_t>& bounds, mds_authority_t auth); | |
281 | void adjust_bounded_subtree_auth(CDir *dir, vector<dirfrag_t>& bounds, mds_rank_t a) { | |
282 | adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN)); | |
283 | } | |
284 | void map_dirfrag_set(list<dirfrag_t>& dfs, set<CDir*>& result); | |
285 | void try_subtree_merge(CDir *root); | |
224ce89b | 286 | void try_subtree_merge_at(CDir *root, set<CInode*> *to_eval); |
7c673cae FG |
287 | void subtree_merge_writebehind_finish(CInode *in, MutationRef& mut); |
288 | void eval_subtree_root(CInode *diri); | |
289 | CDir *get_subtree_root(CDir *dir); | |
290 | CDir *get_projected_subtree_root(CDir *dir); | |
291 | bool is_leaf_subtree(CDir *dir) { | |
292 | assert(subtrees.count(dir)); | |
293 | return subtrees[dir].empty(); | |
294 | } | |
295 | void remove_subtree(CDir *dir); | |
296 | bool is_subtree(CDir *root) { | |
297 | return subtrees.count(root); | |
298 | } | |
299 | void get_subtree_bounds(CDir *root, set<CDir*>& bounds); | |
300 | void get_wouldbe_subtree_bounds(CDir *root, set<CDir*>& bounds); | |
301 | void verify_subtree_bounds(CDir *root, const set<CDir*>& bounds); | |
302 | void verify_subtree_bounds(CDir *root, const list<dirfrag_t>& bounds); | |
303 | ||
304 | void project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir); | |
224ce89b | 305 | void adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop); |
7c673cae FG |
306 | |
307 | void get_auth_subtrees(set<CDir*>& s); | |
308 | void get_fullauth_subtrees(set<CDir*>& s); | |
309 | ||
310 | int num_subtrees(); | |
311 | int num_subtrees_fullauth(); | |
312 | int num_subtrees_fullnonauth(); | |
313 | ||
314 | ||
315 | protected: | |
316 | // delayed cache expire | |
317 | map<CDir*, map<mds_rank_t, MCacheExpire*> > delayed_expire; // subtree root -> expire msg | |
318 | ||
319 | ||
320 | // -- requests -- | |
321 | ceph::unordered_map<metareqid_t, MDRequestRef> active_requests; | |
322 | ||
323 | public: | |
324 | int get_num_client_requests(); | |
325 | ||
326 | MDRequestRef request_start(MClientRequest *req); | |
327 | MDRequestRef request_start_slave(metareqid_t rid, __u32 attempt, Message *m); | |
328 | MDRequestRef request_start_internal(int op); | |
329 | bool have_request(metareqid_t rid) { | |
330 | return active_requests.count(rid); | |
331 | } | |
332 | MDRequestRef request_get(metareqid_t rid); | |
333 | void request_pin_ref(MDRequestRef& r, CInode *ref, vector<CDentry*>& trace); | |
334 | void request_finish(MDRequestRef& mdr); | |
335 | void request_forward(MDRequestRef& mdr, mds_rank_t mds, int port=0); | |
336 | void dispatch_request(MDRequestRef& mdr); | |
337 | void request_drop_foreign_locks(MDRequestRef& mdr); | |
338 | void request_drop_non_rdlocks(MDRequestRef& r); | |
339 | void request_drop_locks(MDRequestRef& r); | |
340 | void request_cleanup(MDRequestRef& r); | |
341 | ||
342 | void request_kill(MDRequestRef& r); // called when session closes | |
343 | ||
344 | // journal/snap helpers | |
345 | CInode *pick_inode_snap(CInode *in, snapid_t follows); | |
346 | CInode *cow_inode(CInode *in, snapid_t last); | |
347 | void journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob, CDentry *dn, | |
348 | snapid_t follows=CEPH_NOSNAP, | |
349 | CInode **pcow_inode=0, CDentry::linkage_t *dnl=0); | |
350 | void journal_cow_inode(MutationRef& mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP, | |
351 | CInode **pcow_inode=0); | |
352 | void journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP); | |
353 | ||
354 | void project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first, | |
355 | int linkunlink, SnapRealm *prealm); | |
356 | void _project_rstat_inode_to_frag(inode_t& inode, snapid_t ofirst, snapid_t last, | |
357 | CDir *parent, int linkunlink, bool update_inode); | |
358 | void project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat, | |
359 | snapid_t ofirst, snapid_t last, | |
360 | CInode *pin, bool cow_head); | |
361 | void broadcast_quota_to_client(CInode *in); | |
362 | void predirty_journal_parents(MutationRef mut, EMetaBlob *blob, | |
363 | CInode *in, CDir *parent, | |
364 | int flags, int linkunlink=0, | |
365 | snapid_t follows=CEPH_NOSNAP); | |
366 | ||
367 | // slaves | |
368 | void add_uncommitted_master(metareqid_t reqid, LogSegment *ls, set<mds_rank_t> &slaves, bool safe=false) { | |
369 | uncommitted_masters[reqid].ls = ls; | |
370 | uncommitted_masters[reqid].slaves = slaves; | |
371 | uncommitted_masters[reqid].safe = safe; | |
372 | } | |
373 | void wait_for_uncommitted_master(metareqid_t reqid, MDSInternalContextBase *c) { | |
374 | uncommitted_masters[reqid].waiters.push_back(c); | |
375 | } | |
376 | bool have_uncommitted_master(metareqid_t reqid, mds_rank_t from) { | |
377 | auto p = uncommitted_masters.find(reqid); | |
378 | return p != uncommitted_masters.end() && p->second.slaves.count(from) > 0; | |
379 | } | |
380 | void log_master_commit(metareqid_t reqid); | |
381 | void logged_master_update(metareqid_t reqid); | |
382 | void _logged_master_commit(metareqid_t reqid); | |
383 | void committed_master_slave(metareqid_t r, mds_rank_t from); | |
384 | void finish_committed_masters(); | |
385 | ||
386 | void _logged_slave_commit(mds_rank_t from, metareqid_t reqid); | |
387 | ||
388 | // -- recovery -- | |
389 | protected: | |
390 | set<mds_rank_t> recovery_set; | |
391 | ||
392 | public: | |
393 | void set_recovery_set(set<mds_rank_t>& s); | |
394 | void handle_mds_failure(mds_rank_t who); | |
395 | void handle_mds_recovery(mds_rank_t who); | |
396 | ||
397 | protected: | |
398 | // [resolve] | |
399 | // from EImportStart w/o EImportFinish during journal replay | |
400 | map<dirfrag_t, vector<dirfrag_t> > my_ambiguous_imports; | |
401 | // from MMDSResolves | |
402 | map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > > other_ambiguous_imports; | |
403 | ||
404 | map<mds_rank_t, map<metareqid_t, MDSlaveUpdate*> > uncommitted_slave_updates; // slave: for replay. | |
405 | map<CInode*, int> uncommitted_slave_rename_olddir; // slave: preserve the non-auth dir until seeing commit. | |
406 | map<CInode*, int> uncommitted_slave_unlink; // slave: preserve the unlinked inode until seeing commit. | |
407 | ||
408 | // track master requests whose slaves haven't acknowledged commit | |
409 | struct umaster { | |
410 | set<mds_rank_t> slaves; | |
411 | LogSegment *ls; | |
412 | list<MDSInternalContextBase*> waiters; | |
413 | bool safe; | |
414 | bool committing; | |
415 | bool recovering; | |
416 | umaster() : ls(NULL), safe(false), committing(false), recovering(false) {} | |
417 | }; | |
418 | map<metareqid_t, umaster> uncommitted_masters; // master: req -> slave set | |
419 | ||
420 | set<metareqid_t> pending_masters; | |
421 | map<int, set<metareqid_t> > ambiguous_slave_updates; | |
422 | ||
423 | friend class ESlaveUpdate; | |
424 | friend class ECommitted; | |
425 | ||
426 | bool resolves_pending; | |
427 | set<mds_rank_t> resolve_gather; // nodes i need resolves from | |
428 | set<mds_rank_t> resolve_ack_gather; // nodes i need a resolve_ack from | |
429 | map<metareqid_t, mds_rank_t> need_resolve_rollback; // rollbacks i'm writing to the journal | |
430 | map<mds_rank_t, MMDSResolve*> delayed_resolve; | |
431 | ||
432 | void handle_resolve(MMDSResolve *m); | |
433 | void handle_resolve_ack(MMDSResolveAck *m); | |
434 | void process_delayed_resolve(); | |
435 | void discard_delayed_resolve(mds_rank_t who); | |
436 | void maybe_resolve_finish(); | |
437 | void disambiguate_my_imports(); | |
438 | void disambiguate_other_imports(); | |
439 | void trim_unlinked_inodes(); | |
440 | void add_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master, MDSlaveUpdate*); | |
441 | void finish_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master); | |
442 | MDSlaveUpdate* get_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master); | |
443 | public: | |
444 | void recalc_auth_bits(bool replay); | |
445 | void remove_inode_recursive(CInode *in); | |
446 | ||
447 | bool is_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) { | |
448 | auto p = ambiguous_slave_updates.find(master); | |
449 | return p != ambiguous_slave_updates.end() && p->second.count(reqid); | |
450 | } | |
451 | void add_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) { | |
452 | ambiguous_slave_updates[master].insert(reqid); | |
453 | } | |
454 | void remove_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) { | |
455 | auto p = ambiguous_slave_updates.find(master); | |
456 | auto q = p->second.find(reqid); | |
457 | assert(q != p->second.end()); | |
458 | p->second.erase(q); | |
459 | if (p->second.empty()) | |
460 | ambiguous_slave_updates.erase(p); | |
461 | } | |
462 | ||
463 | void add_rollback(metareqid_t reqid, mds_rank_t master) { | |
464 | need_resolve_rollback[reqid] = master; | |
465 | } | |
466 | void finish_rollback(metareqid_t reqid); | |
467 | ||
468 | // ambiguous imports | |
469 | void add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds); | |
470 | void add_ambiguous_import(CDir *base, const set<CDir*>& bounds); | |
471 | bool have_ambiguous_import(dirfrag_t base) { | |
472 | return my_ambiguous_imports.count(base); | |
473 | } | |
474 | void get_ambiguous_import_bounds(dirfrag_t base, vector<dirfrag_t>& bounds) { | |
475 | assert(my_ambiguous_imports.count(base)); | |
476 | bounds = my_ambiguous_imports[base]; | |
477 | } | |
478 | void cancel_ambiguous_import(CDir *); | |
479 | void finish_ambiguous_import(dirfrag_t dirino); | |
480 | void resolve_start(MDSInternalContext *resolve_done_); | |
481 | void send_resolves(); | |
482 | void send_slave_resolves(); | |
483 | void send_subtree_resolves(); | |
484 | void maybe_send_pending_resolves() { | |
485 | if (resolves_pending) | |
486 | send_subtree_resolves(); | |
487 | } | |
488 | ||
489 | void _move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent, | |
490 | map<dirfrag_t,vector<dirfrag_t> >& subtrees); | |
491 | ESubtreeMap *create_subtree_map(); | |
492 | ||
493 | ||
494 | void clean_open_file_lists(); | |
495 | ||
496 | protected: | |
497 | // [rejoin] | |
498 | bool rejoins_pending; | |
499 | set<mds_rank_t> rejoin_gather; // nodes from whom i need a rejoin | |
500 | set<mds_rank_t> rejoin_sent; // nodes i sent a rejoin to | |
31f18b77 | 501 | set<mds_rank_t> rejoin_ack_sent; // nodes i sent a rejoin to |
7c673cae FG |
502 | set<mds_rank_t> rejoin_ack_gather; // nodes from whom i need a rejoin ack |
503 | map<mds_rank_t,map<inodeno_t,map<client_t,Capability::Import> > > rejoin_imported_caps; | |
504 | map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > > rejoin_slave_exports; | |
505 | map<client_t,entity_inst_t> rejoin_client_map; | |
506 | ||
507 | map<inodeno_t,map<client_t,cap_reconnect_t> > cap_exports; // ino -> client -> capex | |
508 | map<inodeno_t,mds_rank_t> cap_export_targets; // ino -> auth mds | |
509 | ||
510 | map<inodeno_t,map<client_t,map<mds_rank_t,cap_reconnect_t> > > cap_imports; // ino -> client -> frommds -> capex | |
511 | set<inodeno_t> cap_imports_missing; | |
512 | map<inodeno_t, list<MDSInternalContextBase*> > cap_reconnect_waiters; | |
513 | int cap_imports_num_opening; | |
514 | ||
515 | set<CInode*> rejoin_undef_inodes; | |
516 | set<CInode*> rejoin_potential_updated_scatterlocks; | |
517 | set<CDir*> rejoin_undef_dirfrags; | |
518 | map<mds_rank_t, set<CInode*> > rejoin_unlinked_inodes; | |
519 | ||
520 | vector<CInode*> rejoin_recover_q, rejoin_check_q; | |
521 | list<SimpleLock*> rejoin_eval_locks; | |
522 | list<MDSInternalContextBase*> rejoin_waiters; | |
523 | ||
524 | void rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin); | |
525 | void handle_cache_rejoin(MMDSCacheRejoin *m); | |
526 | void handle_cache_rejoin_weak(MMDSCacheRejoin *m); | |
527 | CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last); | |
528 | CDir* rejoin_invent_dirfrag(dirfrag_t df); | |
529 | void handle_cache_rejoin_strong(MMDSCacheRejoin *m); | |
530 | void rejoin_scour_survivor_replicas(mds_rank_t from, MMDSCacheRejoin *ack, | |
531 | set<vinodeno_t>& acked_inodes, | |
532 | set<SimpleLock *>& gather_locks); | |
533 | void handle_cache_rejoin_ack(MMDSCacheRejoin *m); | |
534 | void rejoin_send_acks(); | |
535 | void rejoin_trim_undef_inodes(); | |
536 | void maybe_send_pending_rejoins() { | |
537 | if (rejoins_pending) | |
538 | rejoin_send_rejoins(); | |
539 | } | |
540 | std::unique_ptr<MDSInternalContext> rejoin_done; | |
541 | std::unique_ptr<MDSInternalContext> resolve_done; | |
542 | public: | |
543 | void rejoin_start(MDSInternalContext *rejoin_done_); | |
544 | void rejoin_gather_finish(); | |
545 | void rejoin_send_rejoins(); | |
546 | void rejoin_export_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr, | |
547 | int target=-1) { | |
548 | cap_exports[ino][client] = icr; | |
549 | cap_export_targets[ino] = target; | |
550 | } | |
551 | void rejoin_recovered_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr, | |
552 | mds_rank_t frommds=MDS_RANK_NONE) { | |
553 | cap_imports[ino][client][frommds] = icr; | |
554 | } | |
555 | const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) { | |
556 | if (cap_imports.count(ino) && | |
557 | cap_imports[ino].count(client) && | |
558 | cap_imports[ino][client].count(MDS_RANK_NONE)) { | |
559 | return &cap_imports[ino][client][MDS_RANK_NONE]; | |
560 | } | |
561 | return NULL; | |
562 | } | |
563 | void remove_replay_cap_reconnect(inodeno_t ino, client_t client) { | |
564 | assert(cap_imports[ino].size() == 1); | |
565 | assert(cap_imports[ino][client].size() == 1); | |
566 | cap_imports.erase(ino); | |
567 | } | |
568 | void wait_replay_cap_reconnect(inodeno_t ino, MDSInternalContextBase *c) { | |
569 | cap_reconnect_waiters[ino].push_back(c); | |
570 | } | |
571 | ||
572 | // [reconnect/rejoin caps] | |
573 | struct reconnected_cap_info_t { | |
574 | inodeno_t realm_ino; | |
575 | snapid_t snap_follows; | |
576 | int dirty_caps; | |
577 | reconnected_cap_info_t() : | |
578 | realm_ino(0), snap_follows(0), dirty_caps(0) {} | |
579 | }; | |
580 | map<inodeno_t,map<client_t, reconnected_cap_info_t> > reconnected_caps; // inode -> client -> snap_follows,realmino | |
581 | map<inodeno_t,map<client_t, snapid_t> > reconnected_snaprealms; // realmino -> client -> realmseq | |
582 | ||
583 | void add_reconnected_cap(client_t client, inodeno_t ino, const cap_reconnect_t& icr) { | |
584 | reconnected_cap_info_t &info = reconnected_caps[ino][client]; | |
585 | info.realm_ino = inodeno_t(icr.capinfo.snaprealm); | |
586 | info.snap_follows = icr.snap_follows; | |
587 | } | |
588 | void set_reconnected_dirty_caps(client_t client, inodeno_t ino, int dirty) { | |
589 | reconnected_cap_info_t &info = reconnected_caps[ino][client]; | |
590 | info.dirty_caps |= dirty; | |
591 | } | |
592 | void add_reconnected_snaprealm(client_t client, inodeno_t ino, snapid_t seq) { | |
593 | reconnected_snaprealms[ino][client] = seq; | |
594 | } | |
595 | ||
596 | friend class C_MDC_RejoinOpenInoFinish; | |
597 | friend class C_MDC_RejoinSessionsOpened; | |
598 | void rejoin_open_ino_finish(inodeno_t ino, int ret); | |
599 | void rejoin_open_sessions_finish(map<client_t,entity_inst_t> client_map, | |
600 | map<client_t,uint64_t>& sseqmap); | |
601 | bool process_imported_caps(); | |
602 | void choose_lock_states_and_reconnect_caps(); | |
603 | void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino, | |
604 | map<client_t,MClientSnap*>& splits); | |
605 | void do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool nosend=false); | |
606 | void send_snaps(map<client_t,MClientSnap*>& splits); | |
607 | Capability* rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds); | |
608 | void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq); | |
609 | void try_reconnect_cap(CInode *in, Session *session); | |
610 | void export_remaining_imported_caps(); | |
611 | ||
612 | // cap imports. delayed snap parent opens. | |
613 | // realm inode -> client -> cap inodes needing to split to this realm | |
614 | map<CInode*,set<CInode*> > missing_snap_parents; | |
615 | map<client_t,set<CInode*> > delayed_imported_caps; | |
616 | ||
617 | void do_cap_import(Session *session, CInode *in, Capability *cap, | |
618 | uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq, | |
619 | int peer, int p_flags); | |
620 | void do_delayed_cap_imports(); | |
621 | void rebuild_need_snapflush(CInode *head_in, SnapRealm *realm, client_t client, | |
622 | snapid_t snap_follows); | |
623 | void check_realm_past_parents(SnapRealm *realm, bool reconnect); | |
624 | void open_snap_parents(); | |
625 | ||
626 | bool open_undef_inodes_dirfrags(); | |
627 | void opened_undef_inode(CInode *in); | |
628 | void opened_undef_dirfrag(CDir *dir) { | |
629 | rejoin_undef_dirfrags.erase(dir); | |
630 | } | |
631 | ||
632 | void reissue_all_caps(); | |
633 | ||
634 | ||
635 | friend class Locker; | |
636 | friend class Migrator; | |
637 | friend class MDBalancer; | |
638 | ||
639 | // StrayManager needs to be able to remove_inode() from us | |
640 | // when it is done purging | |
641 | friend class StrayManager; | |
642 | ||
643 | // File size recovery | |
644 | private: | |
645 | RecoveryQueue recovery_queue; | |
646 | void identify_files_to_recover(); | |
647 | public: | |
648 | void start_files_to_recover(); | |
649 | void do_file_recover(); | |
650 | void queue_file_recover(CInode *in); | |
651 | void _queued_file_recover_cow(CInode *in, MutationRef& mut); | |
652 | ||
653 | // subsystems | |
654 | std::unique_ptr<Migrator> migrator; | |
655 | ||
656 | public: | |
657 | explicit MDCache(MDSRank *m, PurgeQueue &purge_queue_); | |
658 | ~MDCache(); | |
659 | ||
660 | // debug | |
661 | void log_stat(); | |
662 | ||
663 | // root inode | |
664 | CInode *get_root() { return root; } | |
665 | CInode *get_myin() { return myin; } | |
666 | ||
667 | // cache | |
668 | void set_cache_size(size_t max) { lru.lru_set_max(max); } | |
669 | size_t get_cache_size() { return lru.lru_get_size(); } | |
670 | ||
671 | // trimming | |
672 | bool trim(int max=-1, int count=-1); // trim cache | |
673 | bool trim_dentry(CDentry *dn, map<mds_rank_t, MCacheExpire*>& expiremap); | |
674 | void trim_dirfrag(CDir *dir, CDir *con, | |
675 | map<mds_rank_t, MCacheExpire*>& expiremap); | |
676 | bool trim_inode(CDentry *dn, CInode *in, CDir *con, | |
677 | map<mds_rank_t,class MCacheExpire*>& expiremap); | |
678 | void send_expire_messages(map<mds_rank_t, MCacheExpire*>& expiremap); | |
679 | void trim_non_auth(); // trim out trimmable non-auth items | |
680 | bool trim_non_auth_subtree(CDir *directory); | |
681 | void standby_trim_segment(LogSegment *ls); | |
682 | void try_trim_non_auth_subtree(CDir *dir); | |
683 | bool can_trim_non_auth_dirfrag(CDir *dir) { | |
684 | return my_ambiguous_imports.count((dir)->dirfrag()) == 0 && | |
685 | uncommitted_slave_rename_olddir.count(dir->inode) == 0; | |
686 | } | |
687 | ||
688 | /** | |
689 | * For all unreferenced inodes, dirs, dentries below an inode, compose | |
690 | * expiry messages. This is used when giving up all replicas of entities | |
691 | * for an MDS peer in the 'stopping' state, such that the peer can | |
692 | * empty its cache and finish shutting down. | |
693 | * | |
694 | * We have to make sure we're only expiring un-referenced items to | |
695 | * avoid interfering with ongoing stray-movement (we can't distinguish | |
696 | * between the "moving my strays" and "waiting for my cache to empty" | |
697 | * phases within 'stopping') | |
698 | * | |
699 | * @return false if we completed cleanly, true if caller should stop | |
700 | * expiring because we hit something with refs. | |
701 | */ | |
702 | bool expire_recursive( | |
703 | CInode *in, | |
704 | std::map<mds_rank_t, MCacheExpire*>& expiremap); | |
705 | ||
706 | void trim_client_leases(); | |
707 | void check_memory_usage(); | |
708 | ||
709 | utime_t last_recall_state; | |
710 | ||
711 | // shutdown | |
712 | private: | |
713 | set<inodeno_t> shutdown_exported_strays; | |
714 | public: | |
715 | void shutdown_start(); | |
716 | void shutdown_check(); | |
717 | bool shutdown_pass(); | |
718 | bool shutdown_export_strays(); | |
719 | bool shutdown(); // clear cache (ie at shutodwn) | |
720 | ||
721 | bool did_shutdown_log_cap; | |
722 | ||
723 | // inode_map | |
724 | bool have_inode(vinodeno_t vino) { | |
725 | return inode_map.count(vino) ? true:false; | |
726 | } | |
727 | bool have_inode(inodeno_t ino, snapid_t snap=CEPH_NOSNAP) { | |
728 | return have_inode(vinodeno_t(ino, snap)); | |
729 | } | |
730 | CInode* get_inode(vinodeno_t vino) { | |
731 | if (have_inode(vino)) | |
732 | return inode_map[vino]; | |
733 | return NULL; | |
734 | } | |
735 | CInode* get_inode(inodeno_t ino, snapid_t s=CEPH_NOSNAP) { | |
736 | return get_inode(vinodeno_t(ino, s)); | |
737 | } | |
738 | ||
739 | CDir* get_dirfrag(dirfrag_t df) { | |
740 | CInode *in = get_inode(df.ino); | |
741 | if (!in) | |
742 | return NULL; | |
743 | return in->get_dirfrag(df.frag); | |
744 | } | |
745 | CDir* get_dirfrag(inodeno_t ino, const string& dn) { | |
746 | CInode *in = get_inode(ino); | |
747 | if (!in) | |
748 | return NULL; | |
749 | frag_t fg = in->pick_dirfrag(dn); | |
750 | return in->get_dirfrag(fg); | |
751 | } | |
752 | CDir* get_force_dirfrag(dirfrag_t df, bool replay) { | |
753 | CInode *diri = get_inode(df.ino); | |
754 | if (!diri) | |
755 | return NULL; | |
756 | CDir *dir = force_dir_fragment(diri, df.frag, replay); | |
757 | if (!dir) | |
758 | dir = diri->get_dirfrag(df.frag); | |
759 | return dir; | |
760 | } | |
761 | ||
762 | MDSCacheObject *get_object(MDSCacheObjectInfo &info); | |
763 | ||
764 | ||
765 | ||
766 | public: | |
767 | void add_inode(CInode *in); | |
768 | ||
769 | void remove_inode(CInode *in); | |
770 | protected: | |
771 | void touch_inode(CInode *in) { | |
772 | if (in->get_parent_dn()) | |
773 | touch_dentry(in->get_projected_parent_dn()); | |
774 | } | |
775 | public: | |
776 | void touch_dentry(CDentry *dn) { | |
31f18b77 FG |
777 | if (dn->state_test(CDentry::STATE_BOTTOMLRU)) { |
778 | bottom_lru.lru_midtouch(dn); | |
779 | } else { | |
780 | if (dn->is_auth()) | |
781 | lru.lru_touch(dn); | |
782 | else | |
783 | lru.lru_midtouch(dn); | |
784 | } | |
7c673cae FG |
785 | } |
786 | void touch_dentry_bottom(CDentry *dn) { | |
31f18b77 FG |
787 | if (dn->state_test(CDentry::STATE_BOTTOMLRU)) |
788 | return; | |
7c673cae | 789 | lru.lru_bottouch(dn); |
7c673cae FG |
790 | } |
791 | protected: | |
792 | ||
793 | void inode_remove_replica(CInode *in, mds_rank_t rep, bool rejoin, | |
794 | set<SimpleLock *>& gather_locks); | |
795 | void dentry_remove_replica(CDentry *dn, mds_rank_t rep, set<SimpleLock *>& gather_locks); | |
796 | ||
797 | void rename_file(CDentry *srcdn, CDentry *destdn); | |
798 | ||
799 | public: | |
800 | // truncate | |
801 | void truncate_inode(CInode *in, LogSegment *ls); | |
802 | void _truncate_inode(CInode *in, LogSegment *ls); | |
803 | void truncate_inode_finish(CInode *in, LogSegment *ls); | |
804 | void truncate_inode_logged(CInode *in, MutationRef& mut); | |
805 | ||
806 | void add_recovered_truncate(CInode *in, LogSegment *ls); | |
807 | void remove_recovered_truncate(CInode *in, LogSegment *ls); | |
808 | void start_recovered_truncates(); | |
809 | ||
810 | ||
811 | public: | |
812 | CDir *get_auth_container(CDir *in); | |
813 | CDir *get_export_container(CDir *dir); | |
814 | void find_nested_exports(CDir *dir, set<CDir*>& s); | |
815 | void find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s); | |
816 | ||
817 | ||
818 | private: | |
819 | bool opening_root, open; | |
820 | list<MDSInternalContextBase*> waiting_for_open; | |
821 | ||
822 | public: | |
823 | void init_layouts(); | |
824 | void create_unlinked_system_inode(CInode *in, inodeno_t ino, | |
825 | int mode) const; | |
826 | CInode *create_system_inode(inodeno_t ino, int mode); | |
827 | CInode *create_root_inode(); | |
828 | ||
829 | void create_empty_hierarchy(MDSGather *gather); | |
830 | void create_mydir_hierarchy(MDSGather *gather); | |
831 | ||
832 | bool is_open() { return open; } | |
833 | void wait_for_open(MDSInternalContextBase *c) { | |
834 | waiting_for_open.push_back(c); | |
835 | } | |
836 | ||
837 | void open_root_inode(MDSInternalContextBase *c); | |
838 | void open_root(); | |
839 | void open_mydir_inode(MDSInternalContextBase *c); | |
840 | void populate_mydir(); | |
841 | ||
842 | void _create_system_file(CDir *dir, const char *name, CInode *in, MDSInternalContextBase *fin); | |
843 | void _create_system_file_finish(MutationRef& mut, CDentry *dn, | |
844 | version_t dpv, MDSInternalContextBase *fin); | |
845 | ||
846 | void open_foreign_mdsdir(inodeno_t ino, MDSInternalContextBase *c); | |
847 | CDir *get_stray_dir(CInode *in); | |
848 | CDentry *get_or_create_stray_dentry(CInode *in); | |
849 | ||
850 | MDSInternalContextBase *_get_waiter(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin); | |
851 | ||
852 | /** | |
853 | * Find the given dentry (and whether it exists or not), its ancestors, | |
854 | * and get them all into memory and usable on this MDS. This function | |
855 | * makes a best-effort attempt to load everything; if it needs to | |
856 | * go away and do something then it will put the request on a waitlist. | |
857 | * It prefers the mdr, then the req, then the fin. (At least one of these | |
858 | * must be non-null.) | |
859 | * | |
860 | * At least one of the params mdr, req, and fin must be non-null. | |
861 | * | |
862 | * @param mdr The MDRequest associated with the path. Can be null. | |
863 | * @param req The Message associated with the path. Can be null. | |
864 | * @param fin The Context associated with the path. Can be null. | |
865 | * @param path The path to traverse to. | |
866 | * @param pdnvec Data return parameter -- on success, contains a | |
867 | * vector of dentries. On failure, is either empty or contains the | |
868 | * full trace of traversable dentries. | |
869 | * @param pin Data return parameter -- if successful, points to the inode | |
870 | * associated with filepath. If unsuccessful, is null. | |
871 | * @param onfail Specifies different lookup failure behaviors. If set to | |
872 | * MDS_TRAVERSE_DISCOVERXLOCK, path_traverse will succeed on null | |
873 | * dentries (instead of returning -ENOENT). If set to | |
874 | * MDS_TRAVERSE_FORWARD, it will forward the request to the auth | |
875 | * MDS if that becomes appropriate (ie, if it doesn't know the contents | |
876 | * of a directory). If set to MDS_TRAVERSE_DISCOVER, it | |
877 | * will attempt to look up the path from a different MDS (and bring them | |
878 | * into its cache as replicas). | |
879 | * | |
880 | * @returns 0 on success, 1 on "not done yet", 2 on "forwarding", -errno otherwise. | |
881 | * If it returns 1, the requester associated with this call has been placed | |
882 | * on the appropriate waitlist, and it should unwind itself and back out. | |
883 | * If it returns 2 the request has been forwarded, and again the requester | |
884 | * should unwind itself and back out. | |
885 | */ | |
886 | int path_traverse(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin, const filepath& path, | |
887 | vector<CDentry*> *pdnvec, CInode **pin, int onfail); | |
888 | ||
889 | CInode *cache_traverse(const filepath& path); | |
890 | ||
891 | void open_remote_dirfrag(CInode *diri, frag_t fg, MDSInternalContextBase *fin); | |
892 | CInode *get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected=false); | |
893 | ||
894 | bool parallel_fetch(map<inodeno_t,filepath>& pathmap, set<inodeno_t>& missing); | |
895 | bool parallel_fetch_traverse_dir(inodeno_t ino, filepath& path, | |
896 | set<CDir*>& fetch_queue, set<inodeno_t>& missing, | |
897 | C_GatherBuilder &gather_bld); | |
898 | ||
899 | void open_remote_dentry(CDentry *dn, bool projected, MDSInternalContextBase *fin, | |
900 | bool want_xlocked=false); | |
901 | void _open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSInternalContextBase *fin, | |
902 | bool want_xlocked, int r); | |
903 | ||
904 | void make_trace(vector<CDentry*>& trace, CInode *in); | |
905 | ||
906 | protected: | |
907 | struct open_ino_info_t { | |
908 | vector<inode_backpointer_t> ancestors; | |
909 | set<mds_rank_t> checked; | |
910 | mds_rank_t checking; | |
911 | mds_rank_t auth_hint; | |
912 | bool check_peers; | |
913 | bool fetch_backtrace; | |
914 | bool discover; | |
915 | bool want_replica; | |
916 | bool want_xlocked; | |
917 | version_t tid; | |
918 | int64_t pool; | |
919 | int last_err; | |
920 | list<MDSInternalContextBase*> waiters; | |
921 | open_ino_info_t() : checking(MDS_RANK_NONE), auth_hint(MDS_RANK_NONE), | |
922 | check_peers(true), fetch_backtrace(true), discover(false), | |
923 | want_replica(false), want_xlocked(false), tid(0), pool(-1), | |
924 | last_err(0) {} | |
925 | }; | |
926 | ceph_tid_t open_ino_last_tid; | |
927 | map<inodeno_t,open_ino_info_t> opening_inodes; | |
928 | ||
929 | void _open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err); | |
930 | void _open_ino_parent_opened(inodeno_t ino, int ret); | |
931 | void _open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int err); | |
932 | void _open_ino_fetch_dir(inodeno_t ino, MMDSOpenIno *m, CDir *dir, bool parent); | |
933 | int open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m, | |
934 | vector<inode_backpointer_t>& ancestors, | |
935 | bool discover, bool want_xlocked, mds_rank_t *hint); | |
936 | void open_ino_finish(inodeno_t ino, open_ino_info_t& info, int err); | |
937 | void do_open_ino(inodeno_t ino, open_ino_info_t& info, int err); | |
938 | void do_open_ino_peer(inodeno_t ino, open_ino_info_t& info); | |
939 | void handle_open_ino(MMDSOpenIno *m, int err=0); | |
940 | void handle_open_ino_reply(MMDSOpenInoReply *m); | |
941 | friend class C_IO_MDC_OpenInoBacktraceFetched; | |
942 | friend struct C_MDC_OpenInoTraverseDir; | |
943 | friend struct C_MDC_OpenInoParentOpened; | |
944 | ||
945 | public: | |
946 | void kick_open_ino_peers(mds_rank_t who); | |
947 | void open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase *fin, | |
948 | bool want_replica=true, bool want_xlocked=false); | |
949 | ||
950 | // -- find_ino_peer -- | |
951 | struct find_ino_peer_info_t { | |
952 | inodeno_t ino; | |
953 | ceph_tid_t tid; | |
954 | MDSInternalContextBase *fin; | |
955 | mds_rank_t hint; | |
956 | mds_rank_t checking; | |
957 | set<mds_rank_t> checked; | |
958 | ||
959 | find_ino_peer_info_t() : tid(0), fin(NULL), hint(MDS_RANK_NONE), checking(MDS_RANK_NONE) {} | |
960 | }; | |
961 | ||
962 | map<ceph_tid_t, find_ino_peer_info_t> find_ino_peer; | |
963 | ceph_tid_t find_ino_peer_last_tid; | |
964 | ||
965 | void find_ino_peers(inodeno_t ino, MDSInternalContextBase *c, mds_rank_t hint=MDS_RANK_NONE); | |
966 | void _do_find_ino_peer(find_ino_peer_info_t& fip); | |
967 | void handle_find_ino(MMDSFindIno *m); | |
968 | void handle_find_ino_reply(MMDSFindInoReply *m); | |
969 | void kick_find_ino_peers(mds_rank_t who); | |
970 | ||
971 | // -- snaprealms -- | |
972 | public: | |
973 | void snaprealm_create(MDRequestRef& mdr, CInode *in); | |
974 | void _snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CInode *in); | |
975 | ||
976 | // -- stray -- | |
977 | public: | |
7c673cae FG |
978 | void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin); |
979 | uint64_t get_num_strays() const { return stray_manager.get_num_strays(); } | |
980 | ||
981 | protected: | |
982 | void scan_stray_dir(dirfrag_t next=dirfrag_t()); | |
983 | StrayManager stray_manager; | |
984 | friend struct C_MDC_RetryScanStray; | |
985 | friend class C_IO_MDC_FetchedBacktrace; | |
986 | ||
987 | // == messages == | |
988 | public: | |
989 | void dispatch(Message *m); | |
990 | ||
991 | protected: | |
992 | // -- replicas -- | |
993 | void handle_discover(MDiscover *dis); | |
994 | void handle_discover_reply(MDiscoverReply *m); | |
995 | friend class C_MDC_Join; | |
996 | ||
997 | public: | |
998 | void replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl) { | |
999 | dirfrag_t df = dir->dirfrag(); | |
1000 | ::encode(df, bl); | |
1001 | dir->encode_replica(to, bl); | |
1002 | } | |
1003 | void replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl) { | |
1004 | ::encode(dn->name, bl); | |
1005 | ::encode(dn->last, bl); | |
1006 | dn->encode_replica(to, bl); | |
1007 | } | |
1008 | void replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl, | |
1009 | uint64_t features) { | |
1010 | ::encode(in->inode.ino, bl); // bleh, minor assymetry here | |
1011 | ::encode(in->last, bl); | |
1012 | in->encode_replica(to, bl, features); | |
1013 | } | |
1014 | ||
1015 | CDir* add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from, list<MDSInternalContextBase*>& finished); | |
7c673cae FG |
1016 | CDentry *add_replica_dentry(bufferlist::iterator& p, CDir *dir, list<MDSInternalContextBase*>& finished); |
1017 | CInode *add_replica_inode(bufferlist::iterator& p, CDentry *dn, list<MDSInternalContextBase*>& finished); | |
1018 | ||
1019 | void replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl); | |
1020 | CDentry *add_replica_stray(bufferlist &bl, mds_rank_t from); | |
1021 | ||
1022 | // -- namespace -- | |
1023 | public: | |
1024 | void send_dentry_link(CDentry *dn, MDRequestRef& mdr); | |
1025 | void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr); | |
1026 | protected: | |
1027 | void handle_dentry_link(MDentryLink *m); | |
1028 | void handle_dentry_unlink(MDentryUnlink *m); | |
1029 | ||
1030 | ||
1031 | // -- fragmenting -- | |
1032 | private: | |
1033 | struct ufragment { | |
1034 | int bits; | |
1035 | bool committed; | |
1036 | LogSegment *ls; | |
1037 | list<MDSInternalContextBase*> waiters; | |
1038 | list<frag_t> old_frags; | |
1039 | bufferlist rollback; | |
1040 | ufragment() : bits(0), committed(false), ls(NULL) {} | |
1041 | }; | |
1042 | map<dirfrag_t, ufragment> uncommitted_fragments; | |
1043 | ||
1044 | struct fragment_info_t { | |
1045 | int bits; | |
1046 | list<CDir*> dirs; | |
1047 | list<CDir*> resultfrags; | |
1048 | MDRequestRef mdr; | |
1049 | // for deadlock detection | |
1050 | bool all_frozen; | |
1051 | utime_t last_cum_auth_pins_change; | |
1052 | int last_cum_auth_pins; | |
1053 | int num_remote_waiters; // number of remote authpin waiters | |
1054 | fragment_info_t() : bits(0), all_frozen(false), last_cum_auth_pins(0), num_remote_waiters(0) {} | |
1055 | bool is_fragmenting() { return !resultfrags.empty(); } | |
1056 | }; | |
1057 | map<dirfrag_t,fragment_info_t> fragments; | |
1058 | ||
1059 | void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits, | |
1060 | list<CDir*>& frags, list<MDSInternalContextBase*>& waiters, bool replay); | |
1061 | void adjust_dir_fragments(CInode *diri, | |
1062 | list<CDir*>& srcfrags, | |
1063 | frag_t basefrag, int bits, | |
1064 | list<CDir*>& resultfrags, | |
1065 | list<MDSInternalContextBase*>& waiters, | |
1066 | bool replay); | |
1067 | CDir *force_dir_fragment(CInode *diri, frag_t fg, bool replay=true); | |
1068 | void get_force_dirfrag_bound_set(vector<dirfrag_t>& dfs, set<CDir*>& bounds); | |
1069 | ||
1070 | bool can_fragment(CInode *diri, list<CDir*>& dirs); | |
1071 | void fragment_freeze_dirs(list<CDir*>& dirs); | |
1072 | void fragment_mark_and_complete(MDRequestRef& mdr); | |
1073 | void fragment_frozen(MDRequestRef& mdr, int r); | |
1074 | void fragment_unmark_unfreeze_dirs(list<CDir*>& dirs); | |
1075 | void dispatch_fragment_dir(MDRequestRef& mdr); | |
1076 | void _fragment_logged(MDRequestRef& mdr); | |
1077 | void _fragment_stored(MDRequestRef& mdr); | |
1078 | void _fragment_committed(dirfrag_t f, list<CDir*>& resultfrags); | |
1079 | void _fragment_finish(dirfrag_t f, list<CDir*>& resultfrags); | |
1080 | ||
1081 | friend class EFragment; | |
1082 | friend class C_MDC_FragmentFrozen; | |
1083 | friend class C_MDC_FragmentMarking; | |
1084 | friend class C_MDC_FragmentPrep; | |
1085 | friend class C_MDC_FragmentStore; | |
1086 | friend class C_MDC_FragmentCommit; | |
1087 | friend class C_IO_MDC_FragmentFinish; | |
1088 | ||
1089 | void handle_fragment_notify(MMDSFragmentNotify *m); | |
1090 | ||
1091 | void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frag, | |
1092 | LogSegment *ls, bufferlist *rollback=NULL); | |
1093 | void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op); | |
1094 | void rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags); | |
1095 | public: | |
1096 | void wait_for_uncommitted_fragment(dirfrag_t dirfrag, MDSInternalContextBase *c) { | |
1097 | assert(uncommitted_fragments.count(dirfrag)); | |
1098 | uncommitted_fragments[dirfrag].waiters.push_back(c); | |
1099 | } | |
1100 | void split_dir(CDir *dir, int byn); | |
1101 | void merge_dir(CInode *diri, frag_t fg); | |
1102 | void rollback_uncommitted_fragments(); | |
1103 | ||
1104 | void find_stale_fragment_freeze(); | |
1105 | void fragment_freeze_inc_num_waiters(CDir *dir); | |
1106 | bool fragment_are_all_frozen(CDir *dir); | |
1107 | int get_num_fragmenting_dirs() { return fragments.size(); } | |
1108 | ||
1109 | // -- updates -- | |
1110 | //int send_inode_updates(CInode *in); | |
1111 | //void handle_inode_update(MInodeUpdate *m); | |
1112 | ||
1113 | int send_dir_updates(CDir *in, bool bcast=false); | |
1114 | void handle_dir_update(MDirUpdate *m); | |
1115 | ||
1116 | // -- cache expiration -- | |
1117 | void handle_cache_expire(MCacheExpire *m); | |
1118 | void process_delayed_expire(CDir *dir); | |
1119 | void discard_delayed_expire(CDir *dir); | |
1120 | ||
1121 | protected: | |
31f18b77 | 1122 | int dump_cache(const char *fn, Formatter *f, |
7c673cae FG |
1123 | const std::string& dump_root = "", |
1124 | int depth = -1); | |
1125 | public: | |
31f18b77 FG |
1126 | int dump_cache() { return dump_cache(NULL, NULL); } |
1127 | int dump_cache(const std::string &filename); | |
1128 | int dump_cache(Formatter *f); | |
1129 | int dump_cache(const std::string& dump_root, int depth, Formatter *f); | |
7c673cae FG |
1130 | |
1131 | void dump_resolve_status(Formatter *f) const; | |
1132 | void dump_rejoin_status(Formatter *f) const; | |
1133 | ||
1134 | // == crap fns == | |
1135 | public: | |
1136 | void show_cache(); | |
1137 | void show_subtrees(int dbl=10); | |
1138 | ||
1139 | CInode *hack_pick_random_inode() { | |
1140 | assert(!inode_map.empty()); | |
1141 | int n = rand() % inode_map.size(); | |
1142 | ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin(); | |
1143 | while (n--) ++p; | |
1144 | return p->second; | |
1145 | } | |
1146 | ||
1147 | protected: | |
1148 | void flush_dentry_work(MDRequestRef& mdr); | |
1149 | /** | |
1150 | * Resolve path to a dentry and pass it onto the ScrubStack. | |
1151 | * | |
1152 | * TODO: return enough information to the original mdr formatter | |
1153 | * and completion that they can subsequeuntly check the progress of | |
1154 | * this scrub (we won't block them on a whole scrub as it can take a very | |
1155 | * long time) | |
1156 | */ | |
1157 | void enqueue_scrub_work(MDRequestRef& mdr); | |
1158 | void repair_inode_stats_work(MDRequestRef& mdr); | |
1159 | void repair_dirfrag_stats_work(MDRequestRef& mdr); | |
1160 | friend class C_MDC_RepairDirfragStats; | |
1161 | public: | |
1162 | void flush_dentry(const string& path, Context *fin); | |
1163 | /** | |
1164 | * Create and start an OP_ENQUEUE_SCRUB | |
1165 | */ | |
1166 | void enqueue_scrub(const string& path, const std::string &tag, | |
1167 | bool force, bool recursive, bool repair, | |
1168 | Formatter *f, Context *fin); | |
1169 | void repair_inode_stats(CInode *diri); | |
1170 | void repair_dirfrag_stats(CDir *dir); | |
1171 | ||
1172 | public: | |
1173 | /* Because exports may fail, this set lets us keep track of inodes that need exporting. */ | |
1174 | std::set<CInode *> export_pin_queue; | |
1175 | }; | |
1176 | ||
1177 | class C_MDS_RetryRequest : public MDSInternalContext { | |
1178 | MDCache *cache; | |
1179 | MDRequestRef mdr; | |
1180 | public: | |
1181 | C_MDS_RetryRequest(MDCache *c, MDRequestRef& r); | |
1182 | void finish(int r) override; | |
1183 | }; | |
1184 | ||
1185 | #endif |