]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDCache.h
update sources to v12.2.1
[ceph.git] / ceph / src / mds / MDCache.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16
17 #ifndef CEPH_MDCACHE_H
18 #define CEPH_MDCACHE_H
19
20 #include "include/types.h"
21 #include "include/filepath.h"
22 #include "include/elist.h"
23
24 #include "osdc/Filer.h"
25 #include "CInode.h"
26 #include "CDentry.h"
27 #include "CDir.h"
28 #include "include/Context.h"
29 #include "events/EMetaBlob.h"
30 #include "RecoveryQueue.h"
31 #include "StrayManager.h"
32 #include "MDSContext.h"
33 #include "MDSMap.h"
34 #include "Mutation.h"
35
36 #include "messages/MClientRequest.h"
37 #include "messages/MMDSSlaveRequest.h"
38
39 class PerfCounters;
40
41 class MDSRank;
42 class Session;
43 class Migrator;
44
45 class Message;
46 class Session;
47
48 class MMDSResolve;
49 class MMDSResolveAck;
50 class MMDSCacheRejoin;
51 class MDiscover;
52 class MDiscoverReply;
53 class MCacheExpire;
54 class MDirUpdate;
55 class MDentryLink;
56 class MDentryUnlink;
57 class MLock;
58 struct MMDSFindIno;
59 struct MMDSFindInoReply;
60 struct MMDSOpenIno;
61 struct MMDSOpenInoReply;
62
63 class Message;
64 class MClientRequest;
65 class MMDSSlaveRequest;
66 struct MClientSnap;
67
68 class MMDSFragmentNotify;
69
70 class ESubtreeMap;
71
72 enum {
73 l_mdc_first = 3000,
74 // How many inodes currently in stray dentries
75 l_mdc_num_strays,
76 // How many stray dentries are currently delayed for purge due to refs
77 l_mdc_num_strays_delayed,
78 // How many stray dentries are currently being enqueued for purge
79 l_mdc_num_strays_enqueuing,
80
81 // How many dentries have ever been added to stray dir
82 l_mdc_strays_created,
83 // How many dentries have been passed on to PurgeQueue
84 l_mdc_strays_enqueued,
85 // How many strays have been reintegrated?
86 l_mdc_strays_reintegrated,
87 // How many strays have been migrated?
88 l_mdc_strays_migrated,
89
90 // How many inode sizes currently being recovered
91 l_mdc_num_recovering_processing,
92 // How many inodes currently waiting to have size recovered
93 l_mdc_num_recovering_enqueued,
94 // How many inodes waiting with elevated priority for recovery
95 l_mdc_num_recovering_prioritized,
96 // How many inodes ever started size recovery
97 l_mdc_recovery_started,
98 // How many inodes ever completed size recovery
99 l_mdc_recovery_completed,
100
101 l_mdss_ireq_enqueue_scrub,
102 l_mdss_ireq_exportdir,
103 l_mdss_ireq_flush,
104 l_mdss_ireq_fragmentdir,
105 l_mdss_ireq_fragstats,
106 l_mdss_ireq_inodestats,
107
108 l_mdc_last,
109 };
110
111
112 // flags for predirty_journal_parents()
113 static const int PREDIRTY_PRIMARY = 1; // primary dn, adjust nested accounting
114 static const int PREDIRTY_DIR = 2; // update parent dir mtime/size
115 static const int PREDIRTY_SHALLOW = 4; // only go to immediate parent (for easier rollback)
116
117 class MDCache {
118 public:
119 // my master
120 MDSRank *mds;
121
122 // -- my cache --
123 LRU lru; // dentry lru for expiring items from cache
124 LRU bottom_lru; // dentries that should be trimmed ASAP
125 protected:
126 ceph::unordered_map<vinodeno_t,CInode*> inode_map; // map of inodes by ino
127 CInode *root; // root inode
128 CInode *myin; // .ceph/mds%d dir
129
130 bool readonly;
131 void set_readonly() { readonly = true; }
132
133 CInode *strays[NUM_STRAY]; // my stray dir
134 int stray_index;
135
136 CInode *get_stray() {
137 return strays[stray_index];
138 }
139
140 set<CInode*> base_inodes;
141
142 std::unique_ptr<PerfCounters> logger;
143
144 Filer filer;
145
146 bool exceeded_size_limit;
147
148 public:
149 static uint64_t cache_limit_inodes(void) {
150 return g_conf->get_val<int64_t>("mds_cache_size");
151 }
152 static uint64_t cache_limit_memory(void) {
153 return g_conf->get_val<uint64_t>("mds_cache_memory_limit");
154 }
155 static double cache_reservation(void) {
156 return g_conf->get_val<double>("mds_cache_reservation");
157 }
158 static double cache_mid(void) {
159 return g_conf->get_val<double>("mds_cache_mid");
160 }
161 static double cache_health_threshold(void) {
162 return g_conf->get_val<double>("mds_health_cache_threshold");
163 }
164 double cache_toofull_ratio(void) const {
165 uint64_t inode_limit = cache_limit_inodes();
166 double inode_reserve = inode_limit*(1.0-cache_reservation());
167 double memory_reserve = cache_limit_memory()*(1.0-cache_reservation());
168 return fmax(0.0, fmax((cache_size()-memory_reserve)/memory_reserve, inode_limit == 0 ? 0.0 : (CInode::count()-inode_reserve)/inode_reserve));
169 }
170 bool cache_toofull(void) const {
171 return cache_toofull_ratio() > 0.0;
172 }
173 uint64_t cache_size(void) const {
174 return mempool::get_pool(mempool::mds_co::id).allocated_bytes();
175 }
176 bool cache_overfull(void) const {
177 uint64_t inode_limit = cache_limit_inodes();
178 return (inode_limit > 0 && CInode::count() > inode_limit*cache_health_threshold()) || (cache_size() > cache_limit_memory()*cache_health_threshold());
179 }
180
181 void advance_stray() {
182 stray_index = (stray_index+1)%NUM_STRAY;
183 }
184
185 void activate_stray_manager();
186
187 /**
188 * Call this when you know that a CDentry is ready to be passed
189 * on to StrayManager (i.e. this is a stray you've just created)
190 */
191 void notify_stray(CDentry *dn) {
192 assert(dn->get_dir()->get_inode()->is_stray());
193 stray_manager.eval_stray(dn);
194 }
195
196 void maybe_eval_stray(CInode *in, bool delay=false);
197 void clear_dirty_bits_for_stray(CInode* diri);
198
199 bool is_readonly() { return readonly; }
200 void force_readonly();
201
202 DecayRate decayrate;
203
204 int num_inodes_with_caps;
205
206 unsigned max_dir_commit_size;
207
208 static file_layout_t gen_default_file_layout(const MDSMap &mdsmap);
209 static file_layout_t gen_default_log_layout(const MDSMap &mdsmap);
210
211 file_layout_t default_file_layout;
212 file_layout_t default_log_layout;
213
214 void register_perfcounters();
215
216 // -- client leases --
217 public:
218 static const int client_lease_pools = 3;
219 float client_lease_durations[client_lease_pools];
220 protected:
221 xlist<ClientLease*> client_leases[client_lease_pools];
222 public:
223 void touch_client_lease(ClientLease *r, int pool, utime_t ttl) {
224 client_leases[pool].push_back(&r->item_lease);
225 r->ttl = ttl;
226 }
227
228 void notify_stray_removed()
229 {
230 stray_manager.notify_stray_removed();
231 }
232
233 void notify_stray_created()
234 {
235 stray_manager.notify_stray_created();
236 }
237
238 void eval_remote(CDentry *dn)
239 {
240 stray_manager.eval_remote(dn);
241 }
242
243 // -- client caps --
244 uint64_t last_cap_id;
245
246
247
248 // -- discover --
249 struct discover_info_t {
250 ceph_tid_t tid;
251 mds_rank_t mds;
252 inodeno_t ino;
253 frag_t frag;
254 snapid_t snap;
255 filepath want_path;
256 CInode *basei;
257 bool want_base_dir;
258 bool want_xlocked;
259
260 discover_info_t() :
261 tid(0), mds(-1), snap(CEPH_NOSNAP), basei(NULL),
262 want_base_dir(false), want_xlocked(false) {}
263 ~discover_info_t() {
264 if (basei)
265 basei->put(MDSCacheObject::PIN_DISCOVERBASE);
266 }
267 void pin_base(CInode *b) {
268 basei = b;
269 basei->get(MDSCacheObject::PIN_DISCOVERBASE);
270 }
271 };
272
273 map<ceph_tid_t, discover_info_t> discovers;
274 ceph_tid_t discover_last_tid;
275
276 void _send_discover(discover_info_t& dis);
277 discover_info_t& _create_discover(mds_rank_t mds) {
278 ceph_tid_t t = ++discover_last_tid;
279 discover_info_t& d = discovers[t];
280 d.tid = t;
281 d.mds = mds;
282 return d;
283 }
284
285 // waiters
286 map<int, map<inodeno_t, list<MDSInternalContextBase*> > > waiting_for_base_ino;
287
288 void discover_base_ino(inodeno_t want_ino, MDSInternalContextBase *onfinish, mds_rank_t from=MDS_RANK_NONE);
289 void discover_dir_frag(CInode *base, frag_t approx_fg, MDSInternalContextBase *onfinish,
290 mds_rank_t from=MDS_RANK_NONE);
291 void discover_path(CInode *base, snapid_t snap, filepath want_path, MDSInternalContextBase *onfinish,
292 bool want_xlocked=false, mds_rank_t from=MDS_RANK_NONE);
293 void discover_path(CDir *base, snapid_t snap, filepath want_path, MDSInternalContextBase *onfinish,
294 bool want_xlocked=false);
295 void kick_discovers(mds_rank_t who); // after a failure.
296
297
298 // -- subtrees --
299 protected:
300 /* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */
301 map<CDir*,set<CDir*> > subtrees;
302 map<CInode*,list<pair<CDir*,CDir*> > > projected_subtree_renames; // renamed ino -> target dir
303
304 // adjust subtree auth specification
305 // dir->dir_auth
306 // imports/exports/nested_exports
307 // join/split subtrees as appropriate
308 public:
309 bool is_subtrees() { return !subtrees.empty(); }
310 void list_subtrees(list<CDir*>& ls);
311 void adjust_subtree_auth(CDir *root, mds_authority_t auth);
312 void adjust_subtree_auth(CDir *root, mds_rank_t a, mds_rank_t b=CDIR_AUTH_UNKNOWN) {
313 adjust_subtree_auth(root, mds_authority_t(a,b));
314 }
315 void adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, mds_authority_t auth);
316 void adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, mds_rank_t a) {
317 adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
318 }
319 void adjust_bounded_subtree_auth(CDir *dir, vector<dirfrag_t>& bounds, mds_authority_t auth);
320 void adjust_bounded_subtree_auth(CDir *dir, vector<dirfrag_t>& bounds, mds_rank_t a) {
321 adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
322 }
323 void map_dirfrag_set(list<dirfrag_t>& dfs, set<CDir*>& result);
324 void try_subtree_merge(CDir *root);
325 void try_subtree_merge_at(CDir *root, set<CInode*> *to_eval);
326 void subtree_merge_writebehind_finish(CInode *in, MutationRef& mut);
327 void eval_subtree_root(CInode *diri);
328 CDir *get_subtree_root(CDir *dir);
329 CDir *get_projected_subtree_root(CDir *dir);
330 bool is_leaf_subtree(CDir *dir) {
331 assert(subtrees.count(dir));
332 return subtrees[dir].empty();
333 }
334 void remove_subtree(CDir *dir);
335 bool is_subtree(CDir *root) {
336 return subtrees.count(root);
337 }
338 void get_subtree_bounds(CDir *root, set<CDir*>& bounds);
339 void get_wouldbe_subtree_bounds(CDir *root, set<CDir*>& bounds);
340 void verify_subtree_bounds(CDir *root, const set<CDir*>& bounds);
341 void verify_subtree_bounds(CDir *root, const list<dirfrag_t>& bounds);
342
343 void project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir);
344 void adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop);
345
346 void get_auth_subtrees(set<CDir*>& s);
347 void get_fullauth_subtrees(set<CDir*>& s);
348
349 int num_subtrees();
350 int num_subtrees_fullauth();
351 int num_subtrees_fullnonauth();
352
353
354 protected:
355 // delayed cache expire
356 map<CDir*, map<mds_rank_t, MCacheExpire*> > delayed_expire; // subtree root -> expire msg
357
358
359 // -- requests --
360 ceph::unordered_map<metareqid_t, MDRequestRef> active_requests;
361
362 public:
363 int get_num_client_requests();
364
365 MDRequestRef request_start(MClientRequest *req);
366 MDRequestRef request_start_slave(metareqid_t rid, __u32 attempt, Message *m);
367 MDRequestRef request_start_internal(int op);
368 bool have_request(metareqid_t rid) {
369 return active_requests.count(rid);
370 }
371 MDRequestRef request_get(metareqid_t rid);
372 void request_pin_ref(MDRequestRef& r, CInode *ref, vector<CDentry*>& trace);
373 void request_finish(MDRequestRef& mdr);
374 void request_forward(MDRequestRef& mdr, mds_rank_t mds, int port=0);
375 void dispatch_request(MDRequestRef& mdr);
376 void request_drop_foreign_locks(MDRequestRef& mdr);
377 void request_drop_non_rdlocks(MDRequestRef& r);
378 void request_drop_locks(MDRequestRef& r);
379 void request_cleanup(MDRequestRef& r);
380
381 void request_kill(MDRequestRef& r); // called when session closes
382
383 // journal/snap helpers
384 CInode *pick_inode_snap(CInode *in, snapid_t follows);
385 CInode *cow_inode(CInode *in, snapid_t last);
386 void journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob, CDentry *dn,
387 snapid_t follows=CEPH_NOSNAP,
388 CInode **pcow_inode=0, CDentry::linkage_t *dnl=0);
389 void journal_cow_inode(MutationRef& mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP,
390 CInode **pcow_inode=0);
391 void journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP);
392
393 void project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
394 int linkunlink, SnapRealm *prealm);
395 void _project_rstat_inode_to_frag(inode_t& inode, snapid_t ofirst, snapid_t last,
396 CDir *parent, int linkunlink, bool update_inode);
397 void project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
398 snapid_t ofirst, snapid_t last,
399 CInode *pin, bool cow_head);
400 void broadcast_quota_to_client(CInode *in);
401 void predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
402 CInode *in, CDir *parent,
403 int flags, int linkunlink=0,
404 snapid_t follows=CEPH_NOSNAP);
405
406 // slaves
407 void add_uncommitted_master(metareqid_t reqid, LogSegment *ls, set<mds_rank_t> &slaves, bool safe=false) {
408 uncommitted_masters[reqid].ls = ls;
409 uncommitted_masters[reqid].slaves = slaves;
410 uncommitted_masters[reqid].safe = safe;
411 }
412 void wait_for_uncommitted_master(metareqid_t reqid, MDSInternalContextBase *c) {
413 uncommitted_masters[reqid].waiters.push_back(c);
414 }
415 bool have_uncommitted_master(metareqid_t reqid, mds_rank_t from) {
416 auto p = uncommitted_masters.find(reqid);
417 return p != uncommitted_masters.end() && p->second.slaves.count(from) > 0;
418 }
419 void log_master_commit(metareqid_t reqid);
420 void logged_master_update(metareqid_t reqid);
421 void _logged_master_commit(metareqid_t reqid);
422 void committed_master_slave(metareqid_t r, mds_rank_t from);
423 void finish_committed_masters();
424
425 void _logged_slave_commit(mds_rank_t from, metareqid_t reqid);
426
427 // -- recovery --
428 protected:
429 set<mds_rank_t> recovery_set;
430
431 public:
432 void set_recovery_set(set<mds_rank_t>& s);
433 void handle_mds_failure(mds_rank_t who);
434 void handle_mds_recovery(mds_rank_t who);
435
436 protected:
437 // [resolve]
438 // from EImportStart w/o EImportFinish during journal replay
439 map<dirfrag_t, vector<dirfrag_t> > my_ambiguous_imports;
440 // from MMDSResolves
441 map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > > other_ambiguous_imports;
442
443 map<mds_rank_t, map<metareqid_t, MDSlaveUpdate*> > uncommitted_slave_updates; // slave: for replay.
444 map<CInode*, int> uncommitted_slave_rename_olddir; // slave: preserve the non-auth dir until seeing commit.
445 map<CInode*, int> uncommitted_slave_unlink; // slave: preserve the unlinked inode until seeing commit.
446
447 // track master requests whose slaves haven't acknowledged commit
448 struct umaster {
449 set<mds_rank_t> slaves;
450 LogSegment *ls;
451 list<MDSInternalContextBase*> waiters;
452 bool safe;
453 bool committing;
454 bool recovering;
455 umaster() : ls(NULL), safe(false), committing(false), recovering(false) {}
456 };
457 map<metareqid_t, umaster> uncommitted_masters; // master: req -> slave set
458
459 set<metareqid_t> pending_masters;
460 map<int, set<metareqid_t> > ambiguous_slave_updates;
461
462 friend class ESlaveUpdate;
463 friend class ECommitted;
464
465 bool resolves_pending;
466 set<mds_rank_t> resolve_gather; // nodes i need resolves from
467 set<mds_rank_t> resolve_ack_gather; // nodes i need a resolve_ack from
468 map<metareqid_t, mds_rank_t> need_resolve_rollback; // rollbacks i'm writing to the journal
469 map<mds_rank_t, MMDSResolve*> delayed_resolve;
470
471 void handle_resolve(MMDSResolve *m);
472 void handle_resolve_ack(MMDSResolveAck *m);
473 void process_delayed_resolve();
474 void discard_delayed_resolve(mds_rank_t who);
475 void maybe_resolve_finish();
476 void disambiguate_my_imports();
477 void disambiguate_other_imports();
478 void trim_unlinked_inodes();
479 void add_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master, MDSlaveUpdate*);
480 void finish_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master);
481 MDSlaveUpdate* get_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master);
482 public:
483 void recalc_auth_bits(bool replay);
484 void remove_inode_recursive(CInode *in);
485
486 bool is_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
487 auto p = ambiguous_slave_updates.find(master);
488 return p != ambiguous_slave_updates.end() && p->second.count(reqid);
489 }
490 void add_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
491 ambiguous_slave_updates[master].insert(reqid);
492 }
493 void remove_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
494 auto p = ambiguous_slave_updates.find(master);
495 auto q = p->second.find(reqid);
496 assert(q != p->second.end());
497 p->second.erase(q);
498 if (p->second.empty())
499 ambiguous_slave_updates.erase(p);
500 }
501
502 void add_rollback(metareqid_t reqid, mds_rank_t master) {
503 need_resolve_rollback[reqid] = master;
504 }
505 void finish_rollback(metareqid_t reqid);
506
507 // ambiguous imports
508 void add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds);
509 void add_ambiguous_import(CDir *base, const set<CDir*>& bounds);
510 bool have_ambiguous_import(dirfrag_t base) {
511 return my_ambiguous_imports.count(base);
512 }
513 void get_ambiguous_import_bounds(dirfrag_t base, vector<dirfrag_t>& bounds) {
514 assert(my_ambiguous_imports.count(base));
515 bounds = my_ambiguous_imports[base];
516 }
517 void cancel_ambiguous_import(CDir *);
518 void finish_ambiguous_import(dirfrag_t dirino);
519 void resolve_start(MDSInternalContext *resolve_done_);
520 void send_resolves();
521 void send_slave_resolves();
522 void send_subtree_resolves();
523 void maybe_send_pending_resolves() {
524 if (resolves_pending)
525 send_subtree_resolves();
526 }
527
528 void _move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
529 map<dirfrag_t,vector<dirfrag_t> >& subtrees);
530 ESubtreeMap *create_subtree_map();
531
532
533 void clean_open_file_lists();
534
535 protected:
536 // [rejoin]
537 bool rejoins_pending;
538 set<mds_rank_t> rejoin_gather; // nodes from whom i need a rejoin
539 set<mds_rank_t> rejoin_sent; // nodes i sent a rejoin to
540 set<mds_rank_t> rejoin_ack_sent; // nodes i sent a rejoin to
541 set<mds_rank_t> rejoin_ack_gather; // nodes from whom i need a rejoin ack
542 map<mds_rank_t,map<inodeno_t,map<client_t,Capability::Import> > > rejoin_imported_caps;
543 map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > > rejoin_slave_exports;
544 map<client_t,entity_inst_t> rejoin_client_map;
545
546 map<inodeno_t,map<client_t,cap_reconnect_t> > cap_exports; // ino -> client -> capex
547 map<inodeno_t,mds_rank_t> cap_export_targets; // ino -> auth mds
548
549 map<inodeno_t,map<client_t,map<mds_rank_t,cap_reconnect_t> > > cap_imports; // ino -> client -> frommds -> capex
550 set<inodeno_t> cap_imports_missing;
551 map<inodeno_t, list<MDSInternalContextBase*> > cap_reconnect_waiters;
552 int cap_imports_num_opening;
553
554 set<CInode*> rejoin_undef_inodes;
555 set<CInode*> rejoin_potential_updated_scatterlocks;
556 set<CDir*> rejoin_undef_dirfrags;
557 map<mds_rank_t, set<CInode*> > rejoin_unlinked_inodes;
558
559 vector<CInode*> rejoin_recover_q, rejoin_check_q;
560 list<SimpleLock*> rejoin_eval_locks;
561 list<MDSInternalContextBase*> rejoin_waiters;
562
563 void rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin);
564 void handle_cache_rejoin(MMDSCacheRejoin *m);
565 void handle_cache_rejoin_weak(MMDSCacheRejoin *m);
566 CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last);
567 CDir* rejoin_invent_dirfrag(dirfrag_t df);
568 void handle_cache_rejoin_strong(MMDSCacheRejoin *m);
569 void rejoin_scour_survivor_replicas(mds_rank_t from, MMDSCacheRejoin *ack,
570 set<vinodeno_t>& acked_inodes,
571 set<SimpleLock *>& gather_locks);
572 void handle_cache_rejoin_ack(MMDSCacheRejoin *m);
573 void rejoin_send_acks();
574 void rejoin_trim_undef_inodes();
575 void maybe_send_pending_rejoins() {
576 if (rejoins_pending)
577 rejoin_send_rejoins();
578 }
579 std::unique_ptr<MDSInternalContext> rejoin_done;
580 std::unique_ptr<MDSInternalContext> resolve_done;
581 public:
582 void rejoin_start(MDSInternalContext *rejoin_done_);
583 void rejoin_gather_finish();
584 void rejoin_send_rejoins();
585 void rejoin_export_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
586 int target=-1) {
587 cap_exports[ino][client] = icr;
588 cap_export_targets[ino] = target;
589 }
590 void rejoin_recovered_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
591 mds_rank_t frommds=MDS_RANK_NONE) {
592 cap_imports[ino][client][frommds] = icr;
593 }
594 const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) {
595 if (cap_imports.count(ino) &&
596 cap_imports[ino].count(client) &&
597 cap_imports[ino][client].count(MDS_RANK_NONE)) {
598 return &cap_imports[ino][client][MDS_RANK_NONE];
599 }
600 return NULL;
601 }
602 void remove_replay_cap_reconnect(inodeno_t ino, client_t client) {
603 assert(cap_imports[ino].size() == 1);
604 assert(cap_imports[ino][client].size() == 1);
605 cap_imports.erase(ino);
606 }
607 void wait_replay_cap_reconnect(inodeno_t ino, MDSInternalContextBase *c) {
608 cap_reconnect_waiters[ino].push_back(c);
609 }
610
611 // [reconnect/rejoin caps]
612 struct reconnected_cap_info_t {
613 inodeno_t realm_ino;
614 snapid_t snap_follows;
615 int dirty_caps;
616 reconnected_cap_info_t() :
617 realm_ino(0), snap_follows(0), dirty_caps(0) {}
618 };
619 map<inodeno_t,map<client_t, reconnected_cap_info_t> > reconnected_caps; // inode -> client -> snap_follows,realmino
620 map<inodeno_t,map<client_t, snapid_t> > reconnected_snaprealms; // realmino -> client -> realmseq
621
622 void add_reconnected_cap(client_t client, inodeno_t ino, const cap_reconnect_t& icr) {
623 reconnected_cap_info_t &info = reconnected_caps[ino][client];
624 info.realm_ino = inodeno_t(icr.capinfo.snaprealm);
625 info.snap_follows = icr.snap_follows;
626 }
627 void set_reconnected_dirty_caps(client_t client, inodeno_t ino, int dirty) {
628 reconnected_cap_info_t &info = reconnected_caps[ino][client];
629 info.dirty_caps |= dirty;
630 }
631 void add_reconnected_snaprealm(client_t client, inodeno_t ino, snapid_t seq) {
632 reconnected_snaprealms[ino][client] = seq;
633 }
634
635 friend class C_MDC_RejoinOpenInoFinish;
636 friend class C_MDC_RejoinSessionsOpened;
637 void rejoin_open_ino_finish(inodeno_t ino, int ret);
638 void rejoin_open_sessions_finish(map<client_t,entity_inst_t> client_map,
639 map<client_t,uint64_t>& sseqmap);
640 bool process_imported_caps();
641 void choose_lock_states_and_reconnect_caps();
642 void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
643 map<client_t,MClientSnap*>& splits);
644 void do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool nosend=false);
645 void send_snaps(map<client_t,MClientSnap*>& splits);
646 Capability* rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds);
647 void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq);
648 void try_reconnect_cap(CInode *in, Session *session);
649 void export_remaining_imported_caps();
650
651 // cap imports. delayed snap parent opens.
652 // realm inode -> client -> cap inodes needing to split to this realm
653 map<CInode*,set<CInode*> > missing_snap_parents;
654 map<client_t,set<CInode*> > delayed_imported_caps;
655
656 void do_cap_import(Session *session, CInode *in, Capability *cap,
657 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
658 int peer, int p_flags);
659 void do_delayed_cap_imports();
660 void rebuild_need_snapflush(CInode *head_in, SnapRealm *realm, client_t client,
661 snapid_t snap_follows);
662 void check_realm_past_parents(SnapRealm *realm, bool reconnect);
663 void open_snap_parents();
664
665 bool open_undef_inodes_dirfrags();
666 void opened_undef_inode(CInode *in);
667 void opened_undef_dirfrag(CDir *dir) {
668 rejoin_undef_dirfrags.erase(dir);
669 }
670
671 void reissue_all_caps();
672
673
674 friend class Locker;
675 friend class Migrator;
676 friend class MDBalancer;
677
678 // StrayManager needs to be able to remove_inode() from us
679 // when it is done purging
680 friend class StrayManager;
681
682 // File size recovery
683 private:
684 RecoveryQueue recovery_queue;
685 void identify_files_to_recover();
686 public:
687 void start_files_to_recover();
688 void do_file_recover();
689 void queue_file_recover(CInode *in);
690 void _queued_file_recover_cow(CInode *in, MutationRef& mut);
691
692 // subsystems
693 std::unique_ptr<Migrator> migrator;
694
695 public:
696 explicit MDCache(MDSRank *m, PurgeQueue &purge_queue_);
697 ~MDCache();
698
699 // debug
700 void log_stat();
701
702 // root inode
703 CInode *get_root() { return root; }
704 CInode *get_myin() { return myin; }
705
706 size_t get_cache_size() { return lru.lru_get_size(); }
707
708 // trimming
709 bool trim(uint64_t count=0);
710 private:
711 void trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*>& expiremap);
712 bool trim_dentry(CDentry *dn, map<mds_rank_t, MCacheExpire*>& expiremap);
713 void trim_dirfrag(CDir *dir, CDir *con,
714 map<mds_rank_t, MCacheExpire*>& expiremap);
715 bool trim_inode(CDentry *dn, CInode *in, CDir *con,
716 map<mds_rank_t,class MCacheExpire*>& expiremap);
717 void send_expire_messages(map<mds_rank_t, MCacheExpire*>& expiremap);
718 void trim_non_auth(); // trim out trimmable non-auth items
719 public:
720 bool trim_non_auth_subtree(CDir *directory);
721 void standby_trim_segment(LogSegment *ls);
722 void try_trim_non_auth_subtree(CDir *dir);
723 bool can_trim_non_auth_dirfrag(CDir *dir) {
724 return my_ambiguous_imports.count((dir)->dirfrag()) == 0 &&
725 uncommitted_slave_rename_olddir.count(dir->inode) == 0;
726 }
727
728 /**
729 * For all unreferenced inodes, dirs, dentries below an inode, compose
730 * expiry messages. This is used when giving up all replicas of entities
731 * for an MDS peer in the 'stopping' state, such that the peer can
732 * empty its cache and finish shutting down.
733 *
734 * We have to make sure we're only expiring un-referenced items to
735 * avoid interfering with ongoing stray-movement (we can't distinguish
736 * between the "moving my strays" and "waiting for my cache to empty"
737 * phases within 'stopping')
738 *
739 * @return false if we completed cleanly, true if caller should stop
740 * expiring because we hit something with refs.
741 */
742 bool expire_recursive(
743 CInode *in,
744 std::map<mds_rank_t, MCacheExpire*>& expiremap);
745
746 void trim_client_leases();
747 void check_memory_usage();
748
749 utime_t last_recall_state;
750
751 // shutdown
752 private:
753 set<inodeno_t> shutdown_exported_strays;
754 public:
755 void shutdown_start();
756 void shutdown_check();
757 bool shutdown_pass();
758 bool shutdown_export_strays();
759 bool shutdown(); // clear cache (ie at shutodwn)
760
761 bool did_shutdown_log_cap;
762
763 // inode_map
764 bool have_inode(vinodeno_t vino) {
765 return inode_map.count(vino) ? true:false;
766 }
767 bool have_inode(inodeno_t ino, snapid_t snap=CEPH_NOSNAP) {
768 return have_inode(vinodeno_t(ino, snap));
769 }
770 CInode* get_inode(vinodeno_t vino) {
771 if (have_inode(vino))
772 return inode_map[vino];
773 return NULL;
774 }
775 CInode* get_inode(inodeno_t ino, snapid_t s=CEPH_NOSNAP) {
776 return get_inode(vinodeno_t(ino, s));
777 }
778
779 CDir* get_dirfrag(dirfrag_t df) {
780 CInode *in = get_inode(df.ino);
781 if (!in)
782 return NULL;
783 return in->get_dirfrag(df.frag);
784 }
785 CDir* get_dirfrag(inodeno_t ino, const string& dn) {
786 CInode *in = get_inode(ino);
787 if (!in)
788 return NULL;
789 frag_t fg = in->pick_dirfrag(dn);
790 return in->get_dirfrag(fg);
791 }
792 CDir* get_force_dirfrag(dirfrag_t df, bool replay) {
793 CInode *diri = get_inode(df.ino);
794 if (!diri)
795 return NULL;
796 CDir *dir = force_dir_fragment(diri, df.frag, replay);
797 if (!dir)
798 dir = diri->get_dirfrag(df.frag);
799 return dir;
800 }
801
802 MDSCacheObject *get_object(MDSCacheObjectInfo &info);
803
804
805
806 public:
807 void add_inode(CInode *in);
808
809 void remove_inode(CInode *in);
810 protected:
811 void touch_inode(CInode *in) {
812 if (in->get_parent_dn())
813 touch_dentry(in->get_projected_parent_dn());
814 }
815 public:
816 void touch_dentry(CDentry *dn) {
817 if (dn->state_test(CDentry::STATE_BOTTOMLRU)) {
818 bottom_lru.lru_midtouch(dn);
819 } else {
820 if (dn->is_auth())
821 lru.lru_touch(dn);
822 else
823 lru.lru_midtouch(dn);
824 }
825 }
826 void touch_dentry_bottom(CDentry *dn) {
827 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
828 return;
829 lru.lru_bottouch(dn);
830 }
831 protected:
832
833 void inode_remove_replica(CInode *in, mds_rank_t rep, bool rejoin,
834 set<SimpleLock *>& gather_locks);
835 void dentry_remove_replica(CDentry *dn, mds_rank_t rep, set<SimpleLock *>& gather_locks);
836
837 void rename_file(CDentry *srcdn, CDentry *destdn);
838
839 public:
840 // truncate
841 void truncate_inode(CInode *in, LogSegment *ls);
842 void _truncate_inode(CInode *in, LogSegment *ls);
843 void truncate_inode_finish(CInode *in, LogSegment *ls);
844 void truncate_inode_logged(CInode *in, MutationRef& mut);
845
846 void add_recovered_truncate(CInode *in, LogSegment *ls);
847 void remove_recovered_truncate(CInode *in, LogSegment *ls);
848 void start_recovered_truncates();
849
850
851 public:
852 CDir *get_auth_container(CDir *in);
853 CDir *get_export_container(CDir *dir);
854 void find_nested_exports(CDir *dir, set<CDir*>& s);
855 void find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s);
856
857
858 private:
859 bool opening_root, open;
860 list<MDSInternalContextBase*> waiting_for_open;
861
862 public:
863 void init_layouts();
864 void create_unlinked_system_inode(CInode *in, inodeno_t ino,
865 int mode) const;
866 CInode *create_system_inode(inodeno_t ino, int mode);
867 CInode *create_root_inode();
868
869 void create_empty_hierarchy(MDSGather *gather);
870 void create_mydir_hierarchy(MDSGather *gather);
871
872 bool is_open() { return open; }
873 void wait_for_open(MDSInternalContextBase *c) {
874 waiting_for_open.push_back(c);
875 }
876
877 void open_root_inode(MDSInternalContextBase *c);
878 void open_root();
879 void open_mydir_inode(MDSInternalContextBase *c);
880 void populate_mydir();
881
882 void _create_system_file(CDir *dir, const char *name, CInode *in, MDSInternalContextBase *fin);
883 void _create_system_file_finish(MutationRef& mut, CDentry *dn,
884 version_t dpv, MDSInternalContextBase *fin);
885
886 void open_foreign_mdsdir(inodeno_t ino, MDSInternalContextBase *c);
887 CDir *get_stray_dir(CInode *in);
888 CDentry *get_or_create_stray_dentry(CInode *in);
889
890 MDSInternalContextBase *_get_waiter(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin);
891
892 /**
893 * Find the given dentry (and whether it exists or not), its ancestors,
894 * and get them all into memory and usable on this MDS. This function
895 * makes a best-effort attempt to load everything; if it needs to
896 * go away and do something then it will put the request on a waitlist.
897 * It prefers the mdr, then the req, then the fin. (At least one of these
898 * must be non-null.)
899 *
900 * At least one of the params mdr, req, and fin must be non-null.
901 *
902 * @param mdr The MDRequest associated with the path. Can be null.
903 * @param req The Message associated with the path. Can be null.
904 * @param fin The Context associated with the path. Can be null.
905 * @param path The path to traverse to.
906 * @param pdnvec Data return parameter -- on success, contains a
907 * vector of dentries. On failure, is either empty or contains the
908 * full trace of traversable dentries.
909 * @param pin Data return parameter -- if successful, points to the inode
910 * associated with filepath. If unsuccessful, is null.
911 * @param onfail Specifies different lookup failure behaviors. If set to
912 * MDS_TRAVERSE_DISCOVERXLOCK, path_traverse will succeed on null
913 * dentries (instead of returning -ENOENT). If set to
914 * MDS_TRAVERSE_FORWARD, it will forward the request to the auth
915 * MDS if that becomes appropriate (ie, if it doesn't know the contents
916 * of a directory). If set to MDS_TRAVERSE_DISCOVER, it
917 * will attempt to look up the path from a different MDS (and bring them
918 * into its cache as replicas).
919 *
920 * @returns 0 on success, 1 on "not done yet", 2 on "forwarding", -errno otherwise.
921 * If it returns 1, the requester associated with this call has been placed
922 * on the appropriate waitlist, and it should unwind itself and back out.
923 * If it returns 2 the request has been forwarded, and again the requester
924 * should unwind itself and back out.
925 */
926 int path_traverse(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin, const filepath& path,
927 vector<CDentry*> *pdnvec, CInode **pin, int onfail);
928
929 CInode *cache_traverse(const filepath& path);
930
931 void open_remote_dirfrag(CInode *diri, frag_t fg, MDSInternalContextBase *fin);
932 CInode *get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected=false);
933
934 bool parallel_fetch(map<inodeno_t,filepath>& pathmap, set<inodeno_t>& missing);
935 bool parallel_fetch_traverse_dir(inodeno_t ino, filepath& path,
936 set<CDir*>& fetch_queue, set<inodeno_t>& missing,
937 C_GatherBuilder &gather_bld);
938
939 void open_remote_dentry(CDentry *dn, bool projected, MDSInternalContextBase *fin,
940 bool want_xlocked=false);
941 void _open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSInternalContextBase *fin,
942 bool want_xlocked, int r);
943
944 void make_trace(vector<CDentry*>& trace, CInode *in);
945
946 protected:
947 struct open_ino_info_t {
948 vector<inode_backpointer_t> ancestors;
949 set<mds_rank_t> checked;
950 mds_rank_t checking;
951 mds_rank_t auth_hint;
952 bool check_peers;
953 bool fetch_backtrace;
954 bool discover;
955 bool want_replica;
956 bool want_xlocked;
957 version_t tid;
958 int64_t pool;
959 int last_err;
960 list<MDSInternalContextBase*> waiters;
961 open_ino_info_t() : checking(MDS_RANK_NONE), auth_hint(MDS_RANK_NONE),
962 check_peers(true), fetch_backtrace(true), discover(false),
963 want_replica(false), want_xlocked(false), tid(0), pool(-1),
964 last_err(0) {}
965 };
966 ceph_tid_t open_ino_last_tid;
967 map<inodeno_t,open_ino_info_t> opening_inodes;
968
969 void _open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err);
970 void _open_ino_parent_opened(inodeno_t ino, int ret);
971 void _open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int err);
972 void _open_ino_fetch_dir(inodeno_t ino, MMDSOpenIno *m, CDir *dir, bool parent);
973 int open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
974 vector<inode_backpointer_t>& ancestors,
975 bool discover, bool want_xlocked, mds_rank_t *hint);
976 void open_ino_finish(inodeno_t ino, open_ino_info_t& info, int err);
977 void do_open_ino(inodeno_t ino, open_ino_info_t& info, int err);
978 void do_open_ino_peer(inodeno_t ino, open_ino_info_t& info);
979 void handle_open_ino(MMDSOpenIno *m, int err=0);
980 void handle_open_ino_reply(MMDSOpenInoReply *m);
981 friend class C_IO_MDC_OpenInoBacktraceFetched;
982 friend struct C_MDC_OpenInoTraverseDir;
983 friend struct C_MDC_OpenInoParentOpened;
984
985 public:
986 void kick_open_ino_peers(mds_rank_t who);
987 void open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase *fin,
988 bool want_replica=true, bool want_xlocked=false);
989
990 // -- find_ino_peer --
991 struct find_ino_peer_info_t {
992 inodeno_t ino;
993 ceph_tid_t tid;
994 MDSInternalContextBase *fin;
995 mds_rank_t hint;
996 mds_rank_t checking;
997 set<mds_rank_t> checked;
998
999 find_ino_peer_info_t() : tid(0), fin(NULL), hint(MDS_RANK_NONE), checking(MDS_RANK_NONE) {}
1000 };
1001
1002 map<ceph_tid_t, find_ino_peer_info_t> find_ino_peer;
1003 ceph_tid_t find_ino_peer_last_tid;
1004
1005 void find_ino_peers(inodeno_t ino, MDSInternalContextBase *c, mds_rank_t hint=MDS_RANK_NONE);
1006 void _do_find_ino_peer(find_ino_peer_info_t& fip);
1007 void handle_find_ino(MMDSFindIno *m);
1008 void handle_find_ino_reply(MMDSFindInoReply *m);
1009 void kick_find_ino_peers(mds_rank_t who);
1010
1011 // -- snaprealms --
1012 public:
1013 void snaprealm_create(MDRequestRef& mdr, CInode *in);
1014 void _snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CInode *in);
1015
1016 // -- stray --
1017 public:
1018 void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin);
1019 uint64_t get_num_strays() const { return stray_manager.get_num_strays(); }
1020
1021 protected:
1022 void scan_stray_dir(dirfrag_t next=dirfrag_t());
1023 StrayManager stray_manager;
1024 friend struct C_MDC_RetryScanStray;
1025 friend class C_IO_MDC_FetchedBacktrace;
1026
1027 // == messages ==
1028 public:
1029 void dispatch(Message *m);
1030
1031 protected:
1032 // -- replicas --
1033 void handle_discover(MDiscover *dis);
1034 void handle_discover_reply(MDiscoverReply *m);
1035 friend class C_MDC_Join;
1036
1037 public:
1038 void replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl) {
1039 dirfrag_t df = dir->dirfrag();
1040 ::encode(df, bl);
1041 dir->encode_replica(to, bl);
1042 }
1043 void replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl) {
1044 ::encode(dn->name, bl);
1045 ::encode(dn->last, bl);
1046 dn->encode_replica(to, bl);
1047 }
1048 void replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl,
1049 uint64_t features) {
1050 ::encode(in->inode.ino, bl); // bleh, minor assymetry here
1051 ::encode(in->last, bl);
1052 in->encode_replica(to, bl, features);
1053 }
1054
1055 CDir* add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from, list<MDSInternalContextBase*>& finished);
1056 CDentry *add_replica_dentry(bufferlist::iterator& p, CDir *dir, list<MDSInternalContextBase*>& finished);
1057 CInode *add_replica_inode(bufferlist::iterator& p, CDentry *dn, list<MDSInternalContextBase*>& finished);
1058
1059 void replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl);
1060 CDentry *add_replica_stray(bufferlist &bl, mds_rank_t from);
1061
1062 // -- namespace --
1063 public:
1064 void send_dentry_link(CDentry *dn, MDRequestRef& mdr);
1065 void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr);
1066 protected:
1067 void handle_dentry_link(MDentryLink *m);
1068 void handle_dentry_unlink(MDentryUnlink *m);
1069
1070
1071 // -- fragmenting --
1072 private:
1073 struct ufragment {
1074 int bits;
1075 bool committed;
1076 LogSegment *ls;
1077 list<MDSInternalContextBase*> waiters;
1078 list<frag_t> old_frags;
1079 bufferlist rollback;
1080 ufragment() : bits(0), committed(false), ls(NULL) {}
1081 };
1082 map<dirfrag_t, ufragment> uncommitted_fragments;
1083
1084 struct fragment_info_t {
1085 int bits;
1086 list<CDir*> dirs;
1087 list<CDir*> resultfrags;
1088 MDRequestRef mdr;
1089 // for deadlock detection
1090 bool all_frozen;
1091 utime_t last_cum_auth_pins_change;
1092 int last_cum_auth_pins;
1093 int num_remote_waiters; // number of remote authpin waiters
1094 fragment_info_t() : bits(0), all_frozen(false), last_cum_auth_pins(0), num_remote_waiters(0) {}
1095 bool is_fragmenting() { return !resultfrags.empty(); }
1096 };
1097 map<dirfrag_t,fragment_info_t> fragments;
1098
1099 void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
1100 list<CDir*>& frags, list<MDSInternalContextBase*>& waiters, bool replay);
1101 void adjust_dir_fragments(CInode *diri,
1102 list<CDir*>& srcfrags,
1103 frag_t basefrag, int bits,
1104 list<CDir*>& resultfrags,
1105 list<MDSInternalContextBase*>& waiters,
1106 bool replay);
1107 CDir *force_dir_fragment(CInode *diri, frag_t fg, bool replay=true);
1108 void get_force_dirfrag_bound_set(vector<dirfrag_t>& dfs, set<CDir*>& bounds);
1109
1110 bool can_fragment(CInode *diri, list<CDir*>& dirs);
1111 void fragment_freeze_dirs(list<CDir*>& dirs);
1112 void fragment_mark_and_complete(MDRequestRef& mdr);
1113 void fragment_frozen(MDRequestRef& mdr, int r);
1114 void fragment_unmark_unfreeze_dirs(list<CDir*>& dirs);
1115 void dispatch_fragment_dir(MDRequestRef& mdr);
1116 void _fragment_logged(MDRequestRef& mdr);
1117 void _fragment_stored(MDRequestRef& mdr);
1118 void _fragment_committed(dirfrag_t f, list<CDir*>& resultfrags);
1119 void _fragment_finish(dirfrag_t f, list<CDir*>& resultfrags);
1120
1121 friend class EFragment;
1122 friend class C_MDC_FragmentFrozen;
1123 friend class C_MDC_FragmentMarking;
1124 friend class C_MDC_FragmentPrep;
1125 friend class C_MDC_FragmentStore;
1126 friend class C_MDC_FragmentCommit;
1127 friend class C_IO_MDC_FragmentFinish;
1128
1129 void handle_fragment_notify(MMDSFragmentNotify *m);
1130
1131 void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frag,
1132 LogSegment *ls, bufferlist *rollback=NULL);
1133 void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op);
1134 void rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags);
1135 public:
1136 void wait_for_uncommitted_fragment(dirfrag_t dirfrag, MDSInternalContextBase *c) {
1137 assert(uncommitted_fragments.count(dirfrag));
1138 uncommitted_fragments[dirfrag].waiters.push_back(c);
1139 }
1140 void split_dir(CDir *dir, int byn);
1141 void merge_dir(CInode *diri, frag_t fg);
1142 void rollback_uncommitted_fragments();
1143
1144 void find_stale_fragment_freeze();
1145 void fragment_freeze_inc_num_waiters(CDir *dir);
1146 bool fragment_are_all_frozen(CDir *dir);
1147 int get_num_fragmenting_dirs() { return fragments.size(); }
1148
1149 // -- updates --
1150 //int send_inode_updates(CInode *in);
1151 //void handle_inode_update(MInodeUpdate *m);
1152
1153 int send_dir_updates(CDir *in, bool bcast=false);
1154 void handle_dir_update(MDirUpdate *m);
1155
1156 // -- cache expiration --
1157 void handle_cache_expire(MCacheExpire *m);
1158 void process_delayed_expire(CDir *dir);
1159 void discard_delayed_expire(CDir *dir);
1160
1161 protected:
1162 int dump_cache(const char *fn, Formatter *f,
1163 const std::string& dump_root = "",
1164 int depth = -1);
1165 public:
1166 int dump_cache() { return dump_cache(NULL, NULL); }
1167 int dump_cache(const std::string &filename);
1168 int dump_cache(Formatter *f);
1169 int dump_cache(const std::string& dump_root, int depth, Formatter *f);
1170
1171 int cache_status(Formatter *f);
1172
1173 void dump_resolve_status(Formatter *f) const;
1174 void dump_rejoin_status(Formatter *f) const;
1175
1176 // == crap fns ==
1177 public:
1178 void show_cache();
1179 void show_subtrees(int dbl=10);
1180
1181 CInode *hack_pick_random_inode() {
1182 assert(!inode_map.empty());
1183 int n = rand() % inode_map.size();
1184 ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
1185 while (n--) ++p;
1186 return p->second;
1187 }
1188
1189 protected:
1190 void flush_dentry_work(MDRequestRef& mdr);
1191 /**
1192 * Resolve path to a dentry and pass it onto the ScrubStack.
1193 *
1194 * TODO: return enough information to the original mdr formatter
1195 * and completion that they can subsequeuntly check the progress of
1196 * this scrub (we won't block them on a whole scrub as it can take a very
1197 * long time)
1198 */
1199 void enqueue_scrub_work(MDRequestRef& mdr);
1200 void repair_inode_stats_work(MDRequestRef& mdr);
1201 void repair_dirfrag_stats_work(MDRequestRef& mdr);
1202 friend class C_MDC_RepairDirfragStats;
1203 public:
1204 void flush_dentry(const string& path, Context *fin);
1205 /**
1206 * Create and start an OP_ENQUEUE_SCRUB
1207 */
1208 void enqueue_scrub(const string& path, const std::string &tag,
1209 bool force, bool recursive, bool repair,
1210 Formatter *f, Context *fin);
1211 void repair_inode_stats(CInode *diri);
1212 void repair_dirfrag_stats(CDir *dir);
1213
1214 public:
1215 /* Because exports may fail, this set lets us keep track of inodes that need exporting. */
1216 std::set<CInode *> export_pin_queue;
1217 };
1218
1219 class C_MDS_RetryRequest : public MDSInternalContext {
1220 MDCache *cache;
1221 MDRequestRef mdr;
1222 public:
1223 C_MDS_RetryRequest(MDCache *c, MDRequestRef& r);
1224 void finish(int r) override;
1225 };
1226
1227 #endif