ceph/src/mds/MDCache.h

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15
  16
  17 #ifndef CEPH_MDCACHE_H
  18 #define CEPH_MDCACHE_H
  19
  20 #include "include/types.h"
  21 #include "include/filepath.h"
  22 #include "include/elist.h"
  23
  24 #include "osdc/Filer.h"
  25 #include "CInode.h"
  26 #include "CDentry.h"
  27 #include "CDir.h"
  28 #include "include/Context.h"
  29 #include "events/EMetaBlob.h"
  30 #include "RecoveryQueue.h"
  31 #include "StrayManager.h"
  32 #include "MDSContext.h"
  33 #include "MDSMap.h"
  34 #include "Mutation.h"
  35
  36 #include "messages/MClientRequest.h"
  37 #include "messages/MMDSSlaveRequest.h"
  38
  39 class PerfCounters;
  40
  41 class MDSRank;
  42 class Session;
  43 class Migrator;
  44
  45 class Message;
  46 class Session;
  47
  48 class MMDSResolve;
  49 class MMDSResolveAck;
  50 class MMDSCacheRejoin;
  51 class MDiscover;
  52 class MDiscoverReply;
  53 class MCacheExpire;
  54 class MDirUpdate;
  55 class MDentryLink;
  56 class MDentryUnlink;
  57 class MLock;
  58 struct MMDSFindIno;
  59 struct MMDSFindInoReply;
  60 struct MMDSOpenIno;
  61 struct MMDSOpenInoReply;
  62
  63 class Message;
  64 class MClientRequest;
  65 class MMDSSlaveRequest;
  66 struct MClientSnap;
  67
  68 class MMDSFragmentNotify;
  69
  70 class ESubtreeMap;
  71
  72 enum {
  73   l_mdc_first = 3000,
  74   // How many inodes currently in stray dentries
  75   l_mdc_num_strays,
  76   // How many stray dentries are currently delayed for purge due to refs
  77   l_mdc_num_strays_delayed,
  78   // How many stray dentries are currently being enqueued for purge
  79   l_mdc_num_strays_enqueuing,
  80
  81   // How many dentries have ever been added to stray dir
  82   l_mdc_strays_created,
  83   // How many dentries have been passed on to PurgeQueue
  84   l_mdc_strays_enqueued,
  85   // How many strays have been reintegrated?
  86   l_mdc_strays_reintegrated,
  87   // How many strays have been migrated?
  88   l_mdc_strays_migrated,
  89
  90   // How many inode sizes currently being recovered
  91   l_mdc_num_recovering_processing,
  92   // How many inodes currently waiting to have size recovered
  93   l_mdc_num_recovering_enqueued,
  94   // How many inodes waiting with elevated priority for recovery
  95   l_mdc_num_recovering_prioritized,
  96   // How many inodes ever started size recovery
  97   l_mdc_recovery_started,
  98   // How many inodes ever completed size recovery
  99   l_mdc_recovery_completed,
 100
 101   l_mdc_last,
 102 };
 103
 104
 105 // flags for predirty_journal_parents()
 106 static const int PREDIRTY_PRIMARY = 1; // primary dn, adjust nested accounting
 107 static const int PREDIRTY_DIR = 2;     // update parent dir mtime/size
 108 static const int PREDIRTY_SHALLOW = 4; // only go to immediate parent (for easier rollback)
 109
 110 class MDCache {
 111  public:
 112   // my master
 113   MDSRank *mds;
 114
 115   // -- my cache --
 116   LRU lru;   // dentry lru for expiring items from cache
 117  protected:
 118   ceph::unordered_map<vinodeno_t,CInode*> inode_map;  // map of inodes by ino
 119   CInode *root;                            // root inode
 120   CInode *myin;                            // .ceph/mds%d dir
 121
 122   bool readonly;
 123   void set_readonly() { readonly = true; }
 124
 125   CInode *strays[NUM_STRAY];         // my stray dir
 126   int stray_index;
 127
 128   CInode *get_stray() {
 129     return strays[stray_index];
 130   }
 131
 132   set<CInode*> base_inodes;
 133
 134   std::unique_ptr<PerfCounters> logger;
 135
 136   Filer filer;
 137
 138   bool exceeded_size_limit;
 139
 140 public:
 141   void advance_stray() {
 142     stray_index = (stray_index+1)%NUM_STRAY;
 143   }
 144
 145   void activate_stray_manager();
 146
 147   /**
 148    * Call this when you know that a CDentry is ready to be passed
 149    * on to StrayManager (i.e. this is a stray you've just created)
 150    */
 151   void notify_stray(CDentry *dn) {
 152     assert(dn->get_dir()->get_inode()->is_stray());
 153     stray_manager.eval_stray(dn);
 154   }
 155
 156   void maybe_eval_stray(CInode *in, bool delay=false);
 157   bool is_readonly() { return readonly; }
 158   void force_readonly();
 159
 160   DecayRate decayrate;
 161
 162   int num_inodes_with_caps;
 163
 164   unsigned max_dir_commit_size;
 165
 166   static file_layout_t gen_default_file_layout(const MDSMap &mdsmap);
 167   static file_layout_t gen_default_log_layout(const MDSMap &mdsmap);
 168
 169   file_layout_t default_file_layout;
 170   file_layout_t default_log_layout;
 171
 172   void register_perfcounters();
 173
 174   // -- client leases --
 175 public:
 176   static const int client_lease_pools = 3;
 177   float client_lease_durations[client_lease_pools];
 178 protected:
 179   xlist<ClientLease*> client_leases[client_lease_pools];
 180 public:
 181   void touch_client_lease(ClientLease *r, int pool, utime_t ttl) {
 182     client_leases[pool].push_back(&r->item_lease);
 183     r->ttl = ttl;
 184   }
 185
 186   void notify_stray_removed()
 187   {
 188     stray_manager.notify_stray_removed();
 189   }
 190
 191   void notify_stray_created()
 192   {
 193     stray_manager.notify_stray_created();
 194   }
 195
 196   // -- client caps --
 197   uint64_t              last_cap_id;
 198
 199
 200
 201   // -- discover --
 202   struct discover_info_t {
 203     ceph_tid_t tid;
 204     mds_rank_t mds;
 205     inodeno_t ino;
 206     frag_t frag;
 207     snapid_t snap;
 208     filepath want_path;
 209     MDSCacheObject *base;
 210     bool want_base_dir;
 211     bool want_xlocked;
 212
 213     discover_info_t() :
 214       tid(0), mds(-1), snap(CEPH_NOSNAP), base(NULL),
 215       want_base_dir(false), want_xlocked(false) {}
 216     ~discover_info_t() {
 217       if (base)
 218         base->put(MDSCacheObject::PIN_DISCOVERBASE);
 219     }
 220     void pin_base(MDSCacheObject *b) {
 221       base = b;
 222       base->get(MDSCacheObject::PIN_DISCOVERBASE);
 223     }
 224   };
 225
 226   map<ceph_tid_t, discover_info_t> discovers;
 227   ceph_tid_t discover_last_tid;
 228
 229   void _send_discover(discover_info_t& dis);
 230   discover_info_t& _create_discover(mds_rank_t mds) {
 231     ceph_tid_t t = ++discover_last_tid;
 232     discover_info_t& d = discovers[t];
 233     d.tid = t;
 234     d.mds = mds;
 235     return d;
 236   }
 237
 238   // waiters
 239   map<int, map<inodeno_t, list<MDSInternalContextBase*> > > waiting_for_base_ino;
 240
 241   void discover_base_ino(inodeno_t want_ino, MDSInternalContextBase *onfinish, mds_rank_t from=MDS_RANK_NONE);
 242   void discover_dir_frag(CInode *base, frag_t approx_fg, MDSInternalContextBase *onfinish,
 243                          mds_rank_t from=MDS_RANK_NONE);
 244   void discover_path(CInode *base, snapid_t snap, filepath want_path, MDSInternalContextBase *onfinish,
 245                      bool want_xlocked=false, mds_rank_t from=MDS_RANK_NONE);
 246   void discover_path(CDir *base, snapid_t snap, filepath want_path, MDSInternalContextBase *onfinish,
 247                      bool want_xlocked=false);
 248   void kick_discovers(mds_rank_t who);  // after a failure.
 249
 250
 251   // -- subtrees --
 252 protected:
 253   /* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */
 254   map<CDir*,set<CDir*> > subtrees;
 255   map<CInode*,list<pair<CDir*,CDir*> > > projected_subtree_renames;  // renamed ino -> target dir
 256
 257   // adjust subtree auth specification
 258   //  dir->dir_auth
 259   //  imports/exports/nested_exports
 260   //  join/split subtrees as appropriate
 261 public:
 262   bool is_subtrees() { return !subtrees.empty(); }
 263   void list_subtrees(list<CDir*>& ls);
 264   void adjust_subtree_auth(CDir *root, mds_authority_t auth, bool do_eval=true);
 265   void adjust_subtree_auth(CDir *root, mds_rank_t a, mds_rank_t b=CDIR_AUTH_UNKNOWN, bool do_eval=true) {
 266     adjust_subtree_auth(root, mds_authority_t(a,b), do_eval);
 267   }
 268   void adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, mds_authority_t auth);
 269   void adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, mds_rank_t a) {
 270     adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
 271   }
 272   void adjust_bounded_subtree_auth(CDir *dir, vector<dirfrag_t>& bounds, mds_authority_t auth);
 273   void adjust_bounded_subtree_auth(CDir *dir, vector<dirfrag_t>& bounds, mds_rank_t a) {
 274     adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
 275   }
 276   void map_dirfrag_set(list<dirfrag_t>& dfs, set<CDir*>& result);
 277   void try_subtree_merge(CDir *root);
 278   void try_subtree_merge_at(CDir *root, bool do_eval=true);
 279   void subtree_merge_writebehind_finish(CInode *in, MutationRef& mut);
 280   void eval_subtree_root(CInode *diri);
 281   CDir *get_subtree_root(CDir *dir);
 282   CDir *get_projected_subtree_root(CDir *dir);
 283   bool is_leaf_subtree(CDir *dir) {
 284     assert(subtrees.count(dir));
 285     return subtrees[dir].empty();
 286   }
 287   void remove_subtree(CDir *dir);
 288   bool is_subtree(CDir *root) {
 289     return subtrees.count(root);
 290   }
 291   void get_subtree_bounds(CDir *root, set<CDir*>& bounds);
 292   void get_wouldbe_subtree_bounds(CDir *root, set<CDir*>& bounds);
 293   void verify_subtree_bounds(CDir *root, const set<CDir*>& bounds);
 294   void verify_subtree_bounds(CDir *root, const list<dirfrag_t>& bounds);
 295
 296   void project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir);
 297   void adjust_subtree_after_rename(CInode *diri, CDir *olddir,
 298                                    bool pop, bool imported = false);
 299
 300   void get_auth_subtrees(set<CDir*>& s);
 301   void get_fullauth_subtrees(set<CDir*>& s);
 302
 303   int num_subtrees();
 304   int num_subtrees_fullauth();
 305   int num_subtrees_fullnonauth();
 306
 307
 308 protected:
 309   // delayed cache expire
 310   map<CDir*, map<mds_rank_t, MCacheExpire*> > delayed_expire; // subtree root -> expire msg
 311
 312
 313   // -- requests --
 314   ceph::unordered_map<metareqid_t, MDRequestRef> active_requests;
 315
 316 public:
 317   int get_num_client_requests();
 318
 319   MDRequestRef request_start(MClientRequest *req);
 320   MDRequestRef request_start_slave(metareqid_t rid, __u32 attempt, Message *m);
 321   MDRequestRef request_start_internal(int op);
 322   bool have_request(metareqid_t rid) {
 323     return active_requests.count(rid);
 324   }
 325   MDRequestRef request_get(metareqid_t rid);
 326   void request_pin_ref(MDRequestRef& r, CInode *ref, vector<CDentry*>& trace);
 327   void request_finish(MDRequestRef& mdr);
 328   void request_forward(MDRequestRef& mdr, mds_rank_t mds, int port=0);
 329   void dispatch_request(MDRequestRef& mdr);
 330   void request_drop_foreign_locks(MDRequestRef& mdr);
 331   void request_drop_non_rdlocks(MDRequestRef& r);
 332   void request_drop_locks(MDRequestRef& r);
 333   void request_cleanup(MDRequestRef& r);
 334
 335   void request_kill(MDRequestRef& r);  // called when session closes
 336
 337   // journal/snap helpers
 338   CInode *pick_inode_snap(CInode *in, snapid_t follows);
 339   CInode *cow_inode(CInode *in, snapid_t last);
 340   void journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob, CDentry *dn,
 341                           snapid_t follows=CEPH_NOSNAP,
 342                           CInode **pcow_inode=0, CDentry::linkage_t *dnl=0);
 343   void journal_cow_inode(MutationRef& mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP,
 344                           CInode **pcow_inode=0);
 345   void journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP);
 346
 347   void project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
 348                                    int linkunlink, SnapRealm *prealm);
 349   void _project_rstat_inode_to_frag(inode_t& inode, snapid_t ofirst, snapid_t last,
 350                                     CDir *parent, int linkunlink, bool update_inode);
 351   void project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
 352                                    snapid_t ofirst, snapid_t last,
 353                                    CInode *pin, bool cow_head);
 354   void broadcast_quota_to_client(CInode *in);
 355   void predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
 356                                 CInode *in, CDir *parent,
 357                                 int flags, int linkunlink=0,
 358                                 snapid_t follows=CEPH_NOSNAP);
 359
 360   // slaves
 361   void add_uncommitted_master(metareqid_t reqid, LogSegment *ls, set<mds_rank_t> &slaves, bool safe=false) {
 362     uncommitted_masters[reqid].ls = ls;
 363     uncommitted_masters[reqid].slaves = slaves;
 364     uncommitted_masters[reqid].safe = safe;
 365   }
 366   void wait_for_uncommitted_master(metareqid_t reqid, MDSInternalContextBase *c) {
 367     uncommitted_masters[reqid].waiters.push_back(c);
 368   }
 369   bool have_uncommitted_master(metareqid_t reqid, mds_rank_t from) {
 370     auto p = uncommitted_masters.find(reqid);
 371     return p != uncommitted_masters.end() && p->second.slaves.count(from) > 0;
 372   }
 373   void log_master_commit(metareqid_t reqid);
 374   void logged_master_update(metareqid_t reqid);
 375   void _logged_master_commit(metareqid_t reqid);
 376   void committed_master_slave(metareqid_t r, mds_rank_t from);
 377   void finish_committed_masters();
 378
 379   void _logged_slave_commit(mds_rank_t from, metareqid_t reqid);
 380
 381   // -- recovery --
 382 protected:
 383   set<mds_rank_t> recovery_set;
 384
 385 public:
 386   void set_recovery_set(set<mds_rank_t>& s);
 387   void handle_mds_failure(mds_rank_t who);
 388   void handle_mds_recovery(mds_rank_t who);
 389
 390 protected:
 391   // [resolve]
 392   // from EImportStart w/o EImportFinish during journal replay
 393   map<dirfrag_t, vector<dirfrag_t> >            my_ambiguous_imports;
 394   // from MMDSResolves
 395   map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > > other_ambiguous_imports;
 396
 397   map<mds_rank_t, map<metareqid_t, MDSlaveUpdate*> > uncommitted_slave_updates;  // slave: for replay.
 398   map<CInode*, int> uncommitted_slave_rename_olddir;  // slave: preserve the non-auth dir until seeing commit.
 399   map<CInode*, int> uncommitted_slave_unlink;  // slave: preserve the unlinked inode until seeing commit.
 400
 401   // track master requests whose slaves haven't acknowledged commit
 402   struct umaster {
 403     set<mds_rank_t> slaves;
 404     LogSegment *ls;
 405     list<MDSInternalContextBase*> waiters;
 406     bool safe;
 407     bool committing;
 408     bool recovering;
 409     umaster() : ls(NULL), safe(false), committing(false), recovering(false) {}
 410   };
 411   map<metareqid_t, umaster>                 uncommitted_masters;         // master: req -> slave set
 412
 413   set<metareqid_t>              pending_masters;
 414   map<int, set<metareqid_t> >   ambiguous_slave_updates;
 415
 416   friend class ESlaveUpdate;
 417   friend class ECommitted;
 418
 419   bool resolves_pending;
 420   set<mds_rank_t> resolve_gather;       // nodes i need resolves from
 421   set<mds_rank_t> resolve_ack_gather;   // nodes i need a resolve_ack from
 422   map<metareqid_t, mds_rank_t> need_resolve_rollback;  // rollbacks i'm writing to the journal
 423   map<mds_rank_t, MMDSResolve*> delayed_resolve;
 424
 425   void handle_resolve(MMDSResolve *m);
 426   void handle_resolve_ack(MMDSResolveAck *m);
 427   void process_delayed_resolve();
 428   void discard_delayed_resolve(mds_rank_t who);
 429   void maybe_resolve_finish();
 430   void disambiguate_my_imports();
 431   void disambiguate_other_imports();
 432   void trim_unlinked_inodes();
 433   void add_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master, MDSlaveUpdate*);
 434   void finish_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master);
 435   MDSlaveUpdate* get_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master);
 436 public:
 437   void recalc_auth_bits(bool replay);
 438   void remove_inode_recursive(CInode *in);
 439
 440   bool is_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
 441     auto p = ambiguous_slave_updates.find(master);
 442     return p != ambiguous_slave_updates.end() && p->second.count(reqid);
 443   }
 444   void add_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
 445     ambiguous_slave_updates[master].insert(reqid);
 446   }
 447   void remove_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
 448     auto p = ambiguous_slave_updates.find(master);
 449     auto q = p->second.find(reqid);
 450     assert(q != p->second.end());
 451     p->second.erase(q);
 452     if (p->second.empty())
 453       ambiguous_slave_updates.erase(p);
 454   }
 455
 456   void add_rollback(metareqid_t reqid, mds_rank_t master) {
 457     need_resolve_rollback[reqid] = master;
 458   }
 459   void finish_rollback(metareqid_t reqid);
 460
 461   // ambiguous imports
 462   void add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds);
 463   void add_ambiguous_import(CDir *base, const set<CDir*>& bounds);
 464   bool have_ambiguous_import(dirfrag_t base) {
 465     return my_ambiguous_imports.count(base);
 466   }
 467   void get_ambiguous_import_bounds(dirfrag_t base, vector<dirfrag_t>& bounds) {
 468     assert(my_ambiguous_imports.count(base));
 469     bounds = my_ambiguous_imports[base];
 470   }
 471   void cancel_ambiguous_import(CDir *);
 472   void finish_ambiguous_import(dirfrag_t dirino);
 473   void resolve_start(MDSInternalContext *resolve_done_);
 474   void send_resolves();
 475   void send_slave_resolves();
 476   void send_subtree_resolves();
 477   void maybe_send_pending_resolves() {
 478     if (resolves_pending)
 479       send_subtree_resolves();
 480   }
 481
 482   void _move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
 483                                map<dirfrag_t,vector<dirfrag_t> >& subtrees);
 484   ESubtreeMap *create_subtree_map();
 485
 486
 487   void clean_open_file_lists();
 488
 489 protected:
 490   // [rejoin]
 491   bool rejoins_pending;
 492   set<mds_rank_t> rejoin_gather;      // nodes from whom i need a rejoin
 493   set<mds_rank_t> rejoin_sent;        // nodes i sent a rejoin to
 494   set<mds_rank_t> rejoin_ack_gather;  // nodes from whom i need a rejoin ack
 495   map<mds_rank_t,map<inodeno_t,map<client_t,Capability::Import> > > rejoin_imported_caps;
 496   map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > > rejoin_slave_exports;
 497   map<client_t,entity_inst_t> rejoin_client_map;
 498
 499   map<inodeno_t,map<client_t,cap_reconnect_t> > cap_exports; // ino -> client -> capex
 500   map<inodeno_t,mds_rank_t> cap_export_targets; // ino -> auth mds
 501
 502   map<inodeno_t,map<client_t,map<mds_rank_t,cap_reconnect_t> > > cap_imports;  // ino -> client -> frommds -> capex
 503   set<inodeno_t> cap_imports_missing;
 504   map<inodeno_t, list<MDSInternalContextBase*> > cap_reconnect_waiters;
 505   int cap_imports_num_opening;
 506
 507   set<CInode*> rejoin_undef_inodes;
 508   set<CInode*> rejoin_potential_updated_scatterlocks;
 509   set<CDir*>   rejoin_undef_dirfrags;
 510   map<mds_rank_t, set<CInode*> > rejoin_unlinked_inodes;
 511
 512   vector<CInode*> rejoin_recover_q, rejoin_check_q;
 513   list<SimpleLock*> rejoin_eval_locks;
 514   list<MDSInternalContextBase*> rejoin_waiters;
 515
 516   void rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin);
 517   void handle_cache_rejoin(MMDSCacheRejoin *m);
 518   void handle_cache_rejoin_weak(MMDSCacheRejoin *m);
 519   CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last);
 520   CDir* rejoin_invent_dirfrag(dirfrag_t df);
 521   void handle_cache_rejoin_strong(MMDSCacheRejoin *m);
 522   void rejoin_scour_survivor_replicas(mds_rank_t from, MMDSCacheRejoin *ack,
 523                                       set<vinodeno_t>& acked_inodes,
 524                                       set<SimpleLock *>& gather_locks);
 525   void handle_cache_rejoin_ack(MMDSCacheRejoin *m);
 526   void rejoin_send_acks();
 527   void rejoin_trim_undef_inodes();
 528   void maybe_send_pending_rejoins() {
 529     if (rejoins_pending)
 530       rejoin_send_rejoins();
 531   }
 532   std::unique_ptr<MDSInternalContext> rejoin_done;
 533   std::unique_ptr<MDSInternalContext> resolve_done;
 534 public:
 535   void rejoin_start(MDSInternalContext *rejoin_done_);
 536   void rejoin_gather_finish();
 537   void rejoin_send_rejoins();
 538   void rejoin_export_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
 539                           int target=-1) {
 540     cap_exports[ino][client] = icr;
 541     cap_export_targets[ino] = target;
 542   }
 543   void rejoin_recovered_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
 544                              mds_rank_t frommds=MDS_RANK_NONE) {
 545     cap_imports[ino][client][frommds] = icr;
 546   }
 547   const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) {
 548     if (cap_imports.count(ino) &&
 549         cap_imports[ino].count(client) &&
 550         cap_imports[ino][client].count(MDS_RANK_NONE)) {
 551       return &cap_imports[ino][client][MDS_RANK_NONE];
 552     }
 553     return NULL;
 554   }
 555   void remove_replay_cap_reconnect(inodeno_t ino, client_t client) {
 556     assert(cap_imports[ino].size() == 1);
 557     assert(cap_imports[ino][client].size() == 1);
 558     cap_imports.erase(ino);
 559   }
 560   void wait_replay_cap_reconnect(inodeno_t ino, MDSInternalContextBase *c) {
 561     cap_reconnect_waiters[ino].push_back(c);
 562   }
 563
 564   // [reconnect/rejoin caps]
 565   struct reconnected_cap_info_t {
 566     inodeno_t realm_ino;
 567     snapid_t snap_follows;
 568     int dirty_caps;
 569     reconnected_cap_info_t() :
 570       realm_ino(0), snap_follows(0), dirty_caps(0) {}
 571   };
 572   map<inodeno_t,map<client_t, reconnected_cap_info_t> >  reconnected_caps;   // inode -> client -> snap_follows,realmino
 573   map<inodeno_t,map<client_t, snapid_t> > reconnected_snaprealms;  // realmino -> client -> realmseq
 574
 575   void add_reconnected_cap(client_t client, inodeno_t ino, const cap_reconnect_t& icr) {
 576     reconnected_cap_info_t &info = reconnected_caps[ino][client];
 577     info.realm_ino = inodeno_t(icr.capinfo.snaprealm);
 578     info.snap_follows = icr.snap_follows;
 579   }
 580   void set_reconnected_dirty_caps(client_t client, inodeno_t ino, int dirty) {
 581     reconnected_cap_info_t &info = reconnected_caps[ino][client];
 582     info.dirty_caps |= dirty;
 583   }
 584   void add_reconnected_snaprealm(client_t client, inodeno_t ino, snapid_t seq) {
 585     reconnected_snaprealms[ino][client] = seq;
 586   }
 587
 588   friend class C_MDC_RejoinOpenInoFinish;
 589   friend class C_MDC_RejoinSessionsOpened;
 590   void rejoin_open_ino_finish(inodeno_t ino, int ret);
 591   void rejoin_open_sessions_finish(map<client_t,entity_inst_t> client_map,
 592                                    map<client_t,uint64_t>& sseqmap);
 593   bool process_imported_caps();
 594   void choose_lock_states_and_reconnect_caps();
 595   void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
 596                            map<client_t,MClientSnap*>& splits);
 597   void do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool nosend=false);
 598   void send_snaps(map<client_t,MClientSnap*>& splits);
 599   Capability* rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds);
 600   void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq);
 601   void try_reconnect_cap(CInode *in, Session *session);
 602   void export_remaining_imported_caps();
 603
 604   // cap imports.  delayed snap parent opens.
 605   //  realm inode -> client -> cap inodes needing to split to this realm
 606   map<CInode*,set<CInode*> > missing_snap_parents;
 607   map<client_t,set<CInode*> > delayed_imported_caps;
 608
 609   void do_cap_import(Session *session, CInode *in, Capability *cap,
 610                      uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
 611                      int peer, int p_flags);
 612   void do_delayed_cap_imports();
 613   void rebuild_need_snapflush(CInode *head_in, SnapRealm *realm, client_t client,
 614                               snapid_t snap_follows);
 615   void check_realm_past_parents(SnapRealm *realm, bool reconnect);
 616   void open_snap_parents();
 617
 618   bool open_undef_inodes_dirfrags();
 619   void opened_undef_inode(CInode *in);
 620   void opened_undef_dirfrag(CDir *dir) {
 621     rejoin_undef_dirfrags.erase(dir);
 622   }
 623
 624   void reissue_all_caps();
 625
 626
 627   friend class Locker;
 628   friend class Migrator;
 629   friend class MDBalancer;
 630
 631   // StrayManager needs to be able to remove_inode() from us
 632   // when it is done purging
 633   friend class StrayManager;
 634
 635   // File size recovery
 636 private:
 637   RecoveryQueue recovery_queue;
 638   void identify_files_to_recover();
 639 public:
 640   void start_files_to_recover();
 641   void do_file_recover();
 642   void queue_file_recover(CInode *in);
 643   void _queued_file_recover_cow(CInode *in, MutationRef& mut);
 644
 645   // subsystems
 646   std::unique_ptr<Migrator> migrator;
 647
 648  public:
 649   explicit MDCache(MDSRank *m, PurgeQueue &purge_queue_);
 650   ~MDCache();
 651
 652   // debug
 653   void log_stat();
 654
 655   // root inode
 656   CInode *get_root() { return root; }
 657   CInode *get_myin() { return myin; }
 658
 659   // cache
 660   void set_cache_size(size_t max) { lru.lru_set_max(max); }
 661   size_t get_cache_size() { return lru.lru_get_size(); }
 662
 663   // trimming
 664   bool trim(int max=-1, int count=-1);   // trim cache
 665   bool trim_dentry(CDentry *dn, map<mds_rank_t, MCacheExpire*>& expiremap);
 666   void trim_dirfrag(CDir *dir, CDir *con,
 667                     map<mds_rank_t, MCacheExpire*>& expiremap);
 668   bool trim_inode(CDentry *dn, CInode *in, CDir *con,
 669                   map<mds_rank_t,class MCacheExpire*>& expiremap);
 670   void send_expire_messages(map<mds_rank_t, MCacheExpire*>& expiremap);
 671   void trim_non_auth();      // trim out trimmable non-auth items
 672   bool trim_non_auth_subtree(CDir *directory);
 673   void standby_trim_segment(LogSegment *ls);
 674   void try_trim_non_auth_subtree(CDir *dir);
 675   bool can_trim_non_auth_dirfrag(CDir *dir) {
 676     return my_ambiguous_imports.count((dir)->dirfrag()) == 0 &&
 677            uncommitted_slave_rename_olddir.count(dir->inode) == 0;
 678   }
 679
 680   /**
 681    * For all unreferenced inodes, dirs, dentries below an inode, compose
 682    * expiry messages.  This is used when giving up all replicas of entities
 683    * for an MDS peer in the 'stopping' state, such that the peer can
 684    * empty its cache and finish shutting down.
 685    *
 686    * We have to make sure we're only expiring un-referenced items to
 687    * avoid interfering with ongoing stray-movement (we can't distinguish
 688    * between the "moving my strays" and "waiting for my cache to empty"
 689    * phases within 'stopping')
 690    *
 691    * @return false if we completed cleanly, true if caller should stop
 692    *         expiring because we hit something with refs.
 693    */
 694   bool expire_recursive(
 695     CInode *in,
 696     std::map<mds_rank_t, MCacheExpire*>& expiremap);
 697
 698   void trim_client_leases();
 699   void check_memory_usage();
 700
 701   utime_t last_recall_state;
 702
 703   // shutdown
 704 private:
 705   set<inodeno_t> shutdown_exported_strays;
 706 public:
 707   void shutdown_start();
 708   void shutdown_check();
 709   bool shutdown_pass();
 710   bool shutdown_export_strays();
 711   bool shutdown();                    // clear cache (ie at shutodwn)
 712
 713   bool did_shutdown_log_cap;
 714
 715   // inode_map
 716   bool have_inode(vinodeno_t vino) {
 717     return inode_map.count(vino) ? true:false;
 718   }
 719   bool have_inode(inodeno_t ino, snapid_t snap=CEPH_NOSNAP) {
 720     return have_inode(vinodeno_t(ino, snap));
 721   }
 722   CInode* get_inode(vinodeno_t vino) {
 723     if (have_inode(vino))
 724       return inode_map[vino];
 725     return NULL;
 726   }
 727   CInode* get_inode(inodeno_t ino, snapid_t s=CEPH_NOSNAP) {
 728     return get_inode(vinodeno_t(ino, s));
 729   }
 730
 731   CDir* get_dirfrag(dirfrag_t df) {
 732     CInode *in = get_inode(df.ino);
 733     if (!in)
 734       return NULL;
 735     return in->get_dirfrag(df.frag);
 736   }
 737   CDir* get_dirfrag(inodeno_t ino, const string& dn) {
 738     CInode *in = get_inode(ino);
 739     if (!in)
 740       return NULL;
 741     frag_t fg = in->pick_dirfrag(dn);
 742     return in->get_dirfrag(fg);
 743   }
 744   CDir* get_force_dirfrag(dirfrag_t df, bool replay) {
 745     CInode *diri = get_inode(df.ino);
 746     if (!diri)
 747       return NULL;
 748     CDir *dir = force_dir_fragment(diri, df.frag, replay);
 749     if (!dir)
 750       dir = diri->get_dirfrag(df.frag);
 751     return dir;
 752   }
 753
 754   MDSCacheObject *get_object(MDSCacheObjectInfo &info);
 755
 756
 757
 758  public:
 759   void add_inode(CInode *in);
 760
 761   void remove_inode(CInode *in);
 762  protected:
 763   void touch_inode(CInode *in) {
 764     if (in->get_parent_dn())
 765       touch_dentry(in->get_projected_parent_dn());
 766   }
 767 public:
 768   void touch_dentry(CDentry *dn) {
 769     // touch ancestors
 770     if (dn->get_dir()->get_inode()->get_projected_parent_dn())
 771       touch_dentry(dn->get_dir()->get_inode()->get_projected_parent_dn());
 772
 773     // touch me
 774     if (dn->is_auth())
 775       lru.lru_touch(dn);
 776     else
 777       lru.lru_midtouch(dn);
 778   }
 779   void touch_dentry_bottom(CDentry *dn) {
 780     lru.lru_bottouch(dn);
 781     if (dn->get_projected_linkage()->is_primary() &&
 782         dn->get_dir()->inode->is_stray()) {
 783       CInode *in = dn->get_projected_linkage()->get_inode();
 784       if (in->has_dirfrags()) {
 785         list<CDir*> ls;
 786         in->get_dirfrags(ls);
 787         for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p)
 788           (*p)->touch_dentries_bottom();
 789       }
 790     }
 791   }
 792 protected:
 793
 794   void inode_remove_replica(CInode *in, mds_rank_t rep, bool rejoin,
 795                             set<SimpleLock *>& gather_locks);
 796   void dentry_remove_replica(CDentry *dn, mds_rank_t rep, set<SimpleLock *>& gather_locks);
 797
 798   void rename_file(CDentry *srcdn, CDentry *destdn);
 799
 800  public:
 801   // truncate
 802   void truncate_inode(CInode *in, LogSegment *ls);
 803   void _truncate_inode(CInode *in, LogSegment *ls);
 804   void truncate_inode_finish(CInode *in, LogSegment *ls);
 805   void truncate_inode_logged(CInode *in, MutationRef& mut);
 806
 807   void add_recovered_truncate(CInode *in, LogSegment *ls);
 808   void remove_recovered_truncate(CInode *in, LogSegment *ls);
 809   void start_recovered_truncates();
 810
 811
 812  public:
 813   CDir *get_auth_container(CDir *in);
 814   CDir *get_export_container(CDir *dir);
 815   void find_nested_exports(CDir *dir, set<CDir*>& s);
 816   void find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s);
 817
 818
 819 private:
 820   bool opening_root, open;
 821   list<MDSInternalContextBase*> waiting_for_open;
 822
 823 public:
 824   void init_layouts();
 825   void create_unlinked_system_inode(CInode *in, inodeno_t ino,
 826                                     int mode) const;
 827   CInode *create_system_inode(inodeno_t ino, int mode);
 828   CInode *create_root_inode();
 829
 830   void create_empty_hierarchy(MDSGather *gather);
 831   void create_mydir_hierarchy(MDSGather *gather);
 832
 833   bool is_open() { return open; }
 834   void wait_for_open(MDSInternalContextBase *c) {
 835     waiting_for_open.push_back(c);
 836   }
 837
 838   void open_root_inode(MDSInternalContextBase *c);
 839   void open_root();
 840   void open_mydir_inode(MDSInternalContextBase *c);
 841   void populate_mydir();
 842
 843   void _create_system_file(CDir *dir, const char *name, CInode *in, MDSInternalContextBase *fin);
 844   void _create_system_file_finish(MutationRef& mut, CDentry *dn,
 845                                   version_t dpv, MDSInternalContextBase *fin);
 846
 847   void open_foreign_mdsdir(inodeno_t ino, MDSInternalContextBase *c);
 848   CDir *get_stray_dir(CInode *in);
 849   CDentry *get_or_create_stray_dentry(CInode *in);
 850
 851   MDSInternalContextBase *_get_waiter(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin);
 852
 853   /**
 854    * Find the given dentry (and whether it exists or not), its ancestors,
 855    * and get them all into memory and usable on this MDS. This function
 856    * makes a best-effort attempt to load everything; if it needs to
 857    * go away and do something then it will put the request on a waitlist.
 858    * It prefers the mdr, then the req, then the fin. (At least one of these
 859    * must be non-null.)
 860    *
 861    * At least one of the params mdr, req, and fin must be non-null.
 862    *
 863    * @param mdr The MDRequest associated with the path. Can be null.
 864    * @param req The Message associated with the path. Can be null.
 865    * @param fin The Context associated with the path. Can be null.
 866    * @param path The path to traverse to.
 867    * @param pdnvec Data return parameter -- on success, contains a
 868    * vector of dentries. On failure, is either empty or contains the
 869    * full trace of traversable dentries.
 870    * @param pin Data return parameter -- if successful, points to the inode
 871    * associated with filepath. If unsuccessful, is null.
 872    * @param onfail Specifies different lookup failure behaviors. If set to
 873    * MDS_TRAVERSE_DISCOVERXLOCK, path_traverse will succeed on null
 874    * dentries (instead of returning -ENOENT). If set to
 875    * MDS_TRAVERSE_FORWARD, it will forward the request to the auth
 876    * MDS if that becomes appropriate (ie, if it doesn't know the contents
 877    * of a directory). If set to MDS_TRAVERSE_DISCOVER, it
 878    * will attempt to look up the path from a different MDS (and bring them
 879    * into its cache as replicas).
 880    *
 881    * @returns 0 on success, 1 on "not done yet", 2 on "forwarding", -errno otherwise.
 882    * If it returns 1, the requester associated with this call has been placed
 883    * on the appropriate waitlist, and it should unwind itself and back out.
 884    * If it returns 2 the request has been forwarded, and again the requester
 885    * should unwind itself and back out.
 886    */
 887   int path_traverse(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin, const filepath& path,
 888                     vector<CDentry*> *pdnvec, CInode **pin, int onfail);
 889
 890   CInode *cache_traverse(const filepath& path);
 891
 892   void open_remote_dirfrag(CInode *diri, frag_t fg, MDSInternalContextBase *fin);
 893   CInode *get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected=false);
 894
 895   bool parallel_fetch(map<inodeno_t,filepath>& pathmap, set<inodeno_t>& missing);
 896   bool parallel_fetch_traverse_dir(inodeno_t ino, filepath& path,
 897                                    set<CDir*>& fetch_queue, set<inodeno_t>& missing,
 898                                    C_GatherBuilder &gather_bld);
 899
 900   void open_remote_dentry(CDentry *dn, bool projected, MDSInternalContextBase *fin,
 901                           bool want_xlocked=false);
 902   void _open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSInternalContextBase *fin,
 903                                   bool want_xlocked, int r);
 904
 905   void make_trace(vector<CDentry*>& trace, CInode *in);
 906
 907 protected:
 908   struct open_ino_info_t {
 909     vector<inode_backpointer_t> ancestors;
 910     set<mds_rank_t> checked;
 911     mds_rank_t checking;
 912     mds_rank_t auth_hint;
 913     bool check_peers;
 914     bool fetch_backtrace;
 915     bool discover;
 916     bool want_replica;
 917     bool want_xlocked;
 918     version_t tid;
 919     int64_t pool;
 920     int last_err;
 921     list<MDSInternalContextBase*> waiters;
 922     open_ino_info_t() : checking(MDS_RANK_NONE), auth_hint(MDS_RANK_NONE),
 923       check_peers(true), fetch_backtrace(true), discover(false),
 924       want_replica(false), want_xlocked(false), tid(0), pool(-1),
 925       last_err(0) {}
 926   };
 927   ceph_tid_t open_ino_last_tid;
 928   map<inodeno_t,open_ino_info_t> opening_inodes;
 929
 930   void _open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err);
 931   void _open_ino_parent_opened(inodeno_t ino, int ret);
 932   void _open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int err);
 933   void _open_ino_fetch_dir(inodeno_t ino, MMDSOpenIno *m, CDir *dir, bool parent);
 934   int open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
 935                             vector<inode_backpointer_t>& ancestors,
 936                             bool discover, bool want_xlocked, mds_rank_t *hint);
 937   void open_ino_finish(inodeno_t ino, open_ino_info_t& info, int err);
 938   void do_open_ino(inodeno_t ino, open_ino_info_t& info, int err);
 939   void do_open_ino_peer(inodeno_t ino, open_ino_info_t& info);
 940   void handle_open_ino(MMDSOpenIno *m, int err=0);
 941   void handle_open_ino_reply(MMDSOpenInoReply *m);
 942   friend class C_IO_MDC_OpenInoBacktraceFetched;
 943   friend struct C_MDC_OpenInoTraverseDir;
 944   friend struct C_MDC_OpenInoParentOpened;
 945
 946 public:
 947   void kick_open_ino_peers(mds_rank_t who);
 948   void open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase *fin,
 949                 bool want_replica=true, bool want_xlocked=false);
 950
 951   // -- find_ino_peer --
 952   struct find_ino_peer_info_t {
 953     inodeno_t ino;
 954     ceph_tid_t tid;
 955     MDSInternalContextBase *fin;
 956     mds_rank_t hint;
 957     mds_rank_t checking;
 958     set<mds_rank_t> checked;
 959
 960     find_ino_peer_info_t() : tid(0), fin(NULL), hint(MDS_RANK_NONE), checking(MDS_RANK_NONE) {}
 961   };
 962
 963   map<ceph_tid_t, find_ino_peer_info_t> find_ino_peer;
 964   ceph_tid_t find_ino_peer_last_tid;
 965
 966   void find_ino_peers(inodeno_t ino, MDSInternalContextBase *c, mds_rank_t hint=MDS_RANK_NONE);
 967   void _do_find_ino_peer(find_ino_peer_info_t& fip);
 968   void handle_find_ino(MMDSFindIno *m);
 969   void handle_find_ino_reply(MMDSFindInoReply *m);
 970   void kick_find_ino_peers(mds_rank_t who);
 971
 972   // -- snaprealms --
 973 public:
 974   void snaprealm_create(MDRequestRef& mdr, CInode *in);
 975   void _snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CInode *in);
 976
 977   // -- stray --
 978 public:
 979   void eval_remote(CDentry *dn);
 980   void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin);
 981   uint64_t get_num_strays() const { return stray_manager.get_num_strays(); }
 982
 983 protected:
 984   void scan_stray_dir(dirfrag_t next=dirfrag_t());
 985   StrayManager stray_manager;
 986   friend struct C_MDC_RetryScanStray;
 987   friend class C_IO_MDC_FetchedBacktrace;
 988
 989   // == messages ==
 990  public:
 991   void dispatch(Message *m);
 992
 993  protected:
 994   // -- replicas --
 995   void handle_discover(MDiscover *dis);
 996   void handle_discover_reply(MDiscoverReply *m);
 997   friend class C_MDC_Join;
 998
 999 public:
1000   void replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl) {
1001     dirfrag_t df = dir->dirfrag();
1002     ::encode(df, bl);
1003     dir->encode_replica(to, bl);
1004   }
1005   void replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl) {
1006     ::encode(dn->name, bl);
1007     ::encode(dn->last, bl);
1008     dn->encode_replica(to, bl);
1009   }
1010   void replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl,
1011                        uint64_t features) {
1012     ::encode(in->inode.ino, bl);  // bleh, minor assymetry here
1013     ::encode(in->last, bl);
1014     in->encode_replica(to, bl, features);
1015   }
1016
1017   CDir* add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from, list<MDSInternalContextBase*>& finished);
1018   CDir* forge_replica_dir(CInode *diri, frag_t fg, mds_rank_t from);
1019   CDentry *add_replica_dentry(bufferlist::iterator& p, CDir *dir, list<MDSInternalContextBase*>& finished);
1020   CInode *add_replica_inode(bufferlist::iterator& p, CDentry *dn, list<MDSInternalContextBase*>& finished);
1021
1022   void replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl);
1023   CDentry *add_replica_stray(bufferlist &bl, mds_rank_t from);
1024
1025   // -- namespace --
1026 public:
1027   void send_dentry_link(CDentry *dn, MDRequestRef& mdr);
1028   void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr);
1029 protected:
1030   void handle_dentry_link(MDentryLink *m);
1031   void handle_dentry_unlink(MDentryUnlink *m);
1032
1033
1034   // -- fragmenting --
1035 private:
1036   struct ufragment {
1037     int bits;
1038     bool committed;
1039     LogSegment *ls;
1040     list<MDSInternalContextBase*> waiters;
1041     list<frag_t> old_frags;
1042     bufferlist rollback;
1043     ufragment() : bits(0), committed(false), ls(NULL) {}
1044   };
1045   map<dirfrag_t, ufragment> uncommitted_fragments;
1046
1047   struct fragment_info_t {
1048     int bits;
1049     list<CDir*> dirs;
1050     list<CDir*> resultfrags;
1051     MDRequestRef mdr;
1052     // for deadlock detection
1053     bool all_frozen;
1054     utime_t last_cum_auth_pins_change;
1055     int last_cum_auth_pins;
1056     int num_remote_waiters;     // number of remote authpin waiters
1057     fragment_info_t() : bits(0), all_frozen(false), last_cum_auth_pins(0), num_remote_waiters(0) {}
1058     bool is_fragmenting() { return !resultfrags.empty(); }
1059   };
1060   map<dirfrag_t,fragment_info_t> fragments;
1061
1062   void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
1063                             list<CDir*>& frags, list<MDSInternalContextBase*>& waiters, bool replay);
1064   void adjust_dir_fragments(CInode *diri,
1065                             list<CDir*>& srcfrags,
1066                             frag_t basefrag, int bits,
1067                             list<CDir*>& resultfrags,
1068                             list<MDSInternalContextBase*>& waiters,
1069                             bool replay);
1070   CDir *force_dir_fragment(CInode *diri, frag_t fg, bool replay=true);
1071   void get_force_dirfrag_bound_set(vector<dirfrag_t>& dfs, set<CDir*>& bounds);
1072
1073   bool can_fragment(CInode *diri, list<CDir*>& dirs);
1074   void fragment_freeze_dirs(list<CDir*>& dirs);
1075   void fragment_mark_and_complete(MDRequestRef& mdr);
1076   void fragment_frozen(MDRequestRef& mdr, int r);
1077   void fragment_unmark_unfreeze_dirs(list<CDir*>& dirs);
1078   void dispatch_fragment_dir(MDRequestRef& mdr);
1079   void _fragment_logged(MDRequestRef& mdr);
1080   void _fragment_stored(MDRequestRef& mdr);
1081   void _fragment_committed(dirfrag_t f, list<CDir*>& resultfrags);
1082   void _fragment_finish(dirfrag_t f, list<CDir*>& resultfrags);
1083
1084   friend class EFragment;
1085   friend class C_MDC_FragmentFrozen;
1086   friend class C_MDC_FragmentMarking;
1087   friend class C_MDC_FragmentPrep;
1088   friend class C_MDC_FragmentStore;
1089   friend class C_MDC_FragmentCommit;
1090   friend class C_IO_MDC_FragmentFinish;
1091
1092   void handle_fragment_notify(MMDSFragmentNotify *m);
1093
1094   void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frag,
1095                                 LogSegment *ls, bufferlist *rollback=NULL);
1096   void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op);
1097   void rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags);
1098 public:
1099   void wait_for_uncommitted_fragment(dirfrag_t dirfrag, MDSInternalContextBase *c) {
1100     assert(uncommitted_fragments.count(dirfrag));
1101     uncommitted_fragments[dirfrag].waiters.push_back(c);
1102   }
1103   void split_dir(CDir *dir, int byn);
1104   void merge_dir(CInode *diri, frag_t fg);
1105   void rollback_uncommitted_fragments();
1106
1107   void find_stale_fragment_freeze();
1108   void fragment_freeze_inc_num_waiters(CDir *dir);
1109   bool fragment_are_all_frozen(CDir *dir);
1110   int get_num_fragmenting_dirs() { return fragments.size(); }
1111
1112   // -- updates --
1113   //int send_inode_updates(CInode *in);
1114   //void handle_inode_update(MInodeUpdate *m);
1115
1116   int send_dir_updates(CDir *in, bool bcast=false);
1117   void handle_dir_update(MDirUpdate *m);
1118
1119   // -- cache expiration --
1120   void handle_cache_expire(MCacheExpire *m);
1121   void process_delayed_expire(CDir *dir);
1122   void discard_delayed_expire(CDir *dir);
1123
1124 protected:
1125   void dump_cache(const char *fn, Formatter *f,
1126                   const std::string& dump_root = "",
1127                   int depth = -1);
1128 public:
1129   void dump_cache() {dump_cache(NULL, NULL);}
1130   void dump_cache(const std::string &filename);
1131   void dump_cache(Formatter *f);
1132   void dump_cache(const std::string& dump_root, int depth, Formatter *f);
1133
1134   void dump_resolve_status(Formatter *f) const;
1135   void dump_rejoin_status(Formatter *f) const;
1136
1137   // == crap fns ==
1138  public:
1139   void show_cache();
1140   void show_subtrees(int dbl=10);
1141
1142   CInode *hack_pick_random_inode() {
1143     assert(!inode_map.empty());
1144     int n = rand() % inode_map.size();
1145     ceph::unordered_map<vinodeno_t,CInode*>::iterator p = inode_map.begin();
1146     while (n--) ++p;
1147     return p->second;
1148   }
1149
1150 protected:
1151   void flush_dentry_work(MDRequestRef& mdr);
1152   /**
1153    * Resolve path to a dentry and pass it onto the ScrubStack.
1154    *
1155    * TODO: return enough information to the original mdr formatter
1156    * and completion that they can subsequeuntly check the progress of
1157    * this scrub (we won't block them on a whole scrub as it can take a very
1158    * long time)
1159    */
1160   void enqueue_scrub_work(MDRequestRef& mdr);
1161   void repair_inode_stats_work(MDRequestRef& mdr);
1162   void repair_dirfrag_stats_work(MDRequestRef& mdr);
1163   friend class C_MDC_RepairDirfragStats;
1164 public:
1165   void flush_dentry(const string& path, Context *fin);
1166   /**
1167    * Create and start an OP_ENQUEUE_SCRUB
1168    */
1169   void enqueue_scrub(const string& path, const std::string &tag,
1170                      bool force, bool recursive, bool repair,
1171                      Formatter *f, Context *fin);
1172   void repair_inode_stats(CInode *diri);
1173   void repair_dirfrag_stats(CDir *dir);
1174
1175 public:
1176   /* Because exports may fail, this set lets us keep track of inodes that need exporting. */
1177   std::set<CInode *> export_pin_queue;
1178 };
1179
1180 class C_MDS_RetryRequest : public MDSInternalContext {
1181   MDCache *cache;
1182   MDRequestRef mdr;
1183  public:
1184   C_MDS_RetryRequest(MDCache *c, MDRequestRef& r);
1185   void finish(int r) override;
1186 };
1187
1188 #endif