ceph/src/mds/MDCache.h

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15
  16
  17 #ifndef CEPH_MDCACHE_H
  18 #define CEPH_MDCACHE_H
  19
  20 #include <boost/utility/string_view.hpp>
  21
  22 #include "include/types.h"
  23 #include "include/filepath.h"
  24 #include "include/elist.h"
  25
  26 #include "osdc/Filer.h"
  27 #include "CInode.h"
  28 #include "CDentry.h"
  29 #include "CDir.h"
  30 #include "include/Context.h"
  31 #include "events/EMetaBlob.h"
  32 #include "RecoveryQueue.h"
  33 #include "StrayManager.h"
  34 #include "MDSContext.h"
  35 #include "MDSMap.h"
  36 #include "Mutation.h"
  37
  38 #include "messages/MClientRequest.h"
  39 #include "messages/MMDSSlaveRequest.h"
  40
  41 class PerfCounters;
  42
  43 class MDSRank;
  44 class Session;
  45 class Migrator;
  46
  47 class Message;
  48 class Session;
  49
  50 class MMDSResolve;
  51 class MMDSResolveAck;
  52 class MMDSCacheRejoin;
  53 class MDiscover;
  54 class MDiscoverReply;
  55 class MCacheExpire;
  56 class MDirUpdate;
  57 class MDentryLink;
  58 class MDentryUnlink;
  59 class MLock;
  60 struct MMDSFindIno;
  61 struct MMDSFindInoReply;
  62 struct MMDSOpenIno;
  63 struct MMDSOpenInoReply;
  64
  65 class Message;
  66 class MClientRequest;
  67 class MMDSSlaveRequest;
  68 struct MClientSnap;
  69
  70 class MMDSFragmentNotify;
  71
  72 class ESubtreeMap;
  73
  74 enum {
  75   l_mdc_first = 3000,
  76   // How many inodes currently in stray dentries
  77   l_mdc_num_strays,
  78   // How many stray dentries are currently delayed for purge due to refs
  79   l_mdc_num_strays_delayed,
  80   // How many stray dentries are currently being enqueued for purge
  81   l_mdc_num_strays_enqueuing,
  82
  83   // How many dentries have ever been added to stray dir
  84   l_mdc_strays_created,
  85   // How many dentries have been passed on to PurgeQueue
  86   l_mdc_strays_enqueued,
  87   // How many strays have been reintegrated?
  88   l_mdc_strays_reintegrated,
  89   // How many strays have been migrated?
  90   l_mdc_strays_migrated,
  91
  92   // How many inode sizes currently being recovered
  93   l_mdc_num_recovering_processing,
  94   // How many inodes currently waiting to have size recovered
  95   l_mdc_num_recovering_enqueued,
  96   // How many inodes waiting with elevated priority for recovery
  97   l_mdc_num_recovering_prioritized,
  98   // How many inodes ever started size recovery
  99   l_mdc_recovery_started,
 100   // How many inodes ever completed size recovery
 101   l_mdc_recovery_completed,
 102
 103   l_mdss_ireq_enqueue_scrub,
 104   l_mdss_ireq_exportdir,
 105   l_mdss_ireq_flush,
 106   l_mdss_ireq_fragmentdir,
 107   l_mdss_ireq_fragstats,
 108   l_mdss_ireq_inodestats,
 109
 110   l_mdc_last,
 111 };
 112
 113
 114 // flags for predirty_journal_parents()
 115 static const int PREDIRTY_PRIMARY = 1; // primary dn, adjust nested accounting
 116 static const int PREDIRTY_DIR = 2;     // update parent dir mtime/size
 117 static const int PREDIRTY_SHALLOW = 4; // only go to immediate parent (for easier rollback)
 118
 119 class MDCache {
 120  public:
 121   using clock = ceph::coarse_mono_clock;
 122   using time = ceph::coarse_mono_time;
 123
 124   // my master
 125   MDSRank *mds;
 126
 127   // -- my cache --
 128   LRU lru;   // dentry lru for expiring items from cache
 129   LRU bottom_lru; // dentries that should be trimmed ASAP
 130  protected:
 131   ceph::unordered_map<inodeno_t,CInode*> inode_map;  // map of head inodes by ino
 132   map<vinodeno_t, CInode*> snap_inode_map;  // map of snap inodes by ino
 133   CInode *root;                            // root inode
 134   CInode *myin;                            // .ceph/mds%d dir
 135
 136   bool readonly;
 137   void set_readonly() { readonly = true; }
 138
 139   CInode *strays[NUM_STRAY];         // my stray dir
 140   int stray_index;
 141
 142   CInode *get_stray() {
 143     return strays[stray_index];
 144   }
 145
 146   set<CInode*> base_inodes;
 147
 148   std::unique_ptr<PerfCounters> logger;
 149
 150   Filer filer;
 151
 152   bool exceeded_size_limit;
 153
 154 private:
 155   uint64_t cache_inode_limit;
 156   uint64_t cache_memory_limit;
 157   double cache_reservation;
 158   double cache_health_threshold;
 159
 160 public:
 161   uint64_t cache_limit_inodes(void) {
 162     return cache_inode_limit;
 163   }
 164   uint64_t cache_limit_memory(void) {
 165     return cache_memory_limit;
 166   }
 167   double cache_toofull_ratio(void) const {
 168     double inode_reserve = cache_inode_limit*(1.0-cache_reservation);
 169     double memory_reserve = cache_memory_limit*(1.0-cache_reservation);
 170     return fmax(0.0, fmax((cache_size()-memory_reserve)/memory_reserve, cache_inode_limit == 0 ? 0.0 : (CInode::count()-inode_reserve)/inode_reserve));
 171   }
 172   bool cache_toofull(void) const {
 173     return cache_toofull_ratio() > 0.0;
 174   }
 175   uint64_t cache_size(void) const {
 176     return mempool::get_pool(mempool::mds_co::id).allocated_bytes();
 177   }
 178   bool cache_overfull(void) const {
 179     return (cache_inode_limit > 0 && CInode::count() > cache_inode_limit*cache_health_threshold) || (cache_size() > cache_memory_limit*cache_health_threshold);
 180   }
 181
 182   void advance_stray() {
 183     stray_index = (stray_index+1)%NUM_STRAY;
 184   }
 185
 186   void activate_stray_manager();
 187
 188   /**
 189    * Call this when you know that a CDentry is ready to be passed
 190    * on to StrayManager (i.e. this is a stray you've just created)
 191    */
 192   void notify_stray(CDentry *dn) {
 193     assert(dn->get_dir()->get_inode()->is_stray());
 194     stray_manager.eval_stray(dn);
 195   }
 196
 197   void maybe_eval_stray(CInode *in, bool delay=false);
 198   void clear_dirty_bits_for_stray(CInode* diri);
 199
 200   bool is_readonly() { return readonly; }
 201   void force_readonly();
 202
 203   DecayRate decayrate;
 204
 205   int num_shadow_inodes;
 206
 207   int num_inodes_with_caps;
 208
 209   unsigned max_dir_commit_size;
 210
 211   static file_layout_t gen_default_file_layout(const MDSMap &mdsmap);
 212   static file_layout_t gen_default_log_layout(const MDSMap &mdsmap);
 213
 214   file_layout_t default_file_layout;
 215   file_layout_t default_log_layout;
 216
 217   void register_perfcounters();
 218
 219   // -- client leases --
 220 public:
 221   static const int client_lease_pools = 3;
 222   float client_lease_durations[client_lease_pools];
 223 protected:
 224   xlist<ClientLease*> client_leases[client_lease_pools];
 225 public:
 226   void touch_client_lease(ClientLease *r, int pool, utime_t ttl) {
 227     client_leases[pool].push_back(&r->item_lease);
 228     r->ttl = ttl;
 229   }
 230
 231   void notify_stray_removed()
 232   {
 233     stray_manager.notify_stray_removed();
 234   }
 235
 236   void notify_stray_created()
 237   {
 238     stray_manager.notify_stray_created();
 239   }
 240
 241   void eval_remote(CDentry *dn)
 242   {
 243     stray_manager.eval_remote(dn);
 244   }
 245
 246   // -- client caps --
 247   uint64_t              last_cap_id;
 248
 249
 250
 251   // -- discover --
 252   struct discover_info_t {
 253     ceph_tid_t tid;
 254     mds_rank_t mds;
 255     inodeno_t ino;
 256     frag_t frag;
 257     snapid_t snap;
 258     filepath want_path;
 259     CInode *basei;
 260     bool want_base_dir;
 261     bool want_xlocked;
 262
 263     discover_info_t() :
 264       tid(0), mds(-1), snap(CEPH_NOSNAP), basei(NULL),
 265       want_base_dir(false), want_xlocked(false) {}
 266     ~discover_info_t() {
 267       if (basei)
 268         basei->put(MDSCacheObject::PIN_DISCOVERBASE);
 269     }
 270     void pin_base(CInode *b) {
 271       basei = b;
 272       basei->get(MDSCacheObject::PIN_DISCOVERBASE);
 273     }
 274   };
 275
 276   map<ceph_tid_t, discover_info_t> discovers;
 277   ceph_tid_t discover_last_tid;
 278
 279   void _send_discover(discover_info_t& dis);
 280   discover_info_t& _create_discover(mds_rank_t mds) {
 281     ceph_tid_t t = ++discover_last_tid;
 282     discover_info_t& d = discovers[t];
 283     d.tid = t;
 284     d.mds = mds;
 285     return d;
 286   }
 287
 288   // waiters
 289   map<int, map<inodeno_t, list<MDSInternalContextBase*> > > waiting_for_base_ino;
 290
 291   void discover_base_ino(inodeno_t want_ino, MDSInternalContextBase *onfinish, mds_rank_t from=MDS_RANK_NONE);
 292   void discover_dir_frag(CInode *base, frag_t approx_fg, MDSInternalContextBase *onfinish,
 293                          mds_rank_t from=MDS_RANK_NONE);
 294   void discover_path(CInode *base, snapid_t snap, filepath want_path, MDSInternalContextBase *onfinish,
 295                      bool want_xlocked=false, mds_rank_t from=MDS_RANK_NONE);
 296   void discover_path(CDir *base, snapid_t snap, filepath want_path, MDSInternalContextBase *onfinish,
 297                      bool want_xlocked=false);
 298   void kick_discovers(mds_rank_t who);  // after a failure.
 299
 300
 301   // -- subtrees --
 302 protected:
 303   /* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */
 304   map<CDir*,set<CDir*> > subtrees;
 305   map<CInode*,list<pair<CDir*,CDir*> > > projected_subtree_renames;  // renamed ino -> target dir
 306
 307   // adjust subtree auth specification
 308   //  dir->dir_auth
 309   //  imports/exports/nested_exports
 310   //  join/split subtrees as appropriate
 311 public:
 312   bool is_subtrees() { return !subtrees.empty(); }
 313   void list_subtrees(list<CDir*>& ls);
 314   void adjust_subtree_auth(CDir *root, mds_authority_t auth, bool adjust_pop=true);
 315   void adjust_subtree_auth(CDir *root, mds_rank_t a, mds_rank_t b=CDIR_AUTH_UNKNOWN) {
 316     adjust_subtree_auth(root, mds_authority_t(a,b));
 317   }
 318   void adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, mds_authority_t auth);
 319   void adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, mds_rank_t a) {
 320     adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
 321   }
 322   void adjust_bounded_subtree_auth(CDir *dir, vector<dirfrag_t>& bounds, mds_authority_t auth);
 323   void adjust_bounded_subtree_auth(CDir *dir, vector<dirfrag_t>& bounds, mds_rank_t a) {
 324     adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
 325   }
 326   void map_dirfrag_set(list<dirfrag_t>& dfs, set<CDir*>& result);
 327   void try_subtree_merge(CDir *root);
 328   void try_subtree_merge_at(CDir *root, set<CInode*> *to_eval, bool adjust_pop=true);
 329   void subtree_merge_writebehind_finish(CInode *in, MutationRef& mut);
 330   void eval_subtree_root(CInode *diri);
 331   CDir *get_subtree_root(CDir *dir);
 332   CDir *get_projected_subtree_root(CDir *dir);
 333   bool is_leaf_subtree(CDir *dir) {
 334     assert(subtrees.count(dir));
 335     return subtrees[dir].empty();
 336   }
 337   void remove_subtree(CDir *dir);
 338   bool is_subtree(CDir *root) {
 339     return subtrees.count(root);
 340   }
 341   void get_subtree_bounds(CDir *root, set<CDir*>& bounds);
 342   void get_wouldbe_subtree_bounds(CDir *root, set<CDir*>& bounds);
 343   void verify_subtree_bounds(CDir *root, const set<CDir*>& bounds);
 344   void verify_subtree_bounds(CDir *root, const list<dirfrag_t>& bounds);
 345
 346   void project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir);
 347   void adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop);
 348
 349   void get_auth_subtrees(set<CDir*>& s);
 350   void get_fullauth_subtrees(set<CDir*>& s);
 351
 352   int num_subtrees();
 353   int num_subtrees_fullauth();
 354   int num_subtrees_fullnonauth();
 355
 356
 357 protected:
 358   // delayed cache expire
 359   map<CDir*, map<mds_rank_t, MCacheExpire*> > delayed_expire; // subtree root -> expire msg
 360
 361
 362   // -- requests --
 363   ceph::unordered_map<metareqid_t, MDRequestRef> active_requests;
 364
 365 public:
 366   int get_num_client_requests();
 367
 368   MDRequestRef request_start(MClientRequest *req);
 369   MDRequestRef request_start_slave(metareqid_t rid, __u32 attempt, Message *m);
 370   MDRequestRef request_start_internal(int op);
 371   bool have_request(metareqid_t rid) {
 372     return active_requests.count(rid);
 373   }
 374   MDRequestRef request_get(metareqid_t rid);
 375   void request_pin_ref(MDRequestRef& r, CInode *ref, vector<CDentry*>& trace);
 376   void request_finish(MDRequestRef& mdr);
 377   void request_forward(MDRequestRef& mdr, mds_rank_t mds, int port=0);
 378   void dispatch_request(MDRequestRef& mdr);
 379   void request_drop_foreign_locks(MDRequestRef& mdr);
 380   void request_drop_non_rdlocks(MDRequestRef& r);
 381   void request_drop_locks(MDRequestRef& r);
 382   void request_cleanup(MDRequestRef& r);
 383
 384   void request_kill(MDRequestRef& r);  // called when session closes
 385
 386   // journal/snap helpers
 387   CInode *pick_inode_snap(CInode *in, snapid_t follows);
 388   CInode *cow_inode(CInode *in, snapid_t last);
 389   void journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob, CDentry *dn,
 390                           snapid_t follows=CEPH_NOSNAP,
 391                           CInode **pcow_inode=0, CDentry::linkage_t *dnl=0);
 392   void journal_cow_inode(MutationRef& mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP,
 393                           CInode **pcow_inode=0);
 394   void journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP);
 395
 396   void project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
 397                                    int linkunlink, SnapRealm *prealm);
 398   void _project_rstat_inode_to_frag(CInode::mempool_inode & inode, snapid_t ofirst, snapid_t last,
 399                                     CDir *parent, int linkunlink, bool update_inode);
 400   void project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
 401                                    snapid_t ofirst, snapid_t last,
 402                                    CInode *pin, bool cow_head);
 403   void broadcast_quota_to_client(CInode *in, client_t exclude_ct = -1);
 404   void predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
 405                                 CInode *in, CDir *parent,
 406                                 int flags, int linkunlink=0,
 407                                 snapid_t follows=CEPH_NOSNAP);
 408
 409   // slaves
 410   void add_uncommitted_master(metareqid_t reqid, LogSegment *ls, set<mds_rank_t> &slaves, bool safe=false) {
 411     uncommitted_masters[reqid].ls = ls;
 412     uncommitted_masters[reqid].slaves = slaves;
 413     uncommitted_masters[reqid].safe = safe;
 414   }
 415   void wait_for_uncommitted_master(metareqid_t reqid, MDSInternalContextBase *c) {
 416     uncommitted_masters[reqid].waiters.push_back(c);
 417   }
 418   bool have_uncommitted_master(metareqid_t reqid, mds_rank_t from) {
 419     auto p = uncommitted_masters.find(reqid);
 420     return p != uncommitted_masters.end() && p->second.slaves.count(from) > 0;
 421   }
 422   void log_master_commit(metareqid_t reqid);
 423   void logged_master_update(metareqid_t reqid);
 424   void _logged_master_commit(metareqid_t reqid);
 425   void committed_master_slave(metareqid_t r, mds_rank_t from);
 426   void finish_committed_masters();
 427
 428   void _logged_slave_commit(mds_rank_t from, metareqid_t reqid);
 429
 430   // -- recovery --
 431 protected:
 432   set<mds_rank_t> recovery_set;
 433
 434 public:
 435   void set_recovery_set(set<mds_rank_t>& s);
 436   void handle_mds_failure(mds_rank_t who);
 437   void handle_mds_recovery(mds_rank_t who);
 438
 439 protected:
 440   // [resolve]
 441   // from EImportStart w/o EImportFinish during journal replay
 442   map<dirfrag_t, vector<dirfrag_t> >            my_ambiguous_imports;
 443   // from MMDSResolves
 444   map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > > other_ambiguous_imports;
 445
 446   map<mds_rank_t, map<metareqid_t, MDSlaveUpdate*> > uncommitted_slave_updates;  // slave: for replay.
 447   map<CInode*, int> uncommitted_slave_rename_olddir;  // slave: preserve the non-auth dir until seeing commit.
 448   map<CInode*, int> uncommitted_slave_unlink;  // slave: preserve the unlinked inode until seeing commit.
 449
 450   // track master requests whose slaves haven't acknowledged commit
 451   struct umaster {
 452     set<mds_rank_t> slaves;
 453     LogSegment *ls;
 454     list<MDSInternalContextBase*> waiters;
 455     bool safe;
 456     bool committing;
 457     bool recovering;
 458     umaster() : ls(NULL), safe(false), committing(false), recovering(false) {}
 459   };
 460   map<metareqid_t, umaster>                 uncommitted_masters;         // master: req -> slave set
 461
 462   set<metareqid_t>              pending_masters;
 463   map<int, set<metareqid_t> >   ambiguous_slave_updates;
 464
 465   friend class ESlaveUpdate;
 466   friend class ECommitted;
 467
 468   bool resolves_pending;
 469   set<mds_rank_t> resolve_gather;       // nodes i need resolves from
 470   set<mds_rank_t> resolve_ack_gather;   // nodes i need a resolve_ack from
 471   map<metareqid_t, mds_rank_t> need_resolve_rollback;  // rollbacks i'm writing to the journal
 472   map<mds_rank_t, MMDSResolve*> delayed_resolve;
 473
 474   void handle_resolve(MMDSResolve *m);
 475   void handle_resolve_ack(MMDSResolveAck *m);
 476   void process_delayed_resolve();
 477   void discard_delayed_resolve(mds_rank_t who);
 478   void maybe_resolve_finish();
 479   void disambiguate_my_imports();
 480   void disambiguate_other_imports();
 481   void trim_unlinked_inodes();
 482   void add_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master, MDSlaveUpdate*);
 483   void finish_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master);
 484   MDSlaveUpdate* get_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master);
 485 public:
 486   void recalc_auth_bits(bool replay);
 487   void remove_inode_recursive(CInode *in);
 488
 489   bool is_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
 490     auto p = ambiguous_slave_updates.find(master);
 491     return p != ambiguous_slave_updates.end() && p->second.count(reqid);
 492   }
 493   void add_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
 494     ambiguous_slave_updates[master].insert(reqid);
 495   }
 496   void remove_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
 497     auto p = ambiguous_slave_updates.find(master);
 498     auto q = p->second.find(reqid);
 499     assert(q != p->second.end());
 500     p->second.erase(q);
 501     if (p->second.empty())
 502       ambiguous_slave_updates.erase(p);
 503   }
 504
 505   void add_rollback(metareqid_t reqid, mds_rank_t master) {
 506     need_resolve_rollback[reqid] = master;
 507   }
 508   void finish_rollback(metareqid_t reqid);
 509
 510   // ambiguous imports
 511   void add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds);
 512   void add_ambiguous_import(CDir *base, const set<CDir*>& bounds);
 513   bool have_ambiguous_import(dirfrag_t base) {
 514     return my_ambiguous_imports.count(base);
 515   }
 516   void get_ambiguous_import_bounds(dirfrag_t base, vector<dirfrag_t>& bounds) {
 517     assert(my_ambiguous_imports.count(base));
 518     bounds = my_ambiguous_imports[base];
 519   }
 520   void cancel_ambiguous_import(CDir *);
 521   void finish_ambiguous_import(dirfrag_t dirino);
 522   void resolve_start(MDSInternalContext *resolve_done_);
 523   void send_resolves();
 524   void send_slave_resolves();
 525   void send_subtree_resolves();
 526   void maybe_send_pending_resolves() {
 527     if (resolves_pending)
 528       send_subtree_resolves();
 529   }
 530
 531   void _move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
 532                                map<dirfrag_t,vector<dirfrag_t> >& subtrees);
 533   ESubtreeMap *create_subtree_map();
 534
 535
 536   void clean_open_file_lists();
 537
 538 protected:
 539   // [rejoin]
 540   bool rejoins_pending;
 541   set<mds_rank_t> rejoin_gather;      // nodes from whom i need a rejoin
 542   set<mds_rank_t> rejoin_sent;        // nodes i sent a rejoin to
 543   set<mds_rank_t> rejoin_ack_sent;    // nodes i sent a rejoin to
 544   set<mds_rank_t> rejoin_ack_gather;  // nodes from whom i need a rejoin ack
 545   map<mds_rank_t,map<inodeno_t,map<client_t,Capability::Import> > > rejoin_imported_caps;
 546   map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > > rejoin_slave_exports;
 547   map<client_t,entity_inst_t> rejoin_client_map;
 548   map<client_t,pair<Session*,uint64_t> > rejoin_session_map;
 549
 550   map<inodeno_t,pair<mds_rank_t,map<client_t,cap_reconnect_t> > > cap_exports; // ino -> target, client -> capex
 551
 552   map<inodeno_t,map<client_t,map<mds_rank_t,cap_reconnect_t> > > cap_imports;  // ino -> client -> frommds -> capex
 553   set<inodeno_t> cap_imports_missing;
 554   map<inodeno_t, list<MDSInternalContextBase*> > cap_reconnect_waiters;
 555   int cap_imports_num_opening;
 556
 557   set<CInode*> rejoin_undef_inodes;
 558   set<CInode*> rejoin_potential_updated_scatterlocks;
 559   set<CDir*>   rejoin_undef_dirfrags;
 560   map<mds_rank_t, set<CInode*> > rejoin_unlinked_inodes;
 561
 562   vector<CInode*> rejoin_recover_q, rejoin_check_q;
 563   list<SimpleLock*> rejoin_eval_locks;
 564   list<MDSInternalContextBase*> rejoin_waiters;
 565
 566   void rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin);
 567   void handle_cache_rejoin(MMDSCacheRejoin *m);
 568   void handle_cache_rejoin_weak(MMDSCacheRejoin *m);
 569   CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last);
 570   CDir* rejoin_invent_dirfrag(dirfrag_t df);
 571   void handle_cache_rejoin_strong(MMDSCacheRejoin *m);
 572   void rejoin_scour_survivor_replicas(mds_rank_t from, MMDSCacheRejoin *ack,
 573                                       set<vinodeno_t>& acked_inodes,
 574                                       set<SimpleLock *>& gather_locks);
 575   void handle_cache_rejoin_ack(MMDSCacheRejoin *m);
 576   void rejoin_send_acks();
 577   void rejoin_trim_undef_inodes();
 578   void maybe_send_pending_rejoins() {
 579     if (rejoins_pending)
 580       rejoin_send_rejoins();
 581   }
 582   std::unique_ptr<MDSInternalContext> rejoin_done;
 583   std::unique_ptr<MDSInternalContext> resolve_done;
 584 public:
 585   void rejoin_start(MDSInternalContext *rejoin_done_);
 586   void rejoin_gather_finish();
 587   void rejoin_send_rejoins();
 588   void rejoin_export_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
 589                           int target=-1) {
 590     auto& ex = cap_exports[ino];
 591     ex.first = target;
 592     ex.second[client] = icr;
 593   }
 594   void rejoin_recovered_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
 595                              mds_rank_t frommds=MDS_RANK_NONE) {
 596     cap_imports[ino][client][frommds] = icr;
 597   }
 598   void rejoin_recovered_client(client_t client, const entity_inst_t& inst) {
 599     rejoin_client_map.emplace(client, inst);
 600   }
 601   const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) {
 602     if (cap_imports.count(ino) &&
 603         cap_imports[ino].count(client) &&
 604         cap_imports[ino][client].count(MDS_RANK_NONE)) {
 605       return &cap_imports[ino][client][MDS_RANK_NONE];
 606     }
 607     return NULL;
 608   }
 609   void remove_replay_cap_reconnect(inodeno_t ino, client_t client) {
 610     assert(cap_imports[ino].size() == 1);
 611     assert(cap_imports[ino][client].size() == 1);
 612     cap_imports.erase(ino);
 613   }
 614   void wait_replay_cap_reconnect(inodeno_t ino, MDSInternalContextBase *c) {
 615     cap_reconnect_waiters[ino].push_back(c);
 616   }
 617
 618   // [reconnect/rejoin caps]
 619   struct reconnected_cap_info_t {
 620     inodeno_t realm_ino;
 621     snapid_t snap_follows;
 622     int dirty_caps;
 623     reconnected_cap_info_t() :
 624       realm_ino(0), snap_follows(0), dirty_caps(0) {}
 625   };
 626   map<inodeno_t,map<client_t, reconnected_cap_info_t> >  reconnected_caps;   // inode -> client -> snap_follows,realmino
 627   map<inodeno_t,map<client_t, snapid_t> > reconnected_snaprealms;  // realmino -> client -> realmseq
 628
 629   void add_reconnected_cap(client_t client, inodeno_t ino, const cap_reconnect_t& icr) {
 630     reconnected_cap_info_t &info = reconnected_caps[ino][client];
 631     info.realm_ino = inodeno_t(icr.capinfo.snaprealm);
 632     info.snap_follows = icr.snap_follows;
 633   }
 634   void set_reconnected_dirty_caps(client_t client, inodeno_t ino, int dirty) {
 635     reconnected_cap_info_t &info = reconnected_caps[ino][client];
 636     info.dirty_caps |= dirty;
 637   }
 638   void add_reconnected_snaprealm(client_t client, inodeno_t ino, snapid_t seq) {
 639     reconnected_snaprealms[ino][client] = seq;
 640   }
 641
 642   friend class C_MDC_RejoinOpenInoFinish;
 643   friend class C_MDC_RejoinSessionsOpened;
 644   void rejoin_open_ino_finish(inodeno_t ino, int ret);
 645   void rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map);
 646   bool process_imported_caps();
 647   void choose_lock_states_and_reconnect_caps();
 648   void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
 649                            map<client_t,MClientSnap*>& splits);
 650   void do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool nosend=false);
 651   void send_snaps(map<client_t,MClientSnap*>& splits);
 652   Capability* rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds);
 653   void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq);
 654   void try_reconnect_cap(CInode *in, Session *session);
 655   void export_remaining_imported_caps();
 656
 657   // cap imports.  delayed snap parent opens.
 658   //  realm inode -> client -> cap inodes needing to split to this realm
 659   map<CInode*,set<CInode*> > missing_snap_parents;
 660   map<client_t,set<CInode*> > delayed_imported_caps;
 661
 662   void do_cap_import(Session *session, CInode *in, Capability *cap,
 663                      uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
 664                      int peer, int p_flags);
 665   void do_delayed_cap_imports();
 666   void rebuild_need_snapflush(CInode *head_in, SnapRealm *realm, client_t client,
 667                               snapid_t snap_follows);
 668   void check_realm_past_parents(SnapRealm *realm, bool reconnect);
 669   void open_snap_parents();
 670
 671   bool open_undef_inodes_dirfrags();
 672   void opened_undef_inode(CInode *in);
 673   void opened_undef_dirfrag(CDir *dir) {
 674     rejoin_undef_dirfrags.erase(dir);
 675   }
 676
 677   void reissue_all_caps();
 678
 679
 680   friend class Locker;
 681   friend class Migrator;
 682   friend class MDBalancer;
 683
 684   // StrayManager needs to be able to remove_inode() from us
 685   // when it is done purging
 686   friend class StrayManager;
 687
 688   // File size recovery
 689 private:
 690   RecoveryQueue recovery_queue;
 691   void identify_files_to_recover();
 692 public:
 693   void start_files_to_recover();
 694   void do_file_recover();
 695   void queue_file_recover(CInode *in);
 696   void _queued_file_recover_cow(CInode *in, MutationRef& mut);
 697
 698   // subsystems
 699   std::unique_ptr<Migrator> migrator;
 700
 701  public:
 702   explicit MDCache(MDSRank *m, PurgeQueue &purge_queue_);
 703   ~MDCache();
 704   void handle_conf_change(const struct md_config_t *conf,
 705                           const std::set <std::string> &changed,
 706                           const MDSMap &mds_map);
 707
 708   // debug
 709   void log_stat();
 710
 711   // root inode
 712   CInode *get_root() { return root; }
 713   CInode *get_myin() { return myin; }
 714
 715   size_t get_cache_size() { return lru.lru_get_size(); }
 716
 717   // trimming
 718   bool trim(uint64_t count=0);
 719 private:
 720   void trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*>& expiremap);
 721   bool trim_dentry(CDentry *dn, map<mds_rank_t, MCacheExpire*>& expiremap);
 722   void trim_dirfrag(CDir *dir, CDir *con,
 723                     map<mds_rank_t, MCacheExpire*>& expiremap);
 724   bool trim_inode(CDentry *dn, CInode *in, CDir *con,
 725                   map<mds_rank_t,class MCacheExpire*>& expiremap);
 726   void send_expire_messages(map<mds_rank_t, MCacheExpire*>& expiremap);
 727   void trim_non_auth();      // trim out trimmable non-auth items
 728 public:
 729   bool trim_non_auth_subtree(CDir *directory);
 730   void standby_trim_segment(LogSegment *ls);
 731   void try_trim_non_auth_subtree(CDir *dir);
 732   bool can_trim_non_auth_dirfrag(CDir *dir) {
 733     return my_ambiguous_imports.count((dir)->dirfrag()) == 0 &&
 734            uncommitted_slave_rename_olddir.count(dir->inode) == 0;
 735   }
 736
 737   /**
 738    * For all unreferenced inodes, dirs, dentries below an inode, compose
 739    * expiry messages.  This is used when giving up all replicas of entities
 740    * for an MDS peer in the 'stopping' state, such that the peer can
 741    * empty its cache and finish shutting down.
 742    *
 743    * We have to make sure we're only expiring un-referenced items to
 744    * avoid interfering with ongoing stray-movement (we can't distinguish
 745    * between the "moving my strays" and "waiting for my cache to empty"
 746    * phases within 'stopping')
 747    *
 748    * @return false if we completed cleanly, true if caller should stop
 749    *         expiring because we hit something with refs.
 750    */
 751   bool expire_recursive(
 752     CInode *in,
 753     std::map<mds_rank_t, MCacheExpire*>& expiremap);
 754
 755   void trim_client_leases();
 756   void check_memory_usage();
 757
 758   time last_recall_state;
 759
 760   // shutdown
 761 private:
 762   set<inodeno_t> shutdown_exported_strays;
 763 public:
 764   void shutdown_start();
 765   void shutdown_check();
 766   bool shutdown_pass();
 767   bool shutdown_export_strays();
 768   bool shutdown();                    // clear cache (ie at shutodwn)
 769
 770   bool did_shutdown_log_cap;
 771
 772   // inode_map
 773   bool have_inode(vinodeno_t vino) {
 774     if (vino.snapid == CEPH_NOSNAP)
 775       return inode_map.count(vino.ino) ? true : false;
 776     else
 777       return snap_inode_map.count(vino) ? true : false;
 778   }
 779   bool have_inode(inodeno_t ino, snapid_t snap=CEPH_NOSNAP) {
 780     return have_inode(vinodeno_t(ino, snap));
 781   }
 782   CInode* get_inode(vinodeno_t vino) {
 783     if (vino.snapid == CEPH_NOSNAP) {
 784       auto p = inode_map.find(vino.ino);
 785       if (p != inode_map.end())
 786         return p->second;
 787     } else {
 788       auto p = snap_inode_map.find(vino);
 789       if (p != snap_inode_map.end())
 790         return p->second;
 791     }
 792     return NULL;
 793   }
 794   CInode* get_inode(inodeno_t ino, snapid_t s=CEPH_NOSNAP) {
 795     return get_inode(vinodeno_t(ino, s));
 796   }
 797
 798   CDir* get_dirfrag(dirfrag_t df) {
 799     CInode *in = get_inode(df.ino);
 800     if (!in)
 801       return NULL;
 802     return in->get_dirfrag(df.frag);
 803   }
 804   CDir* get_dirfrag(inodeno_t ino, boost::string_view dn) {
 805     CInode *in = get_inode(ino);
 806     if (!in)
 807       return NULL;
 808     frag_t fg = in->pick_dirfrag(dn);
 809     return in->get_dirfrag(fg);
 810   }
 811   CDir* get_force_dirfrag(dirfrag_t df, bool replay) {
 812     CInode *diri = get_inode(df.ino);
 813     if (!diri)
 814       return NULL;
 815     CDir *dir = force_dir_fragment(diri, df.frag, replay);
 816     if (!dir)
 817       dir = diri->get_dirfrag(df.frag);
 818     return dir;
 819   }
 820
 821   MDSCacheObject *get_object(MDSCacheObjectInfo &info);
 822
 823
 824
 825  public:
 826   void add_inode(CInode *in);
 827
 828   void remove_inode(CInode *in);
 829  protected:
 830   void touch_inode(CInode *in) {
 831     if (in->get_parent_dn())
 832       touch_dentry(in->get_projected_parent_dn());
 833   }
 834 public:
 835   void touch_dentry(CDentry *dn) {
 836     if (dn->state_test(CDentry::STATE_BOTTOMLRU)) {
 837       bottom_lru.lru_midtouch(dn);
 838     } else {
 839       if (dn->is_auth())
 840         lru.lru_touch(dn);
 841       else
 842         lru.lru_midtouch(dn);
 843     }
 844   }
 845   void touch_dentry_bottom(CDentry *dn) {
 846     if (dn->state_test(CDentry::STATE_BOTTOMLRU))
 847       return;
 848     lru.lru_bottouch(dn);
 849   }
 850 protected:
 851
 852   void inode_remove_replica(CInode *in, mds_rank_t rep, bool rejoin,
 853                             set<SimpleLock *>& gather_locks);
 854   void dentry_remove_replica(CDentry *dn, mds_rank_t rep, set<SimpleLock *>& gather_locks);
 855
 856   void rename_file(CDentry *srcdn, CDentry *destdn);
 857
 858  public:
 859   // truncate
 860   void truncate_inode(CInode *in, LogSegment *ls);
 861   void _truncate_inode(CInode *in, LogSegment *ls);
 862   void truncate_inode_finish(CInode *in, LogSegment *ls);
 863   void truncate_inode_logged(CInode *in, MutationRef& mut);
 864
 865   void add_recovered_truncate(CInode *in, LogSegment *ls);
 866   void remove_recovered_truncate(CInode *in, LogSegment *ls);
 867   void start_recovered_truncates();
 868
 869
 870  public:
 871   CDir *get_auth_container(CDir *in);
 872   CDir *get_export_container(CDir *dir);
 873   void find_nested_exports(CDir *dir, set<CDir*>& s);
 874   void find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s);
 875
 876
 877 private:
 878   bool opening_root, open;
 879   list<MDSInternalContextBase*> waiting_for_open;
 880
 881 public:
 882   void init_layouts();
 883   void create_unlinked_system_inode(CInode *in, inodeno_t ino,
 884                                     int mode) const;
 885   CInode *create_system_inode(inodeno_t ino, int mode);
 886   CInode *create_root_inode();
 887
 888   void create_empty_hierarchy(MDSGather *gather);
 889   void create_mydir_hierarchy(MDSGather *gather);
 890
 891   bool is_open() { return open; }
 892   void wait_for_open(MDSInternalContextBase *c) {
 893     waiting_for_open.push_back(c);
 894   }
 895
 896   void open_root_inode(MDSInternalContextBase *c);
 897   void open_root();
 898   void open_mydir_inode(MDSInternalContextBase *c);
 899   void open_mydir_frag(MDSInternalContextBase *c);
 900   void populate_mydir();
 901
 902   void _create_system_file(CDir *dir, const char *name, CInode *in, MDSInternalContextBase *fin);
 903   void _create_system_file_finish(MutationRef& mut, CDentry *dn,
 904                                   version_t dpv, MDSInternalContextBase *fin);
 905
 906   void open_foreign_mdsdir(inodeno_t ino, MDSInternalContextBase *c);
 907   CDir *get_stray_dir(CInode *in);
 908   CDentry *get_or_create_stray_dentry(CInode *in);
 909
 910   MDSInternalContextBase *_get_waiter(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin);
 911
 912   /**
 913    * Find the given dentry (and whether it exists or not), its ancestors,
 914    * and get them all into memory and usable on this MDS. This function
 915    * makes a best-effort attempt to load everything; if it needs to
 916    * go away and do something then it will put the request on a waitlist.
 917    * It prefers the mdr, then the req, then the fin. (At least one of these
 918    * must be non-null.)
 919    *
 920    * At least one of the params mdr, req, and fin must be non-null.
 921    *
 922    * @param mdr The MDRequest associated with the path. Can be null.
 923    * @param req The Message associated with the path. Can be null.
 924    * @param fin The Context associated with the path. Can be null.
 925    * @param path The path to traverse to.
 926    * @param pdnvec Data return parameter -- on success, contains a
 927    * vector of dentries. On failure, is either empty or contains the
 928    * full trace of traversable dentries.
 929    * @param pin Data return parameter -- if successful, points to the inode
 930    * associated with filepath. If unsuccessful, is null.
 931    * @param onfail Specifies different lookup failure behaviors. If set to
 932    * MDS_TRAVERSE_DISCOVERXLOCK, path_traverse will succeed on null
 933    * dentries (instead of returning -ENOENT). If set to
 934    * MDS_TRAVERSE_FORWARD, it will forward the request to the auth
 935    * MDS if that becomes appropriate (ie, if it doesn't know the contents
 936    * of a directory). If set to MDS_TRAVERSE_DISCOVER, it
 937    * will attempt to look up the path from a different MDS (and bring them
 938    * into its cache as replicas).
 939    *
 940    * @returns 0 on success, 1 on "not done yet", 2 on "forwarding", -errno otherwise.
 941    * If it returns 1, the requester associated with this call has been placed
 942    * on the appropriate waitlist, and it should unwind itself and back out.
 943    * If it returns 2 the request has been forwarded, and again the requester
 944    * should unwind itself and back out.
 945    */
 946   int path_traverse(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin, const filepath& path,
 947                     vector<CDentry*> *pdnvec, CInode **pin, int onfail);
 948
 949   CInode *cache_traverse(const filepath& path);
 950
 951   void open_remote_dirfrag(CInode *diri, frag_t fg, MDSInternalContextBase *fin);
 952   CInode *get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected=false);
 953
 954   bool parallel_fetch(map<inodeno_t,filepath>& pathmap, set<inodeno_t>& missing);
 955   bool parallel_fetch_traverse_dir(inodeno_t ino, filepath& path,
 956                                    set<CDir*>& fetch_queue, set<inodeno_t>& missing,
 957                                    C_GatherBuilder &gather_bld);
 958
 959   void open_remote_dentry(CDentry *dn, bool projected, MDSInternalContextBase *fin,
 960                           bool want_xlocked=false);
 961   void _open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSInternalContextBase *fin,
 962                                   bool want_xlocked, int r);
 963
 964   void make_trace(vector<CDentry*>& trace, CInode *in);
 965
 966 protected:
 967   struct open_ino_info_t {
 968     vector<inode_backpointer_t> ancestors;
 969     set<mds_rank_t> checked;
 970     mds_rank_t checking;
 971     mds_rank_t auth_hint;
 972     bool check_peers;
 973     bool fetch_backtrace;
 974     bool discover;
 975     bool want_replica;
 976     bool want_xlocked;
 977     version_t tid;
 978     int64_t pool;
 979     int last_err;
 980     list<MDSInternalContextBase*> waiters;
 981     open_ino_info_t() : checking(MDS_RANK_NONE), auth_hint(MDS_RANK_NONE),
 982       check_peers(true), fetch_backtrace(true), discover(false),
 983       want_replica(false), want_xlocked(false), tid(0), pool(-1),
 984       last_err(0) {}
 985   };
 986   ceph_tid_t open_ino_last_tid;
 987   map<inodeno_t,open_ino_info_t> opening_inodes;
 988
 989   void _open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err);
 990   void _open_ino_parent_opened(inodeno_t ino, int ret);
 991   void _open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int err);
 992   void _open_ino_fetch_dir(inodeno_t ino, MMDSOpenIno *m, CDir *dir, bool parent);
 993   int open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
 994                             vector<inode_backpointer_t>& ancestors,
 995                             bool discover, bool want_xlocked, mds_rank_t *hint);
 996   void open_ino_finish(inodeno_t ino, open_ino_info_t& info, int err);
 997   void do_open_ino(inodeno_t ino, open_ino_info_t& info, int err);
 998   void do_open_ino_peer(inodeno_t ino, open_ino_info_t& info);
 999   void handle_open_ino(MMDSOpenIno *m, int err=0);
1000   void handle_open_ino_reply(MMDSOpenInoReply *m);
1001   friend class C_IO_MDC_OpenInoBacktraceFetched;
1002   friend struct C_MDC_OpenInoTraverseDir;
1003   friend struct C_MDC_OpenInoParentOpened;
1004
1005 public:
1006   void kick_open_ino_peers(mds_rank_t who);
1007   void open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase *fin,
1008                 bool want_replica=true, bool want_xlocked=false);
1009
1010   // -- find_ino_peer --
1011   struct find_ino_peer_info_t {
1012     inodeno_t ino;
1013     ceph_tid_t tid;
1014     MDSInternalContextBase *fin;
1015     mds_rank_t hint;
1016     mds_rank_t checking;
1017     set<mds_rank_t> checked;
1018
1019     find_ino_peer_info_t() : tid(0), fin(NULL), hint(MDS_RANK_NONE), checking(MDS_RANK_NONE) {}
1020   };
1021
1022   map<ceph_tid_t, find_ino_peer_info_t> find_ino_peer;
1023   ceph_tid_t find_ino_peer_last_tid;
1024
1025   void find_ino_peers(inodeno_t ino, MDSInternalContextBase *c, mds_rank_t hint=MDS_RANK_NONE);
1026   void _do_find_ino_peer(find_ino_peer_info_t& fip);
1027   void handle_find_ino(MMDSFindIno *m);
1028   void handle_find_ino_reply(MMDSFindInoReply *m);
1029   void kick_find_ino_peers(mds_rank_t who);
1030
1031   // -- snaprealms --
1032 public:
1033   void snaprealm_create(MDRequestRef& mdr, CInode *in);
1034   void _snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CInode *in);
1035
1036   // -- stray --
1037 public:
1038   void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin);
1039   uint64_t get_num_strays() const { return stray_manager.get_num_strays(); }
1040
1041 protected:
1042   void scan_stray_dir(dirfrag_t next=dirfrag_t());
1043   StrayManager stray_manager;
1044   friend struct C_MDC_RetryScanStray;
1045   friend class C_IO_MDC_FetchedBacktrace;
1046
1047   // == messages ==
1048  public:
1049   void dispatch(Message *m);
1050
1051  protected:
1052   // -- replicas --
1053   void handle_discover(MDiscover *dis);
1054   void handle_discover_reply(MDiscoverReply *m);
1055   friend class C_MDC_Join;
1056
1057 public:
1058   void replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl);
1059   void replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl);
1060   void replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl,
1061                        uint64_t features);
1062
1063   CDir* add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from, list<MDSInternalContextBase*>& finished);
1064   CDentry *add_replica_dentry(bufferlist::iterator& p, CDir *dir, list<MDSInternalContextBase*>& finished);
1065   CInode *add_replica_inode(bufferlist::iterator& p, CDentry *dn, list<MDSInternalContextBase*>& finished);
1066
1067   void replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl);
1068   CDentry *add_replica_stray(bufferlist &bl, mds_rank_t from);
1069
1070   // -- namespace --
1071 public:
1072   void send_dentry_link(CDentry *dn, MDRequestRef& mdr);
1073   void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr);
1074 protected:
1075   void handle_dentry_link(MDentryLink *m);
1076   void handle_dentry_unlink(MDentryUnlink *m);
1077
1078
1079   // -- fragmenting --
1080 private:
1081   struct ufragment {
1082     int bits;
1083     bool committed;
1084     LogSegment *ls;
1085     list<MDSInternalContextBase*> waiters;
1086     list<frag_t> old_frags;
1087     bufferlist rollback;
1088     ufragment() : bits(0), committed(false), ls(NULL) {}
1089   };
1090   map<dirfrag_t, ufragment> uncommitted_fragments;
1091
1092   struct fragment_info_t {
1093     int bits;
1094     list<CDir*> dirs;
1095     list<CDir*> resultfrags;
1096     MDRequestRef mdr;
1097     // for deadlock detection
1098     bool all_frozen;
1099     utime_t last_cum_auth_pins_change;
1100     int last_cum_auth_pins;
1101     int num_remote_waiters;     // number of remote authpin waiters
1102     fragment_info_t() : bits(0), all_frozen(false), last_cum_auth_pins(0), num_remote_waiters(0) {}
1103     bool is_fragmenting() { return !resultfrags.empty(); }
1104   };
1105   map<dirfrag_t,fragment_info_t> fragments;
1106
1107   void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
1108                             list<CDir*>& frags, list<MDSInternalContextBase*>& waiters, bool replay);
1109   void adjust_dir_fragments(CInode *diri,
1110                             list<CDir*>& srcfrags,
1111                             frag_t basefrag, int bits,
1112                             list<CDir*>& resultfrags,
1113                             list<MDSInternalContextBase*>& waiters,
1114                             bool replay);
1115   CDir *force_dir_fragment(CInode *diri, frag_t fg, bool replay=true);
1116   void get_force_dirfrag_bound_set(vector<dirfrag_t>& dfs, set<CDir*>& bounds);
1117
1118   bool can_fragment(CInode *diri, list<CDir*>& dirs);
1119   void fragment_freeze_dirs(list<CDir*>& dirs);
1120   void fragment_mark_and_complete(MDRequestRef& mdr);
1121   void fragment_frozen(MDRequestRef& mdr, int r);
1122   void fragment_unmark_unfreeze_dirs(list<CDir*>& dirs);
1123   void dispatch_fragment_dir(MDRequestRef& mdr);
1124   void _fragment_logged(MDRequestRef& mdr);
1125   void _fragment_stored(MDRequestRef& mdr);
1126   void _fragment_committed(dirfrag_t f, list<CDir*>& resultfrags);
1127   void _fragment_finish(dirfrag_t f, list<CDir*>& resultfrags);
1128
1129   friend class EFragment;
1130   friend class C_MDC_FragmentFrozen;
1131   friend class C_MDC_FragmentMarking;
1132   friend class C_MDC_FragmentPrep;
1133   friend class C_MDC_FragmentStore;
1134   friend class C_MDC_FragmentCommit;
1135   friend class C_IO_MDC_FragmentFinish;
1136
1137   void handle_fragment_notify(MMDSFragmentNotify *m);
1138
1139   void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frag,
1140                                 LogSegment *ls, bufferlist *rollback=NULL);
1141   void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op);
1142   void rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags);
1143 public:
1144   void wait_for_uncommitted_fragment(dirfrag_t dirfrag, MDSInternalContextBase *c) {
1145     assert(uncommitted_fragments.count(dirfrag));
1146     uncommitted_fragments[dirfrag].waiters.push_back(c);
1147   }
1148   void split_dir(CDir *dir, int byn);
1149   void merge_dir(CInode *diri, frag_t fg);
1150   void rollback_uncommitted_fragments();
1151
1152   void find_stale_fragment_freeze();
1153   void fragment_freeze_inc_num_waiters(CDir *dir);
1154   bool fragment_are_all_frozen(CDir *dir);
1155   int get_num_fragmenting_dirs() { return fragments.size(); }
1156
1157   // -- updates --
1158   //int send_inode_updates(CInode *in);
1159   //void handle_inode_update(MInodeUpdate *m);
1160
1161   int send_dir_updates(CDir *in, bool bcast=false);
1162   void handle_dir_update(MDirUpdate *m);
1163
1164   // -- cache expiration --
1165   void handle_cache_expire(MCacheExpire *m);
1166   void process_delayed_expire(CDir *dir);
1167   void discard_delayed_expire(CDir *dir);
1168
1169 protected:
1170   int dump_cache(boost::string_view fn, Formatter *f,
1171                   boost::string_view dump_root = "",
1172                   int depth = -1);
1173 public:
1174   int dump_cache() { return dump_cache(NULL, NULL); }
1175   int dump_cache(boost::string_view filename);
1176   int dump_cache(Formatter *f);
1177   int dump_cache(boost::string_view dump_root, int depth, Formatter *f);
1178
1179   int cache_status(Formatter *f);
1180
1181   void dump_resolve_status(Formatter *f) const;
1182   void dump_rejoin_status(Formatter *f) const;
1183
1184   // == crap fns ==
1185  public:
1186   void show_cache();
1187   void show_subtrees(int dbl=10);
1188
1189   CInode *hack_pick_random_inode() {
1190     assert(!inode_map.empty());
1191     int n = rand() % inode_map.size();
1192     auto p = inode_map.begin();
1193     while (n--) ++p;
1194     return p->second;
1195   }
1196
1197 protected:
1198   void flush_dentry_work(MDRequestRef& mdr);
1199   /**
1200    * Resolve path to a dentry and pass it onto the ScrubStack.
1201    *
1202    * TODO: return enough information to the original mdr formatter
1203    * and completion that they can subsequeuntly check the progress of
1204    * this scrub (we won't block them on a whole scrub as it can take a very
1205    * long time)
1206    */
1207   void enqueue_scrub_work(MDRequestRef& mdr);
1208   void repair_inode_stats_work(MDRequestRef& mdr);
1209   void repair_dirfrag_stats_work(MDRequestRef& mdr);
1210   friend class C_MDC_RepairDirfragStats;
1211 public:
1212   void flush_dentry(boost::string_view path, Context *fin);
1213   /**
1214    * Create and start an OP_ENQUEUE_SCRUB
1215    */
1216   void enqueue_scrub(boost::string_view path, boost::string_view tag,
1217                      bool force, bool recursive, bool repair,
1218                      Formatter *f, Context *fin);
1219   void repair_inode_stats(CInode *diri);
1220   void repair_dirfrag_stats(CDir *dir);
1221
1222 public:
1223   /* Because exports may fail, this set lets us keep track of inodes that need exporting. */
1224   std::set<CInode *> export_pin_queue;
1225 };
1226
1227 class C_MDS_RetryRequest : public MDSInternalContext {
1228   MDCache *cache;
1229   MDRequestRef mdr;
1230  public:
1231   C_MDS_RetryRequest(MDCache *c, MDRequestRef& r);
1232   void finish(int r) override;
1233 };
1234
1235 #endif