ceph/src/mds/MDCache.h

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15
  16
  17 #ifndef CEPH_MDCACHE_H
  18 #define CEPH_MDCACHE_H
  19
  20 #include <string_view>
  21
  22 #include "common/DecayCounter.h"
  23 #include "include/types.h"
  24 #include "include/filepath.h"
  25 #include "include/elist.h"
  26
  27 #include "messages/MCacheExpire.h"
  28 #include "messages/MClientQuota.h"
  29 #include "messages/MClientRequest.h"
  30 #include "messages/MClientSnap.h"
  31 #include "messages/MDentryLink.h"
  32 #include "messages/MDentryUnlink.h"
  33 #include "messages/MDirUpdate.h"
  34 #include "messages/MDiscover.h"
  35 #include "messages/MDiscoverReply.h"
  36 #include "messages/MGatherCaps.h"
  37 #include "messages/MGenericMessage.h"
  38 #include "messages/MInodeFileCaps.h"
  39 #include "messages/MLock.h"
  40 #include "messages/MMDSCacheRejoin.h"
  41 #include "messages/MMDSFindIno.h"
  42 #include "messages/MMDSFindInoReply.h"
  43 #include "messages/MMDSFragmentNotify.h"
  44 #include "messages/MMDSFragmentNotifyAck.h"
  45 #include "messages/MMDSOpenIno.h"
  46 #include "messages/MMDSOpenInoReply.h"
  47 #include "messages/MMDSResolve.h"
  48 #include "messages/MMDSResolveAck.h"
  49 #include "messages/MMDSSlaveRequest.h"
  50 #include "messages/MMDSSnapUpdate.h"
  51
  52
  53 #include "osdc/Filer.h"
  54 #include "CInode.h"
  55 #include "CDentry.h"
  56 #include "CDir.h"
  57 #include "include/Context.h"
  58 #include "events/EMetaBlob.h"
  59 #include "RecoveryQueue.h"
  60 #include "StrayManager.h"
  61 #include "OpenFileTable.h"
  62 #include "MDSContext.h"
  63 #include "MDSMap.h"
  64 #include "Mutation.h"
  65
  66
  67 class PerfCounters;
  68
  69 class MDSRank;
  70 class Session;
  71 class Migrator;
  72
  73 class Session;
  74
  75 class ESubtreeMap;
  76
  77 enum {
  78   l_mdc_first = 3000,
  79   // How many inodes currently in stray dentries
  80   l_mdc_num_strays,
  81   // How many stray dentries are currently delayed for purge due to refs
  82   l_mdc_num_strays_delayed,
  83   // How many stray dentries are currently being enqueued for purge
  84   l_mdc_num_strays_enqueuing,
  85
  86   // How many dentries have ever been added to stray dir
  87   l_mdc_strays_created,
  88   // How many dentries have been passed on to PurgeQueue
  89   l_mdc_strays_enqueued,
  90   // How many strays have been reintegrated?
  91   l_mdc_strays_reintegrated,
  92   // How many strays have been migrated?
  93   l_mdc_strays_migrated,
  94
  95   // How many inode sizes currently being recovered
  96   l_mdc_num_recovering_processing,
  97   // How many inodes currently waiting to have size recovered
  98   l_mdc_num_recovering_enqueued,
  99   // How many inodes waiting with elevated priority for recovery
 100   l_mdc_num_recovering_prioritized,
 101   // How many inodes ever started size recovery
 102   l_mdc_recovery_started,
 103   // How many inodes ever completed size recovery
 104   l_mdc_recovery_completed,
 105
 106   l_mdss_ireq_enqueue_scrub,
 107   l_mdss_ireq_exportdir,
 108   l_mdss_ireq_flush,
 109   l_mdss_ireq_fragmentdir,
 110   l_mdss_ireq_fragstats,
 111   l_mdss_ireq_inodestats,
 112
 113   l_mdc_last,
 114 };
 115
 116
 117 // flags for predirty_journal_parents()
 118 static const int PREDIRTY_PRIMARY = 1; // primary dn, adjust nested accounting
 119 static const int PREDIRTY_DIR = 2;     // update parent dir mtime/size
 120 static const int PREDIRTY_SHALLOW = 4; // only go to immediate parent (for easier rollback)
 121
 122 class MDCache {
 123  public:
 124   using clock = ceph::coarse_mono_clock;
 125   using time = ceph::coarse_mono_time;
 126
 127   typedef std::map<mds_rank_t, MCacheExpire::ref> expiremap;
 128
 129   // my master
 130   MDSRank *mds;
 131
 132   // -- my cache --
 133   LRU lru;   // dentry lru for expiring items from cache
 134   LRU bottom_lru; // dentries that should be trimmed ASAP
 135  protected:
 136   ceph::unordered_map<inodeno_t,CInode*> inode_map;  // map of head inodes by ino
 137   map<vinodeno_t, CInode*> snap_inode_map;  // map of snap inodes by ino
 138   CInode *root;                            // root inode
 139   CInode *myin;                            // .ceph/mds%d dir
 140
 141   bool readonly;
 142   void set_readonly() { readonly = true; }
 143
 144   CInode *strays[NUM_STRAY];         // my stray dir
 145   int stray_index;
 146
 147   CInode *get_stray() {
 148     return strays[stray_index];
 149   }
 150
 151   set<CInode*> base_inodes;
 152
 153   std::unique_ptr<PerfCounters> logger;
 154
 155   Filer filer;
 156
 157   bool exceeded_size_limit;
 158
 159 private:
 160   uint64_t cache_inode_limit;
 161   uint64_t cache_memory_limit;
 162   double cache_reservation;
 163   double cache_health_threshold;
 164
 165 public:
 166   uint64_t cache_limit_inodes(void) {
 167     return cache_inode_limit;
 168   }
 169   uint64_t cache_limit_memory(void) {
 170     return cache_memory_limit;
 171   }
 172   double cache_toofull_ratio(void) const {
 173     double inode_reserve = cache_inode_limit*(1.0-cache_reservation);
 174     double memory_reserve = cache_memory_limit*(1.0-cache_reservation);
 175     return fmax(0.0, fmax((cache_size()-memory_reserve)/memory_reserve, cache_inode_limit == 0 ? 0.0 : (CInode::count()-inode_reserve)/inode_reserve));
 176   }
 177   bool cache_toofull(void) const {
 178     return cache_toofull_ratio() > 0.0;
 179   }
 180   uint64_t cache_size(void) const {
 181     return mempool::get_pool(mempool::mds_co::id).allocated_bytes();
 182   }
 183   bool cache_overfull(void) const {
 184     return (cache_inode_limit > 0 && CInode::count() > cache_inode_limit*cache_health_threshold) || (cache_size() > cache_memory_limit*cache_health_threshold);
 185   }
 186
 187   void advance_stray() {
 188     stray_index = (stray_index+1)%NUM_STRAY;
 189   }
 190
 191   /**
 192    * Call this when you know that a CDentry is ready to be passed
 193    * on to StrayManager (i.e. this is a stray you've just created)
 194    */
 195   void notify_stray(CDentry *dn) {
 196     ceph_assert(dn->get_dir()->get_inode()->is_stray());
 197     if (dn->state_test(CDentry::STATE_PURGING))
 198       return;
 199
 200     stray_manager.eval_stray(dn);
 201   }
 202
 203   void maybe_eval_stray(CInode *in, bool delay=false);
 204   void clear_dirty_bits_for_stray(CInode* diri);
 205
 206   bool is_readonly() { return readonly; }
 207   void force_readonly();
 208
 209   DecayRate decayrate;
 210
 211   int num_shadow_inodes;
 212
 213   int num_inodes_with_caps;
 214
 215   unsigned max_dir_commit_size;
 216
 217   static file_layout_t gen_default_file_layout(const MDSMap &mdsmap);
 218   static file_layout_t gen_default_log_layout(const MDSMap &mdsmap);
 219
 220   file_layout_t default_file_layout;
 221   file_layout_t default_log_layout;
 222
 223   void register_perfcounters();
 224
 225   // -- client leases --
 226 public:
 227   static const int client_lease_pools = 3;
 228   float client_lease_durations[client_lease_pools];
 229 protected:
 230   xlist<ClientLease*> client_leases[client_lease_pools];
 231 public:
 232   void touch_client_lease(ClientLease *r, int pool, utime_t ttl) {
 233     client_leases[pool].push_back(&r->item_lease);
 234     r->ttl = ttl;
 235   }
 236
 237   void notify_stray_removed()
 238   {
 239     stray_manager.notify_stray_removed();
 240   }
 241
 242   void notify_stray_created()
 243   {
 244     stray_manager.notify_stray_created();
 245   }
 246
 247   void eval_remote(CDentry *dn)
 248   {
 249     stray_manager.eval_remote(dn);
 250   }
 251
 252   // -- client caps --
 253   uint64_t              last_cap_id;
 254
 255
 256
 257   // -- discover --
 258   struct discover_info_t {
 259     ceph_tid_t tid;
 260     mds_rank_t mds;
 261     inodeno_t ino;
 262     frag_t frag;
 263     snapid_t snap;
 264     filepath want_path;
 265     CInode *basei;
 266     bool want_base_dir;
 267     bool want_xlocked;
 268
 269     discover_info_t() :
 270       tid(0), mds(-1), snap(CEPH_NOSNAP), basei(NULL),
 271       want_base_dir(false), want_xlocked(false) {}
 272     ~discover_info_t() {
 273       if (basei)
 274         basei->put(MDSCacheObject::PIN_DISCOVERBASE);
 275     }
 276     void pin_base(CInode *b) {
 277       basei = b;
 278       basei->get(MDSCacheObject::PIN_DISCOVERBASE);
 279     }
 280   };
 281
 282   map<ceph_tid_t, discover_info_t> discovers;
 283   ceph_tid_t discover_last_tid;
 284
 285   void _send_discover(discover_info_t& dis);
 286   discover_info_t& _create_discover(mds_rank_t mds) {
 287     ceph_tid_t t = ++discover_last_tid;
 288     discover_info_t& d = discovers[t];
 289     d.tid = t;
 290     d.mds = mds;
 291     return d;
 292   }
 293
 294   // waiters
 295   map<int, map<inodeno_t, MDSContext::vec > > waiting_for_base_ino;
 296
 297   void discover_base_ino(inodeno_t want_ino, MDSContext *onfinish, mds_rank_t from=MDS_RANK_NONE);
 298   void discover_dir_frag(CInode *base, frag_t approx_fg, MDSContext *onfinish,
 299                          mds_rank_t from=MDS_RANK_NONE);
 300   void discover_path(CInode *base, snapid_t snap, filepath want_path, MDSContext *onfinish,
 301                      bool want_xlocked=false, mds_rank_t from=MDS_RANK_NONE);
 302   void discover_path(CDir *base, snapid_t snap, filepath want_path, MDSContext *onfinish,
 303                      bool want_xlocked=false);
 304   void kick_discovers(mds_rank_t who);  // after a failure.
 305
 306
 307   // -- subtrees --
 308 protected:
 309   /* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */
 310   map<CDir*,set<CDir*> > subtrees;
 311   map<CInode*,list<pair<CDir*,CDir*> > > projected_subtree_renames;  // renamed ino -> target dir
 312
 313   // adjust subtree auth specification
 314   //  dir->dir_auth
 315   //  imports/exports/nested_exports
 316   //  join/split subtrees as appropriate
 317 public:
 318   bool is_subtrees() { return !subtrees.empty(); }
 319   template<typename T>
 320   void get_subtrees(T& c) {
 321     if constexpr (std::is_same_v<T, std::vector<CDir*>>)
 322       c.reserve(c.size() + subtrees.size());
 323     for (const auto& p : subtrees) {
 324       c.push_back(p.first);
 325     }
 326   }
 327   void adjust_subtree_auth(CDir *root, mds_authority_t auth, bool adjust_pop=true);
 328   void adjust_subtree_auth(CDir *root, mds_rank_t a, mds_rank_t b=CDIR_AUTH_UNKNOWN) {
 329     adjust_subtree_auth(root, mds_authority_t(a,b));
 330   }
 331   void adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth);
 332   void adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_rank_t a) {
 333     adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
 334   }
 335   void adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bounds, const mds_authority_t &auth);
 336   void adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bounds, mds_rank_t a) {
 337     adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
 338   }
 339   void map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result);
 340   void try_subtree_merge(CDir *root);
 341   void try_subtree_merge_at(CDir *root, set<CInode*> *to_eval, bool adjust_pop=true);
 342   void subtree_merge_writebehind_finish(CInode *in, MutationRef& mut);
 343   void eval_subtree_root(CInode *diri);
 344   CDir *get_subtree_root(CDir *dir);
 345   CDir *get_projected_subtree_root(CDir *dir);
 346   bool is_leaf_subtree(CDir *dir) {
 347     ceph_assert(subtrees.count(dir));
 348     return subtrees[dir].empty();
 349   }
 350   void remove_subtree(CDir *dir);
 351   bool is_subtree(CDir *root) {
 352     return subtrees.count(root);
 353   }
 354   void get_subtree_bounds(CDir *root, set<CDir*>& bounds);
 355   void get_wouldbe_subtree_bounds(CDir *root, set<CDir*>& bounds);
 356   void verify_subtree_bounds(CDir *root, const set<CDir*>& bounds);
 357   void verify_subtree_bounds(CDir *root, const list<dirfrag_t>& bounds);
 358
 359   void project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir);
 360   void adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop);
 361
 362   auto get_auth_subtrees() {
 363     std::vector<CDir*> c;
 364     for (auto& p : subtrees) {
 365       auto& root = p.first;
 366       if (root->is_auth()) {
 367         c.push_back(root);
 368       }
 369     }
 370     return c;
 371   }
 372
 373   auto get_fullauth_subtrees() {
 374     std::vector<CDir*> c;
 375     for (auto& p : subtrees) {
 376       auto& root = p.first;
 377       if (root->is_full_dir_auth()) {
 378         c.push_back(root);
 379       }
 380     }
 381     return c;
 382   }
 383   auto num_subtrees_fullauth() const {
 384     std::size_t n = 0;
 385     for (auto& p : subtrees) {
 386       auto& root = p.first;
 387       if (root->is_full_dir_auth()) {
 388         ++n;
 389       }
 390     }
 391     return n;
 392   }
 393
 394   auto num_subtrees_fullnonauth() const {
 395     std::size_t n = 0;
 396     for (auto& p : subtrees) {
 397       auto& root = p.first;
 398       if (root->is_full_dir_nonauth()) {
 399         ++n;
 400       }
 401     }
 402     return n;
 403   }
 404
 405   auto num_subtrees() const {
 406     return subtrees.size();
 407   }
 408
 409
 410 protected:
 411   // -- requests --
 412   ceph::unordered_map<metareqid_t, MDRequestRef> active_requests;
 413
 414 public:
 415   int get_num_client_requests();
 416
 417   MDRequestRef request_start(const MClientRequest::const_ref& req);
 418   MDRequestRef request_start_slave(metareqid_t rid, __u32 attempt, const Message::const_ref &m);
 419   MDRequestRef request_start_internal(int op);
 420   bool have_request(metareqid_t rid) {
 421     return active_requests.count(rid);
 422   }
 423   MDRequestRef request_get(metareqid_t rid);
 424   void request_pin_ref(MDRequestRef& r, CInode *ref, vector<CDentry*>& trace);
 425   void request_finish(MDRequestRef& mdr);
 426   void request_forward(MDRequestRef& mdr, mds_rank_t mds, int port=0);
 427   void dispatch_request(MDRequestRef& mdr);
 428   void request_drop_foreign_locks(MDRequestRef& mdr);
 429   void request_drop_non_rdlocks(MDRequestRef& r);
 430   void request_drop_locks(MDRequestRef& r);
 431   void request_cleanup(MDRequestRef& r);
 432
 433   void request_kill(MDRequestRef& r);  // called when session closes
 434
 435   // journal/snap helpers
 436   CInode *pick_inode_snap(CInode *in, snapid_t follows);
 437   CInode *cow_inode(CInode *in, snapid_t last);
 438   void journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob, CDentry *dn,
 439                           snapid_t follows=CEPH_NOSNAP,
 440                           CInode **pcow_inode=0, CDentry::linkage_t *dnl=0);
 441   void journal_cow_inode(MutationRef& mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP,
 442                           CInode **pcow_inode=0);
 443   void journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP);
 444
 445   void project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
 446                                    int linkunlink, SnapRealm *prealm);
 447   void _project_rstat_inode_to_frag(CInode::mempool_inode & inode, snapid_t ofirst, snapid_t last,
 448                                     CDir *parent, int linkunlink, bool update_inode);
 449   void project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
 450                                    snapid_t ofirst, snapid_t last,
 451                                    CInode *pin, bool cow_head);
 452   void broadcast_quota_to_client(CInode *in, client_t exclude_ct = -1, bool quota_change = false);
 453   void predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
 454                                 CInode *in, CDir *parent,
 455                                 int flags, int linkunlink=0,
 456                                 snapid_t follows=CEPH_NOSNAP);
 457
 458   // slaves
 459   void add_uncommitted_master(metareqid_t reqid, LogSegment *ls, set<mds_rank_t> &slaves, bool safe=false) {
 460     uncommitted_masters[reqid].ls = ls;
 461     uncommitted_masters[reqid].slaves = slaves;
 462     uncommitted_masters[reqid].safe = safe;
 463   }
 464   void wait_for_uncommitted_master(metareqid_t reqid, MDSContext *c) {
 465     uncommitted_masters[reqid].waiters.push_back(c);
 466   }
 467   bool have_uncommitted_master(metareqid_t reqid, mds_rank_t from) {
 468     auto p = uncommitted_masters.find(reqid);
 469     return p != uncommitted_masters.end() && p->second.slaves.count(from) > 0;
 470   }
 471   void log_master_commit(metareqid_t reqid);
 472   void logged_master_update(metareqid_t reqid);
 473   void _logged_master_commit(metareqid_t reqid);
 474   void committed_master_slave(metareqid_t r, mds_rank_t from);
 475   void finish_committed_masters();
 476
 477   void _logged_slave_commit(mds_rank_t from, metareqid_t reqid);
 478
 479   // -- recovery --
 480 protected:
 481   set<mds_rank_t> recovery_set;
 482
 483 public:
 484   void set_recovery_set(set<mds_rank_t>& s);
 485   void handle_mds_failure(mds_rank_t who);
 486   void handle_mds_recovery(mds_rank_t who);
 487
 488 protected:
 489   // [resolve]
 490   // from EImportStart w/o EImportFinish during journal replay
 491   map<dirfrag_t, vector<dirfrag_t> >            my_ambiguous_imports;
 492   // from MMDSResolves
 493   map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > > other_ambiguous_imports;
 494
 495   map<mds_rank_t, map<metareqid_t, MDSlaveUpdate*> > uncommitted_slave_updates;  // slave: for replay.
 496   map<CInode*, int> uncommitted_slave_rename_olddir;  // slave: preserve the non-auth dir until seeing commit.
 497   map<CInode*, int> uncommitted_slave_unlink;  // slave: preserve the unlinked inode until seeing commit.
 498
 499   // track master requests whose slaves haven't acknowledged commit
 500   struct umaster {
 501     set<mds_rank_t> slaves;
 502     LogSegment *ls;
 503     MDSContext::vec waiters;
 504     bool safe;
 505     bool committing;
 506     bool recovering;
 507     umaster() : ls(NULL), safe(false), committing(false), recovering(false) {}
 508   };
 509   map<metareqid_t, umaster>                 uncommitted_masters;         // master: req -> slave set
 510
 511   set<metareqid_t>              pending_masters;
 512   map<int, set<metareqid_t> >   ambiguous_slave_updates;
 513
 514   friend class ESlaveUpdate;
 515   friend class ECommitted;
 516
 517   bool resolves_pending;
 518   set<mds_rank_t> resolve_gather;       // nodes i need resolves from
 519   set<mds_rank_t> resolve_ack_gather;   // nodes i need a resolve_ack from
 520   set<version_t> resolve_snapclient_commits;
 521   map<metareqid_t, mds_rank_t> resolve_need_rollback;  // rollbacks i'm writing to the journal
 522   map<mds_rank_t, MMDSResolve::const_ref> delayed_resolve;
 523
 524   void handle_resolve(const MMDSResolve::const_ref &m);
 525   void handle_resolve_ack(const MMDSResolveAck::const_ref &m);
 526   void process_delayed_resolve();
 527   void discard_delayed_resolve(mds_rank_t who);
 528   void maybe_resolve_finish();
 529   void disambiguate_my_imports();
 530   void disambiguate_other_imports();
 531   void trim_unlinked_inodes();
 532   void add_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master, MDSlaveUpdate*);
 533   void finish_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master);
 534   MDSlaveUpdate* get_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master);
 535
 536   void send_slave_resolves();
 537   void send_subtree_resolves();
 538   void maybe_finish_slave_resolve();
 539
 540 public:
 541   void recalc_auth_bits(bool replay);
 542   void remove_inode_recursive(CInode *in);
 543
 544   bool is_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
 545     auto p = ambiguous_slave_updates.find(master);
 546     return p != ambiguous_slave_updates.end() && p->second.count(reqid);
 547   }
 548   void add_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
 549     ambiguous_slave_updates[master].insert(reqid);
 550   }
 551   void remove_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
 552     auto p = ambiguous_slave_updates.find(master);
 553     auto q = p->second.find(reqid);
 554     ceph_assert(q != p->second.end());
 555     p->second.erase(q);
 556     if (p->second.empty())
 557       ambiguous_slave_updates.erase(p);
 558   }
 559
 560   void add_rollback(metareqid_t reqid, mds_rank_t master) {
 561     resolve_need_rollback[reqid] = master;
 562   }
 563   void finish_rollback(metareqid_t reqid);
 564
 565   // ambiguous imports
 566   void add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds);
 567   void add_ambiguous_import(CDir *base, const set<CDir*>& bounds);
 568   bool have_ambiguous_import(dirfrag_t base) {
 569     return my_ambiguous_imports.count(base);
 570   }
 571   void get_ambiguous_import_bounds(dirfrag_t base, vector<dirfrag_t>& bounds) {
 572     ceph_assert(my_ambiguous_imports.count(base));
 573     bounds = my_ambiguous_imports[base];
 574   }
 575   void cancel_ambiguous_import(CDir *);
 576   void finish_ambiguous_import(dirfrag_t dirino);
 577   void resolve_start(MDSContext *resolve_done_);
 578   void send_resolves();
 579   void maybe_send_pending_resolves() {
 580     if (resolves_pending)
 581       send_subtree_resolves();
 582   }
 583
 584   void _move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
 585                                map<dirfrag_t,vector<dirfrag_t> >& subtrees);
 586   ESubtreeMap *create_subtree_map();
 587
 588
 589   void clean_open_file_lists();
 590   void dump_openfiles(Formatter *f);
 591   bool dump_inode(Formatter *f, uint64_t number);
 592 protected:
 593   // [rejoin]
 594   bool rejoins_pending;
 595   set<mds_rank_t> rejoin_gather;      // nodes from whom i need a rejoin
 596   set<mds_rank_t> rejoin_sent;        // nodes i sent a rejoin to
 597   set<mds_rank_t> rejoin_ack_sent;    // nodes i sent a rejoin to
 598   set<mds_rank_t> rejoin_ack_gather;  // nodes from whom i need a rejoin ack
 599   map<mds_rank_t,map<inodeno_t,map<client_t,Capability::Import> > > rejoin_imported_caps;
 600   map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > > rejoin_slave_exports;
 601
 602   map<client_t,entity_inst_t> rejoin_client_map;
 603   map<client_t,client_metadata_t> rejoin_client_metadata_map;
 604   map<client_t,pair<Session*,uint64_t> > rejoin_session_map;
 605
 606   map<inodeno_t,pair<mds_rank_t,map<client_t,cap_reconnect_t> > > cap_exports; // ino -> target, client -> capex
 607
 608   map<inodeno_t,map<client_t,map<mds_rank_t,cap_reconnect_t> > > cap_imports;  // ino -> client -> frommds -> capex
 609   set<inodeno_t> cap_imports_missing;
 610   map<inodeno_t, MDSContext::vec > cap_reconnect_waiters;
 611   int cap_imports_num_opening;
 612
 613   set<CInode*> rejoin_undef_inodes;
 614   set<CInode*> rejoin_potential_updated_scatterlocks;
 615   set<CDir*>   rejoin_undef_dirfrags;
 616   map<mds_rank_t, set<CInode*> > rejoin_unlinked_inodes;
 617
 618   vector<CInode*> rejoin_recover_q, rejoin_check_q;
 619   list<SimpleLock*> rejoin_eval_locks;
 620   MDSContext::vec rejoin_waiters;
 621
 622   void rejoin_walk(CDir *dir, const MMDSCacheRejoin::ref &rejoin);
 623   void handle_cache_rejoin(const MMDSCacheRejoin::const_ref &m);
 624   void handle_cache_rejoin_weak(const MMDSCacheRejoin::const_ref &m);
 625   CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last);
 626   CDir* rejoin_invent_dirfrag(dirfrag_t df);
 627   void handle_cache_rejoin_strong(const MMDSCacheRejoin::const_ref &m);
 628   void rejoin_scour_survivor_replicas(mds_rank_t from, const MMDSCacheRejoin::const_ref &ack,
 629                                       set<vinodeno_t>& acked_inodes,
 630                                       set<SimpleLock *>& gather_locks);
 631   void handle_cache_rejoin_ack(const MMDSCacheRejoin::const_ref &m);
 632   void rejoin_send_acks();
 633   void rejoin_trim_undef_inodes();
 634   void maybe_send_pending_rejoins() {
 635     if (rejoins_pending)
 636       rejoin_send_rejoins();
 637   }
 638   std::unique_ptr<MDSContext> rejoin_done;
 639   std::unique_ptr<MDSContext> resolve_done;
 640 public:
 641   void rejoin_start(MDSContext *rejoin_done_);
 642   void rejoin_gather_finish();
 643   void rejoin_send_rejoins();
 644   void rejoin_export_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
 645                           int target=-1, bool drop_path=false) {
 646     auto& ex = cap_exports[ino];
 647     ex.first = target;
 648     auto &_icr = ex.second[client] = icr;
 649     if (drop_path)
 650       _icr.path.clear();
 651   }
 652   void rejoin_recovered_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
 653                              mds_rank_t frommds=MDS_RANK_NONE, bool drop_path=false) {
 654     auto &_icr = cap_imports[ino][client][frommds] = icr;
 655     if (drop_path)
 656       _icr.path.clear();
 657   }
 658   void rejoin_recovered_client(client_t client, const entity_inst_t& inst) {
 659     rejoin_client_map.emplace(client, inst);
 660   }
 661   bool rejoin_has_cap_reconnect(inodeno_t ino) const {
 662     return cap_imports.count(ino);
 663   }
 664   void add_replay_ino_alloc(inodeno_t ino) {
 665     cap_imports_missing.insert(ino); // avoid opening ino during cache rejoin
 666   }
 667   const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) {
 668     if (cap_imports.count(ino) &&
 669         cap_imports[ino].count(client) &&
 670         cap_imports[ino][client].count(MDS_RANK_NONE)) {
 671       return &cap_imports[ino][client][MDS_RANK_NONE];
 672     }
 673     return NULL;
 674   }
 675   void remove_replay_cap_reconnect(inodeno_t ino, client_t client) {
 676     ceph_assert(cap_imports[ino].size() == 1);
 677     ceph_assert(cap_imports[ino][client].size() == 1);
 678     cap_imports.erase(ino);
 679   }
 680   void wait_replay_cap_reconnect(inodeno_t ino, MDSContext *c) {
 681     cap_reconnect_waiters[ino].push_back(c);
 682   }
 683
 684   // [reconnect/rejoin caps]
 685   struct reconnected_cap_info_t {
 686     inodeno_t realm_ino;
 687     snapid_t snap_follows;
 688     int dirty_caps;
 689     bool snapflush;
 690     reconnected_cap_info_t() :
 691       realm_ino(0), snap_follows(0), dirty_caps(0), snapflush(false) {}
 692   };
 693   map<inodeno_t,map<client_t, reconnected_cap_info_t> >  reconnected_caps;   // inode -> client -> snap_follows,realmino
 694   map<inodeno_t,map<client_t, snapid_t> > reconnected_snaprealms;  // realmino -> client -> realmseq
 695
 696   void add_reconnected_cap(client_t client, inodeno_t ino, const cap_reconnect_t& icr) {
 697     reconnected_cap_info_t &info = reconnected_caps[ino][client];
 698     info.realm_ino = inodeno_t(icr.capinfo.snaprealm);
 699     info.snap_follows = icr.snap_follows;
 700   }
 701   void set_reconnected_dirty_caps(client_t client, inodeno_t ino, int dirty, bool snapflush) {
 702     reconnected_cap_info_t &info = reconnected_caps[ino][client];
 703     info.dirty_caps |= dirty;
 704     if (snapflush)
 705       info.snapflush = snapflush;
 706   }
 707   void add_reconnected_snaprealm(client_t client, inodeno_t ino, snapid_t seq) {
 708     reconnected_snaprealms[ino][client] = seq;
 709   }
 710
 711   friend class C_MDC_RejoinOpenInoFinish;
 712   friend class C_MDC_RejoinSessionsOpened;
 713   void rejoin_open_ino_finish(inodeno_t ino, int ret);
 714   void rejoin_prefetch_ino_finish(inodeno_t ino, int ret);
 715   void rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map);
 716   bool process_imported_caps();
 717   void choose_lock_states_and_reconnect_caps();
 718   void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
 719                            map<client_t,MClientSnap::ref>& splits);
 720   void prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm, map<client_t,MClientSnap::ref>& splits);
 721   void send_snaps(map<client_t,MClientSnap::ref>& splits);
 722   Capability* rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds);
 723   void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq,
 724                                   map<client_t,MClientSnap::ref>& updates);
 725   Capability* try_reconnect_cap(CInode *in, Session *session);
 726   void export_remaining_imported_caps();
 727
 728   //  realm inodes
 729   set<CInode*> rejoin_pending_snaprealms;
 730   // cap imports.  delayed snap parent opens.
 731   map<client_t,set<CInode*> > delayed_imported_caps;
 732
 733   void do_cap_import(Session *session, CInode *in, Capability *cap,
 734                      uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
 735                      int peer, int p_flags);
 736   void do_delayed_cap_imports();
 737   void rebuild_need_snapflush(CInode *head_in, SnapRealm *realm, client_t client,
 738                               snapid_t snap_follows);
 739   void open_snaprealms();
 740
 741   bool open_undef_inodes_dirfrags();
 742   void opened_undef_inode(CInode *in);
 743   void opened_undef_dirfrag(CDir *dir) {
 744     rejoin_undef_dirfrags.erase(dir);
 745   }
 746
 747   void reissue_all_caps();
 748
 749
 750   friend class Locker;
 751   friend class Migrator;
 752   friend class MDBalancer;
 753
 754   // StrayManager needs to be able to remove_inode() from us
 755   // when it is done purging
 756   friend class StrayManager;
 757
 758   // File size recovery
 759 private:
 760   RecoveryQueue recovery_queue;
 761   void identify_files_to_recover();
 762 public:
 763   void start_files_to_recover();
 764   void do_file_recover();
 765   void queue_file_recover(CInode *in);
 766   void _queued_file_recover_cow(CInode *in, MutationRef& mut);
 767
 768   // subsystems
 769   std::unique_ptr<Migrator> migrator;
 770
 771  public:
 772   explicit MDCache(MDSRank *m, PurgeQueue &purge_queue_);
 773   ~MDCache();
 774   void handle_conf_change(const ConfigProxy& conf,
 775                           const std::set <std::string> &changed,
 776                           const MDSMap &mds_map);
 777
 778   // debug
 779   void log_stat();
 780
 781   // root inode
 782   CInode *get_root() { return root; }
 783   CInode *get_myin() { return myin; }
 784
 785   size_t get_cache_size() { return lru.lru_get_size(); }
 786
 787   // trimming
 788   std::pair<bool, uint64_t> trim(uint64_t count=0);
 789 private:
 790   std::pair<bool, uint64_t> trim_lru(uint64_t count, expiremap& expiremap);
 791   bool trim_dentry(CDentry *dn, expiremap& expiremap);
 792   void trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap);
 793   bool trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap&);
 794   void send_expire_messages(expiremap& expiremap);
 795   void trim_non_auth();      // trim out trimmable non-auth items
 796 public:
 797   bool trim_non_auth_subtree(CDir *directory);
 798   void standby_trim_segment(LogSegment *ls);
 799   void try_trim_non_auth_subtree(CDir *dir);
 800   bool can_trim_non_auth_dirfrag(CDir *dir) {
 801     return my_ambiguous_imports.count((dir)->dirfrag()) == 0 &&
 802            uncommitted_slave_rename_olddir.count(dir->inode) == 0;
 803   }
 804
 805   /**
 806    * For all unreferenced inodes, dirs, dentries below an inode, compose
 807    * expiry messages.  This is used when giving up all replicas of entities
 808    * for an MDS peer in the 'stopping' state, such that the peer can
 809    * empty its cache and finish shutting down.
 810    *
 811    * We have to make sure we're only expiring un-referenced items to
 812    * avoid interfering with ongoing stray-movement (we can't distinguish
 813    * between the "moving my strays" and "waiting for my cache to empty"
 814    * phases within 'stopping')
 815    *
 816    * @return false if we completed cleanly, true if caller should stop
 817    *         expiring because we hit something with refs.
 818    */
 819   bool expire_recursive(CInode *in, expiremap& expiremap);
 820
 821   void trim_client_leases();
 822   void check_memory_usage();
 823
 824   // shutdown
 825 private:
 826   set<inodeno_t> shutdown_exporting_strays;
 827   pair<dirfrag_t, string> shutdown_export_next;
 828 public:
 829   void shutdown_start();
 830   void shutdown_check();
 831   bool shutdown_pass();
 832   bool shutdown();                    // clear cache (ie at shutodwn)
 833   bool shutdown_export_strays();
 834   void shutdown_export_stray_finish(inodeno_t ino) {
 835     if (shutdown_exporting_strays.erase(ino))
 836       shutdown_export_strays();
 837   }
 838
 839   bool did_shutdown_log_cap;
 840
 841   // inode_map
 842   bool have_inode(vinodeno_t vino) {
 843     if (vino.snapid == CEPH_NOSNAP)
 844       return inode_map.count(vino.ino) ? true : false;
 845     else
 846       return snap_inode_map.count(vino) ? true : false;
 847   }
 848   bool have_inode(inodeno_t ino, snapid_t snap=CEPH_NOSNAP) {
 849     return have_inode(vinodeno_t(ino, snap));
 850   }
 851   CInode* get_inode(vinodeno_t vino) {
 852     if (vino.snapid == CEPH_NOSNAP) {
 853       auto p = inode_map.find(vino.ino);
 854       if (p != inode_map.end())
 855         return p->second;
 856     } else {
 857       auto p = snap_inode_map.find(vino);
 858       if (p != snap_inode_map.end())
 859         return p->second;
 860     }
 861     return NULL;
 862   }
 863   CInode* get_inode(inodeno_t ino, snapid_t s=CEPH_NOSNAP) {
 864     return get_inode(vinodeno_t(ino, s));
 865   }
 866   CInode* lookup_snap_inode(vinodeno_t vino) {
 867     auto p = snap_inode_map.lower_bound(vino);
 868     if (p != snap_inode_map.end() &&
 869         p->second->ino() == vino.ino && p->second->first <= vino.snapid)
 870       return p->second;
 871     return NULL;
 872   }
 873
 874   CDir* get_dirfrag(dirfrag_t df) {
 875     CInode *in = get_inode(df.ino);
 876     if (!in)
 877       return NULL;
 878     return in->get_dirfrag(df.frag);
 879   }
 880   CDir* get_dirfrag(inodeno_t ino, std::string_view dn) {
 881     CInode *in = get_inode(ino);
 882     if (!in)
 883       return NULL;
 884     frag_t fg = in->pick_dirfrag(dn);
 885     return in->get_dirfrag(fg);
 886   }
 887   CDir* get_force_dirfrag(dirfrag_t df, bool replay) {
 888     CInode *diri = get_inode(df.ino);
 889     if (!diri)
 890       return NULL;
 891     CDir *dir = force_dir_fragment(diri, df.frag, replay);
 892     if (!dir)
 893       dir = diri->get_dirfrag(df.frag);
 894     return dir;
 895   }
 896
 897   MDSCacheObject *get_object(const MDSCacheObjectInfo &info);
 898
 899
 900
 901  public:
 902   void add_inode(CInode *in);
 903
 904   void remove_inode(CInode *in);
 905  protected:
 906   void touch_inode(CInode *in) {
 907     if (in->get_parent_dn())
 908       touch_dentry(in->get_projected_parent_dn());
 909   }
 910 public:
 911   void touch_dentry(CDentry *dn) {
 912     if (dn->state_test(CDentry::STATE_BOTTOMLRU)) {
 913       bottom_lru.lru_midtouch(dn);
 914     } else {
 915       if (dn->is_auth())
 916         lru.lru_touch(dn);
 917       else
 918         lru.lru_midtouch(dn);
 919     }
 920   }
 921   void touch_dentry_bottom(CDentry *dn) {
 922     if (dn->state_test(CDentry::STATE_BOTTOMLRU))
 923       return;
 924     lru.lru_bottouch(dn);
 925   }
 926 protected:
 927
 928   void inode_remove_replica(CInode *in, mds_rank_t rep, bool rejoin,
 929                             set<SimpleLock *>& gather_locks);
 930   void dentry_remove_replica(CDentry *dn, mds_rank_t rep, set<SimpleLock *>& gather_locks);
 931
 932   void rename_file(CDentry *srcdn, CDentry *destdn);
 933
 934  public:
 935   // truncate
 936   void truncate_inode(CInode *in, LogSegment *ls);
 937   void _truncate_inode(CInode *in, LogSegment *ls);
 938   void truncate_inode_finish(CInode *in, LogSegment *ls);
 939   void truncate_inode_logged(CInode *in, MutationRef& mut);
 940
 941   void add_recovered_truncate(CInode *in, LogSegment *ls);
 942   void remove_recovered_truncate(CInode *in, LogSegment *ls);
 943   void start_recovered_truncates();
 944
 945
 946  public:
 947   CDir *get_auth_container(CDir *in);
 948   CDir *get_export_container(CDir *dir);
 949   void find_nested_exports(CDir *dir, set<CDir*>& s);
 950   void find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s);
 951
 952
 953 private:
 954   bool opening_root, open;
 955   MDSContext::vec waiting_for_open;
 956
 957 public:
 958   void init_layouts();
 959   void create_unlinked_system_inode(CInode *in, inodeno_t ino,
 960                                     int mode) const;
 961   CInode *create_system_inode(inodeno_t ino, int mode);
 962   CInode *create_root_inode();
 963
 964   void create_empty_hierarchy(MDSGather *gather);
 965   void create_mydir_hierarchy(MDSGather *gather);
 966
 967   bool is_open() { return open; }
 968   void wait_for_open(MDSContext *c) {
 969     waiting_for_open.push_back(c);
 970   }
 971
 972   void open_root_inode(MDSContext *c);
 973   void open_root();
 974   void open_mydir_inode(MDSContext *c);
 975   void open_mydir_frag(MDSContext *c);
 976   void populate_mydir();
 977
 978   void _create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin);
 979   void _create_system_file_finish(MutationRef& mut, CDentry *dn,
 980                                   version_t dpv, MDSContext *fin);
 981
 982   void open_foreign_mdsdir(inodeno_t ino, MDSContext *c);
 983   CDir *get_stray_dir(CInode *in);
 984   CDentry *get_or_create_stray_dentry(CInode *in);
 985
 986   /**
 987    * Find the given dentry (and whether it exists or not), its ancestors,
 988    * and get them all into memory and usable on this MDS. This function
 989    * makes a best-effort attempt to load everything; if it needs to
 990    * go away and do something then it will put the request on a waitlist.
 991    * It prefers the mdr, then the req, then the fin. (At least one of these
 992    * must be non-null.)
 993    *
 994    * At least one of the params mdr, req, and fin must be non-null.
 995    *
 996    * @param mdr The MDRequest associated with the path. Can be null.
 997    * @param cf A MDSContextFactory for waiter building.
 998    * @param path The path to traverse to.
 999    * @param pdnvec Data return parameter -- on success, contains a
1000    * vector of dentries. On failure, is either empty or contains the
1001    * full trace of traversable dentries.
1002    * @param pin Data return parameter -- if successful, points to the inode
1003    * associated with filepath. If unsuccessful, is null.
1004    * @param onfail Specifies different lookup failure behaviors. If set to
1005    * MDS_TRAVERSE_DISCOVERXLOCK, path_traverse will succeed on null
1006    * dentries (instead of returning -ENOENT). If set to
1007    * MDS_TRAVERSE_FORWARD, it will forward the request to the auth
1008    * MDS if that becomes appropriate (ie, if it doesn't know the contents
1009    * of a directory). If set to MDS_TRAVERSE_DISCOVER, it
1010    * will attempt to look up the path from a different MDS (and bring them
1011    * into its cache as replicas).
1012    *
1013    * @returns 0 on success, 1 on "not done yet", 2 on "forwarding", -errno otherwise.
1014    * If it returns 1, the requester associated with this call has been placed
1015    * on the appropriate waitlist, and it should unwind itself and back out.
1016    * If it returns 2 the request has been forwarded, and again the requester
1017    * should unwind itself and back out.
1018    */
1019   int path_traverse(MDRequestRef& mdr, MDSContextFactory& cf, const filepath& path,
1020                     vector<CDentry*> *pdnvec, CInode **pin, int onfail);
1021
1022   CInode *cache_traverse(const filepath& path);
1023
1024   void open_remote_dirfrag(CInode *diri, frag_t fg, MDSContext *fin);
1025   CInode *get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected=false);
1026
1027   bool parallel_fetch(map<inodeno_t,filepath>& pathmap, set<inodeno_t>& missing);
1028   bool parallel_fetch_traverse_dir(inodeno_t ino, filepath& path,
1029                                    set<CDir*>& fetch_queue, set<inodeno_t>& missing,
1030                                    C_GatherBuilder &gather_bld);
1031
1032   void open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin,
1033                           bool want_xlocked=false);
1034   void _open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin,
1035                                   bool want_xlocked, int r);
1036
1037   void make_trace(vector<CDentry*>& trace, CInode *in);
1038
1039 protected:
1040   struct open_ino_info_t {
1041     vector<inode_backpointer_t> ancestors;
1042     set<mds_rank_t> checked;
1043     mds_rank_t checking;
1044     mds_rank_t auth_hint;
1045     bool check_peers;
1046     bool fetch_backtrace;
1047     bool discover;
1048     bool want_replica;
1049     bool want_xlocked;
1050     version_t tid;
1051     int64_t pool;
1052     int last_err;
1053     MDSContext::vec waiters;
1054     open_ino_info_t() : checking(MDS_RANK_NONE), auth_hint(MDS_RANK_NONE),
1055       check_peers(true), fetch_backtrace(true), discover(false),
1056       want_replica(false), want_xlocked(false), tid(0), pool(-1),
1057       last_err(0) {}
1058   };
1059   ceph_tid_t open_ino_last_tid;
1060   map<inodeno_t,open_ino_info_t> opening_inodes;
1061
1062   void _open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err);
1063   void _open_ino_parent_opened(inodeno_t ino, int ret);
1064   void _open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int err);
1065   void _open_ino_fetch_dir(inodeno_t ino, const MMDSOpenIno::const_ref &m, CDir *dir, bool parent);
1066   int open_ino_traverse_dir(inodeno_t ino, const MMDSOpenIno::const_ref &m,
1067                             const vector<inode_backpointer_t>& ancestors,
1068                             bool discover, bool want_xlocked, mds_rank_t *hint);
1069   void open_ino_finish(inodeno_t ino, open_ino_info_t& info, int err);
1070   void do_open_ino(inodeno_t ino, open_ino_info_t& info, int err);
1071   void do_open_ino_peer(inodeno_t ino, open_ino_info_t& info);
1072   void handle_open_ino(const MMDSOpenIno::const_ref &m, int err=0);
1073   void handle_open_ino_reply(const MMDSOpenInoReply::const_ref &m);
1074   friend class C_IO_MDC_OpenInoBacktraceFetched;
1075   friend struct C_MDC_OpenInoTraverseDir;
1076   friend struct C_MDC_OpenInoParentOpened;
1077
1078 public:
1079   void kick_open_ino_peers(mds_rank_t who);
1080   void open_ino(inodeno_t ino, int64_t pool, MDSContext *fin,
1081                 bool want_replica=true, bool want_xlocked=false);
1082
1083   // -- find_ino_peer --
1084   struct find_ino_peer_info_t {
1085     inodeno_t ino;
1086     ceph_tid_t tid;
1087     MDSContext *fin;
1088     mds_rank_t hint;
1089     mds_rank_t checking;
1090     set<mds_rank_t> checked;
1091
1092     find_ino_peer_info_t() : tid(0), fin(NULL), hint(MDS_RANK_NONE), checking(MDS_RANK_NONE) {}
1093   };
1094
1095   map<ceph_tid_t, find_ino_peer_info_t> find_ino_peer;
1096   ceph_tid_t find_ino_peer_last_tid;
1097
1098   void find_ino_peers(inodeno_t ino, MDSContext *c, mds_rank_t hint=MDS_RANK_NONE);
1099   void _do_find_ino_peer(find_ino_peer_info_t& fip);
1100   void handle_find_ino(const MMDSFindIno::const_ref &m);
1101   void handle_find_ino_reply(const MMDSFindInoReply::const_ref &m);
1102   void kick_find_ino_peers(mds_rank_t who);
1103
1104   // -- snaprealms --
1105 private:
1106   SnapRealm *global_snaprealm;
1107 public:
1108   SnapRealm *get_global_snaprealm() const { return global_snaprealm; }
1109   void create_global_snaprealm();
1110   void do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients=true);
1111   void send_snap_update(CInode *in, version_t stid, int snap_op);
1112   void handle_snap_update(const MMDSSnapUpdate::const_ref &m);
1113   void notify_global_snaprealm_update(int snap_op);
1114
1115   // -- stray --
1116 public:
1117   void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin);
1118   uint64_t get_num_strays() const { return stray_manager.get_num_strays(); }
1119
1120 protected:
1121   void scan_stray_dir(dirfrag_t next=dirfrag_t());
1122   StrayManager stray_manager;
1123   friend struct C_MDC_RetryScanStray;
1124
1125   // == messages ==
1126  public:
1127   void dispatch(const Message::const_ref &m);
1128
1129  protected:
1130   // -- replicas --
1131   void handle_discover(const MDiscover::const_ref &dis);
1132   void handle_discover_reply(const MDiscoverReply::const_ref &m);
1133   friend class C_MDC_Join;
1134
1135 public:
1136   void replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl);
1137   void replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl);
1138   void replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl,
1139                        uint64_t features);
1140
1141   CDir* add_replica_dir(bufferlist::const_iterator& p, CInode *diri, mds_rank_t from, MDSContext::vec& finished);
1142   CDentry *add_replica_dentry(bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished);
1143   CInode *add_replica_inode(bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished);
1144
1145   void replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl);
1146   CDentry *add_replica_stray(const bufferlist &bl, mds_rank_t from);
1147
1148   // -- namespace --
1149 public:
1150   void send_dentry_link(CDentry *dn, MDRequestRef& mdr);
1151   void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr);
1152 protected:
1153   void handle_dentry_link(const MDentryLink::const_ref &m);
1154   void handle_dentry_unlink(const MDentryUnlink::const_ref &m);
1155
1156
1157   // -- fragmenting --
1158 private:
1159   struct ufragment {
1160     int bits;
1161     bool committed;
1162     LogSegment *ls;
1163     MDSContext::vec waiters;
1164     frag_vec_t old_frags;
1165     bufferlist rollback;
1166     ufragment() : bits(0), committed(false), ls(NULL) {}
1167   };
1168   map<dirfrag_t, ufragment> uncommitted_fragments;
1169
1170   struct fragment_info_t {
1171     int bits;
1172     list<CDir*> dirs;
1173     list<CDir*> resultfrags;
1174     MDRequestRef mdr;
1175     set<mds_rank_t> notify_ack_waiting;
1176     bool finishing = false;
1177
1178     // for deadlock detection
1179     bool all_frozen = false;
1180     utime_t last_cum_auth_pins_change;
1181     int last_cum_auth_pins = 0;
1182     int num_remote_waiters = 0; // number of remote authpin waiters
1183     fragment_info_t() {}
1184     bool is_fragmenting() { return !resultfrags.empty(); }
1185     uint64_t get_tid() { return mdr ? mdr->reqid.tid : 0; }
1186   };
1187   map<dirfrag_t,fragment_info_t> fragments;
1188   typedef map<dirfrag_t,fragment_info_t>::iterator fragment_info_iterator;
1189
1190   void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
1191                             list<CDir*>& frags, MDSContext::vec& waiters, bool replay);
1192   void adjust_dir_fragments(CInode *diri,
1193                             list<CDir*>& srcfrags,
1194                             frag_t basefrag, int bits,
1195                             list<CDir*>& resultfrags,
1196                             MDSContext::vec& waiters,
1197                             bool replay);
1198   CDir *force_dir_fragment(CInode *diri, frag_t fg, bool replay=true);
1199   void get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds);
1200
1201   bool can_fragment(CInode *diri, list<CDir*>& dirs);
1202   void fragment_freeze_dirs(list<CDir*>& dirs);
1203   void fragment_mark_and_complete(MDRequestRef& mdr);
1204   void fragment_frozen(MDRequestRef& mdr, int r);
1205   void fragment_unmark_unfreeze_dirs(list<CDir*>& dirs);
1206   void fragment_drop_locks(fragment_info_t &info);
1207   void fragment_maybe_finish(const fragment_info_iterator& it);
1208   void dispatch_fragment_dir(MDRequestRef& mdr);
1209   void _fragment_logged(MDRequestRef& mdr);
1210   void _fragment_stored(MDRequestRef& mdr);
1211   void _fragment_committed(dirfrag_t f, const MDRequestRef& mdr);
1212   void _fragment_old_purged(dirfrag_t f, int bits, const MDRequestRef& mdr);
1213
1214   friend class EFragment;
1215   friend class C_MDC_FragmentFrozen;
1216   friend class C_MDC_FragmentMarking;
1217   friend class C_MDC_FragmentPrep;
1218   friend class C_MDC_FragmentStore;
1219   friend class C_MDC_FragmentCommit;
1220   friend class C_IO_MDC_FragmentPurgeOld;
1221
1222   void handle_fragment_notify(const MMDSFragmentNotify::const_ref &m);
1223   void handle_fragment_notify_ack(const MMDSFragmentNotifyAck::const_ref &m);
1224
1225   void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frag,
1226                                 LogSegment *ls, bufferlist *rollback=NULL);
1227   void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op);
1228   void rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags);
1229
1230
1231   DecayCounter trim_counter;
1232
1233 public:
1234   void wait_for_uncommitted_fragment(dirfrag_t dirfrag, MDSContext *c) {
1235     ceph_assert(uncommitted_fragments.count(dirfrag));
1236     uncommitted_fragments[dirfrag].waiters.push_back(c);
1237   }
1238   void split_dir(CDir *dir, int byn);
1239   void merge_dir(CInode *diri, frag_t fg);
1240   void rollback_uncommitted_fragments();
1241
1242   void find_stale_fragment_freeze();
1243   void fragment_freeze_inc_num_waiters(CDir *dir);
1244   bool fragment_are_all_frozen(CDir *dir);
1245   int get_num_fragmenting_dirs() { return fragments.size(); }
1246
1247   // -- updates --
1248   //int send_inode_updates(CInode *in);
1249   //void handle_inode_update(MInodeUpdate *m);
1250
1251   int send_dir_updates(CDir *in, bool bcast=false);
1252   void handle_dir_update(const MDirUpdate::const_ref &m);
1253
1254   // -- cache expiration --
1255   void handle_cache_expire(const MCacheExpire::const_ref &m);
1256   // delayed cache expire
1257   map<CDir*, expiremap> delayed_expire; // subtree root -> expire msg
1258   void process_delayed_expire(CDir *dir);
1259   void discard_delayed_expire(CDir *dir);
1260
1261 protected:
1262   int dump_cache(std::string_view fn, Formatter *f);
1263 public:
1264   int dump_cache() { return dump_cache(NULL, NULL); }
1265   int dump_cache(std::string_view filename);
1266   int dump_cache(Formatter *f);
1267   void dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f);
1268
1269   void cache_status(Formatter *f);
1270
1271   void dump_resolve_status(Formatter *f) const;
1272   void dump_rejoin_status(Formatter *f) const;
1273
1274   // == crap fns ==
1275  public:
1276   void show_cache();
1277   void show_subtrees(int dbl=10);
1278
1279   CInode *hack_pick_random_inode() {
1280     ceph_assert(!inode_map.empty());
1281     int n = rand() % inode_map.size();
1282     auto p = inode_map.begin();
1283     while (n--) ++p;
1284     return p->second;
1285   }
1286
1287 protected:
1288   void flush_dentry_work(MDRequestRef& mdr);
1289   /**
1290    * Resolve path to a dentry and pass it onto the ScrubStack.
1291    *
1292    * TODO: return enough information to the original mdr formatter
1293    * and completion that they can subsequeuntly check the progress of
1294    * this scrub (we won't block them on a whole scrub as it can take a very
1295    * long time)
1296    */
1297   void enqueue_scrub_work(MDRequestRef& mdr);
1298   void recursive_scrub_finish(const ScrubHeaderRef& header);
1299   void repair_inode_stats_work(MDRequestRef& mdr);
1300   void repair_dirfrag_stats_work(MDRequestRef& mdr);
1301   void upgrade_inode_snaprealm_work(MDRequestRef& mdr);
1302   friend class C_MDC_RespondInternalRequest;
1303 public:
1304   void flush_dentry(std::string_view path, Context *fin);
1305   /**
1306    * Create and start an OP_ENQUEUE_SCRUB
1307    */
1308   void enqueue_scrub(std::string_view path, std::string_view tag,
1309                      bool force, bool recursive, bool repair,
1310                      Formatter *f, Context *fin);
1311   void repair_inode_stats(CInode *diri);
1312   void repair_dirfrag_stats(CDir *dir);
1313   void upgrade_inode_snaprealm(CInode *in);
1314
1315 public:
1316   /* Because exports may fail, this set lets us keep track of inodes that need exporting. */
1317   std::set<CInode *> export_pin_queue;
1318
1319   OpenFileTable open_file_table;
1320 };
1321
1322 class C_MDS_RetryRequest : public MDSInternalContext {
1323   MDCache *cache;
1324   MDRequestRef mdr;
1325  public:
1326   C_MDS_RetryRequest(MDCache *c, MDRequestRef& r);
1327   void finish(int r) override;
1328 };
1329
1330 #endif