ceph/src/mds/MDCache.h

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14 #ifndef CEPH_MDCACHE_H
  15 #define CEPH_MDCACHE_H
  16
  17 #include <atomic>
  18 #include <string_view>
  19 #include <thread>
  20
  21 #include "common/DecayCounter.h"
  22 #include "include/common_fwd.h"
  23 #include "include/types.h"
  24 #include "include/filepath.h"
  25 #include "include/elist.h"
  26
  27 #include "messages/MCacheExpire.h"
  28 #include "messages/MClientQuota.h"
  29 #include "messages/MClientRequest.h"
  30 #include "messages/MClientSnap.h"
  31 #include "messages/MDentryLink.h"
  32 #include "messages/MDentryUnlink.h"
  33 #include "messages/MDirUpdate.h"
  34 #include "messages/MDiscover.h"
  35 #include "messages/MDiscoverReply.h"
  36 #include "messages/MGatherCaps.h"
  37 #include "messages/MGenericMessage.h"
  38 #include "messages/MInodeFileCaps.h"
  39 #include "messages/MLock.h"
  40 #include "messages/MMDSCacheRejoin.h"
  41 #include "messages/MMDSFindIno.h"
  42 #include "messages/MMDSFindInoReply.h"
  43 #include "messages/MMDSFragmentNotify.h"
  44 #include "messages/MMDSFragmentNotifyAck.h"
  45 #include "messages/MMDSOpenIno.h"
  46 #include "messages/MMDSOpenInoReply.h"
  47 #include "messages/MMDSResolve.h"
  48 #include "messages/MMDSResolveAck.h"
  49 #include "messages/MMDSPeerRequest.h"
  50 #include "messages/MMDSSnapUpdate.h"
  51
  52 #include "osdc/Filer.h"
  53 #include "CInode.h"
  54 #include "CDentry.h"
  55 #include "CDir.h"
  56 #include "include/Context.h"
  57 #include "events/EMetaBlob.h"
  58 #include "RecoveryQueue.h"
  59 #include "StrayManager.h"
  60 #include "OpenFileTable.h"
  61 #include "MDSContext.h"
  62 #include "MDSMap.h"
  63 #include "Mutation.h"
  64
  65 class MDSRank;
  66 class Session;
  67 class Migrator;
  68
  69 class Session;
  70
  71 class ESubtreeMap;
  72
  73 enum {
  74   l_mdc_first = 3000,
  75   // How many inodes currently in stray dentries
  76   l_mdc_num_strays,
  77   // How many stray dentries are currently delayed for purge due to refs
  78   l_mdc_num_strays_delayed,
  79   // How many stray dentries are currently being enqueued for purge
  80   l_mdc_num_strays_enqueuing,
  81
  82   // How many dentries have ever been added to stray dir
  83   l_mdc_strays_created,
  84   // How many dentries have been passed on to PurgeQueue
  85   l_mdc_strays_enqueued,
  86   // How many strays have been reintegrated?
  87   l_mdc_strays_reintegrated,
  88   // How many strays have been migrated?
  89   l_mdc_strays_migrated,
  90
  91   // How many inode sizes currently being recovered
  92   l_mdc_num_recovering_processing,
  93   // How many inodes currently waiting to have size recovered
  94   l_mdc_num_recovering_enqueued,
  95   // How many inodes waiting with elevated priority for recovery
  96   l_mdc_num_recovering_prioritized,
  97   // How many inodes ever started size recovery
  98   l_mdc_recovery_started,
  99   // How many inodes ever completed size recovery
 100   l_mdc_recovery_completed,
 101
 102   l_mdss_ireq_enqueue_scrub,
 103   l_mdss_ireq_exportdir,
 104   l_mdss_ireq_flush,
 105   l_mdss_ireq_fragmentdir,
 106   l_mdss_ireq_fragstats,
 107   l_mdss_ireq_inodestats,
 108
 109   l_mdc_last,
 110 };
 111
 112 // flags for path_traverse();
 113 static const int MDS_TRAVERSE_DISCOVER          = (1 << 0);
 114 static const int MDS_TRAVERSE_PATH_LOCKED       = (1 << 1);
 115 static const int MDS_TRAVERSE_WANT_DENTRY       = (1 << 2);
 116 static const int MDS_TRAVERSE_WANT_AUTH         = (1 << 3);
 117 static const int MDS_TRAVERSE_RDLOCK_SNAP       = (1 << 4);
 118 static const int MDS_TRAVERSE_RDLOCK_SNAP2      = (1 << 5);
 119 static const int MDS_TRAVERSE_WANT_DIRLAYOUT    = (1 << 6);
 120 static const int MDS_TRAVERSE_RDLOCK_PATH       = (1 << 7);
 121 static const int MDS_TRAVERSE_XLOCK_DENTRY      = (1 << 8);
 122 static const int MDS_TRAVERSE_RDLOCK_AUTHLOCK   = (1 << 9);
 123 static const int MDS_TRAVERSE_CHECK_LOCKCACHE   = (1 << 10);
 124
 125
 126 // flags for predirty_journal_parents()
 127 static const int PREDIRTY_PRIMARY = 1; // primary dn, adjust nested accounting
 128 static const int PREDIRTY_DIR = 2;     // update parent dir mtime/size
 129 static const int PREDIRTY_SHALLOW = 4; // only go to immediate parent (for easier rollback)
 130
 131 class MDCache {
 132  public:
 133   typedef std::map<mds_rank_t, ref_t<MCacheExpire>> expiremap;
 134
 135   using clock = ceph::coarse_mono_clock;
 136   using time = ceph::coarse_mono_time;
 137
 138   // -- discover --
 139   struct discover_info_t {
 140     discover_info_t() {}
 141     ~discover_info_t() {
 142       if (basei)
 143         basei->put(MDSCacheObject::PIN_DISCOVERBASE);
 144     }
 145     void pin_base(CInode *b) {
 146       basei = b;
 147       basei->get(MDSCacheObject::PIN_DISCOVERBASE);
 148     }
 149
 150     ceph_tid_t tid = 0;
 151     mds_rank_t mds = -1;
 152     inodeno_t ino;
 153     frag_t frag;
 154     snapid_t snap = CEPH_NOSNAP;
 155     filepath want_path;
 156     CInode *basei = nullptr;
 157     bool want_base_dir = false;
 158     bool path_locked = false;
 159   };
 160
 161   // [reconnect/rejoin caps]
 162   struct reconnected_cap_info_t {
 163     reconnected_cap_info_t() {}
 164     inodeno_t realm_ino = 0;
 165     snapid_t snap_follows = 0;
 166     int dirty_caps = 0;
 167     bool snapflush = 0;
 168   };
 169
 170   // -- find_ino_peer --
 171   struct find_ino_peer_info_t {
 172     find_ino_peer_info_t() {}
 173     inodeno_t ino;
 174     ceph_tid_t tid = 0;
 175     MDSContext *fin = nullptr;
 176     bool path_locked = false;
 177     mds_rank_t hint = MDS_RANK_NONE;
 178     mds_rank_t checking = MDS_RANK_NONE;
 179     set<mds_rank_t> checked;
 180   };
 181
 182   friend class C_MDC_RejoinOpenInoFinish;
 183   friend class C_MDC_RejoinSessionsOpened;
 184
 185   friend class Locker;
 186   friend class Migrator;
 187   friend class MDBalancer;
 188
 189   // StrayManager needs to be able to remove_inode() from us
 190   // when it is done purging
 191   friend class StrayManager;
 192
 193   explicit MDCache(MDSRank *m, PurgeQueue &purge_queue_);
 194   ~MDCache();
 195
 196   uint64_t cache_limit_memory(void) {
 197     return cache_memory_limit;
 198   }
 199   double cache_toofull_ratio(void) const {
 200     double memory_reserve = cache_memory_limit*(1.0-cache_reservation);
 201     return fmax(0.0, (cache_size()-memory_reserve)/memory_reserve);
 202   }
 203   bool cache_toofull(void) const {
 204     return cache_toofull_ratio() > 0.0;
 205   }
 206   uint64_t cache_size(void) const {
 207     return mempool::get_pool(mempool::mds_co::id).allocated_bytes();
 208   }
 209   bool cache_overfull(void) const {
 210     return cache_size() > cache_memory_limit*cache_health_threshold;
 211   }
 212
 213   void advance_stray();
 214
 215   unsigned get_ephemeral_dist_frag_bits() const {
 216     return export_ephemeral_dist_frag_bits;
 217   }
 218   bool get_export_ephemeral_distributed_config(void) const {
 219     return export_ephemeral_distributed_config;
 220   }
 221
 222   bool get_export_ephemeral_random_config(void) const {
 223     return export_ephemeral_random_config;
 224   }
 225
 226   /**
 227    * Call this when you know that a CDentry is ready to be passed
 228    * on to StrayManager (i.e. this is a stray you've just created)
 229    */
 230   void notify_stray(CDentry *dn) {
 231     ceph_assert(dn->get_dir()->get_inode()->is_stray());
 232     if (dn->state_test(CDentry::STATE_PURGING))
 233       return;
 234
 235     stray_manager.eval_stray(dn);
 236   }
 237
 238   mds_rank_t hash_into_rank_bucket(inodeno_t ino, frag_t fg=0);
 239
 240   void maybe_eval_stray(CInode *in, bool delay=false);
 241   void clear_dirty_bits_for_stray(CInode* diri);
 242
 243   bool is_readonly() { return readonly; }
 244   void force_readonly();
 245
 246   static file_layout_t gen_default_file_layout(const MDSMap &mdsmap);
 247   static file_layout_t gen_default_log_layout(const MDSMap &mdsmap);
 248
 249   void register_perfcounters();
 250
 251   void touch_client_lease(ClientLease *r, int pool, utime_t ttl) {
 252     client_leases[pool].push_back(&r->item_lease);
 253     r->ttl = ttl;
 254   }
 255
 256   void notify_stray_removed()
 257   {
 258     stray_manager.notify_stray_removed();
 259   }
 260
 261   void notify_stray_created()
 262   {
 263     stray_manager.notify_stray_created();
 264   }
 265
 266   void eval_remote(CDentry *dn)
 267   {
 268     stray_manager.eval_remote(dn);
 269   }
 270
 271   void _send_discover(discover_info_t& dis);
 272   discover_info_t& _create_discover(mds_rank_t mds) {
 273     ceph_tid_t t = ++discover_last_tid;
 274     discover_info_t& d = discovers[t];
 275     d.tid = t;
 276     d.mds = mds;
 277     return d;
 278   }
 279
 280   void discover_base_ino(inodeno_t want_ino, MDSContext *onfinish, mds_rank_t from=MDS_RANK_NONE);
 281   void discover_dir_frag(CInode *base, frag_t approx_fg, MDSContext *onfinish,
 282                          mds_rank_t from=MDS_RANK_NONE);
 283   void discover_path(CInode *base, snapid_t snap, filepath want_path, MDSContext *onfinish,
 284                      bool path_locked=false, mds_rank_t from=MDS_RANK_NONE);
 285   void discover_path(CDir *base, snapid_t snap, filepath want_path, MDSContext *onfinish,
 286                      bool path_locked=false);
 287   void kick_discovers(mds_rank_t who);  // after a failure.
 288
 289   // adjust subtree auth specification
 290   //  dir->dir_auth
 291   //  imports/exports/nested_exports
 292   //  join/split subtrees as appropriate
 293   bool is_subtrees() { return !subtrees.empty(); }
 294   template<typename T>
 295   void get_subtrees(T& c) {
 296     if constexpr (std::is_same_v<T, std::vector<CDir*>>)
 297       c.reserve(c.size() + subtrees.size());
 298     for (const auto& p : subtrees) {
 299       c.push_back(p.first);
 300     }
 301   }
 302   void adjust_subtree_auth(CDir *root, mds_authority_t auth, bool adjust_pop=true);
 303   void adjust_subtree_auth(CDir *root, mds_rank_t a, mds_rank_t b=CDIR_AUTH_UNKNOWN) {
 304     adjust_subtree_auth(root, mds_authority_t(a,b));
 305   }
 306   void adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth);
 307   void adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_rank_t a) {
 308     adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
 309   }
 310   void adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bounds, const mds_authority_t &auth);
 311   void adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bounds, mds_rank_t a) {
 312     adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
 313   }
 314   void map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result);
 315   void try_subtree_merge(CDir *root);
 316   void try_subtree_merge_at(CDir *root, set<CInode*> *to_eval, bool adjust_pop=true);
 317   void eval_subtree_root(CInode *diri);
 318   CDir *get_subtree_root(CDir *dir);
 319   CDir *get_projected_subtree_root(CDir *dir);
 320   bool is_leaf_subtree(CDir *dir) {
 321     ceph_assert(subtrees.count(dir));
 322     return subtrees[dir].empty();
 323   }
 324   void remove_subtree(CDir *dir);
 325   bool is_subtree(CDir *root) {
 326     return subtrees.count(root);
 327   }
 328   void get_subtree_bounds(CDir *root, set<CDir*>& bounds);
 329   void get_wouldbe_subtree_bounds(CDir *root, set<CDir*>& bounds);
 330   void verify_subtree_bounds(CDir *root, const set<CDir*>& bounds);
 331   void verify_subtree_bounds(CDir *root, const list<dirfrag_t>& bounds);
 332
 333   void project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir);
 334   void adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop);
 335
 336   auto get_auth_subtrees() {
 337     std::vector<CDir*> c;
 338     for (auto& p : subtrees) {
 339       auto& root = p.first;
 340       if (root->is_auth()) {
 341         c.push_back(root);
 342       }
 343     }
 344     return c;
 345   }
 346
 347   auto get_fullauth_subtrees() {
 348     std::vector<CDir*> c;
 349     for (auto& p : subtrees) {
 350       auto& root = p.first;
 351       if (root->is_full_dir_auth()) {
 352         c.push_back(root);
 353       }
 354     }
 355     return c;
 356   }
 357   auto num_subtrees_fullauth() const {
 358     std::size_t n = 0;
 359     for (auto& p : subtrees) {
 360       auto& root = p.first;
 361       if (root->is_full_dir_auth()) {
 362         ++n;
 363       }
 364     }
 365     return n;
 366   }
 367
 368   auto num_subtrees_fullnonauth() const {
 369     std::size_t n = 0;
 370     for (auto& p : subtrees) {
 371       auto& root = p.first;
 372       if (root->is_full_dir_nonauth()) {
 373         ++n;
 374       }
 375     }
 376     return n;
 377   }
 378
 379   auto num_subtrees() const {
 380     return subtrees.size();
 381   }
 382
 383   int get_num_client_requests();
 384
 385   MDRequestRef request_start(const cref_t<MClientRequest>& req);
 386   MDRequestRef request_start_peer(metareqid_t rid, __u32 attempt, const cref_t<Message> &m);
 387   MDRequestRef request_start_internal(int op);
 388   bool have_request(metareqid_t rid) {
 389     return active_requests.count(rid);
 390   }
 391   MDRequestRef request_get(metareqid_t rid);
 392   void request_pin_ref(MDRequestRef& r, CInode *ref, vector<CDentry*>& trace);
 393   void request_finish(MDRequestRef& mdr);
 394   void request_forward(MDRequestRef& mdr, mds_rank_t mds, int port=0);
 395   void dispatch_request(MDRequestRef& mdr);
 396   void request_drop_foreign_locks(MDRequestRef& mdr);
 397   void request_drop_non_rdlocks(MDRequestRef& r);
 398   void request_drop_locks(MDRequestRef& r);
 399   void request_cleanup(MDRequestRef& r);
 400
 401   void request_kill(MDRequestRef& r);  // called when session closes
 402
 403   // journal/snap helpers
 404   CInode *pick_inode_snap(CInode *in, snapid_t follows);
 405   CInode *cow_inode(CInode *in, snapid_t last);
 406   void journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob, CDentry *dn,
 407                           snapid_t follows=CEPH_NOSNAP,
 408                           CInode **pcow_inode=0, CDentry::linkage_t *dnl=0);
 409   void journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP);
 410
 411   void project_rstat_inode_to_frag(const MutationRef& mut,
 412                                    CInode *cur, CDir *parent, snapid_t first,
 413                                    int linkunlink, SnapRealm *prealm);
 414   void _project_rstat_inode_to_frag(const CInode::mempool_inode* inode, snapid_t ofirst, snapid_t last,
 415                                     CDir *parent, int linkunlink, bool update_inode);
 416   void project_rstat_frag_to_inode(const nest_info_t& rstat, const nest_info_t& accounted_rstat,
 417                                    snapid_t ofirst, snapid_t last, CInode *pin, bool cow_head);
 418   void broadcast_quota_to_client(CInode *in, client_t exclude_ct = -1, bool quota_change = false);
 419   void predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
 420                                 CInode *in, CDir *parent,
 421                                 int flags, int linkunlink=0,
 422                                 snapid_t follows=CEPH_NOSNAP);
 423
 424   // peers
 425   void add_uncommitted_leader(metareqid_t reqid, LogSegment *ls, set<mds_rank_t> &peers, bool safe=false) {
 426     uncommitted_leaders[reqid].ls = ls;
 427     uncommitted_leaders[reqid].peers = peers;
 428     uncommitted_leaders[reqid].safe = safe;
 429   }
 430   void wait_for_uncommitted_leader(metareqid_t reqid, MDSContext *c) {
 431     uncommitted_leaders[reqid].waiters.push_back(c);
 432   }
 433   bool have_uncommitted_leader(metareqid_t reqid, mds_rank_t from) {
 434     auto p = uncommitted_leaders.find(reqid);
 435     return p != uncommitted_leaders.end() && p->second.peers.count(from) > 0;
 436   }
 437   void log_leader_commit(metareqid_t reqid);
 438   void logged_leader_update(metareqid_t reqid);
 439   void _logged_leader_commit(metareqid_t reqid);
 440   void committed_leader_peer(metareqid_t r, mds_rank_t from);
 441   void finish_committed_leaders();
 442
 443   void add_uncommitted_peer(metareqid_t reqid, LogSegment*, mds_rank_t, MDPeerUpdate *su=nullptr);
 444   void wait_for_uncommitted_peer(metareqid_t reqid, MDSContext *c) {
 445     uncommitted_peers.at(reqid).waiters.push_back(c);
 446   }
 447   void finish_uncommitted_peer(metareqid_t reqid, bool assert_exist=true);
 448   MDPeerUpdate* get_uncommitted_peer(metareqid_t reqid, mds_rank_t leader);
 449   void _logged_peer_commit(mds_rank_t from, metareqid_t reqid);
 450
 451   void set_recovery_set(set<mds_rank_t>& s);
 452   void handle_mds_failure(mds_rank_t who);
 453   void handle_mds_recovery(mds_rank_t who);
 454
 455   void recalc_auth_bits(bool replay);
 456   void remove_inode_recursive(CInode *in);
 457
 458   bool is_ambiguous_peer_update(metareqid_t reqid, mds_rank_t leader) {
 459     auto p = ambiguous_peer_updates.find(leader);
 460     return p != ambiguous_peer_updates.end() && p->second.count(reqid);
 461   }
 462   void add_ambiguous_peer_update(metareqid_t reqid, mds_rank_t leader) {
 463     ambiguous_peer_updates[leader].insert(reqid);
 464   }
 465   void remove_ambiguous_peer_update(metareqid_t reqid, mds_rank_t leader) {
 466     auto p = ambiguous_peer_updates.find(leader);
 467     auto q = p->second.find(reqid);
 468     ceph_assert(q != p->second.end());
 469     p->second.erase(q);
 470     if (p->second.empty())
 471       ambiguous_peer_updates.erase(p);
 472   }
 473
 474   void add_rollback(metareqid_t reqid, mds_rank_t leader) {
 475     resolve_need_rollback[reqid] = leader;
 476   }
 477   void finish_rollback(metareqid_t reqid, MDRequestRef& mdr);
 478
 479   // ambiguous imports
 480   void add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds);
 481   void add_ambiguous_import(CDir *base, const set<CDir*>& bounds);
 482   bool have_ambiguous_import(dirfrag_t base) {
 483     return my_ambiguous_imports.count(base);
 484   }
 485   void get_ambiguous_import_bounds(dirfrag_t base, vector<dirfrag_t>& bounds) {
 486     ceph_assert(my_ambiguous_imports.count(base));
 487     bounds = my_ambiguous_imports[base];
 488   }
 489   void cancel_ambiguous_import(CDir *);
 490   void finish_ambiguous_import(dirfrag_t dirino);
 491   void resolve_start(MDSContext *resolve_done_);
 492   void send_resolves();
 493   void maybe_send_pending_resolves() {
 494     if (resolves_pending)
 495       send_subtree_resolves();
 496   }
 497
 498   void _move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
 499                                map<dirfrag_t,vector<dirfrag_t> >& subtrees);
 500   ESubtreeMap *create_subtree_map();
 501
 502   void clean_open_file_lists();
 503   void dump_openfiles(Formatter *f);
 504   bool dump_inode(Formatter *f, uint64_t number);
 505
 506   void rejoin_start(MDSContext *rejoin_done_);
 507   void rejoin_gather_finish();
 508   void rejoin_send_rejoins();
 509   void rejoin_export_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
 510                           int target=-1, bool drop_path=false) {
 511     auto& ex = cap_exports[ino];
 512     ex.first = target;
 513     auto &_icr = ex.second[client] = icr;
 514     if (drop_path)
 515       _icr.path.clear();
 516   }
 517   void rejoin_recovered_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
 518                              mds_rank_t frommds=MDS_RANK_NONE, bool drop_path=false) {
 519     auto &_icr = cap_imports[ino][client][frommds] = icr;
 520     if (drop_path)
 521       _icr.path.clear();
 522   }
 523   void rejoin_recovered_client(client_t client, const entity_inst_t& inst) {
 524     rejoin_client_map.emplace(client, inst);
 525   }
 526   bool rejoin_has_cap_reconnect(inodeno_t ino) const {
 527     return cap_imports.count(ino);
 528   }
 529   void add_replay_ino_alloc(inodeno_t ino) {
 530     cap_imports_missing.insert(ino); // avoid opening ino during cache rejoin
 531   }
 532   const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) {
 533     if (cap_imports.count(ino) &&
 534         cap_imports[ino].count(client) &&
 535         cap_imports[ino][client].count(MDS_RANK_NONE)) {
 536       return &cap_imports[ino][client][MDS_RANK_NONE];
 537     }
 538     return NULL;
 539   }
 540   void remove_replay_cap_reconnect(inodeno_t ino, client_t client) {
 541     ceph_assert(cap_imports[ino].size() == 1);
 542     ceph_assert(cap_imports[ino][client].size() == 1);
 543     cap_imports.erase(ino);
 544   }
 545   void wait_replay_cap_reconnect(inodeno_t ino, MDSContext *c) {
 546     cap_reconnect_waiters[ino].push_back(c);
 547   }
 548
 549   void add_reconnected_cap(client_t client, inodeno_t ino, const cap_reconnect_t& icr) {
 550     reconnected_cap_info_t &info = reconnected_caps[ino][client];
 551     info.realm_ino = inodeno_t(icr.capinfo.snaprealm);
 552     info.snap_follows = icr.snap_follows;
 553   }
 554   void set_reconnected_dirty_caps(client_t client, inodeno_t ino, int dirty, bool snapflush) {
 555     reconnected_cap_info_t &info = reconnected_caps[ino][client];
 556     info.dirty_caps |= dirty;
 557     if (snapflush)
 558       info.snapflush = snapflush;
 559   }
 560   void add_reconnected_snaprealm(client_t client, inodeno_t ino, snapid_t seq) {
 561     reconnected_snaprealms[ino][client] = seq;
 562   }
 563
 564   void rejoin_open_ino_finish(inodeno_t ino, int ret);
 565   void rejoin_prefetch_ino_finish(inodeno_t ino, int ret);
 566   void rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map);
 567   bool process_imported_caps();
 568   void choose_lock_states_and_reconnect_caps();
 569   void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
 570                            map<client_t,ref_t<MClientSnap>>& splits);
 571   void prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm, map<client_t,ref_t<MClientSnap>>& splits);
 572   void send_snaps(map<client_t,ref_t<MClientSnap>>& splits);
 573   Capability* rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds);
 574   void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq,
 575                                   map<client_t,ref_t<MClientSnap>>& updates);
 576   Capability* try_reconnect_cap(CInode *in, Session *session);
 577   void export_remaining_imported_caps();
 578
 579   void do_cap_import(Session *session, CInode *in, Capability *cap,
 580                      uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
 581                      int peer, int p_flags);
 582   void do_delayed_cap_imports();
 583   void rebuild_need_snapflush(CInode *head_in, SnapRealm *realm, client_t client,
 584                               snapid_t snap_follows);
 585   void open_snaprealms();
 586
 587   bool open_undef_inodes_dirfrags();
 588   void opened_undef_inode(CInode *in);
 589   void opened_undef_dirfrag(CDir *dir) {
 590     rejoin_undef_dirfrags.erase(dir);
 591   }
 592
 593   void reissue_all_caps();
 594
 595   void start_files_to_recover();
 596   void do_file_recover();
 597   void queue_file_recover(CInode *in);
 598   void _queued_file_recover_cow(CInode *in, MutationRef& mut);
 599
 600   void handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map);
 601
 602   // debug
 603   void log_stat();
 604
 605   // root inode
 606   CInode *get_root() { return root; }
 607   CInode *get_myin() { return myin; }
 608
 609   size_t get_cache_size() { return lru.lru_get_size(); }
 610
 611   // trimming
 612   std::pair<bool, uint64_t> trim(uint64_t count=0);
 613
 614   bool trim_non_auth_subtree(CDir *directory);
 615   void standby_trim_segment(LogSegment *ls);
 616   void try_trim_non_auth_subtree(CDir *dir);
 617   bool can_trim_non_auth_dirfrag(CDir *dir) {
 618     return my_ambiguous_imports.count((dir)->dirfrag()) == 0 &&
 619            uncommitted_peer_rename_olddir.count(dir->inode) == 0;
 620   }
 621
 622   /**
 623    * For all unreferenced inodes, dirs, dentries below an inode, compose
 624    * expiry messages.  This is used when giving up all replicas of entities
 625    * for an MDS peer in the 'stopping' state, such that the peer can
 626    * empty its cache and finish shutting down.
 627    *
 628    * We have to make sure we're only expiring un-referenced items to
 629    * avoid interfering with ongoing stray-movement (we can't distinguish
 630    * between the "moving my strays" and "waiting for my cache to empty"
 631    * phases within 'stopping')
 632    *
 633    * @return false if we completed cleanly, true if caller should stop
 634    *         expiring because we hit something with refs.
 635    */
 636   bool expire_recursive(CInode *in, expiremap& expiremap);
 637
 638   void trim_client_leases();
 639   void check_memory_usage();
 640
 641   void shutdown_start();
 642   void shutdown_check();
 643   bool shutdown_pass();
 644   bool shutdown();                    // clear cache (ie at shutodwn)
 645   bool shutdown_export_strays();
 646   void shutdown_export_stray_finish(inodeno_t ino) {
 647     if (shutdown_exporting_strays.erase(ino))
 648       shutdown_export_strays();
 649   }
 650
 651   // inode_map
 652   bool have_inode(vinodeno_t vino) {
 653     if (vino.snapid == CEPH_NOSNAP)
 654       return inode_map.count(vino.ino) ? true : false;
 655     else
 656       return snap_inode_map.count(vino) ? true : false;
 657   }
 658   bool have_inode(inodeno_t ino, snapid_t snap=CEPH_NOSNAP) {
 659     return have_inode(vinodeno_t(ino, snap));
 660   }
 661   CInode* get_inode(vinodeno_t vino) {
 662     if (vino.snapid == CEPH_NOSNAP) {
 663       auto p = inode_map.find(vino.ino);
 664       if (p != inode_map.end())
 665         return p->second;
 666     } else {
 667       auto p = snap_inode_map.find(vino);
 668       if (p != snap_inode_map.end())
 669         return p->second;
 670     }
 671     return NULL;
 672   }
 673   CInode* get_inode(inodeno_t ino, snapid_t s=CEPH_NOSNAP) {
 674     return get_inode(vinodeno_t(ino, s));
 675   }
 676   CInode* lookup_snap_inode(vinodeno_t vino) {
 677     auto p = snap_inode_map.lower_bound(vino);
 678     if (p != snap_inode_map.end() &&
 679         p->second->ino() == vino.ino && p->second->first <= vino.snapid)
 680       return p->second;
 681     return NULL;
 682   }
 683
 684   CDir* get_dirfrag(dirfrag_t df) {
 685     CInode *in = get_inode(df.ino);
 686     if (!in)
 687       return NULL;
 688     return in->get_dirfrag(df.frag);
 689   }
 690   CDir* get_dirfrag(inodeno_t ino, std::string_view dn) {
 691     CInode *in = get_inode(ino);
 692     if (!in)
 693       return NULL;
 694     frag_t fg = in->pick_dirfrag(dn);
 695     return in->get_dirfrag(fg);
 696   }
 697   CDir* get_force_dirfrag(dirfrag_t df, bool replay) {
 698     CInode *diri = get_inode(df.ino);
 699     if (!diri)
 700       return NULL;
 701     CDir *dir = force_dir_fragment(diri, df.frag, replay);
 702     if (!dir)
 703       dir = diri->get_dirfrag(df.frag);
 704     return dir;
 705   }
 706
 707   MDSCacheObject *get_object(const MDSCacheObjectInfo &info);
 708
 709   void add_inode(CInode *in);
 710
 711   void remove_inode(CInode *in);
 712
 713   void touch_dentry(CDentry *dn) {
 714     if (dn->state_test(CDentry::STATE_BOTTOMLRU)) {
 715       bottom_lru.lru_midtouch(dn);
 716     } else {
 717       if (dn->is_auth())
 718         lru.lru_touch(dn);
 719       else
 720         lru.lru_midtouch(dn);
 721     }
 722   }
 723   void touch_dentry_bottom(CDentry *dn) {
 724     if (dn->state_test(CDentry::STATE_BOTTOMLRU))
 725       return;
 726     lru.lru_bottouch(dn);
 727   }
 728
 729   // truncate
 730   void truncate_inode(CInode *in, LogSegment *ls);
 731   void _truncate_inode(CInode *in, LogSegment *ls);
 732   void truncate_inode_finish(CInode *in, LogSegment *ls);
 733   void truncate_inode_logged(CInode *in, MutationRef& mut);
 734
 735   void add_recovered_truncate(CInode *in, LogSegment *ls);
 736   void remove_recovered_truncate(CInode *in, LogSegment *ls);
 737   void start_recovered_truncates();
 738
 739   // purge unsafe inodes
 740   void start_purge_inodes();
 741   void purge_inodes(const interval_set<inodeno_t>& i, LogSegment *ls);
 742
 743   CDir *get_auth_container(CDir *in);
 744   CDir *get_export_container(CDir *dir);
 745   void find_nested_exports(CDir *dir, set<CDir*>& s);
 746   void find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s);
 747
 748   void init_layouts();
 749   void create_unlinked_system_inode(CInode *in, inodeno_t ino,
 750                                     int mode) const;
 751   CInode *create_system_inode(inodeno_t ino, int mode);
 752   CInode *create_root_inode();
 753
 754   void create_empty_hierarchy(MDSGather *gather);
 755   void create_mydir_hierarchy(MDSGather *gather);
 756
 757   bool is_open() { return open; }
 758   void wait_for_open(MDSContext *c) {
 759     waiting_for_open.push_back(c);
 760   }
 761
 762   void open_root_inode(MDSContext *c);
 763   void open_root();
 764   void open_mydir_inode(MDSContext *c);
 765   void open_mydir_frag(MDSContext *c);
 766   void populate_mydir();
 767
 768   void _create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin);
 769   void _create_system_file_finish(MutationRef& mut, CDentry *dn,
 770                                   version_t dpv, MDSContext *fin);
 771
 772   void open_foreign_mdsdir(inodeno_t ino, MDSContext *c);
 773   CDir *get_stray_dir(CInode *in);
 774
 775   /**
 776    * Find the given dentry (and whether it exists or not), its ancestors,
 777    * and get them all into memory and usable on this MDS. This function
 778    * makes a best-effort attempt to load everything; if it needs to
 779    * go away and do something then it will put the request on a waitlist.
 780    * It prefers the mdr, then the req, then the fin. (At least one of these
 781    * must be non-null.)
 782    *
 783    * At least one of the params mdr, req, and fin must be non-null.
 784    *
 785    * @param mdr The MDRequest associated with the path. Can be null.
 786    * @param cf A MDSContextFactory for waiter building.
 787    * @param path The path to traverse to.
 788    *
 789    * @param flags Specifies different lookup behaviors.
 790    * By default, path_traverse() forwards the request to the auth MDS if that
 791    * is appropriate (ie, if it doesn't know the contents of a directory).
 792    * MDS_TRAVERSE_DISCOVER: Instead of forwarding request, path_traverse()
 793    * attempts to look up the path from a different MDS (and bring them into
 794    * its cache as replicas).
 795    * MDS_TRAVERSE_PATH_LOCKED: path_traverse() will procceed when xlocked
 796    * dentry is encountered.
 797    * MDS_TRAVERSE_WANT_DENTRY: Caller wants tail dentry. Add a null dentry if
 798    * tail dentry does not exist. return 0 even tail dentry is null.
 799    * MDS_TRAVERSE_WANT_AUTH: Always forward request to auth MDS of target inode
 800    * or auth MDS of tail dentry (MDS_TRAVERSE_WANT_DENTRY is set).
 801    *
 802    * @param pdnvec Data return parameter -- on success, contains a
 803    * vector of dentries. On failure, is either empty or contains the
 804    * full trace of traversable dentries.
 805    * @param pin Data return parameter -- if successful, points to the inode
 806    * associated with filepath. If unsuccessful, is null.
 807    *
 808    * @returns 0 on success, 1 on "not done yet", 2 on "forwarding", -errno otherwise.
 809    * If it returns 1, the requester associated with this call has been placed
 810    * on the appropriate waitlist, and it should unwind itself and back out.
 811    * If it returns 2 the request has been forwarded, and again the requester
 812    * should unwind itself and back out.
 813    */
 814   int path_traverse(MDRequestRef& mdr, MDSContextFactory& cf,
 815                     const filepath& path, int flags,
 816                     vector<CDentry*> *pdnvec, CInode **pin=nullptr);
 817
 818   CInode *cache_traverse(const filepath& path);
 819
 820   void open_remote_dirfrag(CInode *diri, frag_t fg, MDSContext *fin);
 821   CInode *get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected=false);
 822
 823   bool parallel_fetch(map<inodeno_t,filepath>& pathmap, set<inodeno_t>& missing);
 824   bool parallel_fetch_traverse_dir(inodeno_t ino, filepath& path,
 825                                    set<CDir*>& fetch_queue, set<inodeno_t>& missing,
 826                                    C_GatherBuilder &gather_bld);
 827
 828   void open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin,
 829                           bool want_xlocked=false);
 830   void _open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin,
 831                                   bool want_xlocked, int r);
 832
 833   void make_trace(vector<CDentry*>& trace, CInode *in);
 834
 835   void kick_open_ino_peers(mds_rank_t who);
 836   void open_ino(inodeno_t ino, int64_t pool, MDSContext *fin,
 837                 bool want_replica=true, bool want_xlocked=false,
 838                 vector<inode_backpointer_t> *ancestors_hint=nullptr,
 839                 mds_rank_t auth_hint=MDS_RANK_NONE);
 840
 841   void find_ino_peers(inodeno_t ino, MDSContext *c,
 842                       mds_rank_t hint=MDS_RANK_NONE, bool path_locked=false);
 843   void _do_find_ino_peer(find_ino_peer_info_t& fip);
 844   void handle_find_ino(const cref_t<MMDSFindIno> &m);
 845   void handle_find_ino_reply(const cref_t<MMDSFindInoReply> &m);
 846   void kick_find_ino_peers(mds_rank_t who);
 847
 848   SnapRealm *get_global_snaprealm() const { return global_snaprealm; }
 849   void create_global_snaprealm();
 850   void do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients=true);
 851   void send_snap_update(CInode *in, version_t stid, int snap_op);
 852   void handle_snap_update(const cref_t<MMDSSnapUpdate> &m);
 853   void notify_global_snaprealm_update(int snap_op);
 854
 855   // -- stray --
 856   void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin);
 857   uint64_t get_num_strays() const { return stray_manager.get_num_strays(); }
 858
 859   // == messages ==
 860   void dispatch(const cref_t<Message> &m);
 861
 862   void encode_replica_dir(CDir *dir, mds_rank_t to, bufferlist& bl);
 863   void encode_replica_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl);
 864   void encode_replica_inode(CInode *in, mds_rank_t to, bufferlist& bl,
 865                        uint64_t features);
 866
 867   void decode_replica_dir(CDir *&dir, bufferlist::const_iterator& p, CInode *diri, mds_rank_t from, MDSContext::vec& finished);
 868   void decode_replica_dentry(CDentry *&dn, bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished);
 869   void decode_replica_inode(CInode *&in, bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished);
 870
 871   void encode_replica_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl);
 872   void decode_replica_stray(CDentry *&straydn, const bufferlist &bl, mds_rank_t from);
 873
 874   // -- namespace --
 875   void encode_remote_dentry_link(CDentry::linkage_t *dnl, bufferlist& bl);
 876   void decode_remote_dentry_link(CDir *dir, CDentry *dn, bufferlist::const_iterator& p);
 877   void send_dentry_link(CDentry *dn, MDRequestRef& mdr);
 878   void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr);
 879
 880   void wait_for_uncommitted_fragment(dirfrag_t dirfrag, MDSContext *c) {
 881     uncommitted_fragments.at(dirfrag).waiters.push_back(c);
 882   }
 883   bool is_any_uncommitted_fragment() const {
 884     return !uncommitted_fragments.empty();
 885   }
 886   void wait_for_uncommitted_fragments(MDSContext* finisher);
 887   void rollback_uncommitted_fragments();
 888
 889   void split_dir(CDir *dir, int byn);
 890   void merge_dir(CInode *diri, frag_t fg);
 891
 892   void find_stale_fragment_freeze();
 893   void fragment_freeze_inc_num_waiters(CDir *dir);
 894   bool fragment_are_all_frozen(CDir *dir);
 895   int get_num_fragmenting_dirs() { return fragments.size(); }
 896
 897   // -- updates --
 898   //int send_inode_updates(CInode *in);
 899   //void handle_inode_update(MInodeUpdate *m);
 900
 901   int send_dir_updates(CDir *in, bool bcast=false);
 902   void handle_dir_update(const cref_t<MDirUpdate> &m);
 903
 904   // -- cache expiration --
 905   void handle_cache_expire(const cref_t<MCacheExpire> &m);
 906   void process_delayed_expire(CDir *dir);
 907   void discard_delayed_expire(CDir *dir);
 908
 909   // -- mdsmap --
 910   void handle_mdsmap(const MDSMap &mdsmap, const MDSMap &oldmap);
 911
 912   int dump_cache() { return dump_cache({}, nullptr); }
 913   int dump_cache(std::string_view filename);
 914   int dump_cache(Formatter *f);
 915   void dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f);
 916
 917   void cache_status(Formatter *f);
 918
 919   void dump_resolve_status(Formatter *f) const;
 920   void dump_rejoin_status(Formatter *f) const;
 921
 922   // == crap fns ==
 923   void show_cache();
 924   void show_subtrees(int dbl=10, bool force_print=false);
 925
 926   CInode *hack_pick_random_inode() {
 927     ceph_assert(!inode_map.empty());
 928     int n = rand() % inode_map.size();
 929     auto p = inode_map.begin();
 930     while (n--) ++p;
 931     return p->second;
 932   }
 933
 934   void flush_dentry(std::string_view path, Context *fin);
 935   /**
 936    * Create and start an OP_ENQUEUE_SCRUB
 937    */
 938   void enqueue_scrub(std::string_view path, std::string_view tag,
 939                      bool force, bool recursive, bool repair,
 940                      Formatter *f, Context *fin);
 941   void repair_inode_stats(CInode *diri);
 942   void repair_dirfrag_stats(CDir *dir);
 943   void rdlock_dirfrags_stats(CInode *diri, MDSInternalContext *fin);
 944
 945   // my leader
 946   MDSRank *mds;
 947
 948   // -- my cache --
 949   LRU lru;   // dentry lru for expiring items from cache
 950   LRU bottom_lru; // dentries that should be trimmed ASAP
 951
 952   DecayRate decayrate;
 953
 954   int num_shadow_inodes = 0;
 955
 956   int num_inodes_with_caps = 0;
 957
 958   unsigned max_dir_commit_size;
 959
 960   file_layout_t default_file_layout;
 961   file_layout_t default_log_layout;
 962
 963   // -- client leases --
 964   static constexpr std::size_t client_lease_pools = 3;
 965   std::array<float, client_lease_pools> client_lease_durations{5.0, 30.0, 300.0};
 966
 967   // -- client caps --
 968   uint64_t last_cap_id = 0;
 969
 970   map<ceph_tid_t, discover_info_t> discovers;
 971   ceph_tid_t discover_last_tid = 0;
 972
 973   // waiters
 974   map<int, map<inodeno_t, MDSContext::vec > > waiting_for_base_ino;
 975
 976   map<inodeno_t,map<client_t, reconnected_cap_info_t> > reconnected_caps;   // inode -> client -> snap_follows,realmino
 977   map<inodeno_t,map<client_t, snapid_t> > reconnected_snaprealms;  // realmino -> client -> realmseq
 978
 979   //  realm inodes
 980   set<CInode*> rejoin_pending_snaprealms;
 981   // cap imports.  delayed snap parent opens.
 982   map<client_t,set<CInode*> > delayed_imported_caps;
 983
 984   // subsystems
 985   std::unique_ptr<Migrator> migrator;
 986
 987   bool did_shutdown_log_cap = false;
 988
 989   map<ceph_tid_t, find_ino_peer_info_t> find_ino_peer;
 990   ceph_tid_t find_ino_peer_last_tid = 0;
 991
 992   // delayed cache expire
 993   map<CDir*, expiremap> delayed_expire; // subtree root -> expire msg
 994
 995   /* Because exports may fail, this set lets us keep track of inodes that need exporting. */
 996   std::set<CInode *> export_pin_queue;
 997   std::set<CInode *> export_pin_delayed_queue;
 998   std::set<CInode *> export_ephemeral_pins;
 999
1000   OpenFileTable open_file_table;
1001
1002   double export_ephemeral_random_max = 0.0;
1003
1004  protected:
1005   // track leader requests whose peers haven't acknowledged commit
1006   struct uleader {
1007     uleader() {}
1008     set<mds_rank_t> peers;
1009     LogSegment *ls = nullptr;
1010     MDSContext::vec waiters;
1011     bool safe = false;
1012     bool committing = false;
1013     bool recovering = false;
1014   };
1015
1016   struct upeer {
1017     upeer() {}
1018     mds_rank_t leader;
1019     LogSegment *ls = nullptr;
1020     MDPeerUpdate *su = nullptr;
1021     MDSContext::vec waiters;
1022   };
1023
1024   struct open_ino_info_t {
1025     open_ino_info_t() {}
1026     vector<inode_backpointer_t> ancestors;
1027     set<mds_rank_t> checked;
1028     mds_rank_t checking = MDS_RANK_NONE;
1029     mds_rank_t auth_hint = MDS_RANK_NONE;
1030     bool check_peers = true;
1031     bool fetch_backtrace = true;
1032     bool discover = false;
1033     bool want_replica = false;
1034     bool want_xlocked = false;
1035     version_t tid = 0;
1036     int64_t pool = -1;
1037     int last_err = 0;
1038     MDSContext::vec waiters;
1039   };
1040
1041   friend struct C_MDC_OpenInoTraverseDir;
1042   friend struct C_MDC_OpenInoParentOpened;
1043   friend struct C_MDC_RetryScanStray;
1044
1045   friend class C_IO_MDC_OpenInoBacktraceFetched;
1046   friend class C_MDC_Join;
1047   friend class C_MDC_RespondInternalRequest;
1048
1049   friend class EPeerUpdate;
1050   friend class ECommitted;
1051
1052   void set_readonly() { readonly = true; }
1053
1054   void handle_resolve(const cref_t<MMDSResolve> &m);
1055   void handle_resolve_ack(const cref_t<MMDSResolveAck> &m);
1056   void process_delayed_resolve();
1057   void discard_delayed_resolve(mds_rank_t who);
1058   void maybe_resolve_finish();
1059   void disambiguate_my_imports();
1060   void disambiguate_other_imports();
1061   void trim_unlinked_inodes();
1062
1063   void send_peer_resolves();
1064   void send_subtree_resolves();
1065   void maybe_finish_peer_resolve();
1066
1067   void rejoin_walk(CDir *dir, const ref_t<MMDSCacheRejoin> &rejoin);
1068   void handle_cache_rejoin(const cref_t<MMDSCacheRejoin> &m);
1069   void handle_cache_rejoin_weak(const cref_t<MMDSCacheRejoin> &m);
1070   CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last);
1071   CDir* rejoin_invent_dirfrag(dirfrag_t df);
1072   void handle_cache_rejoin_strong(const cref_t<MMDSCacheRejoin> &m);
1073   void rejoin_scour_survivor_replicas(mds_rank_t from, const cref_t<MMDSCacheRejoin> &ack,
1074                                       set<vinodeno_t>& acked_inodes,
1075                                       set<SimpleLock *>& gather_locks);
1076   void handle_cache_rejoin_ack(const cref_t<MMDSCacheRejoin> &m);
1077   void rejoin_send_acks();
1078   void rejoin_trim_undef_inodes();
1079   void maybe_send_pending_rejoins() {
1080     if (rejoins_pending)
1081       rejoin_send_rejoins();
1082   }
1083
1084   void touch_inode(CInode *in) {
1085     if (in->get_parent_dn())
1086       touch_dentry(in->get_projected_parent_dn());
1087   }
1088
1089   void inode_remove_replica(CInode *in, mds_rank_t rep, bool rejoin,
1090                             set<SimpleLock *>& gather_locks);
1091   void dentry_remove_replica(CDentry *dn, mds_rank_t rep, set<SimpleLock *>& gather_locks);
1092
1093   void rename_file(CDentry *srcdn, CDentry *destdn);
1094
1095   void _open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err);
1096   void _open_ino_parent_opened(inodeno_t ino, int ret);
1097   void _open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int err);
1098   void _open_ino_fetch_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m, CDir *dir, bool parent);
1099   int open_ino_traverse_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m,
1100                             const vector<inode_backpointer_t>& ancestors,
1101                             bool discover, bool want_xlocked, mds_rank_t *hint);
1102   void open_ino_finish(inodeno_t ino, open_ino_info_t& info, int err);
1103   void do_open_ino(inodeno_t ino, open_ino_info_t& info, int err);
1104   void do_open_ino_peer(inodeno_t ino, open_ino_info_t& info);
1105   void handle_open_ino(const cref_t<MMDSOpenIno> &m, int err=0);
1106   void handle_open_ino_reply(const cref_t<MMDSOpenInoReply> &m);
1107
1108   void scan_stray_dir(dirfrag_t next=dirfrag_t());
1109   // -- replicas --
1110   void handle_discover(const cref_t<MDiscover> &dis);
1111   void handle_discover_reply(const cref_t<MDiscoverReply> &m);
1112   void handle_dentry_link(const cref_t<MDentryLink> &m);
1113   void handle_dentry_unlink(const cref_t<MDentryUnlink> &m);
1114
1115   int dump_cache(std::string_view fn, Formatter *f);
1116
1117   void flush_dentry_work(MDRequestRef& mdr);
1118   /**
1119    * Resolve path to a dentry and pass it onto the ScrubStack.
1120    *
1121    * TODO: return enough information to the original mdr formatter
1122    * and completion that they can subsequeuntly check the progress of
1123    * this scrub (we won't block them on a whole scrub as it can take a very
1124    * long time)
1125    */
1126   void enqueue_scrub_work(MDRequestRef& mdr);
1127   void repair_inode_stats_work(MDRequestRef& mdr);
1128   void repair_dirfrag_stats_work(MDRequestRef& mdr);
1129   void rdlock_dirfrags_stats_work(MDRequestRef& mdr);
1130
1131   ceph::unordered_map<inodeno_t,CInode*> inode_map;  // map of head inodes by ino
1132   map<vinodeno_t, CInode*> snap_inode_map;  // map of snap inodes by ino
1133   CInode *root = nullptr; // root inode
1134   CInode *myin = nullptr; // .ceph/mds%d dir
1135
1136   bool readonly = false;
1137
1138   int stray_index = 0;
1139   int stray_fragmenting_index = -1;
1140
1141   set<CInode*> base_inodes;
1142
1143   std::unique_ptr<PerfCounters> logger;
1144
1145   Filer filer;
1146   std::array<xlist<ClientLease*>, client_lease_pools> client_leases{};
1147
1148   /* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */
1149   map<CDir*,set<CDir*> > subtrees;
1150   map<CInode*,list<pair<CDir*,CDir*> > > projected_subtree_renames;  // renamed ino -> target dir
1151
1152   // -- requests --
1153   ceph::unordered_map<metareqid_t, MDRequestRef> active_requests;
1154
1155   // -- recovery --
1156   set<mds_rank_t> recovery_set;
1157
1158   // [resolve]
1159   // from EImportStart w/o EImportFinish during journal replay
1160   map<dirfrag_t, vector<dirfrag_t> > my_ambiguous_imports;
1161   // from MMDSResolves
1162   map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > > other_ambiguous_imports;
1163
1164   map<CInode*, int> uncommitted_peer_rename_olddir;  // peer: preserve the non-auth dir until seeing commit.
1165   map<CInode*, int> uncommitted_peer_unlink;  // peer: preserve the unlinked inode until seeing commit.
1166
1167   map<metareqid_t, uleader> uncommitted_leaders;         // leader: req -> peer set
1168   map<metareqid_t, upeer> uncommitted_peers;  // peer: preserve the peer req until seeing commit.
1169
1170   set<metareqid_t> pending_leaders;
1171   map<int, set<metareqid_t> > ambiguous_peer_updates;
1172
1173   bool resolves_pending = false;
1174   set<mds_rank_t> resolve_gather;       // nodes i need resolves from
1175   set<mds_rank_t> resolve_ack_gather;   // nodes i need a resolve_ack from
1176   set<version_t> resolve_snapclient_commits;
1177   map<metareqid_t, mds_rank_t> resolve_need_rollback;  // rollbacks i'm writing to the journal
1178   map<mds_rank_t, cref_t<MMDSResolve>> delayed_resolve;
1179
1180   // [rejoin]
1181   bool rejoins_pending = false;
1182   set<mds_rank_t> rejoin_gather;      // nodes from whom i need a rejoin
1183   set<mds_rank_t> rejoin_sent;        // nodes i sent a rejoin to
1184   set<mds_rank_t> rejoin_ack_sent;    // nodes i sent a rejoin to
1185   set<mds_rank_t> rejoin_ack_gather;  // nodes from whom i need a rejoin ack
1186   map<mds_rank_t,map<inodeno_t,map<client_t,Capability::Import> > > rejoin_imported_caps;
1187   map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > > rejoin_peer_exports;
1188
1189   map<client_t,entity_inst_t> rejoin_client_map;
1190   map<client_t,client_metadata_t> rejoin_client_metadata_map;
1191   map<client_t,pair<Session*,uint64_t> > rejoin_session_map;
1192
1193   map<inodeno_t,pair<mds_rank_t,map<client_t,cap_reconnect_t> > > cap_exports; // ino -> target, client -> capex
1194
1195   map<inodeno_t,map<client_t,map<mds_rank_t,cap_reconnect_t> > > cap_imports;  // ino -> client -> frommds -> capex
1196   set<inodeno_t> cap_imports_missing;
1197   map<inodeno_t, MDSContext::vec > cap_reconnect_waiters;
1198   int cap_imports_num_opening = 0;
1199
1200   set<CInode*> rejoin_undef_inodes;
1201   set<CInode*> rejoin_potential_updated_scatterlocks;
1202   set<CDir*>   rejoin_undef_dirfrags;
1203   map<mds_rank_t, set<CInode*> > rejoin_unlinked_inodes;
1204
1205   vector<CInode*> rejoin_recover_q, rejoin_check_q;
1206   list<SimpleLock*> rejoin_eval_locks;
1207   MDSContext::vec rejoin_waiters;
1208
1209   std::unique_ptr<MDSContext> rejoin_done;
1210   std::unique_ptr<MDSContext> resolve_done;
1211
1212   ceph_tid_t open_ino_last_tid = 0;
1213   map<inodeno_t,open_ino_info_t> opening_inodes;
1214
1215   StrayManager stray_manager;
1216
1217  private:
1218   // -- fragmenting --
1219   struct ufragment {
1220     ufragment() {}
1221     int bits = 0;
1222     bool committed = false;
1223     LogSegment *ls = nullptr;
1224     MDSContext::vec waiters;
1225     frag_vec_t old_frags;
1226     bufferlist rollback;
1227   };
1228
1229   struct fragment_info_t {
1230     fragment_info_t() {}
1231     bool is_fragmenting() { return !resultfrags.empty(); }
1232     uint64_t get_tid() { return mdr ? mdr->reqid.tid : 0; }
1233     int bits;
1234     std::vector<CDir*> dirs;
1235     std::vector<CDir*> resultfrags;
1236     MDRequestRef mdr;
1237     set<mds_rank_t> notify_ack_waiting;
1238     bool finishing = false;
1239
1240     // for deadlock detection
1241     bool all_frozen = false;
1242     utime_t last_cum_auth_pins_change;
1243     int last_cum_auth_pins = 0;
1244     int num_remote_waiters = 0; // number of remote authpin waiters
1245   };
1246
1247   typedef map<dirfrag_t,fragment_info_t>::iterator fragment_info_iterator;
1248
1249   friend class EFragment;
1250   friend class C_MDC_FragmentFrozen;
1251   friend class C_MDC_FragmentMarking;
1252   friend class C_MDC_FragmentPrep;
1253   friend class C_MDC_FragmentStore;
1254   friend class C_MDC_FragmentCommit;
1255   friend class C_MDC_FragmentRollback;
1256   friend class C_IO_MDC_FragmentPurgeOld;
1257
1258   // -- subtrees --
1259   static const unsigned int SUBTREES_COUNT_THRESHOLD = 5;
1260   static const unsigned int SUBTREES_DEPTH_THRESHOLD = 5;
1261
1262   CInode *get_stray() {
1263     return strays[stray_index];
1264   }
1265
1266   void identify_files_to_recover();
1267
1268   std::pair<bool, uint64_t> trim_lru(uint64_t count, expiremap& expiremap);
1269   bool trim_dentry(CDentry *dn, expiremap& expiremap);
1270   void trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap);
1271   bool trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap&);
1272   void send_expire_messages(expiremap& expiremap);
1273   void trim_non_auth();      // trim out trimmable non-auth items
1274
1275   void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
1276                             std::vector<CDir*>* frags, MDSContext::vec& waiters, bool replay);
1277   void adjust_dir_fragments(CInode *diri,
1278                             const std::vector<CDir*>& srcfrags,
1279                             frag_t basefrag, int bits,
1280                             std::vector<CDir*>* resultfrags,
1281                             MDSContext::vec& waiters,
1282                             bool replay);
1283   CDir *force_dir_fragment(CInode *diri, frag_t fg, bool replay=true);
1284   void get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds);
1285
1286   bool can_fragment(CInode *diri, const std::vector<CDir*>& dirs);
1287   void fragment_freeze_dirs(const std::vector<CDir*>& dirs);
1288   void fragment_mark_and_complete(MDRequestRef& mdr);
1289   void fragment_frozen(MDRequestRef& mdr, int r);
1290   void fragment_unmark_unfreeze_dirs(const std::vector<CDir*>& dirs);
1291   void fragment_drop_locks(fragment_info_t &info);
1292   void fragment_maybe_finish(const fragment_info_iterator& it);
1293   void dispatch_fragment_dir(MDRequestRef& mdr);
1294   void _fragment_logged(MDRequestRef& mdr);
1295   void _fragment_stored(MDRequestRef& mdr);
1296   void _fragment_committed(dirfrag_t f, const MDRequestRef& mdr);
1297   void _fragment_old_purged(dirfrag_t f, int bits, const MDRequestRef& mdr);
1298
1299   void handle_fragment_notify(const cref_t<MMDSFragmentNotify> &m);
1300   void handle_fragment_notify_ack(const cref_t<MMDSFragmentNotifyAck> &m);
1301
1302   void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frag,
1303                                 LogSegment *ls, bufferlist *rollback=NULL);
1304   void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op);
1305   void rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags);
1306
1307   void upkeep_main(void);
1308
1309   uint64_t cache_memory_limit;
1310   double cache_reservation;
1311   double cache_health_threshold;
1312   std::array<CInode *, NUM_STRAY> strays{}; // my stray dir
1313
1314   bool export_ephemeral_distributed_config;
1315   bool export_ephemeral_random_config;
1316   unsigned export_ephemeral_dist_frag_bits;
1317
1318   // File size recovery
1319   RecoveryQueue recovery_queue;
1320
1321   // shutdown
1322   set<inodeno_t> shutdown_exporting_strays;
1323   pair<dirfrag_t, string> shutdown_export_next;
1324
1325   bool opening_root = false, open = false;
1326   MDSContext::vec waiting_for_open;
1327
1328   // -- snaprealms --
1329   SnapRealm *global_snaprealm = nullptr;
1330
1331   map<dirfrag_t, ufragment> uncommitted_fragments;
1332
1333   map<dirfrag_t,fragment_info_t> fragments;
1334
1335   DecayCounter trim_counter;
1336
1337   std::thread upkeeper;
1338   ceph::mutex upkeep_mutex = ceph::make_mutex("MDCache::upkeep_mutex");
1339   ceph::condition_variable upkeep_cvar;
1340   time upkeep_last_trim = time::min();
1341   time upkeep_last_release = time::min();
1342   std::atomic<bool> upkeep_trim_shutdown{false};
1343 };
1344
1345 class C_MDS_RetryRequest : public MDSInternalContext {
1346   MDCache *cache;
1347   MDRequestRef mdr;
1348  public:
1349   C_MDS_RetryRequest(MDCache *c, MDRequestRef& r) :
1350     MDSInternalContext(c->mds), cache(c), mdr(r) {}
1351   void finish(int r) override;
1352 };
1353
1354 class CF_MDS_RetryRequestFactory : public MDSContextFactory {
1355 public:
1356   CF_MDS_RetryRequestFactory(MDCache *cache, MDRequestRef &mdr, bool dl) :
1357     mdcache(cache), mdr(mdr), drop_locks(dl) {}
1358   MDSContext *build() override;
1359 private:
1360   MDCache *mdcache;
1361   MDRequestRef mdr;
1362   bool drop_locks;
1363 };
1364
1365 #endif