]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/MDCache.h
import new upstream nautilus stable release 14.2.8
[ceph.git] / ceph / src / mds / MDCache.h
CommitLineData
11fdf7f2 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
7c673cae
FG
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16
17#ifndef CEPH_MDCACHE_H
18#define CEPH_MDCACHE_H
19
eafe8130 20#include <atomic>
11fdf7f2 21#include <string_view>
eafe8130 22#include <thread>
94b18763 23
a8e16298 24#include "common/DecayCounter.h"
7c673cae
FG
25#include "include/types.h"
26#include "include/filepath.h"
27#include "include/elist.h"
28
11fdf7f2
TL
29#include "messages/MCacheExpire.h"
30#include "messages/MClientQuota.h"
31#include "messages/MClientRequest.h"
32#include "messages/MClientSnap.h"
33#include "messages/MDentryLink.h"
34#include "messages/MDentryUnlink.h"
35#include "messages/MDirUpdate.h"
36#include "messages/MDiscover.h"
37#include "messages/MDiscoverReply.h"
38#include "messages/MGatherCaps.h"
39#include "messages/MGenericMessage.h"
40#include "messages/MInodeFileCaps.h"
41#include "messages/MLock.h"
42#include "messages/MMDSCacheRejoin.h"
43#include "messages/MMDSFindIno.h"
44#include "messages/MMDSFindInoReply.h"
45#include "messages/MMDSFragmentNotify.h"
46#include "messages/MMDSFragmentNotifyAck.h"
47#include "messages/MMDSOpenIno.h"
48#include "messages/MMDSOpenInoReply.h"
49#include "messages/MMDSResolve.h"
50#include "messages/MMDSResolveAck.h"
51#include "messages/MMDSSlaveRequest.h"
52#include "messages/MMDSSnapUpdate.h"
53
54
7c673cae
FG
55#include "osdc/Filer.h"
56#include "CInode.h"
57#include "CDentry.h"
58#include "CDir.h"
59#include "include/Context.h"
60#include "events/EMetaBlob.h"
61#include "RecoveryQueue.h"
62#include "StrayManager.h"
11fdf7f2 63#include "OpenFileTable.h"
7c673cae
FG
64#include "MDSContext.h"
65#include "MDSMap.h"
66#include "Mutation.h"
67
7c673cae
FG
68
69class PerfCounters;
70
71class MDSRank;
72class Session;
73class Migrator;
74
7c673cae
FG
75class Session;
76
7c673cae
FG
77class ESubtreeMap;
78
79enum {
80 l_mdc_first = 3000,
81 // How many inodes currently in stray dentries
82 l_mdc_num_strays,
83 // How many stray dentries are currently delayed for purge due to refs
84 l_mdc_num_strays_delayed,
85 // How many stray dentries are currently being enqueued for purge
86 l_mdc_num_strays_enqueuing,
87
88 // How many dentries have ever been added to stray dir
89 l_mdc_strays_created,
90 // How many dentries have been passed on to PurgeQueue
91 l_mdc_strays_enqueued,
92 // How many strays have been reintegrated?
93 l_mdc_strays_reintegrated,
94 // How many strays have been migrated?
95 l_mdc_strays_migrated,
96
97 // How many inode sizes currently being recovered
98 l_mdc_num_recovering_processing,
99 // How many inodes currently waiting to have size recovered
100 l_mdc_num_recovering_enqueued,
101 // How many inodes waiting with elevated priority for recovery
102 l_mdc_num_recovering_prioritized,
103 // How many inodes ever started size recovery
104 l_mdc_recovery_started,
105 // How many inodes ever completed size recovery
106 l_mdc_recovery_completed,
107
d2e6a577
FG
108 l_mdss_ireq_enqueue_scrub,
109 l_mdss_ireq_exportdir,
110 l_mdss_ireq_flush,
111 l_mdss_ireq_fragmentdir,
112 l_mdss_ireq_fragstats,
113 l_mdss_ireq_inodestats,
114
7c673cae
FG
115 l_mdc_last,
116};
117
118
119// flags for predirty_journal_parents()
120static const int PREDIRTY_PRIMARY = 1; // primary dn, adjust nested accounting
121static const int PREDIRTY_DIR = 2; // update parent dir mtime/size
122static const int PREDIRTY_SHALLOW = 4; // only go to immediate parent (for easier rollback)
123
124class MDCache {
125 public:
91327a77
AA
126 using clock = ceph::coarse_mono_clock;
127 using time = ceph::coarse_mono_time;
128
11fdf7f2
TL
129 typedef std::map<mds_rank_t, MCacheExpire::ref> expiremap;
130
7c673cae
FG
131 // my master
132 MDSRank *mds;
133
134 // -- my cache --
135 LRU lru; // dentry lru for expiring items from cache
31f18b77 136 LRU bottom_lru; // dentries that should be trimmed ASAP
7c673cae 137 protected:
b32b8144
FG
138 ceph::unordered_map<inodeno_t,CInode*> inode_map; // map of head inodes by ino
139 map<vinodeno_t, CInode*> snap_inode_map; // map of snap inodes by ino
eafe8130
TL
140 CInode *root = nullptr; // root inode
141 CInode *myin = nullptr; // .ceph/mds%d dir
7c673cae 142
eafe8130 143 bool readonly = false;
7c673cae
FG
144 void set_readonly() { readonly = true; }
145
eafe8130
TL
146 std::array<CInode *, NUM_STRAY> strays{}; // my stray dir
147 int stray_index = 0;
7c673cae
FG
148
149 CInode *get_stray() {
150 return strays[stray_index];
151 }
152
153 set<CInode*> base_inodes;
154
155 std::unique_ptr<PerfCounters> logger;
156
157 Filer filer;
158
eafe8130 159 bool exceeded_size_limit = false;
7c673cae 160
91327a77
AA
161private:
162 uint64_t cache_inode_limit;
163 uint64_t cache_memory_limit;
164 double cache_reservation;
165 double cache_health_threshold;
166
7c673cae 167public:
91327a77
AA
168 uint64_t cache_limit_inodes(void) {
169 return cache_inode_limit;
181888fb 170 }
91327a77
AA
171 uint64_t cache_limit_memory(void) {
172 return cache_memory_limit;
181888fb
FG
173 }
174 double cache_toofull_ratio(void) const {
91327a77
AA
175 double inode_reserve = cache_inode_limit*(1.0-cache_reservation);
176 double memory_reserve = cache_memory_limit*(1.0-cache_reservation);
177 return fmax(0.0, fmax((cache_size()-memory_reserve)/memory_reserve, cache_inode_limit == 0 ? 0.0 : (CInode::count()-inode_reserve)/inode_reserve));
181888fb
FG
178 }
179 bool cache_toofull(void) const {
180 return cache_toofull_ratio() > 0.0;
181 }
182 uint64_t cache_size(void) const {
183 return mempool::get_pool(mempool::mds_co::id).allocated_bytes();
184 }
185 bool cache_overfull(void) const {
91327a77 186 return (cache_inode_limit > 0 && CInode::count() > cache_inode_limit*cache_health_threshold) || (cache_size() > cache_memory_limit*cache_health_threshold);
181888fb
FG
187 }
188
7c673cae
FG
189 void advance_stray() {
190 stray_index = (stray_index+1)%NUM_STRAY;
191 }
192
7c673cae
FG
193 /**
194 * Call this when you know that a CDentry is ready to be passed
195 * on to StrayManager (i.e. this is a stray you've just created)
196 */
197 void notify_stray(CDentry *dn) {
11fdf7f2 198 ceph_assert(dn->get_dir()->get_inode()->is_stray());
a8e16298
TL
199 if (dn->state_test(CDentry::STATE_PURGING))
200 return;
201
7c673cae
FG
202 stray_manager.eval_stray(dn);
203 }
204
205 void maybe_eval_stray(CInode *in, bool delay=false);
31f18b77
FG
206 void clear_dirty_bits_for_stray(CInode* diri);
207
7c673cae
FG
208 bool is_readonly() { return readonly; }
209 void force_readonly();
210
211 DecayRate decayrate;
212
eafe8130 213 int num_shadow_inodes = 0;
b32b8144 214
eafe8130 215 int num_inodes_with_caps = 0;
7c673cae
FG
216
217 unsigned max_dir_commit_size;
218
219 static file_layout_t gen_default_file_layout(const MDSMap &mdsmap);
220 static file_layout_t gen_default_log_layout(const MDSMap &mdsmap);
221
222 file_layout_t default_file_layout;
223 file_layout_t default_log_layout;
224
225 void register_perfcounters();
226
227 // -- client leases --
228public:
eafe8130
TL
229 static constexpr std::size_t client_lease_pools = 3;
230 std::array<float, client_lease_pools> client_lease_durations{5.0, 30.0, 300.0};
231
7c673cae 232protected:
eafe8130 233 std::array<xlist<ClientLease*>, client_lease_pools> client_leases{};
7c673cae
FG
234public:
235 void touch_client_lease(ClientLease *r, int pool, utime_t ttl) {
236 client_leases[pool].push_back(&r->item_lease);
237 r->ttl = ttl;
238 }
239
240 void notify_stray_removed()
241 {
242 stray_manager.notify_stray_removed();
243 }
244
245 void notify_stray_created()
246 {
247 stray_manager.notify_stray_created();
248 }
249
31f18b77
FG
250 void eval_remote(CDentry *dn)
251 {
252 stray_manager.eval_remote(dn);
253 }
254
7c673cae 255 // -- client caps --
eafe8130 256 uint64_t last_cap_id = 0;
7c673cae
FG
257
258 // -- discover --
259 struct discover_info_t {
260 ceph_tid_t tid;
261 mds_rank_t mds;
262 inodeno_t ino;
263 frag_t frag;
264 snapid_t snap;
265 filepath want_path;
31f18b77 266 CInode *basei;
7c673cae
FG
267 bool want_base_dir;
268 bool want_xlocked;
269
270 discover_info_t() :
31f18b77 271 tid(0), mds(-1), snap(CEPH_NOSNAP), basei(NULL),
7c673cae
FG
272 want_base_dir(false), want_xlocked(false) {}
273 ~discover_info_t() {
31f18b77
FG
274 if (basei)
275 basei->put(MDSCacheObject::PIN_DISCOVERBASE);
7c673cae 276 }
31f18b77
FG
277 void pin_base(CInode *b) {
278 basei = b;
279 basei->get(MDSCacheObject::PIN_DISCOVERBASE);
7c673cae
FG
280 }
281 };
282
283 map<ceph_tid_t, discover_info_t> discovers;
eafe8130 284 ceph_tid_t discover_last_tid = 0;
7c673cae
FG
285
286 void _send_discover(discover_info_t& dis);
287 discover_info_t& _create_discover(mds_rank_t mds) {
288 ceph_tid_t t = ++discover_last_tid;
289 discover_info_t& d = discovers[t];
290 d.tid = t;
291 d.mds = mds;
292 return d;
293 }
294
295 // waiters
11fdf7f2 296 map<int, map<inodeno_t, MDSContext::vec > > waiting_for_base_ino;
7c673cae 297
11fdf7f2
TL
298 void discover_base_ino(inodeno_t want_ino, MDSContext *onfinish, mds_rank_t from=MDS_RANK_NONE);
299 void discover_dir_frag(CInode *base, frag_t approx_fg, MDSContext *onfinish,
7c673cae 300 mds_rank_t from=MDS_RANK_NONE);
11fdf7f2 301 void discover_path(CInode *base, snapid_t snap, filepath want_path, MDSContext *onfinish,
7c673cae 302 bool want_xlocked=false, mds_rank_t from=MDS_RANK_NONE);
11fdf7f2 303 void discover_path(CDir *base, snapid_t snap, filepath want_path, MDSContext *onfinish,
7c673cae
FG
304 bool want_xlocked=false);
305 void kick_discovers(mds_rank_t who); // after a failure.
306
307
308 // -- subtrees --
81eedcae
TL
309private:
310 static const unsigned int SUBTREES_COUNT_THRESHOLD = 5;
311 static const unsigned int SUBTREES_DEPTH_THRESHOLD = 5;
7c673cae
FG
312protected:
313 /* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */
314 map<CDir*,set<CDir*> > subtrees;
315 map<CInode*,list<pair<CDir*,CDir*> > > projected_subtree_renames; // renamed ino -> target dir
316
317 // adjust subtree auth specification
318 // dir->dir_auth
319 // imports/exports/nested_exports
320 // join/split subtrees as appropriate
321public:
322 bool is_subtrees() { return !subtrees.empty(); }
11fdf7f2
TL
323 template<typename T>
324 void get_subtrees(T& c) {
325 if constexpr (std::is_same_v<T, std::vector<CDir*>>)
326 c.reserve(c.size() + subtrees.size());
327 for (const auto& p : subtrees) {
328 c.push_back(p.first);
329 }
330 }
28e407b8 331 void adjust_subtree_auth(CDir *root, mds_authority_t auth, bool adjust_pop=true);
224ce89b
WB
332 void adjust_subtree_auth(CDir *root, mds_rank_t a, mds_rank_t b=CDIR_AUTH_UNKNOWN) {
333 adjust_subtree_auth(root, mds_authority_t(a,b));
7c673cae 334 }
11fdf7f2
TL
335 void adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth);
336 void adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_rank_t a) {
7c673cae
FG
337 adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
338 }
11fdf7f2
TL
339 void adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bounds, const mds_authority_t &auth);
340 void adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bounds, mds_rank_t a) {
7c673cae
FG
341 adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
342 }
11fdf7f2 343 void map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result);
7c673cae 344 void try_subtree_merge(CDir *root);
28e407b8 345 void try_subtree_merge_at(CDir *root, set<CInode*> *to_eval, bool adjust_pop=true);
7c673cae
FG
346 void subtree_merge_writebehind_finish(CInode *in, MutationRef& mut);
347 void eval_subtree_root(CInode *diri);
348 CDir *get_subtree_root(CDir *dir);
349 CDir *get_projected_subtree_root(CDir *dir);
350 bool is_leaf_subtree(CDir *dir) {
11fdf7f2 351 ceph_assert(subtrees.count(dir));
7c673cae
FG
352 return subtrees[dir].empty();
353 }
354 void remove_subtree(CDir *dir);
355 bool is_subtree(CDir *root) {
356 return subtrees.count(root);
357 }
358 void get_subtree_bounds(CDir *root, set<CDir*>& bounds);
359 void get_wouldbe_subtree_bounds(CDir *root, set<CDir*>& bounds);
360 void verify_subtree_bounds(CDir *root, const set<CDir*>& bounds);
361 void verify_subtree_bounds(CDir *root, const list<dirfrag_t>& bounds);
362
363 void project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir);
224ce89b 364 void adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop);
7c673cae 365
11fdf7f2
TL
366 auto get_auth_subtrees() {
367 std::vector<CDir*> c;
368 for (auto& p : subtrees) {
369 auto& root = p.first;
370 if (root->is_auth()) {
371 c.push_back(root);
372 }
373 }
374 return c;
375 }
7c673cae 376
11fdf7f2
TL
377 auto get_fullauth_subtrees() {
378 std::vector<CDir*> c;
379 for (auto& p : subtrees) {
380 auto& root = p.first;
381 if (root->is_full_dir_auth()) {
382 c.push_back(root);
383 }
384 }
385 return c;
386 }
387 auto num_subtrees_fullauth() const {
388 std::size_t n = 0;
389 for (auto& p : subtrees) {
390 auto& root = p.first;
391 if (root->is_full_dir_auth()) {
392 ++n;
393 }
394 }
395 return n;
396 }
7c673cae 397
11fdf7f2
TL
398 auto num_subtrees_fullnonauth() const {
399 std::size_t n = 0;
400 for (auto& p : subtrees) {
401 auto& root = p.first;
402 if (root->is_full_dir_nonauth()) {
403 ++n;
404 }
405 }
406 return n;
407 }
7c673cae 408
11fdf7f2
TL
409 auto num_subtrees() const {
410 return subtrees.size();
411 }
7c673cae 412
11fdf7f2
TL
413
414protected:
7c673cae
FG
415 // -- requests --
416 ceph::unordered_map<metareqid_t, MDRequestRef> active_requests;
417
418public:
419 int get_num_client_requests();
420
11fdf7f2
TL
421 MDRequestRef request_start(const MClientRequest::const_ref& req);
422 MDRequestRef request_start_slave(metareqid_t rid, __u32 attempt, const Message::const_ref &m);
7c673cae
FG
423 MDRequestRef request_start_internal(int op);
424 bool have_request(metareqid_t rid) {
425 return active_requests.count(rid);
426 }
427 MDRequestRef request_get(metareqid_t rid);
428 void request_pin_ref(MDRequestRef& r, CInode *ref, vector<CDentry*>& trace);
429 void request_finish(MDRequestRef& mdr);
430 void request_forward(MDRequestRef& mdr, mds_rank_t mds, int port=0);
431 void dispatch_request(MDRequestRef& mdr);
432 void request_drop_foreign_locks(MDRequestRef& mdr);
433 void request_drop_non_rdlocks(MDRequestRef& r);
434 void request_drop_locks(MDRequestRef& r);
435 void request_cleanup(MDRequestRef& r);
436
437 void request_kill(MDRequestRef& r); // called when session closes
438
439 // journal/snap helpers
440 CInode *pick_inode_snap(CInode *in, snapid_t follows);
441 CInode *cow_inode(CInode *in, snapid_t last);
442 void journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob, CDentry *dn,
443 snapid_t follows=CEPH_NOSNAP,
444 CInode **pcow_inode=0, CDentry::linkage_t *dnl=0);
445 void journal_cow_inode(MutationRef& mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP,
446 CInode **pcow_inode=0);
447 void journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP);
448
449 void project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
450 int linkunlink, SnapRealm *prealm);
94b18763 451 void _project_rstat_inode_to_frag(CInode::mempool_inode & inode, snapid_t ofirst, snapid_t last,
7c673cae
FG
452 CDir *parent, int linkunlink, bool update_inode);
453 void project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
454 snapid_t ofirst, snapid_t last,
455 CInode *pin, bool cow_head);
a8e16298 456 void broadcast_quota_to_client(CInode *in, client_t exclude_ct = -1, bool quota_change = false);
7c673cae
FG
457 void predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
458 CInode *in, CDir *parent,
459 int flags, int linkunlink=0,
460 snapid_t follows=CEPH_NOSNAP);
461
462 // slaves
463 void add_uncommitted_master(metareqid_t reqid, LogSegment *ls, set<mds_rank_t> &slaves, bool safe=false) {
464 uncommitted_masters[reqid].ls = ls;
465 uncommitted_masters[reqid].slaves = slaves;
466 uncommitted_masters[reqid].safe = safe;
467 }
11fdf7f2 468 void wait_for_uncommitted_master(metareqid_t reqid, MDSContext *c) {
7c673cae
FG
469 uncommitted_masters[reqid].waiters.push_back(c);
470 }
471 bool have_uncommitted_master(metareqid_t reqid, mds_rank_t from) {
472 auto p = uncommitted_masters.find(reqid);
473 return p != uncommitted_masters.end() && p->second.slaves.count(from) > 0;
474 }
475 void log_master_commit(metareqid_t reqid);
476 void logged_master_update(metareqid_t reqid);
477 void _logged_master_commit(metareqid_t reqid);
478 void committed_master_slave(metareqid_t r, mds_rank_t from);
479 void finish_committed_masters();
480
481 void _logged_slave_commit(mds_rank_t from, metareqid_t reqid);
482
483 // -- recovery --
484protected:
485 set<mds_rank_t> recovery_set;
486
487public:
488 void set_recovery_set(set<mds_rank_t>& s);
489 void handle_mds_failure(mds_rank_t who);
490 void handle_mds_recovery(mds_rank_t who);
491
492protected:
493 // [resolve]
494 // from EImportStart w/o EImportFinish during journal replay
495 map<dirfrag_t, vector<dirfrag_t> > my_ambiguous_imports;
496 // from MMDSResolves
497 map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > > other_ambiguous_imports;
498
499 map<mds_rank_t, map<metareqid_t, MDSlaveUpdate*> > uncommitted_slave_updates; // slave: for replay.
500 map<CInode*, int> uncommitted_slave_rename_olddir; // slave: preserve the non-auth dir until seeing commit.
501 map<CInode*, int> uncommitted_slave_unlink; // slave: preserve the unlinked inode until seeing commit.
502
503 // track master requests whose slaves haven't acknowledged commit
504 struct umaster {
505 set<mds_rank_t> slaves;
506 LogSegment *ls;
11fdf7f2 507 MDSContext::vec waiters;
7c673cae
FG
508 bool safe;
509 bool committing;
510 bool recovering;
511 umaster() : ls(NULL), safe(false), committing(false), recovering(false) {}
512 };
513 map<metareqid_t, umaster> uncommitted_masters; // master: req -> slave set
514
515 set<metareqid_t> pending_masters;
516 map<int, set<metareqid_t> > ambiguous_slave_updates;
517
518 friend class ESlaveUpdate;
519 friend class ECommitted;
520
eafe8130 521 bool resolves_pending = false;
7c673cae
FG
522 set<mds_rank_t> resolve_gather; // nodes i need resolves from
523 set<mds_rank_t> resolve_ack_gather; // nodes i need a resolve_ack from
11fdf7f2
TL
524 set<version_t> resolve_snapclient_commits;
525 map<metareqid_t, mds_rank_t> resolve_need_rollback; // rollbacks i'm writing to the journal
526 map<mds_rank_t, MMDSResolve::const_ref> delayed_resolve;
7c673cae 527
11fdf7f2
TL
528 void handle_resolve(const MMDSResolve::const_ref &m);
529 void handle_resolve_ack(const MMDSResolveAck::const_ref &m);
7c673cae
FG
530 void process_delayed_resolve();
531 void discard_delayed_resolve(mds_rank_t who);
532 void maybe_resolve_finish();
533 void disambiguate_my_imports();
534 void disambiguate_other_imports();
535 void trim_unlinked_inodes();
536 void add_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master, MDSlaveUpdate*);
537 void finish_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master);
538 MDSlaveUpdate* get_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master);
11fdf7f2
TL
539
540 void send_slave_resolves();
541 void send_subtree_resolves();
542 void maybe_finish_slave_resolve();
543
7c673cae
FG
544public:
545 void recalc_auth_bits(bool replay);
546 void remove_inode_recursive(CInode *in);
547
548 bool is_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
549 auto p = ambiguous_slave_updates.find(master);
550 return p != ambiguous_slave_updates.end() && p->second.count(reqid);
551 }
552 void add_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
553 ambiguous_slave_updates[master].insert(reqid);
554 }
555 void remove_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
556 auto p = ambiguous_slave_updates.find(master);
557 auto q = p->second.find(reqid);
11fdf7f2 558 ceph_assert(q != p->second.end());
7c673cae
FG
559 p->second.erase(q);
560 if (p->second.empty())
561 ambiguous_slave_updates.erase(p);
562 }
563
564 void add_rollback(metareqid_t reqid, mds_rank_t master) {
11fdf7f2 565 resolve_need_rollback[reqid] = master;
7c673cae
FG
566 }
567 void finish_rollback(metareqid_t reqid);
568
569 // ambiguous imports
570 void add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds);
571 void add_ambiguous_import(CDir *base, const set<CDir*>& bounds);
572 bool have_ambiguous_import(dirfrag_t base) {
573 return my_ambiguous_imports.count(base);
574 }
575 void get_ambiguous_import_bounds(dirfrag_t base, vector<dirfrag_t>& bounds) {
11fdf7f2 576 ceph_assert(my_ambiguous_imports.count(base));
7c673cae
FG
577 bounds = my_ambiguous_imports[base];
578 }
579 void cancel_ambiguous_import(CDir *);
580 void finish_ambiguous_import(dirfrag_t dirino);
11fdf7f2 581 void resolve_start(MDSContext *resolve_done_);
7c673cae 582 void send_resolves();
7c673cae
FG
583 void maybe_send_pending_resolves() {
584 if (resolves_pending)
585 send_subtree_resolves();
586 }
587
588 void _move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
589 map<dirfrag_t,vector<dirfrag_t> >& subtrees);
590 ESubtreeMap *create_subtree_map();
591
592
593 void clean_open_file_lists();
11fdf7f2
TL
594 void dump_openfiles(Formatter *f);
595 bool dump_inode(Formatter *f, uint64_t number);
7c673cae
FG
596protected:
597 // [rejoin]
eafe8130 598 bool rejoins_pending = false;
7c673cae
FG
599 set<mds_rank_t> rejoin_gather; // nodes from whom i need a rejoin
600 set<mds_rank_t> rejoin_sent; // nodes i sent a rejoin to
31f18b77 601 set<mds_rank_t> rejoin_ack_sent; // nodes i sent a rejoin to
7c673cae
FG
602 set<mds_rank_t> rejoin_ack_gather; // nodes from whom i need a rejoin ack
603 map<mds_rank_t,map<inodeno_t,map<client_t,Capability::Import> > > rejoin_imported_caps;
604 map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > > rejoin_slave_exports;
11fdf7f2 605
7c673cae 606 map<client_t,entity_inst_t> rejoin_client_map;
11fdf7f2 607 map<client_t,client_metadata_t> rejoin_client_metadata_map;
28e407b8 608 map<client_t,pair<Session*,uint64_t> > rejoin_session_map;
7c673cae 609
28e407b8 610 map<inodeno_t,pair<mds_rank_t,map<client_t,cap_reconnect_t> > > cap_exports; // ino -> target, client -> capex
7c673cae
FG
611
612 map<inodeno_t,map<client_t,map<mds_rank_t,cap_reconnect_t> > > cap_imports; // ino -> client -> frommds -> capex
613 set<inodeno_t> cap_imports_missing;
11fdf7f2 614 map<inodeno_t, MDSContext::vec > cap_reconnect_waiters;
eafe8130 615 int cap_imports_num_opening = 0;
7c673cae
FG
616
617 set<CInode*> rejoin_undef_inodes;
618 set<CInode*> rejoin_potential_updated_scatterlocks;
619 set<CDir*> rejoin_undef_dirfrags;
620 map<mds_rank_t, set<CInode*> > rejoin_unlinked_inodes;
621
622 vector<CInode*> rejoin_recover_q, rejoin_check_q;
623 list<SimpleLock*> rejoin_eval_locks;
11fdf7f2 624 MDSContext::vec rejoin_waiters;
7c673cae 625
11fdf7f2
TL
626 void rejoin_walk(CDir *dir, const MMDSCacheRejoin::ref &rejoin);
627 void handle_cache_rejoin(const MMDSCacheRejoin::const_ref &m);
628 void handle_cache_rejoin_weak(const MMDSCacheRejoin::const_ref &m);
7c673cae
FG
629 CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last);
630 CDir* rejoin_invent_dirfrag(dirfrag_t df);
11fdf7f2
TL
631 void handle_cache_rejoin_strong(const MMDSCacheRejoin::const_ref &m);
632 void rejoin_scour_survivor_replicas(mds_rank_t from, const MMDSCacheRejoin::const_ref &ack,
7c673cae
FG
633 set<vinodeno_t>& acked_inodes,
634 set<SimpleLock *>& gather_locks);
11fdf7f2 635 void handle_cache_rejoin_ack(const MMDSCacheRejoin::const_ref &m);
7c673cae
FG
636 void rejoin_send_acks();
637 void rejoin_trim_undef_inodes();
638 void maybe_send_pending_rejoins() {
639 if (rejoins_pending)
640 rejoin_send_rejoins();
641 }
11fdf7f2
TL
642 std::unique_ptr<MDSContext> rejoin_done;
643 std::unique_ptr<MDSContext> resolve_done;
7c673cae 644public:
11fdf7f2 645 void rejoin_start(MDSContext *rejoin_done_);
7c673cae
FG
646 void rejoin_gather_finish();
647 void rejoin_send_rejoins();
648 void rejoin_export_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
11fdf7f2 649 int target=-1, bool drop_path=false) {
28e407b8
AA
650 auto& ex = cap_exports[ino];
651 ex.first = target;
11fdf7f2
TL
652 auto &_icr = ex.second[client] = icr;
653 if (drop_path)
654 _icr.path.clear();
7c673cae
FG
655 }
656 void rejoin_recovered_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
11fdf7f2
TL
657 mds_rank_t frommds=MDS_RANK_NONE, bool drop_path=false) {
658 auto &_icr = cap_imports[ino][client][frommds] = icr;
659 if (drop_path)
660 _icr.path.clear();
7c673cae 661 }
28e407b8
AA
662 void rejoin_recovered_client(client_t client, const entity_inst_t& inst) {
663 rejoin_client_map.emplace(client, inst);
664 }
11fdf7f2
TL
665 bool rejoin_has_cap_reconnect(inodeno_t ino) const {
666 return cap_imports.count(ino);
667 }
668 void add_replay_ino_alloc(inodeno_t ino) {
669 cap_imports_missing.insert(ino); // avoid opening ino during cache rejoin
670 }
7c673cae
FG
671 const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) {
672 if (cap_imports.count(ino) &&
673 cap_imports[ino].count(client) &&
674 cap_imports[ino][client].count(MDS_RANK_NONE)) {
675 return &cap_imports[ino][client][MDS_RANK_NONE];
676 }
677 return NULL;
678 }
679 void remove_replay_cap_reconnect(inodeno_t ino, client_t client) {
11fdf7f2
TL
680 ceph_assert(cap_imports[ino].size() == 1);
681 ceph_assert(cap_imports[ino][client].size() == 1);
7c673cae
FG
682 cap_imports.erase(ino);
683 }
11fdf7f2 684 void wait_replay_cap_reconnect(inodeno_t ino, MDSContext *c) {
7c673cae
FG
685 cap_reconnect_waiters[ino].push_back(c);
686 }
687
688 // [reconnect/rejoin caps]
689 struct reconnected_cap_info_t {
690 inodeno_t realm_ino;
691 snapid_t snap_follows;
692 int dirty_caps;
11fdf7f2 693 bool snapflush;
7c673cae 694 reconnected_cap_info_t() :
11fdf7f2 695 realm_ino(0), snap_follows(0), dirty_caps(0), snapflush(false) {}
7c673cae
FG
696 };
697 map<inodeno_t,map<client_t, reconnected_cap_info_t> > reconnected_caps; // inode -> client -> snap_follows,realmino
698 map<inodeno_t,map<client_t, snapid_t> > reconnected_snaprealms; // realmino -> client -> realmseq
699
700 void add_reconnected_cap(client_t client, inodeno_t ino, const cap_reconnect_t& icr) {
701 reconnected_cap_info_t &info = reconnected_caps[ino][client];
702 info.realm_ino = inodeno_t(icr.capinfo.snaprealm);
703 info.snap_follows = icr.snap_follows;
704 }
11fdf7f2 705 void set_reconnected_dirty_caps(client_t client, inodeno_t ino, int dirty, bool snapflush) {
7c673cae
FG
706 reconnected_cap_info_t &info = reconnected_caps[ino][client];
707 info.dirty_caps |= dirty;
11fdf7f2
TL
708 if (snapflush)
709 info.snapflush = snapflush;
7c673cae
FG
710 }
711 void add_reconnected_snaprealm(client_t client, inodeno_t ino, snapid_t seq) {
712 reconnected_snaprealms[ino][client] = seq;
713 }
714
715 friend class C_MDC_RejoinOpenInoFinish;
716 friend class C_MDC_RejoinSessionsOpened;
717 void rejoin_open_ino_finish(inodeno_t ino, int ret);
11fdf7f2 718 void rejoin_prefetch_ino_finish(inodeno_t ino, int ret);
28e407b8 719 void rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map);
7c673cae
FG
720 bool process_imported_caps();
721 void choose_lock_states_and_reconnect_caps();
722 void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
11fdf7f2
TL
723 map<client_t,MClientSnap::ref>& splits);
724 void prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm, map<client_t,MClientSnap::ref>& splits);
725 void send_snaps(map<client_t,MClientSnap::ref>& splits);
7c673cae 726 Capability* rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds);
11fdf7f2
TL
727 void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq,
728 map<client_t,MClientSnap::ref>& updates);
a8e16298 729 Capability* try_reconnect_cap(CInode *in, Session *session);
7c673cae
FG
730 void export_remaining_imported_caps();
731
11fdf7f2
TL
732 // realm inodes
733 set<CInode*> rejoin_pending_snaprealms;
7c673cae 734 // cap imports. delayed snap parent opens.
7c673cae
FG
735 map<client_t,set<CInode*> > delayed_imported_caps;
736
737 void do_cap_import(Session *session, CInode *in, Capability *cap,
738 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
739 int peer, int p_flags);
740 void do_delayed_cap_imports();
741 void rebuild_need_snapflush(CInode *head_in, SnapRealm *realm, client_t client,
742 snapid_t snap_follows);
11fdf7f2 743 void open_snaprealms();
7c673cae
FG
744
745 bool open_undef_inodes_dirfrags();
746 void opened_undef_inode(CInode *in);
747 void opened_undef_dirfrag(CDir *dir) {
748 rejoin_undef_dirfrags.erase(dir);
749 }
750
751 void reissue_all_caps();
752
753
754 friend class Locker;
755 friend class Migrator;
756 friend class MDBalancer;
757
758 // StrayManager needs to be able to remove_inode() from us
759 // when it is done purging
760 friend class StrayManager;
761
762 // File size recovery
763private:
764 RecoveryQueue recovery_queue;
765 void identify_files_to_recover();
766public:
767 void start_files_to_recover();
768 void do_file_recover();
769 void queue_file_recover(CInode *in);
770 void _queued_file_recover_cow(CInode *in, MutationRef& mut);
771
772 // subsystems
773 std::unique_ptr<Migrator> migrator;
774
775 public:
776 explicit MDCache(MDSRank *m, PurgeQueue &purge_queue_);
777 ~MDCache();
92f5a8d4 778 void handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map);
7c673cae
FG
779
780 // debug
781 void log_stat();
782
783 // root inode
784 CInode *get_root() { return root; }
785 CInode *get_myin() { return myin; }
786
7c673cae
FG
787 size_t get_cache_size() { return lru.lru_get_size(); }
788
789 // trimming
a8e16298 790 std::pair<bool, uint64_t> trim(uint64_t count=0);
181888fb 791private:
11fdf7f2
TL
792 std::pair<bool, uint64_t> trim_lru(uint64_t count, expiremap& expiremap);
793 bool trim_dentry(CDentry *dn, expiremap& expiremap);
794 void trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap);
795 bool trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap&);
796 void send_expire_messages(expiremap& expiremap);
7c673cae 797 void trim_non_auth(); // trim out trimmable non-auth items
181888fb 798public:
7c673cae
FG
799 bool trim_non_auth_subtree(CDir *directory);
800 void standby_trim_segment(LogSegment *ls);
801 void try_trim_non_auth_subtree(CDir *dir);
802 bool can_trim_non_auth_dirfrag(CDir *dir) {
803 return my_ambiguous_imports.count((dir)->dirfrag()) == 0 &&
804 uncommitted_slave_rename_olddir.count(dir->inode) == 0;
805 }
806
807 /**
808 * For all unreferenced inodes, dirs, dentries below an inode, compose
809 * expiry messages. This is used when giving up all replicas of entities
810 * for an MDS peer in the 'stopping' state, such that the peer can
811 * empty its cache and finish shutting down.
812 *
813 * We have to make sure we're only expiring un-referenced items to
814 * avoid interfering with ongoing stray-movement (we can't distinguish
815 * between the "moving my strays" and "waiting for my cache to empty"
816 * phases within 'stopping')
817 *
818 * @return false if we completed cleanly, true if caller should stop
819 * expiring because we hit something with refs.
820 */
11fdf7f2 821 bool expire_recursive(CInode *in, expiremap& expiremap);
7c673cae
FG
822
823 void trim_client_leases();
824 void check_memory_usage();
825
7c673cae
FG
826 // shutdown
827private:
f64942e4
AA
828 set<inodeno_t> shutdown_exporting_strays;
829 pair<dirfrag_t, string> shutdown_export_next;
7c673cae
FG
830public:
831 void shutdown_start();
832 void shutdown_check();
833 bool shutdown_pass();
7c673cae 834 bool shutdown(); // clear cache (ie at shutodwn)
f64942e4
AA
835 bool shutdown_export_strays();
836 void shutdown_export_stray_finish(inodeno_t ino) {
837 if (shutdown_exporting_strays.erase(ino))
838 shutdown_export_strays();
839 }
7c673cae 840
eafe8130 841 bool did_shutdown_log_cap = false;
7c673cae
FG
842
843 // inode_map
844 bool have_inode(vinodeno_t vino) {
b32b8144
FG
845 if (vino.snapid == CEPH_NOSNAP)
846 return inode_map.count(vino.ino) ? true : false;
847 else
848 return snap_inode_map.count(vino) ? true : false;
7c673cae
FG
849 }
850 bool have_inode(inodeno_t ino, snapid_t snap=CEPH_NOSNAP) {
851 return have_inode(vinodeno_t(ino, snap));
852 }
853 CInode* get_inode(vinodeno_t vino) {
b32b8144
FG
854 if (vino.snapid == CEPH_NOSNAP) {
855 auto p = inode_map.find(vino.ino);
856 if (p != inode_map.end())
857 return p->second;
858 } else {
859 auto p = snap_inode_map.find(vino);
860 if (p != snap_inode_map.end())
861 return p->second;
862 }
7c673cae
FG
863 return NULL;
864 }
865 CInode* get_inode(inodeno_t ino, snapid_t s=CEPH_NOSNAP) {
866 return get_inode(vinodeno_t(ino, s));
867 }
11fdf7f2
TL
868 CInode* lookup_snap_inode(vinodeno_t vino) {
869 auto p = snap_inode_map.lower_bound(vino);
870 if (p != snap_inode_map.end() &&
871 p->second->ino() == vino.ino && p->second->first <= vino.snapid)
872 return p->second;
873 return NULL;
874 }
7c673cae
FG
875
876 CDir* get_dirfrag(dirfrag_t df) {
877 CInode *in = get_inode(df.ino);
878 if (!in)
879 return NULL;
880 return in->get_dirfrag(df.frag);
881 }
11fdf7f2 882 CDir* get_dirfrag(inodeno_t ino, std::string_view dn) {
7c673cae
FG
883 CInode *in = get_inode(ino);
884 if (!in)
885 return NULL;
886 frag_t fg = in->pick_dirfrag(dn);
887 return in->get_dirfrag(fg);
888 }
889 CDir* get_force_dirfrag(dirfrag_t df, bool replay) {
890 CInode *diri = get_inode(df.ino);
891 if (!diri)
892 return NULL;
893 CDir *dir = force_dir_fragment(diri, df.frag, replay);
894 if (!dir)
895 dir = diri->get_dirfrag(df.frag);
896 return dir;
897 }
898
11fdf7f2 899 MDSCacheObject *get_object(const MDSCacheObjectInfo &info);
7c673cae
FG
900
901
902
903 public:
904 void add_inode(CInode *in);
905
906 void remove_inode(CInode *in);
907 protected:
908 void touch_inode(CInode *in) {
909 if (in->get_parent_dn())
910 touch_dentry(in->get_projected_parent_dn());
911 }
912public:
913 void touch_dentry(CDentry *dn) {
31f18b77
FG
914 if (dn->state_test(CDentry::STATE_BOTTOMLRU)) {
915 bottom_lru.lru_midtouch(dn);
916 } else {
917 if (dn->is_auth())
918 lru.lru_touch(dn);
919 else
920 lru.lru_midtouch(dn);
921 }
7c673cae
FG
922 }
923 void touch_dentry_bottom(CDentry *dn) {
31f18b77
FG
924 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
925 return;
7c673cae 926 lru.lru_bottouch(dn);
7c673cae
FG
927 }
928protected:
929
930 void inode_remove_replica(CInode *in, mds_rank_t rep, bool rejoin,
931 set<SimpleLock *>& gather_locks);
932 void dentry_remove_replica(CDentry *dn, mds_rank_t rep, set<SimpleLock *>& gather_locks);
933
934 void rename_file(CDentry *srcdn, CDentry *destdn);
935
936 public:
937 // truncate
938 void truncate_inode(CInode *in, LogSegment *ls);
939 void _truncate_inode(CInode *in, LogSegment *ls);
940 void truncate_inode_finish(CInode *in, LogSegment *ls);
941 void truncate_inode_logged(CInode *in, MutationRef& mut);
942
943 void add_recovered_truncate(CInode *in, LogSegment *ls);
944 void remove_recovered_truncate(CInode *in, LogSegment *ls);
945 void start_recovered_truncates();
946
947
948 public:
949 CDir *get_auth_container(CDir *in);
950 CDir *get_export_container(CDir *dir);
951 void find_nested_exports(CDir *dir, set<CDir*>& s);
952 void find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s);
953
954
955private:
eafe8130 956 bool opening_root = false, open = false;
11fdf7f2 957 MDSContext::vec waiting_for_open;
7c673cae
FG
958
959public:
960 void init_layouts();
961 void create_unlinked_system_inode(CInode *in, inodeno_t ino,
962 int mode) const;
963 CInode *create_system_inode(inodeno_t ino, int mode);
964 CInode *create_root_inode();
965
966 void create_empty_hierarchy(MDSGather *gather);
967 void create_mydir_hierarchy(MDSGather *gather);
968
969 bool is_open() { return open; }
11fdf7f2 970 void wait_for_open(MDSContext *c) {
7c673cae
FG
971 waiting_for_open.push_back(c);
972 }
973
11fdf7f2 974 void open_root_inode(MDSContext *c);
7c673cae 975 void open_root();
11fdf7f2
TL
976 void open_mydir_inode(MDSContext *c);
977 void open_mydir_frag(MDSContext *c);
7c673cae
FG
978 void populate_mydir();
979
11fdf7f2 980 void _create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin);
7c673cae 981 void _create_system_file_finish(MutationRef& mut, CDentry *dn,
11fdf7f2 982 version_t dpv, MDSContext *fin);
7c673cae 983
11fdf7f2 984 void open_foreign_mdsdir(inodeno_t ino, MDSContext *c);
7c673cae
FG
985 CDir *get_stray_dir(CInode *in);
986 CDentry *get_or_create_stray_dentry(CInode *in);
987
7c673cae
FG
988 /**
989 * Find the given dentry (and whether it exists or not), its ancestors,
990 * and get them all into memory and usable on this MDS. This function
991 * makes a best-effort attempt to load everything; if it needs to
992 * go away and do something then it will put the request on a waitlist.
993 * It prefers the mdr, then the req, then the fin. (At least one of these
994 * must be non-null.)
995 *
996 * At least one of the params mdr, req, and fin must be non-null.
997 *
998 * @param mdr The MDRequest associated with the path. Can be null.
11fdf7f2 999 * @param cf A MDSContextFactory for waiter building.
7c673cae
FG
1000 * @param path The path to traverse to.
1001 * @param pdnvec Data return parameter -- on success, contains a
1002 * vector of dentries. On failure, is either empty or contains the
1003 * full trace of traversable dentries.
1004 * @param pin Data return parameter -- if successful, points to the inode
1005 * associated with filepath. If unsuccessful, is null.
1006 * @param onfail Specifies different lookup failure behaviors. If set to
1007 * MDS_TRAVERSE_DISCOVERXLOCK, path_traverse will succeed on null
1008 * dentries (instead of returning -ENOENT). If set to
1009 * MDS_TRAVERSE_FORWARD, it will forward the request to the auth
1010 * MDS if that becomes appropriate (ie, if it doesn't know the contents
1011 * of a directory). If set to MDS_TRAVERSE_DISCOVER, it
1012 * will attempt to look up the path from a different MDS (and bring them
1013 * into its cache as replicas).
1014 *
1015 * @returns 0 on success, 1 on "not done yet", 2 on "forwarding", -errno otherwise.
1016 * If it returns 1, the requester associated with this call has been placed
1017 * on the appropriate waitlist, and it should unwind itself and back out.
1018 * If it returns 2 the request has been forwarded, and again the requester
1019 * should unwind itself and back out.
1020 */
11fdf7f2 1021 int path_traverse(MDRequestRef& mdr, MDSContextFactory& cf, const filepath& path,
7c673cae
FG
1022 vector<CDentry*> *pdnvec, CInode **pin, int onfail);
1023
1024 CInode *cache_traverse(const filepath& path);
1025
11fdf7f2 1026 void open_remote_dirfrag(CInode *diri, frag_t fg, MDSContext *fin);
7c673cae
FG
1027 CInode *get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected=false);
1028
1029 bool parallel_fetch(map<inodeno_t,filepath>& pathmap, set<inodeno_t>& missing);
1030 bool parallel_fetch_traverse_dir(inodeno_t ino, filepath& path,
1031 set<CDir*>& fetch_queue, set<inodeno_t>& missing,
1032 C_GatherBuilder &gather_bld);
1033
11fdf7f2 1034 void open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin,
7c673cae 1035 bool want_xlocked=false);
11fdf7f2 1036 void _open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin,
7c673cae
FG
1037 bool want_xlocked, int r);
1038
1039 void make_trace(vector<CDentry*>& trace, CInode *in);
1040
1041protected:
1042 struct open_ino_info_t {
1043 vector<inode_backpointer_t> ancestors;
1044 set<mds_rank_t> checked;
1045 mds_rank_t checking;
1046 mds_rank_t auth_hint;
1047 bool check_peers;
1048 bool fetch_backtrace;
1049 bool discover;
1050 bool want_replica;
1051 bool want_xlocked;
1052 version_t tid;
1053 int64_t pool;
1054 int last_err;
11fdf7f2 1055 MDSContext::vec waiters;
7c673cae
FG
1056 open_ino_info_t() : checking(MDS_RANK_NONE), auth_hint(MDS_RANK_NONE),
1057 check_peers(true), fetch_backtrace(true), discover(false),
1058 want_replica(false), want_xlocked(false), tid(0), pool(-1),
1059 last_err(0) {}
1060 };
eafe8130 1061 ceph_tid_t open_ino_last_tid = 0;
7c673cae
FG
1062 map<inodeno_t,open_ino_info_t> opening_inodes;
1063
1064 void _open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err);
1065 void _open_ino_parent_opened(inodeno_t ino, int ret);
1066 void _open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int err);
11fdf7f2
TL
1067 void _open_ino_fetch_dir(inodeno_t ino, const MMDSOpenIno::const_ref &m, CDir *dir, bool parent);
1068 int open_ino_traverse_dir(inodeno_t ino, const MMDSOpenIno::const_ref &m,
1069 const vector<inode_backpointer_t>& ancestors,
7c673cae
FG
1070 bool discover, bool want_xlocked, mds_rank_t *hint);
1071 void open_ino_finish(inodeno_t ino, open_ino_info_t& info, int err);
1072 void do_open_ino(inodeno_t ino, open_ino_info_t& info, int err);
1073 void do_open_ino_peer(inodeno_t ino, open_ino_info_t& info);
11fdf7f2
TL
1074 void handle_open_ino(const MMDSOpenIno::const_ref &m, int err=0);
1075 void handle_open_ino_reply(const MMDSOpenInoReply::const_ref &m);
7c673cae
FG
1076 friend class C_IO_MDC_OpenInoBacktraceFetched;
1077 friend struct C_MDC_OpenInoTraverseDir;
1078 friend struct C_MDC_OpenInoParentOpened;
1079
1080public:
1081 void kick_open_ino_peers(mds_rank_t who);
11fdf7f2 1082 void open_ino(inodeno_t ino, int64_t pool, MDSContext *fin,
7c673cae
FG
1083 bool want_replica=true, bool want_xlocked=false);
1084
1085 // -- find_ino_peer --
1086 struct find_ino_peer_info_t {
1087 inodeno_t ino;
1088 ceph_tid_t tid;
11fdf7f2 1089 MDSContext *fin;
7c673cae
FG
1090 mds_rank_t hint;
1091 mds_rank_t checking;
1092 set<mds_rank_t> checked;
1093
1094 find_ino_peer_info_t() : tid(0), fin(NULL), hint(MDS_RANK_NONE), checking(MDS_RANK_NONE) {}
1095 };
1096
1097 map<ceph_tid_t, find_ino_peer_info_t> find_ino_peer;
eafe8130 1098 ceph_tid_t find_ino_peer_last_tid = 0;
7c673cae 1099
11fdf7f2 1100 void find_ino_peers(inodeno_t ino, MDSContext *c, mds_rank_t hint=MDS_RANK_NONE);
7c673cae 1101 void _do_find_ino_peer(find_ino_peer_info_t& fip);
11fdf7f2
TL
1102 void handle_find_ino(const MMDSFindIno::const_ref &m);
1103 void handle_find_ino_reply(const MMDSFindInoReply::const_ref &m);
7c673cae
FG
1104 void kick_find_ino_peers(mds_rank_t who);
1105
1106 // -- snaprealms --
11fdf7f2 1107private:
eafe8130 1108 SnapRealm *global_snaprealm = nullptr;
7c673cae 1109public:
11fdf7f2
TL
1110 SnapRealm *get_global_snaprealm() const { return global_snaprealm; }
1111 void create_global_snaprealm();
1112 void do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients=true);
1113 void send_snap_update(CInode *in, version_t stid, int snap_op);
1114 void handle_snap_update(const MMDSSnapUpdate::const_ref &m);
1115 void notify_global_snaprealm_update(int snap_op);
7c673cae
FG
1116
1117 // -- stray --
1118public:
7c673cae
FG
1119 void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin);
1120 uint64_t get_num_strays() const { return stray_manager.get_num_strays(); }
1121
1122protected:
1123 void scan_stray_dir(dirfrag_t next=dirfrag_t());
1124 StrayManager stray_manager;
1125 friend struct C_MDC_RetryScanStray;
7c673cae
FG
1126
1127 // == messages ==
1128 public:
11fdf7f2 1129 void dispatch(const Message::const_ref &m);
7c673cae
FG
1130
1131 protected:
1132 // -- replicas --
11fdf7f2
TL
1133 void handle_discover(const MDiscover::const_ref &dis);
1134 void handle_discover_reply(const MDiscoverReply::const_ref &m);
7c673cae
FG
1135 friend class C_MDC_Join;
1136
1137public:
b32b8144
FG
1138 void replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl);
1139 void replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl);
7c673cae 1140 void replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl,
b32b8144 1141 uint64_t features);
7c673cae 1142
11fdf7f2
TL
1143 CDir* add_replica_dir(bufferlist::const_iterator& p, CInode *diri, mds_rank_t from, MDSContext::vec& finished);
1144 CDentry *add_replica_dentry(bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished);
1145 CInode *add_replica_inode(bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished);
7c673cae
FG
1146
1147 void replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl);
11fdf7f2 1148 CDentry *add_replica_stray(const bufferlist &bl, mds_rank_t from);
7c673cae
FG
1149
1150 // -- namespace --
1151public:
1152 void send_dentry_link(CDentry *dn, MDRequestRef& mdr);
1153 void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr);
1154protected:
11fdf7f2
TL
1155 void handle_dentry_link(const MDentryLink::const_ref &m);
1156 void handle_dentry_unlink(const MDentryUnlink::const_ref &m);
7c673cae
FG
1157
1158
1159 // -- fragmenting --
1160private:
1161 struct ufragment {
1162 int bits;
1163 bool committed;
1164 LogSegment *ls;
11fdf7f2
TL
1165 MDSContext::vec waiters;
1166 frag_vec_t old_frags;
7c673cae
FG
1167 bufferlist rollback;
1168 ufragment() : bits(0), committed(false), ls(NULL) {}
1169 };
1170 map<dirfrag_t, ufragment> uncommitted_fragments;
1171
1172 struct fragment_info_t {
1173 int bits;
1174 list<CDir*> dirs;
1175 list<CDir*> resultfrags;
1176 MDRequestRef mdr;
a8e16298
TL
1177 set<mds_rank_t> notify_ack_waiting;
1178 bool finishing = false;
1179
7c673cae 1180 // for deadlock detection
a8e16298 1181 bool all_frozen = false;
7c673cae 1182 utime_t last_cum_auth_pins_change;
a8e16298
TL
1183 int last_cum_auth_pins = 0;
1184 int num_remote_waiters = 0; // number of remote authpin waiters
1185 fragment_info_t() {}
7c673cae 1186 bool is_fragmenting() { return !resultfrags.empty(); }
a8e16298 1187 uint64_t get_tid() { return mdr ? mdr->reqid.tid : 0; }
7c673cae
FG
1188 };
1189 map<dirfrag_t,fragment_info_t> fragments;
a8e16298 1190 typedef map<dirfrag_t,fragment_info_t>::iterator fragment_info_iterator;
7c673cae
FG
1191
1192 void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
11fdf7f2 1193 list<CDir*>& frags, MDSContext::vec& waiters, bool replay);
7c673cae
FG
1194 void adjust_dir_fragments(CInode *diri,
1195 list<CDir*>& srcfrags,
1196 frag_t basefrag, int bits,
1197 list<CDir*>& resultfrags,
11fdf7f2 1198 MDSContext::vec& waiters,
7c673cae
FG
1199 bool replay);
1200 CDir *force_dir_fragment(CInode *diri, frag_t fg, bool replay=true);
11fdf7f2 1201 void get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds);
7c673cae
FG
1202
1203 bool can_fragment(CInode *diri, list<CDir*>& dirs);
1204 void fragment_freeze_dirs(list<CDir*>& dirs);
1205 void fragment_mark_and_complete(MDRequestRef& mdr);
1206 void fragment_frozen(MDRequestRef& mdr, int r);
1207 void fragment_unmark_unfreeze_dirs(list<CDir*>& dirs);
a8e16298
TL
1208 void fragment_drop_locks(fragment_info_t &info);
1209 void fragment_maybe_finish(const fragment_info_iterator& it);
7c673cae
FG
1210 void dispatch_fragment_dir(MDRequestRef& mdr);
1211 void _fragment_logged(MDRequestRef& mdr);
1212 void _fragment_stored(MDRequestRef& mdr);
a8e16298
TL
1213 void _fragment_committed(dirfrag_t f, const MDRequestRef& mdr);
1214 void _fragment_old_purged(dirfrag_t f, int bits, const MDRequestRef& mdr);
7c673cae
FG
1215
1216 friend class EFragment;
1217 friend class C_MDC_FragmentFrozen;
1218 friend class C_MDC_FragmentMarking;
1219 friend class C_MDC_FragmentPrep;
1220 friend class C_MDC_FragmentStore;
1221 friend class C_MDC_FragmentCommit;
a8e16298 1222 friend class C_IO_MDC_FragmentPurgeOld;
7c673cae 1223
11fdf7f2
TL
1224 void handle_fragment_notify(const MMDSFragmentNotify::const_ref &m);
1225 void handle_fragment_notify_ack(const MMDSFragmentNotifyAck::const_ref &m);
7c673cae 1226
11fdf7f2 1227 void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frag,
7c673cae
FG
1228 LogSegment *ls, bufferlist *rollback=NULL);
1229 void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op);
11fdf7f2 1230 void rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags);
a8e16298
TL
1231
1232
1233 DecayCounter trim_counter;
1234
7c673cae 1235public:
11fdf7f2
TL
1236 void wait_for_uncommitted_fragment(dirfrag_t dirfrag, MDSContext *c) {
1237 ceph_assert(uncommitted_fragments.count(dirfrag));
7c673cae
FG
1238 uncommitted_fragments[dirfrag].waiters.push_back(c);
1239 }
1240 void split_dir(CDir *dir, int byn);
1241 void merge_dir(CInode *diri, frag_t fg);
1242 void rollback_uncommitted_fragments();
1243
1244 void find_stale_fragment_freeze();
1245 void fragment_freeze_inc_num_waiters(CDir *dir);
1246 bool fragment_are_all_frozen(CDir *dir);
1247 int get_num_fragmenting_dirs() { return fragments.size(); }
1248
1249 // -- updates --
1250 //int send_inode_updates(CInode *in);
1251 //void handle_inode_update(MInodeUpdate *m);
1252
1253 int send_dir_updates(CDir *in, bool bcast=false);
11fdf7f2 1254 void handle_dir_update(const MDirUpdate::const_ref &m);
7c673cae
FG
1255
1256 // -- cache expiration --
11fdf7f2
TL
1257 void handle_cache_expire(const MCacheExpire::const_ref &m);
1258 // delayed cache expire
1259 map<CDir*, expiremap> delayed_expire; // subtree root -> expire msg
7c673cae
FG
1260 void process_delayed_expire(CDir *dir);
1261 void discard_delayed_expire(CDir *dir);
1262
eafe8130
TL
1263 // -- mdsmap --
1264 void handle_mdsmap(const MDSMap &mdsmap);
1265
7c673cae 1266protected:
11fdf7f2 1267 int dump_cache(std::string_view fn, Formatter *f);
7c673cae 1268public:
31f18b77 1269 int dump_cache() { return dump_cache(NULL, NULL); }
11fdf7f2 1270 int dump_cache(std::string_view filename);
31f18b77 1271 int dump_cache(Formatter *f);
11fdf7f2 1272 void dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f);
7c673cae 1273
f64942e4 1274 void cache_status(Formatter *f);
181888fb 1275
7c673cae
FG
1276 void dump_resolve_status(Formatter *f) const;
1277 void dump_rejoin_status(Formatter *f) const;
1278
1279 // == crap fns ==
1280 public:
1281 void show_cache();
81eedcae 1282 void show_subtrees(int dbl=10, bool force_print=false);
7c673cae
FG
1283
1284 CInode *hack_pick_random_inode() {
11fdf7f2 1285 ceph_assert(!inode_map.empty());
7c673cae 1286 int n = rand() % inode_map.size();
b32b8144 1287 auto p = inode_map.begin();
7c673cae
FG
1288 while (n--) ++p;
1289 return p->second;
1290 }
1291
1292protected:
1293 void flush_dentry_work(MDRequestRef& mdr);
1294 /**
1295 * Resolve path to a dentry and pass it onto the ScrubStack.
1296 *
1297 * TODO: return enough information to the original mdr formatter
1298 * and completion that they can subsequeuntly check the progress of
1299 * this scrub (we won't block them on a whole scrub as it can take a very
1300 * long time)
1301 */
1302 void enqueue_scrub_work(MDRequestRef& mdr);
11fdf7f2 1303 void recursive_scrub_finish(const ScrubHeaderRef& header);
7c673cae
FG
1304 void repair_inode_stats_work(MDRequestRef& mdr);
1305 void repair_dirfrag_stats_work(MDRequestRef& mdr);
11fdf7f2
TL
1306 void upgrade_inode_snaprealm_work(MDRequestRef& mdr);
1307 friend class C_MDC_RespondInternalRequest;
7c673cae 1308public:
11fdf7f2 1309 void flush_dentry(std::string_view path, Context *fin);
7c673cae
FG
1310 /**
1311 * Create and start an OP_ENQUEUE_SCRUB
1312 */
11fdf7f2 1313 void enqueue_scrub(std::string_view path, std::string_view tag,
7c673cae
FG
1314 bool force, bool recursive, bool repair,
1315 Formatter *f, Context *fin);
1316 void repair_inode_stats(CInode *diri);
1317 void repair_dirfrag_stats(CDir *dir);
11fdf7f2 1318 void upgrade_inode_snaprealm(CInode *in);
7c673cae
FG
1319
1320public:
1321 /* Because exports may fail, this set lets us keep track of inodes that need exporting. */
1322 std::set<CInode *> export_pin_queue;
eafe8130 1323 std::set<CInode *> export_pin_delayed_queue;
11fdf7f2
TL
1324
1325 OpenFileTable open_file_table;
eafe8130
TL
1326
1327private:
1328 std::thread upkeeper;
1329 ceph::mutex upkeep_mutex = ceph::make_mutex("MDCache::upkeep_mutex");
1330 ceph::condition_variable upkeep_cvar;
1331 time upkeep_last_trim = time::min();
92f5a8d4 1332 time upkeep_last_release = time::min();
eafe8130 1333 std::atomic<bool> upkeep_trim_shutdown{false};
7c673cae
FG
1334};
1335
1336class C_MDS_RetryRequest : public MDSInternalContext {
1337 MDCache *cache;
1338 MDRequestRef mdr;
1339 public:
1340 C_MDS_RetryRequest(MDCache *c, MDRequestRef& r);
1341 void finish(int r) override;
1342};
1343
1344#endif