1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
14 #ifndef CEPH_MDCACHE_H
15 #define CEPH_MDCACHE_H
18 #include <string_view>
21 #include "common/DecayCounter.h"
22 #include "include/common_fwd.h"
23 #include "include/types.h"
24 #include "include/filepath.h"
25 #include "include/elist.h"
27 #include "messages/MCacheExpire.h"
28 #include "messages/MClientQuota.h"
29 #include "messages/MClientRequest.h"
30 #include "messages/MClientSnap.h"
31 #include "messages/MDentryLink.h"
32 #include "messages/MDentryUnlink.h"
33 #include "messages/MDirUpdate.h"
34 #include "messages/MDiscover.h"
35 #include "messages/MDiscoverReply.h"
36 #include "messages/MGatherCaps.h"
37 #include "messages/MGenericMessage.h"
38 #include "messages/MInodeFileCaps.h"
39 #include "messages/MLock.h"
40 #include "messages/MMDSCacheRejoin.h"
41 #include "messages/MMDSFindIno.h"
42 #include "messages/MMDSFindInoReply.h"
43 #include "messages/MMDSFragmentNotify.h"
44 #include "messages/MMDSFragmentNotifyAck.h"
45 #include "messages/MMDSOpenIno.h"
46 #include "messages/MMDSOpenInoReply.h"
47 #include "messages/MMDSResolve.h"
48 #include "messages/MMDSResolveAck.h"
49 #include "messages/MMDSPeerRequest.h"
50 #include "messages/MMDSSnapUpdate.h"
52 #include "osdc/Filer.h"
56 #include "include/Context.h"
57 #include "events/EMetaBlob.h"
58 #include "RecoveryQueue.h"
59 #include "StrayManager.h"
60 #include "OpenFileTable.h"
61 #include "MDSContext.h"
76 // dir updates for replication
78 l_mdc_dir_update_receipt
,
79 l_mdc_dir_try_discover
,
80 l_mdc_dir_send_discover
,
81 l_mdc_dir_handle_discover
,
83 // How many inodes currently in stray dentries
85 // How many stray dentries are currently delayed for purge due to refs
86 l_mdc_num_strays_delayed
,
87 // How many stray dentries are currently being enqueued for purge
88 l_mdc_num_strays_enqueuing
,
90 // How many dentries have ever been added to stray dir
92 // How many dentries have been passed on to PurgeQueue
93 l_mdc_strays_enqueued
,
94 // How many strays have been reintegrated?
95 l_mdc_strays_reintegrated
,
96 // How many strays have been migrated?
97 l_mdc_strays_migrated
,
99 // How many inode sizes currently being recovered
100 l_mdc_num_recovering_processing
,
101 // How many inodes currently waiting to have size recovered
102 l_mdc_num_recovering_enqueued
,
103 // How many inodes waiting with elevated priority for recovery
104 l_mdc_num_recovering_prioritized
,
105 // How many inodes ever started size recovery
106 l_mdc_recovery_started
,
107 // How many inodes ever completed size recovery
108 l_mdc_recovery_completed
,
110 l_mdss_ireq_enqueue_scrub
,
111 l_mdss_ireq_exportdir
,
113 l_mdss_ireq_fragmentdir
,
114 l_mdss_ireq_fragstats
,
115 l_mdss_ireq_inodestats
,
120 // flags for path_traverse();
121 static const int MDS_TRAVERSE_DISCOVER
= (1 << 0);
122 static const int MDS_TRAVERSE_PATH_LOCKED
= (1 << 1);
123 static const int MDS_TRAVERSE_WANT_DENTRY
= (1 << 2);
124 static const int MDS_TRAVERSE_WANT_AUTH
= (1 << 3);
125 static const int MDS_TRAVERSE_RDLOCK_SNAP
= (1 << 4);
126 static const int MDS_TRAVERSE_RDLOCK_SNAP2
= (1 << 5);
127 static const int MDS_TRAVERSE_WANT_DIRLAYOUT
= (1 << 6);
128 static const int MDS_TRAVERSE_RDLOCK_PATH
= (1 << 7);
129 static const int MDS_TRAVERSE_XLOCK_DENTRY
= (1 << 8);
130 static const int MDS_TRAVERSE_RDLOCK_AUTHLOCK
= (1 << 9);
131 static const int MDS_TRAVERSE_CHECK_LOCKCACHE
= (1 << 10);
132 static const int MDS_TRAVERSE_WANT_INODE
= (1 << 11);
135 // flags for predirty_journal_parents()
136 static const int PREDIRTY_PRIMARY
= 1; // primary dn, adjust nested accounting
137 static const int PREDIRTY_DIR
= 2; // update parent dir mtime/size
138 static const int PREDIRTY_SHALLOW
= 4; // only go to immediate parent (for easier rollback)
142 typedef std::map
<mds_rank_t
, ref_t
<MCacheExpire
>> expiremap
;
144 using clock
= ceph::coarse_mono_clock
;
145 using time
= ceph::coarse_mono_time
;
148 struct discover_info_t
{
152 basei
->put(MDSCacheObject::PIN_DISCOVERBASE
);
154 void pin_base(CInode
*b
) {
156 basei
->get(MDSCacheObject::PIN_DISCOVERBASE
);
163 snapid_t snap
= CEPH_NOSNAP
;
165 CInode
*basei
= nullptr;
166 bool want_base_dir
= false;
167 bool path_locked
= false;
170 // [reconnect/rejoin caps]
171 struct reconnected_cap_info_t
{
172 reconnected_cap_info_t() {}
173 inodeno_t realm_ino
= 0;
174 snapid_t snap_follows
= 0;
179 // -- find_ino_peer --
180 struct find_ino_peer_info_t
{
181 find_ino_peer_info_t() {}
184 MDSContext
*fin
= nullptr;
185 bool path_locked
= false;
186 mds_rank_t hint
= MDS_RANK_NONE
;
187 mds_rank_t checking
= MDS_RANK_NONE
;
188 std::set
<mds_rank_t
> checked
;
191 friend class C_MDC_RejoinOpenInoFinish
;
192 friend class C_MDC_RejoinSessionsOpened
;
195 friend class Migrator
;
196 friend class MDBalancer
;
198 // StrayManager needs to be able to remove_inode() from us
199 // when it is done purging
200 friend class StrayManager
;
202 explicit MDCache(MDSRank
*m
, PurgeQueue
&purge_queue_
);
205 void insert_taken_inos(inodeno_t ino
) {
206 replay_taken_inos
.insert(ino
);
208 void clear_taken_inos(inodeno_t ino
) {
209 replay_taken_inos
.erase(ino
);
211 bool test_and_clear_taken_inos(inodeno_t ino
) {
212 return replay_taken_inos
.erase(ino
) != 0;
214 bool is_taken_inos_empty(void) {
215 return replay_taken_inos
.empty();
218 uint64_t cache_limit_memory(void) {
219 return cache_memory_limit
;
221 double cache_toofull_ratio(void) const {
222 double memory_reserve
= cache_memory_limit
*(1.0-cache_reservation
);
223 return fmax(0.0, (cache_size()-memory_reserve
)/memory_reserve
);
225 bool cache_toofull(void) const {
226 return cache_toofull_ratio() > 0.0;
228 uint64_t cache_size(void) const {
229 return mempool::get_pool(mempool::mds_co::id
).allocated_bytes();
231 bool cache_overfull(void) const {
232 return cache_size() > cache_memory_limit
*cache_health_threshold
;
235 void advance_stray();
237 unsigned get_ephemeral_dist_frag_bits() const {
238 return export_ephemeral_dist_frag_bits
;
240 bool get_export_ephemeral_distributed_config(void) const {
241 return export_ephemeral_distributed_config
;
244 bool get_export_ephemeral_random_config(void) const {
245 return export_ephemeral_random_config
;
248 bool get_symlink_recovery(void) const {
249 return symlink_recovery
;
253 * Call this when you know that a CDentry is ready to be passed
254 * on to StrayManager (i.e. this is a stray you've just created)
256 void notify_stray(CDentry
*dn
) {
257 ceph_assert(dn
->get_dir()->get_inode()->is_stray());
258 if (dn
->state_test(CDentry::STATE_PURGING
))
261 stray_manager
.eval_stray(dn
);
264 mds_rank_t
hash_into_rank_bucket(inodeno_t ino
, frag_t fg
=0);
266 void maybe_eval_stray(CInode
*in
, bool delay
=false);
267 void clear_dirty_bits_for_stray(CInode
* diri
);
269 bool is_readonly() { return readonly
; }
270 void force_readonly();
272 static file_layout_t
gen_default_file_layout(const MDSMap
&mdsmap
);
273 static file_layout_t
gen_default_log_layout(const MDSMap
&mdsmap
);
275 void register_perfcounters();
277 void touch_client_lease(ClientLease
*r
, int pool
, utime_t ttl
) {
278 client_leases
[pool
].push_back(&r
->item_lease
);
282 void notify_stray_removed()
284 stray_manager
.notify_stray_removed();
287 void notify_stray_created()
289 stray_manager
.notify_stray_created();
292 void eval_remote(CDentry
*dn
)
294 stray_manager
.eval_remote(dn
);
297 void _send_discover(discover_info_t
& dis
);
298 discover_info_t
& _create_discover(mds_rank_t mds
) {
299 ceph_tid_t t
= ++discover_last_tid
;
300 discover_info_t
& d
= discovers
[t
];
306 void discover_base_ino(inodeno_t want_ino
, MDSContext
*onfinish
, mds_rank_t from
=MDS_RANK_NONE
);
307 void discover_dir_frag(CInode
*base
, frag_t approx_fg
, MDSContext
*onfinish
,
308 mds_rank_t from
=MDS_RANK_NONE
);
309 void discover_path(CInode
*base
, snapid_t snap
, filepath want_path
, MDSContext
*onfinish
,
310 bool path_locked
=false, mds_rank_t from
=MDS_RANK_NONE
);
311 void discover_path(CDir
*base
, snapid_t snap
, filepath want_path
, MDSContext
*onfinish
,
312 bool path_locked
=false);
313 void kick_discovers(mds_rank_t who
); // after a failure.
315 // adjust subtree auth specification
317 // imports/exports/nested_exports
318 // join/split subtrees as appropriate
319 bool is_subtrees() { return !subtrees
.empty(); }
321 void get_subtrees(T
& c
) {
322 if constexpr (std::is_same_v
<T
, std::vector
<CDir
*>>)
323 c
.reserve(c
.size() + subtrees
.size());
324 for (const auto& p
: subtrees
) {
325 c
.push_back(p
.first
);
328 void adjust_subtree_auth(CDir
*root
, mds_authority_t auth
, bool adjust_pop
=true);
329 void adjust_subtree_auth(CDir
*root
, mds_rank_t a
, mds_rank_t b
=CDIR_AUTH_UNKNOWN
) {
330 adjust_subtree_auth(root
, mds_authority_t(a
,b
));
332 void adjust_bounded_subtree_auth(CDir
*dir
, const std::set
<CDir
*>& bounds
, mds_authority_t auth
);
333 void adjust_bounded_subtree_auth(CDir
*dir
, const std::set
<CDir
*>& bounds
, mds_rank_t a
) {
334 adjust_bounded_subtree_auth(dir
, bounds
, mds_authority_t(a
, CDIR_AUTH_UNKNOWN
));
336 void adjust_bounded_subtree_auth(CDir
*dir
, const std::vector
<dirfrag_t
>& bounds
, const mds_authority_t
&auth
);
337 void adjust_bounded_subtree_auth(CDir
*dir
, const std::vector
<dirfrag_t
>& bounds
, mds_rank_t a
) {
338 adjust_bounded_subtree_auth(dir
, bounds
, mds_authority_t(a
, CDIR_AUTH_UNKNOWN
));
340 void map_dirfrag_set(const std::list
<dirfrag_t
>& dfs
, std::set
<CDir
*>& result
);
341 void try_subtree_merge(CDir
*root
);
342 void try_subtree_merge_at(CDir
*root
, std::set
<CInode
*> *to_eval
, bool adjust_pop
=true);
343 void eval_subtree_root(CInode
*diri
);
344 CDir
*get_subtree_root(CDir
*dir
);
345 CDir
*get_projected_subtree_root(CDir
*dir
);
346 bool is_leaf_subtree(CDir
*dir
) {
347 ceph_assert(subtrees
.count(dir
));
348 return subtrees
[dir
].empty();
350 void remove_subtree(CDir
*dir
);
351 bool is_subtree(CDir
*root
) {
352 return subtrees
.count(root
);
354 void get_subtree_bounds(CDir
*root
, std::set
<CDir
*>& bounds
);
355 void get_wouldbe_subtree_bounds(CDir
*root
, std::set
<CDir
*>& bounds
);
356 void verify_subtree_bounds(CDir
*root
, const std::set
<CDir
*>& bounds
);
357 void verify_subtree_bounds(CDir
*root
, const std::list
<dirfrag_t
>& bounds
);
359 void project_subtree_rename(CInode
*diri
, CDir
*olddir
, CDir
*newdir
);
360 void adjust_subtree_after_rename(CInode
*diri
, CDir
*olddir
, bool pop
);
362 auto get_auth_subtrees() {
363 std::vector
<CDir
*> c
;
364 for (auto& p
: subtrees
) {
365 auto& root
= p
.first
;
366 if (root
->is_auth()) {
373 auto get_fullauth_subtrees() {
374 std::vector
<CDir
*> c
;
375 for (auto& p
: subtrees
) {
376 auto& root
= p
.first
;
377 if (root
->is_full_dir_auth()) {
383 auto num_subtrees_fullauth() const {
385 for (auto& p
: subtrees
) {
386 auto& root
= p
.first
;
387 if (root
->is_full_dir_auth()) {
394 auto num_subtrees_fullnonauth() const {
396 for (auto& p
: subtrees
) {
397 auto& root
= p
.first
;
398 if (root
->is_full_dir_nonauth()) {
405 auto num_subtrees() const {
406 return subtrees
.size();
409 int get_num_client_requests();
411 MDRequestRef
request_start(const cref_t
<MClientRequest
>& req
);
412 MDRequestRef
request_start_peer(metareqid_t rid
, __u32 attempt
, const cref_t
<Message
> &m
);
413 MDRequestRef
request_start_internal(int op
);
414 bool have_request(metareqid_t rid
) {
415 return active_requests
.count(rid
);
417 MDRequestRef
request_get(metareqid_t rid
);
418 void request_pin_ref(MDRequestRef
& r
, CInode
*ref
, std::vector
<CDentry
*>& trace
);
419 void request_finish(MDRequestRef
& mdr
);
420 void request_forward(MDRequestRef
& mdr
, mds_rank_t mds
, int port
=0);
421 void dispatch_request(MDRequestRef
& mdr
);
422 void request_drop_foreign_locks(MDRequestRef
& mdr
);
423 void request_drop_non_rdlocks(MDRequestRef
& r
);
424 void request_drop_locks(MDRequestRef
& r
);
425 void request_cleanup(MDRequestRef
& r
);
427 void request_kill(MDRequestRef
& r
); // called when session closes
429 // journal/snap helpers
430 CInode
*pick_inode_snap(CInode
*in
, snapid_t follows
);
431 CInode
*cow_inode(CInode
*in
, snapid_t last
);
432 void journal_cow_dentry(MutationImpl
*mut
, EMetaBlob
*metablob
, CDentry
*dn
,
433 snapid_t follows
=CEPH_NOSNAP
,
434 CInode
**pcow_inode
=0, CDentry::linkage_t
*dnl
=0);
435 void journal_dirty_inode(MutationImpl
*mut
, EMetaBlob
*metablob
, CInode
*in
, snapid_t follows
=CEPH_NOSNAP
);
437 void project_rstat_inode_to_frag(const MutationRef
& mut
,
438 CInode
*cur
, CDir
*parent
, snapid_t first
,
439 int linkunlink
, SnapRealm
*prealm
);
440 void _project_rstat_inode_to_frag(const CInode::mempool_inode
* inode
, snapid_t ofirst
, snapid_t last
,
441 CDir
*parent
, int linkunlink
, bool update_inode
);
442 void project_rstat_frag_to_inode(const nest_info_t
& rstat
, const nest_info_t
& accounted_rstat
,
443 snapid_t ofirst
, snapid_t last
, CInode
*pin
, bool cow_head
);
444 void broadcast_quota_to_client(CInode
*in
, client_t exclude_ct
= -1, bool quota_change
= false);
445 void predirty_journal_parents(MutationRef mut
, EMetaBlob
*blob
,
446 CInode
*in
, CDir
*parent
,
447 int flags
, int linkunlink
=0,
448 snapid_t follows
=CEPH_NOSNAP
);
451 void add_uncommitted_leader(metareqid_t reqid
, LogSegment
*ls
, std::set
<mds_rank_t
> &peers
, bool safe
=false) {
452 uncommitted_leaders
[reqid
].ls
= ls
;
453 uncommitted_leaders
[reqid
].peers
= peers
;
454 uncommitted_leaders
[reqid
].safe
= safe
;
456 void wait_for_uncommitted_leader(metareqid_t reqid
, MDSContext
*c
) {
457 uncommitted_leaders
[reqid
].waiters
.push_back(c
);
459 bool have_uncommitted_leader(metareqid_t reqid
, mds_rank_t from
) {
460 auto p
= uncommitted_leaders
.find(reqid
);
461 return p
!= uncommitted_leaders
.end() && p
->second
.peers
.count(from
) > 0;
463 void log_leader_commit(metareqid_t reqid
);
464 void logged_leader_update(metareqid_t reqid
);
465 void _logged_leader_commit(metareqid_t reqid
);
466 void committed_leader_peer(metareqid_t r
, mds_rank_t from
);
467 void finish_committed_leaders();
469 void add_uncommitted_peer(metareqid_t reqid
, LogSegment
*, mds_rank_t
, MDPeerUpdate
*su
=nullptr);
470 void wait_for_uncommitted_peer(metareqid_t reqid
, MDSContext
*c
) {
471 uncommitted_peers
.at(reqid
).waiters
.push_back(c
);
473 void finish_uncommitted_peer(metareqid_t reqid
, bool assert_exist
=true);
474 MDPeerUpdate
* get_uncommitted_peer(metareqid_t reqid
, mds_rank_t leader
);
475 void _logged_peer_commit(mds_rank_t from
, metareqid_t reqid
);
477 void set_recovery_set(std::set
<mds_rank_t
>& s
);
478 void handle_mds_failure(mds_rank_t who
);
479 void handle_mds_recovery(mds_rank_t who
);
481 void recalc_auth_bits(bool replay
);
482 void remove_inode_recursive(CInode
*in
);
484 bool is_ambiguous_peer_update(metareqid_t reqid
, mds_rank_t leader
) {
485 auto p
= ambiguous_peer_updates
.find(leader
);
486 return p
!= ambiguous_peer_updates
.end() && p
->second
.count(reqid
);
488 void add_ambiguous_peer_update(metareqid_t reqid
, mds_rank_t leader
) {
489 ambiguous_peer_updates
[leader
].insert(reqid
);
491 void remove_ambiguous_peer_update(metareqid_t reqid
, mds_rank_t leader
) {
492 auto p
= ambiguous_peer_updates
.find(leader
);
493 auto q
= p
->second
.find(reqid
);
494 ceph_assert(q
!= p
->second
.end());
496 if (p
->second
.empty())
497 ambiguous_peer_updates
.erase(p
);
500 void add_rollback(metareqid_t reqid
, mds_rank_t leader
) {
501 resolve_need_rollback
[reqid
] = leader
;
503 void finish_rollback(metareqid_t reqid
, MDRequestRef
& mdr
);
506 void add_ambiguous_import(dirfrag_t base
, const std::vector
<dirfrag_t
>& bounds
);
507 void add_ambiguous_import(CDir
*base
, const std::set
<CDir
*>& bounds
);
508 bool have_ambiguous_import(dirfrag_t base
) {
509 return my_ambiguous_imports
.count(base
);
511 void get_ambiguous_import_bounds(dirfrag_t base
, std::vector
<dirfrag_t
>& bounds
) {
512 ceph_assert(my_ambiguous_imports
.count(base
));
513 bounds
= my_ambiguous_imports
[base
];
515 void cancel_ambiguous_import(CDir
*);
516 void finish_ambiguous_import(dirfrag_t dirino
);
517 void resolve_start(MDSContext
*resolve_done_
);
518 void send_resolves();
519 void maybe_send_pending_resolves() {
520 if (resolves_pending
)
521 send_subtree_resolves();
524 void _move_subtree_map_bound(dirfrag_t df
, dirfrag_t oldparent
, dirfrag_t newparent
,
525 std::map
<dirfrag_t
,std::vector
<dirfrag_t
> >& subtrees
);
526 ESubtreeMap
*create_subtree_map();
528 void clean_open_file_lists();
529 void dump_openfiles(Formatter
*f
);
530 bool dump_inode(Formatter
*f
, uint64_t number
);
532 void rejoin_start(MDSContext
*rejoin_done_
);
533 void rejoin_gather_finish();
534 void rejoin_send_rejoins();
535 void rejoin_export_caps(inodeno_t ino
, client_t client
, const cap_reconnect_t
& icr
,
536 int target
=-1, bool drop_path
=false) {
537 auto& ex
= cap_exports
[ino
];
539 auto &_icr
= ex
.second
[client
] = icr
;
543 void rejoin_recovered_caps(inodeno_t ino
, client_t client
, const cap_reconnect_t
& icr
,
544 mds_rank_t frommds
=MDS_RANK_NONE
, bool drop_path
=false) {
545 auto &_icr
= cap_imports
[ino
][client
][frommds
] = icr
;
549 void rejoin_recovered_client(client_t client
, const entity_inst_t
& inst
) {
550 rejoin_client_map
.emplace(client
, inst
);
552 bool rejoin_has_cap_reconnect(inodeno_t ino
) const {
553 return cap_imports
.count(ino
);
555 void add_replay_ino_alloc(inodeno_t ino
) {
556 cap_imports_missing
.insert(ino
); // avoid opening ino during cache rejoin
558 const cap_reconnect_t
*get_replay_cap_reconnect(inodeno_t ino
, client_t client
) {
559 if (cap_imports
.count(ino
) &&
560 cap_imports
[ino
].count(client
) &&
561 cap_imports
[ino
][client
].count(MDS_RANK_NONE
)) {
562 return &cap_imports
[ino
][client
][MDS_RANK_NONE
];
566 void remove_replay_cap_reconnect(inodeno_t ino
, client_t client
) {
567 ceph_assert(cap_imports
[ino
].size() == 1);
568 ceph_assert(cap_imports
[ino
][client
].size() == 1);
569 cap_imports
.erase(ino
);
571 void wait_replay_cap_reconnect(inodeno_t ino
, MDSContext
*c
) {
572 cap_reconnect_waiters
[ino
].push_back(c
);
575 void add_reconnected_cap(client_t client
, inodeno_t ino
, const cap_reconnect_t
& icr
) {
576 reconnected_cap_info_t
&info
= reconnected_caps
[ino
][client
];
577 info
.realm_ino
= inodeno_t(icr
.capinfo
.snaprealm
);
578 info
.snap_follows
= icr
.snap_follows
;
580 void set_reconnected_dirty_caps(client_t client
, inodeno_t ino
, int dirty
, bool snapflush
) {
581 reconnected_cap_info_t
&info
= reconnected_caps
[ino
][client
];
582 info
.dirty_caps
|= dirty
;
584 info
.snapflush
= snapflush
;
586 void add_reconnected_snaprealm(client_t client
, inodeno_t ino
, snapid_t seq
) {
587 reconnected_snaprealms
[ino
][client
] = seq
;
590 void rejoin_open_ino_finish(inodeno_t ino
, int ret
);
591 void rejoin_prefetch_ino_finish(inodeno_t ino
, int ret
);
592 void rejoin_open_sessions_finish(std::map
<client_t
,std::pair
<Session
*,uint64_t> >& session_map
);
593 bool process_imported_caps();
594 void choose_lock_states_and_reconnect_caps();
595 void prepare_realm_split(SnapRealm
*realm
, client_t client
, inodeno_t ino
,
596 std::map
<client_t
,ref_t
<MClientSnap
>>& splits
);
597 void prepare_realm_merge(SnapRealm
*realm
, SnapRealm
*parent_realm
, std::map
<client_t
,ref_t
<MClientSnap
>>& splits
);
598 void send_snaps(std::map
<client_t
,ref_t
<MClientSnap
>>& splits
);
599 Capability
* rejoin_import_cap(CInode
*in
, client_t client
, const cap_reconnect_t
& icr
, mds_rank_t frommds
);
600 void finish_snaprealm_reconnect(client_t client
, SnapRealm
*realm
, snapid_t seq
,
601 std::map
<client_t
,ref_t
<MClientSnap
>>& updates
);
602 Capability
* try_reconnect_cap(CInode
*in
, Session
*session
);
603 void export_remaining_imported_caps();
605 void do_cap_import(Session
*session
, CInode
*in
, Capability
*cap
,
606 uint64_t p_cap_id
, ceph_seq_t p_seq
, ceph_seq_t p_mseq
,
607 int peer
, int p_flags
);
608 void do_delayed_cap_imports();
609 void rebuild_need_snapflush(CInode
*head_in
, SnapRealm
*realm
, client_t client
,
610 snapid_t snap_follows
);
611 void open_snaprealms();
613 bool open_undef_inodes_dirfrags();
614 void opened_undef_inode(CInode
*in
);
615 void opened_undef_dirfrag(CDir
*dir
) {
616 rejoin_undef_dirfrags
.erase(dir
);
619 void reissue_all_caps();
621 void start_files_to_recover();
622 void do_file_recover();
623 void queue_file_recover(CInode
*in
);
624 void _queued_file_recover_cow(CInode
*in
, MutationRef
& mut
);
626 void handle_conf_change(const std::set
<std::string
>& changed
, const MDSMap
& mds_map
);
632 CInode
*get_root() { return root
; }
633 CInode
*get_myin() { return myin
; }
635 size_t get_cache_size() { return lru
.lru_get_size(); }
638 std::pair
<bool, uint64_t> trim(uint64_t count
=0);
640 bool trim_non_auth_subtree(CDir
*directory
);
641 void standby_trim_segment(LogSegment
*ls
);
642 void try_trim_non_auth_subtree(CDir
*dir
);
643 bool can_trim_non_auth_dirfrag(CDir
*dir
) {
644 return my_ambiguous_imports
.count((dir
)->dirfrag()) == 0 &&
645 uncommitted_peer_rename_olddir
.count(dir
->inode
) == 0;
649 * For all unreferenced inodes, dirs, dentries below an inode, compose
650 * expiry messages. This is used when giving up all replicas of entities
651 * for an MDS peer in the 'stopping' state, such that the peer can
652 * empty its cache and finish shutting down.
654 * We have to make sure we're only expiring un-referenced items to
655 * avoid interfering with ongoing stray-movement (we can't distinguish
656 * between the "moving my strays" and "waiting for my cache to empty"
657 * phases within 'stopping')
659 * @return false if we completed cleanly, true if caller should stop
660 * expiring because we hit something with refs.
662 bool expire_recursive(CInode
*in
, expiremap
& expiremap
);
664 void trim_client_leases();
665 void check_memory_usage();
667 void shutdown_start();
668 void shutdown_check();
669 bool shutdown_pass();
670 bool shutdown(); // clear cache (ie at shutodwn)
671 bool shutdown_export_strays();
672 void shutdown_export_stray_finish(inodeno_t ino
) {
673 if (shutdown_exporting_strays
.erase(ino
))
674 shutdown_export_strays();
678 bool have_inode(vinodeno_t vino
) {
679 if (vino
.snapid
== CEPH_NOSNAP
)
680 return inode_map
.count(vino
.ino
) ? true : false;
682 return snap_inode_map
.count(vino
) ? true : false;
684 bool have_inode(inodeno_t ino
, snapid_t snap
=CEPH_NOSNAP
) {
685 return have_inode(vinodeno_t(ino
, snap
));
687 CInode
* get_inode(vinodeno_t vino
) {
688 if (vino
.snapid
== CEPH_NOSNAP
) {
689 auto p
= inode_map
.find(vino
.ino
);
690 if (p
!= inode_map
.end())
693 auto p
= snap_inode_map
.find(vino
);
694 if (p
!= snap_inode_map
.end())
699 CInode
* get_inode(inodeno_t ino
, snapid_t s
=CEPH_NOSNAP
) {
700 return get_inode(vinodeno_t(ino
, s
));
702 CInode
* lookup_snap_inode(vinodeno_t vino
) {
703 auto p
= snap_inode_map
.lower_bound(vino
);
704 if (p
!= snap_inode_map
.end() &&
705 p
->second
->ino() == vino
.ino
&& p
->second
->first
<= vino
.snapid
)
710 CDir
* get_dirfrag(dirfrag_t df
) {
711 CInode
*in
= get_inode(df
.ino
);
714 return in
->get_dirfrag(df
.frag
);
716 CDir
* get_dirfrag(inodeno_t ino
, std::string_view dn
) {
717 CInode
*in
= get_inode(ino
);
720 frag_t fg
= in
->pick_dirfrag(dn
);
721 return in
->get_dirfrag(fg
);
723 CDir
* get_force_dirfrag(dirfrag_t df
, bool replay
) {
724 CInode
*diri
= get_inode(df
.ino
);
727 CDir
*dir
= force_dir_fragment(diri
, df
.frag
, replay
);
729 dir
= diri
->get_dirfrag(df
.frag
);
733 MDSCacheObject
*get_object(const MDSCacheObjectInfo
&info
);
735 void add_inode(CInode
*in
);
737 void remove_inode(CInode
*in
);
739 void touch_dentry(CDentry
*dn
) {
740 if (dn
->state_test(CDentry::STATE_BOTTOMLRU
)) {
741 bottom_lru
.lru_midtouch(dn
);
746 lru
.lru_midtouch(dn
);
749 void touch_dentry_bottom(CDentry
*dn
) {
750 if (dn
->state_test(CDentry::STATE_BOTTOMLRU
))
752 lru
.lru_bottouch(dn
);
756 void truncate_inode(CInode
*in
, LogSegment
*ls
);
757 void _truncate_inode(CInode
*in
, LogSegment
*ls
);
758 void truncate_inode_finish(CInode
*in
, LogSegment
*ls
);
759 void truncate_inode_write_finish(CInode
*in
, LogSegment
*ls
,
760 uint32_t block_size
);
761 void truncate_inode_logged(CInode
*in
, MutationRef
& mut
);
763 void add_recovered_truncate(CInode
*in
, LogSegment
*ls
);
764 void remove_recovered_truncate(CInode
*in
, LogSegment
*ls
);
765 void start_recovered_truncates();
767 // purge unsafe inodes
768 void start_purge_inodes();
769 void purge_inodes(const interval_set
<inodeno_t
>& i
, LogSegment
*ls
);
771 CDir
*get_auth_container(CDir
*in
);
772 CDir
*get_export_container(CDir
*dir
);
773 void find_nested_exports(CDir
*dir
, std::set
<CDir
*>& s
);
774 void find_nested_exports_under(CDir
*import
, CDir
*dir
, std::set
<CDir
*>& s
);
777 void create_unlinked_system_inode(CInode
*in
, inodeno_t ino
,
779 CInode
*create_system_inode(inodeno_t ino
, int mode
);
780 CInode
*create_root_inode();
782 void create_empty_hierarchy(MDSGather
*gather
);
783 void create_mydir_hierarchy(MDSGather
*gather
);
785 bool is_open() { return open
; }
786 void wait_for_open(MDSContext
*c
) {
787 waiting_for_open
.push_back(c
);
790 void open_root_inode(MDSContext
*c
);
792 void open_mydir_inode(MDSContext
*c
);
793 void open_mydir_frag(MDSContext
*c
);
794 void populate_mydir();
796 void _create_system_file(CDir
*dir
, std::string_view name
, CInode
*in
, MDSContext
*fin
);
797 void _create_system_file_finish(MutationRef
& mut
, CDentry
*dn
,
798 version_t dpv
, MDSContext
*fin
);
800 void open_foreign_mdsdir(inodeno_t ino
, MDSContext
*c
);
801 CDir
*get_stray_dir(CInode
*in
);
804 * Find the given dentry (and whether it exists or not), its ancestors,
805 * and get them all into memory and usable on this MDS. This function
806 * makes a best-effort attempt to load everything; if it needs to
807 * go away and do something then it will put the request on a waitlist.
808 * It prefers the mdr, then the req, then the fin. (At least one of these
811 * At least one of the params mdr, req, and fin must be non-null.
813 * @param mdr The MDRequest associated with the path. Can be null.
814 * @param cf A MDSContextFactory for waiter building.
815 * @param path The path to traverse to.
817 * @param flags Specifies different lookup behaviors.
818 * By default, path_traverse() forwards the request to the auth MDS if that
819 * is appropriate (ie, if it doesn't know the contents of a directory).
820 * MDS_TRAVERSE_DISCOVER: Instead of forwarding request, path_traverse()
821 * attempts to look up the path from a different MDS (and bring them into
822 * its cache as replicas).
823 * MDS_TRAVERSE_PATH_LOCKED: path_traverse() will procceed when xlocked
824 * dentry is encountered.
825 * MDS_TRAVERSE_WANT_DENTRY: Caller wants tail dentry. Add a null dentry if
826 * tail dentry does not exist. return 0 even tail dentry is null.
827 * MDS_TRAVERSE_WANT_INODE: Caller only wants target inode if it exists, or
828 * wants tail dentry if target inode does not exist and MDS_TRAVERSE_WANT_DENTRY
830 * MDS_TRAVERSE_WANT_AUTH: Always forward request to auth MDS of target inode
831 * or auth MDS of tail dentry (MDS_TRAVERSE_WANT_DENTRY is set).
832 * MDS_TRAVERSE_XLOCK_DENTRY: Caller wants to xlock tail dentry if MDS_TRAVERSE_WANT_INODE
833 * is not set or (MDS_TRAVERSE_WANT_INODE is set but target inode does not exist)
835 * @param pdnvec Data return parameter -- on success, contains a
836 * vector of dentries. On failure, is either empty or contains the
837 * full trace of traversable dentries.
838 * @param pin Data return parameter -- if successful, points to the inode
839 * associated with filepath. If unsuccessful, is null.
841 * @returns 0 on success, 1 on "not done yet", 2 on "forwarding", -errno otherwise.
842 * If it returns 1, the requester associated with this call has been placed
843 * on the appropriate waitlist, and it should unwind itself and back out.
844 * If it returns 2 the request has been forwarded, and again the requester
845 * should unwind itself and back out.
847 int path_traverse(MDRequestRef
& mdr
, MDSContextFactory
& cf
,
848 const filepath
& path
, int flags
,
849 std::vector
<CDentry
*> *pdnvec
, CInode
**pin
=nullptr);
851 int maybe_request_forward_to_auth(MDRequestRef
& mdr
, MDSContextFactory
& cf
,
854 CInode
*cache_traverse(const filepath
& path
);
856 void open_remote_dirfrag(CInode
*diri
, frag_t fg
, MDSContext
*fin
);
857 CInode
*get_dentry_inode(CDentry
*dn
, MDRequestRef
& mdr
, bool projected
=false);
859 bool parallel_fetch(std::map
<inodeno_t
,filepath
>& pathmap
, std::set
<inodeno_t
>& missing
);
860 bool parallel_fetch_traverse_dir(inodeno_t ino
, filepath
& path
,
861 std::set
<CDir
*>& fetch_queue
, std::set
<inodeno_t
>& missing
,
862 C_GatherBuilder
&gather_bld
);
864 void open_remote_dentry(CDentry
*dn
, bool projected
, MDSContext
*fin
,
865 bool want_xlocked
=false);
866 void _open_remote_dentry_finish(CDentry
*dn
, inodeno_t ino
, MDSContext
*fin
,
867 bool want_xlocked
, int r
);
869 void make_trace(std::vector
<CDentry
*>& trace
, CInode
*in
);
871 void open_ino(inodeno_t ino
, int64_t pool
, MDSContext
*fin
,
872 bool want_replica
=true, bool want_xlocked
=false,
873 std::vector
<inode_backpointer_t
> *ancestors_hint
=nullptr,
874 mds_rank_t auth_hint
=MDS_RANK_NONE
);
875 void open_ino_batch_start();
876 void open_ino_batch_submit();
877 void kick_open_ino_peers(mds_rank_t who
);
879 void find_ino_peers(inodeno_t ino
, MDSContext
*c
,
880 mds_rank_t hint
=MDS_RANK_NONE
, bool path_locked
=false);
881 void _do_find_ino_peer(find_ino_peer_info_t
& fip
);
882 void handle_find_ino(const cref_t
<MMDSFindIno
> &m
);
883 void handle_find_ino_reply(const cref_t
<MMDSFindInoReply
> &m
);
884 void kick_find_ino_peers(mds_rank_t who
);
886 SnapRealm
*get_global_snaprealm() const { return global_snaprealm
; }
887 void create_global_snaprealm();
888 void do_realm_invalidate_and_update_notify(CInode
*in
, int snapop
, bool notify_clients
=true);
889 void send_snap_update(CInode
*in
, version_t stid
, int snap_op
);
890 void handle_snap_update(const cref_t
<MMDSSnapUpdate
> &m
);
891 void notify_global_snaprealm_update(int snap_op
);
894 void fetch_backtrace(inodeno_t ino
, int64_t pool
, bufferlist
& bl
, Context
*fin
);
895 uint64_t get_num_strays() const { return stray_manager
.get_num_strays(); }
898 void dispatch(const cref_t
<Message
> &m
);
900 void encode_replica_dir(CDir
*dir
, mds_rank_t to
, bufferlist
& bl
);
901 void encode_replica_dentry(CDentry
*dn
, mds_rank_t to
, bufferlist
& bl
);
902 void encode_replica_inode(CInode
*in
, mds_rank_t to
, bufferlist
& bl
,
905 void decode_replica_dir(CDir
*&dir
, bufferlist::const_iterator
& p
, CInode
*diri
, mds_rank_t from
, MDSContext::vec
& finished
);
906 void decode_replica_dentry(CDentry
*&dn
, bufferlist::const_iterator
& p
, CDir
*dir
, MDSContext::vec
& finished
);
907 void decode_replica_inode(CInode
*&in
, bufferlist::const_iterator
& p
, CDentry
*dn
, MDSContext::vec
& finished
);
909 void encode_replica_stray(CDentry
*straydn
, mds_rank_t who
, bufferlist
& bl
);
910 void decode_replica_stray(CDentry
*&straydn
, CInode
**in
, const bufferlist
&bl
, mds_rank_t from
);
913 void encode_remote_dentry_link(CDentry::linkage_t
*dnl
, bufferlist
& bl
);
914 void decode_remote_dentry_link(CDir
*dir
, CDentry
*dn
, bufferlist::const_iterator
& p
);
915 void send_dentry_link(CDentry
*dn
, MDRequestRef
& mdr
);
916 void send_dentry_unlink(CDentry
*dn
, CDentry
*straydn
, MDRequestRef
& mdr
, bool unlinking
=false);
918 void wait_for_uncommitted_fragment(dirfrag_t dirfrag
, MDSContext
*c
) {
919 uncommitted_fragments
.at(dirfrag
).waiters
.push_back(c
);
921 bool is_any_uncommitted_fragment() const {
922 return !uncommitted_fragments
.empty();
924 void wait_for_uncommitted_fragments(MDSContext
* finisher
);
925 void rollback_uncommitted_fragments();
927 void split_dir(CDir
*dir
, int byn
);
928 void merge_dir(CInode
*diri
, frag_t fg
);
930 void find_stale_fragment_freeze();
931 void fragment_freeze_inc_num_waiters(CDir
*dir
);
932 bool fragment_are_all_frozen(CDir
*dir
);
933 int get_num_fragmenting_dirs() { return fragments
.size(); }
936 //int send_inode_updates(CInode *in);
937 //void handle_inode_update(MInodeUpdate *m);
939 int send_dir_updates(CDir
*in
, bool bcast
=false);
940 void handle_dir_update(const cref_t
<MDirUpdate
> &m
);
942 // -- cache expiration --
943 void handle_cache_expire(const cref_t
<MCacheExpire
> &m
);
944 void process_delayed_expire(CDir
*dir
);
945 void discard_delayed_expire(CDir
*dir
);
948 void handle_mdsmap(const MDSMap
&mdsmap
, const MDSMap
&oldmap
);
950 int dump_cache() { return dump_cache({}, nullptr, 0); }
951 int dump_cache(std::string_view filename
, double timeout
);
952 int dump_cache(Formatter
*f
, double timeout
);
953 void dump_tree(CInode
*in
, const int cur_depth
, const int max_depth
, Formatter
*f
);
955 void cache_status(Formatter
*f
);
957 void dump_resolve_status(Formatter
*f
) const;
958 void dump_rejoin_status(Formatter
*f
) const;
962 void show_subtrees(int dbl
=10, bool force_print
=false);
964 CInode
*hack_pick_random_inode() {
965 ceph_assert(!inode_map
.empty());
966 int n
= rand() % inode_map
.size();
967 auto p
= inode_map
.begin();
972 void flush_dentry(std::string_view path
, Context
*fin
);
974 * Create and start an OP_ENQUEUE_SCRUB
976 void enqueue_scrub(std::string_view path
, std::string_view tag
,
977 bool force
, bool recursive
, bool repair
,
978 Formatter
*f
, Context
*fin
);
979 void repair_inode_stats(CInode
*diri
);
980 void repair_dirfrag_stats(CDir
*dir
);
981 void rdlock_dirfrags_stats(CInode
*diri
, MDSInternalContext
*fin
);
987 LRU lru
; // dentry lru for expiring items from cache
988 LRU bottom_lru
; // dentries that should be trimmed ASAP
992 int num_shadow_inodes
= 0;
994 int num_inodes_with_caps
= 0;
996 unsigned max_dir_commit_size
;
998 file_layout_t default_file_layout
;
999 file_layout_t default_log_layout
;
1001 // -- client leases --
1002 static constexpr std::size_t client_lease_pools
= 3;
1003 std::array
<float, client_lease_pools
> client_lease_durations
{5.0, 30.0, 300.0};
1005 // -- client caps --
1006 uint64_t last_cap_id
= 0;
1008 std::map
<ceph_tid_t
, discover_info_t
> discovers
;
1009 ceph_tid_t discover_last_tid
= 0;
1012 std::map
<int, std::map
<inodeno_t
, MDSContext::vec
> > waiting_for_base_ino
;
1014 std::map
<inodeno_t
,std::map
<client_t
, reconnected_cap_info_t
> > reconnected_caps
; // inode -> client -> snap_follows,realmino
1015 std::map
<inodeno_t
,std::map
<client_t
, snapid_t
> > reconnected_snaprealms
; // realmino -> client -> realmseq
1018 std::set
<CInode
*> rejoin_pending_snaprealms
;
1019 // cap imports. delayed snap parent opens.
1020 std::map
<client_t
,std::set
<CInode
*> > delayed_imported_caps
;
1023 std::unique_ptr
<Migrator
> migrator
;
1025 bool did_shutdown_log_cap
= false;
1027 std::map
<ceph_tid_t
, find_ino_peer_info_t
> find_ino_peer
;
1028 ceph_tid_t find_ino_peer_last_tid
= 0;
1030 // delayed cache expire
1031 std::map
<CDir
*, expiremap
> delayed_expire
; // subtree root -> expire msg
1033 /* Because exports may fail, this set lets us keep track of inodes that need exporting. */
1034 std::set
<CInode
*> export_pin_queue
;
1035 std::set
<CInode
*> export_pin_delayed_queue
;
1036 std::set
<CInode
*> export_ephemeral_pins
;
1038 OpenFileTable open_file_table
;
1040 double export_ephemeral_random_max
= 0.0;
1043 // track leader requests whose peers haven't acknowledged commit
1046 std::set
<mds_rank_t
> peers
;
1047 LogSegment
*ls
= nullptr;
1048 MDSContext::vec waiters
;
1050 bool committing
= false;
1051 bool recovering
= false;
1057 LogSegment
*ls
= nullptr;
1058 MDPeerUpdate
*su
= nullptr;
1059 MDSContext::vec waiters
;
1062 struct open_ino_info_t
{
1063 open_ino_info_t() {}
1064 std::vector
<inode_backpointer_t
> ancestors
;
1065 std::set
<mds_rank_t
> checked
;
1066 mds_rank_t checking
= MDS_RANK_NONE
;
1067 mds_rank_t auth_hint
= MDS_RANK_NONE
;
1068 bool check_peers
= true;
1069 bool fetch_backtrace
= true;
1070 bool discover
= false;
1071 bool want_replica
= false;
1072 bool want_xlocked
= false;
1076 MDSContext::vec waiters
;
1079 ceph_tid_t open_ino_last_tid
= 0;
1080 std::map
<inodeno_t
,open_ino_info_t
> opening_inodes
;
1082 bool open_ino_batch
= false;
1083 std::map
<CDir
*, std::pair
<std::vector
<std::string
>, MDSContext::vec
> > open_ino_batched_fetch
;
1085 friend struct C_MDC_OpenInoTraverseDir
;
1086 friend struct C_MDC_OpenInoParentOpened
;
1087 friend struct C_MDC_RetryScanStray
;
1089 friend class C_IO_MDC_OpenInoBacktraceFetched
;
1090 friend class C_MDC_Join
;
1091 friend class C_MDC_RespondInternalRequest
;
1093 friend class EPeerUpdate
;
1094 friend class ECommitted
;
1096 void set_readonly() { readonly
= true; }
1098 void handle_resolve(const cref_t
<MMDSResolve
> &m
);
1099 void handle_resolve_ack(const cref_t
<MMDSResolveAck
> &m
);
1100 void process_delayed_resolve();
1101 void discard_delayed_resolve(mds_rank_t who
);
1102 void maybe_resolve_finish();
1103 void disambiguate_my_imports();
1104 void disambiguate_other_imports();
1105 void trim_unlinked_inodes();
1107 void send_peer_resolves();
1108 void send_subtree_resolves();
1109 void maybe_finish_peer_resolve();
1111 void rejoin_walk(CDir
*dir
, const ref_t
<MMDSCacheRejoin
> &rejoin
);
1112 void handle_cache_rejoin(const cref_t
<MMDSCacheRejoin
> &m
);
1113 void handle_cache_rejoin_weak(const cref_t
<MMDSCacheRejoin
> &m
);
1114 CInode
* rejoin_invent_inode(inodeno_t ino
, snapid_t last
);
1115 CDir
* rejoin_invent_dirfrag(dirfrag_t df
);
1116 void handle_cache_rejoin_strong(const cref_t
<MMDSCacheRejoin
> &m
);
1117 void rejoin_scour_survivor_replicas(mds_rank_t from
, const cref_t
<MMDSCacheRejoin
> &ack
,
1118 std::set
<vinodeno_t
>& acked_inodes
,
1119 std::set
<SimpleLock
*>& gather_locks
);
1120 void handle_cache_rejoin_ack(const cref_t
<MMDSCacheRejoin
> &m
);
1121 void rejoin_send_acks();
1122 void rejoin_trim_undef_inodes();
1123 void maybe_send_pending_rejoins() {
1124 if (rejoins_pending
)
1125 rejoin_send_rejoins();
1128 void touch_inode(CInode
*in
) {
1129 if (in
->get_parent_dn())
1130 touch_dentry(in
->get_projected_parent_dn());
1133 void inode_remove_replica(CInode
*in
, mds_rank_t rep
, bool rejoin
,
1134 std::set
<SimpleLock
*>& gather_locks
);
1135 void dentry_remove_replica(CDentry
*dn
, mds_rank_t rep
, std::set
<SimpleLock
*>& gather_locks
);
1137 void rename_file(CDentry
*srcdn
, CDentry
*destdn
);
1139 void _open_ino_backtrace_fetched(inodeno_t ino
, bufferlist
& bl
, int err
);
1140 void _open_ino_parent_opened(inodeno_t ino
, int ret
);
1141 void _open_ino_traverse_dir(inodeno_t ino
, open_ino_info_t
& info
, int err
);
1142 void _open_ino_fetch_dir(inodeno_t ino
, const cref_t
<MMDSOpenIno
> &m
, bool parent
,
1143 CDir
*dir
, std::string_view dname
);
1144 int open_ino_traverse_dir(inodeno_t ino
, const cref_t
<MMDSOpenIno
> &m
,
1145 const std::vector
<inode_backpointer_t
>& ancestors
,
1146 bool discover
, bool want_xlocked
, mds_rank_t
*hint
);
1147 void open_ino_finish(inodeno_t ino
, open_ino_info_t
& info
, int err
);
1148 void do_open_ino(inodeno_t ino
, open_ino_info_t
& info
, int err
);
1149 void do_open_ino_peer(inodeno_t ino
, open_ino_info_t
& info
);
1150 void handle_open_ino(const cref_t
<MMDSOpenIno
> &m
, int err
=0);
1151 void handle_open_ino_reply(const cref_t
<MMDSOpenInoReply
> &m
);
1153 void scan_stray_dir(dirfrag_t next
=dirfrag_t());
1155 void handle_discover(const cref_t
<MDiscover
> &dis
);
1156 void handle_discover_reply(const cref_t
<MDiscoverReply
> &m
);
1157 void handle_dentry_link(const cref_t
<MDentryLink
> &m
);
1158 void handle_dentry_unlink(const cref_t
<MDentryUnlink
> &m
);
1159 void handle_dentry_unlink_ack(const cref_t
<MDentryUnlinkAck
> &m
);
1161 int dump_cache(std::string_view fn
, Formatter
*f
, double timeout
);
1163 void flush_dentry_work(MDRequestRef
& mdr
);
1165 * Resolve path to a dentry and pass it onto the ScrubStack.
1167 * TODO: return enough information to the original mdr formatter
1168 * and completion that they can subsequeuntly check the progress of
1169 * this scrub (we won't block them on a whole scrub as it can take a very
1172 void enqueue_scrub_work(MDRequestRef
& mdr
);
1173 void repair_inode_stats_work(MDRequestRef
& mdr
);
1174 void repair_dirfrag_stats_work(MDRequestRef
& mdr
);
1175 void rdlock_dirfrags_stats_work(MDRequestRef
& mdr
);
1177 ceph::unordered_map
<inodeno_t
,CInode
*> inode_map
; // map of head inodes by ino
1178 std::map
<vinodeno_t
, CInode
*> snap_inode_map
; // map of snap inodes by ino
1179 CInode
*root
= nullptr; // root inode
1180 CInode
*myin
= nullptr; // .ceph/mds%d dir
1182 bool readonly
= false;
1184 int stray_index
= 0;
1185 int stray_fragmenting_index
= -1;
1187 std::set
<CInode
*> base_inodes
;
1189 std::unique_ptr
<PerfCounters
> logger
;
1192 std::array
<xlist
<ClientLease
*>, client_lease_pools
> client_leases
{};
1194 /* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */
1195 std::map
<CDir
*,std::set
<CDir
*> > subtrees
;
1196 std::map
<CInode
*,std::list
<std::pair
<CDir
*,CDir
*> > > projected_subtree_renames
; // renamed ino -> target dir
1199 ceph::unordered_map
<metareqid_t
, MDRequestRef
> active_requests
;
1202 std::set
<mds_rank_t
> recovery_set
;
1205 // from EImportStart w/o EImportFinish during journal replay
1206 std::map
<dirfrag_t
, std::vector
<dirfrag_t
> > my_ambiguous_imports
;
1207 // from MMDSResolves
1208 std::map
<mds_rank_t
, std::map
<dirfrag_t
, std::vector
<dirfrag_t
> > > other_ambiguous_imports
;
1210 std::map
<CInode
*, int> uncommitted_peer_rename_olddir
; // peer: preserve the non-auth dir until seeing commit.
1211 std::map
<CInode
*, int> uncommitted_peer_unlink
; // peer: preserve the unlinked inode until seeing commit.
1213 std::map
<metareqid_t
, uleader
> uncommitted_leaders
; // leader: req -> peer set
1214 std::map
<metareqid_t
, upeer
> uncommitted_peers
; // peer: preserve the peer req until seeing commit.
1216 std::set
<metareqid_t
> pending_leaders
;
1217 std::map
<int, std::set
<metareqid_t
> > ambiguous_peer_updates
;
1219 bool resolves_pending
= false;
1220 std::set
<mds_rank_t
> resolve_gather
; // nodes i need resolves from
1221 std::set
<mds_rank_t
> resolve_ack_gather
; // nodes i need a resolve_ack from
1222 std::set
<version_t
> resolve_snapclient_commits
;
1223 std::map
<metareqid_t
, mds_rank_t
> resolve_need_rollback
; // rollbacks i'm writing to the journal
1224 std::map
<mds_rank_t
, cref_t
<MMDSResolve
>> delayed_resolve
;
1227 bool rejoins_pending
= false;
1228 std::set
<mds_rank_t
> rejoin_gather
; // nodes from whom i need a rejoin
1229 std::set
<mds_rank_t
> rejoin_sent
; // nodes i sent a rejoin to
1230 std::set
<mds_rank_t
> rejoin_ack_sent
; // nodes i sent a rejoin to
1231 std::set
<mds_rank_t
> rejoin_ack_gather
; // nodes from whom i need a rejoin ack
1232 std::map
<mds_rank_t
,std::map
<inodeno_t
,std::map
<client_t
,Capability::Import
> > > rejoin_imported_caps
;
1233 std::map
<inodeno_t
,std::pair
<mds_rank_t
,std::map
<client_t
,Capability::Export
> > > rejoin_peer_exports
;
1235 std::map
<client_t
,entity_inst_t
> rejoin_client_map
;
1236 std::map
<client_t
,client_metadata_t
> rejoin_client_metadata_map
;
1237 std::map
<client_t
,std::pair
<Session
*,uint64_t> > rejoin_session_map
;
1239 std::map
<inodeno_t
,std::pair
<mds_rank_t
,std::map
<client_t
,cap_reconnect_t
> > > cap_exports
; // ino -> target, client -> capex
1241 std::map
<inodeno_t
,std::map
<client_t
,std::map
<mds_rank_t
,cap_reconnect_t
> > > cap_imports
; // ino -> client -> frommds -> capex
1242 std::set
<inodeno_t
> cap_imports_missing
;
1243 std::map
<inodeno_t
, MDSContext::vec
> cap_reconnect_waiters
;
1244 int cap_imports_num_opening
= 0;
1246 std::set
<CInode
*> rejoin_undef_inodes
;
1247 std::set
<CInode
*> rejoin_potential_updated_scatterlocks
;
1248 std::set
<CDir
*> rejoin_undef_dirfrags
;
1249 std::map
<mds_rank_t
, std::set
<CInode
*> > rejoin_unlinked_inodes
;
1251 std::vector
<CInode
*> rejoin_recover_q
, rejoin_check_q
;
1252 std::list
<SimpleLock
*> rejoin_eval_locks
;
1253 MDSContext::vec rejoin_waiters
;
1255 std::unique_ptr
<MDSContext
> rejoin_done
;
1256 std::unique_ptr
<MDSContext
> resolve_done
;
1258 StrayManager stray_manager
;
1261 std::set
<inodeno_t
> replay_taken_inos
; // the inos have been taken when replaying
1263 // -- fragmenting --
1267 bool committed
= false;
1268 LogSegment
*ls
= nullptr;
1269 MDSContext::vec waiters
;
1270 frag_vec_t old_frags
;
1271 bufferlist rollback
;
1274 struct fragment_info_t
{
1275 fragment_info_t() {}
1276 bool is_fragmenting() { return !resultfrags
.empty(); }
1277 uint64_t get_tid() { return mdr
? mdr
->reqid
.tid
: 0; }
1279 std::vector
<CDir
*> dirs
;
1280 std::vector
<CDir
*> resultfrags
;
1282 std::set
<mds_rank_t
> notify_ack_waiting
;
1283 bool finishing
= false;
1285 // for deadlock detection
1286 bool all_frozen
= false;
1287 utime_t last_cum_auth_pins_change
;
1288 int last_cum_auth_pins
= 0;
1289 int num_remote_waiters
= 0; // number of remote authpin waiters
1292 typedef std::map
<dirfrag_t
,fragment_info_t
>::iterator fragment_info_iterator
;
1294 friend class EFragment
;
1295 friend class C_MDC_FragmentFrozen
;
1296 friend class C_MDC_FragmentMarking
;
1297 friend class C_MDC_FragmentPrep
;
1298 friend class C_MDC_FragmentStore
;
1299 friend class C_MDC_FragmentCommit
;
1300 friend class C_MDC_FragmentRollback
;
1301 friend class C_IO_MDC_FragmentPurgeOld
;
1304 static const unsigned int SUBTREES_COUNT_THRESHOLD
= 5;
1305 static const unsigned int SUBTREES_DEPTH_THRESHOLD
= 5;
1307 CInode
*get_stray() {
1308 return strays
[stray_index
];
1311 void identify_files_to_recover();
1313 std::pair
<bool, uint64_t> trim_lru(uint64_t count
, expiremap
& expiremap
);
1314 bool trim_dentry(CDentry
*dn
, expiremap
& expiremap
);
1315 void trim_dirfrag(CDir
*dir
, CDir
*con
, expiremap
& expiremap
);
1316 bool trim_inode(CDentry
*dn
, CInode
*in
, CDir
*con
, expiremap
&);
1317 void send_expire_messages(expiremap
& expiremap
);
1318 void trim_non_auth(); // trim out trimmable non-auth items
1320 void adjust_dir_fragments(CInode
*diri
, frag_t basefrag
, int bits
,
1321 std::vector
<CDir
*>* frags
, MDSContext::vec
& waiters
, bool replay
);
1322 void adjust_dir_fragments(CInode
*diri
,
1323 const std::vector
<CDir
*>& srcfrags
,
1324 frag_t basefrag
, int bits
,
1325 std::vector
<CDir
*>* resultfrags
,
1326 MDSContext::vec
& waiters
,
1328 CDir
*force_dir_fragment(CInode
*diri
, frag_t fg
, bool replay
=true);
1329 void get_force_dirfrag_bound_set(const std::vector
<dirfrag_t
>& dfs
, std::set
<CDir
*>& bounds
);
1331 bool can_fragment(CInode
*diri
, const std::vector
<CDir
*>& dirs
);
1332 void fragment_freeze_dirs(const std::vector
<CDir
*>& dirs
);
1333 void fragment_mark_and_complete(MDRequestRef
& mdr
);
1334 void fragment_frozen(MDRequestRef
& mdr
, int r
);
1335 void fragment_unmark_unfreeze_dirs(const std::vector
<CDir
*>& dirs
);
1336 void fragment_drop_locks(fragment_info_t
&info
);
1337 void fragment_maybe_finish(const fragment_info_iterator
& it
);
1338 void dispatch_fragment_dir(MDRequestRef
& mdr
);
1339 void _fragment_logged(MDRequestRef
& mdr
);
1340 void _fragment_stored(MDRequestRef
& mdr
);
1341 void _fragment_committed(dirfrag_t f
, const MDRequestRef
& mdr
);
1342 void _fragment_old_purged(dirfrag_t f
, int bits
, const MDRequestRef
& mdr
);
1344 void handle_fragment_notify(const cref_t
<MMDSFragmentNotify
> &m
);
1345 void handle_fragment_notify_ack(const cref_t
<MMDSFragmentNotifyAck
> &m
);
1347 void add_uncommitted_fragment(dirfrag_t basedirfrag
, int bits
, const frag_vec_t
& old_frag
,
1348 LogSegment
*ls
, bufferlist
*rollback
=NULL
);
1349 void finish_uncommitted_fragment(dirfrag_t basedirfrag
, int op
);
1350 void rollback_uncommitted_fragment(dirfrag_t basedirfrag
, frag_vec_t
&& old_frags
);
1352 void upkeep_main(void);
1354 uint64_t cache_memory_limit
;
1355 double cache_reservation
;
1356 double cache_health_threshold
;
1357 std::array
<CInode
*, NUM_STRAY
> strays
{}; // my stray dir
1359 bool export_ephemeral_distributed_config
;
1360 bool export_ephemeral_random_config
;
1361 unsigned export_ephemeral_dist_frag_bits
;
1363 // Stores the symlink target on the file object's head
1364 bool symlink_recovery
;
1366 // File size recovery
1367 RecoveryQueue recovery_queue
;
1370 std::set
<inodeno_t
> shutdown_exporting_strays
;
1371 std::pair
<dirfrag_t
, std::string
> shutdown_export_next
;
1373 bool opening_root
= false, open
= false;
1374 MDSContext::vec waiting_for_open
;
1377 SnapRealm
*global_snaprealm
= nullptr;
1379 std::map
<dirfrag_t
, ufragment
> uncommitted_fragments
;
1381 std::map
<dirfrag_t
,fragment_info_t
> fragments
;
1383 DecayCounter trim_counter
;
1385 std::thread upkeeper
;
1386 ceph::mutex upkeep_mutex
= ceph::make_mutex("MDCache::upkeep_mutex");
1387 ceph::condition_variable upkeep_cvar
;
1388 time upkeep_last_trim
= time::min();
1389 time upkeep_last_release
= time::min();
1390 std::atomic
<bool> upkeep_trim_shutdown
{false};
1393 class C_MDS_RetryRequest
: public MDSInternalContext
{
1397 C_MDS_RetryRequest(MDCache
*c
, MDRequestRef
& r
) :
1398 MDSInternalContext(c
->mds
), cache(c
), mdr(r
) {}
1399 void finish(int r
) override
;
1402 class CF_MDS_RetryRequestFactory
: public MDSContextFactory
{
1404 CF_MDS_RetryRequestFactory(MDCache
*cache
, MDRequestRef
&mdr
, bool dl
) :
1405 mdcache(cache
), mdr(mdr
), drop_locks(dl
) {}
1406 MDSContext
*build() override
;
1414 * Only for contexts called back from an I/O completion
1416 * Note: duplication of members wrt MDCacheContext, because
1417 * it'ls the lesser of two evils compared with introducing
1418 * yet another piece of (multiple) inheritance.
1420 class MDCacheIOContext
: public virtual MDSIOContextBase
{
1423 MDSRank
*get_mds() override
1425 ceph_assert(mdcache
!= NULL
);
1426 return mdcache
->mds
;
1429 explicit MDCacheIOContext(MDCache
*mdc_
, bool track
=true) :
1430 MDSIOContextBase(track
), mdcache(mdc_
) {}