1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
14 #ifndef CEPH_MDCACHE_H
15 #define CEPH_MDCACHE_H
18 #include <string_view>
21 #include "common/DecayCounter.h"
22 #include "include/common_fwd.h"
23 #include "include/types.h"
24 #include "include/filepath.h"
25 #include "include/elist.h"
27 #include "messages/MCacheExpire.h"
28 #include "messages/MClientQuota.h"
29 #include "messages/MClientRequest.h"
30 #include "messages/MClientSnap.h"
31 #include "messages/MDentryLink.h"
32 #include "messages/MDentryUnlink.h"
33 #include "messages/MDirUpdate.h"
34 #include "messages/MDiscover.h"
35 #include "messages/MDiscoverReply.h"
36 #include "messages/MGatherCaps.h"
37 #include "messages/MGenericMessage.h"
38 #include "messages/MInodeFileCaps.h"
39 #include "messages/MLock.h"
40 #include "messages/MMDSCacheRejoin.h"
41 #include "messages/MMDSFindIno.h"
42 #include "messages/MMDSFindInoReply.h"
43 #include "messages/MMDSFragmentNotify.h"
44 #include "messages/MMDSFragmentNotifyAck.h"
45 #include "messages/MMDSOpenIno.h"
46 #include "messages/MMDSOpenInoReply.h"
47 #include "messages/MMDSResolve.h"
48 #include "messages/MMDSResolveAck.h"
49 #include "messages/MMDSPeerRequest.h"
50 #include "messages/MMDSSnapUpdate.h"
52 #include "osdc/Filer.h"
56 #include "include/Context.h"
57 #include "events/EMetaBlob.h"
58 #include "RecoveryQueue.h"
59 #include "StrayManager.h"
60 #include "OpenFileTable.h"
61 #include "MDSContext.h"
75 // How many inodes currently in stray dentries
77 // How many stray dentries are currently delayed for purge due to refs
78 l_mdc_num_strays_delayed
,
79 // How many stray dentries are currently being enqueued for purge
80 l_mdc_num_strays_enqueuing
,
82 // How many dentries have ever been added to stray dir
84 // How many dentries have been passed on to PurgeQueue
85 l_mdc_strays_enqueued
,
86 // How many strays have been reintegrated?
87 l_mdc_strays_reintegrated
,
88 // How many strays have been migrated?
89 l_mdc_strays_migrated
,
91 // How many inode sizes currently being recovered
92 l_mdc_num_recovering_processing
,
93 // How many inodes currently waiting to have size recovered
94 l_mdc_num_recovering_enqueued
,
95 // How many inodes waiting with elevated priority for recovery
96 l_mdc_num_recovering_prioritized
,
97 // How many inodes ever started size recovery
98 l_mdc_recovery_started
,
99 // How many inodes ever completed size recovery
100 l_mdc_recovery_completed
,
102 l_mdss_ireq_enqueue_scrub
,
103 l_mdss_ireq_exportdir
,
105 l_mdss_ireq_fragmentdir
,
106 l_mdss_ireq_fragstats
,
107 l_mdss_ireq_inodestats
,
112 // flags for path_traverse();
113 static const int MDS_TRAVERSE_DISCOVER
= (1 << 0);
114 static const int MDS_TRAVERSE_PATH_LOCKED
= (1 << 1);
115 static const int MDS_TRAVERSE_WANT_DENTRY
= (1 << 2);
116 static const int MDS_TRAVERSE_WANT_AUTH
= (1 << 3);
117 static const int MDS_TRAVERSE_RDLOCK_SNAP
= (1 << 4);
118 static const int MDS_TRAVERSE_RDLOCK_SNAP2
= (1 << 5);
119 static const int MDS_TRAVERSE_WANT_DIRLAYOUT
= (1 << 6);
120 static const int MDS_TRAVERSE_RDLOCK_PATH
= (1 << 7);
121 static const int MDS_TRAVERSE_XLOCK_DENTRY
= (1 << 8);
122 static const int MDS_TRAVERSE_RDLOCK_AUTHLOCK
= (1 << 9);
123 static const int MDS_TRAVERSE_CHECK_LOCKCACHE
= (1 << 10);
126 // flags for predirty_journal_parents()
127 static const int PREDIRTY_PRIMARY
= 1; // primary dn, adjust nested accounting
128 static const int PREDIRTY_DIR
= 2; // update parent dir mtime/size
129 static const int PREDIRTY_SHALLOW
= 4; // only go to immediate parent (for easier rollback)
133 typedef std::map
<mds_rank_t
, ref_t
<MCacheExpire
>> expiremap
;
135 using clock
= ceph::coarse_mono_clock
;
136 using time
= ceph::coarse_mono_time
;
139 struct discover_info_t
{
143 basei
->put(MDSCacheObject::PIN_DISCOVERBASE
);
145 void pin_base(CInode
*b
) {
147 basei
->get(MDSCacheObject::PIN_DISCOVERBASE
);
154 snapid_t snap
= CEPH_NOSNAP
;
156 CInode
*basei
= nullptr;
157 bool want_base_dir
= false;
158 bool path_locked
= false;
161 // [reconnect/rejoin caps]
162 struct reconnected_cap_info_t
{
163 reconnected_cap_info_t() {}
164 inodeno_t realm_ino
= 0;
165 snapid_t snap_follows
= 0;
170 // -- find_ino_peer --
171 struct find_ino_peer_info_t
{
172 find_ino_peer_info_t() {}
175 MDSContext
*fin
= nullptr;
176 bool path_locked
= false;
177 mds_rank_t hint
= MDS_RANK_NONE
;
178 mds_rank_t checking
= MDS_RANK_NONE
;
179 set
<mds_rank_t
> checked
;
182 friend class C_MDC_RejoinOpenInoFinish
;
183 friend class C_MDC_RejoinSessionsOpened
;
186 friend class Migrator
;
187 friend class MDBalancer
;
189 // StrayManager needs to be able to remove_inode() from us
190 // when it is done purging
191 friend class StrayManager
;
193 explicit MDCache(MDSRank
*m
, PurgeQueue
&purge_queue_
);
196 uint64_t cache_limit_memory(void) {
197 return cache_memory_limit
;
199 double cache_toofull_ratio(void) const {
200 double memory_reserve
= cache_memory_limit
*(1.0-cache_reservation
);
201 return fmax(0.0, (cache_size()-memory_reserve
)/memory_reserve
);
203 bool cache_toofull(void) const {
204 return cache_toofull_ratio() > 0.0;
206 uint64_t cache_size(void) const {
207 return mempool::get_pool(mempool::mds_co::id
).allocated_bytes();
209 bool cache_overfull(void) const {
210 return cache_size() > cache_memory_limit
*cache_health_threshold
;
213 void advance_stray();
215 unsigned get_ephemeral_dist_frag_bits() const {
216 return export_ephemeral_dist_frag_bits
;
218 bool get_export_ephemeral_distributed_config(void) const {
219 return export_ephemeral_distributed_config
;
222 bool get_export_ephemeral_random_config(void) const {
223 return export_ephemeral_random_config
;
227 * Call this when you know that a CDentry is ready to be passed
228 * on to StrayManager (i.e. this is a stray you've just created)
230 void notify_stray(CDentry
*dn
) {
231 ceph_assert(dn
->get_dir()->get_inode()->is_stray());
232 if (dn
->state_test(CDentry::STATE_PURGING
))
235 stray_manager
.eval_stray(dn
);
238 mds_rank_t
hash_into_rank_bucket(inodeno_t ino
, frag_t fg
=0);
240 void maybe_eval_stray(CInode
*in
, bool delay
=false);
241 void clear_dirty_bits_for_stray(CInode
* diri
);
243 bool is_readonly() { return readonly
; }
244 void force_readonly();
246 static file_layout_t
gen_default_file_layout(const MDSMap
&mdsmap
);
247 static file_layout_t
gen_default_log_layout(const MDSMap
&mdsmap
);
249 void register_perfcounters();
251 void touch_client_lease(ClientLease
*r
, int pool
, utime_t ttl
) {
252 client_leases
[pool
].push_back(&r
->item_lease
);
256 void notify_stray_removed()
258 stray_manager
.notify_stray_removed();
261 void notify_stray_created()
263 stray_manager
.notify_stray_created();
266 void eval_remote(CDentry
*dn
)
268 stray_manager
.eval_remote(dn
);
271 void _send_discover(discover_info_t
& dis
);
272 discover_info_t
& _create_discover(mds_rank_t mds
) {
273 ceph_tid_t t
= ++discover_last_tid
;
274 discover_info_t
& d
= discovers
[t
];
280 void discover_base_ino(inodeno_t want_ino
, MDSContext
*onfinish
, mds_rank_t from
=MDS_RANK_NONE
);
281 void discover_dir_frag(CInode
*base
, frag_t approx_fg
, MDSContext
*onfinish
,
282 mds_rank_t from
=MDS_RANK_NONE
);
283 void discover_path(CInode
*base
, snapid_t snap
, filepath want_path
, MDSContext
*onfinish
,
284 bool path_locked
=false, mds_rank_t from
=MDS_RANK_NONE
);
285 void discover_path(CDir
*base
, snapid_t snap
, filepath want_path
, MDSContext
*onfinish
,
286 bool path_locked
=false);
287 void kick_discovers(mds_rank_t who
); // after a failure.
289 // adjust subtree auth specification
291 // imports/exports/nested_exports
292 // join/split subtrees as appropriate
293 bool is_subtrees() { return !subtrees
.empty(); }
295 void get_subtrees(T
& c
) {
296 if constexpr (std::is_same_v
<T
, std::vector
<CDir
*>>)
297 c
.reserve(c
.size() + subtrees
.size());
298 for (const auto& p
: subtrees
) {
299 c
.push_back(p
.first
);
302 void adjust_subtree_auth(CDir
*root
, mds_authority_t auth
, bool adjust_pop
=true);
303 void adjust_subtree_auth(CDir
*root
, mds_rank_t a
, mds_rank_t b
=CDIR_AUTH_UNKNOWN
) {
304 adjust_subtree_auth(root
, mds_authority_t(a
,b
));
306 void adjust_bounded_subtree_auth(CDir
*dir
, const set
<CDir
*>& bounds
, mds_authority_t auth
);
307 void adjust_bounded_subtree_auth(CDir
*dir
, const set
<CDir
*>& bounds
, mds_rank_t a
) {
308 adjust_bounded_subtree_auth(dir
, bounds
, mds_authority_t(a
, CDIR_AUTH_UNKNOWN
));
310 void adjust_bounded_subtree_auth(CDir
*dir
, const vector
<dirfrag_t
>& bounds
, const mds_authority_t
&auth
);
311 void adjust_bounded_subtree_auth(CDir
*dir
, const vector
<dirfrag_t
>& bounds
, mds_rank_t a
) {
312 adjust_bounded_subtree_auth(dir
, bounds
, mds_authority_t(a
, CDIR_AUTH_UNKNOWN
));
314 void map_dirfrag_set(const list
<dirfrag_t
>& dfs
, set
<CDir
*>& result
);
315 void try_subtree_merge(CDir
*root
);
316 void try_subtree_merge_at(CDir
*root
, set
<CInode
*> *to_eval
, bool adjust_pop
=true);
317 void eval_subtree_root(CInode
*diri
);
318 CDir
*get_subtree_root(CDir
*dir
);
319 CDir
*get_projected_subtree_root(CDir
*dir
);
320 bool is_leaf_subtree(CDir
*dir
) {
321 ceph_assert(subtrees
.count(dir
));
322 return subtrees
[dir
].empty();
324 void remove_subtree(CDir
*dir
);
325 bool is_subtree(CDir
*root
) {
326 return subtrees
.count(root
);
328 void get_subtree_bounds(CDir
*root
, set
<CDir
*>& bounds
);
329 void get_wouldbe_subtree_bounds(CDir
*root
, set
<CDir
*>& bounds
);
330 void verify_subtree_bounds(CDir
*root
, const set
<CDir
*>& bounds
);
331 void verify_subtree_bounds(CDir
*root
, const list
<dirfrag_t
>& bounds
);
333 void project_subtree_rename(CInode
*diri
, CDir
*olddir
, CDir
*newdir
);
334 void adjust_subtree_after_rename(CInode
*diri
, CDir
*olddir
, bool pop
);
336 auto get_auth_subtrees() {
337 std::vector
<CDir
*> c
;
338 for (auto& p
: subtrees
) {
339 auto& root
= p
.first
;
340 if (root
->is_auth()) {
347 auto get_fullauth_subtrees() {
348 std::vector
<CDir
*> c
;
349 for (auto& p
: subtrees
) {
350 auto& root
= p
.first
;
351 if (root
->is_full_dir_auth()) {
357 auto num_subtrees_fullauth() const {
359 for (auto& p
: subtrees
) {
360 auto& root
= p
.first
;
361 if (root
->is_full_dir_auth()) {
368 auto num_subtrees_fullnonauth() const {
370 for (auto& p
: subtrees
) {
371 auto& root
= p
.first
;
372 if (root
->is_full_dir_nonauth()) {
379 auto num_subtrees() const {
380 return subtrees
.size();
383 int get_num_client_requests();
385 MDRequestRef
request_start(const cref_t
<MClientRequest
>& req
);
386 MDRequestRef
request_start_peer(metareqid_t rid
, __u32 attempt
, const cref_t
<Message
> &m
);
387 MDRequestRef
request_start_internal(int op
);
388 bool have_request(metareqid_t rid
) {
389 return active_requests
.count(rid
);
391 MDRequestRef
request_get(metareqid_t rid
);
392 void request_pin_ref(MDRequestRef
& r
, CInode
*ref
, vector
<CDentry
*>& trace
);
393 void request_finish(MDRequestRef
& mdr
);
394 void request_forward(MDRequestRef
& mdr
, mds_rank_t mds
, int port
=0);
395 void dispatch_request(MDRequestRef
& mdr
);
396 void request_drop_foreign_locks(MDRequestRef
& mdr
);
397 void request_drop_non_rdlocks(MDRequestRef
& r
);
398 void request_drop_locks(MDRequestRef
& r
);
399 void request_cleanup(MDRequestRef
& r
);
401 void request_kill(MDRequestRef
& r
); // called when session closes
403 // journal/snap helpers
404 CInode
*pick_inode_snap(CInode
*in
, snapid_t follows
);
405 CInode
*cow_inode(CInode
*in
, snapid_t last
);
406 void journal_cow_dentry(MutationImpl
*mut
, EMetaBlob
*metablob
, CDentry
*dn
,
407 snapid_t follows
=CEPH_NOSNAP
,
408 CInode
**pcow_inode
=0, CDentry::linkage_t
*dnl
=0);
409 void journal_dirty_inode(MutationImpl
*mut
, EMetaBlob
*metablob
, CInode
*in
, snapid_t follows
=CEPH_NOSNAP
);
411 void project_rstat_inode_to_frag(const MutationRef
& mut
,
412 CInode
*cur
, CDir
*parent
, snapid_t first
,
413 int linkunlink
, SnapRealm
*prealm
);
414 void _project_rstat_inode_to_frag(const CInode::mempool_inode
* inode
, snapid_t ofirst
, snapid_t last
,
415 CDir
*parent
, int linkunlink
, bool update_inode
);
416 void project_rstat_frag_to_inode(const nest_info_t
& rstat
, const nest_info_t
& accounted_rstat
,
417 snapid_t ofirst
, snapid_t last
, CInode
*pin
, bool cow_head
);
418 void broadcast_quota_to_client(CInode
*in
, client_t exclude_ct
= -1, bool quota_change
= false);
419 void predirty_journal_parents(MutationRef mut
, EMetaBlob
*blob
,
420 CInode
*in
, CDir
*parent
,
421 int flags
, int linkunlink
=0,
422 snapid_t follows
=CEPH_NOSNAP
);
425 void add_uncommitted_leader(metareqid_t reqid
, LogSegment
*ls
, set
<mds_rank_t
> &peers
, bool safe
=false) {
426 uncommitted_leaders
[reqid
].ls
= ls
;
427 uncommitted_leaders
[reqid
].peers
= peers
;
428 uncommitted_leaders
[reqid
].safe
= safe
;
430 void wait_for_uncommitted_leader(metareqid_t reqid
, MDSContext
*c
) {
431 uncommitted_leaders
[reqid
].waiters
.push_back(c
);
433 bool have_uncommitted_leader(metareqid_t reqid
, mds_rank_t from
) {
434 auto p
= uncommitted_leaders
.find(reqid
);
435 return p
!= uncommitted_leaders
.end() && p
->second
.peers
.count(from
) > 0;
437 void log_leader_commit(metareqid_t reqid
);
438 void logged_leader_update(metareqid_t reqid
);
439 void _logged_leader_commit(metareqid_t reqid
);
440 void committed_leader_peer(metareqid_t r
, mds_rank_t from
);
441 void finish_committed_leaders();
443 void add_uncommitted_peer(metareqid_t reqid
, LogSegment
*, mds_rank_t
, MDPeerUpdate
*su
=nullptr);
444 void wait_for_uncommitted_peer(metareqid_t reqid
, MDSContext
*c
) {
445 uncommitted_peers
.at(reqid
).waiters
.push_back(c
);
447 void finish_uncommitted_peer(metareqid_t reqid
, bool assert_exist
=true);
448 MDPeerUpdate
* get_uncommitted_peer(metareqid_t reqid
, mds_rank_t leader
);
449 void _logged_peer_commit(mds_rank_t from
, metareqid_t reqid
);
451 void set_recovery_set(set
<mds_rank_t
>& s
);
452 void handle_mds_failure(mds_rank_t who
);
453 void handle_mds_recovery(mds_rank_t who
);
455 void recalc_auth_bits(bool replay
);
456 void remove_inode_recursive(CInode
*in
);
458 bool is_ambiguous_peer_update(metareqid_t reqid
, mds_rank_t leader
) {
459 auto p
= ambiguous_peer_updates
.find(leader
);
460 return p
!= ambiguous_peer_updates
.end() && p
->second
.count(reqid
);
462 void add_ambiguous_peer_update(metareqid_t reqid
, mds_rank_t leader
) {
463 ambiguous_peer_updates
[leader
].insert(reqid
);
465 void remove_ambiguous_peer_update(metareqid_t reqid
, mds_rank_t leader
) {
466 auto p
= ambiguous_peer_updates
.find(leader
);
467 auto q
= p
->second
.find(reqid
);
468 ceph_assert(q
!= p
->second
.end());
470 if (p
->second
.empty())
471 ambiguous_peer_updates
.erase(p
);
474 void add_rollback(metareqid_t reqid
, mds_rank_t leader
) {
475 resolve_need_rollback
[reqid
] = leader
;
477 void finish_rollback(metareqid_t reqid
, MDRequestRef
& mdr
);
480 void add_ambiguous_import(dirfrag_t base
, const vector
<dirfrag_t
>& bounds
);
481 void add_ambiguous_import(CDir
*base
, const set
<CDir
*>& bounds
);
482 bool have_ambiguous_import(dirfrag_t base
) {
483 return my_ambiguous_imports
.count(base
);
485 void get_ambiguous_import_bounds(dirfrag_t base
, vector
<dirfrag_t
>& bounds
) {
486 ceph_assert(my_ambiguous_imports
.count(base
));
487 bounds
= my_ambiguous_imports
[base
];
489 void cancel_ambiguous_import(CDir
*);
490 void finish_ambiguous_import(dirfrag_t dirino
);
491 void resolve_start(MDSContext
*resolve_done_
);
492 void send_resolves();
493 void maybe_send_pending_resolves() {
494 if (resolves_pending
)
495 send_subtree_resolves();
498 void _move_subtree_map_bound(dirfrag_t df
, dirfrag_t oldparent
, dirfrag_t newparent
,
499 map
<dirfrag_t
,vector
<dirfrag_t
> >& subtrees
);
500 ESubtreeMap
*create_subtree_map();
502 void clean_open_file_lists();
503 void dump_openfiles(Formatter
*f
);
504 bool dump_inode(Formatter
*f
, uint64_t number
);
506 void rejoin_start(MDSContext
*rejoin_done_
);
507 void rejoin_gather_finish();
508 void rejoin_send_rejoins();
509 void rejoin_export_caps(inodeno_t ino
, client_t client
, const cap_reconnect_t
& icr
,
510 int target
=-1, bool drop_path
=false) {
511 auto& ex
= cap_exports
[ino
];
513 auto &_icr
= ex
.second
[client
] = icr
;
517 void rejoin_recovered_caps(inodeno_t ino
, client_t client
, const cap_reconnect_t
& icr
,
518 mds_rank_t frommds
=MDS_RANK_NONE
, bool drop_path
=false) {
519 auto &_icr
= cap_imports
[ino
][client
][frommds
] = icr
;
523 void rejoin_recovered_client(client_t client
, const entity_inst_t
& inst
) {
524 rejoin_client_map
.emplace(client
, inst
);
526 bool rejoin_has_cap_reconnect(inodeno_t ino
) const {
527 return cap_imports
.count(ino
);
529 void add_replay_ino_alloc(inodeno_t ino
) {
530 cap_imports_missing
.insert(ino
); // avoid opening ino during cache rejoin
532 const cap_reconnect_t
*get_replay_cap_reconnect(inodeno_t ino
, client_t client
) {
533 if (cap_imports
.count(ino
) &&
534 cap_imports
[ino
].count(client
) &&
535 cap_imports
[ino
][client
].count(MDS_RANK_NONE
)) {
536 return &cap_imports
[ino
][client
][MDS_RANK_NONE
];
540 void remove_replay_cap_reconnect(inodeno_t ino
, client_t client
) {
541 ceph_assert(cap_imports
[ino
].size() == 1);
542 ceph_assert(cap_imports
[ino
][client
].size() == 1);
543 cap_imports
.erase(ino
);
545 void wait_replay_cap_reconnect(inodeno_t ino
, MDSContext
*c
) {
546 cap_reconnect_waiters
[ino
].push_back(c
);
549 void add_reconnected_cap(client_t client
, inodeno_t ino
, const cap_reconnect_t
& icr
) {
550 reconnected_cap_info_t
&info
= reconnected_caps
[ino
][client
];
551 info
.realm_ino
= inodeno_t(icr
.capinfo
.snaprealm
);
552 info
.snap_follows
= icr
.snap_follows
;
554 void set_reconnected_dirty_caps(client_t client
, inodeno_t ino
, int dirty
, bool snapflush
) {
555 reconnected_cap_info_t
&info
= reconnected_caps
[ino
][client
];
556 info
.dirty_caps
|= dirty
;
558 info
.snapflush
= snapflush
;
560 void add_reconnected_snaprealm(client_t client
, inodeno_t ino
, snapid_t seq
) {
561 reconnected_snaprealms
[ino
][client
] = seq
;
564 void rejoin_open_ino_finish(inodeno_t ino
, int ret
);
565 void rejoin_prefetch_ino_finish(inodeno_t ino
, int ret
);
566 void rejoin_open_sessions_finish(map
<client_t
,pair
<Session
*,uint64_t> >& session_map
);
567 bool process_imported_caps();
568 void choose_lock_states_and_reconnect_caps();
569 void prepare_realm_split(SnapRealm
*realm
, client_t client
, inodeno_t ino
,
570 map
<client_t
,ref_t
<MClientSnap
>>& splits
);
571 void prepare_realm_merge(SnapRealm
*realm
, SnapRealm
*parent_realm
, map
<client_t
,ref_t
<MClientSnap
>>& splits
);
572 void send_snaps(map
<client_t
,ref_t
<MClientSnap
>>& splits
);
573 Capability
* rejoin_import_cap(CInode
*in
, client_t client
, const cap_reconnect_t
& icr
, mds_rank_t frommds
);
574 void finish_snaprealm_reconnect(client_t client
, SnapRealm
*realm
, snapid_t seq
,
575 map
<client_t
,ref_t
<MClientSnap
>>& updates
);
576 Capability
* try_reconnect_cap(CInode
*in
, Session
*session
);
577 void export_remaining_imported_caps();
579 void do_cap_import(Session
*session
, CInode
*in
, Capability
*cap
,
580 uint64_t p_cap_id
, ceph_seq_t p_seq
, ceph_seq_t p_mseq
,
581 int peer
, int p_flags
);
582 void do_delayed_cap_imports();
583 void rebuild_need_snapflush(CInode
*head_in
, SnapRealm
*realm
, client_t client
,
584 snapid_t snap_follows
);
585 void open_snaprealms();
587 bool open_undef_inodes_dirfrags();
588 void opened_undef_inode(CInode
*in
);
589 void opened_undef_dirfrag(CDir
*dir
) {
590 rejoin_undef_dirfrags
.erase(dir
);
593 void reissue_all_caps();
595 void start_files_to_recover();
596 void do_file_recover();
597 void queue_file_recover(CInode
*in
);
598 void _queued_file_recover_cow(CInode
*in
, MutationRef
& mut
);
600 void handle_conf_change(const std::set
<std::string
>& changed
, const MDSMap
& mds_map
);
606 CInode
*get_root() { return root
; }
607 CInode
*get_myin() { return myin
; }
609 size_t get_cache_size() { return lru
.lru_get_size(); }
612 std::pair
<bool, uint64_t> trim(uint64_t count
=0);
614 bool trim_non_auth_subtree(CDir
*directory
);
615 void standby_trim_segment(LogSegment
*ls
);
616 void try_trim_non_auth_subtree(CDir
*dir
);
617 bool can_trim_non_auth_dirfrag(CDir
*dir
) {
618 return my_ambiguous_imports
.count((dir
)->dirfrag()) == 0 &&
619 uncommitted_peer_rename_olddir
.count(dir
->inode
) == 0;
623 * For all unreferenced inodes, dirs, dentries below an inode, compose
624 * expiry messages. This is used when giving up all replicas of entities
625 * for an MDS peer in the 'stopping' state, such that the peer can
626 * empty its cache and finish shutting down.
628 * We have to make sure we're only expiring un-referenced items to
629 * avoid interfering with ongoing stray-movement (we can't distinguish
630 * between the "moving my strays" and "waiting for my cache to empty"
631 * phases within 'stopping')
633 * @return false if we completed cleanly, true if caller should stop
634 * expiring because we hit something with refs.
636 bool expire_recursive(CInode
*in
, expiremap
& expiremap
);
638 void trim_client_leases();
639 void check_memory_usage();
641 void shutdown_start();
642 void shutdown_check();
643 bool shutdown_pass();
644 bool shutdown(); // clear cache (ie at shutodwn)
645 bool shutdown_export_strays();
646 void shutdown_export_stray_finish(inodeno_t ino
) {
647 if (shutdown_exporting_strays
.erase(ino
))
648 shutdown_export_strays();
652 bool have_inode(vinodeno_t vino
) {
653 if (vino
.snapid
== CEPH_NOSNAP
)
654 return inode_map
.count(vino
.ino
) ? true : false;
656 return snap_inode_map
.count(vino
) ? true : false;
658 bool have_inode(inodeno_t ino
, snapid_t snap
=CEPH_NOSNAP
) {
659 return have_inode(vinodeno_t(ino
, snap
));
661 CInode
* get_inode(vinodeno_t vino
) {
662 if (vino
.snapid
== CEPH_NOSNAP
) {
663 auto p
= inode_map
.find(vino
.ino
);
664 if (p
!= inode_map
.end())
667 auto p
= snap_inode_map
.find(vino
);
668 if (p
!= snap_inode_map
.end())
673 CInode
* get_inode(inodeno_t ino
, snapid_t s
=CEPH_NOSNAP
) {
674 return get_inode(vinodeno_t(ino
, s
));
676 CInode
* lookup_snap_inode(vinodeno_t vino
) {
677 auto p
= snap_inode_map
.lower_bound(vino
);
678 if (p
!= snap_inode_map
.end() &&
679 p
->second
->ino() == vino
.ino
&& p
->second
->first
<= vino
.snapid
)
684 CDir
* get_dirfrag(dirfrag_t df
) {
685 CInode
*in
= get_inode(df
.ino
);
688 return in
->get_dirfrag(df
.frag
);
690 CDir
* get_dirfrag(inodeno_t ino
, std::string_view dn
) {
691 CInode
*in
= get_inode(ino
);
694 frag_t fg
= in
->pick_dirfrag(dn
);
695 return in
->get_dirfrag(fg
);
697 CDir
* get_force_dirfrag(dirfrag_t df
, bool replay
) {
698 CInode
*diri
= get_inode(df
.ino
);
701 CDir
*dir
= force_dir_fragment(diri
, df
.frag
, replay
);
703 dir
= diri
->get_dirfrag(df
.frag
);
707 MDSCacheObject
*get_object(const MDSCacheObjectInfo
&info
);
709 void add_inode(CInode
*in
);
711 void remove_inode(CInode
*in
);
713 void touch_dentry(CDentry
*dn
) {
714 if (dn
->state_test(CDentry::STATE_BOTTOMLRU
)) {
715 bottom_lru
.lru_midtouch(dn
);
720 lru
.lru_midtouch(dn
);
723 void touch_dentry_bottom(CDentry
*dn
) {
724 if (dn
->state_test(CDentry::STATE_BOTTOMLRU
))
726 lru
.lru_bottouch(dn
);
730 void truncate_inode(CInode
*in
, LogSegment
*ls
);
731 void _truncate_inode(CInode
*in
, LogSegment
*ls
);
732 void truncate_inode_finish(CInode
*in
, LogSegment
*ls
);
733 void truncate_inode_logged(CInode
*in
, MutationRef
& mut
);
735 void add_recovered_truncate(CInode
*in
, LogSegment
*ls
);
736 void remove_recovered_truncate(CInode
*in
, LogSegment
*ls
);
737 void start_recovered_truncates();
739 // purge unsafe inodes
740 void start_purge_inodes();
741 void purge_inodes(const interval_set
<inodeno_t
>& i
, LogSegment
*ls
);
743 CDir
*get_auth_container(CDir
*in
);
744 CDir
*get_export_container(CDir
*dir
);
745 void find_nested_exports(CDir
*dir
, set
<CDir
*>& s
);
746 void find_nested_exports_under(CDir
*import
, CDir
*dir
, set
<CDir
*>& s
);
749 void create_unlinked_system_inode(CInode
*in
, inodeno_t ino
,
751 CInode
*create_system_inode(inodeno_t ino
, int mode
);
752 CInode
*create_root_inode();
754 void create_empty_hierarchy(MDSGather
*gather
);
755 void create_mydir_hierarchy(MDSGather
*gather
);
757 bool is_open() { return open
; }
758 void wait_for_open(MDSContext
*c
) {
759 waiting_for_open
.push_back(c
);
762 void open_root_inode(MDSContext
*c
);
764 void open_mydir_inode(MDSContext
*c
);
765 void open_mydir_frag(MDSContext
*c
);
766 void populate_mydir();
768 void _create_system_file(CDir
*dir
, std::string_view name
, CInode
*in
, MDSContext
*fin
);
769 void _create_system_file_finish(MutationRef
& mut
, CDentry
*dn
,
770 version_t dpv
, MDSContext
*fin
);
772 void open_foreign_mdsdir(inodeno_t ino
, MDSContext
*c
);
773 CDir
*get_stray_dir(CInode
*in
);
776 * Find the given dentry (and whether it exists or not), its ancestors,
777 * and get them all into memory and usable on this MDS. This function
778 * makes a best-effort attempt to load everything; if it needs to
779 * go away and do something then it will put the request on a waitlist.
780 * It prefers the mdr, then the req, then the fin. (At least one of these
783 * At least one of the params mdr, req, and fin must be non-null.
785 * @param mdr The MDRequest associated with the path. Can be null.
786 * @param cf A MDSContextFactory for waiter building.
787 * @param path The path to traverse to.
789 * @param flags Specifies different lookup behaviors.
790 * By default, path_traverse() forwards the request to the auth MDS if that
791 * is appropriate (ie, if it doesn't know the contents of a directory).
792 * MDS_TRAVERSE_DISCOVER: Instead of forwarding request, path_traverse()
793 * attempts to look up the path from a different MDS (and bring them into
794 * its cache as replicas).
795 * MDS_TRAVERSE_PATH_LOCKED: path_traverse() will procceed when xlocked
796 * dentry is encountered.
797 * MDS_TRAVERSE_WANT_DENTRY: Caller wants tail dentry. Add a null dentry if
798 * tail dentry does not exist. return 0 even tail dentry is null.
799 * MDS_TRAVERSE_WANT_AUTH: Always forward request to auth MDS of target inode
800 * or auth MDS of tail dentry (MDS_TRAVERSE_WANT_DENTRY is set).
802 * @param pdnvec Data return parameter -- on success, contains a
803 * vector of dentries. On failure, is either empty or contains the
804 * full trace of traversable dentries.
805 * @param pin Data return parameter -- if successful, points to the inode
806 * associated with filepath. If unsuccessful, is null.
808 * @returns 0 on success, 1 on "not done yet", 2 on "forwarding", -errno otherwise.
809 * If it returns 1, the requester associated with this call has been placed
810 * on the appropriate waitlist, and it should unwind itself and back out.
811 * If it returns 2 the request has been forwarded, and again the requester
812 * should unwind itself and back out.
814 int path_traverse(MDRequestRef
& mdr
, MDSContextFactory
& cf
,
815 const filepath
& path
, int flags
,
816 vector
<CDentry
*> *pdnvec
, CInode
**pin
=nullptr);
818 CInode
*cache_traverse(const filepath
& path
);
820 void open_remote_dirfrag(CInode
*diri
, frag_t fg
, MDSContext
*fin
);
821 CInode
*get_dentry_inode(CDentry
*dn
, MDRequestRef
& mdr
, bool projected
=false);
823 bool parallel_fetch(map
<inodeno_t
,filepath
>& pathmap
, set
<inodeno_t
>& missing
);
824 bool parallel_fetch_traverse_dir(inodeno_t ino
, filepath
& path
,
825 set
<CDir
*>& fetch_queue
, set
<inodeno_t
>& missing
,
826 C_GatherBuilder
&gather_bld
);
828 void open_remote_dentry(CDentry
*dn
, bool projected
, MDSContext
*fin
,
829 bool want_xlocked
=false);
830 void _open_remote_dentry_finish(CDentry
*dn
, inodeno_t ino
, MDSContext
*fin
,
831 bool want_xlocked
, int r
);
833 void make_trace(vector
<CDentry
*>& trace
, CInode
*in
);
835 void kick_open_ino_peers(mds_rank_t who
);
836 void open_ino(inodeno_t ino
, int64_t pool
, MDSContext
*fin
,
837 bool want_replica
=true, bool want_xlocked
=false,
838 vector
<inode_backpointer_t
> *ancestors_hint
=nullptr,
839 mds_rank_t auth_hint
=MDS_RANK_NONE
);
841 void find_ino_peers(inodeno_t ino
, MDSContext
*c
,
842 mds_rank_t hint
=MDS_RANK_NONE
, bool path_locked
=false);
843 void _do_find_ino_peer(find_ino_peer_info_t
& fip
);
844 void handle_find_ino(const cref_t
<MMDSFindIno
> &m
);
845 void handle_find_ino_reply(const cref_t
<MMDSFindInoReply
> &m
);
846 void kick_find_ino_peers(mds_rank_t who
);
848 SnapRealm
*get_global_snaprealm() const { return global_snaprealm
; }
849 void create_global_snaprealm();
850 void do_realm_invalidate_and_update_notify(CInode
*in
, int snapop
, bool notify_clients
=true);
851 void send_snap_update(CInode
*in
, version_t stid
, int snap_op
);
852 void handle_snap_update(const cref_t
<MMDSSnapUpdate
> &m
);
853 void notify_global_snaprealm_update(int snap_op
);
856 void fetch_backtrace(inodeno_t ino
, int64_t pool
, bufferlist
& bl
, Context
*fin
);
857 uint64_t get_num_strays() const { return stray_manager
.get_num_strays(); }
860 void dispatch(const cref_t
<Message
> &m
);
862 void encode_replica_dir(CDir
*dir
, mds_rank_t to
, bufferlist
& bl
);
863 void encode_replica_dentry(CDentry
*dn
, mds_rank_t to
, bufferlist
& bl
);
864 void encode_replica_inode(CInode
*in
, mds_rank_t to
, bufferlist
& bl
,
867 void decode_replica_dir(CDir
*&dir
, bufferlist::const_iterator
& p
, CInode
*diri
, mds_rank_t from
, MDSContext::vec
& finished
);
868 void decode_replica_dentry(CDentry
*&dn
, bufferlist::const_iterator
& p
, CDir
*dir
, MDSContext::vec
& finished
);
869 void decode_replica_inode(CInode
*&in
, bufferlist::const_iterator
& p
, CDentry
*dn
, MDSContext::vec
& finished
);
871 void encode_replica_stray(CDentry
*straydn
, mds_rank_t who
, bufferlist
& bl
);
872 void decode_replica_stray(CDentry
*&straydn
, const bufferlist
&bl
, mds_rank_t from
);
875 void encode_remote_dentry_link(CDentry::linkage_t
*dnl
, bufferlist
& bl
);
876 void decode_remote_dentry_link(CDir
*dir
, CDentry
*dn
, bufferlist::const_iterator
& p
);
877 void send_dentry_link(CDentry
*dn
, MDRequestRef
& mdr
);
878 void send_dentry_unlink(CDentry
*dn
, CDentry
*straydn
, MDRequestRef
& mdr
);
880 void wait_for_uncommitted_fragment(dirfrag_t dirfrag
, MDSContext
*c
) {
881 uncommitted_fragments
.at(dirfrag
).waiters
.push_back(c
);
883 bool is_any_uncommitted_fragment() const {
884 return !uncommitted_fragments
.empty();
886 void wait_for_uncommitted_fragments(MDSContext
* finisher
);
887 void rollback_uncommitted_fragments();
889 void split_dir(CDir
*dir
, int byn
);
890 void merge_dir(CInode
*diri
, frag_t fg
);
892 void find_stale_fragment_freeze();
893 void fragment_freeze_inc_num_waiters(CDir
*dir
);
894 bool fragment_are_all_frozen(CDir
*dir
);
895 int get_num_fragmenting_dirs() { return fragments
.size(); }
898 //int send_inode_updates(CInode *in);
899 //void handle_inode_update(MInodeUpdate *m);
901 int send_dir_updates(CDir
*in
, bool bcast
=false);
902 void handle_dir_update(const cref_t
<MDirUpdate
> &m
);
904 // -- cache expiration --
905 void handle_cache_expire(const cref_t
<MCacheExpire
> &m
);
906 void process_delayed_expire(CDir
*dir
);
907 void discard_delayed_expire(CDir
*dir
);
910 void handle_mdsmap(const MDSMap
&mdsmap
, const MDSMap
&oldmap
);
912 int dump_cache() { return dump_cache({}, nullptr); }
913 int dump_cache(std::string_view filename
);
914 int dump_cache(Formatter
*f
);
915 void dump_tree(CInode
*in
, const int cur_depth
, const int max_depth
, Formatter
*f
);
917 void cache_status(Formatter
*f
);
919 void dump_resolve_status(Formatter
*f
) const;
920 void dump_rejoin_status(Formatter
*f
) const;
924 void show_subtrees(int dbl
=10, bool force_print
=false);
926 CInode
*hack_pick_random_inode() {
927 ceph_assert(!inode_map
.empty());
928 int n
= rand() % inode_map
.size();
929 auto p
= inode_map
.begin();
934 void flush_dentry(std::string_view path
, Context
*fin
);
936 * Create and start an OP_ENQUEUE_SCRUB
938 void enqueue_scrub(std::string_view path
, std::string_view tag
,
939 bool force
, bool recursive
, bool repair
,
940 Formatter
*f
, Context
*fin
);
941 void repair_inode_stats(CInode
*diri
);
942 void repair_dirfrag_stats(CDir
*dir
);
943 void rdlock_dirfrags_stats(CInode
*diri
, MDSInternalContext
*fin
);
949 LRU lru
; // dentry lru for expiring items from cache
950 LRU bottom_lru
; // dentries that should be trimmed ASAP
954 int num_shadow_inodes
= 0;
956 int num_inodes_with_caps
= 0;
958 unsigned max_dir_commit_size
;
960 file_layout_t default_file_layout
;
961 file_layout_t default_log_layout
;
963 // -- client leases --
964 static constexpr std::size_t client_lease_pools
= 3;
965 std::array
<float, client_lease_pools
> client_lease_durations
{5.0, 30.0, 300.0};
968 uint64_t last_cap_id
= 0;
970 map
<ceph_tid_t
, discover_info_t
> discovers
;
971 ceph_tid_t discover_last_tid
= 0;
974 map
<int, map
<inodeno_t
, MDSContext::vec
> > waiting_for_base_ino
;
976 map
<inodeno_t
,map
<client_t
, reconnected_cap_info_t
> > reconnected_caps
; // inode -> client -> snap_follows,realmino
977 map
<inodeno_t
,map
<client_t
, snapid_t
> > reconnected_snaprealms
; // realmino -> client -> realmseq
980 set
<CInode
*> rejoin_pending_snaprealms
;
981 // cap imports. delayed snap parent opens.
982 map
<client_t
,set
<CInode
*> > delayed_imported_caps
;
985 std::unique_ptr
<Migrator
> migrator
;
987 bool did_shutdown_log_cap
= false;
989 map
<ceph_tid_t
, find_ino_peer_info_t
> find_ino_peer
;
990 ceph_tid_t find_ino_peer_last_tid
= 0;
992 // delayed cache expire
993 map
<CDir
*, expiremap
> delayed_expire
; // subtree root -> expire msg
995 /* Because exports may fail, this set lets us keep track of inodes that need exporting. */
996 std::set
<CInode
*> export_pin_queue
;
997 std::set
<CInode
*> export_pin_delayed_queue
;
998 std::set
<CInode
*> export_ephemeral_pins
;
1000 OpenFileTable open_file_table
;
1002 double export_ephemeral_random_max
= 0.0;
1005 // track leader requests whose peers haven't acknowledged commit
1008 set
<mds_rank_t
> peers
;
1009 LogSegment
*ls
= nullptr;
1010 MDSContext::vec waiters
;
1012 bool committing
= false;
1013 bool recovering
= false;
1019 LogSegment
*ls
= nullptr;
1020 MDPeerUpdate
*su
= nullptr;
1021 MDSContext::vec waiters
;
1024 struct open_ino_info_t
{
1025 open_ino_info_t() {}
1026 vector
<inode_backpointer_t
> ancestors
;
1027 set
<mds_rank_t
> checked
;
1028 mds_rank_t checking
= MDS_RANK_NONE
;
1029 mds_rank_t auth_hint
= MDS_RANK_NONE
;
1030 bool check_peers
= true;
1031 bool fetch_backtrace
= true;
1032 bool discover
= false;
1033 bool want_replica
= false;
1034 bool want_xlocked
= false;
1038 MDSContext::vec waiters
;
1041 friend struct C_MDC_OpenInoTraverseDir
;
1042 friend struct C_MDC_OpenInoParentOpened
;
1043 friend struct C_MDC_RetryScanStray
;
1045 friend class C_IO_MDC_OpenInoBacktraceFetched
;
1046 friend class C_MDC_Join
;
1047 friend class C_MDC_RespondInternalRequest
;
1049 friend class EPeerUpdate
;
1050 friend class ECommitted
;
1052 void set_readonly() { readonly
= true; }
1054 void handle_resolve(const cref_t
<MMDSResolve
> &m
);
1055 void handle_resolve_ack(const cref_t
<MMDSResolveAck
> &m
);
1056 void process_delayed_resolve();
1057 void discard_delayed_resolve(mds_rank_t who
);
1058 void maybe_resolve_finish();
1059 void disambiguate_my_imports();
1060 void disambiguate_other_imports();
1061 void trim_unlinked_inodes();
1063 void send_peer_resolves();
1064 void send_subtree_resolves();
1065 void maybe_finish_peer_resolve();
1067 void rejoin_walk(CDir
*dir
, const ref_t
<MMDSCacheRejoin
> &rejoin
);
1068 void handle_cache_rejoin(const cref_t
<MMDSCacheRejoin
> &m
);
1069 void handle_cache_rejoin_weak(const cref_t
<MMDSCacheRejoin
> &m
);
1070 CInode
* rejoin_invent_inode(inodeno_t ino
, snapid_t last
);
1071 CDir
* rejoin_invent_dirfrag(dirfrag_t df
);
1072 void handle_cache_rejoin_strong(const cref_t
<MMDSCacheRejoin
> &m
);
1073 void rejoin_scour_survivor_replicas(mds_rank_t from
, const cref_t
<MMDSCacheRejoin
> &ack
,
1074 set
<vinodeno_t
>& acked_inodes
,
1075 set
<SimpleLock
*>& gather_locks
);
1076 void handle_cache_rejoin_ack(const cref_t
<MMDSCacheRejoin
> &m
);
1077 void rejoin_send_acks();
1078 void rejoin_trim_undef_inodes();
1079 void maybe_send_pending_rejoins() {
1080 if (rejoins_pending
)
1081 rejoin_send_rejoins();
1084 void touch_inode(CInode
*in
) {
1085 if (in
->get_parent_dn())
1086 touch_dentry(in
->get_projected_parent_dn());
1089 void inode_remove_replica(CInode
*in
, mds_rank_t rep
, bool rejoin
,
1090 set
<SimpleLock
*>& gather_locks
);
1091 void dentry_remove_replica(CDentry
*dn
, mds_rank_t rep
, set
<SimpleLock
*>& gather_locks
);
1093 void rename_file(CDentry
*srcdn
, CDentry
*destdn
);
1095 void _open_ino_backtrace_fetched(inodeno_t ino
, bufferlist
& bl
, int err
);
1096 void _open_ino_parent_opened(inodeno_t ino
, int ret
);
1097 void _open_ino_traverse_dir(inodeno_t ino
, open_ino_info_t
& info
, int err
);
1098 void _open_ino_fetch_dir(inodeno_t ino
, const cref_t
<MMDSOpenIno
> &m
, CDir
*dir
, bool parent
);
1099 int open_ino_traverse_dir(inodeno_t ino
, const cref_t
<MMDSOpenIno
> &m
,
1100 const vector
<inode_backpointer_t
>& ancestors
,
1101 bool discover
, bool want_xlocked
, mds_rank_t
*hint
);
1102 void open_ino_finish(inodeno_t ino
, open_ino_info_t
& info
, int err
);
1103 void do_open_ino(inodeno_t ino
, open_ino_info_t
& info
, int err
);
1104 void do_open_ino_peer(inodeno_t ino
, open_ino_info_t
& info
);
1105 void handle_open_ino(const cref_t
<MMDSOpenIno
> &m
, int err
=0);
1106 void handle_open_ino_reply(const cref_t
<MMDSOpenInoReply
> &m
);
1108 void scan_stray_dir(dirfrag_t next
=dirfrag_t());
1110 void handle_discover(const cref_t
<MDiscover
> &dis
);
1111 void handle_discover_reply(const cref_t
<MDiscoverReply
> &m
);
1112 void handle_dentry_link(const cref_t
<MDentryLink
> &m
);
1113 void handle_dentry_unlink(const cref_t
<MDentryUnlink
> &m
);
1115 int dump_cache(std::string_view fn
, Formatter
*f
);
1117 void flush_dentry_work(MDRequestRef
& mdr
);
1119 * Resolve path to a dentry and pass it onto the ScrubStack.
1121 * TODO: return enough information to the original mdr formatter
1122 * and completion that they can subsequeuntly check the progress of
1123 * this scrub (we won't block them on a whole scrub as it can take a very
1126 void enqueue_scrub_work(MDRequestRef
& mdr
);
1127 void repair_inode_stats_work(MDRequestRef
& mdr
);
1128 void repair_dirfrag_stats_work(MDRequestRef
& mdr
);
1129 void rdlock_dirfrags_stats_work(MDRequestRef
& mdr
);
1131 ceph::unordered_map
<inodeno_t
,CInode
*> inode_map
; // map of head inodes by ino
1132 map
<vinodeno_t
, CInode
*> snap_inode_map
; // map of snap inodes by ino
1133 CInode
*root
= nullptr; // root inode
1134 CInode
*myin
= nullptr; // .ceph/mds%d dir
1136 bool readonly
= false;
1138 int stray_index
= 0;
1139 int stray_fragmenting_index
= -1;
1141 set
<CInode
*> base_inodes
;
1143 std::unique_ptr
<PerfCounters
> logger
;
1146 std::array
<xlist
<ClientLease
*>, client_lease_pools
> client_leases
{};
1148 /* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */
1149 map
<CDir
*,set
<CDir
*> > subtrees
;
1150 map
<CInode
*,list
<pair
<CDir
*,CDir
*> > > projected_subtree_renames
; // renamed ino -> target dir
1153 ceph::unordered_map
<metareqid_t
, MDRequestRef
> active_requests
;
1156 set
<mds_rank_t
> recovery_set
;
1159 // from EImportStart w/o EImportFinish during journal replay
1160 map
<dirfrag_t
, vector
<dirfrag_t
> > my_ambiguous_imports
;
1161 // from MMDSResolves
1162 map
<mds_rank_t
, map
<dirfrag_t
, vector
<dirfrag_t
> > > other_ambiguous_imports
;
1164 map
<CInode
*, int> uncommitted_peer_rename_olddir
; // peer: preserve the non-auth dir until seeing commit.
1165 map
<CInode
*, int> uncommitted_peer_unlink
; // peer: preserve the unlinked inode until seeing commit.
1167 map
<metareqid_t
, uleader
> uncommitted_leaders
; // leader: req -> peer set
1168 map
<metareqid_t
, upeer
> uncommitted_peers
; // peer: preserve the peer req until seeing commit.
1170 set
<metareqid_t
> pending_leaders
;
1171 map
<int, set
<metareqid_t
> > ambiguous_peer_updates
;
1173 bool resolves_pending
= false;
1174 set
<mds_rank_t
> resolve_gather
; // nodes i need resolves from
1175 set
<mds_rank_t
> resolve_ack_gather
; // nodes i need a resolve_ack from
1176 set
<version_t
> resolve_snapclient_commits
;
1177 map
<metareqid_t
, mds_rank_t
> resolve_need_rollback
; // rollbacks i'm writing to the journal
1178 map
<mds_rank_t
, cref_t
<MMDSResolve
>> delayed_resolve
;
1181 bool rejoins_pending
= false;
1182 set
<mds_rank_t
> rejoin_gather
; // nodes from whom i need a rejoin
1183 set
<mds_rank_t
> rejoin_sent
; // nodes i sent a rejoin to
1184 set
<mds_rank_t
> rejoin_ack_sent
; // nodes i sent a rejoin to
1185 set
<mds_rank_t
> rejoin_ack_gather
; // nodes from whom i need a rejoin ack
1186 map
<mds_rank_t
,map
<inodeno_t
,map
<client_t
,Capability::Import
> > > rejoin_imported_caps
;
1187 map
<inodeno_t
,pair
<mds_rank_t
,map
<client_t
,Capability::Export
> > > rejoin_peer_exports
;
1189 map
<client_t
,entity_inst_t
> rejoin_client_map
;
1190 map
<client_t
,client_metadata_t
> rejoin_client_metadata_map
;
1191 map
<client_t
,pair
<Session
*,uint64_t> > rejoin_session_map
;
1193 map
<inodeno_t
,pair
<mds_rank_t
,map
<client_t
,cap_reconnect_t
> > > cap_exports
; // ino -> target, client -> capex
1195 map
<inodeno_t
,map
<client_t
,map
<mds_rank_t
,cap_reconnect_t
> > > cap_imports
; // ino -> client -> frommds -> capex
1196 set
<inodeno_t
> cap_imports_missing
;
1197 map
<inodeno_t
, MDSContext::vec
> cap_reconnect_waiters
;
1198 int cap_imports_num_opening
= 0;
1200 set
<CInode
*> rejoin_undef_inodes
;
1201 set
<CInode
*> rejoin_potential_updated_scatterlocks
;
1202 set
<CDir
*> rejoin_undef_dirfrags
;
1203 map
<mds_rank_t
, set
<CInode
*> > rejoin_unlinked_inodes
;
1205 vector
<CInode
*> rejoin_recover_q
, rejoin_check_q
;
1206 list
<SimpleLock
*> rejoin_eval_locks
;
1207 MDSContext::vec rejoin_waiters
;
1209 std::unique_ptr
<MDSContext
> rejoin_done
;
1210 std::unique_ptr
<MDSContext
> resolve_done
;
1212 ceph_tid_t open_ino_last_tid
= 0;
1213 map
<inodeno_t
,open_ino_info_t
> opening_inodes
;
1215 StrayManager stray_manager
;
1218 // -- fragmenting --
1222 bool committed
= false;
1223 LogSegment
*ls
= nullptr;
1224 MDSContext::vec waiters
;
1225 frag_vec_t old_frags
;
1226 bufferlist rollback
;
1229 struct fragment_info_t
{
1230 fragment_info_t() {}
1231 bool is_fragmenting() { return !resultfrags
.empty(); }
1232 uint64_t get_tid() { return mdr
? mdr
->reqid
.tid
: 0; }
1234 std::vector
<CDir
*> dirs
;
1235 std::vector
<CDir
*> resultfrags
;
1237 set
<mds_rank_t
> notify_ack_waiting
;
1238 bool finishing
= false;
1240 // for deadlock detection
1241 bool all_frozen
= false;
1242 utime_t last_cum_auth_pins_change
;
1243 int last_cum_auth_pins
= 0;
1244 int num_remote_waiters
= 0; // number of remote authpin waiters
1247 typedef map
<dirfrag_t
,fragment_info_t
>::iterator fragment_info_iterator
;
1249 friend class EFragment
;
1250 friend class C_MDC_FragmentFrozen
;
1251 friend class C_MDC_FragmentMarking
;
1252 friend class C_MDC_FragmentPrep
;
1253 friend class C_MDC_FragmentStore
;
1254 friend class C_MDC_FragmentCommit
;
1255 friend class C_MDC_FragmentRollback
;
1256 friend class C_IO_MDC_FragmentPurgeOld
;
1259 static const unsigned int SUBTREES_COUNT_THRESHOLD
= 5;
1260 static const unsigned int SUBTREES_DEPTH_THRESHOLD
= 5;
1262 CInode
*get_stray() {
1263 return strays
[stray_index
];
1266 void identify_files_to_recover();
1268 std::pair
<bool, uint64_t> trim_lru(uint64_t count
, expiremap
& expiremap
);
1269 bool trim_dentry(CDentry
*dn
, expiremap
& expiremap
);
1270 void trim_dirfrag(CDir
*dir
, CDir
*con
, expiremap
& expiremap
);
1271 bool trim_inode(CDentry
*dn
, CInode
*in
, CDir
*con
, expiremap
&);
1272 void send_expire_messages(expiremap
& expiremap
);
1273 void trim_non_auth(); // trim out trimmable non-auth items
1275 void adjust_dir_fragments(CInode
*diri
, frag_t basefrag
, int bits
,
1276 std::vector
<CDir
*>* frags
, MDSContext::vec
& waiters
, bool replay
);
1277 void adjust_dir_fragments(CInode
*diri
,
1278 const std::vector
<CDir
*>& srcfrags
,
1279 frag_t basefrag
, int bits
,
1280 std::vector
<CDir
*>* resultfrags
,
1281 MDSContext::vec
& waiters
,
1283 CDir
*force_dir_fragment(CInode
*diri
, frag_t fg
, bool replay
=true);
1284 void get_force_dirfrag_bound_set(const vector
<dirfrag_t
>& dfs
, set
<CDir
*>& bounds
);
1286 bool can_fragment(CInode
*diri
, const std::vector
<CDir
*>& dirs
);
1287 void fragment_freeze_dirs(const std::vector
<CDir
*>& dirs
);
1288 void fragment_mark_and_complete(MDRequestRef
& mdr
);
1289 void fragment_frozen(MDRequestRef
& mdr
, int r
);
1290 void fragment_unmark_unfreeze_dirs(const std::vector
<CDir
*>& dirs
);
1291 void fragment_drop_locks(fragment_info_t
&info
);
1292 void fragment_maybe_finish(const fragment_info_iterator
& it
);
1293 void dispatch_fragment_dir(MDRequestRef
& mdr
);
1294 void _fragment_logged(MDRequestRef
& mdr
);
1295 void _fragment_stored(MDRequestRef
& mdr
);
1296 void _fragment_committed(dirfrag_t f
, const MDRequestRef
& mdr
);
1297 void _fragment_old_purged(dirfrag_t f
, int bits
, const MDRequestRef
& mdr
);
1299 void handle_fragment_notify(const cref_t
<MMDSFragmentNotify
> &m
);
1300 void handle_fragment_notify_ack(const cref_t
<MMDSFragmentNotifyAck
> &m
);
1302 void add_uncommitted_fragment(dirfrag_t basedirfrag
, int bits
, const frag_vec_t
& old_frag
,
1303 LogSegment
*ls
, bufferlist
*rollback
=NULL
);
1304 void finish_uncommitted_fragment(dirfrag_t basedirfrag
, int op
);
1305 void rollback_uncommitted_fragment(dirfrag_t basedirfrag
, frag_vec_t
&& old_frags
);
1307 void upkeep_main(void);
1309 uint64_t cache_memory_limit
;
1310 double cache_reservation
;
1311 double cache_health_threshold
;
1312 std::array
<CInode
*, NUM_STRAY
> strays
{}; // my stray dir
1314 bool export_ephemeral_distributed_config
;
1315 bool export_ephemeral_random_config
;
1316 unsigned export_ephemeral_dist_frag_bits
;
1318 // File size recovery
1319 RecoveryQueue recovery_queue
;
1322 set
<inodeno_t
> shutdown_exporting_strays
;
1323 pair
<dirfrag_t
, string
> shutdown_export_next
;
1325 bool opening_root
= false, open
= false;
1326 MDSContext::vec waiting_for_open
;
1329 SnapRealm
*global_snaprealm
= nullptr;
1331 map
<dirfrag_t
, ufragment
> uncommitted_fragments
;
1333 map
<dirfrag_t
,fragment_info_t
> fragments
;
1335 DecayCounter trim_counter
;
1337 std::thread upkeeper
;
1338 ceph::mutex upkeep_mutex
= ceph::make_mutex("MDCache::upkeep_mutex");
1339 ceph::condition_variable upkeep_cvar
;
1340 time upkeep_last_trim
= time::min();
1341 time upkeep_last_release
= time::min();
1342 std::atomic
<bool> upkeep_trim_shutdown
{false};
1345 class C_MDS_RetryRequest
: public MDSInternalContext
{
1349 C_MDS_RetryRequest(MDCache
*c
, MDRequestRef
& r
) :
1350 MDSInternalContext(c
->mds
), cache(c
), mdr(r
) {}
1351 void finish(int r
) override
;
1354 class CF_MDS_RetryRequestFactory
: public MDSContextFactory
{
1356 CF_MDS_RetryRequestFactory(MDCache
*cache
, MDRequestRef
&mdr
, bool dl
) :
1357 mdcache(cache
), mdr(mdr
), drop_locks(dl
) {}
1358 MDSContext
*build() override
;