1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
14 #ifndef CEPH_MDCACHE_H
15 #define CEPH_MDCACHE_H
18 #include <string_view>
21 #include "common/DecayCounter.h"
22 #include "include/common_fwd.h"
23 #include "include/types.h"
24 #include "include/filepath.h"
25 #include "include/elist.h"
27 #include "messages/MCacheExpire.h"
28 #include "messages/MClientQuota.h"
29 #include "messages/MClientRequest.h"
30 #include "messages/MClientSnap.h"
31 #include "messages/MDentryLink.h"
32 #include "messages/MDentryUnlink.h"
33 #include "messages/MDirUpdate.h"
34 #include "messages/MDiscover.h"
35 #include "messages/MDiscoverReply.h"
36 #include "messages/MGatherCaps.h"
37 #include "messages/MGenericMessage.h"
38 #include "messages/MInodeFileCaps.h"
39 #include "messages/MLock.h"
40 #include "messages/MMDSCacheRejoin.h"
41 #include "messages/MMDSFindIno.h"
42 #include "messages/MMDSFindInoReply.h"
43 #include "messages/MMDSFragmentNotify.h"
44 #include "messages/MMDSFragmentNotifyAck.h"
45 #include "messages/MMDSOpenIno.h"
46 #include "messages/MMDSOpenInoReply.h"
47 #include "messages/MMDSResolve.h"
48 #include "messages/MMDSResolveAck.h"
49 #include "messages/MMDSSlaveRequest.h"
50 #include "messages/MMDSSnapUpdate.h"
52 #include "osdc/Filer.h"
56 #include "include/Context.h"
57 #include "events/EMetaBlob.h"
58 #include "RecoveryQueue.h"
59 #include "StrayManager.h"
60 #include "OpenFileTable.h"
61 #include "MDSContext.h"
75 // How many inodes currently in stray dentries
77 // How many stray dentries are currently delayed for purge due to refs
78 l_mdc_num_strays_delayed
,
79 // How many stray dentries are currently being enqueued for purge
80 l_mdc_num_strays_enqueuing
,
82 // How many dentries have ever been added to stray dir
84 // How many dentries have been passed on to PurgeQueue
85 l_mdc_strays_enqueued
,
86 // How many strays have been reintegrated?
87 l_mdc_strays_reintegrated
,
88 // How many strays have been migrated?
89 l_mdc_strays_migrated
,
91 // How many inode sizes currently being recovered
92 l_mdc_num_recovering_processing
,
93 // How many inodes currently waiting to have size recovered
94 l_mdc_num_recovering_enqueued
,
95 // How many inodes waiting with elevated priority for recovery
96 l_mdc_num_recovering_prioritized
,
97 // How many inodes ever started size recovery
98 l_mdc_recovery_started
,
99 // How many inodes ever completed size recovery
100 l_mdc_recovery_completed
,
102 l_mdss_ireq_enqueue_scrub
,
103 l_mdss_ireq_exportdir
,
105 l_mdss_ireq_fragmentdir
,
106 l_mdss_ireq_fragstats
,
107 l_mdss_ireq_inodestats
,
112 // flags for path_traverse();
113 static const int MDS_TRAVERSE_DISCOVER
= (1 << 0);
114 static const int MDS_TRAVERSE_PATH_LOCKED
= (1 << 1);
115 static const int MDS_TRAVERSE_WANT_DENTRY
= (1 << 2);
116 static const int MDS_TRAVERSE_WANT_AUTH
= (1 << 3);
117 static const int MDS_TRAVERSE_RDLOCK_SNAP
= (1 << 4);
118 static const int MDS_TRAVERSE_RDLOCK_SNAP2
= (1 << 5);
119 static const int MDS_TRAVERSE_WANT_DIRLAYOUT
= (1 << 6);
120 static const int MDS_TRAVERSE_RDLOCK_PATH
= (1 << 7);
121 static const int MDS_TRAVERSE_XLOCK_DENTRY
= (1 << 8);
122 static const int MDS_TRAVERSE_RDLOCK_AUTHLOCK
= (1 << 9);
123 static const int MDS_TRAVERSE_CHECK_LOCKCACHE
= (1 << 10);
126 // flags for predirty_journal_parents()
127 static const int PREDIRTY_PRIMARY
= 1; // primary dn, adjust nested accounting
128 static const int PREDIRTY_DIR
= 2; // update parent dir mtime/size
129 static const int PREDIRTY_SHALLOW
= 4; // only go to immediate parent (for easier rollback)
133 typedef std::map
<mds_rank_t
, ref_t
<MCacheExpire
>> expiremap
;
135 using clock
= ceph::coarse_mono_clock
;
136 using time
= ceph::coarse_mono_time
;
139 struct discover_info_t
{
143 basei
->put(MDSCacheObject::PIN_DISCOVERBASE
);
145 void pin_base(CInode
*b
) {
147 basei
->get(MDSCacheObject::PIN_DISCOVERBASE
);
154 snapid_t snap
= CEPH_NOSNAP
;
156 CInode
*basei
= nullptr;
157 bool want_base_dir
= false;
158 bool path_locked
= false;
161 // [reconnect/rejoin caps]
162 struct reconnected_cap_info_t
{
163 reconnected_cap_info_t() {}
164 inodeno_t realm_ino
= 0;
165 snapid_t snap_follows
= 0;
170 // -- find_ino_peer --
171 struct find_ino_peer_info_t
{
172 find_ino_peer_info_t() {}
175 MDSContext
*fin
= nullptr;
176 bool path_locked
= false;
177 mds_rank_t hint
= MDS_RANK_NONE
;
178 mds_rank_t checking
= MDS_RANK_NONE
;
179 set
<mds_rank_t
> checked
;
182 friend class C_MDC_RejoinOpenInoFinish
;
183 friend class C_MDC_RejoinSessionsOpened
;
186 friend class Migrator
;
187 friend class MDBalancer
;
189 // StrayManager needs to be able to remove_inode() from us
190 // when it is done purging
191 friend class StrayManager
;
193 explicit MDCache(MDSRank
*m
, PurgeQueue
&purge_queue_
);
196 bool forward_all_reqs_to_auth() const {
197 return forward_all_requests_to_auth
;
199 uint64_t cache_limit_memory(void) {
200 return cache_memory_limit
;
202 double cache_toofull_ratio(void) const {
203 double memory_reserve
= cache_memory_limit
*(1.0-cache_reservation
);
204 return fmax(0.0, (cache_size()-memory_reserve
)/memory_reserve
);
206 bool cache_toofull(void) const {
207 return cache_toofull_ratio() > 0.0;
209 uint64_t cache_size(void) const {
210 return mempool::get_pool(mempool::mds_co::id
).allocated_bytes();
212 bool cache_overfull(void) const {
213 return cache_size() > cache_memory_limit
*cache_health_threshold
;
216 void advance_stray() {
217 stray_index
= (stray_index
+1)%NUM_STRAY
;
221 * Call this when you know that a CDentry is ready to be passed
222 * on to StrayManager (i.e. this is a stray you've just created)
224 void notify_stray(CDentry
*dn
) {
225 ceph_assert(dn
->get_dir()->get_inode()->is_stray());
226 if (dn
->state_test(CDentry::STATE_PURGING
))
229 stray_manager
.eval_stray(dn
);
232 void maybe_eval_stray(CInode
*in
, bool delay
=false);
233 void clear_dirty_bits_for_stray(CInode
* diri
);
235 bool is_readonly() { return readonly
; }
236 void force_readonly();
238 static file_layout_t
gen_default_file_layout(const MDSMap
&mdsmap
);
239 static file_layout_t
gen_default_log_layout(const MDSMap
&mdsmap
);
241 void register_perfcounters();
243 void touch_client_lease(ClientLease
*r
, int pool
, utime_t ttl
) {
244 client_leases
[pool
].push_back(&r
->item_lease
);
248 void notify_stray_removed()
250 stray_manager
.notify_stray_removed();
253 void notify_stray_created()
255 stray_manager
.notify_stray_created();
258 void eval_remote(CDentry
*dn
)
260 stray_manager
.eval_remote(dn
);
263 void _send_discover(discover_info_t
& dis
);
264 discover_info_t
& _create_discover(mds_rank_t mds
) {
265 ceph_tid_t t
= ++discover_last_tid
;
266 discover_info_t
& d
= discovers
[t
];
272 void discover_base_ino(inodeno_t want_ino
, MDSContext
*onfinish
, mds_rank_t from
=MDS_RANK_NONE
);
273 void discover_dir_frag(CInode
*base
, frag_t approx_fg
, MDSContext
*onfinish
,
274 mds_rank_t from
=MDS_RANK_NONE
);
275 void discover_path(CInode
*base
, snapid_t snap
, filepath want_path
, MDSContext
*onfinish
,
276 bool path_locked
=false, mds_rank_t from
=MDS_RANK_NONE
);
277 void discover_path(CDir
*base
, snapid_t snap
, filepath want_path
, MDSContext
*onfinish
,
278 bool path_locked
=false);
279 void kick_discovers(mds_rank_t who
); // after a failure.
281 // adjust subtree auth specification
283 // imports/exports/nested_exports
284 // join/split subtrees as appropriate
285 bool is_subtrees() { return !subtrees
.empty(); }
287 void get_subtrees(T
& c
) {
288 if constexpr (std::is_same_v
<T
, std::vector
<CDir
*>>)
289 c
.reserve(c
.size() + subtrees
.size());
290 for (const auto& p
: subtrees
) {
291 c
.push_back(p
.first
);
294 void adjust_subtree_auth(CDir
*root
, mds_authority_t auth
, bool adjust_pop
=true);
295 void adjust_subtree_auth(CDir
*root
, mds_rank_t a
, mds_rank_t b
=CDIR_AUTH_UNKNOWN
) {
296 adjust_subtree_auth(root
, mds_authority_t(a
,b
));
298 void adjust_bounded_subtree_auth(CDir
*dir
, const set
<CDir
*>& bounds
, mds_authority_t auth
);
299 void adjust_bounded_subtree_auth(CDir
*dir
, const set
<CDir
*>& bounds
, mds_rank_t a
) {
300 adjust_bounded_subtree_auth(dir
, bounds
, mds_authority_t(a
, CDIR_AUTH_UNKNOWN
));
302 void adjust_bounded_subtree_auth(CDir
*dir
, const vector
<dirfrag_t
>& bounds
, const mds_authority_t
&auth
);
303 void adjust_bounded_subtree_auth(CDir
*dir
, const vector
<dirfrag_t
>& bounds
, mds_rank_t a
) {
304 adjust_bounded_subtree_auth(dir
, bounds
, mds_authority_t(a
, CDIR_AUTH_UNKNOWN
));
306 void map_dirfrag_set(const list
<dirfrag_t
>& dfs
, set
<CDir
*>& result
);
307 void try_subtree_merge(CDir
*root
);
308 void try_subtree_merge_at(CDir
*root
, set
<CInode
*> *to_eval
, bool adjust_pop
=true);
309 void subtree_merge_writebehind_finish(CInode
*in
, MutationRef
& mut
);
310 void eval_subtree_root(CInode
*diri
);
311 CDir
*get_subtree_root(CDir
*dir
);
312 CDir
*get_projected_subtree_root(CDir
*dir
);
313 bool is_leaf_subtree(CDir
*dir
) {
314 ceph_assert(subtrees
.count(dir
));
315 return subtrees
[dir
].empty();
317 void remove_subtree(CDir
*dir
);
318 bool is_subtree(CDir
*root
) {
319 return subtrees
.count(root
);
321 void get_subtree_bounds(CDir
*root
, set
<CDir
*>& bounds
);
322 void get_wouldbe_subtree_bounds(CDir
*root
, set
<CDir
*>& bounds
);
323 void verify_subtree_bounds(CDir
*root
, const set
<CDir
*>& bounds
);
324 void verify_subtree_bounds(CDir
*root
, const list
<dirfrag_t
>& bounds
);
326 void project_subtree_rename(CInode
*diri
, CDir
*olddir
, CDir
*newdir
);
327 void adjust_subtree_after_rename(CInode
*diri
, CDir
*olddir
, bool pop
);
329 auto get_auth_subtrees() {
330 std::vector
<CDir
*> c
;
331 for (auto& p
: subtrees
) {
332 auto& root
= p
.first
;
333 if (root
->is_auth()) {
340 auto get_fullauth_subtrees() {
341 std::vector
<CDir
*> c
;
342 for (auto& p
: subtrees
) {
343 auto& root
= p
.first
;
344 if (root
->is_full_dir_auth()) {
350 auto num_subtrees_fullauth() const {
352 for (auto& p
: subtrees
) {
353 auto& root
= p
.first
;
354 if (root
->is_full_dir_auth()) {
361 auto num_subtrees_fullnonauth() const {
363 for (auto& p
: subtrees
) {
364 auto& root
= p
.first
;
365 if (root
->is_full_dir_nonauth()) {
372 auto num_subtrees() const {
373 return subtrees
.size();
376 int get_num_client_requests();
378 MDRequestRef
request_start(const cref_t
<MClientRequest
>& req
);
379 MDRequestRef
request_start_slave(metareqid_t rid
, __u32 attempt
, const cref_t
<Message
> &m
);
380 MDRequestRef
request_start_internal(int op
);
381 bool have_request(metareqid_t rid
) {
382 return active_requests
.count(rid
);
384 MDRequestRef
request_get(metareqid_t rid
);
385 void request_pin_ref(MDRequestRef
& r
, CInode
*ref
, vector
<CDentry
*>& trace
);
386 void request_finish(MDRequestRef
& mdr
);
387 void request_forward(MDRequestRef
& mdr
, mds_rank_t mds
, int port
=0);
388 void dispatch_request(MDRequestRef
& mdr
);
389 void request_drop_foreign_locks(MDRequestRef
& mdr
);
390 void request_drop_non_rdlocks(MDRequestRef
& r
);
391 void request_drop_locks(MDRequestRef
& r
);
392 void request_cleanup(MDRequestRef
& r
);
394 void request_kill(MDRequestRef
& r
); // called when session closes
396 // journal/snap helpers
397 CInode
*pick_inode_snap(CInode
*in
, snapid_t follows
);
398 CInode
*cow_inode(CInode
*in
, snapid_t last
);
399 void journal_cow_dentry(MutationImpl
*mut
, EMetaBlob
*metablob
, CDentry
*dn
,
400 snapid_t follows
=CEPH_NOSNAP
,
401 CInode
**pcow_inode
=0, CDentry::linkage_t
*dnl
=0);
402 void journal_cow_inode(MutationRef
& mut
, EMetaBlob
*metablob
, CInode
*in
, snapid_t follows
=CEPH_NOSNAP
,
403 CInode
**pcow_inode
=0);
404 void journal_dirty_inode(MutationImpl
*mut
, EMetaBlob
*metablob
, CInode
*in
, snapid_t follows
=CEPH_NOSNAP
);
406 void project_rstat_inode_to_frag(CInode
*cur
, CDir
*parent
, snapid_t first
,
407 int linkunlink
, SnapRealm
*prealm
);
408 void _project_rstat_inode_to_frag(CInode::mempool_inode
& inode
, snapid_t ofirst
, snapid_t last
,
409 CDir
*parent
, int linkunlink
, bool update_inode
);
410 void project_rstat_frag_to_inode(nest_info_t
& rstat
, nest_info_t
& accounted_rstat
,
411 snapid_t ofirst
, snapid_t last
,
412 CInode
*pin
, bool cow_head
);
413 void broadcast_quota_to_client(CInode
*in
, client_t exclude_ct
= -1, bool quota_change
= false);
414 void predirty_journal_parents(MutationRef mut
, EMetaBlob
*blob
,
415 CInode
*in
, CDir
*parent
,
416 int flags
, int linkunlink
=0,
417 snapid_t follows
=CEPH_NOSNAP
);
420 void add_uncommitted_master(metareqid_t reqid
, LogSegment
*ls
, set
<mds_rank_t
> &slaves
, bool safe
=false) {
421 uncommitted_masters
[reqid
].ls
= ls
;
422 uncommitted_masters
[reqid
].slaves
= slaves
;
423 uncommitted_masters
[reqid
].safe
= safe
;
425 void wait_for_uncommitted_master(metareqid_t reqid
, MDSContext
*c
) {
426 uncommitted_masters
[reqid
].waiters
.push_back(c
);
428 bool have_uncommitted_master(metareqid_t reqid
, mds_rank_t from
) {
429 auto p
= uncommitted_masters
.find(reqid
);
430 return p
!= uncommitted_masters
.end() && p
->second
.slaves
.count(from
) > 0;
432 void log_master_commit(metareqid_t reqid
);
433 void logged_master_update(metareqid_t reqid
);
434 void _logged_master_commit(metareqid_t reqid
);
435 void committed_master_slave(metareqid_t r
, mds_rank_t from
);
436 void finish_committed_masters();
438 void add_uncommitted_slave(metareqid_t reqid
, LogSegment
*, mds_rank_t
, MDSlaveUpdate
*su
=nullptr);
439 void wait_for_uncommitted_slave(metareqid_t reqid
, MDSContext
*c
) {
440 uncommitted_slaves
.at(reqid
).waiters
.push_back(c
);
442 void finish_uncommitted_slave(metareqid_t reqid
, bool assert_exist
=true);
443 MDSlaveUpdate
* get_uncommitted_slave(metareqid_t reqid
, mds_rank_t master
);
444 void _logged_slave_commit(mds_rank_t from
, metareqid_t reqid
);
446 void set_recovery_set(set
<mds_rank_t
>& s
);
447 void handle_mds_failure(mds_rank_t who
);
448 void handle_mds_recovery(mds_rank_t who
);
450 void recalc_auth_bits(bool replay
);
451 void remove_inode_recursive(CInode
*in
);
453 bool is_ambiguous_slave_update(metareqid_t reqid
, mds_rank_t master
) {
454 auto p
= ambiguous_slave_updates
.find(master
);
455 return p
!= ambiguous_slave_updates
.end() && p
->second
.count(reqid
);
457 void add_ambiguous_slave_update(metareqid_t reqid
, mds_rank_t master
) {
458 ambiguous_slave_updates
[master
].insert(reqid
);
460 void remove_ambiguous_slave_update(metareqid_t reqid
, mds_rank_t master
) {
461 auto p
= ambiguous_slave_updates
.find(master
);
462 auto q
= p
->second
.find(reqid
);
463 ceph_assert(q
!= p
->second
.end());
465 if (p
->second
.empty())
466 ambiguous_slave_updates
.erase(p
);
469 void add_rollback(metareqid_t reqid
, mds_rank_t master
) {
470 resolve_need_rollback
[reqid
] = master
;
472 void finish_rollback(metareqid_t reqid
, MDRequestRef
& mdr
);
475 void add_ambiguous_import(dirfrag_t base
, const vector
<dirfrag_t
>& bounds
);
476 void add_ambiguous_import(CDir
*base
, const set
<CDir
*>& bounds
);
477 bool have_ambiguous_import(dirfrag_t base
) {
478 return my_ambiguous_imports
.count(base
);
480 void get_ambiguous_import_bounds(dirfrag_t base
, vector
<dirfrag_t
>& bounds
) {
481 ceph_assert(my_ambiguous_imports
.count(base
));
482 bounds
= my_ambiguous_imports
[base
];
484 void cancel_ambiguous_import(CDir
*);
485 void finish_ambiguous_import(dirfrag_t dirino
);
486 void resolve_start(MDSContext
*resolve_done_
);
487 void send_resolves();
488 void maybe_send_pending_resolves() {
489 if (resolves_pending
)
490 send_subtree_resolves();
493 void _move_subtree_map_bound(dirfrag_t df
, dirfrag_t oldparent
, dirfrag_t newparent
,
494 map
<dirfrag_t
,vector
<dirfrag_t
> >& subtrees
);
495 ESubtreeMap
*create_subtree_map();
497 void clean_open_file_lists();
498 void dump_openfiles(Formatter
*f
);
499 bool dump_inode(Formatter
*f
, uint64_t number
);
501 void rejoin_start(MDSContext
*rejoin_done_
);
502 void rejoin_gather_finish();
503 void rejoin_send_rejoins();
504 void rejoin_export_caps(inodeno_t ino
, client_t client
, const cap_reconnect_t
& icr
,
505 int target
=-1, bool drop_path
=false) {
506 auto& ex
= cap_exports
[ino
];
508 auto &_icr
= ex
.second
[client
] = icr
;
512 void rejoin_recovered_caps(inodeno_t ino
, client_t client
, const cap_reconnect_t
& icr
,
513 mds_rank_t frommds
=MDS_RANK_NONE
, bool drop_path
=false) {
514 auto &_icr
= cap_imports
[ino
][client
][frommds
] = icr
;
518 void rejoin_recovered_client(client_t client
, const entity_inst_t
& inst
) {
519 rejoin_client_map
.emplace(client
, inst
);
521 bool rejoin_has_cap_reconnect(inodeno_t ino
) const {
522 return cap_imports
.count(ino
);
524 void add_replay_ino_alloc(inodeno_t ino
) {
525 cap_imports_missing
.insert(ino
); // avoid opening ino during cache rejoin
527 const cap_reconnect_t
*get_replay_cap_reconnect(inodeno_t ino
, client_t client
) {
528 if (cap_imports
.count(ino
) &&
529 cap_imports
[ino
].count(client
) &&
530 cap_imports
[ino
][client
].count(MDS_RANK_NONE
)) {
531 return &cap_imports
[ino
][client
][MDS_RANK_NONE
];
535 void remove_replay_cap_reconnect(inodeno_t ino
, client_t client
) {
536 ceph_assert(cap_imports
[ino
].size() == 1);
537 ceph_assert(cap_imports
[ino
][client
].size() == 1);
538 cap_imports
.erase(ino
);
540 void wait_replay_cap_reconnect(inodeno_t ino
, MDSContext
*c
) {
541 cap_reconnect_waiters
[ino
].push_back(c
);
544 void add_reconnected_cap(client_t client
, inodeno_t ino
, const cap_reconnect_t
& icr
) {
545 reconnected_cap_info_t
&info
= reconnected_caps
[ino
][client
];
546 info
.realm_ino
= inodeno_t(icr
.capinfo
.snaprealm
);
547 info
.snap_follows
= icr
.snap_follows
;
549 void set_reconnected_dirty_caps(client_t client
, inodeno_t ino
, int dirty
, bool snapflush
) {
550 reconnected_cap_info_t
&info
= reconnected_caps
[ino
][client
];
551 info
.dirty_caps
|= dirty
;
553 info
.snapflush
= snapflush
;
555 void add_reconnected_snaprealm(client_t client
, inodeno_t ino
, snapid_t seq
) {
556 reconnected_snaprealms
[ino
][client
] = seq
;
559 void rejoin_open_ino_finish(inodeno_t ino
, int ret
);
560 void rejoin_prefetch_ino_finish(inodeno_t ino
, int ret
);
561 void rejoin_open_sessions_finish(map
<client_t
,pair
<Session
*,uint64_t> >& session_map
);
562 bool process_imported_caps();
563 void choose_lock_states_and_reconnect_caps();
564 void prepare_realm_split(SnapRealm
*realm
, client_t client
, inodeno_t ino
,
565 map
<client_t
,ref_t
<MClientSnap
>>& splits
);
566 void prepare_realm_merge(SnapRealm
*realm
, SnapRealm
*parent_realm
, map
<client_t
,ref_t
<MClientSnap
>>& splits
);
567 void send_snaps(map
<client_t
,ref_t
<MClientSnap
>>& splits
);
568 Capability
* rejoin_import_cap(CInode
*in
, client_t client
, const cap_reconnect_t
& icr
, mds_rank_t frommds
);
569 void finish_snaprealm_reconnect(client_t client
, SnapRealm
*realm
, snapid_t seq
,
570 map
<client_t
,ref_t
<MClientSnap
>>& updates
);
571 Capability
* try_reconnect_cap(CInode
*in
, Session
*session
);
572 void export_remaining_imported_caps();
574 void do_cap_import(Session
*session
, CInode
*in
, Capability
*cap
,
575 uint64_t p_cap_id
, ceph_seq_t p_seq
, ceph_seq_t p_mseq
,
576 int peer
, int p_flags
);
577 void do_delayed_cap_imports();
578 void rebuild_need_snapflush(CInode
*head_in
, SnapRealm
*realm
, client_t client
,
579 snapid_t snap_follows
);
580 void open_snaprealms();
582 bool open_undef_inodes_dirfrags();
583 void opened_undef_inode(CInode
*in
);
584 void opened_undef_dirfrag(CDir
*dir
) {
585 rejoin_undef_dirfrags
.erase(dir
);
588 void reissue_all_caps();
590 void start_files_to_recover();
591 void do_file_recover();
592 void queue_file_recover(CInode
*in
);
593 void _queued_file_recover_cow(CInode
*in
, MutationRef
& mut
);
595 void handle_conf_change(const std::set
<std::string
>& changed
, const MDSMap
& mds_map
);
601 CInode
*get_root() { return root
; }
602 CInode
*get_myin() { return myin
; }
604 size_t get_cache_size() { return lru
.lru_get_size(); }
607 std::pair
<bool, uint64_t> trim(uint64_t count
=0);
609 bool trim_non_auth_subtree(CDir
*directory
);
610 void standby_trim_segment(LogSegment
*ls
);
611 void try_trim_non_auth_subtree(CDir
*dir
);
612 bool can_trim_non_auth_dirfrag(CDir
*dir
) {
613 return my_ambiguous_imports
.count((dir
)->dirfrag()) == 0 &&
614 uncommitted_slave_rename_olddir
.count(dir
->inode
) == 0;
618 * For all unreferenced inodes, dirs, dentries below an inode, compose
619 * expiry messages. This is used when giving up all replicas of entities
620 * for an MDS peer in the 'stopping' state, such that the peer can
621 * empty its cache and finish shutting down.
623 * We have to make sure we're only expiring un-referenced items to
624 * avoid interfering with ongoing stray-movement (we can't distinguish
625 * between the "moving my strays" and "waiting for my cache to empty"
626 * phases within 'stopping')
628 * @return false if we completed cleanly, true if caller should stop
629 * expiring because we hit something with refs.
631 bool expire_recursive(CInode
*in
, expiremap
& expiremap
);
633 void trim_client_leases();
634 void check_memory_usage();
636 void shutdown_start();
637 void shutdown_check();
638 bool shutdown_pass();
639 bool shutdown(); // clear cache (ie at shutodwn)
640 bool shutdown_export_strays();
641 void shutdown_export_stray_finish(inodeno_t ino
) {
642 if (shutdown_exporting_strays
.erase(ino
))
643 shutdown_export_strays();
647 bool have_inode(vinodeno_t vino
) {
648 if (vino
.snapid
== CEPH_NOSNAP
)
649 return inode_map
.count(vino
.ino
) ? true : false;
651 return snap_inode_map
.count(vino
) ? true : false;
653 bool have_inode(inodeno_t ino
, snapid_t snap
=CEPH_NOSNAP
) {
654 return have_inode(vinodeno_t(ino
, snap
));
656 CInode
* get_inode(vinodeno_t vino
) {
657 if (vino
.snapid
== CEPH_NOSNAP
) {
658 auto p
= inode_map
.find(vino
.ino
);
659 if (p
!= inode_map
.end())
662 auto p
= snap_inode_map
.find(vino
);
663 if (p
!= snap_inode_map
.end())
668 CInode
* get_inode(inodeno_t ino
, snapid_t s
=CEPH_NOSNAP
) {
669 return get_inode(vinodeno_t(ino
, s
));
671 CInode
* lookup_snap_inode(vinodeno_t vino
) {
672 auto p
= snap_inode_map
.lower_bound(vino
);
673 if (p
!= snap_inode_map
.end() &&
674 p
->second
->ino() == vino
.ino
&& p
->second
->first
<= vino
.snapid
)
679 CDir
* get_dirfrag(dirfrag_t df
) {
680 CInode
*in
= get_inode(df
.ino
);
683 return in
->get_dirfrag(df
.frag
);
685 CDir
* get_dirfrag(inodeno_t ino
, std::string_view dn
) {
686 CInode
*in
= get_inode(ino
);
689 frag_t fg
= in
->pick_dirfrag(dn
);
690 return in
->get_dirfrag(fg
);
692 CDir
* get_force_dirfrag(dirfrag_t df
, bool replay
) {
693 CInode
*diri
= get_inode(df
.ino
);
696 CDir
*dir
= force_dir_fragment(diri
, df
.frag
, replay
);
698 dir
= diri
->get_dirfrag(df
.frag
);
702 MDSCacheObject
*get_object(const MDSCacheObjectInfo
&info
);
704 void add_inode(CInode
*in
);
706 void remove_inode(CInode
*in
);
708 void touch_dentry(CDentry
*dn
) {
709 if (dn
->state_test(CDentry::STATE_BOTTOMLRU
)) {
710 bottom_lru
.lru_midtouch(dn
);
715 lru
.lru_midtouch(dn
);
718 void touch_dentry_bottom(CDentry
*dn
) {
719 if (dn
->state_test(CDentry::STATE_BOTTOMLRU
))
721 lru
.lru_bottouch(dn
);
725 void truncate_inode(CInode
*in
, LogSegment
*ls
);
726 void _truncate_inode(CInode
*in
, LogSegment
*ls
);
727 void truncate_inode_finish(CInode
*in
, LogSegment
*ls
);
728 void truncate_inode_logged(CInode
*in
, MutationRef
& mut
);
730 void add_recovered_truncate(CInode
*in
, LogSegment
*ls
);
731 void remove_recovered_truncate(CInode
*in
, LogSegment
*ls
);
732 void start_recovered_truncates();
734 // purge unsafe inodes
735 void start_purge_inodes();
736 void purge_inodes(const interval_set
<inodeno_t
>& i
, LogSegment
*ls
);
738 CDir
*get_auth_container(CDir
*in
);
739 CDir
*get_export_container(CDir
*dir
);
740 void find_nested_exports(CDir
*dir
, set
<CDir
*>& s
);
741 void find_nested_exports_under(CDir
*import
, CDir
*dir
, set
<CDir
*>& s
);
744 void create_unlinked_system_inode(CInode
*in
, inodeno_t ino
,
746 CInode
*create_system_inode(inodeno_t ino
, int mode
);
747 CInode
*create_root_inode();
749 void create_empty_hierarchy(MDSGather
*gather
);
750 void create_mydir_hierarchy(MDSGather
*gather
);
752 bool is_open() { return open
; }
753 void wait_for_open(MDSContext
*c
) {
754 waiting_for_open
.push_back(c
);
757 void open_root_inode(MDSContext
*c
);
759 void open_mydir_inode(MDSContext
*c
);
760 void open_mydir_frag(MDSContext
*c
);
761 void populate_mydir();
763 void _create_system_file(CDir
*dir
, std::string_view name
, CInode
*in
, MDSContext
*fin
);
764 void _create_system_file_finish(MutationRef
& mut
, CDentry
*dn
,
765 version_t dpv
, MDSContext
*fin
);
767 void open_foreign_mdsdir(inodeno_t ino
, MDSContext
*c
);
768 CDir
*get_stray_dir(CInode
*in
);
769 CDentry
*get_or_create_stray_dentry(CInode
*in
);
772 * Find the given dentry (and whether it exists or not), its ancestors,
773 * and get them all into memory and usable on this MDS. This function
774 * makes a best-effort attempt to load everything; if it needs to
775 * go away and do something then it will put the request on a waitlist.
776 * It prefers the mdr, then the req, then the fin. (At least one of these
779 * At least one of the params mdr, req, and fin must be non-null.
781 * @param mdr The MDRequest associated with the path. Can be null.
782 * @param cf A MDSContextFactory for waiter building.
783 * @param path The path to traverse to.
785 * @param flags Specifies different lookup behaviors.
786 * By default, path_traverse() forwards the request to the auth MDS if that
787 * is appropriate (ie, if it doesn't know the contents of a directory).
788 * MDS_TRAVERSE_DISCOVER: Instead of forwarding request, path_traverse()
789 * attempts to look up the path from a different MDS (and bring them into
790 * its cache as replicas).
791 * MDS_TRAVERSE_PATH_LOCKED: path_traverse() will procceed when xlocked
792 * dentry is encountered.
793 * MDS_TRAVERSE_WANT_DENTRY: Caller wants tail dentry. Add a null dentry if
794 * tail dentry does not exist. return 0 even tail dentry is null.
795 * MDS_TRAVERSE_WANT_AUTH: Always forward request to auth MDS of target inode
796 * or auth MDS of tail dentry (MDS_TRAVERSE_WANT_DENTRY is set).
798 * @param pdnvec Data return parameter -- on success, contains a
799 * vector of dentries. On failure, is either empty or contains the
800 * full trace of traversable dentries.
801 * @param pin Data return parameter -- if successful, points to the inode
802 * associated with filepath. If unsuccessful, is null.
804 * @returns 0 on success, 1 on "not done yet", 2 on "forwarding", -errno otherwise.
805 * If it returns 1, the requester associated with this call has been placed
806 * on the appropriate waitlist, and it should unwind itself and back out.
807 * If it returns 2 the request has been forwarded, and again the requester
808 * should unwind itself and back out.
810 int path_traverse(MDRequestRef
& mdr
, MDSContextFactory
& cf
,
811 const filepath
& path
, int flags
,
812 vector
<CDentry
*> *pdnvec
, CInode
**pin
=nullptr);
814 CInode
*cache_traverse(const filepath
& path
);
816 void open_remote_dirfrag(CInode
*diri
, frag_t fg
, MDSContext
*fin
);
817 CInode
*get_dentry_inode(CDentry
*dn
, MDRequestRef
& mdr
, bool projected
=false);
819 bool parallel_fetch(map
<inodeno_t
,filepath
>& pathmap
, set
<inodeno_t
>& missing
);
820 bool parallel_fetch_traverse_dir(inodeno_t ino
, filepath
& path
,
821 set
<CDir
*>& fetch_queue
, set
<inodeno_t
>& missing
,
822 C_GatherBuilder
&gather_bld
);
824 void open_remote_dentry(CDentry
*dn
, bool projected
, MDSContext
*fin
,
825 bool want_xlocked
=false);
826 void _open_remote_dentry_finish(CDentry
*dn
, inodeno_t ino
, MDSContext
*fin
,
827 bool want_xlocked
, int r
);
829 void make_trace(vector
<CDentry
*>& trace
, CInode
*in
);
831 void kick_open_ino_peers(mds_rank_t who
);
832 void open_ino(inodeno_t ino
, int64_t pool
, MDSContext
*fin
,
833 bool want_replica
=true, bool want_xlocked
=false);
835 void find_ino_peers(inodeno_t ino
, MDSContext
*c
,
836 mds_rank_t hint
=MDS_RANK_NONE
, bool path_locked
=false);
837 void _do_find_ino_peer(find_ino_peer_info_t
& fip
);
838 void handle_find_ino(const cref_t
<MMDSFindIno
> &m
);
839 void handle_find_ino_reply(const cref_t
<MMDSFindInoReply
> &m
);
840 void kick_find_ino_peers(mds_rank_t who
);
842 SnapRealm
*get_global_snaprealm() const { return global_snaprealm
; }
843 void create_global_snaprealm();
844 void do_realm_invalidate_and_update_notify(CInode
*in
, int snapop
, bool notify_clients
=true);
845 void send_snap_update(CInode
*in
, version_t stid
, int snap_op
);
846 void handle_snap_update(const cref_t
<MMDSSnapUpdate
> &m
);
847 void notify_global_snaprealm_update(int snap_op
);
850 void fetch_backtrace(inodeno_t ino
, int64_t pool
, bufferlist
& bl
, Context
*fin
);
851 uint64_t get_num_strays() const { return stray_manager
.get_num_strays(); }
854 void dispatch(const cref_t
<Message
> &m
);
856 void encode_replica_dir(CDir
*dir
, mds_rank_t to
, bufferlist
& bl
);
857 void encode_replica_dentry(CDentry
*dn
, mds_rank_t to
, bufferlist
& bl
);
858 void encode_replica_inode(CInode
*in
, mds_rank_t to
, bufferlist
& bl
,
861 void decode_replica_dir(CDir
*&dir
, bufferlist::const_iterator
& p
, CInode
*diri
, mds_rank_t from
, MDSContext::vec
& finished
);
862 void decode_replica_dentry(CDentry
*&dn
, bufferlist::const_iterator
& p
, CDir
*dir
, MDSContext::vec
& finished
);
863 void decode_replica_inode(CInode
*&in
, bufferlist::const_iterator
& p
, CDentry
*dn
, MDSContext::vec
& finished
);
865 void encode_replica_stray(CDentry
*straydn
, mds_rank_t who
, bufferlist
& bl
);
866 void decode_replica_stray(CDentry
*&straydn
, const bufferlist
&bl
, mds_rank_t from
);
869 void encode_remote_dentry_link(CDentry::linkage_t
*dnl
, bufferlist
& bl
);
870 void decode_remote_dentry_link(CDir
*dir
, CDentry
*dn
, bufferlist::const_iterator
& p
);
871 void send_dentry_link(CDentry
*dn
, MDRequestRef
& mdr
);
872 void send_dentry_unlink(CDentry
*dn
, CDentry
*straydn
, MDRequestRef
& mdr
);
874 void wait_for_uncommitted_fragment(dirfrag_t dirfrag
, MDSContext
*c
) {
875 uncommitted_fragments
.at(dirfrag
).waiters
.push_back(c
);
877 bool is_any_uncommitted_fragment() const {
878 return !uncommitted_fragments
.empty();
880 void wait_for_uncommitted_fragments(MDSGather
*gather
);
881 void rollback_uncommitted_fragments();
883 void split_dir(CDir
*dir
, int byn
);
884 void merge_dir(CInode
*diri
, frag_t fg
);
886 void find_stale_fragment_freeze();
887 void fragment_freeze_inc_num_waiters(CDir
*dir
);
888 bool fragment_are_all_frozen(CDir
*dir
);
889 int get_num_fragmenting_dirs() { return fragments
.size(); }
892 //int send_inode_updates(CInode *in);
893 //void handle_inode_update(MInodeUpdate *m);
895 int send_dir_updates(CDir
*in
, bool bcast
=false);
896 void handle_dir_update(const cref_t
<MDirUpdate
> &m
);
898 // -- cache expiration --
899 void handle_cache_expire(const cref_t
<MCacheExpire
> &m
);
900 void process_delayed_expire(CDir
*dir
);
901 void discard_delayed_expire(CDir
*dir
);
904 void handle_mdsmap(const MDSMap
&mdsmap
);
906 int dump_cache() { return dump_cache({}, nullptr); }
907 int dump_cache(std::string_view filename
);
908 int dump_cache(Formatter
*f
);
909 void dump_tree(CInode
*in
, const int cur_depth
, const int max_depth
, Formatter
*f
);
911 void cache_status(Formatter
*f
);
913 void dump_resolve_status(Formatter
*f
) const;
914 void dump_rejoin_status(Formatter
*f
) const;
918 void show_subtrees(int dbl
=10, bool force_print
=false);
920 CInode
*hack_pick_random_inode() {
921 ceph_assert(!inode_map
.empty());
922 int n
= rand() % inode_map
.size();
923 auto p
= inode_map
.begin();
928 void flush_dentry(std::string_view path
, Context
*fin
);
930 * Create and start an OP_ENQUEUE_SCRUB
932 void enqueue_scrub(std::string_view path
, std::string_view tag
,
933 bool force
, bool recursive
, bool repair
,
934 Formatter
*f
, Context
*fin
);
935 void repair_inode_stats(CInode
*diri
);
936 void repair_dirfrag_stats(CDir
*dir
);
937 void upgrade_inode_snaprealm(CInode
*in
);
943 LRU lru
; // dentry lru for expiring items from cache
944 LRU bottom_lru
; // dentries that should be trimmed ASAP
948 int num_shadow_inodes
= 0;
950 int num_inodes_with_caps
= 0;
952 unsigned max_dir_commit_size
;
954 file_layout_t default_file_layout
;
955 file_layout_t default_log_layout
;
957 // -- client leases --
958 static constexpr std::size_t client_lease_pools
= 3;
959 std::array
<float, client_lease_pools
> client_lease_durations
{5.0, 30.0, 300.0};
962 uint64_t last_cap_id
= 0;
964 map
<ceph_tid_t
, discover_info_t
> discovers
;
965 ceph_tid_t discover_last_tid
= 0;
968 map
<int, map
<inodeno_t
, MDSContext::vec
> > waiting_for_base_ino
;
970 map
<inodeno_t
,map
<client_t
, reconnected_cap_info_t
> > reconnected_caps
; // inode -> client -> snap_follows,realmino
971 map
<inodeno_t
,map
<client_t
, snapid_t
> > reconnected_snaprealms
; // realmino -> client -> realmseq
974 set
<CInode
*> rejoin_pending_snaprealms
;
975 // cap imports. delayed snap parent opens.
976 map
<client_t
,set
<CInode
*> > delayed_imported_caps
;
979 std::unique_ptr
<Migrator
> migrator
;
981 bool did_shutdown_log_cap
= false;
983 map
<ceph_tid_t
, find_ino_peer_info_t
> find_ino_peer
;
984 ceph_tid_t find_ino_peer_last_tid
= 0;
986 // delayed cache expire
987 map
<CDir
*, expiremap
> delayed_expire
; // subtree root -> expire msg
989 /* Because exports may fail, this set lets us keep track of inodes that need exporting. */
990 std::set
<CInode
*> export_pin_queue
;
991 std::set
<CInode
*> export_pin_delayed_queue
;
993 OpenFileTable open_file_table
;
996 // track master requests whose slaves haven't acknowledged commit
999 set
<mds_rank_t
> slaves
;
1000 LogSegment
*ls
= nullptr;
1001 MDSContext::vec waiters
;
1003 bool committing
= false;
1004 bool recovering
= false;
1010 LogSegment
*ls
= nullptr;
1011 MDSlaveUpdate
*su
= nullptr;
1012 MDSContext::vec waiters
;
1015 struct open_ino_info_t
{
1016 open_ino_info_t() {}
1017 vector
<inode_backpointer_t
> ancestors
;
1018 set
<mds_rank_t
> checked
;
1019 mds_rank_t checking
= MDS_RANK_NONE
;
1020 mds_rank_t auth_hint
= MDS_RANK_NONE
;
1021 bool check_peers
= true;
1022 bool fetch_backtrace
= true;
1023 bool discover
= false;
1024 bool want_replica
= false;
1025 bool want_xlocked
= false;
1029 MDSContext::vec waiters
;
1032 friend struct C_MDC_OpenInoTraverseDir
;
1033 friend struct C_MDC_OpenInoParentOpened
;
1034 friend struct C_MDC_RetryScanStray
;
1036 friend class C_IO_MDC_OpenInoBacktraceFetched
;
1037 friend class C_MDC_Join
;
1038 friend class C_MDC_RespondInternalRequest
;
1040 friend class ESlaveUpdate
;
1041 friend class ECommitted
;
1043 void set_readonly() { readonly
= true; }
1045 void handle_resolve(const cref_t
<MMDSResolve
> &m
);
1046 void handle_resolve_ack(const cref_t
<MMDSResolveAck
> &m
);
1047 void process_delayed_resolve();
1048 void discard_delayed_resolve(mds_rank_t who
);
1049 void maybe_resolve_finish();
1050 void disambiguate_my_imports();
1051 void disambiguate_other_imports();
1052 void trim_unlinked_inodes();
1054 void send_slave_resolves();
1055 void send_subtree_resolves();
1056 void maybe_finish_slave_resolve();
1058 void rejoin_walk(CDir
*dir
, const ref_t
<MMDSCacheRejoin
> &rejoin
);
1059 void handle_cache_rejoin(const cref_t
<MMDSCacheRejoin
> &m
);
1060 void handle_cache_rejoin_weak(const cref_t
<MMDSCacheRejoin
> &m
);
1061 CInode
* rejoin_invent_inode(inodeno_t ino
, snapid_t last
);
1062 CDir
* rejoin_invent_dirfrag(dirfrag_t df
);
1063 void handle_cache_rejoin_strong(const cref_t
<MMDSCacheRejoin
> &m
);
1064 void rejoin_scour_survivor_replicas(mds_rank_t from
, const cref_t
<MMDSCacheRejoin
> &ack
,
1065 set
<vinodeno_t
>& acked_inodes
,
1066 set
<SimpleLock
*>& gather_locks
);
1067 void handle_cache_rejoin_ack(const cref_t
<MMDSCacheRejoin
> &m
);
1068 void rejoin_send_acks();
1069 void rejoin_trim_undef_inodes();
1070 void maybe_send_pending_rejoins() {
1071 if (rejoins_pending
)
1072 rejoin_send_rejoins();
1075 void touch_inode(CInode
*in
) {
1076 if (in
->get_parent_dn())
1077 touch_dentry(in
->get_projected_parent_dn());
1080 void inode_remove_replica(CInode
*in
, mds_rank_t rep
, bool rejoin
,
1081 set
<SimpleLock
*>& gather_locks
);
1082 void dentry_remove_replica(CDentry
*dn
, mds_rank_t rep
, set
<SimpleLock
*>& gather_locks
);
1084 void rename_file(CDentry
*srcdn
, CDentry
*destdn
);
1086 void _open_ino_backtrace_fetched(inodeno_t ino
, bufferlist
& bl
, int err
);
1087 void _open_ino_parent_opened(inodeno_t ino
, int ret
);
1088 void _open_ino_traverse_dir(inodeno_t ino
, open_ino_info_t
& info
, int err
);
1089 void _open_ino_fetch_dir(inodeno_t ino
, const cref_t
<MMDSOpenIno
> &m
, CDir
*dir
, bool parent
);
1090 int open_ino_traverse_dir(inodeno_t ino
, const cref_t
<MMDSOpenIno
> &m
,
1091 const vector
<inode_backpointer_t
>& ancestors
,
1092 bool discover
, bool want_xlocked
, mds_rank_t
*hint
);
1093 void open_ino_finish(inodeno_t ino
, open_ino_info_t
& info
, int err
);
1094 void do_open_ino(inodeno_t ino
, open_ino_info_t
& info
, int err
);
1095 void do_open_ino_peer(inodeno_t ino
, open_ino_info_t
& info
);
1096 void handle_open_ino(const cref_t
<MMDSOpenIno
> &m
, int err
=0);
1097 void handle_open_ino_reply(const cref_t
<MMDSOpenInoReply
> &m
);
1099 void scan_stray_dir(dirfrag_t next
=dirfrag_t());
1101 void handle_discover(const cref_t
<MDiscover
> &dis
);
1102 void handle_discover_reply(const cref_t
<MDiscoverReply
> &m
);
1103 void handle_dentry_link(const cref_t
<MDentryLink
> &m
);
1104 void handle_dentry_unlink(const cref_t
<MDentryUnlink
> &m
);
1106 int dump_cache(std::string_view fn
, Formatter
*f
);
1108 void flush_dentry_work(MDRequestRef
& mdr
);
1110 * Resolve path to a dentry and pass it onto the ScrubStack.
1112 * TODO: return enough information to the original mdr formatter
1113 * and completion that they can subsequeuntly check the progress of
1114 * this scrub (we won't block them on a whole scrub as it can take a very
1117 void enqueue_scrub_work(MDRequestRef
& mdr
);
1118 void recursive_scrub_finish(const ScrubHeaderRef
& header
);
1119 void repair_inode_stats_work(MDRequestRef
& mdr
);
1120 void repair_dirfrag_stats_work(MDRequestRef
& mdr
);
1121 void upgrade_inode_snaprealm_work(MDRequestRef
& mdr
);
1123 ceph::unordered_map
<inodeno_t
,CInode
*> inode_map
; // map of head inodes by ino
1124 map
<vinodeno_t
, CInode
*> snap_inode_map
; // map of snap inodes by ino
1125 CInode
*root
= nullptr; // root inode
1126 CInode
*myin
= nullptr; // .ceph/mds%d dir
1128 bool readonly
= false;
1130 int stray_index
= 0;
1132 set
<CInode
*> base_inodes
;
1134 std::unique_ptr
<PerfCounters
> logger
;
1137 bool exceeded_size_limit
= false;
1138 std::array
<xlist
<ClientLease
*>, client_lease_pools
> client_leases
{};
1140 /* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */
1141 map
<CDir
*,set
<CDir
*> > subtrees
;
1142 map
<CInode
*,list
<pair
<CDir
*,CDir
*> > > projected_subtree_renames
; // renamed ino -> target dir
1145 ceph::unordered_map
<metareqid_t
, MDRequestRef
> active_requests
;
1148 set
<mds_rank_t
> recovery_set
;
1151 // from EImportStart w/o EImportFinish during journal replay
1152 map
<dirfrag_t
, vector
<dirfrag_t
> > my_ambiguous_imports
;
1153 // from MMDSResolves
1154 map
<mds_rank_t
, map
<dirfrag_t
, vector
<dirfrag_t
> > > other_ambiguous_imports
;
1156 map
<CInode
*, int> uncommitted_slave_rename_olddir
; // slave: preserve the non-auth dir until seeing commit.
1157 map
<CInode
*, int> uncommitted_slave_unlink
; // slave: preserve the unlinked inode until seeing commit.
1159 map
<metareqid_t
, umaster
> uncommitted_masters
; // master: req -> slave set
1160 map
<metareqid_t
, uslave
> uncommitted_slaves
; // slave: preserve the slave req until seeing commit.
1162 set
<metareqid_t
> pending_masters
;
1163 map
<int, set
<metareqid_t
> > ambiguous_slave_updates
;
1165 bool resolves_pending
= false;
1166 set
<mds_rank_t
> resolve_gather
; // nodes i need resolves from
1167 set
<mds_rank_t
> resolve_ack_gather
; // nodes i need a resolve_ack from
1168 set
<version_t
> resolve_snapclient_commits
;
1169 map
<metareqid_t
, mds_rank_t
> resolve_need_rollback
; // rollbacks i'm writing to the journal
1170 map
<mds_rank_t
, cref_t
<MMDSResolve
>> delayed_resolve
;
1173 bool rejoins_pending
= false;
1174 set
<mds_rank_t
> rejoin_gather
; // nodes from whom i need a rejoin
1175 set
<mds_rank_t
> rejoin_sent
; // nodes i sent a rejoin to
1176 set
<mds_rank_t
> rejoin_ack_sent
; // nodes i sent a rejoin to
1177 set
<mds_rank_t
> rejoin_ack_gather
; // nodes from whom i need a rejoin ack
1178 map
<mds_rank_t
,map
<inodeno_t
,map
<client_t
,Capability::Import
> > > rejoin_imported_caps
;
1179 map
<inodeno_t
,pair
<mds_rank_t
,map
<client_t
,Capability::Export
> > > rejoin_slave_exports
;
1181 map
<client_t
,entity_inst_t
> rejoin_client_map
;
1182 map
<client_t
,client_metadata_t
> rejoin_client_metadata_map
;
1183 map
<client_t
,pair
<Session
*,uint64_t> > rejoin_session_map
;
1185 map
<inodeno_t
,pair
<mds_rank_t
,map
<client_t
,cap_reconnect_t
> > > cap_exports
; // ino -> target, client -> capex
1187 map
<inodeno_t
,map
<client_t
,map
<mds_rank_t
,cap_reconnect_t
> > > cap_imports
; // ino -> client -> frommds -> capex
1188 set
<inodeno_t
> cap_imports_missing
;
1189 map
<inodeno_t
, MDSContext::vec
> cap_reconnect_waiters
;
1190 int cap_imports_num_opening
= 0;
1192 set
<CInode
*> rejoin_undef_inodes
;
1193 set
<CInode
*> rejoin_potential_updated_scatterlocks
;
1194 set
<CDir
*> rejoin_undef_dirfrags
;
1195 map
<mds_rank_t
, set
<CInode
*> > rejoin_unlinked_inodes
;
1197 vector
<CInode
*> rejoin_recover_q
, rejoin_check_q
;
1198 list
<SimpleLock
*> rejoin_eval_locks
;
1199 MDSContext::vec rejoin_waiters
;
1201 std::unique_ptr
<MDSContext
> rejoin_done
;
1202 std::unique_ptr
<MDSContext
> resolve_done
;
1204 ceph_tid_t open_ino_last_tid
= 0;
1205 map
<inodeno_t
,open_ino_info_t
> opening_inodes
;
1207 StrayManager stray_manager
;
1210 // -- fragmenting --
1214 bool committed
= false;
1215 LogSegment
*ls
= nullptr;
1216 MDSContext::vec waiters
;
1217 frag_vec_t old_frags
;
1218 bufferlist rollback
;
1221 struct fragment_info_t
{
1222 fragment_info_t() {}
1223 bool is_fragmenting() { return !resultfrags
.empty(); }
1224 uint64_t get_tid() { return mdr
? mdr
->reqid
.tid
: 0; }
1226 std::vector
<CDir
*> dirs
;
1227 std::vector
<CDir
*> resultfrags
;
1229 set
<mds_rank_t
> notify_ack_waiting
;
1230 bool finishing
= false;
1232 // for deadlock detection
1233 bool all_frozen
= false;
1234 utime_t last_cum_auth_pins_change
;
1235 int last_cum_auth_pins
= 0;
1236 int num_remote_waiters
= 0; // number of remote authpin waiters
1239 typedef map
<dirfrag_t
,fragment_info_t
>::iterator fragment_info_iterator
;
1241 friend class EFragment
;
1242 friend class C_MDC_FragmentFrozen
;
1243 friend class C_MDC_FragmentMarking
;
1244 friend class C_MDC_FragmentPrep
;
1245 friend class C_MDC_FragmentStore
;
1246 friend class C_MDC_FragmentCommit
;
1247 friend class C_IO_MDC_FragmentPurgeOld
;
1250 static const unsigned int SUBTREES_COUNT_THRESHOLD
= 5;
1251 static const unsigned int SUBTREES_DEPTH_THRESHOLD
= 5;
1253 CInode
*get_stray() {
1254 return strays
[stray_index
];
1257 void identify_files_to_recover();
1259 std::pair
<bool, uint64_t> trim_lru(uint64_t count
, expiremap
& expiremap
);
1260 bool trim_dentry(CDentry
*dn
, expiremap
& expiremap
);
1261 void trim_dirfrag(CDir
*dir
, CDir
*con
, expiremap
& expiremap
);
1262 bool trim_inode(CDentry
*dn
, CInode
*in
, CDir
*con
, expiremap
&);
1263 void send_expire_messages(expiremap
& expiremap
);
1264 void trim_non_auth(); // trim out trimmable non-auth items
1266 void adjust_dir_fragments(CInode
*diri
, frag_t basefrag
, int bits
,
1267 std::vector
<CDir
*>* frags
, MDSContext::vec
& waiters
, bool replay
);
1268 void adjust_dir_fragments(CInode
*diri
,
1269 const std::vector
<CDir
*>& srcfrags
,
1270 frag_t basefrag
, int bits
,
1271 std::vector
<CDir
*>* resultfrags
,
1272 MDSContext::vec
& waiters
,
1274 CDir
*force_dir_fragment(CInode
*diri
, frag_t fg
, bool replay
=true);
1275 void get_force_dirfrag_bound_set(const vector
<dirfrag_t
>& dfs
, set
<CDir
*>& bounds
);
1277 bool can_fragment(CInode
*diri
, const std::vector
<CDir
*>& dirs
);
1278 void fragment_freeze_dirs(const std::vector
<CDir
*>& dirs
);
1279 void fragment_mark_and_complete(MDRequestRef
& mdr
);
1280 void fragment_frozen(MDRequestRef
& mdr
, int r
);
1281 void fragment_unmark_unfreeze_dirs(const std::vector
<CDir
*>& dirs
);
1282 void fragment_drop_locks(fragment_info_t
&info
);
1283 void fragment_maybe_finish(const fragment_info_iterator
& it
);
1284 void dispatch_fragment_dir(MDRequestRef
& mdr
);
1285 void _fragment_logged(MDRequestRef
& mdr
);
1286 void _fragment_stored(MDRequestRef
& mdr
);
1287 void _fragment_committed(dirfrag_t f
, const MDRequestRef
& mdr
);
1288 void _fragment_old_purged(dirfrag_t f
, int bits
, const MDRequestRef
& mdr
);
1290 void handle_fragment_notify(const cref_t
<MMDSFragmentNotify
> &m
);
1291 void handle_fragment_notify_ack(const cref_t
<MMDSFragmentNotifyAck
> &m
);
1293 void add_uncommitted_fragment(dirfrag_t basedirfrag
, int bits
, const frag_vec_t
& old_frag
,
1294 LogSegment
*ls
, bufferlist
*rollback
=NULL
);
1295 void finish_uncommitted_fragment(dirfrag_t basedirfrag
, int op
);
1296 void rollback_uncommitted_fragment(dirfrag_t basedirfrag
, frag_vec_t
&& old_frags
);
1298 uint64_t cache_memory_limit
;
1299 double cache_reservation
;
1300 double cache_health_threshold
;
1301 bool forward_all_requests_to_auth
;
1302 std::array
<CInode
*, NUM_STRAY
> strays
{}; // my stray dir
1304 // File size recovery
1305 RecoveryQueue recovery_queue
;
1308 set
<inodeno_t
> shutdown_exporting_strays
;
1309 pair
<dirfrag_t
, string
> shutdown_export_next
;
1311 bool opening_root
= false, open
= false;
1312 MDSContext::vec waiting_for_open
;
1315 SnapRealm
*global_snaprealm
= nullptr;
1317 map
<dirfrag_t
, ufragment
> uncommitted_fragments
;
1319 map
<dirfrag_t
,fragment_info_t
> fragments
;
1321 DecayCounter trim_counter
;
1323 std::thread upkeeper
;
1324 ceph::mutex upkeep_mutex
= ceph::make_mutex("MDCache::upkeep_mutex");
1325 ceph::condition_variable upkeep_cvar
;
1326 time upkeep_last_trim
= time::min();
1327 time upkeep_last_release
= time::min();
1328 std::atomic
<bool> upkeep_trim_shutdown
{false};
1331 class C_MDS_RetryRequest
: public MDSInternalContext
{
1335 C_MDS_RetryRequest(MDCache
*c
, MDRequestRef
& r
);
1336 void finish(int r
) override
;