1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
17 #ifndef CEPH_MDCACHE_H
18 #define CEPH_MDCACHE_H
20 #include <string_view>
22 #include "common/DecayCounter.h"
23 #include "include/types.h"
24 #include "include/filepath.h"
25 #include "include/elist.h"
27 #include "messages/MCacheExpire.h"
28 #include "messages/MClientQuota.h"
29 #include "messages/MClientRequest.h"
30 #include "messages/MClientSnap.h"
31 #include "messages/MDentryLink.h"
32 #include "messages/MDentryUnlink.h"
33 #include "messages/MDirUpdate.h"
34 #include "messages/MDiscover.h"
35 #include "messages/MDiscoverReply.h"
36 #include "messages/MGatherCaps.h"
37 #include "messages/MGenericMessage.h"
38 #include "messages/MInodeFileCaps.h"
39 #include "messages/MLock.h"
40 #include "messages/MMDSCacheRejoin.h"
41 #include "messages/MMDSFindIno.h"
42 #include "messages/MMDSFindInoReply.h"
43 #include "messages/MMDSFragmentNotify.h"
44 #include "messages/MMDSFragmentNotifyAck.h"
45 #include "messages/MMDSOpenIno.h"
46 #include "messages/MMDSOpenInoReply.h"
47 #include "messages/MMDSResolve.h"
48 #include "messages/MMDSResolveAck.h"
49 #include "messages/MMDSSlaveRequest.h"
50 #include "messages/MMDSSnapUpdate.h"
53 #include "osdc/Filer.h"
57 #include "include/Context.h"
58 #include "events/EMetaBlob.h"
59 #include "RecoveryQueue.h"
60 #include "StrayManager.h"
61 #include "OpenFileTable.h"
62 #include "MDSContext.h"
79 // How many inodes currently in stray dentries
81 // How many stray dentries are currently delayed for purge due to refs
82 l_mdc_num_strays_delayed
,
83 // How many stray dentries are currently being enqueued for purge
84 l_mdc_num_strays_enqueuing
,
86 // How many dentries have ever been added to stray dir
88 // How many dentries have been passed on to PurgeQueue
89 l_mdc_strays_enqueued
,
90 // How many strays have been reintegrated?
91 l_mdc_strays_reintegrated
,
92 // How many strays have been migrated?
93 l_mdc_strays_migrated
,
95 // How many inode sizes currently being recovered
96 l_mdc_num_recovering_processing
,
97 // How many inodes currently waiting to have size recovered
98 l_mdc_num_recovering_enqueued
,
99 // How many inodes waiting with elevated priority for recovery
100 l_mdc_num_recovering_prioritized
,
101 // How many inodes ever started size recovery
102 l_mdc_recovery_started
,
103 // How many inodes ever completed size recovery
104 l_mdc_recovery_completed
,
106 l_mdss_ireq_enqueue_scrub
,
107 l_mdss_ireq_exportdir
,
109 l_mdss_ireq_fragmentdir
,
110 l_mdss_ireq_fragstats
,
111 l_mdss_ireq_inodestats
,
117 // flags for predirty_journal_parents()
118 static const int PREDIRTY_PRIMARY
= 1; // primary dn, adjust nested accounting
119 static const int PREDIRTY_DIR
= 2; // update parent dir mtime/size
120 static const int PREDIRTY_SHALLOW
= 4; // only go to immediate parent (for easier rollback)
124 using clock
= ceph::coarse_mono_clock
;
125 using time
= ceph::coarse_mono_time
;
127 typedef std::map
<mds_rank_t
, MCacheExpire::ref
> expiremap
;
133 LRU lru
; // dentry lru for expiring items from cache
134 LRU bottom_lru
; // dentries that should be trimmed ASAP
136 ceph::unordered_map
<inodeno_t
,CInode
*> inode_map
; // map of head inodes by ino
137 map
<vinodeno_t
, CInode
*> snap_inode_map
; // map of snap inodes by ino
138 CInode
*root
; // root inode
139 CInode
*myin
; // .ceph/mds%d dir
142 void set_readonly() { readonly
= true; }
144 CInode
*strays
[NUM_STRAY
]; // my stray dir
147 CInode
*get_stray() {
148 return strays
[stray_index
];
151 set
<CInode
*> base_inodes
;
153 std::unique_ptr
<PerfCounters
> logger
;
157 bool exceeded_size_limit
;
160 uint64_t cache_inode_limit
;
161 uint64_t cache_memory_limit
;
162 double cache_reservation
;
163 double cache_health_threshold
;
166 uint64_t cache_limit_inodes(void) {
167 return cache_inode_limit
;
169 uint64_t cache_limit_memory(void) {
170 return cache_memory_limit
;
172 double cache_toofull_ratio(void) const {
173 double inode_reserve
= cache_inode_limit
*(1.0-cache_reservation
);
174 double memory_reserve
= cache_memory_limit
*(1.0-cache_reservation
);
175 return fmax(0.0, fmax((cache_size()-memory_reserve
)/memory_reserve
, cache_inode_limit
== 0 ? 0.0 : (CInode::count()-inode_reserve
)/inode_reserve
));
177 bool cache_toofull(void) const {
178 return cache_toofull_ratio() > 0.0;
180 uint64_t cache_size(void) const {
181 return mempool::get_pool(mempool::mds_co::id
).allocated_bytes();
183 bool cache_overfull(void) const {
184 return (cache_inode_limit
> 0 && CInode::count() > cache_inode_limit
*cache_health_threshold
) || (cache_size() > cache_memory_limit
*cache_health_threshold
);
187 void advance_stray() {
188 stray_index
= (stray_index
+1)%NUM_STRAY
;
192 * Call this when you know that a CDentry is ready to be passed
193 * on to StrayManager (i.e. this is a stray you've just created)
195 void notify_stray(CDentry
*dn
) {
196 ceph_assert(dn
->get_dir()->get_inode()->is_stray());
197 if (dn
->state_test(CDentry::STATE_PURGING
))
200 stray_manager
.eval_stray(dn
);
203 void maybe_eval_stray(CInode
*in
, bool delay
=false);
204 void clear_dirty_bits_for_stray(CInode
* diri
);
206 bool is_readonly() { return readonly
; }
207 void force_readonly();
211 int num_shadow_inodes
;
213 int num_inodes_with_caps
;
215 unsigned max_dir_commit_size
;
217 static file_layout_t
gen_default_file_layout(const MDSMap
&mdsmap
);
218 static file_layout_t
gen_default_log_layout(const MDSMap
&mdsmap
);
220 file_layout_t default_file_layout
;
221 file_layout_t default_log_layout
;
223 void register_perfcounters();
225 // -- client leases --
227 static const int client_lease_pools
= 3;
228 float client_lease_durations
[client_lease_pools
];
230 xlist
<ClientLease
*> client_leases
[client_lease_pools
];
232 void touch_client_lease(ClientLease
*r
, int pool
, utime_t ttl
) {
233 client_leases
[pool
].push_back(&r
->item_lease
);
237 void notify_stray_removed()
239 stray_manager
.notify_stray_removed();
242 void notify_stray_created()
244 stray_manager
.notify_stray_created();
247 void eval_remote(CDentry
*dn
)
249 stray_manager
.eval_remote(dn
);
253 uint64_t last_cap_id
;
258 struct discover_info_t
{
270 tid(0), mds(-1), snap(CEPH_NOSNAP
), basei(NULL
),
271 want_base_dir(false), want_xlocked(false) {}
274 basei
->put(MDSCacheObject::PIN_DISCOVERBASE
);
276 void pin_base(CInode
*b
) {
278 basei
->get(MDSCacheObject::PIN_DISCOVERBASE
);
282 map
<ceph_tid_t
, discover_info_t
> discovers
;
283 ceph_tid_t discover_last_tid
;
285 void _send_discover(discover_info_t
& dis
);
286 discover_info_t
& _create_discover(mds_rank_t mds
) {
287 ceph_tid_t t
= ++discover_last_tid
;
288 discover_info_t
& d
= discovers
[t
];
295 map
<int, map
<inodeno_t
, MDSContext::vec
> > waiting_for_base_ino
;
297 void discover_base_ino(inodeno_t want_ino
, MDSContext
*onfinish
, mds_rank_t from
=MDS_RANK_NONE
);
298 void discover_dir_frag(CInode
*base
, frag_t approx_fg
, MDSContext
*onfinish
,
299 mds_rank_t from
=MDS_RANK_NONE
);
300 void discover_path(CInode
*base
, snapid_t snap
, filepath want_path
, MDSContext
*onfinish
,
301 bool want_xlocked
=false, mds_rank_t from
=MDS_RANK_NONE
);
302 void discover_path(CDir
*base
, snapid_t snap
, filepath want_path
, MDSContext
*onfinish
,
303 bool want_xlocked
=false);
304 void kick_discovers(mds_rank_t who
); // after a failure.
309 /* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */
310 map
<CDir
*,set
<CDir
*> > subtrees
;
311 map
<CInode
*,list
<pair
<CDir
*,CDir
*> > > projected_subtree_renames
; // renamed ino -> target dir
313 // adjust subtree auth specification
315 // imports/exports/nested_exports
316 // join/split subtrees as appropriate
318 bool is_subtrees() { return !subtrees
.empty(); }
320 void get_subtrees(T
& c
) {
321 if constexpr (std::is_same_v
<T
, std::vector
<CDir
*>>)
322 c
.reserve(c
.size() + subtrees
.size());
323 for (const auto& p
: subtrees
) {
324 c
.push_back(p
.first
);
327 void adjust_subtree_auth(CDir
*root
, mds_authority_t auth
, bool adjust_pop
=true);
328 void adjust_subtree_auth(CDir
*root
, mds_rank_t a
, mds_rank_t b
=CDIR_AUTH_UNKNOWN
) {
329 adjust_subtree_auth(root
, mds_authority_t(a
,b
));
331 void adjust_bounded_subtree_auth(CDir
*dir
, const set
<CDir
*>& bounds
, mds_authority_t auth
);
332 void adjust_bounded_subtree_auth(CDir
*dir
, const set
<CDir
*>& bounds
, mds_rank_t a
) {
333 adjust_bounded_subtree_auth(dir
, bounds
, mds_authority_t(a
, CDIR_AUTH_UNKNOWN
));
335 void adjust_bounded_subtree_auth(CDir
*dir
, const vector
<dirfrag_t
>& bounds
, const mds_authority_t
&auth
);
336 void adjust_bounded_subtree_auth(CDir
*dir
, const vector
<dirfrag_t
>& bounds
, mds_rank_t a
) {
337 adjust_bounded_subtree_auth(dir
, bounds
, mds_authority_t(a
, CDIR_AUTH_UNKNOWN
));
339 void map_dirfrag_set(const list
<dirfrag_t
>& dfs
, set
<CDir
*>& result
);
340 void try_subtree_merge(CDir
*root
);
341 void try_subtree_merge_at(CDir
*root
, set
<CInode
*> *to_eval
, bool adjust_pop
=true);
342 void subtree_merge_writebehind_finish(CInode
*in
, MutationRef
& mut
);
343 void eval_subtree_root(CInode
*diri
);
344 CDir
*get_subtree_root(CDir
*dir
);
345 CDir
*get_projected_subtree_root(CDir
*dir
);
346 bool is_leaf_subtree(CDir
*dir
) {
347 ceph_assert(subtrees
.count(dir
));
348 return subtrees
[dir
].empty();
350 void remove_subtree(CDir
*dir
);
351 bool is_subtree(CDir
*root
) {
352 return subtrees
.count(root
);
354 void get_subtree_bounds(CDir
*root
, set
<CDir
*>& bounds
);
355 void get_wouldbe_subtree_bounds(CDir
*root
, set
<CDir
*>& bounds
);
356 void verify_subtree_bounds(CDir
*root
, const set
<CDir
*>& bounds
);
357 void verify_subtree_bounds(CDir
*root
, const list
<dirfrag_t
>& bounds
);
359 void project_subtree_rename(CInode
*diri
, CDir
*olddir
, CDir
*newdir
);
360 void adjust_subtree_after_rename(CInode
*diri
, CDir
*olddir
, bool pop
);
362 auto get_auth_subtrees() {
363 std::vector
<CDir
*> c
;
364 for (auto& p
: subtrees
) {
365 auto& root
= p
.first
;
366 if (root
->is_auth()) {
373 auto get_fullauth_subtrees() {
374 std::vector
<CDir
*> c
;
375 for (auto& p
: subtrees
) {
376 auto& root
= p
.first
;
377 if (root
->is_full_dir_auth()) {
383 auto num_subtrees_fullauth() const {
385 for (auto& p
: subtrees
) {
386 auto& root
= p
.first
;
387 if (root
->is_full_dir_auth()) {
394 auto num_subtrees_fullnonauth() const {
396 for (auto& p
: subtrees
) {
397 auto& root
= p
.first
;
398 if (root
->is_full_dir_nonauth()) {
405 auto num_subtrees() const {
406 return subtrees
.size();
412 ceph::unordered_map
<metareqid_t
, MDRequestRef
> active_requests
;
415 int get_num_client_requests();
417 MDRequestRef
request_start(const MClientRequest::const_ref
& req
);
418 MDRequestRef
request_start_slave(metareqid_t rid
, __u32 attempt
, const Message::const_ref
&m
);
419 MDRequestRef
request_start_internal(int op
);
420 bool have_request(metareqid_t rid
) {
421 return active_requests
.count(rid
);
423 MDRequestRef
request_get(metareqid_t rid
);
424 void request_pin_ref(MDRequestRef
& r
, CInode
*ref
, vector
<CDentry
*>& trace
);
425 void request_finish(MDRequestRef
& mdr
);
426 void request_forward(MDRequestRef
& mdr
, mds_rank_t mds
, int port
=0);
427 void dispatch_request(MDRequestRef
& mdr
);
428 void request_drop_foreign_locks(MDRequestRef
& mdr
);
429 void request_drop_non_rdlocks(MDRequestRef
& r
);
430 void request_drop_locks(MDRequestRef
& r
);
431 void request_cleanup(MDRequestRef
& r
);
433 void request_kill(MDRequestRef
& r
); // called when session closes
435 // journal/snap helpers
436 CInode
*pick_inode_snap(CInode
*in
, snapid_t follows
);
437 CInode
*cow_inode(CInode
*in
, snapid_t last
);
438 void journal_cow_dentry(MutationImpl
*mut
, EMetaBlob
*metablob
, CDentry
*dn
,
439 snapid_t follows
=CEPH_NOSNAP
,
440 CInode
**pcow_inode
=0, CDentry::linkage_t
*dnl
=0);
441 void journal_cow_inode(MutationRef
& mut
, EMetaBlob
*metablob
, CInode
*in
, snapid_t follows
=CEPH_NOSNAP
,
442 CInode
**pcow_inode
=0);
443 void journal_dirty_inode(MutationImpl
*mut
, EMetaBlob
*metablob
, CInode
*in
, snapid_t follows
=CEPH_NOSNAP
);
445 void project_rstat_inode_to_frag(CInode
*cur
, CDir
*parent
, snapid_t first
,
446 int linkunlink
, SnapRealm
*prealm
);
447 void _project_rstat_inode_to_frag(CInode::mempool_inode
& inode
, snapid_t ofirst
, snapid_t last
,
448 CDir
*parent
, int linkunlink
, bool update_inode
);
449 void project_rstat_frag_to_inode(nest_info_t
& rstat
, nest_info_t
& accounted_rstat
,
450 snapid_t ofirst
, snapid_t last
,
451 CInode
*pin
, bool cow_head
);
452 void broadcast_quota_to_client(CInode
*in
, client_t exclude_ct
= -1, bool quota_change
= false);
453 void predirty_journal_parents(MutationRef mut
, EMetaBlob
*blob
,
454 CInode
*in
, CDir
*parent
,
455 int flags
, int linkunlink
=0,
456 snapid_t follows
=CEPH_NOSNAP
);
459 void add_uncommitted_master(metareqid_t reqid
, LogSegment
*ls
, set
<mds_rank_t
> &slaves
, bool safe
=false) {
460 uncommitted_masters
[reqid
].ls
= ls
;
461 uncommitted_masters
[reqid
].slaves
= slaves
;
462 uncommitted_masters
[reqid
].safe
= safe
;
464 void wait_for_uncommitted_master(metareqid_t reqid
, MDSContext
*c
) {
465 uncommitted_masters
[reqid
].waiters
.push_back(c
);
467 bool have_uncommitted_master(metareqid_t reqid
, mds_rank_t from
) {
468 auto p
= uncommitted_masters
.find(reqid
);
469 return p
!= uncommitted_masters
.end() && p
->second
.slaves
.count(from
) > 0;
471 void log_master_commit(metareqid_t reqid
);
472 void logged_master_update(metareqid_t reqid
);
473 void _logged_master_commit(metareqid_t reqid
);
474 void committed_master_slave(metareqid_t r
, mds_rank_t from
);
475 void finish_committed_masters();
477 void _logged_slave_commit(mds_rank_t from
, metareqid_t reqid
);
481 set
<mds_rank_t
> recovery_set
;
484 void set_recovery_set(set
<mds_rank_t
>& s
);
485 void handle_mds_failure(mds_rank_t who
);
486 void handle_mds_recovery(mds_rank_t who
);
490 // from EImportStart w/o EImportFinish during journal replay
491 map
<dirfrag_t
, vector
<dirfrag_t
> > my_ambiguous_imports
;
493 map
<mds_rank_t
, map
<dirfrag_t
, vector
<dirfrag_t
> > > other_ambiguous_imports
;
495 map
<mds_rank_t
, map
<metareqid_t
, MDSlaveUpdate
*> > uncommitted_slave_updates
; // slave: for replay.
496 map
<CInode
*, int> uncommitted_slave_rename_olddir
; // slave: preserve the non-auth dir until seeing commit.
497 map
<CInode
*, int> uncommitted_slave_unlink
; // slave: preserve the unlinked inode until seeing commit.
499 // track master requests whose slaves haven't acknowledged commit
501 set
<mds_rank_t
> slaves
;
503 MDSContext::vec waiters
;
507 umaster() : ls(NULL
), safe(false), committing(false), recovering(false) {}
509 map
<metareqid_t
, umaster
> uncommitted_masters
; // master: req -> slave set
511 set
<metareqid_t
> pending_masters
;
512 map
<int, set
<metareqid_t
> > ambiguous_slave_updates
;
514 friend class ESlaveUpdate
;
515 friend class ECommitted
;
517 bool resolves_pending
;
518 set
<mds_rank_t
> resolve_gather
; // nodes i need resolves from
519 set
<mds_rank_t
> resolve_ack_gather
; // nodes i need a resolve_ack from
520 set
<version_t
> resolve_snapclient_commits
;
521 map
<metareqid_t
, mds_rank_t
> resolve_need_rollback
; // rollbacks i'm writing to the journal
522 map
<mds_rank_t
, MMDSResolve::const_ref
> delayed_resolve
;
524 void handle_resolve(const MMDSResolve::const_ref
&m
);
525 void handle_resolve_ack(const MMDSResolveAck::const_ref
&m
);
526 void process_delayed_resolve();
527 void discard_delayed_resolve(mds_rank_t who
);
528 void maybe_resolve_finish();
529 void disambiguate_my_imports();
530 void disambiguate_other_imports();
531 void trim_unlinked_inodes();
532 void add_uncommitted_slave_update(metareqid_t reqid
, mds_rank_t master
, MDSlaveUpdate
*);
533 void finish_uncommitted_slave_update(metareqid_t reqid
, mds_rank_t master
);
534 MDSlaveUpdate
* get_uncommitted_slave_update(metareqid_t reqid
, mds_rank_t master
);
536 void send_slave_resolves();
537 void send_subtree_resolves();
538 void maybe_finish_slave_resolve();
541 void recalc_auth_bits(bool replay
);
542 void remove_inode_recursive(CInode
*in
);
544 bool is_ambiguous_slave_update(metareqid_t reqid
, mds_rank_t master
) {
545 auto p
= ambiguous_slave_updates
.find(master
);
546 return p
!= ambiguous_slave_updates
.end() && p
->second
.count(reqid
);
548 void add_ambiguous_slave_update(metareqid_t reqid
, mds_rank_t master
) {
549 ambiguous_slave_updates
[master
].insert(reqid
);
551 void remove_ambiguous_slave_update(metareqid_t reqid
, mds_rank_t master
) {
552 auto p
= ambiguous_slave_updates
.find(master
);
553 auto q
= p
->second
.find(reqid
);
554 ceph_assert(q
!= p
->second
.end());
556 if (p
->second
.empty())
557 ambiguous_slave_updates
.erase(p
);
560 void add_rollback(metareqid_t reqid
, mds_rank_t master
) {
561 resolve_need_rollback
[reqid
] = master
;
563 void finish_rollback(metareqid_t reqid
);
566 void add_ambiguous_import(dirfrag_t base
, const vector
<dirfrag_t
>& bounds
);
567 void add_ambiguous_import(CDir
*base
, const set
<CDir
*>& bounds
);
568 bool have_ambiguous_import(dirfrag_t base
) {
569 return my_ambiguous_imports
.count(base
);
571 void get_ambiguous_import_bounds(dirfrag_t base
, vector
<dirfrag_t
>& bounds
) {
572 ceph_assert(my_ambiguous_imports
.count(base
));
573 bounds
= my_ambiguous_imports
[base
];
575 void cancel_ambiguous_import(CDir
*);
576 void finish_ambiguous_import(dirfrag_t dirino
);
577 void resolve_start(MDSContext
*resolve_done_
);
578 void send_resolves();
579 void maybe_send_pending_resolves() {
580 if (resolves_pending
)
581 send_subtree_resolves();
584 void _move_subtree_map_bound(dirfrag_t df
, dirfrag_t oldparent
, dirfrag_t newparent
,
585 map
<dirfrag_t
,vector
<dirfrag_t
> >& subtrees
);
586 ESubtreeMap
*create_subtree_map();
589 void clean_open_file_lists();
590 void dump_openfiles(Formatter
*f
);
591 bool dump_inode(Formatter
*f
, uint64_t number
);
594 bool rejoins_pending
;
595 set
<mds_rank_t
> rejoin_gather
; // nodes from whom i need a rejoin
596 set
<mds_rank_t
> rejoin_sent
; // nodes i sent a rejoin to
597 set
<mds_rank_t
> rejoin_ack_sent
; // nodes i sent a rejoin to
598 set
<mds_rank_t
> rejoin_ack_gather
; // nodes from whom i need a rejoin ack
599 map
<mds_rank_t
,map
<inodeno_t
,map
<client_t
,Capability::Import
> > > rejoin_imported_caps
;
600 map
<inodeno_t
,pair
<mds_rank_t
,map
<client_t
,Capability::Export
> > > rejoin_slave_exports
;
602 map
<client_t
,entity_inst_t
> rejoin_client_map
;
603 map
<client_t
,client_metadata_t
> rejoin_client_metadata_map
;
604 map
<client_t
,pair
<Session
*,uint64_t> > rejoin_session_map
;
606 map
<inodeno_t
,pair
<mds_rank_t
,map
<client_t
,cap_reconnect_t
> > > cap_exports
; // ino -> target, client -> capex
608 map
<inodeno_t
,map
<client_t
,map
<mds_rank_t
,cap_reconnect_t
> > > cap_imports
; // ino -> client -> frommds -> capex
609 set
<inodeno_t
> cap_imports_missing
;
610 map
<inodeno_t
, MDSContext::vec
> cap_reconnect_waiters
;
611 int cap_imports_num_opening
;
613 set
<CInode
*> rejoin_undef_inodes
;
614 set
<CInode
*> rejoin_potential_updated_scatterlocks
;
615 set
<CDir
*> rejoin_undef_dirfrags
;
616 map
<mds_rank_t
, set
<CInode
*> > rejoin_unlinked_inodes
;
618 vector
<CInode
*> rejoin_recover_q
, rejoin_check_q
;
619 list
<SimpleLock
*> rejoin_eval_locks
;
620 MDSContext::vec rejoin_waiters
;
622 void rejoin_walk(CDir
*dir
, const MMDSCacheRejoin::ref
&rejoin
);
623 void handle_cache_rejoin(const MMDSCacheRejoin::const_ref
&m
);
624 void handle_cache_rejoin_weak(const MMDSCacheRejoin::const_ref
&m
);
625 CInode
* rejoin_invent_inode(inodeno_t ino
, snapid_t last
);
626 CDir
* rejoin_invent_dirfrag(dirfrag_t df
);
627 void handle_cache_rejoin_strong(const MMDSCacheRejoin::const_ref
&m
);
628 void rejoin_scour_survivor_replicas(mds_rank_t from
, const MMDSCacheRejoin::const_ref
&ack
,
629 set
<vinodeno_t
>& acked_inodes
,
630 set
<SimpleLock
*>& gather_locks
);
631 void handle_cache_rejoin_ack(const MMDSCacheRejoin::const_ref
&m
);
632 void rejoin_send_acks();
633 void rejoin_trim_undef_inodes();
634 void maybe_send_pending_rejoins() {
636 rejoin_send_rejoins();
638 std::unique_ptr
<MDSContext
> rejoin_done
;
639 std::unique_ptr
<MDSContext
> resolve_done
;
641 void rejoin_start(MDSContext
*rejoin_done_
);
642 void rejoin_gather_finish();
643 void rejoin_send_rejoins();
644 void rejoin_export_caps(inodeno_t ino
, client_t client
, const cap_reconnect_t
& icr
,
645 int target
=-1, bool drop_path
=false) {
646 auto& ex
= cap_exports
[ino
];
648 auto &_icr
= ex
.second
[client
] = icr
;
652 void rejoin_recovered_caps(inodeno_t ino
, client_t client
, const cap_reconnect_t
& icr
,
653 mds_rank_t frommds
=MDS_RANK_NONE
, bool drop_path
=false) {
654 auto &_icr
= cap_imports
[ino
][client
][frommds
] = icr
;
658 void rejoin_recovered_client(client_t client
, const entity_inst_t
& inst
) {
659 rejoin_client_map
.emplace(client
, inst
);
661 bool rejoin_has_cap_reconnect(inodeno_t ino
) const {
662 return cap_imports
.count(ino
);
664 void add_replay_ino_alloc(inodeno_t ino
) {
665 cap_imports_missing
.insert(ino
); // avoid opening ino during cache rejoin
667 const cap_reconnect_t
*get_replay_cap_reconnect(inodeno_t ino
, client_t client
) {
668 if (cap_imports
.count(ino
) &&
669 cap_imports
[ino
].count(client
) &&
670 cap_imports
[ino
][client
].count(MDS_RANK_NONE
)) {
671 return &cap_imports
[ino
][client
][MDS_RANK_NONE
];
675 void remove_replay_cap_reconnect(inodeno_t ino
, client_t client
) {
676 ceph_assert(cap_imports
[ino
].size() == 1);
677 ceph_assert(cap_imports
[ino
][client
].size() == 1);
678 cap_imports
.erase(ino
);
680 void wait_replay_cap_reconnect(inodeno_t ino
, MDSContext
*c
) {
681 cap_reconnect_waiters
[ino
].push_back(c
);
684 // [reconnect/rejoin caps]
685 struct reconnected_cap_info_t
{
687 snapid_t snap_follows
;
690 reconnected_cap_info_t() :
691 realm_ino(0), snap_follows(0), dirty_caps(0), snapflush(false) {}
693 map
<inodeno_t
,map
<client_t
, reconnected_cap_info_t
> > reconnected_caps
; // inode -> client -> snap_follows,realmino
694 map
<inodeno_t
,map
<client_t
, snapid_t
> > reconnected_snaprealms
; // realmino -> client -> realmseq
696 void add_reconnected_cap(client_t client
, inodeno_t ino
, const cap_reconnect_t
& icr
) {
697 reconnected_cap_info_t
&info
= reconnected_caps
[ino
][client
];
698 info
.realm_ino
= inodeno_t(icr
.capinfo
.snaprealm
);
699 info
.snap_follows
= icr
.snap_follows
;
701 void set_reconnected_dirty_caps(client_t client
, inodeno_t ino
, int dirty
, bool snapflush
) {
702 reconnected_cap_info_t
&info
= reconnected_caps
[ino
][client
];
703 info
.dirty_caps
|= dirty
;
705 info
.snapflush
= snapflush
;
707 void add_reconnected_snaprealm(client_t client
, inodeno_t ino
, snapid_t seq
) {
708 reconnected_snaprealms
[ino
][client
] = seq
;
711 friend class C_MDC_RejoinOpenInoFinish
;
712 friend class C_MDC_RejoinSessionsOpened
;
713 void rejoin_open_ino_finish(inodeno_t ino
, int ret
);
714 void rejoin_prefetch_ino_finish(inodeno_t ino
, int ret
);
715 void rejoin_open_sessions_finish(map
<client_t
,pair
<Session
*,uint64_t> >& session_map
);
716 bool process_imported_caps();
717 void choose_lock_states_and_reconnect_caps();
718 void prepare_realm_split(SnapRealm
*realm
, client_t client
, inodeno_t ino
,
719 map
<client_t
,MClientSnap::ref
>& splits
);
720 void prepare_realm_merge(SnapRealm
*realm
, SnapRealm
*parent_realm
, map
<client_t
,MClientSnap::ref
>& splits
);
721 void send_snaps(map
<client_t
,MClientSnap::ref
>& splits
);
722 Capability
* rejoin_import_cap(CInode
*in
, client_t client
, const cap_reconnect_t
& icr
, mds_rank_t frommds
);
723 void finish_snaprealm_reconnect(client_t client
, SnapRealm
*realm
, snapid_t seq
,
724 map
<client_t
,MClientSnap::ref
>& updates
);
725 Capability
* try_reconnect_cap(CInode
*in
, Session
*session
);
726 void export_remaining_imported_caps();
729 set
<CInode
*> rejoin_pending_snaprealms
;
730 // cap imports. delayed snap parent opens.
731 map
<client_t
,set
<CInode
*> > delayed_imported_caps
;
733 void do_cap_import(Session
*session
, CInode
*in
, Capability
*cap
,
734 uint64_t p_cap_id
, ceph_seq_t p_seq
, ceph_seq_t p_mseq
,
735 int peer
, int p_flags
);
736 void do_delayed_cap_imports();
737 void rebuild_need_snapflush(CInode
*head_in
, SnapRealm
*realm
, client_t client
,
738 snapid_t snap_follows
);
739 void open_snaprealms();
741 bool open_undef_inodes_dirfrags();
742 void opened_undef_inode(CInode
*in
);
743 void opened_undef_dirfrag(CDir
*dir
) {
744 rejoin_undef_dirfrags
.erase(dir
);
747 void reissue_all_caps();
751 friend class Migrator
;
752 friend class MDBalancer
;
754 // StrayManager needs to be able to remove_inode() from us
755 // when it is done purging
756 friend class StrayManager
;
758 // File size recovery
760 RecoveryQueue recovery_queue
;
761 void identify_files_to_recover();
763 void start_files_to_recover();
764 void do_file_recover();
765 void queue_file_recover(CInode
*in
);
766 void _queued_file_recover_cow(CInode
*in
, MutationRef
& mut
);
769 std::unique_ptr
<Migrator
> migrator
;
772 explicit MDCache(MDSRank
*m
, PurgeQueue
&purge_queue_
);
774 void handle_conf_change(const ConfigProxy
& conf
,
775 const std::set
<std::string
> &changed
,
776 const MDSMap
&mds_map
);
782 CInode
*get_root() { return root
; }
783 CInode
*get_myin() { return myin
; }
785 size_t get_cache_size() { return lru
.lru_get_size(); }
788 std::pair
<bool, uint64_t> trim(uint64_t count
=0);
790 std::pair
<bool, uint64_t> trim_lru(uint64_t count
, expiremap
& expiremap
);
791 bool trim_dentry(CDentry
*dn
, expiremap
& expiremap
);
792 void trim_dirfrag(CDir
*dir
, CDir
*con
, expiremap
& expiremap
);
793 bool trim_inode(CDentry
*dn
, CInode
*in
, CDir
*con
, expiremap
&);
794 void send_expire_messages(expiremap
& expiremap
);
795 void trim_non_auth(); // trim out trimmable non-auth items
797 bool trim_non_auth_subtree(CDir
*directory
);
798 void standby_trim_segment(LogSegment
*ls
);
799 void try_trim_non_auth_subtree(CDir
*dir
);
800 bool can_trim_non_auth_dirfrag(CDir
*dir
) {
801 return my_ambiguous_imports
.count((dir
)->dirfrag()) == 0 &&
802 uncommitted_slave_rename_olddir
.count(dir
->inode
) == 0;
806 * For all unreferenced inodes, dirs, dentries below an inode, compose
807 * expiry messages. This is used when giving up all replicas of entities
808 * for an MDS peer in the 'stopping' state, such that the peer can
809 * empty its cache and finish shutting down.
811 * We have to make sure we're only expiring un-referenced items to
812 * avoid interfering with ongoing stray-movement (we can't distinguish
813 * between the "moving my strays" and "waiting for my cache to empty"
814 * phases within 'stopping')
816 * @return false if we completed cleanly, true if caller should stop
817 * expiring because we hit something with refs.
819 bool expire_recursive(CInode
*in
, expiremap
& expiremap
);
821 void trim_client_leases();
822 void check_memory_usage();
826 set
<inodeno_t
> shutdown_exporting_strays
;
827 pair
<dirfrag_t
, string
> shutdown_export_next
;
829 void shutdown_start();
830 void shutdown_check();
831 bool shutdown_pass();
832 bool shutdown(); // clear cache (ie at shutodwn)
833 bool shutdown_export_strays();
834 void shutdown_export_stray_finish(inodeno_t ino
) {
835 if (shutdown_exporting_strays
.erase(ino
))
836 shutdown_export_strays();
839 bool did_shutdown_log_cap
;
842 bool have_inode(vinodeno_t vino
) {
843 if (vino
.snapid
== CEPH_NOSNAP
)
844 return inode_map
.count(vino
.ino
) ? true : false;
846 return snap_inode_map
.count(vino
) ? true : false;
848 bool have_inode(inodeno_t ino
, snapid_t snap
=CEPH_NOSNAP
) {
849 return have_inode(vinodeno_t(ino
, snap
));
851 CInode
* get_inode(vinodeno_t vino
) {
852 if (vino
.snapid
== CEPH_NOSNAP
) {
853 auto p
= inode_map
.find(vino
.ino
);
854 if (p
!= inode_map
.end())
857 auto p
= snap_inode_map
.find(vino
);
858 if (p
!= snap_inode_map
.end())
863 CInode
* get_inode(inodeno_t ino
, snapid_t s
=CEPH_NOSNAP
) {
864 return get_inode(vinodeno_t(ino
, s
));
866 CInode
* lookup_snap_inode(vinodeno_t vino
) {
867 auto p
= snap_inode_map
.lower_bound(vino
);
868 if (p
!= snap_inode_map
.end() &&
869 p
->second
->ino() == vino
.ino
&& p
->second
->first
<= vino
.snapid
)
874 CDir
* get_dirfrag(dirfrag_t df
) {
875 CInode
*in
= get_inode(df
.ino
);
878 return in
->get_dirfrag(df
.frag
);
880 CDir
* get_dirfrag(inodeno_t ino
, std::string_view dn
) {
881 CInode
*in
= get_inode(ino
);
884 frag_t fg
= in
->pick_dirfrag(dn
);
885 return in
->get_dirfrag(fg
);
887 CDir
* get_force_dirfrag(dirfrag_t df
, bool replay
) {
888 CInode
*diri
= get_inode(df
.ino
);
891 CDir
*dir
= force_dir_fragment(diri
, df
.frag
, replay
);
893 dir
= diri
->get_dirfrag(df
.frag
);
897 MDSCacheObject
*get_object(const MDSCacheObjectInfo
&info
);
902 void add_inode(CInode
*in
);
904 void remove_inode(CInode
*in
);
906 void touch_inode(CInode
*in
) {
907 if (in
->get_parent_dn())
908 touch_dentry(in
->get_projected_parent_dn());
911 void touch_dentry(CDentry
*dn
) {
912 if (dn
->state_test(CDentry::STATE_BOTTOMLRU
)) {
913 bottom_lru
.lru_midtouch(dn
);
918 lru
.lru_midtouch(dn
);
921 void touch_dentry_bottom(CDentry
*dn
) {
922 if (dn
->state_test(CDentry::STATE_BOTTOMLRU
))
924 lru
.lru_bottouch(dn
);
928 void inode_remove_replica(CInode
*in
, mds_rank_t rep
, bool rejoin
,
929 set
<SimpleLock
*>& gather_locks
);
930 void dentry_remove_replica(CDentry
*dn
, mds_rank_t rep
, set
<SimpleLock
*>& gather_locks
);
932 void rename_file(CDentry
*srcdn
, CDentry
*destdn
);
936 void truncate_inode(CInode
*in
, LogSegment
*ls
);
937 void _truncate_inode(CInode
*in
, LogSegment
*ls
);
938 void truncate_inode_finish(CInode
*in
, LogSegment
*ls
);
939 void truncate_inode_logged(CInode
*in
, MutationRef
& mut
);
941 void add_recovered_truncate(CInode
*in
, LogSegment
*ls
);
942 void remove_recovered_truncate(CInode
*in
, LogSegment
*ls
);
943 void start_recovered_truncates();
947 CDir
*get_auth_container(CDir
*in
);
948 CDir
*get_export_container(CDir
*dir
);
949 void find_nested_exports(CDir
*dir
, set
<CDir
*>& s
);
950 void find_nested_exports_under(CDir
*import
, CDir
*dir
, set
<CDir
*>& s
);
954 bool opening_root
, open
;
955 MDSContext::vec waiting_for_open
;
959 void create_unlinked_system_inode(CInode
*in
, inodeno_t ino
,
961 CInode
*create_system_inode(inodeno_t ino
, int mode
);
962 CInode
*create_root_inode();
964 void create_empty_hierarchy(MDSGather
*gather
);
965 void create_mydir_hierarchy(MDSGather
*gather
);
967 bool is_open() { return open
; }
968 void wait_for_open(MDSContext
*c
) {
969 waiting_for_open
.push_back(c
);
972 void open_root_inode(MDSContext
*c
);
974 void open_mydir_inode(MDSContext
*c
);
975 void open_mydir_frag(MDSContext
*c
);
976 void populate_mydir();
978 void _create_system_file(CDir
*dir
, std::string_view name
, CInode
*in
, MDSContext
*fin
);
979 void _create_system_file_finish(MutationRef
& mut
, CDentry
*dn
,
980 version_t dpv
, MDSContext
*fin
);
982 void open_foreign_mdsdir(inodeno_t ino
, MDSContext
*c
);
983 CDir
*get_stray_dir(CInode
*in
);
984 CDentry
*get_or_create_stray_dentry(CInode
*in
);
987 * Find the given dentry (and whether it exists or not), its ancestors,
988 * and get them all into memory and usable on this MDS. This function
989 * makes a best-effort attempt to load everything; if it needs to
990 * go away and do something then it will put the request on a waitlist.
991 * It prefers the mdr, then the req, then the fin. (At least one of these
994 * At least one of the params mdr, req, and fin must be non-null.
996 * @param mdr The MDRequest associated with the path. Can be null.
997 * @param cf A MDSContextFactory for waiter building.
998 * @param path The path to traverse to.
999 * @param pdnvec Data return parameter -- on success, contains a
1000 * vector of dentries. On failure, is either empty or contains the
1001 * full trace of traversable dentries.
1002 * @param pin Data return parameter -- if successful, points to the inode
1003 * associated with filepath. If unsuccessful, is null.
1004 * @param onfail Specifies different lookup failure behaviors. If set to
1005 * MDS_TRAVERSE_DISCOVERXLOCK, path_traverse will succeed on null
1006 * dentries (instead of returning -ENOENT). If set to
1007 * MDS_TRAVERSE_FORWARD, it will forward the request to the auth
1008 * MDS if that becomes appropriate (ie, if it doesn't know the contents
1009 * of a directory). If set to MDS_TRAVERSE_DISCOVER, it
1010 * will attempt to look up the path from a different MDS (and bring them
1011 * into its cache as replicas).
1013 * @returns 0 on success, 1 on "not done yet", 2 on "forwarding", -errno otherwise.
1014 * If it returns 1, the requester associated with this call has been placed
1015 * on the appropriate waitlist, and it should unwind itself and back out.
1016 * If it returns 2 the request has been forwarded, and again the requester
1017 * should unwind itself and back out.
1019 int path_traverse(MDRequestRef
& mdr
, MDSContextFactory
& cf
, const filepath
& path
,
1020 vector
<CDentry
*> *pdnvec
, CInode
**pin
, int onfail
);
1022 CInode
*cache_traverse(const filepath
& path
);
1024 void open_remote_dirfrag(CInode
*diri
, frag_t fg
, MDSContext
*fin
);
1025 CInode
*get_dentry_inode(CDentry
*dn
, MDRequestRef
& mdr
, bool projected
=false);
1027 bool parallel_fetch(map
<inodeno_t
,filepath
>& pathmap
, set
<inodeno_t
>& missing
);
1028 bool parallel_fetch_traverse_dir(inodeno_t ino
, filepath
& path
,
1029 set
<CDir
*>& fetch_queue
, set
<inodeno_t
>& missing
,
1030 C_GatherBuilder
&gather_bld
);
1032 void open_remote_dentry(CDentry
*dn
, bool projected
, MDSContext
*fin
,
1033 bool want_xlocked
=false);
1034 void _open_remote_dentry_finish(CDentry
*dn
, inodeno_t ino
, MDSContext
*fin
,
1035 bool want_xlocked
, int r
);
1037 void make_trace(vector
<CDentry
*>& trace
, CInode
*in
);
1040 struct open_ino_info_t
{
1041 vector
<inode_backpointer_t
> ancestors
;
1042 set
<mds_rank_t
> checked
;
1043 mds_rank_t checking
;
1044 mds_rank_t auth_hint
;
1046 bool fetch_backtrace
;
1053 MDSContext::vec waiters
;
1054 open_ino_info_t() : checking(MDS_RANK_NONE
), auth_hint(MDS_RANK_NONE
),
1055 check_peers(true), fetch_backtrace(true), discover(false),
1056 want_replica(false), want_xlocked(false), tid(0), pool(-1),
1059 ceph_tid_t open_ino_last_tid
;
1060 map
<inodeno_t
,open_ino_info_t
> opening_inodes
;
1062 void _open_ino_backtrace_fetched(inodeno_t ino
, bufferlist
& bl
, int err
);
1063 void _open_ino_parent_opened(inodeno_t ino
, int ret
);
1064 void _open_ino_traverse_dir(inodeno_t ino
, open_ino_info_t
& info
, int err
);
1065 void _open_ino_fetch_dir(inodeno_t ino
, const MMDSOpenIno::const_ref
&m
, CDir
*dir
, bool parent
);
1066 int open_ino_traverse_dir(inodeno_t ino
, const MMDSOpenIno::const_ref
&m
,
1067 const vector
<inode_backpointer_t
>& ancestors
,
1068 bool discover
, bool want_xlocked
, mds_rank_t
*hint
);
1069 void open_ino_finish(inodeno_t ino
, open_ino_info_t
& info
, int err
);
1070 void do_open_ino(inodeno_t ino
, open_ino_info_t
& info
, int err
);
1071 void do_open_ino_peer(inodeno_t ino
, open_ino_info_t
& info
);
1072 void handle_open_ino(const MMDSOpenIno::const_ref
&m
, int err
=0);
1073 void handle_open_ino_reply(const MMDSOpenInoReply::const_ref
&m
);
1074 friend class C_IO_MDC_OpenInoBacktraceFetched
;
1075 friend struct C_MDC_OpenInoTraverseDir
;
1076 friend struct C_MDC_OpenInoParentOpened
;
1079 void kick_open_ino_peers(mds_rank_t who
);
1080 void open_ino(inodeno_t ino
, int64_t pool
, MDSContext
*fin
,
1081 bool want_replica
=true, bool want_xlocked
=false);
1083 // -- find_ino_peer --
1084 struct find_ino_peer_info_t
{
1089 mds_rank_t checking
;
1090 set
<mds_rank_t
> checked
;
1092 find_ino_peer_info_t() : tid(0), fin(NULL
), hint(MDS_RANK_NONE
), checking(MDS_RANK_NONE
) {}
1095 map
<ceph_tid_t
, find_ino_peer_info_t
> find_ino_peer
;
1096 ceph_tid_t find_ino_peer_last_tid
;
1098 void find_ino_peers(inodeno_t ino
, MDSContext
*c
, mds_rank_t hint
=MDS_RANK_NONE
);
1099 void _do_find_ino_peer(find_ino_peer_info_t
& fip
);
1100 void handle_find_ino(const MMDSFindIno::const_ref
&m
);
1101 void handle_find_ino_reply(const MMDSFindInoReply::const_ref
&m
);
1102 void kick_find_ino_peers(mds_rank_t who
);
1106 SnapRealm
*global_snaprealm
;
1108 SnapRealm
*get_global_snaprealm() const { return global_snaprealm
; }
1109 void create_global_snaprealm();
1110 void do_realm_invalidate_and_update_notify(CInode
*in
, int snapop
, bool notify_clients
=true);
1111 void send_snap_update(CInode
*in
, version_t stid
, int snap_op
);
1112 void handle_snap_update(const MMDSSnapUpdate::const_ref
&m
);
1113 void notify_global_snaprealm_update(int snap_op
);
1117 void fetch_backtrace(inodeno_t ino
, int64_t pool
, bufferlist
& bl
, Context
*fin
);
1118 uint64_t get_num_strays() const { return stray_manager
.get_num_strays(); }
1121 void scan_stray_dir(dirfrag_t next
=dirfrag_t());
1122 StrayManager stray_manager
;
1123 friend struct C_MDC_RetryScanStray
;
1127 void dispatch(const Message::const_ref
&m
);
1131 void handle_discover(const MDiscover::const_ref
&dis
);
1132 void handle_discover_reply(const MDiscoverReply::const_ref
&m
);
1133 friend class C_MDC_Join
;
1136 void replicate_dir(CDir
*dir
, mds_rank_t to
, bufferlist
& bl
);
1137 void replicate_dentry(CDentry
*dn
, mds_rank_t to
, bufferlist
& bl
);
1138 void replicate_inode(CInode
*in
, mds_rank_t to
, bufferlist
& bl
,
1141 CDir
* add_replica_dir(bufferlist::const_iterator
& p
, CInode
*diri
, mds_rank_t from
, MDSContext::vec
& finished
);
1142 CDentry
*add_replica_dentry(bufferlist::const_iterator
& p
, CDir
*dir
, MDSContext::vec
& finished
);
1143 CInode
*add_replica_inode(bufferlist::const_iterator
& p
, CDentry
*dn
, MDSContext::vec
& finished
);
1145 void replicate_stray(CDentry
*straydn
, mds_rank_t who
, bufferlist
& bl
);
1146 CDentry
*add_replica_stray(const bufferlist
&bl
, mds_rank_t from
);
1150 void send_dentry_link(CDentry
*dn
, MDRequestRef
& mdr
);
1151 void send_dentry_unlink(CDentry
*dn
, CDentry
*straydn
, MDRequestRef
& mdr
);
1153 void handle_dentry_link(const MDentryLink::const_ref
&m
);
1154 void handle_dentry_unlink(const MDentryUnlink::const_ref
&m
);
1157 // -- fragmenting --
1163 MDSContext::vec waiters
;
1164 frag_vec_t old_frags
;
1165 bufferlist rollback
;
1166 ufragment() : bits(0), committed(false), ls(NULL
) {}
1168 map
<dirfrag_t
, ufragment
> uncommitted_fragments
;
1170 struct fragment_info_t
{
1173 list
<CDir
*> resultfrags
;
1175 set
<mds_rank_t
> notify_ack_waiting
;
1176 bool finishing
= false;
1178 // for deadlock detection
1179 bool all_frozen
= false;
1180 utime_t last_cum_auth_pins_change
;
1181 int last_cum_auth_pins
= 0;
1182 int num_remote_waiters
= 0; // number of remote authpin waiters
1183 fragment_info_t() {}
1184 bool is_fragmenting() { return !resultfrags
.empty(); }
1185 uint64_t get_tid() { return mdr
? mdr
->reqid
.tid
: 0; }
1187 map
<dirfrag_t
,fragment_info_t
> fragments
;
1188 typedef map
<dirfrag_t
,fragment_info_t
>::iterator fragment_info_iterator
;
1190 void adjust_dir_fragments(CInode
*diri
, frag_t basefrag
, int bits
,
1191 list
<CDir
*>& frags
, MDSContext::vec
& waiters
, bool replay
);
1192 void adjust_dir_fragments(CInode
*diri
,
1193 list
<CDir
*>& srcfrags
,
1194 frag_t basefrag
, int bits
,
1195 list
<CDir
*>& resultfrags
,
1196 MDSContext::vec
& waiters
,
1198 CDir
*force_dir_fragment(CInode
*diri
, frag_t fg
, bool replay
=true);
1199 void get_force_dirfrag_bound_set(const vector
<dirfrag_t
>& dfs
, set
<CDir
*>& bounds
);
1201 bool can_fragment(CInode
*diri
, list
<CDir
*>& dirs
);
1202 void fragment_freeze_dirs(list
<CDir
*>& dirs
);
1203 void fragment_mark_and_complete(MDRequestRef
& mdr
);
1204 void fragment_frozen(MDRequestRef
& mdr
, int r
);
1205 void fragment_unmark_unfreeze_dirs(list
<CDir
*>& dirs
);
1206 void fragment_drop_locks(fragment_info_t
&info
);
1207 void fragment_maybe_finish(const fragment_info_iterator
& it
);
1208 void dispatch_fragment_dir(MDRequestRef
& mdr
);
1209 void _fragment_logged(MDRequestRef
& mdr
);
1210 void _fragment_stored(MDRequestRef
& mdr
);
1211 void _fragment_committed(dirfrag_t f
, const MDRequestRef
& mdr
);
1212 void _fragment_old_purged(dirfrag_t f
, int bits
, const MDRequestRef
& mdr
);
1214 friend class EFragment
;
1215 friend class C_MDC_FragmentFrozen
;
1216 friend class C_MDC_FragmentMarking
;
1217 friend class C_MDC_FragmentPrep
;
1218 friend class C_MDC_FragmentStore
;
1219 friend class C_MDC_FragmentCommit
;
1220 friend class C_IO_MDC_FragmentPurgeOld
;
1222 void handle_fragment_notify(const MMDSFragmentNotify::const_ref
&m
);
1223 void handle_fragment_notify_ack(const MMDSFragmentNotifyAck::const_ref
&m
);
1225 void add_uncommitted_fragment(dirfrag_t basedirfrag
, int bits
, const frag_vec_t
& old_frag
,
1226 LogSegment
*ls
, bufferlist
*rollback
=NULL
);
1227 void finish_uncommitted_fragment(dirfrag_t basedirfrag
, int op
);
1228 void rollback_uncommitted_fragment(dirfrag_t basedirfrag
, frag_vec_t
&& old_frags
);
1231 DecayCounter trim_counter
;
1234 void wait_for_uncommitted_fragment(dirfrag_t dirfrag
, MDSContext
*c
) {
1235 ceph_assert(uncommitted_fragments
.count(dirfrag
));
1236 uncommitted_fragments
[dirfrag
].waiters
.push_back(c
);
1238 void split_dir(CDir
*dir
, int byn
);
1239 void merge_dir(CInode
*diri
, frag_t fg
);
1240 void rollback_uncommitted_fragments();
1242 void find_stale_fragment_freeze();
1243 void fragment_freeze_inc_num_waiters(CDir
*dir
);
1244 bool fragment_are_all_frozen(CDir
*dir
);
1245 int get_num_fragmenting_dirs() { return fragments
.size(); }
1248 //int send_inode_updates(CInode *in);
1249 //void handle_inode_update(MInodeUpdate *m);
1251 int send_dir_updates(CDir
*in
, bool bcast
=false);
1252 void handle_dir_update(const MDirUpdate::const_ref
&m
);
1254 // -- cache expiration --
1255 void handle_cache_expire(const MCacheExpire::const_ref
&m
);
1256 // delayed cache expire
1257 map
<CDir
*, expiremap
> delayed_expire
; // subtree root -> expire msg
1258 void process_delayed_expire(CDir
*dir
);
1259 void discard_delayed_expire(CDir
*dir
);
1262 int dump_cache(std::string_view fn
, Formatter
*f
);
1264 int dump_cache() { return dump_cache(NULL
, NULL
); }
1265 int dump_cache(std::string_view filename
);
1266 int dump_cache(Formatter
*f
);
1267 void dump_tree(CInode
*in
, const int cur_depth
, const int max_depth
, Formatter
*f
);
1269 void cache_status(Formatter
*f
);
1271 void dump_resolve_status(Formatter
*f
) const;
1272 void dump_rejoin_status(Formatter
*f
) const;
1277 void show_subtrees(int dbl
=10);
1279 CInode
*hack_pick_random_inode() {
1280 ceph_assert(!inode_map
.empty());
1281 int n
= rand() % inode_map
.size();
1282 auto p
= inode_map
.begin();
1288 void flush_dentry_work(MDRequestRef
& mdr
);
1290 * Resolve path to a dentry and pass it onto the ScrubStack.
1292 * TODO: return enough information to the original mdr formatter
1293 * and completion that they can subsequeuntly check the progress of
1294 * this scrub (we won't block them on a whole scrub as it can take a very
1297 void enqueue_scrub_work(MDRequestRef
& mdr
);
1298 void recursive_scrub_finish(const ScrubHeaderRef
& header
);
1299 void repair_inode_stats_work(MDRequestRef
& mdr
);
1300 void repair_dirfrag_stats_work(MDRequestRef
& mdr
);
1301 void upgrade_inode_snaprealm_work(MDRequestRef
& mdr
);
1302 friend class C_MDC_RespondInternalRequest
;
1304 void flush_dentry(std::string_view path
, Context
*fin
);
1306 * Create and start an OP_ENQUEUE_SCRUB
1308 void enqueue_scrub(std::string_view path
, std::string_view tag
,
1309 bool force
, bool recursive
, bool repair
,
1310 Formatter
*f
, Context
*fin
);
1311 void repair_inode_stats(CInode
*diri
);
1312 void repair_dirfrag_stats(CDir
*dir
);
1313 void upgrade_inode_snaprealm(CInode
*in
);
1316 /* Because exports may fail, this set lets us keep track of inodes that need exporting. */
1317 std::set
<CInode
*> export_pin_queue
;
1319 OpenFileTable open_file_table
;
1322 class C_MDS_RetryRequest
: public MDSInternalContext
{
1326 C_MDS_RetryRequest(MDCache
*c
, MDRequestRef
& r
);
1327 void finish(int r
) override
;