1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <string_view>
26 #include "MDBalancer.h"
28 #include "ScrubStack.h"
30 #include "SnapClient.h"
39 #include "include/ceph_fs.h"
40 #include "include/filepath.h"
41 #include "include/util.h"
43 #include "messages/MClientCaps.h"
45 #include "msg/Message.h"
46 #include "msg/Messenger.h"
48 #include "common/MemoryModel.h"
49 #include "common/errno.h"
50 #include "common/perf_counters.h"
51 #include "common/safe_io.h"
53 #include "osdc/Journaler.h"
54 #include "osdc/Filer.h"
56 #include "events/ESubtreeMap.h"
57 #include "events/EUpdate.h"
58 #include "events/EPeerUpdate.h"
59 #include "events/EImportFinish.h"
60 #include "events/EFragment.h"
61 #include "events/ECommitted.h"
62 #include "events/EPurged.h"
63 #include "events/ESessions.h"
67 #include "common/Timer.h"
69 #include "perfglue/heap_profiler.h"
72 #include "common/config.h"
73 #include "include/ceph_assert.h"
75 #define dout_context g_ceph_context
76 #define dout_subsys ceph_subsys_mds
78 #define dout_prefix _prefix(_dout, mds)
79 static ostream
& _prefix(std::ostream
*_dout
, MDSRank
*mds
) {
80 return *_dout
<< "mds." << mds
->get_nodeid() << ".cache ";
83 set
<int> SimpleLock::empty_gather_set
;
87 * All non-I/O contexts that require a reference
88 * to an MDCache instance descend from this.
90 class MDCacheContext
: public virtual MDSContext
{
93 MDSRank
*get_mds() override
95 ceph_assert(mdcache
!= NULL
);
99 explicit MDCacheContext(MDCache
*mdc_
) : mdcache(mdc_
) {}
104 * Only for contexts called back from an I/O completion
106 * Note: duplication of members wrt MDCacheContext, because
107 * it'ls the lesser of two evils compared with introducing
108 * yet another piece of (multiple) inheritance.
110 class MDCacheIOContext
: public virtual MDSIOContextBase
{
113 MDSRank
*get_mds() override
115 ceph_assert(mdcache
!= NULL
);
119 explicit MDCacheIOContext(MDCache
*mdc_
, bool track
=true) :
120 MDSIOContextBase(track
), mdcache(mdc_
) {}
123 class MDCacheLogContext
: public virtual MDSLogContextBase
{
126 MDSRank
*get_mds() override
128 ceph_assert(mdcache
!= NULL
);
132 explicit MDCacheLogContext(MDCache
*mdc_
) : mdcache(mdc_
) {}
135 MDCache::MDCache(MDSRank
*m
, PurgeQueue
&purge_queue_
) :
138 filer(m
->objecter
, m
->finisher
),
139 stray_manager(m
, purge_queue_
),
141 trim_counter(g_conf().get_val
<double>("mds_cache_trim_decay_rate"))
143 migrator
.reset(new Migrator(mds
, this));
145 max_dir_commit_size
= g_conf()->mds_dir_max_commit_size
?
146 (g_conf()->mds_dir_max_commit_size
<< 20) :
147 (0.9 *(g_conf()->osd_max_write_size
<< 20));
149 cache_memory_limit
= g_conf().get_val
<Option::size_t>("mds_cache_memory_limit");
150 cache_reservation
= g_conf().get_val
<double>("mds_cache_reservation");
151 cache_health_threshold
= g_conf().get_val
<double>("mds_health_cache_threshold");
153 export_ephemeral_distributed_config
= g_conf().get_val
<bool>("mds_export_ephemeral_distributed");
154 export_ephemeral_random_config
= g_conf().get_val
<bool>("mds_export_ephemeral_random");
155 export_ephemeral_random_max
= g_conf().get_val
<double>("mds_export_ephemeral_random_max");
157 lru
.lru_set_midpoint(g_conf().get_val
<double>("mds_cache_mid"));
159 bottom_lru
.lru_set_midpoint(0);
161 decayrate
.set_halflife(g_conf()->mds_decay_halflife
);
163 upkeeper
= std::thread(&MDCache::upkeep_main
, this);
169 g_ceph_context
->get_perfcounters_collection()->remove(logger
.get());
171 if (upkeeper
.joinable())
175 void MDCache::handle_conf_change(const std::set
<std::string
>& changed
, const MDSMap
& mdsmap
)
177 dout(20) << "config changes: " << changed
<< dendl
;
178 if (changed
.count("mds_cache_memory_limit"))
179 cache_memory_limit
= g_conf().get_val
<Option::size_t>("mds_cache_memory_limit");
180 if (changed
.count("mds_cache_reservation"))
181 cache_reservation
= g_conf().get_val
<double>("mds_cache_reservation");
183 bool ephemeral_pin_config_changed
= false;
184 if (changed
.count("mds_export_ephemeral_distributed")) {
185 export_ephemeral_distributed_config
= g_conf().get_val
<bool>("mds_export_ephemeral_distributed");
186 dout(10) << "Migrating any ephemeral distributed pinned inodes" << dendl
;
187 /* copy to vector to avoid removals during iteration */
188 ephemeral_pin_config_changed
= true;
190 if (changed
.count("mds_export_ephemeral_random")) {
191 export_ephemeral_random_config
= g_conf().get_val
<bool>("mds_export_ephemeral_random");
192 dout(10) << "Migrating any ephemeral random pinned inodes" << dendl
;
193 /* copy to vector to avoid removals during iteration */
194 ephemeral_pin_config_changed
= true;
196 if (ephemeral_pin_config_changed
) {
197 std::vector
<CInode
*> migrate
;
198 migrate
.assign(export_ephemeral_pins
.begin(), export_ephemeral_pins
.end());
199 for (auto& in
: migrate
) {
200 in
->maybe_export_pin(true);
203 if (changed
.count("mds_export_ephemeral_random_max")) {
204 export_ephemeral_random_max
= g_conf().get_val
<double>("mds_export_ephemeral_random_max");
206 if (changed
.count("mds_health_cache_threshold"))
207 cache_health_threshold
= g_conf().get_val
<double>("mds_health_cache_threshold");
208 if (changed
.count("mds_cache_mid"))
209 lru
.lru_set_midpoint(g_conf().get_val
<double>("mds_cache_mid"));
210 if (changed
.count("mds_cache_trim_decay_rate")) {
211 trim_counter
= DecayCounter(g_conf().get_val
<double>("mds_cache_trim_decay_rate"));
214 migrator
->handle_conf_change(changed
, mdsmap
);
215 mds
->balancer
->handle_conf_change(changed
, mdsmap
);
218 void MDCache::log_stat()
220 mds
->logger
->set(l_mds_inodes
, lru
.lru_get_size());
221 mds
->logger
->set(l_mds_inodes_pinned
, lru
.lru_get_num_pinned());
222 mds
->logger
->set(l_mds_inodes_top
, lru
.lru_get_top());
223 mds
->logger
->set(l_mds_inodes_bottom
, lru
.lru_get_bot());
224 mds
->logger
->set(l_mds_inodes_pin_tail
, lru
.lru_get_pintail());
225 mds
->logger
->set(l_mds_inodes_with_caps
, num_inodes_with_caps
);
226 mds
->logger
->set(l_mds_caps
, Capability::count());
228 mds
->logger
->set(l_mds_root_rfiles
, root
->get_inode()->rstat
.rfiles
);
229 mds
->logger
->set(l_mds_root_rbytes
, root
->get_inode()->rstat
.rbytes
);
230 mds
->logger
->set(l_mds_root_rsnaps
, root
->get_inode()->rstat
.rsnaps
);
237 bool MDCache::shutdown()
240 std::scoped_lock
lock(upkeep_mutex
);
241 upkeep_trim_shutdown
= true;
242 upkeep_cvar
.notify_one();
244 if (lru
.lru_get_size() > 0) {
245 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl
;
254 // ====================================================================
255 // some inode functions
257 void MDCache::add_inode(CInode
*in
)
259 // add to lru, inode map
260 if (in
->last
== CEPH_NOSNAP
) {
261 auto &p
= inode_map
[in
->ino()];
262 ceph_assert(!p
); // should be no dup inos!
265 auto &p
= snap_inode_map
[in
->vino()];
266 ceph_assert(!p
); // should be no dup inos!
270 if (in
->ino() < MDS_INO_SYSTEM_BASE
) {
271 if (in
->ino() == CEPH_INO_ROOT
)
273 else if (in
->ino() == MDS_INO_MDSDIR(mds
->get_nodeid()))
275 else if (in
->is_stray()) {
276 if (MDS_INO_STRAY_OWNER(in
->ino()) == mds
->get_nodeid()) {
277 strays
[MDS_INO_STRAY_INDEX(in
->ino())] = in
;
281 base_inodes
.insert(in
);
285 void MDCache::remove_inode(CInode
*o
)
287 dout(14) << "remove_inode " << *o
<< dendl
;
289 if (o
->get_parent_dn()) {
290 // FIXME: multiple parents?
291 CDentry
*dn
= o
->get_parent_dn();
292 ceph_assert(!dn
->is_dirty());
293 dn
->dir
->unlink_inode(dn
); // leave dentry ... FIXME?
298 if (o
->is_dirty_parent())
299 o
->clear_dirty_parent();
301 o
->clear_scatter_dirty();
303 o
->clear_clientwriteable();
305 o
->item_open_file
.remove_myself();
307 if (o
->state_test(CInode::STATE_QUEUEDEXPORTPIN
))
308 export_pin_queue
.erase(o
);
310 if (o
->state_test(CInode::STATE_DELAYEDEXPORTPIN
))
311 export_pin_delayed_queue
.erase(o
);
313 o
->clear_ephemeral_pin(true, true);
315 // remove from inode map
316 if (o
->last
== CEPH_NOSNAP
) {
317 inode_map
.erase(o
->ino());
319 o
->item_caps
.remove_myself();
320 snap_inode_map
.erase(o
->vino());
323 if (o
->ino() < MDS_INO_SYSTEM_BASE
) {
324 if (o
== root
) root
= 0;
325 if (o
== myin
) myin
= 0;
327 if (MDS_INO_STRAY_OWNER(o
->ino()) == mds
->get_nodeid()) {
328 strays
[MDS_INO_STRAY_INDEX(o
->ino())] = 0;
332 base_inodes
.erase(o
);
336 ceph_assert(o
->get_num_ref() == 0);
340 file_layout_t
MDCache::gen_default_file_layout(const MDSMap
&mdsmap
)
342 file_layout_t result
= file_layout_t::get_default();
343 result
.pool_id
= mdsmap
.get_first_data_pool();
347 file_layout_t
MDCache::gen_default_log_layout(const MDSMap
&mdsmap
)
349 file_layout_t result
= file_layout_t::get_default();
350 result
.pool_id
= mdsmap
.get_metadata_pool();
351 if (g_conf()->mds_log_segment_size
> 0) {
352 result
.object_size
= g_conf()->mds_log_segment_size
;
353 result
.stripe_unit
= g_conf()->mds_log_segment_size
;
358 void MDCache::init_layouts()
360 default_file_layout
= gen_default_file_layout(*(mds
->mdsmap
));
361 default_log_layout
= gen_default_log_layout(*(mds
->mdsmap
));
364 void MDCache::create_unlinked_system_inode(CInode
*in
, inodeno_t ino
, int mode
) const
366 auto _inode
= in
->_get_inode();
369 _inode
->xattr_version
= 1;
370 _inode
->mode
= 0500 | mode
;
372 _inode
->ctime
= _inode
->mtime
= _inode
->btime
= ceph_clock_now();
374 _inode
->truncate_size
= -1ull;
375 _inode
->change_attr
= 0;
376 _inode
->export_pin
= MDS_RANK_NONE
;
378 // FIPS zeroization audit 20191117: this memset is not security related.
379 memset(&_inode
->dir_layout
, 0, sizeof(_inode
->dir_layout
));
380 if (_inode
->is_dir()) {
381 _inode
->dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
382 _inode
->rstat
.rsubdirs
= 1; /* itself */
383 _inode
->rstat
.rctime
= in
->get_inode()->ctime
;
385 _inode
->layout
= default_file_layout
;
386 ++_inode
->rstat
.rfiles
;
388 _inode
->accounted_rstat
= _inode
->rstat
;
392 in
->inode_auth
= mds_authority_t(mds
->get_nodeid(), CDIR_AUTH_UNKNOWN
);
394 in
->inode_auth
= mds_authority_t(mds_rank_t(in
->ino() - MDS_INO_MDSDIR_OFFSET
), CDIR_AUTH_UNKNOWN
);
395 in
->open_snaprealm(); // empty snaprealm
396 ceph_assert(!in
->snaprealm
->parent
); // created its own
397 in
->snaprealm
->srnode
.seq
= 1;
401 CInode
*MDCache::create_system_inode(inodeno_t ino
, int mode
)
403 dout(0) << "creating system inode with ino:" << ino
<< dendl
;
404 CInode
*in
= new CInode(this);
405 create_unlinked_system_inode(in
, ino
, mode
);
410 CInode
*MDCache::create_root_inode()
412 CInode
*in
= create_system_inode(CEPH_INO_ROOT
, S_IFDIR
|0755);
413 auto _inode
= in
->_get_inode();
414 _inode
->uid
= g_conf()->mds_root_ino_uid
;
415 _inode
->gid
= g_conf()->mds_root_ino_gid
;
416 _inode
->layout
= default_file_layout
;
417 _inode
->layout
.pool_id
= mds
->mdsmap
->get_first_data_pool();
421 void MDCache::create_empty_hierarchy(MDSGather
*gather
)
424 CInode
*root
= create_root_inode();
426 // force empty root dir
427 CDir
*rootdir
= root
->get_or_open_dirfrag(this, frag_t());
428 adjust_subtree_auth(rootdir
, mds
->get_nodeid());
429 rootdir
->dir_rep
= CDir::REP_ALL
; //NONE;
431 ceph_assert(rootdir
->get_fnode()->accounted_fragstat
== rootdir
->get_fnode()->fragstat
);
432 ceph_assert(rootdir
->get_fnode()->fragstat
== root
->get_inode()->dirstat
);
433 ceph_assert(rootdir
->get_fnode()->accounted_rstat
== rootdir
->get_fnode()->rstat
);
434 /* Do no update rootdir rstat information of the fragment, rstat upkeep magic
435 * assume version 0 is stale/invalid.
438 rootdir
->mark_complete();
439 rootdir
->_get_fnode()->version
= rootdir
->pre_dirty();
440 rootdir
->mark_dirty(mds
->mdlog
->get_current_segment());
441 rootdir
->commit(0, gather
->new_sub());
443 root
->store(gather
->new_sub());
444 root
->mark_dirty_parent(mds
->mdlog
->get_current_segment(), true);
445 root
->store_backtrace(gather
->new_sub());
448 void MDCache::create_mydir_hierarchy(MDSGather
*gather
)
451 CInode
*my
= create_system_inode(MDS_INO_MDSDIR(mds
->get_nodeid()), S_IFDIR
);
453 CDir
*mydir
= my
->get_or_open_dirfrag(this, frag_t());
454 auto mydir_fnode
= mydir
->_get_fnode();
456 adjust_subtree_auth(mydir
, mds
->get_nodeid());
458 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
461 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
462 CInode
*stray
= create_system_inode(MDS_INO_STRAY(mds
->get_nodeid(), i
), S_IFDIR
);
463 CDir
*straydir
= stray
->get_or_open_dirfrag(this, frag_t());
464 CachedStackStringStream css
;
465 *css
<< "stray" << i
;
466 CDentry
*sdn
= mydir
->add_primary_dentry(css
->str(), stray
, "");
467 sdn
->_mark_dirty(mds
->mdlog
->get_current_segment());
469 stray
->_get_inode()->dirstat
= straydir
->get_fnode()->fragstat
;
471 mydir_fnode
->rstat
.add(stray
->get_inode()->rstat
);
472 mydir_fnode
->fragstat
.nsubdirs
++;
474 straydir
->mark_complete();
475 straydir
->_get_fnode()->version
= straydir
->pre_dirty();
476 straydir
->mark_dirty(ls
);
477 straydir
->commit(0, gather
->new_sub());
478 stray
->mark_dirty_parent(ls
, true);
479 stray
->store_backtrace(gather
->new_sub());
482 mydir_fnode
->accounted_fragstat
= mydir
->get_fnode()->fragstat
;
483 mydir_fnode
->accounted_rstat
= mydir
->get_fnode()->rstat
;
485 auto inode
= myin
->_get_inode();
486 inode
->dirstat
= mydir
->get_fnode()->fragstat
;
487 inode
->rstat
= mydir
->get_fnode()->rstat
;
488 ++inode
->rstat
.rsubdirs
;
489 inode
->accounted_rstat
= inode
->rstat
;
491 mydir
->mark_complete();
492 mydir_fnode
->version
= mydir
->pre_dirty();
493 mydir
->mark_dirty(ls
);
494 mydir
->commit(0, gather
->new_sub());
496 myin
->store(gather
->new_sub());
499 struct C_MDC_CreateSystemFile
: public MDCacheLogContext
{
504 C_MDC_CreateSystemFile(MDCache
*c
, MutationRef
& mu
, CDentry
*d
, version_t v
, MDSContext
*f
) :
505 MDCacheLogContext(c
), mut(mu
), dn(d
), dpv(v
), fin(f
) {}
506 void finish(int r
) override
{
507 mdcache
->_create_system_file_finish(mut
, dn
, dpv
, fin
);
511 void MDCache::_create_system_file(CDir
*dir
, std::string_view name
, CInode
*in
, MDSContext
*fin
)
513 dout(10) << "_create_system_file " << name
<< " in " << *dir
<< dendl
;
514 CDentry
*dn
= dir
->add_null_dentry(name
);
516 dn
->push_projected_linkage(in
);
517 version_t dpv
= dn
->pre_dirty();
520 auto inode
= in
->_get_inode();
522 inode
->rstat
.rsubdirs
= 1;
524 mdir
= in
->get_or_open_dirfrag(this, frag_t());
525 mdir
->mark_complete();
526 mdir
->_get_fnode()->version
= mdir
->pre_dirty();
528 inode
->rstat
.rfiles
= 1;
531 inode
->version
= dn
->pre_dirty();
533 SnapRealm
*realm
= dir
->get_inode()->find_snaprealm();
534 dn
->first
= in
->first
= realm
->get_newest_seq() + 1;
536 MutationRef
mut(new MutationImpl());
538 // force some locks. hacky.
539 mds
->locker
->wrlock_force(&dir
->inode
->filelock
, mut
);
540 mds
->locker
->wrlock_force(&dir
->inode
->nestlock
, mut
);
542 mut
->ls
= mds
->mdlog
->get_current_segment();
543 EUpdate
*le
= new EUpdate(mds
->mdlog
, "create system file");
544 mds
->mdlog
->start_entry(le
);
546 if (!in
->is_mdsdir()) {
547 predirty_journal_parents(mut
, &le
->metablob
, in
, dir
, PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
548 le
->metablob
.add_primary_dentry(dn
, in
, true);
550 predirty_journal_parents(mut
, &le
->metablob
, in
, dir
, PREDIRTY_DIR
, 1);
551 journal_dirty_inode(mut
.get(), &le
->metablob
, in
);
552 dn
->push_projected_linkage(in
->ino(), in
->d_type());
553 le
->metablob
.add_remote_dentry(dn
, true, in
->ino(), in
->d_type());
554 le
->metablob
.add_root(true, in
);
557 le
->metablob
.add_new_dir(mdir
); // dirty AND complete AND new
559 mds
->mdlog
->submit_entry(le
, new C_MDC_CreateSystemFile(this, mut
, dn
, dpv
, fin
));
563 void MDCache::_create_system_file_finish(MutationRef
& mut
, CDentry
*dn
, version_t dpv
, MDSContext
*fin
)
565 dout(10) << "_create_system_file_finish " << *dn
<< dendl
;
567 dn
->pop_projected_linkage();
568 dn
->mark_dirty(dpv
, mut
->ls
);
570 CInode
*in
= dn
->get_linkage()->get_inode();
571 in
->mark_dirty(mut
->ls
);
574 CDir
*dir
= in
->get_dirfrag(frag_t());
576 dir
->mark_dirty(mut
->ls
);
577 dir
->mark_new(mut
->ls
);
581 mds
->locker
->drop_locks(mut
.get());
586 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
587 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
592 struct C_MDS_RetryOpenRoot
: public MDSInternalContext
{
594 explicit C_MDS_RetryOpenRoot(MDCache
*c
) : MDSInternalContext(c
->mds
), cache(c
) {}
595 void finish(int r
) override
{
597 // If we can't open root, something disastrous has happened: mark
598 // this rank damaged for operator intervention. Note that
599 // it is not okay to call suicide() here because we are in
600 // a Finisher callback.
601 cache
->mds
->damaged();
602 ceph_abort(); // damaged should never return
609 void MDCache::open_root_inode(MDSContext
*c
)
611 if (mds
->get_nodeid() == mds
->mdsmap
->get_root()) {
613 in
= create_system_inode(CEPH_INO_ROOT
, S_IFDIR
|0755); // initially inaccurate!
616 discover_base_ino(CEPH_INO_ROOT
, c
, mds
->mdsmap
->get_root());
620 void MDCache::open_mydir_inode(MDSContext
*c
)
622 CInode
*in
= create_system_inode(MDS_INO_MDSDIR(mds
->get_nodeid()), S_IFDIR
|0755); // initially inaccurate!
626 void MDCache::open_mydir_frag(MDSContext
*c
)
629 new MDSInternalContextWrapper(mds
,
630 new LambdaContext([this, c
](int r
) {
635 CDir
*mydir
= myin
->get_or_open_dirfrag(this, frag_t());
637 adjust_subtree_auth(mydir
, mds
->get_nodeid());
644 void MDCache::open_root()
646 dout(10) << "open_root" << dendl
;
649 open_root_inode(new C_MDS_RetryOpenRoot(this));
652 if (mds
->get_nodeid() == mds
->mdsmap
->get_root()) {
653 ceph_assert(root
->is_auth());
654 CDir
*rootdir
= root
->get_or_open_dirfrag(this, frag_t());
655 ceph_assert(rootdir
);
656 if (!rootdir
->is_subtree_root())
657 adjust_subtree_auth(rootdir
, mds
->get_nodeid());
658 if (!rootdir
->is_complete()) {
659 rootdir
->fetch(new C_MDS_RetryOpenRoot(this));
663 ceph_assert(!root
->is_auth());
664 CDir
*rootdir
= root
->get_dirfrag(frag_t());
666 open_remote_dirfrag(root
, frag_t(), new C_MDS_RetryOpenRoot(this));
672 CInode
*in
= create_system_inode(MDS_INO_MDSDIR(mds
->get_nodeid()), S_IFDIR
|0755); // initially inaccurate!
673 in
->fetch(new C_MDS_RetryOpenRoot(this));
676 CDir
*mydir
= myin
->get_or_open_dirfrag(this, frag_t());
678 adjust_subtree_auth(mydir
, mds
->get_nodeid());
683 void MDCache::advance_stray() {
684 // check whether the directory has been fragmented
685 if (stray_fragmenting_index
>= 0) {
686 auto&& dfs
= strays
[stray_fragmenting_index
]->get_dirfrags();
687 bool any_fragmenting
= false;
688 for (const auto& dir
: dfs
) {
689 if (dir
->state_test(CDir::STATE_FRAGMENTING
) ||
690 mds
->balancer
->is_fragment_pending(dir
->dirfrag())) {
691 any_fragmenting
= true;
695 if (!any_fragmenting
)
696 stray_fragmenting_index
= -1;
699 for (int i
= 1; i
< NUM_STRAY
; i
++){
700 stray_index
= (stray_index
+ i
) % NUM_STRAY
;
701 if (stray_index
!= stray_fragmenting_index
)
705 if (stray_fragmenting_index
== -1 && is_open()) {
706 // Fragment later stray dir in advance. We don't choose past
707 // stray dir because in-flight requests may still use it.
708 stray_fragmenting_index
= (stray_index
+ 3) % NUM_STRAY
;
709 auto&& dfs
= strays
[stray_fragmenting_index
]->get_dirfrags();
710 bool any_fragmenting
= false;
711 for (const auto& dir
: dfs
) {
712 if (dir
->should_split()) {
713 mds
->balancer
->queue_split(dir
, true);
714 any_fragmenting
= true;
715 } else if (dir
->should_merge()) {
716 mds
->balancer
->queue_merge(dir
);
717 any_fragmenting
= true;
720 if (!any_fragmenting
)
721 stray_fragmenting_index
= -1;
724 dout(10) << "advance_stray to index " << stray_index
725 << " fragmenting index " << stray_fragmenting_index
<< dendl
;
728 void MDCache::populate_mydir()
731 CDir
*mydir
= myin
->get_or_open_dirfrag(this, frag_t());
734 dout(10) << "populate_mydir " << *mydir
<< dendl
;
736 if (!mydir
->is_complete()) {
737 mydir
->fetch(new C_MDS_RetryOpenRoot(this));
741 if (mydir
->get_version() == 0 && mydir
->state_test(CDir::STATE_BADFRAG
)) {
742 // A missing dirfrag, we will recreate it. Before that, we must dirty
743 // it before dirtying any of the strays we create within it.
744 mds
->clog
->warn() << "fragment " << mydir
->dirfrag() << " was unreadable, "
746 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
747 mydir
->state_clear(CDir::STATE_BADFRAG
);
748 mydir
->mark_complete();
749 mydir
->_get_fnode()->version
= mydir
->pre_dirty();
750 mydir
->mark_dirty(ls
);
753 // open or create stray
754 uint64_t num_strays
= 0;
755 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
756 CachedStackStringStream css
;
757 *css
<< "stray" << i
;
758 CDentry
*straydn
= mydir
->lookup(css
->str());
760 // allow for older fs's with stray instead of stray0
761 if (straydn
== NULL
&& i
== 0)
762 straydn
= mydir
->lookup("stray");
764 if (!straydn
|| !straydn
->get_linkage()->get_inode()) {
765 _create_system_file(mydir
, css
->strv(), create_system_inode(MDS_INO_STRAY(mds
->get_nodeid(), i
), S_IFDIR
),
766 new C_MDS_RetryOpenRoot(this));
769 ceph_assert(straydn
);
770 ceph_assert(strays
[i
]);
771 // we make multiple passes through this method; make sure we only pin each stray once.
772 if (!strays
[i
]->state_test(CInode::STATE_STRAYPINNED
)) {
773 strays
[i
]->get(CInode::PIN_STRAY
);
774 strays
[i
]->state_set(CInode::STATE_STRAYPINNED
);
775 strays
[i
]->get_stickydirs();
777 dout(20) << " stray num " << i
<< " is " << *strays
[i
] << dendl
;
781 strays
[i
]->dirfragtree
.get_leaves(leaves
);
782 for (const auto& leaf
: leaves
) {
783 CDir
*dir
= strays
[i
]->get_dirfrag(leaf
);
785 dir
= strays
[i
]->get_or_open_dirfrag(this, leaf
);
788 // DamageTable applies special handling to strays: it will
789 // have damaged() us out if one is damaged.
790 ceph_assert(!dir
->state_test(CDir::STATE_BADFRAG
));
792 if (dir
->get_version() == 0) {
793 dir
->fetch(new C_MDS_RetryOpenRoot(this));
797 if (dir
->get_frag_size() > 0)
798 num_strays
+= dir
->get_frag_size();
803 dout(10) << "populate_mydir done" << dendl
;
806 mds
->queue_waiters(waiting_for_open
);
808 stray_manager
.set_num_strays(num_strays
);
809 stray_manager
.activate();
814 void MDCache::open_foreign_mdsdir(inodeno_t ino
, MDSContext
*fin
)
816 discover_base_ino(ino
, fin
, mds_rank_t(ino
& (MAX_MDS
-1)));
819 CDir
*MDCache::get_stray_dir(CInode
*in
)
822 in
->name_stray_dentry(straydname
);
824 CInode
*strayi
= get_stray();
826 frag_t fg
= strayi
->pick_dirfrag(straydname
);
827 CDir
*straydir
= strayi
->get_dirfrag(fg
);
828 ceph_assert(straydir
);
832 MDSCacheObject
*MDCache::get_object(const MDSCacheObjectInfo
&info
)
836 return get_inode(info
.ino
, info
.snapid
);
839 CDir
*dir
= get_dirfrag(info
.dirfrag
);
842 if (info
.dname
.length())
843 return dir
->lookup(info
.dname
, info
.snapid
);
849 // ====================================================================
850 // consistent hash ring
853 * hashing implementation based on Lamping and Veach's Jump Consistent Hash: https://arxiv.org/pdf/1406.2294.pdf
855 mds_rank_t
MDCache::hash_into_rank_bucket(inodeno_t ino
, frag_t fg
)
857 const mds_rank_t max_mds
= mds
->mdsmap
->get_max_mds();
858 uint64_t hash
= rjhash64(ino
);
860 hash
= rjhash64(hash
+ rjhash64(fg
.value()));
862 int64_t b
= -1, j
= 0;
863 while (j
< max_mds
) {
865 hash
= hash
*2862933555777941757ULL + 1;
866 j
= (b
+ 1) * (double(1LL << 31) / double((hash
>> 33) + 1));
868 // verify bounds before returning
869 auto result
= mds_rank_t(b
);
870 ceph_assert(result
>= 0 && result
< max_mds
);
875 // ====================================================================
876 // subtree management
879 * adjust the dir_auth of a subtree.
880 * merge with parent and/or child subtrees, if is it appropriate.
881 * merge can ONLY happen if both parent and child have unambiguous auth.
883 void MDCache::adjust_subtree_auth(CDir
*dir
, mds_authority_t auth
, bool adjust_pop
)
885 dout(7) << "adjust_subtree_auth " << dir
->get_dir_auth() << " -> " << auth
886 << " on " << *dir
<< dendl
;
891 if (dir
->inode
->is_base()) {
892 root
= dir
; // bootstrap hack.
893 if (subtrees
.count(root
) == 0) {
895 root
->get(CDir::PIN_SUBTREE
);
898 root
= get_subtree_root(dir
); // subtree root
901 ceph_assert(subtrees
.count(root
));
902 dout(7) << " current root is " << *root
<< dendl
;
905 // i am already a subtree.
906 dir
->set_dir_auth(auth
);
908 // i am a new subtree.
909 dout(10) << " new subtree at " << *dir
<< dendl
;
910 ceph_assert(subtrees
.count(dir
) == 0);
911 subtrees
[dir
]; // create empty subtree bounds list for me.
912 dir
->get(CDir::PIN_SUBTREE
);
915 dir
->set_dir_auth(auth
);
917 // move items nested beneath me, under me.
918 set
<CDir
*>::iterator p
= subtrees
[root
].begin();
919 while (p
!= subtrees
[root
].end()) {
920 set
<CDir
*>::iterator next
= p
;
922 if (get_subtree_root((*p
)->get_parent_dir()) == dir
) {
924 dout(10) << " claiming child bound " << **p
<< dendl
;
925 subtrees
[dir
].insert(*p
);
926 subtrees
[root
].erase(p
);
931 // i am a bound of the parent subtree.
932 subtrees
[root
].insert(dir
);
934 // i am now the subtree root.
937 // adjust recursive pop counters
938 if (adjust_pop
&& dir
->is_auth()) {
939 CDir
*p
= dir
->get_parent_dir();
941 p
->pop_auth_subtree
.sub(dir
->pop_auth_subtree
);
942 if (p
->is_subtree_root()) break;
943 p
= p
->inode
->get_parent_dir();
952 void MDCache::try_subtree_merge(CDir
*dir
)
954 dout(7) << "try_subtree_merge " << *dir
<< dendl
;
955 // record my old bounds
956 auto oldbounds
= subtrees
.at(dir
);
958 set
<CInode
*> to_eval
;
959 // try merge at my root
960 try_subtree_merge_at(dir
, &to_eval
);
962 // try merge at my old bounds
963 for (auto bound
: oldbounds
)
964 try_subtree_merge_at(bound
, &to_eval
);
966 if (!(mds
->is_any_replay() || mds
->is_resolve())) {
967 for(auto in
: to_eval
)
968 eval_subtree_root(in
);
972 void MDCache::try_subtree_merge_at(CDir
*dir
, set
<CInode
*> *to_eval
, bool adjust_pop
)
974 dout(10) << "try_subtree_merge_at " << *dir
<< dendl
;
976 if (dir
->dir_auth
.second
!= CDIR_AUTH_UNKNOWN
||
977 dir
->state_test(CDir::STATE_EXPORTBOUND
) ||
978 dir
->state_test(CDir::STATE_AUXSUBTREE
))
981 auto it
= subtrees
.find(dir
);
982 ceph_assert(it
!= subtrees
.end());
984 // merge with parent?
986 if (!dir
->inode
->is_base())
987 parent
= get_subtree_root(dir
->get_parent_dir());
989 if (parent
!= dir
&& // we have a parent,
990 parent
->dir_auth
== dir
->dir_auth
) { // auth matches,
991 // merge with parent.
992 dout(10) << " subtree merge at " << *dir
<< dendl
;
993 dir
->set_dir_auth(CDIR_AUTH_DEFAULT
);
995 // move our bounds under the parent
996 subtrees
[parent
].insert(it
->second
.begin(), it
->second
.end());
998 // we are no longer a subtree or bound
999 dir
->put(CDir::PIN_SUBTREE
);
1001 subtrees
[parent
].erase(dir
);
1003 // adjust popularity?
1004 if (adjust_pop
&& dir
->is_auth()) {
1006 CDir
*p
= dir
->get_parent_dir();
1008 p
->pop_auth_subtree
.add(dir
->pop_auth_subtree
);
1009 p
->pop_lru_subdirs
.push_front(&cur
->get_inode()->item_pop_lru
);
1010 if (p
->is_subtree_root()) break;
1012 p
= p
->inode
->get_parent_dir();
1016 if (to_eval
&& dir
->get_inode()->is_auth())
1017 to_eval
->insert(dir
->get_inode());
1023 void MDCache::eval_subtree_root(CInode
*diri
)
1025 // evaluate subtree inode filelock?
1026 // (we should scatter the filelock on subtree bounds)
1027 ceph_assert(diri
->is_auth());
1028 mds
->locker
->try_eval(diri
, CEPH_LOCK_IFILE
| CEPH_LOCK_INEST
);
1032 void MDCache::adjust_bounded_subtree_auth(CDir
*dir
, const set
<CDir
*>& bounds
, mds_authority_t auth
)
1034 dout(7) << "adjust_bounded_subtree_auth " << dir
->get_dir_auth() << " -> " << auth
1036 << " bounds " << bounds
1042 if (dir
->ino() == CEPH_INO_ROOT
) {
1043 root
= dir
; // bootstrap hack.
1044 if (subtrees
.count(root
) == 0) {
1046 root
->get(CDir::PIN_SUBTREE
);
1049 root
= get_subtree_root(dir
); // subtree root
1052 ceph_assert(subtrees
.count(root
));
1053 dout(7) << " current root is " << *root
<< dendl
;
1055 mds_authority_t oldauth
= dir
->authority();
1058 // i am already a subtree.
1059 dir
->set_dir_auth(auth
);
1061 // i am a new subtree.
1062 dout(10) << " new subtree at " << *dir
<< dendl
;
1063 ceph_assert(subtrees
.count(dir
) == 0);
1064 subtrees
[dir
]; // create empty subtree bounds list for me.
1065 dir
->get(CDir::PIN_SUBTREE
);
1068 dir
->set_dir_auth(auth
);
1070 // move items nested beneath me, under me.
1071 set
<CDir
*>::iterator p
= subtrees
[root
].begin();
1072 while (p
!= subtrees
[root
].end()) {
1073 set
<CDir
*>::iterator next
= p
;
1075 if (get_subtree_root((*p
)->get_parent_dir()) == dir
) {
1077 dout(10) << " claiming child bound " << **p
<< dendl
;
1078 subtrees
[dir
].insert(*p
);
1079 subtrees
[root
].erase(p
);
1084 // i am a bound of the parent subtree.
1085 subtrees
[root
].insert(dir
);
1087 // i am now the subtree root.
1091 set
<CInode
*> to_eval
;
1093 // verify/adjust bounds.
1094 // - these may be new, or
1095 // - beneath existing ambiguous bounds (which will be collapsed),
1096 // - but NOT beneath unambiguous bounds.
1097 for (const auto& bound
: bounds
) {
1099 if (subtrees
[dir
].count(bound
) == 0) {
1100 if (get_subtree_root(bound
) == dir
) {
1101 dout(10) << " new bound " << *bound
<< ", adjusting auth back to old " << oldauth
<< dendl
;
1102 adjust_subtree_auth(bound
, oldauth
); // otherwise, adjust at bound.
1105 dout(10) << " want bound " << *bound
<< dendl
;
1106 CDir
*t
= get_subtree_root(bound
->get_parent_dir());
1107 if (subtrees
[t
].count(bound
) == 0) {
1108 ceph_assert(t
!= dir
);
1109 dout(10) << " new bound " << *bound
<< dendl
;
1110 adjust_subtree_auth(bound
, t
->authority());
1112 // make sure it's nested beneath ambiguous subtree(s)
1114 while (subtrees
[dir
].count(t
) == 0)
1115 t
= get_subtree_root(t
->get_parent_dir());
1116 dout(10) << " swallowing intervening subtree at " << *t
<< dendl
;
1117 adjust_subtree_auth(t
, auth
);
1118 try_subtree_merge_at(t
, &to_eval
);
1119 t
= get_subtree_root(bound
->get_parent_dir());
1120 if (t
== dir
) break;
1125 dout(10) << " already have bound " << *bound
<< dendl
;
1128 // merge stray bounds?
1129 while (!subtrees
[dir
].empty()) {
1130 set
<CDir
*> copy
= subtrees
[dir
];
1131 for (set
<CDir
*>::iterator p
= copy
.begin(); p
!= copy
.end(); ++p
) {
1132 if (bounds
.count(*p
) == 0) {
1134 dout(10) << " swallowing extra subtree at " << *stray
<< dendl
;
1135 adjust_subtree_auth(stray
, auth
);
1136 try_subtree_merge_at(stray
, &to_eval
);
1139 // swallowing subtree may add new subtree bounds
1140 if (copy
== subtrees
[dir
])
1144 // bound should now match.
1145 verify_subtree_bounds(dir
, bounds
);
1149 if (!(mds
->is_any_replay() || mds
->is_resolve())) {
1150 for(auto in
: to_eval
)
1151 eval_subtree_root(in
);
1157 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1158 * fragmentation as necessary to get an equivalent bounding set. That is, only
1159 * split if one of our frags spans the provided bounding set. Never merge.
1161 void MDCache::get_force_dirfrag_bound_set(const vector
<dirfrag_t
>& dfs
, set
<CDir
*>& bounds
)
1163 dout(10) << "get_force_dirfrag_bound_set " << dfs
<< dendl
;
1166 map
<inodeno_t
, fragset_t
> byino
;
1167 for (auto& frag
: dfs
) {
1168 byino
[frag
.ino
].insert_raw(frag
.frag
);
1170 dout(10) << " by ino: " << byino
<< dendl
;
1172 for (map
<inodeno_t
,fragset_t
>::iterator p
= byino
.begin(); p
!= byino
.end(); ++p
) {
1173 p
->second
.simplify();
1174 CInode
*diri
= get_inode(p
->first
);
1177 dout(10) << " checking fragset " << p
->second
.get() << " on " << *diri
<< dendl
;
1180 for (set
<frag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
)
1181 tmpdft
.force_to_leaf(g_ceph_context
, *q
);
1183 for (const auto& fg
: p
->second
) {
1185 diri
->dirfragtree
.get_leaves_under(fg
, leaves
);
1186 if (leaves
.empty()) {
1187 frag_t approx_fg
= diri
->dirfragtree
[fg
.value()];
1188 frag_vec_t approx_leaves
;
1189 tmpdft
.get_leaves_under(approx_fg
, approx_leaves
);
1190 for (const auto& leaf
: approx_leaves
) {
1191 if (p
->second
.get().count(leaf
) == 0) {
1192 // not bound, so the resolve message is from auth MDS of the dirfrag
1193 force_dir_fragment(diri
, leaf
);
1198 auto&& [complete
, sibs
] = diri
->get_dirfrags_under(fg
);
1199 for (const auto& sib
: sibs
)
1205 void MDCache::adjust_bounded_subtree_auth(CDir
*dir
, const vector
<dirfrag_t
>& bound_dfs
, const mds_authority_t
&auth
)
1207 dout(7) << "adjust_bounded_subtree_auth " << dir
->get_dir_auth() << " -> " << auth
1208 << " on " << *dir
<< " bound_dfs " << bound_dfs
<< dendl
;
1211 get_force_dirfrag_bound_set(bound_dfs
, bounds
);
1212 adjust_bounded_subtree_auth(dir
, bounds
, auth
);
1215 void MDCache::map_dirfrag_set(const list
<dirfrag_t
>& dfs
, set
<CDir
*>& result
)
1217 dout(10) << "map_dirfrag_set " << dfs
<< dendl
;
1220 map
<inodeno_t
, fragset_t
> ino_fragset
;
1221 for (const auto &df
: dfs
) {
1222 ino_fragset
[df
.ino
].insert_raw(df
.frag
);
1225 for (map
<inodeno_t
, fragset_t
>::iterator p
= ino_fragset
.begin();
1226 p
!= ino_fragset
.end();
1228 p
->second
.simplify();
1229 CInode
*in
= get_inode(p
->first
);
1234 for (const auto& fg
: p
->second
) {
1235 in
->dirfragtree
.get_leaves_under(fg
, fgs
);
1238 dout(15) << "map_dirfrag_set " << p
->second
<< " -> " << fgs
1239 << " on " << *in
<< dendl
;
1241 for (const auto& fg
: fgs
) {
1242 CDir
*dir
= in
->get_dirfrag(fg
);
1251 CDir
*MDCache::get_subtree_root(CDir
*dir
)
1253 // find the underlying dir that delegates (or is about to delegate) auth
1255 if (dir
->is_subtree_root())
1257 dir
= dir
->get_inode()->get_parent_dir();
1263 CDir
*MDCache::get_projected_subtree_root(CDir
*dir
)
1265 // find the underlying dir that delegates (or is about to delegate) auth
1267 if (dir
->is_subtree_root())
1269 dir
= dir
->get_inode()->get_projected_parent_dir();
1275 void MDCache::remove_subtree(CDir
*dir
)
1277 dout(10) << "remove_subtree " << *dir
<< dendl
;
1278 auto it
= subtrees
.find(dir
);
1279 ceph_assert(it
!= subtrees
.end());
1281 dir
->put(CDir::PIN_SUBTREE
);
1282 if (dir
->get_parent_dir()) {
1283 CDir
*p
= get_subtree_root(dir
->get_parent_dir());
1284 auto it
= subtrees
.find(p
);
1285 ceph_assert(it
!= subtrees
.end());
1286 auto count
= it
->second
.erase(dir
);
1287 ceph_assert(count
== 1);
1291 void MDCache::get_subtree_bounds(CDir
*dir
, set
<CDir
*>& bounds
)
1293 ceph_assert(subtrees
.count(dir
));
1294 bounds
= subtrees
[dir
];
1297 void MDCache::get_wouldbe_subtree_bounds(CDir
*dir
, set
<CDir
*>& bounds
)
1299 if (subtrees
.count(dir
)) {
1300 // just copy them, dir is a subtree.
1301 get_subtree_bounds(dir
, bounds
);
1304 CDir
*root
= get_subtree_root(dir
);
1305 for (set
<CDir
*>::iterator p
= subtrees
[root
].begin();
1306 p
!= subtrees
[root
].end();
1310 t
= t
->get_parent_dir();
1321 void MDCache::verify_subtree_bounds(CDir
*dir
, const set
<CDir
*>& bounds
)
1323 // for debugging only.
1324 ceph_assert(subtrees
.count(dir
));
1325 if (bounds
!= subtrees
[dir
]) {
1326 dout(0) << "verify_subtree_bounds failed" << dendl
;
1327 set
<CDir
*> b
= bounds
;
1328 for (auto &cd
: subtrees
[dir
]) {
1329 if (bounds
.count(cd
)) {
1333 dout(0) << " missing bound " << *cd
<< dendl
;
1335 for (const auto &cd
: b
)
1336 dout(0) << " extra bound " << *cd
<< dendl
;
1338 ceph_assert(bounds
== subtrees
[dir
]);
1341 void MDCache::verify_subtree_bounds(CDir
*dir
, const list
<dirfrag_t
>& bounds
)
1343 // for debugging only.
1344 ceph_assert(subtrees
.count(dir
));
1346 // make sure that any bounds i do have are properly noted as such.
1348 for (const auto &fg
: bounds
) {
1349 CDir
*bd
= get_dirfrag(fg
);
1351 if (subtrees
[dir
].count(bd
) == 0) {
1352 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd
<< dendl
;
1356 ceph_assert(failed
== 0);
1359 void MDCache::project_subtree_rename(CInode
*diri
, CDir
*olddir
, CDir
*newdir
)
1361 dout(10) << "project_subtree_rename " << *diri
<< " from " << *olddir
1362 << " to " << *newdir
<< dendl
;
1363 projected_subtree_renames
[diri
].push_back(pair
<CDir
*,CDir
*>(olddir
, newdir
));
1366 void MDCache::adjust_subtree_after_rename(CInode
*diri
, CDir
*olddir
, bool pop
)
1368 dout(10) << "adjust_subtree_after_rename " << *diri
<< " from " << *olddir
<< dendl
;
1370 CDir
*newdir
= diri
->get_parent_dir();
1373 map
<CInode
*,list
<pair
<CDir
*,CDir
*> > >::iterator p
= projected_subtree_renames
.find(diri
);
1374 ceph_assert(p
!= projected_subtree_renames
.end());
1375 ceph_assert(!p
->second
.empty());
1376 ceph_assert(p
->second
.front().first
== olddir
);
1377 ceph_assert(p
->second
.front().second
== newdir
);
1378 p
->second
.pop_front();
1379 if (p
->second
.empty())
1380 projected_subtree_renames
.erase(p
);
1383 // adjust total auth pin of freezing subtree
1384 if (olddir
!= newdir
) {
1385 auto&& dfls
= diri
->get_nested_dirfrags();
1386 for (const auto& dir
: dfls
)
1387 olddir
->adjust_freeze_after_rename(dir
);
1391 // N.B. make sure subtree dirfrags are at the front of the list
1392 auto dfls
= diri
->get_subtree_dirfrags();
1393 diri
->get_nested_dirfrags(dfls
);
1394 for (const auto& dir
: dfls
) {
1395 dout(10) << "dirfrag " << *dir
<< dendl
;
1396 CDir
*oldparent
= get_subtree_root(olddir
);
1397 dout(10) << " old parent " << *oldparent
<< dendl
;
1398 CDir
*newparent
= get_subtree_root(newdir
);
1399 dout(10) << " new parent " << *newparent
<< dendl
;
1401 auto& oldbounds
= subtrees
[oldparent
];
1402 auto& newbounds
= subtrees
[newparent
];
1404 if (olddir
!= newdir
)
1405 mds
->balancer
->adjust_pop_for_rename(olddir
, dir
, false);
1407 if (oldparent
== newparent
) {
1408 dout(10) << "parent unchanged for " << *dir
<< " at " << *oldparent
<< dendl
;
1409 } else if (dir
->is_subtree_root()) {
1410 // children are fine. change parent.
1411 dout(10) << "moving " << *dir
<< " from " << *oldparent
<< " to " << *newparent
<< dendl
;
1413 auto n
= oldbounds
.erase(dir
);
1414 ceph_assert(n
== 1);
1416 newbounds
.insert(dir
);
1417 // caller is responsible for 'eval diri'
1418 try_subtree_merge_at(dir
, NULL
, false);
1422 // see if any old bounds move to the new parent.
1423 std::vector
<CDir
*> tomove
;
1424 for (const auto& bound
: oldbounds
) {
1425 CDir
*broot
= get_subtree_root(bound
->get_parent_dir());
1426 if (broot
!= oldparent
) {
1427 ceph_assert(broot
== newparent
);
1428 tomove
.push_back(bound
);
1431 for (const auto& bound
: tomove
) {
1432 dout(10) << "moving bound " << *bound
<< " from " << *oldparent
<< " to " << *newparent
<< dendl
;
1433 oldbounds
.erase(bound
);
1434 newbounds
.insert(bound
);
1438 if (oldparent
->authority() != newparent
->authority()) {
1439 adjust_subtree_auth(dir
, oldparent
->authority(), false);
1440 // caller is responsible for 'eval diri'
1441 try_subtree_merge_at(dir
, NULL
, false);
1445 if (olddir
!= newdir
)
1446 mds
->balancer
->adjust_pop_for_rename(newdir
, dir
, true);
1452 // ===================================
1453 // journal and snap/cow helpers
1457 * find first inode in cache that follows given snapid. otherwise, return current.
1459 CInode
*MDCache::pick_inode_snap(CInode
*in
, snapid_t follows
)
1461 dout(10) << "pick_inode_snap follows " << follows
<< " on " << *in
<< dendl
;
1462 ceph_assert(in
->last
== CEPH_NOSNAP
);
1464 auto p
= snap_inode_map
.upper_bound(vinodeno_t(in
->ino(), follows
));
1465 if (p
!= snap_inode_map
.end() && p
->second
->ino() == in
->ino()) {
1466 dout(10) << "pick_inode_snap found " << *p
->second
<< dendl
;
1475 * note: i'm currently cheating wrt dirty and inode.version on cow
1476 * items. instead of doing a full dir predirty, i just take the
1477 * original item's version, and set the dirty flag (via
1478 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1479 * means a special case in the dir commit clean sweep assertions.
1482 CInode
*MDCache::cow_inode(CInode
*in
, snapid_t last
)
1484 ceph_assert(last
>= in
->first
);
1486 CInode
*oldin
= new CInode(this, true, in
->first
, last
);
1487 auto _inode
= CInode::allocate_inode(*in
->get_previous_projected_inode());
1488 _inode
->trim_client_ranges(last
);
1489 oldin
->reset_inode(std::move(_inode
));
1490 auto _xattrs
= in
->get_previous_projected_xattrs();
1491 oldin
->reset_xattrs(std::move(_xattrs
));
1493 oldin
->symlink
= in
->symlink
;
1495 if (in
->first
< in
->oldest_snap
)
1496 in
->oldest_snap
= in
->first
;
1500 dout(10) << "cow_inode " << *in
<< " to " << *oldin
<< dendl
;
1503 if (in
->last
!= CEPH_NOSNAP
) {
1504 CInode
*head_in
= get_inode(in
->ino());
1505 ceph_assert(head_in
);
1506 auto ret
= head_in
->split_need_snapflush(oldin
, in
);
1508 oldin
->client_snap_caps
= in
->client_snap_caps
;
1509 if (!oldin
->client_snap_caps
.empty()) {
1510 for (int i
= 0; i
< num_cinode_locks
; i
++) {
1511 SimpleLock
*lock
= oldin
->get_lock(cinode_lock_info
[i
].lock
);
1513 if (lock
->get_state() != LOCK_SNAP_SYNC
) {
1514 ceph_assert(lock
->is_stable());
1515 lock
->set_state(LOCK_SNAP_SYNC
); // gathering
1516 oldin
->auth_pin(lock
);
1518 lock
->get_wrlock(true);
1523 auto client_snap_caps
= std::move(in
->client_snap_caps
);
1524 in
->client_snap_caps
.clear();
1525 in
->item_open_file
.remove_myself();
1526 in
->item_caps
.remove_myself();
1528 if (!client_snap_caps
.empty()) {
1529 MDSContext::vec finished
;
1530 for (int i
= 0; i
< num_cinode_locks
; i
++) {
1531 SimpleLock
*lock
= in
->get_lock(cinode_lock_info
[i
].lock
);
1533 ceph_assert(lock
->get_state() == LOCK_SNAP_SYNC
); // gathering
1535 if (!lock
->get_num_wrlocks()) {
1536 lock
->set_state(LOCK_SYNC
);
1537 lock
->take_waiting(SimpleLock::WAIT_STABLE
|SimpleLock::WAIT_RD
, finished
);
1538 in
->auth_unpin(lock
);
1541 mds
->queue_waiters(finished
);
1547 if (!in
->client_caps
.empty()) {
1548 const set
<snapid_t
>& snaps
= in
->find_snaprealm()->get_snaps();
1550 for (auto &p
: in
->client_caps
) {
1551 client_t client
= p
.first
;
1552 Capability
*cap
= &p
.second
;
1553 int issued
= cap
->need_snapflush() ? CEPH_CAP_ANY_WR
: cap
->issued();
1554 if ((issued
& CEPH_CAP_ANY_WR
) &&
1555 cap
->client_follows
< last
) {
1556 dout(10) << " client." << client
<< " cap " << ccap_string(issued
) << dendl
;
1557 oldin
->client_snap_caps
.insert(client
);
1558 cap
->client_follows
= last
;
1560 // we need snapflushes for any intervening snaps
1561 dout(10) << " snaps " << snaps
<< dendl
;
1562 for (auto q
= snaps
.lower_bound(oldin
->first
);
1563 q
!= snaps
.end() && *q
<= last
;
1565 in
->add_need_snapflush(oldin
, *q
, client
);
1568 dout(10) << " ignoring client." << client
<< " cap follows " << cap
->client_follows
<< dendl
;
1572 if (!oldin
->client_snap_caps
.empty()) {
1573 for (int i
= 0; i
< num_cinode_locks
; i
++) {
1574 SimpleLock
*lock
= oldin
->get_lock(cinode_lock_info
[i
].lock
);
1576 if (lock
->get_state() != LOCK_SNAP_SYNC
) {
1577 ceph_assert(lock
->is_stable());
1578 lock
->set_state(LOCK_SNAP_SYNC
); // gathering
1579 oldin
->auth_pin(lock
);
1581 lock
->get_wrlock(true);
1588 void MDCache::journal_cow_dentry(MutationImpl
*mut
, EMetaBlob
*metablob
,
1589 CDentry
*dn
, snapid_t follows
,
1590 CInode
**pcow_inode
, CDentry::linkage_t
*dnl
)
1593 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl
;
1596 dout(10) << "journal_cow_dentry follows " << follows
<< " on " << *dn
<< dendl
;
1597 ceph_assert(dn
->is_auth());
1599 // nothing to cow on a null dentry, fix caller
1601 dnl
= dn
->get_projected_linkage();
1602 ceph_assert(!dnl
->is_null());
1604 CInode
*in
= dnl
->is_primary() ? dnl
->get_inode() : NULL
;
1605 bool cow_head
= false;
1606 if (in
&& in
->state_test(CInode::STATE_AMBIGUOUSAUTH
)) {
1607 ceph_assert(in
->is_frozen_inode());
1610 if (in
&& (in
->is_multiversion() || cow_head
)) {
1611 // multiversion inode.
1612 SnapRealm
*realm
= NULL
;
1614 if (in
->get_projected_parent_dn() != dn
) {
1615 ceph_assert(follows
== CEPH_NOSNAP
);
1616 realm
= dn
->dir
->inode
->find_snaprealm();
1617 snapid_t dir_follows
= get_global_snaprealm()->get_newest_seq();
1618 ceph_assert(dir_follows
>= realm
->get_newest_seq());
1620 if (dir_follows
+1 > dn
->first
) {
1621 snapid_t oldfirst
= dn
->first
;
1622 dn
->first
= dir_follows
+1;
1623 if (realm
->has_snaps_in_range(oldfirst
, dir_follows
)) {
1624 CDir
*dir
= dn
->dir
;
1625 CDentry
*olddn
= dir
->add_remote_dentry(dn
->get_name(), in
->ino(), in
->d_type(), dn
->alternate_name
, oldfirst
, dir_follows
);
1626 dout(10) << " olddn " << *olddn
<< dendl
;
1627 ceph_assert(dir
->is_projected());
1628 olddn
->set_projected_version(dir
->get_projected_version());
1629 metablob
->add_remote_dentry(olddn
, true);
1630 mut
->add_cow_dentry(olddn
);
1631 // FIXME: adjust link count here? hmm.
1633 if (dir_follows
+1 > in
->first
)
1634 in
->cow_old_inode(dir_follows
, cow_head
);
1638 follows
= dir_follows
;
1639 if (in
->snaprealm
) {
1640 realm
= in
->snaprealm
;
1641 ceph_assert(follows
>= realm
->get_newest_seq());
1644 realm
= in
->find_snaprealm();
1645 if (follows
== CEPH_NOSNAP
) {
1646 follows
= get_global_snaprealm()->get_newest_seq();
1647 ceph_assert(follows
>= realm
->get_newest_seq());
1652 if (follows
< in
->first
) {
1653 dout(10) << "journal_cow_dentry follows " << follows
<< " < first on " << *in
<< dendl
;
1657 if (!realm
->has_snaps_in_range(in
->first
, follows
)) {
1658 dout(10) << "journal_cow_dentry no snapshot follows " << follows
<< " on " << *in
<< dendl
;
1659 in
->first
= follows
+ 1;
1663 in
->cow_old_inode(follows
, cow_head
);
1666 SnapRealm
*realm
= dn
->dir
->inode
->find_snaprealm();
1667 if (follows
== CEPH_NOSNAP
) {
1668 follows
= get_global_snaprealm()->get_newest_seq();
1669 ceph_assert(follows
>= realm
->get_newest_seq());
1673 if (follows
< dn
->first
) {
1674 dout(10) << "journal_cow_dentry follows " << follows
<< " < first on " << *dn
<< dendl
;
1678 // update dn.first before adding old dentry to cdir's map
1679 snapid_t oldfirst
= dn
->first
;
1680 dn
->first
= follows
+1;
1682 if (!realm
->has_snaps_in_range(oldfirst
, follows
)) {
1683 dout(10) << "journal_cow_dentry no snapshot follows " << follows
<< " on " << *dn
<< dendl
;
1685 in
->first
= follows
+1;
1689 dout(10) << " dn " << *dn
<< dendl
;
1690 CDir
*dir
= dn
->get_dir();
1691 ceph_assert(dir
->is_projected());
1694 CInode
*oldin
= cow_inode(in
, follows
);
1695 ceph_assert(in
->is_projected());
1696 mut
->add_cow_inode(oldin
);
1698 *pcow_inode
= oldin
;
1699 CDentry
*olddn
= dir
->add_primary_dentry(dn
->get_name(), oldin
, dn
->alternate_name
, oldfirst
, follows
);
1700 dout(10) << " olddn " << *olddn
<< dendl
;
1701 bool need_snapflush
= !oldin
->client_snap_caps
.empty();
1702 if (need_snapflush
) {
1703 mut
->ls
->open_files
.push_back(&oldin
->item_open_file
);
1704 mds
->locker
->mark_need_snapflush_inode(oldin
);
1706 olddn
->set_projected_version(dir
->get_projected_version());
1707 metablob
->add_primary_dentry(olddn
, 0, true, false, false, need_snapflush
);
1708 mut
->add_cow_dentry(olddn
);
1710 ceph_assert(dnl
->is_remote());
1711 CDentry
*olddn
= dir
->add_remote_dentry(dn
->get_name(), dnl
->get_remote_ino(), dnl
->get_remote_d_type(), dn
->alternate_name
, oldfirst
, follows
);
1712 dout(10) << " olddn " << *olddn
<< dendl
;
1714 olddn
->set_projected_version(dir
->get_projected_version());
1715 metablob
->add_remote_dentry(olddn
, true);
1716 mut
->add_cow_dentry(olddn
);
1721 void MDCache::journal_dirty_inode(MutationImpl
*mut
, EMetaBlob
*metablob
, CInode
*in
, snapid_t follows
)
1723 if (in
->is_base()) {
1724 metablob
->add_root(true, in
);
1726 if (follows
== CEPH_NOSNAP
&& in
->last
!= CEPH_NOSNAP
)
1727 follows
= in
->first
- 1;
1728 CDentry
*dn
= in
->get_projected_parent_dn();
1729 if (!dn
->get_projected_linkage()->is_null()) // no need to cow a null dentry
1730 journal_cow_dentry(mut
, metablob
, dn
, follows
);
1731 if (in
->get_projected_inode()->is_backtrace_updated()) {
1732 bool dirty_pool
= in
->get_projected_inode()->layout
.pool_id
!=
1733 in
->get_previous_projected_inode()->layout
.pool_id
;
1734 metablob
->add_primary_dentry(dn
, in
, true, true, dirty_pool
);
1736 metablob
->add_primary_dentry(dn
, in
, true);
1743 // nested ---------------------------------------------------------------
1745 void MDCache::project_rstat_inode_to_frag(const MutationRef
& mut
,
1746 CInode
*cur
, CDir
*parent
, snapid_t first
,
1747 int linkunlink
, SnapRealm
*prealm
)
1749 CDentry
*parentdn
= cur
->get_projected_parent_dn();
1751 if (cur
->first
> first
)
1754 dout(10) << "projected_rstat_inode_to_frag first " << first
<< " linkunlink " << linkunlink
1755 << " " << *cur
<< dendl
;
1756 dout(20) << " frag head is [" << parent
->first
<< ",head] " << dendl
;
1757 dout(20) << " inode update is [" << first
<< "," << cur
->last
<< "]" << dendl
;
1760 * FIXME. this incompletely propagates rstats to _old_ parents
1761 * (i.e. shortly after a directory rename). but we need full
1762 * blown hard link backpointers to make this work properly...
1764 snapid_t floor
= parentdn
->first
;
1765 dout(20) << " floor of " << floor
<< " from parent dn " << *parentdn
<< dendl
;
1768 prealm
= parent
->inode
->find_snaprealm();
1769 const set
<snapid_t
> snaps
= prealm
->get_snaps();
1771 if (cur
->last
!= CEPH_NOSNAP
) {
1772 ceph_assert(cur
->dirty_old_rstats
.empty());
1773 set
<snapid_t
>::const_iterator q
= snaps
.lower_bound(std::max(first
, floor
));
1774 if (q
== snaps
.end() || *q
> cur
->last
)
1778 if (cur
->last
>= floor
) {
1780 if (cur
->state_test(CInode::STATE_AMBIGUOUSAUTH
) && cur
->is_auth()) {
1781 // rename src inode is not projected in the peer rename prep case. so we should
1782 // avoid updateing the inode.
1783 ceph_assert(linkunlink
< 0);
1784 ceph_assert(cur
->is_frozen_inode());
1788 const CInode::mempool_inode
*pi
;
1789 if (update
&& mut
->is_projected(cur
)) {
1790 pi
= cur
->_get_projected_inode();
1792 pi
= cur
->get_projected_inode().get();
1795 ceph_assert(pi
->rstat
== pi
->accounted_rstat
);
1799 _project_rstat_inode_to_frag(pi
, std::max(first
, floor
), cur
->last
, parent
,
1800 linkunlink
, update
);
1803 if (g_conf()->mds_snap_rstat
) {
1804 for (const auto &p
: cur
->dirty_old_rstats
) {
1805 const auto &old
= cur
->get_old_inodes()->at(p
);
1806 snapid_t ofirst
= std::max(old
.first
, floor
);
1807 auto it
= snaps
.lower_bound(ofirst
);
1808 if (it
== snaps
.end() || *it
> p
)
1811 _project_rstat_inode_to_frag(&old
.inode
, ofirst
, p
, parent
, 0, false);
1814 cur
->dirty_old_rstats
.clear();
1818 void MDCache::_project_rstat_inode_to_frag(const CInode::mempool_inode
* inode
, snapid_t ofirst
, snapid_t last
,
1819 CDir
*parent
, int linkunlink
, bool update_inode
)
1821 dout(10) << "_project_rstat_inode_to_frag [" << ofirst
<< "," << last
<< "]" << dendl
;
1822 dout(20) << " inode rstat " << inode
->rstat
<< dendl
;
1823 dout(20) << " inode accounted_rstat " << inode
->accounted_rstat
<< dendl
;
1825 if (linkunlink
== 0) {
1826 delta
.add(inode
->rstat
);
1827 delta
.sub(inode
->accounted_rstat
);
1828 } else if (linkunlink
< 0) {
1829 delta
.sub(inode
->accounted_rstat
);
1831 delta
.add(inode
->rstat
);
1833 dout(20) << " delta " << delta
<< dendl
;
1836 while (last
>= ofirst
) {
1838 * pick fnode version to update. at each iteration, we want to
1839 * pick a segment ending in 'last' to update. split as necessary
1840 * to make that work. then, adjust first up so that we only
1841 * update one segment at a time. then loop to cover the whole
1842 * [ofirst,last] interval.
1844 nest_info_t
*prstat
;
1846 auto pf
= parent
->_get_projected_fnode();
1847 if (last
== CEPH_NOSNAP
) {
1848 if (g_conf()->mds_snap_rstat
)
1849 first
= std::max(ofirst
, parent
->first
);
1851 first
= parent
->first
;
1852 prstat
= &pf
->rstat
;
1853 dout(20) << " projecting to head [" << first
<< "," << last
<< "] " << *prstat
<< dendl
;
1855 if (first
> parent
->first
&&
1856 !(pf
->rstat
== pf
->accounted_rstat
)) {
1857 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1858 << parent
->first
<< "," << (first
-1) << "] "
1859 << " " << *prstat
<< "/" << pf
->accounted_rstat
1861 parent
->dirty_old_rstat
[first
-1].first
= parent
->first
;
1862 parent
->dirty_old_rstat
[first
-1].rstat
= pf
->rstat
;
1863 parent
->dirty_old_rstat
[first
-1].accounted_rstat
= pf
->accounted_rstat
;
1865 parent
->first
= first
;
1866 } else if (!g_conf()->mds_snap_rstat
) {
1867 // drop snapshots' rstats
1869 } else if (last
>= parent
->first
) {
1870 first
= parent
->first
;
1871 parent
->dirty_old_rstat
[last
].first
= first
;
1872 parent
->dirty_old_rstat
[last
].rstat
= pf
->rstat
;
1873 parent
->dirty_old_rstat
[last
].accounted_rstat
= pf
->accounted_rstat
;
1874 prstat
= &parent
->dirty_old_rstat
[last
].rstat
;
1875 dout(10) << " projecting to newly split dirty_old_fnode [" << first
<< "," << last
<< "] "
1876 << " " << *prstat
<< "/" << pf
->accounted_rstat
<< dendl
;
1878 // be careful, dirty_old_rstat is a _sparse_ map.
1879 // sorry, this is ugly.
1882 // find any intersection with last
1883 auto it
= parent
->dirty_old_rstat
.lower_bound(last
);
1884 if (it
== parent
->dirty_old_rstat
.end()) {
1885 dout(20) << " no dirty_old_rstat with last >= last " << last
<< dendl
;
1886 if (!parent
->dirty_old_rstat
.empty() && parent
->dirty_old_rstat
.rbegin()->first
>= first
) {
1887 dout(20) << " last dirty_old_rstat ends at " << parent
->dirty_old_rstat
.rbegin()->first
<< dendl
;
1888 first
= parent
->dirty_old_rstat
.rbegin()->first
+1;
1891 // *it last is >= last
1892 if (it
->second
.first
<= last
) {
1893 // *it intersects [first,last]
1894 if (it
->second
.first
< first
) {
1895 dout(10) << " splitting off left bit [" << it
->second
.first
<< "," << first
-1 << "]" << dendl
;
1896 parent
->dirty_old_rstat
[first
-1] = it
->second
;
1897 it
->second
.first
= first
;
1899 if (it
->second
.first
> first
)
1900 first
= it
->second
.first
;
1901 if (last
< it
->first
) {
1902 dout(10) << " splitting off right bit [" << last
+1 << "," << it
->first
<< "]" << dendl
;
1903 parent
->dirty_old_rstat
[last
] = it
->second
;
1904 it
->second
.first
= last
+1;
1907 // *it is to the _right_ of [first,last]
1908 it
= parent
->dirty_old_rstat
.lower_bound(first
);
1909 // new *it last is >= first
1910 if (it
->second
.first
<= last
&& // new *it isn't also to the right, and
1911 it
->first
>= first
) { // it intersects our first bit,
1912 dout(10) << " staying to the right of [" << it
->second
.first
<< "," << it
->first
<< "]..." << dendl
;
1913 first
= it
->first
+1;
1915 dout(10) << " projecting to new dirty_old_rstat [" << first
<< "," << last
<< "]" << dendl
;
1918 dout(20) << " projecting to dirty_old_rstat [" << first
<< "," << last
<< "]" << dendl
;
1919 parent
->dirty_old_rstat
[last
].first
= first
;
1920 prstat
= &parent
->dirty_old_rstat
[last
].rstat
;
1924 dout(20) << " project to [" << first
<< "," << last
<< "] " << *prstat
<< dendl
;
1925 ceph_assert(last
>= first
);
1927 dout(20) << " result [" << first
<< "," << last
<< "] " << *prstat
<< " " << *parent
<< dendl
;
1933 auto _inode
= const_cast<CInode::mempool_inode
*>(inode
);
1934 _inode
->accounted_rstat
= _inode
->rstat
;
1938 void MDCache::project_rstat_frag_to_inode(const nest_info_t
& rstat
,
1939 const nest_info_t
& accounted_rstat
,
1940 snapid_t ofirst
, snapid_t last
,
1941 CInode
*pin
, bool cow_head
)
1943 dout(10) << "project_rstat_frag_to_inode [" << ofirst
<< "," << last
<< "]" << dendl
;
1944 dout(20) << " frag rstat " << rstat
<< dendl
;
1945 dout(20) << " frag accounted_rstat " << accounted_rstat
<< dendl
;
1946 nest_info_t delta
= rstat
;
1947 delta
.sub(accounted_rstat
);
1948 dout(20) << " delta " << delta
<< dendl
;
1950 CInode::old_inode_map_ptr _old_inodes
;
1951 while (last
>= ofirst
) {
1952 CInode::mempool_inode
*pi
;
1954 if (last
== pin
->last
) {
1955 pi
= pin
->_get_projected_inode();
1956 first
= std::max(ofirst
, pin
->first
);
1957 if (first
> pin
->first
) {
1958 auto& old
= pin
->cow_old_inode(first
-1, cow_head
);
1959 dout(20) << " cloned old_inode rstat is " << old
.inode
.rstat
<< dendl
;
1963 _old_inodes
= CInode::allocate_old_inode_map();
1964 if (pin
->is_any_old_inodes())
1965 *_old_inodes
= *pin
->get_old_inodes();
1967 if (last
>= pin
->first
) {
1969 pin
->cow_old_inode(last
, cow_head
);
1971 // our life is easier here because old_inodes is not sparse
1972 // (although it may not begin at snapid 1)
1973 auto it
= _old_inodes
->lower_bound(last
);
1974 if (it
== _old_inodes
->end()) {
1975 dout(10) << " no old_inode <= " << last
<< ", done." << dendl
;
1978 first
= it
->second
.first
;
1980 dout(10) << " oldest old_inode is [" << first
<< "," << it
->first
<< "], done." << dendl
;
1981 //assert(p == pin->old_inodes.begin());
1984 if (it
->first
> last
) {
1985 dout(10) << " splitting right old_inode [" << first
<< "," << it
->first
<< "] to ["
1986 << (last
+1) << "," << it
->first
<< "]" << dendl
;
1987 (*_old_inodes
)[last
] = it
->second
;
1988 it
->second
.first
= last
+1;
1989 pin
->dirty_old_rstats
.insert(it
->first
);
1992 if (first
< ofirst
) {
1993 dout(10) << " splitting left old_inode [" << first
<< "," << last
<< "] to ["
1994 << first
<< "," << ofirst
-1 << "]" << dendl
;
1995 (*_old_inodes
)[ofirst
-1] = (*_old_inodes
)[last
];
1996 pin
->dirty_old_rstats
.insert(ofirst
-1);
1997 (*_old_inodes
)[last
].first
= first
= ofirst
;
1999 pi
= &(*_old_inodes
)[last
].inode
;
2000 pin
->dirty_old_rstats
.insert(last
);
2002 dout(20) << " projecting to [" << first
<< "," << last
<< "] " << pi
->rstat
<< dendl
;
2003 pi
->rstat
.add(delta
);
2004 dout(20) << " result [" << first
<< "," << last
<< "] " << pi
->rstat
<< dendl
;
2009 pin
->reset_old_inodes(std::move(_old_inodes
));
2012 void MDCache::broadcast_quota_to_client(CInode
*in
, client_t exclude_ct
, bool quota_change
)
2014 if (!(mds
->is_active() || mds
->is_stopping()))
2017 if (!in
->is_auth() || in
->is_frozen())
2020 const auto& pi
= in
->get_projected_inode();
2021 if (!pi
->quota
.is_enable() && !quota_change
)
2024 // creaete snaprealm for quota inode (quota was set before mimic)
2025 if (!in
->get_projected_srnode())
2026 mds
->server
->create_quota_realm(in
);
2028 for (auto &p
: in
->client_caps
) {
2029 Capability
*cap
= &p
.second
;
2030 if (cap
->is_noquota())
2033 if (exclude_ct
>= 0 && exclude_ct
!= p
.first
)
2036 if (cap
->last_rbytes
== pi
->rstat
.rbytes
&&
2037 cap
->last_rsize
== pi
->rstat
.rsize())
2040 if (pi
->quota
.max_files
> 0) {
2041 if (pi
->rstat
.rsize() >= pi
->quota
.max_files
)
2044 if ((abs(cap
->last_rsize
- pi
->quota
.max_files
) >> 4) <
2045 abs(cap
->last_rsize
- pi
->rstat
.rsize()))
2049 if (pi
->quota
.max_bytes
> 0) {
2050 if (pi
->rstat
.rbytes
> pi
->quota
.max_bytes
- (pi
->quota
.max_bytes
>> 3))
2053 if ((abs(cap
->last_rbytes
- pi
->quota
.max_bytes
) >> 4) <
2054 abs(cap
->last_rbytes
- pi
->rstat
.rbytes
))
2061 cap
->last_rsize
= pi
->rstat
.rsize();
2062 cap
->last_rbytes
= pi
->rstat
.rbytes
;
2064 auto msg
= make_message
<MClientQuota
>();
2065 msg
->ino
= in
->ino();
2066 msg
->rstat
= pi
->rstat
;
2067 msg
->quota
= pi
->quota
;
2068 mds
->send_message_client_counted(msg
, cap
->get_session());
2070 for (const auto &it
: in
->get_replicas()) {
2071 auto msg
= make_message
<MGatherCaps
>();
2072 msg
->ino
= in
->ino();
2073 mds
->send_message_mds(msg
, it
.first
);
2078 * NOTE: we _have_ to delay the scatter if we are called during a
2079 * rejoin, because we can't twiddle locks between when the
2080 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2081 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2082 * (no requests), and a survivor acks immediately. _except_ that
2083 * during rejoin_(weak|strong) processing, we may complete a lock
2084 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2085 * scatterlock state in that case or the lock states will get out of
2086 * sync between the auth and replica.
2088 * the simple solution is to never do the scatter here. instead, put
2089 * the scatterlock on a list if it isn't already wrlockable. this is
2090 * probably the best plan anyway, since we avoid too many
2091 * scatters/locks under normal usage.
2094 * some notes on dirlock/nestlock scatterlock semantics:
2096 * the fragstat (dirlock) will never be updated without
2097 * dirlock+nestlock wrlock held by the caller.
2099 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2100 * data is pushed up the tree. this could be changed with some
2101 * restructuring here, but in its current form we ensure that the
2102 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2103 * frag, which is nice. and, we only need to track frags that need to
2104 * be nudged (and not inodes with pending rstat changes that need to
2105 * be pushed into the frag). a consequence of this is that the
2106 * accounted_rstat on scatterlock sync may not match our current
2107 * rstat. this is normal and expected.
2109 void MDCache::predirty_journal_parents(MutationRef mut
, EMetaBlob
*blob
,
2110 CInode
*in
, CDir
*parent
,
2111 int flags
, int linkunlink
,
2114 bool primary_dn
= flags
& PREDIRTY_PRIMARY
;
2115 bool do_parent_mtime
= flags
& PREDIRTY_DIR
;
2116 bool shallow
= flags
& PREDIRTY_SHALLOW
;
2118 ceph_assert(mds
->mdlog
->entry_is_open());
2120 // make sure stamp is set
2121 if (mut
->get_mds_stamp() == utime_t())
2122 mut
->set_mds_stamp(ceph_clock_now());
2127 dout(10) << "predirty_journal_parents"
2128 << (do_parent_mtime
? " do_parent_mtime":"")
2129 << " linkunlink=" << linkunlink
2130 << (primary_dn
? " primary_dn":" remote_dn")
2131 << (shallow
? " SHALLOW":"")
2132 << " follows " << cfollows
2133 << " " << *in
<< dendl
;
2136 ceph_assert(primary_dn
);
2137 parent
= in
->get_projected_parent_dn()->get_dir();
2140 if (flags
== 0 && linkunlink
== 0) {
2141 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl
;
2142 blob
->add_dir_context(parent
);
2146 // build list of inodes to wrlock, dirty, and update
2149 CDentry
*parentdn
= NULL
;
2152 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
2153 ceph_assert(parent
->is_auth());
2155 // opportunistically adjust parent dirfrag
2156 CInode
*pin
= parent
->get_inode();
2159 mut
->auth_pin(parent
);
2161 auto pf
= parent
->project_fnode(mut
);
2162 pf
->version
= parent
->pre_dirty();
2164 if (do_parent_mtime
|| linkunlink
) {
2165 ceph_assert(mut
->is_wrlocked(&pin
->filelock
));
2166 ceph_assert(mut
->is_wrlocked(&pin
->nestlock
));
2167 ceph_assert(cfollows
== CEPH_NOSNAP
);
2169 // update stale fragstat/rstat?
2170 parent
->resync_accounted_fragstat();
2171 parent
->resync_accounted_rstat();
2173 if (do_parent_mtime
) {
2174 pf
->fragstat
.mtime
= mut
->get_op_stamp();
2175 pf
->fragstat
.change_attr
++;
2176 dout(10) << "predirty_journal_parents bumping change_attr to " << pf
->fragstat
.change_attr
<< " on " << parent
<< dendl
;
2177 if (pf
->fragstat
.mtime
> pf
->rstat
.rctime
) {
2178 dout(10) << "predirty_journal_parents updating mtime on " << *parent
<< dendl
;
2179 pf
->rstat
.rctime
= pf
->fragstat
.mtime
;
2181 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent
<< dendl
;
2185 dout(10) << "predirty_journal_parents updating size on " << *parent
<< dendl
;
2187 pf
->fragstat
.nsubdirs
+= linkunlink
;
2188 //pf->rstat.rsubdirs += linkunlink;
2190 pf
->fragstat
.nfiles
+= linkunlink
;
2191 //pf->rstat.rfiles += linkunlink;
2198 // don't update parent this pass
2199 } else if (!linkunlink
&& !(pin
->nestlock
.can_wrlock(-1) &&
2200 pin
->versionlock
.can_wrlock())) {
2201 dout(20) << " unwritable parent nestlock " << pin
->nestlock
2202 << ", marking dirty rstat on " << *cur
<< dendl
;
2203 cur
->mark_dirty_rstat();
2205 // if we don't hold a wrlock reference on this nestlock, take one,
2206 // because we are about to write into the dirfrag fnode and that needs
2207 // to commit before the lock can cycle.
2209 ceph_assert(pin
->nestlock
.get_num_wrlocks() || mut
->is_peer());
2212 if (!mut
->is_wrlocked(&pin
->nestlock
)) {
2213 dout(10) << " taking wrlock on " << pin
->nestlock
<< " on " << *pin
<< dendl
;
2214 mds
->locker
->wrlock_force(&pin
->nestlock
, mut
);
2217 // now we can project the inode rstat diff the dirfrag
2218 SnapRealm
*prealm
= pin
->find_snaprealm();
2220 snapid_t follows
= cfollows
;
2221 if (follows
== CEPH_NOSNAP
)
2222 follows
= prealm
->get_newest_seq();
2224 snapid_t first
= follows
+1;
2226 // first, if the frag is stale, bring it back in sync.
2227 parent
->resync_accounted_rstat();
2229 // now push inode rstats into frag
2230 project_rstat_inode_to_frag(mut
, cur
, parent
, first
, linkunlink
, prealm
);
2231 cur
->clear_dirty_rstat();
2235 if (!pin
->is_auth() || (!mut
->is_auth_pinned(pin
) && !pin
->can_auth_pin())) {
2236 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin
<< dendl
;
2240 // delay propagating until later?
2241 if (!stop
&& !first
&&
2242 g_conf()->mds_dirstat_min_interval
> 0) {
2243 double since_last_prop
= mut
->get_mds_stamp() - pin
->last_dirstat_prop
;
2244 if (since_last_prop
< g_conf()->mds_dirstat_min_interval
) {
2245 dout(10) << "predirty_journal_parents last prop " << since_last_prop
2246 << " < " << g_conf()->mds_dirstat_min_interval
2247 << ", stopping" << dendl
;
2250 dout(10) << "predirty_journal_parents last prop " << since_last_prop
<< " ago, continuing" << dendl
;
2254 // can cast only because i'm passing nowait=true in the sole user
2256 !mut
->is_wrlocked(&pin
->nestlock
) &&
2257 (!pin
->versionlock
.can_wrlock() || // make sure we can take versionlock, too
2258 !mds
->locker
->wrlock_try(&pin
->nestlock
, mut
)
2259 )) { // ** do not initiate.. see above comment **
2260 dout(10) << "predirty_journal_parents can't wrlock one of " << pin
->versionlock
<< " or " << pin
->nestlock
2261 << " on " << *pin
<< dendl
;
2265 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin
<< dendl
;
2266 mds
->locker
->mark_updated_scatterlock(&pin
->nestlock
);
2267 mut
->ls
->dirty_dirfrag_nest
.push_back(&pin
->item_dirty_dirfrag_nest
);
2268 mut
->add_updated_lock(&pin
->nestlock
);
2269 if (do_parent_mtime
|| linkunlink
) {
2270 mds
->locker
->mark_updated_scatterlock(&pin
->filelock
);
2271 mut
->ls
->dirty_dirfrag_dir
.push_back(&pin
->item_dirty_dirfrag_dir
);
2272 mut
->add_updated_lock(&pin
->filelock
);
2276 if (!mut
->is_wrlocked(&pin
->versionlock
))
2277 mds
->locker
->local_wrlock_grab(&pin
->versionlock
, mut
);
2279 ceph_assert(mut
->is_wrlocked(&pin
->nestlock
) || mut
->is_peer());
2281 pin
->last_dirstat_prop
= mut
->get_mds_stamp();
2285 lsi
.push_front(pin
);
2287 pin
->pre_cow_old_inode(); // avoid cow mayhem!
2289 auto pi
= pin
->project_inode(mut
);
2290 pi
.inode
->version
= pin
->pre_dirty();
2293 if (do_parent_mtime
|| linkunlink
) {
2294 dout(20) << "predirty_journal_parents add_delta " << pf
->fragstat
<< dendl
;
2295 dout(20) << "predirty_journal_parents - " << pf
->accounted_fragstat
<< dendl
;
2296 bool touched_mtime
= false, touched_chattr
= false;
2297 pi
.inode
->dirstat
.add_delta(pf
->fragstat
, pf
->accounted_fragstat
, &touched_mtime
, &touched_chattr
);
2298 pf
->accounted_fragstat
= pf
->fragstat
;
2300 pi
.inode
->mtime
= pi
.inode
->ctime
= pi
.inode
->dirstat
.mtime
;
2302 pi
.inode
->change_attr
= pi
.inode
->dirstat
.change_attr
;
2303 dout(20) << "predirty_journal_parents gives " << pi
.inode
->dirstat
<< " on " << *pin
<< dendl
;
2305 if (parent
->get_frag() == frag_t()) { // i.e., we are the only frag
2306 if (pi
.inode
->dirstat
.size() < 0)
2307 ceph_assert(!"negative dirstat size" == g_conf()->mds_verify_scatter
);
2308 if (pi
.inode
->dirstat
.size() != pf
->fragstat
.size()) {
2309 mds
->clog
->error() << "unmatched fragstat size on single dirfrag "
2310 << parent
->dirfrag() << ", inode has " << pi
.inode
->dirstat
2311 << ", dirfrag has " << pf
->fragstat
;
2313 // trust the dirfrag for now
2314 pi
.inode
->dirstat
= pf
->fragstat
;
2316 ceph_assert(!"unmatched fragstat size" == g_conf()->mds_verify_scatter
);
2322 dout(10) << "predirty_journal_parents frag->inode on " << *parent
<< dendl
;
2324 // first, if the frag is stale, bring it back in sync.
2325 parent
->resync_accounted_rstat();
2327 if (g_conf()->mds_snap_rstat
) {
2328 for (auto &p
: parent
->dirty_old_rstat
) {
2329 project_rstat_frag_to_inode(p
.second
.rstat
, p
.second
.accounted_rstat
, p
.second
.first
,
2330 p
.first
, pin
, true);
2333 parent
->dirty_old_rstat
.clear();
2334 project_rstat_frag_to_inode(pf
->rstat
, pf
->accounted_rstat
, parent
->first
, CEPH_NOSNAP
, pin
, true);//false);
2336 pf
->accounted_rstat
= pf
->rstat
;
2338 if (parent
->get_frag() == frag_t()) { // i.e., we are the only frag
2339 if (pi
.inode
->rstat
.rbytes
!= pf
->rstat
.rbytes
) {
2340 mds
->clog
->error() << "unmatched rstat rbytes on single dirfrag "
2341 << parent
->dirfrag() << ", inode has " << pi
.inode
->rstat
2342 << ", dirfrag has " << pf
->rstat
;
2344 // trust the dirfrag for now
2345 pi
.inode
->rstat
= pf
->rstat
;
2347 ceph_assert(!"unmatched rstat rbytes" == g_conf()->mds_verify_scatter
);
2351 parent
->check_rstats();
2352 broadcast_quota_to_client(pin
);
2357 parentdn
= pin
->get_projected_parent_dn();
2358 ceph_assert(parentdn
);
2359 parent
= parentdn
->get_dir();
2361 do_parent_mtime
= false;
2366 // now, stick it in the blob
2367 ceph_assert(parent
);
2368 ceph_assert(parent
->is_auth());
2369 blob
->add_dir_context(parent
);
2370 blob
->add_dir(parent
, true);
2371 for (const auto& in
: lsi
) {
2372 journal_dirty_inode(mut
.get(), blob
, in
);
2381 // ===================================
2386 * some handlers for leader requests with peers. we need to make
2387 * sure leader journal commits before we forget we leadered them and
2388 * remove them from the uncommitted_leaders map (used during recovery
2389 * to commit|abort peers).
2391 struct C_MDC_CommittedLeader
: public MDCacheLogContext
{
2393 C_MDC_CommittedLeader(MDCache
*s
, metareqid_t r
) : MDCacheLogContext(s
), reqid(r
) {}
2394 void finish(int r
) override
{
2395 mdcache
->_logged_leader_commit(reqid
);
2399 void MDCache::log_leader_commit(metareqid_t reqid
)
2401 dout(10) << "log_leader_commit " << reqid
<< dendl
;
2402 uncommitted_leaders
[reqid
].committing
= true;
2403 mds
->mdlog
->start_submit_entry(new ECommitted(reqid
),
2404 new C_MDC_CommittedLeader(this, reqid
));
2407 void MDCache::_logged_leader_commit(metareqid_t reqid
)
2409 dout(10) << "_logged_leader_commit " << reqid
<< dendl
;
2410 ceph_assert(uncommitted_leaders
.count(reqid
));
2411 uncommitted_leaders
[reqid
].ls
->uncommitted_leaders
.erase(reqid
);
2412 mds
->queue_waiters(uncommitted_leaders
[reqid
].waiters
);
2413 uncommitted_leaders
.erase(reqid
);
2418 void MDCache::committed_leader_peer(metareqid_t r
, mds_rank_t from
)
2420 dout(10) << "committed_leader_peer mds." << from
<< " on " << r
<< dendl
;
2421 ceph_assert(uncommitted_leaders
.count(r
));
2422 uncommitted_leaders
[r
].peers
.erase(from
);
2423 if (!uncommitted_leaders
[r
].recovering
&& uncommitted_leaders
[r
].peers
.empty())
2424 log_leader_commit(r
);
2427 void MDCache::logged_leader_update(metareqid_t reqid
)
2429 dout(10) << "logged_leader_update " << reqid
<< dendl
;
2430 ceph_assert(uncommitted_leaders
.count(reqid
));
2431 uncommitted_leaders
[reqid
].safe
= true;
2432 auto p
= pending_leaders
.find(reqid
);
2433 if (p
!= pending_leaders
.end()) {
2434 pending_leaders
.erase(p
);
2435 if (pending_leaders
.empty())
2436 process_delayed_resolve();
2441 * Leader may crash after receiving all peers' commit acks, but before journalling
2442 * the final commit. Peers may crash after journalling the peer commit, but before
2443 * sending commit ack to the leader. Commit leaders with no uncommitted peer when
2446 void MDCache::finish_committed_leaders()
2448 for (map
<metareqid_t
, uleader
>::iterator p
= uncommitted_leaders
.begin();
2449 p
!= uncommitted_leaders
.end();
2451 p
->second
.recovering
= false;
2452 if (!p
->second
.committing
&& p
->second
.peers
.empty()) {
2453 dout(10) << "finish_committed_leaders " << p
->first
<< dendl
;
2454 log_leader_commit(p
->first
);
2460 * at end of resolve... we must journal a commit|abort for all peer
2461 * updates, before moving on.
2463 * this is so that the leader can safely journal ECommitted on ops it
2464 * leaders when it reaches up:active (all other recovering nodes must
2465 * complete resolve before that happens).
2467 struct C_MDC_PeerCommit
: public MDCacheLogContext
{
2470 C_MDC_PeerCommit(MDCache
*c
, int f
, metareqid_t r
) : MDCacheLogContext(c
), from(f
), reqid(r
) {}
2471 void finish(int r
) override
{
2472 mdcache
->_logged_peer_commit(from
, reqid
);
2476 void MDCache::_logged_peer_commit(mds_rank_t from
, metareqid_t reqid
)
2478 dout(10) << "_logged_peer_commit from mds." << from
<< " " << reqid
<< dendl
;
2481 auto req
= make_message
<MMDSPeerRequest
>(reqid
, 0, MMDSPeerRequest::OP_COMMITTED
);
2482 mds
->send_message_mds(req
, from
);
2490 // ====================================================================
2491 // import map, recovery
2493 void MDCache::_move_subtree_map_bound(dirfrag_t df
, dirfrag_t oldparent
, dirfrag_t newparent
,
2494 map
<dirfrag_t
,vector
<dirfrag_t
> >& subtrees
)
2496 if (subtrees
.count(oldparent
)) {
2497 vector
<dirfrag_t
>& v
= subtrees
[oldparent
];
2498 dout(10) << " removing " << df
<< " from " << oldparent
<< " bounds " << v
<< dendl
;
2499 for (vector
<dirfrag_t
>::iterator it
= v
.begin(); it
!= v
.end(); ++it
)
2505 if (subtrees
.count(newparent
)) {
2506 vector
<dirfrag_t
>& v
= subtrees
[newparent
];
2507 dout(10) << " adding " << df
<< " to " << newparent
<< " bounds " << v
<< dendl
;
2512 ESubtreeMap
*MDCache::create_subtree_map()
2514 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2515 << num_subtrees_fullauth() << " fullauth"
2520 ESubtreeMap
*le
= new ESubtreeMap();
2521 mds
->mdlog
->_start_entry(le
);
2523 map
<dirfrag_t
, CDir
*> dirs_to_add
;
2526 CDir
* mydir
= myin
->get_dirfrag(frag_t());
2527 dirs_to_add
[mydir
->dirfrag()] = mydir
;
2530 // include all auth subtrees, and their bounds.
2531 // and a spanning tree to tie it to the root.
2532 for (auto& [dir
, bounds
] : subtrees
) {
2533 // journal subtree as "ours" if we are
2536 // me, !me (may be importing and ambiguous!)
2540 if (dir
->get_dir_auth().first
!= mds
->get_nodeid())
2543 if (migrator
->is_ambiguous_import(dir
->dirfrag()) ||
2544 my_ambiguous_imports
.count(dir
->dirfrag())) {
2545 dout(15) << " ambig subtree " << *dir
<< dendl
;
2546 le
->ambiguous_subtrees
.insert(dir
->dirfrag());
2548 dout(15) << " auth subtree " << *dir
<< dendl
;
2551 dirs_to_add
[dir
->dirfrag()] = dir
;
2552 le
->subtrees
[dir
->dirfrag()].clear();
2555 size_t nbounds
= bounds
.size();
2557 dout(15) << " subtree has " << nbounds
<< " bounds" << dendl
;
2559 for (auto& bound
: bounds
) {
2561 dout(15) << " subtree bound " << *bound
<< dendl
;
2563 dirs_to_add
[bound
->dirfrag()] = bound
;
2564 le
->subtrees
[dir
->dirfrag()].push_back(bound
->dirfrag());
2568 // apply projected renames
2569 for (const auto& [diri
, renames
] : projected_subtree_renames
) {
2570 for (const auto& [olddir
, newdir
] : renames
) {
2571 dout(15) << " adjusting for projected rename of " << *diri
<< " to " << *newdir
<< dendl
;
2573 auto&& dfls
= diri
->get_dirfrags();
2574 for (const auto& dir
: dfls
) {
2575 dout(15) << "dirfrag " << dir
->dirfrag() << " " << *dir
<< dendl
;
2576 CDir
*oldparent
= get_projected_subtree_root(olddir
);
2577 dout(15) << " old parent " << oldparent
->dirfrag() << " " << *oldparent
<< dendl
;
2578 CDir
*newparent
= get_projected_subtree_root(newdir
);
2579 dout(15) << " new parent " << newparent
->dirfrag() << " " << *newparent
<< dendl
;
2581 if (oldparent
== newparent
) {
2582 dout(15) << "parent unchanged for " << dir
->dirfrag() << " at "
2583 << oldparent
->dirfrag() << dendl
;
2587 if (dir
->is_subtree_root()) {
2588 if (le
->subtrees
.count(newparent
->dirfrag()) &&
2589 oldparent
->get_dir_auth() != newparent
->get_dir_auth())
2590 dirs_to_add
[dir
->dirfrag()] = dir
;
2591 // children are fine. change parent.
2592 _move_subtree_map_bound(dir
->dirfrag(), oldparent
->dirfrag(), newparent
->dirfrag(),
2597 if (oldparent
->get_dir_auth() != newparent
->get_dir_auth()) {
2598 dout(10) << " creating subtree for " << dir
->dirfrag() << dendl
;
2599 // if oldparent is auth, subtree is mine; include it.
2600 if (le
->subtrees
.count(oldparent
->dirfrag())) {
2601 dirs_to_add
[dir
->dirfrag()] = dir
;
2602 le
->subtrees
[dir
->dirfrag()].clear();
2604 // if newparent is auth, subtree is a new bound
2605 if (le
->subtrees
.count(newparent
->dirfrag())) {
2606 dirs_to_add
[dir
->dirfrag()] = dir
;
2607 le
->subtrees
[newparent
->dirfrag()].push_back(dir
->dirfrag()); // newparent is auth; new bound
2612 // see if any old bounds move to the new parent.
2613 for (auto& bound
: subtrees
.at(oldparent
)) {
2614 if (dir
->contains(bound
->get_parent_dir()))
2615 _move_subtree_map_bound(bound
->dirfrag(), oldparent
->dirfrag(), newparent
->dirfrag(),
2623 // simplify the journaled map. our in memory map may have more
2624 // subtrees than needed due to migrations that are just getting
2625 // started or just completing. but on replay, the "live" map will
2626 // be simple and we can do a straight comparison.
2627 for (auto& [frag
, bfrags
] : le
->subtrees
) {
2628 if (le
->ambiguous_subtrees
.count(frag
))
2631 while (i
< bfrags
.size()) {
2632 dirfrag_t b
= bfrags
[i
];
2633 if (le
->subtrees
.count(b
) &&
2634 le
->ambiguous_subtrees
.count(b
) == 0) {
2635 auto& bb
= le
->subtrees
.at(b
);
2636 dout(10) << "simplify: " << frag
<< " swallowing " << b
<< " with bounds " << bb
<< dendl
;
2637 for (auto& r
: bb
) {
2638 bfrags
.push_back(r
);
2640 dirs_to_add
.erase(b
);
2641 le
->subtrees
.erase(b
);
2642 bfrags
.erase(bfrags
.begin() + i
);
2649 for (auto &p
: dirs_to_add
) {
2650 CDir
*dir
= p
.second
;
2651 le
->metablob
.add_dir_context(dir
, EMetaBlob::TO_ROOT
);
2652 le
->metablob
.add_dir(dir
, false);
2655 dout(15) << " subtrees " << le
->subtrees
<< dendl
;
2656 dout(15) << " ambiguous_subtrees " << le
->ambiguous_subtrees
<< dendl
;
2658 //le->metablob.print(cout);
2659 le
->expire_pos
= mds
->mdlog
->journaler
->get_expire_pos();
2663 void MDCache::dump_resolve_status(Formatter
*f
) const
2665 f
->open_object_section("resolve_status");
2666 f
->dump_stream("resolve_gather") << resolve_gather
;
2667 f
->dump_stream("resolve_ack_gather") << resolve_gather
;
2671 void MDCache::resolve_start(MDSContext
*resolve_done_
)
2673 dout(10) << "resolve_start" << dendl
;
2674 ceph_assert(!resolve_done
);
2675 resolve_done
.reset(resolve_done_
);
2677 if (mds
->mdsmap
->get_root() != mds
->get_nodeid()) {
2678 // if we don't have the root dir, adjust it to UNKNOWN. during
2679 // resolve we want mds0 to explicit claim the portion of it that
2680 // it owns, so that anything beyond its bounds get left as
2682 CDir
*rootdir
= root
->get_dirfrag(frag_t());
2684 adjust_subtree_auth(rootdir
, CDIR_AUTH_UNKNOWN
);
2686 resolve_gather
= recovery_set
;
2688 resolve_snapclient_commits
= mds
->snapclient
->get_journaled_tids();
2691 void MDCache::send_resolves()
2693 send_peer_resolves();
2695 if (!resolve_done
) {
2696 // I'm survivor: refresh snap cache
2697 mds
->snapclient
->sync(
2698 new MDSInternalContextWrapper(mds
,
2699 new LambdaContext([this](int r
) {
2700 maybe_finish_peer_resolve();
2704 dout(10) << "send_resolves waiting for snapclient cache to sync" << dendl
;
2707 if (!resolve_ack_gather
.empty()) {
2708 dout(10) << "send_resolves still waiting for resolve ack from ("
2709 << resolve_ack_gather
<< ")" << dendl
;
2712 if (!resolve_need_rollback
.empty()) {
2713 dout(10) << "send_resolves still waiting for rollback to commit on ("
2714 << resolve_need_rollback
<< ")" << dendl
;
2718 send_subtree_resolves();
2721 void MDCache::send_peer_resolves()
2723 dout(10) << "send_peer_resolves" << dendl
;
2725 map
<mds_rank_t
, ref_t
<MMDSResolve
>> resolves
;
2727 if (mds
->is_resolve()) {
2728 for (map
<metareqid_t
, upeer
>::iterator p
= uncommitted_peers
.begin();
2729 p
!= uncommitted_peers
.end();
2731 mds_rank_t leader
= p
->second
.leader
;
2732 auto &m
= resolves
[leader
];
2733 if (!m
) m
= make_message
<MMDSResolve
>();
2734 m
->add_peer_request(p
->first
, false);
2737 set
<mds_rank_t
> resolve_set
;
2738 mds
->mdsmap
->get_mds_set(resolve_set
, MDSMap::STATE_RESOLVE
);
2739 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
2740 p
!= active_requests
.end();
2742 MDRequestRef
& mdr
= p
->second
;
2743 if (!mdr
->is_peer())
2745 if (!mdr
->peer_did_prepare() && !mdr
->committing
) {
2748 mds_rank_t leader
= mdr
->peer_to_mds
;
2749 if (resolve_set
.count(leader
) || is_ambiguous_peer_update(p
->first
, leader
)) {
2750 dout(10) << " including uncommitted " << *mdr
<< dendl
;
2751 if (!resolves
.count(leader
))
2752 resolves
[leader
] = make_message
<MMDSResolve
>();
2753 if (!mdr
->committing
&&
2754 mdr
->has_more() && mdr
->more()->is_inode_exporter
) {
2755 // re-send cap exports
2756 CInode
*in
= mdr
->more()->rename_inode
;
2757 map
<client_t
, Capability::Export
> cap_map
;
2758 in
->export_client_caps(cap_map
);
2760 MMDSResolve::peer_inode_cap
inode_caps(in
->ino(), cap_map
);
2761 encode(inode_caps
, bl
);
2762 resolves
[leader
]->add_peer_request(p
->first
, bl
);
2764 resolves
[leader
]->add_peer_request(p
->first
, mdr
->committing
);
2770 for (auto &p
: resolves
) {
2771 dout(10) << "sending peer resolve to mds." << p
.first
<< dendl
;
2772 mds
->send_message_mds(p
.second
, p
.first
);
2773 resolve_ack_gather
.insert(p
.first
);
2777 void MDCache::send_subtree_resolves()
2779 dout(10) << "send_subtree_resolves" << dendl
;
2781 if (migrator
->is_exporting() || migrator
->is_importing()) {
2782 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl
;
2783 migrator
->show_importing();
2784 migrator
->show_exporting();
2785 resolves_pending
= true;
2789 map
<mds_rank_t
, ref_t
<MMDSResolve
>> resolves
;
2790 for (set
<mds_rank_t
>::iterator p
= recovery_set
.begin();
2791 p
!= recovery_set
.end();
2793 if (*p
== mds
->get_nodeid())
2795 if (mds
->is_resolve() || mds
->mdsmap
->is_resolve(*p
))
2796 resolves
[*p
] = make_message
<MMDSResolve
>();
2799 map
<dirfrag_t
, vector
<dirfrag_t
> > my_subtrees
;
2800 map
<dirfrag_t
, vector
<dirfrag_t
> > my_ambig_imports
;
2803 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
2804 p
!= subtrees
.end();
2806 CDir
*dir
= p
->first
;
2808 // only our subtrees
2809 if (dir
->authority().first
!= mds
->get_nodeid())
2812 if (mds
->is_resolve() && my_ambiguous_imports
.count(dir
->dirfrag()))
2813 continue; // we'll add it below
2815 if (migrator
->is_ambiguous_import(dir
->dirfrag())) {
2816 // ambiguous (mid-import)
2818 get_subtree_bounds(dir
, bounds
);
2819 vector
<dirfrag_t
> dfls
;
2820 for (set
<CDir
*>::iterator q
= bounds
.begin(); q
!= bounds
.end(); ++q
)
2821 dfls
.push_back((*q
)->dirfrag());
2823 my_ambig_imports
[dir
->dirfrag()] = dfls
;
2824 dout(10) << " ambig " << dir
->dirfrag() << " " << dfls
<< dendl
;
2827 for (auto &q
: resolves
) {
2828 resolves
[q
.first
]->add_subtree(dir
->dirfrag());
2831 vector
<dirfrag_t
> dfls
;
2832 for (set
<CDir
*>::iterator q
= subtrees
[dir
].begin();
2833 q
!= subtrees
[dir
].end();
2836 dfls
.push_back(bound
->dirfrag());
2839 my_subtrees
[dir
->dirfrag()] = dfls
;
2840 dout(10) << " claim " << dir
->dirfrag() << " " << dfls
<< dendl
;
2845 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= my_ambiguous_imports
.begin();
2846 p
!= my_ambiguous_imports
.end();
2848 my_ambig_imports
[p
->first
] = p
->second
;
2849 dout(10) << " ambig " << p
->first
<< " " << p
->second
<< dendl
;
2852 // simplify the claimed subtree.
2853 for (auto p
= my_subtrees
.begin(); p
!= my_subtrees
.end(); ++p
) {
2855 while (i
< p
->second
.size()) {
2856 dirfrag_t b
= p
->second
[i
];
2857 if (my_subtrees
.count(b
)) {
2858 vector
<dirfrag_t
>& bb
= my_subtrees
[b
];
2859 dout(10) << " simplify: " << p
->first
<< " swallowing " << b
<< " with bounds " << bb
<< dendl
;
2860 for (vector
<dirfrag_t
>::iterator r
= bb
.begin(); r
!= bb
.end(); ++r
)
2861 p
->second
.push_back(*r
);
2862 my_subtrees
.erase(b
);
2863 p
->second
.erase(p
->second
.begin() + i
);
2871 for (auto &p
: resolves
) {
2872 const ref_t
<MMDSResolve
> &m
= p
.second
;
2873 if (mds
->is_resolve()) {
2874 m
->add_table_commits(TABLE_SNAP
, resolve_snapclient_commits
);
2876 m
->add_table_commits(TABLE_SNAP
, mds
->snapclient
->get_journaled_tids());
2878 m
->subtrees
= my_subtrees
;
2879 m
->ambiguous_imports
= my_ambig_imports
;
2880 dout(10) << "sending subtee resolve to mds." << p
.first
<< dendl
;
2881 mds
->send_message_mds(m
, p
.first
);
2883 resolves_pending
= false;
2886 void MDCache::maybe_finish_peer_resolve() {
2887 if (resolve_ack_gather
.empty() && resolve_need_rollback
.empty()) {
2888 // snap cache get synced or I'm in resolve state
2889 if (mds
->snapclient
->is_synced() || resolve_done
)
2890 send_subtree_resolves();
2891 process_delayed_resolve();
2895 void MDCache::handle_mds_failure(mds_rank_t who
)
2897 dout(7) << "handle_mds_failure mds." << who
<< dendl
;
2899 dout(1) << "handle_mds_failure mds." << who
<< " : recovery peers are " << recovery_set
<< dendl
;
2901 resolve_gather
.insert(who
);
2902 discard_delayed_resolve(who
);
2903 ambiguous_peer_updates
.erase(who
);
2905 rejoin_gather
.insert(who
);
2906 rejoin_sent
.erase(who
); // i need to send another
2907 rejoin_ack_sent
.erase(who
); // i need to send another
2908 rejoin_ack_gather
.erase(who
); // i'll need/get another.
2910 dout(10) << " resolve_gather " << resolve_gather
<< dendl
;
2911 dout(10) << " resolve_ack_gather " << resolve_ack_gather
<< dendl
;
2912 dout(10) << " rejoin_sent " << rejoin_sent
<< dendl
;
2913 dout(10) << " rejoin_gather " << rejoin_gather
<< dendl
;
2914 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather
<< dendl
;
2917 // tell the migrator too.
2918 migrator
->handle_mds_failure_or_stop(who
);
2920 // tell the balancer too.
2921 mds
->balancer
->handle_mds_failure(who
);
2923 // clean up any requests peer to/from this node
2924 list
<MDRequestRef
> finish
;
2925 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
2926 p
!= active_requests
.end();
2928 MDRequestRef
& mdr
= p
->second
;
2929 // peer to the failed node?
2930 if (mdr
->peer_to_mds
== who
) {
2931 if (mdr
->peer_did_prepare()) {
2932 dout(10) << " peer request " << *mdr
<< " uncommitted, will resolve shortly" << dendl
;
2933 if (is_ambiguous_peer_update(p
->first
, mdr
->peer_to_mds
))
2934 remove_ambiguous_peer_update(p
->first
, mdr
->peer_to_mds
);
2936 if (!mdr
->more()->waiting_on_peer
.empty()) {
2937 ceph_assert(mdr
->more()->srcdn_auth_mds
== mds
->get_nodeid());
2938 // will rollback, no need to wait
2939 mdr
->reset_peer_request();
2940 mdr
->more()->waiting_on_peer
.clear();
2942 } else if (!mdr
->committing
) {
2943 dout(10) << " peer request " << *mdr
<< " has no prepare, finishing up" << dendl
;
2944 if (mdr
->peer_request
|| mdr
->peer_rolling_back())
2945 mdr
->aborted
= true;
2947 finish
.push_back(mdr
);
2951 if (mdr
->is_peer() && mdr
->peer_did_prepare()) {
2952 if (mdr
->more()->waiting_on_peer
.count(who
)) {
2953 ceph_assert(mdr
->more()->srcdn_auth_mds
== mds
->get_nodeid());
2954 dout(10) << " peer request " << *mdr
<< " no longer need rename notity ack from mds."
2956 mdr
->more()->waiting_on_peer
.erase(who
);
2957 if (mdr
->more()->waiting_on_peer
.empty() && mdr
->peer_request
)
2958 mds
->queue_waiter(new C_MDS_RetryRequest(this, mdr
));
2961 if (mdr
->more()->srcdn_auth_mds
== who
&&
2962 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(mdr
->peer_to_mds
)) {
2963 // rename srcdn's auth mds failed, resolve even I'm a survivor.
2964 dout(10) << " peer request " << *mdr
<< " uncommitted, will resolve shortly" << dendl
;
2965 add_ambiguous_peer_update(p
->first
, mdr
->peer_to_mds
);
2967 } else if (mdr
->peer_request
) {
2968 const cref_t
<MMDSPeerRequest
> &peer_req
= mdr
->peer_request
;
2969 // FIXME: Peer rename request can arrive after we notice mds failure.
2970 // This can cause mds to crash (does not affect integrity of FS).
2971 if (peer_req
->get_op() == MMDSPeerRequest::OP_RENAMEPREP
&&
2972 peer_req
->srcdn_auth
== who
)
2973 peer_req
->mark_interrupted();
2976 // failed node is peer?
2977 if (mdr
->is_leader() && !mdr
->committing
) {
2978 if (mdr
->more()->srcdn_auth_mds
== who
) {
2979 dout(10) << " leader request " << *mdr
<< " waiting for rename srcdn's auth mds."
2980 << who
<< " to recover" << dendl
;
2981 ceph_assert(mdr
->more()->witnessed
.count(who
) == 0);
2982 if (mdr
->more()->is_ambiguous_auth
)
2983 mdr
->clear_ambiguous_auth();
2984 // rename srcdn's auth mds failed, all witnesses will rollback
2985 mdr
->more()->witnessed
.clear();
2986 pending_leaders
.erase(p
->first
);
2989 if (mdr
->more()->witnessed
.count(who
)) {
2990 mds_rank_t srcdn_auth
= mdr
->more()->srcdn_auth_mds
;
2991 if (srcdn_auth
>= 0 && mdr
->more()->waiting_on_peer
.count(srcdn_auth
)) {
2992 dout(10) << " leader request " << *mdr
<< " waiting for rename srcdn's auth mds."
2993 << mdr
->more()->srcdn_auth_mds
<< " to reply" << dendl
;
2994 // waiting for the peer (rename srcdn's auth mds), delay sending resolve ack
2995 // until either the request is committing or the peer also fails.
2996 ceph_assert(mdr
->more()->waiting_on_peer
.size() == 1);
2997 pending_leaders
.insert(p
->first
);
2999 dout(10) << " leader request " << *mdr
<< " no longer witnessed by peer mds."
3000 << who
<< " to recover" << dendl
;
3001 if (srcdn_auth
>= 0)
3002 ceph_assert(mdr
->more()->witnessed
.count(srcdn_auth
) == 0);
3004 // discard this peer's prepare (if any)
3005 mdr
->more()->witnessed
.erase(who
);
3009 if (mdr
->more()->waiting_on_peer
.count(who
)) {
3010 dout(10) << " leader request " << *mdr
<< " waiting for peer mds." << who
3011 << " to recover" << dendl
;
3012 // retry request when peer recovers
3013 mdr
->more()->waiting_on_peer
.erase(who
);
3014 if (mdr
->more()->waiting_on_peer
.empty())
3015 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(this, mdr
));
3018 if (mdr
->locking
&& mdr
->locking_target_mds
== who
)
3019 mdr
->finish_locking(mdr
->locking
);
3023 for (map
<metareqid_t
, uleader
>::iterator p
= uncommitted_leaders
.begin();
3024 p
!= uncommitted_leaders
.end();
3026 // The failed MDS may have already committed the peer update
3027 if (p
->second
.peers
.count(who
)) {
3028 p
->second
.recovering
= true;
3029 p
->second
.peers
.erase(who
);
3033 while (!finish
.empty()) {
3034 dout(10) << "cleaning up peer request " << *finish
.front() << dendl
;
3035 request_finish(finish
.front());
3039 kick_find_ino_peers(who
);
3040 kick_open_ino_peers(who
);
3042 for (map
<dirfrag_t
,fragment_info_t
>::iterator p
= fragments
.begin();
3043 p
!= fragments
.end(); ) {
3044 dirfrag_t df
= p
->first
;
3045 fragment_info_t
& info
= p
->second
;
3047 if (info
.is_fragmenting()) {
3048 if (info
.notify_ack_waiting
.erase(who
) &&
3049 info
.notify_ack_waiting
.empty()) {
3050 fragment_drop_locks(info
);
3051 fragment_maybe_finish(p
++);
3059 dout(10) << "cancelling fragment " << df
<< " bit " << info
.bits
<< dendl
;
3060 std::vector
<CDir
*> dirs
;
3061 info
.dirs
.swap(dirs
);
3062 fragments
.erase(df
);
3063 fragment_unmark_unfreeze_dirs(dirs
);
3066 // MDCache::shutdown_export_strays() always exports strays to mds.0
3067 if (who
== mds_rank_t(0))
3068 shutdown_exporting_strays
.clear();
3074 * handle_mds_recovery - called on another node's transition
3075 * from resolve -> active.
3077 void MDCache::handle_mds_recovery(mds_rank_t who
)
3079 dout(7) << "handle_mds_recovery mds." << who
<< dendl
;
3081 // exclude all discover waiters. kick_discovers() will do the job
3082 static const uint64_t i_mask
= CInode::WAIT_ANY_MASK
& ~CInode::WAIT_DIR
;
3083 static const uint64_t d_mask
= CDir::WAIT_ANY_MASK
& ~CDir::WAIT_DENTRY
;
3085 MDSContext::vec waiters
;
3087 // wake up any waiters in their subtrees
3088 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3089 p
!= subtrees
.end();
3091 CDir
*dir
= p
->first
;
3093 if (dir
->authority().first
!= who
||
3094 dir
->authority().second
== mds
->get_nodeid())
3096 ceph_assert(!dir
->is_auth());
3099 std::queue
<CDir
*> q
;
3102 while (!q
.empty()) {
3103 CDir
*d
= q
.front();
3105 d
->take_waiting(d_mask
, waiters
);
3107 // inode waiters too
3108 for (auto &p
: d
->items
) {
3109 CDentry
*dn
= p
.second
;
3110 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3111 if (dnl
->is_primary()) {
3112 dnl
->get_inode()->take_waiting(i_mask
, waiters
);
3115 auto&& ls
= dnl
->get_inode()->get_dirfrags();
3116 for (const auto& subdir
: ls
) {
3117 if (!subdir
->is_subtree_root())
3125 kick_open_ino_peers(who
);
3126 kick_find_ino_peers(who
);
3129 mds
->queue_waiters(waiters
);
3132 void MDCache::set_recovery_set(set
<mds_rank_t
>& s
)
3134 dout(7) << "set_recovery_set " << s
<< dendl
;
3140 * during resolve state, we share resolves to determine who
3141 * is authoritative for which trees. we expect to get an resolve
3142 * from _everyone_ in the recovery_set (the mds cluster at the time of
3143 * the first failure).
3145 * This functions puts the passed message before returning
3147 void MDCache::handle_resolve(const cref_t
<MMDSResolve
> &m
)
3149 dout(7) << "handle_resolve from " << m
->get_source() << dendl
;
3150 mds_rank_t from
= mds_rank_t(m
->get_source().num());
3152 if (mds
->get_state() < MDSMap::STATE_RESOLVE
) {
3153 if (mds
->get_want_state() == CEPH_MDS_STATE_RESOLVE
) {
3154 mds
->wait_for_resolve(new C_MDS_RetryMessage(mds
, m
));
3157 // wait until we reach the resolve stage!
3161 discard_delayed_resolve(from
);
3163 // ambiguous peer requests?
3164 if (!m
->peer_requests
.empty()) {
3165 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping()) {
3166 for (auto p
= m
->peer_requests
.begin(); p
!= m
->peer_requests
.end(); ++p
) {
3167 if (uncommitted_leaders
.count(p
->first
) && !uncommitted_leaders
[p
->first
].safe
) {
3168 ceph_assert(!p
->second
.committing
);
3169 pending_leaders
.insert(p
->first
);
3173 if (!pending_leaders
.empty()) {
3174 dout(10) << " still have pending updates, delay processing peer resolve" << dendl
;
3175 delayed_resolve
[from
] = m
;
3180 auto ack
= make_message
<MMDSResolveAck
>();
3181 for (const auto &p
: m
->peer_requests
) {
3182 if (uncommitted_leaders
.count(p
.first
)) { //mds->sessionmap.have_completed_request(p.first)) {
3184 if (p
.second
.committing
) {
3185 // already committing, waiting for the OP_COMMITTED peer reply
3186 dout(10) << " already committing peer request " << p
<< " noop "<< dendl
;
3188 dout(10) << " ambiguous peer request " << p
<< " will COMMIT" << dendl
;
3189 ack
->add_commit(p
.first
);
3191 uncommitted_leaders
[p
.first
].peers
.insert(from
); // wait for peer OP_COMMITTED before we log ECommitted
3193 if (p
.second
.inode_caps
.length() > 0) {
3194 // peer wants to export caps (rename)
3195 ceph_assert(mds
->is_resolve());
3196 MMDSResolve::peer_inode_cap inode_caps
;
3197 auto q
= p
.second
.inode_caps
.cbegin();
3198 decode(inode_caps
, q
);
3199 inodeno_t ino
= inode_caps
.ino
;
3200 map
<client_t
,Capability::Export
> cap_exports
= inode_caps
.cap_exports
;
3201 ceph_assert(get_inode(ino
));
3203 for (map
<client_t
,Capability::Export
>::iterator q
= cap_exports
.begin();
3204 q
!= cap_exports
.end();
3206 Capability::Import
& im
= rejoin_imported_caps
[from
][ino
][q
->first
];
3207 im
.cap_id
= ++last_cap_id
; // assign a new cap ID
3209 im
.mseq
= q
->second
.mseq
;
3211 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
3213 rejoin_client_map
.emplace(q
->first
, session
->info
.inst
);
3216 // will process these caps in rejoin stage
3217 rejoin_peer_exports
[ino
].first
= from
;
3218 rejoin_peer_exports
[ino
].second
.swap(cap_exports
);
3220 // send information of imported caps back to peer
3221 encode(rejoin_imported_caps
[from
][ino
], ack
->commit
[p
.first
]);
3225 dout(10) << " ambiguous peer request " << p
<< " will ABORT" << dendl
;
3226 ceph_assert(!p
.second
.committing
);
3227 ack
->add_abort(p
.first
);
3230 mds
->send_message(ack
, m
->get_connection());
3234 if (!resolve_ack_gather
.empty() || !resolve_need_rollback
.empty()) {
3235 dout(10) << "delay processing subtree resolve" << dendl
;
3236 delayed_resolve
[from
] = m
;
3240 bool survivor
= false;
3241 // am i a surviving ambiguous importer?
3242 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping()) {
3244 // check for any import success/failure (from this node)
3245 map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= my_ambiguous_imports
.begin();
3246 while (p
!= my_ambiguous_imports
.end()) {
3247 map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator next
= p
;
3249 CDir
*dir
= get_dirfrag(p
->first
);
3251 dout(10) << "checking ambiguous import " << *dir
<< dendl
;
3252 if (migrator
->is_importing(dir
->dirfrag()) &&
3253 migrator
->get_import_peer(dir
->dirfrag()) == from
) {
3254 ceph_assert(migrator
->get_import_state(dir
->dirfrag()) == Migrator::IMPORT_ACKING
);
3256 // check if sender claims the subtree
3257 bool claimed_by_sender
= false;
3258 for (const auto &q
: m
->subtrees
) {
3259 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
3260 CDir
*base
= get_force_dirfrag(q
.first
, false);
3261 if (!base
|| !base
->contains(dir
))
3262 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3266 get_force_dirfrag_bound_set(q
.second
, bounds
);
3267 for (set
<CDir
*>::iterator p
= bounds
.begin(); p
!= bounds
.end(); ++p
) {
3269 if (bound
->contains(dir
)) {
3270 inside
= false; // nope, bound is dir or parent of dir, not inside.
3275 claimed_by_sender
= true;
3278 my_ambiguous_imports
.erase(p
); // no longer ambiguous.
3279 if (claimed_by_sender
) {
3280 dout(7) << "ambiguous import failed on " << *dir
<< dendl
;
3281 migrator
->import_reverse(dir
);
3283 dout(7) << "ambiguous import succeeded on " << *dir
<< dendl
;
3284 migrator
->import_finish(dir
, true);
3291 // update my dir_auth values
3292 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3293 // migrations between other nodes)
3294 for (const auto& p
: m
->subtrees
) {
3295 dout(10) << "peer claims " << p
.first
<< " bounds " << p
.second
<< dendl
;
3296 CDir
*dir
= get_force_dirfrag(p
.first
, !survivor
);
3299 adjust_bounded_subtree_auth(dir
, p
.second
, from
);
3300 try_subtree_merge(dir
);
3305 // note ambiguous imports too
3306 for (const auto& p
: m
->ambiguous_imports
) {
3307 dout(10) << "noting ambiguous import on " << p
.first
<< " bounds " << p
.second
<< dendl
;
3308 other_ambiguous_imports
[from
][p
.first
] = p
.second
;
3311 // learn other mds' pendina snaptable commits. later when resolve finishes, we will reload
3312 // snaptable cache from snapserver. By this way, snaptable cache get synced among all mds
3313 for (const auto& p
: m
->table_clients
) {
3314 dout(10) << " noting " << get_mdstable_name(p
.type
)
3315 << " pending_commits " << p
.pending_commits
<< dendl
;
3316 MDSTableClient
*client
= mds
->get_table_client(p
.type
);
3317 for (const auto& q
: p
.pending_commits
)
3318 client
->notify_commit(q
);
3321 // did i get them all?
3322 resolve_gather
.erase(from
);
3324 maybe_resolve_finish();
3327 void MDCache::process_delayed_resolve()
3329 dout(10) << "process_delayed_resolve" << dendl
;
3330 map
<mds_rank_t
, cref_t
<MMDSResolve
>> tmp
;
3331 tmp
.swap(delayed_resolve
);
3332 for (auto &p
: tmp
) {
3333 handle_resolve(p
.second
);
3337 void MDCache::discard_delayed_resolve(mds_rank_t who
)
3339 delayed_resolve
.erase(who
);
3342 void MDCache::maybe_resolve_finish()
3344 ceph_assert(resolve_ack_gather
.empty());
3345 ceph_assert(resolve_need_rollback
.empty());
3347 if (!resolve_gather
.empty()) {
3348 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3349 << resolve_gather
<< ")" << dendl
;
3353 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl
;
3354 disambiguate_my_imports();
3355 finish_committed_leaders();
3358 ceph_assert(mds
->is_resolve());
3359 trim_unlinked_inodes();
3360 recalc_auth_bits(false);
3361 resolve_done
.release()->complete(0);
3364 maybe_send_pending_rejoins();
3368 void MDCache::handle_resolve_ack(const cref_t
<MMDSResolveAck
> &ack
)
3370 dout(10) << "handle_resolve_ack " << *ack
<< " from " << ack
->get_source() << dendl
;
3371 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
3373 if (!resolve_ack_gather
.count(from
) ||
3374 mds
->mdsmap
->get_state(from
) < MDSMap::STATE_RESOLVE
) {
3378 if (ambiguous_peer_updates
.count(from
)) {
3379 ceph_assert(mds
->mdsmap
->is_clientreplay_or_active_or_stopping(from
));
3380 ceph_assert(mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping());
3383 for (const auto &p
: ack
->commit
) {
3384 dout(10) << " commit on peer " << p
.first
<< dendl
;
3386 if (ambiguous_peer_updates
.count(from
)) {
3387 remove_ambiguous_peer_update(p
.first
, from
);
3391 if (mds
->is_resolve()) {
3393 MDPeerUpdate
*su
= get_uncommitted_peer(p
.first
, from
);
3397 mds
->mdlog
->start_submit_entry(new EPeerUpdate(mds
->mdlog
, "unknown", p
.first
, from
,
3398 EPeerUpdate::OP_COMMIT
, su
->origop
),
3399 new C_MDC_PeerCommit(this, from
, p
.first
));
3400 mds
->mdlog
->flush();
3402 finish_uncommitted_peer(p
.first
);
3404 MDRequestRef mdr
= request_get(p
.first
);
3405 // information about leader imported caps
3406 if (p
.second
.length() > 0)
3407 mdr
->more()->inode_import
.share(p
.second
);
3409 ceph_assert(mdr
->peer_request
== 0); // shouldn't be doing anything!
3410 request_finish(mdr
);
3414 for (const auto &metareq
: ack
->abort
) {
3415 dout(10) << " abort on peer " << metareq
<< dendl
;
3417 if (mds
->is_resolve()) {
3418 MDPeerUpdate
*su
= get_uncommitted_peer(metareq
, from
);
3421 // perform rollback (and journal a rollback entry)
3422 // note: this will hold up the resolve a bit, until the rollback entries journal.
3423 MDRequestRef null_ref
;
3424 switch (su
->origop
) {
3425 case EPeerUpdate::LINK
:
3426 mds
->server
->do_link_rollback(su
->rollback
, from
, null_ref
);
3428 case EPeerUpdate::RENAME
:
3429 mds
->server
->do_rename_rollback(su
->rollback
, from
, null_ref
);
3431 case EPeerUpdate::RMDIR
:
3432 mds
->server
->do_rmdir_rollback(su
->rollback
, from
, null_ref
);
3438 MDRequestRef mdr
= request_get(metareq
);
3439 mdr
->aborted
= true;
3440 if (mdr
->peer_request
) {
3441 if (mdr
->peer_did_prepare()) // journaling peer prepare ?
3442 add_rollback(metareq
, from
);
3444 request_finish(mdr
);
3449 if (!ambiguous_peer_updates
.count(from
)) {
3450 resolve_ack_gather
.erase(from
);
3451 maybe_finish_peer_resolve();
3455 void MDCache::add_uncommitted_peer(metareqid_t reqid
, LogSegment
*ls
, mds_rank_t leader
, MDPeerUpdate
*su
)
3457 auto const &ret
= uncommitted_peers
.emplace(std::piecewise_construct
,
3458 std::forward_as_tuple(reqid
),
3459 std::forward_as_tuple());
3460 ceph_assert(ret
.second
);
3461 ls
->uncommitted_peers
.insert(reqid
);
3462 upeer
&u
= ret
.first
->second
;
3466 if (su
== nullptr) {
3469 for(set
<CInode
*>::iterator p
= su
->olddirs
.begin(); p
!= su
->olddirs
.end(); ++p
)
3470 uncommitted_peer_rename_olddir
[*p
]++;
3471 for(set
<CInode
*>::iterator p
= su
->unlinked
.begin(); p
!= su
->unlinked
.end(); ++p
)
3472 uncommitted_peer_unlink
[*p
]++;
3475 void MDCache::finish_uncommitted_peer(metareqid_t reqid
, bool assert_exist
)
3477 auto it
= uncommitted_peers
.find(reqid
);
3478 if (it
== uncommitted_peers
.end()) {
3479 ceph_assert(!assert_exist
);
3482 upeer
&u
= it
->second
;
3483 MDPeerUpdate
* su
= u
.su
;
3485 if (!u
.waiters
.empty()) {
3486 mds
->queue_waiters(u
.waiters
);
3488 u
.ls
->uncommitted_peers
.erase(reqid
);
3489 uncommitted_peers
.erase(it
);
3491 if (su
== nullptr) {
3494 // discard the non-auth subtree we renamed out of
3495 for(set
<CInode
*>::iterator p
= su
->olddirs
.begin(); p
!= su
->olddirs
.end(); ++p
) {
3497 map
<CInode
*, int>::iterator it
= uncommitted_peer_rename_olddir
.find(diri
);
3498 ceph_assert(it
!= uncommitted_peer_rename_olddir
.end());
3500 if (it
->second
== 0) {
3501 uncommitted_peer_rename_olddir
.erase(it
);
3502 auto&& ls
= diri
->get_dirfrags();
3503 for (const auto& dir
: ls
) {
3504 CDir
*root
= get_subtree_root(dir
);
3505 if (root
->get_dir_auth() == CDIR_AUTH_UNDEF
) {
3506 try_trim_non_auth_subtree(root
);
3512 ceph_assert(it
->second
> 0);
3514 // removed the inodes that were unlinked by peer update
3515 for(set
<CInode
*>::iterator p
= su
->unlinked
.begin(); p
!= su
->unlinked
.end(); ++p
) {
3517 map
<CInode
*, int>::iterator it
= uncommitted_peer_unlink
.find(in
);
3518 ceph_assert(it
!= uncommitted_peer_unlink
.end());
3520 if (it
->second
== 0) {
3521 uncommitted_peer_unlink
.erase(it
);
3522 if (!in
->get_projected_parent_dn())
3523 mds
->mdcache
->remove_inode_recursive(in
);
3525 ceph_assert(it
->second
> 0);
3530 MDPeerUpdate
* MDCache::get_uncommitted_peer(metareqid_t reqid
, mds_rank_t leader
)
3533 MDPeerUpdate
* su
= nullptr;
3534 auto it
= uncommitted_peers
.find(reqid
);
3535 if (it
!= uncommitted_peers
.end() &&
3536 it
->second
.leader
== leader
) {
3542 void MDCache::finish_rollback(metareqid_t reqid
, MDRequestRef
& mdr
) {
3543 auto p
= resolve_need_rollback
.find(reqid
);
3544 ceph_assert(p
!= resolve_need_rollback
.end());
3545 if (mds
->is_resolve()) {
3546 finish_uncommitted_peer(reqid
, false);
3548 finish_uncommitted_peer(mdr
->reqid
, mdr
->more()->peer_update_journaled
);
3550 resolve_need_rollback
.erase(p
);
3551 maybe_finish_peer_resolve();
3554 void MDCache::disambiguate_other_imports()
3556 dout(10) << "disambiguate_other_imports" << dendl
;
3558 bool recovering
= !(mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping());
3559 // other nodes' ambiguous imports
3560 for (map
<mds_rank_t
, map
<dirfrag_t
, vector
<dirfrag_t
> > >::iterator p
= other_ambiguous_imports
.begin();
3561 p
!= other_ambiguous_imports
.end();
3563 mds_rank_t who
= p
->first
;
3564 dout(10) << "ambiguous imports for mds." << who
<< dendl
;
3566 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator q
= p
->second
.begin();
3567 q
!= p
->second
.end();
3569 dout(10) << " ambiguous import " << q
->first
<< " bounds " << q
->second
<< dendl
;
3570 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3571 CDir
*dir
= get_force_dirfrag(q
->first
, recovering
);
3574 if (dir
->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3575 dir
->authority() == CDIR_AUTH_UNDEF
) { // resolving
3576 dout(10) << " mds." << who
<< " did import " << *dir
<< dendl
;
3577 adjust_bounded_subtree_auth(dir
, q
->second
, who
);
3578 try_subtree_merge(dir
);
3580 dout(10) << " mds." << who
<< " did not import " << *dir
<< dendl
;
3584 other_ambiguous_imports
.clear();
3587 void MDCache::disambiguate_my_imports()
3589 dout(10) << "disambiguate_my_imports" << dendl
;
3591 if (!mds
->is_resolve()) {
3592 ceph_assert(my_ambiguous_imports
.empty());
3596 disambiguate_other_imports();
3598 // my ambiguous imports
3599 mds_authority_t
me_ambig(mds
->get_nodeid(), mds
->get_nodeid());
3600 while (!my_ambiguous_imports
.empty()) {
3601 map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator q
= my_ambiguous_imports
.begin();
3603 CDir
*dir
= get_dirfrag(q
->first
);
3606 if (dir
->authority() != me_ambig
) {
3607 dout(10) << "ambiguous import auth known, must not be me " << *dir
<< dendl
;
3608 cancel_ambiguous_import(dir
);
3610 mds
->mdlog
->start_submit_entry(new EImportFinish(dir
, false));
3612 // subtree may have been swallowed by another node claiming dir
3614 CDir
*root
= get_subtree_root(dir
);
3616 dout(10) << " subtree root is " << *root
<< dendl
;
3617 ceph_assert(root
->dir_auth
.first
!= mds
->get_nodeid()); // no us!
3618 try_trim_non_auth_subtree(root
);
3620 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir
<< dendl
;
3621 finish_ambiguous_import(q
->first
);
3622 mds
->mdlog
->start_submit_entry(new EImportFinish(dir
, true));
3625 ceph_assert(my_ambiguous_imports
.empty());
3626 mds
->mdlog
->flush();
3628 // verify all my subtrees are unambiguous!
3629 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3630 p
!= subtrees
.end();
3632 CDir
*dir
= p
->first
;
3633 if (dir
->is_ambiguous_dir_auth()) {
3634 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir
<< dendl
;
3636 ceph_assert(!dir
->is_ambiguous_dir_auth());
3643 void MDCache::add_ambiguous_import(dirfrag_t base
, const vector
<dirfrag_t
>& bounds
)
3645 ceph_assert(my_ambiguous_imports
.count(base
) == 0);
3646 my_ambiguous_imports
[base
] = bounds
;
3650 void MDCache::add_ambiguous_import(CDir
*base
, const set
<CDir
*>& bounds
)
3653 vector
<dirfrag_t
> binos
;
3654 for (set
<CDir
*>::iterator p
= bounds
.begin();
3657 binos
.push_back((*p
)->dirfrag());
3659 // note: this can get called twice if the exporter fails during recovery
3660 if (my_ambiguous_imports
.count(base
->dirfrag()))
3661 my_ambiguous_imports
.erase(base
->dirfrag());
3663 add_ambiguous_import(base
->dirfrag(), binos
);
3666 void MDCache::cancel_ambiguous_import(CDir
*dir
)
3668 dirfrag_t df
= dir
->dirfrag();
3669 ceph_assert(my_ambiguous_imports
.count(df
));
3670 dout(10) << "cancel_ambiguous_import " << df
3671 << " bounds " << my_ambiguous_imports
[df
]
3674 my_ambiguous_imports
.erase(df
);
3677 void MDCache::finish_ambiguous_import(dirfrag_t df
)
3679 ceph_assert(my_ambiguous_imports
.count(df
));
3680 vector
<dirfrag_t
> bounds
;
3681 bounds
.swap(my_ambiguous_imports
[df
]);
3682 my_ambiguous_imports
.erase(df
);
3684 dout(10) << "finish_ambiguous_import " << df
3685 << " bounds " << bounds
3687 CDir
*dir
= get_dirfrag(df
);
3690 // adjust dir_auth, import maps
3691 adjust_bounded_subtree_auth(dir
, bounds
, mds
->get_nodeid());
3692 try_subtree_merge(dir
);
3695 void MDCache::remove_inode_recursive(CInode
*in
)
3697 dout(10) << "remove_inode_recursive " << *in
<< dendl
;
3698 auto&& ls
= in
->get_dirfrags();
3699 for (const auto& subdir
: ls
) {
3700 dout(10) << " removing dirfrag " << *subdir
<< dendl
;
3701 auto it
= subdir
->items
.begin();
3702 while (it
!= subdir
->items
.end()) {
3703 CDentry
*dn
= it
->second
;
3705 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3706 if (dnl
->is_primary()) {
3707 CInode
*tin
= dnl
->get_inode();
3708 subdir
->unlink_inode(dn
, false);
3709 remove_inode_recursive(tin
);
3711 subdir
->remove_dentry(dn
);
3714 if (subdir
->is_subtree_root())
3715 remove_subtree(subdir
);
3716 in
->close_dirfrag(subdir
->dirfrag().frag
);
3721 bool MDCache::expire_recursive(CInode
*in
, expiremap
&expiremap
)
3723 ceph_assert(!in
->is_auth());
3725 dout(10) << __func__
<< ":" << *in
<< dendl
;
3727 // Recurse into any dirfrags beneath this inode
3728 auto&& ls
= in
->get_dirfrags();
3729 for (const auto& subdir
: ls
) {
3730 if (!in
->is_mdsdir() && subdir
->is_subtree_root()) {
3731 dout(10) << __func__
<< ": stray still has subtree " << *in
<< dendl
;
3735 for (auto &it
: subdir
->items
) {
3736 CDentry
*dn
= it
.second
;
3737 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3738 if (dnl
->is_primary()) {
3739 CInode
*tin
= dnl
->get_inode();
3741 /* Remote strays with linkage (i.e. hardlinks) should not be
3742 * expired, because they may be the target of
3743 * a rename() as the owning MDS shuts down */
3744 if (!tin
->is_stray() && tin
->get_inode()->nlink
) {
3745 dout(10) << __func__
<< ": stray still has linkage " << *tin
<< dendl
;
3749 const bool abort
= expire_recursive(tin
, expiremap
);
3754 if (dn
->lru_is_expireable()) {
3755 trim_dentry(dn
, expiremap
);
3757 dout(10) << __func__
<< ": stray dn is not expireable " << *dn
<< dendl
;
3766 void MDCache::trim_unlinked_inodes()
3768 dout(7) << "trim_unlinked_inodes" << dendl
;
3771 for (auto &p
: inode_map
) {
3772 CInode
*in
= p
.second
;
3773 if (in
->get_parent_dn() == NULL
&& !in
->is_base()) {
3774 dout(7) << " will trim from " << *in
<< dendl
;
3778 if (!(++count
% 1000))
3779 mds
->heartbeat_reset();
3781 for (auto& in
: q
) {
3782 remove_inode_recursive(in
);
3784 if (!(++count
% 1000))
3785 mds
->heartbeat_reset();
3789 /** recalc_auth_bits()
3790 * once subtree auth is disambiguated, we need to adjust all the
3791 * auth and dirty bits in our cache before moving on.
3793 void MDCache::recalc_auth_bits(bool replay
)
3795 dout(7) << "recalc_auth_bits " << (replay
? "(replay)" : "") << dendl
;
3798 root
->inode_auth
.first
= mds
->mdsmap
->get_root();
3799 bool auth
= mds
->get_nodeid() == root
->inode_auth
.first
;
3801 root
->state_set(CInode::STATE_AUTH
);
3803 root
->state_clear(CInode::STATE_AUTH
);
3805 root
->state_set(CInode::STATE_REJOINING
);
3809 set
<CInode
*> subtree_inodes
;
3810 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3811 p
!= subtrees
.end();
3813 if (p
->first
->dir_auth
.first
== mds
->get_nodeid())
3814 subtree_inodes
.insert(p
->first
->inode
);
3817 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3818 p
!= subtrees
.end();
3820 if (p
->first
->inode
->is_mdsdir()) {
3821 CInode
*in
= p
->first
->inode
;
3822 bool auth
= in
->ino() == MDS_INO_MDSDIR(mds
->get_nodeid());
3824 in
->state_set(CInode::STATE_AUTH
);
3826 in
->state_clear(CInode::STATE_AUTH
);
3828 in
->state_set(CInode::STATE_REJOINING
);
3832 std::queue
<CDir
*> dfq
; // dirfrag queue
3835 bool auth
= p
->first
->authority().first
== mds
->get_nodeid();
3836 dout(10) << " subtree auth=" << auth
<< " for " << *p
->first
<< dendl
;
3838 while (!dfq
.empty()) {
3839 CDir
*dir
= dfq
.front();
3844 dir
->state_set(CDir::STATE_AUTH
);
3846 dir
->state_clear(CDir::STATE_AUTH
);
3848 // close empty non-auth dirfrag
3849 if (!dir
->is_subtree_root() && dir
->get_num_any() == 0) {
3850 dir
->inode
->close_dirfrag(dir
->get_frag());
3853 dir
->state_set(CDir::STATE_REJOINING
);
3854 dir
->state_clear(CDir::STATE_COMPLETE
);
3855 if (dir
->is_dirty())
3860 // dentries in this dir
3861 for (auto &p
: dir
->items
) {
3863 CDentry
*dn
= p
.second
;
3864 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3866 dn
->state_set(CDentry::STATE_AUTH
);
3868 dn
->state_clear(CDentry::STATE_AUTH
);
3870 dn
->state_set(CDentry::STATE_REJOINING
);
3876 if (dnl
->is_primary()) {
3878 CInode
*in
= dnl
->get_inode();
3880 in
->state_set(CInode::STATE_AUTH
);
3882 in
->state_clear(CInode::STATE_AUTH
);
3884 in
->state_set(CInode::STATE_REJOINING
);
3887 if (in
->is_dirty_parent())
3888 in
->clear_dirty_parent();
3889 // avoid touching scatterlocks for our subtree roots!
3890 if (subtree_inodes
.count(in
) == 0)
3891 in
->clear_scatter_dirty();
3896 auto&& dfv
= in
->get_nested_dirfrags();
3897 for (const auto& dir
: dfv
) {
3912 // ===========================================================================
3916 * notes on scatterlock recovery:
3918 * - recovering inode replica sends scatterlock data for any subtree
3919 * roots (the only ones that are possibly dirty).
3921 * - surviving auth incorporates any provided scatterlock data. any
3922 * pending gathers are then finished, as with the other lock types.
3924 * that takes care of surviving auth + (recovering replica)*.
3926 * - surviving replica sends strong_inode, which includes current
3927 * scatterlock state, AND any dirty scatterlock data. this
3928 * provides the recovering auth with everything it might need.
3930 * - recovering auth must pick initial scatterlock state based on
3931 * (weak|strong) rejoins.
3932 * - always assimilate scatterlock data (it can't hurt)
3933 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3934 * - include base inode in ack for all inodes that saw scatterlock content
3936 * also, for scatter gather,
3938 * - auth increments {frag,r}stat.version on completion of any gather.
3940 * - auth incorporates changes in a gather _only_ if the version
3943 * - replica discards changes any time the scatterlock syncs, and
3947 void MDCache::dump_rejoin_status(Formatter
*f
) const
3949 f
->open_object_section("rejoin_status");
3950 f
->dump_stream("rejoin_gather") << rejoin_gather
;
3951 f
->dump_stream("rejoin_ack_gather") << rejoin_ack_gather
;
3952 f
->dump_unsigned("num_opening_inodes", cap_imports_num_opening
);
3956 void MDCache::rejoin_start(MDSContext
*rejoin_done_
)
3958 dout(10) << "rejoin_start" << dendl
;
3959 ceph_assert(!rejoin_done
);
3960 rejoin_done
.reset(rejoin_done_
);
3962 rejoin_gather
= recovery_set
;
3963 // need finish opening cap inodes before sending cache rejoins
3964 rejoin_gather
.insert(mds
->get_nodeid());
3965 process_imported_caps();
3971 * this initiates rejoin. it should be called before we get any
3972 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3974 * we start out by sending rejoins to everyone in the recovery set.
3976 * if we are rejoin, send for all regions in our cache.
3977 * if we are active|stopping, send only to nodes that are rejoining.
3979 void MDCache::rejoin_send_rejoins()
3981 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set
<< dendl
;
3983 if (rejoin_gather
.count(mds
->get_nodeid())) {
3984 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl
;
3985 rejoins_pending
= true;
3988 if (!resolve_gather
.empty()) {
3989 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
3990 << resolve_gather
<< ")" << dendl
;
3991 rejoins_pending
= true;
3995 ceph_assert(!migrator
->is_importing());
3996 ceph_assert(!migrator
->is_exporting());
3998 if (!mds
->is_rejoin()) {
3999 disambiguate_other_imports();
4002 map
<mds_rank_t
, ref_t
<MMDSCacheRejoin
>> rejoins
;
4005 // if i am rejoining, send a rejoin to everyone.
4006 // otherwise, just send to others who are rejoining.
4007 for (const auto& rank
: recovery_set
) {
4008 if (rank
== mds
->get_nodeid()) continue; // nothing to myself!
4009 if (rejoin_sent
.count(rank
)) continue; // already sent a rejoin to this node!
4010 if (mds
->is_rejoin())
4011 rejoins
[rank
] = make_message
<MMDSCacheRejoin
>(MMDSCacheRejoin::OP_WEAK
);
4012 else if (mds
->mdsmap
->is_rejoin(rank
))
4013 rejoins
[rank
] = make_message
<MMDSCacheRejoin
>(MMDSCacheRejoin::OP_STRONG
);
4016 if (mds
->is_rejoin()) {
4017 map
<client_t
, pair
<Session
*, set
<mds_rank_t
> > > client_exports
;
4018 for (auto& p
: cap_exports
) {
4019 mds_rank_t target
= p
.second
.first
;
4020 if (rejoins
.count(target
) == 0)
4022 for (auto q
= p
.second
.second
.begin(); q
!= p
.second
.second
.end(); ) {
4023 Session
*session
= nullptr;
4024 auto it
= client_exports
.find(q
->first
);
4025 if (it
!= client_exports
.end()) {
4026 session
= it
->second
.first
;
4028 it
->second
.second
.insert(target
);
4030 session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
4031 auto& r
= client_exports
[q
->first
];
4034 r
.second
.insert(target
);
4039 // remove reconnect with no session
4040 p
.second
.second
.erase(q
++);
4043 rejoins
[target
]->cap_exports
[p
.first
] = p
.second
.second
;
4045 for (auto& p
: client_exports
) {
4046 Session
*session
= p
.second
.first
;
4047 for (auto& q
: p
.second
.second
) {
4048 auto rejoin
= rejoins
[q
];
4049 rejoin
->client_map
[p
.first
] = session
->info
.inst
;
4050 rejoin
->client_metadata_map
[p
.first
] = session
->info
.client_metadata
;
4056 // check all subtrees
4057 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
4058 p
!= subtrees
.end();
4060 CDir
*dir
= p
->first
;
4061 ceph_assert(dir
->is_subtree_root());
4062 if (dir
->is_ambiguous_dir_auth()) {
4063 // exporter is recovering, importer is survivor.
4064 ceph_assert(rejoins
.count(dir
->authority().first
));
4065 ceph_assert(!rejoins
.count(dir
->authority().second
));
4071 continue; // skip my own regions!
4073 mds_rank_t auth
= dir
->get_dir_auth().first
;
4074 ceph_assert(auth
>= 0);
4075 if (rejoins
.count(auth
) == 0)
4076 continue; // don't care about this node's subtrees
4078 rejoin_walk(dir
, rejoins
[auth
]);
4081 // rejoin root inodes, too
4082 for (auto &p
: rejoins
) {
4083 if (mds
->is_rejoin()) {
4085 if (p
.first
== 0 && root
) {
4086 p
.second
->add_weak_inode(root
->vino());
4087 if (root
->is_dirty_scattered()) {
4088 dout(10) << " sending scatterlock state on root " << *root
<< dendl
;
4089 p
.second
->add_scatterlock_state(root
);
4092 if (CInode
*in
= get_inode(MDS_INO_MDSDIR(p
.first
))) {
4094 p
.second
->add_weak_inode(in
->vino());
4098 if (p
.first
== 0 && root
) {
4099 p
.second
->add_strong_inode(root
->vino(),
4100 root
->get_replica_nonce(),
4101 root
->get_caps_wanted(),
4102 root
->filelock
.get_state(),
4103 root
->nestlock
.get_state(),
4104 root
->dirfragtreelock
.get_state());
4105 root
->state_set(CInode::STATE_REJOINING
);
4106 if (root
->is_dirty_scattered()) {
4107 dout(10) << " sending scatterlock state on root " << *root
<< dendl
;
4108 p
.second
->add_scatterlock_state(root
);
4112 if (CInode
*in
= get_inode(MDS_INO_MDSDIR(p
.first
))) {
4113 p
.second
->add_strong_inode(in
->vino(),
4114 in
->get_replica_nonce(),
4115 in
->get_caps_wanted(),
4116 in
->filelock
.get_state(),
4117 in
->nestlock
.get_state(),
4118 in
->dirfragtreelock
.get_state());
4119 in
->state_set(CInode::STATE_REJOINING
);
4124 if (!mds
->is_rejoin()) {
4125 // i am survivor. send strong rejoin.
4126 // note request remote_auth_pins, xlocks
4127 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
4128 p
!= active_requests
.end();
4130 MDRequestRef
& mdr
= p
->second
;
4134 for (const auto& q
: mdr
->object_states
) {
4135 if (q
.second
.remote_auth_pinned
== MDS_RANK_NONE
)
4137 if (!q
.first
->is_auth()) {
4138 mds_rank_t target
= q
.second
.remote_auth_pinned
;
4139 ceph_assert(target
== q
.first
->authority().first
);
4140 if (rejoins
.count(target
) == 0) continue;
4141 const auto& rejoin
= rejoins
[target
];
4143 dout(15) << " " << *mdr
<< " authpin on " << *q
.first
<< dendl
;
4144 MDSCacheObjectInfo i
;
4145 q
.first
->set_object_info(i
);
4147 rejoin
->add_inode_authpin(vinodeno_t(i
.ino
, i
.snapid
), mdr
->reqid
, mdr
->attempt
);
4149 rejoin
->add_dentry_authpin(i
.dirfrag
, i
.dname
, i
.snapid
, mdr
->reqid
, mdr
->attempt
);
4151 if (mdr
->has_more() && mdr
->more()->is_remote_frozen_authpin
&&
4152 mdr
->more()->rename_inode
== q
.first
)
4153 rejoin
->add_inode_frozen_authpin(vinodeno_t(i
.ino
, i
.snapid
),
4154 mdr
->reqid
, mdr
->attempt
);
4158 for (const auto& q
: mdr
->locks
) {
4160 auto obj
= lock
->get_parent();
4161 if (q
.is_xlock() && !obj
->is_auth()) {
4162 mds_rank_t who
= obj
->authority().first
;
4163 if (rejoins
.count(who
) == 0) continue;
4164 const auto& rejoin
= rejoins
[who
];
4166 dout(15) << " " << *mdr
<< " xlock on " << *lock
<< " " << *obj
<< dendl
;
4167 MDSCacheObjectInfo i
;
4168 obj
->set_object_info(i
);
4170 rejoin
->add_inode_xlock(vinodeno_t(i
.ino
, i
.snapid
), lock
->get_type(),
4171 mdr
->reqid
, mdr
->attempt
);
4173 rejoin
->add_dentry_xlock(i
.dirfrag
, i
.dname
, i
.snapid
,
4174 mdr
->reqid
, mdr
->attempt
);
4175 } else if (q
.is_remote_wrlock()) {
4176 mds_rank_t who
= q
.wrlock_target
;
4177 if (rejoins
.count(who
) == 0) continue;
4178 const auto& rejoin
= rejoins
[who
];
4180 dout(15) << " " << *mdr
<< " wrlock on " << *lock
<< " " << *obj
<< dendl
;
4181 MDSCacheObjectInfo i
;
4182 obj
->set_object_info(i
);
4184 rejoin
->add_inode_wrlock(vinodeno_t(i
.ino
, i
.snapid
), lock
->get_type(),
4185 mdr
->reqid
, mdr
->attempt
);
4191 // send the messages
4192 for (auto &p
: rejoins
) {
4193 ceph_assert(rejoin_sent
.count(p
.first
) == 0);
4194 ceph_assert(rejoin_ack_gather
.count(p
.first
) == 0);
4195 rejoin_sent
.insert(p
.first
);
4196 rejoin_ack_gather
.insert(p
.first
);
4197 mds
->send_message_mds(p
.second
, p
.first
);
4199 rejoin_ack_gather
.insert(mds
->get_nodeid()); // we need to complete rejoin_gather_finish, too
4200 rejoins_pending
= false;
4203 if (mds
->is_rejoin() && rejoin_gather
.empty()) {
4204 dout(10) << "nothing to rejoin" << dendl
;
4205 rejoin_gather_finish();
4211 * rejoin_walk - build rejoin declarations for a subtree
4213 * @param dir subtree root
4214 * @param rejoin rejoin message
4216 * from a rejoining node:
4218 * weak dentries (w/ connectivity)
4220 * from a surviving node:
4222 * strong dentries (no connectivity!)
4225 void MDCache::rejoin_walk(CDir
*dir
, const ref_t
<MMDSCacheRejoin
> &rejoin
)
4227 dout(10) << "rejoin_walk " << *dir
<< dendl
;
4229 std::vector
<CDir
*> nested
; // finish this dir, then do nested items
4231 if (mds
->is_rejoin()) {
4233 rejoin
->add_weak_dirfrag(dir
->dirfrag());
4234 for (auto &p
: dir
->items
) {
4235 CDentry
*dn
= p
.second
;
4236 ceph_assert(dn
->last
== CEPH_NOSNAP
);
4237 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4238 dout(15) << " add_weak_primary_dentry " << *dn
<< dendl
;
4239 ceph_assert(dnl
->is_primary());
4240 CInode
*in
= dnl
->get_inode();
4241 ceph_assert(dnl
->get_inode()->is_dir());
4242 rejoin
->add_weak_primary_dentry(dir
->ino(), dn
->get_name(), dn
->first
, dn
->last
, in
->ino());
4244 auto&& dirs
= in
->get_nested_dirfrags();
4245 nested
.insert(std::end(nested
), std::begin(dirs
), std::end(dirs
));
4247 if (in
->is_dirty_scattered()) {
4248 dout(10) << " sending scatterlock state on " << *in
<< dendl
;
4249 rejoin
->add_scatterlock_state(in
);
4254 dout(15) << " add_strong_dirfrag " << *dir
<< dendl
;
4255 rejoin
->add_strong_dirfrag(dir
->dirfrag(), dir
->get_replica_nonce(), dir
->get_dir_rep());
4256 dir
->state_set(CDir::STATE_REJOINING
);
4258 for (auto it
= dir
->items
.begin(); it
!= dir
->items
.end(); ) {
4259 CDentry
*dn
= it
->second
;
4261 dn
->state_set(CDentry::STATE_REJOINING
);
4262 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4263 CInode
*in
= dnl
->is_primary() ? dnl
->get_inode() : NULL
;
4265 // trim snap dentries. because they may have been pruned by
4266 // their auth mds (snap deleted)
4267 if (dn
->last
!= CEPH_NOSNAP
) {
4268 if (in
&& !in
->remote_parents
.empty()) {
4269 // unlink any stale remote snap dentry.
4270 for (auto it2
= in
->remote_parents
.begin(); it2
!= in
->remote_parents
.end(); ) {
4271 CDentry
*remote_dn
= *it2
;
4273 ceph_assert(remote_dn
->last
!= CEPH_NOSNAP
);
4274 remote_dn
->unlink_remote(remote_dn
->get_linkage());
4277 if (dn
->lru_is_expireable()) {
4278 if (!dnl
->is_null())
4279 dir
->unlink_inode(dn
, false);
4282 dir
->remove_dentry(dn
);
4285 // Inventing null/remote dentry shouldn't cause problem
4286 ceph_assert(!dnl
->is_primary());
4290 dout(15) << " add_strong_dentry " << *dn
<< dendl
;
4291 rejoin
->add_strong_dentry(dir
->dirfrag(), dn
->get_name(), dn
->get_alternate_name(),
4292 dn
->first
, dn
->last
,
4293 dnl
->is_primary() ? dnl
->get_inode()->ino():inodeno_t(0),
4294 dnl
->is_remote() ? dnl
->get_remote_ino():inodeno_t(0),
4295 dnl
->is_remote() ? dnl
->get_remote_d_type():0,
4296 dn
->get_replica_nonce(),
4297 dn
->lock
.get_state());
4298 dn
->state_set(CDentry::STATE_REJOINING
);
4299 if (dnl
->is_primary()) {
4300 CInode
*in
= dnl
->get_inode();
4301 dout(15) << " add_strong_inode " << *in
<< dendl
;
4302 rejoin
->add_strong_inode(in
->vino(),
4303 in
->get_replica_nonce(),
4304 in
->get_caps_wanted(),
4305 in
->filelock
.get_state(),
4306 in
->nestlock
.get_state(),
4307 in
->dirfragtreelock
.get_state());
4308 in
->state_set(CInode::STATE_REJOINING
);
4310 auto&& dirs
= in
->get_nested_dirfrags();
4311 nested
.insert(std::end(nested
), std::begin(dirs
), std::end(dirs
));
4313 if (in
->is_dirty_scattered()) {
4314 dout(10) << " sending scatterlock state on " << *in
<< dendl
;
4315 rejoin
->add_scatterlock_state(in
);
4321 // recurse into nested dirs
4322 for (const auto& dir
: nested
) {
4323 rejoin_walk(dir
, rejoin
);
4330 * - reply with the lockstate
4332 * if i am active|stopping,
4333 * - remove source from replica list for everything not referenced here.
4335 void MDCache::handle_cache_rejoin(const cref_t
<MMDSCacheRejoin
> &m
)
4337 dout(7) << "handle_cache_rejoin " << *m
<< " from " << m
->get_source()
4338 << " (" << m
->get_payload().length() << " bytes)"
4342 case MMDSCacheRejoin::OP_WEAK
:
4343 handle_cache_rejoin_weak(m
);
4345 case MMDSCacheRejoin::OP_STRONG
:
4346 handle_cache_rejoin_strong(m
);
4348 case MMDSCacheRejoin::OP_ACK
:
4349 handle_cache_rejoin_ack(m
);
4359 * handle_cache_rejoin_weak
4362 * - is recovering from their journal.
4363 * - may have incorrect (out of date) inode contents
4364 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4366 * if the sender didn't trim_non_auth(), they
4367 * - may have incorrect (out of date) dentry/inode linkage
4368 * - may have deleted/purged inodes
4369 * and i may have to go to disk to get accurate inode contents. yuck.
4371 void MDCache::handle_cache_rejoin_weak(const cref_t
<MMDSCacheRejoin
> &weak
)
4373 mds_rank_t from
= mds_rank_t(weak
->get_source().num());
4375 // possible response(s)
4376 ref_t
<MMDSCacheRejoin
> ack
; // if survivor
4377 set
<vinodeno_t
> acked_inodes
; // if survivor
4378 set
<SimpleLock
*> gather_locks
; // if survivor
4379 bool survivor
= false; // am i a survivor?
4381 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping()) {
4383 dout(10) << "i am a surivivor, and will ack immediately" << dendl
;
4384 ack
= make_message
<MMDSCacheRejoin
>(MMDSCacheRejoin::OP_ACK
);
4386 map
<inodeno_t
,map
<client_t
,Capability::Import
> > imported_caps
;
4388 // check cap exports
4389 for (auto p
= weak
->cap_exports
.begin(); p
!= weak
->cap_exports
.end(); ++p
) {
4390 CInode
*in
= get_inode(p
->first
);
4391 ceph_assert(!in
|| in
->is_auth());
4392 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
4393 dout(10) << " claiming cap import " << p
->first
<< " client." << q
->first
<< " on " << *in
<< dendl
;
4394 Capability
*cap
= rejoin_import_cap(in
, q
->first
, q
->second
, from
);
4395 Capability::Import
& im
= imported_caps
[p
->first
][q
->first
];
4397 im
.cap_id
= cap
->get_cap_id();
4398 im
.issue_seq
= cap
->get_last_seq();
4399 im
.mseq
= cap
->get_mseq();
4404 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);
4407 encode(imported_caps
, ack
->imported_caps
);
4409 ceph_assert(mds
->is_rejoin());
4411 // we may have already received a strong rejoin from the sender.
4412 rejoin_scour_survivor_replicas(from
, NULL
, acked_inodes
, gather_locks
);
4413 ceph_assert(gather_locks
.empty());
4415 // check cap exports.
4416 rejoin_client_map
.insert(weak
->client_map
.begin(), weak
->client_map
.end());
4417 rejoin_client_metadata_map
.insert(weak
->client_metadata_map
.begin(),
4418 weak
->client_metadata_map
.end());
4420 for (auto p
= weak
->cap_exports
.begin(); p
!= weak
->cap_exports
.end(); ++p
) {
4421 CInode
*in
= get_inode(p
->first
);
4422 ceph_assert(!in
|| in
->is_auth());
4424 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
4425 dout(10) << " claiming cap import " << p
->first
<< " client." << q
->first
<< dendl
;
4426 cap_imports
[p
->first
][q
->first
][from
] = q
->second
;
4431 // assimilate any potentially dirty scatterlock state
4432 for (const auto &p
: weak
->inode_scatterlocks
) {
4433 CInode
*in
= get_inode(p
.first
);
4435 in
->decode_lock_state(CEPH_LOCK_IFILE
, p
.second
.file
);
4436 in
->decode_lock_state(CEPH_LOCK_INEST
, p
.second
.nest
);
4437 in
->decode_lock_state(CEPH_LOCK_IDFT
, p
.second
.dft
);
4439 rejoin_potential_updated_scatterlocks
.insert(in
);
4442 // recovering peer may send incorrect dirfrags here. we need to
4443 // infer which dirfrag they meant. the ack will include a
4444 // strong_dirfrag that will set them straight on the fragmentation.
4447 set
<CDir
*> dirs_to_share
;
4448 for (const auto &p
: weak
->weak_dirfrags
) {
4449 CInode
*diri
= get_inode(p
.ino
);
4451 dout(0) << " missing dir ino " << p
.ino
<< dendl
;
4455 if (diri
->dirfragtree
.is_leaf(p
.frag
)) {
4456 leaves
.push_back(p
.frag
);
4458 diri
->dirfragtree
.get_leaves_under(p
.frag
, leaves
);
4460 leaves
.push_back(diri
->dirfragtree
[p
.frag
.value()]);
4462 for (const auto& leaf
: leaves
) {
4463 CDir
*dir
= diri
->get_dirfrag(leaf
);
4465 dout(0) << " missing dir for " << p
.frag
<< " (which maps to " << leaf
<< ") on " << *diri
<< dendl
;
4469 if (dirs_to_share
.count(dir
)) {
4470 dout(10) << " already have " << p
.frag
<< " -> " << leaf
<< " " << *dir
<< dendl
;
4472 dirs_to_share
.insert(dir
);
4473 unsigned nonce
= dir
->add_replica(from
);
4474 dout(10) << " have " << p
.frag
<< " -> " << leaf
<< " " << *dir
<< dendl
;
4476 ack
->add_strong_dirfrag(dir
->dirfrag(), nonce
, dir
->dir_rep
);
4477 ack
->add_dirfrag_base(dir
);
4483 for (const auto &p
: weak
->weak
) {
4484 CInode
*diri
= get_inode(p
.first
);
4486 dout(0) << " missing dir ino " << p
.first
<< dendl
;
4491 for (const auto &q
: p
.second
) {
4492 // locate proper dirfrag.
4493 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
4494 frag_t fg
= diri
->pick_dirfrag(q
.first
.name
);
4495 if (!dir
|| dir
->get_frag() != fg
) {
4496 dir
= diri
->get_dirfrag(fg
);
4498 dout(0) << " missing dir frag " << fg
<< " on " << *diri
<< dendl
;
4500 ceph_assert(dirs_to_share
.count(dir
));
4504 CDentry
*dn
= dir
->lookup(q
.first
.name
, q
.first
.snapid
);
4506 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4507 ceph_assert(dnl
->is_primary());
4509 if (survivor
&& dn
->is_replica(from
))
4510 dentry_remove_replica(dn
, from
, gather_locks
);
4511 unsigned dnonce
= dn
->add_replica(from
);
4512 dout(10) << " have " << *dn
<< dendl
;
4514 ack
->add_strong_dentry(dir
->dirfrag(), dn
->get_name(), dn
->get_alternate_name(),
4515 dn
->first
, dn
->last
,
4516 dnl
->get_inode()->ino(), inodeno_t(0), 0,
4517 dnonce
, dn
->lock
.get_replica_state());
4520 CInode
*in
= dnl
->get_inode();
4523 if (survivor
&& in
->is_replica(from
))
4524 inode_remove_replica(in
, from
, true, gather_locks
);
4525 unsigned inonce
= in
->add_replica(from
);
4526 dout(10) << " have " << *in
<< dendl
;
4528 // scatter the dirlock, just in case?
4529 if (!survivor
&& in
->is_dir() && in
->has_subtree_root_dirfrag())
4530 in
->filelock
.set_state(LOCK_MIX
);
4533 acked_inodes
.insert(in
->vino());
4534 ack
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
4536 in
->_encode_locks_state_for_rejoin(bl
, from
);
4537 ack
->add_inode_locks(in
, inonce
, bl
);
4542 // weak base inodes? (root, stray, etc.)
4543 for (set
<vinodeno_t
>::iterator p
= weak
->weak_inodes
.begin();
4544 p
!= weak
->weak_inodes
.end();
4546 CInode
*in
= get_inode(*p
);
4547 ceph_assert(in
); // hmm fixme wrt stray?
4548 if (survivor
&& in
->is_replica(from
))
4549 inode_remove_replica(in
, from
, true, gather_locks
);
4550 unsigned inonce
= in
->add_replica(from
);
4551 dout(10) << " have base " << *in
<< dendl
;
4554 acked_inodes
.insert(in
->vino());
4555 ack
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
4557 in
->_encode_locks_state_for_rejoin(bl
, from
);
4558 ack
->add_inode_locks(in
, inonce
, bl
);
4562 ceph_assert(rejoin_gather
.count(from
));
4563 rejoin_gather
.erase(from
);
4565 // survivor. do everything now.
4566 for (const auto &p
: weak
->inode_scatterlocks
) {
4567 CInode
*in
= get_inode(p
.first
);
4569 dout(10) << " including base inode (due to potential scatterlock update) " << *in
<< dendl
;
4570 acked_inodes
.insert(in
->vino());
4571 ack
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
4574 rejoin_scour_survivor_replicas(from
, ack
, acked_inodes
, gather_locks
);
4575 mds
->send_message(ack
, weak
->get_connection());
4577 for (set
<SimpleLock
*>::iterator p
= gather_locks
.begin(); p
!= gather_locks
.end(); ++p
) {
4578 if (!(*p
)->is_stable())
4579 mds
->locker
->eval_gather(*p
);
4583 if (rejoin_gather
.empty() && rejoin_ack_gather
.count(mds
->get_nodeid())) {
4584 rejoin_gather_finish();
4586 dout(7) << "still need rejoin from (" << rejoin_gather
<< ")" << dendl
;
4592 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4594 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4595 * ack, the replica dne, and we can remove it from our replica maps.
4597 void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from
, const cref_t
<MMDSCacheRejoin
> &ack
,
4598 set
<vinodeno_t
>& acked_inodes
,
4599 set
<SimpleLock
*>& gather_locks
)
4601 dout(10) << "rejoin_scour_survivor_replicas from mds." << from
<< dendl
;
4603 auto scour_func
= [this, from
, ack
, &acked_inodes
, &gather_locks
] (CInode
*in
) {
4605 if (in
->is_auth() &&
4606 in
->is_replica(from
) &&
4607 (ack
== NULL
|| acked_inodes
.count(in
->vino()) == 0)) {
4608 inode_remove_replica(in
, from
, false, gather_locks
);
4609 dout(10) << " rem " << *in
<< dendl
;
4615 const auto&& dfs
= in
->get_dirfrags();
4616 for (const auto& dir
: dfs
) {
4617 if (!dir
->is_auth())
4620 if (dir
->is_replica(from
) &&
4621 (ack
== NULL
|| ack
->strong_dirfrags
.count(dir
->dirfrag()) == 0)) {
4622 dir
->remove_replica(from
);
4623 dout(10) << " rem " << *dir
<< dendl
;
4627 for (auto &p
: dir
->items
) {
4628 CDentry
*dn
= p
.second
;
4630 if (dn
->is_replica(from
)) {
4632 const auto it
= ack
->strong_dentries
.find(dir
->dirfrag());
4633 if (it
!= ack
->strong_dentries
.end() && it
->second
.count(string_snap_t(dn
->get_name(), dn
->last
)) > 0) {
4637 dentry_remove_replica(dn
, from
, gather_locks
);
4638 dout(10) << " rem " << *dn
<< dendl
;
4644 for (auto &p
: inode_map
)
4645 scour_func(p
.second
);
4646 for (auto &p
: snap_inode_map
)
4647 scour_func(p
.second
);
4651 CInode
*MDCache::rejoin_invent_inode(inodeno_t ino
, snapid_t last
)
4653 CInode
*in
= new CInode(this, true, 2, last
);
4654 in
->_get_inode()->ino
= ino
;
4655 in
->state_set(CInode::STATE_REJOINUNDEF
);
4657 rejoin_undef_inodes
.insert(in
);
4658 dout(10) << " invented " << *in
<< dendl
;
4662 CDir
*MDCache::rejoin_invent_dirfrag(dirfrag_t df
)
4664 CInode
*in
= get_inode(df
.ino
);
4666 in
= rejoin_invent_inode(df
.ino
, CEPH_NOSNAP
);
4667 if (!in
->is_dir()) {
4668 ceph_assert(in
->state_test(CInode::STATE_REJOINUNDEF
));
4669 in
->_get_inode()->mode
= S_IFDIR
;
4670 in
->_get_inode()->dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
4672 CDir
*dir
= in
->get_or_open_dirfrag(this, df
.frag
);
4673 dir
->state_set(CDir::STATE_REJOINUNDEF
);
4674 rejoin_undef_dirfrags
.insert(dir
);
4675 dout(10) << " invented " << *dir
<< dendl
;
4679 void MDCache::handle_cache_rejoin_strong(const cref_t
<MMDSCacheRejoin
> &strong
)
4681 mds_rank_t from
= mds_rank_t(strong
->get_source().num());
4683 // only a recovering node will get a strong rejoin.
4684 if (!mds
->is_rejoin()) {
4685 if (mds
->get_want_state() == MDSMap::STATE_REJOIN
) {
4686 mds
->wait_for_rejoin(new C_MDS_RetryMessage(mds
, strong
));
4689 ceph_abort_msg("got unexpected rejoin message during recovery");
4692 // assimilate any potentially dirty scatterlock state
4693 for (const auto &p
: strong
->inode_scatterlocks
) {
4694 CInode
*in
= get_inode(p
.first
);
4696 in
->decode_lock_state(CEPH_LOCK_IFILE
, p
.second
.file
);
4697 in
->decode_lock_state(CEPH_LOCK_INEST
, p
.second
.nest
);
4698 in
->decode_lock_state(CEPH_LOCK_IDFT
, p
.second
.dft
);
4699 rejoin_potential_updated_scatterlocks
.insert(in
);
4702 rejoin_unlinked_inodes
[from
].clear();
4704 // surviving peer may send incorrect dirfrag here (maybe they didn't
4705 // get the fragment notify, or maybe we rolled back?). we need to
4706 // infer the right frag and get them with the program. somehow.
4707 // we don't normally send ACK.. so we'll need to bundle this with
4708 // MISSING or something.
4710 // strong dirfrags/dentries.
4711 // also process auth_pins, xlocks.
4712 for (const auto &p
: strong
->strong_dirfrags
) {
4713 auto& dirfrag
= p
.first
;
4714 CInode
*diri
= get_inode(dirfrag
.ino
);
4716 diri
= rejoin_invent_inode(dirfrag
.ino
, CEPH_NOSNAP
);
4717 CDir
*dir
= diri
->get_dirfrag(dirfrag
.frag
);
4718 bool refragged
= false;
4720 dout(10) << " have " << *dir
<< dendl
;
4722 if (diri
->state_test(CInode::STATE_REJOINUNDEF
))
4723 dir
= rejoin_invent_dirfrag(dirfrag_t(diri
->ino(), frag_t()));
4724 else if (diri
->dirfragtree
.is_leaf(dirfrag
.frag
))
4725 dir
= rejoin_invent_dirfrag(dirfrag
);
4728 dir
->add_replica(from
, p
.second
.nonce
);
4729 dir
->dir_rep
= p
.second
.dir_rep
;
4731 dout(10) << " frag " << dirfrag
<< " doesn't match dirfragtree " << *diri
<< dendl
;
4733 diri
->dirfragtree
.get_leaves_under(dirfrag
.frag
, leaves
);
4735 leaves
.push_back(diri
->dirfragtree
[dirfrag
.frag
.value()]);
4736 dout(10) << " maps to frag(s) " << leaves
<< dendl
;
4737 for (const auto& leaf
: leaves
) {
4738 CDir
*dir
= diri
->get_dirfrag(leaf
);
4740 dir
= rejoin_invent_dirfrag(dirfrag_t(diri
->ino(), leaf
));
4742 dout(10) << " have(approx) " << *dir
<< dendl
;
4743 dir
->add_replica(from
, p
.second
.nonce
);
4744 dir
->dir_rep
= p
.second
.dir_rep
;
4749 const auto it
= strong
->strong_dentries
.find(dirfrag
);
4750 if (it
!= strong
->strong_dentries
.end()) {
4751 const auto& dmap
= it
->second
;
4752 for (const auto &q
: dmap
) {
4753 const string_snap_t
& ss
= q
.first
;
4754 const MMDSCacheRejoin::dn_strong
& d
= q
.second
;
4757 dn
= dir
->lookup(ss
.name
, ss
.snapid
);
4759 frag_t fg
= diri
->pick_dirfrag(ss
.name
);
4760 dir
= diri
->get_dirfrag(fg
);
4762 dn
= dir
->lookup(ss
.name
, ss
.snapid
);
4765 if (d
.is_remote()) {
4766 dn
= dir
->add_remote_dentry(ss
.name
, d
.remote_ino
, d
.remote_d_type
, mempool::mds_co::string(d
.alternate_name
), d
.first
, ss
.snapid
);
4767 } else if (d
.is_null()) {
4768 dn
= dir
->add_null_dentry(ss
.name
, d
.first
, ss
.snapid
);
4770 CInode
*in
= get_inode(d
.ino
, ss
.snapid
);
4771 if (!in
) in
= rejoin_invent_inode(d
.ino
, ss
.snapid
);
4772 dn
= dir
->add_primary_dentry(ss
.name
, in
, mempool::mds_co::string(d
.alternate_name
), d
.first
, ss
.snapid
);
4774 dout(10) << " invented " << *dn
<< dendl
;
4776 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4779 const auto pinned_it
= strong
->authpinned_dentries
.find(dirfrag
);
4780 if (pinned_it
!= strong
->authpinned_dentries
.end()) {
4781 const auto peer_reqid_it
= pinned_it
->second
.find(ss
);
4782 if (peer_reqid_it
!= pinned_it
->second
.end()) {
4783 for (const auto &r
: peer_reqid_it
->second
) {
4784 dout(10) << " dn authpin by " << r
<< " on " << *dn
<< dendl
;
4786 // get/create peer mdrequest
4788 if (have_request(r
.reqid
))
4789 mdr
= request_get(r
.reqid
);
4791 mdr
= request_start_peer(r
.reqid
, r
.attempt
, strong
);
4798 const auto xlocked_it
= strong
->xlocked_dentries
.find(dirfrag
);
4799 if (xlocked_it
!= strong
->xlocked_dentries
.end()) {
4800 const auto ss_req_it
= xlocked_it
->second
.find(ss
);
4801 if (ss_req_it
!= xlocked_it
->second
.end()) {
4802 const MMDSCacheRejoin::peer_reqid
& r
= ss_req_it
->second
;
4803 dout(10) << " dn xlock by " << r
<< " on " << *dn
<< dendl
;
4804 MDRequestRef mdr
= request_get(r
.reqid
); // should have this from auth_pin above.
4805 ceph_assert(mdr
->is_auth_pinned(dn
));
4806 if (!mdr
->is_xlocked(&dn
->versionlock
)) {
4807 ceph_assert(dn
->versionlock
.can_xlock_local());
4808 dn
->versionlock
.get_xlock(mdr
, mdr
->get_client());
4809 mdr
->emplace_lock(&dn
->versionlock
, MutationImpl::LockOp::XLOCK
);
4811 if (dn
->lock
.is_stable())
4812 dn
->auth_pin(&dn
->lock
);
4813 dn
->lock
.set_state(LOCK_XLOCK
);
4814 dn
->lock
.get_xlock(mdr
, mdr
->get_client());
4815 mdr
->emplace_lock(&dn
->lock
, MutationImpl::LockOp::XLOCK
);
4819 dn
->add_replica(from
, d
.nonce
);
4820 dout(10) << " have " << *dn
<< dendl
;
4822 if (dnl
->is_primary()) {
4823 if (d
.is_primary()) {
4824 if (vinodeno_t(d
.ino
, ss
.snapid
) != dnl
->get_inode()->vino()) {
4825 // the survivor missed MDentryUnlink+MDentryLink messages ?
4826 ceph_assert(strong
->strong_inodes
.count(dnl
->get_inode()->vino()) == 0);
4827 CInode
*in
= get_inode(d
.ino
, ss
.snapid
);
4829 ceph_assert(in
->get_parent_dn());
4830 rejoin_unlinked_inodes
[from
].insert(in
);
4831 dout(7) << " sender has primary dentry but wrong inode" << dendl
;
4834 // the survivor missed MDentryLink message ?
4835 ceph_assert(strong
->strong_inodes
.count(dnl
->get_inode()->vino()) == 0);
4836 dout(7) << " sender doesn't have primay dentry" << dendl
;
4839 if (d
.is_primary()) {
4840 // the survivor missed MDentryUnlink message ?
4841 CInode
*in
= get_inode(d
.ino
, ss
.snapid
);
4843 ceph_assert(in
->get_parent_dn());
4844 rejoin_unlinked_inodes
[from
].insert(in
);
4845 dout(7) << " sender has primary dentry but we don't" << dendl
;
4852 for (const auto &p
: strong
->strong_inodes
) {
4853 CInode
*in
= get_inode(p
.first
);
4855 in
->add_replica(from
, p
.second
.nonce
);
4856 dout(10) << " have " << *in
<< dendl
;
4858 const MMDSCacheRejoin::inode_strong
& is
= p
.second
;
4861 if (is
.caps_wanted
) {
4862 in
->set_mds_caps_wanted(from
, is
.caps_wanted
);
4863 dout(15) << " inode caps_wanted " << ccap_string(is
.caps_wanted
)
4864 << " on " << *in
<< dendl
;
4868 // infer state from replica state:
4869 // * go to MIX if they might have wrlocks
4870 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4871 in
->filelock
.infer_state_from_strong_rejoin(is
.filelock
, !in
->is_dir()); // maybe also go to LOCK
4872 in
->nestlock
.infer_state_from_strong_rejoin(is
.nestlock
, false);
4873 in
->dirfragtreelock
.infer_state_from_strong_rejoin(is
.dftlock
, false);
4876 const auto authpinned_inodes_it
= strong
->authpinned_inodes
.find(in
->vino());
4877 if (authpinned_inodes_it
!= strong
->authpinned_inodes
.end()) {
4878 for (const auto& r
: authpinned_inodes_it
->second
) {
4879 dout(10) << " inode authpin by " << r
<< " on " << *in
<< dendl
;
4881 // get/create peer mdrequest
4883 if (have_request(r
.reqid
))
4884 mdr
= request_get(r
.reqid
);
4886 mdr
= request_start_peer(r
.reqid
, r
.attempt
, strong
);
4887 if (strong
->frozen_authpin_inodes
.count(in
->vino())) {
4888 ceph_assert(!in
->get_num_auth_pins());
4889 mdr
->freeze_auth_pin(in
);
4891 ceph_assert(!in
->is_frozen_auth_pin());
4897 const auto xlocked_inodes_it
= strong
->xlocked_inodes
.find(in
->vino());
4898 if (xlocked_inodes_it
!= strong
->xlocked_inodes
.end()) {
4899 for (const auto &q
: xlocked_inodes_it
->second
) {
4900 SimpleLock
*lock
= in
->get_lock(q
.first
);
4901 dout(10) << " inode xlock by " << q
.second
<< " on " << *lock
<< " on " << *in
<< dendl
;
4902 MDRequestRef mdr
= request_get(q
.second
.reqid
); // should have this from auth_pin above.
4903 ceph_assert(mdr
->is_auth_pinned(in
));
4904 if (!mdr
->is_xlocked(&in
->versionlock
)) {
4905 ceph_assert(in
->versionlock
.can_xlock_local());
4906 in
->versionlock
.get_xlock(mdr
, mdr
->get_client());
4907 mdr
->emplace_lock(&in
->versionlock
, MutationImpl::LockOp::XLOCK
);
4909 if (lock
->is_stable())
4911 lock
->set_state(LOCK_XLOCK
);
4912 if (lock
== &in
->filelock
)
4914 lock
->get_xlock(mdr
, mdr
->get_client());
4915 mdr
->emplace_lock(lock
, MutationImpl::LockOp::XLOCK
);
4920 for (const auto &p
: strong
->wrlocked_inodes
) {
4921 CInode
*in
= get_inode(p
.first
);
4922 for (const auto &q
: p
.second
) {
4923 SimpleLock
*lock
= in
->get_lock(q
.first
);
4924 for (const auto &r
: q
.second
) {
4925 dout(10) << " inode wrlock by " << r
<< " on " << *lock
<< " on " << *in
<< dendl
;
4926 MDRequestRef mdr
= request_get(r
.reqid
); // should have this from auth_pin above.
4928 ceph_assert(mdr
->is_auth_pinned(in
));
4929 lock
->set_state(LOCK_MIX
);
4930 if (lock
== &in
->filelock
)
4932 lock
->get_wrlock(true);
4933 mdr
->emplace_lock(lock
, MutationImpl::LockOp::WRLOCK
);
4939 ceph_assert(rejoin_gather
.count(from
));
4940 rejoin_gather
.erase(from
);
4941 if (rejoin_gather
.empty() && rejoin_ack_gather
.count(mds
->get_nodeid())) {
4942 rejoin_gather_finish();
4944 dout(7) << "still need rejoin from (" << rejoin_gather
<< ")" << dendl
;
4948 void MDCache::handle_cache_rejoin_ack(const cref_t
<MMDSCacheRejoin
> &ack
)
4950 dout(7) << "handle_cache_rejoin_ack from " << ack
->get_source() << dendl
;
4951 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
4953 ceph_assert(mds
->get_state() >= MDSMap::STATE_REJOIN
);
4954 bool survivor
= !mds
->is_rejoin();
4956 // for sending cache expire message
4957 set
<CInode
*> isolated_inodes
;
4958 set
<CInode
*> refragged_inodes
;
4959 list
<pair
<CInode
*,int> > updated_realms
;
4962 for (const auto &p
: ack
->strong_dirfrags
) {
4963 // we may have had incorrect dir fragmentation; refragment based
4964 // on what they auth tells us.
4965 CDir
*dir
= get_dirfrag(p
.first
);
4967 dir
= get_force_dirfrag(p
.first
, false);
4969 refragged_inodes
.insert(dir
->get_inode());
4972 CInode
*diri
= get_inode(p
.first
.ino
);
4974 // barebones inode; the full inode loop below will clean up.
4975 diri
= new CInode(this, false);
4976 auto _inode
= diri
->_get_inode();
4977 _inode
->ino
= p
.first
.ino
;
4978 _inode
->mode
= S_IFDIR
;
4979 _inode
->dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
4982 if (MDS_INO_MDSDIR(from
) == p
.first
.ino
) {
4983 diri
->inode_auth
= mds_authority_t(from
, CDIR_AUTH_UNKNOWN
);
4984 dout(10) << " add inode " << *diri
<< dendl
;
4986 diri
->inode_auth
= CDIR_AUTH_DEFAULT
;
4987 isolated_inodes
.insert(diri
);
4988 dout(10) << " unconnected dirfrag " << p
.first
<< dendl
;
4991 // barebones dirfrag; the full dirfrag loop below will clean up.
4992 dir
= diri
->add_dirfrag(new CDir(diri
, p
.first
.frag
, this, false));
4993 if (MDS_INO_MDSDIR(from
) == p
.first
.ino
||
4994 (dir
->authority() != CDIR_AUTH_UNDEF
&&
4995 dir
->authority().first
!= from
))
4996 adjust_subtree_auth(dir
, from
);
4997 dout(10) << " add dirfrag " << *dir
<< dendl
;
5000 dir
->set_replica_nonce(p
.second
.nonce
);
5001 dir
->state_clear(CDir::STATE_REJOINING
);
5002 dout(10) << " got " << *dir
<< dendl
;
5005 auto it
= ack
->strong_dentries
.find(p
.first
);
5006 if (it
!= ack
->strong_dentries
.end()) {
5007 for (const auto &q
: it
->second
) {
5008 CDentry
*dn
= dir
->lookup(q
.first
.name
, q
.first
.snapid
);
5010 dn
= dir
->add_null_dentry(q
.first
.name
, q
.second
.first
, q
.first
.snapid
);
5012 CDentry::linkage_t
*dnl
= dn
->get_linkage();
5014 ceph_assert(dn
->last
== q
.first
.snapid
);
5015 if (dn
->first
!= q
.second
.first
) {
5016 dout(10) << " adjust dn.first " << dn
->first
<< " -> " << q
.second
.first
<< " on " << *dn
<< dendl
;
5017 dn
->first
= q
.second
.first
;
5020 // may have bad linkage if we missed dentry link/unlink messages
5021 if (dnl
->is_primary()) {
5022 CInode
*in
= dnl
->get_inode();
5023 if (!q
.second
.is_primary() ||
5024 vinodeno_t(q
.second
.ino
, q
.first
.snapid
) != in
->vino()) {
5025 dout(10) << " had bad linkage for " << *dn
<< ", unlinking " << *in
<< dendl
;
5026 dir
->unlink_inode(dn
);
5028 } else if (dnl
->is_remote()) {
5029 if (!q
.second
.is_remote() ||
5030 q
.second
.remote_ino
!= dnl
->get_remote_ino() ||
5031 q
.second
.remote_d_type
!= dnl
->get_remote_d_type()) {
5032 dout(10) << " had bad linkage for " << *dn
<< dendl
;
5033 dir
->unlink_inode(dn
);
5036 if (!q
.second
.is_null())
5037 dout(10) << " had bad linkage for " << *dn
<< dendl
;
5040 // hmm, did we have the proper linkage here?
5041 if (dnl
->is_null() && !q
.second
.is_null()) {
5042 if (q
.second
.is_remote()) {
5043 dn
->dir
->link_remote_inode(dn
, q
.second
.remote_ino
, q
.second
.remote_d_type
);
5045 CInode
*in
= get_inode(q
.second
.ino
, q
.first
.snapid
);
5047 // barebones inode; assume it's dir, the full inode loop below will clean up.
5048 in
= new CInode(this, false, q
.second
.first
, q
.first
.snapid
);
5049 auto _inode
= in
->_get_inode();
5050 _inode
->ino
= q
.second
.ino
;
5051 _inode
->mode
= S_IFDIR
;
5052 _inode
->dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
5054 dout(10) << " add inode " << *in
<< dendl
;
5055 } else if (in
->get_parent_dn()) {
5056 dout(10) << " had bad linkage for " << *(in
->get_parent_dn())
5057 << ", unlinking " << *in
<< dendl
;
5058 in
->get_parent_dir()->unlink_inode(in
->get_parent_dn());
5060 dn
->dir
->link_primary_inode(dn
, in
);
5061 isolated_inodes
.erase(in
);
5065 dn
->set_replica_nonce(q
.second
.nonce
);
5066 dn
->lock
.set_state_rejoin(q
.second
.lock
, rejoin_waiters
, survivor
);
5067 dn
->state_clear(CDentry::STATE_REJOINING
);
5068 dout(10) << " got " << *dn
<< dendl
;
5073 for (const auto& in
: refragged_inodes
) {
5074 auto&& ls
= in
->get_nested_dirfrags();
5075 for (const auto& dir
: ls
) {
5076 if (dir
->is_auth() || ack
->strong_dirfrags
.count(dir
->dirfrag()))
5078 ceph_assert(dir
->get_num_any() == 0);
5079 in
->close_dirfrag(dir
->get_frag());
5084 for (const auto &p
: ack
->dirfrag_bases
) {
5085 CDir
*dir
= get_dirfrag(p
.first
);
5087 auto q
= p
.second
.cbegin();
5088 dir
->_decode_base(q
);
5089 dout(10) << " got dir replica " << *dir
<< dendl
;
5093 auto p
= ack
->inode_base
.cbegin();
5101 CInode
*in
= get_inode(ino
, last
);
5103 auto q
= basebl
.cbegin();
5106 sseq
= in
->snaprealm
->srnode
.seq
;
5107 in
->_decode_base(q
);
5108 if (in
->snaprealm
&& in
->snaprealm
->srnode
.seq
!= sseq
) {
5109 int snap_op
= sseq
> 0 ? CEPH_SNAP_OP_UPDATE
: CEPH_SNAP_OP_SPLIT
;
5110 updated_realms
.push_back(pair
<CInode
*,int>(in
, snap_op
));
5112 dout(10) << " got inode base " << *in
<< dendl
;
5116 p
= ack
->inode_locks
.cbegin();
5117 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5128 CInode
*in
= get_inode(ino
, last
);
5130 in
->set_replica_nonce(nonce
);
5131 auto q
= lockbl
.cbegin();
5132 in
->_decode_locks_rejoin(q
, rejoin_waiters
, rejoin_eval_locks
, survivor
);
5133 in
->state_clear(CInode::STATE_REJOINING
);
5134 dout(10) << " got inode locks " << *in
<< dendl
;
5137 // FIXME: This can happen if entire subtree, together with the inode subtree root
5138 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
5139 ceph_assert(isolated_inodes
.empty());
5141 map
<inodeno_t
,map
<client_t
,Capability::Import
> > peer_imported
;
5142 auto bp
= ack
->imported_caps
.cbegin();
5143 decode(peer_imported
, bp
);
5145 for (map
<inodeno_t
,map
<client_t
,Capability::Import
> >::iterator p
= peer_imported
.begin();
5146 p
!= peer_imported
.end();
5148 auto& ex
= cap_exports
.at(p
->first
);
5149 ceph_assert(ex
.first
== from
);
5150 for (map
<client_t
,Capability::Import
>::iterator q
= p
->second
.begin();
5151 q
!= p
->second
.end();
5153 auto r
= ex
.second
.find(q
->first
);
5154 ceph_assert(r
!= ex
.second
.end());
5156 dout(10) << " exporting caps for client." << q
->first
<< " ino " << p
->first
<< dendl
;
5157 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
5159 dout(10) << " no session for client." << p
->first
<< dendl
;
5164 // mark client caps stale.
5165 auto m
= make_message
<MClientCaps
>(CEPH_CAP_OP_EXPORT
, p
->first
, 0,
5166 r
->second
.capinfo
.cap_id
, 0,
5167 mds
->get_osd_epoch_barrier());
5168 m
->set_cap_peer(q
->second
.cap_id
, q
->second
.issue_seq
, q
->second
.mseq
,
5169 (q
->second
.cap_id
> 0 ? from
: -1), 0);
5170 mds
->send_message_client_counted(m
, session
);
5174 ceph_assert(ex
.second
.empty());
5177 for (auto p
: updated_realms
) {
5178 CInode
*in
= p
.first
;
5179 bool notify_clients
;
5180 if (mds
->is_rejoin()) {
5181 if (!rejoin_pending_snaprealms
.count(in
)) {
5182 in
->get(CInode::PIN_OPENINGSNAPPARENTS
);
5183 rejoin_pending_snaprealms
.insert(in
);
5185 notify_clients
= false;
5187 // notify clients if I'm survivor
5188 notify_clients
= true;
5190 do_realm_invalidate_and_update_notify(in
, p
.second
, notify_clients
);
5194 ceph_assert(rejoin_ack_gather
.count(from
));
5195 rejoin_ack_gather
.erase(from
);
5197 if (rejoin_gather
.empty()) {
5198 // eval unstable scatter locks after all wrlocks are rejoined.
5199 while (!rejoin_eval_locks
.empty()) {
5200 SimpleLock
*lock
= rejoin_eval_locks
.front();
5201 rejoin_eval_locks
.pop_front();
5202 if (!lock
->is_stable())
5203 mds
->locker
->eval_gather(lock
);
5207 if (rejoin_gather
.empty() && // make sure we've gotten our FULL inodes, too.
5208 rejoin_ack_gather
.empty()) {
5209 // finally, kickstart past snap parent opens
5212 dout(7) << "still need rejoin from (" << rejoin_gather
<< ")"
5213 << ", rejoin_ack from (" << rejoin_ack_gather
<< ")" << dendl
;
5217 mds
->queue_waiters(rejoin_waiters
);
5222 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5224 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5225 * messages that clean these guys up...
5227 void MDCache::rejoin_trim_undef_inodes()
5229 dout(10) << "rejoin_trim_undef_inodes" << dendl
;
5231 while (!rejoin_undef_inodes
.empty()) {
5232 set
<CInode
*>::iterator p
= rejoin_undef_inodes
.begin();
5234 rejoin_undef_inodes
.erase(p
);
5236 in
->clear_replica_map();
5238 // close out dirfrags
5240 const auto&& dfls
= in
->get_dirfrags();
5241 for (const auto& dir
: dfls
) {
5242 dir
->clear_replica_map();
5244 for (auto &p
: dir
->items
) {
5245 CDentry
*dn
= p
.second
;
5246 dn
->clear_replica_map();
5248 dout(10) << " trimming " << *dn
<< dendl
;
5249 dir
->remove_dentry(dn
);
5252 dout(10) << " trimming " << *dir
<< dendl
;
5253 in
->close_dirfrag(dir
->dirfrag().frag
);
5257 CDentry
*dn
= in
->get_parent_dn();
5259 dn
->clear_replica_map();
5260 dout(10) << " trimming " << *dn
<< dendl
;
5261 dn
->dir
->remove_dentry(dn
);
5263 dout(10) << " trimming " << *in
<< dendl
;
5268 ceph_assert(rejoin_undef_inodes
.empty());
5271 void MDCache::rejoin_gather_finish()
5273 dout(10) << "rejoin_gather_finish" << dendl
;
5274 ceph_assert(mds
->is_rejoin());
5275 ceph_assert(rejoin_ack_gather
.count(mds
->get_nodeid()));
5277 if (open_undef_inodes_dirfrags())
5280 if (process_imported_caps())
5283 choose_lock_states_and_reconnect_caps();
5285 identify_files_to_recover();
5288 // signal completion of fetches, rejoin_gather_finish, etc.
5289 rejoin_ack_gather
.erase(mds
->get_nodeid());
5291 // did we already get our acks too?
5292 if (rejoin_ack_gather
.empty()) {
5293 // finally, open snaprealms
5298 class C_MDC_RejoinOpenInoFinish
: public MDCacheContext
{
5301 C_MDC_RejoinOpenInoFinish(MDCache
*c
, inodeno_t i
) : MDCacheContext(c
), ino(i
) {}
5302 void finish(int r
) override
{
5303 mdcache
->rejoin_open_ino_finish(ino
, r
);
5307 void MDCache::rejoin_open_ino_finish(inodeno_t ino
, int ret
)
5309 dout(10) << "open_caps_inode_finish ino " << ino
<< " ret " << ret
<< dendl
;
5312 cap_imports_missing
.insert(ino
);
5313 } else if (ret
== mds
->get_nodeid()) {
5314 ceph_assert(get_inode(ino
));
5316 auto p
= cap_imports
.find(ino
);
5317 ceph_assert(p
!= cap_imports
.end());
5318 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5319 ceph_assert(q
->second
.count(MDS_RANK_NONE
));
5320 ceph_assert(q
->second
.size() == 1);
5321 rejoin_export_caps(p
->first
, q
->first
, q
->second
[MDS_RANK_NONE
], ret
);
5323 cap_imports
.erase(p
);
5326 ceph_assert(cap_imports_num_opening
> 0);
5327 cap_imports_num_opening
--;
5329 if (cap_imports_num_opening
== 0) {
5330 if (rejoin_gather
.empty())
5331 rejoin_gather_finish();
5332 else if (rejoin_gather
.count(mds
->get_nodeid()))
5333 process_imported_caps();
5337 class C_MDC_RejoinSessionsOpened
: public MDCacheLogContext
{
5339 map
<client_t
,pair
<Session
*,uint64_t> > session_map
;
5340 C_MDC_RejoinSessionsOpened(MDCache
*c
) : MDCacheLogContext(c
) {}
5341 void finish(int r
) override
{
5342 ceph_assert(r
== 0);
5343 mdcache
->rejoin_open_sessions_finish(session_map
);
5347 void MDCache::rejoin_open_sessions_finish(map
<client_t
,pair
<Session
*,uint64_t> >& session_map
)
5349 dout(10) << "rejoin_open_sessions_finish" << dendl
;
5350 mds
->server
->finish_force_open_sessions(session_map
);
5351 rejoin_session_map
.swap(session_map
);
5352 if (rejoin_gather
.empty())
5353 rejoin_gather_finish();
5356 void MDCache::rejoin_prefetch_ino_finish(inodeno_t ino
, int ret
)
5358 auto p
= cap_imports
.find(ino
);
5359 if (p
!= cap_imports
.end()) {
5360 dout(10) << __func__
<< " ino " << ino
<< " ret " << ret
<< dendl
;
5362 cap_imports_missing
.insert(ino
);
5363 } else if (ret
!= mds
->get_nodeid()) {
5364 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5365 ceph_assert(q
->second
.count(MDS_RANK_NONE
));
5366 ceph_assert(q
->second
.size() == 1);
5367 rejoin_export_caps(p
->first
, q
->first
, q
->second
[MDS_RANK_NONE
], ret
);
5369 cap_imports
.erase(p
);
5374 bool MDCache::process_imported_caps()
5376 dout(10) << "process_imported_caps" << dendl
;
5378 if (!open_file_table
.is_prefetched() &&
5379 open_file_table
.prefetch_inodes()) {
5380 open_file_table
.wait_for_prefetch(
5381 new MDSInternalContextWrapper(mds
,
5382 new LambdaContext([this](int r
) {
5383 ceph_assert(rejoin_gather
.count(mds
->get_nodeid()));
5384 process_imported_caps();
5391 for (auto& p
: cap_imports
) {
5392 CInode
*in
= get_inode(p
.first
);
5394 ceph_assert(in
->is_auth());
5395 cap_imports_missing
.erase(p
.first
);
5398 if (cap_imports_missing
.count(p
.first
) > 0)
5401 uint64_t parent_ino
= 0;
5402 std::string_view d_name
;
5403 for (auto& q
: p
.second
) {
5404 for (auto& r
: q
.second
) {
5405 auto &icr
= r
.second
;
5406 if (icr
.capinfo
.pathbase
&&
5407 icr
.path
.length() > 0 &&
5408 icr
.path
.find('/') == string::npos
) {
5409 parent_ino
= icr
.capinfo
.pathbase
;
5418 dout(10) << " opening missing ino " << p
.first
<< dendl
;
5419 cap_imports_num_opening
++;
5420 auto fin
= new C_MDC_RejoinOpenInoFinish(this, p
.first
);
5422 vector
<inode_backpointer_t
> ancestors
;
5423 ancestors
.push_back(inode_backpointer_t(parent_ino
, string
{d_name
}, 0));
5424 open_ino(p
.first
, (int64_t)-1, fin
, false, false, &ancestors
);
5426 open_ino(p
.first
, (int64_t)-1, fin
, false);
5428 if (!(cap_imports_num_opening
% 1000))
5429 mds
->heartbeat_reset();
5432 if (cap_imports_num_opening
> 0)
5435 // called by rejoin_gather_finish() ?
5436 if (rejoin_gather
.count(mds
->get_nodeid()) == 0) {
5437 if (!rejoin_client_map
.empty() &&
5438 rejoin_session_map
.empty()) {
5439 C_MDC_RejoinSessionsOpened
*finish
= new C_MDC_RejoinSessionsOpened(this);
5440 version_t pv
= mds
->server
->prepare_force_open_sessions(rejoin_client_map
,
5441 rejoin_client_metadata_map
,
5442 finish
->session_map
);
5443 ESessions
*le
= new ESessions(pv
, std::move(rejoin_client_map
),
5444 std::move(rejoin_client_metadata_map
));
5445 mds
->mdlog
->start_submit_entry(le
, finish
);
5446 mds
->mdlog
->flush();
5447 rejoin_client_map
.clear();
5448 rejoin_client_metadata_map
.clear();
5452 // process caps that were exported by peer rename
5453 for (map
<inodeno_t
,pair
<mds_rank_t
,map
<client_t
,Capability::Export
> > >::iterator p
= rejoin_peer_exports
.begin();
5454 p
!= rejoin_peer_exports
.end();
5456 CInode
*in
= get_inode(p
->first
);
5458 for (map
<client_t
,Capability::Export
>::iterator q
= p
->second
.second
.begin();
5459 q
!= p
->second
.second
.end();
5461 auto r
= rejoin_session_map
.find(q
->first
);
5462 if (r
== rejoin_session_map
.end())
5465 Session
*session
= r
->second
.first
;
5466 Capability
*cap
= in
->get_client_cap(q
->first
);
5468 cap
= in
->add_client_cap(q
->first
, session
);
5469 // add empty item to reconnected_caps
5470 (void)reconnected_caps
[p
->first
][q
->first
];
5472 cap
->merge(q
->second
, true);
5474 Capability::Import
& im
= rejoin_imported_caps
[p
->second
.first
][p
->first
][q
->first
];
5475 ceph_assert(cap
->get_last_seq() == im
.issue_seq
);
5476 ceph_assert(cap
->get_mseq() == im
.mseq
);
5477 cap
->set_cap_id(im
.cap_id
);
5478 // send cap import because we assigned a new cap ID
5479 do_cap_import(session
, in
, cap
, q
->second
.cap_id
, q
->second
.seq
, q
->second
.mseq
- 1,
5480 p
->second
.first
, CEPH_CAP_FLAG_AUTH
);
5483 rejoin_peer_exports
.clear();
5484 rejoin_imported_caps
.clear();
5486 // process cap imports
5487 // ino -> client -> frommds -> capex
5488 for (auto p
= cap_imports
.begin(); p
!= cap_imports
.end(); ) {
5489 CInode
*in
= get_inode(p
->first
);
5491 dout(10) << " still missing ino " << p
->first
5492 << ", will try again after replayed client requests" << dendl
;
5496 ceph_assert(in
->is_auth());
5497 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5500 auto r
= rejoin_session_map
.find(q
->first
);
5501 session
= (r
!= rejoin_session_map
.end() ? r
->second
.first
: nullptr);
5504 for (auto r
= q
->second
.begin(); r
!= q
->second
.end(); ++r
) {
5507 (void)rejoin_imported_caps
[r
->first
][p
->first
][q
->first
]; // all are zero
5511 Capability
*cap
= in
->reconnect_cap(q
->first
, r
->second
, session
);
5512 add_reconnected_cap(q
->first
, in
->ino(), r
->second
);
5513 if (r
->first
>= 0) {
5514 if (cap
->get_last_seq() == 0) // don't increase mseq if cap already exists
5516 do_cap_import(session
, in
, cap
, r
->second
.capinfo
.cap_id
, 0, 0, r
->first
, 0);
5518 Capability::Import
& im
= rejoin_imported_caps
[r
->first
][p
->first
][q
->first
];
5519 im
.cap_id
= cap
->get_cap_id();
5520 im
.issue_seq
= cap
->get_last_seq();
5521 im
.mseq
= cap
->get_mseq();
5525 cap_imports
.erase(p
++); // remove and move on
5530 ceph_assert(rejoin_gather
.count(mds
->get_nodeid()));
5531 rejoin_gather
.erase(mds
->get_nodeid());
5532 ceph_assert(!rejoin_ack_gather
.count(mds
->get_nodeid()));
5533 maybe_send_pending_rejoins();
5538 void MDCache::rebuild_need_snapflush(CInode
*head_in
, SnapRealm
*realm
,
5539 client_t client
, snapid_t snap_follows
)
5541 dout(10) << "rebuild_need_snapflush " << snap_follows
<< " on " << *head_in
<< dendl
;
5543 if (!realm
->has_snaps_in_range(snap_follows
+ 1, head_in
->first
- 1))
5546 const set
<snapid_t
>& snaps
= realm
->get_snaps();
5547 snapid_t follows
= snap_follows
;
5550 CInode
*in
= pick_inode_snap(head_in
, follows
);
5554 bool need_snapflush
= false;
5555 for (auto p
= snaps
.lower_bound(std::max
<snapid_t
>(in
->first
, (follows
+ 1)));
5556 p
!= snaps
.end() && *p
<= in
->last
;
5558 head_in
->add_need_snapflush(in
, *p
, client
);
5559 need_snapflush
= true;
5562 if (!need_snapflush
)
5565 dout(10) << " need snapflush from client." << client
<< " on " << *in
<< dendl
;
5567 if (in
->client_snap_caps
.empty()) {
5568 for (int i
= 0; i
< num_cinode_locks
; i
++) {
5569 int lockid
= cinode_lock_info
[i
].lock
;
5570 SimpleLock
*lock
= in
->get_lock(lockid
);
5573 lock
->set_state(LOCK_SNAP_SYNC
);
5574 lock
->get_wrlock(true);
5577 in
->client_snap_caps
.insert(client
);
5578 mds
->locker
->mark_need_snapflush_inode(in
);
5583 * choose lock states based on reconnected caps
5585 void MDCache::choose_lock_states_and_reconnect_caps()
5587 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl
;
5590 for (auto p
: inode_map
) {
5591 CInode
*in
= p
.second
;
5592 if (in
->last
!= CEPH_NOSNAP
)
5595 if (in
->is_auth() && !in
->is_base() && in
->get_inode()->is_dirty_rstat())
5596 in
->mark_dirty_rstat();
5599 auto q
= reconnected_caps
.find(in
->ino());
5600 if (q
!= reconnected_caps
.end()) {
5601 for (const auto &it
: q
->second
)
5602 dirty_caps
|= it
.second
.dirty_caps
;
5604 in
->choose_lock_states(dirty_caps
);
5605 dout(15) << " chose lock states on " << *in
<< dendl
;
5607 if (in
->snaprealm
&& !rejoin_pending_snaprealms
.count(in
)) {
5608 in
->get(CInode::PIN_OPENINGSNAPPARENTS
);
5609 rejoin_pending_snaprealms
.insert(in
);
5612 if (!(++count
% 1000))
5613 mds
->heartbeat_reset();
5617 void MDCache::prepare_realm_split(SnapRealm
*realm
, client_t client
, inodeno_t ino
,
5618 map
<client_t
,ref_t
<MClientSnap
>>& splits
)
5620 ref_t
<MClientSnap
> snap
;
5621 auto it
= splits
.find(client
);
5622 if (it
!= splits
.end()) {
5624 snap
->head
.op
= CEPH_SNAP_OP_SPLIT
;
5626 snap
= make_message
<MClientSnap
>(CEPH_SNAP_OP_SPLIT
);
5627 splits
.emplace(std::piecewise_construct
, std::forward_as_tuple(client
), std::forward_as_tuple(snap
));
5628 snap
->head
.split
= realm
->inode
->ino();
5629 snap
->bl
= realm
->get_snap_trace();
5631 for (const auto& child
: realm
->open_children
)
5632 snap
->split_realms
.push_back(child
->inode
->ino());
5634 snap
->split_inos
.push_back(ino
);
5637 void MDCache::prepare_realm_merge(SnapRealm
*realm
, SnapRealm
*parent_realm
,
5638 map
<client_t
,ref_t
<MClientSnap
>>& splits
)
5640 ceph_assert(parent_realm
);
5642 vector
<inodeno_t
> split_inos
;
5643 vector
<inodeno_t
> split_realms
;
5645 for (auto p
= realm
->inodes_with_caps
.begin(); !p
.end(); ++p
)
5646 split_inos
.push_back((*p
)->ino());
5647 for (set
<SnapRealm
*>::iterator p
= realm
->open_children
.begin();
5648 p
!= realm
->open_children
.end();
5650 split_realms
.push_back((*p
)->inode
->ino());
5652 for (const auto& p
: realm
->client_caps
) {
5653 ceph_assert(!p
.second
->empty());
5654 auto em
= splits
.emplace(std::piecewise_construct
, std::forward_as_tuple(p
.first
), std::forward_as_tuple());
5656 auto update
= make_message
<MClientSnap
>(CEPH_SNAP_OP_SPLIT
);
5657 update
->head
.split
= parent_realm
->inode
->ino();
5658 update
->split_inos
= split_inos
;
5659 update
->split_realms
= split_realms
;
5660 update
->bl
= parent_realm
->get_snap_trace();
5661 em
.first
->second
= std::move(update
);
5666 void MDCache::send_snaps(map
<client_t
,ref_t
<MClientSnap
>>& splits
)
5668 dout(10) << "send_snaps" << dendl
;
5670 for (auto &p
: splits
) {
5671 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(p
.first
.v
));
5673 dout(10) << " client." << p
.first
5674 << " split " << p
.second
->head
.split
5675 << " inos " << p
.second
->split_inos
5677 mds
->send_message_client_counted(p
.second
, session
);
5679 dout(10) << " no session for client." << p
.first
<< dendl
;
5687 * remove any items from logsegment open_file lists that don't have
5690 void MDCache::clean_open_file_lists()
5692 dout(10) << "clean_open_file_lists" << dendl
;
5694 for (map
<uint64_t,LogSegment
*>::iterator p
= mds
->mdlog
->segments
.begin();
5695 p
!= mds
->mdlog
->segments
.end();
5697 LogSegment
*ls
= p
->second
;
5699 elist
<CInode
*>::iterator q
= ls
->open_files
.begin(member_offset(CInode
, item_open_file
));
5703 if (in
->last
== CEPH_NOSNAP
) {
5704 dout(10) << " unlisting unwanted/capless inode " << *in
<< dendl
;
5705 in
->item_open_file
.remove_myself();
5707 if (in
->client_snap_caps
.empty()) {
5708 dout(10) << " unlisting flushed snap inode " << *in
<< dendl
;
5709 in
->item_open_file
.remove_myself();
5716 void MDCache::dump_openfiles(Formatter
*f
)
5718 f
->open_array_section("openfiles");
5719 for (auto p
= mds
->mdlog
->segments
.begin();
5720 p
!= mds
->mdlog
->segments
.end();
5722 LogSegment
*ls
= p
->second
;
5724 auto q
= ls
->open_files
.begin(member_offset(CInode
, item_open_file
));
5728 if ((in
->last
== CEPH_NOSNAP
&& !in
->is_any_caps_wanted())
5729 || (in
->last
!= CEPH_NOSNAP
&& in
->client_snap_caps
.empty()))
5731 f
->open_object_section("file");
5732 in
->dump(f
, CInode::DUMP_PATH
| CInode::DUMP_INODE_STORE_BASE
| CInode::DUMP_CAPS
);
5739 Capability
* MDCache::rejoin_import_cap(CInode
*in
, client_t client
, const cap_reconnect_t
& icr
, mds_rank_t frommds
)
5741 dout(10) << "rejoin_import_cap for client." << client
<< " from mds." << frommds
5742 << " on " << *in
<< dendl
;
5743 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(client
.v
));
5745 dout(10) << " no session for client." << client
<< dendl
;
5749 Capability
*cap
= in
->reconnect_cap(client
, icr
, session
);
5752 if (cap
->get_last_seq() == 0) // don't increase mseq if cap already exists
5754 do_cap_import(session
, in
, cap
, icr
.capinfo
.cap_id
, 0, 0, frommds
, 0);
5760 void MDCache::export_remaining_imported_caps()
5762 dout(10) << "export_remaining_imported_caps" << dendl
;
5764 CachedStackStringStream css
;
5767 for (auto p
= cap_imports
.begin(); p
!= cap_imports
.end(); ++p
) {
5768 *css
<< " ino " << p
->first
<< "\n";
5769 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5770 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
5772 // mark client caps stale.
5773 auto stale
= make_message
<MClientCaps
>(CEPH_CAP_OP_EXPORT
, p
->first
,
5775 mds
->get_osd_epoch_barrier());
5776 stale
->set_cap_peer(0, 0, 0, -1, 0);
5777 mds
->send_message_client_counted(stale
, q
->first
);
5781 if (!(++count
% 1000))
5782 mds
->heartbeat_reset();
5785 for (map
<inodeno_t
, MDSContext::vec
>::iterator p
= cap_reconnect_waiters
.begin();
5786 p
!= cap_reconnect_waiters
.end();
5788 mds
->queue_waiters(p
->second
);
5790 cap_imports
.clear();
5791 cap_reconnect_waiters
.clear();
5793 if (css
->strv().length()) {
5794 mds
->clog
->warn() << "failed to reconnect caps for missing inodes:"
5799 Capability
* MDCache::try_reconnect_cap(CInode
*in
, Session
*session
)
5801 client_t client
= session
->info
.get_client();
5802 Capability
*cap
= nullptr;
5803 const cap_reconnect_t
*rc
= get_replay_cap_reconnect(in
->ino(), client
);
5805 cap
= in
->reconnect_cap(client
, *rc
, session
);
5806 dout(10) << "try_reconnect_cap client." << client
5807 << " reconnect wanted " << ccap_string(rc
->capinfo
.wanted
)
5808 << " issue " << ccap_string(rc
->capinfo
.issued
)
5809 << " on " << *in
<< dendl
;
5810 remove_replay_cap_reconnect(in
->ino(), client
);
5812 if (in
->is_replicated()) {
5813 mds
->locker
->try_eval(in
, CEPH_CAP_LOCKS
);
5816 auto p
= reconnected_caps
.find(in
->ino());
5817 if (p
!= reconnected_caps
.end()) {
5818 auto q
= p
->second
.find(client
);
5819 if (q
!= p
->second
.end())
5820 dirty_caps
= q
->second
.dirty_caps
;
5822 in
->choose_lock_states(dirty_caps
);
5823 dout(15) << " chose lock states on " << *in
<< dendl
;
5826 map
<inodeno_t
, MDSContext::vec
>::iterator it
=
5827 cap_reconnect_waiters
.find(in
->ino());
5828 if (it
!= cap_reconnect_waiters
.end()) {
5829 mds
->queue_waiters(it
->second
);
5830 cap_reconnect_waiters
.erase(it
);
5839 // cap imports and delayed snap parent opens
5841 void MDCache::do_cap_import(Session
*session
, CInode
*in
, Capability
*cap
,
5842 uint64_t p_cap_id
, ceph_seq_t p_seq
, ceph_seq_t p_mseq
,
5843 int peer
, int p_flags
)
5845 SnapRealm
*realm
= in
->find_snaprealm();
5846 dout(10) << "do_cap_import " << session
->info
.inst
.name
<< " mseq " << cap
->get_mseq() << " on " << *in
<< dendl
;
5847 if (cap
->get_last_seq() == 0) // reconnected cap
5848 cap
->inc_last_seq();
5849 cap
->set_last_issue();
5850 cap
->set_last_issue_stamp(ceph_clock_now());
5852 auto reap
= make_message
<MClientCaps
>(CEPH_CAP_OP_IMPORT
,
5853 in
->ino(), realm
->inode
->ino(), cap
->get_cap_id(),
5854 cap
->get_last_seq(), cap
->pending(), cap
->wanted(),
5855 0, cap
->get_mseq(), mds
->get_osd_epoch_barrier());
5856 in
->encode_cap_message(reap
, cap
);
5857 reap
->snapbl
= realm
->get_snap_trace();
5858 reap
->set_cap_peer(p_cap_id
, p_seq
, p_mseq
, peer
, p_flags
);
5859 mds
->send_message_client_counted(reap
, session
);
5862 void MDCache::do_delayed_cap_imports()
5864 dout(10) << "do_delayed_cap_imports" << dendl
;
5866 ceph_assert(delayed_imported_caps
.empty());
5869 struct C_MDC_OpenSnapRealms
: public MDCacheContext
{
5870 explicit C_MDC_OpenSnapRealms(MDCache
*c
) : MDCacheContext(c
) {}
5871 void finish(int r
) override
{
5872 mdcache
->open_snaprealms();
5876 void MDCache::open_snaprealms()
5878 dout(10) << "open_snaprealms" << dendl
;
5880 auto it
= rejoin_pending_snaprealms
.begin();
5881 while (it
!= rejoin_pending_snaprealms
.end()) {
5883 SnapRealm
*realm
= in
->snaprealm
;
5886 map
<client_t
,ref_t
<MClientSnap
>> splits
;
5887 // finish off client snaprealm reconnects?
5888 auto q
= reconnected_snaprealms
.find(in
->ino());
5889 if (q
!= reconnected_snaprealms
.end()) {
5890 for (const auto& r
: q
->second
)
5891 finish_snaprealm_reconnect(r
.first
, realm
, r
.second
, splits
);
5892 reconnected_snaprealms
.erase(q
);
5895 for (auto p
= realm
->inodes_with_caps
.begin(); !p
.end(); ++p
) {
5897 auto q
= reconnected_caps
.find(child
->ino());
5898 ceph_assert(q
!= reconnected_caps
.end());
5899 for (auto r
= q
->second
.begin(); r
!= q
->second
.end(); ++r
) {
5900 Capability
*cap
= child
->get_client_cap(r
->first
);
5903 if (r
->second
.snap_follows
> 0) {
5904 if (r
->second
.snap_follows
< child
->first
- 1) {
5905 rebuild_need_snapflush(child
, realm
, r
->first
, r
->second
.snap_follows
);
5906 } else if (r
->second
.snapflush
) {
5907 // When processing a cap flush message that is re-sent, it's possble
5908 // that the sender has already released all WR caps. So we should
5909 // force MDCache::cow_inode() to setup CInode::client_need_snapflush.
5910 cap
->mark_needsnapflush();
5913 // make sure client's cap is in the correct snaprealm.
5914 if (r
->second
.realm_ino
!= in
->ino()) {
5915 prepare_realm_split(realm
, r
->first
, child
->ino(), splits
);
5920 rejoin_pending_snaprealms
.erase(it
++);
5921 in
->put(CInode::PIN_OPENINGSNAPPARENTS
);
5926 notify_global_snaprealm_update(CEPH_SNAP_OP_UPDATE
);
5928 if (!reconnected_snaprealms
.empty()) {
5929 dout(5) << "open_snaprealms has unconnected snaprealm:" << dendl
;
5930 for (auto& p
: reconnected_snaprealms
) {
5931 CachedStackStringStream css
;
5932 *css
<< " " << p
.first
<< " {";
5934 for (auto& q
: p
.second
) {
5937 *css
<< "client." << q
.first
<< "/" << q
.second
;
5940 dout(5) << css
->strv() << dendl
;
5943 ceph_assert(rejoin_waiters
.empty());
5944 ceph_assert(rejoin_pending_snaprealms
.empty());
5945 dout(10) << "open_snaprealms - all open" << dendl
;
5946 do_delayed_cap_imports();
5948 ceph_assert(rejoin_done
);
5949 rejoin_done
.release()->complete(0);
5950 reconnected_caps
.clear();
5953 bool MDCache::open_undef_inodes_dirfrags()
5955 dout(10) << "open_undef_inodes_dirfrags "
5956 << rejoin_undef_inodes
.size() << " inodes "
5957 << rejoin_undef_dirfrags
.size() << " dirfrags" << dendl
;
5959 set
<CDir
*> fetch_queue
= rejoin_undef_dirfrags
;
5961 for (set
<CInode
*>::iterator p
= rejoin_undef_inodes
.begin();
5962 p
!= rejoin_undef_inodes
.end();
5965 ceph_assert(!in
->is_base());
5966 ceph_assert(in
->get_parent_dir());
5967 fetch_queue
.insert(in
->get_parent_dir());
5970 if (fetch_queue
.empty())
5973 MDSGatherBuilder
gather(g_ceph_context
,
5974 new MDSInternalContextWrapper(mds
,
5975 new LambdaContext([this](int r
) {
5976 if (rejoin_gather
.empty())
5977 rejoin_gather_finish();
5982 for (set
<CDir
*>::iterator p
= fetch_queue
.begin();
5983 p
!= fetch_queue
.end();
5986 CInode
*diri
= dir
->get_inode();
5987 if (diri
->state_test(CInode::STATE_REJOINUNDEF
))
5989 if (dir
->state_test(CDir::STATE_REJOINUNDEF
))
5990 ceph_assert(diri
->dirfragtree
.is_leaf(dir
->get_frag()));
5991 dir
->fetch(gather
.new_sub());
5993 ceph_assert(gather
.has_subs());
5998 void MDCache::opened_undef_inode(CInode
*in
) {
5999 dout(10) << "opened_undef_inode " << *in
<< dendl
;
6000 rejoin_undef_inodes
.erase(in
);
6002 // FIXME: re-hash dentries if necessary
6003 ceph_assert(in
->get_inode()->dir_layout
.dl_dir_hash
== g_conf()->mds_default_dir_hash
);
6004 if (in
->get_num_dirfrags() && !in
->dirfragtree
.is_leaf(frag_t())) {
6005 CDir
*dir
= in
->get_dirfrag(frag_t());
6007 rejoin_undef_dirfrags
.erase(dir
);
6008 in
->force_dirfrags();
6009 auto&& ls
= in
->get_dirfrags();
6010 for (const auto& dir
: ls
) {
6011 rejoin_undef_dirfrags
.insert(dir
);
6017 void MDCache::finish_snaprealm_reconnect(client_t client
, SnapRealm
*realm
, snapid_t seq
,
6018 map
<client_t
,ref_t
<MClientSnap
>>& updates
)
6020 if (seq
< realm
->get_newest_seq()) {
6021 dout(10) << "finish_snaprealm_reconnect client." << client
<< " has old seq " << seq
<< " < "
6022 << realm
->get_newest_seq() << " on " << *realm
<< dendl
;
6023 auto snap
= make_message
<MClientSnap
>(CEPH_SNAP_OP_UPDATE
);
6024 snap
->bl
= realm
->get_snap_trace();
6025 for (const auto& child
: realm
->open_children
)
6026 snap
->split_realms
.push_back(child
->inode
->ino());
6027 updates
.emplace(std::piecewise_construct
, std::forward_as_tuple(client
), std::forward_as_tuple(snap
));
6029 dout(10) << "finish_snaprealm_reconnect client." << client
<< " up to date"
6030 << " on " << *realm
<< dendl
;
6036 void MDCache::rejoin_send_acks()
6038 dout(7) << "rejoin_send_acks" << dendl
;
6041 for (map
<mds_rank_t
, set
<CInode
*> >::iterator p
= rejoin_unlinked_inodes
.begin();
6042 p
!= rejoin_unlinked_inodes
.end();
6044 for (set
<CInode
*>::iterator q
= p
->second
.begin();
6045 q
!= p
->second
.end();
6048 dout(7) << " unlinked inode " << *in
<< dendl
;
6050 if (!in
->is_replica(p
->first
))
6053 CDentry
*dn
= in
->get_parent_dn();
6054 if (dn
->is_replica(p
->first
))
6056 dn
->add_replica(p
->first
);
6057 CDir
*dir
= dn
->get_dir();
6058 if (dir
->is_replica(p
->first
))
6060 dir
->add_replica(p
->first
);
6061 in
= dir
->get_inode();
6062 if (in
->is_replica(p
->first
))
6064 in
->add_replica(p
->first
);
6070 rejoin_unlinked_inodes
.clear();
6072 // send acks to everyone in the recovery set
6073 map
<mds_rank_t
,ref_t
<MMDSCacheRejoin
>> acks
;
6074 for (set
<mds_rank_t
>::iterator p
= recovery_set
.begin();
6075 p
!= recovery_set
.end();
6077 if (rejoin_ack_sent
.count(*p
))
6079 acks
[*p
] = make_message
<MMDSCacheRejoin
>(MMDSCacheRejoin::OP_ACK
);
6082 rejoin_ack_sent
= recovery_set
;
6085 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
6086 p
!= subtrees
.end();
6088 CDir
*dir
= p
->first
;
6089 if (!dir
->is_auth())
6091 dout(10) << "subtree " << *dir
<< dendl
;
6093 // auth items in this subtree
6094 std::queue
<CDir
*> dq
;
6097 while (!dq
.empty()) {
6098 CDir
*dir
= dq
.front();
6102 for (auto &r
: dir
->get_replicas()) {
6103 auto it
= acks
.find(r
.first
);
6104 if (it
== acks
.end())
6106 it
->second
->add_strong_dirfrag(dir
->dirfrag(), ++r
.second
, dir
->dir_rep
);
6107 it
->second
->add_dirfrag_base(dir
);
6110 for (auto &p
: dir
->items
) {
6111 CDentry
*dn
= p
.second
;
6112 CDentry::linkage_t
*dnl
= dn
->get_linkage();
6116 if (dnl
->is_primary())
6117 in
= dnl
->get_inode();
6120 for (auto &r
: dn
->get_replicas()) {
6121 auto it
= acks
.find(r
.first
);
6122 if (it
== acks
.end())
6124 it
->second
->add_strong_dentry(dir
->dirfrag(), dn
->get_name(), dn
->get_alternate_name(),
6125 dn
->first
, dn
->last
,
6126 dnl
->is_primary() ? dnl
->get_inode()->ino():inodeno_t(0),
6127 dnl
->is_remote() ? dnl
->get_remote_ino():inodeno_t(0),
6128 dnl
->is_remote() ? dnl
->get_remote_d_type():0,
6130 dn
->lock
.get_replica_state());
6131 // peer missed MDentrylink message ?
6132 if (in
&& !in
->is_replica(r
.first
))
6133 in
->add_replica(r
.first
);
6139 for (auto &r
: in
->get_replicas()) {
6140 auto it
= acks
.find(r
.first
);
6141 if (it
== acks
.end())
6143 it
->second
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
6145 in
->_encode_locks_state_for_rejoin(bl
, r
.first
);
6146 it
->second
->add_inode_locks(in
, ++r
.second
, bl
);
6149 // subdirs in this subtree?
6151 auto&& dirs
= in
->get_nested_dirfrags();
6152 for (const auto& dir
: dirs
) {
6161 if (root
&& root
->is_auth())
6162 for (auto &r
: root
->get_replicas()) {
6163 auto it
= acks
.find(r
.first
);
6164 if (it
== acks
.end())
6166 it
->second
->add_inode_base(root
, mds
->mdsmap
->get_up_features());
6168 root
->_encode_locks_state_for_rejoin(bl
, r
.first
);
6169 it
->second
->add_inode_locks(root
, ++r
.second
, bl
);
6172 for (auto &r
: myin
->get_replicas()) {
6173 auto it
= acks
.find(r
.first
);
6174 if (it
== acks
.end())
6176 it
->second
->add_inode_base(myin
, mds
->mdsmap
->get_up_features());
6178 myin
->_encode_locks_state_for_rejoin(bl
, r
.first
);
6179 it
->second
->add_inode_locks(myin
, ++r
.second
, bl
);
6182 // include inode base for any inodes whose scatterlocks may have updated
6183 for (set
<CInode
*>::iterator p
= rejoin_potential_updated_scatterlocks
.begin();
6184 p
!= rejoin_potential_updated_scatterlocks
.end();
6187 for (const auto &r
: in
->get_replicas()) {
6188 auto it
= acks
.find(r
.first
);
6189 if (it
== acks
.end())
6191 it
->second
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
6196 for (auto p
= acks
.begin(); p
!= acks
.end(); ++p
) {
6197 encode(rejoin_imported_caps
[p
->first
], p
->second
->imported_caps
);
6198 mds
->send_message_mds(p
->second
, p
->first
);
6201 rejoin_imported_caps
.clear();
6204 class C_MDC_ReIssueCaps
: public MDCacheContext
{
6207 C_MDC_ReIssueCaps(MDCache
*mdc
, CInode
*i
) :
6208 MDCacheContext(mdc
), in(i
)
6210 in
->get(CInode::PIN_PTRWAITER
);
6212 void finish(int r
) override
{
6213 if (!mdcache
->mds
->locker
->eval(in
, CEPH_CAP_LOCKS
))
6214 mdcache
->mds
->locker
->issue_caps(in
);
6215 in
->put(CInode::PIN_PTRWAITER
);
6219 void MDCache::reissue_all_caps()
6221 dout(10) << "reissue_all_caps" << dendl
;
6224 for (auto &p
: inode_map
) {
6226 CInode
*in
= p
.second
;
6227 if (in
->is_head() && in
->is_any_caps()) {
6228 // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
6229 if (in
->is_frozen_inode()) {
6230 in
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDC_ReIssueCaps(this, in
));
6233 if (!mds
->locker
->eval(in
, CEPH_CAP_LOCKS
))
6234 n
+= mds
->locker
->issue_caps(in
);
6237 if ((count
% 1000) + n
>= 1000)
6238 mds
->heartbeat_reset();
6244 // ===============================================================================
6246 struct C_MDC_QueuedCow
: public MDCacheContext
{
6249 C_MDC_QueuedCow(MDCache
*mdc
, CInode
*i
, MutationRef
& m
) :
6250 MDCacheContext(mdc
), in(i
), mut(m
) {}
6251 void finish(int r
) override
{
6252 mdcache
->_queued_file_recover_cow(in
, mut
);
6257 void MDCache::queue_file_recover(CInode
*in
)
6259 dout(10) << "queue_file_recover " << *in
<< dendl
;
6260 ceph_assert(in
->is_auth());
6264 SnapRealm *realm = in->find_snaprealm();
6265 set<snapid_t> s = realm->get_snaps();
6266 while (!s.empty() && *s.begin() < in->first)
6268 while (!s.empty() && *s.rbegin() > in->last)
6269 s.erase(*s.rbegin());
6270 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6272 auto pi = in->project_inode(mut);
6273 pi.inode.version = in->pre_dirty();
6275 auto mut(std::make_shared<MutationImpl>());
6276 mut->ls = mds->mdlog->get_current_segment();
6277 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6278 mds->mdlog->start_entry(le);
6279 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6281 s.erase(*s.begin());
6282 while (!s.empty()) {
6283 snapid_t snapid = *s.begin();
6284 CInode *cow_inode = 0;
6285 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
6286 ceph_assert(cow_inode);
6287 recovery_queue.enqueue(cow_inode);
6288 s.erase(*s.begin());
6291 in->parent->first = in->first;
6292 le->metablob.add_primary_dentry(in->parent, in, true);
6293 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6294 mds->mdlog->flush();
6298 recovery_queue
.enqueue(in
);
6301 void MDCache::_queued_file_recover_cow(CInode
*in
, MutationRef
& mut
)
6304 mds
->locker
->drop_locks(mut
.get());
6310 * called after recovery to recover file sizes for previously opened (for write)
6311 * files. that is, those where max_size > size.
6313 void MDCache::identify_files_to_recover()
6315 dout(10) << "identify_files_to_recover" << dendl
;
6317 for (auto &p
: inode_map
) {
6318 CInode
*in
= p
.second
;
6322 if (in
->last
!= CEPH_NOSNAP
)
6325 // Only normal files need file size recovery
6326 if (!in
->is_file()) {
6330 bool recover
= false;
6331 const auto& client_ranges
= in
->get_projected_inode()->client_ranges
;
6332 if (!client_ranges
.empty()) {
6333 in
->mark_clientwriteable();
6334 for (auto& p
: client_ranges
) {
6335 Capability
*cap
= in
->get_client_cap(p
.first
);
6337 cap
->mark_clientwriteable();
6339 dout(10) << " client." << p
.first
<< " has range " << p
.second
<< " but no cap on " << *in
<< dendl
;
6347 if (in
->filelock
.is_stable()) {
6348 in
->auth_pin(&in
->filelock
);
6350 ceph_assert(in
->filelock
.get_state() == LOCK_XLOCKSNAP
);
6352 in
->filelock
.set_state(LOCK_PRE_SCAN
);
6353 rejoin_recover_q
.push_back(in
);
6355 rejoin_check_q
.push_back(in
);
6358 if (!(++count
% 1000))
6359 mds
->heartbeat_reset();
6363 void MDCache::start_files_to_recover()
6366 for (CInode
*in
: rejoin_check_q
) {
6367 if (in
->filelock
.get_state() == LOCK_XLOCKSNAP
)
6368 mds
->locker
->issue_caps(in
);
6369 mds
->locker
->check_inode_max_size(in
);
6370 if (!(++count
% 1000))
6371 mds
->heartbeat_reset();
6373 rejoin_check_q
.clear();
6374 for (CInode
*in
: rejoin_recover_q
) {
6375 mds
->locker
->file_recover(&in
->filelock
);
6376 if (!(++count
% 1000))
6377 mds
->heartbeat_reset();
6379 if (!rejoin_recover_q
.empty()) {
6380 rejoin_recover_q
.clear();
6385 void MDCache::do_file_recover()
6387 recovery_queue
.advance();
6390 // ===============================================================================
6393 // ----------------------------
6396 class C_MDC_RetryTruncate
: public MDCacheContext
{
6400 C_MDC_RetryTruncate(MDCache
*c
, CInode
*i
, LogSegment
*l
) :
6401 MDCacheContext(c
), in(i
), ls(l
) {}
6402 void finish(int r
) override
{
6403 mdcache
->_truncate_inode(in
, ls
);
6407 void MDCache::truncate_inode(CInode
*in
, LogSegment
*ls
)
6409 const auto& pi
= in
->get_projected_inode();
6410 dout(10) << "truncate_inode "
6411 << pi
->truncate_from
<< " -> " << pi
->truncate_size
6415 ls
->truncating_inodes
.insert(in
);
6416 in
->get(CInode::PIN_TRUNCATING
);
6419 if (!in
->client_need_snapflush
.empty() &&
6420 (in
->get_caps_issued() & CEPH_CAP_FILE_BUFFER
)) {
6421 ceph_assert(in
->filelock
.is_xlocked());
6422 in
->filelock
.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in
, ls
));
6423 mds
->locker
->issue_caps(in
);
6427 _truncate_inode(in
, ls
);
6430 struct C_IO_MDC_TruncateFinish
: public MDCacheIOContext
{
6433 C_IO_MDC_TruncateFinish(MDCache
*c
, CInode
*i
, LogSegment
*l
) :
6434 MDCacheIOContext(c
, false), in(i
), ls(l
) {
6436 void finish(int r
) override
{
6437 ceph_assert(r
== 0 || r
== -CEPHFS_ENOENT
);
6438 mdcache
->truncate_inode_finish(in
, ls
);
6440 void print(ostream
& out
) const override
{
6441 out
<< "file_truncate(" << in
->ino() << ")";
6445 void MDCache::_truncate_inode(CInode
*in
, LogSegment
*ls
)
6447 const auto& pi
= in
->get_inode();
6448 dout(10) << "_truncate_inode "
6449 << pi
->truncate_from
<< " -> " << pi
->truncate_size
6450 << " on " << *in
<< dendl
;
6452 ceph_assert(pi
->is_truncating());
6453 ceph_assert(pi
->truncate_size
< (1ULL << 63));
6454 ceph_assert(pi
->truncate_from
< (1ULL << 63));
6455 ceph_assert(pi
->truncate_size
< pi
->truncate_from
);
6458 SnapRealm
*realm
= in
->find_snaprealm();
6459 SnapContext nullsnap
;
6460 const SnapContext
*snapc
;
6462 dout(10) << " realm " << *realm
<< dendl
;
6463 snapc
= &realm
->get_snap_context();
6465 dout(10) << " NO realm, using null context" << dendl
;
6467 ceph_assert(in
->last
== CEPH_NOSNAP
);
6469 dout(10) << "_truncate_inode snapc " << snapc
<< " on " << *in
<< dendl
;
6470 auto layout
= pi
->layout
;
6471 filer
.truncate(in
->ino(), &layout
, *snapc
,
6472 pi
->truncate_size
, pi
->truncate_from
-pi
->truncate_size
,
6473 pi
->truncate_seq
, ceph::real_time::min(), 0,
6474 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in
, ls
),
6478 struct C_MDC_TruncateLogged
: public MDCacheLogContext
{
6481 C_MDC_TruncateLogged(MDCache
*m
, CInode
*i
, MutationRef
& mu
) :
6482 MDCacheLogContext(m
), in(i
), mut(mu
) {}
6483 void finish(int r
) override
{
6484 mdcache
->truncate_inode_logged(in
, mut
);
6488 void MDCache::truncate_inode_finish(CInode
*in
, LogSegment
*ls
)
6490 dout(10) << "truncate_inode_finish " << *in
<< dendl
;
6492 set
<CInode
*>::iterator p
= ls
->truncating_inodes
.find(in
);
6493 ceph_assert(p
!= ls
->truncating_inodes
.end());
6494 ls
->truncating_inodes
.erase(p
);
6496 MutationRef
mut(new MutationImpl());
6497 mut
->ls
= mds
->mdlog
->get_current_segment();
6500 auto pi
= in
->project_inode(mut
);
6501 pi
.inode
->version
= in
->pre_dirty();
6502 pi
.inode
->truncate_from
= 0;
6503 pi
.inode
->truncate_pending
--;
6505 EUpdate
*le
= new EUpdate(mds
->mdlog
, "truncate finish");
6506 mds
->mdlog
->start_entry(le
);
6508 predirty_journal_parents(mut
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
6509 journal_dirty_inode(mut
.get(), &le
->metablob
, in
);
6510 le
->metablob
.add_truncate_finish(in
->ino(), ls
->seq
);
6511 mds
->mdlog
->submit_entry(le
, new C_MDC_TruncateLogged(this, in
, mut
));
6513 // flush immediately if there are readers/writers waiting
6514 if (in
->is_waiter_for(CInode::WAIT_TRUNC
) ||
6515 (in
->get_caps_wanted() & (CEPH_CAP_FILE_RD
|CEPH_CAP_FILE_WR
)))
6516 mds
->mdlog
->flush();
6519 void MDCache::truncate_inode_logged(CInode
*in
, MutationRef
& mut
)
6521 dout(10) << "truncate_inode_logged " << *in
<< dendl
;
6523 mds
->locker
->drop_locks(mut
.get());
6526 in
->put(CInode::PIN_TRUNCATING
);
6527 in
->auth_unpin(this);
6529 MDSContext::vec waiters
;
6530 in
->take_waiting(CInode::WAIT_TRUNC
, waiters
);
6531 mds
->queue_waiters(waiters
);
6535 void MDCache::add_recovered_truncate(CInode
*in
, LogSegment
*ls
)
6537 dout(20) << "add_recovered_truncate " << *in
<< " in log segment "
6538 << ls
->seq
<< "/" << ls
->offset
<< dendl
;
6539 ls
->truncating_inodes
.insert(in
);
6540 in
->get(CInode::PIN_TRUNCATING
);
6543 void MDCache::remove_recovered_truncate(CInode
*in
, LogSegment
*ls
)
6545 dout(20) << "remove_recovered_truncate " << *in
<< " in log segment "
6546 << ls
->seq
<< "/" << ls
->offset
<< dendl
;
6547 // if we have the logseg the truncate started in, it must be in our list.
6548 set
<CInode
*>::iterator p
= ls
->truncating_inodes
.find(in
);
6549 ceph_assert(p
!= ls
->truncating_inodes
.end());
6550 ls
->truncating_inodes
.erase(p
);
6551 in
->put(CInode::PIN_TRUNCATING
);
6554 void MDCache::start_recovered_truncates()
6556 dout(10) << "start_recovered_truncates" << dendl
;
6557 for (map
<uint64_t,LogSegment
*>::iterator p
= mds
->mdlog
->segments
.begin();
6558 p
!= mds
->mdlog
->segments
.end();
6560 LogSegment
*ls
= p
->second
;
6561 for (set
<CInode
*>::iterator q
= ls
->truncating_inodes
.begin();
6562 q
!= ls
->truncating_inodes
.end();
6567 if (!in
->client_need_snapflush
.empty() &&
6568 (in
->get_caps_issued() & CEPH_CAP_FILE_BUFFER
)) {
6569 ceph_assert(in
->filelock
.is_stable());
6570 in
->filelock
.set_state(LOCK_XLOCKDONE
);
6571 in
->auth_pin(&in
->filelock
);
6572 in
->filelock
.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in
, ls
));
6573 // start_files_to_recover will revoke caps
6576 _truncate_inode(in
, ls
);
6582 class C_MDS_purge_completed_finish
: public MDCacheLogContext
{
6583 interval_set
<inodeno_t
> inos
;
6585 version_t inotablev
;
6587 C_MDS_purge_completed_finish(MDCache
*m
, const interval_set
<inodeno_t
>& _inos
,
6588 LogSegment
*_ls
, version_t iv
)
6589 : MDCacheLogContext(m
), inos(_inos
), ls(_ls
), inotablev(iv
) {}
6590 void finish(int r
) override
{
6593 get_mds()->inotable
->apply_release_ids(inos
);
6594 assert(get_mds()->inotable
->get_version() == inotablev
);
6596 ls
->purge_inodes_finish(inos
);
6600 void MDCache::start_purge_inodes(){
6601 dout(10) << "start_purge_inodes" << dendl
;
6602 for (auto& p
: mds
->mdlog
->segments
){
6603 LogSegment
*ls
= p
.second
;
6604 if (ls
->purging_inodes
.size()){
6605 purge_inodes(ls
->purging_inodes
, ls
);
6610 void MDCache::purge_inodes(const interval_set
<inodeno_t
>& inos
, LogSegment
*ls
)
6612 dout(10) << __func__
<< " purging inos " << inos
<< " logseg " << ls
->seq
<< dendl
;
6613 // FIXME: handle non-default data pool and namespace
6615 auto cb
= new LambdaContext([this, inos
, ls
](int r
){
6616 assert(r
== 0 || r
== -2);
6617 mds
->inotable
->project_release_ids(inos
);
6618 version_t piv
= mds
->inotable
->get_projected_version();
6620 mds
->mdlog
->start_submit_entry(new EPurged(inos
, ls
->seq
, piv
),
6621 new C_MDS_purge_completed_finish(this, inos
, ls
, piv
));
6622 mds
->mdlog
->flush();
6625 C_GatherBuilder
gather(g_ceph_context
,
6626 new C_OnFinisher(new MDSIOContextWrapper(mds
, cb
), mds
->finisher
));
6627 SnapContext nullsnapc
;
6628 for (const auto& [start
, len
] : inos
) {
6629 for (auto i
= start
; i
< start
+ len
; i
+= 1) {
6630 filer
.purge_range(i
, &default_file_layout
, nullsnapc
, 0, 1,
6631 ceph::real_clock::now(), 0, gather
.new_sub());
6637 // ================================================================================
6640 std::pair
<bool, uint64_t> MDCache::trim_lru(uint64_t count
, expiremap
& expiremap
)
6642 bool is_standby_replay
= mds
->is_standby_replay();
6643 std::vector
<CDentry
*> unexpirables
;
6644 uint64_t trimmed
= 0;
6646 auto trim_threshold
= g_conf().get_val
<Option::size_t>("mds_cache_trim_threshold");
6648 dout(7) << "trim_lru trimming " << count
6649 << " items from LRU"
6650 << " size=" << lru
.lru_get_size()
6651 << " mid=" << lru
.lru_get_top()
6652 << " pintail=" << lru
.lru_get_pintail()
6653 << " pinned=" << lru
.lru_get_num_pinned()
6656 const uint64_t trim_counter_start
= trim_counter
.get();
6657 bool throttled
= false;
6659 throttled
|= trim_counter_start
+trimmed
>= trim_threshold
;
6660 if (throttled
) break;
6661 CDentry
*dn
= static_cast<CDentry
*>(bottom_lru
.lru_expire());
6664 if (trim_dentry(dn
, expiremap
)) {
6665 unexpirables
.push_back(dn
);
6671 for (auto &dn
: unexpirables
) {
6672 bottom_lru
.lru_insert_mid(dn
);
6674 unexpirables
.clear();
6676 // trim dentries from the LRU until count is reached
6677 // if mds is in standby_replay and skip trimming the inodes
6678 while (!throttled
&& (cache_toofull() || count
> 0 || is_standby_replay
)) {
6679 throttled
|= trim_counter_start
+trimmed
>= trim_threshold
;
6680 if (throttled
) break;
6681 CDentry
*dn
= static_cast<CDentry
*>(lru
.lru_expire());
6685 if (is_standby_replay
&& dn
->get_linkage()->inode
) {
6686 // we move the inodes that need to be trimmed to the end of the lru queue.
6687 // refer to MDCache::standby_trim_segment
6688 lru
.lru_insert_bot(dn
);
6690 } else if (trim_dentry(dn
, expiremap
)) {
6691 unexpirables
.push_back(dn
);
6694 if (count
> 0) count
--;
6697 trim_counter
.hit(trimmed
);
6699 for (auto &dn
: unexpirables
) {
6700 lru
.lru_insert_mid(dn
);
6702 unexpirables
.clear();
6704 dout(7) << "trim_lru trimmed " << trimmed
<< " items" << dendl
;
6705 return std::pair
<bool, uint64_t>(throttled
, trimmed
);
6709 * note: only called while MDS is active or stopping... NOT during recovery.
6710 * however, we may expire a replica whose authority is recovering.
6712 * @param count is number of dentries to try to expire
6714 std::pair
<bool, uint64_t> MDCache::trim(uint64_t count
)
6716 uint64_t used
= cache_size();
6717 uint64_t limit
= cache_memory_limit
;
6718 expiremap expiremap
;
6720 dout(7) << "trim bytes_used=" << bytes2str(used
)
6721 << " limit=" << bytes2str(limit
)
6722 << " reservation=" << cache_reservation
6723 << "% count=" << count
<< dendl
;
6725 // process delayed eval_stray()
6726 stray_manager
.advance_delayed();
6728 auto result
= trim_lru(count
, expiremap
);
6729 auto& trimmed
= result
.second
;
6731 // trim non-auth, non-bound subtrees
6732 for (auto p
= subtrees
.begin(); p
!= subtrees
.end();) {
6733 CDir
*dir
= p
->first
;
6735 CInode
*diri
= dir
->get_inode();
6736 if (dir
->is_auth()) {
6737 if (diri
->is_auth() && !diri
->is_base()) {
6738 /* this situation should correspond to an export pin */
6739 if (dir
->get_num_head_items() == 0 && dir
->get_num_ref() == 1) {
6740 /* pinned empty subtree, try to drop */
6741 if (dir
->state_test(CDir::STATE_AUXSUBTREE
)) {
6742 dout(20) << "trimming empty pinned subtree " << *dir
<< dendl
;
6743 dir
->state_clear(CDir::STATE_AUXSUBTREE
);
6744 remove_subtree(dir
);
6745 diri
->close_dirfrag(dir
->dirfrag().frag
);
6748 } else if (!diri
->is_auth() && !diri
->is_base() && dir
->get_num_head_items() == 0) {
6749 if (dir
->state_test(CDir::STATE_EXPORTING
) ||
6750 !(mds
->is_active() || mds
->is_stopping()) ||
6751 dir
->is_freezing() || dir
->is_frozen())
6754 migrator
->export_empty_import(dir
);
6757 } else if (!diri
->is_auth() && dir
->get_num_ref() <= 1) {
6759 if (diri
->get_num_ref() > diri
->get_num_subtree_roots()) {
6763 // don't trim subtree root if its auth MDS is recovering.
6764 // This simplify the cache rejoin code.
6765 if (dir
->is_subtree_root() && rejoin_ack_gather
.count(dir
->get_dir_auth().first
))
6767 trim_dirfrag(dir
, 0, expiremap
);
6773 if (mds
->is_stopping() && root
) {
6774 auto&& ls
= root
->get_dirfrags();
6775 for (const auto& dir
: ls
) {
6776 if (dir
->get_num_ref() == 1) { // subtree pin
6777 trim_dirfrag(dir
, 0, expiremap
);
6781 if (root
->get_num_ref() == 0) {
6782 trim_inode(0, root
, 0, expiremap
);
6787 std::set
<mds_rank_t
> stopping
;
6788 mds
->mdsmap
->get_mds_set(stopping
, MDSMap::STATE_STOPPING
);
6789 stopping
.erase(mds
->get_nodeid());
6790 for (auto rank
: stopping
) {
6791 CInode
* mdsdir_in
= get_inode(MDS_INO_MDSDIR(rank
));
6795 auto em
= expiremap
.emplace(std::piecewise_construct
, std::forward_as_tuple(rank
), std::forward_as_tuple());
6797 em
.first
->second
= make_message
<MCacheExpire
>(mds
->get_nodeid());
6800 dout(20) << __func__
<< ": try expiring " << *mdsdir_in
<< " for stopping mds." << mds
<< dendl
;
6802 const bool aborted
= expire_recursive(mdsdir_in
, expiremap
);
6804 dout(20) << __func__
<< ": successfully expired mdsdir" << dendl
;
6805 auto&& ls
= mdsdir_in
->get_dirfrags();
6806 for (auto dir
: ls
) {
6807 if (dir
->get_num_ref() == 1) { // subtree pin
6808 trim_dirfrag(dir
, dir
, expiremap
);
6812 if (mdsdir_in
->get_num_ref() == 0) {
6813 trim_inode(NULL
, mdsdir_in
, NULL
, expiremap
);
6817 dout(20) << __func__
<< ": some unexpirable contents in mdsdir" << dendl
;
6821 // Other rank's base inodes (when I'm stopping)
6822 if (mds
->is_stopping()) {
6823 for (set
<CInode
*>::iterator p
= base_inodes
.begin();
6824 p
!= base_inodes
.end();) {
6825 CInode
*base_in
= *p
;
6827 if (MDS_INO_IS_MDSDIR(base_in
->ino()) &&
6828 MDS_INO_MDSDIR_OWNER(base_in
->ino()) != mds
->get_nodeid()) {
6829 dout(20) << __func__
<< ": maybe trimming base: " << *base_in
<< dendl
;
6830 if (base_in
->get_num_ref() == 0) {
6831 trim_inode(NULL
, base_in
, NULL
, expiremap
);
6838 // send any expire messages
6839 send_expire_messages(expiremap
);
6844 void MDCache::send_expire_messages(expiremap
& expiremap
)
6847 for (const auto &p
: expiremap
) {
6848 if (mds
->is_cluster_degraded() &&
6849 (mds
->mdsmap
->get_state(p
.first
) < MDSMap::STATE_REJOIN
||
6850 (mds
->mdsmap
->get_state(p
.first
) == MDSMap::STATE_REJOIN
&&
6851 rejoin_sent
.count(p
.first
) == 0))) {
6854 dout(7) << "sending cache_expire to " << p
.first
<< dendl
;
6855 mds
->send_message_mds(p
.second
, p
.first
);
6861 bool MDCache::trim_dentry(CDentry
*dn
, expiremap
& expiremap
)
6863 dout(12) << "trim_dentry " << *dn
<< dendl
;
6865 CDentry::linkage_t
*dnl
= dn
->get_linkage();
6867 CDir
*dir
= dn
->get_dir();
6870 CDir
*con
= get_subtree_root(dir
);
6872 dout(12) << " in container " << *con
<< dendl
;
6874 dout(12) << " no container; under a not-yet-linked dir" << dendl
;
6875 ceph_assert(dn
->is_auth());
6878 // If replica dentry is not readable, it's likely we will receive
6879 // MDentryLink/MDentryUnlink message soon (It's possible we first
6880 // receive a MDentryUnlink message, then MDentryLink message)
6881 // MDentryLink message only replicates an inode, so we should
6882 // avoid trimming the inode's parent dentry. This is because that
6883 // unconnected replicas are problematic for subtree migration.
6884 if (!dn
->is_auth() && !dn
->lock
.can_read(-1) &&
6885 !dn
->get_dir()->get_inode()->is_stray())
6888 // adjust the dir state
6889 // NOTE: we can safely remove a clean, null dentry without effecting
6890 // directory completeness.
6891 // (check this _before_ we unlink the inode, below!)
6892 bool clear_complete
= false;
6893 if (!(dnl
->is_null() && dn
->is_clean()))
6894 clear_complete
= true;
6896 // unlink the dentry
6897 if (dnl
->is_remote()) {
6899 dir
->unlink_inode(dn
, false);
6900 } else if (dnl
->is_primary()) {
6901 // expire the inode, too.
6902 CInode
*in
= dnl
->get_inode();
6904 if (trim_inode(dn
, in
, con
, expiremap
))
6905 return true; // purging stray instead of trimming
6907 ceph_assert(dnl
->is_null());
6910 if (!dn
->is_auth()) {
6911 // notify dentry authority.
6912 mds_authority_t auth
= dn
->authority();
6914 for (int p
=0; p
<2; p
++) {
6915 mds_rank_t a
= auth
.first
;
6916 if (p
) a
= auth
.second
;
6917 if (a
< 0 || (p
== 1 && auth
.second
== auth
.first
)) break;
6918 if (mds
->get_nodeid() == auth
.second
&&
6919 con
->is_importing()) break; // don't send any expire while importing.
6920 if (a
== mds
->get_nodeid()) continue; // on export, ignore myself.
6922 dout(12) << " sending expire to mds." << a
<< " on " << *dn
<< dendl
;
6923 ceph_assert(a
!= mds
->get_nodeid());
6924 auto em
= expiremap
.emplace(std::piecewise_construct
, std::forward_as_tuple(a
), std::forward_as_tuple());
6926 em
.first
->second
= make_message
<MCacheExpire
>(mds
->get_nodeid());
6927 em
.first
->second
->add_dentry(con
->dirfrag(), dir
->dirfrag(), dn
->get_name(), dn
->last
, dn
->get_replica_nonce());
6932 if (dn
->last
== CEPH_NOSNAP
&& dir
->is_auth())
6933 dir
->add_to_bloom(dn
);
6934 dir
->remove_dentry(dn
);
6937 dir
->state_clear(CDir::STATE_COMPLETE
);
6939 if (mds
->logger
) mds
->logger
->inc(l_mds_inodes_expired
);
6944 void MDCache::trim_dirfrag(CDir
*dir
, CDir
*con
, expiremap
& expiremap
)
6946 dout(15) << "trim_dirfrag " << *dir
<< dendl
;
6948 if (dir
->is_subtree_root()) {
6949 ceph_assert(!dir
->is_auth() ||
6950 (!dir
->is_replicated() && dir
->inode
->is_base()));
6951 remove_subtree(dir
); // remove from subtree map
6953 ceph_assert(dir
->get_num_ref() == 0);
6955 CInode
*in
= dir
->get_inode();
6957 if (!dir
->is_auth()) {
6958 mds_authority_t auth
= dir
->authority();
6960 // was this an auth delegation? (if so, slightly modified container)
6962 if (dir
->is_subtree_root()) {
6963 dout(12) << " subtree root, container is " << *dir
<< dendl
;
6965 condf
= dir
->dirfrag();
6967 condf
= con
->dirfrag();
6970 for (int p
=0; p
<2; p
++) {
6971 mds_rank_t a
= auth
.first
;
6972 if (p
) a
= auth
.second
;
6973 if (a
< 0 || (p
== 1 && auth
.second
== auth
.first
)) break;
6974 if (mds
->get_nodeid() == auth
.second
&&
6975 con
->is_importing()) break; // don't send any expire while importing.
6976 if (a
== mds
->get_nodeid()) continue; // on export, ignore myself.
6978 dout(12) << " sending expire to mds." << a
<< " on " << *dir
<< dendl
;
6979 ceph_assert(a
!= mds
->get_nodeid());
6980 auto em
= expiremap
.emplace(std::piecewise_construct
, std::forward_as_tuple(a
), std::forward_as_tuple());
6982 em
.first
->second
= make_message
<MCacheExpire
>(mds
->get_nodeid()); /* new */
6983 em
.first
->second
->add_dir(condf
, dir
->dirfrag(), dir
->replica_nonce
);
6987 in
->close_dirfrag(dir
->dirfrag().frag
);
6991 * Try trimming an inode from the cache
6993 * @return true if the inode is still in cache, else false if it was trimmed
6995 bool MDCache::trim_inode(CDentry
*dn
, CInode
*in
, CDir
*con
, expiremap
& expiremap
)
6997 dout(15) << "trim_inode " << *in
<< dendl
;
6998 ceph_assert(in
->get_num_ref() == 0);
7001 // If replica inode's dirfragtreelock is not readable, it's likely
7002 // some dirfrags of the inode are being fragmented and we will receive
7003 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
7004 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
7005 // This is because that unconnected replicas are problematic for
7006 // subtree migration.
7008 if (!in
->is_auth() && !mds
->locker
->rdlock_try(&in
->dirfragtreelock
, -1)) {
7013 auto&& dfls
= in
->get_dirfrags();
7014 for (const auto& dir
: dfls
) {
7015 ceph_assert(!dir
->is_subtree_root());
7016 trim_dirfrag(dir
, con
? con
:dir
, expiremap
); // if no container (e.g. root dirfrag), use *p
7021 if (in
->is_auth()) {
7022 // eval stray after closing dirfrags
7023 if (dn
&& !dn
->state_test(CDentry::STATE_PURGING
)) {
7024 maybe_eval_stray(in
);
7025 if (dn
->state_test(CDentry::STATE_PURGING
) || dn
->get_num_ref() > 0)
7029 mds_authority_t auth
= in
->authority();
7033 df
= con
->dirfrag();
7035 df
= dirfrag_t(0,frag_t()); // must be a root or stray inode.
7037 for (int p
=0; p
<2; p
++) {
7038 mds_rank_t a
= auth
.first
;
7039 if (p
) a
= auth
.second
;
7040 if (a
< 0 || (p
== 1 && auth
.second
== auth
.first
)) break;
7041 if (con
&& mds
->get_nodeid() == auth
.second
&&
7042 con
->is_importing()) break; // don't send any expire while importing.
7043 if (a
== mds
->get_nodeid()) continue; // on export, ignore myself.
7045 dout(12) << " sending expire to mds." << a
<< " on " << *in
<< dendl
;
7046 ceph_assert(a
!= mds
->get_nodeid());
7047 auto em
= expiremap
.emplace(std::piecewise_construct
, std::forward_as_tuple(a
), std::forward_as_tuple());
7049 em
.first
->second
= make_message
<MCacheExpire
>(mds
->get_nodeid()); /* new */
7050 em
.first
->second
->add_inode(df
, in
->vino(), in
->get_replica_nonce());
7055 if (in->is_auth()) {
7056 if (in->hack_accessed)
7057 mds->logger->inc("outt");
7059 mds->logger->inc("outut");
7060 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
7067 dn
->get_dir()->unlink_inode(dn
, false);
7074 * trim_non_auth - remove any non-auth items from our cache
7076 * this reduces the amount of non-auth metadata in our cache, reducing the
7077 * load incurred by the rejoin phase.
7079 * the only non-auth items that remain are those that are needed to
7080 * attach our own subtrees to the root.
7082 * when we are done, all dentries will be in the top bit of the lru.
7084 * why we have to do this:
7085 * we may not have accurate linkage for non-auth items. which means we will
7086 * know which subtree it falls into, and can not be sure to declare it to the
7087 * correct authority.
7089 void MDCache::trim_non_auth()
7091 dout(7) << "trim_non_auth" << dendl
;
7093 // temporarily pin all subtree roots
7094 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
7095 p
!= subtrees
.end();
7097 p
->first
->get(CDir::PIN_SUBTREETEMP
);
7099 list
<CDentry
*> auth_list
;
7101 // trim non-auth items from the lru
7104 if (bottom_lru
.lru_get_size() > 0)
7105 dn
= static_cast<CDentry
*>(bottom_lru
.lru_expire());
7106 if (!dn
&& lru
.lru_get_size() > 0)
7107 dn
= static_cast<CDentry
*>(lru
.lru_expire());
7111 CDentry::linkage_t
*dnl
= dn
->get_linkage();
7113 if (dn
->is_auth()) {
7114 // add back into lru (at the top)
7115 auth_list
.push_back(dn
);
7117 if (dnl
->is_remote() && dnl
->get_inode() && !dnl
->get_inode()->is_auth())
7118 dn
->unlink_remote(dnl
);
7120 // non-auth. expire.
7121 CDir
*dir
= dn
->get_dir();
7124 // unlink the dentry
7125 dout(10) << " removing " << *dn
<< dendl
;
7126 if (dnl
->is_remote()) {
7127 dir
->unlink_inode(dn
, false);
7129 else if (dnl
->is_primary()) {
7130 CInode
*in
= dnl
->get_inode();
7131 dout(10) << " removing " << *in
<< dendl
;
7132 auto&& ls
= in
->get_dirfrags();
7133 for (const auto& subdir
: ls
) {
7134 ceph_assert(!subdir
->is_subtree_root());
7135 in
->close_dirfrag(subdir
->dirfrag().frag
);
7137 dir
->unlink_inode(dn
, false);
7141 ceph_assert(dnl
->is_null());
7144 ceph_assert(!dir
->has_bloom());
7145 dir
->remove_dentry(dn
);
7146 // adjust the dir state
7147 dir
->state_clear(CDir::STATE_COMPLETE
); // dir incomplete!
7148 // close empty non-auth dirfrag
7149 if (!dir
->is_subtree_root() && dir
->get_num_any() == 0)
7150 dir
->inode
->close_dirfrag(dir
->get_frag());
7154 for (const auto& dn
: auth_list
) {
7155 if (dn
->state_test(CDentry::STATE_BOTTOMLRU
))
7156 bottom_lru
.lru_insert_mid(dn
);
7158 lru
.lru_insert_top(dn
);
7161 // move everything in the pintail to the top bit of the lru.
7162 lru
.lru_touch_entire_pintail();
7164 // unpin all subtrees
7165 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
7166 p
!= subtrees
.end();
7168 p
->first
->put(CDir::PIN_SUBTREETEMP
);
7170 if (lru
.lru_get_size() == 0 &&
7171 bottom_lru
.lru_get_size() == 0) {
7172 // root, stray, etc.?
7173 auto p
= inode_map
.begin();
7174 while (p
!= inode_map
.end()) {
7175 CInode
*in
= p
->second
;
7177 if (!in
->is_auth()) {
7178 auto&& ls
= in
->get_dirfrags();
7179 for (const auto& dir
: ls
) {
7180 dout(10) << " removing " << *dir
<< dendl
;
7181 ceph_assert(dir
->get_num_ref() == 1); // SUBTREE
7182 remove_subtree(dir
);
7183 in
->close_dirfrag(dir
->dirfrag().frag
);
7185 dout(10) << " removing " << *in
<< dendl
;
7186 ceph_assert(!in
->get_parent_dn());
7187 ceph_assert(in
->get_num_ref() == 0);
7197 * Recursively trim the subtree rooted at directory to remove all
7198 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
7199 * of those links. This is used to clear invalid data out of the cache.
7200 * Note that it doesn't clear the passed-in directory, since that's not
7203 bool MDCache::trim_non_auth_subtree(CDir
*dir
)
7205 dout(10) << "trim_non_auth_subtree(" << dir
<< ") " << *dir
<< dendl
;
7207 bool keep_dir
= !can_trim_non_auth_dirfrag(dir
);
7209 auto j
= dir
->begin();
7211 while (j
!= dir
->end()) {
7213 CDentry
*dn
= i
->second
;
7214 dout(10) << "trim_non_auth_subtree(" << dir
<< ") Checking dentry " << dn
<< dendl
;
7215 CDentry::linkage_t
*dnl
= dn
->get_linkage();
7216 if (dnl
->is_primary()) { // check for subdirectories, etc
7217 CInode
*in
= dnl
->get_inode();
7218 bool keep_inode
= false;
7220 auto&& subdirs
= in
->get_dirfrags();
7221 for (const auto& subdir
: subdirs
) {
7222 if (subdir
->is_subtree_root()) {
7224 dout(10) << "trim_non_auth_subtree(" << dir
<< ") keeping " << *subdir
<< dendl
;
7226 if (trim_non_auth_subtree(subdir
))
7229 in
->close_dirfrag(subdir
->get_frag());
7230 dir
->state_clear(CDir::STATE_COMPLETE
); // now incomplete!
7236 if (!keep_inode
) { // remove it!
7237 dout(20) << "trim_non_auth_subtree(" << dir
<< ") removing inode " << in
<< " with dentry" << dn
<< dendl
;
7238 dir
->unlink_inode(dn
, false);
7240 ceph_assert(!dir
->has_bloom());
7241 dir
->remove_dentry(dn
);
7243 dout(20) << "trim_non_auth_subtree(" << dir
<< ") keeping inode " << in
<< " with dentry " << dn
<<dendl
;
7244 dn
->state_clear(CDentry::STATE_AUTH
);
7245 in
->state_clear(CInode::STATE_AUTH
);
7247 } else if (keep_dir
&& dnl
->is_null()) { // keep null dentry for peer rollback
7248 dout(20) << "trim_non_auth_subtree(" << dir
<< ") keeping dentry " << dn
<<dendl
;
7249 } else { // just remove it
7250 dout(20) << "trim_non_auth_subtree(" << dir
<< ") removing dentry " << dn
<< dendl
;
7251 if (dnl
->is_remote())
7252 dir
->unlink_inode(dn
, false);
7253 dir
->remove_dentry(dn
);
7256 dir
->state_clear(CDir::STATE_AUTH
);
7258 * We've now checked all our children and deleted those that need it.
7259 * Now return to caller, and tell them if *we're* a keeper.
7261 return keep_dir
|| dir
->get_num_any();
7265 * during replay, when we determine a subtree is no longer ours, we
7266 * try to trim it from our cache. because subtrees must be connected
7267 * to the root, the fact that we can trim this tree may mean that our
7268 * children or parents can also be trimmed.
7270 void MDCache::try_trim_non_auth_subtree(CDir
*dir
)
7272 dout(10) << "try_trim_nonauth_subtree " << *dir
<< dendl
;
7274 // can we now trim child subtrees?
7276 get_subtree_bounds(dir
, bounds
);
7277 for (set
<CDir
*>::iterator p
= bounds
.begin(); p
!= bounds
.end(); ++p
) {
7279 if (bd
->get_dir_auth().first
!= mds
->get_nodeid() && // we are not auth
7280 bd
->get_num_any() == 0 && // and empty
7281 can_trim_non_auth_dirfrag(bd
)) {
7282 CInode
*bi
= bd
->get_inode();
7283 dout(10) << " closing empty non-auth child subtree " << *bd
<< dendl
;
7286 bi
->close_dirfrag(bd
->get_frag());
7290 if (trim_non_auth_subtree(dir
)) {
7292 try_subtree_merge(dir
);
7294 // can we trim this subtree (and possibly our ancestors) too?
7296 CInode
*diri
= dir
->get_inode();
7297 if (diri
->is_base()) {
7298 if (!diri
->is_root() && diri
->authority().first
!= mds
->get_nodeid()) {
7299 dout(10) << " closing empty non-auth subtree " << *dir
<< dendl
;
7300 remove_subtree(dir
);
7302 diri
->close_dirfrag(dir
->get_frag());
7304 dout(10) << " removing " << *diri
<< dendl
;
7305 ceph_assert(!diri
->get_parent_dn());
7306 ceph_assert(diri
->get_num_ref() == 0);
7312 CDir
*psub
= get_subtree_root(diri
->get_parent_dir());
7313 dout(10) << " parent subtree is " << *psub
<< dendl
;
7314 if (psub
->get_dir_auth().first
== mds
->get_nodeid())
7315 break; // we are auth, keep.
7317 dout(10) << " closing empty non-auth subtree " << *dir
<< dendl
;
7318 remove_subtree(dir
);
7320 diri
->close_dirfrag(dir
->get_frag());
7322 dout(10) << " parent subtree also non-auth: " << *psub
<< dendl
;
7323 if (trim_non_auth_subtree(psub
))
7332 void MDCache::standby_trim_segment(LogSegment
*ls
)
7334 auto try_trim_inode
= [this](CInode
*in
) {
7335 if (in
->get_num_ref() == 0 &&
7336 !in
->item_open_file
.is_on_list() &&
7337 in
->parent
!= NULL
&&
7338 in
->parent
->get_num_ref() == 0){
7339 touch_dentry_bottom(in
->parent
);
7343 auto try_trim_dentry
= [this](CDentry
*dn
) {
7344 if (dn
->get_num_ref() > 0)
7346 auto in
= dn
->get_linkage()->inode
;
7347 if(in
&& in
->item_open_file
.is_on_list())
7349 touch_dentry_bottom(dn
);
7352 ls
->new_dirfrags
.clear_list();
7353 ls
->open_files
.clear_list();
7355 while (!ls
->dirty_dirfrags
.empty()) {
7356 CDir
*dir
= ls
->dirty_dirfrags
.front();
7359 try_trim_inode(dir
->inode
);
7361 while (!ls
->dirty_inodes
.empty()) {
7362 CInode
*in
= ls
->dirty_inodes
.front();
7366 while (!ls
->dirty_dentries
.empty()) {
7367 CDentry
*dn
= ls
->dirty_dentries
.front();
7369 try_trim_dentry(dn
);
7371 while (!ls
->dirty_parent_inodes
.empty()) {
7372 CInode
*in
= ls
->dirty_parent_inodes
.front();
7373 in
->clear_dirty_parent();
7376 while (!ls
->dirty_dirfrag_dir
.empty()) {
7377 CInode
*in
= ls
->dirty_dirfrag_dir
.front();
7378 in
->filelock
.remove_dirty();
7381 while (!ls
->dirty_dirfrag_nest
.empty()) {
7382 CInode
*in
= ls
->dirty_dirfrag_nest
.front();
7383 in
->nestlock
.remove_dirty();
7386 while (!ls
->dirty_dirfrag_dirfragtree
.empty()) {
7387 CInode
*in
= ls
->dirty_dirfrag_dirfragtree
.front();
7388 in
->dirfragtreelock
.remove_dirty();
7391 while (!ls
->truncating_inodes
.empty()) {
7392 auto it
= ls
->truncating_inodes
.begin();
7394 ls
->truncating_inodes
.erase(it
);
7395 in
->put(CInode::PIN_TRUNCATING
);
7400 void MDCache::handle_cache_expire(const cref_t
<MCacheExpire
> &m
)
7402 mds_rank_t from
= mds_rank_t(m
->get_from());
7404 dout(7) << "cache_expire from mds." << from
<< dendl
;
7406 if (mds
->get_state() < MDSMap::STATE_REJOIN
) {
7410 set
<SimpleLock
*> gather_locks
;
7412 for (const auto &p
: m
->realms
) {
7414 if (p
.first
.ino
> 0) {
7415 CInode
*expired_inode
= get_inode(p
.first
.ino
);
7416 ceph_assert(expired_inode
); // we had better have this.
7417 CDir
*parent_dir
= expired_inode
->get_approx_dirfrag(p
.first
.frag
);
7418 ceph_assert(parent_dir
);
7420 int export_state
= -1;
7421 if (parent_dir
->is_auth() && parent_dir
->is_exporting()) {
7422 export_state
= migrator
->get_export_state(parent_dir
);
7423 ceph_assert(export_state
>= 0);
7426 if (!parent_dir
->is_auth() ||
7427 (export_state
!= -1 &&
7428 ((export_state
== Migrator::EXPORT_WARNING
&&
7429 migrator
->export_has_warned(parent_dir
,from
)) ||
7430 export_state
== Migrator::EXPORT_EXPORTING
||
7431 export_state
== Migrator::EXPORT_LOGGINGFINISH
||
7432 (export_state
== Migrator::EXPORT_NOTIFYING
&&
7433 !migrator
->export_has_notified(parent_dir
,from
))))) {
7436 dout(7) << "delaying nonauth|warned expires for " << *parent_dir
<< dendl
;
7437 ceph_assert(parent_dir
->is_frozen_tree_root());
7439 // make a message container
7441 auto em
= delayed_expire
[parent_dir
].emplace(std::piecewise_construct
, std::forward_as_tuple(from
), std::forward_as_tuple());
7443 em
.first
->second
= make_message
<MCacheExpire
>(from
); /* new */
7445 // merge these expires into it
7446 em
.first
->second
->add_realm(p
.first
, p
.second
);
7449 ceph_assert(export_state
<= Migrator::EXPORT_PREPPING
||
7450 (export_state
== Migrator::EXPORT_WARNING
&&
7451 !migrator
->export_has_warned(parent_dir
, from
)));
7453 dout(7) << "expires for " << *parent_dir
<< dendl
;
7455 dout(7) << "containerless expires (root, stray inodes)" << dendl
;
7459 for (const auto &q
: p
.second
.inodes
) {
7460 CInode
*in
= get_inode(q
.first
);
7461 unsigned nonce
= q
.second
;
7464 dout(0) << " inode expire on " << q
.first
<< " from " << from
7465 << ", don't have it" << dendl
;
7468 ceph_assert(in
->is_auth());
7469 dout(20) << __func__
<< ": expiring inode " << *in
<< dendl
;
7472 if (nonce
== in
->get_replica_nonce(from
)) {
7473 // remove from our cached_by
7474 dout(7) << " inode expire on " << *in
<< " from mds." << from
7475 << " cached_by was " << in
->get_replicas() << dendl
;
7476 inode_remove_replica(in
, from
, false, gather_locks
);
7479 // this is an old nonce, ignore expire.
7480 dout(7) << " inode expire on " << *in
<< " from mds." << from
7481 << " with old nonce " << nonce
7482 << " (current " << in
->get_replica_nonce(from
) << "), dropping"
7488 for (const auto &q
: p
.second
.dirs
) {
7489 CDir
*dir
= get_dirfrag(q
.first
);
7490 unsigned nonce
= q
.second
;
7493 CInode
*diri
= get_inode(q
.first
.ino
);
7495 if (mds
->is_rejoin() &&
7496 rejoin_ack_gather
.count(mds
->get_nodeid()) && // haven't sent rejoin ack yet
7497 !diri
->is_replica(from
)) {
7498 auto&& ls
= diri
->get_nested_dirfrags();
7499 dout(7) << " dir expire on dirfrag " << q
.first
<< " from mds." << from
7500 << " while rejoining, inode isn't replicated" << dendl
;
7501 for (const auto& d
: ls
) {
7503 if (dir
->is_replica(from
)) {
7504 dout(7) << " dir expire on " << *dir
<< " from mds." << from
<< dendl
;
7505 dir
->remove_replica(from
);
7510 CDir
*other
= diri
->get_approx_dirfrag(q
.first
.frag
);
7512 dout(7) << " dir expire on dirfrag " << q
.first
<< " from mds." << from
7513 << " have " << *other
<< ", mismatched frags, dropping" << dendl
;
7517 dout(0) << " dir expire on " << q
.first
<< " from " << from
7518 << ", don't have it" << dendl
;
7521 dout(20) << __func__
<< ": expiring dirfrag " << *dir
<< dendl
;
7523 ceph_assert(dir
->is_auth());
7526 if (nonce
== dir
->get_replica_nonce(from
)) {
7527 // remove from our cached_by
7528 dout(7) << " dir expire on " << *dir
<< " from mds." << from
7529 << " replicas was " << dir
->get_replicas() << dendl
;
7530 dir
->remove_replica(from
);
7533 // this is an old nonce, ignore expire.
7534 dout(7) << " dir expire on " << *dir
<< " from mds." << from
7535 << " with old nonce " << nonce
<< " (current " << dir
->get_replica_nonce(from
)
7536 << "), dropping" << dendl
;
7541 for (const auto &pd
: p
.second
.dentries
) {
7542 dout(10) << " dn expires in dir " << pd
.first
<< dendl
;
7543 CInode
*diri
= get_inode(pd
.first
.ino
);
7545 CDir
*dir
= diri
->get_dirfrag(pd
.first
.frag
);
7548 dout(0) << " dn expires on " << pd
.first
<< " from " << from
7549 << ", must have refragmented" << dendl
;
7551 ceph_assert(dir
->is_auth());
7554 for (const auto &p
: pd
.second
) {
7555 unsigned nonce
= p
.second
;
7559 dn
= dir
->lookup(p
.first
.first
, p
.first
.second
);
7561 // which dirfrag for this dentry?
7562 CDir
*dir
= diri
->get_dirfrag(diri
->pick_dirfrag(p
.first
.first
));
7564 ceph_assert(dir
->is_auth());
7565 dn
= dir
->lookup(p
.first
.first
, p
.first
.second
);
7570 dout(0) << " missing dentry for " << p
.first
.first
<< " snap " << p
.first
.second
<< " in " << *dir
<< dendl
;
7572 dout(0) << " missing dentry for " << p
.first
.first
<< " snap " << p
.first
.second
<< dendl
;
7576 if (nonce
== dn
->get_replica_nonce(from
)) {
7577 dout(7) << " dentry_expire on " << *dn
<< " from mds." << from
<< dendl
;
7578 dentry_remove_replica(dn
, from
, gather_locks
);
7581 dout(7) << " dentry_expire on " << *dn
<< " from mds." << from
7582 << " with old nonce " << nonce
<< " (current " << dn
->get_replica_nonce(from
)
7583 << "), dropping" << dendl
;
7589 for (set
<SimpleLock
*>::iterator p
= gather_locks
.begin(); p
!= gather_locks
.end(); ++p
) {
7590 if (!(*p
)->is_stable())
7591 mds
->locker
->eval_gather(*p
);
7595 void MDCache::process_delayed_expire(CDir
*dir
)
7597 dout(7) << "process_delayed_expire on " << *dir
<< dendl
;
7598 for (const auto &p
: delayed_expire
[dir
]) {
7599 handle_cache_expire(p
.second
);
7601 delayed_expire
.erase(dir
);
7604 void MDCache::discard_delayed_expire(CDir
*dir
)
7606 dout(7) << "discard_delayed_expire on " << *dir
<< dendl
;
7607 delayed_expire
.erase(dir
);
7610 void MDCache::inode_remove_replica(CInode
*in
, mds_rank_t from
, bool rejoin
,
7611 set
<SimpleLock
*>& gather_locks
)
7613 in
->remove_replica(from
);
7614 in
->set_mds_caps_wanted(from
, 0);
7616 // note: this code calls _eval more often than it needs to!
7618 if (in
->authlock
.remove_replica(from
)) gather_locks
.insert(&in
->authlock
);
7619 if (in
->linklock
.remove_replica(from
)) gather_locks
.insert(&in
->linklock
);
7620 if (in
->snaplock
.remove_replica(from
)) gather_locks
.insert(&in
->snaplock
);
7621 if (in
->xattrlock
.remove_replica(from
)) gather_locks
.insert(&in
->xattrlock
);
7622 if (in
->flocklock
.remove_replica(from
)) gather_locks
.insert(&in
->flocklock
);
7623 if (in
->policylock
.remove_replica(from
)) gather_locks
.insert(&in
->policylock
);
7625 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7626 // Don't remove the recovering mds from lock's gathering list because
7627 // it may hold rejoined wrlocks.
7628 if (in
->dirfragtreelock
.remove_replica(from
, rejoin
)) gather_locks
.insert(&in
->dirfragtreelock
);
7629 if (in
->filelock
.remove_replica(from
, rejoin
)) gather_locks
.insert(&in
->filelock
);
7630 if (in
->nestlock
.remove_replica(from
, rejoin
)) gather_locks
.insert(&in
->nestlock
);
7633 void MDCache::dentry_remove_replica(CDentry
*dn
, mds_rank_t from
, set
<SimpleLock
*>& gather_locks
)
7635 dn
->remove_replica(from
);
7638 if (dn
->lock
.remove_replica(from
))
7639 gather_locks
.insert(&dn
->lock
);
7641 // Replicated strays might now be elegible for purge
7642 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
7643 if (dnl
->is_primary()) {
7644 maybe_eval_stray(dnl
->get_inode());
7648 void MDCache::trim_client_leases()
7650 utime_t now
= ceph_clock_now();
7652 dout(10) << "trim_client_leases" << dendl
;
7654 std::size_t pool
= 0;
7655 for (const auto& list
: client_leases
) {
7660 auto before
= list
.size();
7661 while (!list
.empty()) {
7662 ClientLease
*r
= list
.front();
7663 if (r
->ttl
> now
) break;
7664 CDentry
*dn
= static_cast<CDentry
*>(r
->parent
);
7665 dout(10) << " expiring client." << r
->client
<< " lease of " << *dn
<< dendl
;
7666 dn
->remove_client_lease(r
, mds
->locker
);
7668 auto after
= list
.size();
7669 dout(10) << "trim_client_leases pool " << pool
<< " trimmed "
7670 << (before
-after
) << " leases, " << after
<< " left" << dendl
;
7674 void MDCache::check_memory_usage()
7676 static MemoryModel
mm(g_ceph_context
);
7677 static MemoryModel::snap last
;
7679 static MemoryModel::snap baseline
= last
;
7681 // check client caps
7682 ceph_assert(CInode::count() == inode_map
.size() + snap_inode_map
.size() + num_shadow_inodes
);
7683 double caps_per_inode
= 0.0;
7684 if (CInode::count())
7685 caps_per_inode
= (double)Capability::count() / (double)CInode::count();
7687 dout(2) << "Memory usage: "
7688 << " total " << last
.get_total()
7689 << ", rss " << last
.get_rss()
7690 << ", heap " << last
.get_heap()
7691 << ", baseline " << baseline
.get_heap()
7692 << ", " << num_inodes_with_caps
<< " / " << CInode::count() << " inodes have caps"
7693 << ", " << Capability::count() << " caps, " << caps_per_inode
<< " caps per inode"
7696 mds
->update_mlogger();
7697 mds
->mlogger
->set(l_mdm_rss
, last
.get_rss());
7698 mds
->mlogger
->set(l_mdm_heap
, last
.get_heap());
7703 // =========================================================================================
7706 class C_MDC_ShutdownCheck
: public MDCacheContext
{
7708 explicit C_MDC_ShutdownCheck(MDCache
*m
) : MDCacheContext(m
) {}
7709 void finish(int) override
{
7710 mdcache
->shutdown_check();
7714 void MDCache::shutdown_check()
7716 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl
;
7719 char old_val
[32] = { 0 };
7721 g_conf().get_val("debug_mds", &o
, sizeof(old_val
));
7722 g_conf().set_val("debug_mds", "10");
7723 g_conf().apply_changes(nullptr);
7725 g_conf().set_val("debug_mds", old_val
);
7726 g_conf().apply_changes(nullptr);
7727 mds
->timer
.add_event_after(g_conf()->mds_shutdown_check
, new C_MDC_ShutdownCheck(this));
7730 dout(0) << "lru size now " << lru
.lru_get_size() << "/" << bottom_lru
.lru_get_size() << dendl
;
7731 dout(0) << "log len " << mds
->mdlog
->get_num_events() << dendl
;
7734 if (mds
->objecter
->is_active()) {
7735 dout(0) << "objecter still active" << dendl
;
7736 mds
->objecter
->dump_active();
7741 void MDCache::shutdown_start()
7743 dout(5) << "shutdown_start" << dendl
;
7745 if (g_conf()->mds_shutdown_check
)
7746 mds
->timer
.add_event_after(g_conf()->mds_shutdown_check
, new C_MDC_ShutdownCheck(this));
7748 // g_conf()->debug_mds = 10;
7753 bool MDCache::shutdown_pass()
7755 dout(7) << "shutdown_pass" << dendl
;
7757 if (mds
->is_stopped()) {
7758 dout(7) << " already shut down" << dendl
;
7765 bool strays_all_exported
= shutdown_export_strays();
7769 dout(5) << "lru size now " << lru
.lru_get_size() << "/" << bottom_lru
.lru_get_size() << dendl
;
7771 // Export all subtrees to another active (usually rank 0) if not rank 0
7772 int num_auth_subtree
= 0;
7773 if (!subtrees
.empty() && mds
->get_nodeid() != 0) {
7774 dout(7) << "looking for subtrees to export" << dendl
;
7775 std::vector
<CDir
*> ls
;
7776 for (auto& [dir
, bounds
] : subtrees
) {
7777 dout(10) << " examining " << *dir
<< " bounds " << bounds
<< dendl
;
7778 if (dir
->get_inode()->is_mdsdir() || !dir
->is_auth())
7781 if (dir
->is_frozen() ||
7782 dir
->is_freezing() ||
7783 dir
->is_ambiguous_dir_auth() ||
7784 dir
->state_test(CDir::STATE_EXPORTING
) ||
7785 dir
->get_inode()->is_ephemerally_pinned()) {
7791 migrator
->clear_export_queue();
7792 // stopping mds does not call MDBalancer::tick()
7793 mds
->balancer
->handle_export_pins();
7794 for (const auto& dir
: ls
) {
7795 mds_rank_t dest
= dir
->get_inode()->authority().first
;
7796 if (dest
> 0 && !mds
->mdsmap
->is_active(dest
))
7798 dout(7) << "sending " << *dir
<< " back to mds." << dest
<< dendl
;
7799 migrator
->export_dir_nicely(dir
, dest
);
7803 if (!strays_all_exported
) {
7804 dout(7) << "waiting for strays to migrate" << dendl
;
7808 if (num_auth_subtree
> 0) {
7809 ceph_assert(mds
->get_nodeid() > 0);
7810 dout(7) << "still have " << num_auth_subtree
<< " auth subtrees" << dendl
;
7815 // close out any sessions (and open files!) before we try to trim the log, etc.
7816 if (mds
->sessionmap
.have_unclosed_sessions()) {
7817 if (!mds
->server
->terminating_sessions
)
7818 mds
->server
->terminate_sessions();
7822 // Fully trim the log so that all objects in cache are clean and may be
7823 // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
7824 // trim the log such that the cache eventually becomes clean.
7825 if (mds
->mdlog
->get_num_segments() > 0) {
7826 auto ls
= mds
->mdlog
->get_current_segment();
7827 if (ls
->num_events
> 1 || !ls
->dirty_dirfrags
.empty()) {
7828 // Current segment contains events other than subtreemap or
7829 // there are dirty dirfrags (see CDir::log_mark_dirty())
7830 mds
->mdlog
->start_new_segment();
7831 mds
->mdlog
->flush();
7834 mds
->mdlog
->trim_all();
7835 if (mds
->mdlog
->get_num_segments() > 1) {
7836 dout(7) << "still >1 segments, waiting for log to trim" << dendl
;
7840 // drop our reference to our stray dir inode
7841 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
7843 strays
[i
]->state_test(CInode::STATE_STRAYPINNED
)) {
7844 strays
[i
]->state_clear(CInode::STATE_STRAYPINNED
);
7845 strays
[i
]->put(CInode::PIN_STRAY
);
7846 strays
[i
]->put_stickydirs();
7850 CDir
*mydir
= myin
? myin
->get_dirfrag(frag_t()) : NULL
;
7851 if (mydir
&& !mydir
->is_subtree_root())
7854 // subtrees map not empty yet?
7855 if (subtrees
.size() > (mydir
? 1 : 0)) {
7856 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl
;
7858 migrator
->show_importing();
7859 migrator
->show_exporting();
7860 if (!migrator
->is_importing() && !migrator
->is_exporting())
7864 ceph_assert(!migrator
->is_exporting());
7865 ceph_assert(!migrator
->is_importing());
7867 // replicas may dirty scatter locks
7868 if (myin
&& myin
->is_replicated()) {
7869 dout(7) << "still have replicated objects" << dendl
;
7873 if ((myin
&& myin
->get_num_auth_pins()) ||
7874 (mydir
&& (mydir
->get_auth_pins() || mydir
->get_dir_auth_pins()))) {
7875 dout(7) << "still have auth pinned objects" << dendl
;
7879 // (only do this once!)
7880 if (!mds
->mdlog
->is_capped()) {
7881 dout(7) << "capping the log" << dendl
;
7885 if (!mds
->mdlog
->empty())
7886 mds
->mdlog
->trim(0);
7888 if (!mds
->mdlog
->empty()) {
7889 dout(7) << "waiting for log to flush.. " << mds
->mdlog
->get_num_events()
7890 << " in " << mds
->mdlog
->get_num_segments() << " segments" << dendl
;
7894 if (!did_shutdown_log_cap
) {
7895 // flush journal header
7896 dout(7) << "writing header for (now-empty) journal" << dendl
;
7897 ceph_assert(mds
->mdlog
->empty());
7898 mds
->mdlog
->write_head(0);
7899 // NOTE: filer active checker below will block us until this completes.
7900 did_shutdown_log_cap
= true;
7905 if (mds
->objecter
->is_active()) {
7906 dout(7) << "objecter still active" << dendl
;
7907 mds
->objecter
->dump_active();
7911 // trim what we can from the cache
7912 if (lru
.lru_get_size() > 0 || bottom_lru
.lru_get_size() > 0) {
7913 dout(7) << "there's still stuff in the cache: " << lru
.lru_get_size() << "/" << bottom_lru
.lru_get_size() << dendl
;
7919 // make mydir subtree go away
7921 if (mydir
->get_num_ref() > 1) { // subtree pin
7922 dout(7) << "there's still reference to mydir " << *mydir
<< dendl
;
7927 remove_subtree(mydir
);
7928 myin
->close_dirfrag(mydir
->get_frag());
7930 ceph_assert(subtrees
.empty());
7937 if (global_snaprealm
) {
7938 remove_inode(global_snaprealm
->inode
);
7939 global_snaprealm
= nullptr;
7943 dout(5) << "shutdown done." << dendl
;
7947 bool MDCache::shutdown_export_strays()
7949 static const unsigned MAX_EXPORTING
= 100;
7951 if (mds
->get_nodeid() == 0)
7954 if (shutdown_exporting_strays
.size() * 3 >= MAX_EXPORTING
* 2)
7957 dout(10) << "shutdown_export_strays " << shutdown_export_next
.first
7958 << " '" << shutdown_export_next
.second
<< "'" << dendl
;
7960 bool mds0_active
= mds
->mdsmap
->is_active(mds_rank_t(0));
7961 bool all_exported
= false;
7964 auto next
= shutdown_export_next
;
7966 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
7967 CInode
*strayi
= strays
[i
];
7969 !strayi
->state_test(CInode::STATE_STRAYPINNED
))
7971 if (strayi
->ino() < next
.first
.ino
)
7975 strayi
->get_dirfrags(dfls
);
7977 while (!dfls
.empty()) {
7978 CDir
*dir
= dfls
.front();
7981 if (dir
->dirfrag() < next
.first
)
7983 if (next
.first
< dir
->dirfrag()) {
7984 next
.first
= dir
->dirfrag();
7985 next
.second
.clear();
7988 if (!dir
->is_complete()) {
7989 MDSContext
*fin
= nullptr;
7990 if (shutdown_exporting_strays
.empty()) {
7991 fin
= new MDSInternalContextWrapper(mds
,
7992 new LambdaContext([this](int r
) {
7993 shutdown_export_strays();
8001 CDir::dentry_key_map::iterator it
;
8002 if (next
.second
.empty()) {
8005 auto hash
= ceph_frag_value(strayi
->hash_dentry_name(next
.second
));
8006 it
= dir
->lower_bound(dentry_key_t(0, next
.second
, hash
));
8009 for (; it
!= dir
->end(); ++it
) {
8010 CDentry
*dn
= it
->second
;
8011 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
8015 if (!mds0_active
&& !dn
->state_test(CDentry::STATE_PURGING
)) {
8016 next
.second
= it
->first
.name
;
8020 auto ret
= shutdown_exporting_strays
.insert(dnl
->get_inode()->ino());
8022 dout(10) << "already exporting/purging " << *dn
<< dendl
;
8026 // Don't try to migrate anything that is actually
8027 // being purged right now
8028 if (!dn
->state_test(CDentry::STATE_PURGING
))
8029 stray_manager
.migrate_stray(dn
, mds_rank_t(0)); // send to root!
8031 if (shutdown_exporting_strays
.size() >= MAX_EXPORTING
) {
8033 if (it
!= dir
->end()) {
8034 next
.second
= it
->first
.name
;
8037 next
.first
.ino
.val
++;
8039 next
.first
= dfls
.front()->dirfrag();
8040 next
.second
.clear();
8048 if (shutdown_exporting_strays
.empty()) {
8049 dirfrag_t
first_df(MDS_INO_STRAY(mds
->get_nodeid(), 0), 0);
8050 if (first_df
< shutdown_export_next
.first
||
8051 !shutdown_export_next
.second
.empty()) {
8052 shutdown_export_next
.first
= first_df
;
8053 shutdown_export_next
.second
.clear();
8056 all_exported
= true;
8060 shutdown_export_next
= next
;
8061 return all_exported
;
8064 // ========= messaging ==============
8066 void MDCache::dispatch(const cref_t
<Message
> &m
)
8068 switch (m
->get_type()) {
8071 case MSG_MDS_RESOLVE
:
8072 handle_resolve(ref_cast
<MMDSResolve
>(m
));
8074 case MSG_MDS_RESOLVEACK
:
8075 handle_resolve_ack(ref_cast
<MMDSResolveAck
>(m
));
8079 case MSG_MDS_CACHEREJOIN
:
8080 handle_cache_rejoin(ref_cast
<MMDSCacheRejoin
>(m
));
8083 case MSG_MDS_DISCOVER
:
8084 handle_discover(ref_cast
<MDiscover
>(m
));
8086 case MSG_MDS_DISCOVERREPLY
:
8087 handle_discover_reply(ref_cast
<MDiscoverReply
>(m
));
8090 case MSG_MDS_DIRUPDATE
:
8091 handle_dir_update(ref_cast
<MDirUpdate
>(m
));
8094 case MSG_MDS_CACHEEXPIRE
:
8095 handle_cache_expire(ref_cast
<MCacheExpire
>(m
));
8098 case MSG_MDS_DENTRYLINK
:
8099 handle_dentry_link(ref_cast
<MDentryLink
>(m
));
8101 case MSG_MDS_DENTRYUNLINK
:
8102 handle_dentry_unlink(ref_cast
<MDentryUnlink
>(m
));
8105 case MSG_MDS_FRAGMENTNOTIFY
:
8106 handle_fragment_notify(ref_cast
<MMDSFragmentNotify
>(m
));
8108 case MSG_MDS_FRAGMENTNOTIFYACK
:
8109 handle_fragment_notify_ack(ref_cast
<MMDSFragmentNotifyAck
>(m
));
8112 case MSG_MDS_FINDINO
:
8113 handle_find_ino(ref_cast
<MMDSFindIno
>(m
));
8115 case MSG_MDS_FINDINOREPLY
:
8116 handle_find_ino_reply(ref_cast
<MMDSFindInoReply
>(m
));
8119 case MSG_MDS_OPENINO
:
8120 handle_open_ino(ref_cast
<MMDSOpenIno
>(m
));
8122 case MSG_MDS_OPENINOREPLY
:
8123 handle_open_ino_reply(ref_cast
<MMDSOpenInoReply
>(m
));
8126 case MSG_MDS_SNAPUPDATE
:
8127 handle_snap_update(ref_cast
<MMDSSnapUpdate
>(m
));
8131 derr
<< "cache unknown message " << m
->get_type() << dendl
;
8132 ceph_abort_msg("cache unknown message");
8136 int MDCache::path_traverse(MDRequestRef
& mdr
, MDSContextFactory
& cf
,
8137 const filepath
& path
, int flags
,
8138 vector
<CDentry
*> *pdnvec
, CInode
**pin
)
8140 bool discover
= (flags
& MDS_TRAVERSE_DISCOVER
);
8141 bool forward
= !discover
;
8142 bool path_locked
= (flags
& MDS_TRAVERSE_PATH_LOCKED
);
8143 bool want_dentry
= (flags
& MDS_TRAVERSE_WANT_DENTRY
);
8144 bool want_auth
= (flags
& MDS_TRAVERSE_WANT_AUTH
);
8145 bool rdlock_snap
= (flags
& (MDS_TRAVERSE_RDLOCK_SNAP
| MDS_TRAVERSE_RDLOCK_SNAP2
));
8146 bool rdlock_path
= (flags
& MDS_TRAVERSE_RDLOCK_PATH
);
8147 bool xlock_dentry
= (flags
& MDS_TRAVERSE_XLOCK_DENTRY
);
8148 bool rdlock_authlock
= (flags
& MDS_TRAVERSE_RDLOCK_AUTHLOCK
);
8151 ceph_assert(mdr
); // forward requires a request
8153 snapid_t snapid
= CEPH_NOSNAP
;
8155 mdr
->snapid
= snapid
;
8157 client_t client
= mdr
? mdr
->get_client() : -1;
8159 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse
);
8161 dout(7) << "traverse: opening base ino " << path
.get_ino() << " snap " << snapid
<< dendl
;
8162 CInode
*cur
= get_inode(path
.get_ino());
8164 if (MDS_INO_IS_MDSDIR(path
.get_ino())) {
8165 open_foreign_mdsdir(path
.get_ino(), cf
.build());
8168 if (MDS_INO_IS_STRAY(path
.get_ino())) {
8169 mds_rank_t rank
= MDS_INO_STRAY_OWNER(path
.get_ino());
8170 unsigned idx
= MDS_INO_STRAY_INDEX(path
.get_ino());
8171 filepath
path(strays
[idx
]->get_parent_dn()->get_name(),
8172 MDS_INO_MDSDIR(rank
));
8173 MDRequestRef null_ref
;
8174 return path_traverse(null_ref
, cf
, path
, MDS_TRAVERSE_DISCOVER
, nullptr);
8176 return -CEPHFS_ESTALE
;
8178 if (cur
->state_test(CInode::STATE_PURGING
))
8179 return -CEPHFS_ESTALE
;
8181 if (flags
& MDS_TRAVERSE_CHECK_LOCKCACHE
)
8182 mds
->locker
->find_and_attach_lock_cache(mdr
, cur
);
8184 if (mdr
&& mdr
->lock_cache
) {
8185 if (flags
& MDS_TRAVERSE_WANT_DIRLAYOUT
)
8186 mdr
->dir_layout
= mdr
->lock_cache
->get_dir_layout();
8187 } else if (rdlock_snap
) {
8188 int n
= (flags
& MDS_TRAVERSE_RDLOCK_SNAP2
) ? 1 : 0;
8189 if ((n
== 0 && !(mdr
->locking_state
& MutationImpl::SNAP_LOCKED
)) ||
8190 (n
== 1 && !(mdr
->locking_state
& MutationImpl::SNAP2_LOCKED
))) {
8191 bool want_layout
= (flags
& MDS_TRAVERSE_WANT_DIRLAYOUT
);
8192 if (!mds
->locker
->try_rdlock_snap_layout(cur
, mdr
, n
, want_layout
))
8203 MutationImpl::LockOpVec lov
;
8205 for (unsigned depth
= 0; depth
< path
.depth(); ) {
8206 dout(12) << "traverse: path seg depth " << depth
<< " '" << path
[depth
]
8207 << "' snapid " << snapid
<< dendl
;
8209 if (!cur
->is_dir()) {
8210 dout(7) << "traverse: " << *cur
<< " not a dir " << dendl
;
8211 return -CEPHFS_ENOTDIR
;
8214 // walk into snapdir?
8215 if (path
[depth
].length() == 0) {
8216 dout(10) << "traverse: snapdir" << dendl
;
8217 if (!mdr
|| depth
> 0) // snapdir must be the first component
8218 return -CEPHFS_EINVAL
;
8219 snapid
= CEPH_SNAPDIR
;
8220 mdr
->snapid
= snapid
;
8224 // walk thru snapdir?
8225 if (snapid
== CEPH_SNAPDIR
) {
8227 return -CEPHFS_EINVAL
;
8228 SnapRealm
*realm
= cur
->find_snaprealm();
8229 snapid
= realm
->resolve_snapname(path
[depth
], cur
->ino());
8230 dout(10) << "traverse: snap " << path
[depth
] << " -> " << snapid
<< dendl
;
8233 pdnvec
->clear(); // do not confuse likes of rdlock_path_pin_ref();
8234 return -CEPHFS_ENOENT
;
8236 mdr
->snapid
= snapid
;
8242 frag_t fg
= cur
->pick_dirfrag(path
[depth
]);
8243 CDir
*curdir
= cur
->get_dirfrag(fg
);
8245 if (cur
->is_auth()) {
8246 // parent dir frozen_dir?
8247 if (cur
->is_frozen()) {
8248 dout(7) << "traverse: " << *cur
<< " is frozen, waiting" << dendl
;
8249 cur
->add_waiter(CDir::WAIT_UNFREEZE
, cf
.build());
8252 curdir
= cur
->get_or_open_dirfrag(this, fg
);
8255 dout(10) << "traverse: need dirfrag " << fg
<< ", doing discover from " << *cur
<< dendl
;
8256 discover_path(cur
, snapid
, path
.postfixpath(depth
), cf
.build(),
8258 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_discover
);
8262 ceph_assert(curdir
);
8264 #ifdef MDS_VERIFY_FRAGSTAT
8265 if (curdir
->is_complete())
8266 curdir
->verify_fragstat();
8271 if (curdir->is_frozen()) {
8273 // FIXME: traverse is allowed?
8274 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8275 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
8276 if (onfinish) delete onfinish;
8281 if (want_auth
&& want_dentry
&& depth
== path
.depth() - 1) {
8282 if (curdir
->is_ambiguous_auth()) {
8283 dout(10) << "waiting for single auth on " << *curdir
<< dendl
;
8284 curdir
->add_waiter(CInode::WAIT_SINGLEAUTH
, cf
.build());
8287 if (!curdir
->is_auth()) {
8288 dout(10) << "fw to auth for " << *curdir
<< dendl
;
8289 request_forward(mdr
, curdir
->authority().first
);
8294 // Before doing dirfrag->dn lookup, compare with DamageTable's
8295 // record of which dentries were unreadable
8296 if (mds
->damage_table
.is_dentry_damaged(curdir
, path
[depth
], snapid
)) {
8297 dout(4) << "traverse: stopped lookup at damaged dentry "
8298 << *curdir
<< "/" << path
[depth
] << " snap=" << snapid
<< dendl
;
8303 CDentry
*dn
= curdir
->lookup(path
[depth
], snapid
);
8305 if (dn
->state_test(CDentry::STATE_PURGING
))
8306 return -CEPHFS_ENOENT
;
8310 if (xlock_dentry
&& depth
== path
.depth() - 1) {
8311 if (depth
> 0 || !mdr
->lock_cache
) {
8312 lov
.add_wrlock(&cur
->filelock
);
8313 lov
.add_wrlock(&cur
->nestlock
);
8314 if (rdlock_authlock
)
8315 lov
.add_rdlock(&cur
->authlock
);
8317 lov
.add_xlock(&dn
->lock
);
8319 // force client to flush async dir operation if necessary
8320 if (cur
->filelock
.is_cached())
8321 lov
.add_wrlock(&cur
->filelock
);
8322 lov
.add_rdlock(&dn
->lock
);
8324 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
8325 dout(10) << "traverse: failed to rdlock " << dn
->lock
<< " " << *dn
<< dendl
;
8328 } else if (!path_locked
&&
8329 !dn
->lock
.can_read(client
) &&
8330 !(dn
->lock
.is_xlocked() && dn
->lock
.get_xlock_by() == mdr
)) {
8331 dout(10) << "traverse: non-readable dentry at " << *dn
<< dendl
;
8332 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, cf
.build());
8334 mds
->logger
->inc(l_mds_traverse_lock
);
8335 if (dn
->is_auth() && dn
->lock
.is_unstable_and_locked())
8336 mds
->mdlog
->flush();
8341 pdnvec
->push_back(dn
);
8343 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
8344 // can we conclude CEPHFS_ENOENT?
8345 if (dnl
->is_null()) {
8346 dout(10) << "traverse: null+readable dentry at " << *dn
<< dendl
;
8347 if (depth
== path
.depth() - 1) {
8352 pdnvec
->clear(); // do not confuse likes of rdlock_path_pin_ref();
8354 return -CEPHFS_ENOENT
;
8357 // do we have inode?
8358 CInode
*in
= dnl
->get_inode();
8360 ceph_assert(dnl
->is_remote());
8362 in
= get_inode(dnl
->get_remote_ino());
8364 dout(7) << "linking in remote in " << *in
<< dendl
;
8365 dn
->link_remote(dnl
, in
);
8367 dout(7) << "remote link to " << dnl
->get_remote_ino() << ", which i don't have" << dendl
;
8368 ceph_assert(mdr
); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
8369 if (mds
->damage_table
.is_remote_damaged(dnl
->get_remote_ino())) {
8370 dout(4) << "traverse: remote dentry points to damaged ino "
8374 open_remote_dentry(dn
, true, cf
.build(),
8375 (path_locked
&& depth
== path
.depth() - 1));
8376 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_remote_ino
);
8383 if (rdlock_snap
&& !(want_dentry
&& depth
== path
.depth() - 1)) {
8385 lov
.add_rdlock(&cur
->snaplock
);
8386 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
8387 dout(10) << "traverse: failed to rdlock " << cur
->snaplock
<< " " << *cur
<< dendl
;
8392 // add to trace, continue.
8402 // MISS. dentry doesn't exist.
8403 dout(12) << "traverse: miss on dentry " << path
[depth
] << " in " << *curdir
<< dendl
;
8405 if (curdir
->is_auth()) {
8407 if (curdir
->is_complete() ||
8408 (snapid
== CEPH_NOSNAP
&&
8409 curdir
->has_bloom() &&
8410 !curdir
->is_in_bloom(path
[depth
]))) {
8413 // instantiate a null dn?
8414 if (depth
< path
.depth() - 1) {
8415 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl
;
8416 } else if (snapid
< CEPH_MAXSNAP
) {
8417 dout(20) << " not adding null for snapid " << snapid
<< dendl
;
8418 } else if (curdir
->is_frozen()) {
8419 dout(7) << "traverse: " << *curdir
<< " is frozen, waiting" << dendl
;
8420 curdir
->add_waiter(CDir::WAIT_UNFREEZE
, cf
.build());
8423 // create a null dentry
8424 dn
= curdir
->add_null_dentry(path
[depth
]);
8425 dout(20) << " added null " << *dn
<< dendl
;
8430 if (depth
> 0 || !mdr
->lock_cache
) {
8431 lov
.add_wrlock(&cur
->filelock
);
8432 lov
.add_wrlock(&cur
->nestlock
);
8433 if (rdlock_authlock
)
8434 lov
.add_rdlock(&cur
->authlock
);
8436 lov
.add_xlock(&dn
->lock
);
8438 // force client to flush async dir operation if necessary
8439 if (cur
->filelock
.is_cached())
8440 lov
.add_wrlock(&cur
->filelock
);
8441 lov
.add_rdlock(&dn
->lock
);
8443 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
8444 dout(10) << "traverse: failed to rdlock " << dn
->lock
<< " " << *dn
<< dendl
;
8450 pdnvec
->push_back(dn
);
8454 pdnvec
->clear(); // do not confuse likes of rdlock_path_pin_ref();
8457 return -CEPHFS_ENOENT
;
8460 // Check DamageTable for missing fragments before trying to fetch
8462 if (mds
->damage_table
.is_dirfrag_damaged(curdir
)) {
8463 dout(4) << "traverse: damaged dirfrag " << *curdir
8464 << ", blocking fetch" << dendl
;
8468 // directory isn't complete; reload
8469 dout(7) << "traverse: incomplete dir contents for " << *cur
<< ", fetching" << dendl
;
8471 curdir
->fetch(cf
.build(), path
[depth
]);
8472 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_dir_fetch
);
8476 // dirfrag/dentry is not mine.
8477 mds_authority_t dauth
= curdir
->authority();
8480 mdr
&& mdr
->client_request
&&
8481 (int)depth
< mdr
->client_request
->get_num_fwd()){
8482 dout(7) << "traverse: snap " << snapid
<< " and depth " << depth
8483 << " < fwd " << mdr
->client_request
->get_num_fwd()
8484 << ", discovering instead of forwarding" << dendl
;
8489 dout(7) << "traverse: discover from " << path
[depth
] << " from " << *curdir
<< dendl
;
8490 discover_path(curdir
, snapid
, path
.postfixpath(depth
), cf
.build(),
8492 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_discover
);
8497 dout(7) << "traverse: not auth for " << path
<< " in " << *curdir
<< dendl
;
8499 if (curdir
->is_ambiguous_auth()) {
8501 dout(7) << "traverse: waiting for single auth in " << *curdir
<< dendl
;
8502 curdir
->add_waiter(CDir::WAIT_SINGLEAUTH
, cf
.build());
8506 dout(7) << "traverse: forwarding, not auth for " << *curdir
<< dendl
;
8508 request_forward(mdr
, dauth
.first
);
8510 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_forward
);
8515 ceph_abort(); // i shouldn't get here
8518 if (want_auth
&& !want_dentry
) {
8519 if (cur
->is_ambiguous_auth()) {
8520 dout(10) << "waiting for single auth on " << *cur
<< dendl
;
8521 cur
->add_waiter(CInode::WAIT_SINGLEAUTH
, cf
.build());
8524 if (!cur
->is_auth()) {
8525 dout(10) << "fw to auth for " << *cur
<< dendl
;
8526 request_forward(mdr
, cur
->authority().first
);
8532 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_hit
);
8533 dout(10) << "path_traverse finish on snapid " << snapid
<< dendl
;
8535 ceph_assert(mdr
->snapid
== snapid
);
8537 if (flags
& MDS_TRAVERSE_RDLOCK_SNAP
)
8538 mdr
->locking_state
|= MutationImpl::SNAP_LOCKED
;
8539 else if (flags
& MDS_TRAVERSE_RDLOCK_SNAP2
)
8540 mdr
->locking_state
|= MutationImpl::SNAP2_LOCKED
;
8543 mdr
->locking_state
|= MutationImpl::PATH_LOCKED
;
8548 CInode
*MDCache::cache_traverse(const filepath
& fp
)
8550 dout(10) << "cache_traverse " << fp
<< dendl
;
8556 in
= get_inode(fp
.get_ino());
8557 } else if (fp
.depth() > 0 && fp
[0] == "~mdsdir") {
8566 for (; depth
< fp
.depth(); depth
++) {
8567 std::string_view dname
= fp
[depth
];
8568 frag_t fg
= in
->pick_dirfrag(dname
);
8569 dout(20) << " " << depth
<< " " << dname
<< " frag " << fg
<< " from " << *in
<< dendl
;
8570 CDir
*curdir
= in
->get_dirfrag(fg
);
8573 CDentry
*dn
= curdir
->lookup(dname
, CEPH_NOSNAP
);
8576 in
= dn
->get_linkage()->get_inode();
8580 dout(10) << " got " << *in
<< dendl
;
8586 * open_remote_dir -- open up a remote dirfrag
8588 * @param diri base inode
8589 * @param approxfg approximate fragment.
8590 * @param fin completion callback
8592 void MDCache::open_remote_dirfrag(CInode
*diri
, frag_t approxfg
, MDSContext
*fin
)
8594 dout(10) << "open_remote_dir on " << *diri
<< dendl
;
8595 ceph_assert(diri
->is_dir());
8596 ceph_assert(!diri
->is_auth());
8597 ceph_assert(diri
->get_dirfrag(approxfg
) == 0);
8599 discover_dir_frag(diri
, approxfg
, fin
);
8604 * get_dentry_inode - get or open inode
8606 * @param dn the dentry
8607 * @param mdr current request
8609 * will return inode for primary, or link up/open up remote link's inode as necessary.
8610 * If it's not available right now, puts mdr on wait list and returns null.
8612 CInode
*MDCache::get_dentry_inode(CDentry
*dn
, MDRequestRef
& mdr
, bool projected
)
8614 CDentry::linkage_t
*dnl
;
8616 dnl
= dn
->get_projected_linkage();
8618 dnl
= dn
->get_linkage();
8620 ceph_assert(!dnl
->is_null());
8622 if (dnl
->is_primary())
8625 ceph_assert(dnl
->is_remote());
8626 CInode
*in
= get_inode(dnl
->get_remote_ino());
8628 dout(7) << "get_dentry_inode linking in remote in " << *in
<< dendl
;
8629 dn
->link_remote(dnl
, in
);
8632 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn
<< dendl
;
8633 open_remote_dentry(dn
, projected
, new C_MDS_RetryRequest(this, mdr
));
8638 struct C_MDC_OpenRemoteDentry
: public MDCacheContext
{
8641 MDSContext
*onfinish
;
8643 C_MDC_OpenRemoteDentry(MDCache
*m
, CDentry
*d
, inodeno_t i
, MDSContext
*f
, bool wx
) :
8644 MDCacheContext(m
), dn(d
), ino(i
), onfinish(f
), want_xlocked(wx
) {
8645 dn
->get(MDSCacheObject::PIN_PTRWAITER
);
8647 void finish(int r
) override
{
8648 mdcache
->_open_remote_dentry_finish(dn
, ino
, onfinish
, want_xlocked
, r
);
8649 dn
->put(MDSCacheObject::PIN_PTRWAITER
);
8653 void MDCache::open_remote_dentry(CDentry
*dn
, bool projected
, MDSContext
*fin
, bool want_xlocked
)
8655 dout(10) << "open_remote_dentry " << *dn
<< dendl
;
8656 CDentry::linkage_t
*dnl
= projected
? dn
->get_projected_linkage() : dn
->get_linkage();
8657 inodeno_t ino
= dnl
->get_remote_ino();
8658 int64_t pool
= dnl
->get_remote_d_type() == DT_DIR
? mds
->get_metadata_pool() : -1;
8660 new C_MDC_OpenRemoteDentry(this, dn
, ino
, fin
, want_xlocked
), true, want_xlocked
); // backtrace
8663 void MDCache::_open_remote_dentry_finish(CDentry
*dn
, inodeno_t ino
, MDSContext
*fin
,
8664 bool want_xlocked
, int r
)
8667 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
8668 if (dnl
->is_remote() && dnl
->get_remote_ino() == ino
) {
8669 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn
<< dendl
;
8670 dn
->state_set(CDentry::STATE_BADREMOTEINO
);
8673 CDir
*dir
= dn
->get_dir();
8675 dir
->get_inode()->make_path_string(path
);
8677 path
+= dn
->get_name();
8680 bool fatal
= mds
->damage_table
.notify_remote_damaged(ino
, path
);
8683 ceph_abort(); // unreachable, damaged() respawns us
8689 fin
->complete(r
< 0 ? r
: 0);
8693 void MDCache::make_trace(vector
<CDentry
*>& trace
, CInode
*in
)
8695 // empty trace if we're a base inode
8699 CInode
*parent
= in
->get_parent_inode();
8700 ceph_assert(parent
);
8701 make_trace(trace
, parent
);
8703 CDentry
*dn
= in
->get_parent_dn();
8704 dout(15) << "make_trace adding " << *dn
<< dendl
;
8705 trace
.push_back(dn
);
8709 // -------------------------------------------------------------------------------
8710 // Open inode by inode number
8712 class C_IO_MDC_OpenInoBacktraceFetched
: public MDCacheIOContext
{
8716 C_IO_MDC_OpenInoBacktraceFetched(MDCache
*c
, inodeno_t i
) :
8717 MDCacheIOContext(c
), ino(i
) {}
8718 void finish(int r
) override
{
8719 mdcache
->_open_ino_backtrace_fetched(ino
, bl
, r
);
8721 void print(ostream
& out
) const override
{
8722 out
<< "openino_backtrace_fetch" << ino
<< ")";
8726 struct C_MDC_OpenInoTraverseDir
: public MDCacheContext
{
8728 cref_t
<MMDSOpenIno
> msg
;
8731 C_MDC_OpenInoTraverseDir(MDCache
*c
, inodeno_t i
, const cref_t
<MMDSOpenIno
> &m
, bool p
) :
8732 MDCacheContext(c
), ino(i
), msg(m
), parent(p
) {}
8733 void finish(int r
) override
{
8734 if (r
< 0 && !parent
)
8737 mdcache
->handle_open_ino(msg
, r
);
8740 auto& info
= mdcache
->opening_inodes
.at(ino
);
8741 mdcache
->_open_ino_traverse_dir(ino
, info
, r
);
8745 struct C_MDC_OpenInoParentOpened
: public MDCacheContext
{
8748 C_MDC_OpenInoParentOpened(MDCache
*c
, inodeno_t i
) : MDCacheContext(c
), ino(i
) {}
8749 void finish(int r
) override
{
8750 mdcache
->_open_ino_parent_opened(ino
, r
);
8754 void MDCache::_open_ino_backtrace_fetched(inodeno_t ino
, bufferlist
& bl
, int err
)
8756 dout(10) << "_open_ino_backtrace_fetched ino " << ino
<< " errno " << err
<< dendl
;
8758 open_ino_info_t
& info
= opening_inodes
.at(ino
);
8760 CInode
*in
= get_inode(ino
);
8762 dout(10) << " found cached " << *in
<< dendl
;
8763 open_ino_finish(ino
, info
, in
->authority().first
);
8767 inode_backtrace_t backtrace
;
8770 decode(backtrace
, bl
);
8771 } catch (const buffer::error
&decode_exc
) {
8772 derr
<< "corrupt backtrace on ino x0" << std::hex
<< ino
8773 << std::dec
<< ": " << decode_exc
.what() << dendl
;
8774 open_ino_finish(ino
, info
, -CEPHFS_EIO
);
8777 if (backtrace
.pool
!= info
.pool
&& backtrace
.pool
!= -1) {
8778 dout(10) << " old object in pool " << info
.pool
8779 << ", retrying pool " << backtrace
.pool
<< dendl
;
8780 info
.pool
= backtrace
.pool
;
8781 C_IO_MDC_OpenInoBacktraceFetched
*fin
=
8782 new C_IO_MDC_OpenInoBacktraceFetched(this, ino
);
8783 fetch_backtrace(ino
, info
.pool
, fin
->bl
,
8784 new C_OnFinisher(fin
, mds
->finisher
));
8787 } else if (err
== -CEPHFS_ENOENT
) {
8788 int64_t meta_pool
= mds
->get_metadata_pool();
8789 if (info
.pool
!= meta_pool
) {
8790 dout(10) << " no object in pool " << info
.pool
8791 << ", retrying pool " << meta_pool
<< dendl
;
8792 info
.pool
= meta_pool
;
8793 C_IO_MDC_OpenInoBacktraceFetched
*fin
=
8794 new C_IO_MDC_OpenInoBacktraceFetched(this, ino
);
8795 fetch_backtrace(ino
, info
.pool
, fin
->bl
,
8796 new C_OnFinisher(fin
, mds
->finisher
));
8799 err
= 0; // backtrace.ancestors.empty() is checked below
8803 if (backtrace
.ancestors
.empty()) {
8804 dout(10) << " got empty backtrace " << dendl
;
8805 err
= -CEPHFS_ESTALE
;
8806 } else if (!info
.ancestors
.empty()) {
8807 if (info
.ancestors
[0] == backtrace
.ancestors
[0]) {
8808 dout(10) << " got same parents " << info
.ancestors
[0] << " 2 times" << dendl
;
8809 err
= -CEPHFS_EINVAL
;
8816 dout(0) << " failed to open ino " << ino
<< " err " << err
<< "/" << info
.last_err
<< dendl
;
8818 err
= info
.last_err
;
8819 open_ino_finish(ino
, info
, err
);
8823 dout(10) << " got backtrace " << backtrace
<< dendl
;
8824 info
.ancestors
= backtrace
.ancestors
;
8826 _open_ino_traverse_dir(ino
, info
, 0);
8829 void MDCache::_open_ino_parent_opened(inodeno_t ino
, int ret
)
8831 dout(10) << "_open_ino_parent_opened ino " << ino
<< " ret " << ret
<< dendl
;
8833 open_ino_info_t
& info
= opening_inodes
.at(ino
);
8835 CInode
*in
= get_inode(ino
);
8837 dout(10) << " found cached " << *in
<< dendl
;
8838 open_ino_finish(ino
, info
, in
->authority().first
);
8842 if (ret
== mds
->get_nodeid()) {
8843 _open_ino_traverse_dir(ino
, info
, 0);
8846 mds_rank_t checked_rank
= mds_rank_t(ret
);
8847 info
.check_peers
= true;
8848 info
.auth_hint
= checked_rank
;
8849 info
.checked
.erase(checked_rank
);
8851 do_open_ino(ino
, info
, ret
);
8855 void MDCache::_open_ino_traverse_dir(inodeno_t ino
, open_ino_info_t
& info
, int ret
)
8857 dout(10) << __func__
<< ": ino " << ino
<< " ret " << ret
<< dendl
;
8859 CInode
*in
= get_inode(ino
);
8861 dout(10) << " found cached " << *in
<< dendl
;
8862 open_ino_finish(ino
, info
, in
->authority().first
);
8867 do_open_ino(ino
, info
, ret
);
8871 mds_rank_t hint
= info
.auth_hint
;
8872 ret
= open_ino_traverse_dir(ino
, NULL
, info
.ancestors
,
8873 info
.discover
, info
.want_xlocked
, &hint
);
8876 if (hint
!= mds
->get_nodeid())
8877 info
.auth_hint
= hint
;
8878 do_open_ino(ino
, info
, ret
);
8881 void MDCache::_open_ino_fetch_dir(inodeno_t ino
, const cref_t
<MMDSOpenIno
> &m
, CDir
*dir
, bool parent
)
8883 if (dir
->state_test(CDir::STATE_REJOINUNDEF
))
8884 ceph_assert(dir
->get_inode()->dirfragtree
.is_leaf(dir
->get_frag()));
8885 dir
->fetch(new C_MDC_OpenInoTraverseDir(this, ino
, m
, parent
));
8887 mds
->logger
->inc(l_mds_openino_dir_fetch
);
8890 int MDCache::open_ino_traverse_dir(inodeno_t ino
, const cref_t
<MMDSOpenIno
> &m
,
8891 const vector
<inode_backpointer_t
>& ancestors
,
8892 bool discover
, bool want_xlocked
, mds_rank_t
*hint
)
8894 dout(10) << "open_ino_traverse_dir ino " << ino
<< " " << ancestors
<< dendl
;
8896 for (unsigned i
= 0; i
< ancestors
.size(); i
++) {
8897 const auto& ancestor
= ancestors
.at(i
);
8898 CInode
*diri
= get_inode(ancestor
.dirino
);
8901 if (discover
&& MDS_INO_IS_MDSDIR(ancestor
.dirino
)) {
8902 open_foreign_mdsdir(ancestor
.dirino
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
8908 if (diri
->state_test(CInode::STATE_REJOINUNDEF
)) {
8909 CDir
*dir
= diri
->get_parent_dir();
8910 while (dir
->state_test(CDir::STATE_REJOINUNDEF
) &&
8911 dir
->get_inode()->state_test(CInode::STATE_REJOINUNDEF
))
8912 dir
= dir
->get_inode()->get_parent_dir();
8913 _open_ino_fetch_dir(ino
, m
, dir
, i
== 0);
8917 if (!diri
->is_dir()) {
8918 dout(10) << " " << *diri
<< " is not dir" << dendl
;
8920 err
= -CEPHFS_ENOTDIR
;
8924 const string
& name
= ancestor
.dname
;
8925 frag_t fg
= diri
->pick_dirfrag(name
);
8926 CDir
*dir
= diri
->get_dirfrag(fg
);
8928 if (diri
->is_auth()) {
8929 if (diri
->is_frozen()) {
8930 dout(10) << " " << *diri
<< " is frozen, waiting " << dendl
;
8931 diri
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
8934 dir
= diri
->get_or_open_dirfrag(this, fg
);
8935 } else if (discover
) {
8936 open_remote_dirfrag(diri
, fg
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
8941 inodeno_t next_ino
= i
> 0 ? ancestors
.at(i
-1).dirino
: ino
;
8942 CDentry
*dn
= dir
->lookup(name
);
8943 CDentry::linkage_t
*dnl
= dn
? dn
->get_linkage() : NULL
;
8944 if (dir
->is_auth()) {
8945 if (dnl
&& dnl
->is_primary() &&
8946 dnl
->get_inode()->state_test(CInode::STATE_REJOINUNDEF
)) {
8947 dout(10) << " fetching undef " << *dnl
->get_inode() << dendl
;
8948 _open_ino_fetch_dir(ino
, m
, dir
, i
== 0);
8952 if (!dnl
&& !dir
->is_complete() &&
8953 (!dir
->has_bloom() || dir
->is_in_bloom(name
))) {
8954 dout(10) << " fetching incomplete " << *dir
<< dendl
;
8955 _open_ino_fetch_dir(ino
, m
, dir
, i
== 0);
8959 dout(10) << " no ino " << next_ino
<< " in " << *dir
<< dendl
;
8961 err
= -CEPHFS_ENOENT
;
8962 } else if (discover
) {
8964 filepath
path(name
, 0);
8965 discover_path(dir
, CEPH_NOSNAP
, path
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0),
8966 (i
== 0 && want_xlocked
));
8969 if (dnl
->is_null() && !dn
->lock
.can_read(-1)) {
8970 dout(10) << " null " << *dn
<< " is not readable, waiting" << dendl
;
8971 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
8974 dout(10) << " no ino " << next_ino
<< " in " << *dir
<< dendl
;
8976 err
= -CEPHFS_ENOENT
;
8980 *hint
= dir
? dir
->authority().first
: diri
->authority().first
;
8986 void MDCache::open_ino_finish(inodeno_t ino
, open_ino_info_t
& info
, int ret
)
8988 dout(10) << "open_ino_finish ino " << ino
<< " ret " << ret
<< dendl
;
8990 MDSContext::vec waiters
;
8991 waiters
.swap(info
.waiters
);
8992 opening_inodes
.erase(ino
);
8993 finish_contexts(g_ceph_context
, waiters
, ret
);
8996 void MDCache::do_open_ino(inodeno_t ino
, open_ino_info_t
& info
, int err
)
8998 if (err
< 0 && err
!= -CEPHFS_EAGAIN
) {
8999 info
.checked
.clear();
9000 info
.checking
= MDS_RANK_NONE
;
9001 info
.check_peers
= true;
9002 info
.fetch_backtrace
= true;
9003 if (info
.discover
) {
9004 info
.discover
= false;
9005 info
.ancestors
.clear();
9007 if (err
!= -CEPHFS_ENOENT
&& err
!= -CEPHFS_ENOTDIR
)
9008 info
.last_err
= err
;
9011 if (info
.check_peers
|| info
.discover
) {
9012 if (info
.discover
) {
9013 // got backtrace from peer, but failed to find inode. re-check peers
9014 info
.discover
= false;
9015 info
.ancestors
.clear();
9016 info
.checked
.clear();
9018 info
.check_peers
= false;
9019 info
.checking
= MDS_RANK_NONE
;
9020 do_open_ino_peer(ino
, info
);
9021 } else if (info
.fetch_backtrace
) {
9022 info
.check_peers
= true;
9023 info
.fetch_backtrace
= false;
9024 info
.checking
= mds
->get_nodeid();
9025 info
.checked
.clear();
9026 C_IO_MDC_OpenInoBacktraceFetched
*fin
=
9027 new C_IO_MDC_OpenInoBacktraceFetched(this, ino
);
9028 fetch_backtrace(ino
, info
.pool
, fin
->bl
,
9029 new C_OnFinisher(fin
, mds
->finisher
));
9031 ceph_assert(!info
.ancestors
.empty());
9032 info
.checking
= mds
->get_nodeid();
9033 open_ino(info
.ancestors
[0].dirino
, mds
->get_metadata_pool(),
9034 new C_MDC_OpenInoParentOpened(this, ino
), info
.want_replica
);
9038 void MDCache::do_open_ino_peer(inodeno_t ino
, open_ino_info_t
& info
)
9040 set
<mds_rank_t
> all
, active
;
9041 mds
->mdsmap
->get_mds_set(all
);
9042 if (mds
->get_state() == MDSMap::STATE_REJOIN
)
9043 mds
->mdsmap
->get_mds_set_lower_bound(active
, MDSMap::STATE_REJOIN
);
9045 mds
->mdsmap
->get_mds_set_lower_bound(active
, MDSMap::STATE_CLIENTREPLAY
);
9047 dout(10) << "do_open_ino_peer " << ino
<< " active " << active
9048 << " all " << all
<< " checked " << info
.checked
<< dendl
;
9050 mds_rank_t whoami
= mds
->get_nodeid();
9051 mds_rank_t peer
= MDS_RANK_NONE
;
9052 if (info
.auth_hint
>= 0 && info
.auth_hint
!= whoami
) {
9053 if (active
.count(info
.auth_hint
)) {
9054 peer
= info
.auth_hint
;
9055 info
.auth_hint
= MDS_RANK_NONE
;
9058 for (set
<mds_rank_t
>::iterator p
= active
.begin(); p
!= active
.end(); ++p
)
9059 if (*p
!= whoami
&& info
.checked
.count(*p
) == 0) {
9066 if (all
!= info
.checked
) {
9067 dout(10) << " waiting for more peers to be active" << dendl
;
9069 dout(10) << " all MDS peers have been checked " << dendl
;
9070 do_open_ino(ino
, info
, 0);
9073 info
.checking
= peer
;
9074 vector
<inode_backpointer_t
> *pa
= NULL
;
9075 // got backtrace from peer or backtrace just fetched
9076 if (info
.discover
|| !info
.fetch_backtrace
)
9077 pa
= &info
.ancestors
;
9078 mds
->send_message_mds(make_message
<MMDSOpenIno
>(info
.tid
, ino
, pa
), peer
);
9080 mds
->logger
->inc(l_mds_openino_peer_discover
);
9084 void MDCache::handle_open_ino(const cref_t
<MMDSOpenIno
> &m
, int err
)
9086 if (mds
->get_state() < MDSMap::STATE_REJOIN
&&
9087 mds
->get_want_state() != CEPH_MDS_STATE_REJOIN
) {
9091 dout(10) << "handle_open_ino " << *m
<< " err " << err
<< dendl
;
9093 auto from
= mds_rank_t(m
->get_source().num());
9094 inodeno_t ino
= m
->ino
;
9095 ref_t
<MMDSOpenInoReply
> reply
;
9096 CInode
*in
= get_inode(ino
);
9098 dout(10) << " have " << *in
<< dendl
;
9099 reply
= make_message
<MMDSOpenInoReply
>(m
->get_tid(), ino
, mds_rank_t(0));
9100 if (in
->is_auth()) {
9103 CDentry
*pdn
= in
->get_parent_dn();
9106 CInode
*diri
= pdn
->get_dir()->get_inode();
9107 reply
->ancestors
.push_back(inode_backpointer_t(diri
->ino(), pdn
->get_name(),
9108 in
->get_version()));
9112 reply
->hint
= in
->authority().first
;
9114 } else if (err
< 0) {
9115 reply
= make_message
<MMDSOpenInoReply
>(m
->get_tid(), ino
, MDS_RANK_NONE
, err
);
9117 mds_rank_t hint
= MDS_RANK_NONE
;
9118 int ret
= open_ino_traverse_dir(ino
, m
, m
->ancestors
, false, false, &hint
);
9121 reply
= make_message
<MMDSOpenInoReply
>(m
->get_tid(), ino
, hint
, ret
);
9123 mds
->send_message_mds(reply
, from
);
9126 void MDCache::handle_open_ino_reply(const cref_t
<MMDSOpenInoReply
> &m
)
9128 dout(10) << "handle_open_ino_reply " << *m
<< dendl
;
9130 inodeno_t ino
= m
->ino
;
9131 mds_rank_t from
= mds_rank_t(m
->get_source().num());
9132 auto it
= opening_inodes
.find(ino
);
9133 if (it
!= opening_inodes
.end() && it
->second
.checking
== from
) {
9134 open_ino_info_t
& info
= it
->second
;
9135 info
.checking
= MDS_RANK_NONE
;
9136 info
.checked
.insert(from
);
9138 CInode
*in
= get_inode(ino
);
9140 dout(10) << " found cached " << *in
<< dendl
;
9141 open_ino_finish(ino
, info
, in
->authority().first
);
9142 } else if (!m
->ancestors
.empty()) {
9143 dout(10) << " found ino " << ino
<< " on mds." << from
<< dendl
;
9144 if (!info
.want_replica
) {
9145 open_ino_finish(ino
, info
, from
);
9149 info
.ancestors
= m
->ancestors
;
9150 info
.auth_hint
= from
;
9151 info
.checking
= mds
->get_nodeid();
9152 info
.discover
= true;
9153 _open_ino_traverse_dir(ino
, info
, 0);
9154 } else if (m
->error
) {
9155 dout(10) << " error " << m
->error
<< " from mds." << from
<< dendl
;
9156 do_open_ino(ino
, info
, m
->error
);
9158 if (m
->hint
>= 0 && m
->hint
!= mds
->get_nodeid()) {
9159 info
.auth_hint
= m
->hint
;
9160 info
.checked
.erase(m
->hint
);
9162 do_open_ino_peer(ino
, info
);
9167 void MDCache::kick_open_ino_peers(mds_rank_t who
)
9169 dout(10) << "kick_open_ino_peers mds." << who
<< dendl
;
9171 for (map
<inodeno_t
, open_ino_info_t
>::iterator p
= opening_inodes
.begin();
9172 p
!= opening_inodes
.end();
9174 open_ino_info_t
& info
= p
->second
;
9175 if (info
.checking
== who
) {
9176 dout(10) << " kicking ino " << p
->first
<< " who was checking mds." << who
<< dendl
;
9177 info
.checking
= MDS_RANK_NONE
;
9178 do_open_ino_peer(p
->first
, info
);
9179 } else if (info
.checking
== MDS_RANK_NONE
) {
9180 dout(10) << " kicking ino " << p
->first
<< " who was waiting" << dendl
;
9181 do_open_ino_peer(p
->first
, info
);
9186 void MDCache::open_ino(inodeno_t ino
, int64_t pool
, MDSContext
* fin
,
9187 bool want_replica
, bool want_xlocked
,
9188 vector
<inode_backpointer_t
> *ancestors_hint
,
9189 mds_rank_t auth_hint
)
9191 dout(10) << "open_ino " << ino
<< " pool " << pool
<< " want_replica "
9192 << want_replica
<< dendl
;
9194 auto it
= opening_inodes
.find(ino
);
9195 if (it
!= opening_inodes
.end()) {
9196 open_ino_info_t
& info
= it
->second
;
9198 info
.want_replica
= true;
9199 if (want_xlocked
&& !info
.want_xlocked
) {
9200 if (!info
.ancestors
.empty()) {
9201 CInode
*diri
= get_inode(info
.ancestors
[0].dirino
);
9203 frag_t fg
= diri
->pick_dirfrag(info
.ancestors
[0].dname
);
9204 CDir
*dir
= diri
->get_dirfrag(fg
);
9205 if (dir
&& !dir
->is_auth()) {
9206 filepath
path(info
.ancestors
[0].dname
, 0);
9207 discover_path(dir
, CEPH_NOSNAP
, path
, NULL
, true);
9211 info
.want_xlocked
= true;
9214 info
.waiters
.push_back(fin
);
9216 open_ino_info_t
& info
= opening_inodes
[ino
];
9217 info
.want_replica
= want_replica
;
9218 info
.want_xlocked
= want_xlocked
;
9219 info
.tid
= ++open_ino_last_tid
;
9220 info
.pool
= pool
>= 0 ? pool
: default_file_layout
.pool_id
;
9221 info
.waiters
.push_back(fin
);
9222 if (auth_hint
!= MDS_RANK_NONE
)
9223 info
.auth_hint
= auth_hint
;
9224 if (ancestors_hint
) {
9225 info
.ancestors
= std::move(*ancestors_hint
);
9226 info
.fetch_backtrace
= false;
9227 info
.checking
= mds
->get_nodeid();
9228 _open_ino_traverse_dir(ino
, info
, 0);
9230 do_open_ino(ino
, info
, 0);
9235 /* ---------------------------- */
9238 * search for a given inode on MDS peers. optionally start with the given node.
9242 - recover from mds node failure, recovery
9246 void MDCache::find_ino_peers(inodeno_t ino
, MDSContext
*c
,
9247 mds_rank_t hint
, bool path_locked
)
9249 dout(5) << "find_ino_peers " << ino
<< " hint " << hint
<< dendl
;
9250 CInode
*in
= get_inode(ino
);
9251 if (in
&& in
->state_test(CInode::STATE_PURGING
)) {
9252 c
->complete(-CEPHFS_ESTALE
);
9257 ceph_tid_t tid
= ++find_ino_peer_last_tid
;
9258 find_ino_peer_info_t
& fip
= find_ino_peer
[tid
];
9262 fip
.path_locked
= path_locked
;
9264 _do_find_ino_peer(fip
);
9267 void MDCache::_do_find_ino_peer(find_ino_peer_info_t
& fip
)
9269 set
<mds_rank_t
> all
, active
;
9270 mds
->mdsmap
->get_mds_set(all
);
9271 mds
->mdsmap
->get_mds_set_lower_bound(active
, MDSMap::STATE_CLIENTREPLAY
);
9273 dout(10) << "_do_find_ino_peer " << fip
.tid
<< " " << fip
.ino
9274 << " active " << active
<< " all " << all
9275 << " checked " << fip
.checked
9278 mds_rank_t m
= MDS_RANK_NONE
;
9279 if (fip
.hint
>= 0) {
9281 fip
.hint
= MDS_RANK_NONE
;
9283 for (set
<mds_rank_t
>::iterator p
= active
.begin(); p
!= active
.end(); ++p
)
9284 if (*p
!= mds
->get_nodeid() &&
9285 fip
.checked
.count(*p
) == 0) {
9290 if (m
== MDS_RANK_NONE
) {
9291 all
.erase(mds
->get_nodeid());
9292 if (all
!= fip
.checked
) {
9293 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl
;
9295 dout(10) << "_do_find_ino_peer failed on " << fip
.ino
<< dendl
;
9296 fip
.fin
->complete(-CEPHFS_ESTALE
);
9297 find_ino_peer
.erase(fip
.tid
);
9301 mds
->send_message_mds(make_message
<MMDSFindIno
>(fip
.tid
, fip
.ino
), m
);
9305 void MDCache::handle_find_ino(const cref_t
<MMDSFindIno
> &m
)
9307 if (mds
->get_state() < MDSMap::STATE_REJOIN
) {
9311 dout(10) << "handle_find_ino " << *m
<< dendl
;
9312 auto r
= make_message
<MMDSFindInoReply
>(m
->tid
);
9313 CInode
*in
= get_inode(m
->ino
);
9315 in
->make_path(r
->path
);
9316 dout(10) << " have " << r
->path
<< " " << *in
<< dendl
;
9318 mds
->send_message_mds(r
, mds_rank_t(m
->get_source().num()));
9322 void MDCache::handle_find_ino_reply(const cref_t
<MMDSFindInoReply
> &m
)
9324 auto p
= find_ino_peer
.find(m
->tid
);
9325 if (p
!= find_ino_peer
.end()) {
9326 dout(10) << "handle_find_ino_reply " << *m
<< dendl
;
9327 find_ino_peer_info_t
& fip
= p
->second
;
9330 if (get_inode(fip
.ino
)) {
9331 dout(10) << "handle_find_ino_reply successfully found " << fip
.ino
<< dendl
;
9332 mds
->queue_waiter(fip
.fin
);
9333 find_ino_peer
.erase(p
);
9337 mds_rank_t from
= mds_rank_t(m
->get_source().num());
9338 if (fip
.checking
== from
)
9339 fip
.checking
= MDS_RANK_NONE
;
9340 fip
.checked
.insert(from
);
9342 if (!m
->path
.empty()) {
9344 vector
<CDentry
*> trace
;
9345 CF_MDS_RetryMessageFactory
cf(mds
, m
);
9346 MDRequestRef null_ref
;
9347 int flags
= MDS_TRAVERSE_DISCOVER
;
9348 if (fip
.path_locked
)
9349 flags
|= MDS_TRAVERSE_PATH_LOCKED
;
9350 int r
= path_traverse(null_ref
, cf
, m
->path
, flags
, &trace
);
9353 dout(0) << "handle_find_ino_reply failed with " << r
<< " on " << m
->path
9354 << ", retrying" << dendl
;
9355 fip
.checked
.clear();
9356 _do_find_ino_peer(fip
);
9359 _do_find_ino_peer(fip
);
9362 dout(10) << "handle_find_ino_reply tid " << m
->tid
<< " dne" << dendl
;
9366 void MDCache::kick_find_ino_peers(mds_rank_t who
)
9368 // find_ino_peers requests we should move on from
9369 for (map
<ceph_tid_t
,find_ino_peer_info_t
>::iterator p
= find_ino_peer
.begin();
9370 p
!= find_ino_peer
.end();
9372 find_ino_peer_info_t
& fip
= p
->second
;
9373 if (fip
.checking
== who
) {
9374 dout(10) << "kicking find_ino_peer " << fip
.tid
<< " who was checking mds." << who
<< dendl
;
9375 fip
.checking
= MDS_RANK_NONE
;
9376 _do_find_ino_peer(fip
);
9377 } else if (fip
.checking
== MDS_RANK_NONE
) {
9378 dout(10) << "kicking find_ino_peer " << fip
.tid
<< " who was waiting" << dendl
;
9379 _do_find_ino_peer(fip
);
9384 /* ---------------------------- */
9386 int MDCache::get_num_client_requests()
9389 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
9390 p
!= active_requests
.end();
9392 MDRequestRef
& mdr
= p
->second
;
9393 if (mdr
->reqid
.name
.is_client() && !mdr
->is_peer())
9399 MDRequestRef
MDCache::request_start(const cref_t
<MClientRequest
>& req
)
9401 // did we win a forward race against a peer?
9402 if (active_requests
.count(req
->get_reqid())) {
9403 MDRequestRef
& mdr
= active_requests
[req
->get_reqid()];
9405 if (mdr
->is_peer()) {
9406 dout(10) << "request_start already had " << *mdr
<< ", waiting for finish" << dendl
;
9407 mdr
->more()->waiting_for_finish
.push_back(new C_MDS_RetryMessage(mds
, req
));
9409 dout(10) << "request_start already processing " << *mdr
<< ", dropping new msg" << dendl
;
9411 return MDRequestRef();
9414 // register new client request
9415 MDRequestImpl::Params params
;
9416 params
.reqid
= req
->get_reqid();
9417 params
.attempt
= req
->get_num_fwd();
9418 params
.client_req
= req
;
9419 params
.initiated
= req
->get_recv_stamp();
9420 params
.throttled
= req
->get_throttle_stamp();
9421 params
.all_read
= req
->get_recv_complete_stamp();
9422 params
.dispatched
= req
->get_dispatch_stamp();
9425 mds
->op_tracker
.create_request
<MDRequestImpl
,MDRequestImpl::Params
*>(¶ms
);
9426 active_requests
[params
.reqid
] = mdr
;
9427 mdr
->set_op_stamp(req
->get_stamp());
9428 dout(7) << "request_start " << *mdr
<< dendl
;
9432 MDRequestRef
MDCache::request_start_peer(metareqid_t ri
, __u32 attempt
, const cref_t
<Message
> &m
)
9434 int by
= m
->get_source().num();
9435 MDRequestImpl::Params params
;
9437 params
.attempt
= attempt
;
9438 params
.triggering_peer_req
= m
;
9439 params
.peer_to
= by
;
9440 params
.initiated
= m
->get_recv_stamp();
9441 params
.throttled
= m
->get_throttle_stamp();
9442 params
.all_read
= m
->get_recv_complete_stamp();
9443 params
.dispatched
= m
->get_dispatch_stamp();
9445 mds
->op_tracker
.create_request
<MDRequestImpl
,MDRequestImpl::Params
*>(¶ms
);
9446 ceph_assert(active_requests
.count(mdr
->reqid
) == 0);
9447 active_requests
[mdr
->reqid
] = mdr
;
9448 dout(7) << "request_start_peer " << *mdr
<< " by mds." << by
<< dendl
;
9452 MDRequestRef
MDCache::request_start_internal(int op
)
9454 utime_t now
= ceph_clock_now();
9455 MDRequestImpl::Params params
;
9456 params
.reqid
.name
= entity_name_t::MDS(mds
->get_nodeid());
9457 params
.reqid
.tid
= mds
->issue_tid();
9458 params
.initiated
= now
;
9459 params
.throttled
= now
;
9460 params
.all_read
= now
;
9461 params
.dispatched
= now
;
9462 params
.internal_op
= op
;
9464 mds
->op_tracker
.create_request
<MDRequestImpl
,MDRequestImpl::Params
*>(¶ms
);
9466 ceph_assert(active_requests
.count(mdr
->reqid
) == 0);
9467 active_requests
[mdr
->reqid
] = mdr
;
9468 dout(7) << "request_start_internal " << *mdr
<< " op " << op
<< dendl
;
9472 MDRequestRef
MDCache::request_get(metareqid_t rid
)
9474 ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.find(rid
);
9475 ceph_assert(p
!= active_requests
.end());
9476 dout(7) << "request_get " << rid
<< " " << *p
->second
<< dendl
;
9480 void MDCache::request_finish(MDRequestRef
& mdr
)
9482 dout(7) << "request_finish " << *mdr
<< dendl
;
9483 mdr
->mark_event("finishing request");
9486 if (mdr
->has_more() && mdr
->more()->peer_commit
) {
9487 Context
*fin
= mdr
->more()->peer_commit
;
9488 mdr
->more()->peer_commit
= 0;
9491 mdr
->aborted
= false;
9493 mdr
->more()->peer_rolling_back
= true;
9496 mdr
->committing
= true;
9498 fin
->complete(ret
); // this must re-call request_finish.
9502 switch(mdr
->internal_op
) {
9503 case CEPH_MDS_OP_FRAGMENTDIR
:
9504 logger
->inc(l_mdss_ireq_fragmentdir
);
9506 case CEPH_MDS_OP_EXPORTDIR
:
9507 logger
->inc(l_mdss_ireq_exportdir
);
9509 case CEPH_MDS_OP_ENQUEUE_SCRUB
:
9510 logger
->inc(l_mdss_ireq_enqueue_scrub
);
9512 case CEPH_MDS_OP_FLUSH
:
9513 logger
->inc(l_mdss_ireq_flush
);
9515 case CEPH_MDS_OP_REPAIR_FRAGSTATS
:
9516 logger
->inc(l_mdss_ireq_fragstats
);
9518 case CEPH_MDS_OP_REPAIR_INODESTATS
:
9519 logger
->inc(l_mdss_ireq_inodestats
);
9523 request_cleanup(mdr
);
9527 void MDCache::request_forward(MDRequestRef
& mdr
, mds_rank_t who
, int port
)
9529 CachedStackStringStream css
;
9530 *css
<< "forwarding request to mds." << who
;
9531 mdr
->mark_event(css
->strv());
9532 if (mdr
->client_request
&& mdr
->client_request
->get_source().is_client()) {
9533 dout(7) << "request_forward " << *mdr
<< " to mds." << who
<< " req "
9534 << *mdr
->client_request
<< dendl
;
9535 if (mdr
->is_batch_head()) {
9536 mdr
->release_batch_op()->forward(who
);
9538 mds
->forward_message_mds(mdr
->release_client_request(), who
);
9540 if (mds
->logger
) mds
->logger
->inc(l_mds_forward
);
9541 } else if (mdr
->internal_op
>= 0) {
9542 dout(10) << "request_forward on internal op; cancelling" << dendl
;
9543 mdr
->internal_op_finish
->complete(-CEPHFS_EXDEV
);
9545 dout(7) << "request_forward drop " << *mdr
<< " req " << *mdr
->client_request
9546 << " was from mds" << dendl
;
9548 request_cleanup(mdr
);
9552 void MDCache::dispatch_request(MDRequestRef
& mdr
)
9554 if (mdr
->client_request
) {
9555 mds
->server
->dispatch_client_request(mdr
);
9556 } else if (mdr
->peer_request
) {
9557 mds
->server
->dispatch_peer_request(mdr
);
9559 switch (mdr
->internal_op
) {
9560 case CEPH_MDS_OP_FRAGMENTDIR
:
9561 dispatch_fragment_dir(mdr
);
9563 case CEPH_MDS_OP_EXPORTDIR
:
9564 migrator
->dispatch_export_dir(mdr
, 0);
9566 case CEPH_MDS_OP_ENQUEUE_SCRUB
:
9567 enqueue_scrub_work(mdr
);
9569 case CEPH_MDS_OP_FLUSH
:
9570 flush_dentry_work(mdr
);
9572 case CEPH_MDS_OP_REPAIR_FRAGSTATS
:
9573 repair_dirfrag_stats_work(mdr
);
9575 case CEPH_MDS_OP_REPAIR_INODESTATS
:
9576 repair_inode_stats_work(mdr
);
9578 case CEPH_MDS_OP_RDLOCK_FRAGSSTATS
:
9579 rdlock_dirfrags_stats_work(mdr
);
9588 void MDCache::request_drop_foreign_locks(MDRequestRef
& mdr
)
9590 if (!mdr
->has_more())
9594 // (will implicitly drop remote dn pins)
9595 for (set
<mds_rank_t
>::iterator p
= mdr
->more()->peers
.begin();
9596 p
!= mdr
->more()->peers
.end();
9598 auto r
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
,
9599 MMDSPeerRequest::OP_FINISH
);
9601 if (mdr
->killed
&& !mdr
->committing
) {
9603 } else if (mdr
->more()->srcdn_auth_mds
== *p
&&
9604 mdr
->more()->inode_import
.length() > 0) {
9605 // information about rename imported caps
9606 r
->inode_export
= std::move(mdr
->more()->inode_import
);
9609 mds
->send_message_mds(r
, *p
);
9612 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9613 * implicitly. Note that we don't call the finishers -- there shouldn't
9614 * be any on a remote lock and the request finish wakes up all
9615 * the waiters anyway! */
9617 for (auto it
= mdr
->locks
.begin(); it
!= mdr
->locks
.end(); ) {
9618 SimpleLock
*lock
= it
->lock
;
9619 if (it
->is_xlock() && !lock
->get_parent()->is_auth()) {
9620 dout(10) << "request_drop_foreign_locks forgetting lock " << *lock
9621 << " on " << lock
->get_parent() << dendl
;
9623 mdr
->locks
.erase(it
++);
9624 } else if (it
->is_remote_wrlock()) {
9625 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *lock
9626 << " on mds." << it
->wrlock_target
<< " on " << *lock
->get_parent() << dendl
;
9627 if (it
->is_wrlock()) {
9628 it
->clear_remote_wrlock();
9631 mdr
->locks
.erase(it
++);
9638 mdr
->more()->peers
.clear(); /* we no longer have requests out to them, and
9639 * leaving them in can cause double-notifies as
9640 * this function can get called more than once */
9643 void MDCache::request_drop_non_rdlocks(MDRequestRef
& mdr
)
9645 request_drop_foreign_locks(mdr
);
9646 mds
->locker
->drop_non_rdlocks(mdr
.get());
9649 void MDCache::request_drop_locks(MDRequestRef
& mdr
)
9651 request_drop_foreign_locks(mdr
);
9652 mds
->locker
->drop_locks(mdr
.get());
9655 void MDCache::request_cleanup(MDRequestRef
& mdr
)
9657 dout(15) << "request_cleanup " << *mdr
<< dendl
;
9659 if (mdr
->has_more()) {
9660 if (mdr
->more()->is_ambiguous_auth
)
9661 mdr
->clear_ambiguous_auth();
9662 if (!mdr
->more()->waiting_for_finish
.empty())
9663 mds
->queue_waiters(mdr
->more()->waiting_for_finish
);
9666 request_drop_locks(mdr
);
9668 // drop (local) auth pins
9669 mdr
->drop_local_auth_pins();
9672 mdr
->put_stickydirs();
9674 mds
->locker
->kick_cap_releases(mdr
);
9679 // remove from session
9680 mdr
->item_session_request
.remove_myself();
9683 active_requests
.erase(mdr
->reqid
);
9688 mdr
->mark_event("cleaned up request");
9691 void MDCache::request_kill(MDRequestRef
& mdr
)
9693 // rollback peer requests is tricky. just let the request proceed.
9694 if (mdr
->has_more() &&
9695 (!mdr
->more()->witnessed
.empty() || !mdr
->more()->waiting_on_peer
.empty())) {
9696 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
9697 ceph_assert(mdr
->more()->witnessed
.empty());
9698 mdr
->aborted
= true;
9699 dout(10) << "request_kill " << *mdr
<< " -- waiting for peer reply, delaying" << dendl
;
9701 dout(10) << "request_kill " << *mdr
<< " -- already started peer prep, no-op" << dendl
;
9704 ceph_assert(mdr
->used_prealloc_ino
== 0);
9705 ceph_assert(mdr
->prealloc_inos
.empty());
9707 mdr
->session
= NULL
;
9708 mdr
->item_session_request
.remove_myself();
9713 mdr
->mark_event("killing request");
9715 if (mdr
->committing
) {
9716 dout(10) << "request_kill " << *mdr
<< " -- already committing, remove it from sesssion requests" << dendl
;
9717 mdr
->item_session_request
.remove_myself();
9719 dout(10) << "request_kill " << *mdr
<< dendl
;
9720 request_cleanup(mdr
);
9724 // -------------------------------------------------------------------------------
9727 void MDCache::create_global_snaprealm()
9729 CInode
*in
= new CInode(this); // dummy inode
9730 create_unlinked_system_inode(in
, CEPH_INO_GLOBAL_SNAPREALM
, S_IFDIR
|0755);
9732 global_snaprealm
= in
->snaprealm
;
9735 void MDCache::do_realm_invalidate_and_update_notify(CInode
*in
, int snapop
, bool notify_clients
)
9737 dout(10) << "do_realm_invalidate_and_update_notify " << *in
->snaprealm
<< " " << *in
<< dendl
;
9739 vector
<inodeno_t
> split_inos
;
9740 vector
<inodeno_t
> split_realms
;
9742 if (notify_clients
) {
9743 if (snapop
== CEPH_SNAP_OP_SPLIT
) {
9744 // notify clients of update|split
9745 for (auto p
= in
->snaprealm
->inodes_with_caps
.begin(); !p
.end(); ++p
)
9746 split_inos
.push_back((*p
)->ino());
9748 for (auto& r
: in
->snaprealm
->open_children
)
9749 split_realms
.push_back(r
->inode
->ino());
9753 map
<client_t
, ref_t
<MClientSnap
>> updates
;
9755 q
.push_back(in
->snaprealm
);
9756 while (!q
.empty()) {
9757 SnapRealm
*realm
= q
.front();
9760 dout(10) << " realm " << *realm
<< " on " << *realm
->inode
<< dendl
;
9761 realm
->invalidate_cached_snaps();
9763 if (notify_clients
) {
9764 for (const auto& p
: realm
->client_caps
) {
9765 const auto& client
= p
.first
;
9766 const auto& caps
= p
.second
;
9767 ceph_assert(!caps
->empty());
9769 auto em
= updates
.emplace(std::piecewise_construct
, std::forward_as_tuple(client
), std::forward_as_tuple());
9771 auto update
= make_message
<MClientSnap
>(CEPH_SNAP_OP_SPLIT
);
9772 update
->head
.split
= in
->ino();
9773 update
->split_inos
= split_inos
;
9774 update
->split_realms
= split_realms
;
9775 update
->bl
= in
->snaprealm
->get_snap_trace();
9776 em
.first
->second
= std::move(update
);
9781 // notify for active children, too.
9782 dout(10) << " " << realm
<< " open_children are " << realm
->open_children
<< dendl
;
9783 for (auto& r
: realm
->open_children
)
9788 send_snaps(updates
);
9791 void MDCache::send_snap_update(CInode
*in
, version_t stid
, int snap_op
)
9793 dout(10) << __func__
<< " " << *in
<< " stid " << stid
<< dendl
;
9794 ceph_assert(in
->is_auth());
9796 set
<mds_rank_t
> mds_set
;
9798 mds
->mdsmap
->get_mds_set_lower_bound(mds_set
, MDSMap::STATE_RESOLVE
);
9799 mds_set
.erase(mds
->get_nodeid());
9801 in
->list_replicas(mds_set
);
9804 if (!mds_set
.empty()) {
9805 bufferlist snap_blob
;
9806 in
->encode_snap(snap_blob
);
9808 for (auto p
: mds_set
) {
9809 auto m
= make_message
<MMDSSnapUpdate
>(in
->ino(), stid
, snap_op
);
9810 m
->snap_blob
= snap_blob
;
9811 mds
->send_message_mds(m
, p
);
9816 notify_global_snaprealm_update(snap_op
);
9819 void MDCache::handle_snap_update(const cref_t
<MMDSSnapUpdate
> &m
)
9821 mds_rank_t from
= mds_rank_t(m
->get_source().num());
9822 dout(10) << __func__
<< " " << *m
<< " from mds." << from
<< dendl
;
9824 if (mds
->get_state() < MDSMap::STATE_RESOLVE
&&
9825 mds
->get_want_state() != CEPH_MDS_STATE_RESOLVE
) {
9829 // null rejoin_done means open_snaprealms() has already been called
9830 bool notify_clients
= mds
->get_state() > MDSMap::STATE_REJOIN
||
9831 (mds
->is_rejoin() && !rejoin_done
);
9833 if (m
->get_tid() > 0) {
9834 mds
->snapclient
->notify_commit(m
->get_tid());
9836 notify_global_snaprealm_update(m
->get_snap_op());
9839 CInode
*in
= get_inode(m
->get_ino());
9841 ceph_assert(!in
->is_auth());
9842 if (mds
->get_state() > MDSMap::STATE_REJOIN
||
9843 (mds
->is_rejoin() && !in
->is_rejoining())) {
9844 auto p
= m
->snap_blob
.cbegin();
9847 if (!notify_clients
) {
9848 if (!rejoin_pending_snaprealms
.count(in
)) {
9849 in
->get(CInode::PIN_OPENINGSNAPPARENTS
);
9850 rejoin_pending_snaprealms
.insert(in
);
9853 do_realm_invalidate_and_update_notify(in
, m
->get_snap_op(), notify_clients
);
9858 void MDCache::notify_global_snaprealm_update(int snap_op
)
9860 if (snap_op
!= CEPH_SNAP_OP_DESTROY
)
9861 snap_op
= CEPH_SNAP_OP_UPDATE
;
9862 set
<Session
*> sessions
;
9863 mds
->sessionmap
.get_client_session_set(sessions
);
9864 for (auto &session
: sessions
) {
9865 if (!session
->is_open() && !session
->is_stale())
9867 auto update
= make_message
<MClientSnap
>(snap_op
);
9868 update
->head
.split
= global_snaprealm
->inode
->ino();
9869 update
->bl
= global_snaprealm
->get_snap_trace();
9870 mds
->send_message_client_counted(update
, session
);
9874 // -------------------------------------------------------------------------------
9877 struct C_MDC_RetryScanStray
: public MDCacheContext
{
9879 C_MDC_RetryScanStray(MDCache
*c
, dirfrag_t n
) : MDCacheContext(c
), next(n
) { }
9880 void finish(int r
) override
{
9881 mdcache
->scan_stray_dir(next
);
9885 void MDCache::scan_stray_dir(dirfrag_t next
)
9887 dout(10) << "scan_stray_dir " << next
<< dendl
;
9890 next
.frag
= strays
[MDS_INO_STRAY_INDEX(next
.ino
)]->dirfragtree
[next
.frag
.value()];
9892 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
9893 if (strays
[i
]->ino() < next
.ino
)
9896 std::vector
<CDir
*> ls
;
9897 strays
[i
]->get_dirfrags(ls
);
9899 for (const auto& dir
: ls
) {
9900 if (dir
->get_frag() < next
.frag
)
9903 if (!dir
->can_auth_pin()) {
9904 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDC_RetryScanStray(this, dir
->dirfrag()));
9908 if (!dir
->is_complete()) {
9909 dir
->fetch(new C_MDC_RetryScanStray(this, dir
->dirfrag()));
9913 for (auto &p
: dir
->items
) {
9914 CDentry
*dn
= p
.second
;
9915 dn
->state_set(CDentry::STATE_STRAY
);
9916 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
9917 if (dnl
->is_primary()) {
9918 CInode
*in
= dnl
->get_inode();
9919 if (in
->get_inode()->nlink
== 0)
9920 in
->state_set(CInode::STATE_ORPHAN
);
9921 maybe_eval_stray(in
);
9928 void MDCache::fetch_backtrace(inodeno_t ino
, int64_t pool
, bufferlist
& bl
, Context
*fin
)
9930 object_t oid
= CInode::get_object_name(ino
, frag_t(), "");
9931 mds
->objecter
->getxattr(oid
, object_locator_t(pool
), "parent", CEPH_NOSNAP
, &bl
, 0, fin
);
9933 mds
->logger
->inc(l_mds_openino_backtrace_fetch
);
9940 // ========================================================================================
9944 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
9945 to the parent metadata object in the cache (pinning it).
9947 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
9951 void MDCache::_send_discover(discover_info_t
& d
)
9953 auto dis
= make_message
<MDiscover
>(d
.ino
, d
.frag
, d
.snap
, d
.want_path
,
9954 d
.want_base_dir
, d
.path_locked
);
9955 dis
->set_tid(d
.tid
);
9956 mds
->send_message_mds(dis
, d
.mds
);
9959 void MDCache::discover_base_ino(inodeno_t want_ino
,
9960 MDSContext
*onfinish
,
9963 dout(7) << "discover_base_ino " << want_ino
<< " from mds." << from
<< dendl
;
9964 if (waiting_for_base_ino
[from
].count(want_ino
) == 0) {
9965 discover_info_t
& d
= _create_discover(from
);
9969 waiting_for_base_ino
[from
][want_ino
].push_back(onfinish
);
9973 void MDCache::discover_dir_frag(CInode
*base
,
9975 MDSContext
*onfinish
,
9979 from
= base
->authority().first
;
9981 dirfrag_t
df(base
->ino(), approx_fg
);
9982 dout(7) << "discover_dir_frag " << df
9983 << " from mds." << from
<< dendl
;
9985 if (!base
->is_waiting_for_dir(approx_fg
) || !onfinish
) {
9986 discover_info_t
& d
= _create_discover(from
);
9988 d
.ino
= base
->ino();
9990 d
.want_base_dir
= true;
9995 base
->add_dir_waiter(approx_fg
, onfinish
);
9998 struct C_MDC_RetryDiscoverPath
: public MDCacheContext
{
10003 C_MDC_RetryDiscoverPath(MDCache
*c
, CInode
*b
, snapid_t s
, filepath
&p
, mds_rank_t f
) :
10004 MDCacheContext(c
), base(b
), snapid(s
), path(p
), from(f
) {}
10005 void finish(int r
) override
{
10006 mdcache
->discover_path(base
, snapid
, path
, 0, from
);
10010 void MDCache::discover_path(CInode
*base
,
10012 filepath want_path
,
10013 MDSContext
*onfinish
,
10018 from
= base
->authority().first
;
10020 dout(7) << "discover_path " << base
->ino() << " " << want_path
<< " snap " << snap
<< " from mds." << from
10021 << (path_locked
? " path_locked":"")
10024 if (base
->is_ambiguous_auth()) {
10025 dout(10) << " waiting for single auth on " << *base
<< dendl
;
10027 onfinish
= new C_MDC_RetryDiscoverPath(this, base
, snap
, want_path
, from
);
10028 base
->add_waiter(CInode::WAIT_SINGLEAUTH
, onfinish
);
10030 } else if (from
== mds
->get_nodeid()) {
10031 MDSContext::vec finished
;
10032 base
->take_waiting(CInode::WAIT_DIR
, finished
);
10033 mds
->queue_waiters(finished
);
10037 frag_t fg
= base
->pick_dirfrag(want_path
[0]);
10038 if ((path_locked
&& want_path
.depth() == 1) ||
10039 !base
->is_waiting_for_dir(fg
) || !onfinish
) {
10040 discover_info_t
& d
= _create_discover(from
);
10041 d
.ino
= base
->ino();
10045 d
.want_path
= want_path
;
10046 d
.want_base_dir
= true;
10047 d
.path_locked
= path_locked
;
10053 base
->add_dir_waiter(fg
, onfinish
);
10056 struct C_MDC_RetryDiscoverPath2
: public MDCacheContext
{
10060 C_MDC_RetryDiscoverPath2(MDCache
*c
, CDir
*b
, snapid_t s
, filepath
&p
) :
10061 MDCacheContext(c
), base(b
), snapid(s
), path(p
) {}
10062 void finish(int r
) override
{
10063 mdcache
->discover_path(base
, snapid
, path
, 0);
10067 void MDCache::discover_path(CDir
*base
,
10069 filepath want_path
,
10070 MDSContext
*onfinish
,
10073 mds_rank_t from
= base
->authority().first
;
10075 dout(7) << "discover_path " << base
->dirfrag() << " " << want_path
<< " snap " << snap
<< " from mds." << from
10076 << (path_locked
? " path_locked":"")
10079 if (base
->is_ambiguous_auth()) {
10080 dout(7) << " waiting for single auth on " << *base
<< dendl
;
10082 onfinish
= new C_MDC_RetryDiscoverPath2(this, base
, snap
, want_path
);
10083 base
->add_waiter(CDir::WAIT_SINGLEAUTH
, onfinish
);
10085 } else if (from
== mds
->get_nodeid()) {
10086 MDSContext::vec finished
;
10087 base
->take_sub_waiting(finished
);
10088 mds
->queue_waiters(finished
);
10092 if ((path_locked
&& want_path
.depth() == 1) ||
10093 !base
->is_waiting_for_dentry(want_path
[0].c_str(), snap
) || !onfinish
) {
10094 discover_info_t
& d
= _create_discover(from
);
10095 d
.ino
= base
->ino();
10096 d
.pin_base(base
->inode
);
10097 d
.frag
= base
->get_frag();
10099 d
.want_path
= want_path
;
10100 d
.want_base_dir
= false;
10101 d
.path_locked
= path_locked
;
10107 base
->add_dentry_waiter(want_path
[0], snap
, onfinish
);
10110 void MDCache::kick_discovers(mds_rank_t who
)
10112 for (map
<ceph_tid_t
,discover_info_t
>::iterator p
= discovers
.begin();
10113 p
!= discovers
.end();
10115 if (p
->second
.mds
!= who
)
10117 _send_discover(p
->second
);
10122 void MDCache::handle_discover(const cref_t
<MDiscover
> &dis
)
10124 mds_rank_t whoami
= mds
->get_nodeid();
10125 mds_rank_t from
= mds_rank_t(dis
->get_source().num());
10127 ceph_assert(from
!= whoami
);
10129 if (mds
->get_state() <= MDSMap::STATE_REJOIN
) {
10130 if (mds
->get_state() < MDSMap::STATE_REJOIN
&&
10131 mds
->get_want_state() < CEPH_MDS_STATE_REJOIN
) {
10135 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
10136 // delay processing request from survivor because we may not yet choose lock states.
10137 if (!mds
->mdsmap
->is_rejoin(from
)) {
10138 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl
;
10139 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, dis
));
10146 auto reply
= make_message
<MDiscoverReply
>(*dis
);
10148 snapid_t snapid
= dis
->get_snapid();
10151 if (MDS_INO_IS_BASE(dis
->get_base_ino()) &&
10152 !dis
->wants_base_dir() && dis
->get_want().depth() == 0) {
10154 dout(7) << "handle_discover from mds." << from
10155 << " wants base + " << dis
->get_want().get_path()
10156 << " snap " << snapid
10159 cur
= get_inode(dis
->get_base_ino());
10163 reply
->starts_with
= MDiscoverReply::INODE
;
10164 encode_replica_inode(cur
, from
, reply
->trace
, mds
->mdsmap
->get_up_features());
10165 dout(10) << "added base " << *cur
<< dendl
;
10168 // there's a base inode
10169 cur
= get_inode(dis
->get_base_ino(), snapid
);
10170 if (!cur
&& snapid
!= CEPH_NOSNAP
) {
10171 cur
= get_inode(dis
->get_base_ino());
10172 if (cur
&& !cur
->is_multiversion())
10173 cur
= NULL
; // nope!
10177 dout(7) << "handle_discover mds." << from
10178 << " don't have base ino " << dis
->get_base_ino() << "." << snapid
10180 if (!dis
->wants_base_dir() && dis
->get_want().depth() > 0)
10181 reply
->set_error_dentry(dis
->get_dentry(0));
10182 reply
->set_flag_error_dir();
10183 } else if (dis
->wants_base_dir()) {
10184 dout(7) << "handle_discover mds." << from
10185 << " wants basedir+" << dis
->get_want().get_path()
10189 dout(7) << "handle_discover mds." << from
10190 << " wants " << dis
->get_want().get_path()
10196 ceph_assert(reply
);
10199 // do some fidgeting to include a dir if they asked for the base dir, or just root.
10200 for (unsigned i
= 0;
10201 cur
&& (i
< dis
->get_want().depth() || dis
->get_want().depth() == 0);
10204 // -- figure out the dir
10206 // is *cur even a dir at all?
10207 if (!cur
->is_dir()) {
10208 dout(7) << *cur
<< " not a dir" << dendl
;
10209 reply
->set_flag_error_dir();
10215 if (dis
->get_want().depth()) {
10216 // dentry specifies
10217 fg
= cur
->pick_dirfrag(dis
->get_dentry(i
));
10219 // requester explicity specified the frag
10220 ceph_assert(dis
->wants_base_dir() || MDS_INO_IS_BASE(dis
->get_base_ino()));
10221 fg
= dis
->get_base_dir_frag();
10222 if (!cur
->dirfragtree
.is_leaf(fg
))
10223 fg
= cur
->dirfragtree
[fg
.value()];
10225 CDir
*curdir
= cur
->get_dirfrag(fg
);
10227 if ((!curdir
&& !cur
->is_auth()) ||
10228 (curdir
&& !curdir
->is_auth())) {
10231 * ONLY set flag if empty!!
10232 * otherwise requester will wake up waiter(s) _and_ continue with discover,
10233 * resulting in duplicate discovers in flight,
10234 * which can wreak havoc when discovering rename srcdn (which may move)
10237 if (reply
->is_empty()) {
10238 // only hint if empty.
10239 // someday this could be better, but right now the waiter logic isn't smart enough.
10243 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir
<< dendl
;
10244 reply
->set_dir_auth_hint(curdir
->authority().first
);
10246 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
10248 reply
->set_dir_auth_hint(cur
->authority().first
);
10251 // note error dentry, if any
10252 // NOTE: important, as it allows requester to issue an equivalent discover
10253 // to whomever we hint at.
10254 if (dis
->get_want().depth() > i
)
10255 reply
->set_error_dentry(dis
->get_dentry(i
));
10261 if (!curdir
) { // open dir?
10262 if (cur
->is_frozen()) {
10263 if (!reply
->is_empty()) {
10264 dout(7) << *cur
<< " is frozen, non-empty reply, stopping" << dendl
;
10267 dout(7) << *cur
<< " is frozen, empty reply, waiting" << dendl
;
10268 cur
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryMessage(mds
, dis
));
10271 curdir
= cur
->get_or_open_dirfrag(this, fg
);
10272 } else if (curdir
->is_frozen_tree() ||
10273 (curdir
->is_frozen_dir() && fragment_are_all_frozen(curdir
))) {
10274 if (!reply
->is_empty()) {
10275 dout(7) << *curdir
<< " is frozen, non-empty reply, stopping" << dendl
;
10278 if (dis
->wants_base_dir() && dis
->get_base_dir_frag() != curdir
->get_frag()) {
10279 dout(7) << *curdir
<< " is frozen, dirfrag mismatch, stopping" << dendl
;
10280 reply
->set_flag_error_dir();
10283 dout(7) << *curdir
<< " is frozen, empty reply, waiting" << dendl
;
10284 curdir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryMessage(mds
, dis
));
10289 if (curdir
->get_version() == 0) {
10290 // fetch newly opened dir
10291 } else if (reply
->is_empty() && !dis
->wants_base_dir()) {
10292 dout(7) << "handle_discover not adding unwanted base dir " << *curdir
<< dendl
;
10293 // make sure the base frag is correct, though, in there was a refragment since the
10294 // original request was sent.
10295 reply
->set_base_dir_frag(curdir
->get_frag());
10297 ceph_assert(!curdir
->is_ambiguous_auth()); // would be frozen.
10298 if (!reply
->trace
.length())
10299 reply
->starts_with
= MDiscoverReply::DIR;
10300 encode_replica_dir(curdir
, from
, reply
->trace
);
10301 dout(7) << "handle_discover added dir " << *curdir
<< dendl
;
10306 if (curdir
->get_version() == 0) {
10307 // fetch newly opened dir
10308 ceph_assert(!curdir
->has_bloom());
10309 } else if (dis
->get_want().depth() > 0) {
10311 dn
= curdir
->lookup(dis
->get_dentry(i
), snapid
);
10317 if (!curdir
->is_complete() &&
10318 !(snapid
== CEPH_NOSNAP
&&
10319 curdir
->has_bloom() &&
10320 !curdir
->is_in_bloom(dis
->get_dentry(i
)))) {
10322 dout(7) << "incomplete dir contents for " << *curdir
<< ", fetching" << dendl
;
10323 if (reply
->is_empty()) {
10325 curdir
->fetch(new C_MDS_RetryMessage(mds
, dis
),
10326 dis
->wants_base_dir() && curdir
->get_version() == 0);
10329 // initiate fetch, but send what we have so far
10335 if (snapid
!= CEPH_NOSNAP
&& !reply
->is_empty()) {
10336 dout(7) << "dentry " << dis
->get_dentry(i
) << " snap " << snapid
10337 << " dne, non-empty reply, stopping" << dendl
;
10341 // send null dentry
10342 dout(7) << "dentry " << dis
->get_dentry(i
) << " dne, returning null in "
10343 << *curdir
<< dendl
;
10344 if (snapid
== CEPH_NOSNAP
)
10345 dn
= curdir
->add_null_dentry(dis
->get_dentry(i
));
10347 dn
= curdir
->add_null_dentry(dis
->get_dentry(i
), snapid
, snapid
);
10351 // don't add replica to purging dentry/inode
10352 if (dn
->state_test(CDentry::STATE_PURGING
)) {
10353 if (reply
->is_empty())
10354 reply
->set_flag_error_dn(dis
->get_dentry(i
));
10358 CDentry::linkage_t
*dnl
= dn
->get_linkage();
10361 // ...always block on non-tail items (they are unrelated)
10362 // ...allow xlocked tail disocvery _only_ if explicitly requested
10363 if (dn
->lock
.is_xlocked()) {
10364 // is this the last (tail) item in the discover traversal?
10365 if (dis
->is_path_locked()) {
10366 dout(7) << "handle_discover allowing discovery of xlocked " << *dn
<< dendl
;
10367 } else if (reply
->is_empty()) {
10368 dout(7) << "handle_discover blocking on xlocked " << *dn
<< dendl
;
10369 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, new C_MDS_RetryMessage(mds
, dis
));
10372 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn
<< dendl
;
10378 bool tailitem
= (dis
->get_want().depth() == 0) || (i
== dis
->get_want().depth() - 1);
10379 if (dnl
->is_primary() && dnl
->get_inode()->is_frozen_inode()) {
10380 if (tailitem
&& dis
->is_path_locked()) {
10381 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl
->get_inode() << dendl
;
10382 } else if (reply
->is_empty()) {
10383 dout(7) << *dnl
->get_inode() << " is frozen, empty reply, waiting" << dendl
;
10384 dnl
->get_inode()->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryMessage(mds
, dis
));
10387 dout(7) << *dnl
->get_inode() << " is frozen, non-empty reply, stopping" << dendl
;
10393 if (!reply
->trace
.length())
10394 reply
->starts_with
= MDiscoverReply::DENTRY
;
10395 encode_replica_dentry(dn
, from
, reply
->trace
);
10396 dout(7) << "handle_discover added dentry " << *dn
<< dendl
;
10398 if (!dnl
->is_primary()) break; // stop on null or remote link.
10401 CInode
*next
= dnl
->get_inode();
10402 ceph_assert(next
->is_auth());
10404 encode_replica_inode(next
, from
, reply
->trace
, mds
->mdsmap
->get_up_features());
10405 dout(7) << "handle_discover added inode " << *next
<< dendl
;
10407 // descend, keep going.
10413 ceph_assert(!reply
->is_empty());
10414 dout(7) << "handle_discover sending result back to asker mds." << from
<< dendl
;
10415 mds
->send_message(reply
, dis
->get_connection());
10418 void MDCache::handle_discover_reply(const cref_t
<MDiscoverReply
> &m
)
10421 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
10422 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
10426 dout(7) << "discover_reply " << *m
<< dendl
;
10427 if (m
->is_flag_error_dir())
10428 dout(7) << " flag error, dir" << dendl
;
10429 if (m
->is_flag_error_dn())
10430 dout(7) << " flag error, dentry = " << m
->get_error_dentry() << dendl
;
10432 MDSContext::vec finished
, error
;
10433 mds_rank_t from
= mds_rank_t(m
->get_source().num());
10436 CInode
*cur
= get_inode(m
->get_base_ino());
10437 auto p
= m
->trace
.cbegin();
10439 int next
= m
->starts_with
;
10441 // decrement discover counters
10442 if (m
->get_tid()) {
10443 map
<ceph_tid_t
,discover_info_t
>::iterator p
= discovers
.find(m
->get_tid());
10444 if (p
!= discovers
.end()) {
10445 dout(10) << " found tid " << m
->get_tid() << dendl
;
10446 discovers
.erase(p
);
10448 dout(10) << " tid " << m
->get_tid() << " not found, must be dup reply" << dendl
;
10452 // discover may start with an inode
10453 if (!p
.end() && next
== MDiscoverReply::INODE
) {
10454 decode_replica_inode(cur
, p
, NULL
, finished
);
10455 dout(7) << "discover_reply got base inode " << *cur
<< dendl
;
10456 ceph_assert(cur
->is_base());
10458 next
= MDiscoverReply::DIR;
10461 if (cur
->is_base() &&
10462 waiting_for_base_ino
[from
].count(cur
->ino())) {
10463 finished
.swap(waiting_for_base_ino
[from
][cur
->ino()]);
10464 waiting_for_base_ino
[from
].erase(cur
->ino());
10469 // loop over discover results.
10470 // indexes follow each ([[dir] dentry] inode)
10471 // can start, end with any type.
10475 CDir
*curdir
= nullptr;
10476 if (next
== MDiscoverReply::DIR) {
10477 decode_replica_dir(curdir
, p
, cur
, mds_rank_t(m
->get_source().num()), finished
);
10478 if (cur
->ino() == m
->get_base_ino() && curdir
->get_frag() != m
->get_base_dir_frag()) {
10479 ceph_assert(m
->get_wanted_base_dir());
10480 cur
->take_dir_waiting(m
->get_base_dir_frag(), finished
);
10483 // note: this can only happen our first way around this loop.
10484 if (p
.end() && m
->is_flag_error_dn()) {
10485 fg
= cur
->pick_dirfrag(m
->get_error_dentry());
10486 curdir
= cur
->get_dirfrag(fg
);
10488 curdir
= cur
->get_dirfrag(m
->get_base_dir_frag());
10495 CDentry
*dn
= nullptr;
10496 decode_replica_dentry(dn
, p
, curdir
, finished
);
10502 decode_replica_inode(cur
, p
, dn
, finished
);
10504 next
= MDiscoverReply::DIR;
10508 // or dir_auth hint?
10509 if (m
->is_flag_error_dir() && !cur
->is_dir()) {
10511 cur
->take_waiting(CInode::WAIT_DIR
, error
);
10512 } else if (m
->is_flag_error_dir() || m
->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN
) {
10513 mds_rank_t who
= m
->get_dir_auth_hint();
10514 if (who
== mds
->get_nodeid()) who
= -1;
10516 dout(7) << " dir_auth_hint is " << m
->get_dir_auth_hint() << dendl
;
10519 if (m
->get_wanted_base_dir()) {
10520 frag_t fg
= m
->get_base_dir_frag();
10521 CDir
*dir
= cur
->get_dirfrag(fg
);
10523 if (cur
->is_waiting_for_dir(fg
)) {
10524 if (cur
->is_auth())
10525 cur
->take_waiting(CInode::WAIT_DIR
, finished
);
10526 else if (dir
|| !cur
->dirfragtree
.is_leaf(fg
))
10527 cur
->take_dir_waiting(fg
, finished
);
10529 discover_dir_frag(cur
, fg
, 0, who
);
10531 dout(7) << " doing nothing, nobody is waiting for dir" << dendl
;
10535 if (m
->get_error_dentry().length()) {
10536 frag_t fg
= cur
->pick_dirfrag(m
->get_error_dentry());
10537 CDir
*dir
= cur
->get_dirfrag(fg
);
10539 if (dir
&& dir
->is_waiting_for_dentry(m
->get_error_dentry(), m
->get_wanted_snapid())) {
10540 if (dir
->is_auth() || dir
->lookup(m
->get_error_dentry())) {
10541 dir
->take_dentry_waiting(m
->get_error_dentry(), m
->get_wanted_snapid(),
10542 m
->get_wanted_snapid(), finished
);
10544 filepath
relpath(m
->get_error_dentry(), 0);
10545 discover_path(dir
, m
->get_wanted_snapid(), relpath
, 0, m
->is_path_locked());
10548 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10549 << m
->get_error_dentry() << dendl
;
10551 } else if (m
->is_flag_error_dn()) {
10552 frag_t fg
= cur
->pick_dirfrag(m
->get_error_dentry());
10553 CDir
*dir
= cur
->get_dirfrag(fg
);
10555 if (dir
->is_auth()) {
10556 dir
->take_sub_waiting(finished
);
10558 dir
->take_dentry_waiting(m
->get_error_dentry(), m
->get_wanted_snapid(),
10559 m
->get_wanted_snapid(), error
);
10565 finish_contexts(g_ceph_context
, error
, -CEPHFS_ENOENT
); // finish errors directly
10566 mds
->queue_waiters(finished
);
10571 // ----------------------------
10575 void MDCache::encode_replica_dir(CDir
*dir
, mds_rank_t to
, bufferlist
& bl
)
10577 ENCODE_START(1, 1, bl
);
10578 dirfrag_t df
= dir
->dirfrag();
10580 __u32 nonce
= dir
->add_replica(to
);
10582 dir
->_encode_base(bl
);
10586 void MDCache::encode_replica_dentry(CDentry
*dn
, mds_rank_t to
, bufferlist
& bl
)
10588 ENCODE_START(2, 1, bl
);
10589 encode(dn
->get_name(), bl
);
10590 encode(dn
->last
, bl
);
10592 __u32 nonce
= dn
->add_replica(to
);
10594 encode(dn
->first
, bl
);
10595 encode(dn
->linkage
.remote_ino
, bl
);
10596 encode(dn
->linkage
.remote_d_type
, bl
);
10597 dn
->lock
.encode_state_for_replica(bl
);
10598 bool need_recover
= mds
->get_state() < MDSMap::STATE_ACTIVE
;
10599 encode(need_recover
, bl
);
10600 encode(dn
->alternate_name
, bl
);
10604 void MDCache::encode_replica_inode(CInode
*in
, mds_rank_t to
, bufferlist
& bl
,
10607 ceph_assert(in
->is_auth());
10609 ENCODE_START(2, 1, bl
);
10610 encode(in
->ino(), bl
); // bleh, minor assymetry here
10611 encode(in
->last
, bl
);
10613 __u32 nonce
= in
->add_replica(to
);
10616 in
->_encode_base(bl
, features
);
10617 in
->_encode_locks_state_for_replica(bl
, mds
->get_state() < MDSMap::STATE_ACTIVE
);
10619 __u32 state
= in
->state
;
10625 void MDCache::decode_replica_dir(CDir
*&dir
, bufferlist::const_iterator
& p
, CInode
*diri
, mds_rank_t from
,
10626 MDSContext::vec
& finished
)
10628 DECODE_START(1, p
);
10632 ceph_assert(diri
->ino() == df
.ino
);
10634 // add it (_replica_)
10635 dir
= diri
->get_dirfrag(df
.frag
);
10638 // had replica. update w/ new nonce.
10641 dir
->set_replica_nonce(nonce
);
10642 dir
->_decode_base(p
);
10643 dout(7) << __func__
<< " had " << *dir
<< " nonce " << dir
->replica_nonce
<< dendl
;
10645 // force frag to leaf in the diri tree
10646 if (!diri
->dirfragtree
.is_leaf(df
.frag
)) {
10647 dout(7) << __func__
<< " forcing frag " << df
.frag
<< " to leaf in the fragtree "
10648 << diri
->dirfragtree
<< dendl
;
10649 diri
->dirfragtree
.force_to_leaf(g_ceph_context
, df
.frag
);
10652 dir
= diri
->add_dirfrag( new CDir(diri
, df
.frag
, this, false) );
10655 dir
->set_replica_nonce(nonce
);
10656 dir
->_decode_base(p
);
10657 // is this a dir_auth delegation boundary?
10658 if (from
!= diri
->authority().first
||
10659 diri
->is_ambiguous_auth() ||
10661 adjust_subtree_auth(dir
, from
);
10663 dout(7) << __func__
<< " added " << *dir
<< " nonce " << dir
->replica_nonce
<< dendl
;
10665 diri
->take_dir_waiting(df
.frag
, finished
);
10670 void MDCache::decode_replica_dentry(CDentry
*&dn
, bufferlist::const_iterator
& p
, CDir
*dir
, MDSContext::vec
& finished
)
10672 DECODE_START(1, p
);
10678 dn
= dir
->lookup(name
, last
);
10681 bool is_new
= false;
10684 dout(7) << __func__
<< " had " << *dn
<< dendl
;
10687 dn
= dir
->add_null_dentry(name
, 1 /* this will get updated below */, last
);
10688 dout(7) << __func__
<< " added " << *dn
<< dendl
;
10693 dn
->set_replica_nonce(nonce
);
10694 decode(dn
->first
, p
);
10697 unsigned char rdtype
;
10700 dn
->lock
.decode_state(p
, is_new
);
10703 decode(need_recover
, p
);
10705 mempool::mds_co::string alternate_name
;
10706 if (struct_v
>= 2) {
10707 decode(alternate_name
, p
);
10711 dn
->set_alternate_name(std::move(alternate_name
));
10713 dir
->link_remote_inode(dn
, rino
, rdtype
);
10715 dn
->lock
.mark_need_recover();
10717 ceph_assert(dn
->alternate_name
== alternate_name
);
10720 dir
->take_dentry_waiting(name
, dn
->first
, dn
->last
, finished
);
10724 void MDCache::decode_replica_inode(CInode
*&in
, bufferlist::const_iterator
& p
, CDentry
*dn
, MDSContext::vec
& finished
)
10726 DECODE_START(2, p
);
10733 in
= get_inode(ino
, last
);
10735 in
= new CInode(this, false, 2, last
);
10736 in
->set_replica_nonce(nonce
);
10737 in
->_decode_base(p
);
10738 in
->_decode_locks_state_for_replica(p
, true);
10740 if (in
->ino() == CEPH_INO_ROOT
)
10741 in
->inode_auth
.first
= 0;
10742 else if (in
->is_mdsdir())
10743 in
->inode_auth
.first
= in
->ino() - MDS_INO_MDSDIR_OFFSET
;
10744 dout(10) << __func__
<< " added " << *in
<< dendl
;
10746 ceph_assert(dn
->get_linkage()->is_null());
10747 dn
->dir
->link_primary_inode(dn
, in
);
10750 in
->set_replica_nonce(nonce
);
10751 in
->_decode_base(p
);
10752 in
->_decode_locks_state_for_replica(p
, false);
10753 dout(10) << __func__
<< " had " << *in
<< dendl
;
10757 if (!dn
->get_linkage()->is_primary() || dn
->get_linkage()->get_inode() != in
)
10758 dout(10) << __func__
<< " different linkage in dentry " << *dn
<< dendl
;
10761 if (struct_v
>= 2) {
10764 s
&= CInode::MASK_STATE_REPLICATED
;
10765 if (s
& CInode::STATE_RANDEPHEMERALPIN
) {
10766 dout(10) << "replica inode is random ephemeral pinned" << dendl
;
10767 in
->set_ephemeral_pin(false, true);
10775 void MDCache::encode_replica_stray(CDentry
*straydn
, mds_rank_t who
, bufferlist
& bl
)
10777 ceph_assert(straydn
->get_num_auth_pins());
10778 ENCODE_START(1, 1, bl
);
10779 uint64_t features
= mds
->mdsmap
->get_up_features();
10780 encode_replica_inode(get_myin(), who
, bl
, features
);
10781 encode_replica_dir(straydn
->get_dir()->inode
->get_parent_dn()->get_dir(), who
, bl
);
10782 encode_replica_dentry(straydn
->get_dir()->inode
->get_parent_dn(), who
, bl
);
10783 encode_replica_inode(straydn
->get_dir()->inode
, who
, bl
, features
);
10784 encode_replica_dir(straydn
->get_dir(), who
, bl
);
10785 encode_replica_dentry(straydn
, who
, bl
);
10789 void MDCache::decode_replica_stray(CDentry
*&straydn
, const bufferlist
&bl
, mds_rank_t from
)
10791 MDSContext::vec finished
;
10792 auto p
= bl
.cbegin();
10794 DECODE_START(1, p
);
10795 CInode
*mdsin
= nullptr;
10796 decode_replica_inode(mdsin
, p
, NULL
, finished
);
10797 CDir
*mdsdir
= nullptr;
10798 decode_replica_dir(mdsdir
, p
, mdsin
, from
, finished
);
10799 CDentry
*straydirdn
= nullptr;
10800 decode_replica_dentry(straydirdn
, p
, mdsdir
, finished
);
10801 CInode
*strayin
= nullptr;
10802 decode_replica_inode(strayin
, p
, straydirdn
, finished
);
10803 CDir
*straydir
= nullptr;
10804 decode_replica_dir(straydir
, p
, strayin
, from
, finished
);
10806 decode_replica_dentry(straydn
, p
, straydir
, finished
);
10807 if (!finished
.empty())
10808 mds
->queue_waiters(finished
);
10813 int MDCache::send_dir_updates(CDir
*dir
, bool bcast
)
10815 // this is an FYI, re: replication
10817 set
<mds_rank_t
> who
;
10819 set
<mds_rank_t
> mds_set
;
10820 mds
->get_mds_map()->get_active_mds_set(mds_set
);
10822 set
<mds_rank_t
> replica_set
;
10823 for (const auto &p
: dir
->get_replicas()) {
10824 replica_set
.insert(p
.first
);
10827 std::set_difference(mds_set
.begin(), mds_set
.end(),
10828 replica_set
.begin(), replica_set
.end(),
10829 std::inserter(who
, who
.end()));
10831 for (const auto &p
: dir
->get_replicas()) {
10832 who
.insert(p
.first
);
10836 dout(7) << "sending dir_update on " << *dir
<< " bcast " << bcast
<< " to " << who
<< dendl
;
10839 dir
->inode
->make_path(path
);
10841 std::set
<int32_t> dir_rep_set
;
10842 for (const auto &r
: dir
->dir_rep_by
) {
10843 dir_rep_set
.insert(r
);
10846 mds_rank_t whoami
= mds
->get_nodeid();
10847 for (set
<mds_rank_t
>::iterator it
= who
.begin();
10850 if (*it
== whoami
) continue;
10851 //if (*it == except) continue;
10852 dout(7) << "sending dir_update on " << *dir
<< " to " << *it
<< dendl
;
10854 mds
->send_message_mds(make_message
<MDirUpdate
>(mds
->get_nodeid(), dir
->dirfrag(), dir
->dir_rep
, dir_rep_set
, path
, bcast
), *it
);
10860 void MDCache::handle_dir_update(const cref_t
<MDirUpdate
> &m
)
10862 dirfrag_t df
= m
->get_dirfrag();
10863 CDir
*dir
= get_dirfrag(df
);
10865 dout(5) << "dir_update on " << df
<< ", don't have it" << dendl
;
10868 if (m
->should_discover()) {
10870 // this is key to avoid a fragtree update race, among other things.
10871 m
->inc_tried_discover();
10872 vector
<CDentry
*> trace
;
10874 filepath path
= m
->get_path();
10875 dout(5) << "trying discover on dir_update for " << path
<< dendl
;
10876 CF_MDS_RetryMessageFactory
cf(mds
, m
);
10877 MDRequestRef null_ref
;
10878 int r
= path_traverse(null_ref
, cf
, path
, MDS_TRAVERSE_DISCOVER
, &trace
, &in
);
10882 in
->ino() == df
.ino
&&
10883 in
->get_approx_dirfrag(df
.frag
) == NULL
) {
10884 open_remote_dirfrag(in
, df
.frag
, new C_MDS_RetryMessage(mds
, m
));
10892 if (!m
->has_tried_discover()) {
10893 // Update if it already exists. Othwerwise it got updated by discover reply.
10894 dout(5) << "dir_update on " << *dir
<< dendl
;
10895 dir
->dir_rep
= m
->get_dir_rep();
10896 dir
->dir_rep_by
.clear();
10897 for (const auto &e
: m
->get_dir_rep_by()) {
10898 dir
->dir_rep_by
.insert(e
);
10909 void MDCache::encode_remote_dentry_link(CDentry::linkage_t
*dnl
, bufferlist
& bl
)
10911 ENCODE_START(1, 1, bl
);
10912 inodeno_t ino
= dnl
->get_remote_ino();
10914 __u8 d_type
= dnl
->get_remote_d_type();
10915 encode(d_type
, bl
);
10919 void MDCache::decode_remote_dentry_link(CDir
*dir
, CDentry
*dn
, bufferlist::const_iterator
& p
)
10921 DECODE_START(1, p
);
10926 dout(10) << __func__
<< " remote " << ino
<< " " << d_type
<< dendl
;
10927 dir
->link_remote_inode(dn
, ino
, d_type
);
10931 void MDCache::send_dentry_link(CDentry
*dn
, MDRequestRef
& mdr
)
10933 dout(7) << __func__
<< " " << *dn
<< dendl
;
10935 CDir
*subtree
= get_subtree_root(dn
->get_dir());
10936 for (const auto &p
: dn
->get_replicas()) {
10937 // don't tell (rename) witnesses; they already know
10938 if (mdr
.get() && mdr
->more()->witnessed
.count(p
.first
))
10940 if (mds
->mdsmap
->get_state(p
.first
) < MDSMap::STATE_REJOIN
||
10941 (mds
->mdsmap
->get_state(p
.first
) == MDSMap::STATE_REJOIN
&&
10942 rejoin_gather
.count(p
.first
)))
10944 CDentry::linkage_t
*dnl
= dn
->get_linkage();
10945 auto m
= make_message
<MDentryLink
>(subtree
->dirfrag(), dn
->get_dir()->dirfrag(), dn
->get_name(), dnl
->is_primary());
10946 if (dnl
->is_primary()) {
10947 dout(10) << __func__
<< " primary " << *dnl
->get_inode() << dendl
;
10948 encode_replica_inode(dnl
->get_inode(), p
.first
, m
->bl
,
10949 mds
->mdsmap
->get_up_features());
10950 } else if (dnl
->is_remote()) {
10951 encode_remote_dentry_link(dnl
, m
->bl
);
10953 ceph_abort(); // aie, bad caller!
10954 mds
->send_message_mds(m
, p
.first
);
10958 void MDCache::handle_dentry_link(const cref_t
<MDentryLink
> &m
)
10960 CDentry
*dn
= NULL
;
10961 CDir
*dir
= get_dirfrag(m
->get_dirfrag());
10963 dout(7) << __func__
<< " don't have dirfrag " << m
->get_dirfrag() << dendl
;
10965 dn
= dir
->lookup(m
->get_dn());
10967 dout(7) << __func__
<< " don't have dentry " << *dir
<< " dn " << m
->get_dn() << dendl
;
10969 dout(7) << __func__
<< " on " << *dn
<< dendl
;
10970 CDentry::linkage_t
*dnl
= dn
->get_linkage();
10972 ceph_assert(!dn
->is_auth());
10973 ceph_assert(dnl
->is_null());
10977 auto p
= m
->bl
.cbegin();
10978 MDSContext::vec finished
;
10980 if (m
->get_is_primary()) {
10982 CInode
*in
= nullptr;
10983 decode_replica_inode(in
, p
, dn
, finished
);
10985 // remote link, easy enough.
10986 decode_remote_dentry_link(dir
, dn
, p
);
10992 if (!finished
.empty())
10993 mds
->queue_waiters(finished
);
11001 void MDCache::send_dentry_unlink(CDentry
*dn
, CDentry
*straydn
, MDRequestRef
& mdr
)
11003 dout(10) << __func__
<< " " << *dn
<< dendl
;
11004 // share unlink news with replicas
11005 set
<mds_rank_t
> replicas
;
11006 dn
->list_replicas(replicas
);
11009 straydn
->list_replicas(replicas
);
11010 CInode
*strayin
= straydn
->get_linkage()->get_inode();
11011 strayin
->encode_snap_blob(snapbl
);
11013 for (set
<mds_rank_t
>::iterator it
= replicas
.begin();
11014 it
!= replicas
.end();
11016 // don't tell (rmdir) witnesses; they already know
11017 if (mdr
.get() && mdr
->more()->witnessed
.count(*it
))
11020 if (mds
->mdsmap
->get_state(*it
) < MDSMap::STATE_REJOIN
||
11021 (mds
->mdsmap
->get_state(*it
) == MDSMap::STATE_REJOIN
&&
11022 rejoin_gather
.count(*it
)))
11025 auto unlink
= make_message
<MDentryUnlink
>(dn
->get_dir()->dirfrag(), dn
->get_name());
11027 encode_replica_stray(straydn
, *it
, unlink
->straybl
);
11028 unlink
->snapbl
= snapbl
;
11030 mds
->send_message_mds(unlink
, *it
);
11034 void MDCache::handle_dentry_unlink(const cref_t
<MDentryUnlink
> &m
)
11037 CDentry
*straydn
= nullptr;
11038 if (m
->straybl
.length())
11039 decode_replica_stray(straydn
, m
->straybl
, mds_rank_t(m
->get_source().num()));
11041 CDir
*dir
= get_dirfrag(m
->get_dirfrag());
11043 dout(7) << __func__
<< " don't have dirfrag " << m
->get_dirfrag() << dendl
;
11045 CDentry
*dn
= dir
->lookup(m
->get_dn());
11047 dout(7) << __func__
<< " don't have dentry " << *dir
<< " dn " << m
->get_dn() << dendl
;
11049 dout(7) << __func__
<< " on " << *dn
<< dendl
;
11050 CDentry::linkage_t
*dnl
= dn
->get_linkage();
11053 if (dnl
->is_primary()) {
11054 CInode
*in
= dnl
->get_inode();
11055 dn
->dir
->unlink_inode(dn
);
11056 ceph_assert(straydn
);
11057 straydn
->dir
->link_primary_inode(straydn
, in
);
11059 // in->first is lazily updated on replica; drag it forward so
11060 // that we always keep it in sync with the dnq
11061 ceph_assert(straydn
->first
>= in
->first
);
11062 in
->first
= straydn
->first
;
11064 // update subtree map?
11066 adjust_subtree_after_rename(in
, dir
, false);
11068 if (m
->snapbl
.length()) {
11069 bool hadrealm
= (in
->snaprealm
? true : false);
11070 in
->decode_snap_blob(m
->snapbl
);
11071 ceph_assert(in
->snaprealm
);
11073 do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
, false);
11076 // send caps to auth (if we're not already)
11077 if (in
->is_any_caps() &&
11078 !in
->state_test(CInode::STATE_EXPORTINGCAPS
))
11079 migrator
->export_caps(in
);
11083 ceph_assert(!straydn
);
11084 ceph_assert(dnl
->is_remote());
11085 dn
->dir
->unlink_inode(dn
);
11087 ceph_assert(dnl
->is_null());
11091 // race with trim_dentry()
11093 ceph_assert(straydn
->get_num_ref() == 0);
11094 ceph_assert(straydn
->get_linkage()->is_null());
11096 trim_dentry(straydn
, ex
);
11097 send_expire_messages(ex
);
11106 // ===================================================================
11110 // ===================================================================
11115 * adjust_dir_fragments -- adjust fragmentation for a directory
11117 * @param diri directory inode
11118 * @param basefrag base fragment
11119 * @param bits bit adjustment. positive for split, negative for merge.
11121 void MDCache::adjust_dir_fragments(CInode
*diri
, frag_t basefrag
, int bits
,
11122 std::vector
<CDir
*>* resultfrags
,
11123 MDSContext::vec
& waiters
,
11126 dout(10) << "adjust_dir_fragments " << basefrag
<< " " << bits
11127 << " on " << *diri
<< dendl
;
11129 auto&& p
= diri
->get_dirfrags_under(basefrag
);
11131 adjust_dir_fragments(diri
, p
.second
, basefrag
, bits
, resultfrags
, waiters
, replay
);
11134 CDir
*MDCache::force_dir_fragment(CInode
*diri
, frag_t fg
, bool replay
)
11136 CDir
*dir
= diri
->get_dirfrag(fg
);
11140 dout(10) << "force_dir_fragment " << fg
<< " on " << *diri
<< dendl
;
11142 std::vector
<CDir
*> src
, result
;
11143 MDSContext::vec waiters
;
11146 frag_t parent
= diri
->dirfragtree
.get_branch_or_leaf(fg
);
11148 CDir
*pdir
= diri
->get_dirfrag(parent
);
11150 int split
= fg
.bits() - parent
.bits();
11151 dout(10) << " splitting parent by " << split
<< " " << *pdir
<< dendl
;
11152 src
.push_back(pdir
);
11153 adjust_dir_fragments(diri
, src
, parent
, split
, &result
, waiters
, replay
);
11154 dir
= diri
->get_dirfrag(fg
);
11156 dout(10) << "force_dir_fragment result " << *dir
<< dendl
;
11160 if (parent
== frag_t())
11162 frag_t last
= parent
;
11163 parent
= parent
.parent();
11164 dout(10) << " " << last
<< " parent is " << parent
<< dendl
;
11168 // hoover up things under fg?
11170 auto&& p
= diri
->get_dirfrags_under(fg
);
11171 src
.insert(std::end(src
), std::cbegin(p
.second
), std::cend(p
.second
));
11174 dout(10) << "force_dir_fragment no frags under " << fg
<< dendl
;
11176 dout(10) << " will combine frags under " << fg
<< ": " << src
<< dendl
;
11177 adjust_dir_fragments(diri
, src
, fg
, 0, &result
, waiters
, replay
);
11178 dir
= result
.front();
11179 dout(10) << "force_dir_fragment result " << *dir
<< dendl
;
11183 mds
->queue_waiters(waiters
);
11187 void MDCache::adjust_dir_fragments(CInode
*diri
,
11188 const std::vector
<CDir
*>& srcfrags
,
11189 frag_t basefrag
, int bits
,
11190 std::vector
<CDir
*>* resultfrags
,
11191 MDSContext::vec
& waiters
,
11194 dout(10) << "adjust_dir_fragments " << basefrag
<< " bits " << bits
11195 << " srcfrags " << srcfrags
11196 << " on " << *diri
<< dendl
;
11199 // yuck. we may have discovered the inode while it was being fragmented.
11200 if (!diri
->dirfragtree
.is_leaf(basefrag
))
11201 diri
->dirfragtree
.force_to_leaf(g_ceph_context
, basefrag
);
11204 diri
->dirfragtree
.split(basefrag
, bits
);
11205 dout(10) << " new fragtree is " << diri
->dirfragtree
<< dendl
;
11207 if (srcfrags
.empty())
11211 CDir
*parent_dir
= diri
->get_parent_dir();
11212 CDir
*parent_subtree
= 0;
11214 parent_subtree
= get_subtree_root(parent_dir
);
11216 ceph_assert(srcfrags
.size() >= 1);
11219 ceph_assert(srcfrags
.size() == 1);
11220 CDir
*dir
= srcfrags
.front();
11222 dir
->split(bits
, resultfrags
, waiters
, replay
);
11224 // did i change the subtree map?
11225 if (dir
->is_subtree_root()) {
11226 // new frags are now separate subtrees
11227 for (const auto& dir
: *resultfrags
) {
11228 subtrees
[dir
].clear(); // new frag is now its own subtree
11232 if (parent_subtree
) {
11233 ceph_assert(subtrees
[parent_subtree
].count(dir
));
11234 subtrees
[parent_subtree
].erase(dir
);
11235 for (const auto& dir
: *resultfrags
) {
11236 ceph_assert(dir
->is_subtree_root());
11237 subtrees
[parent_subtree
].insert(dir
);
11241 // adjust my bounds.
11243 bounds
.swap(subtrees
[dir
]);
11244 subtrees
.erase(dir
);
11245 for (set
<CDir
*>::iterator p
= bounds
.begin();
11248 CDir
*frag
= get_subtree_root((*p
)->get_parent_dir());
11249 subtrees
[frag
].insert(*p
);
11255 diri
->close_dirfrag(dir
->get_frag());
11260 // are my constituent bits subtrees? if so, i will be too.
11261 // (it's all or none, actually.)
11262 bool any_subtree
= false, any_non_subtree
= false;
11263 for (const auto& dir
: srcfrags
) {
11264 if (dir
->is_subtree_root())
11265 any_subtree
= true;
11267 any_non_subtree
= true;
11269 ceph_assert(!any_subtree
|| !any_non_subtree
);
11271 set
<CDir
*> new_bounds
;
11273 for (const auto& dir
: srcfrags
) {
11274 // this simplifies the code that find subtrees underneath the dirfrag
11275 if (!dir
->is_subtree_root()) {
11276 dir
->state_set(CDir::STATE_AUXSUBTREE
);
11277 adjust_subtree_auth(dir
, mds
->get_nodeid());
11281 for (const auto& dir
: srcfrags
) {
11282 ceph_assert(dir
->is_subtree_root());
11283 dout(10) << " taking srcfrag subtree bounds from " << *dir
<< dendl
;
11284 map
<CDir
*, set
<CDir
*> >::iterator q
= subtrees
.find(dir
);
11285 set
<CDir
*>::iterator r
= q
->second
.begin();
11286 while (r
!= subtrees
[dir
].end()) {
11287 new_bounds
.insert(*r
);
11288 subtrees
[dir
].erase(r
++);
11292 // remove myself as my parent's bound
11293 if (parent_subtree
)
11294 subtrees
[parent_subtree
].erase(dir
);
11299 CDir
*f
= new CDir(diri
, basefrag
, this, srcfrags
.front()->is_auth());
11300 f
->merge(srcfrags
, waiters
, replay
);
11303 ceph_assert(f
->is_subtree_root());
11304 subtrees
[f
].swap(new_bounds
);
11305 if (parent_subtree
)
11306 subtrees
[parent_subtree
].insert(f
);
11311 resultfrags
->push_back(f
);
11316 class C_MDC_FragmentFrozen
: public MDSInternalContext
{
11320 C_MDC_FragmentFrozen(MDCache
*m
, MDRequestRef
& r
) :
11321 MDSInternalContext(m
->mds
), mdcache(m
), mdr(r
) {}
11322 void finish(int r
) override
{
11323 mdcache
->fragment_frozen(mdr
, r
);
11327 bool MDCache::can_fragment(CInode
*diri
, const std::vector
<CDir
*>& dirs
)
11329 if (is_readonly()) {
11330 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl
;
11333 if (mds
->is_cluster_degraded()) {
11334 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl
;
11337 if (diri
->get_parent_dir() &&
11338 diri
->get_parent_dir()->get_inode()->is_stray()) {
11339 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl
;
11342 if (diri
->is_mdsdir() || diri
->ino() == CEPH_INO_CEPH
) {
11343 dout(7) << "can_fragment: i won't fragment mdsdir or .ceph" << dendl
;
11347 for (const auto& dir
: dirs
) {
11348 if (dir
->scrub_is_in_progress()) {
11349 dout(7) << "can_fragment: scrub in progress " << *dir
<< dendl
;
11353 if (dir
->state_test(CDir::STATE_FRAGMENTING
)) {
11354 dout(7) << "can_fragment: already fragmenting " << *dir
<< dendl
;
11357 if (!dir
->is_auth()) {
11358 dout(7) << "can_fragment: not auth on " << *dir
<< dendl
;
11361 if (dir
->is_bad()) {
11362 dout(7) << "can_fragment: bad dirfrag " << *dir
<< dendl
;
11365 if (dir
->is_frozen() ||
11366 dir
->is_freezing()) {
11367 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl
;
11375 void MDCache::split_dir(CDir
*dir
, int bits
)
11377 dout(7) << __func__
<< " " << *dir
<< " bits " << bits
<< dendl
;
11378 ceph_assert(dir
->is_auth());
11379 CInode
*diri
= dir
->inode
;
11381 std::vector
<CDir
*> dirs
;
11382 dirs
.push_back(dir
);
11384 if (!can_fragment(diri
, dirs
)) {
11385 dout(7) << __func__
<< " cannot fragment right now, dropping" << dendl
;
11389 if (dir
->frag
.bits() + bits
> 24) {
11390 dout(7) << __func__
<< " frag bits > 24, dropping" << dendl
;
11394 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_FRAGMENTDIR
);
11395 mdr
->more()->fragment_base
= dir
->dirfrag();
11397 ceph_assert(fragments
.count(dir
->dirfrag()) == 0);
11398 fragment_info_t
& info
= fragments
[dir
->dirfrag()];
11400 info
.dirs
.push_back(dir
);
11402 info
.last_cum_auth_pins_change
= ceph_clock_now();
11404 fragment_freeze_dirs(dirs
);
11405 // initial mark+complete pass
11406 fragment_mark_and_complete(mdr
);
11409 void MDCache::merge_dir(CInode
*diri
, frag_t frag
)
11411 dout(7) << "merge_dir to " << frag
<< " on " << *diri
<< dendl
;
11413 auto&& [all
, dirs
] = diri
->get_dirfrags_under(frag
);
11415 dout(7) << "don't have all frags under " << frag
<< " for " << *diri
<< dendl
;
11419 if (diri
->dirfragtree
.is_leaf(frag
)) {
11420 dout(10) << " " << frag
<< " already a leaf for " << *diri
<< dendl
;
11424 if (!can_fragment(diri
, dirs
))
11427 CDir
*first
= dirs
.front();
11428 int bits
= first
->get_frag().bits() - frag
.bits();
11429 dout(10) << " we are merging by " << bits
<< " bits" << dendl
;
11431 dirfrag_t
basedirfrag(diri
->ino(), frag
);
11432 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_FRAGMENTDIR
);
11433 mdr
->more()->fragment_base
= basedirfrag
;
11435 ceph_assert(fragments
.count(basedirfrag
) == 0);
11436 fragment_info_t
& info
= fragments
[basedirfrag
];
11440 info
.last_cum_auth_pins_change
= ceph_clock_now();
11442 fragment_freeze_dirs(dirs
);
11443 // initial mark+complete pass
11444 fragment_mark_and_complete(mdr
);
11447 void MDCache::fragment_freeze_dirs(const std::vector
<CDir
*>& dirs
)
11449 bool any_subtree
= false, any_non_subtree
= false;
11450 for (const auto& dir
: dirs
) {
11451 dir
->auth_pin(dir
); // until we mark and complete them
11452 dir
->state_set(CDir::STATE_FRAGMENTING
);
11454 ceph_assert(dir
->is_freezing_dir());
11456 if (dir
->is_subtree_root())
11457 any_subtree
= true;
11459 any_non_subtree
= true;
11462 if (any_subtree
&& any_non_subtree
) {
11463 // either all dirfrags are subtree roots or all are not.
11464 for (const auto& dir
: dirs
) {
11465 if (dir
->is_subtree_root()) {
11466 ceph_assert(dir
->state_test(CDir::STATE_AUXSUBTREE
));
11468 dir
->state_set(CDir::STATE_AUXSUBTREE
);
11469 adjust_subtree_auth(dir
, mds
->get_nodeid());
11475 class C_MDC_FragmentMarking
: public MDCacheContext
{
11478 C_MDC_FragmentMarking(MDCache
*m
, MDRequestRef
& r
) : MDCacheContext(m
), mdr(r
) {}
11479 void finish(int r
) override
{
11480 mdcache
->fragment_mark_and_complete(mdr
);
11484 void MDCache::fragment_mark_and_complete(MDRequestRef
& mdr
)
11486 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11487 map
<dirfrag_t
,fragment_info_t
>::iterator it
= fragments
.find(basedirfrag
);
11488 if (it
== fragments
.end() || it
->second
.mdr
!= mdr
) {
11489 dout(7) << "fragment_mark_and_complete " << basedirfrag
<< " must have aborted" << dendl
;
11490 request_finish(mdr
);
11494 fragment_info_t
& info
= it
->second
;
11495 CInode
*diri
= info
.dirs
.front()->get_inode();
11496 dout(10) << "fragment_mark_and_complete " << info
.dirs
<< " on " << *diri
<< dendl
;
11498 MDSGatherBuilder
gather(g_ceph_context
);
11500 for (const auto& dir
: info
.dirs
) {
11502 if (!dir
->is_complete()) {
11503 dout(15) << " fetching incomplete " << *dir
<< dendl
;
11504 dir
->fetch(gather
.new_sub(), true); // ignore authpinnability
11506 } else if (dir
->get_frag() == frag_t()) {
11507 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
11508 // the operation. To avoid CDir::fetch() complaining about missing object,
11509 // we commit new dirfrag first.
11510 if (dir
->state_test(CDir::STATE_CREATING
)) {
11511 dout(15) << " waiting until new dir gets journaled " << *dir
<< dendl
;
11512 dir
->add_waiter(CDir::WAIT_CREATED
, gather
.new_sub());
11514 } else if (dir
->is_new()) {
11515 dout(15) << " committing new " << *dir
<< dendl
;
11516 ceph_assert(dir
->is_dirty());
11517 dir
->commit(0, gather
.new_sub(), true);
11524 if (!dir
->state_test(CDir::STATE_DNPINNEDFRAG
)) {
11525 dout(15) << " marking " << *dir
<< dendl
;
11526 for (auto &p
: dir
->items
) {
11527 CDentry
*dn
= p
.second
;
11528 dn
->get(CDentry::PIN_FRAGMENTING
);
11529 ceph_assert(!dn
->state_test(CDentry::STATE_FRAGMENTING
));
11530 dn
->state_set(CDentry::STATE_FRAGMENTING
);
11532 dir
->state_set(CDir::STATE_DNPINNEDFRAG
);
11533 dir
->auth_unpin(dir
);
11535 dout(15) << " already marked " << *dir
<< dendl
;
11538 if (gather
.has_subs()) {
11539 gather
.set_finisher(new C_MDC_FragmentMarking(this, mdr
));
11544 for (const auto& dir
: info
.dirs
) {
11545 if (!dir
->is_frozen_dir()) {
11546 ceph_assert(dir
->is_freezing_dir());
11547 dir
->add_waiter(CDir::WAIT_FROZEN
, gather
.new_sub());
11550 if (gather
.has_subs()) {
11551 gather
.set_finisher(new C_MDC_FragmentFrozen(this, mdr
));
11553 // flush log so that request auth_pins are retired
11554 mds
->mdlog
->flush();
11558 fragment_frozen(mdr
, 0);
11561 void MDCache::fragment_unmark_unfreeze_dirs(const std::vector
<CDir
*>& dirs
)
11563 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs
<< dendl
;
11564 for (const auto& dir
: dirs
) {
11565 dout(10) << " frag " << *dir
<< dendl
;
11567 ceph_assert(dir
->state_test(CDir::STATE_FRAGMENTING
));
11568 dir
->state_clear(CDir::STATE_FRAGMENTING
);
11570 if (dir
->state_test(CDir::STATE_DNPINNEDFRAG
)) {
11571 dir
->state_clear(CDir::STATE_DNPINNEDFRAG
);
11573 for (auto &p
: dir
->items
) {
11574 CDentry
*dn
= p
.second
;
11575 ceph_assert(dn
->state_test(CDentry::STATE_FRAGMENTING
));
11576 dn
->state_clear(CDentry::STATE_FRAGMENTING
);
11577 dn
->put(CDentry::PIN_FRAGMENTING
);
11580 dir
->auth_unpin(dir
);
11583 dir
->unfreeze_dir();
11587 bool MDCache::fragment_are_all_frozen(CDir
*dir
)
11589 ceph_assert(dir
->is_frozen_dir());
11590 map
<dirfrag_t
,fragment_info_t
>::iterator p
;
11591 for (p
= fragments
.lower_bound(dirfrag_t(dir
->ino(), 0));
11592 p
!= fragments
.end() && p
->first
.ino
== dir
->ino();
11594 if (p
->first
.frag
.contains(dir
->get_frag()))
11595 return p
->second
.all_frozen
;
11601 void MDCache::fragment_freeze_inc_num_waiters(CDir
*dir
)
11603 map
<dirfrag_t
,fragment_info_t
>::iterator p
;
11604 for (p
= fragments
.lower_bound(dirfrag_t(dir
->ino(), 0));
11605 p
!= fragments
.end() && p
->first
.ino
== dir
->ino();
11607 if (p
->first
.frag
.contains(dir
->get_frag())) {
11608 p
->second
.num_remote_waiters
++;
11615 void MDCache::find_stale_fragment_freeze()
11617 dout(10) << "find_stale_fragment_freeze" << dendl
;
11618 // see comment in Migrator::find_stale_export_freeze()
11619 utime_t now
= ceph_clock_now();
11620 utime_t cutoff
= now
;
11621 cutoff
-= g_conf()->mds_freeze_tree_timeout
;
11623 for (map
<dirfrag_t
,fragment_info_t
>::iterator p
= fragments
.begin();
11624 p
!= fragments
.end(); ) {
11625 dirfrag_t df
= p
->first
;
11626 fragment_info_t
& info
= p
->second
;
11628 if (info
.all_frozen
)
11631 int total_auth_pins
= 0;
11632 for (const auto& d
: info
.dirs
) {
11634 if (!dir
->state_test(CDir::STATE_DNPINNEDFRAG
)) {
11635 total_auth_pins
= -1;
11638 if (dir
->is_frozen_dir())
11640 total_auth_pins
+= dir
->get_auth_pins() + dir
->get_dir_auth_pins();
11642 if (total_auth_pins
< 0)
11644 if (info
.last_cum_auth_pins
!= total_auth_pins
) {
11645 info
.last_cum_auth_pins
= total_auth_pins
;
11646 info
.last_cum_auth_pins_change
= now
;
11649 if (info
.last_cum_auth_pins_change
>= cutoff
)
11651 dir
= info
.dirs
.front();
11652 if (info
.num_remote_waiters
> 0 ||
11653 (!dir
->inode
->is_root() && dir
->get_parent_dir()->is_freezing())) {
11654 dout(10) << " cancel fragmenting " << df
<< " bit " << info
.bits
<< dendl
;
11655 std::vector
<CDir
*> dirs
;
11656 info
.dirs
.swap(dirs
);
11657 fragments
.erase(df
);
11658 fragment_unmark_unfreeze_dirs(dirs
);
11663 class C_MDC_FragmentPrep
: public MDCacheLogContext
{
11666 C_MDC_FragmentPrep(MDCache
*m
, MDRequestRef
& r
) : MDCacheLogContext(m
), mdr(r
) {}
11667 void finish(int r
) override
{
11668 mdcache
->_fragment_logged(mdr
);
11672 class C_MDC_FragmentStore
: public MDCacheContext
{
11675 C_MDC_FragmentStore(MDCache
*m
, MDRequestRef
& r
) : MDCacheContext(m
), mdr(r
) {}
11676 void finish(int r
) override
{
11677 mdcache
->_fragment_stored(mdr
);
11681 class C_MDC_FragmentCommit
: public MDCacheLogContext
{
11682 dirfrag_t basedirfrag
;
11685 C_MDC_FragmentCommit(MDCache
*m
, dirfrag_t df
, const MDRequestRef
& r
) :
11686 MDCacheLogContext(m
), basedirfrag(df
), mdr(r
) {}
11687 void finish(int r
) override
{
11688 mdcache
->_fragment_committed(basedirfrag
, mdr
);
11692 class C_IO_MDC_FragmentPurgeOld
: public MDCacheIOContext
{
11693 dirfrag_t basedirfrag
;
11697 C_IO_MDC_FragmentPurgeOld(MDCache
*m
, dirfrag_t f
, int b
,
11698 const MDRequestRef
& r
) :
11699 MDCacheIOContext(m
), basedirfrag(f
), bits(b
), mdr(r
) {}
11700 void finish(int r
) override
{
11701 ceph_assert(r
== 0 || r
== -CEPHFS_ENOENT
);
11702 mdcache
->_fragment_old_purged(basedirfrag
, bits
, mdr
);
11704 void print(ostream
& out
) const override
{
11705 out
<< "fragment_purge_old(" << basedirfrag
<< ")";
11709 void MDCache::fragment_frozen(MDRequestRef
& mdr
, int r
)
11711 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11712 map
<dirfrag_t
,fragment_info_t
>::iterator it
= fragments
.find(basedirfrag
);
11713 if (it
== fragments
.end() || it
->second
.mdr
!= mdr
) {
11714 dout(7) << "fragment_frozen " << basedirfrag
<< " must have aborted" << dendl
;
11715 request_finish(mdr
);
11719 ceph_assert(r
== 0);
11720 fragment_info_t
& info
= it
->second
;
11721 dout(10) << "fragment_frozen " << basedirfrag
.frag
<< " by " << info
.bits
11722 << " on " << info
.dirs
.front()->get_inode() << dendl
;
11724 info
.all_frozen
= true;
11725 dispatch_fragment_dir(mdr
);
11728 void MDCache::dispatch_fragment_dir(MDRequestRef
& mdr
)
11730 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11731 map
<dirfrag_t
,fragment_info_t
>::iterator it
= fragments
.find(basedirfrag
);
11732 if (it
== fragments
.end() || it
->second
.mdr
!= mdr
) {
11733 dout(7) << "dispatch_fragment_dir " << basedirfrag
<< " must have aborted" << dendl
;
11734 request_finish(mdr
);
11738 fragment_info_t
& info
= it
->second
;
11739 CInode
*diri
= info
.dirs
.front()->get_inode();
11741 dout(10) << "dispatch_fragment_dir " << basedirfrag
<< " bits " << info
.bits
11742 << " on " << *diri
<< dendl
;
11744 if (mdr
->more()->peer_error
)
11745 mdr
->aborted
= true;
11747 if (!mdr
->aborted
) {
11748 MutationImpl::LockOpVec lov
;
11749 lov
.add_wrlock(&diri
->dirfragtreelock
);
11750 // prevent a racing gather on any other scatterlocks too
11751 lov
.lock_scatter_gather(&diri
->nestlock
);
11752 lov
.lock_scatter_gather(&diri
->filelock
);
11753 if (!mds
->locker
->acquire_locks(mdr
, lov
, NULL
, true)) {
11759 if (mdr
->aborted
) {
11760 dout(10) << " can't auth_pin " << *diri
<< ", requeuing dir "
11761 << info
.dirs
.front()->dirfrag() << dendl
;
11763 mds
->balancer
->queue_split(info
.dirs
.front(), false);
11765 mds
->balancer
->queue_merge(info
.dirs
.front());
11766 fragment_unmark_unfreeze_dirs(info
.dirs
);
11767 fragments
.erase(it
);
11768 request_finish(mdr
);
11772 mdr
->ls
= mds
->mdlog
->get_current_segment();
11773 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_PREPARE
, basedirfrag
, info
.bits
);
11774 mds
->mdlog
->start_entry(le
);
11776 for (const auto& dir
: info
.dirs
) {
11777 dirfrag_rollback rollback
;
11778 rollback
.fnode
= dir
->fnode
;
11779 le
->add_orig_frag(dir
->get_frag(), &rollback
);
11783 MDSContext::vec waiters
;
11784 adjust_dir_fragments(diri
, info
.dirs
, basedirfrag
.frag
, info
.bits
,
11785 &info
.resultfrags
, waiters
, false);
11786 if (g_conf()->mds_debug_frag
)
11787 diri
->verify_dirfrags();
11788 mds
->queue_waiters(waiters
);
11790 for (const auto& fg
: le
->orig_frags
)
11791 ceph_assert(!diri
->dirfragtree
.is_leaf(fg
));
11793 le
->metablob
.add_dir_context(info
.resultfrags
.front());
11794 for (const auto& dir
: info
.resultfrags
) {
11795 if (diri
->is_auth()) {
11796 le
->metablob
.add_fragmented_dir(dir
, false, false);
11798 dir
->state_set(CDir::STATE_DIRTYDFT
);
11799 le
->metablob
.add_fragmented_dir(dir
, false, true);
11804 if (diri
->is_auth()) {
11805 // journal dirfragtree
11806 auto pi
= diri
->project_inode(mdr
);
11807 pi
.inode
->version
= diri
->pre_dirty();
11808 predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
);
11809 journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
11811 mds
->locker
->mark_updated_scatterlock(&diri
->dirfragtreelock
);
11812 mdr
->ls
->dirty_dirfrag_dirfragtree
.push_back(&diri
->item_dirty_dirfrag_dirfragtree
);
11813 mdr
->add_updated_lock(&diri
->dirfragtreelock
);
11818 mds->locker->mark_updated_scatterlock(&diri->filelock);
11819 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11820 mut->add_updated_lock(&diri->filelock);
11823 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11824 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11825 mut->add_updated_lock(&diri->nestlock);
11828 add_uncommitted_fragment(basedirfrag
, info
.bits
, le
->orig_frags
, mdr
->ls
);
11829 mds
->server
->submit_mdlog_entry(le
, new C_MDC_FragmentPrep(this, mdr
),
11831 mds
->mdlog
->flush();
11834 void MDCache::_fragment_logged(MDRequestRef
& mdr
)
11836 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11837 auto& info
= fragments
.at(basedirfrag
);
11838 CInode
*diri
= info
.resultfrags
.front()->get_inode();
11840 dout(10) << "fragment_logged " << basedirfrag
<< " bits " << info
.bits
11841 << " on " << *diri
<< dendl
;
11842 mdr
->mark_event("prepare logged");
11844 mdr
->apply(); // mark scatterlock
11846 // store resulting frags
11847 MDSGatherBuilder
gather(g_ceph_context
, new C_MDC_FragmentStore(this, mdr
));
11849 for (const auto& dir
: info
.resultfrags
) {
11850 dout(10) << " storing result frag " << *dir
<< dendl
;
11852 dir
->mark_dirty(mdr
->ls
);
11853 dir
->mark_new(mdr
->ls
);
11855 // freeze and store them too
11856 dir
->auth_pin(this);
11857 dir
->state_set(CDir::STATE_FRAGMENTING
);
11858 dir
->commit(0, gather
.new_sub(), true); // ignore authpinnability
11864 void MDCache::_fragment_stored(MDRequestRef
& mdr
)
11866 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11867 fragment_info_t
&info
= fragments
.at(basedirfrag
);
11868 CDir
*first
= info
.resultfrags
.front();
11869 CInode
*diri
= first
->get_inode();
11871 dout(10) << "fragment_stored " << basedirfrag
<< " bits " << info
.bits
11872 << " on " << *diri
<< dendl
;
11873 mdr
->mark_event("new frags stored");
11876 mds_rank_t diri_auth
= (first
->is_subtree_root() && !diri
->is_auth()) ?
11877 diri
->authority().first
: CDIR_AUTH_UNKNOWN
;
11878 for (const auto &p
: first
->get_replicas()) {
11879 if (mds
->mdsmap
->get_state(p
.first
) < MDSMap::STATE_REJOIN
||
11880 (mds
->mdsmap
->get_state(p
.first
) == MDSMap::STATE_REJOIN
&&
11881 rejoin_gather
.count(p
.first
)))
11884 auto notify
= make_message
<MMDSFragmentNotify
>(basedirfrag
, info
.bits
, mdr
->reqid
.tid
);
11885 if (diri_auth
!= CDIR_AUTH_UNKNOWN
&& // subtree root
11886 diri_auth
!= p
.first
) { // not auth mds of diri
11888 * In the nornal case, mds does not trim dir inode whose child dirfrags
11889 * are likely being fragmented (see trim_inode()). But when fragmenting
11890 * subtree roots, following race can happen:
11892 * - mds.a (auth mds of dirfrag) sends fragment_notify message to
11893 * mds.c and drops wrlock on dirfragtreelock.
11894 * - mds.b (auth mds of dir inode) changes dirfragtreelock state to
11895 * SYNC and send lock message mds.c
11896 * - mds.c receives the lock message and changes dirfragtreelock state
11898 * - mds.c trim dirfrag and dir inode from its cache
11899 * - mds.c receives the fragment_notify message
11901 * So we need to ensure replicas have received the notify, then unlock
11902 * the dirfragtreelock.
11904 notify
->mark_ack_wanted();
11905 info
.notify_ack_waiting
.insert(p
.first
);
11908 // freshly replicate new dirs to peers
11909 for (const auto& dir
: info
.resultfrags
) {
11910 encode_replica_dir(dir
, p
.first
, notify
->basebl
);
11913 mds
->send_message_mds(notify
, p
.first
);
11917 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_COMMIT
, basedirfrag
, info
.bits
);
11918 mds
->mdlog
->start_submit_entry(le
, new C_MDC_FragmentCommit(this, basedirfrag
, mdr
));
11921 // unfreeze resulting frags
11922 for (const auto& dir
: info
.resultfrags
) {
11923 dout(10) << " result frag " << *dir
<< dendl
;
11925 for (auto &p
: dir
->items
) {
11926 CDentry
*dn
= p
.second
;
11927 ceph_assert(dn
->state_test(CDentry::STATE_FRAGMENTING
));
11928 dn
->state_clear(CDentry::STATE_FRAGMENTING
);
11929 dn
->put(CDentry::PIN_FRAGMENTING
);
11933 dir
->unfreeze_dir();
11936 if (info
.notify_ack_waiting
.empty()) {
11937 fragment_drop_locks(info
);
11939 mds
->locker
->drop_locks_for_fragment_unfreeze(mdr
.get());
11943 void MDCache::_fragment_committed(dirfrag_t basedirfrag
, const MDRequestRef
& mdr
)
11945 dout(10) << "fragment_committed " << basedirfrag
<< dendl
;
11947 mdr
->mark_event("commit logged");
11949 ufragment
&uf
= uncommitted_fragments
.at(basedirfrag
);
11951 // remove old frags
11952 C_GatherBuilder
gather(
11955 new C_IO_MDC_FragmentPurgeOld(this, basedirfrag
, uf
.bits
, mdr
),
11958 SnapContext nullsnapc
;
11959 object_locator_t
oloc(mds
->get_metadata_pool());
11960 for (const auto& fg
: uf
.old_frags
) {
11961 object_t oid
= CInode::get_object_name(basedirfrag
.ino
, fg
, "");
11962 ObjectOperation op
;
11963 if (fg
== frag_t()) {
11964 // backtrace object
11965 dout(10) << " truncate orphan dirfrag " << oid
<< dendl
;
11969 dout(10) << " removing orphan dirfrag " << oid
<< dendl
;
11972 mds
->objecter
->mutate(oid
, oloc
, op
, nullsnapc
,
11973 ceph::real_clock::now(),
11974 0, gather
.new_sub());
11977 ceph_assert(gather
.has_subs());
11981 void MDCache::_fragment_old_purged(dirfrag_t basedirfrag
, int bits
, const MDRequestRef
& mdr
)
11983 dout(10) << "fragment_old_purged " << basedirfrag
<< dendl
;
11985 mdr
->mark_event("old frags purged");
11987 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_FINISH
, basedirfrag
, bits
);
11988 mds
->mdlog
->start_submit_entry(le
);
11990 finish_uncommitted_fragment(basedirfrag
, EFragment::OP_FINISH
);
11994 mds
->logger
->inc(l_mds_dir_split
);
11996 mds
->logger
->inc(l_mds_dir_merge
);
12001 auto it
= fragments
.find(basedirfrag
);
12002 ceph_assert(it
!= fragments
.end());
12003 it
->second
.finishing
= true;
12004 if (it
->second
.notify_ack_waiting
.empty())
12005 fragment_maybe_finish(it
);
12007 mdr
->mark_event("wating for notify acks");
12011 void MDCache::fragment_drop_locks(fragment_info_t
& info
)
12013 mds
->locker
->drop_locks(info
.mdr
.get());
12014 request_finish(info
.mdr
);
12015 //info.mdr.reset();
12018 void MDCache::fragment_maybe_finish(const fragment_info_iterator
& it
)
12020 if (!it
->second
.finishing
)
12023 // unmark & auth_unpin
12024 for (const auto &dir
: it
->second
.resultfrags
) {
12025 dir
->state_clear(CDir::STATE_FRAGMENTING
);
12026 dir
->auth_unpin(this);
12028 // In case the resulting fragments are beyond the split size,
12029 // we might need to split them again right away (they could
12030 // have been taking inserts between unfreezing and getting
12032 mds
->balancer
->maybe_fragment(dir
, false);
12035 fragments
.erase(it
);
12039 void MDCache::handle_fragment_notify_ack(const cref_t
<MMDSFragmentNotifyAck
> &ack
)
12041 dout(10) << "handle_fragment_notify_ack " << *ack
<< " from " << ack
->get_source() << dendl
;
12042 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
12044 if (mds
->get_state() < MDSMap::STATE_ACTIVE
) {
12048 auto it
= fragments
.find(ack
->get_base_dirfrag());
12049 if (it
== fragments
.end() ||
12050 it
->second
.get_tid() != ack
->get_tid()) {
12051 dout(10) << "handle_fragment_notify_ack obsolete message, dropping" << dendl
;
12055 if (it
->second
.notify_ack_waiting
.erase(from
) &&
12056 it
->second
.notify_ack_waiting
.empty()) {
12057 fragment_drop_locks(it
->second
);
12058 fragment_maybe_finish(it
);
12062 void MDCache::handle_fragment_notify(const cref_t
<MMDSFragmentNotify
> ¬ify
)
12064 dout(10) << "handle_fragment_notify " << *notify
<< " from " << notify
->get_source() << dendl
;
12065 mds_rank_t from
= mds_rank_t(notify
->get_source().num());
12067 if (mds
->get_state() < MDSMap::STATE_REJOIN
) {
12071 CInode
*diri
= get_inode(notify
->get_ino());
12073 frag_t base
= notify
->get_basefrag();
12074 int bits
= notify
->get_bits();
12077 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
12078 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
12079 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
12080 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
12086 MDSContext::vec waiters
;
12087 std::vector
<CDir
*> resultfrags
;
12088 adjust_dir_fragments(diri
, base
, bits
, &resultfrags
, waiters
, false);
12089 if (g_conf()->mds_debug_frag
)
12090 diri
->verify_dirfrags();
12092 for (const auto& dir
: resultfrags
) {
12093 diri
->take_dir_waiting(dir
->get_frag(), waiters
);
12096 // add new replica dirs values
12097 auto p
= notify
->basebl
.cbegin();
12099 CDir
*tmp_dir
= nullptr;
12100 decode_replica_dir(tmp_dir
, p
, diri
, from
, waiters
);
12103 mds
->queue_waiters(waiters
);
12108 if (notify
->is_ack_wanted()) {
12109 auto ack
= make_message
<MMDSFragmentNotifyAck
>(notify
->get_base_dirfrag(),
12110 notify
->get_bits(), notify
->get_tid());
12111 mds
->send_message_mds(ack
, from
);
12115 void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag
, int bits
, const frag_vec_t
& old_frags
,
12116 LogSegment
*ls
, bufferlist
*rollback
)
12118 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag
<< " bits " << bits
<< dendl
;
12119 ceph_assert(!uncommitted_fragments
.count(basedirfrag
));
12120 ufragment
& uf
= uncommitted_fragments
[basedirfrag
];
12121 uf
.old_frags
= old_frags
;
12124 ls
->uncommitted_fragments
.insert(basedirfrag
);
12126 uf
.rollback
.swap(*rollback
);
12129 void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag
, int op
)
12131 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
12132 << " op " << EFragment::op_name(op
) << dendl
;
12133 map
<dirfrag_t
, ufragment
>::iterator it
= uncommitted_fragments
.find(basedirfrag
);
12134 if (it
!= uncommitted_fragments
.end()) {
12135 ufragment
& uf
= it
->second
;
12136 if (op
!= EFragment::OP_FINISH
&& !uf
.old_frags
.empty()) {
12137 uf
.committed
= true;
12139 uf
.ls
->uncommitted_fragments
.erase(basedirfrag
);
12140 mds
->queue_waiters(uf
.waiters
);
12141 uncommitted_fragments
.erase(it
);
12146 void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag
, frag_vec_t
&& old_frags
)
12148 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
12149 << " old_frags (" << old_frags
<< ")" << dendl
;
12150 map
<dirfrag_t
, ufragment
>::iterator it
= uncommitted_fragments
.find(basedirfrag
);
12151 if (it
!= uncommitted_fragments
.end()) {
12152 ufragment
& uf
= it
->second
;
12153 if (!uf
.old_frags
.empty()) {
12154 uf
.old_frags
= std::move(old_frags
);
12155 uf
.committed
= true;
12157 uf
.ls
->uncommitted_fragments
.erase(basedirfrag
);
12158 uncommitted_fragments
.erase(it
);
12163 void MDCache::wait_for_uncommitted_fragments(MDSContext
* finisher
)
12165 MDSGatherBuilder
gather(g_ceph_context
, finisher
);
12166 for (auto& p
: uncommitted_fragments
) {
12167 p
.second
.waiters
.push_back(gather
.new_sub());
12172 struct C_MDC_FragmentRollback
: public MDCacheLogContext
{
12174 C_MDC_FragmentRollback(MDCache
*c
, MutationRef
& m
) :
12175 MDCacheLogContext(c
), mut(m
) {}
12176 void finish(int r
) override
{
12178 get_mds()->locker
->drop_locks(mut
.get());
12183 void MDCache::rollback_uncommitted_fragments()
12185 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments
.size() << " pending" << dendl
;
12186 for (map
<dirfrag_t
, ufragment
>::iterator p
= uncommitted_fragments
.begin();
12187 p
!= uncommitted_fragments
.end();
12189 ufragment
&uf
= p
->second
;
12190 CInode
*diri
= get_inode(p
->first
.ino
);
12193 if (uf
.committed
) {
12194 _fragment_committed(p
->first
, MDRequestRef());
12198 dout(10) << " rolling back " << p
->first
<< " refragment by " << uf
.bits
<< " bits" << dendl
;
12200 MutationRef
mut(new MutationImpl());
12201 mut
->ls
= mds
->mdlog
->get_current_segment();
12202 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_ROLLBACK
, p
->first
, uf
.bits
);
12203 mds
->mdlog
->start_entry(le
);
12204 bool diri_auth
= (diri
->authority() != CDIR_AUTH_UNDEF
);
12206 frag_vec_t old_frags
;
12207 diri
->dirfragtree
.get_leaves_under(p
->first
.frag
, old_frags
);
12209 std::vector
<CDir
*> resultfrags
;
12210 if (uf
.old_frags
.empty()) {
12211 // created by old format EFragment
12212 MDSContext::vec waiters
;
12213 adjust_dir_fragments(diri
, p
->first
.frag
, -uf
.bits
, &resultfrags
, waiters
, true);
12215 auto bp
= uf
.rollback
.cbegin();
12216 for (const auto& fg
: uf
.old_frags
) {
12217 CDir
*dir
= force_dir_fragment(diri
, fg
);
12218 resultfrags
.push_back(dir
);
12220 dirfrag_rollback rollback
;
12221 decode(rollback
, bp
);
12223 dir
->fnode
= rollback
.fnode
;
12225 dir
->mark_dirty(mut
->ls
);
12227 if (!(dir
->get_fnode()->rstat
== dir
->get_fnode()->accounted_rstat
)) {
12228 dout(10) << " dirty nestinfo on " << *dir
<< dendl
;
12229 mds
->locker
->mark_updated_scatterlock(&diri
->nestlock
);
12230 mut
->ls
->dirty_dirfrag_nest
.push_back(&diri
->item_dirty_dirfrag_nest
);
12231 mut
->add_updated_lock(&diri
->nestlock
);
12233 if (!(dir
->get_fnode()->fragstat
== dir
->get_fnode()->accounted_fragstat
)) {
12234 dout(10) << " dirty fragstat on " << *dir
<< dendl
;
12235 mds
->locker
->mark_updated_scatterlock(&diri
->filelock
);
12236 mut
->ls
->dirty_dirfrag_dir
.push_back(&diri
->item_dirty_dirfrag_dir
);
12237 mut
->add_updated_lock(&diri
->filelock
);
12240 le
->add_orig_frag(dir
->get_frag());
12241 le
->metablob
.add_dir_context(dir
);
12243 le
->metablob
.add_fragmented_dir(dir
, true, false);
12245 dout(10) << " dirty dirfragtree on " << *dir
<< dendl
;
12246 dir
->state_set(CDir::STATE_DIRTYDFT
);
12247 le
->metablob
.add_fragmented_dir(dir
, true, true);
12253 auto pi
= diri
->project_inode(mut
);
12254 pi
.inode
->version
= diri
->pre_dirty();
12255 predirty_journal_parents(mut
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
);
12256 le
->metablob
.add_primary_dentry(diri
->get_projected_parent_dn(), diri
, true);
12258 mds
->locker
->mark_updated_scatterlock(&diri
->dirfragtreelock
);
12259 mut
->ls
->dirty_dirfrag_dirfragtree
.push_back(&diri
->item_dirty_dirfrag_dirfragtree
);
12260 mut
->add_updated_lock(&diri
->dirfragtreelock
);
12263 if (g_conf()->mds_debug_frag
)
12264 diri
->verify_dirfrags();
12266 for (const auto& leaf
: old_frags
) {
12267 ceph_assert(!diri
->dirfragtree
.is_leaf(leaf
));
12270 mds
->mdlog
->submit_entry(le
, new C_MDC_FragmentRollback(this, mut
));
12272 uf
.old_frags
.swap(old_frags
);
12273 _fragment_committed(p
->first
, MDRequestRef());
12277 void MDCache::force_readonly()
12282 dout(1) << "force file system read-only" << dendl
;
12283 mds
->clog
->warn() << "force file system read-only";
12287 mds
->server
->force_clients_readonly();
12289 // revoke write caps
12291 for (auto &p
: inode_map
) {
12292 CInode
*in
= p
.second
;
12294 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
);
12295 if (!(++count
% 1000))
12296 mds
->heartbeat_reset();
12299 mds
->mdlog
->flush();
12303 // ==============================================================
12306 void MDCache::show_subtrees(int dbl
, bool force_print
)
12308 if (g_conf()->mds_thrash_exports
)
12311 //dout(10) << "show_subtrees" << dendl;
12313 if (!g_conf()->subsys
.should_gather(ceph_subsys_mds
, dbl
))
12314 return; // i won't print anything.
12316 if (subtrees
.empty()) {
12317 dout(ceph::dout::need_dynamic(dbl
)) << "show_subtrees - no subtrees"
12322 if (!force_print
&& subtrees
.size() > SUBTREES_COUNT_THRESHOLD
&&
12323 !g_conf()->subsys
.should_gather
<ceph_subsys_mds
, 25>()) {
12324 dout(ceph::dout::need_dynamic(dbl
)) << "number of subtrees = " << subtrees
.size() << "; not "
12325 "printing subtrees" << dendl
;
12330 std::vector
<CDir
*> basefrags
;
12331 for (set
<CInode
*>::iterator p
= base_inodes
.begin();
12332 p
!= base_inodes
.end();
12334 (*p
)->get_dirfrags(basefrags
);
12335 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
12336 dout(15) << "show_subtrees" << dendl
;
12339 list
<pair
<CDir
*,int> > q
;
12344 for (const auto& dir
: basefrags
) {
12345 q
.emplace_back(dir
, 0);
12348 set
<CDir
*> subtrees_seen
;
12350 unsigned int depth
= 0;
12351 while (!q
.empty()) {
12352 CDir
*dir
= q
.front().first
;
12353 unsigned int d
= q
.front().second
;
12356 if (subtrees
.count(dir
) == 0) continue;
12358 subtrees_seen
.insert(dir
);
12360 if (d
> depth
) depth
= d
;
12363 //dout(25) << "saw depth " << d << " " << *dir << dendl;
12364 if (seen
.count(dir
)) dout(0) << "aah, already seen " << *dir
<< dendl
;
12365 ceph_assert(seen
.count(dir
) == 0);
12369 if (!subtrees
[dir
].empty()) {
12370 for (set
<CDir
*>::iterator p
= subtrees
[dir
].begin();
12371 p
!= subtrees
[dir
].end();
12373 //dout(25) << " saw sub " << **p << dendl;
12374 q
.push_front(pair
<CDir
*,int>(*p
, d
+1));
12379 if (!force_print
&& depth
> SUBTREES_DEPTH_THRESHOLD
&&
12380 !g_conf()->subsys
.should_gather
<ceph_subsys_mds
, 25>()) {
12381 dout(ceph::dout::need_dynamic(dbl
)) << "max depth among subtrees = " << depth
<< "; not printing "
12382 "subtrees" << dendl
;
12387 for (const auto& dir
: basefrags
) {
12388 q
.emplace_back(dir
, 0);
12391 while (!q
.empty()) {
12392 CDir
*dir
= q
.front().first
;
12393 int d
= q
.front().second
;
12396 if (subtrees
.count(dir
) == 0) continue;
12399 while ((unsigned)d
< indent
.size())
12403 string pad
= "______________________________________";
12404 pad
.resize(depth
*2+1-indent
.size());
12405 if (!subtrees
[dir
].empty())
12406 pad
[0] = '.'; // parent
12410 if (dir
->is_auth())
12416 if (dir
->get_dir_auth().second
== CDIR_AUTH_UNKNOWN
)
12417 snprintf(s
, sizeof(s
), "%2d ", int(dir
->get_dir_auth().first
));
12419 snprintf(s
, sizeof(s
), "%2d,%2d", int(dir
->get_dir_auth().first
), int(dir
->get_dir_auth().second
));
12422 dout(ceph::dout::need_dynamic(dbl
)) << indent
<< "|_" << pad
<< s
12423 << " " << auth
<< *dir
<< dendl
;
12425 if (dir
->ino() == CEPH_INO_ROOT
)
12426 ceph_assert(dir
->inode
== root
);
12427 if (dir
->ino() == MDS_INO_MDSDIR(mds
->get_nodeid()))
12428 ceph_assert(dir
->inode
== myin
);
12429 if (dir
->inode
->is_stray() && (MDS_INO_STRAY_OWNER(dir
->ino()) == mds
->get_nodeid()))
12430 ceph_assert(strays
[MDS_INO_STRAY_INDEX(dir
->ino())] == dir
->inode
);
12433 if (!subtrees
[dir
].empty()) {
12434 // more at my level?
12435 if (!q
.empty() && q
.front().second
== d
)
12440 for (set
<CDir
*>::iterator p
= subtrees
[dir
].begin();
12441 p
!= subtrees
[dir
].end();
12443 q
.push_front(pair
<CDir
*,int>(*p
, d
+2));
12447 // verify there isn't stray crap in subtree map
12449 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
12450 p
!= subtrees
.end();
12452 if (subtrees_seen
.count(p
->first
)) continue;
12453 dout(10) << "*** stray/lost entry in subtree map: " << *p
->first
<< dendl
;
12456 ceph_assert(lost
== 0);
12459 void MDCache::show_cache()
12461 if (!g_conf()->subsys
.should_gather
<ceph_subsys_mds
, 7>())
12463 dout(7) << "show_cache" << dendl
;
12465 auto show_func
= [this](CInode
*in
) {
12468 dout(7) << " unlinked " << *in
<< dendl
;
12471 auto&& dfs
= in
->get_dirfrags();
12472 for (const auto& dir
: dfs
) {
12473 dout(7) << " dirfrag " << *dir
<< dendl
;
12475 for (auto &p
: dir
->items
) {
12476 CDentry
*dn
= p
.second
;
12477 dout(7) << " dentry " << *dn
<< dendl
;
12478 CDentry::linkage_t
*dnl
= dn
->get_linkage();
12479 if (dnl
->is_primary() && dnl
->get_inode())
12480 dout(7) << " inode " << *dnl
->get_inode() << dendl
;
12485 for (auto &p
: inode_map
)
12486 show_func(p
.second
);
12487 for (auto &p
: snap_inode_map
)
12488 show_func(p
.second
);
12491 void MDCache::cache_status(Formatter
*f
)
12493 f
->open_object_section("cache");
12495 f
->open_object_section("pool");
12496 mempool::get_pool(mempool::mds_co::id
).dump(f
);
12497 f
->close_section();
12499 f
->close_section();
12502 void MDCache::dump_tree(CInode
*in
, const int cur_depth
, const int max_depth
, Formatter
*f
)
12505 if ((max_depth
>= 0) && (cur_depth
> max_depth
)) {
12508 auto&& ls
= in
->get_dirfrags();
12509 for (const auto &subdir
: ls
) {
12510 for (const auto &p
: subdir
->items
) {
12511 CDentry
*dn
= p
.second
;
12512 CInode
*in
= dn
->get_linkage()->get_inode();
12514 dump_tree(in
, cur_depth
+ 1, max_depth
, f
);
12518 f
->open_object_section("inode");
12519 in
->dump(f
, CInode::DUMP_DEFAULT
| CInode::DUMP_DIRFRAGS
);
12520 f
->close_section();
12523 int MDCache::dump_cache(std::string_view file_name
)
12525 return dump_cache(file_name
, NULL
);
12528 int MDCache::dump_cache(Formatter
*f
)
12530 return dump_cache(std::string_view(""), f
);
12534 * Dump the metadata cache, either to a Formatter, if
12535 * provided, else to a plain text file.
12537 int MDCache::dump_cache(std::string_view fn
, Formatter
*f
)
12541 // dumping large caches may cause mds to hang or worse get killed.
12542 // so, disallow the dump if the cache size exceeds the configured
12543 // threshold, which is 1G for formatter and unlimited for file (note
12544 // that this can be jacked up by the admin... and is nothing but foot
12545 // shooting, but the option itself is for devs and hence dangerous to
12546 // tune). TODO: remove this when fixed.
12547 uint64_t threshold
= f
?
12548 g_conf().get_val
<Option::size_t>("mds_dump_cache_threshold_formatter") :
12549 g_conf().get_val
<Option::size_t>("mds_dump_cache_threshold_file");
12551 if (threshold
&& cache_size() > threshold
) {
12553 CachedStackStringStream css
;
12554 *css
<< "cache usage exceeds dump threshold";
12555 f
->open_object_section("result");
12556 f
->dump_string("error", css
->strv());
12557 f
->close_section();
12559 derr
<< "cache usage exceeds dump threshold" << dendl
;
12560 r
= -CEPHFS_EINVAL
;
12569 f
->open_array_section("inodes");
12571 char path
[PATH_MAX
] = "";
12573 snprintf(path
, sizeof path
, "%s", fn
.data());
12575 snprintf(path
, sizeof path
, "cachedump.%d.mds%d", (int)mds
->mdsmap
->get_epoch(), int(mds
->get_nodeid()));
12578 dout(1) << "dump_cache to " << path
<< dendl
;
12580 fd
= ::open(path
, O_WRONLY
|O_CREAT
|O_EXCL
|O_CLOEXEC
, 0600);
12582 derr
<< "failed to open " << path
<< ": " << cpp_strerror(errno
) << dendl
;
12587 auto dump_func
= [fd
, f
](CInode
*in
) {
12590 f
->open_object_section("inode");
12591 in
->dump(f
, CInode::DUMP_DEFAULT
| CInode::DUMP_DIRFRAGS
);
12592 f
->close_section();
12595 CachedStackStringStream css
;
12596 *css
<< *in
<< std::endl
;
12597 auto sv
= css
->strv();
12598 r
= safe_write(fd
, sv
.data(), sv
.size());
12601 auto&& dfs
= in
->get_dirfrags();
12602 for (auto &dir
: dfs
) {
12603 CachedStackStringStream css2
;
12604 *css2
<< " " << *dir
<< std::endl
;
12605 auto sv
= css2
->strv();
12606 r
= safe_write(fd
, sv
.data(), sv
.size());
12609 for (auto &p
: dir
->items
) {
12610 CDentry
*dn
= p
.second
;
12611 CachedStackStringStream css3
;
12612 *css3
<< " " << *dn
<< std::endl
;
12613 auto sv
= css3
->strv();
12614 r
= safe_write(fd
, sv
.data(), sv
.size());
12618 dir
->check_rstats();
12623 for (auto &p
: inode_map
) {
12624 r
= dump_func(p
.second
);
12628 for (auto &p
: snap_inode_map
) {
12629 r
= dump_func(p
.second
);
12637 f
->close_section(); // inodes
12644 void C_MDS_RetryRequest::finish(int r
)
12647 cache
->dispatch_request(mdr
);
12650 MDSContext
*CF_MDS_RetryRequestFactory::build()
12653 mdcache
->mds
->locker
->drop_locks(mdr
.get(), nullptr);
12654 mdr
->drop_local_auth_pins();
12656 return new C_MDS_RetryRequest(mdcache
, mdr
);
12659 class C_MDS_EnqueueScrub
: public Context
12662 Formatter
*formatter
;
12663 Context
*on_finish
;
12665 ScrubHeaderRef header
;
12666 C_MDS_EnqueueScrub(std::string_view tag
, Formatter
*f
, Context
*fin
) :
12667 tag(tag
), formatter(f
), on_finish(fin
), header(nullptr) {}
12669 void finish(int r
) override
{
12670 formatter
->open_object_section("results");
12671 formatter
->dump_int("return_code", r
);
12673 formatter
->dump_string("scrub_tag", tag
);
12674 formatter
->dump_string("mode", "asynchronous");
12676 formatter
->close_section();
12680 on_finish
->complete(r
);
12684 void MDCache::enqueue_scrub(
12685 std::string_view path
,
12686 std::string_view tag
,
12687 bool force
, bool recursive
, bool repair
,
12688 Formatter
*f
, Context
*fin
)
12690 dout(10) << __func__
<< " " << path
<< dendl
;
12693 if (path
.compare(0, 4, "~mds") == 0) {
12695 if (path
== "~mdsdir") {
12696 rank
= mds
->get_nodeid();
12699 rank
= strict_strtoll(path
.substr(4), 10, &err
);
12701 rank
= MDS_RANK_NONE
;
12703 if (rank
>= 0 && rank
< MAX_MDS
)
12704 fp
.set_path("", MDS_INO_MDSDIR(rank
));
12706 if (fp
.get_ino() == inodeno_t(0))
12709 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB
);
12710 mdr
->set_filepath(fp
);
12712 bool is_internal
= false;
12713 std::string
tag_str(tag
);
12714 if (tag_str
.empty()) {
12716 uuid_gen
.generate_random();
12717 tag_str
= uuid_gen
.to_string();
12718 is_internal
= true;
12721 C_MDS_EnqueueScrub
*cs
= new C_MDS_EnqueueScrub(tag_str
, f
, fin
);
12722 cs
->header
= std::make_shared
<ScrubHeader
>(tag_str
, is_internal
, force
, recursive
, repair
);
12724 mdr
->internal_op_finish
= cs
;
12725 enqueue_scrub_work(mdr
);
12728 void MDCache::enqueue_scrub_work(MDRequestRef
& mdr
)
12731 CF_MDS_RetryRequestFactory
cf(this, mdr
, true);
12732 int r
= path_traverse(mdr
, cf
, mdr
->get_filepath(),
12733 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_RDLOCK_PATH
,
12738 mds
->server
->respond_to_request(mdr
, r
);
12742 // Cannot scrub same dentry twice at same time
12743 if (in
->scrub_is_in_progress()) {
12744 mds
->server
->respond_to_request(mdr
, -CEPHFS_EBUSY
);
12750 C_MDS_EnqueueScrub
*cs
= static_cast<C_MDS_EnqueueScrub
*>(mdr
->internal_op_finish
);
12751 ScrubHeaderRef
& header
= cs
->header
;
12753 r
= mds
->scrubstack
->enqueue(in
, header
, !header
->get_recursive());
12755 mds
->server
->respond_to_request(mdr
, r
);
12758 struct C_MDC_RespondInternalRequest
: public MDCacheLogContext
{
12760 C_MDC_RespondInternalRequest(MDCache
*c
, MDRequestRef
& m
) :
12761 MDCacheLogContext(c
), mdr(m
) {}
12762 void finish(int r
) override
{
12764 get_mds()->server
->respond_to_request(mdr
, r
);
12768 struct C_MDC_ScrubRepaired
: public MDCacheContext
{
12769 ScrubHeaderRef header
;
12771 C_MDC_ScrubRepaired(MDCache
*m
, const ScrubHeaderRef
& h
)
12772 : MDCacheContext(m
), header(h
) {
12773 header
->inc_num_pending();
12775 void finish(int r
) override
{
12776 header
->dec_num_pending();
12780 void MDCache::repair_dirfrag_stats(CDir
*dir
)
12782 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS
);
12784 mdr
->internal_op_private
= dir
;
12785 if (dir
->scrub_is_in_progress())
12786 mdr
->internal_op_finish
= new C_MDC_ScrubRepaired(this, dir
->get_scrub_header());
12788 mdr
->internal_op_finish
= new C_MDSInternalNoop
;
12789 repair_dirfrag_stats_work(mdr
);
12792 void MDCache::repair_dirfrag_stats_work(MDRequestRef
& mdr
)
12794 CDir
*dir
= static_cast<CDir
*>(mdr
->internal_op_private
);
12795 dout(10) << __func__
<< " " << *dir
<< dendl
;
12797 if (!dir
->is_auth()) {
12798 mds
->server
->respond_to_request(mdr
, -CEPHFS_ESTALE
);
12802 if (!mdr
->is_auth_pinned(dir
) && !dir
->can_auth_pin()) {
12803 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(this, mdr
));
12805 mds
->locker
->drop_locks(mdr
.get());
12806 mdr
->drop_local_auth_pins();
12807 if (mdr
->is_any_remote_auth_pin())
12808 mds
->locker
->notify_freeze_waiter(dir
);
12812 mdr
->auth_pin(dir
);
12814 MutationImpl::LockOpVec lov
;
12815 CInode
*diri
= dir
->inode
;
12816 lov
.add_rdlock(&diri
->dirfragtreelock
);
12817 lov
.add_wrlock(&diri
->nestlock
);
12818 lov
.add_wrlock(&diri
->filelock
);
12819 if (!mds
->locker
->acquire_locks(mdr
, lov
))
12822 if (!dir
->is_complete()) {
12823 dir
->fetch(new C_MDS_RetryRequest(this, mdr
));
12827 frag_info_t frag_info
;
12828 nest_info_t nest_info
;
12829 for (auto it
= dir
->begin(); it
!= dir
->end(); ++it
) {
12830 CDentry
*dn
= it
->second
;
12831 if (dn
->last
!= CEPH_NOSNAP
)
12833 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
12834 if (dnl
->is_primary()) {
12835 CInode
*in
= dnl
->get_inode();
12836 nest_info
.add(in
->get_projected_inode()->accounted_rstat
);
12838 frag_info
.nsubdirs
++;
12840 frag_info
.nfiles
++;
12841 } else if (dnl
->is_remote())
12842 frag_info
.nfiles
++;
12845 auto pf
= dir
->get_projected_fnode();
12846 bool good_fragstat
= frag_info
.same_sums(pf
->fragstat
);
12847 bool good_rstat
= nest_info
.same_sums(pf
->rstat
);
12848 if (good_fragstat
&& good_rstat
) {
12849 dout(10) << __func__
<< " no corruption found" << dendl
;
12850 mds
->server
->respond_to_request(mdr
, 0);
12854 auto _pf
= dir
->project_fnode(mdr
);
12855 _pf
->version
= dir
->pre_dirty();
12858 mdr
->ls
= mds
->mdlog
->get_current_segment();
12859 EUpdate
*le
= new EUpdate(mds
->mdlog
, "repair_dirfrag");
12860 mds
->mdlog
->start_entry(le
);
12862 if (!good_fragstat
) {
12863 if (pf
->fragstat
.mtime
> frag_info
.mtime
)
12864 frag_info
.mtime
= pf
->fragstat
.mtime
;
12865 if (pf
->fragstat
.change_attr
> frag_info
.change_attr
)
12866 frag_info
.change_attr
= pf
->fragstat
.change_attr
;
12867 _pf
->fragstat
= frag_info
;
12868 mds
->locker
->mark_updated_scatterlock(&diri
->filelock
);
12869 mdr
->ls
->dirty_dirfrag_dir
.push_back(&diri
->item_dirty_dirfrag_dir
);
12870 mdr
->add_updated_lock(&diri
->filelock
);
12874 if (pf
->rstat
.rctime
> nest_info
.rctime
)
12875 nest_info
.rctime
= pf
->rstat
.rctime
;
12876 _pf
->rstat
= nest_info
;
12877 mds
->locker
->mark_updated_scatterlock(&diri
->nestlock
);
12878 mdr
->ls
->dirty_dirfrag_nest
.push_back(&diri
->item_dirty_dirfrag_nest
);
12879 mdr
->add_updated_lock(&diri
->nestlock
);
12882 le
->metablob
.add_dir_context(dir
);
12883 le
->metablob
.add_dir(dir
, true);
12885 mds
->mdlog
->submit_entry(le
, new C_MDC_RespondInternalRequest(this, mdr
));
12888 void MDCache::repair_inode_stats(CInode
*diri
)
12890 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS
);
12891 mdr
->auth_pin(diri
); // already auth pinned by CInode::validate_disk_state()
12892 mdr
->internal_op_private
= diri
;
12893 if (diri
->scrub_is_in_progress())
12894 mdr
->internal_op_finish
= new C_MDC_ScrubRepaired(this, diri
->get_scrub_header());
12896 mdr
->internal_op_finish
= new C_MDSInternalNoop
;
12897 repair_inode_stats_work(mdr
);
12900 void MDCache::repair_inode_stats_work(MDRequestRef
& mdr
)
12902 CInode
*diri
= static_cast<CInode
*>(mdr
->internal_op_private
);
12903 dout(10) << __func__
<< " " << *diri
<< dendl
;
12905 if (!diri
->is_auth()) {
12906 mds
->server
->respond_to_request(mdr
, -CEPHFS_ESTALE
);
12909 if (!diri
->is_dir()) {
12910 mds
->server
->respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
12914 MutationImpl::LockOpVec lov
;
12916 if (mdr
->ls
) // already marked filelock/nestlock dirty ?
12919 lov
.add_rdlock(&diri
->dirfragtreelock
);
12920 lov
.add_wrlock(&diri
->nestlock
);
12921 lov
.add_wrlock(&diri
->filelock
);
12922 if (!mds
->locker
->acquire_locks(mdr
, lov
))
12925 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
12926 // the scatter-gather process, which will fix any fragstat/rstat errors.
12929 diri
->dirfragtree
.get_leaves(leaves
);
12930 for (const auto& leaf
: leaves
) {
12931 CDir
*dir
= diri
->get_dirfrag(leaf
);
12933 ceph_assert(mdr
->is_auth_pinned(diri
));
12934 dir
= diri
->get_or_open_dirfrag(this, leaf
);
12936 if (dir
->get_version() == 0) {
12937 ceph_assert(dir
->is_auth());
12938 dir
->fetch(new C_MDS_RetryRequest(this, mdr
));
12944 diri
->state_set(CInode::STATE_REPAIRSTATS
);
12945 mdr
->ls
= mds
->mdlog
->get_current_segment();
12946 mds
->locker
->mark_updated_scatterlock(&diri
->filelock
);
12947 mdr
->ls
->dirty_dirfrag_dir
.push_back(&diri
->item_dirty_dirfrag_dir
);
12948 mds
->locker
->mark_updated_scatterlock(&diri
->nestlock
);
12949 mdr
->ls
->dirty_dirfrag_nest
.push_back(&diri
->item_dirty_dirfrag_nest
);
12951 mds
->locker
->drop_locks(mdr
.get());
12954 // force the scatter-gather process
12956 lov
.add_rdlock(&diri
->dirfragtreelock
);
12957 lov
.add_rdlock(&diri
->nestlock
);
12958 lov
.add_rdlock(&diri
->filelock
);
12959 if (!mds
->locker
->acquire_locks(mdr
, lov
))
12962 diri
->state_clear(CInode::STATE_REPAIRSTATS
);
12964 frag_info_t dir_info
;
12965 nest_info_t nest_info
;
12966 nest_info
.rsubdirs
= 1; // it gets one to account for self
12967 if (const sr_t
*srnode
= diri
->get_projected_srnode(); srnode
)
12968 nest_info
.rsnaps
= srnode
->snaps
.size();
12972 diri
->dirfragtree
.get_leaves(leaves
);
12973 for (const auto& leaf
: leaves
) {
12974 CDir
*dir
= diri
->get_dirfrag(leaf
);
12976 ceph_assert(dir
->get_version() > 0);
12977 dir_info
.add(dir
->get_fnode()->accounted_fragstat
);
12978 nest_info
.add(dir
->get_fnode()->accounted_rstat
);
12982 if (!dir_info
.same_sums(diri
->get_inode()->dirstat
) ||
12983 !nest_info
.same_sums(diri
->get_inode()->rstat
)) {
12984 dout(10) << __func__
<< " failed to fix fragstat/rstat on "
12988 mds
->server
->respond_to_request(mdr
, 0);
12991 void MDCache::rdlock_dirfrags_stats(CInode
*diri
, MDSInternalContext
* fin
)
12993 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_RDLOCK_FRAGSSTATS
);
12994 mdr
->auth_pin(diri
); // already auth pinned by CInode::validate_disk_state()
12995 mdr
->internal_op_private
= diri
;
12996 mdr
->internal_op_finish
= fin
;
12997 return rdlock_dirfrags_stats_work(mdr
);
13000 void MDCache::rdlock_dirfrags_stats_work(MDRequestRef
& mdr
)
13002 CInode
*diri
= static_cast<CInode
*>(mdr
->internal_op_private
);
13003 dout(10) << __func__
<< " " << *diri
<< dendl
;
13004 if (!diri
->is_auth()) {
13005 mds
->server
->respond_to_request(mdr
, -CEPHFS_ESTALE
);
13008 if (!diri
->is_dir()) {
13009 mds
->server
->respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
13013 MutationImpl::LockOpVec lov
;
13014 lov
.add_rdlock(&diri
->dirfragtreelock
);
13015 lov
.add_rdlock(&diri
->nestlock
);
13016 lov
.add_rdlock(&diri
->filelock
);
13017 if (!mds
->locker
->acquire_locks(mdr
, lov
))
13019 dout(10) << __func__
<< " start dirfrags : " << *diri
<< dendl
;
13021 mds
->server
->respond_to_request(mdr
, 0);
13025 void MDCache::flush_dentry(std::string_view path
, Context
*fin
)
13027 if (is_readonly()) {
13028 dout(10) << __func__
<< ": read-only FS" << dendl
;
13029 fin
->complete(-CEPHFS_EROFS
);
13032 dout(10) << "flush_dentry " << path
<< dendl
;
13033 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_FLUSH
);
13035 mdr
->set_filepath(fp
);
13036 mdr
->internal_op_finish
= fin
;
13037 flush_dentry_work(mdr
);
13040 class C_FinishIOMDR
: public MDSContext
{
13044 MDSRank
*get_mds() override
{ return mds
; }
13046 C_FinishIOMDR(MDSRank
*mds_
, MDRequestRef
& mdr_
) : mds(mds_
), mdr(mdr_
) {}
13047 void finish(int r
) override
{ mds
->server
->respond_to_request(mdr
, r
); }
13050 void MDCache::flush_dentry_work(MDRequestRef
& mdr
)
13052 MutationImpl::LockOpVec lov
;
13053 CInode
*in
= mds
->server
->rdlock_path_pin_ref(mdr
, true);
13057 ceph_assert(in
->is_auth());
13058 in
->flush(new C_FinishIOMDR(mds
, mdr
));
13063 * Initialize performance counters with global perfcounter
13066 void MDCache::register_perfcounters()
13068 PerfCountersBuilder
pcb(g_ceph_context
, "mds_cache", l_mdc_first
, l_mdc_last
);
13070 // Stray/purge statistics
13071 pcb
.add_u64(l_mdc_num_strays
, "num_strays", "Stray dentries", "stry",
13072 PerfCountersBuilder::PRIO_INTERESTING
);
13073 pcb
.add_u64(l_mdc_num_recovering_enqueued
,
13074 "num_recovering_enqueued", "Files waiting for recovery", "recy",
13075 PerfCountersBuilder::PRIO_INTERESTING
);
13076 pcb
.add_u64_counter(l_mdc_recovery_completed
,
13077 "recovery_completed", "File recoveries completed", "recd",
13078 PerfCountersBuilder::PRIO_INTERESTING
);
13080 // useful recovery queue statistics
13081 pcb
.set_prio_default(PerfCountersBuilder::PRIO_USEFUL
);
13082 pcb
.add_u64(l_mdc_num_recovering_processing
, "num_recovering_processing",
13083 "Files currently being recovered");
13084 pcb
.add_u64(l_mdc_num_recovering_prioritized
, "num_recovering_prioritized",
13085 "Files waiting for recovery with elevated priority");
13086 pcb
.add_u64_counter(l_mdc_recovery_started
, "recovery_started",
13087 "File recoveries started");
13089 // along with other stray dentries stats
13090 pcb
.add_u64(l_mdc_num_strays_delayed
, "num_strays_delayed",
13091 "Stray dentries delayed");
13092 pcb
.add_u64(l_mdc_num_strays_enqueuing
, "num_strays_enqueuing",
13093 "Stray dentries enqueuing for purge");
13094 pcb
.add_u64_counter(l_mdc_strays_created
, "strays_created",
13095 "Stray dentries created");
13096 pcb
.add_u64_counter(l_mdc_strays_enqueued
, "strays_enqueued",
13097 "Stray dentries enqueued for purge");
13098 pcb
.add_u64_counter(l_mdc_strays_reintegrated
, "strays_reintegrated",
13099 "Stray dentries reintegrated");
13100 pcb
.add_u64_counter(l_mdc_strays_migrated
, "strays_migrated",
13101 "Stray dentries migrated");
13103 // low prio internal request stats
13104 pcb
.add_u64_counter(l_mdss_ireq_enqueue_scrub
, "ireq_enqueue_scrub",
13105 "Internal Request type enqueue scrub");
13106 pcb
.add_u64_counter(l_mdss_ireq_exportdir
, "ireq_exportdir",
13107 "Internal Request type export dir");
13108 pcb
.add_u64_counter(l_mdss_ireq_flush
, "ireq_flush",
13109 "Internal Request type flush");
13110 pcb
.add_u64_counter(l_mdss_ireq_fragmentdir
, "ireq_fragmentdir",
13111 "Internal Request type fragmentdir");
13112 pcb
.add_u64_counter(l_mdss_ireq_fragstats
, "ireq_fragstats",
13113 "Internal Request type frag stats");
13114 pcb
.add_u64_counter(l_mdss_ireq_inodestats
, "ireq_inodestats",
13115 "Internal Request type inode stats");
13117 logger
.reset(pcb
.create_perf_counters());
13118 g_ceph_context
->get_perfcounters_collection()->add(logger
.get());
13119 recovery_queue
.set_logger(logger
.get());
13120 stray_manager
.set_logger(logger
.get());
13124 * Call this when putting references to an inode/dentry or
13125 * when attempting to trim it.
13127 * If this inode is no longer linked by anyone, and this MDS
13128 * rank holds the primary dentry, and that dentry is in a stray
13129 * directory, then give up the dentry to the StrayManager, never
13130 * to be seen again by MDCache.
13132 * @param delay if true, then purgeable inodes are stashed til
13133 * the next trim(), rather than being purged right
13136 void MDCache::maybe_eval_stray(CInode
*in
, bool delay
) {
13137 if (in
->get_inode()->nlink
> 0 || in
->is_base() || is_readonly() ||
13138 mds
->get_state() <= MDSMap::STATE_REJOIN
)
13141 CDentry
*dn
= in
->get_projected_parent_dn();
13143 if (dn
->state_test(CDentry::STATE_PURGING
)) {
13144 /* We have already entered the purging process, no need
13145 * to re-evaluate me ! */
13149 if (dn
->get_dir()->get_inode()->is_stray()) {
13151 stray_manager
.queue_delayed(dn
);
13153 stray_manager
.eval_stray(dn
);
13157 void MDCache::clear_dirty_bits_for_stray(CInode
* diri
) {
13158 dout(10) << __func__
<< " " << *diri
<< dendl
;
13159 ceph_assert(diri
->get_projected_parent_dir()->inode
->is_stray());
13160 auto&& ls
= diri
->get_dirfrags();
13161 for (auto &p
: ls
) {
13162 if (p
->is_auth() && !(p
->is_frozen() || p
->is_freezing()))
13163 p
->try_remove_dentries_for_stray();
13165 if (!diri
->snaprealm
) {
13166 if (diri
->is_auth())
13167 diri
->clear_dirty_rstat();
13168 diri
->clear_scatter_dirty();
13172 bool MDCache::dump_inode(Formatter
*f
, uint64_t number
) {
13173 CInode
*in
= get_inode(number
);
13177 f
->open_object_section("inode");
13178 in
->dump(f
, CInode::DUMP_DEFAULT
| CInode::DUMP_PATH
);
13179 f
->close_section();
13183 void MDCache::handle_mdsmap(const MDSMap
&mdsmap
, const MDSMap
&oldmap
) {
13184 const mds_rank_t max_mds
= mdsmap
.get_max_mds();
13186 // process export_pin_delayed_queue whenever a new MDSMap received
13187 auto &q
= export_pin_delayed_queue
;
13188 for (auto it
= q
.begin(); it
!= q
.end(); ) {
13190 mds_rank_t export_pin
= in
->get_export_pin(false);
13191 dout(10) << " delayed export_pin=" << export_pin
<< " on " << *in
13192 << " max_mds=" << max_mds
<< dendl
;
13193 if (export_pin
>= mdsmap
.get_max_mds()) {
13198 in
->state_clear(CInode::STATE_DELAYEDEXPORTPIN
);
13200 in
->queue_export_pin(export_pin
);
13203 if (mdsmap
.get_max_mds() != oldmap
.get_max_mds()) {
13204 dout(10) << "Checking ephemerally pinned directories for redistribute due to max_mds change." << dendl
;
13205 /* copy to vector to avoid removals during iteration */
13206 std::vector
<CInode
*> migrate
;
13207 migrate
.assign(export_ephemeral_pins
.begin(), export_ephemeral_pins
.end());
13208 for (auto& in
: migrate
) {
13209 in
->maybe_export_pin();
13213 if (max_mds
<= 1) {
13214 export_ephemeral_dist_frag_bits
= 0;
13216 double want
= g_conf().get_val
<double>("mds_export_ephemeral_distributed_factor");
13219 while ((1U << n
) < (unsigned)want
)
13221 export_ephemeral_dist_frag_bits
= n
;
13225 void MDCache::upkeep_main(void)
13227 std::unique_lock
lock(upkeep_mutex
);
13228 while (!upkeep_trim_shutdown
.load()) {
13229 auto now
= clock::now();
13230 auto since
= now
-upkeep_last_trim
;
13231 auto trim_interval
= clock::duration(g_conf().get_val
<std::chrono::seconds
>("mds_cache_trim_interval"));
13232 if (since
>= trim_interval
*.90) {
13233 lock
.unlock(); /* mds_lock -> upkeep_mutex */
13234 std::scoped_lock
mds_lock(mds
->mds_lock
);
13236 if (upkeep_trim_shutdown
.load())
13238 check_memory_usage();
13239 if (mds
->is_cache_trimmable()) {
13240 dout(20) << "upkeep thread trimming cache; last trim " << since
<< " ago" << dendl
;
13241 bool active_with_clients
= mds
->is_active() || mds
->is_clientreplay() || mds
->is_stopping();
13242 if (active_with_clients
) {
13243 trim_client_leases();
13246 if (active_with_clients
) {
13247 auto recall_flags
= Server::RecallFlags::ENFORCE_MAX
|Server::RecallFlags::ENFORCE_LIVENESS
;
13248 if (cache_toofull()) {
13249 recall_flags
= recall_flags
|Server::RecallFlags::TRIM
;
13251 mds
->server
->recall_client_state(nullptr, recall_flags
);
13253 upkeep_last_trim
= now
= clock::now();
13255 dout(10) << "cache not ready for trimming" << dendl
;
13258 trim_interval
-= since
;
13260 since
= now
-upkeep_last_release
;
13261 auto release_interval
= clock::duration(g_conf().get_val
<std::chrono::seconds
>("mds_cache_release_free_interval"));
13262 if (since
>= release_interval
*.90) {
13263 /* XXX not necessary once MDCache uses PriorityCache */
13264 dout(10) << "releasing free memory" << dendl
;
13265 ceph_heap_release_free_memory();
13266 upkeep_last_release
= clock::now();
13268 release_interval
-= since
;
13270 auto interval
= std::min(release_interval
, trim_interval
);
13271 dout(20) << "upkeep thread waiting interval " << interval
<< dendl
;
13272 upkeep_cvar
.wait_for(lock
, interval
);