1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <string_view>
26 #include "MDBalancer.h"
28 #include "ScrubStack.h"
30 #include "SnapClient.h"
39 #include "include/ceph_fs.h"
40 #include "include/filepath.h"
41 #include "include/util.h"
43 #include "messages/MClientCaps.h"
45 #include "msg/Message.h"
46 #include "msg/Messenger.h"
48 #include "common/MemoryModel.h"
49 #include "common/errno.h"
50 #include "common/perf_counters.h"
51 #include "common/safe_io.h"
53 #include "osdc/Journaler.h"
54 #include "osdc/Filer.h"
56 #include "events/ESubtreeMap.h"
57 #include "events/EUpdate.h"
58 #include "events/EPeerUpdate.h"
59 #include "events/EImportFinish.h"
60 #include "events/EFragment.h"
61 #include "events/ECommitted.h"
62 #include "events/EPurged.h"
63 #include "events/ESessions.h"
68 #include "common/Timer.h"
70 #include "perfglue/heap_profiler.h"
73 #include "common/config.h"
74 #include "include/ceph_assert.h"
76 #define dout_context g_ceph_context
77 #define dout_subsys ceph_subsys_mds
79 #define dout_prefix _prefix(_dout, mds)
83 static ostream
& _prefix(std::ostream
*_dout
, MDSRank
*mds
) {
84 return *_dout
<< "mds." << mds
->get_nodeid() << ".cache ";
87 set
<int> SimpleLock::empty_gather_set
;
91 * All non-I/O contexts that require a reference
92 * to an MDCache instance descend from this.
94 class MDCacheContext
: public virtual MDSContext
{
97 MDSRank
*get_mds() override
99 ceph_assert(mdcache
!= NULL
);
103 explicit MDCacheContext(MDCache
*mdc_
) : mdcache(mdc_
) {}
106 class MDCacheLogContext
: public virtual MDSLogContextBase
{
109 MDSRank
*get_mds() override
111 ceph_assert(mdcache
!= NULL
);
115 explicit MDCacheLogContext(MDCache
*mdc_
) : mdcache(mdc_
) {}
118 MDCache::MDCache(MDSRank
*m
, PurgeQueue
&purge_queue_
) :
121 filer(m
->objecter
, m
->finisher
),
122 stray_manager(m
, purge_queue_
),
124 trim_counter(g_conf().get_val
<double>("mds_cache_trim_decay_rate"))
126 migrator
.reset(new Migrator(mds
, this));
128 max_dir_commit_size
= g_conf()->mds_dir_max_commit_size
?
129 (g_conf()->mds_dir_max_commit_size
<< 20) :
130 (0.9 *(g_conf()->osd_max_write_size
<< 20));
132 cache_memory_limit
= g_conf().get_val
<Option::size_t>("mds_cache_memory_limit");
133 cache_reservation
= g_conf().get_val
<double>("mds_cache_reservation");
134 cache_health_threshold
= g_conf().get_val
<double>("mds_health_cache_threshold");
136 export_ephemeral_distributed_config
= g_conf().get_val
<bool>("mds_export_ephemeral_distributed");
137 export_ephemeral_random_config
= g_conf().get_val
<bool>("mds_export_ephemeral_random");
138 export_ephemeral_random_max
= g_conf().get_val
<double>("mds_export_ephemeral_random_max");
140 symlink_recovery
= g_conf().get_val
<bool>("mds_symlink_recovery");
142 lru
.lru_set_midpoint(g_conf().get_val
<double>("mds_cache_mid"));
144 bottom_lru
.lru_set_midpoint(0);
146 decayrate
.set_halflife(g_conf()->mds_decay_halflife
);
148 upkeeper
= std::thread(&MDCache::upkeep_main
, this);
154 g_ceph_context
->get_perfcounters_collection()->remove(logger
.get());
156 if (upkeeper
.joinable())
160 void MDCache::handle_conf_change(const std::set
<std::string
>& changed
, const MDSMap
& mdsmap
)
162 dout(20) << "config changes: " << changed
<< dendl
;
163 if (changed
.count("mds_cache_memory_limit"))
164 cache_memory_limit
= g_conf().get_val
<Option::size_t>("mds_cache_memory_limit");
165 if (changed
.count("mds_cache_reservation"))
166 cache_reservation
= g_conf().get_val
<double>("mds_cache_reservation");
168 bool ephemeral_pin_config_changed
= false;
169 if (changed
.count("mds_export_ephemeral_distributed")) {
170 export_ephemeral_distributed_config
= g_conf().get_val
<bool>("mds_export_ephemeral_distributed");
171 dout(10) << "Migrating any ephemeral distributed pinned inodes" << dendl
;
172 /* copy to vector to avoid removals during iteration */
173 ephemeral_pin_config_changed
= true;
175 if (changed
.count("mds_export_ephemeral_random")) {
176 export_ephemeral_random_config
= g_conf().get_val
<bool>("mds_export_ephemeral_random");
177 dout(10) << "Migrating any ephemeral random pinned inodes" << dendl
;
178 /* copy to vector to avoid removals during iteration */
179 ephemeral_pin_config_changed
= true;
181 if (ephemeral_pin_config_changed
) {
182 std::vector
<CInode
*> migrate
;
183 migrate
.assign(export_ephemeral_pins
.begin(), export_ephemeral_pins
.end());
184 for (auto& in
: migrate
) {
185 in
->maybe_export_pin(true);
188 if (changed
.count("mds_export_ephemeral_random_max")) {
189 export_ephemeral_random_max
= g_conf().get_val
<double>("mds_export_ephemeral_random_max");
191 if (changed
.count("mds_health_cache_threshold"))
192 cache_health_threshold
= g_conf().get_val
<double>("mds_health_cache_threshold");
193 if (changed
.count("mds_cache_mid"))
194 lru
.lru_set_midpoint(g_conf().get_val
<double>("mds_cache_mid"));
195 if (changed
.count("mds_cache_trim_decay_rate")) {
196 trim_counter
= DecayCounter(g_conf().get_val
<double>("mds_cache_trim_decay_rate"));
198 if (changed
.count("mds_symlink_recovery")) {
199 symlink_recovery
= g_conf().get_val
<bool>("mds_symlink_recovery");
200 dout(10) << "Storing symlink targets on file object's head " << symlink_recovery
<< dendl
;
203 migrator
->handle_conf_change(changed
, mdsmap
);
204 mds
->balancer
->handle_conf_change(changed
, mdsmap
);
207 void MDCache::log_stat()
209 mds
->logger
->set(l_mds_inodes
, lru
.lru_get_size());
210 mds
->logger
->set(l_mds_inodes_pinned
, lru
.lru_get_num_pinned());
211 mds
->logger
->set(l_mds_inodes_top
, lru
.lru_get_top());
212 mds
->logger
->set(l_mds_inodes_bottom
, lru
.lru_get_bot());
213 mds
->logger
->set(l_mds_inodes_pin_tail
, lru
.lru_get_pintail());
214 mds
->logger
->set(l_mds_inodes_with_caps
, num_inodes_with_caps
);
215 mds
->logger
->set(l_mds_caps
, Capability::count());
217 mds
->logger
->set(l_mds_root_rfiles
, root
->get_inode()->rstat
.rfiles
);
218 mds
->logger
->set(l_mds_root_rbytes
, root
->get_inode()->rstat
.rbytes
);
219 mds
->logger
->set(l_mds_root_rsnaps
, root
->get_inode()->rstat
.rsnaps
);
226 bool MDCache::shutdown()
229 std::scoped_lock
lock(upkeep_mutex
);
230 upkeep_trim_shutdown
= true;
231 upkeep_cvar
.notify_one();
233 if (lru
.lru_get_size() > 0) {
234 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl
;
243 // ====================================================================
244 // some inode functions
246 void MDCache::add_inode(CInode
*in
)
249 if (in
->last
== CEPH_NOSNAP
) {
250 auto &p
= inode_map
[in
->ino()];
251 ceph_assert(!p
); // should be no dup inos!
254 auto &p
= snap_inode_map
[in
->vino()];
255 ceph_assert(!p
); // should be no dup inos!
259 if (in
->ino() < MDS_INO_SYSTEM_BASE
) {
260 if (in
->ino() == CEPH_INO_ROOT
)
262 else if (in
->ino() == MDS_INO_MDSDIR(mds
->get_nodeid()))
264 else if (in
->is_stray()) {
265 if (MDS_INO_STRAY_OWNER(in
->ino()) == mds
->get_nodeid()) {
266 strays
[MDS_INO_STRAY_INDEX(in
->ino())] = in
;
270 base_inodes
.insert(in
);
274 void MDCache::remove_inode(CInode
*o
)
276 dout(14) << "remove_inode " << *o
<< dendl
;
278 if (o
->get_parent_dn()) {
279 // FIXME: multiple parents?
280 CDentry
*dn
= o
->get_parent_dn();
281 ceph_assert(!dn
->is_dirty());
282 dn
->dir
->unlink_inode(dn
); // leave dentry ... FIXME?
287 if (o
->is_dirty_parent())
288 o
->clear_dirty_parent();
290 o
->clear_scatter_dirty();
292 o
->clear_clientwriteable();
294 o
->item_open_file
.remove_myself();
296 if (o
->state_test(CInode::STATE_QUEUEDEXPORTPIN
))
297 export_pin_queue
.erase(o
);
299 if (o
->state_test(CInode::STATE_DELAYEDEXPORTPIN
))
300 export_pin_delayed_queue
.erase(o
);
302 o
->clear_ephemeral_pin(true, true);
304 // remove from inode map
305 if (o
->last
== CEPH_NOSNAP
) {
306 inode_map
.erase(o
->ino());
308 o
->item_caps
.remove_myself();
309 snap_inode_map
.erase(o
->vino());
312 clear_taken_inos(o
->ino());
314 if (o
->ino() < MDS_INO_SYSTEM_BASE
) {
315 if (o
== root
) root
= 0;
316 if (o
== myin
) myin
= 0;
318 if (MDS_INO_STRAY_OWNER(o
->ino()) == mds
->get_nodeid()) {
319 strays
[MDS_INO_STRAY_INDEX(o
->ino())] = 0;
323 base_inodes
.erase(o
);
327 ceph_assert(o
->get_num_ref() == 0);
331 file_layout_t
MDCache::gen_default_file_layout(const MDSMap
&mdsmap
)
333 file_layout_t result
= file_layout_t::get_default();
334 result
.pool_id
= mdsmap
.get_first_data_pool();
338 file_layout_t
MDCache::gen_default_log_layout(const MDSMap
&mdsmap
)
340 file_layout_t result
= file_layout_t::get_default();
341 result
.pool_id
= mdsmap
.get_metadata_pool();
342 if (g_conf()->mds_log_segment_size
> 0) {
343 result
.object_size
= g_conf()->mds_log_segment_size
;
344 result
.stripe_unit
= g_conf()->mds_log_segment_size
;
349 void MDCache::init_layouts()
351 default_file_layout
= gen_default_file_layout(*(mds
->mdsmap
));
352 default_log_layout
= gen_default_log_layout(*(mds
->mdsmap
));
355 void MDCache::create_unlinked_system_inode(CInode
*in
, inodeno_t ino
, int mode
) const
357 auto _inode
= in
->_get_inode();
360 _inode
->xattr_version
= 1;
361 _inode
->mode
= 0500 | mode
;
363 _inode
->ctime
= _inode
->mtime
= _inode
->btime
= ceph_clock_now();
365 _inode
->truncate_size
= -1ull;
366 _inode
->change_attr
= 0;
367 _inode
->export_pin
= MDS_RANK_NONE
;
369 // FIPS zeroization audit 20191117: this memset is not security related.
370 memset(&_inode
->dir_layout
, 0, sizeof(_inode
->dir_layout
));
371 if (_inode
->is_dir()) {
372 _inode
->dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
373 _inode
->rstat
.rsubdirs
= 1; /* itself */
374 _inode
->rstat
.rctime
= in
->get_inode()->ctime
;
376 _inode
->layout
= default_file_layout
;
377 ++_inode
->rstat
.rfiles
;
379 _inode
->accounted_rstat
= _inode
->rstat
;
383 in
->inode_auth
= mds_authority_t(mds
->get_nodeid(), CDIR_AUTH_UNKNOWN
);
385 in
->inode_auth
= mds_authority_t(mds_rank_t(in
->ino() - MDS_INO_MDSDIR_OFFSET
), CDIR_AUTH_UNKNOWN
);
386 in
->open_snaprealm(); // empty snaprealm
387 ceph_assert(!in
->snaprealm
->parent
); // created its own
388 in
->snaprealm
->srnode
.seq
= 1;
392 CInode
*MDCache::create_system_inode(inodeno_t ino
, int mode
)
394 dout(0) << "creating system inode with ino:" << ino
<< dendl
;
395 CInode
*in
= new CInode(this);
396 create_unlinked_system_inode(in
, ino
, mode
);
401 CInode
*MDCache::create_root_inode()
403 CInode
*in
= create_system_inode(CEPH_INO_ROOT
, S_IFDIR
|0755);
404 auto _inode
= in
->_get_inode();
405 _inode
->uid
= g_conf()->mds_root_ino_uid
;
406 _inode
->gid
= g_conf()->mds_root_ino_gid
;
407 _inode
->layout
= default_file_layout
;
408 _inode
->layout
.pool_id
= mds
->mdsmap
->get_first_data_pool();
412 void MDCache::create_empty_hierarchy(MDSGather
*gather
)
415 CInode
*root
= create_root_inode();
417 // force empty root dir
418 CDir
*rootdir
= root
->get_or_open_dirfrag(this, frag_t());
419 adjust_subtree_auth(rootdir
, mds
->get_nodeid());
420 rootdir
->dir_rep
= CDir::REP_ALL
; //NONE;
422 ceph_assert(rootdir
->get_fnode()->accounted_fragstat
== rootdir
->get_fnode()->fragstat
);
423 ceph_assert(rootdir
->get_fnode()->fragstat
== root
->get_inode()->dirstat
);
424 ceph_assert(rootdir
->get_fnode()->accounted_rstat
== rootdir
->get_fnode()->rstat
);
425 /* Do no update rootdir rstat information of the fragment, rstat upkeep magic
426 * assume version 0 is stale/invalid.
429 rootdir
->mark_complete();
430 rootdir
->_get_fnode()->version
= rootdir
->pre_dirty();
431 rootdir
->mark_dirty(mds
->mdlog
->get_current_segment());
432 rootdir
->commit(0, gather
->new_sub());
434 root
->store(gather
->new_sub());
435 root
->mark_dirty_parent(mds
->mdlog
->get_current_segment(), true);
436 root
->store_backtrace(gather
->new_sub());
439 void MDCache::create_mydir_hierarchy(MDSGather
*gather
)
442 CInode
*my
= create_system_inode(MDS_INO_MDSDIR(mds
->get_nodeid()), S_IFDIR
);
444 CDir
*mydir
= my
->get_or_open_dirfrag(this, frag_t());
445 auto mydir_fnode
= mydir
->_get_fnode();
447 adjust_subtree_auth(mydir
, mds
->get_nodeid());
449 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
452 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
453 CInode
*stray
= create_system_inode(MDS_INO_STRAY(mds
->get_nodeid(), i
), S_IFDIR
);
454 CDir
*straydir
= stray
->get_or_open_dirfrag(this, frag_t());
455 CachedStackStringStream css
;
456 *css
<< "stray" << i
;
457 CDentry
*sdn
= mydir
->add_primary_dentry(css
->str(), stray
, "");
458 sdn
->_mark_dirty(mds
->mdlog
->get_current_segment());
460 stray
->_get_inode()->dirstat
= straydir
->get_fnode()->fragstat
;
462 mydir_fnode
->rstat
.add(stray
->get_inode()->rstat
);
463 mydir_fnode
->fragstat
.nsubdirs
++;
465 straydir
->mark_complete();
466 straydir
->_get_fnode()->version
= straydir
->pre_dirty();
467 straydir
->mark_dirty(ls
);
468 straydir
->commit(0, gather
->new_sub());
469 stray
->mark_dirty_parent(ls
, true);
470 stray
->store_backtrace(gather
->new_sub());
473 mydir_fnode
->accounted_fragstat
= mydir
->get_fnode()->fragstat
;
474 mydir_fnode
->accounted_rstat
= mydir
->get_fnode()->rstat
;
476 auto inode
= myin
->_get_inode();
477 inode
->dirstat
= mydir
->get_fnode()->fragstat
;
478 inode
->rstat
= mydir
->get_fnode()->rstat
;
479 ++inode
->rstat
.rsubdirs
;
480 inode
->accounted_rstat
= inode
->rstat
;
482 mydir
->mark_complete();
483 mydir_fnode
->version
= mydir
->pre_dirty();
484 mydir
->mark_dirty(ls
);
485 mydir
->commit(0, gather
->new_sub());
487 myin
->store(gather
->new_sub());
490 struct C_MDC_CreateSystemFile
: public MDCacheLogContext
{
495 C_MDC_CreateSystemFile(MDCache
*c
, MutationRef
& mu
, CDentry
*d
, version_t v
, MDSContext
*f
) :
496 MDCacheLogContext(c
), mut(mu
), dn(d
), dpv(v
), fin(f
) {}
497 void finish(int r
) override
{
498 mdcache
->_create_system_file_finish(mut
, dn
, dpv
, fin
);
502 void MDCache::_create_system_file(CDir
*dir
, std::string_view name
, CInode
*in
, MDSContext
*fin
)
504 dout(10) << "_create_system_file " << name
<< " in " << *dir
<< dendl
;
505 CDentry
*dn
= dir
->add_null_dentry(name
);
507 dn
->push_projected_linkage(in
);
508 version_t dpv
= dn
->pre_dirty();
511 auto inode
= in
->_get_inode();
513 inode
->rstat
.rsubdirs
= 1;
515 mdir
= in
->get_or_open_dirfrag(this, frag_t());
516 mdir
->mark_complete();
517 mdir
->_get_fnode()->version
= mdir
->pre_dirty();
519 inode
->rstat
.rfiles
= 1;
522 inode
->version
= dn
->pre_dirty();
524 SnapRealm
*realm
= dir
->get_inode()->find_snaprealm();
525 dn
->first
= in
->first
= realm
->get_newest_seq() + 1;
527 MutationRef
mut(new MutationImpl());
529 // force some locks. hacky.
530 mds
->locker
->wrlock_force(&dir
->inode
->filelock
, mut
);
531 mds
->locker
->wrlock_force(&dir
->inode
->nestlock
, mut
);
533 mut
->ls
= mds
->mdlog
->get_current_segment();
534 EUpdate
*le
= new EUpdate(mds
->mdlog
, "create system file");
535 mds
->mdlog
->start_entry(le
);
537 if (!in
->is_mdsdir()) {
538 predirty_journal_parents(mut
, &le
->metablob
, in
, dir
, PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
539 le
->metablob
.add_primary_dentry(dn
, in
, true);
541 predirty_journal_parents(mut
, &le
->metablob
, in
, dir
, PREDIRTY_DIR
, 1);
542 journal_dirty_inode(mut
.get(), &le
->metablob
, in
);
543 dn
->push_projected_linkage(in
->ino(), in
->d_type());
544 le
->metablob
.add_remote_dentry(dn
, true, in
->ino(), in
->d_type());
545 le
->metablob
.add_root(true, in
);
548 le
->metablob
.add_new_dir(mdir
); // dirty AND complete AND new
550 mds
->mdlog
->submit_entry(le
, new C_MDC_CreateSystemFile(this, mut
, dn
, dpv
, fin
));
554 void MDCache::_create_system_file_finish(MutationRef
& mut
, CDentry
*dn
, version_t dpv
, MDSContext
*fin
)
556 dout(10) << "_create_system_file_finish " << *dn
<< dendl
;
558 dn
->pop_projected_linkage();
559 dn
->mark_dirty(dpv
, mut
->ls
);
561 CInode
*in
= dn
->get_linkage()->get_inode();
562 in
->mark_dirty(mut
->ls
);
565 CDir
*dir
= in
->get_dirfrag(frag_t());
567 dir
->mark_dirty(mut
->ls
);
568 dir
->mark_new(mut
->ls
);
572 mds
->locker
->drop_locks(mut
.get());
577 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
578 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
583 struct C_MDS_RetryOpenRoot
: public MDSInternalContext
{
585 explicit C_MDS_RetryOpenRoot(MDCache
*c
) : MDSInternalContext(c
->mds
), cache(c
) {}
586 void finish(int r
) override
{
588 // If we can't open root, something disastrous has happened: mark
589 // this rank damaged for operator intervention. Note that
590 // it is not okay to call suicide() here because we are in
591 // a Finisher callback.
592 cache
->mds
->damaged();
593 ceph_abort(); // damaged should never return
600 void MDCache::open_root_inode(MDSContext
*c
)
602 if (mds
->get_nodeid() == mds
->mdsmap
->get_root()) {
604 in
= create_system_inode(CEPH_INO_ROOT
, S_IFDIR
|0755); // initially inaccurate!
607 discover_base_ino(CEPH_INO_ROOT
, c
, mds
->mdsmap
->get_root());
611 void MDCache::open_mydir_inode(MDSContext
*c
)
613 CInode
*in
= create_system_inode(MDS_INO_MDSDIR(mds
->get_nodeid()), S_IFDIR
|0755); // initially inaccurate!
617 void MDCache::open_mydir_frag(MDSContext
*c
)
620 new MDSInternalContextWrapper(mds
,
621 new LambdaContext([this, c
](int r
) {
626 CDir
*mydir
= myin
->get_or_open_dirfrag(this, frag_t());
628 adjust_subtree_auth(mydir
, mds
->get_nodeid());
635 void MDCache::open_root()
637 dout(10) << "open_root" << dendl
;
640 open_root_inode(new C_MDS_RetryOpenRoot(this));
643 if (mds
->get_nodeid() == mds
->mdsmap
->get_root()) {
644 ceph_assert(root
->is_auth());
645 CDir
*rootdir
= root
->get_or_open_dirfrag(this, frag_t());
646 ceph_assert(rootdir
);
647 if (!rootdir
->is_subtree_root())
648 adjust_subtree_auth(rootdir
, mds
->get_nodeid());
649 if (!rootdir
->is_complete()) {
650 rootdir
->fetch(new C_MDS_RetryOpenRoot(this));
654 ceph_assert(!root
->is_auth());
655 CDir
*rootdir
= root
->get_dirfrag(frag_t());
657 open_remote_dirfrag(root
, frag_t(), new C_MDS_RetryOpenRoot(this));
663 CInode
*in
= create_system_inode(MDS_INO_MDSDIR(mds
->get_nodeid()), S_IFDIR
|0755); // initially inaccurate!
664 in
->fetch(new C_MDS_RetryOpenRoot(this));
667 CDir
*mydir
= myin
->get_or_open_dirfrag(this, frag_t());
669 adjust_subtree_auth(mydir
, mds
->get_nodeid());
674 void MDCache::advance_stray() {
675 // check whether the directory has been fragmented
676 if (stray_fragmenting_index
>= 0) {
677 auto&& dfs
= strays
[stray_fragmenting_index
]->get_dirfrags();
678 bool any_fragmenting
= false;
679 for (const auto& dir
: dfs
) {
680 if (dir
->state_test(CDir::STATE_FRAGMENTING
) ||
681 mds
->balancer
->is_fragment_pending(dir
->dirfrag())) {
682 any_fragmenting
= true;
686 if (!any_fragmenting
)
687 stray_fragmenting_index
= -1;
690 for (int i
= 1; i
< NUM_STRAY
; i
++){
691 stray_index
= (stray_index
+ i
) % NUM_STRAY
;
692 if (stray_index
!= stray_fragmenting_index
)
696 if (stray_fragmenting_index
== -1 && is_open()) {
697 // Fragment later stray dir in advance. We don't choose past
698 // stray dir because in-flight requests may still use it.
699 stray_fragmenting_index
= (stray_index
+ 3) % NUM_STRAY
;
700 auto&& dfs
= strays
[stray_fragmenting_index
]->get_dirfrags();
701 bool any_fragmenting
= false;
702 for (const auto& dir
: dfs
) {
703 if (dir
->should_split()) {
704 mds
->balancer
->queue_split(dir
, true);
705 any_fragmenting
= true;
706 } else if (dir
->should_merge()) {
707 mds
->balancer
->queue_merge(dir
);
708 any_fragmenting
= true;
711 if (!any_fragmenting
)
712 stray_fragmenting_index
= -1;
715 dout(10) << "advance_stray to index " << stray_index
716 << " fragmenting index " << stray_fragmenting_index
<< dendl
;
719 void MDCache::populate_mydir()
722 CDir
*mydir
= myin
->get_or_open_dirfrag(this, frag_t());
725 dout(10) << "populate_mydir " << *mydir
<< dendl
;
727 if (!mydir
->is_complete()) {
728 mydir
->fetch(new C_MDS_RetryOpenRoot(this));
732 if (mydir
->get_version() == 0 && mydir
->state_test(CDir::STATE_BADFRAG
)) {
733 // A missing dirfrag, we will recreate it. Before that, we must dirty
734 // it before dirtying any of the strays we create within it.
735 mds
->clog
->warn() << "fragment " << mydir
->dirfrag() << " was unreadable, "
737 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
738 mydir
->state_clear(CDir::STATE_BADFRAG
);
739 mydir
->mark_complete();
740 mydir
->_get_fnode()->version
= mydir
->pre_dirty();
741 mydir
->mark_dirty(ls
);
744 // open or create stray
745 uint64_t num_strays
= 0;
746 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
747 CachedStackStringStream css
;
748 *css
<< "stray" << i
;
749 CDentry
*straydn
= mydir
->lookup(css
->str());
751 // allow for older fs's with stray instead of stray0
752 if (straydn
== NULL
&& i
== 0)
753 straydn
= mydir
->lookup("stray");
755 if (!straydn
|| !straydn
->get_linkage()->get_inode()) {
756 _create_system_file(mydir
, css
->strv(), create_system_inode(MDS_INO_STRAY(mds
->get_nodeid(), i
), S_IFDIR
),
757 new C_MDS_RetryOpenRoot(this));
760 ceph_assert(straydn
);
761 ceph_assert(strays
[i
]);
762 // we make multiple passes through this method; make sure we only pin each stray once.
763 if (!strays
[i
]->state_test(CInode::STATE_STRAYPINNED
)) {
764 strays
[i
]->get(CInode::PIN_STRAY
);
765 strays
[i
]->state_set(CInode::STATE_STRAYPINNED
);
766 strays
[i
]->get_stickydirs();
768 dout(20) << " stray num " << i
<< " is " << *strays
[i
] << dendl
;
772 strays
[i
]->dirfragtree
.get_leaves(leaves
);
773 for (const auto& leaf
: leaves
) {
774 CDir
*dir
= strays
[i
]->get_dirfrag(leaf
);
776 dir
= strays
[i
]->get_or_open_dirfrag(this, leaf
);
779 // DamageTable applies special handling to strays: it will
780 // have damaged() us out if one is damaged.
781 ceph_assert(!dir
->state_test(CDir::STATE_BADFRAG
));
783 if (dir
->get_version() == 0) {
784 dir
->fetch_keys({}, new C_MDS_RetryOpenRoot(this));
788 if (dir
->get_frag_size() > 0)
789 num_strays
+= dir
->get_frag_size();
794 dout(10) << "populate_mydir done" << dendl
;
797 mds
->queue_waiters(waiting_for_open
);
799 stray_manager
.set_num_strays(num_strays
);
800 stray_manager
.activate();
805 void MDCache::open_foreign_mdsdir(inodeno_t ino
, MDSContext
*fin
)
807 discover_base_ino(ino
, fin
, mds_rank_t(ino
& (MAX_MDS
-1)));
810 CDir
*MDCache::get_stray_dir(CInode
*in
)
813 in
->name_stray_dentry(straydname
);
815 CInode
*strayi
= get_stray();
817 frag_t fg
= strayi
->pick_dirfrag(straydname
);
818 CDir
*straydir
= strayi
->get_dirfrag(fg
);
819 ceph_assert(straydir
);
823 MDSCacheObject
*MDCache::get_object(const MDSCacheObjectInfo
&info
)
827 return get_inode(info
.ino
, info
.snapid
);
830 CDir
*dir
= get_dirfrag(info
.dirfrag
);
833 if (info
.dname
.length())
834 return dir
->lookup(info
.dname
, info
.snapid
);
840 // ====================================================================
841 // consistent hash ring
844 * hashing implementation based on Lamping and Veach's Jump Consistent Hash: https://arxiv.org/pdf/1406.2294.pdf
846 mds_rank_t
MDCache::hash_into_rank_bucket(inodeno_t ino
, frag_t fg
)
848 const mds_rank_t max_mds
= mds
->mdsmap
->get_max_mds();
849 uint64_t hash
= rjhash64(ino
);
851 hash
= rjhash64(hash
+ rjhash64(fg
.value()));
853 int64_t b
= -1, j
= 0;
854 while (j
< max_mds
) {
856 hash
= hash
*2862933555777941757ULL + 1;
857 j
= (b
+ 1) * (double(1LL << 31) / double((hash
>> 33) + 1));
859 // verify bounds before returning
860 auto result
= mds_rank_t(b
);
861 ceph_assert(result
>= 0 && result
< max_mds
);
866 // ====================================================================
867 // subtree management
870 * adjust the dir_auth of a subtree.
871 * merge with parent and/or child subtrees, if is it appropriate.
872 * merge can ONLY happen if both parent and child have unambiguous auth.
874 void MDCache::adjust_subtree_auth(CDir
*dir
, mds_authority_t auth
, bool adjust_pop
)
876 dout(7) << "adjust_subtree_auth " << dir
->get_dir_auth() << " -> " << auth
877 << " on " << *dir
<< dendl
;
882 if (dir
->inode
->is_base()) {
883 root
= dir
; // bootstrap hack.
884 if (subtrees
.count(root
) == 0) {
886 root
->get(CDir::PIN_SUBTREE
);
889 root
= get_subtree_root(dir
); // subtree root
892 ceph_assert(subtrees
.count(root
));
893 dout(7) << " current root is " << *root
<< dendl
;
896 // i am already a subtree.
897 dir
->set_dir_auth(auth
);
899 // i am a new subtree.
900 dout(10) << " new subtree at " << *dir
<< dendl
;
901 ceph_assert(subtrees
.count(dir
) == 0);
902 subtrees
[dir
]; // create empty subtree bounds list for me.
903 dir
->get(CDir::PIN_SUBTREE
);
906 dir
->set_dir_auth(auth
);
908 // move items nested beneath me, under me.
909 set
<CDir
*>::iterator p
= subtrees
[root
].begin();
910 while (p
!= subtrees
[root
].end()) {
911 set
<CDir
*>::iterator next
= p
;
913 if (get_subtree_root((*p
)->get_parent_dir()) == dir
) {
915 dout(10) << " claiming child bound " << **p
<< dendl
;
916 subtrees
[dir
].insert(*p
);
917 subtrees
[root
].erase(p
);
922 // i am a bound of the parent subtree.
923 subtrees
[root
].insert(dir
);
925 // i am now the subtree root.
928 // adjust recursive pop counters
929 if (adjust_pop
&& dir
->is_auth()) {
930 CDir
*p
= dir
->get_parent_dir();
932 p
->pop_auth_subtree
.sub(dir
->pop_auth_subtree
);
933 if (p
->is_subtree_root()) break;
934 p
= p
->inode
->get_parent_dir();
943 void MDCache::try_subtree_merge(CDir
*dir
)
945 dout(7) << "try_subtree_merge " << *dir
<< dendl
;
946 // record my old bounds
947 auto oldbounds
= subtrees
.at(dir
);
949 set
<CInode
*> to_eval
;
950 // try merge at my root
951 try_subtree_merge_at(dir
, &to_eval
);
953 // try merge at my old bounds
954 for (auto bound
: oldbounds
)
955 try_subtree_merge_at(bound
, &to_eval
);
957 if (!(mds
->is_any_replay() || mds
->is_resolve())) {
958 for(auto in
: to_eval
)
959 eval_subtree_root(in
);
963 void MDCache::try_subtree_merge_at(CDir
*dir
, set
<CInode
*> *to_eval
, bool adjust_pop
)
965 dout(10) << "try_subtree_merge_at " << *dir
<< dendl
;
967 if (dir
->dir_auth
.second
!= CDIR_AUTH_UNKNOWN
||
968 dir
->state_test(CDir::STATE_EXPORTBOUND
) ||
969 dir
->state_test(CDir::STATE_AUXSUBTREE
))
972 auto it
= subtrees
.find(dir
);
973 ceph_assert(it
!= subtrees
.end());
975 // merge with parent?
977 if (!dir
->inode
->is_base())
978 parent
= get_subtree_root(dir
->get_parent_dir());
980 if (parent
!= dir
&& // we have a parent,
981 parent
->dir_auth
== dir
->dir_auth
) { // auth matches,
982 // merge with parent.
983 dout(10) << " subtree merge at " << *dir
<< dendl
;
984 dir
->set_dir_auth(CDIR_AUTH_DEFAULT
);
986 // move our bounds under the parent
987 subtrees
[parent
].insert(it
->second
.begin(), it
->second
.end());
989 // we are no longer a subtree or bound
990 dir
->put(CDir::PIN_SUBTREE
);
992 subtrees
[parent
].erase(dir
);
994 // adjust popularity?
995 if (adjust_pop
&& dir
->is_auth()) {
997 CDir
*p
= dir
->get_parent_dir();
999 p
->pop_auth_subtree
.add(dir
->pop_auth_subtree
);
1000 p
->pop_lru_subdirs
.push_front(&cur
->get_inode()->item_pop_lru
);
1001 if (p
->is_subtree_root()) break;
1003 p
= p
->inode
->get_parent_dir();
1007 if (to_eval
&& dir
->get_inode()->is_auth())
1008 to_eval
->insert(dir
->get_inode());
1014 void MDCache::eval_subtree_root(CInode
*diri
)
1016 // evaluate subtree inode filelock?
1017 // (we should scatter the filelock on subtree bounds)
1018 ceph_assert(diri
->is_auth());
1019 mds
->locker
->try_eval(diri
, CEPH_LOCK_IFILE
| CEPH_LOCK_INEST
);
1023 void MDCache::adjust_bounded_subtree_auth(CDir
*dir
, const set
<CDir
*>& bounds
, mds_authority_t auth
)
1025 dout(7) << "adjust_bounded_subtree_auth " << dir
->get_dir_auth() << " -> " << auth
1027 << " bounds " << bounds
1033 if (dir
->ino() == CEPH_INO_ROOT
) {
1034 root
= dir
; // bootstrap hack.
1035 if (subtrees
.count(root
) == 0) {
1037 root
->get(CDir::PIN_SUBTREE
);
1040 root
= get_subtree_root(dir
); // subtree root
1043 ceph_assert(subtrees
.count(root
));
1044 dout(7) << " current root is " << *root
<< dendl
;
1046 mds_authority_t oldauth
= dir
->authority();
1049 // i am already a subtree.
1050 dir
->set_dir_auth(auth
);
1052 // i am a new subtree.
1053 dout(10) << " new subtree at " << *dir
<< dendl
;
1054 ceph_assert(subtrees
.count(dir
) == 0);
1055 subtrees
[dir
]; // create empty subtree bounds list for me.
1056 dir
->get(CDir::PIN_SUBTREE
);
1059 dir
->set_dir_auth(auth
);
1061 // move items nested beneath me, under me.
1062 set
<CDir
*>::iterator p
= subtrees
[root
].begin();
1063 while (p
!= subtrees
[root
].end()) {
1064 set
<CDir
*>::iterator next
= p
;
1066 if (get_subtree_root((*p
)->get_parent_dir()) == dir
) {
1068 dout(10) << " claiming child bound " << **p
<< dendl
;
1069 subtrees
[dir
].insert(*p
);
1070 subtrees
[root
].erase(p
);
1075 // i am a bound of the parent subtree.
1076 subtrees
[root
].insert(dir
);
1078 // i am now the subtree root.
1082 set
<CInode
*> to_eval
;
1084 // verify/adjust bounds.
1085 // - these may be new, or
1086 // - beneath existing ambiguous bounds (which will be collapsed),
1087 // - but NOT beneath unambiguous bounds.
1088 for (const auto& bound
: bounds
) {
1090 if (subtrees
[dir
].count(bound
) == 0) {
1091 if (get_subtree_root(bound
) == dir
) {
1092 dout(10) << " new bound " << *bound
<< ", adjusting auth back to old " << oldauth
<< dendl
;
1093 adjust_subtree_auth(bound
, oldauth
); // otherwise, adjust at bound.
1096 dout(10) << " want bound " << *bound
<< dendl
;
1097 CDir
*t
= get_subtree_root(bound
->get_parent_dir());
1098 if (subtrees
[t
].count(bound
) == 0) {
1099 ceph_assert(t
!= dir
);
1100 dout(10) << " new bound " << *bound
<< dendl
;
1101 adjust_subtree_auth(bound
, t
->authority());
1103 // make sure it's nested beneath ambiguous subtree(s)
1105 while (subtrees
[dir
].count(t
) == 0)
1106 t
= get_subtree_root(t
->get_parent_dir());
1107 dout(10) << " swallowing intervening subtree at " << *t
<< dendl
;
1108 adjust_subtree_auth(t
, auth
);
1109 try_subtree_merge_at(t
, &to_eval
);
1110 t
= get_subtree_root(bound
->get_parent_dir());
1111 if (t
== dir
) break;
1116 dout(10) << " already have bound " << *bound
<< dendl
;
1119 // merge stray bounds?
1120 while (!subtrees
[dir
].empty()) {
1121 set
<CDir
*> copy
= subtrees
[dir
];
1122 for (set
<CDir
*>::iterator p
= copy
.begin(); p
!= copy
.end(); ++p
) {
1123 if (bounds
.count(*p
) == 0) {
1125 dout(10) << " swallowing extra subtree at " << *stray
<< dendl
;
1126 adjust_subtree_auth(stray
, auth
);
1127 try_subtree_merge_at(stray
, &to_eval
);
1130 // swallowing subtree may add new subtree bounds
1131 if (copy
== subtrees
[dir
])
1135 // bound should now match.
1136 verify_subtree_bounds(dir
, bounds
);
1140 if (!(mds
->is_any_replay() || mds
->is_resolve())) {
1141 for(auto in
: to_eval
)
1142 eval_subtree_root(in
);
1148 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1149 * fragmentation as necessary to get an equivalent bounding set. That is, only
1150 * split if one of our frags spans the provided bounding set. Never merge.
1152 void MDCache::get_force_dirfrag_bound_set(const vector
<dirfrag_t
>& dfs
, set
<CDir
*>& bounds
)
1154 dout(10) << "get_force_dirfrag_bound_set " << dfs
<< dendl
;
1157 map
<inodeno_t
, fragset_t
> byino
;
1158 for (auto& frag
: dfs
) {
1159 byino
[frag
.ino
].insert_raw(frag
.frag
);
1161 dout(10) << " by ino: " << byino
<< dendl
;
1163 for (map
<inodeno_t
,fragset_t
>::iterator p
= byino
.begin(); p
!= byino
.end(); ++p
) {
1164 p
->second
.simplify();
1165 CInode
*diri
= get_inode(p
->first
);
1168 dout(10) << " checking fragset " << p
->second
.get() << " on " << *diri
<< dendl
;
1171 for (set
<frag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
)
1172 tmpdft
.force_to_leaf(g_ceph_context
, *q
);
1174 for (const auto& fg
: p
->second
) {
1176 diri
->dirfragtree
.get_leaves_under(fg
, leaves
);
1177 if (leaves
.empty()) {
1178 frag_t approx_fg
= diri
->dirfragtree
[fg
.value()];
1179 frag_vec_t approx_leaves
;
1180 tmpdft
.get_leaves_under(approx_fg
, approx_leaves
);
1181 for (const auto& leaf
: approx_leaves
) {
1182 if (p
->second
.get().count(leaf
) == 0) {
1183 // not bound, so the resolve message is from auth MDS of the dirfrag
1184 force_dir_fragment(diri
, leaf
);
1189 auto&& [complete
, sibs
] = diri
->get_dirfrags_under(fg
);
1190 for (const auto& sib
: sibs
)
1196 void MDCache::adjust_bounded_subtree_auth(CDir
*dir
, const vector
<dirfrag_t
>& bound_dfs
, const mds_authority_t
&auth
)
1198 dout(7) << "adjust_bounded_subtree_auth " << dir
->get_dir_auth() << " -> " << auth
1199 << " on " << *dir
<< " bound_dfs " << bound_dfs
<< dendl
;
1202 get_force_dirfrag_bound_set(bound_dfs
, bounds
);
1203 adjust_bounded_subtree_auth(dir
, bounds
, auth
);
1206 void MDCache::map_dirfrag_set(const list
<dirfrag_t
>& dfs
, set
<CDir
*>& result
)
1208 dout(10) << "map_dirfrag_set " << dfs
<< dendl
;
1211 map
<inodeno_t
, fragset_t
> ino_fragset
;
1212 for (const auto &df
: dfs
) {
1213 ino_fragset
[df
.ino
].insert_raw(df
.frag
);
1216 for (map
<inodeno_t
, fragset_t
>::iterator p
= ino_fragset
.begin();
1217 p
!= ino_fragset
.end();
1219 p
->second
.simplify();
1220 CInode
*in
= get_inode(p
->first
);
1225 for (const auto& fg
: p
->second
) {
1226 in
->dirfragtree
.get_leaves_under(fg
, fgs
);
1229 dout(15) << "map_dirfrag_set " << p
->second
<< " -> " << fgs
1230 << " on " << *in
<< dendl
;
1232 for (const auto& fg
: fgs
) {
1233 CDir
*dir
= in
->get_dirfrag(fg
);
1242 CDir
*MDCache::get_subtree_root(CDir
*dir
)
1244 // find the underlying dir that delegates (or is about to delegate) auth
1246 if (dir
->is_subtree_root())
1248 dir
= dir
->get_inode()->get_parent_dir();
1254 CDir
*MDCache::get_projected_subtree_root(CDir
*dir
)
1256 // find the underlying dir that delegates (or is about to delegate) auth
1258 if (dir
->is_subtree_root())
1260 dir
= dir
->get_inode()->get_projected_parent_dir();
1266 void MDCache::remove_subtree(CDir
*dir
)
1268 dout(10) << "remove_subtree " << *dir
<< dendl
;
1269 auto it
= subtrees
.find(dir
);
1270 ceph_assert(it
!= subtrees
.end());
1272 dir
->put(CDir::PIN_SUBTREE
);
1273 if (dir
->get_parent_dir()) {
1274 CDir
*p
= get_subtree_root(dir
->get_parent_dir());
1275 auto it
= subtrees
.find(p
);
1276 ceph_assert(it
!= subtrees
.end());
1277 auto count
= it
->second
.erase(dir
);
1278 ceph_assert(count
== 1);
1282 void MDCache::get_subtree_bounds(CDir
*dir
, set
<CDir
*>& bounds
)
1284 ceph_assert(subtrees
.count(dir
));
1285 bounds
= subtrees
[dir
];
1288 void MDCache::get_wouldbe_subtree_bounds(CDir
*dir
, set
<CDir
*>& bounds
)
1290 if (subtrees
.count(dir
)) {
1291 // just copy them, dir is a subtree.
1292 get_subtree_bounds(dir
, bounds
);
1295 CDir
*root
= get_subtree_root(dir
);
1296 for (set
<CDir
*>::iterator p
= subtrees
[root
].begin();
1297 p
!= subtrees
[root
].end();
1301 t
= t
->get_parent_dir();
1312 void MDCache::verify_subtree_bounds(CDir
*dir
, const set
<CDir
*>& bounds
)
1314 // for debugging only.
1315 ceph_assert(subtrees
.count(dir
));
1316 if (bounds
!= subtrees
[dir
]) {
1317 dout(0) << "verify_subtree_bounds failed" << dendl
;
1318 set
<CDir
*> b
= bounds
;
1319 for (auto &cd
: subtrees
[dir
]) {
1320 if (bounds
.count(cd
)) {
1324 dout(0) << " missing bound " << *cd
<< dendl
;
1326 for (const auto &cd
: b
)
1327 dout(0) << " extra bound " << *cd
<< dendl
;
1329 ceph_assert(bounds
== subtrees
[dir
]);
1332 void MDCache::verify_subtree_bounds(CDir
*dir
, const list
<dirfrag_t
>& bounds
)
1334 // for debugging only.
1335 ceph_assert(subtrees
.count(dir
));
1337 // make sure that any bounds i do have are properly noted as such.
1339 for (const auto &fg
: bounds
) {
1340 CDir
*bd
= get_dirfrag(fg
);
1342 if (subtrees
[dir
].count(bd
) == 0) {
1343 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd
<< dendl
;
1347 ceph_assert(failed
== 0);
1350 void MDCache::project_subtree_rename(CInode
*diri
, CDir
*olddir
, CDir
*newdir
)
1352 dout(10) << "project_subtree_rename " << *diri
<< " from " << *olddir
1353 << " to " << *newdir
<< dendl
;
1354 projected_subtree_renames
[diri
].push_back(pair
<CDir
*,CDir
*>(olddir
, newdir
));
1357 void MDCache::adjust_subtree_after_rename(CInode
*diri
, CDir
*olddir
, bool pop
)
1359 dout(10) << "adjust_subtree_after_rename " << *diri
<< " from " << *olddir
<< dendl
;
1361 CDir
*newdir
= diri
->get_parent_dir();
1364 map
<CInode
*,list
<pair
<CDir
*,CDir
*> > >::iterator p
= projected_subtree_renames
.find(diri
);
1365 ceph_assert(p
!= projected_subtree_renames
.end());
1366 ceph_assert(!p
->second
.empty());
1367 ceph_assert(p
->second
.front().first
== olddir
);
1368 ceph_assert(p
->second
.front().second
== newdir
);
1369 p
->second
.pop_front();
1370 if (p
->second
.empty())
1371 projected_subtree_renames
.erase(p
);
1374 // adjust total auth pin of freezing subtree
1375 if (olddir
!= newdir
) {
1376 auto&& dfls
= diri
->get_nested_dirfrags();
1377 for (const auto& dir
: dfls
)
1378 olddir
->adjust_freeze_after_rename(dir
);
1382 // N.B. make sure subtree dirfrags are at the front of the list
1383 auto dfls
= diri
->get_subtree_dirfrags();
1384 diri
->get_nested_dirfrags(dfls
);
1385 for (const auto& dir
: dfls
) {
1386 dout(10) << "dirfrag " << *dir
<< dendl
;
1387 CDir
*oldparent
= get_subtree_root(olddir
);
1388 dout(10) << " old parent " << *oldparent
<< dendl
;
1389 CDir
*newparent
= get_subtree_root(newdir
);
1390 dout(10) << " new parent " << *newparent
<< dendl
;
1392 auto& oldbounds
= subtrees
[oldparent
];
1393 auto& newbounds
= subtrees
[newparent
];
1395 if (olddir
!= newdir
)
1396 mds
->balancer
->adjust_pop_for_rename(olddir
, dir
, false);
1398 if (oldparent
== newparent
) {
1399 dout(10) << "parent unchanged for " << *dir
<< " at " << *oldparent
<< dendl
;
1400 } else if (dir
->is_subtree_root()) {
1401 // children are fine. change parent.
1402 dout(10) << "moving " << *dir
<< " from " << *oldparent
<< " to " << *newparent
<< dendl
;
1404 auto n
= oldbounds
.erase(dir
);
1405 ceph_assert(n
== 1);
1407 newbounds
.insert(dir
);
1408 // caller is responsible for 'eval diri'
1409 try_subtree_merge_at(dir
, NULL
, false);
1413 // see if any old bounds move to the new parent.
1414 std::vector
<CDir
*> tomove
;
1415 for (const auto& bound
: oldbounds
) {
1416 CDir
*broot
= get_subtree_root(bound
->get_parent_dir());
1417 if (broot
!= oldparent
) {
1418 ceph_assert(broot
== newparent
);
1419 tomove
.push_back(bound
);
1422 for (const auto& bound
: tomove
) {
1423 dout(10) << "moving bound " << *bound
<< " from " << *oldparent
<< " to " << *newparent
<< dendl
;
1424 oldbounds
.erase(bound
);
1425 newbounds
.insert(bound
);
1429 if (oldparent
->authority() != newparent
->authority()) {
1430 adjust_subtree_auth(dir
, oldparent
->authority(), false);
1431 // caller is responsible for 'eval diri'
1432 try_subtree_merge_at(dir
, NULL
, false);
1436 if (olddir
!= newdir
)
1437 mds
->balancer
->adjust_pop_for_rename(newdir
, dir
, true);
1443 // ===================================
1444 // journal and snap/cow helpers
1448 * find first inode in cache that follows given snapid. otherwise, return current.
1450 CInode
*MDCache::pick_inode_snap(CInode
*in
, snapid_t follows
)
1452 dout(10) << "pick_inode_snap follows " << follows
<< " on " << *in
<< dendl
;
1453 ceph_assert(in
->last
== CEPH_NOSNAP
);
1455 auto p
= snap_inode_map
.upper_bound(vinodeno_t(in
->ino(), follows
));
1456 if (p
!= snap_inode_map
.end() && p
->second
->ino() == in
->ino()) {
1457 dout(10) << "pick_inode_snap found " << *p
->second
<< dendl
;
1466 * note: i'm currently cheating wrt dirty and inode.version on cow
1467 * items. instead of doing a full dir predirty, i just take the
1468 * original item's version, and set the dirty flag (via
1469 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1470 * means a special case in the dir commit clean sweep assertions.
1473 CInode
*MDCache::cow_inode(CInode
*in
, snapid_t last
)
1475 ceph_assert(last
>= in
->first
);
1477 CInode
*oldin
= new CInode(this, true, in
->first
, last
);
1478 auto _inode
= CInode::allocate_inode(*in
->get_previous_projected_inode());
1479 _inode
->trim_client_ranges(last
);
1480 oldin
->reset_inode(std::move(_inode
));
1481 auto _xattrs
= in
->get_previous_projected_xattrs();
1482 oldin
->reset_xattrs(std::move(_xattrs
));
1484 oldin
->symlink
= in
->symlink
;
1486 if (in
->first
< in
->oldest_snap
)
1487 in
->oldest_snap
= in
->first
;
1491 dout(10) << "cow_inode " << *in
<< " to " << *oldin
<< dendl
;
1494 if (in
->last
!= CEPH_NOSNAP
) {
1495 CInode
*head_in
= get_inode(in
->ino());
1496 ceph_assert(head_in
);
1497 auto ret
= head_in
->split_need_snapflush(oldin
, in
);
1499 oldin
->client_snap_caps
= in
->client_snap_caps
;
1500 if (!oldin
->client_snap_caps
.empty()) {
1501 for (int i
= 0; i
< num_cinode_locks
; i
++) {
1502 SimpleLock
*lock
= oldin
->get_lock(cinode_lock_info
[i
].lock
);
1504 if (lock
->get_state() != LOCK_SNAP_SYNC
) {
1505 ceph_assert(lock
->is_stable());
1506 lock
->set_state(LOCK_SNAP_SYNC
); // gathering
1507 oldin
->auth_pin(lock
);
1509 lock
->get_wrlock(true);
1514 auto client_snap_caps
= std::move(in
->client_snap_caps
);
1515 in
->client_snap_caps
.clear();
1516 in
->item_open_file
.remove_myself();
1517 in
->item_caps
.remove_myself();
1519 if (!client_snap_caps
.empty()) {
1520 MDSContext::vec finished
;
1521 for (int i
= 0; i
< num_cinode_locks
; i
++) {
1522 SimpleLock
*lock
= in
->get_lock(cinode_lock_info
[i
].lock
);
1524 ceph_assert(lock
->get_state() == LOCK_SNAP_SYNC
); // gathering
1526 if (!lock
->get_num_wrlocks()) {
1527 lock
->set_state(LOCK_SYNC
);
1528 lock
->take_waiting(SimpleLock::WAIT_STABLE
|SimpleLock::WAIT_RD
, finished
);
1529 in
->auth_unpin(lock
);
1532 mds
->queue_waiters(finished
);
1538 if (!in
->client_caps
.empty()) {
1539 const set
<snapid_t
>& snaps
= in
->find_snaprealm()->get_snaps();
1541 for (auto &p
: in
->client_caps
) {
1542 client_t client
= p
.first
;
1543 Capability
*cap
= &p
.second
;
1544 int issued
= cap
->need_snapflush() ? CEPH_CAP_ANY_WR
: cap
->issued();
1545 if ((issued
& CEPH_CAP_ANY_WR
) &&
1546 cap
->client_follows
< last
) {
1547 dout(10) << " client." << client
<< " cap " << ccap_string(issued
) << dendl
;
1548 oldin
->client_snap_caps
.insert(client
);
1549 cap
->client_follows
= last
;
1551 // we need snapflushes for any intervening snaps
1552 dout(10) << " snaps " << snaps
<< dendl
;
1553 for (auto q
= snaps
.lower_bound(oldin
->first
);
1554 q
!= snaps
.end() && *q
<= last
;
1556 in
->add_need_snapflush(oldin
, *q
, client
);
1559 dout(10) << " ignoring client." << client
<< " cap follows " << cap
->client_follows
<< dendl
;
1563 if (!oldin
->client_snap_caps
.empty()) {
1564 for (int i
= 0; i
< num_cinode_locks
; i
++) {
1565 SimpleLock
*lock
= oldin
->get_lock(cinode_lock_info
[i
].lock
);
1567 if (lock
->get_state() != LOCK_SNAP_SYNC
) {
1568 ceph_assert(lock
->is_stable());
1569 lock
->set_state(LOCK_SNAP_SYNC
); // gathering
1570 oldin
->auth_pin(lock
);
1572 lock
->get_wrlock(true);
1579 void MDCache::journal_cow_dentry(MutationImpl
*mut
, EMetaBlob
*metablob
,
1580 CDentry
*dn
, snapid_t follows
,
1581 CInode
**pcow_inode
, CDentry::linkage_t
*dnl
)
1584 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl
;
1587 dout(10) << "journal_cow_dentry follows " << follows
<< " on " << *dn
<< dendl
;
1588 ceph_assert(dn
->is_auth());
1590 // nothing to cow on a null dentry, fix caller
1592 dnl
= dn
->get_projected_linkage();
1593 ceph_assert(!dnl
->is_null());
1595 CInode
*in
= dnl
->is_primary() ? dnl
->get_inode() : NULL
;
1596 bool cow_head
= false;
1597 if (in
&& in
->state_test(CInode::STATE_AMBIGUOUSAUTH
)) {
1598 ceph_assert(in
->is_frozen_inode());
1601 if (in
&& (in
->is_multiversion() || cow_head
)) {
1602 // multiversion inode.
1603 SnapRealm
*realm
= NULL
;
1605 if (in
->get_projected_parent_dn() != dn
) {
1606 ceph_assert(follows
== CEPH_NOSNAP
);
1607 realm
= dn
->dir
->inode
->find_snaprealm();
1608 snapid_t dir_follows
= get_global_snaprealm()->get_newest_seq();
1609 ceph_assert(dir_follows
>= realm
->get_newest_seq());
1611 if (dir_follows
+1 > dn
->first
) {
1612 snapid_t oldfirst
= dn
->first
;
1613 dn
->first
= dir_follows
+1;
1614 if (realm
->has_snaps_in_range(oldfirst
, dir_follows
)) {
1615 CDir
*dir
= dn
->dir
;
1616 CDentry
*olddn
= dir
->add_remote_dentry(dn
->get_name(), in
->ino(), in
->d_type(), dn
->alternate_name
, oldfirst
, dir_follows
);
1617 dout(10) << " olddn " << *olddn
<< dendl
;
1618 ceph_assert(dir
->is_projected());
1619 olddn
->set_projected_version(dir
->get_projected_version());
1620 metablob
->add_remote_dentry(olddn
, true);
1621 mut
->add_cow_dentry(olddn
);
1622 // FIXME: adjust link count here? hmm.
1624 if (dir_follows
+1 > in
->first
)
1625 in
->cow_old_inode(dir_follows
, cow_head
);
1629 follows
= dir_follows
;
1630 if (in
->snaprealm
) {
1631 realm
= in
->snaprealm
;
1632 ceph_assert(follows
>= realm
->get_newest_seq());
1635 realm
= in
->find_snaprealm();
1636 if (follows
== CEPH_NOSNAP
) {
1637 follows
= get_global_snaprealm()->get_newest_seq();
1638 ceph_assert(follows
>= realm
->get_newest_seq());
1643 if (follows
< in
->first
) {
1644 dout(10) << "journal_cow_dentry follows " << follows
<< " < first on " << *in
<< dendl
;
1648 if (!realm
->has_snaps_in_range(in
->first
, follows
)) {
1649 dout(10) << "journal_cow_dentry no snapshot follows " << follows
<< " on " << *in
<< dendl
;
1650 in
->first
= follows
+ 1;
1654 in
->cow_old_inode(follows
, cow_head
);
1657 SnapRealm
*realm
= dn
->dir
->inode
->find_snaprealm();
1658 if (follows
== CEPH_NOSNAP
) {
1659 follows
= get_global_snaprealm()->get_newest_seq();
1660 ceph_assert(follows
>= realm
->get_newest_seq());
1664 if (follows
< dn
->first
) {
1665 dout(10) << "journal_cow_dentry follows " << follows
<< " < first on " << *dn
<< dendl
;
1669 // update dn.first before adding old dentry to cdir's map
1670 snapid_t oldfirst
= dn
->first
;
1671 dn
->first
= follows
+1;
1673 if (!realm
->has_snaps_in_range(oldfirst
, follows
)) {
1674 dout(10) << "journal_cow_dentry no snapshot follows " << follows
<< " on " << *dn
<< dendl
;
1676 in
->first
= follows
+1;
1680 dout(10) << " dn " << *dn
<< dendl
;
1681 CDir
*dir
= dn
->get_dir();
1682 ceph_assert(dir
->is_projected());
1685 CInode
*oldin
= cow_inode(in
, follows
);
1686 ceph_assert(in
->is_projected());
1687 mut
->add_cow_inode(oldin
);
1689 *pcow_inode
= oldin
;
1690 CDentry
*olddn
= dir
->add_primary_dentry(dn
->get_name(), oldin
, dn
->alternate_name
, oldfirst
, follows
);
1691 dout(10) << " olddn " << *olddn
<< dendl
;
1692 bool need_snapflush
= !oldin
->client_snap_caps
.empty();
1693 if (need_snapflush
) {
1694 mut
->ls
->open_files
.push_back(&oldin
->item_open_file
);
1695 mds
->locker
->mark_need_snapflush_inode(oldin
);
1697 olddn
->set_projected_version(dir
->get_projected_version());
1698 metablob
->add_primary_dentry(olddn
, 0, true, false, false, need_snapflush
);
1699 mut
->add_cow_dentry(olddn
);
1701 ceph_assert(dnl
->is_remote());
1702 CDentry
*olddn
= dir
->add_remote_dentry(dn
->get_name(), dnl
->get_remote_ino(), dnl
->get_remote_d_type(), dn
->alternate_name
, oldfirst
, follows
);
1703 dout(10) << " olddn " << *olddn
<< dendl
;
1705 olddn
->set_projected_version(dir
->get_projected_version());
1706 metablob
->add_remote_dentry(olddn
, true);
1707 mut
->add_cow_dentry(olddn
);
1712 void MDCache::journal_dirty_inode(MutationImpl
*mut
, EMetaBlob
*metablob
, CInode
*in
, snapid_t follows
)
1714 if (in
->is_base()) {
1715 metablob
->add_root(true, in
);
1717 if (follows
== CEPH_NOSNAP
&& in
->last
!= CEPH_NOSNAP
)
1718 follows
= in
->first
- 1;
1719 CDentry
*dn
= in
->get_projected_parent_dn();
1720 if (!dn
->get_projected_linkage()->is_null()) // no need to cow a null dentry
1721 journal_cow_dentry(mut
, metablob
, dn
, follows
);
1722 if (in
->get_projected_inode()->is_backtrace_updated()) {
1723 bool dirty_pool
= in
->get_projected_inode()->layout
.pool_id
!=
1724 in
->get_previous_projected_inode()->layout
.pool_id
;
1725 metablob
->add_primary_dentry(dn
, in
, true, true, dirty_pool
);
1727 metablob
->add_primary_dentry(dn
, in
, true);
1734 // nested ---------------------------------------------------------------
1736 void MDCache::project_rstat_inode_to_frag(const MutationRef
& mut
,
1737 CInode
*cur
, CDir
*parent
, snapid_t first
,
1738 int linkunlink
, SnapRealm
*prealm
)
1740 CDentry
*parentdn
= cur
->get_projected_parent_dn();
1742 if (cur
->first
> first
)
1745 dout(10) << "projected_rstat_inode_to_frag first " << first
<< " linkunlink " << linkunlink
1746 << " " << *cur
<< dendl
;
1747 dout(20) << " frag head is [" << parent
->first
<< ",head] " << dendl
;
1748 dout(20) << " inode update is [" << first
<< "," << cur
->last
<< "]" << dendl
;
1751 * FIXME. this incompletely propagates rstats to _old_ parents
1752 * (i.e. shortly after a directory rename). but we need full
1753 * blown hard link backpointers to make this work properly...
1755 snapid_t floor
= parentdn
->first
;
1756 dout(20) << " floor of " << floor
<< " from parent dn " << *parentdn
<< dendl
;
1759 prealm
= parent
->inode
->find_snaprealm();
1760 const set
<snapid_t
> snaps
= prealm
->get_snaps();
1762 if (cur
->last
!= CEPH_NOSNAP
) {
1763 ceph_assert(cur
->dirty_old_rstats
.empty());
1764 set
<snapid_t
>::const_iterator q
= snaps
.lower_bound(std::max(first
, floor
));
1765 if (q
== snaps
.end() || *q
> cur
->last
)
1769 if (cur
->last
>= floor
) {
1771 if (cur
->state_test(CInode::STATE_AMBIGUOUSAUTH
) && cur
->is_auth()) {
1772 // rename src inode is not projected in the peer rename prep case. so we should
1773 // avoid updateing the inode.
1774 ceph_assert(linkunlink
< 0);
1775 ceph_assert(cur
->is_frozen_inode());
1779 const CInode::mempool_inode
*pi
;
1780 if (update
&& mut
->is_projected(cur
)) {
1781 pi
= cur
->_get_projected_inode();
1783 pi
= cur
->get_projected_inode().get();
1786 ceph_assert(pi
->rstat
== pi
->accounted_rstat
);
1790 _project_rstat_inode_to_frag(pi
, std::max(first
, floor
), cur
->last
, parent
,
1791 linkunlink
, update
);
1794 if (g_conf()->mds_snap_rstat
) {
1795 for (const auto &p
: cur
->dirty_old_rstats
) {
1796 const auto &old
= cur
->get_old_inodes()->at(p
);
1797 snapid_t ofirst
= std::max(old
.first
, floor
);
1798 auto it
= snaps
.lower_bound(ofirst
);
1799 if (it
== snaps
.end() || *it
> p
)
1802 _project_rstat_inode_to_frag(&old
.inode
, ofirst
, p
, parent
, 0, false);
1805 cur
->dirty_old_rstats
.clear();
1809 void MDCache::_project_rstat_inode_to_frag(const CInode::mempool_inode
* inode
, snapid_t ofirst
, snapid_t last
,
1810 CDir
*parent
, int linkunlink
, bool update_inode
)
1812 dout(10) << "_project_rstat_inode_to_frag [" << ofirst
<< "," << last
<< "]" << dendl
;
1813 dout(20) << " inode rstat " << inode
->rstat
<< dendl
;
1814 dout(20) << " inode accounted_rstat " << inode
->accounted_rstat
<< dendl
;
1816 if (linkunlink
== 0) {
1817 delta
.add(inode
->rstat
);
1818 delta
.sub(inode
->accounted_rstat
);
1819 } else if (linkunlink
< 0) {
1820 delta
.sub(inode
->accounted_rstat
);
1822 delta
.add(inode
->rstat
);
1824 dout(20) << " delta " << delta
<< dendl
;
1827 while (last
>= ofirst
) {
1829 * pick fnode version to update. at each iteration, we want to
1830 * pick a segment ending in 'last' to update. split as necessary
1831 * to make that work. then, adjust first up so that we only
1832 * update one segment at a time. then loop to cover the whole
1833 * [ofirst,last] interval.
1835 nest_info_t
*prstat
;
1837 auto pf
= parent
->_get_projected_fnode();
1838 if (last
== CEPH_NOSNAP
) {
1839 if (g_conf()->mds_snap_rstat
)
1840 first
= std::max(ofirst
, parent
->first
);
1842 first
= parent
->first
;
1843 prstat
= &pf
->rstat
;
1844 dout(20) << " projecting to head [" << first
<< "," << last
<< "] " << *prstat
<< dendl
;
1846 if (first
> parent
->first
&&
1847 !(pf
->rstat
== pf
->accounted_rstat
)) {
1848 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1849 << parent
->first
<< "," << (first
-1) << "] "
1850 << " " << *prstat
<< "/" << pf
->accounted_rstat
1852 parent
->dirty_old_rstat
[first
-1].first
= parent
->first
;
1853 parent
->dirty_old_rstat
[first
-1].rstat
= pf
->rstat
;
1854 parent
->dirty_old_rstat
[first
-1].accounted_rstat
= pf
->accounted_rstat
;
1856 parent
->first
= first
;
1857 } else if (!g_conf()->mds_snap_rstat
) {
1858 // drop snapshots' rstats
1860 } else if (last
>= parent
->first
) {
1861 first
= parent
->first
;
1862 parent
->dirty_old_rstat
[last
].first
= first
;
1863 parent
->dirty_old_rstat
[last
].rstat
= pf
->rstat
;
1864 parent
->dirty_old_rstat
[last
].accounted_rstat
= pf
->accounted_rstat
;
1865 prstat
= &parent
->dirty_old_rstat
[last
].rstat
;
1866 dout(10) << " projecting to newly split dirty_old_fnode [" << first
<< "," << last
<< "] "
1867 << " " << *prstat
<< "/" << pf
->accounted_rstat
<< dendl
;
1869 // be careful, dirty_old_rstat is a _sparse_ map.
1870 // sorry, this is ugly.
1873 // find any intersection with last
1874 auto it
= parent
->dirty_old_rstat
.lower_bound(last
);
1875 if (it
== parent
->dirty_old_rstat
.end()) {
1876 dout(20) << " no dirty_old_rstat with last >= last " << last
<< dendl
;
1877 if (!parent
->dirty_old_rstat
.empty() && parent
->dirty_old_rstat
.rbegin()->first
>= first
) {
1878 dout(20) << " last dirty_old_rstat ends at " << parent
->dirty_old_rstat
.rbegin()->first
<< dendl
;
1879 first
= parent
->dirty_old_rstat
.rbegin()->first
+1;
1882 // *it last is >= last
1883 if (it
->second
.first
<= last
) {
1884 // *it intersects [first,last]
1885 if (it
->second
.first
< first
) {
1886 dout(10) << " splitting off left bit [" << it
->second
.first
<< "," << first
-1 << "]" << dendl
;
1887 parent
->dirty_old_rstat
[first
-1] = it
->second
;
1888 it
->second
.first
= first
;
1890 if (it
->second
.first
> first
)
1891 first
= it
->second
.first
;
1892 if (last
< it
->first
) {
1893 dout(10) << " splitting off right bit [" << last
+1 << "," << it
->first
<< "]" << dendl
;
1894 parent
->dirty_old_rstat
[last
] = it
->second
;
1895 it
->second
.first
= last
+1;
1898 // *it is to the _right_ of [first,last]
1899 it
= parent
->dirty_old_rstat
.lower_bound(first
);
1900 // new *it last is >= first
1901 if (it
->second
.first
<= last
&& // new *it isn't also to the right, and
1902 it
->first
>= first
) { // it intersects our first bit,
1903 dout(10) << " staying to the right of [" << it
->second
.first
<< "," << it
->first
<< "]..." << dendl
;
1904 first
= it
->first
+1;
1906 dout(10) << " projecting to new dirty_old_rstat [" << first
<< "," << last
<< "]" << dendl
;
1909 dout(20) << " projecting to dirty_old_rstat [" << first
<< "," << last
<< "]" << dendl
;
1910 parent
->dirty_old_rstat
[last
].first
= first
;
1911 prstat
= &parent
->dirty_old_rstat
[last
].rstat
;
1915 dout(20) << " project to [" << first
<< "," << last
<< "] " << *prstat
<< dendl
;
1916 ceph_assert(last
>= first
);
1918 dout(20) << " result [" << first
<< "," << last
<< "] " << *prstat
<< " " << *parent
<< dendl
;
1924 auto _inode
= const_cast<CInode::mempool_inode
*>(inode
);
1925 _inode
->accounted_rstat
= _inode
->rstat
;
1929 void MDCache::project_rstat_frag_to_inode(const nest_info_t
& rstat
,
1930 const nest_info_t
& accounted_rstat
,
1931 snapid_t ofirst
, snapid_t last
,
1932 CInode
*pin
, bool cow_head
)
1934 dout(10) << "project_rstat_frag_to_inode [" << ofirst
<< "," << last
<< "]" << dendl
;
1935 dout(20) << " frag rstat " << rstat
<< dendl
;
1936 dout(20) << " frag accounted_rstat " << accounted_rstat
<< dendl
;
1937 nest_info_t delta
= rstat
;
1938 delta
.sub(accounted_rstat
);
1939 dout(20) << " delta " << delta
<< dendl
;
1941 CInode::old_inode_map_ptr _old_inodes
;
1942 while (last
>= ofirst
) {
1943 CInode::mempool_inode
*pi
;
1945 if (last
== pin
->last
) {
1946 pi
= pin
->_get_projected_inode();
1947 first
= std::max(ofirst
, pin
->first
);
1948 if (first
> pin
->first
) {
1949 auto& old
= pin
->cow_old_inode(first
-1, cow_head
);
1950 dout(20) << " cloned old_inode rstat is " << old
.inode
.rstat
<< dendl
;
1954 _old_inodes
= CInode::allocate_old_inode_map();
1955 if (pin
->is_any_old_inodes())
1956 *_old_inodes
= *pin
->get_old_inodes();
1958 if (last
>= pin
->first
) {
1960 pin
->cow_old_inode(last
, cow_head
);
1962 // our life is easier here because old_inodes is not sparse
1963 // (although it may not begin at snapid 1)
1964 auto it
= _old_inodes
->lower_bound(last
);
1965 if (it
== _old_inodes
->end()) {
1966 dout(10) << " no old_inode <= " << last
<< ", done." << dendl
;
1969 first
= it
->second
.first
;
1971 dout(10) << " oldest old_inode is [" << first
<< "," << it
->first
<< "], done." << dendl
;
1972 //assert(p == pin->old_inodes.begin());
1975 if (it
->first
> last
) {
1976 dout(10) << " splitting right old_inode [" << first
<< "," << it
->first
<< "] to ["
1977 << (last
+1) << "," << it
->first
<< "]" << dendl
;
1978 (*_old_inodes
)[last
] = it
->second
;
1979 it
->second
.first
= last
+1;
1980 pin
->dirty_old_rstats
.insert(it
->first
);
1983 if (first
< ofirst
) {
1984 dout(10) << " splitting left old_inode [" << first
<< "," << last
<< "] to ["
1985 << first
<< "," << ofirst
-1 << "]" << dendl
;
1986 (*_old_inodes
)[ofirst
-1] = (*_old_inodes
)[last
];
1987 pin
->dirty_old_rstats
.insert(ofirst
-1);
1988 (*_old_inodes
)[last
].first
= first
= ofirst
;
1990 pi
= &(*_old_inodes
)[last
].inode
;
1991 pin
->dirty_old_rstats
.insert(last
);
1993 dout(20) << " projecting to [" << first
<< "," << last
<< "] " << pi
->rstat
<< dendl
;
1994 pi
->rstat
.add(delta
);
1995 dout(20) << " result [" << first
<< "," << last
<< "] " << pi
->rstat
<< dendl
;
2000 pin
->reset_old_inodes(std::move(_old_inodes
));
2003 void MDCache::broadcast_quota_to_client(CInode
*in
, client_t exclude_ct
, bool quota_change
)
2005 if (!(mds
->is_active() || mds
->is_stopping()))
2008 if (!in
->is_auth() || in
->is_frozen())
2011 const auto& pi
= in
->get_projected_inode();
2012 if (!pi
->quota
.is_enabled() && !quota_change
)
2015 // creaete snaprealm for quota inode (quota was set before mimic)
2016 if (!in
->get_projected_srnode())
2017 mds
->server
->create_quota_realm(in
);
2019 for (auto &p
: in
->client_caps
) {
2020 Capability
*cap
= &p
.second
;
2021 if (cap
->is_noquota())
2024 if (exclude_ct
>= 0 && exclude_ct
!= p
.first
)
2027 if (cap
->last_rbytes
== pi
->rstat
.rbytes
&&
2028 cap
->last_rsize
== pi
->rstat
.rsize())
2031 if (pi
->quota
.max_files
> 0) {
2032 if (pi
->rstat
.rsize() >= pi
->quota
.max_files
)
2035 if ((abs(cap
->last_rsize
- pi
->quota
.max_files
) >> 4) <
2036 abs(cap
->last_rsize
- pi
->rstat
.rsize()))
2040 if (pi
->quota
.max_bytes
> 0) {
2041 if (pi
->rstat
.rbytes
> pi
->quota
.max_bytes
- (pi
->quota
.max_bytes
>> 3))
2044 if ((abs(cap
->last_rbytes
- pi
->quota
.max_bytes
) >> 4) <
2045 abs(cap
->last_rbytes
- pi
->rstat
.rbytes
))
2052 cap
->last_rsize
= pi
->rstat
.rsize();
2053 cap
->last_rbytes
= pi
->rstat
.rbytes
;
2055 auto msg
= make_message
<MClientQuota
>();
2056 msg
->ino
= in
->ino();
2057 msg
->rstat
= pi
->rstat
;
2058 msg
->quota
= pi
->quota
;
2059 mds
->send_message_client_counted(msg
, cap
->get_session());
2061 for (const auto &it
: in
->get_replicas()) {
2062 auto msg
= make_message
<MGatherCaps
>();
2063 msg
->ino
= in
->ino();
2064 mds
->send_message_mds(msg
, it
.first
);
2069 * NOTE: we _have_ to delay the scatter if we are called during a
2070 * rejoin, because we can't twiddle locks between when the
2071 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2072 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2073 * (no requests), and a survivor acks immediately. _except_ that
2074 * during rejoin_(weak|strong) processing, we may complete a lock
2075 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2076 * scatterlock state in that case or the lock states will get out of
2077 * sync between the auth and replica.
2079 * the simple solution is to never do the scatter here. instead, put
2080 * the scatterlock on a list if it isn't already wrlockable. this is
2081 * probably the best plan anyway, since we avoid too many
2082 * scatters/locks under normal usage.
2085 * some notes on dirlock/nestlock scatterlock semantics:
2087 * the fragstat (dirlock) will never be updated without
2088 * dirlock+nestlock wrlock held by the caller.
2090 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2091 * data is pushed up the tree. this could be changed with some
2092 * restructuring here, but in its current form we ensure that the
2093 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2094 * frag, which is nice. and, we only need to track frags that need to
2095 * be nudged (and not inodes with pending rstat changes that need to
2096 * be pushed into the frag). a consequence of this is that the
2097 * accounted_rstat on scatterlock sync may not match our current
2098 * rstat. this is normal and expected.
2100 void MDCache::predirty_journal_parents(MutationRef mut
, EMetaBlob
*blob
,
2101 CInode
*in
, CDir
*parent
,
2102 int flags
, int linkunlink
,
2105 bool primary_dn
= flags
& PREDIRTY_PRIMARY
;
2106 bool do_parent_mtime
= flags
& PREDIRTY_DIR
;
2107 bool shallow
= flags
& PREDIRTY_SHALLOW
;
2109 ceph_assert(mds
->mdlog
->entry_is_open());
2111 // make sure stamp is set
2112 if (mut
->get_mds_stamp() == utime_t())
2113 mut
->set_mds_stamp(ceph_clock_now());
2118 dout(10) << "predirty_journal_parents"
2119 << (do_parent_mtime
? " do_parent_mtime":"")
2120 << " linkunlink=" << linkunlink
2121 << (primary_dn
? " primary_dn":" remote_dn")
2122 << (shallow
? " SHALLOW":"")
2123 << " follows " << cfollows
2124 << " " << *in
<< dendl
;
2127 ceph_assert(primary_dn
);
2128 parent
= in
->get_projected_parent_dn()->get_dir();
2131 if (flags
== 0 && linkunlink
== 0) {
2132 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl
;
2133 blob
->add_dir_context(parent
);
2137 // build list of inodes to wrlock, dirty, and update
2140 CDentry
*parentdn
= NULL
;
2143 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
2144 ceph_assert(parent
->is_auth());
2146 // opportunistically adjust parent dirfrag
2147 CInode
*pin
= parent
->get_inode();
2150 mut
->auth_pin(parent
);
2152 auto pf
= parent
->project_fnode(mut
);
2153 pf
->version
= parent
->pre_dirty();
2155 if (do_parent_mtime
|| linkunlink
) {
2156 ceph_assert(mut
->is_wrlocked(&pin
->filelock
));
2157 ceph_assert(mut
->is_wrlocked(&pin
->nestlock
));
2158 ceph_assert(cfollows
== CEPH_NOSNAP
);
2160 // update stale fragstat/rstat?
2161 parent
->resync_accounted_fragstat();
2162 parent
->resync_accounted_rstat();
2164 if (do_parent_mtime
) {
2165 pf
->fragstat
.mtime
= mut
->get_op_stamp();
2166 pf
->fragstat
.change_attr
++;
2167 dout(10) << "predirty_journal_parents bumping fragstat change_attr to " << pf
->fragstat
.change_attr
<< " on " << parent
<< dendl
;
2168 if (pf
->fragstat
.mtime
> pf
->rstat
.rctime
) {
2169 dout(10) << "predirty_journal_parents updating mtime on " << *parent
<< dendl
;
2170 pf
->rstat
.rctime
= pf
->fragstat
.mtime
;
2172 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent
<< dendl
;
2176 dout(10) << "predirty_journal_parents updating size on " << *parent
<< dendl
;
2178 pf
->fragstat
.nsubdirs
+= linkunlink
;
2179 //pf->rstat.rsubdirs += linkunlink;
2181 pf
->fragstat
.nfiles
+= linkunlink
;
2182 //pf->rstat.rfiles += linkunlink;
2189 // don't update parent this pass
2190 } else if (!linkunlink
&& !(pin
->nestlock
.can_wrlock(-1) &&
2191 pin
->versionlock
.can_wrlock())) {
2192 dout(20) << " unwritable parent nestlock " << pin
->nestlock
2193 << ", marking dirty rstat on " << *cur
<< dendl
;
2194 cur
->mark_dirty_rstat();
2196 // if we don't hold a wrlock reference on this nestlock, take one,
2197 // because we are about to write into the dirfrag fnode and that needs
2198 // to commit before the lock can cycle.
2200 ceph_assert(pin
->nestlock
.get_num_wrlocks() || mut
->is_peer());
2203 if (!mut
->is_wrlocked(&pin
->nestlock
)) {
2204 dout(10) << " taking wrlock on " << pin
->nestlock
<< " on " << *pin
<< dendl
;
2205 mds
->locker
->wrlock_force(&pin
->nestlock
, mut
);
2208 // now we can project the inode rstat diff the dirfrag
2209 SnapRealm
*prealm
= pin
->find_snaprealm();
2211 snapid_t follows
= cfollows
;
2212 if (follows
== CEPH_NOSNAP
)
2213 follows
= prealm
->get_newest_seq();
2215 snapid_t first
= follows
+1;
2217 // first, if the frag is stale, bring it back in sync.
2218 parent
->resync_accounted_rstat();
2220 // now push inode rstats into frag
2221 project_rstat_inode_to_frag(mut
, cur
, parent
, first
, linkunlink
, prealm
);
2222 cur
->clear_dirty_rstat();
2226 if (!pin
->is_auth() || (!mut
->is_auth_pinned(pin
) && !pin
->can_auth_pin())) {
2227 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin
<< dendl
;
2231 // delay propagating until later?
2232 if (!stop
&& !first
&&
2233 g_conf()->mds_dirstat_min_interval
> 0) {
2234 double since_last_prop
= mut
->get_mds_stamp() - pin
->last_dirstat_prop
;
2235 if (since_last_prop
< g_conf()->mds_dirstat_min_interval
) {
2236 dout(10) << "predirty_journal_parents last prop " << since_last_prop
2237 << " < " << g_conf()->mds_dirstat_min_interval
2238 << ", stopping" << dendl
;
2241 dout(10) << "predirty_journal_parents last prop " << since_last_prop
<< " ago, continuing" << dendl
;
2245 // can cast only because i'm passing nowait=true in the sole user
2247 !mut
->is_wrlocked(&pin
->nestlock
) &&
2248 (!pin
->versionlock
.can_wrlock() || // make sure we can take versionlock, too
2249 !mds
->locker
->wrlock_try(&pin
->nestlock
, mut
)
2250 )) { // ** do not initiate.. see above comment **
2251 dout(10) << "predirty_journal_parents can't wrlock one of " << pin
->versionlock
<< " or " << pin
->nestlock
2252 << " on " << *pin
<< dendl
;
2256 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin
<< dendl
;
2257 mds
->locker
->mark_updated_scatterlock(&pin
->nestlock
);
2258 mut
->ls
->dirty_dirfrag_nest
.push_back(&pin
->item_dirty_dirfrag_nest
);
2259 mut
->add_updated_lock(&pin
->nestlock
);
2260 if (do_parent_mtime
|| linkunlink
) {
2261 mds
->locker
->mark_updated_scatterlock(&pin
->filelock
);
2262 mut
->ls
->dirty_dirfrag_dir
.push_back(&pin
->item_dirty_dirfrag_dir
);
2263 mut
->add_updated_lock(&pin
->filelock
);
2267 if (!mut
->is_wrlocked(&pin
->versionlock
))
2268 mds
->locker
->local_wrlock_grab(&pin
->versionlock
, mut
);
2270 ceph_assert(mut
->is_wrlocked(&pin
->nestlock
) || mut
->is_peer());
2272 pin
->last_dirstat_prop
= mut
->get_mds_stamp();
2276 lsi
.push_front(pin
);
2278 pin
->pre_cow_old_inode(); // avoid cow mayhem!
2280 auto pi
= pin
->project_inode(mut
);
2281 pi
.inode
->version
= pin
->pre_dirty();
2284 if (do_parent_mtime
|| linkunlink
) {
2285 dout(20) << "predirty_journal_parents add_delta " << pf
->fragstat
<< dendl
;
2286 dout(20) << "predirty_journal_parents - " << pf
->accounted_fragstat
<< dendl
;
2287 bool touched_mtime
= false, touched_chattr
= false;
2288 pi
.inode
->dirstat
.add_delta(pf
->fragstat
, pf
->accounted_fragstat
, &touched_mtime
, &touched_chattr
);
2289 pf
->accounted_fragstat
= pf
->fragstat
;
2291 pi
.inode
->mtime
= pi
.inode
->ctime
= pi
.inode
->dirstat
.mtime
;
2293 pi
.inode
->change_attr
++;
2294 dout(20) << "predirty_journal_parents gives " << pi
.inode
->dirstat
<< " on " << *pin
<< dendl
;
2296 if (parent
->get_frag() == frag_t()) { // i.e., we are the only frag
2297 if (pi
.inode
->dirstat
.size() < 0)
2298 ceph_assert(!"negative dirstat size" == g_conf()->mds_verify_scatter
);
2299 if (pi
.inode
->dirstat
.size() != pf
->fragstat
.size()) {
2300 mds
->clog
->error() << "unmatched fragstat size on single dirfrag "
2301 << parent
->dirfrag() << ", inode has " << pi
.inode
->dirstat
2302 << ", dirfrag has " << pf
->fragstat
;
2304 // trust the dirfrag for now
2305 pi
.inode
->dirstat
= pf
->fragstat
;
2307 ceph_assert(!"unmatched fragstat size" == g_conf()->mds_verify_scatter
);
2313 dout(10) << "predirty_journal_parents frag->inode on " << *parent
<< dendl
;
2315 // first, if the frag is stale, bring it back in sync.
2316 parent
->resync_accounted_rstat();
2318 if (g_conf()->mds_snap_rstat
) {
2319 for (auto &p
: parent
->dirty_old_rstat
) {
2320 project_rstat_frag_to_inode(p
.second
.rstat
, p
.second
.accounted_rstat
, p
.second
.first
,
2321 p
.first
, pin
, true);
2324 parent
->dirty_old_rstat
.clear();
2325 project_rstat_frag_to_inode(pf
->rstat
, pf
->accounted_rstat
, parent
->first
, CEPH_NOSNAP
, pin
, true);//false);
2327 pf
->accounted_rstat
= pf
->rstat
;
2329 if (parent
->get_frag() == frag_t()) { // i.e., we are the only frag
2330 if (pi
.inode
->rstat
.rbytes
!= pf
->rstat
.rbytes
) {
2331 mds
->clog
->error() << "unmatched rstat rbytes on single dirfrag "
2332 << parent
->dirfrag() << ", inode has " << pi
.inode
->rstat
2333 << ", dirfrag has " << pf
->rstat
;
2335 // trust the dirfrag for now
2336 pi
.inode
->rstat
= pf
->rstat
;
2338 ceph_assert(!"unmatched rstat rbytes" == g_conf()->mds_verify_scatter
);
2342 parent
->check_rstats();
2343 broadcast_quota_to_client(pin
);
2348 parentdn
= pin
->get_projected_parent_dn();
2349 ceph_assert(parentdn
);
2350 parent
= parentdn
->get_dir();
2352 do_parent_mtime
= false;
2357 // now, stick it in the blob
2358 ceph_assert(parent
);
2359 ceph_assert(parent
->is_auth());
2360 blob
->add_dir_context(parent
);
2361 blob
->add_dir(parent
, true);
2362 for (const auto& in
: lsi
) {
2363 journal_dirty_inode(mut
.get(), blob
, in
);
2372 // ===================================
2377 * some handlers for leader requests with peers. we need to make
2378 * sure leader journal commits before we forget we leadered them and
2379 * remove them from the uncommitted_leaders map (used during recovery
2380 * to commit|abort peers).
2382 struct C_MDC_CommittedLeader
: public MDCacheLogContext
{
2384 C_MDC_CommittedLeader(MDCache
*s
, metareqid_t r
) : MDCacheLogContext(s
), reqid(r
) {}
2385 void finish(int r
) override
{
2386 mdcache
->_logged_leader_commit(reqid
);
2390 void MDCache::log_leader_commit(metareqid_t reqid
)
2392 dout(10) << "log_leader_commit " << reqid
<< dendl
;
2393 uncommitted_leaders
[reqid
].committing
= true;
2394 mds
->mdlog
->start_submit_entry(new ECommitted(reqid
),
2395 new C_MDC_CommittedLeader(this, reqid
));
2398 void MDCache::_logged_leader_commit(metareqid_t reqid
)
2400 dout(10) << "_logged_leader_commit " << reqid
<< dendl
;
2401 ceph_assert(uncommitted_leaders
.count(reqid
));
2402 uncommitted_leaders
[reqid
].ls
->uncommitted_leaders
.erase(reqid
);
2403 mds
->queue_waiters(uncommitted_leaders
[reqid
].waiters
);
2404 uncommitted_leaders
.erase(reqid
);
2409 void MDCache::committed_leader_peer(metareqid_t r
, mds_rank_t from
)
2411 dout(10) << "committed_leader_peer mds." << from
<< " on " << r
<< dendl
;
2412 ceph_assert(uncommitted_leaders
.count(r
));
2413 uncommitted_leaders
[r
].peers
.erase(from
);
2414 if (!uncommitted_leaders
[r
].recovering
&& uncommitted_leaders
[r
].peers
.empty())
2415 log_leader_commit(r
);
2418 void MDCache::logged_leader_update(metareqid_t reqid
)
2420 dout(10) << "logged_leader_update " << reqid
<< dendl
;
2421 ceph_assert(uncommitted_leaders
.count(reqid
));
2422 uncommitted_leaders
[reqid
].safe
= true;
2423 auto p
= pending_leaders
.find(reqid
);
2424 if (p
!= pending_leaders
.end()) {
2425 pending_leaders
.erase(p
);
2426 if (pending_leaders
.empty())
2427 process_delayed_resolve();
2432 * Leader may crash after receiving all peers' commit acks, but before journalling
2433 * the final commit. Peers may crash after journalling the peer commit, but before
2434 * sending commit ack to the leader. Commit leaders with no uncommitted peer when
2437 void MDCache::finish_committed_leaders()
2439 for (map
<metareqid_t
, uleader
>::iterator p
= uncommitted_leaders
.begin();
2440 p
!= uncommitted_leaders
.end();
2442 p
->second
.recovering
= false;
2443 if (!p
->second
.committing
&& p
->second
.peers
.empty()) {
2444 dout(10) << "finish_committed_leaders " << p
->first
<< dendl
;
2445 log_leader_commit(p
->first
);
2451 * at end of resolve... we must journal a commit|abort for all peer
2452 * updates, before moving on.
2454 * this is so that the leader can safely journal ECommitted on ops it
2455 * leaders when it reaches up:active (all other recovering nodes must
2456 * complete resolve before that happens).
2458 struct C_MDC_PeerCommit
: public MDCacheLogContext
{
2461 C_MDC_PeerCommit(MDCache
*c
, int f
, metareqid_t r
) : MDCacheLogContext(c
), from(f
), reqid(r
) {}
2462 void finish(int r
) override
{
2463 mdcache
->_logged_peer_commit(from
, reqid
);
2467 void MDCache::_logged_peer_commit(mds_rank_t from
, metareqid_t reqid
)
2469 dout(10) << "_logged_peer_commit from mds." << from
<< " " << reqid
<< dendl
;
2472 auto req
= make_message
<MMDSPeerRequest
>(reqid
, 0, MMDSPeerRequest::OP_COMMITTED
);
2473 mds
->send_message_mds(req
, from
);
2481 // ====================================================================
2482 // import map, recovery
2484 void MDCache::_move_subtree_map_bound(dirfrag_t df
, dirfrag_t oldparent
, dirfrag_t newparent
,
2485 map
<dirfrag_t
,vector
<dirfrag_t
> >& subtrees
)
2487 if (subtrees
.count(oldparent
)) {
2488 vector
<dirfrag_t
>& v
= subtrees
[oldparent
];
2489 dout(10) << " removing " << df
<< " from " << oldparent
<< " bounds " << v
<< dendl
;
2490 for (vector
<dirfrag_t
>::iterator it
= v
.begin(); it
!= v
.end(); ++it
)
2496 if (subtrees
.count(newparent
)) {
2497 vector
<dirfrag_t
>& v
= subtrees
[newparent
];
2498 dout(10) << " adding " << df
<< " to " << newparent
<< " bounds " << v
<< dendl
;
2503 ESubtreeMap
*MDCache::create_subtree_map()
2505 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2506 << num_subtrees_fullauth() << " fullauth"
2511 ESubtreeMap
*le
= new ESubtreeMap();
2512 mds
->mdlog
->_start_entry(le
);
2514 map
<dirfrag_t
, CDir
*> dirs_to_add
;
2517 CDir
* mydir
= myin
->get_dirfrag(frag_t());
2518 dirs_to_add
[mydir
->dirfrag()] = mydir
;
2521 // include all auth subtrees, and their bounds.
2522 // and a spanning tree to tie it to the root.
2523 for (auto& [dir
, bounds
] : subtrees
) {
2524 // journal subtree as "ours" if we are
2527 // me, !me (may be importing and ambiguous!)
2531 if (dir
->get_dir_auth().first
!= mds
->get_nodeid())
2534 if (migrator
->is_ambiguous_import(dir
->dirfrag()) ||
2535 my_ambiguous_imports
.count(dir
->dirfrag())) {
2536 dout(15) << " ambig subtree " << *dir
<< dendl
;
2537 le
->ambiguous_subtrees
.insert(dir
->dirfrag());
2539 dout(15) << " auth subtree " << *dir
<< dendl
;
2542 dirs_to_add
[dir
->dirfrag()] = dir
;
2543 le
->subtrees
[dir
->dirfrag()].clear();
2546 size_t nbounds
= bounds
.size();
2548 dout(15) << " subtree has " << nbounds
<< " bounds" << dendl
;
2550 for (auto& bound
: bounds
) {
2552 dout(15) << " subtree bound " << *bound
<< dendl
;
2554 dirs_to_add
[bound
->dirfrag()] = bound
;
2555 le
->subtrees
[dir
->dirfrag()].push_back(bound
->dirfrag());
2559 // apply projected renames
2560 for (const auto& [diri
, renames
] : projected_subtree_renames
) {
2561 for (const auto& [olddir
, newdir
] : renames
) {
2562 dout(15) << " adjusting for projected rename of " << *diri
<< " to " << *newdir
<< dendl
;
2564 auto&& dfls
= diri
->get_dirfrags();
2565 for (const auto& dir
: dfls
) {
2566 dout(15) << "dirfrag " << dir
->dirfrag() << " " << *dir
<< dendl
;
2567 CDir
*oldparent
= get_projected_subtree_root(olddir
);
2568 dout(15) << " old parent " << oldparent
->dirfrag() << " " << *oldparent
<< dendl
;
2569 CDir
*newparent
= get_projected_subtree_root(newdir
);
2570 dout(15) << " new parent " << newparent
->dirfrag() << " " << *newparent
<< dendl
;
2572 if (oldparent
== newparent
) {
2573 dout(15) << "parent unchanged for " << dir
->dirfrag() << " at "
2574 << oldparent
->dirfrag() << dendl
;
2578 if (dir
->is_subtree_root()) {
2579 if (le
->subtrees
.count(newparent
->dirfrag()) &&
2580 oldparent
->get_dir_auth() != newparent
->get_dir_auth())
2581 dirs_to_add
[dir
->dirfrag()] = dir
;
2582 // children are fine. change parent.
2583 _move_subtree_map_bound(dir
->dirfrag(), oldparent
->dirfrag(), newparent
->dirfrag(),
2588 if (oldparent
->get_dir_auth() != newparent
->get_dir_auth()) {
2589 dout(10) << " creating subtree for " << dir
->dirfrag() << dendl
;
2590 // if oldparent is auth, subtree is mine; include it.
2591 if (le
->subtrees
.count(oldparent
->dirfrag())) {
2592 dirs_to_add
[dir
->dirfrag()] = dir
;
2593 le
->subtrees
[dir
->dirfrag()].clear();
2595 // if newparent is auth, subtree is a new bound
2596 if (le
->subtrees
.count(newparent
->dirfrag())) {
2597 dirs_to_add
[dir
->dirfrag()] = dir
;
2598 le
->subtrees
[newparent
->dirfrag()].push_back(dir
->dirfrag()); // newparent is auth; new bound
2603 // see if any old bounds move to the new parent.
2604 for (auto& bound
: subtrees
.at(oldparent
)) {
2605 if (dir
->contains(bound
->get_parent_dir()))
2606 _move_subtree_map_bound(bound
->dirfrag(), oldparent
->dirfrag(), newparent
->dirfrag(),
2614 // simplify the journaled map. our in memory map may have more
2615 // subtrees than needed due to migrations that are just getting
2616 // started or just completing. but on replay, the "live" map will
2617 // be simple and we can do a straight comparison.
2618 for (auto& [frag
, bfrags
] : le
->subtrees
) {
2619 if (le
->ambiguous_subtrees
.count(frag
))
2622 while (i
< bfrags
.size()) {
2623 dirfrag_t b
= bfrags
[i
];
2624 if (le
->subtrees
.count(b
) &&
2625 le
->ambiguous_subtrees
.count(b
) == 0) {
2626 auto& bb
= le
->subtrees
.at(b
);
2627 dout(10) << "simplify: " << frag
<< " swallowing " << b
<< " with bounds " << bb
<< dendl
;
2628 for (auto& r
: bb
) {
2629 bfrags
.push_back(r
);
2631 dirs_to_add
.erase(b
);
2632 le
->subtrees
.erase(b
);
2633 bfrags
.erase(bfrags
.begin() + i
);
2640 for (auto &p
: dirs_to_add
) {
2641 CDir
*dir
= p
.second
;
2642 le
->metablob
.add_dir_context(dir
, EMetaBlob::TO_ROOT
);
2643 le
->metablob
.add_dir(dir
, false);
2646 dout(15) << " subtrees " << le
->subtrees
<< dendl
;
2647 dout(15) << " ambiguous_subtrees " << le
->ambiguous_subtrees
<< dendl
;
2649 //le->metablob.print(cout);
2650 le
->expire_pos
= mds
->mdlog
->journaler
->get_expire_pos();
2654 void MDCache::dump_resolve_status(Formatter
*f
) const
2656 f
->open_object_section("resolve_status");
2657 f
->dump_stream("resolve_gather") << resolve_gather
;
2658 f
->dump_stream("resolve_ack_gather") << resolve_gather
;
2662 void MDCache::resolve_start(MDSContext
*resolve_done_
)
2664 dout(10) << "resolve_start" << dendl
;
2665 ceph_assert(!resolve_done
);
2666 resolve_done
.reset(resolve_done_
);
2668 if (mds
->mdsmap
->get_root() != mds
->get_nodeid()) {
2669 // if we don't have the root dir, adjust it to UNKNOWN. during
2670 // resolve we want mds0 to explicit claim the portion of it that
2671 // it owns, so that anything beyond its bounds get left as
2673 CDir
*rootdir
= root
->get_dirfrag(frag_t());
2675 adjust_subtree_auth(rootdir
, CDIR_AUTH_UNKNOWN
);
2677 resolve_gather
= recovery_set
;
2679 resolve_snapclient_commits
= mds
->snapclient
->get_journaled_tids();
2682 void MDCache::send_resolves()
2684 send_peer_resolves();
2686 if (!resolve_done
) {
2687 // I'm survivor: refresh snap cache
2688 mds
->snapclient
->sync(
2689 new MDSInternalContextWrapper(mds
,
2690 new LambdaContext([this](int r
) {
2691 maybe_finish_peer_resolve();
2695 dout(10) << "send_resolves waiting for snapclient cache to sync" << dendl
;
2698 if (!resolve_ack_gather
.empty()) {
2699 dout(10) << "send_resolves still waiting for resolve ack from ("
2700 << resolve_ack_gather
<< ")" << dendl
;
2703 if (!resolve_need_rollback
.empty()) {
2704 dout(10) << "send_resolves still waiting for rollback to commit on ("
2705 << resolve_need_rollback
<< ")" << dendl
;
2709 send_subtree_resolves();
2712 void MDCache::send_peer_resolves()
2714 dout(10) << "send_peer_resolves" << dendl
;
2716 map
<mds_rank_t
, ref_t
<MMDSResolve
>> resolves
;
2718 if (mds
->is_resolve()) {
2719 for (map
<metareqid_t
, upeer
>::iterator p
= uncommitted_peers
.begin();
2720 p
!= uncommitted_peers
.end();
2722 mds_rank_t leader
= p
->second
.leader
;
2723 auto &m
= resolves
[leader
];
2724 if (!m
) m
= make_message
<MMDSResolve
>();
2725 m
->add_peer_request(p
->first
, false);
2728 set
<mds_rank_t
> resolve_set
;
2729 mds
->mdsmap
->get_mds_set(resolve_set
, MDSMap::STATE_RESOLVE
);
2730 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
2731 p
!= active_requests
.end();
2733 MDRequestRef
& mdr
= p
->second
;
2734 if (!mdr
->is_peer())
2736 if (!mdr
->peer_did_prepare() && !mdr
->committing
) {
2739 mds_rank_t leader
= mdr
->peer_to_mds
;
2740 if (resolve_set
.count(leader
) || is_ambiguous_peer_update(p
->first
, leader
)) {
2741 dout(10) << " including uncommitted " << *mdr
<< dendl
;
2742 if (!resolves
.count(leader
))
2743 resolves
[leader
] = make_message
<MMDSResolve
>();
2744 if (!mdr
->committing
&&
2745 mdr
->has_more() && mdr
->more()->is_inode_exporter
) {
2746 // re-send cap exports
2747 CInode
*in
= mdr
->more()->rename_inode
;
2748 map
<client_t
, Capability::Export
> cap_map
;
2749 in
->export_client_caps(cap_map
);
2751 MMDSResolve::peer_inode_cap
inode_caps(in
->ino(), cap_map
);
2752 encode(inode_caps
, bl
);
2753 resolves
[leader
]->add_peer_request(p
->first
, bl
);
2755 resolves
[leader
]->add_peer_request(p
->first
, mdr
->committing
);
2761 for (auto &p
: resolves
) {
2762 dout(10) << "sending peer resolve to mds." << p
.first
<< dendl
;
2763 mds
->send_message_mds(p
.second
, p
.first
);
2764 resolve_ack_gather
.insert(p
.first
);
2768 void MDCache::send_subtree_resolves()
2770 dout(10) << "send_subtree_resolves" << dendl
;
2772 if (migrator
->is_exporting() || migrator
->is_importing()) {
2773 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl
;
2774 migrator
->show_importing();
2775 migrator
->show_exporting();
2776 resolves_pending
= true;
2780 map
<mds_rank_t
, ref_t
<MMDSResolve
>> resolves
;
2781 for (set
<mds_rank_t
>::iterator p
= recovery_set
.begin();
2782 p
!= recovery_set
.end();
2784 if (*p
== mds
->get_nodeid())
2786 if (mds
->is_resolve() || mds
->mdsmap
->is_resolve(*p
))
2787 resolves
[*p
] = make_message
<MMDSResolve
>();
2790 map
<dirfrag_t
, vector
<dirfrag_t
> > my_subtrees
;
2791 map
<dirfrag_t
, vector
<dirfrag_t
> > my_ambig_imports
;
2794 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
2795 p
!= subtrees
.end();
2797 CDir
*dir
= p
->first
;
2799 // only our subtrees
2800 if (dir
->authority().first
!= mds
->get_nodeid())
2803 if (mds
->is_resolve() && my_ambiguous_imports
.count(dir
->dirfrag()))
2804 continue; // we'll add it below
2806 if (migrator
->is_ambiguous_import(dir
->dirfrag())) {
2807 // ambiguous (mid-import)
2809 get_subtree_bounds(dir
, bounds
);
2810 vector
<dirfrag_t
> dfls
;
2811 for (set
<CDir
*>::iterator q
= bounds
.begin(); q
!= bounds
.end(); ++q
)
2812 dfls
.push_back((*q
)->dirfrag());
2814 my_ambig_imports
[dir
->dirfrag()] = dfls
;
2815 dout(10) << " ambig " << dir
->dirfrag() << " " << dfls
<< dendl
;
2818 for (auto &q
: resolves
) {
2819 resolves
[q
.first
]->add_subtree(dir
->dirfrag());
2822 vector
<dirfrag_t
> dfls
;
2823 for (set
<CDir
*>::iterator q
= subtrees
[dir
].begin();
2824 q
!= subtrees
[dir
].end();
2827 dfls
.push_back(bound
->dirfrag());
2830 my_subtrees
[dir
->dirfrag()] = dfls
;
2831 dout(10) << " claim " << dir
->dirfrag() << " " << dfls
<< dendl
;
2836 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= my_ambiguous_imports
.begin();
2837 p
!= my_ambiguous_imports
.end();
2839 my_ambig_imports
[p
->first
] = p
->second
;
2840 dout(10) << " ambig " << p
->first
<< " " << p
->second
<< dendl
;
2843 // simplify the claimed subtree.
2844 for (auto p
= my_subtrees
.begin(); p
!= my_subtrees
.end(); ++p
) {
2846 while (i
< p
->second
.size()) {
2847 dirfrag_t b
= p
->second
[i
];
2848 if (my_subtrees
.count(b
)) {
2849 vector
<dirfrag_t
>& bb
= my_subtrees
[b
];
2850 dout(10) << " simplify: " << p
->first
<< " swallowing " << b
<< " with bounds " << bb
<< dendl
;
2851 for (vector
<dirfrag_t
>::iterator r
= bb
.begin(); r
!= bb
.end(); ++r
)
2852 p
->second
.push_back(*r
);
2853 my_subtrees
.erase(b
);
2854 p
->second
.erase(p
->second
.begin() + i
);
2862 for (auto &p
: resolves
) {
2863 const ref_t
<MMDSResolve
> &m
= p
.second
;
2864 if (mds
->is_resolve()) {
2865 m
->add_table_commits(TABLE_SNAP
, resolve_snapclient_commits
);
2867 m
->add_table_commits(TABLE_SNAP
, mds
->snapclient
->get_journaled_tids());
2869 m
->subtrees
= my_subtrees
;
2870 m
->ambiguous_imports
= my_ambig_imports
;
2871 dout(10) << "sending subtee resolve to mds." << p
.first
<< dendl
;
2872 mds
->send_message_mds(m
, p
.first
);
2874 resolves_pending
= false;
2877 void MDCache::maybe_finish_peer_resolve() {
2878 if (resolve_ack_gather
.empty() && resolve_need_rollback
.empty()) {
2879 // snap cache get synced or I'm in resolve state
2880 if (mds
->snapclient
->is_synced() || resolve_done
)
2881 send_subtree_resolves();
2882 process_delayed_resolve();
2886 void MDCache::handle_mds_failure(mds_rank_t who
)
2888 dout(7) << "handle_mds_failure mds." << who
<< dendl
;
2890 dout(1) << "handle_mds_failure mds." << who
<< " : recovery peers are " << recovery_set
<< dendl
;
2892 resolve_gather
.insert(who
);
2893 discard_delayed_resolve(who
);
2894 ambiguous_peer_updates
.erase(who
);
2896 rejoin_gather
.insert(who
);
2897 rejoin_sent
.erase(who
); // i need to send another
2898 rejoin_ack_sent
.erase(who
); // i need to send another
2899 rejoin_ack_gather
.erase(who
); // i'll need/get another.
2901 dout(10) << " resolve_gather " << resolve_gather
<< dendl
;
2902 dout(10) << " resolve_ack_gather " << resolve_ack_gather
<< dendl
;
2903 dout(10) << " rejoin_sent " << rejoin_sent
<< dendl
;
2904 dout(10) << " rejoin_gather " << rejoin_gather
<< dendl
;
2905 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather
<< dendl
;
2908 // tell the migrator too.
2909 migrator
->handle_mds_failure_or_stop(who
);
2911 // tell the balancer too.
2912 mds
->balancer
->handle_mds_failure(who
);
2914 // clean up any requests peer to/from this node
2915 list
<MDRequestRef
> finish
;
2916 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
2917 p
!= active_requests
.end();
2919 MDRequestRef
& mdr
= p
->second
;
2920 // peer to the failed node?
2921 if (mdr
->peer_to_mds
== who
) {
2922 if (mdr
->peer_did_prepare()) {
2923 dout(10) << " peer request " << *mdr
<< " uncommitted, will resolve shortly" << dendl
;
2924 if (is_ambiguous_peer_update(p
->first
, mdr
->peer_to_mds
))
2925 remove_ambiguous_peer_update(p
->first
, mdr
->peer_to_mds
);
2927 if (!mdr
->more()->waiting_on_peer
.empty()) {
2928 ceph_assert(mdr
->more()->srcdn_auth_mds
== mds
->get_nodeid());
2929 // will rollback, no need to wait
2930 mdr
->reset_peer_request();
2931 mdr
->more()->waiting_on_peer
.clear();
2933 } else if (!mdr
->committing
) {
2934 dout(10) << " peer request " << *mdr
<< " has no prepare, finishing up" << dendl
;
2935 if (mdr
->peer_request
|| mdr
->peer_rolling_back())
2936 mdr
->aborted
= true;
2938 finish
.push_back(mdr
);
2942 if (mdr
->is_peer() && mdr
->peer_did_prepare()) {
2943 if (mdr
->more()->waiting_on_peer
.count(who
)) {
2944 ceph_assert(mdr
->more()->srcdn_auth_mds
== mds
->get_nodeid());
2945 dout(10) << " peer request " << *mdr
<< " no longer need rename notity ack from mds."
2947 mdr
->more()->waiting_on_peer
.erase(who
);
2948 if (mdr
->more()->waiting_on_peer
.empty() && mdr
->peer_request
)
2949 mds
->queue_waiter(new C_MDS_RetryRequest(this, mdr
));
2952 if (mdr
->more()->srcdn_auth_mds
== who
&&
2953 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(mdr
->peer_to_mds
)) {
2954 // rename srcdn's auth mds failed, resolve even I'm a survivor.
2955 dout(10) << " peer request " << *mdr
<< " uncommitted, will resolve shortly" << dendl
;
2956 add_ambiguous_peer_update(p
->first
, mdr
->peer_to_mds
);
2958 } else if (mdr
->peer_request
) {
2959 const cref_t
<MMDSPeerRequest
> &peer_req
= mdr
->peer_request
;
2960 // FIXME: Peer rename request can arrive after we notice mds failure.
2961 // This can cause mds to crash (does not affect integrity of FS).
2962 if (peer_req
->get_op() == MMDSPeerRequest::OP_RENAMEPREP
&&
2963 peer_req
->srcdn_auth
== who
)
2964 peer_req
->mark_interrupted();
2967 // failed node is peer?
2968 if (mdr
->is_leader() && !mdr
->committing
) {
2969 if (mdr
->more()->srcdn_auth_mds
== who
) {
2970 dout(10) << " leader request " << *mdr
<< " waiting for rename srcdn's auth mds."
2971 << who
<< " to recover" << dendl
;
2972 ceph_assert(mdr
->more()->witnessed
.count(who
) == 0);
2973 if (mdr
->more()->is_ambiguous_auth
)
2974 mdr
->clear_ambiguous_auth();
2975 // rename srcdn's auth mds failed, all witnesses will rollback
2976 mdr
->more()->witnessed
.clear();
2977 pending_leaders
.erase(p
->first
);
2980 if (mdr
->more()->witnessed
.count(who
)) {
2981 mds_rank_t srcdn_auth
= mdr
->more()->srcdn_auth_mds
;
2982 if (srcdn_auth
>= 0 && mdr
->more()->waiting_on_peer
.count(srcdn_auth
)) {
2983 dout(10) << " leader request " << *mdr
<< " waiting for rename srcdn's auth mds."
2984 << mdr
->more()->srcdn_auth_mds
<< " to reply" << dendl
;
2985 // waiting for the peer (rename srcdn's auth mds), delay sending resolve ack
2986 // until either the request is committing or the peer also fails.
2987 ceph_assert(mdr
->more()->waiting_on_peer
.size() == 1);
2988 pending_leaders
.insert(p
->first
);
2990 dout(10) << " leader request " << *mdr
<< " no longer witnessed by peer mds."
2991 << who
<< " to recover" << dendl
;
2992 if (srcdn_auth
>= 0)
2993 ceph_assert(mdr
->more()->witnessed
.count(srcdn_auth
) == 0);
2995 // discard this peer's prepare (if any)
2996 mdr
->more()->witnessed
.erase(who
);
3000 if (mdr
->more()->waiting_on_peer
.count(who
)) {
3001 dout(10) << " leader request " << *mdr
<< " waiting for peer mds." << who
3002 << " to recover" << dendl
;
3003 // retry request when peer recovers
3004 mdr
->more()->waiting_on_peer
.erase(who
);
3005 if (mdr
->more()->waiting_on_peer
.empty())
3006 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(this, mdr
));
3009 if (mdr
->locking
&& mdr
->locking_target_mds
== who
)
3010 mdr
->finish_locking(mdr
->locking
);
3014 for (map
<metareqid_t
, uleader
>::iterator p
= uncommitted_leaders
.begin();
3015 p
!= uncommitted_leaders
.end();
3017 // The failed MDS may have already committed the peer update
3018 if (p
->second
.peers
.count(who
)) {
3019 p
->second
.recovering
= true;
3020 p
->second
.peers
.erase(who
);
3024 while (!finish
.empty()) {
3025 dout(10) << "cleaning up peer request " << *finish
.front() << dendl
;
3026 request_finish(finish
.front());
3030 kick_find_ino_peers(who
);
3031 kick_open_ino_peers(who
);
3033 for (map
<dirfrag_t
,fragment_info_t
>::iterator p
= fragments
.begin();
3034 p
!= fragments
.end(); ) {
3035 dirfrag_t df
= p
->first
;
3036 fragment_info_t
& info
= p
->second
;
3038 if (info
.is_fragmenting()) {
3039 if (info
.notify_ack_waiting
.erase(who
) &&
3040 info
.notify_ack_waiting
.empty()) {
3041 fragment_drop_locks(info
);
3042 fragment_maybe_finish(p
++);
3050 dout(10) << "cancelling fragment " << df
<< " bit " << info
.bits
<< dendl
;
3051 std::vector
<CDir
*> dirs
;
3052 info
.dirs
.swap(dirs
);
3053 fragments
.erase(df
);
3054 fragment_unmark_unfreeze_dirs(dirs
);
3057 // MDCache::shutdown_export_strays() always exports strays to mds.0
3058 if (who
== mds_rank_t(0))
3059 shutdown_exporting_strays
.clear();
3065 * handle_mds_recovery - called on another node's transition
3066 * from resolve -> active.
3068 void MDCache::handle_mds_recovery(mds_rank_t who
)
3070 dout(7) << "handle_mds_recovery mds." << who
<< dendl
;
3072 // exclude all discover waiters. kick_discovers() will do the job
3073 static const uint64_t i_mask
= CInode::WAIT_ANY_MASK
& ~CInode::WAIT_DIR
;
3074 static const uint64_t d_mask
= CDir::WAIT_ANY_MASK
& ~CDir::WAIT_DENTRY
;
3076 MDSContext::vec waiters
;
3078 // wake up any waiters in their subtrees
3079 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3080 p
!= subtrees
.end();
3082 CDir
*dir
= p
->first
;
3084 if (dir
->authority().first
!= who
||
3085 dir
->authority().second
== mds
->get_nodeid())
3087 ceph_assert(!dir
->is_auth());
3090 std::queue
<CDir
*> q
;
3093 while (!q
.empty()) {
3094 CDir
*d
= q
.front();
3096 d
->take_waiting(d_mask
, waiters
);
3098 // inode waiters too
3099 for (auto &p
: d
->items
) {
3100 CDentry
*dn
= p
.second
;
3101 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3102 if (dnl
->is_primary()) {
3103 dnl
->get_inode()->take_waiting(i_mask
, waiters
);
3106 auto&& ls
= dnl
->get_inode()->get_dirfrags();
3107 for (const auto& subdir
: ls
) {
3108 if (!subdir
->is_subtree_root())
3116 kick_open_ino_peers(who
);
3117 kick_find_ino_peers(who
);
3120 mds
->queue_waiters(waiters
);
3123 void MDCache::set_recovery_set(set
<mds_rank_t
>& s
)
3125 dout(7) << "set_recovery_set " << s
<< dendl
;
3131 * during resolve state, we share resolves to determine who
3132 * is authoritative for which trees. we expect to get an resolve
3133 * from _everyone_ in the recovery_set (the mds cluster at the time of
3134 * the first failure).
3136 * This functions puts the passed message before returning
3138 void MDCache::handle_resolve(const cref_t
<MMDSResolve
> &m
)
3140 dout(7) << "handle_resolve from " << m
->get_source() << dendl
;
3141 mds_rank_t from
= mds_rank_t(m
->get_source().num());
3143 if (mds
->get_state() < MDSMap::STATE_RESOLVE
) {
3144 if (mds
->get_want_state() == CEPH_MDS_STATE_RESOLVE
) {
3145 mds
->wait_for_resolve(new C_MDS_RetryMessage(mds
, m
));
3148 // wait until we reach the resolve stage!
3152 discard_delayed_resolve(from
);
3154 // ambiguous peer requests?
3155 if (!m
->peer_requests
.empty()) {
3156 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping()) {
3157 for (auto p
= m
->peer_requests
.begin(); p
!= m
->peer_requests
.end(); ++p
) {
3158 if (uncommitted_leaders
.count(p
->first
) && !uncommitted_leaders
[p
->first
].safe
) {
3159 ceph_assert(!p
->second
.committing
);
3160 pending_leaders
.insert(p
->first
);
3164 if (!pending_leaders
.empty()) {
3165 dout(10) << " still have pending updates, delay processing peer resolve" << dendl
;
3166 delayed_resolve
[from
] = m
;
3171 auto ack
= make_message
<MMDSResolveAck
>();
3172 for (const auto &p
: m
->peer_requests
) {
3173 if (uncommitted_leaders
.count(p
.first
)) { //mds->sessionmap.have_completed_request(p.first)) {
3175 if (p
.second
.committing
) {
3176 // already committing, waiting for the OP_COMMITTED peer reply
3177 dout(10) << " already committing peer request " << p
<< " noop "<< dendl
;
3179 dout(10) << " ambiguous peer request " << p
<< " will COMMIT" << dendl
;
3180 ack
->add_commit(p
.first
);
3182 uncommitted_leaders
[p
.first
].peers
.insert(from
); // wait for peer OP_COMMITTED before we log ECommitted
3184 if (p
.second
.inode_caps
.length() > 0) {
3185 // peer wants to export caps (rename)
3186 ceph_assert(mds
->is_resolve());
3187 MMDSResolve::peer_inode_cap inode_caps
;
3188 auto q
= p
.second
.inode_caps
.cbegin();
3189 decode(inode_caps
, q
);
3190 inodeno_t ino
= inode_caps
.ino
;
3191 map
<client_t
,Capability::Export
> cap_exports
= inode_caps
.cap_exports
;
3192 ceph_assert(get_inode(ino
));
3194 for (map
<client_t
,Capability::Export
>::iterator q
= cap_exports
.begin();
3195 q
!= cap_exports
.end();
3197 Capability::Import
& im
= rejoin_imported_caps
[from
][ino
][q
->first
];
3198 im
.cap_id
= ++last_cap_id
; // assign a new cap ID
3200 im
.mseq
= q
->second
.mseq
;
3202 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
3204 rejoin_client_map
.emplace(q
->first
, session
->info
.inst
);
3207 // will process these caps in rejoin stage
3208 rejoin_peer_exports
[ino
].first
= from
;
3209 rejoin_peer_exports
[ino
].second
.swap(cap_exports
);
3211 // send information of imported caps back to peer
3212 encode(rejoin_imported_caps
[from
][ino
], ack
->commit
[p
.first
]);
3216 dout(10) << " ambiguous peer request " << p
<< " will ABORT" << dendl
;
3217 ceph_assert(!p
.second
.committing
);
3218 ack
->add_abort(p
.first
);
3221 mds
->send_message(ack
, m
->get_connection());
3225 if (!resolve_ack_gather
.empty() || !resolve_need_rollback
.empty()) {
3226 dout(10) << "delay processing subtree resolve" << dendl
;
3227 delayed_resolve
[from
] = m
;
3231 bool survivor
= false;
3232 // am i a surviving ambiguous importer?
3233 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping()) {
3235 // check for any import success/failure (from this node)
3236 map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= my_ambiguous_imports
.begin();
3237 while (p
!= my_ambiguous_imports
.end()) {
3238 map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator next
= p
;
3240 CDir
*dir
= get_dirfrag(p
->first
);
3242 dout(10) << "checking ambiguous import " << *dir
<< dendl
;
3243 if (migrator
->is_importing(dir
->dirfrag()) &&
3244 migrator
->get_import_peer(dir
->dirfrag()) == from
) {
3245 ceph_assert(migrator
->get_import_state(dir
->dirfrag()) == Migrator::IMPORT_ACKING
);
3247 // check if sender claims the subtree
3248 bool claimed_by_sender
= false;
3249 for (const auto &q
: m
->subtrees
) {
3250 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
3251 CDir
*base
= get_force_dirfrag(q
.first
, false);
3252 if (!base
|| !base
->contains(dir
))
3253 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3257 get_force_dirfrag_bound_set(q
.second
, bounds
);
3258 for (set
<CDir
*>::iterator p
= bounds
.begin(); p
!= bounds
.end(); ++p
) {
3260 if (bound
->contains(dir
)) {
3261 inside
= false; // nope, bound is dir or parent of dir, not inside.
3266 claimed_by_sender
= true;
3269 my_ambiguous_imports
.erase(p
); // no longer ambiguous.
3270 if (claimed_by_sender
) {
3271 dout(7) << "ambiguous import failed on " << *dir
<< dendl
;
3272 migrator
->import_reverse(dir
);
3274 dout(7) << "ambiguous import succeeded on " << *dir
<< dendl
;
3275 migrator
->import_finish(dir
, true);
3282 // update my dir_auth values
3283 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3284 // migrations between other nodes)
3285 for (const auto& p
: m
->subtrees
) {
3286 dout(10) << "peer claims " << p
.first
<< " bounds " << p
.second
<< dendl
;
3287 CDir
*dir
= get_force_dirfrag(p
.first
, !survivor
);
3290 adjust_bounded_subtree_auth(dir
, p
.second
, from
);
3291 try_subtree_merge(dir
);
3296 // note ambiguous imports too
3297 for (const auto& p
: m
->ambiguous_imports
) {
3298 dout(10) << "noting ambiguous import on " << p
.first
<< " bounds " << p
.second
<< dendl
;
3299 other_ambiguous_imports
[from
][p
.first
] = p
.second
;
3302 // learn other mds' pendina snaptable commits. later when resolve finishes, we will reload
3303 // snaptable cache from snapserver. By this way, snaptable cache get synced among all mds
3304 for (const auto& p
: m
->table_clients
) {
3305 dout(10) << " noting " << get_mdstable_name(p
.type
)
3306 << " pending_commits " << p
.pending_commits
<< dendl
;
3307 MDSTableClient
*client
= mds
->get_table_client(p
.type
);
3308 for (const auto& q
: p
.pending_commits
)
3309 client
->notify_commit(q
);
3312 // did i get them all?
3313 resolve_gather
.erase(from
);
3315 maybe_resolve_finish();
3318 void MDCache::process_delayed_resolve()
3320 dout(10) << "process_delayed_resolve" << dendl
;
3321 map
<mds_rank_t
, cref_t
<MMDSResolve
>> tmp
;
3322 tmp
.swap(delayed_resolve
);
3323 for (auto &p
: tmp
) {
3324 handle_resolve(p
.second
);
3328 void MDCache::discard_delayed_resolve(mds_rank_t who
)
3330 delayed_resolve
.erase(who
);
3333 void MDCache::maybe_resolve_finish()
3335 ceph_assert(resolve_ack_gather
.empty());
3336 ceph_assert(resolve_need_rollback
.empty());
3338 if (!resolve_gather
.empty()) {
3339 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3340 << resolve_gather
<< ")" << dendl
;
3344 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl
;
3345 disambiguate_my_imports();
3346 finish_committed_leaders();
3349 ceph_assert(mds
->is_resolve());
3350 trim_unlinked_inodes();
3351 recalc_auth_bits(false);
3352 resolve_done
.release()->complete(0);
3355 maybe_send_pending_rejoins();
3359 void MDCache::handle_resolve_ack(const cref_t
<MMDSResolveAck
> &ack
)
3361 dout(10) << "handle_resolve_ack " << *ack
<< " from " << ack
->get_source() << dendl
;
3362 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
3364 if (!resolve_ack_gather
.count(from
) ||
3365 mds
->mdsmap
->get_state(from
) < MDSMap::STATE_RESOLVE
) {
3369 if (ambiguous_peer_updates
.count(from
)) {
3370 ceph_assert(mds
->mdsmap
->is_clientreplay_or_active_or_stopping(from
));
3371 ceph_assert(mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping());
3374 for (const auto &p
: ack
->commit
) {
3375 dout(10) << " commit on peer " << p
.first
<< dendl
;
3377 if (ambiguous_peer_updates
.count(from
)) {
3378 remove_ambiguous_peer_update(p
.first
, from
);
3382 if (mds
->is_resolve()) {
3384 MDPeerUpdate
*su
= get_uncommitted_peer(p
.first
, from
);
3388 mds
->mdlog
->start_submit_entry(new EPeerUpdate(mds
->mdlog
, "unknown", p
.first
, from
,
3389 EPeerUpdate::OP_COMMIT
, su
->origop
),
3390 new C_MDC_PeerCommit(this, from
, p
.first
));
3391 mds
->mdlog
->flush();
3393 finish_uncommitted_peer(p
.first
);
3395 MDRequestRef mdr
= request_get(p
.first
);
3396 // information about leader imported caps
3397 if (p
.second
.length() > 0)
3398 mdr
->more()->inode_import
.share(p
.second
);
3400 ceph_assert(mdr
->peer_request
== 0); // shouldn't be doing anything!
3401 request_finish(mdr
);
3405 for (const auto &metareq
: ack
->abort
) {
3406 dout(10) << " abort on peer " << metareq
<< dendl
;
3408 if (mds
->is_resolve()) {
3409 MDPeerUpdate
*su
= get_uncommitted_peer(metareq
, from
);
3412 // perform rollback (and journal a rollback entry)
3413 // note: this will hold up the resolve a bit, until the rollback entries journal.
3414 MDRequestRef null_ref
;
3415 switch (su
->origop
) {
3416 case EPeerUpdate::LINK
:
3417 mds
->server
->do_link_rollback(su
->rollback
, from
, null_ref
);
3419 case EPeerUpdate::RENAME
:
3420 mds
->server
->do_rename_rollback(su
->rollback
, from
, null_ref
);
3422 case EPeerUpdate::RMDIR
:
3423 mds
->server
->do_rmdir_rollback(su
->rollback
, from
, null_ref
);
3429 MDRequestRef mdr
= request_get(metareq
);
3430 mdr
->aborted
= true;
3431 if (mdr
->peer_request
) {
3432 if (mdr
->peer_did_prepare()) // journaling peer prepare ?
3433 add_rollback(metareq
, from
);
3435 request_finish(mdr
);
3440 if (!ambiguous_peer_updates
.count(from
)) {
3441 resolve_ack_gather
.erase(from
);
3442 maybe_finish_peer_resolve();
3446 void MDCache::add_uncommitted_peer(metareqid_t reqid
, LogSegment
*ls
, mds_rank_t leader
, MDPeerUpdate
*su
)
3448 auto const &ret
= uncommitted_peers
.emplace(std::piecewise_construct
,
3449 std::forward_as_tuple(reqid
),
3450 std::forward_as_tuple());
3451 ceph_assert(ret
.second
);
3452 ls
->uncommitted_peers
.insert(reqid
);
3453 upeer
&u
= ret
.first
->second
;
3457 if (su
== nullptr) {
3460 for(set
<CInode
*>::iterator p
= su
->olddirs
.begin(); p
!= su
->olddirs
.end(); ++p
)
3461 uncommitted_peer_rename_olddir
[*p
]++;
3462 for(set
<CInode
*>::iterator p
= su
->unlinked
.begin(); p
!= su
->unlinked
.end(); ++p
)
3463 uncommitted_peer_unlink
[*p
]++;
3466 void MDCache::finish_uncommitted_peer(metareqid_t reqid
, bool assert_exist
)
3468 auto it
= uncommitted_peers
.find(reqid
);
3469 if (it
== uncommitted_peers
.end()) {
3470 ceph_assert(!assert_exist
);
3473 upeer
&u
= it
->second
;
3474 MDPeerUpdate
* su
= u
.su
;
3476 if (!u
.waiters
.empty()) {
3477 mds
->queue_waiters(u
.waiters
);
3479 u
.ls
->uncommitted_peers
.erase(reqid
);
3480 uncommitted_peers
.erase(it
);
3482 if (su
== nullptr) {
3485 // discard the non-auth subtree we renamed out of
3486 for(set
<CInode
*>::iterator p
= su
->olddirs
.begin(); p
!= su
->olddirs
.end(); ++p
) {
3488 map
<CInode
*, int>::iterator it
= uncommitted_peer_rename_olddir
.find(diri
);
3489 ceph_assert(it
!= uncommitted_peer_rename_olddir
.end());
3491 if (it
->second
== 0) {
3492 uncommitted_peer_rename_olddir
.erase(it
);
3493 auto&& ls
= diri
->get_dirfrags();
3494 for (const auto& dir
: ls
) {
3495 CDir
*root
= get_subtree_root(dir
);
3496 if (root
->get_dir_auth() == CDIR_AUTH_UNDEF
) {
3497 try_trim_non_auth_subtree(root
);
3503 ceph_assert(it
->second
> 0);
3505 // removed the inodes that were unlinked by peer update
3506 for(set
<CInode
*>::iterator p
= su
->unlinked
.begin(); p
!= su
->unlinked
.end(); ++p
) {
3508 map
<CInode
*, int>::iterator it
= uncommitted_peer_unlink
.find(in
);
3509 ceph_assert(it
!= uncommitted_peer_unlink
.end());
3511 if (it
->second
== 0) {
3512 uncommitted_peer_unlink
.erase(it
);
3513 if (!in
->get_projected_parent_dn())
3514 mds
->mdcache
->remove_inode_recursive(in
);
3516 ceph_assert(it
->second
> 0);
3521 MDPeerUpdate
* MDCache::get_uncommitted_peer(metareqid_t reqid
, mds_rank_t leader
)
3524 MDPeerUpdate
* su
= nullptr;
3525 auto it
= uncommitted_peers
.find(reqid
);
3526 if (it
!= uncommitted_peers
.end() &&
3527 it
->second
.leader
== leader
) {
3533 void MDCache::finish_rollback(metareqid_t reqid
, MDRequestRef
& mdr
) {
3534 auto p
= resolve_need_rollback
.find(reqid
);
3535 ceph_assert(p
!= resolve_need_rollback
.end());
3536 if (mds
->is_resolve()) {
3537 finish_uncommitted_peer(reqid
, false);
3539 finish_uncommitted_peer(mdr
->reqid
, mdr
->more()->peer_update_journaled
);
3541 resolve_need_rollback
.erase(p
);
3542 maybe_finish_peer_resolve();
3545 void MDCache::disambiguate_other_imports()
3547 dout(10) << "disambiguate_other_imports" << dendl
;
3549 bool recovering
= !(mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping());
3550 // other nodes' ambiguous imports
3551 for (map
<mds_rank_t
, map
<dirfrag_t
, vector
<dirfrag_t
> > >::iterator p
= other_ambiguous_imports
.begin();
3552 p
!= other_ambiguous_imports
.end();
3554 mds_rank_t who
= p
->first
;
3555 dout(10) << "ambiguous imports for mds." << who
<< dendl
;
3557 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator q
= p
->second
.begin();
3558 q
!= p
->second
.end();
3560 dout(10) << " ambiguous import " << q
->first
<< " bounds " << q
->second
<< dendl
;
3561 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3562 CDir
*dir
= get_force_dirfrag(q
->first
, recovering
);
3565 if (dir
->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3566 dir
->authority() == CDIR_AUTH_UNDEF
) { // resolving
3567 dout(10) << " mds." << who
<< " did import " << *dir
<< dendl
;
3568 adjust_bounded_subtree_auth(dir
, q
->second
, who
);
3569 try_subtree_merge(dir
);
3571 dout(10) << " mds." << who
<< " did not import " << *dir
<< dendl
;
3575 other_ambiguous_imports
.clear();
3578 void MDCache::disambiguate_my_imports()
3580 dout(10) << "disambiguate_my_imports" << dendl
;
3582 if (!mds
->is_resolve()) {
3583 ceph_assert(my_ambiguous_imports
.empty());
3587 disambiguate_other_imports();
3589 // my ambiguous imports
3590 mds_authority_t
me_ambig(mds
->get_nodeid(), mds
->get_nodeid());
3591 while (!my_ambiguous_imports
.empty()) {
3592 map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator q
= my_ambiguous_imports
.begin();
3594 CDir
*dir
= get_dirfrag(q
->first
);
3597 if (dir
->authority() != me_ambig
) {
3598 dout(10) << "ambiguous import auth known, must not be me " << *dir
<< dendl
;
3599 cancel_ambiguous_import(dir
);
3601 mds
->mdlog
->start_submit_entry(new EImportFinish(dir
, false));
3603 // subtree may have been swallowed by another node claiming dir
3605 CDir
*root
= get_subtree_root(dir
);
3607 dout(10) << " subtree root is " << *root
<< dendl
;
3608 ceph_assert(root
->dir_auth
.first
!= mds
->get_nodeid()); // no us!
3609 try_trim_non_auth_subtree(root
);
3611 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir
<< dendl
;
3612 finish_ambiguous_import(q
->first
);
3613 mds
->mdlog
->start_submit_entry(new EImportFinish(dir
, true));
3616 ceph_assert(my_ambiguous_imports
.empty());
3617 mds
->mdlog
->flush();
3619 // verify all my subtrees are unambiguous!
3620 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3621 p
!= subtrees
.end();
3623 CDir
*dir
= p
->first
;
3624 if (dir
->is_ambiguous_dir_auth()) {
3625 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir
<< dendl
;
3627 ceph_assert(!dir
->is_ambiguous_dir_auth());
3634 void MDCache::add_ambiguous_import(dirfrag_t base
, const vector
<dirfrag_t
>& bounds
)
3636 ceph_assert(my_ambiguous_imports
.count(base
) == 0);
3637 my_ambiguous_imports
[base
] = bounds
;
3641 void MDCache::add_ambiguous_import(CDir
*base
, const set
<CDir
*>& bounds
)
3644 vector
<dirfrag_t
> binos
;
3645 for (set
<CDir
*>::iterator p
= bounds
.begin();
3648 binos
.push_back((*p
)->dirfrag());
3650 // note: this can get called twice if the exporter fails during recovery
3651 if (my_ambiguous_imports
.count(base
->dirfrag()))
3652 my_ambiguous_imports
.erase(base
->dirfrag());
3654 add_ambiguous_import(base
->dirfrag(), binos
);
3657 void MDCache::cancel_ambiguous_import(CDir
*dir
)
3659 dirfrag_t df
= dir
->dirfrag();
3660 ceph_assert(my_ambiguous_imports
.count(df
));
3661 dout(10) << "cancel_ambiguous_import " << df
3662 << " bounds " << my_ambiguous_imports
[df
]
3665 my_ambiguous_imports
.erase(df
);
3668 void MDCache::finish_ambiguous_import(dirfrag_t df
)
3670 ceph_assert(my_ambiguous_imports
.count(df
));
3671 vector
<dirfrag_t
> bounds
;
3672 bounds
.swap(my_ambiguous_imports
[df
]);
3673 my_ambiguous_imports
.erase(df
);
3675 dout(10) << "finish_ambiguous_import " << df
3676 << " bounds " << bounds
3678 CDir
*dir
= get_dirfrag(df
);
3681 // adjust dir_auth, import maps
3682 adjust_bounded_subtree_auth(dir
, bounds
, mds
->get_nodeid());
3683 try_subtree_merge(dir
);
3686 void MDCache::remove_inode_recursive(CInode
*in
)
3688 dout(10) << "remove_inode_recursive " << *in
<< dendl
;
3689 auto&& ls
= in
->get_dirfrags();
3690 for (const auto& subdir
: ls
) {
3691 dout(10) << " removing dirfrag " << *subdir
<< dendl
;
3692 auto it
= subdir
->items
.begin();
3693 while (it
!= subdir
->items
.end()) {
3694 CDentry
*dn
= it
->second
;
3696 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3697 if (dnl
->is_primary()) {
3698 CInode
*tin
= dnl
->get_inode();
3699 subdir
->unlink_inode(dn
, false);
3700 remove_inode_recursive(tin
);
3702 subdir
->remove_dentry(dn
);
3705 if (subdir
->is_subtree_root())
3706 remove_subtree(subdir
);
3707 in
->close_dirfrag(subdir
->dirfrag().frag
);
3712 bool MDCache::expire_recursive(CInode
*in
, expiremap
&expiremap
)
3714 ceph_assert(!in
->is_auth());
3716 dout(10) << __func__
<< ":" << *in
<< dendl
;
3718 // Recurse into any dirfrags beneath this inode
3719 auto&& ls
= in
->get_dirfrags();
3720 for (const auto& subdir
: ls
) {
3721 if (!in
->is_mdsdir() && subdir
->is_subtree_root()) {
3722 dout(10) << __func__
<< ": stray still has subtree " << *in
<< dendl
;
3726 for (auto it
= subdir
->items
.begin(); it
!= subdir
->items
.end();) {
3727 CDentry
*dn
= it
->second
;
3729 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3730 if (dnl
->is_primary()) {
3731 CInode
*tin
= dnl
->get_inode();
3733 /* Remote strays with linkage (i.e. hardlinks) should not be
3734 * expired, because they may be the target of
3735 * a rename() as the owning MDS shuts down */
3736 if (!tin
->is_stray() && tin
->get_inode()->nlink
) {
3737 dout(10) << __func__
<< ": stray still has linkage " << *tin
<< dendl
;
3741 const bool abort
= expire_recursive(tin
, expiremap
);
3746 if (dn
->lru_is_expireable()) {
3747 trim_dentry(dn
, expiremap
);
3749 dout(10) << __func__
<< ": stray dn is not expireable " << *dn
<< dendl
;
3758 void MDCache::trim_unlinked_inodes()
3760 dout(7) << "trim_unlinked_inodes" << dendl
;
3763 for (auto &p
: inode_map
) {
3764 CInode
*in
= p
.second
;
3765 if (in
->get_parent_dn() == NULL
&& !in
->is_base()) {
3766 dout(7) << " will trim from " << *in
<< dendl
;
3770 if (!(++count
% mds
->heartbeat_reset_grace()))
3771 mds
->heartbeat_reset();
3773 for (auto& in
: q
) {
3774 remove_inode_recursive(in
);
3776 if (!(++count
% mds
->heartbeat_reset_grace()))
3777 mds
->heartbeat_reset();
3781 /** recalc_auth_bits()
3782 * once subtree auth is disambiguated, we need to adjust all the
3783 * auth and dirty bits in our cache before moving on.
3785 void MDCache::recalc_auth_bits(bool replay
)
3787 dout(7) << "recalc_auth_bits " << (replay
? "(replay)" : "") << dendl
;
3790 root
->inode_auth
.first
= mds
->mdsmap
->get_root();
3791 bool auth
= mds
->get_nodeid() == root
->inode_auth
.first
;
3793 root
->state_set(CInode::STATE_AUTH
);
3795 root
->state_clear(CInode::STATE_AUTH
);
3797 root
->state_set(CInode::STATE_REJOINING
);
3801 set
<CInode
*> subtree_inodes
;
3802 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3803 p
!= subtrees
.end();
3805 if (p
->first
->dir_auth
.first
== mds
->get_nodeid())
3806 subtree_inodes
.insert(p
->first
->inode
);
3809 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3810 p
!= subtrees
.end();
3812 if (p
->first
->inode
->is_mdsdir()) {
3813 CInode
*in
= p
->first
->inode
;
3814 bool auth
= in
->ino() == MDS_INO_MDSDIR(mds
->get_nodeid());
3816 in
->state_set(CInode::STATE_AUTH
);
3818 in
->state_clear(CInode::STATE_AUTH
);
3820 in
->state_set(CInode::STATE_REJOINING
);
3824 std::queue
<CDir
*> dfq
; // dirfrag queue
3827 bool auth
= p
->first
->authority().first
== mds
->get_nodeid();
3828 dout(10) << " subtree auth=" << auth
<< " for " << *p
->first
<< dendl
;
3830 while (!dfq
.empty()) {
3831 CDir
*dir
= dfq
.front();
3836 dir
->state_set(CDir::STATE_AUTH
);
3838 dir
->state_clear(CDir::STATE_AUTH
);
3840 // close empty non-auth dirfrag
3841 if (!dir
->is_subtree_root() && dir
->get_num_any() == 0) {
3842 dir
->inode
->close_dirfrag(dir
->get_frag());
3845 dir
->state_set(CDir::STATE_REJOINING
);
3846 dir
->state_clear(CDir::STATE_COMPLETE
);
3847 if (dir
->is_dirty())
3852 // dentries in this dir
3853 for (auto &p
: dir
->items
) {
3855 CDentry
*dn
= p
.second
;
3856 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3862 dn
->state_set(CDentry::STATE_REJOINING
);
3868 if (dnl
->is_primary()) {
3870 CInode
*in
= dnl
->get_inode();
3872 in
->state_set(CInode::STATE_AUTH
);
3874 in
->state_clear(CInode::STATE_AUTH
);
3876 in
->state_set(CInode::STATE_REJOINING
);
3879 if (in
->is_dirty_parent())
3880 in
->clear_dirty_parent();
3881 // avoid touching scatterlocks for our subtree roots!
3882 if (subtree_inodes
.count(in
) == 0)
3883 in
->clear_scatter_dirty();
3888 auto&& dfv
= in
->get_nested_dirfrags();
3889 for (const auto& dir
: dfv
) {
3904 // ===========================================================================
3908 * notes on scatterlock recovery:
3910 * - recovering inode replica sends scatterlock data for any subtree
3911 * roots (the only ones that are possibly dirty).
3913 * - surviving auth incorporates any provided scatterlock data. any
3914 * pending gathers are then finished, as with the other lock types.
3916 * that takes care of surviving auth + (recovering replica)*.
3918 * - surviving replica sends strong_inode, which includes current
3919 * scatterlock state, AND any dirty scatterlock data. this
3920 * provides the recovering auth with everything it might need.
3922 * - recovering auth must pick initial scatterlock state based on
3923 * (weak|strong) rejoins.
3924 * - always assimilate scatterlock data (it can't hurt)
3925 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3926 * - include base inode in ack for all inodes that saw scatterlock content
3928 * also, for scatter gather,
3930 * - auth increments {frag,r}stat.version on completion of any gather.
3932 * - auth incorporates changes in a gather _only_ if the version
3935 * - replica discards changes any time the scatterlock syncs, and
3939 void MDCache::dump_rejoin_status(Formatter
*f
) const
3941 f
->open_object_section("rejoin_status");
3942 f
->dump_stream("rejoin_gather") << rejoin_gather
;
3943 f
->dump_stream("rejoin_ack_gather") << rejoin_ack_gather
;
3944 f
->dump_unsigned("num_opening_inodes", cap_imports_num_opening
);
3948 void MDCache::rejoin_start(MDSContext
*rejoin_done_
)
3950 dout(10) << "rejoin_start" << dendl
;
3951 ceph_assert(!rejoin_done
);
3952 rejoin_done
.reset(rejoin_done_
);
3954 rejoin_gather
= recovery_set
;
3955 // need finish opening cap inodes before sending cache rejoins
3956 rejoin_gather
.insert(mds
->get_nodeid());
3957 process_imported_caps();
3963 * this initiates rejoin. it should be called before we get any
3964 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3966 * we start out by sending rejoins to everyone in the recovery set.
3968 * if we are rejoin, send for all regions in our cache.
3969 * if we are active|stopping, send only to nodes that are rejoining.
3971 void MDCache::rejoin_send_rejoins()
3973 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set
<< dendl
;
3975 if (rejoin_gather
.count(mds
->get_nodeid())) {
3976 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl
;
3977 rejoins_pending
= true;
3980 if (!resolve_gather
.empty()) {
3981 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
3982 << resolve_gather
<< ")" << dendl
;
3983 rejoins_pending
= true;
3987 ceph_assert(!migrator
->is_importing());
3988 ceph_assert(!migrator
->is_exporting());
3990 if (!mds
->is_rejoin()) {
3991 disambiguate_other_imports();
3994 map
<mds_rank_t
, ref_t
<MMDSCacheRejoin
>> rejoins
;
3997 // if i am rejoining, send a rejoin to everyone.
3998 // otherwise, just send to others who are rejoining.
3999 for (const auto& rank
: recovery_set
) {
4000 if (rank
== mds
->get_nodeid()) continue; // nothing to myself!
4001 if (rejoin_sent
.count(rank
)) continue; // already sent a rejoin to this node!
4002 if (mds
->is_rejoin())
4003 rejoins
[rank
] = make_message
<MMDSCacheRejoin
>(MMDSCacheRejoin::OP_WEAK
);
4004 else if (mds
->mdsmap
->is_rejoin(rank
))
4005 rejoins
[rank
] = make_message
<MMDSCacheRejoin
>(MMDSCacheRejoin::OP_STRONG
);
4008 if (mds
->is_rejoin()) {
4009 map
<client_t
, pair
<Session
*, set
<mds_rank_t
> > > client_exports
;
4010 for (auto& p
: cap_exports
) {
4011 mds_rank_t target
= p
.second
.first
;
4012 if (rejoins
.count(target
) == 0)
4014 for (auto q
= p
.second
.second
.begin(); q
!= p
.second
.second
.end(); ) {
4015 Session
*session
= nullptr;
4016 auto it
= client_exports
.find(q
->first
);
4017 if (it
!= client_exports
.end()) {
4018 session
= it
->second
.first
;
4020 it
->second
.second
.insert(target
);
4022 session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
4023 auto& r
= client_exports
[q
->first
];
4026 r
.second
.insert(target
);
4031 // remove reconnect with no session
4032 p
.second
.second
.erase(q
++);
4035 rejoins
[target
]->cap_exports
[p
.first
] = p
.second
.second
;
4037 for (auto& p
: client_exports
) {
4038 Session
*session
= p
.second
.first
;
4039 for (auto& q
: p
.second
.second
) {
4040 auto rejoin
= rejoins
[q
];
4041 rejoin
->client_map
[p
.first
] = session
->info
.inst
;
4042 rejoin
->client_metadata_map
[p
.first
] = session
->info
.client_metadata
;
4048 // check all subtrees
4049 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
4050 p
!= subtrees
.end();
4052 CDir
*dir
= p
->first
;
4053 ceph_assert(dir
->is_subtree_root());
4054 if (dir
->is_ambiguous_dir_auth()) {
4055 // exporter is recovering, importer is survivor.
4056 ceph_assert(rejoins
.count(dir
->authority().first
));
4057 ceph_assert(!rejoins
.count(dir
->authority().second
));
4063 continue; // skip my own regions!
4065 mds_rank_t auth
= dir
->get_dir_auth().first
;
4066 ceph_assert(auth
>= 0);
4067 if (rejoins
.count(auth
) == 0)
4068 continue; // don't care about this node's subtrees
4070 rejoin_walk(dir
, rejoins
[auth
]);
4073 // rejoin root inodes, too
4074 for (auto &p
: rejoins
) {
4075 if (mds
->is_rejoin()) {
4077 if (p
.first
== 0 && root
) {
4078 p
.second
->add_weak_inode(root
->vino());
4079 if (root
->is_dirty_scattered()) {
4080 dout(10) << " sending scatterlock state on root " << *root
<< dendl
;
4081 p
.second
->add_scatterlock_state(root
);
4084 if (CInode
*in
= get_inode(MDS_INO_MDSDIR(p
.first
))) {
4086 p
.second
->add_weak_inode(in
->vino());
4090 if (p
.first
== 0 && root
) {
4091 p
.second
->add_strong_inode(root
->vino(),
4092 root
->get_replica_nonce(),
4093 root
->get_caps_wanted(),
4094 root
->filelock
.get_state(),
4095 root
->nestlock
.get_state(),
4096 root
->dirfragtreelock
.get_state());
4097 root
->state_set(CInode::STATE_REJOINING
);
4098 if (root
->is_dirty_scattered()) {
4099 dout(10) << " sending scatterlock state on root " << *root
<< dendl
;
4100 p
.second
->add_scatterlock_state(root
);
4104 if (CInode
*in
= get_inode(MDS_INO_MDSDIR(p
.first
))) {
4105 p
.second
->add_strong_inode(in
->vino(),
4106 in
->get_replica_nonce(),
4107 in
->get_caps_wanted(),
4108 in
->filelock
.get_state(),
4109 in
->nestlock
.get_state(),
4110 in
->dirfragtreelock
.get_state());
4111 in
->state_set(CInode::STATE_REJOINING
);
4116 if (!mds
->is_rejoin()) {
4117 // i am survivor. send strong rejoin.
4118 // note request remote_auth_pins, xlocks
4119 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
4120 p
!= active_requests
.end();
4122 MDRequestRef
& mdr
= p
->second
;
4126 for (const auto& q
: mdr
->object_states
) {
4127 if (q
.second
.remote_auth_pinned
== MDS_RANK_NONE
)
4129 if (!q
.first
->is_auth()) {
4130 mds_rank_t target
= q
.second
.remote_auth_pinned
;
4131 ceph_assert(target
== q
.first
->authority().first
);
4132 if (rejoins
.count(target
) == 0) continue;
4133 const auto& rejoin
= rejoins
[target
];
4135 dout(15) << " " << *mdr
<< " authpin on " << *q
.first
<< dendl
;
4136 MDSCacheObjectInfo i
;
4137 q
.first
->set_object_info(i
);
4139 rejoin
->add_inode_authpin(vinodeno_t(i
.ino
, i
.snapid
), mdr
->reqid
, mdr
->attempt
);
4141 rejoin
->add_dentry_authpin(i
.dirfrag
, i
.dname
, i
.snapid
, mdr
->reqid
, mdr
->attempt
);
4143 if (mdr
->has_more() && mdr
->more()->is_remote_frozen_authpin
&&
4144 mdr
->more()->rename_inode
== q
.first
)
4145 rejoin
->add_inode_frozen_authpin(vinodeno_t(i
.ino
, i
.snapid
),
4146 mdr
->reqid
, mdr
->attempt
);
4150 for (const auto& q
: mdr
->locks
) {
4152 auto obj
= lock
->get_parent();
4153 if (q
.is_xlock() && !obj
->is_auth()) {
4154 mds_rank_t who
= obj
->authority().first
;
4155 if (rejoins
.count(who
) == 0) continue;
4156 const auto& rejoin
= rejoins
[who
];
4158 dout(15) << " " << *mdr
<< " xlock on " << *lock
<< " " << *obj
<< dendl
;
4159 MDSCacheObjectInfo i
;
4160 obj
->set_object_info(i
);
4162 rejoin
->add_inode_xlock(vinodeno_t(i
.ino
, i
.snapid
), lock
->get_type(),
4163 mdr
->reqid
, mdr
->attempt
);
4165 rejoin
->add_dentry_xlock(i
.dirfrag
, i
.dname
, i
.snapid
,
4166 mdr
->reqid
, mdr
->attempt
);
4167 } else if (q
.is_remote_wrlock()) {
4168 mds_rank_t who
= q
.wrlock_target
;
4169 if (rejoins
.count(who
) == 0) continue;
4170 const auto& rejoin
= rejoins
[who
];
4172 dout(15) << " " << *mdr
<< " wrlock on " << *lock
<< " " << *obj
<< dendl
;
4173 MDSCacheObjectInfo i
;
4174 obj
->set_object_info(i
);
4176 rejoin
->add_inode_wrlock(vinodeno_t(i
.ino
, i
.snapid
), lock
->get_type(),
4177 mdr
->reqid
, mdr
->attempt
);
4183 // send the messages
4184 for (auto &p
: rejoins
) {
4185 ceph_assert(rejoin_sent
.count(p
.first
) == 0);
4186 ceph_assert(rejoin_ack_gather
.count(p
.first
) == 0);
4187 rejoin_sent
.insert(p
.first
);
4188 rejoin_ack_gather
.insert(p
.first
);
4189 mds
->send_message_mds(p
.second
, p
.first
);
4191 rejoin_ack_gather
.insert(mds
->get_nodeid()); // we need to complete rejoin_gather_finish, too
4192 rejoins_pending
= false;
4195 if (mds
->is_rejoin() && rejoin_gather
.empty()) {
4196 dout(10) << "nothing to rejoin" << dendl
;
4197 rejoin_gather_finish();
4203 * rejoin_walk - build rejoin declarations for a subtree
4205 * @param dir subtree root
4206 * @param rejoin rejoin message
4208 * from a rejoining node:
4210 * weak dentries (w/ connectivity)
4212 * from a surviving node:
4214 * strong dentries (no connectivity!)
4217 void MDCache::rejoin_walk(CDir
*dir
, const ref_t
<MMDSCacheRejoin
> &rejoin
)
4219 dout(10) << "rejoin_walk " << *dir
<< dendl
;
4221 std::vector
<CDir
*> nested
; // finish this dir, then do nested items
4223 if (mds
->is_rejoin()) {
4225 rejoin
->add_weak_dirfrag(dir
->dirfrag());
4226 for (auto &p
: dir
->items
) {
4227 CDentry
*dn
= p
.second
;
4228 ceph_assert(dn
->last
== CEPH_NOSNAP
);
4229 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4230 dout(15) << " add_weak_primary_dentry " << *dn
<< dendl
;
4231 ceph_assert(dnl
->is_primary());
4232 CInode
*in
= dnl
->get_inode();
4233 ceph_assert(dnl
->get_inode()->is_dir());
4234 rejoin
->add_weak_primary_dentry(dir
->ino(), dn
->get_name(), dn
->first
, dn
->last
, in
->ino());
4236 auto&& dirs
= in
->get_nested_dirfrags();
4237 nested
.insert(std::end(nested
), std::begin(dirs
), std::end(dirs
));
4239 if (in
->is_dirty_scattered()) {
4240 dout(10) << " sending scatterlock state on " << *in
<< dendl
;
4241 rejoin
->add_scatterlock_state(in
);
4246 dout(15) << " add_strong_dirfrag " << *dir
<< dendl
;
4247 rejoin
->add_strong_dirfrag(dir
->dirfrag(), dir
->get_replica_nonce(), dir
->get_dir_rep());
4248 dir
->state_set(CDir::STATE_REJOINING
);
4250 for (auto it
= dir
->items
.begin(); it
!= dir
->items
.end(); ) {
4251 CDentry
*dn
= it
->second
;
4253 dn
->state_set(CDentry::STATE_REJOINING
);
4254 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4255 CInode
*in
= dnl
->is_primary() ? dnl
->get_inode() : NULL
;
4257 // trim snap dentries. because they may have been pruned by
4258 // their auth mds (snap deleted)
4259 if (dn
->last
!= CEPH_NOSNAP
) {
4260 if (in
&& !in
->remote_parents
.empty()) {
4261 // unlink any stale remote snap dentry.
4262 for (auto it2
= in
->remote_parents
.begin(); it2
!= in
->remote_parents
.end(); ) {
4263 CDentry
*remote_dn
= *it2
;
4265 ceph_assert(remote_dn
->last
!= CEPH_NOSNAP
);
4266 remote_dn
->unlink_remote(remote_dn
->get_linkage());
4269 if (dn
->lru_is_expireable()) {
4270 if (!dnl
->is_null())
4271 dir
->unlink_inode(dn
, false);
4274 dir
->remove_dentry(dn
);
4277 // Inventing null/remote dentry shouldn't cause problem
4278 ceph_assert(!dnl
->is_primary());
4282 dout(15) << " add_strong_dentry " << *dn
<< dendl
;
4283 rejoin
->add_strong_dentry(dir
->dirfrag(), dn
->get_name(), dn
->get_alternate_name(),
4284 dn
->first
, dn
->last
,
4285 dnl
->is_primary() ? dnl
->get_inode()->ino():inodeno_t(0),
4286 dnl
->is_remote() ? dnl
->get_remote_ino():inodeno_t(0),
4287 dnl
->is_remote() ? dnl
->get_remote_d_type():0,
4288 dn
->get_replica_nonce(),
4289 dn
->lock
.get_state());
4290 dn
->state_set(CDentry::STATE_REJOINING
);
4291 if (dnl
->is_primary()) {
4292 CInode
*in
= dnl
->get_inode();
4293 dout(15) << " add_strong_inode " << *in
<< dendl
;
4294 rejoin
->add_strong_inode(in
->vino(),
4295 in
->get_replica_nonce(),
4296 in
->get_caps_wanted(),
4297 in
->filelock
.get_state(),
4298 in
->nestlock
.get_state(),
4299 in
->dirfragtreelock
.get_state());
4300 in
->state_set(CInode::STATE_REJOINING
);
4302 auto&& dirs
= in
->get_nested_dirfrags();
4303 nested
.insert(std::end(nested
), std::begin(dirs
), std::end(dirs
));
4305 if (in
->is_dirty_scattered()) {
4306 dout(10) << " sending scatterlock state on " << *in
<< dendl
;
4307 rejoin
->add_scatterlock_state(in
);
4313 // recurse into nested dirs
4314 for (const auto& dir
: nested
) {
4315 rejoin_walk(dir
, rejoin
);
4322 * - reply with the lockstate
4324 * if i am active|stopping,
4325 * - remove source from replica list for everything not referenced here.
4327 void MDCache::handle_cache_rejoin(const cref_t
<MMDSCacheRejoin
> &m
)
4329 dout(7) << "handle_cache_rejoin " << *m
<< " from " << m
->get_source()
4330 << " (" << m
->get_payload().length() << " bytes)"
4334 case MMDSCacheRejoin::OP_WEAK
:
4335 handle_cache_rejoin_weak(m
);
4337 case MMDSCacheRejoin::OP_STRONG
:
4338 handle_cache_rejoin_strong(m
);
4340 case MMDSCacheRejoin::OP_ACK
:
4341 handle_cache_rejoin_ack(m
);
4351 * handle_cache_rejoin_weak
4354 * - is recovering from their journal.
4355 * - may have incorrect (out of date) inode contents
4356 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4358 * if the sender didn't trim_non_auth(), they
4359 * - may have incorrect (out of date) dentry/inode linkage
4360 * - may have deleted/purged inodes
4361 * and i may have to go to disk to get accurate inode contents. yuck.
4363 void MDCache::handle_cache_rejoin_weak(const cref_t
<MMDSCacheRejoin
> &weak
)
4365 mds_rank_t from
= mds_rank_t(weak
->get_source().num());
4367 // possible response(s)
4368 ref_t
<MMDSCacheRejoin
> ack
; // if survivor
4369 set
<vinodeno_t
> acked_inodes
; // if survivor
4370 set
<SimpleLock
*> gather_locks
; // if survivor
4371 bool survivor
= false; // am i a survivor?
4373 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping()) {
4375 dout(10) << "i am a surivivor, and will ack immediately" << dendl
;
4376 ack
= make_message
<MMDSCacheRejoin
>(MMDSCacheRejoin::OP_ACK
);
4378 map
<inodeno_t
,map
<client_t
,Capability::Import
> > imported_caps
;
4380 // check cap exports
4381 for (auto p
= weak
->cap_exports
.begin(); p
!= weak
->cap_exports
.end(); ++p
) {
4382 CInode
*in
= get_inode(p
->first
);
4383 ceph_assert(!in
|| in
->is_auth());
4384 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
4385 dout(10) << " claiming cap import " << p
->first
<< " client." << q
->first
<< " on " << *in
<< dendl
;
4386 Capability
*cap
= rejoin_import_cap(in
, q
->first
, q
->second
, from
);
4387 Capability::Import
& im
= imported_caps
[p
->first
][q
->first
];
4389 im
.cap_id
= cap
->get_cap_id();
4390 im
.issue_seq
= cap
->get_last_seq();
4391 im
.mseq
= cap
->get_mseq();
4396 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);
4399 encode(imported_caps
, ack
->imported_caps
);
4401 ceph_assert(mds
->is_rejoin());
4403 // we may have already received a strong rejoin from the sender.
4404 rejoin_scour_survivor_replicas(from
, NULL
, acked_inodes
, gather_locks
);
4405 ceph_assert(gather_locks
.empty());
4407 // check cap exports.
4408 rejoin_client_map
.insert(weak
->client_map
.begin(), weak
->client_map
.end());
4409 rejoin_client_metadata_map
.insert(weak
->client_metadata_map
.begin(),
4410 weak
->client_metadata_map
.end());
4412 for (auto p
= weak
->cap_exports
.begin(); p
!= weak
->cap_exports
.end(); ++p
) {
4413 CInode
*in
= get_inode(p
->first
);
4414 ceph_assert(!in
|| in
->is_auth());
4416 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
4417 dout(10) << " claiming cap import " << p
->first
<< " client." << q
->first
<< dendl
;
4418 cap_imports
[p
->first
][q
->first
][from
] = q
->second
;
4423 // assimilate any potentially dirty scatterlock state
4424 for (const auto &p
: weak
->inode_scatterlocks
) {
4425 CInode
*in
= get_inode(p
.first
);
4427 in
->decode_lock_state(CEPH_LOCK_IFILE
, p
.second
.file
);
4428 in
->decode_lock_state(CEPH_LOCK_INEST
, p
.second
.nest
);
4429 in
->decode_lock_state(CEPH_LOCK_IDFT
, p
.second
.dft
);
4431 rejoin_potential_updated_scatterlocks
.insert(in
);
4434 // recovering peer may send incorrect dirfrags here. we need to
4435 // infer which dirfrag they meant. the ack will include a
4436 // strong_dirfrag that will set them straight on the fragmentation.
4439 set
<CDir
*> dirs_to_share
;
4440 for (const auto &p
: weak
->weak_dirfrags
) {
4441 CInode
*diri
= get_inode(p
.ino
);
4443 dout(0) << " missing dir ino " << p
.ino
<< dendl
;
4447 if (diri
->dirfragtree
.is_leaf(p
.frag
)) {
4448 leaves
.push_back(p
.frag
);
4450 diri
->dirfragtree
.get_leaves_under(p
.frag
, leaves
);
4452 leaves
.push_back(diri
->dirfragtree
[p
.frag
.value()]);
4454 for (const auto& leaf
: leaves
) {
4455 CDir
*dir
= diri
->get_dirfrag(leaf
);
4457 dout(0) << " missing dir for " << p
.frag
<< " (which maps to " << leaf
<< ") on " << *diri
<< dendl
;
4461 if (dirs_to_share
.count(dir
)) {
4462 dout(10) << " already have " << p
.frag
<< " -> " << leaf
<< " " << *dir
<< dendl
;
4464 dirs_to_share
.insert(dir
);
4465 unsigned nonce
= dir
->add_replica(from
);
4466 dout(10) << " have " << p
.frag
<< " -> " << leaf
<< " " << *dir
<< dendl
;
4468 ack
->add_strong_dirfrag(dir
->dirfrag(), nonce
, dir
->dir_rep
);
4469 ack
->add_dirfrag_base(dir
);
4475 for (const auto &p
: weak
->weak
) {
4476 CInode
*diri
= get_inode(p
.first
);
4478 dout(0) << " missing dir ino " << p
.first
<< dendl
;
4483 for (const auto &q
: p
.second
) {
4484 // locate proper dirfrag.
4485 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
4486 frag_t fg
= diri
->pick_dirfrag(q
.first
.name
);
4487 if (!dir
|| dir
->get_frag() != fg
) {
4488 dir
= diri
->get_dirfrag(fg
);
4490 dout(0) << " missing dir frag " << fg
<< " on " << *diri
<< dendl
;
4492 ceph_assert(dirs_to_share
.count(dir
));
4496 CDentry
*dn
= dir
->lookup(q
.first
.name
, q
.first
.snapid
);
4498 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4499 ceph_assert(dnl
->is_primary());
4501 if (survivor
&& dn
->is_replica(from
))
4502 dentry_remove_replica(dn
, from
, gather_locks
);
4503 unsigned dnonce
= dn
->add_replica(from
);
4504 dout(10) << " have " << *dn
<< dendl
;
4506 ack
->add_strong_dentry(dir
->dirfrag(), dn
->get_name(), dn
->get_alternate_name(),
4507 dn
->first
, dn
->last
,
4508 dnl
->get_inode()->ino(), inodeno_t(0), 0,
4509 dnonce
, dn
->lock
.get_replica_state());
4512 CInode
*in
= dnl
->get_inode();
4515 if (survivor
&& in
->is_replica(from
))
4516 inode_remove_replica(in
, from
, true, gather_locks
);
4517 unsigned inonce
= in
->add_replica(from
);
4518 dout(10) << " have " << *in
<< dendl
;
4520 // scatter the dirlock, just in case?
4521 if (!survivor
&& in
->is_dir() && in
->has_subtree_root_dirfrag())
4522 in
->filelock
.set_state(LOCK_MIX
);
4525 acked_inodes
.insert(in
->vino());
4526 ack
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
4528 in
->_encode_locks_state_for_rejoin(bl
, from
);
4529 ack
->add_inode_locks(in
, inonce
, bl
);
4534 // weak base inodes? (root, stray, etc.)
4535 for (set
<vinodeno_t
>::iterator p
= weak
->weak_inodes
.begin();
4536 p
!= weak
->weak_inodes
.end();
4538 CInode
*in
= get_inode(*p
);
4539 ceph_assert(in
); // hmm fixme wrt stray?
4540 if (survivor
&& in
->is_replica(from
))
4541 inode_remove_replica(in
, from
, true, gather_locks
);
4542 unsigned inonce
= in
->add_replica(from
);
4543 dout(10) << " have base " << *in
<< dendl
;
4546 acked_inodes
.insert(in
->vino());
4547 ack
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
4549 in
->_encode_locks_state_for_rejoin(bl
, from
);
4550 ack
->add_inode_locks(in
, inonce
, bl
);
4554 ceph_assert(rejoin_gather
.count(from
));
4555 rejoin_gather
.erase(from
);
4557 // survivor. do everything now.
4558 for (const auto &p
: weak
->inode_scatterlocks
) {
4559 CInode
*in
= get_inode(p
.first
);
4561 dout(10) << " including base inode (due to potential scatterlock update) " << *in
<< dendl
;
4562 acked_inodes
.insert(in
->vino());
4563 ack
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
4566 rejoin_scour_survivor_replicas(from
, ack
, acked_inodes
, gather_locks
);
4567 mds
->send_message(ack
, weak
->get_connection());
4569 for (set
<SimpleLock
*>::iterator p
= gather_locks
.begin(); p
!= gather_locks
.end(); ++p
) {
4570 if (!(*p
)->is_stable())
4571 mds
->locker
->eval_gather(*p
);
4575 if (rejoin_gather
.empty() && rejoin_ack_gather
.count(mds
->get_nodeid())) {
4576 rejoin_gather_finish();
4578 dout(7) << "still need rejoin from (" << rejoin_gather
<< ")" << dendl
;
4584 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4586 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4587 * ack, the replica dne, and we can remove it from our replica maps.
4589 void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from
, const cref_t
<MMDSCacheRejoin
> &ack
,
4590 set
<vinodeno_t
>& acked_inodes
,
4591 set
<SimpleLock
*>& gather_locks
)
4593 dout(10) << "rejoin_scour_survivor_replicas from mds." << from
<< dendl
;
4595 auto scour_func
= [this, from
, ack
, &acked_inodes
, &gather_locks
] (CInode
*in
) {
4597 if (in
->is_auth() &&
4598 in
->is_replica(from
) &&
4599 (ack
== NULL
|| acked_inodes
.count(in
->vino()) == 0)) {
4600 inode_remove_replica(in
, from
, false, gather_locks
);
4601 dout(10) << " rem " << *in
<< dendl
;
4607 const auto&& dfs
= in
->get_dirfrags();
4608 for (const auto& dir
: dfs
) {
4609 if (!dir
->is_auth())
4612 if (dir
->is_replica(from
) &&
4613 (ack
== NULL
|| ack
->strong_dirfrags
.count(dir
->dirfrag()) == 0)) {
4614 dir
->remove_replica(from
);
4615 dout(10) << " rem " << *dir
<< dendl
;
4619 for (auto &p
: dir
->items
) {
4620 CDentry
*dn
= p
.second
;
4622 if (dn
->is_replica(from
)) {
4624 const auto it
= ack
->strong_dentries
.find(dir
->dirfrag());
4625 if (it
!= ack
->strong_dentries
.end() && it
->second
.count(string_snap_t(dn
->get_name(), dn
->last
)) > 0) {
4629 dentry_remove_replica(dn
, from
, gather_locks
);
4630 dout(10) << " rem " << *dn
<< dendl
;
4636 for (auto &p
: inode_map
)
4637 scour_func(p
.second
);
4638 for (auto &p
: snap_inode_map
)
4639 scour_func(p
.second
);
4643 CInode
*MDCache::rejoin_invent_inode(inodeno_t ino
, snapid_t last
)
4645 CInode
*in
= new CInode(this, true, 2, last
);
4646 in
->_get_inode()->ino
= ino
;
4647 in
->state_set(CInode::STATE_REJOINUNDEF
);
4649 rejoin_undef_inodes
.insert(in
);
4650 dout(10) << " invented " << *in
<< dendl
;
4654 CDir
*MDCache::rejoin_invent_dirfrag(dirfrag_t df
)
4656 CInode
*in
= get_inode(df
.ino
);
4658 in
= rejoin_invent_inode(df
.ino
, CEPH_NOSNAP
);
4659 if (!in
->is_dir()) {
4660 ceph_assert(in
->state_test(CInode::STATE_REJOINUNDEF
));
4661 in
->_get_inode()->mode
= S_IFDIR
;
4662 in
->_get_inode()->dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
4664 CDir
*dir
= in
->get_or_open_dirfrag(this, df
.frag
);
4665 dir
->state_set(CDir::STATE_REJOINUNDEF
);
4666 rejoin_undef_dirfrags
.insert(dir
);
4667 dout(10) << " invented " << *dir
<< dendl
;
4671 void MDCache::handle_cache_rejoin_strong(const cref_t
<MMDSCacheRejoin
> &strong
)
4673 mds_rank_t from
= mds_rank_t(strong
->get_source().num());
4675 // only a recovering node will get a strong rejoin.
4676 if (!mds
->is_rejoin()) {
4677 if (mds
->get_want_state() == MDSMap::STATE_REJOIN
) {
4678 mds
->wait_for_rejoin(new C_MDS_RetryMessage(mds
, strong
));
4681 ceph_abort_msg("got unexpected rejoin message during recovery");
4684 // assimilate any potentially dirty scatterlock state
4685 for (const auto &p
: strong
->inode_scatterlocks
) {
4686 CInode
*in
= get_inode(p
.first
);
4688 in
->decode_lock_state(CEPH_LOCK_IFILE
, p
.second
.file
);
4689 in
->decode_lock_state(CEPH_LOCK_INEST
, p
.second
.nest
);
4690 in
->decode_lock_state(CEPH_LOCK_IDFT
, p
.second
.dft
);
4691 rejoin_potential_updated_scatterlocks
.insert(in
);
4694 rejoin_unlinked_inodes
[from
].clear();
4696 // surviving peer may send incorrect dirfrag here (maybe they didn't
4697 // get the fragment notify, or maybe we rolled back?). we need to
4698 // infer the right frag and get them with the program. somehow.
4699 // we don't normally send ACK.. so we'll need to bundle this with
4700 // MISSING or something.
4702 // strong dirfrags/dentries.
4703 // also process auth_pins, xlocks.
4704 for (const auto &p
: strong
->strong_dirfrags
) {
4705 auto& dirfrag
= p
.first
;
4706 CInode
*diri
= get_inode(dirfrag
.ino
);
4708 diri
= rejoin_invent_inode(dirfrag
.ino
, CEPH_NOSNAP
);
4709 CDir
*dir
= diri
->get_dirfrag(dirfrag
.frag
);
4710 bool refragged
= false;
4712 dout(10) << " have " << *dir
<< dendl
;
4714 if (diri
->state_test(CInode::STATE_REJOINUNDEF
))
4715 dir
= rejoin_invent_dirfrag(dirfrag_t(diri
->ino(), frag_t()));
4716 else if (diri
->dirfragtree
.is_leaf(dirfrag
.frag
))
4717 dir
= rejoin_invent_dirfrag(dirfrag
);
4720 dir
->add_replica(from
, p
.second
.nonce
);
4721 dir
->dir_rep
= p
.second
.dir_rep
;
4723 dout(10) << " frag " << dirfrag
<< " doesn't match dirfragtree " << *diri
<< dendl
;
4725 diri
->dirfragtree
.get_leaves_under(dirfrag
.frag
, leaves
);
4727 leaves
.push_back(diri
->dirfragtree
[dirfrag
.frag
.value()]);
4728 dout(10) << " maps to frag(s) " << leaves
<< dendl
;
4729 for (const auto& leaf
: leaves
) {
4730 CDir
*dir
= diri
->get_dirfrag(leaf
);
4732 dir
= rejoin_invent_dirfrag(dirfrag_t(diri
->ino(), leaf
));
4734 dout(10) << " have(approx) " << *dir
<< dendl
;
4735 dir
->add_replica(from
, p
.second
.nonce
);
4736 dir
->dir_rep
= p
.second
.dir_rep
;
4741 const auto it
= strong
->strong_dentries
.find(dirfrag
);
4742 if (it
!= strong
->strong_dentries
.end()) {
4743 const auto& dmap
= it
->second
;
4744 for (const auto &q
: dmap
) {
4745 const string_snap_t
& ss
= q
.first
;
4746 const MMDSCacheRejoin::dn_strong
& d
= q
.second
;
4749 dn
= dir
->lookup(ss
.name
, ss
.snapid
);
4751 frag_t fg
= diri
->pick_dirfrag(ss
.name
);
4752 dir
= diri
->get_dirfrag(fg
);
4754 dn
= dir
->lookup(ss
.name
, ss
.snapid
);
4757 if (d
.is_remote()) {
4758 dn
= dir
->add_remote_dentry(ss
.name
, d
.remote_ino
, d
.remote_d_type
, mempool::mds_co::string(d
.alternate_name
), d
.first
, ss
.snapid
);
4759 } else if (d
.is_null()) {
4760 dn
= dir
->add_null_dentry(ss
.name
, d
.first
, ss
.snapid
);
4762 CInode
*in
= get_inode(d
.ino
, ss
.snapid
);
4763 if (!in
) in
= rejoin_invent_inode(d
.ino
, ss
.snapid
);
4764 dn
= dir
->add_primary_dentry(ss
.name
, in
, mempool::mds_co::string(d
.alternate_name
), d
.first
, ss
.snapid
);
4766 dout(10) << " invented " << *dn
<< dendl
;
4768 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4771 const auto pinned_it
= strong
->authpinned_dentries
.find(dirfrag
);
4772 if (pinned_it
!= strong
->authpinned_dentries
.end()) {
4773 const auto peer_reqid_it
= pinned_it
->second
.find(ss
);
4774 if (peer_reqid_it
!= pinned_it
->second
.end()) {
4775 for (const auto &r
: peer_reqid_it
->second
) {
4776 dout(10) << " dn authpin by " << r
<< " on " << *dn
<< dendl
;
4778 // get/create peer mdrequest
4780 if (have_request(r
.reqid
))
4781 mdr
= request_get(r
.reqid
);
4783 mdr
= request_start_peer(r
.reqid
, r
.attempt
, strong
);
4790 const auto xlocked_it
= strong
->xlocked_dentries
.find(dirfrag
);
4791 if (xlocked_it
!= strong
->xlocked_dentries
.end()) {
4792 const auto ss_req_it
= xlocked_it
->second
.find(ss
);
4793 if (ss_req_it
!= xlocked_it
->second
.end()) {
4794 const MMDSCacheRejoin::peer_reqid
& r
= ss_req_it
->second
;
4795 dout(10) << " dn xlock by " << r
<< " on " << *dn
<< dendl
;
4796 MDRequestRef mdr
= request_get(r
.reqid
); // should have this from auth_pin above.
4797 ceph_assert(mdr
->is_auth_pinned(dn
));
4798 if (!mdr
->is_xlocked(&dn
->versionlock
)) {
4799 ceph_assert(dn
->versionlock
.can_xlock_local());
4800 dn
->versionlock
.get_xlock(mdr
, mdr
->get_client());
4801 mdr
->emplace_lock(&dn
->versionlock
, MutationImpl::LockOp::XLOCK
);
4803 if (dn
->lock
.is_stable())
4804 dn
->auth_pin(&dn
->lock
);
4805 dn
->lock
.set_state(LOCK_XLOCK
);
4806 dn
->lock
.get_xlock(mdr
, mdr
->get_client());
4807 mdr
->emplace_lock(&dn
->lock
, MutationImpl::LockOp::XLOCK
);
4811 dn
->add_replica(from
, d
.nonce
);
4812 dout(10) << " have " << *dn
<< dendl
;
4814 if (dnl
->is_primary()) {
4815 if (d
.is_primary()) {
4816 if (vinodeno_t(d
.ino
, ss
.snapid
) != dnl
->get_inode()->vino()) {
4817 // the survivor missed MDentryUnlink+MDentryLink messages ?
4818 ceph_assert(strong
->strong_inodes
.count(dnl
->get_inode()->vino()) == 0);
4819 CInode
*in
= get_inode(d
.ino
, ss
.snapid
);
4821 ceph_assert(in
->get_parent_dn());
4822 rejoin_unlinked_inodes
[from
].insert(in
);
4823 dout(7) << " sender has primary dentry but wrong inode" << dendl
;
4826 // the survivor missed MDentryLink message ?
4827 ceph_assert(strong
->strong_inodes
.count(dnl
->get_inode()->vino()) == 0);
4828 dout(7) << " sender doesn't have primay dentry" << dendl
;
4831 if (d
.is_primary()) {
4832 // the survivor missed MDentryUnlink message ?
4833 CInode
*in
= get_inode(d
.ino
, ss
.snapid
);
4835 ceph_assert(in
->get_parent_dn());
4836 rejoin_unlinked_inodes
[from
].insert(in
);
4837 dout(7) << " sender has primary dentry but we don't" << dendl
;
4844 for (const auto &p
: strong
->strong_inodes
) {
4845 CInode
*in
= get_inode(p
.first
);
4847 in
->add_replica(from
, p
.second
.nonce
);
4848 dout(10) << " have " << *in
<< dendl
;
4850 const MMDSCacheRejoin::inode_strong
& is
= p
.second
;
4853 if (is
.caps_wanted
) {
4854 in
->set_mds_caps_wanted(from
, is
.caps_wanted
);
4855 dout(15) << " inode caps_wanted " << ccap_string(is
.caps_wanted
)
4856 << " on " << *in
<< dendl
;
4860 // infer state from replica state:
4861 // * go to MIX if they might have wrlocks
4862 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4863 in
->filelock
.infer_state_from_strong_rejoin(is
.filelock
, !in
->is_dir()); // maybe also go to LOCK
4864 in
->nestlock
.infer_state_from_strong_rejoin(is
.nestlock
, false);
4865 in
->dirfragtreelock
.infer_state_from_strong_rejoin(is
.dftlock
, false);
4868 const auto authpinned_inodes_it
= strong
->authpinned_inodes
.find(in
->vino());
4869 if (authpinned_inodes_it
!= strong
->authpinned_inodes
.end()) {
4870 for (const auto& r
: authpinned_inodes_it
->second
) {
4871 dout(10) << " inode authpin by " << r
<< " on " << *in
<< dendl
;
4873 // get/create peer mdrequest
4875 if (have_request(r
.reqid
))
4876 mdr
= request_get(r
.reqid
);
4878 mdr
= request_start_peer(r
.reqid
, r
.attempt
, strong
);
4879 if (strong
->frozen_authpin_inodes
.count(in
->vino())) {
4880 ceph_assert(!in
->get_num_auth_pins());
4881 mdr
->freeze_auth_pin(in
);
4883 ceph_assert(!in
->is_frozen_auth_pin());
4889 const auto xlocked_inodes_it
= strong
->xlocked_inodes
.find(in
->vino());
4890 if (xlocked_inodes_it
!= strong
->xlocked_inodes
.end()) {
4891 for (const auto &q
: xlocked_inodes_it
->second
) {
4892 SimpleLock
*lock
= in
->get_lock(q
.first
);
4893 dout(10) << " inode xlock by " << q
.second
<< " on " << *lock
<< " on " << *in
<< dendl
;
4894 MDRequestRef mdr
= request_get(q
.second
.reqid
); // should have this from auth_pin above.
4895 ceph_assert(mdr
->is_auth_pinned(in
));
4896 if (!mdr
->is_xlocked(&in
->versionlock
)) {
4897 ceph_assert(in
->versionlock
.can_xlock_local());
4898 in
->versionlock
.get_xlock(mdr
, mdr
->get_client());
4899 mdr
->emplace_lock(&in
->versionlock
, MutationImpl::LockOp::XLOCK
);
4901 if (lock
->is_stable())
4903 lock
->set_state(LOCK_XLOCK
);
4904 if (lock
== &in
->filelock
)
4906 lock
->get_xlock(mdr
, mdr
->get_client());
4907 mdr
->emplace_lock(lock
, MutationImpl::LockOp::XLOCK
);
4912 for (const auto &p
: strong
->wrlocked_inodes
) {
4913 CInode
*in
= get_inode(p
.first
);
4914 for (const auto &q
: p
.second
) {
4915 SimpleLock
*lock
= in
->get_lock(q
.first
);
4916 for (const auto &r
: q
.second
) {
4917 dout(10) << " inode wrlock by " << r
<< " on " << *lock
<< " on " << *in
<< dendl
;
4918 MDRequestRef mdr
= request_get(r
.reqid
); // should have this from auth_pin above.
4920 ceph_assert(mdr
->is_auth_pinned(in
));
4921 lock
->set_state(LOCK_MIX
);
4922 if (lock
== &in
->filelock
)
4924 lock
->get_wrlock(true);
4925 mdr
->emplace_lock(lock
, MutationImpl::LockOp::WRLOCK
);
4931 ceph_assert(rejoin_gather
.count(from
));
4932 rejoin_gather
.erase(from
);
4933 if (rejoin_gather
.empty() && rejoin_ack_gather
.count(mds
->get_nodeid())) {
4934 rejoin_gather_finish();
4936 dout(7) << "still need rejoin from (" << rejoin_gather
<< ")" << dendl
;
4940 void MDCache::handle_cache_rejoin_ack(const cref_t
<MMDSCacheRejoin
> &ack
)
4942 dout(7) << "handle_cache_rejoin_ack from " << ack
->get_source() << dendl
;
4943 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
4945 ceph_assert(mds
->get_state() >= MDSMap::STATE_REJOIN
);
4946 bool survivor
= !mds
->is_rejoin();
4948 // for sending cache expire message
4949 set
<CInode
*> isolated_inodes
;
4950 set
<CInode
*> refragged_inodes
;
4951 list
<pair
<CInode
*,int> > updated_realms
;
4954 for (const auto &p
: ack
->strong_dirfrags
) {
4955 // we may have had incorrect dir fragmentation; refragment based
4956 // on what they auth tells us.
4957 CDir
*dir
= get_dirfrag(p
.first
);
4959 dir
= get_force_dirfrag(p
.first
, false);
4961 refragged_inodes
.insert(dir
->get_inode());
4964 CInode
*diri
= get_inode(p
.first
.ino
);
4966 // barebones inode; the full inode loop below will clean up.
4967 diri
= new CInode(this, false);
4968 auto _inode
= diri
->_get_inode();
4969 _inode
->ino
= p
.first
.ino
;
4970 _inode
->mode
= S_IFDIR
;
4971 _inode
->dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
4974 if (MDS_INO_MDSDIR(from
) == p
.first
.ino
) {
4975 diri
->inode_auth
= mds_authority_t(from
, CDIR_AUTH_UNKNOWN
);
4976 dout(10) << " add inode " << *diri
<< dendl
;
4978 diri
->inode_auth
= CDIR_AUTH_DEFAULT
;
4979 isolated_inodes
.insert(diri
);
4980 dout(10) << " unconnected dirfrag " << p
.first
<< dendl
;
4983 // barebones dirfrag; the full dirfrag loop below will clean up.
4984 dir
= diri
->add_dirfrag(new CDir(diri
, p
.first
.frag
, this, false));
4985 if (MDS_INO_MDSDIR(from
) == p
.first
.ino
||
4986 (dir
->authority() != CDIR_AUTH_UNDEF
&&
4987 dir
->authority().first
!= from
))
4988 adjust_subtree_auth(dir
, from
);
4989 dout(10) << " add dirfrag " << *dir
<< dendl
;
4992 dir
->set_replica_nonce(p
.second
.nonce
);
4993 dir
->state_clear(CDir::STATE_REJOINING
);
4994 dout(10) << " got " << *dir
<< dendl
;
4997 auto it
= ack
->strong_dentries
.find(p
.first
);
4998 if (it
!= ack
->strong_dentries
.end()) {
4999 for (const auto &q
: it
->second
) {
5000 CDentry
*dn
= dir
->lookup(q
.first
.name
, q
.first
.snapid
);
5002 dn
= dir
->add_null_dentry(q
.first
.name
, q
.second
.first
, q
.first
.snapid
);
5004 CDentry::linkage_t
*dnl
= dn
->get_linkage();
5006 ceph_assert(dn
->last
== q
.first
.snapid
);
5007 if (dn
->first
!= q
.second
.first
) {
5008 dout(10) << " adjust dn.first " << dn
->first
<< " -> " << q
.second
.first
<< " on " << *dn
<< dendl
;
5009 dn
->first
= q
.second
.first
;
5012 // may have bad linkage if we missed dentry link/unlink messages
5013 if (dnl
->is_primary()) {
5014 CInode
*in
= dnl
->get_inode();
5015 if (!q
.second
.is_primary() ||
5016 vinodeno_t(q
.second
.ino
, q
.first
.snapid
) != in
->vino()) {
5017 dout(10) << " had bad linkage for " << *dn
<< ", unlinking " << *in
<< dendl
;
5018 dir
->unlink_inode(dn
);
5020 } else if (dnl
->is_remote()) {
5021 if (!q
.second
.is_remote() ||
5022 q
.second
.remote_ino
!= dnl
->get_remote_ino() ||
5023 q
.second
.remote_d_type
!= dnl
->get_remote_d_type()) {
5024 dout(10) << " had bad linkage for " << *dn
<< dendl
;
5025 dir
->unlink_inode(dn
);
5028 if (!q
.second
.is_null())
5029 dout(10) << " had bad linkage for " << *dn
<< dendl
;
5032 // hmm, did we have the proper linkage here?
5033 if (dnl
->is_null() && !q
.second
.is_null()) {
5034 if (q
.second
.is_remote()) {
5035 dn
->dir
->link_remote_inode(dn
, q
.second
.remote_ino
, q
.second
.remote_d_type
);
5037 CInode
*in
= get_inode(q
.second
.ino
, q
.first
.snapid
);
5039 // barebones inode; assume it's dir, the full inode loop below will clean up.
5040 in
= new CInode(this, false, q
.second
.first
, q
.first
.snapid
);
5041 auto _inode
= in
->_get_inode();
5042 _inode
->ino
= q
.second
.ino
;
5043 _inode
->mode
= S_IFDIR
;
5044 _inode
->dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
5046 dout(10) << " add inode " << *in
<< dendl
;
5047 } else if (in
->get_parent_dn()) {
5048 dout(10) << " had bad linkage for " << *(in
->get_parent_dn())
5049 << ", unlinking " << *in
<< dendl
;
5050 in
->get_parent_dir()->unlink_inode(in
->get_parent_dn());
5052 dn
->dir
->link_primary_inode(dn
, in
);
5053 isolated_inodes
.erase(in
);
5057 dn
->set_replica_nonce(q
.second
.nonce
);
5058 dn
->lock
.set_state_rejoin(q
.second
.lock
, rejoin_waiters
, survivor
);
5059 dn
->state_clear(CDentry::STATE_REJOINING
);
5060 dout(10) << " got " << *dn
<< dendl
;
5065 for (const auto& in
: refragged_inodes
) {
5066 auto&& ls
= in
->get_nested_dirfrags();
5067 for (const auto& dir
: ls
) {
5068 if (dir
->is_auth() || ack
->strong_dirfrags
.count(dir
->dirfrag()))
5070 ceph_assert(dir
->get_num_any() == 0);
5071 in
->close_dirfrag(dir
->get_frag());
5076 for (const auto &p
: ack
->dirfrag_bases
) {
5077 CDir
*dir
= get_dirfrag(p
.first
);
5079 auto q
= p
.second
.cbegin();
5080 dir
->_decode_base(q
);
5081 dout(10) << " got dir replica " << *dir
<< dendl
;
5085 auto p
= ack
->inode_base
.cbegin();
5093 CInode
*in
= get_inode(ino
, last
);
5095 auto q
= basebl
.cbegin();
5098 sseq
= in
->snaprealm
->srnode
.seq
;
5099 in
->_decode_base(q
);
5100 if (in
->snaprealm
&& in
->snaprealm
->srnode
.seq
!= sseq
) {
5101 int snap_op
= sseq
> 0 ? CEPH_SNAP_OP_UPDATE
: CEPH_SNAP_OP_SPLIT
;
5102 updated_realms
.push_back(pair
<CInode
*,int>(in
, snap_op
));
5104 dout(10) << " got inode base " << *in
<< dendl
;
5108 p
= ack
->inode_locks
.cbegin();
5109 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5120 CInode
*in
= get_inode(ino
, last
);
5122 in
->set_replica_nonce(nonce
);
5123 auto q
= lockbl
.cbegin();
5124 in
->_decode_locks_rejoin(q
, rejoin_waiters
, rejoin_eval_locks
, survivor
);
5125 in
->state_clear(CInode::STATE_REJOINING
);
5126 dout(10) << " got inode locks " << *in
<< dendl
;
5129 // FIXME: This can happen if entire subtree, together with the inode subtree root
5130 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
5131 ceph_assert(isolated_inodes
.empty());
5133 map
<inodeno_t
,map
<client_t
,Capability::Import
> > peer_imported
;
5134 auto bp
= ack
->imported_caps
.cbegin();
5135 decode(peer_imported
, bp
);
5137 for (map
<inodeno_t
,map
<client_t
,Capability::Import
> >::iterator p
= peer_imported
.begin();
5138 p
!= peer_imported
.end();
5140 auto& ex
= cap_exports
.at(p
->first
);
5141 ceph_assert(ex
.first
== from
);
5142 for (map
<client_t
,Capability::Import
>::iterator q
= p
->second
.begin();
5143 q
!= p
->second
.end();
5145 auto r
= ex
.second
.find(q
->first
);
5146 ceph_assert(r
!= ex
.second
.end());
5148 dout(10) << " exporting caps for client." << q
->first
<< " ino " << p
->first
<< dendl
;
5149 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
5151 dout(10) << " no session for client." << p
->first
<< dendl
;
5156 // mark client caps stale.
5157 auto m
= make_message
<MClientCaps
>(CEPH_CAP_OP_EXPORT
, p
->first
, 0,
5158 r
->second
.capinfo
.cap_id
, 0,
5159 mds
->get_osd_epoch_barrier());
5160 m
->set_cap_peer(q
->second
.cap_id
, q
->second
.issue_seq
, q
->second
.mseq
,
5161 (q
->second
.cap_id
> 0 ? from
: -1), 0);
5162 mds
->send_message_client_counted(m
, session
);
5166 ceph_assert(ex
.second
.empty());
5169 for (auto p
: updated_realms
) {
5170 CInode
*in
= p
.first
;
5171 bool notify_clients
;
5172 if (mds
->is_rejoin()) {
5173 if (!rejoin_pending_snaprealms
.count(in
)) {
5174 in
->get(CInode::PIN_OPENINGSNAPPARENTS
);
5175 rejoin_pending_snaprealms
.insert(in
);
5177 notify_clients
= false;
5179 // notify clients if I'm survivor
5180 notify_clients
= true;
5182 do_realm_invalidate_and_update_notify(in
, p
.second
, notify_clients
);
5186 ceph_assert(rejoin_ack_gather
.count(from
));
5187 rejoin_ack_gather
.erase(from
);
5189 if (rejoin_gather
.empty()) {
5190 // eval unstable scatter locks after all wrlocks are rejoined.
5191 while (!rejoin_eval_locks
.empty()) {
5192 SimpleLock
*lock
= rejoin_eval_locks
.front();
5193 rejoin_eval_locks
.pop_front();
5194 if (!lock
->is_stable())
5195 mds
->locker
->eval_gather(lock
);
5199 if (rejoin_gather
.empty() && // make sure we've gotten our FULL inodes, too.
5200 rejoin_ack_gather
.empty()) {
5201 // finally, kickstart past snap parent opens
5204 dout(7) << "still need rejoin from (" << rejoin_gather
<< ")"
5205 << ", rejoin_ack from (" << rejoin_ack_gather
<< ")" << dendl
;
5209 mds
->queue_waiters(rejoin_waiters
);
5214 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5216 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5217 * messages that clean these guys up...
5219 void MDCache::rejoin_trim_undef_inodes()
5221 dout(10) << "rejoin_trim_undef_inodes" << dendl
;
5223 while (!rejoin_undef_inodes
.empty()) {
5224 set
<CInode
*>::iterator p
= rejoin_undef_inodes
.begin();
5226 rejoin_undef_inodes
.erase(p
);
5228 in
->clear_replica_map();
5230 // close out dirfrags
5232 const auto&& dfls
= in
->get_dirfrags();
5233 for (const auto& dir
: dfls
) {
5234 dir
->clear_replica_map();
5236 for (auto &p
: dir
->items
) {
5237 CDentry
*dn
= p
.second
;
5238 dn
->clear_replica_map();
5240 dout(10) << " trimming " << *dn
<< dendl
;
5241 dir
->remove_dentry(dn
);
5244 dout(10) << " trimming " << *dir
<< dendl
;
5245 in
->close_dirfrag(dir
->dirfrag().frag
);
5249 CDentry
*dn
= in
->get_parent_dn();
5251 dn
->clear_replica_map();
5252 dout(10) << " trimming " << *dn
<< dendl
;
5253 dn
->dir
->remove_dentry(dn
);
5255 dout(10) << " trimming " << *in
<< dendl
;
5260 ceph_assert(rejoin_undef_inodes
.empty());
5263 void MDCache::rejoin_gather_finish()
5265 dout(10) << "rejoin_gather_finish" << dendl
;
5266 ceph_assert(mds
->is_rejoin());
5267 ceph_assert(rejoin_ack_gather
.count(mds
->get_nodeid()));
5269 if (open_undef_inodes_dirfrags())
5272 if (process_imported_caps())
5275 choose_lock_states_and_reconnect_caps();
5277 identify_files_to_recover();
5280 // signal completion of fetches, rejoin_gather_finish, etc.
5281 rejoin_ack_gather
.erase(mds
->get_nodeid());
5283 // did we already get our acks too?
5284 if (rejoin_ack_gather
.empty()) {
5285 // finally, open snaprealms
5290 class C_MDC_RejoinOpenInoFinish
: public MDCacheContext
{
5293 C_MDC_RejoinOpenInoFinish(MDCache
*c
, inodeno_t i
) : MDCacheContext(c
), ino(i
) {}
5294 void finish(int r
) override
{
5295 mdcache
->rejoin_open_ino_finish(ino
, r
);
5299 void MDCache::rejoin_open_ino_finish(inodeno_t ino
, int ret
)
5301 dout(10) << "open_caps_inode_finish ino " << ino
<< " ret " << ret
<< dendl
;
5304 cap_imports_missing
.insert(ino
);
5305 } else if (ret
== mds
->get_nodeid()) {
5306 ceph_assert(get_inode(ino
));
5308 auto p
= cap_imports
.find(ino
);
5309 ceph_assert(p
!= cap_imports
.end());
5310 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5311 ceph_assert(q
->second
.count(MDS_RANK_NONE
));
5312 ceph_assert(q
->second
.size() == 1);
5313 rejoin_export_caps(p
->first
, q
->first
, q
->second
[MDS_RANK_NONE
], ret
);
5315 cap_imports
.erase(p
);
5318 ceph_assert(cap_imports_num_opening
> 0);
5319 cap_imports_num_opening
--;
5321 if (cap_imports_num_opening
== 0) {
5322 if (rejoin_gather
.empty() && rejoin_ack_gather
.count(mds
->get_nodeid()))
5323 rejoin_gather_finish();
5324 else if (rejoin_gather
.count(mds
->get_nodeid()))
5325 process_imported_caps();
5329 class C_MDC_RejoinSessionsOpened
: public MDCacheLogContext
{
5331 map
<client_t
,pair
<Session
*,uint64_t> > session_map
;
5332 C_MDC_RejoinSessionsOpened(MDCache
*c
) : MDCacheLogContext(c
) {}
5333 void finish(int r
) override
{
5334 ceph_assert(r
== 0);
5335 mdcache
->rejoin_open_sessions_finish(session_map
);
5339 void MDCache::rejoin_open_sessions_finish(map
<client_t
,pair
<Session
*,uint64_t> >& session_map
)
5341 dout(10) << "rejoin_open_sessions_finish" << dendl
;
5342 mds
->server
->finish_force_open_sessions(session_map
);
5343 rejoin_session_map
.swap(session_map
);
5344 if (rejoin_gather
.empty() && rejoin_ack_gather
.count(mds
->get_nodeid()))
5345 rejoin_gather_finish();
5348 void MDCache::rejoin_prefetch_ino_finish(inodeno_t ino
, int ret
)
5350 auto p
= cap_imports
.find(ino
);
5351 if (p
!= cap_imports
.end()) {
5352 dout(10) << __func__
<< " ino " << ino
<< " ret " << ret
<< dendl
;
5354 cap_imports_missing
.insert(ino
);
5355 } else if (ret
!= mds
->get_nodeid()) {
5356 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5357 ceph_assert(q
->second
.count(MDS_RANK_NONE
));
5358 ceph_assert(q
->second
.size() == 1);
5359 rejoin_export_caps(p
->first
, q
->first
, q
->second
[MDS_RANK_NONE
], ret
);
5361 cap_imports
.erase(p
);
5366 bool MDCache::process_imported_caps()
5368 dout(10) << "process_imported_caps" << dendl
;
5370 if (!open_file_table
.is_prefetched() &&
5371 open_file_table
.prefetch_inodes()) {
5372 open_file_table
.wait_for_prefetch(
5373 new MDSInternalContextWrapper(mds
,
5374 new LambdaContext([this](int r
) {
5375 ceph_assert(rejoin_gather
.count(mds
->get_nodeid()));
5376 process_imported_caps();
5383 open_ino_batch_start();
5385 for (auto& p
: cap_imports
) {
5386 CInode
*in
= get_inode(p
.first
);
5388 ceph_assert(in
->is_auth());
5389 cap_imports_missing
.erase(p
.first
);
5392 if (cap_imports_missing
.count(p
.first
) > 0)
5395 uint64_t parent_ino
= 0;
5396 std::string_view d_name
;
5397 for (auto& q
: p
.second
) {
5398 for (auto& r
: q
.second
) {
5399 auto &icr
= r
.second
;
5400 if (icr
.capinfo
.pathbase
&&
5401 icr
.path
.length() > 0 &&
5402 icr
.path
.find('/') == string::npos
) {
5403 parent_ino
= icr
.capinfo
.pathbase
;
5412 dout(10) << " opening missing ino " << p
.first
<< dendl
;
5413 cap_imports_num_opening
++;
5414 auto fin
= new C_MDC_RejoinOpenInoFinish(this, p
.first
);
5416 vector
<inode_backpointer_t
> ancestors
;
5417 ancestors
.push_back(inode_backpointer_t(parent_ino
, string
{d_name
}, 0));
5418 open_ino(p
.first
, (int64_t)-1, fin
, false, false, &ancestors
);
5420 open_ino(p
.first
, (int64_t)-1, fin
, false);
5422 if (!(cap_imports_num_opening
% mds
->heartbeat_reset_grace()))
5423 mds
->heartbeat_reset();
5426 open_ino_batch_submit();
5428 if (cap_imports_num_opening
> 0)
5431 // called by rejoin_gather_finish() ?
5432 if (rejoin_gather
.count(mds
->get_nodeid()) == 0) {
5433 if (!rejoin_client_map
.empty() &&
5434 rejoin_session_map
.empty()) {
5435 C_MDC_RejoinSessionsOpened
*finish
= new C_MDC_RejoinSessionsOpened(this);
5436 version_t pv
= mds
->server
->prepare_force_open_sessions(rejoin_client_map
,
5437 rejoin_client_metadata_map
,
5438 finish
->session_map
);
5439 ESessions
*le
= new ESessions(pv
, std::move(rejoin_client_map
),
5440 std::move(rejoin_client_metadata_map
));
5441 mds
->mdlog
->start_submit_entry(le
, finish
);
5442 mds
->mdlog
->flush();
5443 rejoin_client_map
.clear();
5444 rejoin_client_metadata_map
.clear();
5448 // process caps that were exported by peer rename
5449 for (map
<inodeno_t
,pair
<mds_rank_t
,map
<client_t
,Capability::Export
> > >::iterator p
= rejoin_peer_exports
.begin();
5450 p
!= rejoin_peer_exports
.end();
5452 CInode
*in
= get_inode(p
->first
);
5454 for (map
<client_t
,Capability::Export
>::iterator q
= p
->second
.second
.begin();
5455 q
!= p
->second
.second
.end();
5457 auto r
= rejoin_session_map
.find(q
->first
);
5458 if (r
== rejoin_session_map
.end())
5461 Session
*session
= r
->second
.first
;
5462 Capability
*cap
= in
->get_client_cap(q
->first
);
5464 cap
= in
->add_client_cap(q
->first
, session
);
5465 // add empty item to reconnected_caps
5466 (void)reconnected_caps
[p
->first
][q
->first
];
5468 cap
->merge(q
->second
, true);
5470 Capability::Import
& im
= rejoin_imported_caps
[p
->second
.first
][p
->first
][q
->first
];
5471 ceph_assert(cap
->get_last_seq() == im
.issue_seq
);
5472 ceph_assert(cap
->get_mseq() == im
.mseq
);
5473 cap
->set_cap_id(im
.cap_id
);
5474 // send cap import because we assigned a new cap ID
5475 do_cap_import(session
, in
, cap
, q
->second
.cap_id
, q
->second
.seq
, q
->second
.mseq
- 1,
5476 p
->second
.first
, CEPH_CAP_FLAG_AUTH
);
5479 rejoin_peer_exports
.clear();
5480 rejoin_imported_caps
.clear();
5482 // process cap imports
5483 // ino -> client -> frommds -> capex
5484 for (auto p
= cap_imports
.begin(); p
!= cap_imports
.end(); ) {
5485 CInode
*in
= get_inode(p
->first
);
5487 dout(10) << " still missing ino " << p
->first
5488 << ", will try again after replayed client requests" << dendl
;
5492 ceph_assert(in
->is_auth());
5493 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5496 auto r
= rejoin_session_map
.find(q
->first
);
5497 session
= (r
!= rejoin_session_map
.end() ? r
->second
.first
: nullptr);
5500 for (auto r
= q
->second
.begin(); r
!= q
->second
.end(); ++r
) {
5503 (void)rejoin_imported_caps
[r
->first
][p
->first
][q
->first
]; // all are zero
5507 Capability
*cap
= in
->reconnect_cap(q
->first
, r
->second
, session
);
5508 add_reconnected_cap(q
->first
, in
->ino(), r
->second
);
5509 if (r
->first
>= 0) {
5510 if (cap
->get_last_seq() == 0) // don't increase mseq if cap already exists
5512 do_cap_import(session
, in
, cap
, r
->second
.capinfo
.cap_id
, 0, 0, r
->first
, 0);
5514 Capability::Import
& im
= rejoin_imported_caps
[r
->first
][p
->first
][q
->first
];
5515 im
.cap_id
= cap
->get_cap_id();
5516 im
.issue_seq
= cap
->get_last_seq();
5517 im
.mseq
= cap
->get_mseq();
5521 cap_imports
.erase(p
++); // remove and move on
5526 ceph_assert(rejoin_gather
.count(mds
->get_nodeid()));
5527 rejoin_gather
.erase(mds
->get_nodeid());
5528 ceph_assert(!rejoin_ack_gather
.count(mds
->get_nodeid()));
5529 maybe_send_pending_rejoins();
5534 void MDCache::rebuild_need_snapflush(CInode
*head_in
, SnapRealm
*realm
,
5535 client_t client
, snapid_t snap_follows
)
5537 dout(10) << "rebuild_need_snapflush " << snap_follows
<< " on " << *head_in
<< dendl
;
5539 if (!realm
->has_snaps_in_range(snap_follows
+ 1, head_in
->first
- 1))
5542 const set
<snapid_t
>& snaps
= realm
->get_snaps();
5543 snapid_t follows
= snap_follows
;
5546 CInode
*in
= pick_inode_snap(head_in
, follows
);
5550 bool need_snapflush
= false;
5551 for (auto p
= snaps
.lower_bound(std::max
<snapid_t
>(in
->first
, (follows
+ 1)));
5552 p
!= snaps
.end() && *p
<= in
->last
;
5554 head_in
->add_need_snapflush(in
, *p
, client
);
5555 need_snapflush
= true;
5558 if (!need_snapflush
)
5561 dout(10) << " need snapflush from client." << client
<< " on " << *in
<< dendl
;
5563 if (in
->client_snap_caps
.empty()) {
5564 for (int i
= 0; i
< num_cinode_locks
; i
++) {
5565 int lockid
= cinode_lock_info
[i
].lock
;
5566 SimpleLock
*lock
= in
->get_lock(lockid
);
5569 lock
->set_state(LOCK_SNAP_SYNC
);
5570 lock
->get_wrlock(true);
5573 in
->client_snap_caps
.insert(client
);
5574 mds
->locker
->mark_need_snapflush_inode(in
);
5579 * choose lock states based on reconnected caps
5581 void MDCache::choose_lock_states_and_reconnect_caps()
5583 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl
;
5586 for (auto p
: inode_map
) {
5587 CInode
*in
= p
.second
;
5588 if (in
->last
!= CEPH_NOSNAP
)
5591 if (in
->is_auth() && !in
->is_base() && in
->get_inode()->is_dirty_rstat())
5592 in
->mark_dirty_rstat();
5595 auto q
= reconnected_caps
.find(in
->ino());
5596 if (q
!= reconnected_caps
.end()) {
5597 for (const auto &it
: q
->second
)
5598 dirty_caps
|= it
.second
.dirty_caps
;
5600 in
->choose_lock_states(dirty_caps
);
5601 dout(15) << " chose lock states on " << *in
<< dendl
;
5603 if (in
->snaprealm
&& !rejoin_pending_snaprealms
.count(in
)) {
5604 in
->get(CInode::PIN_OPENINGSNAPPARENTS
);
5605 rejoin_pending_snaprealms
.insert(in
);
5608 if (!(++count
% mds
->heartbeat_reset_grace()))
5609 mds
->heartbeat_reset();
5613 void MDCache::prepare_realm_split(SnapRealm
*realm
, client_t client
, inodeno_t ino
,
5614 map
<client_t
,ref_t
<MClientSnap
>>& splits
)
5616 ref_t
<MClientSnap
> snap
;
5617 auto it
= splits
.find(client
);
5618 if (it
!= splits
.end()) {
5620 snap
->head
.op
= CEPH_SNAP_OP_SPLIT
;
5622 snap
= make_message
<MClientSnap
>(CEPH_SNAP_OP_SPLIT
);
5623 splits
.emplace(std::piecewise_construct
, std::forward_as_tuple(client
), std::forward_as_tuple(snap
));
5624 snap
->head
.split
= realm
->inode
->ino();
5625 snap
->bl
= mds
->server
->get_snap_trace(client
, realm
);
5627 for (const auto& child
: realm
->open_children
)
5628 snap
->split_realms
.push_back(child
->inode
->ino());
5630 snap
->split_inos
.push_back(ino
);
5633 void MDCache::prepare_realm_merge(SnapRealm
*realm
, SnapRealm
*parent_realm
,
5634 map
<client_t
,ref_t
<MClientSnap
>>& splits
)
5636 ceph_assert(parent_realm
);
5638 vector
<inodeno_t
> split_inos
;
5639 vector
<inodeno_t
> split_realms
;
5641 for (auto p
= realm
->inodes_with_caps
.begin(); !p
.end(); ++p
)
5642 split_inos
.push_back((*p
)->ino());
5643 for (set
<SnapRealm
*>::iterator p
= realm
->open_children
.begin();
5644 p
!= realm
->open_children
.end();
5646 split_realms
.push_back((*p
)->inode
->ino());
5648 for (const auto& p
: realm
->client_caps
) {
5649 ceph_assert(!p
.second
->empty());
5650 auto em
= splits
.emplace(std::piecewise_construct
, std::forward_as_tuple(p
.first
), std::forward_as_tuple());
5652 auto update
= make_message
<MClientSnap
>(CEPH_SNAP_OP_SPLIT
);
5653 update
->head
.split
= parent_realm
->inode
->ino();
5654 update
->split_inos
= split_inos
;
5655 update
->split_realms
= split_realms
;
5656 update
->bl
= mds
->server
->get_snap_trace(p
.first
, parent_realm
);
5657 em
.first
->second
= std::move(update
);
5662 void MDCache::send_snaps(map
<client_t
,ref_t
<MClientSnap
>>& splits
)
5664 dout(10) << "send_snaps" << dendl
;
5666 for (auto &p
: splits
) {
5667 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(p
.first
.v
));
5669 dout(10) << " client." << p
.first
5670 << " split " << p
.second
->head
.split
5671 << " inos " << p
.second
->split_inos
5673 mds
->send_message_client_counted(p
.second
, session
);
5675 dout(10) << " no session for client." << p
.first
<< dendl
;
5683 * remove any items from logsegment open_file lists that don't have
5686 void MDCache::clean_open_file_lists()
5688 dout(10) << "clean_open_file_lists" << dendl
;
5690 for (map
<uint64_t,LogSegment
*>::iterator p
= mds
->mdlog
->segments
.begin();
5691 p
!= mds
->mdlog
->segments
.end();
5693 LogSegment
*ls
= p
->second
;
5695 elist
<CInode
*>::iterator q
= ls
->open_files
.begin(member_offset(CInode
, item_open_file
));
5699 if (in
->last
== CEPH_NOSNAP
) {
5700 dout(10) << " unlisting unwanted/capless inode " << *in
<< dendl
;
5701 in
->item_open_file
.remove_myself();
5703 if (in
->client_snap_caps
.empty()) {
5704 dout(10) << " unlisting flushed snap inode " << *in
<< dendl
;
5705 in
->item_open_file
.remove_myself();
5712 void MDCache::dump_openfiles(Formatter
*f
)
5714 f
->open_array_section("openfiles");
5715 for (auto p
= mds
->mdlog
->segments
.begin();
5716 p
!= mds
->mdlog
->segments
.end();
5718 LogSegment
*ls
= p
->second
;
5720 auto q
= ls
->open_files
.begin(member_offset(CInode
, item_open_file
));
5724 if ((in
->last
== CEPH_NOSNAP
&& !in
->is_any_caps_wanted())
5725 || (in
->last
!= CEPH_NOSNAP
&& in
->client_snap_caps
.empty()))
5727 f
->open_object_section("file");
5728 in
->dump(f
, CInode::DUMP_PATH
| CInode::DUMP_INODE_STORE_BASE
| CInode::DUMP_CAPS
);
5735 Capability
* MDCache::rejoin_import_cap(CInode
*in
, client_t client
, const cap_reconnect_t
& icr
, mds_rank_t frommds
)
5737 dout(10) << "rejoin_import_cap for client." << client
<< " from mds." << frommds
5738 << " on " << *in
<< dendl
;
5739 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(client
.v
));
5741 dout(10) << " no session for client." << client
<< dendl
;
5745 Capability
*cap
= in
->reconnect_cap(client
, icr
, session
);
5748 if (cap
->get_last_seq() == 0) // don't increase mseq if cap already exists
5750 do_cap_import(session
, in
, cap
, icr
.capinfo
.cap_id
, 0, 0, frommds
, 0);
5756 void MDCache::export_remaining_imported_caps()
5758 dout(10) << "export_remaining_imported_caps" << dendl
;
5760 CachedStackStringStream css
;
5763 for (auto p
= cap_imports
.begin(); p
!= cap_imports
.end(); ++p
) {
5764 *css
<< " ino " << p
->first
<< "\n";
5765 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5766 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
5768 // mark client caps stale.
5769 auto stale
= make_message
<MClientCaps
>(CEPH_CAP_OP_EXPORT
, p
->first
,
5771 mds
->get_osd_epoch_barrier());
5772 stale
->set_cap_peer(0, 0, 0, -1, 0);
5773 mds
->send_message_client_counted(stale
, q
->first
);
5777 if (!(++count
% mds
->heartbeat_reset_grace()))
5778 mds
->heartbeat_reset();
5781 for (map
<inodeno_t
, MDSContext::vec
>::iterator p
= cap_reconnect_waiters
.begin();
5782 p
!= cap_reconnect_waiters
.end();
5784 mds
->queue_waiters(p
->second
);
5786 cap_imports
.clear();
5787 cap_reconnect_waiters
.clear();
5789 if (css
->strv().length()) {
5790 mds
->clog
->warn() << "failed to reconnect caps for missing inodes:"
5795 Capability
* MDCache::try_reconnect_cap(CInode
*in
, Session
*session
)
5797 client_t client
= session
->info
.get_client();
5798 Capability
*cap
= nullptr;
5799 const cap_reconnect_t
*rc
= get_replay_cap_reconnect(in
->ino(), client
);
5801 cap
= in
->reconnect_cap(client
, *rc
, session
);
5802 dout(10) << "try_reconnect_cap client." << client
5803 << " reconnect wanted " << ccap_string(rc
->capinfo
.wanted
)
5804 << " issue " << ccap_string(rc
->capinfo
.issued
)
5805 << " on " << *in
<< dendl
;
5806 remove_replay_cap_reconnect(in
->ino(), client
);
5808 if (in
->is_replicated()) {
5809 mds
->locker
->try_eval(in
, CEPH_CAP_LOCKS
);
5812 auto p
= reconnected_caps
.find(in
->ino());
5813 if (p
!= reconnected_caps
.end()) {
5814 auto q
= p
->second
.find(client
);
5815 if (q
!= p
->second
.end())
5816 dirty_caps
= q
->second
.dirty_caps
;
5818 in
->choose_lock_states(dirty_caps
);
5819 dout(15) << " chose lock states on " << *in
<< dendl
;
5822 map
<inodeno_t
, MDSContext::vec
>::iterator it
=
5823 cap_reconnect_waiters
.find(in
->ino());
5824 if (it
!= cap_reconnect_waiters
.end()) {
5825 mds
->queue_waiters(it
->second
);
5826 cap_reconnect_waiters
.erase(it
);
5835 // cap imports and delayed snap parent opens
5837 void MDCache::do_cap_import(Session
*session
, CInode
*in
, Capability
*cap
,
5838 uint64_t p_cap_id
, ceph_seq_t p_seq
, ceph_seq_t p_mseq
,
5839 int peer
, int p_flags
)
5841 SnapRealm
*realm
= in
->find_snaprealm();
5842 dout(10) << "do_cap_import " << session
->info
.inst
.name
<< " mseq " << cap
->get_mseq() << " on " << *in
<< dendl
;
5843 if (cap
->get_last_seq() == 0) // reconnected cap
5844 cap
->inc_last_seq();
5845 cap
->set_last_issue();
5846 cap
->set_last_issue_stamp(ceph_clock_now());
5848 auto reap
= make_message
<MClientCaps
>(CEPH_CAP_OP_IMPORT
,
5849 in
->ino(), realm
->inode
->ino(), cap
->get_cap_id(),
5850 cap
->get_last_seq(), cap
->pending(), cap
->wanted(),
5851 0, cap
->get_mseq(), mds
->get_osd_epoch_barrier());
5852 in
->encode_cap_message(reap
, cap
);
5853 reap
->snapbl
= mds
->server
->get_snap_trace(session
, realm
);
5854 reap
->set_cap_peer(p_cap_id
, p_seq
, p_mseq
, peer
, p_flags
);
5855 mds
->send_message_client_counted(reap
, session
);
5858 void MDCache::do_delayed_cap_imports()
5860 dout(10) << "do_delayed_cap_imports" << dendl
;
5862 ceph_assert(delayed_imported_caps
.empty());
5865 struct C_MDC_OpenSnapRealms
: public MDCacheContext
{
5866 explicit C_MDC_OpenSnapRealms(MDCache
*c
) : MDCacheContext(c
) {}
5867 void finish(int r
) override
{
5868 mdcache
->open_snaprealms();
5872 void MDCache::open_snaprealms()
5874 dout(10) << "open_snaprealms" << dendl
;
5876 auto it
= rejoin_pending_snaprealms
.begin();
5877 while (it
!= rejoin_pending_snaprealms
.end()) {
5879 SnapRealm
*realm
= in
->snaprealm
;
5882 map
<client_t
,ref_t
<MClientSnap
>> splits
;
5883 // finish off client snaprealm reconnects?
5884 auto q
= reconnected_snaprealms
.find(in
->ino());
5885 if (q
!= reconnected_snaprealms
.end()) {
5886 for (const auto& r
: q
->second
)
5887 finish_snaprealm_reconnect(r
.first
, realm
, r
.second
, splits
);
5888 reconnected_snaprealms
.erase(q
);
5891 for (auto p
= realm
->inodes_with_caps
.begin(); !p
.end(); ++p
) {
5893 auto q
= reconnected_caps
.find(child
->ino());
5894 ceph_assert(q
!= reconnected_caps
.end());
5895 for (auto r
= q
->second
.begin(); r
!= q
->second
.end(); ++r
) {
5896 Capability
*cap
= child
->get_client_cap(r
->first
);
5899 if (r
->second
.snap_follows
> 0) {
5900 if (r
->second
.snap_follows
< child
->first
- 1) {
5901 rebuild_need_snapflush(child
, realm
, r
->first
, r
->second
.snap_follows
);
5902 } else if (r
->second
.snapflush
) {
5903 // When processing a cap flush message that is re-sent, it's possble
5904 // that the sender has already released all WR caps. So we should
5905 // force MDCache::cow_inode() to setup CInode::client_need_snapflush.
5906 cap
->mark_needsnapflush();
5909 // make sure client's cap is in the correct snaprealm.
5910 if (r
->second
.realm_ino
!= in
->ino()) {
5911 prepare_realm_split(realm
, r
->first
, child
->ino(), splits
);
5916 rejoin_pending_snaprealms
.erase(it
++);
5917 in
->put(CInode::PIN_OPENINGSNAPPARENTS
);
5922 notify_global_snaprealm_update(CEPH_SNAP_OP_UPDATE
);
5924 if (!reconnected_snaprealms
.empty()) {
5925 dout(5) << "open_snaprealms has unconnected snaprealm:" << dendl
;
5926 for (auto& p
: reconnected_snaprealms
) {
5927 CachedStackStringStream css
;
5928 *css
<< " " << p
.first
<< " {";
5930 for (auto& q
: p
.second
) {
5933 *css
<< "client." << q
.first
<< "/" << q
.second
;
5936 dout(5) << css
->strv() << dendl
;
5939 ceph_assert(rejoin_waiters
.empty());
5940 ceph_assert(rejoin_pending_snaprealms
.empty());
5941 dout(10) << "open_snaprealms - all open" << dendl
;
5942 do_delayed_cap_imports();
5944 ceph_assert(rejoin_done
);
5945 rejoin_done
.release()->complete(0);
5946 reconnected_caps
.clear();
5949 bool MDCache::open_undef_inodes_dirfrags()
5951 dout(10) << "open_undef_inodes_dirfrags "
5952 << rejoin_undef_inodes
.size() << " inodes "
5953 << rejoin_undef_dirfrags
.size() << " dirfrags" << dendl
;
5955 // dirfrag -> (fetch_complete, keys_to_fetch)
5956 map
<CDir
*, pair
<bool, std::vector
<dentry_key_t
> > > fetch_queue
;
5957 for (auto& dir
: rejoin_undef_dirfrags
) {
5958 ceph_assert(dir
->get_version() == 0);
5959 fetch_queue
.emplace(std::piecewise_construct
, std::make_tuple(dir
), std::make_tuple());
5962 if (g_conf().get_val
<bool>("mds_dir_prefetch")) {
5963 for (auto& in
: rejoin_undef_inodes
) {
5964 ceph_assert(!in
->is_base());
5965 ceph_assert(in
->get_parent_dir());
5966 fetch_queue
.emplace(std::piecewise_construct
, std::make_tuple(in
->get_parent_dir()), std::make_tuple());
5969 for (auto& in
: rejoin_undef_inodes
) {
5970 assert(!in
->is_base());
5971 CDentry
*dn
= in
->get_parent_dn();
5972 auto& p
= fetch_queue
[dn
->get_dir()];
5974 if (dn
->last
!= CEPH_NOSNAP
) {
5977 } else if (!p
.first
) {
5978 p
.second
.push_back(dn
->key());
5983 if (fetch_queue
.empty())
5986 MDSGatherBuilder
gather(g_ceph_context
,
5987 new MDSInternalContextWrapper(mds
,
5988 new LambdaContext([this](int r
) {
5989 if (rejoin_gather
.empty() && rejoin_ack_gather
.count(mds
->get_nodeid()))
5990 rejoin_gather_finish();
5995 for (auto& p
: fetch_queue
) {
5996 CDir
*dir
= p
.first
;
5997 CInode
*diri
= dir
->get_inode();
5998 if (diri
->state_test(CInode::STATE_REJOINUNDEF
))
6000 if (dir
->state_test(CDir::STATE_REJOINUNDEF
))
6001 ceph_assert(diri
->dirfragtree
.is_leaf(dir
->get_frag()));
6002 if (p
.second
.first
|| p
.second
.second
.empty()) {
6003 dir
->fetch(gather
.new_sub());
6005 dir
->fetch_keys(p
.second
.second
, gather
.new_sub());
6008 ceph_assert(gather
.has_subs());
6013 void MDCache::opened_undef_inode(CInode
*in
) {
6014 dout(10) << "opened_undef_inode " << *in
<< dendl
;
6015 rejoin_undef_inodes
.erase(in
);
6017 // FIXME: re-hash dentries if necessary
6018 ceph_assert(in
->get_inode()->dir_layout
.dl_dir_hash
== g_conf()->mds_default_dir_hash
);
6019 if (in
->get_num_dirfrags() && !in
->dirfragtree
.is_leaf(frag_t())) {
6020 CDir
*dir
= in
->get_dirfrag(frag_t());
6022 rejoin_undef_dirfrags
.erase(dir
);
6023 in
->force_dirfrags();
6024 auto&& ls
= in
->get_dirfrags();
6025 for (const auto& dir
: ls
) {
6026 rejoin_undef_dirfrags
.insert(dir
);
6032 void MDCache::finish_snaprealm_reconnect(client_t client
, SnapRealm
*realm
, snapid_t seq
,
6033 map
<client_t
,ref_t
<MClientSnap
>>& updates
)
6035 if (seq
< realm
->get_newest_seq()) {
6036 dout(10) << "finish_snaprealm_reconnect client." << client
<< " has old seq " << seq
<< " < "
6037 << realm
->get_newest_seq() << " on " << *realm
<< dendl
;
6038 auto snap
= make_message
<MClientSnap
>(CEPH_SNAP_OP_UPDATE
);
6039 snap
->bl
= mds
->server
->get_snap_trace(client
, realm
);
6040 updates
.emplace(std::piecewise_construct
, std::forward_as_tuple(client
), std::forward_as_tuple(snap
));
6042 dout(10) << "finish_snaprealm_reconnect client." << client
<< " up to date"
6043 << " on " << *realm
<< dendl
;
6049 void MDCache::rejoin_send_acks()
6051 dout(7) << "rejoin_send_acks" << dendl
;
6054 for (map
<mds_rank_t
, set
<CInode
*> >::iterator p
= rejoin_unlinked_inodes
.begin();
6055 p
!= rejoin_unlinked_inodes
.end();
6057 for (set
<CInode
*>::iterator q
= p
->second
.begin();
6058 q
!= p
->second
.end();
6061 dout(7) << " unlinked inode " << *in
<< dendl
;
6063 if (!in
->is_replica(p
->first
))
6066 CDentry
*dn
= in
->get_parent_dn();
6067 if (dn
->is_replica(p
->first
))
6069 dn
->add_replica(p
->first
);
6070 CDir
*dir
= dn
->get_dir();
6071 if (dir
->is_replica(p
->first
))
6073 dir
->add_replica(p
->first
);
6074 in
= dir
->get_inode();
6075 if (in
->is_replica(p
->first
))
6077 in
->add_replica(p
->first
);
6083 rejoin_unlinked_inodes
.clear();
6085 // send acks to everyone in the recovery set
6086 map
<mds_rank_t
,ref_t
<MMDSCacheRejoin
>> acks
;
6087 for (set
<mds_rank_t
>::iterator p
= recovery_set
.begin();
6088 p
!= recovery_set
.end();
6090 if (rejoin_ack_sent
.count(*p
))
6092 acks
[*p
] = make_message
<MMDSCacheRejoin
>(MMDSCacheRejoin::OP_ACK
);
6095 rejoin_ack_sent
= recovery_set
;
6098 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
6099 p
!= subtrees
.end();
6101 CDir
*dir
= p
->first
;
6102 if (!dir
->is_auth())
6104 dout(10) << "subtree " << *dir
<< dendl
;
6106 // auth items in this subtree
6107 std::queue
<CDir
*> dq
;
6110 while (!dq
.empty()) {
6111 CDir
*dir
= dq
.front();
6115 for (auto &r
: dir
->get_replicas()) {
6116 auto it
= acks
.find(r
.first
);
6117 if (it
== acks
.end())
6119 it
->second
->add_strong_dirfrag(dir
->dirfrag(), ++r
.second
, dir
->dir_rep
);
6120 it
->second
->add_dirfrag_base(dir
);
6123 for (auto &p
: dir
->items
) {
6124 CDentry
*dn
= p
.second
;
6125 CDentry::linkage_t
*dnl
= dn
->get_linkage();
6129 if (dnl
->is_primary())
6130 in
= dnl
->get_inode();
6133 for (auto &r
: dn
->get_replicas()) {
6134 auto it
= acks
.find(r
.first
);
6135 if (it
== acks
.end())
6137 it
->second
->add_strong_dentry(dir
->dirfrag(), dn
->get_name(), dn
->get_alternate_name(),
6138 dn
->first
, dn
->last
,
6139 dnl
->is_primary() ? dnl
->get_inode()->ino():inodeno_t(0),
6140 dnl
->is_remote() ? dnl
->get_remote_ino():inodeno_t(0),
6141 dnl
->is_remote() ? dnl
->get_remote_d_type():0,
6143 dn
->lock
.get_replica_state());
6144 // peer missed MDentrylink message ?
6145 if (in
&& !in
->is_replica(r
.first
))
6146 in
->add_replica(r
.first
);
6152 for (auto &r
: in
->get_replicas()) {
6153 auto it
= acks
.find(r
.first
);
6154 if (it
== acks
.end())
6156 it
->second
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
6158 in
->_encode_locks_state_for_rejoin(bl
, r
.first
);
6159 it
->second
->add_inode_locks(in
, ++r
.second
, bl
);
6162 // subdirs in this subtree?
6164 auto&& dirs
= in
->get_nested_dirfrags();
6165 for (const auto& dir
: dirs
) {
6174 if (root
&& root
->is_auth())
6175 for (auto &r
: root
->get_replicas()) {
6176 auto it
= acks
.find(r
.first
);
6177 if (it
== acks
.end())
6179 it
->second
->add_inode_base(root
, mds
->mdsmap
->get_up_features());
6181 root
->_encode_locks_state_for_rejoin(bl
, r
.first
);
6182 it
->second
->add_inode_locks(root
, ++r
.second
, bl
);
6185 for (auto &r
: myin
->get_replicas()) {
6186 auto it
= acks
.find(r
.first
);
6187 if (it
== acks
.end())
6189 it
->second
->add_inode_base(myin
, mds
->mdsmap
->get_up_features());
6191 myin
->_encode_locks_state_for_rejoin(bl
, r
.first
);
6192 it
->second
->add_inode_locks(myin
, ++r
.second
, bl
);
6195 // include inode base for any inodes whose scatterlocks may have updated
6196 for (set
<CInode
*>::iterator p
= rejoin_potential_updated_scatterlocks
.begin();
6197 p
!= rejoin_potential_updated_scatterlocks
.end();
6200 for (const auto &r
: in
->get_replicas()) {
6201 auto it
= acks
.find(r
.first
);
6202 if (it
== acks
.end())
6204 it
->second
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
6209 for (auto p
= acks
.begin(); p
!= acks
.end(); ++p
) {
6210 encode(rejoin_imported_caps
[p
->first
], p
->second
->imported_caps
);
6211 mds
->send_message_mds(p
->second
, p
->first
);
6214 rejoin_imported_caps
.clear();
6217 class C_MDC_ReIssueCaps
: public MDCacheContext
{
6220 C_MDC_ReIssueCaps(MDCache
*mdc
, CInode
*i
) :
6221 MDCacheContext(mdc
), in(i
)
6223 in
->get(CInode::PIN_PTRWAITER
);
6225 void finish(int r
) override
{
6226 if (!mdcache
->mds
->locker
->eval(in
, CEPH_CAP_LOCKS
))
6227 mdcache
->mds
->locker
->issue_caps(in
);
6228 in
->put(CInode::PIN_PTRWAITER
);
6232 void MDCache::reissue_all_caps()
6234 dout(10) << "reissue_all_caps" << dendl
;
6237 for (auto &p
: inode_map
) {
6239 CInode
*in
= p
.second
;
6240 if (in
->is_head() && in
->is_any_caps()) {
6241 // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
6242 if (in
->is_frozen_inode()) {
6243 in
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDC_ReIssueCaps(this, in
));
6246 if (!mds
->locker
->eval(in
, CEPH_CAP_LOCKS
))
6247 n
+= mds
->locker
->issue_caps(in
);
6250 if ((count
% mds
->heartbeat_reset_grace()) + n
>= mds
->heartbeat_reset_grace())
6251 mds
->heartbeat_reset();
6257 // ===============================================================================
6259 struct C_MDC_QueuedCow
: public MDCacheContext
{
6262 C_MDC_QueuedCow(MDCache
*mdc
, CInode
*i
, MutationRef
& m
) :
6263 MDCacheContext(mdc
), in(i
), mut(m
) {}
6264 void finish(int r
) override
{
6265 mdcache
->_queued_file_recover_cow(in
, mut
);
6270 void MDCache::queue_file_recover(CInode
*in
)
6272 dout(10) << "queue_file_recover " << *in
<< dendl
;
6273 ceph_assert(in
->is_auth());
6277 SnapRealm *realm = in->find_snaprealm();
6278 set<snapid_t> s = realm->get_snaps();
6279 while (!s.empty() && *s.begin() < in->first)
6281 while (!s.empty() && *s.rbegin() > in->last)
6282 s.erase(*s.rbegin());
6283 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6285 auto pi = in->project_inode(mut);
6286 pi.inode.version = in->pre_dirty();
6288 auto mut(std::make_shared<MutationImpl>());
6289 mut->ls = mds->mdlog->get_current_segment();
6290 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6291 mds->mdlog->start_entry(le);
6292 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6294 s.erase(*s.begin());
6295 while (!s.empty()) {
6296 snapid_t snapid = *s.begin();
6297 CInode *cow_inode = 0;
6298 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
6299 ceph_assert(cow_inode);
6300 recovery_queue.enqueue(cow_inode);
6301 s.erase(*s.begin());
6304 in->parent->first = in->first;
6305 le->metablob.add_primary_dentry(in->parent, in, true);
6306 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6307 mds->mdlog->flush();
6311 recovery_queue
.enqueue(in
);
6314 void MDCache::_queued_file_recover_cow(CInode
*in
, MutationRef
& mut
)
6317 mds
->locker
->drop_locks(mut
.get());
6323 * called after recovery to recover file sizes for previously opened (for write)
6324 * files. that is, those where max_size > size.
6326 void MDCache::identify_files_to_recover()
6328 dout(10) << "identify_files_to_recover" << dendl
;
6331 // Clear the recover and check queues in case the monitor sends rejoin mdsmap twice.
6332 rejoin_recover_q
.clear();
6333 rejoin_check_q
.clear();
6335 for (auto &p
: inode_map
) {
6336 CInode
*in
= p
.second
;
6340 if (in
->last
!= CEPH_NOSNAP
)
6343 // Only normal files need file size recovery
6344 if (!in
->is_file()) {
6348 bool recover
= false;
6349 const auto& client_ranges
= in
->get_projected_inode()->client_ranges
;
6350 if (!client_ranges
.empty()) {
6351 in
->mark_clientwriteable();
6352 for (auto& p
: client_ranges
) {
6353 Capability
*cap
= in
->get_client_cap(p
.first
);
6355 cap
->mark_clientwriteable();
6357 dout(10) << " client." << p
.first
<< " has range " << p
.second
<< " but no cap on " << *in
<< dendl
;
6365 if (in
->filelock
.is_stable()) {
6366 in
->auth_pin(&in
->filelock
);
6368 ceph_assert(in
->filelock
.get_state() == LOCK_XLOCKSNAP
);
6370 in
->filelock
.set_state(LOCK_PRE_SCAN
);
6371 rejoin_recover_q
.push_back(in
);
6373 rejoin_check_q
.push_back(in
);
6376 if (!(++count
% mds
->heartbeat_reset_grace()))
6377 mds
->heartbeat_reset();
6381 void MDCache::start_files_to_recover()
6384 for (CInode
*in
: rejoin_check_q
) {
6385 if (in
->filelock
.get_state() == LOCK_XLOCKSNAP
)
6386 mds
->locker
->issue_caps(in
);
6387 mds
->locker
->check_inode_max_size(in
);
6388 if (!(++count
% mds
->heartbeat_reset_grace()))
6389 mds
->heartbeat_reset();
6391 rejoin_check_q
.clear();
6392 for (CInode
*in
: rejoin_recover_q
) {
6393 mds
->locker
->file_recover(&in
->filelock
);
6394 if (!(++count
% mds
->heartbeat_reset_grace()))
6395 mds
->heartbeat_reset();
6397 if (!rejoin_recover_q
.empty()) {
6398 rejoin_recover_q
.clear();
6403 void MDCache::do_file_recover()
6405 recovery_queue
.advance();
6408 // ===============================================================================
6411 // ----------------------------
6414 class C_MDC_RetryTruncate
: public MDCacheContext
{
6418 C_MDC_RetryTruncate(MDCache
*c
, CInode
*i
, LogSegment
*l
) :
6419 MDCacheContext(c
), in(i
), ls(l
) {}
6420 void finish(int r
) override
{
6421 mdcache
->_truncate_inode(in
, ls
);
6425 void MDCache::truncate_inode(CInode
*in
, LogSegment
*ls
)
6427 const auto& pi
= in
->get_projected_inode();
6428 dout(10) << "truncate_inode "
6429 << pi
->truncate_from
<< " -> " << pi
->truncate_size
6433 ls
->truncating_inodes
.insert(in
);
6434 in
->get(CInode::PIN_TRUNCATING
);
6437 if (!in
->client_need_snapflush
.empty() &&
6438 (in
->get_caps_issued() & CEPH_CAP_FILE_BUFFER
)) {
6439 ceph_assert(in
->filelock
.is_xlocked());
6440 in
->filelock
.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in
, ls
));
6441 mds
->locker
->issue_caps(in
);
6445 _truncate_inode(in
, ls
);
6448 struct C_IO_MDC_TruncateWriteFinish
: public MDCacheIOContext
{
6451 uint32_t block_size
;
6452 C_IO_MDC_TruncateWriteFinish(MDCache
*c
, CInode
*i
, LogSegment
*l
, uint32_t bs
) :
6453 MDCacheIOContext(c
, false), in(i
), ls(l
), block_size(bs
) {
6455 void finish(int r
) override
{
6456 ceph_assert(r
== 0 || r
== -CEPHFS_ENOENT
);
6457 mdcache
->truncate_inode_write_finish(in
, ls
, block_size
);
6459 void print(ostream
& out
) const override
{
6460 out
<< "file_truncate_write(" << in
->ino() << ")";
6464 struct C_IO_MDC_TruncateFinish
: public MDCacheIOContext
{
6467 C_IO_MDC_TruncateFinish(MDCache
*c
, CInode
*i
, LogSegment
*l
) :
6468 MDCacheIOContext(c
, false), in(i
), ls(l
) {
6470 void finish(int r
) override
{
6471 ceph_assert(r
== 0 || r
== -CEPHFS_ENOENT
);
6472 mdcache
->truncate_inode_finish(in
, ls
);
6474 void print(ostream
& out
) const override
{
6475 out
<< "file_truncate(" << in
->ino() << ")";
6479 void MDCache::_truncate_inode(CInode
*in
, LogSegment
*ls
)
6481 const auto& pi
= in
->get_inode();
6482 dout(10) << "_truncate_inode "
6483 << pi
->truncate_from
<< " -> " << pi
->truncate_size
6484 << " fscrypt last block length is " << pi
->fscrypt_last_block
.length()
6485 << " on " << *in
<< dendl
;
6487 ceph_assert(pi
->is_truncating());
6488 ceph_assert(pi
->truncate_size
< (1ULL << 63));
6489 ceph_assert(pi
->truncate_from
< (1ULL << 63));
6490 ceph_assert(pi
->truncate_size
< pi
->truncate_from
||
6491 (pi
->truncate_size
== pi
->truncate_from
&&
6492 pi
->fscrypt_last_block
.length()));
6495 SnapRealm
*realm
= in
->find_snaprealm();
6496 SnapContext nullsnap
;
6497 const SnapContext
*snapc
;
6499 dout(10) << " realm " << *realm
<< dendl
;
6500 snapc
= &realm
->get_snap_context();
6502 dout(10) << " NO realm, using null context" << dendl
;
6504 ceph_assert(in
->last
== CEPH_NOSNAP
);
6506 dout(10) << "_truncate_inode snapc " << snapc
<< " on " << *in
6507 << " fscrypt_last_block length is " << pi
->fscrypt_last_block
.length()
6509 auto layout
= pi
->layout
;
6510 struct ceph_fscrypt_last_block_header header
;
6511 memset(&header
, 0, sizeof(header
));
6513 if (pi
->fscrypt_last_block
.length()) {
6514 auto bl
= pi
->fscrypt_last_block
.cbegin();
6515 DECODE_START(1, bl
);
6516 decode(header
.change_attr
, bl
);
6517 decode(header
.file_offset
, bl
);
6518 decode(header
.block_size
, bl
);
6521 * The block_size will be in unit of KB, so if the last block is not
6522 * located in a file hole, the struct_len should be larger than the
6523 * header.block_size.
6525 if (struct_len
> header
.block_size
) {
6526 bl
.copy(header
.block_size
, data
);
6531 if (data
.length()) {
6532 dout(10) << "_truncate_inode write on inode " << *in
<< " change_attr: "
6533 << header
.change_attr
<< " offset: " << header
.file_offset
<< " blen: "
6534 << header
.block_size
<< dendl
;
6535 filer
.write(in
->ino(), &layout
, *snapc
, header
.file_offset
, header
.block_size
,
6536 data
, ceph::real_time::min(), 0,
6537 new C_OnFinisher(new C_IO_MDC_TruncateWriteFinish(this, in
, ls
,
6540 } else { // located in file hole.
6541 uint64_t length
= pi
->truncate_from
- pi
->truncate_size
;
6544 * When the fscrypt is enabled the truncate_from and truncate_size
6545 * possibly equal and both are aligned up to header.block_size. In
6546 * this case we will always request a larger length to make sure the
6547 * OSD won't miss truncating the last object.
6549 if (pi
->fscrypt_last_block
.length()) {
6550 dout(10) << "_truncate_inode truncate on inode " << *in
<< " hits a hole!" << dendl
;
6551 length
+= header
.block_size
;
6553 ceph_assert(length
);
6555 dout(10) << "_truncate_inode truncate on inode " << *in
<< dendl
;
6556 filer
.truncate(in
->ino(), &layout
, *snapc
, pi
->truncate_size
, length
,
6557 pi
->truncate_seq
, ceph::real_time::min(), 0,
6558 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in
, ls
),
6564 struct C_MDC_TruncateLogged
: public MDCacheLogContext
{
6567 C_MDC_TruncateLogged(MDCache
*m
, CInode
*i
, MutationRef
& mu
) :
6568 MDCacheLogContext(m
), in(i
), mut(mu
) {}
6569 void finish(int r
) override
{
6570 mdcache
->truncate_inode_logged(in
, mut
);
6574 void MDCache::truncate_inode_write_finish(CInode
*in
, LogSegment
*ls
,
6575 uint32_t block_size
)
6577 const auto& pi
= in
->get_inode();
6578 dout(10) << "_truncate_inode_write "
6579 << pi
->truncate_from
<< " -> " << pi
->truncate_size
6580 << " on " << *in
<< dendl
;
6582 ceph_assert(pi
->is_truncating());
6583 ceph_assert(pi
->truncate_size
< (1ULL << 63));
6584 ceph_assert(pi
->truncate_from
< (1ULL << 63));
6585 ceph_assert(pi
->truncate_size
< pi
->truncate_from
||
6586 (pi
->truncate_size
== pi
->truncate_from
&&
6587 pi
->fscrypt_last_block
.length()));
6590 SnapRealm
*realm
= in
->find_snaprealm();
6591 SnapContext nullsnap
;
6592 const SnapContext
*snapc
;
6594 dout(10) << " realm " << *realm
<< dendl
;
6595 snapc
= &realm
->get_snap_context();
6597 dout(10) << " NO realm, using null context" << dendl
;
6599 ceph_assert(in
->last
== CEPH_NOSNAP
);
6601 dout(10) << "_truncate_inode_write snapc " << snapc
<< " on " << *in
6602 << " fscrypt_last_block length is " << pi
->fscrypt_last_block
.length()
6604 auto layout
= pi
->layout
;
6606 * When the fscrypt is enabled the truncate_from and truncate_size
6607 * possibly equal and both are aligned up to header.block_size. In
6608 * this case we will always request a larger length to make sure the
6609 * OSD won't miss truncating the last object.
6611 uint64_t length
= pi
->truncate_from
- pi
->truncate_size
+ block_size
;
6612 filer
.truncate(in
->ino(), &layout
, *snapc
, pi
->truncate_size
, length
,
6613 pi
->truncate_seq
, ceph::real_time::min(), 0,
6614 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in
, ls
),
6618 void MDCache::truncate_inode_finish(CInode
*in
, LogSegment
*ls
)
6620 dout(10) << "truncate_inode_finish " << *in
<< dendl
;
6622 set
<CInode
*>::iterator p
= ls
->truncating_inodes
.find(in
);
6623 ceph_assert(p
!= ls
->truncating_inodes
.end());
6624 ls
->truncating_inodes
.erase(p
);
6626 MutationRef
mut(new MutationImpl());
6627 mut
->ls
= mds
->mdlog
->get_current_segment();
6630 auto pi
= in
->project_inode(mut
);
6631 pi
.inode
->version
= in
->pre_dirty();
6632 pi
.inode
->truncate_from
= 0;
6633 pi
.inode
->truncate_pending
--;
6634 pi
.inode
->fscrypt_last_block
= bufferlist();
6636 EUpdate
*le
= new EUpdate(mds
->mdlog
, "truncate finish");
6637 mds
->mdlog
->start_entry(le
);
6639 predirty_journal_parents(mut
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
6640 journal_dirty_inode(mut
.get(), &le
->metablob
, in
);
6641 le
->metablob
.add_truncate_finish(in
->ino(), ls
->seq
);
6642 mds
->mdlog
->submit_entry(le
, new C_MDC_TruncateLogged(this, in
, mut
));
6644 // flush immediately if there are readers/writers waiting
6645 if (in
->is_waiter_for(CInode::WAIT_TRUNC
) ||
6646 (in
->get_caps_wanted() & (CEPH_CAP_FILE_RD
|CEPH_CAP_FILE_WR
)))
6647 mds
->mdlog
->flush();
6650 void MDCache::truncate_inode_logged(CInode
*in
, MutationRef
& mut
)
6652 dout(10) << "truncate_inode_logged " << *in
<< dendl
;
6654 mds
->locker
->drop_locks(mut
.get());
6657 in
->put(CInode::PIN_TRUNCATING
);
6658 in
->auth_unpin(this);
6660 MDSContext::vec waiters
;
6661 in
->take_waiting(CInode::WAIT_TRUNC
, waiters
);
6662 mds
->queue_waiters(waiters
);
6666 void MDCache::add_recovered_truncate(CInode
*in
, LogSegment
*ls
)
6668 dout(20) << "add_recovered_truncate " << *in
<< " in log segment "
6669 << ls
->seq
<< "/" << ls
->offset
<< dendl
;
6670 ls
->truncating_inodes
.insert(in
);
6671 in
->get(CInode::PIN_TRUNCATING
);
6674 void MDCache::remove_recovered_truncate(CInode
*in
, LogSegment
*ls
)
6676 dout(20) << "remove_recovered_truncate " << *in
<< " in log segment "
6677 << ls
->seq
<< "/" << ls
->offset
<< dendl
;
6678 // if we have the logseg the truncate started in, it must be in our list.
6679 set
<CInode
*>::iterator p
= ls
->truncating_inodes
.find(in
);
6680 ceph_assert(p
!= ls
->truncating_inodes
.end());
6681 ls
->truncating_inodes
.erase(p
);
6682 in
->put(CInode::PIN_TRUNCATING
);
6685 void MDCache::start_recovered_truncates()
6687 dout(10) << "start_recovered_truncates" << dendl
;
6688 for (map
<uint64_t,LogSegment
*>::iterator p
= mds
->mdlog
->segments
.begin();
6689 p
!= mds
->mdlog
->segments
.end();
6691 LogSegment
*ls
= p
->second
;
6692 for (set
<CInode
*>::iterator q
= ls
->truncating_inodes
.begin();
6693 q
!= ls
->truncating_inodes
.end();
6698 if (!in
->client_need_snapflush
.empty() &&
6699 (in
->get_caps_issued() & CEPH_CAP_FILE_BUFFER
)) {
6700 ceph_assert(in
->filelock
.is_stable());
6701 in
->filelock
.set_state(LOCK_XLOCKDONE
);
6702 in
->auth_pin(&in
->filelock
);
6703 in
->filelock
.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in
, ls
));
6704 // start_files_to_recover will revoke caps
6707 _truncate_inode(in
, ls
);
6713 class C_MDS_purge_completed_finish
: public MDCacheLogContext
{
6714 interval_set
<inodeno_t
> inos
;
6716 version_t inotablev
;
6718 C_MDS_purge_completed_finish(MDCache
*m
, const interval_set
<inodeno_t
>& _inos
,
6719 LogSegment
*_ls
, version_t iv
)
6720 : MDCacheLogContext(m
), inos(_inos
), ls(_ls
), inotablev(iv
) {}
6721 void finish(int r
) override
{
6722 ceph_assert(r
== 0);
6724 get_mds()->inotable
->apply_release_ids(inos
);
6725 ceph_assert(get_mds()->inotable
->get_version() == inotablev
);
6727 ls
->purge_inodes_finish(inos
);
6731 void MDCache::start_purge_inodes(){
6732 dout(10) << "start_purge_inodes" << dendl
;
6733 for (auto& p
: mds
->mdlog
->segments
){
6734 LogSegment
*ls
= p
.second
;
6735 if (ls
->purging_inodes
.size()){
6736 purge_inodes(ls
->purging_inodes
, ls
);
6741 void MDCache::purge_inodes(const interval_set
<inodeno_t
>& inos
, LogSegment
*ls
)
6743 dout(10) << __func__
<< " purging inos " << inos
<< " logseg " << ls
->seq
<< dendl
;
6744 // FIXME: handle non-default data pool and namespace
6746 auto cb
= new LambdaContext([this, inos
, ls
](int r
){
6747 ceph_assert(r
== 0 || r
== -2);
6748 mds
->inotable
->project_release_ids(inos
);
6749 version_t piv
= mds
->inotable
->get_projected_version();
6750 ceph_assert(piv
!= 0);
6751 mds
->mdlog
->start_submit_entry(new EPurged(inos
, ls
->seq
, piv
),
6752 new C_MDS_purge_completed_finish(this, inos
, ls
, piv
));
6753 mds
->mdlog
->flush();
6756 C_GatherBuilder
gather(g_ceph_context
,
6757 new C_OnFinisher(new MDSIOContextWrapper(mds
, cb
), mds
->finisher
));
6758 SnapContext nullsnapc
;
6759 for (const auto& [start
, len
] : inos
) {
6760 for (auto i
= start
; i
< start
+ len
; i
+= 1) {
6761 filer
.purge_range(i
, &default_file_layout
, nullsnapc
, 0, 1,
6762 ceph::real_clock::now(), 0, gather
.new_sub());
6768 // ================================================================================
6771 std::pair
<bool, uint64_t> MDCache::trim_lru(uint64_t count
, expiremap
& expiremap
)
6773 bool is_standby_replay
= mds
->is_standby_replay();
6774 std::vector
<CDentry
*> unexpirables
;
6775 uint64_t trimmed
= 0;
6777 auto trim_threshold
= g_conf().get_val
<Option::size_t>("mds_cache_trim_threshold");
6779 dout(7) << "trim_lru trimming " << count
6780 << " items from LRU"
6781 << " size=" << lru
.lru_get_size()
6782 << " mid=" << lru
.lru_get_top()
6783 << " pintail=" << lru
.lru_get_pintail()
6784 << " pinned=" << lru
.lru_get_num_pinned()
6787 const uint64_t trim_counter_start
= trim_counter
.get();
6788 bool throttled
= false;
6790 throttled
|= trim_counter_start
+trimmed
>= trim_threshold
;
6791 if (throttled
) break;
6792 CDentry
*dn
= static_cast<CDentry
*>(bottom_lru
.lru_expire());
6795 if (trim_dentry(dn
, expiremap
)) {
6796 unexpirables
.push_back(dn
);
6802 for (auto &dn
: unexpirables
) {
6803 bottom_lru
.lru_insert_mid(dn
);
6805 unexpirables
.clear();
6807 // trim dentries from the LRU until count is reached
6808 // if mds is in standby_replay and skip trimming the inodes
6809 while (!throttled
&& (cache_toofull() || count
> 0 || is_standby_replay
)) {
6810 throttled
|= trim_counter_start
+trimmed
>= trim_threshold
;
6811 if (throttled
) break;
6812 CDentry
*dn
= static_cast<CDentry
*>(lru
.lru_expire());
6816 if (is_standby_replay
&& dn
->get_linkage()->inode
) {
6817 // we move the inodes that need to be trimmed to the end of the lru queue.
6818 // refer to MDCache::standby_trim_segment
6819 lru
.lru_insert_bot(dn
);
6821 } else if (trim_dentry(dn
, expiremap
)) {
6822 unexpirables
.push_back(dn
);
6825 if (count
> 0) count
--;
6828 trim_counter
.hit(trimmed
);
6830 for (auto &dn
: unexpirables
) {
6831 lru
.lru_insert_mid(dn
);
6833 unexpirables
.clear();
6835 dout(7) << "trim_lru trimmed " << trimmed
<< " items" << dendl
;
6836 return std::pair
<bool, uint64_t>(throttled
, trimmed
);
6840 * note: only called while MDS is active or stopping... NOT during recovery.
6841 * however, we may expire a replica whose authority is recovering.
6843 * @param count is number of dentries to try to expire
6845 std::pair
<bool, uint64_t> MDCache::trim(uint64_t count
)
6847 uint64_t used
= cache_size();
6848 uint64_t limit
= cache_memory_limit
;
6849 expiremap expiremap
;
6851 dout(7) << "trim bytes_used=" << bytes2str(used
)
6852 << " limit=" << bytes2str(limit
)
6853 << " reservation=" << cache_reservation
6854 << "% count=" << count
<< dendl
;
6856 // process delayed eval_stray()
6857 stray_manager
.advance_delayed();
6859 auto result
= trim_lru(count
, expiremap
);
6860 auto& trimmed
= result
.second
;
6862 // trim non-auth, non-bound subtrees
6863 for (auto p
= subtrees
.begin(); p
!= subtrees
.end();) {
6864 CDir
*dir
= p
->first
;
6866 CInode
*diri
= dir
->get_inode();
6867 if (dir
->is_auth()) {
6868 if (diri
->is_auth() && !diri
->is_base()) {
6869 /* this situation should correspond to an export pin */
6870 if (dir
->get_num_head_items() == 0 && dir
->get_num_ref() == 1) {
6871 /* pinned empty subtree, try to drop */
6872 if (dir
->state_test(CDir::STATE_AUXSUBTREE
)) {
6873 dout(20) << "trimming empty pinned subtree " << *dir
<< dendl
;
6874 dir
->state_clear(CDir::STATE_AUXSUBTREE
);
6875 remove_subtree(dir
);
6876 diri
->close_dirfrag(dir
->dirfrag().frag
);
6879 } else if (!diri
->is_auth() && !diri
->is_base() && dir
->get_num_head_items() == 0) {
6880 if (dir
->state_test(CDir::STATE_EXPORTING
) ||
6881 !(mds
->is_active() || mds
->is_stopping()) ||
6882 dir
->is_freezing() || dir
->is_frozen())
6885 migrator
->export_empty_import(dir
);
6888 } else if (!diri
->is_auth() && dir
->get_num_ref() <= 1) {
6890 if (diri
->get_num_ref() > diri
->get_num_subtree_roots()) {
6894 // don't trim subtree root if its auth MDS is recovering.
6895 // This simplify the cache rejoin code.
6896 if (dir
->is_subtree_root() && rejoin_ack_gather
.count(dir
->get_dir_auth().first
))
6898 trim_dirfrag(dir
, 0, expiremap
);
6904 if (mds
->is_stopping() && root
) {
6905 auto&& ls
= root
->get_dirfrags();
6906 for (const auto& dir
: ls
) {
6907 if (dir
->get_num_ref() == 1) { // subtree pin
6908 trim_dirfrag(dir
, 0, expiremap
);
6912 if (root
->get_num_ref() == 0) {
6913 trim_inode(0, root
, 0, expiremap
);
6918 std::set
<mds_rank_t
> stopping
;
6919 mds
->mdsmap
->get_mds_set(stopping
, MDSMap::STATE_STOPPING
);
6920 stopping
.erase(mds
->get_nodeid());
6921 for (auto rank
: stopping
) {
6922 CInode
* mdsdir_in
= get_inode(MDS_INO_MDSDIR(rank
));
6926 auto em
= expiremap
.emplace(std::piecewise_construct
, std::forward_as_tuple(rank
), std::forward_as_tuple());
6928 em
.first
->second
= make_message
<MCacheExpire
>(mds
->get_nodeid());
6931 dout(20) << __func__
<< ": try expiring " << *mdsdir_in
<< " for stopping mds." << mds
->get_nodeid() << dendl
;
6933 const bool aborted
= expire_recursive(mdsdir_in
, expiremap
);
6935 dout(20) << __func__
<< ": successfully expired mdsdir" << dendl
;
6936 auto&& ls
= mdsdir_in
->get_dirfrags();
6937 for (auto dir
: ls
) {
6938 if (dir
->get_num_ref() == 1) { // subtree pin
6939 trim_dirfrag(dir
, dir
, expiremap
);
6943 if (mdsdir_in
->get_num_ref() == 0) {
6944 trim_inode(NULL
, mdsdir_in
, NULL
, expiremap
);
6948 dout(20) << __func__
<< ": some unexpirable contents in mdsdir" << dendl
;
6952 // Other rank's base inodes (when I'm stopping)
6953 if (mds
->is_stopping()) {
6954 for (set
<CInode
*>::iterator p
= base_inodes
.begin();
6955 p
!= base_inodes
.end();) {
6956 CInode
*base_in
= *p
;
6958 if (MDS_INO_IS_MDSDIR(base_in
->ino()) &&
6959 MDS_INO_MDSDIR_OWNER(base_in
->ino()) != mds
->get_nodeid()) {
6960 dout(20) << __func__
<< ": maybe trimming base: " << *base_in
<< dendl
;
6961 if (base_in
->get_num_ref() == 0) {
6962 trim_inode(NULL
, base_in
, NULL
, expiremap
);
6969 // send any expire messages
6970 send_expire_messages(expiremap
);
6975 void MDCache::send_expire_messages(expiremap
& expiremap
)
6978 for (const auto &p
: expiremap
) {
6979 if (mds
->is_cluster_degraded() &&
6980 (mds
->mdsmap
->get_state(p
.first
) < MDSMap::STATE_REJOIN
||
6981 (mds
->mdsmap
->get_state(p
.first
) == MDSMap::STATE_REJOIN
&&
6982 rejoin_sent
.count(p
.first
) == 0))) {
6985 dout(7) << "sending cache_expire to " << p
.first
<< dendl
;
6986 mds
->send_message_mds(p
.second
, p
.first
);
6992 bool MDCache::trim_dentry(CDentry
*dn
, expiremap
& expiremap
)
6994 dout(12) << "trim_dentry " << *dn
<< dendl
;
6996 CDentry::linkage_t
*dnl
= dn
->get_linkage();
6998 CDir
*dir
= dn
->get_dir();
7001 CDir
*con
= get_subtree_root(dir
);
7003 dout(12) << " in container " << *con
<< dendl
;
7005 dout(12) << " no container; under a not-yet-linked dir" << dendl
;
7006 ceph_assert(dn
->is_auth());
7009 // If replica dentry is not readable, it's likely we will receive
7010 // MDentryLink/MDentryUnlink message soon (It's possible we first
7011 // receive a MDentryUnlink message, then MDentryLink message)
7012 // MDentryLink message only replicates an inode, so we should
7013 // avoid trimming the inode's parent dentry. This is because that
7014 // unconnected replicas are problematic for subtree migration.
7015 if (!dn
->is_auth() && !dn
->lock
.can_read(-1) &&
7016 !dn
->get_dir()->get_inode()->is_stray())
7019 // adjust the dir state
7020 // NOTE: we can safely remove a clean, null dentry without effecting
7021 // directory completeness.
7022 // (check this _before_ we unlink the inode, below!)
7023 bool clear_complete
= false;
7024 if (dn
->is_auth() && !(dnl
->is_null() && dn
->is_clean()))
7025 clear_complete
= true;
7027 // unlink the dentry
7028 if (dnl
->is_remote()) {
7030 dir
->unlink_inode(dn
, false);
7031 } else if (dnl
->is_primary()) {
7032 // expire the inode, too.
7033 CInode
*in
= dnl
->get_inode();
7035 if (trim_inode(dn
, in
, con
, expiremap
))
7036 return true; // purging stray instead of trimming
7038 ceph_assert(dnl
->is_null());
7041 if (!dn
->is_auth()) {
7042 // notify dentry authority.
7043 mds_authority_t auth
= dn
->authority();
7045 for (int p
=0; p
<2; p
++) {
7046 mds_rank_t a
= auth
.first
;
7047 if (p
) a
= auth
.second
;
7048 if (a
< 0 || (p
== 1 && auth
.second
== auth
.first
)) break;
7049 if (mds
->get_nodeid() == auth
.second
&&
7050 con
->is_importing()) break; // don't send any expire while importing.
7051 if (a
== mds
->get_nodeid()) continue; // on export, ignore myself.
7053 dout(12) << " sending expire to mds." << a
<< " on " << *dn
<< dendl
;
7054 ceph_assert(a
!= mds
->get_nodeid());
7055 auto em
= expiremap
.emplace(std::piecewise_construct
, std::forward_as_tuple(a
), std::forward_as_tuple());
7057 em
.first
->second
= make_message
<MCacheExpire
>(mds
->get_nodeid());
7058 em
.first
->second
->add_dentry(con
->dirfrag(), dir
->dirfrag(), dn
->get_name(), dn
->last
, dn
->get_replica_nonce());
7062 if (clear_complete
) {
7063 if (dn
->last
== CEPH_NOSNAP
)
7064 dir
->add_to_bloom(dn
);
7065 dir
->state_clear(CDir::STATE_COMPLETE
);
7069 dir
->remove_dentry(dn
);
7071 if (mds
->logger
) mds
->logger
->inc(l_mds_inodes_expired
);
7076 void MDCache::trim_dirfrag(CDir
*dir
, CDir
*con
, expiremap
& expiremap
)
7078 dout(15) << "trim_dirfrag " << *dir
<< dendl
;
7080 if (dir
->is_subtree_root()) {
7081 ceph_assert(!dir
->is_auth() ||
7082 (!dir
->is_replicated() && dir
->inode
->is_base()));
7083 remove_subtree(dir
); // remove from subtree map
7085 ceph_assert(dir
->get_num_ref() == 0);
7087 CInode
*in
= dir
->get_inode();
7089 if (!dir
->is_auth()) {
7090 mds_authority_t auth
= dir
->authority();
7092 // was this an auth delegation? (if so, slightly modified container)
7094 if (dir
->is_subtree_root()) {
7095 dout(12) << " subtree root, container is " << *dir
<< dendl
;
7097 condf
= dir
->dirfrag();
7099 condf
= con
->dirfrag();
7102 for (int p
=0; p
<2; p
++) {
7103 mds_rank_t a
= auth
.first
;
7104 if (p
) a
= auth
.second
;
7105 if (a
< 0 || (p
== 1 && auth
.second
== auth
.first
)) break;
7106 if (mds
->get_nodeid() == auth
.second
&&
7107 con
->is_importing()) break; // don't send any expire while importing.
7108 if (a
== mds
->get_nodeid()) continue; // on export, ignore myself.
7110 dout(12) << " sending expire to mds." << a
<< " on " << *dir
<< dendl
;
7111 ceph_assert(a
!= mds
->get_nodeid());
7112 auto em
= expiremap
.emplace(std::piecewise_construct
, std::forward_as_tuple(a
), std::forward_as_tuple());
7114 em
.first
->second
= make_message
<MCacheExpire
>(mds
->get_nodeid()); /* new */
7115 em
.first
->second
->add_dir(condf
, dir
->dirfrag(), dir
->replica_nonce
);
7119 in
->close_dirfrag(dir
->dirfrag().frag
);
7123 * Try trimming an inode from the cache
7125 * @return true if the inode is still in cache, else false if it was trimmed
7127 bool MDCache::trim_inode(CDentry
*dn
, CInode
*in
, CDir
*con
, expiremap
& expiremap
)
7129 dout(15) << "trim_inode " << *in
<< dendl
;
7130 ceph_assert(in
->get_num_ref() == 0);
7133 // If replica inode's dirfragtreelock is not readable, it's likely
7134 // some dirfrags of the inode are being fragmented and we will receive
7135 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
7136 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
7137 // This is because that unconnected replicas are problematic for
7138 // subtree migration.
7140 if (!in
->is_auth() && !mds
->locker
->rdlock_try(&in
->dirfragtreelock
, -1)) {
7145 auto&& dfls
= in
->get_dirfrags();
7146 for (const auto& dir
: dfls
) {
7147 ceph_assert(!dir
->is_subtree_root());
7148 trim_dirfrag(dir
, con
? con
:dir
, expiremap
); // if no container (e.g. root dirfrag), use *p
7153 if (in
->is_auth()) {
7154 // eval stray after closing dirfrags
7155 if (dn
&& !dn
->state_test(CDentry::STATE_PURGING
)) {
7156 maybe_eval_stray(in
);
7157 if (dn
->state_test(CDentry::STATE_PURGING
) || dn
->get_num_ref() > 0)
7161 mds_authority_t auth
= in
->authority();
7165 df
= con
->dirfrag();
7167 df
= dirfrag_t(0,frag_t()); // must be a root or stray inode.
7169 for (int p
=0; p
<2; p
++) {
7170 mds_rank_t a
= auth
.first
;
7171 if (p
) a
= auth
.second
;
7172 if (a
< 0 || (p
== 1 && auth
.second
== auth
.first
)) break;
7173 if (con
&& mds
->get_nodeid() == auth
.second
&&
7174 con
->is_importing()) break; // don't send any expire while importing.
7175 if (a
== mds
->get_nodeid()) continue; // on export, ignore myself.
7177 dout(12) << " sending expire to mds." << a
<< " on " << *in
<< dendl
;
7178 ceph_assert(a
!= mds
->get_nodeid());
7179 auto em
= expiremap
.emplace(std::piecewise_construct
, std::forward_as_tuple(a
), std::forward_as_tuple());
7181 em
.first
->second
= make_message
<MCacheExpire
>(mds
->get_nodeid()); /* new */
7182 em
.first
->second
->add_inode(df
, in
->vino(), in
->get_replica_nonce());
7187 if (in->is_auth()) {
7188 if (in->hack_accessed)
7189 mds->logger->inc("outt");
7191 mds->logger->inc("outut");
7192 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
7199 dn
->get_dir()->unlink_inode(dn
, false);
7206 * trim_non_auth - remove any non-auth items from our cache
7208 * this reduces the amount of non-auth metadata in our cache, reducing the
7209 * load incurred by the rejoin phase.
7211 * the only non-auth items that remain are those that are needed to
7212 * attach our own subtrees to the root.
7214 * when we are done, all dentries will be in the top bit of the lru.
7216 * why we have to do this:
7217 * we may not have accurate linkage for non-auth items. which means we will
7218 * know which subtree it falls into, and can not be sure to declare it to the
7219 * correct authority.
7221 void MDCache::trim_non_auth()
7223 dout(7) << "trim_non_auth" << dendl
;
7225 // temporarily pin all subtree roots
7226 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
7227 p
!= subtrees
.end();
7229 p
->first
->get(CDir::PIN_SUBTREETEMP
);
7231 list
<CDentry
*> auth_list
;
7233 // trim non-auth items from the lru
7236 if (bottom_lru
.lru_get_size() > 0)
7237 dn
= static_cast<CDentry
*>(bottom_lru
.lru_expire());
7238 if (!dn
&& lru
.lru_get_size() > 0)
7239 dn
= static_cast<CDentry
*>(lru
.lru_expire());
7243 CDentry::linkage_t
*dnl
= dn
->get_linkage();
7245 if (dn
->is_auth()) {
7246 // add back into lru (at the top)
7247 auth_list
.push_back(dn
);
7249 if (dnl
->is_remote() && dnl
->get_inode() && !dnl
->get_inode()->is_auth())
7250 dn
->unlink_remote(dnl
);
7252 // non-auth. expire.
7253 CDir
*dir
= dn
->get_dir();
7256 // unlink the dentry
7257 dout(10) << " removing " << *dn
<< dendl
;
7258 if (dnl
->is_remote()) {
7259 dir
->unlink_inode(dn
, false);
7261 else if (dnl
->is_primary()) {
7262 CInode
*in
= dnl
->get_inode();
7263 dout(10) << " removing " << *in
<< dendl
;
7264 auto&& ls
= in
->get_dirfrags();
7265 for (const auto& subdir
: ls
) {
7266 ceph_assert(!subdir
->is_subtree_root());
7267 in
->close_dirfrag(subdir
->dirfrag().frag
);
7269 dir
->unlink_inode(dn
, false);
7273 ceph_assert(dnl
->is_null());
7276 ceph_assert(!dir
->has_bloom());
7277 dir
->remove_dentry(dn
);
7278 // adjust the dir state
7279 dir
->state_clear(CDir::STATE_COMPLETE
); // dir incomplete!
7280 // close empty non-auth dirfrag
7281 if (!dir
->is_subtree_root() && dir
->get_num_any() == 0)
7282 dir
->inode
->close_dirfrag(dir
->get_frag());
7286 for (const auto& dn
: auth_list
) {
7287 if (dn
->state_test(CDentry::STATE_BOTTOMLRU
))
7288 bottom_lru
.lru_insert_mid(dn
);
7290 lru
.lru_insert_top(dn
);
7293 // move everything in the pintail to the top bit of the lru.
7294 lru
.lru_touch_entire_pintail();
7296 // unpin all subtrees
7297 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
7298 p
!= subtrees
.end();
7300 p
->first
->put(CDir::PIN_SUBTREETEMP
);
7302 if (lru
.lru_get_size() == 0 &&
7303 bottom_lru
.lru_get_size() == 0) {
7304 // root, stray, etc.?
7305 auto p
= inode_map
.begin();
7306 while (p
!= inode_map
.end()) {
7307 CInode
*in
= p
->second
;
7309 if (!in
->is_auth()) {
7310 auto&& ls
= in
->get_dirfrags();
7311 for (const auto& dir
: ls
) {
7312 dout(10) << " removing " << *dir
<< dendl
;
7313 ceph_assert(dir
->get_num_ref() == 1); // SUBTREE
7314 remove_subtree(dir
);
7315 in
->close_dirfrag(dir
->dirfrag().frag
);
7317 dout(10) << " removing " << *in
<< dendl
;
7318 ceph_assert(!in
->get_parent_dn());
7319 ceph_assert(in
->get_num_ref() == 0);
7329 * Recursively trim the subtree rooted at directory to remove all
7330 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
7331 * of those links. This is used to clear invalid data out of the cache.
7332 * Note that it doesn't clear the passed-in directory, since that's not
7335 bool MDCache::trim_non_auth_subtree(CDir
*dir
)
7337 dout(10) << "trim_non_auth_subtree(" << dir
<< ") " << *dir
<< dendl
;
7339 bool keep_dir
= !can_trim_non_auth_dirfrag(dir
);
7341 auto j
= dir
->begin();
7343 while (j
!= dir
->end()) {
7345 CDentry
*dn
= i
->second
;
7346 dout(10) << "trim_non_auth_subtree(" << dir
<< ") Checking dentry " << dn
<< dendl
;
7347 CDentry::linkage_t
*dnl
= dn
->get_linkage();
7348 if (dnl
->is_primary()) { // check for subdirectories, etc
7349 CInode
*in
= dnl
->get_inode();
7350 bool keep_inode
= false;
7352 auto&& subdirs
= in
->get_dirfrags();
7353 for (const auto& subdir
: subdirs
) {
7354 if (subdir
->is_subtree_root()) {
7356 dout(10) << "trim_non_auth_subtree(" << dir
<< ") keeping " << *subdir
<< dendl
;
7358 if (trim_non_auth_subtree(subdir
))
7361 in
->close_dirfrag(subdir
->get_frag());
7362 dir
->state_clear(CDir::STATE_COMPLETE
); // now incomplete!
7368 if (!keep_inode
) { // remove it!
7369 dout(20) << "trim_non_auth_subtree(" << dir
<< ") removing inode " << in
<< " with dentry" << dn
<< dendl
;
7370 dir
->unlink_inode(dn
, false);
7372 ceph_assert(!dir
->has_bloom());
7373 dir
->remove_dentry(dn
);
7375 dout(20) << "trim_non_auth_subtree(" << dir
<< ") keeping inode " << in
<< " with dentry " << dn
<<dendl
;
7377 in
->state_clear(CInode::STATE_AUTH
);
7379 } else if (keep_dir
&& dnl
->is_null()) { // keep null dentry for peer rollback
7380 dout(20) << "trim_non_auth_subtree(" << dir
<< ") keeping dentry " << dn
<<dendl
;
7381 } else { // just remove it
7382 dout(20) << "trim_non_auth_subtree(" << dir
<< ") removing dentry " << dn
<< dendl
;
7383 if (dnl
->is_remote())
7384 dir
->unlink_inode(dn
, false);
7385 dir
->remove_dentry(dn
);
7388 dir
->state_clear(CDir::STATE_AUTH
);
7390 * We've now checked all our children and deleted those that need it.
7391 * Now return to caller, and tell them if *we're* a keeper.
7393 return keep_dir
|| dir
->get_num_any();
7397 * during replay, when we determine a subtree is no longer ours, we
7398 * try to trim it from our cache. because subtrees must be connected
7399 * to the root, the fact that we can trim this tree may mean that our
7400 * children or parents can also be trimmed.
7402 void MDCache::try_trim_non_auth_subtree(CDir
*dir
)
7404 dout(10) << "try_trim_nonauth_subtree " << *dir
<< dendl
;
7406 // can we now trim child subtrees?
7408 get_subtree_bounds(dir
, bounds
);
7409 for (set
<CDir
*>::iterator p
= bounds
.begin(); p
!= bounds
.end(); ++p
) {
7411 if (bd
->get_dir_auth().first
!= mds
->get_nodeid() && // we are not auth
7412 bd
->get_num_any() == 0 && // and empty
7413 can_trim_non_auth_dirfrag(bd
)) {
7414 CInode
*bi
= bd
->get_inode();
7415 dout(10) << " closing empty non-auth child subtree " << *bd
<< dendl
;
7418 bi
->close_dirfrag(bd
->get_frag());
7422 if (trim_non_auth_subtree(dir
)) {
7424 try_subtree_merge(dir
);
7426 // can we trim this subtree (and possibly our ancestors) too?
7428 CInode
*diri
= dir
->get_inode();
7429 if (diri
->is_base()) {
7430 if (!diri
->is_root() && diri
->authority().first
!= mds
->get_nodeid()) {
7431 dout(10) << " closing empty non-auth subtree " << *dir
<< dendl
;
7432 remove_subtree(dir
);
7434 diri
->close_dirfrag(dir
->get_frag());
7436 dout(10) << " removing " << *diri
<< dendl
;
7437 ceph_assert(!diri
->get_parent_dn());
7438 ceph_assert(diri
->get_num_ref() == 0);
7444 CDir
*psub
= get_subtree_root(diri
->get_parent_dir());
7445 dout(10) << " parent subtree is " << *psub
<< dendl
;
7446 if (psub
->get_dir_auth().first
== mds
->get_nodeid())
7447 break; // we are auth, keep.
7449 dout(10) << " closing empty non-auth subtree " << *dir
<< dendl
;
7450 remove_subtree(dir
);
7452 diri
->close_dirfrag(dir
->get_frag());
7454 dout(10) << " parent subtree also non-auth: " << *psub
<< dendl
;
7455 if (trim_non_auth_subtree(psub
))
7464 void MDCache::standby_trim_segment(LogSegment
*ls
)
7466 auto try_trim_inode
= [this](CInode
*in
) {
7467 if (in
->get_num_ref() == 0 &&
7468 !in
->item_open_file
.is_on_list() &&
7469 in
->parent
!= NULL
&&
7470 in
->parent
->get_num_ref() == 0){
7471 touch_dentry_bottom(in
->parent
);
7475 auto try_trim_dentry
= [this](CDentry
*dn
) {
7476 if (dn
->get_num_ref() > 0)
7478 auto in
= dn
->get_linkage()->inode
;
7479 if(in
&& in
->item_open_file
.is_on_list())
7481 touch_dentry_bottom(dn
);
7484 ls
->new_dirfrags
.clear_list();
7485 ls
->open_files
.clear_list();
7487 while (!ls
->dirty_dirfrags
.empty()) {
7488 CDir
*dir
= ls
->dirty_dirfrags
.front();
7491 try_trim_inode(dir
->inode
);
7493 while (!ls
->dirty_inodes
.empty()) {
7494 CInode
*in
= ls
->dirty_inodes
.front();
7498 while (!ls
->dirty_dentries
.empty()) {
7499 CDentry
*dn
= ls
->dirty_dentries
.front();
7501 try_trim_dentry(dn
);
7503 while (!ls
->dirty_parent_inodes
.empty()) {
7504 CInode
*in
= ls
->dirty_parent_inodes
.front();
7505 in
->clear_dirty_parent();
7508 while (!ls
->dirty_dirfrag_dir
.empty()) {
7509 CInode
*in
= ls
->dirty_dirfrag_dir
.front();
7510 in
->filelock
.remove_dirty();
7513 while (!ls
->dirty_dirfrag_nest
.empty()) {
7514 CInode
*in
= ls
->dirty_dirfrag_nest
.front();
7515 in
->nestlock
.remove_dirty();
7518 while (!ls
->dirty_dirfrag_dirfragtree
.empty()) {
7519 CInode
*in
= ls
->dirty_dirfrag_dirfragtree
.front();
7520 in
->dirfragtreelock
.remove_dirty();
7523 while (!ls
->truncating_inodes
.empty()) {
7524 auto it
= ls
->truncating_inodes
.begin();
7526 ls
->truncating_inodes
.erase(it
);
7527 in
->put(CInode::PIN_TRUNCATING
);
7532 void MDCache::handle_cache_expire(const cref_t
<MCacheExpire
> &m
)
7534 mds_rank_t from
= mds_rank_t(m
->get_from());
7536 dout(7) << "cache_expire from mds." << from
<< dendl
;
7538 if (mds
->get_state() < MDSMap::STATE_REJOIN
) {
7542 set
<SimpleLock
*> gather_locks
;
7544 for (const auto &p
: m
->realms
) {
7546 if (p
.first
.ino
> 0) {
7547 CInode
*expired_inode
= get_inode(p
.first
.ino
);
7548 ceph_assert(expired_inode
); // we had better have this.
7549 CDir
*parent_dir
= expired_inode
->get_approx_dirfrag(p
.first
.frag
);
7550 ceph_assert(parent_dir
);
7552 int export_state
= -1;
7553 if (parent_dir
->is_auth() && parent_dir
->is_exporting()) {
7554 export_state
= migrator
->get_export_state(parent_dir
);
7555 ceph_assert(export_state
>= 0);
7558 if (!parent_dir
->is_auth() ||
7559 (export_state
!= -1 &&
7560 ((export_state
== Migrator::EXPORT_WARNING
&&
7561 migrator
->export_has_warned(parent_dir
,from
)) ||
7562 export_state
== Migrator::EXPORT_EXPORTING
||
7563 export_state
== Migrator::EXPORT_LOGGINGFINISH
||
7564 (export_state
== Migrator::EXPORT_NOTIFYING
&&
7565 !migrator
->export_has_notified(parent_dir
,from
))))) {
7568 dout(7) << "delaying nonauth|warned expires for " << *parent_dir
<< dendl
;
7569 ceph_assert(parent_dir
->is_frozen_tree_root());
7571 // make a message container
7573 auto em
= delayed_expire
[parent_dir
].emplace(std::piecewise_construct
, std::forward_as_tuple(from
), std::forward_as_tuple());
7575 em
.first
->second
= make_message
<MCacheExpire
>(from
); /* new */
7577 // merge these expires into it
7578 em
.first
->second
->add_realm(p
.first
, p
.second
);
7581 ceph_assert(export_state
<= Migrator::EXPORT_PREPPING
||
7582 (export_state
== Migrator::EXPORT_WARNING
&&
7583 !migrator
->export_has_warned(parent_dir
, from
)));
7585 dout(7) << "expires for " << *parent_dir
<< dendl
;
7587 dout(7) << "containerless expires (root, stray inodes)" << dendl
;
7591 for (const auto &q
: p
.second
.inodes
) {
7592 CInode
*in
= get_inode(q
.first
);
7593 unsigned nonce
= q
.second
;
7596 dout(0) << " inode expire on " << q
.first
<< " from " << from
7597 << ", don't have it" << dendl
;
7600 ceph_assert(in
->is_auth());
7601 dout(20) << __func__
<< ": expiring inode " << *in
<< dendl
;
7604 if (nonce
== in
->get_replica_nonce(from
)) {
7605 // remove from our cached_by
7606 dout(7) << " inode expire on " << *in
<< " from mds." << from
7607 << " cached_by was " << in
->get_replicas() << dendl
;
7608 inode_remove_replica(in
, from
, false, gather_locks
);
7611 // this is an old nonce, ignore expire.
7612 dout(7) << " inode expire on " << *in
<< " from mds." << from
7613 << " with old nonce " << nonce
7614 << " (current " << in
->get_replica_nonce(from
) << "), dropping"
7620 for (const auto &q
: p
.second
.dirs
) {
7621 CDir
*dir
= get_dirfrag(q
.first
);
7622 unsigned nonce
= q
.second
;
7625 CInode
*diri
= get_inode(q
.first
.ino
);
7627 if (mds
->is_rejoin() &&
7628 rejoin_ack_gather
.count(mds
->get_nodeid()) && // haven't sent rejoin ack yet
7629 !diri
->is_replica(from
)) {
7630 auto&& ls
= diri
->get_nested_dirfrags();
7631 dout(7) << " dir expire on dirfrag " << q
.first
<< " from mds." << from
7632 << " while rejoining, inode isn't replicated" << dendl
;
7633 for (const auto& d
: ls
) {
7635 if (dir
->is_replica(from
)) {
7636 dout(7) << " dir expire on " << *dir
<< " from mds." << from
<< dendl
;
7637 dir
->remove_replica(from
);
7642 CDir
*other
= diri
->get_approx_dirfrag(q
.first
.frag
);
7644 dout(7) << " dir expire on dirfrag " << q
.first
<< " from mds." << from
7645 << " have " << *other
<< ", mismatched frags, dropping" << dendl
;
7649 dout(0) << " dir expire on " << q
.first
<< " from " << from
7650 << ", don't have it" << dendl
;
7653 dout(20) << __func__
<< ": expiring dirfrag " << *dir
<< dendl
;
7655 ceph_assert(dir
->is_auth());
7658 if (nonce
== dir
->get_replica_nonce(from
)) {
7659 // remove from our cached_by
7660 dout(7) << " dir expire on " << *dir
<< " from mds." << from
7661 << " replicas was " << dir
->get_replicas() << dendl
;
7662 dir
->remove_replica(from
);
7665 // this is an old nonce, ignore expire.
7666 dout(7) << " dir expire on " << *dir
<< " from mds." << from
7667 << " with old nonce " << nonce
<< " (current " << dir
->get_replica_nonce(from
)
7668 << "), dropping" << dendl
;
7673 for (const auto &pd
: p
.second
.dentries
) {
7674 dout(10) << " dn expires in dir " << pd
.first
<< dendl
;
7675 CInode
*diri
= get_inode(pd
.first
.ino
);
7677 CDir
*dir
= diri
->get_dirfrag(pd
.first
.frag
);
7680 dout(0) << " dn expires on " << pd
.first
<< " from " << from
7681 << ", must have refragmented" << dendl
;
7683 ceph_assert(dir
->is_auth());
7686 for (const auto &p
: pd
.second
) {
7687 unsigned nonce
= p
.second
;
7691 dn
= dir
->lookup(p
.first
.first
, p
.first
.second
);
7693 // which dirfrag for this dentry?
7694 CDir
*dir
= diri
->get_dirfrag(diri
->pick_dirfrag(p
.first
.first
));
7696 ceph_assert(dir
->is_auth());
7697 dn
= dir
->lookup(p
.first
.first
, p
.first
.second
);
7702 dout(0) << " missing dentry for " << p
.first
.first
<< " snap " << p
.first
.second
<< " in " << *dir
<< dendl
;
7704 dout(0) << " missing dentry for " << p
.first
.first
<< " snap " << p
.first
.second
<< dendl
;
7708 if (nonce
== dn
->get_replica_nonce(from
)) {
7709 dout(7) << " dentry_expire on " << *dn
<< " from mds." << from
<< dendl
;
7710 dentry_remove_replica(dn
, from
, gather_locks
);
7713 dout(7) << " dentry_expire on " << *dn
<< " from mds." << from
7714 << " with old nonce " << nonce
<< " (current " << dn
->get_replica_nonce(from
)
7715 << "), dropping" << dendl
;
7721 for (set
<SimpleLock
*>::iterator p
= gather_locks
.begin(); p
!= gather_locks
.end(); ++p
) {
7722 if (!(*p
)->is_stable())
7723 mds
->locker
->eval_gather(*p
);
7727 void MDCache::process_delayed_expire(CDir
*dir
)
7729 dout(7) << "process_delayed_expire on " << *dir
<< dendl
;
7730 for (const auto &p
: delayed_expire
[dir
]) {
7731 handle_cache_expire(p
.second
);
7733 delayed_expire
.erase(dir
);
7736 void MDCache::discard_delayed_expire(CDir
*dir
)
7738 dout(7) << "discard_delayed_expire on " << *dir
<< dendl
;
7739 delayed_expire
.erase(dir
);
7742 void MDCache::inode_remove_replica(CInode
*in
, mds_rank_t from
, bool rejoin
,
7743 set
<SimpleLock
*>& gather_locks
)
7745 in
->remove_replica(from
);
7746 in
->set_mds_caps_wanted(from
, 0);
7748 // note: this code calls _eval more often than it needs to!
7750 if (in
->authlock
.remove_replica(from
)) gather_locks
.insert(&in
->authlock
);
7751 if (in
->linklock
.remove_replica(from
)) gather_locks
.insert(&in
->linklock
);
7752 if (in
->snaplock
.remove_replica(from
)) gather_locks
.insert(&in
->snaplock
);
7753 if (in
->xattrlock
.remove_replica(from
)) gather_locks
.insert(&in
->xattrlock
);
7754 if (in
->flocklock
.remove_replica(from
)) gather_locks
.insert(&in
->flocklock
);
7755 if (in
->policylock
.remove_replica(from
)) gather_locks
.insert(&in
->policylock
);
7757 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7758 // Don't remove the recovering mds from lock's gathering list because
7759 // it may hold rejoined wrlocks.
7760 if (in
->dirfragtreelock
.remove_replica(from
, rejoin
)) gather_locks
.insert(&in
->dirfragtreelock
);
7761 if (in
->filelock
.remove_replica(from
, rejoin
)) gather_locks
.insert(&in
->filelock
);
7762 if (in
->nestlock
.remove_replica(from
, rejoin
)) gather_locks
.insert(&in
->nestlock
);
7765 void MDCache::dentry_remove_replica(CDentry
*dn
, mds_rank_t from
, set
<SimpleLock
*>& gather_locks
)
7767 dn
->remove_replica(from
);
7770 if (dn
->lock
.remove_replica(from
))
7771 gather_locks
.insert(&dn
->lock
);
7773 // Replicated strays might now be elegible for purge
7774 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
7775 if (dnl
->is_primary()) {
7776 maybe_eval_stray(dnl
->get_inode());
7780 void MDCache::trim_client_leases()
7782 utime_t now
= ceph_clock_now();
7784 dout(10) << "trim_client_leases" << dendl
;
7786 std::size_t pool
= 0;
7787 for (const auto& list
: client_leases
) {
7792 auto before
= list
.size();
7793 while (!list
.empty()) {
7794 ClientLease
*r
= list
.front();
7795 if (r
->ttl
> now
) break;
7796 CDentry
*dn
= static_cast<CDentry
*>(r
->parent
);
7797 dout(10) << " expiring client." << r
->client
<< " lease of " << *dn
<< dendl
;
7798 dn
->remove_client_lease(r
, mds
->locker
);
7800 auto after
= list
.size();
7801 dout(10) << "trim_client_leases pool " << pool
<< " trimmed "
7802 << (before
-after
) << " leases, " << after
<< " left" << dendl
;
7806 void MDCache::check_memory_usage()
7808 static MemoryModel
mm(g_ceph_context
);
7809 static MemoryModel::snap last
;
7811 static MemoryModel::snap baseline
= last
;
7813 // check client caps
7814 ceph_assert(CInode::count() == inode_map
.size() + snap_inode_map
.size() + num_shadow_inodes
);
7815 double caps_per_inode
= 0.0;
7816 if (CInode::count())
7817 caps_per_inode
= (double)Capability::count() / (double)CInode::count();
7819 dout(2) << "Memory usage: "
7820 << " total " << last
.get_total()
7821 << ", rss " << last
.get_rss()
7822 << ", heap " << last
.get_heap()
7823 << ", baseline " << baseline
.get_heap()
7824 << ", " << num_inodes_with_caps
<< " / " << CInode::count() << " inodes have caps"
7825 << ", " << Capability::count() << " caps, " << caps_per_inode
<< " caps per inode"
7828 mds
->update_mlogger();
7829 mds
->mlogger
->set(l_mdm_rss
, last
.get_rss());
7830 mds
->mlogger
->set(l_mdm_heap
, last
.get_heap());
7835 // =========================================================================================
7838 class C_MDC_ShutdownCheck
: public MDCacheContext
{
7840 explicit C_MDC_ShutdownCheck(MDCache
*m
) : MDCacheContext(m
) {}
7841 void finish(int) override
{
7842 mdcache
->shutdown_check();
7846 void MDCache::shutdown_check()
7848 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl
;
7851 char old_val
[32] = { 0 };
7853 g_conf().get_val("debug_mds", &o
, sizeof(old_val
));
7854 g_conf().set_val("debug_mds", "10");
7855 g_conf().apply_changes(nullptr);
7857 g_conf().set_val("debug_mds", old_val
);
7858 g_conf().apply_changes(nullptr);
7859 mds
->timer
.add_event_after(g_conf()->mds_shutdown_check
, new C_MDC_ShutdownCheck(this));
7862 dout(0) << "lru size now " << lru
.lru_get_size() << "/" << bottom_lru
.lru_get_size() << dendl
;
7863 dout(0) << "log len " << mds
->mdlog
->get_num_events() << dendl
;
7866 if (mds
->objecter
->is_active()) {
7867 dout(0) << "objecter still active" << dendl
;
7868 mds
->objecter
->dump_active();
7873 void MDCache::shutdown_start()
7875 dout(5) << "shutdown_start" << dendl
;
7877 if (g_conf()->mds_shutdown_check
)
7878 mds
->timer
.add_event_after(g_conf()->mds_shutdown_check
, new C_MDC_ShutdownCheck(this));
7880 // g_conf()->debug_mds = 10;
7885 bool MDCache::shutdown_pass()
7887 dout(7) << "shutdown_pass" << dendl
;
7889 if (mds
->is_stopped()) {
7890 dout(7) << " already shut down" << dendl
;
7897 bool strays_all_exported
= shutdown_export_strays();
7901 dout(5) << "lru size now " << lru
.lru_get_size() << "/" << bottom_lru
.lru_get_size() << dendl
;
7903 // Export all subtrees to another active (usually rank 0) if not rank 0
7904 int num_auth_subtree
= 0;
7905 if (!subtrees
.empty() && mds
->get_nodeid() != 0) {
7906 dout(7) << "looking for subtrees to export" << dendl
;
7907 std::vector
<CDir
*> ls
;
7908 for (auto& [dir
, bounds
] : subtrees
) {
7909 dout(10) << " examining " << *dir
<< " bounds " << bounds
<< dendl
;
7910 if (dir
->get_inode()->is_mdsdir() || !dir
->is_auth())
7913 if (dir
->is_frozen() ||
7914 dir
->is_freezing() ||
7915 dir
->is_ambiguous_dir_auth() ||
7916 dir
->state_test(CDir::STATE_EXPORTING
) ||
7917 dir
->get_inode()->is_ephemerally_pinned()) {
7923 migrator
->clear_export_queue();
7924 // stopping mds does not call MDBalancer::tick()
7925 mds
->balancer
->handle_export_pins();
7926 for (const auto& dir
: ls
) {
7927 mds_rank_t dest
= dir
->get_inode()->authority().first
;
7928 if (dest
> 0 && !mds
->mdsmap
->is_active(dest
))
7930 dout(7) << "sending " << *dir
<< " back to mds." << dest
<< dendl
;
7931 migrator
->export_dir_nicely(dir
, dest
);
7935 if (!strays_all_exported
) {
7936 dout(7) << "waiting for strays to migrate" << dendl
;
7940 if (num_auth_subtree
> 0) {
7941 ceph_assert(mds
->get_nodeid() > 0);
7942 dout(7) << "still have " << num_auth_subtree
<< " auth subtrees" << dendl
;
7947 // close out any sessions (and open files!) before we try to trim the log, etc.
7948 if (mds
->sessionmap
.have_unclosed_sessions()) {
7949 if (!mds
->server
->terminating_sessions
)
7950 mds
->server
->terminate_sessions();
7954 // Fully trim the log so that all objects in cache are clean and may be
7955 // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
7956 // trim the log such that the cache eventually becomes clean.
7957 if (mds
->mdlog
->get_num_segments() > 0) {
7958 auto ls
= mds
->mdlog
->get_current_segment();
7959 if (ls
->num_events
> 1 || !ls
->dirty_dirfrags
.empty()) {
7960 // Current segment contains events other than subtreemap or
7961 // there are dirty dirfrags (see CDir::log_mark_dirty())
7962 mds
->mdlog
->start_new_segment();
7963 mds
->mdlog
->flush();
7966 mds
->mdlog
->trim_all();
7967 if (mds
->mdlog
->get_num_segments() > 1) {
7968 dout(7) << "still >1 segments, waiting for log to trim" << dendl
;
7972 // drop our reference to our stray dir inode
7973 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
7975 strays
[i
]->state_test(CInode::STATE_STRAYPINNED
)) {
7976 strays
[i
]->state_clear(CInode::STATE_STRAYPINNED
);
7977 strays
[i
]->put(CInode::PIN_STRAY
);
7978 strays
[i
]->put_stickydirs();
7982 CDir
*mydir
= myin
? myin
->get_dirfrag(frag_t()) : NULL
;
7983 if (mydir
&& !mydir
->is_subtree_root())
7986 // subtrees map not empty yet?
7987 if (subtrees
.size() > (mydir
? 1 : 0)) {
7988 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl
;
7990 migrator
->show_importing();
7991 migrator
->show_exporting();
7992 if (!migrator
->is_importing() && !migrator
->is_exporting())
7996 ceph_assert(!migrator
->is_exporting());
7997 ceph_assert(!migrator
->is_importing());
7999 // replicas may dirty scatter locks
8000 if (myin
&& myin
->is_replicated()) {
8001 dout(7) << "still have replicated objects" << dendl
;
8005 if ((myin
&& myin
->get_num_auth_pins()) ||
8006 (mydir
&& (mydir
->get_auth_pins() || mydir
->get_dir_auth_pins()))) {
8007 dout(7) << "still have auth pinned objects" << dendl
;
8011 // (only do this once!)
8012 if (!mds
->mdlog
->is_capped()) {
8013 dout(7) << "capping the mdlog" << dendl
;
8017 if (!mds
->mdlog
->empty())
8018 mds
->mdlog
->trim(0);
8020 if (!mds
->mdlog
->empty()) {
8021 dout(7) << "waiting for log to flush.. " << mds
->mdlog
->get_num_events()
8022 << " in " << mds
->mdlog
->get_num_segments() << " segments" << dendl
;
8026 if (!did_shutdown_log_cap
) {
8027 // flush journal header
8028 dout(7) << "writing header for (now-empty) journal" << dendl
;
8029 ceph_assert(mds
->mdlog
->empty());
8030 mds
->mdlog
->write_head(0);
8031 // NOTE: filer active checker below will block us until this completes.
8032 did_shutdown_log_cap
= true;
8037 if (mds
->objecter
->is_active()) {
8038 dout(7) << "objecter still active" << dendl
;
8039 mds
->objecter
->dump_active();
8043 // trim what we can from the cache
8044 if (lru
.lru_get_size() > 0 || bottom_lru
.lru_get_size() > 0) {
8045 dout(7) << "there's still stuff in the cache: " << lru
.lru_get_size() << "/" << bottom_lru
.lru_get_size() << dendl
;
8051 // make mydir subtree go away
8053 if (mydir
->get_num_ref() > 1) { // subtree pin
8054 dout(7) << "there's still reference to mydir " << *mydir
<< dendl
;
8059 remove_subtree(mydir
);
8060 myin
->close_dirfrag(mydir
->get_frag());
8062 ceph_assert(subtrees
.empty());
8069 if (global_snaprealm
) {
8070 remove_inode(global_snaprealm
->inode
);
8071 global_snaprealm
= nullptr;
8075 dout(5) << "shutdown done." << dendl
;
8079 bool MDCache::shutdown_export_strays()
8081 static const unsigned MAX_EXPORTING
= 100;
8083 if (mds
->get_nodeid() == 0)
8086 if (shutdown_exporting_strays
.size() * 3 >= MAX_EXPORTING
* 2)
8089 dout(10) << "shutdown_export_strays " << shutdown_export_next
.first
8090 << " '" << shutdown_export_next
.second
<< "'" << dendl
;
8092 bool mds0_active
= mds
->mdsmap
->is_active(mds_rank_t(0));
8093 bool all_exported
= false;
8096 auto next
= shutdown_export_next
;
8098 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
8099 CInode
*strayi
= strays
[i
];
8101 !strayi
->state_test(CInode::STATE_STRAYPINNED
))
8103 if (strayi
->ino() < next
.first
.ino
)
8107 strayi
->get_dirfrags(dfls
);
8109 while (!dfls
.empty()) {
8110 CDir
*dir
= dfls
.front();
8113 if (dir
->dirfrag() < next
.first
)
8115 if (next
.first
< dir
->dirfrag()) {
8116 next
.first
= dir
->dirfrag();
8117 next
.second
.clear();
8120 if (!dir
->is_complete()) {
8121 MDSContext
*fin
= nullptr;
8122 if (shutdown_exporting_strays
.empty()) {
8123 fin
= new MDSInternalContextWrapper(mds
,
8124 new LambdaContext([this](int r
) {
8125 shutdown_export_strays();
8133 CDir::dentry_key_map::iterator it
;
8134 if (next
.second
.empty()) {
8137 auto hash
= ceph_frag_value(strayi
->hash_dentry_name(next
.second
));
8138 it
= dir
->lower_bound(dentry_key_t(0, next
.second
, hash
));
8141 for (; it
!= dir
->end(); ++it
) {
8142 CDentry
*dn
= it
->second
;
8143 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
8147 if (!mds0_active
&& !dn
->state_test(CDentry::STATE_PURGING
)) {
8148 next
.second
= it
->first
.name
;
8152 auto ret
= shutdown_exporting_strays
.insert(dnl
->get_inode()->ino());
8154 dout(10) << "already exporting/purging " << *dn
<< dendl
;
8158 // Don't try to migrate anything that is actually
8159 // being purged right now
8160 if (!dn
->state_test(CDentry::STATE_PURGING
))
8161 stray_manager
.migrate_stray(dn
, mds_rank_t(0)); // send to root!
8163 if (shutdown_exporting_strays
.size() >= MAX_EXPORTING
) {
8165 if (it
!= dir
->end()) {
8166 next
.second
= it
->first
.name
;
8169 next
.first
.ino
.val
++;
8171 next
.first
= dfls
.front()->dirfrag();
8172 next
.second
.clear();
8180 if (shutdown_exporting_strays
.empty()) {
8181 dirfrag_t
first_df(MDS_INO_STRAY(mds
->get_nodeid(), 0), 0);
8182 if (first_df
< shutdown_export_next
.first
||
8183 !shutdown_export_next
.second
.empty()) {
8184 shutdown_export_next
.first
= first_df
;
8185 shutdown_export_next
.second
.clear();
8188 all_exported
= true;
8192 shutdown_export_next
= next
;
8193 return all_exported
;
8196 // ========= messaging ==============
8198 void MDCache::dispatch(const cref_t
<Message
> &m
)
8200 switch (m
->get_type()) {
8203 case MSG_MDS_RESOLVE
:
8204 handle_resolve(ref_cast
<MMDSResolve
>(m
));
8206 case MSG_MDS_RESOLVEACK
:
8207 handle_resolve_ack(ref_cast
<MMDSResolveAck
>(m
));
8211 case MSG_MDS_CACHEREJOIN
:
8212 handle_cache_rejoin(ref_cast
<MMDSCacheRejoin
>(m
));
8215 case MSG_MDS_DISCOVER
:
8216 handle_discover(ref_cast
<MDiscover
>(m
));
8218 case MSG_MDS_DISCOVERREPLY
:
8219 handle_discover_reply(ref_cast
<MDiscoverReply
>(m
));
8222 case MSG_MDS_DIRUPDATE
:
8223 handle_dir_update(ref_cast
<MDirUpdate
>(m
));
8226 case MSG_MDS_CACHEEXPIRE
:
8227 handle_cache_expire(ref_cast
<MCacheExpire
>(m
));
8230 case MSG_MDS_DENTRYLINK
:
8231 handle_dentry_link(ref_cast
<MDentryLink
>(m
));
8233 case MSG_MDS_DENTRYUNLINK
:
8234 handle_dentry_unlink(ref_cast
<MDentryUnlink
>(m
));
8237 case MSG_MDS_FRAGMENTNOTIFY
:
8238 handle_fragment_notify(ref_cast
<MMDSFragmentNotify
>(m
));
8240 case MSG_MDS_FRAGMENTNOTIFYACK
:
8241 handle_fragment_notify_ack(ref_cast
<MMDSFragmentNotifyAck
>(m
));
8244 case MSG_MDS_FINDINO
:
8245 handle_find_ino(ref_cast
<MMDSFindIno
>(m
));
8247 case MSG_MDS_FINDINOREPLY
:
8248 handle_find_ino_reply(ref_cast
<MMDSFindInoReply
>(m
));
8251 case MSG_MDS_OPENINO
:
8252 handle_open_ino(ref_cast
<MMDSOpenIno
>(m
));
8254 case MSG_MDS_OPENINOREPLY
:
8255 handle_open_ino_reply(ref_cast
<MMDSOpenInoReply
>(m
));
8258 case MSG_MDS_SNAPUPDATE
:
8259 handle_snap_update(ref_cast
<MMDSSnapUpdate
>(m
));
8263 derr
<< "cache unknown message " << m
->get_type() << dendl
;
8264 ceph_abort_msg("cache unknown message");
8268 int MDCache::path_traverse(MDRequestRef
& mdr
, MDSContextFactory
& cf
,
8269 const filepath
& path
, int flags
,
8270 vector
<CDentry
*> *pdnvec
, CInode
**pin
)
8272 bool discover
= (flags
& MDS_TRAVERSE_DISCOVER
);
8273 bool forward
= !discover
;
8274 bool path_locked
= (flags
& MDS_TRAVERSE_PATH_LOCKED
);
8275 bool want_dentry
= (flags
& MDS_TRAVERSE_WANT_DENTRY
);
8276 bool want_inode
= (flags
& MDS_TRAVERSE_WANT_INODE
);
8277 bool want_auth
= (flags
& MDS_TRAVERSE_WANT_AUTH
);
8278 bool rdlock_snap
= (flags
& (MDS_TRAVERSE_RDLOCK_SNAP
| MDS_TRAVERSE_RDLOCK_SNAP2
));
8279 bool rdlock_path
= (flags
& MDS_TRAVERSE_RDLOCK_PATH
);
8280 bool xlock_dentry
= (flags
& MDS_TRAVERSE_XLOCK_DENTRY
);
8281 bool rdlock_authlock
= (flags
& MDS_TRAVERSE_RDLOCK_AUTHLOCK
);
8284 ceph_assert(mdr
); // forward requires a request
8286 snapid_t snapid
= CEPH_NOSNAP
;
8288 mdr
->snapid
= snapid
;
8290 client_t client
= mdr
? mdr
->get_client() : -1;
8292 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse
);
8294 dout(7) << "traverse: opening base ino " << path
.get_ino() << " snap " << snapid
<< dendl
;
8295 CInode
*cur
= get_inode(path
.get_ino());
8297 if (MDS_INO_IS_MDSDIR(path
.get_ino())) {
8298 open_foreign_mdsdir(path
.get_ino(), cf
.build());
8301 if (MDS_INO_IS_STRAY(path
.get_ino())) {
8302 mds_rank_t rank
= MDS_INO_STRAY_OWNER(path
.get_ino());
8303 unsigned idx
= MDS_INO_STRAY_INDEX(path
.get_ino());
8304 filepath
path(strays
[idx
]->get_parent_dn()->get_name(),
8305 MDS_INO_MDSDIR(rank
));
8306 MDRequestRef null_ref
;
8307 return path_traverse(null_ref
, cf
, path
, MDS_TRAVERSE_DISCOVER
, nullptr);
8309 return -CEPHFS_ESTALE
;
8311 if (cur
->state_test(CInode::STATE_PURGING
))
8312 return -CEPHFS_ESTALE
;
8314 if (flags
& MDS_TRAVERSE_CHECK_LOCKCACHE
)
8315 mds
->locker
->find_and_attach_lock_cache(mdr
, cur
);
8317 if (mdr
&& mdr
->lock_cache
) {
8318 if (flags
& MDS_TRAVERSE_WANT_DIRLAYOUT
)
8319 mdr
->dir_layout
= mdr
->lock_cache
->get_dir_layout();
8320 } else if (rdlock_snap
) {
8321 int n
= (flags
& MDS_TRAVERSE_RDLOCK_SNAP2
) ? 1 : 0;
8322 if ((n
== 0 && !(mdr
->locking_state
& MutationImpl::SNAP_LOCKED
)) ||
8323 (n
== 1 && !(mdr
->locking_state
& MutationImpl::SNAP2_LOCKED
))) {
8324 bool want_layout
= (flags
& MDS_TRAVERSE_WANT_DIRLAYOUT
);
8325 if (!mds
->locker
->try_rdlock_snap_layout(cur
, mdr
, n
, want_layout
))
8336 CInode
*target_inode
= nullptr;
8337 MutationImpl::LockOpVec lov
;
8340 for (unsigned depth
= 0; depth
< path
.depth(); ) {
8341 dout(12) << "traverse: path seg depth " << depth
<< " '" << path
[depth
]
8342 << "' snapid " << snapid
<< dendl
;
8344 if (!cur
->is_dir()) {
8345 dout(7) << "traverse: " << *cur
<< " not a dir " << dendl
;
8346 return -CEPHFS_ENOTDIR
;
8349 // walk into snapdir?
8350 if (path
[depth
].length() == 0) {
8351 dout(10) << "traverse: snapdir" << dendl
;
8352 if (!mdr
|| depth
> 0) // snapdir must be the first component
8353 return -CEPHFS_EINVAL
;
8354 snapid
= CEPH_SNAPDIR
;
8355 mdr
->snapid
= snapid
;
8359 // walk thru snapdir?
8360 if (snapid
== CEPH_SNAPDIR
) {
8362 return -CEPHFS_EINVAL
;
8363 SnapRealm
*realm
= cur
->find_snaprealm();
8364 snapid
= realm
->resolve_snapname(path
[depth
], cur
->ino());
8365 dout(10) << "traverse: snap " << path
[depth
] << " -> " << snapid
<< dendl
;
8368 pdnvec
->clear(); // do not confuse likes of rdlock_path_pin_ref();
8369 return -CEPHFS_ENOENT
;
8371 if (depth
== path
.depth() - 1)
8373 mdr
->snapid
= snapid
;
8379 frag_t fg
= cur
->pick_dirfrag(path
[depth
]);
8380 CDir
*curdir
= cur
->get_dirfrag(fg
);
8382 if (cur
->is_auth()) {
8383 // parent dir frozen_dir?
8384 if (cur
->is_frozen()) {
8385 dout(7) << "traverse: " << *cur
<< " is frozen, waiting" << dendl
;
8386 cur
->add_waiter(CDir::WAIT_UNFREEZE
, cf
.build());
8389 curdir
= cur
->get_or_open_dirfrag(this, fg
);
8392 dout(10) << "traverse: need dirfrag " << fg
<< ", doing discover from " << *cur
<< dendl
;
8393 discover_path(cur
, snapid
, path
.postfixpath(depth
), cf
.build(),
8395 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_discover
);
8399 ceph_assert(curdir
);
8401 #ifdef MDS_VERIFY_FRAGSTAT
8402 if (curdir
->is_complete())
8403 curdir
->verify_fragstat();
8408 if (curdir->is_frozen()) {
8410 // FIXME: traverse is allowed?
8411 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8412 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
8413 if (onfinish) delete onfinish;
8418 // Defer the auth check until the target inode is determined not to exist
8419 // if want_inode is true.
8420 if (want_auth
&& want_dentry
&& !want_inode
&& depth
== path
.depth() - 1 &&
8421 (r
= maybe_request_forward_to_auth(mdr
, cf
, curdir
)) != 0)
8424 // Before doing dirfrag->dn lookup, compare with DamageTable's
8425 // record of which dentries were unreadable
8426 if (mds
->damage_table
.is_dentry_damaged(curdir
, path
[depth
], snapid
)) {
8427 dout(4) << "traverse: stopped lookup at damaged dentry "
8428 << *curdir
<< "/" << path
[depth
] << " snap=" << snapid
<< dendl
;
8433 CDentry
*dn
= curdir
->lookup(path
[depth
], snapid
);
8435 if (dn
->state_test(CDentry::STATE_PURGING
))
8436 return -CEPHFS_ENOENT
;
8438 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
8439 // If an auth check was deferred before and the target inode is found
8440 // not to exist now, do the auth check here if necessary.
8441 if (want_auth
&& want_dentry
&& want_inode
&& depth
== path
.depth() - 1 &&
8442 dnl
->is_null() && (r
= maybe_request_forward_to_auth(mdr
, cf
, dn
)) != 0)
8447 // do not xlock the tail dentry if target inode exists and caller wants it
8448 if (xlock_dentry
&& (dnl
->is_null() || !want_inode
) &&
8449 depth
== path
.depth() - 1) {
8450 ceph_assert(dn
->is_auth());
8451 if (depth
> 0 || !mdr
->lock_cache
) {
8452 lov
.add_wrlock(&cur
->filelock
);
8453 lov
.add_wrlock(&cur
->nestlock
);
8454 if (rdlock_authlock
)
8455 lov
.add_rdlock(&cur
->authlock
);
8457 lov
.add_xlock(&dn
->lock
);
8459 // force client to flush async dir operation if necessary
8460 if (cur
->filelock
.is_cached())
8461 lov
.add_wrlock(&cur
->filelock
);
8462 lov
.add_rdlock(&dn
->lock
);
8464 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
8465 dout(10) << "traverse: failed to rdlock " << dn
->lock
<< " " << *dn
<< dendl
;
8468 } else if (!path_locked
&&
8469 !dn
->lock
.can_read(client
) &&
8470 !(dn
->lock
.is_xlocked() && dn
->lock
.get_xlock_by() == mdr
)) {
8471 dout(10) << "traverse: non-readable dentry at " << *dn
<< dendl
;
8472 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, cf
.build());
8474 mds
->logger
->inc(l_mds_traverse_lock
);
8475 if (dn
->is_auth() && dn
->lock
.is_unstable_and_locked())
8476 mds
->mdlog
->flush();
8481 pdnvec
->push_back(dn
);
8483 // can we conclude CEPHFS_ENOENT?
8484 if (dnl
->is_null()) {
8485 dout(10) << "traverse: null+readable dentry at " << *dn
<< dendl
;
8486 if (depth
== path
.depth() - 1) {
8491 pdnvec
->clear(); // do not confuse likes of rdlock_path_pin_ref();
8493 return -CEPHFS_ENOENT
;
8496 // do we have inode?
8497 CInode
*in
= dnl
->get_inode();
8499 ceph_assert(dnl
->is_remote());
8501 in
= get_inode(dnl
->get_remote_ino());
8503 dout(7) << "linking in remote in " << *in
<< dendl
;
8504 dn
->link_remote(dnl
, in
);
8506 dout(7) << "remote link to " << dnl
->get_remote_ino() << ", which i don't have" << dendl
;
8507 ceph_assert(mdr
); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
8508 if (mds
->damage_table
.is_remote_damaged(dnl
->get_remote_ino())) {
8509 dout(4) << "traverse: remote dentry points to damaged ino "
8513 open_remote_dentry(dn
, true, cf
.build(),
8514 (path_locked
&& depth
== path
.depth() - 1));
8515 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_remote_ino
);
8522 if (rdlock_snap
&& !(want_dentry
&& !want_inode
&& depth
== path
.depth() - 1)) {
8524 lov
.add_rdlock(&cur
->snaplock
);
8525 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
8526 dout(10) << "traverse: failed to rdlock " << cur
->snaplock
<< " " << *cur
<< dendl
;
8531 if (depth
== path
.depth() - 1)
8534 // add to trace, continue.
8544 // MISS. dentry doesn't exist.
8545 dout(12) << "traverse: miss on dentry " << path
[depth
] << " in " << *curdir
<< dendl
;
8547 if (curdir
->is_auth()) {
8549 if (curdir
->is_complete() ||
8550 (snapid
== CEPH_NOSNAP
&&
8551 curdir
->has_bloom() &&
8552 !curdir
->is_in_bloom(path
[depth
]))) {
8555 // instantiate a null dn?
8556 if (depth
< path
.depth() - 1) {
8557 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl
;
8558 } else if (snapid
< CEPH_MAXSNAP
) {
8559 dout(20) << " not adding null for snapid " << snapid
<< dendl
;
8560 } else if (curdir
->is_frozen()) {
8561 dout(7) << "traverse: " << *curdir
<< " is frozen, waiting" << dendl
;
8562 curdir
->add_waiter(CDir::WAIT_UNFREEZE
, cf
.build());
8565 // create a null dentry
8566 dn
= curdir
->add_null_dentry(path
[depth
]);
8567 dout(20) << " added null " << *dn
<< dendl
;
8572 if (depth
> 0 || !mdr
->lock_cache
) {
8573 lov
.add_wrlock(&cur
->filelock
);
8574 lov
.add_wrlock(&cur
->nestlock
);
8575 if (rdlock_authlock
)
8576 lov
.add_rdlock(&cur
->authlock
);
8578 lov
.add_xlock(&dn
->lock
);
8580 // force client to flush async dir operation if necessary
8581 if (cur
->filelock
.is_cached())
8582 lov
.add_wrlock(&cur
->filelock
);
8583 lov
.add_rdlock(&dn
->lock
);
8585 if (!mds
->locker
->acquire_locks(mdr
, lov
)) {
8586 dout(10) << "traverse: failed to rdlock " << dn
->lock
<< " " << *dn
<< dendl
;
8592 pdnvec
->push_back(dn
);
8596 pdnvec
->clear(); // do not confuse likes of rdlock_path_pin_ref();
8599 return -CEPHFS_ENOENT
;
8602 // Check DamageTable for missing fragments before trying to fetch
8604 if (mds
->damage_table
.is_dirfrag_damaged(curdir
)) {
8605 dout(4) << "traverse: damaged dirfrag " << *curdir
8606 << ", blocking fetch" << dendl
;
8610 // directory isn't complete; reload
8611 dout(7) << "traverse: incomplete dir contents for " << *cur
<< ", fetching" << dendl
;
8613 curdir
->fetch(path
[depth
], snapid
, cf
.build());
8614 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_dir_fetch
);
8618 // dirfrag/dentry is not mine.
8621 mdr
&& mdr
->client_request
&&
8622 (int)depth
< mdr
->client_request
->get_num_fwd()){
8623 dout(7) << "traverse: snap " << snapid
<< " and depth " << depth
8624 << " < fwd " << mdr
->client_request
->get_num_fwd()
8625 << ", discovering instead of forwarding" << dendl
;
8630 dout(7) << "traverse: discover from " << path
[depth
] << " from " << *curdir
<< dendl
;
8631 discover_path(curdir
, snapid
, path
.postfixpath(depth
), cf
.build(),
8633 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_discover
);
8638 dout(7) << "traverse: not auth for " << path
<< " in " << *curdir
<< dendl
;
8640 r
= maybe_request_forward_to_auth(mdr
, cf
, curdir
);
8641 ceph_assert(r
!= 0);
8643 if (r
== 2 && mds
->logger
)
8644 mds
->logger
->inc(l_mds_traverse_forward
);
8650 ceph_abort(); // i shouldn't get here
8653 if (path
.depth() == 0) {
8654 dout(7) << "no tail dentry, base " << *cur
<< dendl
;
8655 if (want_dentry
&& !want_inode
) {
8656 return -CEPHFS_ENOENT
;
8662 dout(7) << "found target " << *target_inode
<< dendl
;
8663 if (want_auth
&& !(want_dentry
&& !want_inode
) &&
8664 (r
= maybe_request_forward_to_auth(mdr
, cf
, target_inode
)) != 0)
8669 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_hit
);
8670 dout(10) << "path_traverse finish on snapid " << snapid
<< dendl
;
8672 ceph_assert(mdr
->snapid
== snapid
);
8674 if (flags
& MDS_TRAVERSE_RDLOCK_SNAP
)
8675 mdr
->locking_state
|= MutationImpl::SNAP_LOCKED
;
8676 else if (flags
& MDS_TRAVERSE_RDLOCK_SNAP2
)
8677 mdr
->locking_state
|= MutationImpl::SNAP2_LOCKED
;
8680 mdr
->locking_state
|= MutationImpl::PATH_LOCKED
;
8685 int MDCache::maybe_request_forward_to_auth(MDRequestRef
& mdr
, MDSContextFactory
& cf
,
8688 if (p
->is_ambiguous_auth()) {
8689 dout(7) << "waiting for single auth on " << *p
<< dendl
;
8690 p
->add_waiter(CInode::WAIT_SINGLEAUTH
, cf
.build());
8693 if (!p
->is_auth()) {
8694 dout(7) << "fw to auth for " << *p
<< dendl
;
8695 request_forward(mdr
, p
->authority().first
);
8701 CInode
*MDCache::cache_traverse(const filepath
& fp
)
8703 dout(10) << "cache_traverse " << fp
<< dendl
;
8707 char mdsdir_name
[16];
8708 sprintf(mdsdir_name
, "~mds%d", mds
->get_nodeid());
8711 in
= get_inode(fp
.get_ino());
8712 } else if (fp
.depth() > 0 && (fp
[0] == "~mdsdir" || fp
[0] == mdsdir_name
)) {
8721 for (; depth
< fp
.depth(); depth
++) {
8722 std::string_view dname
= fp
[depth
];
8723 frag_t fg
= in
->pick_dirfrag(dname
);
8724 dout(20) << " " << depth
<< " " << dname
<< " frag " << fg
<< " from " << *in
<< dendl
;
8725 CDir
*curdir
= in
->get_dirfrag(fg
);
8728 CDentry
*dn
= curdir
->lookup(dname
, CEPH_NOSNAP
);
8731 in
= dn
->get_linkage()->get_inode();
8735 dout(10) << " got " << *in
<< dendl
;
8741 * open_remote_dir -- open up a remote dirfrag
8743 * @param diri base inode
8744 * @param approxfg approximate fragment.
8745 * @param fin completion callback
8747 void MDCache::open_remote_dirfrag(CInode
*diri
, frag_t approxfg
, MDSContext
*fin
)
8749 dout(10) << "open_remote_dir on " << *diri
<< dendl
;
8750 ceph_assert(diri
->is_dir());
8751 ceph_assert(!diri
->is_auth());
8752 ceph_assert(diri
->get_dirfrag(approxfg
) == 0);
8754 discover_dir_frag(diri
, approxfg
, fin
);
8759 * get_dentry_inode - get or open inode
8761 * @param dn the dentry
8762 * @param mdr current request
8764 * will return inode for primary, or link up/open up remote link's inode as necessary.
8765 * If it's not available right now, puts mdr on wait list and returns null.
8767 CInode
*MDCache::get_dentry_inode(CDentry
*dn
, MDRequestRef
& mdr
, bool projected
)
8769 CDentry::linkage_t
*dnl
;
8771 dnl
= dn
->get_projected_linkage();
8773 dnl
= dn
->get_linkage();
8775 ceph_assert(!dnl
->is_null());
8777 if (dnl
->is_primary())
8780 ceph_assert(dnl
->is_remote());
8781 CInode
*in
= get_inode(dnl
->get_remote_ino());
8783 dout(7) << "get_dentry_inode linking in remote in " << *in
<< dendl
;
8784 dn
->link_remote(dnl
, in
);
8787 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn
<< dendl
;
8788 open_remote_dentry(dn
, projected
, new C_MDS_RetryRequest(this, mdr
));
8793 struct C_MDC_OpenRemoteDentry
: public MDCacheContext
{
8796 MDSContext
*onfinish
;
8798 C_MDC_OpenRemoteDentry(MDCache
*m
, CDentry
*d
, inodeno_t i
, MDSContext
*f
, bool wx
) :
8799 MDCacheContext(m
), dn(d
), ino(i
), onfinish(f
), want_xlocked(wx
) {
8800 dn
->get(MDSCacheObject::PIN_PTRWAITER
);
8802 void finish(int r
) override
{
8803 mdcache
->_open_remote_dentry_finish(dn
, ino
, onfinish
, want_xlocked
, r
);
8804 dn
->put(MDSCacheObject::PIN_PTRWAITER
);
8808 void MDCache::open_remote_dentry(CDentry
*dn
, bool projected
, MDSContext
*fin
, bool want_xlocked
)
8810 dout(10) << "open_remote_dentry " << *dn
<< dendl
;
8811 CDentry::linkage_t
*dnl
= projected
? dn
->get_projected_linkage() : dn
->get_linkage();
8812 inodeno_t ino
= dnl
->get_remote_ino();
8813 int64_t pool
= dnl
->get_remote_d_type() == DT_DIR
? mds
->get_metadata_pool() : -1;
8815 new C_MDC_OpenRemoteDentry(this, dn
, ino
, fin
, want_xlocked
), true, want_xlocked
); // backtrace
8818 void MDCache::_open_remote_dentry_finish(CDentry
*dn
, inodeno_t ino
, MDSContext
*fin
,
8819 bool want_xlocked
, int r
)
8822 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
8823 if (dnl
->is_remote() && dnl
->get_remote_ino() == ino
) {
8824 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn
<< dendl
;
8825 dn
->state_set(CDentry::STATE_BADREMOTEINO
);
8828 CDir
*dir
= dn
->get_dir();
8830 dir
->get_inode()->make_path_string(path
);
8832 path
+= dn
->get_name();
8835 bool fatal
= mds
->damage_table
.notify_remote_damaged(ino
, path
);
8838 ceph_abort(); // unreachable, damaged() respawns us
8844 fin
->complete(r
< 0 ? r
: 0);
8848 void MDCache::make_trace(vector
<CDentry
*>& trace
, CInode
*in
)
8850 // empty trace if we're a base inode
8854 CInode
*parent
= in
->get_parent_inode();
8855 ceph_assert(parent
);
8856 make_trace(trace
, parent
);
8858 CDentry
*dn
= in
->get_parent_dn();
8859 dout(15) << "make_trace adding " << *dn
<< dendl
;
8860 trace
.push_back(dn
);
8864 // -------------------------------------------------------------------------------
8865 // Open inode by inode number
8867 class C_IO_MDC_OpenInoBacktraceFetched
: public MDCacheIOContext
{
8871 C_IO_MDC_OpenInoBacktraceFetched(MDCache
*c
, inodeno_t i
) :
8872 MDCacheIOContext(c
), ino(i
) {}
8873 void finish(int r
) override
{
8874 mdcache
->_open_ino_backtrace_fetched(ino
, bl
, r
);
8876 void print(ostream
& out
) const override
{
8877 out
<< "openino_backtrace_fetch" << ino
<< ")";
8881 struct C_MDC_OpenInoTraverseDir
: public MDCacheContext
{
8883 cref_t
<MMDSOpenIno
> msg
;
8886 C_MDC_OpenInoTraverseDir(MDCache
*c
, inodeno_t i
, const cref_t
<MMDSOpenIno
> &m
, bool p
) :
8887 MDCacheContext(c
), ino(i
), msg(m
), parent(p
) {}
8888 void finish(int r
) override
{
8889 if (r
< 0 && !parent
)
8892 mdcache
->handle_open_ino(msg
, r
);
8895 auto& info
= mdcache
->opening_inodes
.at(ino
);
8896 mdcache
->_open_ino_traverse_dir(ino
, info
, r
);
8900 struct C_MDC_OpenInoParentOpened
: public MDCacheContext
{
8903 C_MDC_OpenInoParentOpened(MDCache
*c
, inodeno_t i
) : MDCacheContext(c
), ino(i
) {}
8904 void finish(int r
) override
{
8905 mdcache
->_open_ino_parent_opened(ino
, r
);
8909 void MDCache::_open_ino_backtrace_fetched(inodeno_t ino
, bufferlist
& bl
, int err
)
8911 dout(10) << "_open_ino_backtrace_fetched ino " << ino
<< " errno " << err
<< dendl
;
8913 open_ino_info_t
& info
= opening_inodes
.at(ino
);
8915 CInode
*in
= get_inode(ino
);
8917 dout(10) << " found cached " << *in
<< dendl
;
8918 open_ino_finish(ino
, info
, in
->authority().first
);
8922 inode_backtrace_t backtrace
;
8925 decode(backtrace
, bl
);
8926 } catch (const buffer::error
&decode_exc
) {
8927 derr
<< "corrupt backtrace on ino x0" << std::hex
<< ino
8928 << std::dec
<< ": " << decode_exc
.what() << dendl
;
8929 open_ino_finish(ino
, info
, -CEPHFS_EIO
);
8932 if (backtrace
.pool
!= info
.pool
&& backtrace
.pool
!= -1) {
8933 dout(10) << " old object in pool " << info
.pool
8934 << ", retrying pool " << backtrace
.pool
<< dendl
;
8935 info
.pool
= backtrace
.pool
;
8936 C_IO_MDC_OpenInoBacktraceFetched
*fin
=
8937 new C_IO_MDC_OpenInoBacktraceFetched(this, ino
);
8938 fetch_backtrace(ino
, info
.pool
, fin
->bl
,
8939 new C_OnFinisher(fin
, mds
->finisher
));
8942 } else if (err
== -CEPHFS_ENOENT
) {
8943 int64_t meta_pool
= mds
->get_metadata_pool();
8944 if (info
.pool
!= meta_pool
) {
8945 dout(10) << " no object in pool " << info
.pool
8946 << ", retrying pool " << meta_pool
<< dendl
;
8947 info
.pool
= meta_pool
;
8948 C_IO_MDC_OpenInoBacktraceFetched
*fin
=
8949 new C_IO_MDC_OpenInoBacktraceFetched(this, ino
);
8950 fetch_backtrace(ino
, info
.pool
, fin
->bl
,
8951 new C_OnFinisher(fin
, mds
->finisher
));
8954 err
= 0; // backtrace.ancestors.empty() is checked below
8958 if (backtrace
.ancestors
.empty()) {
8959 dout(10) << " got empty backtrace " << dendl
;
8960 err
= -CEPHFS_ESTALE
;
8961 } else if (!info
.ancestors
.empty()) {
8962 if (info
.ancestors
[0] == backtrace
.ancestors
[0]) {
8963 dout(10) << " got same parents " << info
.ancestors
[0] << " 2 times" << dendl
;
8964 err
= -CEPHFS_EINVAL
;
8971 dout(0) << " failed to open ino " << ino
<< " err " << err
<< "/" << info
.last_err
<< dendl
;
8973 err
= info
.last_err
;
8974 open_ino_finish(ino
, info
, err
);
8978 dout(10) << " got backtrace " << backtrace
<< dendl
;
8979 info
.ancestors
= backtrace
.ancestors
;
8981 _open_ino_traverse_dir(ino
, info
, 0);
8984 void MDCache::_open_ino_parent_opened(inodeno_t ino
, int ret
)
8986 dout(10) << "_open_ino_parent_opened ino " << ino
<< " ret " << ret
<< dendl
;
8988 open_ino_info_t
& info
= opening_inodes
.at(ino
);
8990 CInode
*in
= get_inode(ino
);
8992 dout(10) << " found cached " << *in
<< dendl
;
8993 open_ino_finish(ino
, info
, in
->authority().first
);
8997 if (ret
== mds
->get_nodeid()) {
8998 _open_ino_traverse_dir(ino
, info
, 0);
9001 mds_rank_t checked_rank
= mds_rank_t(ret
);
9002 info
.check_peers
= true;
9003 info
.auth_hint
= checked_rank
;
9004 info
.checked
.erase(checked_rank
);
9006 do_open_ino(ino
, info
, ret
);
9010 void MDCache::_open_ino_traverse_dir(inodeno_t ino
, open_ino_info_t
& info
, int ret
)
9012 dout(10) << __func__
<< ": ino " << ino
<< " ret " << ret
<< dendl
;
9014 CInode
*in
= get_inode(ino
);
9016 dout(10) << " found cached " << *in
<< dendl
;
9017 open_ino_finish(ino
, info
, in
->authority().first
);
9022 do_open_ino(ino
, info
, ret
);
9026 mds_rank_t hint
= info
.auth_hint
;
9027 ret
= open_ino_traverse_dir(ino
, NULL
, info
.ancestors
,
9028 info
.discover
, info
.want_xlocked
, &hint
);
9031 if (hint
!= mds
->get_nodeid())
9032 info
.auth_hint
= hint
;
9033 do_open_ino(ino
, info
, ret
);
9036 void MDCache::_open_ino_fetch_dir(inodeno_t ino
, const cref_t
<MMDSOpenIno
> &m
, bool parent
,
9037 CDir
*dir
, std::string_view dname
)
9039 if (dir
->state_test(CDir::STATE_REJOINUNDEF
))
9040 ceph_assert(dir
->get_inode()->dirfragtree
.is_leaf(dir
->get_frag()));
9042 auto fin
= new C_MDC_OpenInoTraverseDir(this, ino
, m
, parent
);
9043 if (open_ino_batch
&& !dname
.empty()) {
9044 auto& p
= open_ino_batched_fetch
[dir
];
9045 p
.first
.emplace_back(dname
);
9046 p
.second
.emplace_back(fin
);
9050 dir
->fetch(dname
, CEPH_NOSNAP
, fin
);
9052 mds
->logger
->inc(l_mds_openino_dir_fetch
);
9055 int MDCache::open_ino_traverse_dir(inodeno_t ino
, const cref_t
<MMDSOpenIno
> &m
,
9056 const vector
<inode_backpointer_t
>& ancestors
,
9057 bool discover
, bool want_xlocked
, mds_rank_t
*hint
)
9059 dout(10) << "open_ino_traverse_dir ino " << ino
<< " " << ancestors
<< dendl
;
9061 for (unsigned i
= 0; i
< ancestors
.size(); i
++) {
9062 const auto& ancestor
= ancestors
.at(i
);
9063 CInode
*diri
= get_inode(ancestor
.dirino
);
9066 if (discover
&& MDS_INO_IS_MDSDIR(ancestor
.dirino
)) {
9067 open_foreign_mdsdir(ancestor
.dirino
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
9073 if (diri
->state_test(CInode::STATE_REJOINUNDEF
)) {
9074 CDentry
*dn
= diri
->get_parent_dn();
9075 CDir
*dir
= dn
->get_dir();
9076 while (dir
->state_test(CDir::STATE_REJOINUNDEF
) &&
9077 dir
->get_inode()->state_test(CInode::STATE_REJOINUNDEF
)) {
9078 dn
= dir
->get_inode()->get_parent_dn();
9079 dir
= dn
->get_dir();
9081 _open_ino_fetch_dir(ino
, m
, i
== 0, dir
, dn
->name
);
9085 if (!diri
->is_dir()) {
9086 dout(10) << " " << *diri
<< " is not dir" << dendl
;
9088 err
= -CEPHFS_ENOTDIR
;
9092 const string
& name
= ancestor
.dname
;
9093 frag_t fg
= diri
->pick_dirfrag(name
);
9094 CDir
*dir
= diri
->get_dirfrag(fg
);
9096 if (diri
->is_auth()) {
9097 if (diri
->is_frozen()) {
9098 dout(10) << " " << *diri
<< " is frozen, waiting " << dendl
;
9099 diri
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
9102 dir
= diri
->get_or_open_dirfrag(this, fg
);
9103 } else if (discover
) {
9104 open_remote_dirfrag(diri
, fg
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
9109 inodeno_t next_ino
= i
> 0 ? ancestors
.at(i
-1).dirino
: ino
;
9110 CDentry
*dn
= dir
->lookup(name
);
9111 CDentry::linkage_t
*dnl
= dn
? dn
->get_linkage() : NULL
;
9112 if (dir
->is_auth()) {
9113 if (dnl
&& dnl
->is_primary() &&
9114 dnl
->get_inode()->state_test(CInode::STATE_REJOINUNDEF
)) {
9115 dout(10) << " fetching undef " << *dnl
->get_inode() << dendl
;
9116 _open_ino_fetch_dir(ino
, m
, i
== 0, dir
, name
);
9120 if (!dnl
&& !dir
->is_complete() &&
9121 (!dir
->has_bloom() || dir
->is_in_bloom(name
))) {
9122 dout(10) << " fetching incomplete " << *dir
<< dendl
;
9123 _open_ino_fetch_dir(ino
, m
, i
== 0, dir
, name
);
9127 dout(10) << " no ino " << next_ino
<< " in " << *dir
<< dendl
;
9129 err
= -CEPHFS_ENOENT
;
9130 } else if (discover
) {
9132 filepath
path(name
, 0);
9133 discover_path(dir
, CEPH_NOSNAP
, path
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0),
9134 (i
== 0 && want_xlocked
));
9137 if (dnl
->is_null() && !dn
->lock
.can_read(-1)) {
9138 dout(10) << " null " << *dn
<< " is not readable, waiting" << dendl
;
9139 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
9142 dout(10) << " no ino " << next_ino
<< " in " << *dir
<< dendl
;
9144 err
= -CEPHFS_ENOENT
;
9148 *hint
= dir
? dir
->authority().first
: diri
->authority().first
;
9154 void MDCache::open_ino_finish(inodeno_t ino
, open_ino_info_t
& info
, int ret
)
9156 dout(10) << "open_ino_finish ino " << ino
<< " ret " << ret
<< dendl
;
9158 MDSContext::vec waiters
;
9159 waiters
.swap(info
.waiters
);
9160 opening_inodes
.erase(ino
);
9161 finish_contexts(g_ceph_context
, waiters
, ret
);
9164 void MDCache::do_open_ino(inodeno_t ino
, open_ino_info_t
& info
, int err
)
9166 if (err
< 0 && err
!= -CEPHFS_EAGAIN
) {
9167 info
.checked
.clear();
9168 info
.checking
= MDS_RANK_NONE
;
9169 info
.check_peers
= true;
9170 info
.fetch_backtrace
= true;
9171 if (info
.discover
) {
9172 info
.discover
= false;
9173 info
.ancestors
.clear();
9175 if (err
!= -CEPHFS_ENOENT
&& err
!= -CEPHFS_ENOTDIR
)
9176 info
.last_err
= err
;
9179 if (info
.check_peers
|| info
.discover
) {
9180 if (info
.discover
) {
9181 // got backtrace from peer, but failed to find inode. re-check peers
9182 info
.discover
= false;
9183 info
.ancestors
.clear();
9184 info
.checked
.clear();
9186 info
.check_peers
= false;
9187 info
.checking
= MDS_RANK_NONE
;
9188 do_open_ino_peer(ino
, info
);
9189 } else if (info
.fetch_backtrace
) {
9190 info
.check_peers
= true;
9191 info
.fetch_backtrace
= false;
9192 info
.checking
= mds
->get_nodeid();
9193 info
.checked
.clear();
9194 C_IO_MDC_OpenInoBacktraceFetched
*fin
=
9195 new C_IO_MDC_OpenInoBacktraceFetched(this, ino
);
9196 fetch_backtrace(ino
, info
.pool
, fin
->bl
,
9197 new C_OnFinisher(fin
, mds
->finisher
));
9199 ceph_assert(!info
.ancestors
.empty());
9200 info
.checking
= mds
->get_nodeid();
9201 open_ino(info
.ancestors
[0].dirino
, mds
->get_metadata_pool(),
9202 new C_MDC_OpenInoParentOpened(this, ino
), info
.want_replica
);
9206 void MDCache::do_open_ino_peer(inodeno_t ino
, open_ino_info_t
& info
)
9208 set
<mds_rank_t
> all
, active
;
9209 mds
->mdsmap
->get_mds_set(all
);
9210 if (mds
->get_state() == MDSMap::STATE_REJOIN
)
9211 mds
->mdsmap
->get_mds_set_lower_bound(active
, MDSMap::STATE_REJOIN
);
9213 mds
->mdsmap
->get_mds_set_lower_bound(active
, MDSMap::STATE_CLIENTREPLAY
);
9215 dout(10) << "do_open_ino_peer " << ino
<< " active " << active
9216 << " all " << all
<< " checked " << info
.checked
<< dendl
;
9218 mds_rank_t whoami
= mds
->get_nodeid();
9219 mds_rank_t peer
= MDS_RANK_NONE
;
9220 if (info
.auth_hint
>= 0 && info
.auth_hint
!= whoami
) {
9221 if (active
.count(info
.auth_hint
)) {
9222 peer
= info
.auth_hint
;
9223 info
.auth_hint
= MDS_RANK_NONE
;
9226 for (set
<mds_rank_t
>::iterator p
= active
.begin(); p
!= active
.end(); ++p
)
9227 if (*p
!= whoami
&& info
.checked
.count(*p
) == 0) {
9234 if (all
!= info
.checked
) {
9235 dout(10) << " waiting for more peers to be active" << dendl
;
9237 dout(10) << " all MDS peers have been checked " << dendl
;
9238 do_open_ino(ino
, info
, 0);
9241 info
.checking
= peer
;
9242 vector
<inode_backpointer_t
> *pa
= NULL
;
9243 // got backtrace from peer or backtrace just fetched
9244 if (info
.discover
|| !info
.fetch_backtrace
)
9245 pa
= &info
.ancestors
;
9246 mds
->send_message_mds(make_message
<MMDSOpenIno
>(info
.tid
, ino
, pa
), peer
);
9248 mds
->logger
->inc(l_mds_openino_peer_discover
);
9252 void MDCache::handle_open_ino(const cref_t
<MMDSOpenIno
> &m
, int err
)
9254 if (mds
->get_state() < MDSMap::STATE_REJOIN
&&
9255 mds
->get_want_state() != CEPH_MDS_STATE_REJOIN
) {
9259 dout(10) << "handle_open_ino " << *m
<< " err " << err
<< dendl
;
9261 auto from
= mds_rank_t(m
->get_source().num());
9262 inodeno_t ino
= m
->ino
;
9263 ref_t
<MMDSOpenInoReply
> reply
;
9264 CInode
*in
= get_inode(ino
);
9266 dout(10) << " have " << *in
<< dendl
;
9267 reply
= make_message
<MMDSOpenInoReply
>(m
->get_tid(), ino
, mds_rank_t(0));
9268 if (in
->is_auth()) {
9271 CDentry
*pdn
= in
->get_parent_dn();
9274 CInode
*diri
= pdn
->get_dir()->get_inode();
9275 reply
->ancestors
.push_back(inode_backpointer_t(diri
->ino(), pdn
->get_name(),
9276 in
->get_version()));
9280 reply
->hint
= in
->authority().first
;
9282 } else if (err
< 0) {
9283 reply
= make_message
<MMDSOpenInoReply
>(m
->get_tid(), ino
, MDS_RANK_NONE
, err
);
9285 mds_rank_t hint
= MDS_RANK_NONE
;
9286 int ret
= open_ino_traverse_dir(ino
, m
, m
->ancestors
, false, false, &hint
);
9289 reply
= make_message
<MMDSOpenInoReply
>(m
->get_tid(), ino
, hint
, ret
);
9291 mds
->send_message_mds(reply
, from
);
9294 void MDCache::handle_open_ino_reply(const cref_t
<MMDSOpenInoReply
> &m
)
9296 dout(10) << "handle_open_ino_reply " << *m
<< dendl
;
9298 inodeno_t ino
= m
->ino
;
9299 mds_rank_t from
= mds_rank_t(m
->get_source().num());
9300 auto it
= opening_inodes
.find(ino
);
9301 if (it
!= opening_inodes
.end() && it
->second
.checking
== from
) {
9302 open_ino_info_t
& info
= it
->second
;
9303 info
.checking
= MDS_RANK_NONE
;
9304 info
.checked
.insert(from
);
9306 CInode
*in
= get_inode(ino
);
9308 dout(10) << " found cached " << *in
<< dendl
;
9309 open_ino_finish(ino
, info
, in
->authority().first
);
9310 } else if (!m
->ancestors
.empty()) {
9311 dout(10) << " found ino " << ino
<< " on mds." << from
<< dendl
;
9312 if (!info
.want_replica
) {
9313 open_ino_finish(ino
, info
, from
);
9317 info
.ancestors
= m
->ancestors
;
9318 info
.auth_hint
= from
;
9319 info
.checking
= mds
->get_nodeid();
9320 info
.discover
= true;
9321 _open_ino_traverse_dir(ino
, info
, 0);
9322 } else if (m
->error
) {
9323 dout(10) << " error " << m
->error
<< " from mds." << from
<< dendl
;
9324 do_open_ino(ino
, info
, m
->error
);
9326 if (m
->hint
>= 0 && m
->hint
!= mds
->get_nodeid()) {
9327 info
.auth_hint
= m
->hint
;
9328 info
.checked
.erase(m
->hint
);
9330 do_open_ino_peer(ino
, info
);
9335 void MDCache::kick_open_ino_peers(mds_rank_t who
)
9337 dout(10) << "kick_open_ino_peers mds." << who
<< dendl
;
9339 for (map
<inodeno_t
, open_ino_info_t
>::iterator p
= opening_inodes
.begin();
9340 p
!= opening_inodes
.end();
9342 open_ino_info_t
& info
= p
->second
;
9343 if (info
.checking
== who
) {
9344 dout(10) << " kicking ino " << p
->first
<< " who was checking mds." << who
<< dendl
;
9345 info
.checking
= MDS_RANK_NONE
;
9346 do_open_ino_peer(p
->first
, info
);
9347 } else if (info
.checking
== MDS_RANK_NONE
) {
9348 dout(10) << " kicking ino " << p
->first
<< " who was waiting" << dendl
;
9349 do_open_ino_peer(p
->first
, info
);
9354 void MDCache::open_ino_batch_start()
9356 dout(10) << __func__
<< dendl
;
9357 open_ino_batch
= true;
9360 void MDCache::open_ino_batch_submit()
9362 dout(10) << __func__
<< dendl
;
9363 open_ino_batch
= false;
9365 for (auto& [dir
, p
] : open_ino_batched_fetch
) {
9366 CInode
*in
= dir
->inode
;
9367 std::vector
<dentry_key_t
> keys
;
9368 for (auto& dname
: p
.first
)
9369 keys
.emplace_back(CEPH_NOSNAP
, dname
, in
->hash_dentry_name(dname
));
9370 dir
->fetch_keys(keys
,
9371 new MDSInternalContextWrapper(mds
,
9372 new LambdaContext([this, waiters
= std::move(p
.second
)](int r
) mutable {
9373 mds
->queue_waiters_front(waiters
);
9378 mds
->logger
->inc(l_mds_openino_dir_fetch
);
9380 open_ino_batched_fetch
.clear();
9383 void MDCache::open_ino(inodeno_t ino
, int64_t pool
, MDSContext
* fin
,
9384 bool want_replica
, bool want_xlocked
,
9385 vector
<inode_backpointer_t
> *ancestors_hint
,
9386 mds_rank_t auth_hint
)
9388 dout(10) << "open_ino " << ino
<< " pool " << pool
<< " want_replica "
9389 << want_replica
<< dendl
;
9391 auto it
= opening_inodes
.find(ino
);
9392 if (it
!= opening_inodes
.end()) {
9393 open_ino_info_t
& info
= it
->second
;
9395 info
.want_replica
= true;
9396 if (want_xlocked
&& !info
.want_xlocked
) {
9397 if (!info
.ancestors
.empty()) {
9398 CInode
*diri
= get_inode(info
.ancestors
[0].dirino
);
9400 frag_t fg
= diri
->pick_dirfrag(info
.ancestors
[0].dname
);
9401 CDir
*dir
= diri
->get_dirfrag(fg
);
9402 if (dir
&& !dir
->is_auth()) {
9403 filepath
path(info
.ancestors
[0].dname
, 0);
9404 discover_path(dir
, CEPH_NOSNAP
, path
, NULL
, true);
9408 info
.want_xlocked
= true;
9411 info
.waiters
.push_back(fin
);
9413 open_ino_info_t
& info
= opening_inodes
[ino
];
9414 info
.want_replica
= want_replica
;
9415 info
.want_xlocked
= want_xlocked
;
9416 info
.tid
= ++open_ino_last_tid
;
9417 info
.pool
= pool
>= 0 ? pool
: default_file_layout
.pool_id
;
9418 info
.waiters
.push_back(fin
);
9419 if (auth_hint
!= MDS_RANK_NONE
)
9420 info
.auth_hint
= auth_hint
;
9421 if (ancestors_hint
) {
9422 info
.ancestors
= std::move(*ancestors_hint
);
9423 info
.fetch_backtrace
= false;
9424 info
.checking
= mds
->get_nodeid();
9425 _open_ino_traverse_dir(ino
, info
, 0);
9427 do_open_ino(ino
, info
, 0);
9432 /* ---------------------------- */
9435 * search for a given inode on MDS peers. optionally start with the given node.
9439 - recover from mds node failure, recovery
9443 void MDCache::find_ino_peers(inodeno_t ino
, MDSContext
*c
,
9444 mds_rank_t hint
, bool path_locked
)
9446 dout(5) << "find_ino_peers " << ino
<< " hint " << hint
<< dendl
;
9447 CInode
*in
= get_inode(ino
);
9448 if (in
&& in
->state_test(CInode::STATE_PURGING
)) {
9449 c
->complete(-CEPHFS_ESTALE
);
9454 ceph_tid_t tid
= ++find_ino_peer_last_tid
;
9455 find_ino_peer_info_t
& fip
= find_ino_peer
[tid
];
9459 fip
.path_locked
= path_locked
;
9461 _do_find_ino_peer(fip
);
9464 void MDCache::_do_find_ino_peer(find_ino_peer_info_t
& fip
)
9466 set
<mds_rank_t
> all
, active
;
9467 mds
->mdsmap
->get_mds_set(all
);
9468 mds
->mdsmap
->get_mds_set_lower_bound(active
, MDSMap::STATE_CLIENTREPLAY
);
9470 dout(10) << "_do_find_ino_peer " << fip
.tid
<< " " << fip
.ino
9471 << " active " << active
<< " all " << all
9472 << " checked " << fip
.checked
9475 mds_rank_t m
= MDS_RANK_NONE
;
9476 if (fip
.hint
>= 0) {
9478 fip
.hint
= MDS_RANK_NONE
;
9480 for (set
<mds_rank_t
>::iterator p
= active
.begin(); p
!= active
.end(); ++p
)
9481 if (*p
!= mds
->get_nodeid() &&
9482 fip
.checked
.count(*p
) == 0) {
9487 if (m
== MDS_RANK_NONE
) {
9488 all
.erase(mds
->get_nodeid());
9489 if (all
!= fip
.checked
) {
9490 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl
;
9492 dout(10) << "_do_find_ino_peer failed on " << fip
.ino
<< dendl
;
9493 fip
.fin
->complete(-CEPHFS_ESTALE
);
9494 find_ino_peer
.erase(fip
.tid
);
9498 mds
->send_message_mds(make_message
<MMDSFindIno
>(fip
.tid
, fip
.ino
), m
);
9502 void MDCache::handle_find_ino(const cref_t
<MMDSFindIno
> &m
)
9504 if (mds
->get_state() < MDSMap::STATE_REJOIN
) {
9508 dout(10) << "handle_find_ino " << *m
<< dendl
;
9509 auto r
= make_message
<MMDSFindInoReply
>(m
->tid
);
9510 CInode
*in
= get_inode(m
->ino
);
9512 in
->make_path(r
->path
);
9513 dout(10) << " have " << r
->path
<< " " << *in
<< dendl
;
9516 * If the the CInode was just created by using openc in current
9517 * auth MDS, but the client just sends a getattr request to another
9518 * replica MDS. Then here it will make a path of '#INODE-NUMBER'
9519 * only because the CInode hasn't been linked yet, and the replica
9520 * MDS will keep retrying until the auth MDS flushes the mdlog and
9521 * the C_MDS_openc_finish and link_primary_inode are called at most
9524 if (!in
->get_parent_dn() && in
->is_auth()) {
9525 mds
->mdlog
->flush();
9528 mds
->send_message_mds(r
, mds_rank_t(m
->get_source().num()));
9532 void MDCache::handle_find_ino_reply(const cref_t
<MMDSFindInoReply
> &m
)
9534 auto p
= find_ino_peer
.find(m
->tid
);
9535 if (p
!= find_ino_peer
.end()) {
9536 dout(10) << "handle_find_ino_reply " << *m
<< dendl
;
9537 find_ino_peer_info_t
& fip
= p
->second
;
9540 if (get_inode(fip
.ino
)) {
9541 dout(10) << "handle_find_ino_reply successfully found " << fip
.ino
<< dendl
;
9542 mds
->queue_waiter(fip
.fin
);
9543 find_ino_peer
.erase(p
);
9547 mds_rank_t from
= mds_rank_t(m
->get_source().num());
9548 if (fip
.checking
== from
)
9549 fip
.checking
= MDS_RANK_NONE
;
9550 fip
.checked
.insert(from
);
9552 if (!m
->path
.empty()) {
9554 vector
<CDentry
*> trace
;
9555 CF_MDS_RetryMessageFactory
cf(mds
, m
);
9556 MDRequestRef null_ref
;
9557 int flags
= MDS_TRAVERSE_DISCOVER
;
9558 if (fip
.path_locked
)
9559 flags
|= MDS_TRAVERSE_PATH_LOCKED
;
9560 int r
= path_traverse(null_ref
, cf
, m
->path
, flags
, &trace
);
9563 dout(0) << "handle_find_ino_reply failed with " << r
<< " on " << m
->path
9564 << ", retrying" << dendl
;
9565 fip
.checked
.clear();
9566 _do_find_ino_peer(fip
);
9569 _do_find_ino_peer(fip
);
9572 dout(10) << "handle_find_ino_reply tid " << m
->tid
<< " dne" << dendl
;
9576 void MDCache::kick_find_ino_peers(mds_rank_t who
)
9578 // find_ino_peers requests we should move on from
9579 for (map
<ceph_tid_t
,find_ino_peer_info_t
>::iterator p
= find_ino_peer
.begin();
9580 p
!= find_ino_peer
.end();
9582 find_ino_peer_info_t
& fip
= p
->second
;
9583 if (fip
.checking
== who
) {
9584 dout(10) << "kicking find_ino_peer " << fip
.tid
<< " who was checking mds." << who
<< dendl
;
9585 fip
.checking
= MDS_RANK_NONE
;
9586 _do_find_ino_peer(fip
);
9587 } else if (fip
.checking
== MDS_RANK_NONE
) {
9588 dout(10) << "kicking find_ino_peer " << fip
.tid
<< " who was waiting" << dendl
;
9589 _do_find_ino_peer(fip
);
9594 /* ---------------------------- */
9596 int MDCache::get_num_client_requests()
9599 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
9600 p
!= active_requests
.end();
9602 MDRequestRef
& mdr
= p
->second
;
9603 if (mdr
->reqid
.name
.is_client() && !mdr
->is_peer())
9609 MDRequestRef
MDCache::request_start(const cref_t
<MClientRequest
>& req
)
9611 // did we win a forward race against a peer?
9612 if (active_requests
.count(req
->get_reqid())) {
9613 MDRequestRef
& mdr
= active_requests
[req
->get_reqid()];
9615 if (mdr
->is_peer()) {
9616 dout(10) << "request_start already had " << *mdr
<< ", waiting for finish" << dendl
;
9617 mdr
->more()->waiting_for_finish
.push_back(new C_MDS_RetryMessage(mds
, req
));
9619 dout(10) << "request_start already processing " << *mdr
<< ", dropping new msg" << dendl
;
9621 return MDRequestRef();
9624 // register new client request
9625 MDRequestImpl::Params params
;
9626 params
.reqid
= req
->get_reqid();
9627 params
.attempt
= req
->get_num_fwd();
9628 params
.client_req
= req
;
9629 params
.initiated
= req
->get_recv_stamp();
9630 params
.throttled
= req
->get_throttle_stamp();
9631 params
.all_read
= req
->get_recv_complete_stamp();
9632 params
.dispatched
= req
->get_dispatch_stamp();
9635 mds
->op_tracker
.create_request
<MDRequestImpl
,MDRequestImpl::Params
*>(¶ms
);
9636 active_requests
[params
.reqid
] = mdr
;
9637 mdr
->set_op_stamp(req
->get_stamp());
9638 dout(7) << "request_start " << *mdr
<< dendl
;
9642 MDRequestRef
MDCache::request_start_peer(metareqid_t ri
, __u32 attempt
, const cref_t
<Message
> &m
)
9644 int by
= m
->get_source().num();
9645 MDRequestImpl::Params params
;
9647 params
.attempt
= attempt
;
9648 params
.triggering_peer_req
= m
;
9649 params
.peer_to
= by
;
9650 params
.initiated
= m
->get_recv_stamp();
9651 params
.throttled
= m
->get_throttle_stamp();
9652 params
.all_read
= m
->get_recv_complete_stamp();
9653 params
.dispatched
= m
->get_dispatch_stamp();
9655 mds
->op_tracker
.create_request
<MDRequestImpl
,MDRequestImpl::Params
*>(¶ms
);
9656 ceph_assert(active_requests
.count(mdr
->reqid
) == 0);
9657 active_requests
[mdr
->reqid
] = mdr
;
9658 dout(7) << "request_start_peer " << *mdr
<< " by mds." << by
<< dendl
;
9662 MDRequestRef
MDCache::request_start_internal(int op
)
9664 utime_t now
= ceph_clock_now();
9665 MDRequestImpl::Params params
;
9666 params
.reqid
.name
= entity_name_t::MDS(mds
->get_nodeid());
9667 params
.reqid
.tid
= mds
->issue_tid();
9668 params
.initiated
= now
;
9669 params
.throttled
= now
;
9670 params
.all_read
= now
;
9671 params
.dispatched
= now
;
9672 params
.internal_op
= op
;
9674 mds
->op_tracker
.create_request
<MDRequestImpl
,MDRequestImpl::Params
*>(¶ms
);
9676 if (active_requests
.count(mdr
->reqid
)) {
9677 auto& _mdr
= active_requests
[mdr
->reqid
];
9678 dout(0) << __func__
<< " existing " << *_mdr
<< " op " << _mdr
->internal_op
<< dendl
;
9679 dout(0) << __func__
<< " new " << *mdr
<< " op " << op
<< dendl
;
9682 active_requests
[mdr
->reqid
] = mdr
;
9683 dout(7) << __func__
<< " " << *mdr
<< " op " << op
<< dendl
;
9687 MDRequestRef
MDCache::request_get(metareqid_t rid
)
9689 ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.find(rid
);
9690 ceph_assert(p
!= active_requests
.end());
9691 dout(7) << "request_get " << rid
<< " " << *p
->second
<< dendl
;
9695 void MDCache::request_finish(MDRequestRef
& mdr
)
9697 dout(7) << "request_finish " << *mdr
<< dendl
;
9698 mdr
->mark_event("finishing request");
9701 if (mdr
->has_more() && mdr
->more()->peer_commit
) {
9702 Context
*fin
= mdr
->more()->peer_commit
;
9703 mdr
->more()->peer_commit
= 0;
9706 mdr
->aborted
= false;
9708 mdr
->more()->peer_rolling_back
= true;
9711 mdr
->committing
= true;
9713 fin
->complete(ret
); // this must re-call request_finish.
9717 switch(mdr
->internal_op
) {
9718 case CEPH_MDS_OP_FRAGMENTDIR
:
9719 logger
->inc(l_mdss_ireq_fragmentdir
);
9721 case CEPH_MDS_OP_EXPORTDIR
:
9722 logger
->inc(l_mdss_ireq_exportdir
);
9724 case CEPH_MDS_OP_ENQUEUE_SCRUB
:
9725 logger
->inc(l_mdss_ireq_enqueue_scrub
);
9727 case CEPH_MDS_OP_FLUSH
:
9728 logger
->inc(l_mdss_ireq_flush
);
9730 case CEPH_MDS_OP_REPAIR_FRAGSTATS
:
9731 logger
->inc(l_mdss_ireq_fragstats
);
9733 case CEPH_MDS_OP_REPAIR_INODESTATS
:
9734 logger
->inc(l_mdss_ireq_inodestats
);
9738 request_cleanup(mdr
);
9742 void MDCache::request_forward(MDRequestRef
& mdr
, mds_rank_t who
, int port
)
9744 CachedStackStringStream css
;
9745 *css
<< "forwarding request to mds." << who
;
9746 mdr
->mark_event(css
->strv());
9747 if (mdr
->client_request
&& mdr
->client_request
->get_source().is_client()) {
9748 dout(7) << "request_forward " << *mdr
<< " to mds." << who
<< " req "
9749 << *mdr
->client_request
<< dendl
;
9750 if (mdr
->is_batch_head()) {
9751 mdr
->release_batch_op()->forward(who
);
9753 mds
->forward_message_mds(mdr
, who
);
9755 if (mds
->logger
) mds
->logger
->inc(l_mds_forward
);
9756 } else if (mdr
->internal_op
>= 0) {
9757 dout(10) << "request_forward on internal op; cancelling" << dendl
;
9758 mdr
->internal_op_finish
->complete(-CEPHFS_EXDEV
);
9760 dout(7) << "request_forward drop " << *mdr
<< " req " << *mdr
->client_request
9761 << " was from mds" << dendl
;
9763 request_cleanup(mdr
);
9767 void MDCache::dispatch_request(MDRequestRef
& mdr
)
9769 if (mdr
->client_request
) {
9770 mds
->server
->dispatch_client_request(mdr
);
9771 } else if (mdr
->peer_request
) {
9772 mds
->server
->dispatch_peer_request(mdr
);
9774 switch (mdr
->internal_op
) {
9775 case CEPH_MDS_OP_FRAGMENTDIR
:
9776 dispatch_fragment_dir(mdr
);
9778 case CEPH_MDS_OP_EXPORTDIR
:
9779 migrator
->dispatch_export_dir(mdr
, 0);
9781 case CEPH_MDS_OP_ENQUEUE_SCRUB
:
9782 enqueue_scrub_work(mdr
);
9784 case CEPH_MDS_OP_FLUSH
:
9785 flush_dentry_work(mdr
);
9787 case CEPH_MDS_OP_REPAIR_FRAGSTATS
:
9788 repair_dirfrag_stats_work(mdr
);
9790 case CEPH_MDS_OP_REPAIR_INODESTATS
:
9791 repair_inode_stats_work(mdr
);
9793 case CEPH_MDS_OP_RDLOCK_FRAGSSTATS
:
9794 rdlock_dirfrags_stats_work(mdr
);
9803 void MDCache::request_drop_foreign_locks(MDRequestRef
& mdr
)
9805 if (!mdr
->has_more())
9809 // (will implicitly drop remote dn pins)
9810 for (set
<mds_rank_t
>::iterator p
= mdr
->more()->peers
.begin();
9811 p
!= mdr
->more()->peers
.end();
9813 auto r
= make_message
<MMDSPeerRequest
>(mdr
->reqid
, mdr
->attempt
,
9814 MMDSPeerRequest::OP_FINISH
);
9816 if (mdr
->killed
&& !mdr
->committing
) {
9818 } else if (mdr
->more()->srcdn_auth_mds
== *p
&&
9819 mdr
->more()->inode_import
.length() > 0) {
9820 // information about rename imported caps
9821 r
->inode_export
= std::move(mdr
->more()->inode_import
);
9824 mds
->send_message_mds(r
, *p
);
9827 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9828 * implicitly. Note that we don't call the finishers -- there shouldn't
9829 * be any on a remote lock and the request finish wakes up all
9830 * the waiters anyway! */
9832 for (auto it
= mdr
->locks
.begin(); it
!= mdr
->locks
.end(); ) {
9833 SimpleLock
*lock
= it
->lock
;
9834 if (it
->is_xlock() && !lock
->get_parent()->is_auth()) {
9835 dout(10) << "request_drop_foreign_locks forgetting lock " << *lock
9836 << " on " << lock
->get_parent() << dendl
;
9838 mdr
->locks
.erase(it
++);
9839 } else if (it
->is_remote_wrlock()) {
9840 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *lock
9841 << " on mds." << it
->wrlock_target
<< " on " << *lock
->get_parent() << dendl
;
9842 if (it
->is_wrlock()) {
9843 it
->clear_remote_wrlock();
9846 mdr
->locks
.erase(it
++);
9853 mdr
->more()->peers
.clear(); /* we no longer have requests out to them, and
9854 * leaving them in can cause double-notifies as
9855 * this function can get called more than once */
9858 void MDCache::request_drop_non_rdlocks(MDRequestRef
& mdr
)
9860 request_drop_foreign_locks(mdr
);
9861 mds
->locker
->drop_non_rdlocks(mdr
.get());
9864 void MDCache::request_drop_locks(MDRequestRef
& mdr
)
9866 request_drop_foreign_locks(mdr
);
9867 mds
->locker
->drop_locks(mdr
.get());
9870 void MDCache::request_cleanup(MDRequestRef
& mdr
)
9872 dout(15) << "request_cleanup " << *mdr
<< dendl
;
9874 if (mdr
->has_more()) {
9875 if (mdr
->more()->is_ambiguous_auth
)
9876 mdr
->clear_ambiguous_auth();
9877 if (!mdr
->more()->waiting_for_finish
.empty())
9878 mds
->queue_waiters(mdr
->more()->waiting_for_finish
);
9881 request_drop_locks(mdr
);
9883 // drop (local) auth pins
9884 mdr
->drop_local_auth_pins();
9887 mdr
->put_stickydirs();
9889 mds
->locker
->kick_cap_releases(mdr
);
9894 // remove from session
9895 mdr
->item_session_request
.remove_myself();
9898 active_requests
.erase(mdr
->reqid
);
9903 mdr
->mark_event("cleaned up request");
9906 void MDCache::request_kill(MDRequestRef
& mdr
)
9908 // rollback peer requests is tricky. just let the request proceed.
9909 if (mdr
->has_more() &&
9910 (!mdr
->more()->witnessed
.empty() || !mdr
->more()->waiting_on_peer
.empty())) {
9911 if (!(mdr
->locking_state
& MutationImpl::ALL_LOCKED
)) {
9912 ceph_assert(mdr
->more()->witnessed
.empty());
9913 mdr
->aborted
= true;
9914 dout(10) << "request_kill " << *mdr
<< " -- waiting for peer reply, delaying" << dendl
;
9916 dout(10) << "request_kill " << *mdr
<< " -- already started peer prep, no-op" << dendl
;
9919 ceph_assert(mdr
->used_prealloc_ino
== 0);
9920 ceph_assert(mdr
->prealloc_inos
.empty());
9922 mdr
->session
= NULL
;
9923 mdr
->item_session_request
.remove_myself();
9928 mdr
->mark_event("killing request");
9930 if (mdr
->committing
) {
9931 dout(10) << "request_kill " << *mdr
<< " -- already committing, remove it from sesssion requests" << dendl
;
9932 mdr
->item_session_request
.remove_myself();
9934 dout(10) << "request_kill " << *mdr
<< dendl
;
9935 request_cleanup(mdr
);
9939 // -------------------------------------------------------------------------------
9942 void MDCache::create_global_snaprealm()
9944 CInode
*in
= new CInode(this); // dummy inode
9945 create_unlinked_system_inode(in
, CEPH_INO_GLOBAL_SNAPREALM
, S_IFDIR
|0755);
9947 global_snaprealm
= in
->snaprealm
;
9950 void MDCache::do_realm_invalidate_and_update_notify(CInode
*in
, int snapop
, bool notify_clients
)
9952 dout(10) << "do_realm_invalidate_and_update_notify " << *in
->snaprealm
<< " " << *in
<< dendl
;
9954 vector
<inodeno_t
> split_inos
;
9955 vector
<inodeno_t
> split_realms
;
9957 if (notify_clients
) {
9958 if (snapop
== CEPH_SNAP_OP_SPLIT
) {
9959 // notify clients of update|split
9960 for (auto p
= in
->snaprealm
->inodes_with_caps
.begin(); !p
.end(); ++p
)
9961 split_inos
.push_back((*p
)->ino());
9963 for (auto& r
: in
->snaprealm
->open_children
)
9964 split_realms
.push_back(r
->inode
->ino());
9968 map
<client_t
, ref_t
<MClientSnap
>> updates
;
9970 q
.push_back(in
->snaprealm
);
9971 while (!q
.empty()) {
9972 SnapRealm
*realm
= q
.front();
9975 dout(10) << " realm " << *realm
<< " on " << *realm
->inode
<< dendl
;
9976 realm
->invalidate_cached_snaps();
9978 if (notify_clients
) {
9979 for (const auto& p
: realm
->client_caps
) {
9980 const auto& client
= p
.first
;
9981 const auto& caps
= p
.second
;
9982 ceph_assert(!caps
->empty());
9984 auto em
= updates
.emplace(std::piecewise_construct
, std::forward_as_tuple(client
), std::forward_as_tuple());
9986 auto update
= make_message
<MClientSnap
>(CEPH_SNAP_OP_SPLIT
);
9987 update
->head
.split
= in
->ino();
9988 update
->split_inos
= split_inos
;
9989 update
->split_realms
= split_realms
;
9990 update
->bl
= mds
->server
->get_snap_trace(em
.first
->first
, in
->snaprealm
);
9991 em
.first
->second
= std::move(update
);
9996 // notify for active children, too.
9997 dout(10) << " " << realm
<< " open_children are " << realm
->open_children
<< dendl
;
9998 for (auto& r
: realm
->open_children
)
10002 if (notify_clients
)
10003 send_snaps(updates
);
10006 void MDCache::send_snap_update(CInode
*in
, version_t stid
, int snap_op
)
10008 dout(10) << __func__
<< " " << *in
<< " stid " << stid
<< dendl
;
10009 ceph_assert(in
->is_auth());
10011 set
<mds_rank_t
> mds_set
;
10013 mds
->mdsmap
->get_mds_set_lower_bound(mds_set
, MDSMap::STATE_RESOLVE
);
10014 mds_set
.erase(mds
->get_nodeid());
10016 in
->list_replicas(mds_set
);
10019 if (!mds_set
.empty()) {
10020 bufferlist snap_blob
;
10021 in
->encode_snap(snap_blob
);
10023 for (auto p
: mds_set
) {
10024 auto m
= make_message
<MMDSSnapUpdate
>(in
->ino(), stid
, snap_op
);
10025 m
->snap_blob
= snap_blob
;
10026 mds
->send_message_mds(m
, p
);
10031 notify_global_snaprealm_update(snap_op
);
10034 void MDCache::handle_snap_update(const cref_t
<MMDSSnapUpdate
> &m
)
10036 mds_rank_t from
= mds_rank_t(m
->get_source().num());
10037 dout(10) << __func__
<< " " << *m
<< " from mds." << from
<< dendl
;
10039 if (mds
->get_state() < MDSMap::STATE_RESOLVE
&&
10040 mds
->get_want_state() != CEPH_MDS_STATE_RESOLVE
) {
10044 // null rejoin_done means open_snaprealms() has already been called
10045 bool notify_clients
= mds
->get_state() > MDSMap::STATE_REJOIN
||
10046 (mds
->is_rejoin() && !rejoin_done
);
10048 if (m
->get_tid() > 0) {
10049 mds
->snapclient
->notify_commit(m
->get_tid());
10050 if (notify_clients
)
10051 notify_global_snaprealm_update(m
->get_snap_op());
10054 CInode
*in
= get_inode(m
->get_ino());
10056 ceph_assert(!in
->is_auth());
10057 if (mds
->get_state() > MDSMap::STATE_REJOIN
||
10058 (mds
->is_rejoin() && !in
->is_rejoining())) {
10059 auto p
= m
->snap_blob
.cbegin();
10060 in
->decode_snap(p
);
10062 if (!notify_clients
) {
10063 if (!rejoin_pending_snaprealms
.count(in
)) {
10064 in
->get(CInode::PIN_OPENINGSNAPPARENTS
);
10065 rejoin_pending_snaprealms
.insert(in
);
10068 do_realm_invalidate_and_update_notify(in
, m
->get_snap_op(), notify_clients
);
10073 void MDCache::notify_global_snaprealm_update(int snap_op
)
10075 if (snap_op
!= CEPH_SNAP_OP_DESTROY
)
10076 snap_op
= CEPH_SNAP_OP_UPDATE
;
10077 set
<Session
*> sessions
;
10078 mds
->sessionmap
.get_client_session_set(sessions
);
10079 for (auto &session
: sessions
) {
10080 if (!session
->is_open() && !session
->is_stale())
10082 auto update
= make_message
<MClientSnap
>(snap_op
);
10083 update
->head
.split
= global_snaprealm
->inode
->ino();
10084 update
->bl
= mds
->server
->get_snap_trace(session
, global_snaprealm
);
10085 mds
->send_message_client_counted(update
, session
);
10089 // -------------------------------------------------------------------------------
10092 struct C_MDC_RetryScanStray
: public MDCacheContext
{
10094 C_MDC_RetryScanStray(MDCache
*c
, dirfrag_t n
) : MDCacheContext(c
), next(n
) { }
10095 void finish(int r
) override
{
10096 mdcache
->scan_stray_dir(next
);
10100 void MDCache::scan_stray_dir(dirfrag_t next
)
10102 dout(10) << "scan_stray_dir " << next
<< dendl
;
10105 next
.frag
= strays
[MDS_INO_STRAY_INDEX(next
.ino
)]->dirfragtree
[next
.frag
.value()];
10107 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
10108 if (strays
[i
]->ino() < next
.ino
)
10111 std::vector
<CDir
*> ls
;
10112 strays
[i
]->get_dirfrags(ls
);
10114 for (const auto& dir
: ls
) {
10115 if (dir
->get_frag() < next
.frag
)
10118 if (!dir
->can_auth_pin()) {
10119 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDC_RetryScanStray(this, dir
->dirfrag()));
10123 if (!dir
->is_complete()) {
10124 dir
->fetch(new C_MDC_RetryScanStray(this, dir
->dirfrag()));
10128 for (auto &p
: dir
->items
) {
10129 CDentry
*dn
= p
.second
;
10130 dn
->state_set(CDentry::STATE_STRAY
);
10131 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
10132 if (dnl
->is_primary()) {
10133 CInode
*in
= dnl
->get_inode();
10134 if (in
->get_inode()->nlink
== 0)
10135 in
->state_set(CInode::STATE_ORPHAN
);
10136 maybe_eval_stray(in
);
10140 next
.frag
= frag_t();
10144 void MDCache::fetch_backtrace(inodeno_t ino
, int64_t pool
, bufferlist
& bl
, Context
*fin
)
10146 object_t oid
= CInode::get_object_name(ino
, frag_t(), "");
10147 mds
->objecter
->getxattr(oid
, object_locator_t(pool
), "parent", CEPH_NOSNAP
, &bl
, 0, fin
);
10149 mds
->logger
->inc(l_mds_openino_backtrace_fetch
);
10156 // ========================================================================================
10160 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
10161 to the parent metadata object in the cache (pinning it).
10163 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
10167 void MDCache::_send_discover(discover_info_t
& d
)
10169 auto dis
= make_message
<MDiscover
>(d
.ino
, d
.frag
, d
.snap
, d
.want_path
,
10170 d
.want_base_dir
, d
.path_locked
);
10171 logger
->inc(l_mdc_dir_send_discover
);
10172 dis
->set_tid(d
.tid
);
10173 mds
->send_message_mds(dis
, d
.mds
);
10176 void MDCache::discover_base_ino(inodeno_t want_ino
,
10177 MDSContext
*onfinish
,
10180 dout(7) << "discover_base_ino " << want_ino
<< " from mds." << from
<< dendl
;
10181 if (waiting_for_base_ino
[from
].count(want_ino
) == 0) {
10182 discover_info_t
& d
= _create_discover(from
);
10186 waiting_for_base_ino
[from
][want_ino
].push_back(onfinish
);
10190 void MDCache::discover_dir_frag(CInode
*base
,
10192 MDSContext
*onfinish
,
10196 from
= base
->authority().first
;
10198 dirfrag_t
df(base
->ino(), approx_fg
);
10199 dout(7) << "discover_dir_frag " << df
10200 << " from mds." << from
<< dendl
;
10202 if (!base
->is_waiting_for_dir(approx_fg
) || !onfinish
) {
10203 discover_info_t
& d
= _create_discover(from
);
10205 d
.ino
= base
->ino();
10206 d
.frag
= approx_fg
;
10207 d
.want_base_dir
= true;
10212 base
->add_dir_waiter(approx_fg
, onfinish
);
10215 struct C_MDC_RetryDiscoverPath
: public MDCacheContext
{
10220 C_MDC_RetryDiscoverPath(MDCache
*c
, CInode
*b
, snapid_t s
, filepath
&p
, mds_rank_t f
) :
10221 MDCacheContext(c
), base(b
), snapid(s
), path(p
), from(f
) {}
10222 void finish(int r
) override
{
10223 mdcache
->discover_path(base
, snapid
, path
, 0, from
);
10227 void MDCache::discover_path(CInode
*base
,
10229 filepath want_path
,
10230 MDSContext
*onfinish
,
10235 from
= base
->authority().first
;
10237 dout(7) << "discover_path " << base
->ino() << " " << want_path
<< " snap " << snap
<< " from mds." << from
10238 << (path_locked
? " path_locked":"")
10241 if (base
->is_ambiguous_auth()) {
10242 dout(10) << " waiting for single auth on " << *base
<< dendl
;
10244 onfinish
= new C_MDC_RetryDiscoverPath(this, base
, snap
, want_path
, from
);
10245 base
->add_waiter(CInode::WAIT_SINGLEAUTH
, onfinish
);
10247 } else if (from
== mds
->get_nodeid()) {
10248 MDSContext::vec finished
;
10249 base
->take_waiting(CInode::WAIT_DIR
, finished
);
10250 mds
->queue_waiters(finished
);
10254 frag_t fg
= base
->pick_dirfrag(want_path
[0]);
10255 if ((path_locked
&& want_path
.depth() == 1) ||
10256 !base
->is_waiting_for_dir(fg
) || !onfinish
) {
10257 discover_info_t
& d
= _create_discover(from
);
10258 d
.ino
= base
->ino();
10262 d
.want_path
= want_path
;
10263 d
.want_base_dir
= true;
10264 d
.path_locked
= path_locked
;
10270 base
->add_dir_waiter(fg
, onfinish
);
10273 struct C_MDC_RetryDiscoverPath2
: public MDCacheContext
{
10277 C_MDC_RetryDiscoverPath2(MDCache
*c
, CDir
*b
, snapid_t s
, filepath
&p
) :
10278 MDCacheContext(c
), base(b
), snapid(s
), path(p
) {}
10279 void finish(int r
) override
{
10280 mdcache
->discover_path(base
, snapid
, path
, 0);
10284 void MDCache::discover_path(CDir
*base
,
10286 filepath want_path
,
10287 MDSContext
*onfinish
,
10290 mds_rank_t from
= base
->authority().first
;
10292 dout(7) << "discover_path " << base
->dirfrag() << " " << want_path
<< " snap " << snap
<< " from mds." << from
10293 << (path_locked
? " path_locked":"")
10296 if (base
->is_ambiguous_auth()) {
10297 dout(7) << " waiting for single auth on " << *base
<< dendl
;
10299 onfinish
= new C_MDC_RetryDiscoverPath2(this, base
, snap
, want_path
);
10300 base
->add_waiter(CDir::WAIT_SINGLEAUTH
, onfinish
);
10304 if ((path_locked
&& want_path
.depth() == 1) ||
10305 !base
->is_waiting_for_dentry(want_path
[0].c_str(), snap
) || !onfinish
) {
10306 discover_info_t
& d
= _create_discover(from
);
10307 d
.ino
= base
->ino();
10308 d
.pin_base(base
->inode
);
10309 d
.frag
= base
->get_frag();
10311 d
.want_path
= want_path
;
10312 d
.want_base_dir
= false;
10313 d
.path_locked
= path_locked
;
10319 base
->add_dentry_waiter(want_path
[0], snap
, onfinish
);
10322 void MDCache::kick_discovers(mds_rank_t who
)
10324 for (map
<ceph_tid_t
,discover_info_t
>::iterator p
= discovers
.begin();
10325 p
!= discovers
.end();
10327 if (p
->second
.mds
!= who
)
10329 _send_discover(p
->second
);
10334 void MDCache::handle_discover(const cref_t
<MDiscover
> &dis
)
10336 mds_rank_t whoami
= mds
->get_nodeid();
10337 mds_rank_t from
= mds_rank_t(dis
->get_source().num());
10339 ceph_assert(from
!= whoami
);
10341 if (mds
->get_state() <= MDSMap::STATE_REJOIN
) {
10342 if (mds
->get_state() < MDSMap::STATE_REJOIN
&&
10343 mds
->get_want_state() < CEPH_MDS_STATE_REJOIN
) {
10347 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
10348 // delay processing request from survivor because we may not yet choose lock states.
10349 if (!mds
->mdsmap
->is_rejoin(from
)) {
10350 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl
;
10351 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, dis
));
10358 auto reply
= make_message
<MDiscoverReply
>(*dis
);
10360 snapid_t snapid
= dis
->get_snapid();
10362 logger
->inc(l_mdc_dir_handle_discover
);
10365 if (MDS_INO_IS_BASE(dis
->get_base_ino()) &&
10366 !dis
->wants_base_dir() && dis
->get_want().depth() == 0) {
10368 dout(7) << "handle_discover from mds." << from
10369 << " wants base + " << dis
->get_want().get_path()
10370 << " snap " << snapid
10373 cur
= get_inode(dis
->get_base_ino());
10377 reply
->starts_with
= MDiscoverReply::INODE
;
10378 encode_replica_inode(cur
, from
, reply
->trace
, mds
->mdsmap
->get_up_features());
10379 dout(10) << "added base " << *cur
<< dendl
;
10382 // there's a base inode
10383 cur
= get_inode(dis
->get_base_ino(), snapid
);
10384 if (!cur
&& snapid
!= CEPH_NOSNAP
) {
10385 cur
= get_inode(dis
->get_base_ino());
10386 if (cur
&& !cur
->is_multiversion())
10387 cur
= NULL
; // nope!
10391 dout(7) << "handle_discover mds." << from
10392 << " don't have base ino " << dis
->get_base_ino() << "." << snapid
10394 if (!dis
->wants_base_dir() && dis
->get_want().depth() > 0)
10395 reply
->set_error_dentry(dis
->get_dentry(0));
10396 reply
->set_flag_error_dir();
10397 } else if (dis
->wants_base_dir()) {
10398 dout(7) << "handle_discover mds." << from
10399 << " wants basedir+" << dis
->get_want().get_path()
10403 dout(7) << "handle_discover mds." << from
10404 << " wants " << dis
->get_want().get_path()
10410 ceph_assert(reply
);
10413 // do some fidgeting to include a dir if they asked for the base dir, or just root.
10414 for (unsigned i
= 0;
10415 cur
&& (i
< dis
->get_want().depth() || dis
->get_want().depth() == 0);
10418 // -- figure out the dir
10420 // is *cur even a dir at all?
10421 if (!cur
->is_dir()) {
10422 dout(7) << *cur
<< " not a dir" << dendl
;
10423 reply
->set_flag_error_dir();
10429 if (dis
->get_want().depth()) {
10430 // dentry specifies
10431 fg
= cur
->pick_dirfrag(dis
->get_dentry(i
));
10433 // requester explicity specified the frag
10434 ceph_assert(dis
->wants_base_dir() || MDS_INO_IS_BASE(dis
->get_base_ino()));
10435 fg
= dis
->get_base_dir_frag();
10436 if (!cur
->dirfragtree
.is_leaf(fg
))
10437 fg
= cur
->dirfragtree
[fg
.value()];
10439 CDir
*curdir
= cur
->get_dirfrag(fg
);
10441 if ((!curdir
&& !cur
->is_auth()) ||
10442 (curdir
&& !curdir
->is_auth())) {
10445 * ONLY set flag if empty!!
10446 * otherwise requester will wake up waiter(s) _and_ continue with discover,
10447 * resulting in duplicate discovers in flight,
10448 * which can wreak havoc when discovering rename srcdn (which may move)
10451 if (reply
->is_empty()) {
10452 // only hint if empty.
10453 // someday this could be better, but right now the waiter logic isn't smart enough.
10457 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir
<< dendl
;
10458 reply
->set_dir_auth_hint(curdir
->authority().first
);
10460 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
10462 reply
->set_dir_auth_hint(cur
->authority().first
);
10465 // note error dentry, if any
10466 // NOTE: important, as it allows requester to issue an equivalent discover
10467 // to whomever we hint at.
10468 if (dis
->get_want().depth() > i
)
10469 reply
->set_error_dentry(dis
->get_dentry(i
));
10475 if (!curdir
) { // open dir?
10476 if (cur
->is_frozen()) {
10477 if (!reply
->is_empty()) {
10478 dout(7) << *cur
<< " is frozen, non-empty reply, stopping" << dendl
;
10481 dout(7) << *cur
<< " is frozen, empty reply, waiting" << dendl
;
10482 cur
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryMessage(mds
, dis
));
10485 curdir
= cur
->get_or_open_dirfrag(this, fg
);
10486 } else if (curdir
->is_frozen_tree() ||
10487 (curdir
->is_frozen_dir() && fragment_are_all_frozen(curdir
))) {
10488 if (!reply
->is_empty()) {
10489 dout(7) << *curdir
<< " is frozen, non-empty reply, stopping" << dendl
;
10492 if (dis
->wants_base_dir() && dis
->get_base_dir_frag() != curdir
->get_frag()) {
10493 dout(7) << *curdir
<< " is frozen, dirfrag mismatch, stopping" << dendl
;
10494 reply
->set_flag_error_dir();
10497 dout(7) << *curdir
<< " is frozen, empty reply, waiting" << dendl
;
10498 curdir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryMessage(mds
, dis
));
10503 if (curdir
->get_version() == 0) {
10504 // fetch newly opened dir
10505 } else if (reply
->is_empty() && !dis
->wants_base_dir()) {
10506 dout(7) << "handle_discover not adding unwanted base dir " << *curdir
<< dendl
;
10507 // make sure the base frag is correct, though, in there was a refragment since the
10508 // original request was sent.
10509 reply
->set_base_dir_frag(curdir
->get_frag());
10511 ceph_assert(!curdir
->is_ambiguous_auth()); // would be frozen.
10512 if (!reply
->trace
.length())
10513 reply
->starts_with
= MDiscoverReply::DIR;
10514 encode_replica_dir(curdir
, from
, reply
->trace
);
10515 dout(7) << "handle_discover added dir " << *curdir
<< dendl
;
10520 std::string_view dname
;
10521 if (dis
->get_want().depth() > 0)
10522 dname
= dis
->get_dentry(i
);
10523 if (curdir
->get_version() == 0) {
10524 // fetch newly opened dir
10525 ceph_assert(!curdir
->has_bloom());
10526 } else if (dname
.size() > 0) {
10528 dn
= curdir
->lookup(dname
, snapid
);
10534 if (!curdir
->is_complete() &&
10535 !(dname
.size() > 0 &&
10536 snapid
== CEPH_NOSNAP
&&
10537 curdir
->has_bloom() &&
10538 !curdir
->is_in_bloom(dname
))) {
10540 dout(7) << "incomplete dir contents for " << *curdir
<< ", fetching" << dendl
;
10541 if (reply
->is_empty()) {
10543 curdir
->fetch(dname
, snapid
, new C_MDS_RetryMessage(mds
, dis
),
10544 dis
->wants_base_dir() && curdir
->get_version() == 0);
10547 // initiate fetch, but send what we have so far
10548 curdir
->fetch(dname
, snapid
, nullptr);
10553 if (snapid
!= CEPH_NOSNAP
&& !reply
->is_empty()) {
10554 dout(7) << "dentry " << dis
->get_dentry(i
) << " snap " << snapid
10555 << " dne, non-empty reply, stopping" << dendl
;
10559 // send null dentry
10560 dout(7) << "dentry " << dis
->get_dentry(i
) << " dne, returning null in "
10561 << *curdir
<< dendl
;
10562 if (snapid
== CEPH_NOSNAP
)
10563 dn
= curdir
->add_null_dentry(dis
->get_dentry(i
));
10565 dn
= curdir
->add_null_dentry(dis
->get_dentry(i
), snapid
, snapid
);
10569 // don't add replica to purging dentry/inode
10570 if (dn
->state_test(CDentry::STATE_PURGING
)) {
10571 if (reply
->is_empty())
10572 reply
->set_flag_error_dn(dis
->get_dentry(i
));
10576 CDentry::linkage_t
*dnl
= dn
->get_linkage();
10579 // ...always block on non-tail items (they are unrelated)
10580 // ...allow xlocked tail disocvery _only_ if explicitly requested
10581 if (dn
->lock
.is_xlocked()) {
10582 // is this the last (tail) item in the discover traversal?
10583 if (dis
->is_path_locked()) {
10584 dout(7) << "handle_discover allowing discovery of xlocked " << *dn
<< dendl
;
10585 } else if (reply
->is_empty()) {
10586 dout(7) << "handle_discover blocking on xlocked " << *dn
<< dendl
;
10587 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, new C_MDS_RetryMessage(mds
, dis
));
10590 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn
<< dendl
;
10596 bool tailitem
= (dis
->get_want().depth() == 0) || (i
== dis
->get_want().depth() - 1);
10597 if (dnl
->is_primary() && dnl
->get_inode()->is_frozen_inode()) {
10598 if (tailitem
&& dis
->is_path_locked()) {
10599 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl
->get_inode() << dendl
;
10600 } else if (reply
->is_empty()) {
10601 dout(7) << *dnl
->get_inode() << " is frozen, empty reply, waiting" << dendl
;
10602 dnl
->get_inode()->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryMessage(mds
, dis
));
10605 dout(7) << *dnl
->get_inode() << " is frozen, non-empty reply, stopping" << dendl
;
10611 if (!reply
->trace
.length())
10612 reply
->starts_with
= MDiscoverReply::DENTRY
;
10613 encode_replica_dentry(dn
, from
, reply
->trace
);
10614 dout(7) << "handle_discover added dentry " << *dn
<< dendl
;
10616 if (!dnl
->is_primary()) break; // stop on null or remote link.
10619 CInode
*next
= dnl
->get_inode();
10620 ceph_assert(next
->is_auth());
10622 encode_replica_inode(next
, from
, reply
->trace
, mds
->mdsmap
->get_up_features());
10623 dout(7) << "handle_discover added inode " << *next
<< dendl
;
10625 // descend, keep going.
10631 ceph_assert(!reply
->is_empty());
10632 dout(7) << "handle_discover sending result back to asker mds." << from
<< dendl
;
10633 mds
->send_message(reply
, dis
->get_connection());
10636 void MDCache::handle_discover_reply(const cref_t
<MDiscoverReply
> &m
)
10639 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
10640 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
10644 dout(7) << "discover_reply " << *m
<< dendl
;
10645 if (m
->is_flag_error_dir())
10646 dout(7) << " flag error, dir" << dendl
;
10647 if (m
->is_flag_error_dn())
10648 dout(7) << " flag error, dentry = " << m
->get_error_dentry() << dendl
;
10650 MDSContext::vec finished
, error
;
10651 mds_rank_t from
= mds_rank_t(m
->get_source().num());
10654 CInode
*cur
= get_inode(m
->get_base_ino());
10655 auto p
= m
->trace
.cbegin();
10657 int next
= m
->starts_with
;
10659 // decrement discover counters
10660 if (m
->get_tid()) {
10661 map
<ceph_tid_t
,discover_info_t
>::iterator p
= discovers
.find(m
->get_tid());
10662 if (p
!= discovers
.end()) {
10663 dout(10) << " found tid " << m
->get_tid() << dendl
;
10664 discovers
.erase(p
);
10666 dout(10) << " tid " << m
->get_tid() << " not found, must be dup reply" << dendl
;
10670 // discover may start with an inode
10671 if (!p
.end() && next
== MDiscoverReply::INODE
) {
10672 decode_replica_inode(cur
, p
, NULL
, finished
);
10673 dout(7) << "discover_reply got base inode " << *cur
<< dendl
;
10674 ceph_assert(cur
->is_base());
10676 next
= MDiscoverReply::DIR;
10679 if (cur
->is_base() &&
10680 waiting_for_base_ino
[from
].count(cur
->ino())) {
10681 finished
.swap(waiting_for_base_ino
[from
][cur
->ino()]);
10682 waiting_for_base_ino
[from
].erase(cur
->ino());
10687 // loop over discover results.
10688 // indexes follow each ([[dir] dentry] inode)
10689 // can start, end with any type.
10693 CDir
*curdir
= nullptr;
10694 if (next
== MDiscoverReply::DIR) {
10695 decode_replica_dir(curdir
, p
, cur
, mds_rank_t(m
->get_source().num()), finished
);
10696 if (cur
->ino() == m
->get_base_ino() && curdir
->get_frag() != m
->get_base_dir_frag()) {
10697 ceph_assert(m
->get_wanted_base_dir());
10698 cur
->take_dir_waiting(m
->get_base_dir_frag(), finished
);
10701 // note: this can only happen our first way around this loop.
10702 if (p
.end() && m
->is_flag_error_dn()) {
10703 fg
= cur
->pick_dirfrag(m
->get_error_dentry());
10704 curdir
= cur
->get_dirfrag(fg
);
10706 curdir
= cur
->get_dirfrag(m
->get_base_dir_frag());
10713 CDentry
*dn
= nullptr;
10714 decode_replica_dentry(dn
, p
, curdir
, finished
);
10720 decode_replica_inode(cur
, p
, dn
, finished
);
10722 next
= MDiscoverReply::DIR;
10726 // or dir_auth hint?
10727 if (m
->is_flag_error_dir() && !cur
->is_dir()) {
10729 cur
->take_waiting(CInode::WAIT_DIR
, error
);
10730 } else if (m
->is_flag_error_dir() || m
->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN
) {
10731 mds_rank_t who
= m
->get_dir_auth_hint();
10732 if (who
== mds
->get_nodeid()) who
= -1;
10734 dout(7) << " dir_auth_hint is " << m
->get_dir_auth_hint() << dendl
;
10737 if (m
->get_wanted_base_dir()) {
10738 frag_t fg
= m
->get_base_dir_frag();
10739 CDir
*dir
= cur
->get_dirfrag(fg
);
10741 if (cur
->is_waiting_for_dir(fg
)) {
10742 if (cur
->is_auth())
10743 cur
->take_waiting(CInode::WAIT_DIR
, finished
);
10744 else if (dir
|| !cur
->dirfragtree
.is_leaf(fg
))
10745 cur
->take_dir_waiting(fg
, finished
);
10747 discover_dir_frag(cur
, fg
, 0, who
);
10749 dout(7) << " doing nothing, nobody is waiting for dir" << dendl
;
10753 if (m
->get_error_dentry().length()) {
10754 frag_t fg
= cur
->pick_dirfrag(m
->get_error_dentry());
10755 CDir
*dir
= cur
->get_dirfrag(fg
);
10757 if (dir
&& dir
->is_waiting_for_dentry(m
->get_error_dentry(), m
->get_wanted_snapid())) {
10758 if (dir
->is_auth() || dir
->lookup(m
->get_error_dentry())) {
10759 dir
->take_dentry_waiting(m
->get_error_dentry(), m
->get_wanted_snapid(),
10760 m
->get_wanted_snapid(), finished
);
10762 filepath
relpath(m
->get_error_dentry(), 0);
10763 discover_path(dir
, m
->get_wanted_snapid(), relpath
, 0, m
->is_path_locked());
10766 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10767 << m
->get_error_dentry() << dendl
;
10769 } else if (m
->is_flag_error_dn()) {
10770 frag_t fg
= cur
->pick_dirfrag(m
->get_error_dentry());
10771 CDir
*dir
= cur
->get_dirfrag(fg
);
10772 if (dir
&& !dir
->is_auth()) {
10773 dir
->take_dentry_waiting(m
->get_error_dentry(), m
->get_wanted_snapid(),
10774 m
->get_wanted_snapid(), error
);
10779 finish_contexts(g_ceph_context
, error
, -CEPHFS_ENOENT
); // finish errors directly
10780 mds
->queue_waiters(finished
);
10785 // ----------------------------
10789 void MDCache::encode_replica_dir(CDir
*dir
, mds_rank_t to
, bufferlist
& bl
)
10791 ENCODE_START(1, 1, bl
);
10792 dirfrag_t df
= dir
->dirfrag();
10794 __u32 nonce
= dir
->add_replica(to
);
10796 dir
->_encode_base(bl
);
10800 void MDCache::encode_replica_dentry(CDentry
*dn
, mds_rank_t to
, bufferlist
& bl
)
10802 ENCODE_START(2, 1, bl
);
10803 encode(dn
->get_name(), bl
);
10804 encode(dn
->last
, bl
);
10806 __u32 nonce
= dn
->add_replica(to
);
10808 encode(dn
->first
, bl
);
10809 encode(dn
->linkage
.remote_ino
, bl
);
10810 encode(dn
->linkage
.remote_d_type
, bl
);
10811 dn
->lock
.encode_state_for_replica(bl
);
10812 bool need_recover
= mds
->get_state() < MDSMap::STATE_ACTIVE
;
10813 encode(need_recover
, bl
);
10814 encode(dn
->alternate_name
, bl
);
10818 void MDCache::encode_replica_inode(CInode
*in
, mds_rank_t to
, bufferlist
& bl
,
10821 ceph_assert(in
->is_auth());
10823 ENCODE_START(2, 1, bl
);
10824 encode(in
->ino(), bl
); // bleh, minor assymetry here
10825 encode(in
->last
, bl
);
10827 __u32 nonce
= in
->add_replica(to
);
10830 in
->_encode_base(bl
, features
);
10831 in
->_encode_locks_state_for_replica(bl
, mds
->get_state() < MDSMap::STATE_ACTIVE
);
10833 __u32 state
= in
->state
;
10839 void MDCache::decode_replica_dir(CDir
*&dir
, bufferlist::const_iterator
& p
, CInode
*diri
, mds_rank_t from
,
10840 MDSContext::vec
& finished
)
10842 DECODE_START(1, p
);
10846 ceph_assert(diri
->ino() == df
.ino
);
10848 // add it (_replica_)
10849 dir
= diri
->get_dirfrag(df
.frag
);
10852 // had replica. update w/ new nonce.
10855 dir
->set_replica_nonce(nonce
);
10856 dir
->_decode_base(p
);
10857 dout(7) << __func__
<< " had " << *dir
<< " nonce " << dir
->replica_nonce
<< dendl
;
10859 // force frag to leaf in the diri tree
10860 if (!diri
->dirfragtree
.is_leaf(df
.frag
)) {
10861 dout(7) << __func__
<< " forcing frag " << df
.frag
<< " to leaf in the fragtree "
10862 << diri
->dirfragtree
<< dendl
;
10863 diri
->dirfragtree
.force_to_leaf(g_ceph_context
, df
.frag
);
10866 dir
= diri
->add_dirfrag( new CDir(diri
, df
.frag
, this, false) );
10869 dir
->set_replica_nonce(nonce
);
10870 dir
->_decode_base(p
);
10871 // is this a dir_auth delegation boundary?
10872 if (from
!= diri
->authority().first
||
10873 diri
->is_ambiguous_auth() ||
10875 adjust_subtree_auth(dir
, from
);
10877 dout(7) << __func__
<< " added " << *dir
<< " nonce " << dir
->replica_nonce
<< dendl
;
10879 diri
->take_dir_waiting(df
.frag
, finished
);
10884 void MDCache::decode_replica_dentry(CDentry
*&dn
, bufferlist::const_iterator
& p
, CDir
*dir
, MDSContext::vec
& finished
)
10886 DECODE_START(1, p
);
10892 dn
= dir
->lookup(name
, last
);
10895 bool is_new
= false;
10898 dout(7) << __func__
<< " had " << *dn
<< dendl
;
10901 dn
= dir
->add_null_dentry(name
, 1 /* this will get updated below */, last
);
10902 dout(7) << __func__
<< " added " << *dn
<< dendl
;
10907 dn
->set_replica_nonce(nonce
);
10908 decode(dn
->first
, p
);
10911 unsigned char rdtype
;
10914 dn
->lock
.decode_state(p
, is_new
);
10917 decode(need_recover
, p
);
10919 mempool::mds_co::string alternate_name
;
10920 if (struct_v
>= 2) {
10921 decode(alternate_name
, p
);
10925 dn
->set_alternate_name(std::move(alternate_name
));
10927 dir
->link_remote_inode(dn
, rino
, rdtype
);
10929 dn
->lock
.mark_need_recover();
10931 ceph_assert(dn
->alternate_name
== alternate_name
);
10934 dir
->take_dentry_waiting(name
, dn
->first
, dn
->last
, finished
);
10938 void MDCache::decode_replica_inode(CInode
*&in
, bufferlist::const_iterator
& p
, CDentry
*dn
, MDSContext::vec
& finished
)
10940 DECODE_START(2, p
);
10947 in
= get_inode(ino
, last
);
10949 in
= new CInode(this, false, 2, last
);
10950 in
->set_replica_nonce(nonce
);
10951 in
->_decode_base(p
);
10952 in
->_decode_locks_state_for_replica(p
, true);
10954 if (in
->ino() == CEPH_INO_ROOT
)
10955 in
->inode_auth
.first
= 0;
10956 else if (in
->is_mdsdir())
10957 in
->inode_auth
.first
= in
->ino() - MDS_INO_MDSDIR_OFFSET
;
10958 dout(10) << __func__
<< " added " << *in
<< dendl
;
10960 ceph_assert(dn
->get_linkage()->is_null());
10961 dn
->dir
->link_primary_inode(dn
, in
);
10964 in
->set_replica_nonce(nonce
);
10965 in
->_decode_base(p
);
10966 in
->_decode_locks_state_for_replica(p
, false);
10967 dout(10) << __func__
<< " had " << *in
<< dendl
;
10971 if (!dn
->get_linkage()->is_primary() || dn
->get_linkage()->get_inode() != in
)
10972 dout(10) << __func__
<< " different linkage in dentry " << *dn
<< dendl
;
10975 if (struct_v
>= 2) {
10978 s
&= CInode::MASK_STATE_REPLICATED
;
10979 if (s
& CInode::STATE_RANDEPHEMERALPIN
) {
10980 dout(10) << "replica inode is random ephemeral pinned" << dendl
;
10981 in
->set_ephemeral_pin(false, true);
10989 void MDCache::encode_replica_stray(CDentry
*straydn
, mds_rank_t who
, bufferlist
& bl
)
10991 ceph_assert(straydn
->get_num_auth_pins());
10992 ENCODE_START(2, 1, bl
);
10993 uint64_t features
= mds
->mdsmap
->get_up_features();
10994 encode_replica_inode(get_myin(), who
, bl
, features
);
10995 encode_replica_dir(straydn
->get_dir()->inode
->get_parent_dn()->get_dir(), who
, bl
);
10996 encode_replica_dentry(straydn
->get_dir()->inode
->get_parent_dn(), who
, bl
);
10997 encode_replica_inode(straydn
->get_dir()->inode
, who
, bl
, features
);
10998 encode_replica_dir(straydn
->get_dir(), who
, bl
);
10999 encode_replica_dentry(straydn
, who
, bl
);
11000 if (!straydn
->get_projected_linkage()->is_null()) {
11001 encode_replica_inode(straydn
->get_projected_linkage()->get_inode(), who
, bl
, features
);
11006 void MDCache::decode_replica_stray(CDentry
*&straydn
, CInode
**in
, const bufferlist
&bl
, mds_rank_t from
)
11008 MDSContext::vec finished
;
11009 auto p
= bl
.cbegin();
11011 DECODE_START(2, p
);
11012 CInode
*mdsin
= nullptr;
11013 decode_replica_inode(mdsin
, p
, NULL
, finished
);
11014 CDir
*mdsdir
= nullptr;
11015 decode_replica_dir(mdsdir
, p
, mdsin
, from
, finished
);
11016 CDentry
*straydirdn
= nullptr;
11017 decode_replica_dentry(straydirdn
, p
, mdsdir
, finished
);
11018 CInode
*strayin
= nullptr;
11019 decode_replica_inode(strayin
, p
, straydirdn
, finished
);
11020 CDir
*straydir
= nullptr;
11021 decode_replica_dir(straydir
, p
, strayin
, from
, finished
);
11023 decode_replica_dentry(straydn
, p
, straydir
, finished
);
11024 if (struct_v
>= 2 && in
) {
11025 decode_replica_inode(*in
, p
, straydn
, finished
);
11027 if (!finished
.empty())
11028 mds
->queue_waiters(finished
);
11033 int MDCache::send_dir_updates(CDir
*dir
, bool bcast
)
11035 // this is an FYI, re: replication
11037 set
<mds_rank_t
> who
;
11039 set
<mds_rank_t
> mds_set
;
11040 mds
->get_mds_map()->get_active_mds_set(mds_set
);
11042 set
<mds_rank_t
> replica_set
;
11043 for (const auto &p
: dir
->get_replicas()) {
11044 replica_set
.insert(p
.first
);
11047 std::set_difference(mds_set
.begin(), mds_set
.end(),
11048 replica_set
.begin(), replica_set
.end(),
11049 std::inserter(who
, who
.end()));
11051 for (const auto &p
: dir
->get_replicas()) {
11052 who
.insert(p
.first
);
11056 dout(7) << "sending dir_update on " << *dir
<< " bcast " << bcast
<< " to " << who
<< dendl
;
11059 dir
->inode
->make_path(path
);
11061 std::set
<int32_t> dir_rep_set
;
11062 for (const auto &r
: dir
->dir_rep_by
) {
11063 dir_rep_set
.insert(r
);
11066 mds_rank_t whoami
= mds
->get_nodeid();
11067 for (set
<mds_rank_t
>::iterator it
= who
.begin();
11070 if (*it
== whoami
) continue;
11071 //if (*it == except) continue;
11072 dout(7) << "sending dir_update on " << *dir
<< " to " << *it
<< dendl
;
11074 logger
->inc(l_mdc_dir_update
);
11075 mds
->send_message_mds(make_message
<MDirUpdate
>(mds
->get_nodeid(), dir
->dirfrag(), dir
->dir_rep
, dir_rep_set
, path
, bcast
), *it
);
11081 void MDCache::handle_dir_update(const cref_t
<MDirUpdate
> &m
)
11083 dirfrag_t df
= m
->get_dirfrag();
11084 CDir
*dir
= get_dirfrag(df
);
11085 logger
->inc(l_mdc_dir_update_receipt
);
11087 dout(5) << "dir_update on " << df
<< ", don't have it" << dendl
;
11090 if (m
->should_discover()) {
11092 // this is key to avoid a fragtree update race, among other things.
11093 m
->inc_tried_discover();
11094 vector
<CDentry
*> trace
;
11096 filepath path
= m
->get_path();
11097 dout(5) << "trying discover on dir_update for " << path
<< dendl
;
11098 logger
->inc(l_mdc_dir_try_discover
);
11099 CF_MDS_RetryMessageFactory
cf(mds
, m
);
11100 MDRequestRef null_ref
;
11101 int r
= path_traverse(null_ref
, cf
, path
, MDS_TRAVERSE_DISCOVER
, &trace
, &in
);
11105 in
->ino() == df
.ino
&&
11106 in
->get_approx_dirfrag(df
.frag
) == NULL
) {
11107 open_remote_dirfrag(in
, df
.frag
, new C_MDS_RetryMessage(mds
, m
));
11115 if (!m
->has_tried_discover()) {
11116 // Update if it already exists. Othwerwise it got updated by discover reply.
11117 dout(5) << "dir_update on " << *dir
<< dendl
;
11118 dir
->dir_rep
= m
->get_dir_rep();
11119 dir
->dir_rep_by
.clear();
11120 for (const auto &e
: m
->get_dir_rep_by()) {
11121 dir
->dir_rep_by
.insert(e
);
11132 void MDCache::encode_remote_dentry_link(CDentry::linkage_t
*dnl
, bufferlist
& bl
)
11134 ENCODE_START(1, 1, bl
);
11135 inodeno_t ino
= dnl
->get_remote_ino();
11137 __u8 d_type
= dnl
->get_remote_d_type();
11138 encode(d_type
, bl
);
11142 void MDCache::decode_remote_dentry_link(CDir
*dir
, CDentry
*dn
, bufferlist::const_iterator
& p
)
11144 DECODE_START(1, p
);
11149 dout(10) << __func__
<< " remote " << ino
<< " " << d_type
<< dendl
;
11150 dir
->link_remote_inode(dn
, ino
, d_type
);
11154 void MDCache::send_dentry_link(CDentry
*dn
, MDRequestRef
& mdr
)
11156 dout(7) << __func__
<< " " << *dn
<< dendl
;
11158 CDir
*subtree
= get_subtree_root(dn
->get_dir());
11159 for (const auto &p
: dn
->get_replicas()) {
11160 // don't tell (rename) witnesses; they already know
11161 if (mdr
.get() && mdr
->more()->witnessed
.count(p
.first
))
11163 if (mds
->mdsmap
->get_state(p
.first
) < MDSMap::STATE_REJOIN
||
11164 (mds
->mdsmap
->get_state(p
.first
) == MDSMap::STATE_REJOIN
&&
11165 rejoin_gather
.count(p
.first
)))
11167 CDentry::linkage_t
*dnl
= dn
->get_linkage();
11168 auto m
= make_message
<MDentryLink
>(subtree
->dirfrag(), dn
->get_dir()->dirfrag(), dn
->get_name(), dnl
->is_primary());
11169 if (dnl
->is_primary()) {
11170 dout(10) << __func__
<< " primary " << *dnl
->get_inode() << dendl
;
11171 encode_replica_inode(dnl
->get_inode(), p
.first
, m
->bl
,
11172 mds
->mdsmap
->get_up_features());
11173 } else if (dnl
->is_remote()) {
11174 encode_remote_dentry_link(dnl
, m
->bl
);
11176 ceph_abort(); // aie, bad caller!
11177 mds
->send_message_mds(m
, p
.first
);
11181 void MDCache::handle_dentry_link(const cref_t
<MDentryLink
> &m
)
11183 CDentry
*dn
= NULL
;
11184 CDir
*dir
= get_dirfrag(m
->get_dirfrag());
11186 dout(7) << __func__
<< " don't have dirfrag " << m
->get_dirfrag() << dendl
;
11188 dn
= dir
->lookup(m
->get_dn());
11190 dout(7) << __func__
<< " don't have dentry " << *dir
<< " dn " << m
->get_dn() << dendl
;
11192 dout(7) << __func__
<< " on " << *dn
<< dendl
;
11193 CDentry::linkage_t
*dnl
= dn
->get_linkage();
11195 ceph_assert(!dn
->is_auth());
11196 ceph_assert(dnl
->is_null());
11200 auto p
= m
->bl
.cbegin();
11201 MDSContext::vec finished
;
11203 if (m
->get_is_primary()) {
11205 CInode
*in
= nullptr;
11206 decode_replica_inode(in
, p
, dn
, finished
);
11208 // remote link, easy enough.
11209 decode_remote_dentry_link(dir
, dn
, p
);
11215 if (!finished
.empty())
11216 mds
->queue_waiters(finished
);
11224 void MDCache::send_dentry_unlink(CDentry
*dn
, CDentry
*straydn
, MDRequestRef
& mdr
)
11226 dout(10) << __func__
<< " " << *dn
<< dendl
;
11227 // share unlink news with replicas
11228 set
<mds_rank_t
> replicas
;
11229 dn
->list_replicas(replicas
);
11232 straydn
->list_replicas(replicas
);
11233 CInode
*strayin
= straydn
->get_linkage()->get_inode();
11234 strayin
->encode_snap_blob(snapbl
);
11236 for (set
<mds_rank_t
>::iterator it
= replicas
.begin();
11237 it
!= replicas
.end();
11239 // don't tell (rmdir) witnesses; they already know
11240 if (mdr
.get() && mdr
->more()->witnessed
.count(*it
))
11243 if (mds
->mdsmap
->get_state(*it
) < MDSMap::STATE_REJOIN
||
11244 (mds
->mdsmap
->get_state(*it
) == MDSMap::STATE_REJOIN
&&
11245 rejoin_gather
.count(*it
)))
11248 auto unlink
= make_message
<MDentryUnlink
>(dn
->get_dir()->dirfrag(), dn
->get_name());
11250 encode_replica_stray(straydn
, *it
, unlink
->straybl
);
11251 unlink
->snapbl
= snapbl
;
11253 mds
->send_message_mds(unlink
, *it
);
11257 void MDCache::handle_dentry_unlink(const cref_t
<MDentryUnlink
> &m
)
11260 CDentry
*straydn
= nullptr;
11261 CInode
*strayin
= nullptr;
11262 if (m
->straybl
.length())
11263 decode_replica_stray(straydn
, &strayin
, m
->straybl
, mds_rank_t(m
->get_source().num()));
11265 CDir
*dir
= get_dirfrag(m
->get_dirfrag());
11267 dout(7) << __func__
<< " don't have dirfrag " << m
->get_dirfrag() << dendl
;
11269 CDentry
*dn
= dir
->lookup(m
->get_dn());
11271 dout(7) << __func__
<< " don't have dentry " << *dir
<< " dn " << m
->get_dn() << dendl
;
11273 dout(7) << __func__
<< " on " << *dn
<< dendl
;
11274 CDentry::linkage_t
*dnl
= dn
->get_linkage();
11277 if (dnl
->is_primary()) {
11278 CInode
*in
= dnl
->get_inode();
11279 dn
->dir
->unlink_inode(dn
);
11280 ceph_assert(straydn
);
11281 straydn
->dir
->link_primary_inode(straydn
, in
);
11283 // in->first is lazily updated on replica; drag it forward so
11284 // that we always keep it in sync with the dnq
11285 ceph_assert(straydn
->first
>= in
->first
);
11286 in
->first
= straydn
->first
;
11288 // update subtree map?
11290 adjust_subtree_after_rename(in
, dir
, false);
11292 if (m
->snapbl
.length()) {
11293 bool hadrealm
= (in
->snaprealm
? true : false);
11294 in
->decode_snap_blob(m
->snapbl
);
11295 ceph_assert(in
->snaprealm
);
11297 do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
, false);
11300 // send caps to auth (if we're not already)
11301 if (in
->is_any_caps() &&
11302 !in
->state_test(CInode::STATE_EXPORTINGCAPS
))
11303 migrator
->export_caps(in
);
11307 ceph_assert(!straydn
);
11308 ceph_assert(dnl
->is_remote());
11309 dn
->dir
->unlink_inode(dn
);
11311 ceph_assert(dnl
->is_null());
11315 // race with trim_dentry()
11317 ceph_assert(straydn
->get_num_ref() == 0);
11318 ceph_assert(straydn
->get_linkage()->is_null());
11320 trim_dentry(straydn
, ex
);
11321 send_expire_messages(ex
);
11330 // ===================================================================
11334 // ===================================================================
11339 * adjust_dir_fragments -- adjust fragmentation for a directory
11341 * @param diri directory inode
11342 * @param basefrag base fragment
11343 * @param bits bit adjustment. positive for split, negative for merge.
11345 void MDCache::adjust_dir_fragments(CInode
*diri
, frag_t basefrag
, int bits
,
11346 std::vector
<CDir
*>* resultfrags
,
11347 MDSContext::vec
& waiters
,
11350 dout(10) << "adjust_dir_fragments " << basefrag
<< " " << bits
11351 << " on " << *diri
<< dendl
;
11353 auto&& p
= diri
->get_dirfrags_under(basefrag
);
11355 adjust_dir_fragments(diri
, p
.second
, basefrag
, bits
, resultfrags
, waiters
, replay
);
11358 CDir
*MDCache::force_dir_fragment(CInode
*diri
, frag_t fg
, bool replay
)
11360 CDir
*dir
= diri
->get_dirfrag(fg
);
11364 dout(10) << "force_dir_fragment " << fg
<< " on " << *diri
<< dendl
;
11366 std::vector
<CDir
*> src
, result
;
11367 MDSContext::vec waiters
;
11370 frag_t parent
= diri
->dirfragtree
.get_branch_or_leaf(fg
);
11372 CDir
*pdir
= diri
->get_dirfrag(parent
);
11374 int split
= fg
.bits() - parent
.bits();
11375 dout(10) << " splitting parent by " << split
<< " " << *pdir
<< dendl
;
11376 src
.push_back(pdir
);
11377 adjust_dir_fragments(diri
, src
, parent
, split
, &result
, waiters
, replay
);
11378 dir
= diri
->get_dirfrag(fg
);
11380 dout(10) << "force_dir_fragment result " << *dir
<< dendl
;
11384 if (parent
== frag_t())
11386 frag_t last
= parent
;
11387 parent
= parent
.parent();
11388 dout(10) << " " << last
<< " parent is " << parent
<< dendl
;
11392 // hoover up things under fg?
11394 auto&& p
= diri
->get_dirfrags_under(fg
);
11395 src
.insert(std::end(src
), std::cbegin(p
.second
), std::cend(p
.second
));
11398 dout(10) << "force_dir_fragment no frags under " << fg
<< dendl
;
11400 dout(10) << " will combine frags under " << fg
<< ": " << src
<< dendl
;
11401 adjust_dir_fragments(diri
, src
, fg
, 0, &result
, waiters
, replay
);
11402 dir
= result
.front();
11403 dout(10) << "force_dir_fragment result " << *dir
<< dendl
;
11407 mds
->queue_waiters(waiters
);
11411 void MDCache::adjust_dir_fragments(CInode
*diri
,
11412 const std::vector
<CDir
*>& srcfrags
,
11413 frag_t basefrag
, int bits
,
11414 std::vector
<CDir
*>* resultfrags
,
11415 MDSContext::vec
& waiters
,
11418 dout(10) << "adjust_dir_fragments " << basefrag
<< " bits " << bits
11419 << " srcfrags " << srcfrags
11420 << " on " << *diri
<< dendl
;
11423 // yuck. we may have discovered the inode while it was being fragmented.
11424 if (!diri
->dirfragtree
.is_leaf(basefrag
))
11425 diri
->dirfragtree
.force_to_leaf(g_ceph_context
, basefrag
);
11428 diri
->dirfragtree
.split(basefrag
, bits
);
11429 dout(10) << " new fragtree is " << diri
->dirfragtree
<< dendl
;
11431 if (srcfrags
.empty())
11435 CDir
*parent_dir
= diri
->get_parent_dir();
11436 CDir
*parent_subtree
= 0;
11438 parent_subtree
= get_subtree_root(parent_dir
);
11440 ceph_assert(srcfrags
.size() >= 1);
11443 ceph_assert(srcfrags
.size() == 1);
11444 CDir
*dir
= srcfrags
.front();
11446 dir
->split(bits
, resultfrags
, waiters
, replay
);
11448 // did i change the subtree map?
11449 if (dir
->is_subtree_root()) {
11450 // new frags are now separate subtrees
11451 for (const auto& dir
: *resultfrags
) {
11452 subtrees
[dir
].clear(); // new frag is now its own subtree
11456 if (parent_subtree
) {
11457 ceph_assert(subtrees
[parent_subtree
].count(dir
));
11458 subtrees
[parent_subtree
].erase(dir
);
11459 for (const auto& dir
: *resultfrags
) {
11460 ceph_assert(dir
->is_subtree_root());
11461 subtrees
[parent_subtree
].insert(dir
);
11465 // adjust my bounds.
11467 bounds
.swap(subtrees
[dir
]);
11468 subtrees
.erase(dir
);
11469 for (set
<CDir
*>::iterator p
= bounds
.begin();
11472 CDir
*frag
= get_subtree_root((*p
)->get_parent_dir());
11473 subtrees
[frag
].insert(*p
);
11479 diri
->close_dirfrag(dir
->get_frag());
11484 // are my constituent bits subtrees? if so, i will be too.
11485 // (it's all or none, actually.)
11486 bool any_subtree
= false, any_non_subtree
= false;
11487 for (const auto& dir
: srcfrags
) {
11488 if (dir
->is_subtree_root())
11489 any_subtree
= true;
11491 any_non_subtree
= true;
11493 ceph_assert(!any_subtree
|| !any_non_subtree
);
11495 set
<CDir
*> new_bounds
;
11497 for (const auto& dir
: srcfrags
) {
11498 // this simplifies the code that find subtrees underneath the dirfrag
11499 if (!dir
->is_subtree_root()) {
11500 dir
->state_set(CDir::STATE_AUXSUBTREE
);
11501 adjust_subtree_auth(dir
, mds
->get_nodeid());
11505 for (const auto& dir
: srcfrags
) {
11506 ceph_assert(dir
->is_subtree_root());
11507 dout(10) << " taking srcfrag subtree bounds from " << *dir
<< dendl
;
11508 map
<CDir
*, set
<CDir
*> >::iterator q
= subtrees
.find(dir
);
11509 set
<CDir
*>::iterator r
= q
->second
.begin();
11510 while (r
!= subtrees
[dir
].end()) {
11511 new_bounds
.insert(*r
);
11512 subtrees
[dir
].erase(r
++);
11516 // remove myself as my parent's bound
11517 if (parent_subtree
)
11518 subtrees
[parent_subtree
].erase(dir
);
11523 CDir
*f
= new CDir(diri
, basefrag
, this, srcfrags
.front()->is_auth());
11524 f
->merge(srcfrags
, waiters
, replay
);
11527 ceph_assert(f
->is_subtree_root());
11528 subtrees
[f
].swap(new_bounds
);
11529 if (parent_subtree
)
11530 subtrees
[parent_subtree
].insert(f
);
11535 resultfrags
->push_back(f
);
11540 class C_MDC_FragmentFrozen
: public MDSInternalContext
{
11544 C_MDC_FragmentFrozen(MDCache
*m
, MDRequestRef
& r
) :
11545 MDSInternalContext(m
->mds
), mdcache(m
), mdr(r
) {}
11546 void finish(int r
) override
{
11547 mdcache
->fragment_frozen(mdr
, r
);
11551 bool MDCache::can_fragment(CInode
*diri
, const std::vector
<CDir
*>& dirs
)
11553 if (is_readonly()) {
11554 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl
;
11557 if (mds
->is_cluster_degraded()) {
11558 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl
;
11561 if (diri
->get_parent_dir() &&
11562 diri
->get_parent_dir()->get_inode()->is_stray()) {
11563 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl
;
11566 if (diri
->is_mdsdir() || diri
->ino() == CEPH_INO_CEPH
) {
11567 dout(7) << "can_fragment: i won't fragment mdsdir or .ceph" << dendl
;
11571 for (const auto& dir
: dirs
) {
11572 if (dir
->scrub_is_in_progress()) {
11573 dout(7) << "can_fragment: scrub in progress " << *dir
<< dendl
;
11577 if (dir
->state_test(CDir::STATE_FRAGMENTING
)) {
11578 dout(7) << "can_fragment: already fragmenting " << *dir
<< dendl
;
11581 if (!dir
->is_auth()) {
11582 dout(7) << "can_fragment: not auth on " << *dir
<< dendl
;
11585 if (dir
->is_bad()) {
11586 dout(7) << "can_fragment: bad dirfrag " << *dir
<< dendl
;
11589 if (dir
->is_frozen() ||
11590 dir
->is_freezing()) {
11591 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl
;
11599 void MDCache::split_dir(CDir
*dir
, int bits
)
11601 dout(7) << __func__
<< " " << *dir
<< " bits " << bits
<< dendl
;
11602 ceph_assert(dir
->is_auth());
11603 CInode
*diri
= dir
->inode
;
11605 std::vector
<CDir
*> dirs
;
11606 dirs
.push_back(dir
);
11608 if (!can_fragment(diri
, dirs
)) {
11609 dout(7) << __func__
<< " cannot fragment right now, dropping" << dendl
;
11613 if (dir
->frag
.bits() + bits
> 24) {
11614 dout(7) << __func__
<< " frag bits > 24, dropping" << dendl
;
11618 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_FRAGMENTDIR
);
11619 mdr
->more()->fragment_base
= dir
->dirfrag();
11621 ceph_assert(fragments
.count(dir
->dirfrag()) == 0);
11622 fragment_info_t
& info
= fragments
[dir
->dirfrag()];
11624 info
.dirs
.push_back(dir
);
11626 info
.last_cum_auth_pins_change
= ceph_clock_now();
11628 fragment_freeze_dirs(dirs
);
11629 // initial mark+complete pass
11630 fragment_mark_and_complete(mdr
);
11633 void MDCache::merge_dir(CInode
*diri
, frag_t frag
)
11635 dout(7) << "merge_dir to " << frag
<< " on " << *diri
<< dendl
;
11637 auto&& [all
, dirs
] = diri
->get_dirfrags_under(frag
);
11639 dout(7) << "don't have all frags under " << frag
<< " for " << *diri
<< dendl
;
11643 if (diri
->dirfragtree
.is_leaf(frag
)) {
11644 dout(10) << " " << frag
<< " already a leaf for " << *diri
<< dendl
;
11648 if (!can_fragment(diri
, dirs
))
11651 CDir
*first
= dirs
.front();
11652 int bits
= first
->get_frag().bits() - frag
.bits();
11653 dout(10) << " we are merging by " << bits
<< " bits" << dendl
;
11655 dirfrag_t
basedirfrag(diri
->ino(), frag
);
11656 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_FRAGMENTDIR
);
11657 mdr
->more()->fragment_base
= basedirfrag
;
11659 ceph_assert(fragments
.count(basedirfrag
) == 0);
11660 fragment_info_t
& info
= fragments
[basedirfrag
];
11664 info
.last_cum_auth_pins_change
= ceph_clock_now();
11666 fragment_freeze_dirs(dirs
);
11667 // initial mark+complete pass
11668 fragment_mark_and_complete(mdr
);
11671 void MDCache::fragment_freeze_dirs(const std::vector
<CDir
*>& dirs
)
11673 bool any_subtree
= false, any_non_subtree
= false;
11674 for (const auto& dir
: dirs
) {
11675 dir
->auth_pin(dir
); // until we mark and complete them
11676 dir
->state_set(CDir::STATE_FRAGMENTING
);
11678 ceph_assert(dir
->is_freezing_dir());
11680 if (dir
->is_subtree_root())
11681 any_subtree
= true;
11683 any_non_subtree
= true;
11686 if (any_subtree
&& any_non_subtree
) {
11687 // either all dirfrags are subtree roots or all are not.
11688 for (const auto& dir
: dirs
) {
11689 if (dir
->is_subtree_root()) {
11690 ceph_assert(dir
->state_test(CDir::STATE_AUXSUBTREE
));
11692 dir
->state_set(CDir::STATE_AUXSUBTREE
);
11693 adjust_subtree_auth(dir
, mds
->get_nodeid());
11699 class C_MDC_FragmentMarking
: public MDCacheContext
{
11702 C_MDC_FragmentMarking(MDCache
*m
, MDRequestRef
& r
) : MDCacheContext(m
), mdr(r
) {}
11703 void finish(int r
) override
{
11704 mdcache
->fragment_mark_and_complete(mdr
);
11708 void MDCache::fragment_mark_and_complete(MDRequestRef
& mdr
)
11710 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11711 map
<dirfrag_t
,fragment_info_t
>::iterator it
= fragments
.find(basedirfrag
);
11712 if (it
== fragments
.end() || it
->second
.mdr
!= mdr
) {
11713 dout(7) << "fragment_mark_and_complete " << basedirfrag
<< " must have aborted" << dendl
;
11714 request_finish(mdr
);
11718 fragment_info_t
& info
= it
->second
;
11719 CInode
*diri
= info
.dirs
.front()->get_inode();
11720 dout(10) << "fragment_mark_and_complete " << info
.dirs
<< " on " << *diri
<< dendl
;
11722 MDSGatherBuilder
gather(g_ceph_context
);
11724 for (const auto& dir
: info
.dirs
) {
11726 if (!dir
->is_complete()) {
11727 dout(15) << " fetching incomplete " << *dir
<< dendl
;
11728 dir
->fetch(gather
.new_sub(), true); // ignore authpinnability
11730 } else if (dir
->get_frag() == frag_t()) {
11731 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
11732 // the operation. To avoid CDir::fetch() complaining about missing object,
11733 // we commit new dirfrag first.
11734 if (dir
->state_test(CDir::STATE_CREATING
)) {
11735 dout(15) << " waiting until new dir gets journaled " << *dir
<< dendl
;
11736 dir
->add_waiter(CDir::WAIT_CREATED
, gather
.new_sub());
11738 } else if (dir
->is_new()) {
11739 dout(15) << " committing new " << *dir
<< dendl
;
11740 ceph_assert(dir
->is_dirty());
11741 dir
->commit(0, gather
.new_sub(), true);
11748 if (!dir
->state_test(CDir::STATE_DNPINNEDFRAG
)) {
11749 dout(15) << " marking " << *dir
<< dendl
;
11750 for (auto &p
: dir
->items
) {
11751 CDentry
*dn
= p
.second
;
11752 dn
->get(CDentry::PIN_FRAGMENTING
);
11753 ceph_assert(!dn
->state_test(CDentry::STATE_FRAGMENTING
));
11754 dn
->state_set(CDentry::STATE_FRAGMENTING
);
11756 dir
->state_set(CDir::STATE_DNPINNEDFRAG
);
11757 dir
->auth_unpin(dir
);
11759 dout(15) << " already marked " << *dir
<< dendl
;
11762 if (gather
.has_subs()) {
11763 gather
.set_finisher(new C_MDC_FragmentMarking(this, mdr
));
11768 for (const auto& dir
: info
.dirs
) {
11769 if (!dir
->is_frozen_dir()) {
11770 ceph_assert(dir
->is_freezing_dir());
11771 dir
->add_waiter(CDir::WAIT_FROZEN
, gather
.new_sub());
11774 if (gather
.has_subs()) {
11775 gather
.set_finisher(new C_MDC_FragmentFrozen(this, mdr
));
11777 // flush log so that request auth_pins are retired
11778 mds
->mdlog
->flush();
11782 fragment_frozen(mdr
, 0);
11785 void MDCache::fragment_unmark_unfreeze_dirs(const std::vector
<CDir
*>& dirs
)
11787 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs
<< dendl
;
11788 for (const auto& dir
: dirs
) {
11789 dout(10) << " frag " << *dir
<< dendl
;
11791 ceph_assert(dir
->state_test(CDir::STATE_FRAGMENTING
));
11792 dir
->state_clear(CDir::STATE_FRAGMENTING
);
11794 if (dir
->state_test(CDir::STATE_DNPINNEDFRAG
)) {
11795 dir
->state_clear(CDir::STATE_DNPINNEDFRAG
);
11797 for (auto &p
: dir
->items
) {
11798 CDentry
*dn
= p
.second
;
11799 ceph_assert(dn
->state_test(CDentry::STATE_FRAGMENTING
));
11800 dn
->state_clear(CDentry::STATE_FRAGMENTING
);
11801 dn
->put(CDentry::PIN_FRAGMENTING
);
11804 dir
->auth_unpin(dir
);
11807 dir
->unfreeze_dir();
11811 bool MDCache::fragment_are_all_frozen(CDir
*dir
)
11813 ceph_assert(dir
->is_frozen_dir());
11814 map
<dirfrag_t
,fragment_info_t
>::iterator p
;
11815 for (p
= fragments
.lower_bound(dirfrag_t(dir
->ino(), 0));
11816 p
!= fragments
.end() && p
->first
.ino
== dir
->ino();
11818 if (p
->first
.frag
.contains(dir
->get_frag()))
11819 return p
->second
.all_frozen
;
11825 void MDCache::fragment_freeze_inc_num_waiters(CDir
*dir
)
11827 map
<dirfrag_t
,fragment_info_t
>::iterator p
;
11828 for (p
= fragments
.lower_bound(dirfrag_t(dir
->ino(), 0));
11829 p
!= fragments
.end() && p
->first
.ino
== dir
->ino();
11831 if (p
->first
.frag
.contains(dir
->get_frag())) {
11832 p
->second
.num_remote_waiters
++;
11839 void MDCache::find_stale_fragment_freeze()
11841 dout(10) << "find_stale_fragment_freeze" << dendl
;
11842 // see comment in Migrator::find_stale_export_freeze()
11843 utime_t now
= ceph_clock_now();
11844 utime_t cutoff
= now
;
11845 cutoff
-= g_conf()->mds_freeze_tree_timeout
;
11847 for (map
<dirfrag_t
,fragment_info_t
>::iterator p
= fragments
.begin();
11848 p
!= fragments
.end(); ) {
11849 dirfrag_t df
= p
->first
;
11850 fragment_info_t
& info
= p
->second
;
11852 if (info
.all_frozen
)
11855 int total_auth_pins
= 0;
11856 for (const auto& d
: info
.dirs
) {
11858 if (!dir
->state_test(CDir::STATE_DNPINNEDFRAG
)) {
11859 total_auth_pins
= -1;
11862 if (dir
->is_frozen_dir())
11864 total_auth_pins
+= dir
->get_auth_pins() + dir
->get_dir_auth_pins();
11866 if (total_auth_pins
< 0)
11868 if (info
.last_cum_auth_pins
!= total_auth_pins
) {
11869 info
.last_cum_auth_pins
= total_auth_pins
;
11870 info
.last_cum_auth_pins_change
= now
;
11873 if (info
.last_cum_auth_pins_change
>= cutoff
)
11875 dir
= info
.dirs
.front();
11876 if (info
.num_remote_waiters
> 0 ||
11877 (!dir
->inode
->is_root() && dir
->get_parent_dir()->is_freezing())) {
11878 dout(10) << " cancel fragmenting " << df
<< " bit " << info
.bits
<< dendl
;
11879 std::vector
<CDir
*> dirs
;
11880 info
.dirs
.swap(dirs
);
11881 fragments
.erase(df
);
11882 fragment_unmark_unfreeze_dirs(dirs
);
11887 class C_MDC_FragmentPrep
: public MDCacheLogContext
{
11890 C_MDC_FragmentPrep(MDCache
*m
, MDRequestRef
& r
) : MDCacheLogContext(m
), mdr(r
) {}
11891 void finish(int r
) override
{
11892 mdcache
->_fragment_logged(mdr
);
11896 class C_MDC_FragmentStore
: public MDCacheContext
{
11899 C_MDC_FragmentStore(MDCache
*m
, MDRequestRef
& r
) : MDCacheContext(m
), mdr(r
) {}
11900 void finish(int r
) override
{
11901 mdcache
->_fragment_stored(mdr
);
11905 class C_MDC_FragmentCommit
: public MDCacheLogContext
{
11906 dirfrag_t basedirfrag
;
11909 C_MDC_FragmentCommit(MDCache
*m
, dirfrag_t df
, const MDRequestRef
& r
) :
11910 MDCacheLogContext(m
), basedirfrag(df
), mdr(r
) {}
11911 void finish(int r
) override
{
11912 mdcache
->_fragment_committed(basedirfrag
, mdr
);
11916 class C_IO_MDC_FragmentPurgeOld
: public MDCacheIOContext
{
11917 dirfrag_t basedirfrag
;
11921 C_IO_MDC_FragmentPurgeOld(MDCache
*m
, dirfrag_t f
, int b
,
11922 const MDRequestRef
& r
) :
11923 MDCacheIOContext(m
), basedirfrag(f
), bits(b
), mdr(r
) {}
11924 void finish(int r
) override
{
11925 ceph_assert(r
== 0 || r
== -CEPHFS_ENOENT
);
11926 mdcache
->_fragment_old_purged(basedirfrag
, bits
, mdr
);
11928 void print(ostream
& out
) const override
{
11929 out
<< "fragment_purge_old(" << basedirfrag
<< ")";
11933 void MDCache::fragment_frozen(MDRequestRef
& mdr
, int r
)
11935 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11936 map
<dirfrag_t
,fragment_info_t
>::iterator it
= fragments
.find(basedirfrag
);
11937 if (it
== fragments
.end() || it
->second
.mdr
!= mdr
) {
11938 dout(7) << "fragment_frozen " << basedirfrag
<< " must have aborted" << dendl
;
11939 request_finish(mdr
);
11943 ceph_assert(r
== 0);
11944 fragment_info_t
& info
= it
->second
;
11945 dout(10) << "fragment_frozen " << basedirfrag
.frag
<< " by " << info
.bits
11946 << " on " << info
.dirs
.front()->get_inode() << dendl
;
11948 info
.all_frozen
= true;
11949 dispatch_fragment_dir(mdr
);
11952 void MDCache::dispatch_fragment_dir(MDRequestRef
& mdr
)
11954 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11955 map
<dirfrag_t
,fragment_info_t
>::iterator it
= fragments
.find(basedirfrag
);
11956 if (it
== fragments
.end() || it
->second
.mdr
!= mdr
) {
11957 dout(7) << "dispatch_fragment_dir " << basedirfrag
<< " must have aborted" << dendl
;
11958 request_finish(mdr
);
11962 fragment_info_t
& info
= it
->second
;
11963 CInode
*diri
= info
.dirs
.front()->get_inode();
11965 dout(10) << "dispatch_fragment_dir " << basedirfrag
<< " bits " << info
.bits
11966 << " on " << *diri
<< dendl
;
11968 if (mdr
->more()->peer_error
)
11969 mdr
->aborted
= true;
11971 if (!mdr
->aborted
) {
11972 MutationImpl::LockOpVec lov
;
11973 lov
.add_wrlock(&diri
->dirfragtreelock
);
11974 // prevent a racing gather on any other scatterlocks too
11975 lov
.lock_scatter_gather(&diri
->nestlock
);
11976 lov
.lock_scatter_gather(&diri
->filelock
);
11977 if (!mds
->locker
->acquire_locks(mdr
, lov
, NULL
, true)) {
11983 if (mdr
->aborted
) {
11984 dout(10) << " can't auth_pin " << *diri
<< ", requeuing dir "
11985 << info
.dirs
.front()->dirfrag() << dendl
;
11987 mds
->balancer
->queue_split(info
.dirs
.front(), false);
11989 mds
->balancer
->queue_merge(info
.dirs
.front());
11990 fragment_unmark_unfreeze_dirs(info
.dirs
);
11991 fragments
.erase(it
);
11992 request_finish(mdr
);
11996 mdr
->ls
= mds
->mdlog
->get_current_segment();
11997 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_PREPARE
, basedirfrag
, info
.bits
);
11998 mds
->mdlog
->start_entry(le
);
12000 for (const auto& dir
: info
.dirs
) {
12001 dirfrag_rollback rollback
;
12002 rollback
.fnode
= dir
->fnode
;
12003 le
->add_orig_frag(dir
->get_frag(), &rollback
);
12007 MDSContext::vec waiters
;
12008 adjust_dir_fragments(diri
, info
.dirs
, basedirfrag
.frag
, info
.bits
,
12009 &info
.resultfrags
, waiters
, false);
12010 if (g_conf()->mds_debug_frag
)
12011 diri
->verify_dirfrags();
12012 mds
->queue_waiters(waiters
);
12014 for (const auto& fg
: le
->orig_frags
)
12015 ceph_assert(!diri
->dirfragtree
.is_leaf(fg
));
12017 le
->metablob
.add_dir_context(info
.resultfrags
.front());
12018 for (const auto& dir
: info
.resultfrags
) {
12019 if (diri
->is_auth()) {
12020 le
->metablob
.add_fragmented_dir(dir
, false, false);
12022 dir
->state_set(CDir::STATE_DIRTYDFT
);
12023 le
->metablob
.add_fragmented_dir(dir
, false, true);
12028 if (diri
->is_auth()) {
12029 // journal dirfragtree
12030 auto pi
= diri
->project_inode(mdr
);
12031 pi
.inode
->version
= diri
->pre_dirty();
12032 predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
);
12033 journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
12035 mds
->locker
->mark_updated_scatterlock(&diri
->dirfragtreelock
);
12036 mdr
->ls
->dirty_dirfrag_dirfragtree
.push_back(&diri
->item_dirty_dirfrag_dirfragtree
);
12037 mdr
->add_updated_lock(&diri
->dirfragtreelock
);
12042 mds->locker->mark_updated_scatterlock(&diri->filelock);
12043 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12044 mut->add_updated_lock(&diri->filelock);
12047 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12048 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12049 mut->add_updated_lock(&diri->nestlock);
12052 add_uncommitted_fragment(basedirfrag
, info
.bits
, le
->orig_frags
, mdr
->ls
);
12053 mds
->server
->submit_mdlog_entry(le
, new C_MDC_FragmentPrep(this, mdr
),
12055 mds
->mdlog
->flush();
12058 void MDCache::_fragment_logged(MDRequestRef
& mdr
)
12060 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
12061 auto& info
= fragments
.at(basedirfrag
);
12062 CInode
*diri
= info
.resultfrags
.front()->get_inode();
12064 dout(10) << "fragment_logged " << basedirfrag
<< " bits " << info
.bits
12065 << " on " << *diri
<< dendl
;
12066 mdr
->mark_event("prepare logged");
12068 mdr
->apply(); // mark scatterlock
12070 // store resulting frags
12071 MDSGatherBuilder
gather(g_ceph_context
, new C_MDC_FragmentStore(this, mdr
));
12073 for (const auto& dir
: info
.resultfrags
) {
12074 dout(10) << " storing result frag " << *dir
<< dendl
;
12076 dir
->mark_dirty(mdr
->ls
);
12077 dir
->mark_new(mdr
->ls
);
12079 // freeze and store them too
12080 dir
->auth_pin(this);
12081 dir
->state_set(CDir::STATE_FRAGMENTING
);
12082 dir
->commit(0, gather
.new_sub(), true); // ignore authpinnability
12088 void MDCache::_fragment_stored(MDRequestRef
& mdr
)
12090 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
12091 fragment_info_t
&info
= fragments
.at(basedirfrag
);
12092 CDir
*first
= info
.resultfrags
.front();
12093 CInode
*diri
= first
->get_inode();
12095 dout(10) << "fragment_stored " << basedirfrag
<< " bits " << info
.bits
12096 << " on " << *diri
<< dendl
;
12097 mdr
->mark_event("new frags stored");
12100 mds_rank_t diri_auth
= (first
->is_subtree_root() && !diri
->is_auth()) ?
12101 diri
->authority().first
: CDIR_AUTH_UNKNOWN
;
12102 for (const auto &p
: first
->get_replicas()) {
12103 if (mds
->mdsmap
->get_state(p
.first
) < MDSMap::STATE_REJOIN
||
12104 (mds
->mdsmap
->get_state(p
.first
) == MDSMap::STATE_REJOIN
&&
12105 rejoin_gather
.count(p
.first
)))
12108 auto notify
= make_message
<MMDSFragmentNotify
>(basedirfrag
, info
.bits
, mdr
->reqid
.tid
);
12109 if (diri_auth
!= CDIR_AUTH_UNKNOWN
&& // subtree root
12110 diri_auth
!= p
.first
) { // not auth mds of diri
12112 * In the nornal case, mds does not trim dir inode whose child dirfrags
12113 * are likely being fragmented (see trim_inode()). But when fragmenting
12114 * subtree roots, following race can happen:
12116 * - mds.a (auth mds of dirfrag) sends fragment_notify message to
12117 * mds.c and drops wrlock on dirfragtreelock.
12118 * - mds.b (auth mds of dir inode) changes dirfragtreelock state to
12119 * SYNC and send lock message mds.c
12120 * - mds.c receives the lock message and changes dirfragtreelock state
12122 * - mds.c trim dirfrag and dir inode from its cache
12123 * - mds.c receives the fragment_notify message
12125 * So we need to ensure replicas have received the notify, then unlock
12126 * the dirfragtreelock.
12128 notify
->mark_ack_wanted();
12129 info
.notify_ack_waiting
.insert(p
.first
);
12132 // freshly replicate new dirs to peers
12133 for (const auto& dir
: info
.resultfrags
) {
12134 encode_replica_dir(dir
, p
.first
, notify
->basebl
);
12137 mds
->send_message_mds(notify
, p
.first
);
12141 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_COMMIT
, basedirfrag
, info
.bits
);
12142 mds
->mdlog
->start_submit_entry(le
, new C_MDC_FragmentCommit(this, basedirfrag
, mdr
));
12145 // unfreeze resulting frags
12146 for (const auto& dir
: info
.resultfrags
) {
12147 dout(10) << " result frag " << *dir
<< dendl
;
12149 for (auto &p
: dir
->items
) {
12150 CDentry
*dn
= p
.second
;
12151 ceph_assert(dn
->state_test(CDentry::STATE_FRAGMENTING
));
12152 dn
->state_clear(CDentry::STATE_FRAGMENTING
);
12153 dn
->put(CDentry::PIN_FRAGMENTING
);
12157 dir
->unfreeze_dir();
12160 if (info
.notify_ack_waiting
.empty()) {
12161 fragment_drop_locks(info
);
12163 mds
->locker
->drop_locks_for_fragment_unfreeze(mdr
.get());
12167 void MDCache::_fragment_committed(dirfrag_t basedirfrag
, const MDRequestRef
& mdr
)
12169 dout(10) << "fragment_committed " << basedirfrag
<< dendl
;
12171 mdr
->mark_event("commit logged");
12173 ufragment
&uf
= uncommitted_fragments
.at(basedirfrag
);
12175 // remove old frags
12176 C_GatherBuilder
gather(
12179 new C_IO_MDC_FragmentPurgeOld(this, basedirfrag
, uf
.bits
, mdr
),
12182 SnapContext nullsnapc
;
12183 object_locator_t
oloc(mds
->get_metadata_pool());
12184 for (const auto& fg
: uf
.old_frags
) {
12185 object_t oid
= CInode::get_object_name(basedirfrag
.ino
, fg
, "");
12186 ObjectOperation op
;
12187 if (fg
== frag_t()) {
12188 // backtrace object
12189 dout(10) << " truncate orphan dirfrag " << oid
<< dendl
;
12193 dout(10) << " removing orphan dirfrag " << oid
<< dendl
;
12196 mds
->objecter
->mutate(oid
, oloc
, op
, nullsnapc
,
12197 ceph::real_clock::now(),
12198 0, gather
.new_sub());
12201 ceph_assert(gather
.has_subs());
12205 void MDCache::_fragment_old_purged(dirfrag_t basedirfrag
, int bits
, const MDRequestRef
& mdr
)
12207 dout(10) << "fragment_old_purged " << basedirfrag
<< dendl
;
12209 mdr
->mark_event("old frags purged");
12211 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_FINISH
, basedirfrag
, bits
);
12212 mds
->mdlog
->start_submit_entry(le
);
12214 finish_uncommitted_fragment(basedirfrag
, EFragment::OP_FINISH
);
12218 mds
->logger
->inc(l_mds_dir_split
);
12220 mds
->logger
->inc(l_mds_dir_merge
);
12225 auto it
= fragments
.find(basedirfrag
);
12226 ceph_assert(it
!= fragments
.end());
12227 it
->second
.finishing
= true;
12228 if (it
->second
.notify_ack_waiting
.empty())
12229 fragment_maybe_finish(it
);
12231 mdr
->mark_event("wating for notify acks");
12235 void MDCache::fragment_drop_locks(fragment_info_t
& info
)
12237 mds
->locker
->drop_locks(info
.mdr
.get());
12238 request_finish(info
.mdr
);
12239 //info.mdr.reset();
12242 void MDCache::fragment_maybe_finish(const fragment_info_iterator
& it
)
12244 if (!it
->second
.finishing
)
12247 // unmark & auth_unpin
12248 for (const auto &dir
: it
->second
.resultfrags
) {
12249 dir
->state_clear(CDir::STATE_FRAGMENTING
);
12250 dir
->auth_unpin(this);
12252 // In case the resulting fragments are beyond the split size,
12253 // we might need to split them again right away (they could
12254 // have been taking inserts between unfreezing and getting
12256 mds
->balancer
->maybe_fragment(dir
, false);
12259 fragments
.erase(it
);
12263 void MDCache::handle_fragment_notify_ack(const cref_t
<MMDSFragmentNotifyAck
> &ack
)
12265 dout(10) << "handle_fragment_notify_ack " << *ack
<< " from " << ack
->get_source() << dendl
;
12266 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
12268 if (mds
->get_state() < MDSMap::STATE_ACTIVE
) {
12272 auto it
= fragments
.find(ack
->get_base_dirfrag());
12273 if (it
== fragments
.end() ||
12274 it
->second
.get_tid() != ack
->get_tid()) {
12275 dout(10) << "handle_fragment_notify_ack obsolete message, dropping" << dendl
;
12279 if (it
->second
.notify_ack_waiting
.erase(from
) &&
12280 it
->second
.notify_ack_waiting
.empty()) {
12281 fragment_drop_locks(it
->second
);
12282 fragment_maybe_finish(it
);
12286 void MDCache::handle_fragment_notify(const cref_t
<MMDSFragmentNotify
> ¬ify
)
12288 dout(10) << "handle_fragment_notify " << *notify
<< " from " << notify
->get_source() << dendl
;
12289 mds_rank_t from
= mds_rank_t(notify
->get_source().num());
12291 if (mds
->get_state() < MDSMap::STATE_REJOIN
) {
12295 CInode
*diri
= get_inode(notify
->get_ino());
12297 frag_t base
= notify
->get_basefrag();
12298 int bits
= notify
->get_bits();
12301 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
12302 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
12303 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
12304 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
12310 MDSContext::vec waiters
;
12311 std::vector
<CDir
*> resultfrags
;
12312 adjust_dir_fragments(diri
, base
, bits
, &resultfrags
, waiters
, false);
12313 if (g_conf()->mds_debug_frag
)
12314 diri
->verify_dirfrags();
12316 for (const auto& dir
: resultfrags
) {
12317 diri
->take_dir_waiting(dir
->get_frag(), waiters
);
12320 // add new replica dirs values
12321 auto p
= notify
->basebl
.cbegin();
12323 CDir
*tmp_dir
= nullptr;
12324 decode_replica_dir(tmp_dir
, p
, diri
, from
, waiters
);
12327 mds
->queue_waiters(waiters
);
12332 if (notify
->is_ack_wanted()) {
12333 auto ack
= make_message
<MMDSFragmentNotifyAck
>(notify
->get_base_dirfrag(),
12334 notify
->get_bits(), notify
->get_tid());
12335 mds
->send_message_mds(ack
, from
);
12339 void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag
, int bits
, const frag_vec_t
& old_frags
,
12340 LogSegment
*ls
, bufferlist
*rollback
)
12342 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag
<< " bits " << bits
<< dendl
;
12343 ceph_assert(!uncommitted_fragments
.count(basedirfrag
));
12344 ufragment
& uf
= uncommitted_fragments
[basedirfrag
];
12345 uf
.old_frags
= old_frags
;
12348 ls
->uncommitted_fragments
.insert(basedirfrag
);
12350 uf
.rollback
.swap(*rollback
);
12353 void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag
, int op
)
12355 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
12356 << " op " << EFragment::op_name(op
) << dendl
;
12357 map
<dirfrag_t
, ufragment
>::iterator it
= uncommitted_fragments
.find(basedirfrag
);
12358 if (it
!= uncommitted_fragments
.end()) {
12359 ufragment
& uf
= it
->second
;
12360 if (op
!= EFragment::OP_FINISH
&& !uf
.old_frags
.empty()) {
12361 uf
.committed
= true;
12363 uf
.ls
->uncommitted_fragments
.erase(basedirfrag
);
12364 mds
->queue_waiters(uf
.waiters
);
12365 uncommitted_fragments
.erase(it
);
12370 void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag
, frag_vec_t
&& old_frags
)
12372 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
12373 << " old_frags (" << old_frags
<< ")" << dendl
;
12374 map
<dirfrag_t
, ufragment
>::iterator it
= uncommitted_fragments
.find(basedirfrag
);
12375 if (it
!= uncommitted_fragments
.end()) {
12376 ufragment
& uf
= it
->second
;
12377 if (!uf
.old_frags
.empty()) {
12378 uf
.old_frags
= std::move(old_frags
);
12379 uf
.committed
= true;
12381 uf
.ls
->uncommitted_fragments
.erase(basedirfrag
);
12382 uncommitted_fragments
.erase(it
);
12387 void MDCache::wait_for_uncommitted_fragments(MDSContext
* finisher
)
12389 MDSGatherBuilder
gather(g_ceph_context
, finisher
);
12390 for (auto& p
: uncommitted_fragments
) {
12391 p
.second
.waiters
.push_back(gather
.new_sub());
12396 struct C_MDC_FragmentRollback
: public MDCacheLogContext
{
12398 C_MDC_FragmentRollback(MDCache
*c
, MutationRef
& m
) :
12399 MDCacheLogContext(c
), mut(m
) {}
12400 void finish(int r
) override
{
12402 get_mds()->locker
->drop_locks(mut
.get());
12407 void MDCache::rollback_uncommitted_fragments()
12409 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments
.size() << " pending" << dendl
;
12410 for (map
<dirfrag_t
, ufragment
>::iterator p
= uncommitted_fragments
.begin();
12411 p
!= uncommitted_fragments
.end();
12413 ufragment
&uf
= p
->second
;
12414 CInode
*diri
= get_inode(p
->first
.ino
);
12417 if (uf
.committed
) {
12418 _fragment_committed(p
->first
, MDRequestRef());
12422 dout(10) << " rolling back " << p
->first
<< " refragment by " << uf
.bits
<< " bits" << dendl
;
12424 MutationRef
mut(new MutationImpl());
12425 mut
->ls
= mds
->mdlog
->get_current_segment();
12426 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_ROLLBACK
, p
->first
, uf
.bits
);
12427 mds
->mdlog
->start_entry(le
);
12428 bool diri_auth
= (diri
->authority() != CDIR_AUTH_UNDEF
);
12430 frag_vec_t old_frags
;
12431 diri
->dirfragtree
.get_leaves_under(p
->first
.frag
, old_frags
);
12433 std::vector
<CDir
*> resultfrags
;
12434 if (uf
.old_frags
.empty()) {
12435 // created by old format EFragment
12436 MDSContext::vec waiters
;
12437 adjust_dir_fragments(diri
, p
->first
.frag
, -uf
.bits
, &resultfrags
, waiters
, true);
12439 auto bp
= uf
.rollback
.cbegin();
12440 for (const auto& fg
: uf
.old_frags
) {
12441 CDir
*dir
= force_dir_fragment(diri
, fg
);
12442 resultfrags
.push_back(dir
);
12444 dirfrag_rollback rollback
;
12445 decode(rollback
, bp
);
12447 dir
->fnode
= rollback
.fnode
;
12449 dir
->mark_dirty(mut
->ls
);
12451 if (!(dir
->get_fnode()->rstat
== dir
->get_fnode()->accounted_rstat
)) {
12452 dout(10) << " dirty nestinfo on " << *dir
<< dendl
;
12453 mds
->locker
->mark_updated_scatterlock(&diri
->nestlock
);
12454 mut
->ls
->dirty_dirfrag_nest
.push_back(&diri
->item_dirty_dirfrag_nest
);
12455 mut
->add_updated_lock(&diri
->nestlock
);
12457 if (!(dir
->get_fnode()->fragstat
== dir
->get_fnode()->accounted_fragstat
)) {
12458 dout(10) << " dirty fragstat on " << *dir
<< dendl
;
12459 mds
->locker
->mark_updated_scatterlock(&diri
->filelock
);
12460 mut
->ls
->dirty_dirfrag_dir
.push_back(&diri
->item_dirty_dirfrag_dir
);
12461 mut
->add_updated_lock(&diri
->filelock
);
12464 le
->add_orig_frag(dir
->get_frag());
12465 le
->metablob
.add_dir_context(dir
);
12467 le
->metablob
.add_fragmented_dir(dir
, true, false);
12469 dout(10) << " dirty dirfragtree on " << *dir
<< dendl
;
12470 dir
->state_set(CDir::STATE_DIRTYDFT
);
12471 le
->metablob
.add_fragmented_dir(dir
, true, true);
12477 auto pi
= diri
->project_inode(mut
);
12478 pi
.inode
->version
= diri
->pre_dirty();
12479 predirty_journal_parents(mut
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
);
12480 le
->metablob
.add_primary_dentry(diri
->get_projected_parent_dn(), diri
, true);
12482 mds
->locker
->mark_updated_scatterlock(&diri
->dirfragtreelock
);
12483 mut
->ls
->dirty_dirfrag_dirfragtree
.push_back(&diri
->item_dirty_dirfrag_dirfragtree
);
12484 mut
->add_updated_lock(&diri
->dirfragtreelock
);
12487 if (g_conf()->mds_debug_frag
)
12488 diri
->verify_dirfrags();
12490 for (const auto& leaf
: old_frags
) {
12491 ceph_assert(!diri
->dirfragtree
.is_leaf(leaf
));
12494 mds
->mdlog
->submit_entry(le
, new C_MDC_FragmentRollback(this, mut
));
12496 uf
.old_frags
.swap(old_frags
);
12497 _fragment_committed(p
->first
, MDRequestRef());
12501 void MDCache::force_readonly()
12506 dout(1) << "force file system read-only" << dendl
;
12507 mds
->clog
->warn() << "force file system read-only";
12511 mds
->server
->force_clients_readonly();
12513 // revoke write caps
12515 for (auto &p
: inode_map
) {
12516 CInode
*in
= p
.second
;
12518 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
);
12519 if (!(++count
% mds
->heartbeat_reset_grace()))
12520 mds
->heartbeat_reset();
12523 mds
->mdlog
->flush();
12527 // ==============================================================
12530 void MDCache::show_subtrees(int dbl
, bool force_print
)
12532 if (g_conf()->mds_thrash_exports
)
12535 //dout(10) << "show_subtrees" << dendl;
12537 if (!g_conf()->subsys
.should_gather(ceph_subsys_mds
, dbl
))
12538 return; // i won't print anything.
12540 if (subtrees
.empty()) {
12541 dout(ceph::dout::need_dynamic(dbl
)) << "show_subtrees - no subtrees"
12546 if (!force_print
&& subtrees
.size() > SUBTREES_COUNT_THRESHOLD
&&
12547 !g_conf()->subsys
.should_gather
<ceph_subsys_mds
, 25>()) {
12548 dout(ceph::dout::need_dynamic(dbl
)) << "number of subtrees = " << subtrees
.size() << "; not "
12549 "printing subtrees" << dendl
;
12554 std::vector
<CDir
*> basefrags
;
12555 for (set
<CInode
*>::iterator p
= base_inodes
.begin();
12556 p
!= base_inodes
.end();
12558 (*p
)->get_dirfrags(basefrags
);
12559 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
12560 dout(15) << "show_subtrees" << dendl
;
12563 list
<pair
<CDir
*,int> > q
;
12568 for (const auto& dir
: basefrags
) {
12569 q
.emplace_back(dir
, 0);
12572 set
<CDir
*> subtrees_seen
;
12574 unsigned int depth
= 0;
12575 while (!q
.empty()) {
12576 CDir
*dir
= q
.front().first
;
12577 unsigned int d
= q
.front().second
;
12580 if (subtrees
.count(dir
) == 0) continue;
12582 subtrees_seen
.insert(dir
);
12584 if (d
> depth
) depth
= d
;
12587 //dout(25) << "saw depth " << d << " " << *dir << dendl;
12588 if (seen
.count(dir
)) dout(0) << "aah, already seen " << *dir
<< dendl
;
12589 ceph_assert(seen
.count(dir
) == 0);
12593 if (!subtrees
[dir
].empty()) {
12594 for (set
<CDir
*>::iterator p
= subtrees
[dir
].begin();
12595 p
!= subtrees
[dir
].end();
12597 //dout(25) << " saw sub " << **p << dendl;
12598 q
.push_front(pair
<CDir
*,int>(*p
, d
+1));
12603 if (!force_print
&& depth
> SUBTREES_DEPTH_THRESHOLD
&&
12604 !g_conf()->subsys
.should_gather
<ceph_subsys_mds
, 25>()) {
12605 dout(ceph::dout::need_dynamic(dbl
)) << "max depth among subtrees = " << depth
<< "; not printing "
12606 "subtrees" << dendl
;
12611 for (const auto& dir
: basefrags
) {
12612 q
.emplace_back(dir
, 0);
12615 while (!q
.empty()) {
12616 CDir
*dir
= q
.front().first
;
12617 int d
= q
.front().second
;
12620 if (subtrees
.count(dir
) == 0) continue;
12623 while ((unsigned)d
< indent
.size())
12627 string pad
= "______________________________________";
12628 pad
.resize(depth
*2+1-indent
.size());
12629 if (!subtrees
[dir
].empty())
12630 pad
[0] = '.'; // parent
12634 if (dir
->is_auth())
12640 if (dir
->get_dir_auth().second
== CDIR_AUTH_UNKNOWN
)
12641 snprintf(s
, sizeof(s
), "%2d ", int(dir
->get_dir_auth().first
));
12643 snprintf(s
, sizeof(s
), "%2d,%2d", int(dir
->get_dir_auth().first
), int(dir
->get_dir_auth().second
));
12646 dout(ceph::dout::need_dynamic(dbl
)) << indent
<< "|_" << pad
<< s
12647 << " " << auth
<< *dir
<< dendl
;
12649 if (dir
->ino() == CEPH_INO_ROOT
)
12650 ceph_assert(dir
->inode
== root
);
12651 if (dir
->ino() == MDS_INO_MDSDIR(mds
->get_nodeid()))
12652 ceph_assert(dir
->inode
== myin
);
12653 if (dir
->inode
->is_stray() && (MDS_INO_STRAY_OWNER(dir
->ino()) == mds
->get_nodeid()))
12654 ceph_assert(strays
[MDS_INO_STRAY_INDEX(dir
->ino())] == dir
->inode
);
12657 if (!subtrees
[dir
].empty()) {
12658 // more at my level?
12659 if (!q
.empty() && q
.front().second
== d
)
12664 for (set
<CDir
*>::iterator p
= subtrees
[dir
].begin();
12665 p
!= subtrees
[dir
].end();
12667 q
.push_front(pair
<CDir
*,int>(*p
, d
+2));
12671 // verify there isn't stray crap in subtree map
12673 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
12674 p
!= subtrees
.end();
12676 if (subtrees_seen
.count(p
->first
)) continue;
12677 dout(10) << "*** stray/lost entry in subtree map: " << *p
->first
<< dendl
;
12680 ceph_assert(lost
== 0);
12683 void MDCache::show_cache()
12685 if (!g_conf()->subsys
.should_gather
<ceph_subsys_mds
, 7>())
12687 dout(7) << "show_cache" << dendl
;
12689 auto show_func
= [this](CInode
*in
) {
12692 dout(7) << " unlinked " << *in
<< dendl
;
12695 auto&& dfs
= in
->get_dirfrags();
12696 for (const auto& dir
: dfs
) {
12697 dout(7) << " dirfrag " << *dir
<< dendl
;
12699 for (auto &p
: dir
->items
) {
12700 CDentry
*dn
= p
.second
;
12701 dout(7) << " dentry " << *dn
<< dendl
;
12702 CDentry::linkage_t
*dnl
= dn
->get_linkage();
12703 if (dnl
->is_primary() && dnl
->get_inode())
12704 dout(7) << " inode " << *dnl
->get_inode() << dendl
;
12709 for (auto &p
: inode_map
)
12710 show_func(p
.second
);
12711 for (auto &p
: snap_inode_map
)
12712 show_func(p
.second
);
12715 void MDCache::cache_status(Formatter
*f
)
12717 f
->open_object_section("cache");
12719 f
->open_object_section("pool");
12720 mempool::get_pool(mempool::mds_co::id
).dump(f
);
12721 f
->close_section();
12723 f
->close_section();
12726 void MDCache::dump_tree(CInode
*in
, const int cur_depth
, const int max_depth
, Formatter
*f
)
12729 if ((max_depth
>= 0) && (cur_depth
> max_depth
)) {
12732 auto&& ls
= in
->get_dirfrags();
12733 for (const auto &subdir
: ls
) {
12734 for (const auto &p
: subdir
->items
) {
12735 CDentry
*dn
= p
.second
;
12736 CInode
*in
= dn
->get_linkage()->get_inode();
12738 dump_tree(in
, cur_depth
+ 1, max_depth
, f
);
12742 f
->open_object_section("inode");
12743 in
->dump(f
, CInode::DUMP_DEFAULT
| CInode::DUMP_DIRFRAGS
);
12744 f
->close_section();
12747 int MDCache::dump_cache(std::string_view file_name
, double timeout
)
12749 return dump_cache(file_name
, NULL
, timeout
);
12752 int MDCache::dump_cache(Formatter
*f
, double timeout
)
12754 return dump_cache(std::string_view(""), f
, timeout
);
12758 * Dump the metadata cache, either to a Formatter, if
12759 * provided, else to a plain text file.
12761 int MDCache::dump_cache(std::string_view fn
, Formatter
*f
, double timeout
)
12765 // dumping large caches may cause mds to hang or worse get killed.
12766 // so, disallow the dump if the cache size exceeds the configured
12767 // threshold, which is 1G for formatter and unlimited for file (note
12768 // that this can be jacked up by the admin... and is nothing but foot
12769 // shooting, but the option itself is for devs and hence dangerous to
12770 // tune). TODO: remove this when fixed.
12771 uint64_t threshold
= f
?
12772 g_conf().get_val
<Option::size_t>("mds_dump_cache_threshold_formatter") :
12773 g_conf().get_val
<Option::size_t>("mds_dump_cache_threshold_file");
12775 if (threshold
&& cache_size() > threshold
) {
12777 CachedStackStringStream css
;
12778 *css
<< "cache usage exceeds dump threshold";
12779 f
->open_object_section("result");
12780 f
->dump_string("error", css
->strv());
12781 f
->close_section();
12783 derr
<< "cache usage exceeds dump threshold" << dendl
;
12784 r
= -CEPHFS_EINVAL
;
12793 f
->open_array_section("inodes");
12795 char path
[PATH_MAX
] = "";
12797 snprintf(path
, sizeof path
, "%s", fn
.data());
12799 snprintf(path
, sizeof path
, "cachedump.%d.mds%d", (int)mds
->mdsmap
->get_epoch(), int(mds
->get_nodeid()));
12802 dout(1) << "dump_cache to " << path
<< dendl
;
12804 fd
= ::open(path
, O_WRONLY
|O_CREAT
|O_EXCL
|O_CLOEXEC
, 0600);
12806 derr
<< "failed to open " << path
<< ": " << cpp_strerror(errno
) << dendl
;
12811 auto dump_func
= [fd
, f
](CInode
*in
) {
12814 f
->open_object_section("inode");
12815 in
->dump(f
, CInode::DUMP_DEFAULT
| CInode::DUMP_DIRFRAGS
);
12816 f
->close_section();
12819 CachedStackStringStream css
;
12820 *css
<< *in
<< std::endl
;
12821 auto sv
= css
->strv();
12822 r
= safe_write(fd
, sv
.data(), sv
.size());
12825 auto&& dfs
= in
->get_dirfrags();
12826 for (auto &dir
: dfs
) {
12827 CachedStackStringStream css2
;
12828 *css2
<< " " << *dir
<< std::endl
;
12829 auto sv
= css2
->strv();
12830 r
= safe_write(fd
, sv
.data(), sv
.size());
12833 for (auto &p
: dir
->items
) {
12834 CDentry
*dn
= p
.second
;
12835 CachedStackStringStream css3
;
12836 *css3
<< " " << *dn
<< std::endl
;
12837 auto sv
= css3
->strv();
12838 r
= safe_write(fd
, sv
.data(), sv
.size());
12842 dir
->check_rstats();
12847 auto start
= mono_clock::now();
12849 for (auto &p
: inode_map
) {
12850 r
= dump_func(p
.second
);
12853 if (!(++count
% 1000) &&
12855 std::chrono::duration
<double>(mono_clock::now() - start
).count() > timeout
) {
12860 for (auto &p
: snap_inode_map
) {
12861 r
= dump_func(p
.second
);
12864 if (!(++count
% 1000) &&
12866 std::chrono::duration
<double>(mono_clock::now() - start
).count() > timeout
) {
12876 if (r
== -ETIMEDOUT
)
12878 f
->close_section();
12879 f
->open_object_section("result");
12880 f
->dump_string("error", "the operation timeout");
12882 f
->close_section(); // inodes
12884 if (r
== -ETIMEDOUT
)
12886 CachedStackStringStream css
;
12887 *css
<< "error : the operation timeout" << std::endl
;
12888 auto sv
= css
->strv();
12889 r
= safe_write(fd
, sv
.data(), sv
.size());
12896 void C_MDS_RetryRequest::finish(int r
)
12899 cache
->dispatch_request(mdr
);
12902 MDSContext
*CF_MDS_RetryRequestFactory::build()
12905 mdcache
->mds
->locker
->drop_locks(mdr
.get(), nullptr);
12906 mdr
->drop_local_auth_pins();
12908 return new C_MDS_RetryRequest(mdcache
, mdr
);
12911 class C_MDS_EnqueueScrub
: public Context
12914 Formatter
*formatter
;
12915 Context
*on_finish
;
12917 ScrubHeaderRef header
;
12918 C_MDS_EnqueueScrub(std::string_view tag
, Formatter
*f
, Context
*fin
) :
12919 tag(tag
), formatter(f
), on_finish(fin
), header(nullptr) {}
12921 void finish(int r
) override
{
12922 formatter
->open_object_section("results");
12923 formatter
->dump_int("return_code", r
);
12925 formatter
->dump_string("scrub_tag", tag
);
12926 formatter
->dump_string("mode", "asynchronous");
12928 formatter
->close_section();
12932 on_finish
->complete(r
);
12936 void MDCache::enqueue_scrub(
12937 std::string_view path
,
12938 std::string_view tag
,
12939 bool force
, bool recursive
, bool repair
,
12940 bool scrub_mdsdir
, Formatter
*f
, Context
*fin
)
12942 dout(10) << __func__
<< " " << path
<< dendl
;
12945 if (path
.compare(0, 4, "~mds") == 0) {
12947 if (path
== "~mdsdir") {
12948 rank
= mds
->get_nodeid();
12951 rank
= strict_strtoll(path
.substr(4), 10, &err
);
12953 rank
= MDS_RANK_NONE
;
12955 if (rank
>= 0 && rank
< MAX_MDS
)
12956 fp
.set_path("", MDS_INO_MDSDIR(rank
));
12958 if (fp
.get_ino() == inodeno_t(0))
12961 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB
);
12962 mdr
->set_filepath(fp
);
12964 bool is_internal
= false;
12965 std::string
tag_str(tag
);
12966 if (tag_str
.empty()) {
12968 uuid_gen
.generate_random();
12969 tag_str
= uuid_gen
.to_string();
12970 is_internal
= true;
12973 C_MDS_EnqueueScrub
*cs
= new C_MDS_EnqueueScrub(tag_str
, f
, fin
);
12974 cs
->header
= std::make_shared
<ScrubHeader
>(tag_str
, is_internal
, force
,
12975 recursive
, repair
, scrub_mdsdir
);
12977 mdr
->internal_op_finish
= cs
;
12978 enqueue_scrub_work(mdr
);
12981 void MDCache::enqueue_scrub_work(MDRequestRef
& mdr
)
12984 CF_MDS_RetryRequestFactory
cf(this, mdr
, true);
12985 int r
= path_traverse(mdr
, cf
, mdr
->get_filepath(),
12986 MDS_TRAVERSE_DISCOVER
| MDS_TRAVERSE_RDLOCK_PATH
,
12991 mds
->server
->respond_to_request(mdr
, r
);
12995 // Cannot scrub same dentry twice at same time
12996 if (in
->scrub_is_in_progress()) {
12997 mds
->server
->respond_to_request(mdr
, -CEPHFS_EBUSY
);
13003 C_MDS_EnqueueScrub
*cs
= static_cast<C_MDS_EnqueueScrub
*>(mdr
->internal_op_finish
);
13004 ScrubHeaderRef
& header
= cs
->header
;
13006 r
= mds
->scrubstack
->enqueue(in
, header
, !header
->get_recursive());
13008 mds
->server
->respond_to_request(mdr
, r
);
13011 struct C_MDC_RespondInternalRequest
: public MDCacheLogContext
{
13013 C_MDC_RespondInternalRequest(MDCache
*c
, MDRequestRef
& m
) :
13014 MDCacheLogContext(c
), mdr(m
) {}
13015 void finish(int r
) override
{
13017 get_mds()->server
->respond_to_request(mdr
, r
);
13021 struct C_MDC_ScrubRepaired
: public MDCacheContext
{
13022 ScrubHeaderRef header
;
13024 C_MDC_ScrubRepaired(MDCache
*m
, const ScrubHeaderRef
& h
)
13025 : MDCacheContext(m
), header(h
) {
13026 header
->inc_num_pending();
13028 void finish(int r
) override
{
13029 header
->dec_num_pending();
13033 void MDCache::repair_dirfrag_stats(CDir
*dir
)
13035 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS
);
13037 mdr
->internal_op_private
= dir
;
13038 if (dir
->scrub_is_in_progress())
13039 mdr
->internal_op_finish
= new C_MDC_ScrubRepaired(this, dir
->get_scrub_header());
13041 mdr
->internal_op_finish
= new C_MDSInternalNoop
;
13042 repair_dirfrag_stats_work(mdr
);
13045 void MDCache::repair_dirfrag_stats_work(MDRequestRef
& mdr
)
13047 CDir
*dir
= static_cast<CDir
*>(mdr
->internal_op_private
);
13048 dout(10) << __func__
<< " " << *dir
<< dendl
;
13050 if (!dir
->is_auth()) {
13051 mds
->server
->respond_to_request(mdr
, -CEPHFS_ESTALE
);
13055 if (!mdr
->is_auth_pinned(dir
) && !dir
->can_auth_pin()) {
13056 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(this, mdr
));
13058 mds
->locker
->drop_locks(mdr
.get());
13059 mdr
->drop_local_auth_pins();
13060 if (mdr
->is_any_remote_auth_pin())
13061 mds
->locker
->notify_freeze_waiter(dir
);
13065 mdr
->auth_pin(dir
);
13067 MutationImpl::LockOpVec lov
;
13068 CInode
*diri
= dir
->inode
;
13069 lov
.add_rdlock(&diri
->dirfragtreelock
);
13070 lov
.add_wrlock(&diri
->nestlock
);
13071 lov
.add_wrlock(&diri
->filelock
);
13072 if (!mds
->locker
->acquire_locks(mdr
, lov
))
13075 if (!dir
->is_complete()) {
13076 dir
->fetch(new C_MDS_RetryRequest(this, mdr
));
13080 frag_info_t frag_info
;
13081 nest_info_t nest_info
;
13082 for (auto it
= dir
->begin(); it
!= dir
->end(); ++it
) {
13083 CDentry
*dn
= it
->second
;
13084 if (dn
->last
!= CEPH_NOSNAP
)
13086 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
13087 if (dnl
->is_primary()) {
13088 CInode
*in
= dnl
->get_inode();
13089 nest_info
.add(in
->get_projected_inode()->accounted_rstat
);
13091 frag_info
.nsubdirs
++;
13093 frag_info
.nfiles
++;
13094 } else if (dnl
->is_remote())
13095 frag_info
.nfiles
++;
13098 auto pf
= dir
->get_projected_fnode();
13099 bool good_fragstat
= frag_info
.same_sums(pf
->fragstat
);
13100 bool good_rstat
= nest_info
.same_sums(pf
->rstat
);
13101 if (good_fragstat
&& good_rstat
) {
13102 dout(10) << __func__
<< " no corruption found" << dendl
;
13103 mds
->server
->respond_to_request(mdr
, 0);
13107 auto _pf
= dir
->project_fnode(mdr
);
13108 _pf
->version
= dir
->pre_dirty();
13111 mdr
->ls
= mds
->mdlog
->get_current_segment();
13112 EUpdate
*le
= new EUpdate(mds
->mdlog
, "repair_dirfrag");
13113 mds
->mdlog
->start_entry(le
);
13115 if (!good_fragstat
) {
13116 if (pf
->fragstat
.mtime
> frag_info
.mtime
)
13117 frag_info
.mtime
= pf
->fragstat
.mtime
;
13118 if (pf
->fragstat
.change_attr
> frag_info
.change_attr
)
13119 frag_info
.change_attr
= pf
->fragstat
.change_attr
;
13120 _pf
->fragstat
= frag_info
;
13121 mds
->locker
->mark_updated_scatterlock(&diri
->filelock
);
13122 mdr
->ls
->dirty_dirfrag_dir
.push_back(&diri
->item_dirty_dirfrag_dir
);
13123 mdr
->add_updated_lock(&diri
->filelock
);
13127 if (pf
->rstat
.rctime
> nest_info
.rctime
)
13128 nest_info
.rctime
= pf
->rstat
.rctime
;
13129 _pf
->rstat
= nest_info
;
13130 mds
->locker
->mark_updated_scatterlock(&diri
->nestlock
);
13131 mdr
->ls
->dirty_dirfrag_nest
.push_back(&diri
->item_dirty_dirfrag_nest
);
13132 mdr
->add_updated_lock(&diri
->nestlock
);
13135 le
->metablob
.add_dir_context(dir
);
13136 le
->metablob
.add_dir(dir
, true);
13138 mds
->mdlog
->submit_entry(le
, new C_MDC_RespondInternalRequest(this, mdr
));
13141 void MDCache::repair_inode_stats(CInode
*diri
)
13143 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS
);
13144 mdr
->auth_pin(diri
); // already auth pinned by CInode::validate_disk_state()
13145 mdr
->internal_op_private
= diri
;
13146 if (diri
->scrub_is_in_progress())
13147 mdr
->internal_op_finish
= new C_MDC_ScrubRepaired(this, diri
->get_scrub_header());
13149 mdr
->internal_op_finish
= new C_MDSInternalNoop
;
13150 repair_inode_stats_work(mdr
);
13153 void MDCache::repair_inode_stats_work(MDRequestRef
& mdr
)
13155 CInode
*diri
= static_cast<CInode
*>(mdr
->internal_op_private
);
13156 dout(10) << __func__
<< " " << *diri
<< dendl
;
13158 if (!diri
->is_auth()) {
13159 mds
->server
->respond_to_request(mdr
, -CEPHFS_ESTALE
);
13162 if (!diri
->is_dir()) {
13163 mds
->server
->respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
13167 MutationImpl::LockOpVec lov
;
13169 if (mdr
->ls
) // already marked filelock/nestlock dirty ?
13172 lov
.add_rdlock(&diri
->dirfragtreelock
);
13173 lov
.add_wrlock(&diri
->nestlock
);
13174 lov
.add_wrlock(&diri
->filelock
);
13175 if (!mds
->locker
->acquire_locks(mdr
, lov
))
13178 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
13179 // the scatter-gather process, which will fix any fragstat/rstat errors.
13182 diri
->dirfragtree
.get_leaves(leaves
);
13183 for (const auto& leaf
: leaves
) {
13184 CDir
*dir
= diri
->get_dirfrag(leaf
);
13186 ceph_assert(mdr
->is_auth_pinned(diri
));
13187 dir
= diri
->get_or_open_dirfrag(this, leaf
);
13189 if (dir
->get_version() == 0) {
13190 ceph_assert(dir
->is_auth());
13191 dir
->fetch_keys({}, new C_MDS_RetryRequest(this, mdr
));
13197 diri
->state_set(CInode::STATE_REPAIRSTATS
);
13198 mdr
->ls
= mds
->mdlog
->get_current_segment();
13199 mds
->locker
->mark_updated_scatterlock(&diri
->filelock
);
13200 mdr
->ls
->dirty_dirfrag_dir
.push_back(&diri
->item_dirty_dirfrag_dir
);
13201 mds
->locker
->mark_updated_scatterlock(&diri
->nestlock
);
13202 mdr
->ls
->dirty_dirfrag_nest
.push_back(&diri
->item_dirty_dirfrag_nest
);
13204 mds
->locker
->drop_locks(mdr
.get());
13207 // force the scatter-gather process
13209 lov
.add_rdlock(&diri
->dirfragtreelock
);
13210 lov
.add_rdlock(&diri
->nestlock
);
13211 lov
.add_rdlock(&diri
->filelock
);
13212 if (!mds
->locker
->acquire_locks(mdr
, lov
))
13215 diri
->state_clear(CInode::STATE_REPAIRSTATS
);
13217 frag_info_t dir_info
;
13218 nest_info_t nest_info
;
13219 nest_info
.rsubdirs
= 1; // it gets one to account for self
13220 if (const sr_t
*srnode
= diri
->get_projected_srnode(); srnode
)
13221 nest_info
.rsnaps
= srnode
->snaps
.size();
13225 diri
->dirfragtree
.get_leaves(leaves
);
13226 for (const auto& leaf
: leaves
) {
13227 CDir
*dir
= diri
->get_dirfrag(leaf
);
13229 ceph_assert(dir
->get_version() > 0);
13230 dir_info
.add(dir
->get_fnode()->accounted_fragstat
);
13231 nest_info
.add(dir
->get_fnode()->accounted_rstat
);
13235 if (!dir_info
.same_sums(diri
->get_inode()->dirstat
) ||
13236 !nest_info
.same_sums(diri
->get_inode()->rstat
)) {
13237 dout(10) << __func__
<< " failed to fix fragstat/rstat on "
13241 mds
->server
->respond_to_request(mdr
, 0);
13244 void MDCache::rdlock_dirfrags_stats(CInode
*diri
, MDSInternalContext
* fin
)
13246 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_RDLOCK_FRAGSSTATS
);
13247 mdr
->auth_pin(diri
); // already auth pinned by CInode::validate_disk_state()
13248 mdr
->internal_op_private
= diri
;
13249 mdr
->internal_op_finish
= fin
;
13250 return rdlock_dirfrags_stats_work(mdr
);
13253 void MDCache::rdlock_dirfrags_stats_work(MDRequestRef
& mdr
)
13255 CInode
*diri
= static_cast<CInode
*>(mdr
->internal_op_private
);
13256 dout(10) << __func__
<< " " << *diri
<< dendl
;
13257 if (!diri
->is_auth()) {
13258 mds
->server
->respond_to_request(mdr
, -CEPHFS_ESTALE
);
13261 if (!diri
->is_dir()) {
13262 mds
->server
->respond_to_request(mdr
, -CEPHFS_ENOTDIR
);
13266 MutationImpl::LockOpVec lov
;
13267 lov
.add_rdlock(&diri
->dirfragtreelock
);
13268 lov
.add_rdlock(&diri
->nestlock
);
13269 lov
.add_rdlock(&diri
->filelock
);
13270 if (!mds
->locker
->acquire_locks(mdr
, lov
))
13272 dout(10) << __func__
<< " start dirfrags : " << *diri
<< dendl
;
13274 mds
->server
->respond_to_request(mdr
, 0);
13278 void MDCache::flush_dentry(std::string_view path
, Context
*fin
)
13280 if (is_readonly()) {
13281 dout(10) << __func__
<< ": read-only FS" << dendl
;
13282 fin
->complete(-CEPHFS_EROFS
);
13285 dout(10) << "flush_dentry " << path
<< dendl
;
13286 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_FLUSH
);
13288 mdr
->set_filepath(fp
);
13289 mdr
->internal_op_finish
= fin
;
13290 flush_dentry_work(mdr
);
13293 class C_FinishIOMDR
: public MDSContext
{
13297 MDSRank
*get_mds() override
{ return mds
; }
13299 C_FinishIOMDR(MDSRank
*mds_
, MDRequestRef
& mdr_
) : mds(mds_
), mdr(mdr_
) {}
13300 void finish(int r
) override
{ mds
->server
->respond_to_request(mdr
, r
); }
13303 void MDCache::flush_dentry_work(MDRequestRef
& mdr
)
13305 MutationImpl::LockOpVec lov
;
13306 CInode
*in
= mds
->server
->rdlock_path_pin_ref(mdr
, true);
13310 ceph_assert(in
->is_auth());
13311 in
->flush(new C_FinishIOMDR(mds
, mdr
));
13316 * Initialize performance counters with global perfcounter
13319 void MDCache::register_perfcounters()
13321 PerfCountersBuilder
pcb(g_ceph_context
, "mds_cache", l_mdc_first
, l_mdc_last
);
13323 pcb
.add_u64_counter(l_mdc_dir_update
, "dir_update",
13324 "Directory replication directives");
13325 pcb
.add_u64_counter(l_mdc_dir_update_receipt
, "dir_update_receipt",
13326 "Directory replication directives received");
13327 pcb
.add_u64_counter(l_mdc_dir_try_discover
, "dir_try_discover",
13328 "Directory replication attempt to discover");
13329 pcb
.add_u64_counter(l_mdc_dir_send_discover
, "dir_send_discover",
13330 "Directory replication discovery message sent");
13331 pcb
.add_u64_counter(l_mdc_dir_handle_discover
, "dir_handle_discover",
13332 "Directory replication discovery message handled");
13334 // Stray/purge statistics
13335 pcb
.add_u64(l_mdc_num_strays
, "num_strays", "Stray dentries", "stry",
13336 PerfCountersBuilder::PRIO_INTERESTING
);
13337 pcb
.add_u64(l_mdc_num_recovering_enqueued
,
13338 "num_recovering_enqueued", "Files waiting for recovery", "recy",
13339 PerfCountersBuilder::PRIO_INTERESTING
);
13340 pcb
.add_u64_counter(l_mdc_recovery_completed
,
13341 "recovery_completed", "File recoveries completed", "recd",
13342 PerfCountersBuilder::PRIO_INTERESTING
);
13344 // useful recovery queue statistics
13345 pcb
.set_prio_default(PerfCountersBuilder::PRIO_USEFUL
);
13346 pcb
.add_u64(l_mdc_num_recovering_processing
, "num_recovering_processing",
13347 "Files currently being recovered");
13348 pcb
.add_u64(l_mdc_num_recovering_prioritized
, "num_recovering_prioritized",
13349 "Files waiting for recovery with elevated priority");
13350 pcb
.add_u64_counter(l_mdc_recovery_started
, "recovery_started",
13351 "File recoveries started");
13353 // along with other stray dentries stats
13354 pcb
.add_u64(l_mdc_num_strays_delayed
, "num_strays_delayed",
13355 "Stray dentries delayed");
13356 pcb
.add_u64(l_mdc_num_strays_enqueuing
, "num_strays_enqueuing",
13357 "Stray dentries enqueuing for purge");
13358 pcb
.add_u64_counter(l_mdc_strays_created
, "strays_created",
13359 "Stray dentries created");
13360 pcb
.add_u64_counter(l_mdc_strays_enqueued
, "strays_enqueued",
13361 "Stray dentries enqueued for purge");
13362 pcb
.add_u64_counter(l_mdc_strays_reintegrated
, "strays_reintegrated",
13363 "Stray dentries reintegrated");
13364 pcb
.add_u64_counter(l_mdc_strays_migrated
, "strays_migrated",
13365 "Stray dentries migrated");
13367 // low prio internal request stats
13368 pcb
.add_u64_counter(l_mdss_ireq_enqueue_scrub
, "ireq_enqueue_scrub",
13369 "Internal Request type enqueue scrub");
13370 pcb
.add_u64_counter(l_mdss_ireq_exportdir
, "ireq_exportdir",
13371 "Internal Request type export dir");
13372 pcb
.add_u64_counter(l_mdss_ireq_flush
, "ireq_flush",
13373 "Internal Request type flush");
13374 pcb
.add_u64_counter(l_mdss_ireq_fragmentdir
, "ireq_fragmentdir",
13375 "Internal Request type fragmentdir");
13376 pcb
.add_u64_counter(l_mdss_ireq_fragstats
, "ireq_fragstats",
13377 "Internal Request type frag stats");
13378 pcb
.add_u64_counter(l_mdss_ireq_inodestats
, "ireq_inodestats",
13379 "Internal Request type inode stats");
13381 logger
.reset(pcb
.create_perf_counters());
13382 g_ceph_context
->get_perfcounters_collection()->add(logger
.get());
13383 recovery_queue
.set_logger(logger
.get());
13384 stray_manager
.set_logger(logger
.get());
13388 * Call this when putting references to an inode/dentry or
13389 * when attempting to trim it.
13391 * If this inode is no longer linked by anyone, and this MDS
13392 * rank holds the primary dentry, and that dentry is in a stray
13393 * directory, then give up the dentry to the StrayManager, never
13394 * to be seen again by MDCache.
13396 * @param delay if true, then purgeable inodes are stashed til
13397 * the next trim(), rather than being purged right
13400 void MDCache::maybe_eval_stray(CInode
*in
, bool delay
) {
13401 if (in
->get_inode()->nlink
> 0 || in
->is_base() || is_readonly() ||
13402 mds
->get_state() <= MDSMap::STATE_REJOIN
)
13405 CDentry
*dn
= in
->get_projected_parent_dn();
13407 if (dn
->state_test(CDentry::STATE_PURGING
)) {
13408 /* We have already entered the purging process, no need
13409 * to re-evaluate me ! */
13413 if (dn
->get_dir()->get_inode()->is_stray()) {
13415 stray_manager
.queue_delayed(dn
);
13417 stray_manager
.eval_stray(dn
);
13421 void MDCache::clear_dirty_bits_for_stray(CInode
* diri
) {
13422 dout(10) << __func__
<< " " << *diri
<< dendl
;
13423 ceph_assert(diri
->get_projected_parent_dir()->inode
->is_stray());
13424 auto&& ls
= diri
->get_dirfrags();
13425 for (auto &p
: ls
) {
13426 if (p
->is_auth() && !(p
->is_frozen() || p
->is_freezing()))
13427 p
->try_remove_dentries_for_stray();
13429 if (!diri
->snaprealm
) {
13430 if (diri
->is_auth())
13431 diri
->clear_dirty_rstat();
13432 diri
->clear_scatter_dirty();
13436 bool MDCache::dump_inode(Formatter
*f
, uint64_t number
) {
13437 CInode
*in
= get_inode(number
);
13441 f
->open_object_section("inode");
13442 in
->dump(f
, CInode::DUMP_DEFAULT
| CInode::DUMP_PATH
);
13443 f
->close_section();
13447 void MDCache::handle_mdsmap(const MDSMap
&mdsmap
, const MDSMap
&oldmap
) {
13448 const mds_rank_t max_mds
= mdsmap
.get_max_mds();
13450 // process export_pin_delayed_queue whenever a new MDSMap received
13451 auto &q
= export_pin_delayed_queue
;
13452 for (auto it
= q
.begin(); it
!= q
.end(); ) {
13454 mds_rank_t export_pin
= in
->get_export_pin(false);
13455 dout(10) << " delayed export_pin=" << export_pin
<< " on " << *in
13456 << " max_mds=" << max_mds
<< dendl
;
13457 if (export_pin
>= mdsmap
.get_max_mds()) {
13462 in
->state_clear(CInode::STATE_DELAYEDEXPORTPIN
);
13464 in
->queue_export_pin(export_pin
);
13467 if (mdsmap
.get_max_mds() != oldmap
.get_max_mds()) {
13468 dout(10) << "Checking ephemerally pinned directories for redistribute due to max_mds change." << dendl
;
13469 /* copy to vector to avoid removals during iteration */
13470 std::vector
<CInode
*> migrate
;
13471 migrate
.assign(export_ephemeral_pins
.begin(), export_ephemeral_pins
.end());
13472 for (auto& in
: migrate
) {
13473 in
->maybe_export_pin();
13477 if (max_mds
<= 1) {
13478 export_ephemeral_dist_frag_bits
= 0;
13480 double want
= g_conf().get_val
<double>("mds_export_ephemeral_distributed_factor");
13483 while ((1U << n
) < (unsigned)want
)
13485 export_ephemeral_dist_frag_bits
= n
;
13489 void MDCache::upkeep_main(void)
13491 std::unique_lock
lock(upkeep_mutex
);
13492 while (!upkeep_trim_shutdown
.load()) {
13493 auto now
= clock::now();
13494 auto since
= now
-upkeep_last_trim
;
13495 auto trim_interval
= clock::duration(g_conf().get_val
<std::chrono::seconds
>("mds_cache_trim_interval"));
13496 if (since
>= trim_interval
*.90) {
13497 lock
.unlock(); /* mds_lock -> upkeep_mutex */
13498 std::scoped_lock
mds_lock(mds
->mds_lock
);
13500 if (upkeep_trim_shutdown
.load())
13502 check_memory_usage();
13503 if (mds
->is_cache_trimmable()) {
13504 dout(20) << "upkeep thread trimming cache; last trim " << since
<< " ago" << dendl
;
13505 bool active_with_clients
= mds
->is_active() || mds
->is_clientreplay() || mds
->is_stopping();
13506 if (active_with_clients
) {
13507 trim_client_leases();
13512 if (active_with_clients
) {
13513 auto recall_flags
= Server::RecallFlags::ENFORCE_MAX
|Server::RecallFlags::ENFORCE_LIVENESS
;
13514 if (cache_toofull()) {
13515 recall_flags
= recall_flags
|Server::RecallFlags::TRIM
;
13517 mds
->server
->recall_client_state(nullptr, recall_flags
);
13519 upkeep_last_trim
= now
= clock::now();
13521 dout(10) << "cache not ready for trimming" << dendl
;
13524 trim_interval
-= since
;
13526 since
= now
-upkeep_last_release
;
13527 auto release_interval
= clock::duration(g_conf().get_val
<std::chrono::seconds
>("mds_cache_release_free_interval"));
13528 if (since
>= release_interval
*.90) {
13529 /* XXX not necessary once MDCache uses PriorityCache */
13530 dout(10) << "releasing free memory" << dendl
;
13531 ceph_heap_release_free_memory();
13532 upkeep_last_release
= clock::now();
13534 release_interval
-= since
;
13536 auto interval
= std::min(release_interval
, trim_interval
);
13537 dout(20) << "upkeep thread waiting interval " << interval
<< dendl
;
13538 upkeep_cvar
.wait_for(lock
, interval
);