1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
20 #include <string_view>
28 #include "MDBalancer.h"
30 #include "ScrubStack.h"
32 #include "SnapClient.h"
41 #include "include/ceph_fs.h"
42 #include "include/filepath.h"
43 #include "include/util.h"
45 #include "messages/MClientCaps.h"
47 #include "msg/Message.h"
48 #include "msg/Messenger.h"
50 #include "common/MemoryModel.h"
51 #include "common/errno.h"
52 #include "common/perf_counters.h"
53 #include "common/safe_io.h"
55 #include "osdc/Journaler.h"
56 #include "osdc/Filer.h"
58 #include "events/ESubtreeMap.h"
59 #include "events/EUpdate.h"
60 #include "events/ESlaveUpdate.h"
61 #include "events/EImportFinish.h"
62 #include "events/EFragment.h"
63 #include "events/ECommitted.h"
64 #include "events/ESessions.h"
68 #include "common/Timer.h"
70 #include "perfglue/heap_profiler.h"
73 #include "common/config.h"
74 #include "include/ceph_assert.h"
76 #define dout_context g_ceph_context
77 #define dout_subsys ceph_subsys_mds
79 #define dout_prefix _prefix(_dout, mds)
80 static ostream
& _prefix(std::ostream
*_dout
, MDSRank
*mds
) {
81 return *_dout
<< "mds." << mds
->get_nodeid() << ".cache ";
84 set
<int> SimpleLock::empty_gather_set
;
88 * All non-I/O contexts that require a reference
89 * to an MDCache instance descend from this.
91 class MDCacheContext
: public virtual MDSContext
{
94 MDSRank
*get_mds() override
96 ceph_assert(mdcache
!= NULL
);
100 explicit MDCacheContext(MDCache
*mdc_
) : mdcache(mdc_
) {}
105 * Only for contexts called back from an I/O completion
107 * Note: duplication of members wrt MDCacheContext, because
108 * it'ls the lesser of two evils compared with introducing
109 * yet another piece of (multiple) inheritance.
111 class MDCacheIOContext
: public virtual MDSIOContextBase
{
114 MDSRank
*get_mds() override
116 ceph_assert(mdcache
!= NULL
);
120 explicit MDCacheIOContext(MDCache
*mdc_
, bool track
=true) :
121 MDSIOContextBase(track
), mdcache(mdc_
) {}
124 class MDCacheLogContext
: public virtual MDSLogContextBase
{
127 MDSRank
*get_mds() override
129 ceph_assert(mdcache
!= NULL
);
133 explicit MDCacheLogContext(MDCache
*mdc_
) : mdcache(mdc_
) {}
136 MDCache::MDCache(MDSRank
*m
, PurgeQueue
&purge_queue_
) :
138 filer(m
->objecter
, m
->finisher
),
140 stray_manager(m
, purge_queue_
),
141 trim_counter(g_conf().get_val
<double>("mds_cache_trim_decay_rate")),
144 migrator
.reset(new Migrator(mds
, this));
146 max_dir_commit_size
= g_conf()->mds_dir_max_commit_size
?
147 (g_conf()->mds_dir_max_commit_size
<< 20) :
148 (0.9 *(g_conf()->osd_max_write_size
<< 20));
150 cache_inode_limit
= g_conf().get_val
<int64_t>("mds_cache_size");
151 cache_memory_limit
= g_conf().get_val
<Option::size_t>("mds_cache_memory_limit");
152 cache_reservation
= g_conf().get_val
<double>("mds_cache_reservation");
153 cache_health_threshold
= g_conf().get_val
<double>("mds_health_cache_threshold");
155 lru
.lru_set_midpoint(g_conf().get_val
<double>("mds_cache_mid"));
157 bottom_lru
.lru_set_midpoint(0);
159 decayrate
.set_halflife(g_conf()->mds_decay_halflife
);
161 upkeeper
= std::thread([this]() {
162 std::unique_lock
lock(upkeep_mutex
);
163 while (!upkeep_trim_shutdown
.load()) {
164 auto now
= clock::now();
165 auto since
= now
-upkeep_last_trim
;
166 auto trim_interval
= clock::duration(g_conf().get_val
<std::chrono::seconds
>("mds_cache_trim_interval"));
167 if (since
>= trim_interval
*.90) {
168 lock
.unlock(); /* mds_lock -> upkeep_mutex */
169 std::scoped_lock
mds_lock(mds
->mds_lock
);
171 if (upkeep_trim_shutdown
.load())
173 if (mds
->is_cache_trimmable()) {
174 dout(20) << "upkeep thread trimming cache; last trim " << since
<< " ago" << dendl
;
175 trim_client_leases();
177 check_memory_usage();
178 auto flags
= Server::RecallFlags::ENFORCE_MAX
|Server::RecallFlags::ENFORCE_LIVENESS
;
179 mds
->server
->recall_client_state(nullptr, flags
);
180 upkeep_last_trim
= clock::now();
181 upkeep_last_trim
= now
= clock::now();
183 dout(10) << "cache not ready for trimming" << dendl
;
186 trim_interval
-= since
;
188 since
= now
-upkeep_last_release
;
189 auto release_interval
= clock::duration(g_conf().get_val
<std::chrono::seconds
>("mds_cache_release_free_interval"));
190 if (since
>= release_interval
) {
191 /* XXX not necessary once MDCache uses PriorityCache */
192 dout(10) << "releasing free memory" << dendl
;
193 ceph_heap_release_free_memory();
194 upkeep_last_release
= clock::now();
196 release_interval
-= since
;
198 auto interval
= std::min(release_interval
, trim_interval
);
199 dout(20) << "upkeep thread waiting interval " << interval
<< dendl
;
200 upkeep_cvar
.wait_for(lock
, interval
);
208 g_ceph_context
->get_perfcounters_collection()->remove(logger
.get());
210 if (upkeeper
.joinable())
214 void MDCache::handle_conf_change(const std::set
<std::string
>& changed
, const MDSMap
& mdsmap
)
216 if (changed
.count("mds_cache_size"))
217 cache_inode_limit
= g_conf().get_val
<int64_t>("mds_cache_size");
218 if (changed
.count("mds_cache_memory_limit"))
219 cache_memory_limit
= g_conf().get_val
<Option::size_t>("mds_cache_memory_limit");
220 if (changed
.count("mds_cache_reservation"))
221 cache_reservation
= g_conf().get_val
<double>("mds_cache_reservation");
222 if (changed
.count("mds_health_cache_threshold"))
223 cache_health_threshold
= g_conf().get_val
<double>("mds_health_cache_threshold");
224 if (changed
.count("mds_cache_mid"))
225 lru
.lru_set_midpoint(g_conf().get_val
<double>("mds_cache_mid"));
226 if (changed
.count("mds_cache_trim_decay_rate")) {
227 trim_counter
= DecayCounter(g_conf().get_val
<double>("mds_cache_trim_decay_rate"));
230 migrator
->handle_conf_change(changed
, mdsmap
);
231 mds
->balancer
->handle_conf_change(changed
, mdsmap
);
234 void MDCache::log_stat()
236 mds
->logger
->set(l_mds_inode_max
, cache_inode_limit
? : INT_MAX
);
237 mds
->logger
->set(l_mds_inodes
, lru
.lru_get_size());
238 mds
->logger
->set(l_mds_inodes_pinned
, lru
.lru_get_num_pinned());
239 mds
->logger
->set(l_mds_inodes_top
, lru
.lru_get_top());
240 mds
->logger
->set(l_mds_inodes_bottom
, lru
.lru_get_bot());
241 mds
->logger
->set(l_mds_inodes_pin_tail
, lru
.lru_get_pintail());
242 mds
->logger
->set(l_mds_inodes_with_caps
, num_inodes_with_caps
);
243 mds
->logger
->set(l_mds_caps
, Capability::count());
245 mds
->logger
->set(l_mds_root_rfiles
, root
->inode
.rstat
.rfiles
);
246 mds
->logger
->set(l_mds_root_rbytes
, root
->inode
.rstat
.rbytes
);
247 mds
->logger
->set(l_mds_root_rsnaps
, root
->inode
.rstat
.rsnaps
);
254 bool MDCache::shutdown()
257 std::scoped_lock
lock(upkeep_mutex
);
258 upkeep_trim_shutdown
= true;
259 upkeep_cvar
.notify_one();
261 if (lru
.lru_get_size() > 0) {
262 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl
;
271 // ====================================================================
272 // some inode functions
274 void MDCache::add_inode(CInode
*in
)
276 // add to lru, inode map
277 if (in
->last
== CEPH_NOSNAP
) {
278 auto &p
= inode_map
[in
->ino()];
279 ceph_assert(!p
); // should be no dup inos!
282 auto &p
= snap_inode_map
[in
->vino()];
283 ceph_assert(!p
); // should be no dup inos!
287 if (in
->ino() < MDS_INO_SYSTEM_BASE
) {
288 if (in
->ino() == MDS_INO_ROOT
)
290 else if (in
->ino() == MDS_INO_MDSDIR(mds
->get_nodeid()))
292 else if (in
->is_stray()) {
293 if (MDS_INO_STRAY_OWNER(in
->ino()) == mds
->get_nodeid()) {
294 strays
[MDS_INO_STRAY_INDEX(in
->ino())] = in
;
298 base_inodes
.insert(in
);
301 if (cache_toofull()) {
302 exceeded_size_limit
= true;
306 void MDCache::remove_inode(CInode
*o
)
308 dout(14) << "remove_inode " << *o
<< dendl
;
310 if (o
->get_parent_dn()) {
311 // FIXME: multiple parents?
312 CDentry
*dn
= o
->get_parent_dn();
313 ceph_assert(!dn
->is_dirty());
314 dn
->dir
->unlink_inode(dn
); // leave dentry ... FIXME?
319 if (o
->is_dirty_parent())
320 o
->clear_dirty_parent();
322 o
->clear_scatter_dirty();
324 o
->item_open_file
.remove_myself();
326 if (o
->state_test(CInode::STATE_QUEUEDEXPORTPIN
))
327 export_pin_queue
.erase(o
);
329 if (o
->state_test(CInode::STATE_DELAYEDEXPORTPIN
))
330 export_pin_delayed_queue
.erase(o
);
332 // remove from inode map
333 if (o
->last
== CEPH_NOSNAP
) {
334 inode_map
.erase(o
->ino());
336 o
->item_caps
.remove_myself();
337 snap_inode_map
.erase(o
->vino());
340 if (o
->ino() < MDS_INO_SYSTEM_BASE
) {
341 if (o
== root
) root
= 0;
342 if (o
== myin
) myin
= 0;
344 if (MDS_INO_STRAY_OWNER(o
->ino()) == mds
->get_nodeid()) {
345 strays
[MDS_INO_STRAY_INDEX(o
->ino())] = 0;
349 base_inodes
.erase(o
);
353 ceph_assert(o
->get_num_ref() == 0);
357 file_layout_t
MDCache::gen_default_file_layout(const MDSMap
&mdsmap
)
359 file_layout_t result
= file_layout_t::get_default();
360 result
.pool_id
= mdsmap
.get_first_data_pool();
364 file_layout_t
MDCache::gen_default_log_layout(const MDSMap
&mdsmap
)
366 file_layout_t result
= file_layout_t::get_default();
367 result
.pool_id
= mdsmap
.get_metadata_pool();
368 if (g_conf()->mds_log_segment_size
> 0) {
369 result
.object_size
= g_conf()->mds_log_segment_size
;
370 result
.stripe_unit
= g_conf()->mds_log_segment_size
;
375 void MDCache::init_layouts()
377 default_file_layout
= gen_default_file_layout(*(mds
->mdsmap
));
378 default_log_layout
= gen_default_log_layout(*(mds
->mdsmap
));
381 void MDCache::create_unlinked_system_inode(CInode
*in
, inodeno_t ino
,
385 in
->inode
.version
= 1;
386 in
->inode
.xattr_version
= 1;
387 in
->inode
.mode
= 0500 | mode
;
391 in
->inode
.btime
= ceph_clock_now();
393 in
->inode
.truncate_size
= -1ull;
394 in
->inode
.change_attr
= 0;
395 in
->inode
.export_pin
= MDS_RANK_NONE
;
397 // FIPS zeroization audit 20191117: this memset is not security related.
398 memset(&in
->inode
.dir_layout
, 0, sizeof(in
->inode
.dir_layout
));
399 if (in
->inode
.is_dir()) {
400 in
->inode
.dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
401 in
->inode
.rstat
.rsubdirs
= 1; /* itself */
402 in
->inode
.rstat
.rctime
= in
->inode
.ctime
;
404 in
->inode
.layout
= default_file_layout
;
405 ++in
->inode
.rstat
.rfiles
;
407 in
->inode
.accounted_rstat
= in
->inode
.rstat
;
411 in
->inode_auth
= mds_authority_t(mds
->get_nodeid(), CDIR_AUTH_UNKNOWN
);
413 in
->inode_auth
= mds_authority_t(mds_rank_t(in
->ino() - MDS_INO_MDSDIR_OFFSET
), CDIR_AUTH_UNKNOWN
);
414 in
->open_snaprealm(); // empty snaprealm
415 ceph_assert(!in
->snaprealm
->parent
); // created its own
416 in
->snaprealm
->srnode
.seq
= 1;
420 CInode
*MDCache::create_system_inode(inodeno_t ino
, int mode
)
422 dout(0) << "creating system inode with ino:" << ino
<< dendl
;
423 CInode
*in
= new CInode(this);
424 create_unlinked_system_inode(in
, ino
, mode
);
429 CInode
*MDCache::create_root_inode()
431 CInode
*i
= create_system_inode(MDS_INO_ROOT
, S_IFDIR
|0755);
432 i
->inode
.uid
= g_conf()->mds_root_ino_uid
;
433 i
->inode
.gid
= g_conf()->mds_root_ino_gid
;
434 i
->inode
.layout
= default_file_layout
;
435 i
->inode
.layout
.pool_id
= mds
->mdsmap
->get_first_data_pool();
439 void MDCache::create_empty_hierarchy(MDSGather
*gather
)
442 CInode
*root
= create_root_inode();
444 // force empty root dir
445 CDir
*rootdir
= root
->get_or_open_dirfrag(this, frag_t());
446 adjust_subtree_auth(rootdir
, mds
->get_nodeid());
447 rootdir
->dir_rep
= CDir::REP_ALL
; //NONE;
449 ceph_assert(rootdir
->fnode
.accounted_fragstat
== rootdir
->fnode
.fragstat
);
450 ceph_assert(rootdir
->fnode
.fragstat
== root
->inode
.dirstat
);
451 ceph_assert(rootdir
->fnode
.accounted_rstat
== rootdir
->fnode
.rstat
);
452 /* Do no update rootdir rstat information of the fragment, rstat upkeep magic
453 * assume version 0 is stale/invalid.
456 rootdir
->mark_complete();
457 rootdir
->mark_dirty(rootdir
->pre_dirty(), mds
->mdlog
->get_current_segment());
458 rootdir
->commit(0, gather
->new_sub());
461 root
->mark_dirty(root
->pre_dirty(), mds
->mdlog
->get_current_segment());
462 root
->mark_dirty_parent(mds
->mdlog
->get_current_segment(), true);
463 root
->flush(gather
->new_sub());
466 void MDCache::create_mydir_hierarchy(MDSGather
*gather
)
469 CInode
*my
= create_system_inode(MDS_INO_MDSDIR(mds
->get_nodeid()), S_IFDIR
);
471 CDir
*mydir
= my
->get_or_open_dirfrag(this, frag_t());
472 adjust_subtree_auth(mydir
, mds
->get_nodeid());
474 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
477 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
478 CInode
*stray
= create_system_inode(MDS_INO_STRAY(mds
->get_nodeid(), i
), S_IFDIR
);
479 CDir
*straydir
= stray
->get_or_open_dirfrag(this, frag_t());
481 name
<< "stray" << i
;
482 CDentry
*sdn
= mydir
->add_primary_dentry(name
.str(), stray
);
483 sdn
->_mark_dirty(mds
->mdlog
->get_current_segment());
485 stray
->inode
.dirstat
= straydir
->fnode
.fragstat
;
487 mydir
->fnode
.rstat
.add(stray
->inode
.rstat
);
488 mydir
->fnode
.fragstat
.nsubdirs
++;
490 straydir
->mark_complete();
491 straydir
->mark_dirty(straydir
->pre_dirty(), ls
);
492 straydir
->commit(0, gather
->new_sub());
493 stray
->mark_dirty_parent(ls
, true);
494 stray
->store_backtrace(gather
->new_sub());
497 mydir
->fnode
.accounted_fragstat
= mydir
->fnode
.fragstat
;
498 mydir
->fnode
.accounted_rstat
= mydir
->fnode
.rstat
;
500 myin
->inode
.dirstat
= mydir
->fnode
.fragstat
;
501 myin
->inode
.rstat
= mydir
->fnode
.rstat
;
502 ++myin
->inode
.rstat
.rsubdirs
;
503 myin
->inode
.accounted_rstat
= myin
->inode
.rstat
;
505 mydir
->mark_complete();
506 mydir
->mark_dirty(mydir
->pre_dirty(), ls
);
507 mydir
->commit(0, gather
->new_sub());
509 myin
->store(gather
->new_sub());
512 struct C_MDC_CreateSystemFile
: public MDCacheLogContext
{
517 C_MDC_CreateSystemFile(MDCache
*c
, MutationRef
& mu
, CDentry
*d
, version_t v
, MDSContext
*f
) :
518 MDCacheLogContext(c
), mut(mu
), dn(d
), dpv(v
), fin(f
) {}
519 void finish(int r
) override
{
520 mdcache
->_create_system_file_finish(mut
, dn
, dpv
, fin
);
524 void MDCache::_create_system_file(CDir
*dir
, std::string_view name
, CInode
*in
, MDSContext
*fin
)
526 dout(10) << "_create_system_file " << name
<< " in " << *dir
<< dendl
;
527 CDentry
*dn
= dir
->add_null_dentry(name
);
529 dn
->push_projected_linkage(in
);
530 version_t dpv
= dn
->pre_dirty();
533 if (in
->inode
.is_dir()) {
534 in
->inode
.rstat
.rsubdirs
= 1;
536 mdir
= in
->get_or_open_dirfrag(this, frag_t());
537 mdir
->mark_complete();
540 in
->inode
.rstat
.rfiles
= 1;
541 in
->inode
.version
= dn
->pre_dirty();
543 SnapRealm
*realm
= dir
->get_inode()->find_snaprealm();
544 dn
->first
= in
->first
= realm
->get_newest_seq() + 1;
546 MutationRef
mut(new MutationImpl());
548 // force some locks. hacky.
549 mds
->locker
->wrlock_force(&dir
->inode
->filelock
, mut
);
550 mds
->locker
->wrlock_force(&dir
->inode
->nestlock
, mut
);
552 mut
->ls
= mds
->mdlog
->get_current_segment();
553 EUpdate
*le
= new EUpdate(mds
->mdlog
, "create system file");
554 mds
->mdlog
->start_entry(le
);
556 if (!in
->is_mdsdir()) {
557 predirty_journal_parents(mut
, &le
->metablob
, in
, dir
, PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
558 le
->metablob
.add_primary_dentry(dn
, in
, true);
560 predirty_journal_parents(mut
, &le
->metablob
, in
, dir
, PREDIRTY_DIR
, 1);
561 journal_dirty_inode(mut
.get(), &le
->metablob
, in
);
562 dn
->push_projected_linkage(in
->ino(), in
->d_type());
563 le
->metablob
.add_remote_dentry(dn
, true, in
->ino(), in
->d_type());
564 le
->metablob
.add_root(true, in
);
567 le
->metablob
.add_new_dir(mdir
); // dirty AND complete AND new
569 mds
->mdlog
->submit_entry(le
, new C_MDC_CreateSystemFile(this, mut
, dn
, dpv
, fin
));
573 void MDCache::_create_system_file_finish(MutationRef
& mut
, CDentry
*dn
, version_t dpv
, MDSContext
*fin
)
575 dout(10) << "_create_system_file_finish " << *dn
<< dendl
;
577 dn
->pop_projected_linkage();
578 dn
->mark_dirty(dpv
, mut
->ls
);
580 CInode
*in
= dn
->get_linkage()->get_inode();
582 in
->mark_dirty(in
->inode
.version
+ 1, mut
->ls
);
584 if (in
->inode
.is_dir()) {
585 CDir
*dir
= in
->get_dirfrag(frag_t());
587 dir
->mark_dirty(1, mut
->ls
);
588 dir
->mark_new(mut
->ls
);
592 mds
->locker
->drop_locks(mut
.get());
597 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
598 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
603 struct C_MDS_RetryOpenRoot
: public MDSInternalContext
{
605 explicit C_MDS_RetryOpenRoot(MDCache
*c
) : MDSInternalContext(c
->mds
), cache(c
) {}
606 void finish(int r
) override
{
608 // If we can't open root, something disastrous has happened: mark
609 // this rank damaged for operator intervention. Note that
610 // it is not okay to call suicide() here because we are in
611 // a Finisher callback.
612 cache
->mds
->damaged();
613 ceph_abort(); // damaged should never return
620 void MDCache::open_root_inode(MDSContext
*c
)
622 if (mds
->get_nodeid() == mds
->mdsmap
->get_root()) {
624 in
= create_system_inode(MDS_INO_ROOT
, S_IFDIR
|0755); // initially inaccurate!
627 discover_base_ino(MDS_INO_ROOT
, c
, mds
->mdsmap
->get_root());
631 void MDCache::open_mydir_inode(MDSContext
*c
)
633 CInode
*in
= create_system_inode(MDS_INO_MDSDIR(mds
->get_nodeid()), S_IFDIR
|0755); // initially inaccurate!
637 void MDCache::open_mydir_frag(MDSContext
*c
)
640 new MDSInternalContextWrapper(mds
,
641 new FunctionContext([this, c
](int r
) {
646 CDir
*mydir
= myin
->get_or_open_dirfrag(this, frag_t());
648 adjust_subtree_auth(mydir
, mds
->get_nodeid());
655 void MDCache::open_root()
657 dout(10) << "open_root" << dendl
;
660 open_root_inode(new C_MDS_RetryOpenRoot(this));
663 if (mds
->get_nodeid() == mds
->mdsmap
->get_root()) {
664 ceph_assert(root
->is_auth());
665 CDir
*rootdir
= root
->get_or_open_dirfrag(this, frag_t());
666 ceph_assert(rootdir
);
667 if (!rootdir
->is_subtree_root())
668 adjust_subtree_auth(rootdir
, mds
->get_nodeid());
669 if (!rootdir
->is_complete()) {
670 rootdir
->fetch(new C_MDS_RetryOpenRoot(this));
674 ceph_assert(!root
->is_auth());
675 CDir
*rootdir
= root
->get_dirfrag(frag_t());
677 open_remote_dirfrag(root
, frag_t(), new C_MDS_RetryOpenRoot(this));
683 CInode
*in
= create_system_inode(MDS_INO_MDSDIR(mds
->get_nodeid()), S_IFDIR
|0755); // initially inaccurate!
684 in
->fetch(new C_MDS_RetryOpenRoot(this));
687 CDir
*mydir
= myin
->get_or_open_dirfrag(this, frag_t());
689 adjust_subtree_auth(mydir
, mds
->get_nodeid());
694 void MDCache::populate_mydir()
697 CDir
*mydir
= myin
->get_or_open_dirfrag(this, frag_t());
700 dout(10) << "populate_mydir " << *mydir
<< dendl
;
702 if (!mydir
->is_complete()) {
703 mydir
->fetch(new C_MDS_RetryOpenRoot(this));
707 if (mydir
->get_version() == 0 && mydir
->state_test(CDir::STATE_BADFRAG
)) {
708 // A missing dirfrag, we will recreate it. Before that, we must dirty
709 // it before dirtying any of the strays we create within it.
710 mds
->clog
->warn() << "fragment " << mydir
->dirfrag() << " was unreadable, "
712 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
713 mydir
->state_clear(CDir::STATE_BADFRAG
);
714 mydir
->mark_complete();
715 mydir
->mark_dirty(mydir
->pre_dirty(), ls
);
718 // open or create stray
719 uint64_t num_strays
= 0;
720 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
722 name
<< "stray" << i
;
723 CDentry
*straydn
= mydir
->lookup(name
.str());
725 // allow for older fs's with stray instead of stray0
726 if (straydn
== NULL
&& i
== 0)
727 straydn
= mydir
->lookup("stray");
729 if (!straydn
|| !straydn
->get_linkage()->get_inode()) {
730 _create_system_file(mydir
, name
.str().c_str(), create_system_inode(MDS_INO_STRAY(mds
->get_nodeid(), i
), S_IFDIR
),
731 new C_MDS_RetryOpenRoot(this));
734 ceph_assert(straydn
);
735 ceph_assert(strays
[i
]);
736 // we make multiple passes through this method; make sure we only pin each stray once.
737 if (!strays
[i
]->state_test(CInode::STATE_STRAYPINNED
)) {
738 strays
[i
]->get(CInode::PIN_STRAY
);
739 strays
[i
]->state_set(CInode::STATE_STRAYPINNED
);
740 strays
[i
]->get_stickydirs();
742 dout(20) << " stray num " << i
<< " is " << *strays
[i
] << dendl
;
746 strays
[i
]->dirfragtree
.get_leaves(leaves
);
747 for (const auto& leaf
: leaves
) {
748 CDir
*dir
= strays
[i
]->get_dirfrag(leaf
);
750 dir
= strays
[i
]->get_or_open_dirfrag(this, leaf
);
753 // DamageTable applies special handling to strays: it will
754 // have damaged() us out if one is damaged.
755 ceph_assert(!dir
->state_test(CDir::STATE_BADFRAG
));
757 if (dir
->get_version() == 0) {
758 dir
->fetch(new C_MDS_RetryOpenRoot(this));
762 if (dir
->get_frag_size() > 0)
763 num_strays
+= dir
->get_frag_size();
768 dout(10) << "populate_mydir done" << dendl
;
771 mds
->queue_waiters(waiting_for_open
);
773 stray_manager
.set_num_strays(num_strays
);
774 stray_manager
.activate();
779 void MDCache::open_foreign_mdsdir(inodeno_t ino
, MDSContext
*fin
)
781 discover_base_ino(ino
, fin
, mds_rank_t(ino
& (MAX_MDS
-1)));
784 CDir
*MDCache::get_stray_dir(CInode
*in
)
787 in
->name_stray_dentry(straydname
);
789 CInode
*strayi
= get_stray();
791 frag_t fg
= strayi
->pick_dirfrag(straydname
);
792 CDir
*straydir
= strayi
->get_dirfrag(fg
);
793 ceph_assert(straydir
);
797 CDentry
*MDCache::get_or_create_stray_dentry(CInode
*in
)
799 CDir
*straydir
= get_stray_dir(in
);
801 in
->name_stray_dentry(straydname
);
802 CDentry
*straydn
= straydir
->lookup(straydname
);
804 straydn
= straydir
->add_null_dentry(straydname
);
807 ceph_assert(straydn
->get_projected_linkage()->is_null());
810 straydn
->state_set(CDentry::STATE_STRAY
);
816 MDSCacheObject
*MDCache::get_object(const MDSCacheObjectInfo
&info
)
820 return get_inode(info
.ino
, info
.snapid
);
823 CDir
*dir
= get_dirfrag(info
.dirfrag
);
826 if (info
.dname
.length())
827 return dir
->lookup(info
.dname
, info
.snapid
);
835 // ====================================================================
836 // subtree management
839 * adjust the dir_auth of a subtree.
840 * merge with parent and/or child subtrees, if is it appropriate.
841 * merge can ONLY happen if both parent and child have unambiguous auth.
843 void MDCache::adjust_subtree_auth(CDir
*dir
, mds_authority_t auth
, bool adjust_pop
)
845 dout(7) << "adjust_subtree_auth " << dir
->get_dir_auth() << " -> " << auth
846 << " on " << *dir
<< dendl
;
851 if (dir
->inode
->is_base()) {
852 root
= dir
; // bootstrap hack.
853 if (subtrees
.count(root
) == 0) {
855 root
->get(CDir::PIN_SUBTREE
);
858 root
= get_subtree_root(dir
); // subtree root
861 ceph_assert(subtrees
.count(root
));
862 dout(7) << " current root is " << *root
<< dendl
;
865 // i am already a subtree.
866 dir
->set_dir_auth(auth
);
868 // i am a new subtree.
869 dout(10) << " new subtree at " << *dir
<< dendl
;
870 ceph_assert(subtrees
.count(dir
) == 0);
871 subtrees
[dir
]; // create empty subtree bounds list for me.
872 dir
->get(CDir::PIN_SUBTREE
);
875 dir
->set_dir_auth(auth
);
877 // move items nested beneath me, under me.
878 set
<CDir
*>::iterator p
= subtrees
[root
].begin();
879 while (p
!= subtrees
[root
].end()) {
880 set
<CDir
*>::iterator next
= p
;
882 if (get_subtree_root((*p
)->get_parent_dir()) == dir
) {
884 dout(10) << " claiming child bound " << **p
<< dendl
;
885 subtrees
[dir
].insert(*p
);
886 subtrees
[root
].erase(p
);
891 // i am a bound of the parent subtree.
892 subtrees
[root
].insert(dir
);
894 // i am now the subtree root.
897 // adjust recursive pop counters
898 if (adjust_pop
&& dir
->is_auth()) {
899 CDir
*p
= dir
->get_parent_dir();
901 p
->pop_auth_subtree
.sub(dir
->pop_auth_subtree
);
902 if (p
->is_subtree_root()) break;
903 p
= p
->inode
->get_parent_dir();
912 void MDCache::try_subtree_merge(CDir
*dir
)
914 dout(7) << "try_subtree_merge " << *dir
<< dendl
;
915 // record my old bounds
916 auto oldbounds
= subtrees
.at(dir
);
918 set
<CInode
*> to_eval
;
919 // try merge at my root
920 try_subtree_merge_at(dir
, &to_eval
);
922 // try merge at my old bounds
923 for (auto bound
: oldbounds
)
924 try_subtree_merge_at(bound
, &to_eval
);
926 if (!(mds
->is_any_replay() || mds
->is_resolve())) {
927 for(auto in
: to_eval
)
928 eval_subtree_root(in
);
932 class C_MDC_SubtreeMergeWB
: public MDCacheLogContext
{
936 C_MDC_SubtreeMergeWB(MDCache
*mdc
, CInode
*i
, MutationRef
& m
) : MDCacheLogContext(mdc
), in(i
), mut(m
) {}
937 void finish(int r
) override
{
938 mdcache
->subtree_merge_writebehind_finish(in
, mut
);
942 void MDCache::try_subtree_merge_at(CDir
*dir
, set
<CInode
*> *to_eval
, bool adjust_pop
)
944 dout(10) << "try_subtree_merge_at " << *dir
<< dendl
;
946 if (dir
->dir_auth
.second
!= CDIR_AUTH_UNKNOWN
||
947 dir
->state_test(CDir::STATE_EXPORTBOUND
) ||
948 dir
->state_test(CDir::STATE_AUXSUBTREE
))
951 auto it
= subtrees
.find(dir
);
952 ceph_assert(it
!= subtrees
.end());
954 // merge with parent?
956 if (!dir
->inode
->is_base())
957 parent
= get_subtree_root(dir
->get_parent_dir());
959 if (parent
!= dir
&& // we have a parent,
960 parent
->dir_auth
== dir
->dir_auth
) { // auth matches,
961 // merge with parent.
962 dout(10) << " subtree merge at " << *dir
<< dendl
;
963 dir
->set_dir_auth(CDIR_AUTH_DEFAULT
);
965 // move our bounds under the parent
966 subtrees
[parent
].insert(it
->second
.begin(), it
->second
.end());
968 // we are no longer a subtree or bound
969 dir
->put(CDir::PIN_SUBTREE
);
971 subtrees
[parent
].erase(dir
);
973 // adjust popularity?
974 if (adjust_pop
&& dir
->is_auth()) {
976 CDir
*p
= dir
->get_parent_dir();
978 p
->pop_auth_subtree
.add(dir
->pop_auth_subtree
);
979 p
->pop_lru_subdirs
.push_front(&cur
->get_inode()->item_pop_lru
);
980 if (p
->is_subtree_root()) break;
982 p
= p
->inode
->get_parent_dir();
986 if (to_eval
&& dir
->get_inode()->is_auth())
987 to_eval
->insert(dir
->get_inode());
993 void MDCache::subtree_merge_writebehind_finish(CInode
*in
, MutationRef
& mut
)
995 dout(10) << "subtree_merge_writebehind_finish on " << in
<< dendl
;
996 in
->pop_and_dirty_projected_inode(mut
->ls
);
999 mds
->locker
->drop_locks(mut
.get());
1002 in
->auth_unpin(this);
1005 void MDCache::eval_subtree_root(CInode
*diri
)
1007 // evaluate subtree inode filelock?
1008 // (we should scatter the filelock on subtree bounds)
1009 ceph_assert(diri
->is_auth());
1010 mds
->locker
->try_eval(diri
, CEPH_LOCK_IFILE
| CEPH_LOCK_INEST
);
1014 void MDCache::adjust_bounded_subtree_auth(CDir
*dir
, const set
<CDir
*>& bounds
, mds_authority_t auth
)
1016 dout(7) << "adjust_bounded_subtree_auth " << dir
->get_dir_auth() << " -> " << auth
1018 << " bounds " << bounds
1024 if (dir
->ino() == MDS_INO_ROOT
) {
1025 root
= dir
; // bootstrap hack.
1026 if (subtrees
.count(root
) == 0) {
1028 root
->get(CDir::PIN_SUBTREE
);
1031 root
= get_subtree_root(dir
); // subtree root
1034 ceph_assert(subtrees
.count(root
));
1035 dout(7) << " current root is " << *root
<< dendl
;
1037 mds_authority_t oldauth
= dir
->authority();
1040 // i am already a subtree.
1041 dir
->set_dir_auth(auth
);
1043 // i am a new subtree.
1044 dout(10) << " new subtree at " << *dir
<< dendl
;
1045 ceph_assert(subtrees
.count(dir
) == 0);
1046 subtrees
[dir
]; // create empty subtree bounds list for me.
1047 dir
->get(CDir::PIN_SUBTREE
);
1050 dir
->set_dir_auth(auth
);
1052 // move items nested beneath me, under me.
1053 set
<CDir
*>::iterator p
= subtrees
[root
].begin();
1054 while (p
!= subtrees
[root
].end()) {
1055 set
<CDir
*>::iterator next
= p
;
1057 if (get_subtree_root((*p
)->get_parent_dir()) == dir
) {
1059 dout(10) << " claiming child bound " << **p
<< dendl
;
1060 subtrees
[dir
].insert(*p
);
1061 subtrees
[root
].erase(p
);
1066 // i am a bound of the parent subtree.
1067 subtrees
[root
].insert(dir
);
1069 // i am now the subtree root.
1073 set
<CInode
*> to_eval
;
1075 // verify/adjust bounds.
1076 // - these may be new, or
1077 // - beneath existing ambiguous bounds (which will be collapsed),
1078 // - but NOT beneath unambiguous bounds.
1079 for (const auto& bound
: bounds
) {
1081 if (subtrees
[dir
].count(bound
) == 0) {
1082 if (get_subtree_root(bound
) == dir
) {
1083 dout(10) << " new bound " << *bound
<< ", adjusting auth back to old " << oldauth
<< dendl
;
1084 adjust_subtree_auth(bound
, oldauth
); // otherwise, adjust at bound.
1087 dout(10) << " want bound " << *bound
<< dendl
;
1088 CDir
*t
= get_subtree_root(bound
->get_parent_dir());
1089 if (subtrees
[t
].count(bound
) == 0) {
1090 ceph_assert(t
!= dir
);
1091 dout(10) << " new bound " << *bound
<< dendl
;
1092 adjust_subtree_auth(bound
, t
->authority());
1094 // make sure it's nested beneath ambiguous subtree(s)
1096 while (subtrees
[dir
].count(t
) == 0)
1097 t
= get_subtree_root(t
->get_parent_dir());
1098 dout(10) << " swallowing intervening subtree at " << *t
<< dendl
;
1099 adjust_subtree_auth(t
, auth
);
1100 try_subtree_merge_at(t
, &to_eval
);
1101 t
= get_subtree_root(bound
->get_parent_dir());
1102 if (t
== dir
) break;
1107 dout(10) << " already have bound " << *bound
<< dendl
;
1110 // merge stray bounds?
1111 while (!subtrees
[dir
].empty()) {
1112 set
<CDir
*> copy
= subtrees
[dir
];
1113 for (set
<CDir
*>::iterator p
= copy
.begin(); p
!= copy
.end(); ++p
) {
1114 if (bounds
.count(*p
) == 0) {
1116 dout(10) << " swallowing extra subtree at " << *stray
<< dendl
;
1117 adjust_subtree_auth(stray
, auth
);
1118 try_subtree_merge_at(stray
, &to_eval
);
1121 // swallowing subtree may add new subtree bounds
1122 if (copy
== subtrees
[dir
])
1126 // bound should now match.
1127 verify_subtree_bounds(dir
, bounds
);
1131 if (!(mds
->is_any_replay() || mds
->is_resolve())) {
1132 for(auto in
: to_eval
)
1133 eval_subtree_root(in
);
1139 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1140 * fragmentation as necessary to get an equivalent bounding set. That is, only
1141 * split if one of our frags spans the provided bounding set. Never merge.
1143 void MDCache::get_force_dirfrag_bound_set(const vector
<dirfrag_t
>& dfs
, set
<CDir
*>& bounds
)
1145 dout(10) << "get_force_dirfrag_bound_set " << dfs
<< dendl
;
1148 map
<inodeno_t
, fragset_t
> byino
;
1149 for (auto& frag
: dfs
) {
1150 byino
[frag
.ino
].insert(frag
.frag
);
1152 dout(10) << " by ino: " << byino
<< dendl
;
1154 for (map
<inodeno_t
,fragset_t
>::iterator p
= byino
.begin(); p
!= byino
.end(); ++p
) {
1155 CInode
*diri
= get_inode(p
->first
);
1158 dout(10) << " checking fragset " << p
->second
.get() << " on " << *diri
<< dendl
;
1161 for (set
<frag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
)
1162 tmpdft
.force_to_leaf(g_ceph_context
, *q
);
1164 for (const auto& fg
: p
->second
) {
1166 diri
->dirfragtree
.get_leaves_under(fg
, leaves
);
1167 if (leaves
.empty()) {
1169 frag_t approx_fg
= diri
->dirfragtree
[fg
.value()];
1170 frag_vec_t approx_leaves
;
1171 tmpdft
.get_leaves_under(approx_fg
, approx_leaves
);
1172 for (const auto& leaf
: approx_leaves
) {
1173 if (p
->second
.get().count(leaf
) == 0) {
1174 // not bound, so the resolve message is from auth MDS of the dirfrag
1175 force_dir_fragment(diri
, leaf
);
1180 leaves
.push_back(approx_fg
);
1182 diri
->dirfragtree
.get_leaves_under(fg
, leaves
);
1184 dout(10) << " frag " << fg
<< " contains " << leaves
<< dendl
;
1185 for (const auto& leaf
: leaves
) {
1186 CDir
*dir
= diri
->get_dirfrag(leaf
);
1194 void MDCache::adjust_bounded_subtree_auth(CDir
*dir
, const vector
<dirfrag_t
>& bound_dfs
, const mds_authority_t
&auth
)
1196 dout(7) << "adjust_bounded_subtree_auth " << dir
->get_dir_auth() << " -> " << auth
1197 << " on " << *dir
<< " bound_dfs " << bound_dfs
<< dendl
;
1200 get_force_dirfrag_bound_set(bound_dfs
, bounds
);
1201 adjust_bounded_subtree_auth(dir
, bounds
, auth
);
1204 void MDCache::map_dirfrag_set(const list
<dirfrag_t
>& dfs
, set
<CDir
*>& result
)
1206 dout(10) << "map_dirfrag_set " << dfs
<< dendl
;
1209 map
<inodeno_t
, fragset_t
> ino_fragset
;
1210 for (const auto &df
: dfs
) {
1211 ino_fragset
[df
.ino
].insert(df
.frag
);
1215 for (map
<inodeno_t
, fragset_t
>::iterator p
= ino_fragset
.begin();
1216 p
!= ino_fragset
.end();
1218 CInode
*in
= get_inode(p
->first
);
1223 for (const auto& fg
: p
->second
) {
1224 in
->dirfragtree
.get_leaves_under(fg
, fgs
);
1227 dout(15) << "map_dirfrag_set " << p
->second
<< " -> " << fgs
1228 << " on " << *in
<< dendl
;
1230 for (const auto& fg
: fgs
) {
1231 CDir
*dir
= in
->get_dirfrag(fg
);
1240 CDir
*MDCache::get_subtree_root(CDir
*dir
)
1242 // find the underlying dir that delegates (or is about to delegate) auth
1244 if (dir
->is_subtree_root())
1246 dir
= dir
->get_inode()->get_parent_dir();
1252 CDir
*MDCache::get_projected_subtree_root(CDir
*dir
)
1254 // find the underlying dir that delegates (or is about to delegate) auth
1256 if (dir
->is_subtree_root())
1258 dir
= dir
->get_inode()->get_projected_parent_dir();
1264 void MDCache::remove_subtree(CDir
*dir
)
1266 dout(10) << "remove_subtree " << *dir
<< dendl
;
1267 ceph_assert(subtrees
.count(dir
));
1268 ceph_assert(subtrees
[dir
].empty());
1269 subtrees
.erase(dir
);
1270 dir
->put(CDir::PIN_SUBTREE
);
1271 if (dir
->get_parent_dir()) {
1272 CDir
*p
= get_subtree_root(dir
->get_parent_dir());
1273 ceph_assert(subtrees
[p
].count(dir
));
1274 subtrees
[p
].erase(dir
);
1278 void MDCache::get_subtree_bounds(CDir
*dir
, set
<CDir
*>& bounds
)
1280 ceph_assert(subtrees
.count(dir
));
1281 bounds
= subtrees
[dir
];
1284 void MDCache::get_wouldbe_subtree_bounds(CDir
*dir
, set
<CDir
*>& bounds
)
1286 if (subtrees
.count(dir
)) {
1287 // just copy them, dir is a subtree.
1288 get_subtree_bounds(dir
, bounds
);
1291 CDir
*root
= get_subtree_root(dir
);
1292 for (set
<CDir
*>::iterator p
= subtrees
[root
].begin();
1293 p
!= subtrees
[root
].end();
1297 t
= t
->get_parent_dir();
1308 void MDCache::verify_subtree_bounds(CDir
*dir
, const set
<CDir
*>& bounds
)
1310 // for debugging only.
1311 ceph_assert(subtrees
.count(dir
));
1312 if (bounds
!= subtrees
[dir
]) {
1313 dout(0) << "verify_subtree_bounds failed" << dendl
;
1314 set
<CDir
*> b
= bounds
;
1315 for (auto &cd
: subtrees
[dir
]) {
1316 if (bounds
.count(cd
)) {
1320 dout(0) << " missing bound " << *cd
<< dendl
;
1322 for (const auto &cd
: b
)
1323 dout(0) << " extra bound " << *cd
<< dendl
;
1325 ceph_assert(bounds
== subtrees
[dir
]);
1328 void MDCache::verify_subtree_bounds(CDir
*dir
, const list
<dirfrag_t
>& bounds
)
1330 // for debugging only.
1331 ceph_assert(subtrees
.count(dir
));
1333 // make sure that any bounds i do have are properly noted as such.
1335 for (const auto &fg
: bounds
) {
1336 CDir
*bd
= get_dirfrag(fg
);
1338 if (subtrees
[dir
].count(bd
) == 0) {
1339 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd
<< dendl
;
1343 ceph_assert(failed
== 0);
1346 void MDCache::project_subtree_rename(CInode
*diri
, CDir
*olddir
, CDir
*newdir
)
1348 dout(10) << "project_subtree_rename " << *diri
<< " from " << *olddir
1349 << " to " << *newdir
<< dendl
;
1350 projected_subtree_renames
[diri
].push_back(pair
<CDir
*,CDir
*>(olddir
, newdir
));
1353 void MDCache::adjust_subtree_after_rename(CInode
*diri
, CDir
*olddir
, bool pop
)
1355 dout(10) << "adjust_subtree_after_rename " << *diri
<< " from " << *olddir
<< dendl
;
1357 CDir
*newdir
= diri
->get_parent_dir();
1360 map
<CInode
*,list
<pair
<CDir
*,CDir
*> > >::iterator p
= projected_subtree_renames
.find(diri
);
1361 ceph_assert(p
!= projected_subtree_renames
.end());
1362 ceph_assert(!p
->second
.empty());
1363 ceph_assert(p
->second
.front().first
== olddir
);
1364 ceph_assert(p
->second
.front().second
== newdir
);
1365 p
->second
.pop_front();
1366 if (p
->second
.empty())
1367 projected_subtree_renames
.erase(p
);
1372 // adjust total auth pin of freezing subtree
1373 if (olddir
!= newdir
) {
1374 diri
->get_nested_dirfrags(dfls
);
1375 for (auto dir
: dfls
)
1376 olddir
->adjust_freeze_after_rename(dir
);
1381 // make sure subtree dirfrags are at the front of the list
1382 diri
->get_subtree_dirfrags(dfls
);
1383 diri
->get_nested_dirfrags(dfls
);
1384 for (auto dir
: dfls
) {
1385 dout(10) << "dirfrag " << *dir
<< dendl
;
1386 CDir
*oldparent
= get_subtree_root(olddir
);
1387 dout(10) << " old parent " << *oldparent
<< dendl
;
1388 CDir
*newparent
= get_subtree_root(newdir
);
1389 dout(10) << " new parent " << *newparent
<< dendl
;
1391 if (olddir
!= newdir
)
1392 mds
->balancer
->adjust_pop_for_rename(olddir
, dir
, false);
1394 if (oldparent
== newparent
) {
1395 dout(10) << "parent unchanged for " << *dir
<< " at " << *oldparent
<< dendl
;
1396 } else if (dir
->is_subtree_root()) {
1397 // children are fine. change parent.
1398 dout(10) << "moving " << *dir
<< " from " << *oldparent
<< " to " << *newparent
<< dendl
;
1399 ceph_assert(subtrees
[oldparent
].count(dir
));
1400 subtrees
[oldparent
].erase(dir
);
1401 ceph_assert(subtrees
.count(newparent
));
1402 subtrees
[newparent
].insert(dir
);
1403 // caller is responsible for 'eval diri'
1404 try_subtree_merge_at(dir
, NULL
, false);
1408 // see if any old bounds move to the new parent.
1410 for (set
<CDir
*>::iterator p
= subtrees
[oldparent
].begin();
1411 p
!= subtrees
[oldparent
].end();
1414 CDir
*broot
= get_subtree_root(bound
->get_parent_dir());
1415 if (broot
!= oldparent
) {
1416 ceph_assert(broot
== newparent
);
1417 tomove
.push_back(bound
);
1420 for (list
<CDir
*>::iterator p
= tomove
.begin(); p
!= tomove
.end(); ++p
) {
1422 dout(10) << "moving bound " << *bound
<< " from " << *oldparent
<< " to " << *newparent
<< dendl
;
1423 subtrees
[oldparent
].erase(bound
);
1424 subtrees
[newparent
].insert(bound
);
1428 if (oldparent
->authority() != newparent
->authority()) {
1429 adjust_subtree_auth(dir
, oldparent
->authority(), false);
1430 // caller is responsible for 'eval diri'
1431 try_subtree_merge_at(dir
, NULL
, false);
1435 if (olddir
!= newdir
)
1436 mds
->balancer
->adjust_pop_for_rename(newdir
, dir
, true);
1442 // ===================================
1443 // journal and snap/cow helpers
1447 * find first inode in cache that follows given snapid. otherwise, return current.
1449 CInode
*MDCache::pick_inode_snap(CInode
*in
, snapid_t follows
)
1451 dout(10) << "pick_inode_snap follows " << follows
<< " on " << *in
<< dendl
;
1452 ceph_assert(in
->last
== CEPH_NOSNAP
);
1454 auto p
= snap_inode_map
.upper_bound(vinodeno_t(in
->ino(), follows
));
1455 if (p
!= snap_inode_map
.end() && p
->second
->ino() == in
->ino()) {
1456 dout(10) << "pick_inode_snap found " << *p
->second
<< dendl
;
1465 * note: i'm currently cheating wrt dirty and inode.version on cow
1466 * items. instead of doing a full dir predirty, i just take the
1467 * original item's version, and set the dirty flag (via
1468 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1469 * means a special case in the dir commit clean sweep assertions.
1472 CInode
*MDCache::cow_inode(CInode
*in
, snapid_t last
)
1474 ceph_assert(last
>= in
->first
);
1476 CInode
*oldin
= new CInode(this, true, in
->first
, last
);
1477 oldin
->inode
= *in
->get_previous_projected_inode();
1478 oldin
->xattrs
= *in
->get_previous_projected_xattrs();
1479 oldin
->symlink
= in
->symlink
;
1480 oldin
->inode
.trim_client_ranges(last
);
1482 if (in
->first
< in
->oldest_snap
)
1483 in
->oldest_snap
= in
->first
;
1487 dout(10) << "cow_inode " << *in
<< " to " << *oldin
<< dendl
;
1490 if (in
->last
!= CEPH_NOSNAP
) {
1491 CInode
*head_in
= get_inode(in
->ino());
1492 ceph_assert(head_in
);
1493 auto ret
= head_in
->split_need_snapflush(oldin
, in
);
1495 oldin
->client_snap_caps
= in
->client_snap_caps
;
1496 if (!oldin
->client_snap_caps
.empty()) {
1497 for (int i
= 0; i
< num_cinode_locks
; i
++) {
1498 SimpleLock
*lock
= oldin
->get_lock(cinode_lock_info
[i
].lock
);
1500 if (lock
->get_state() != LOCK_SNAP_SYNC
) {
1501 ceph_assert(lock
->is_stable());
1502 lock
->set_state(LOCK_SNAP_SYNC
); // gathering
1503 oldin
->auth_pin(lock
);
1505 lock
->get_wrlock(true);
1510 auto client_snap_caps
= std::move(in
->client_snap_caps
);
1511 in
->client_snap_caps
.clear();
1512 in
->item_open_file
.remove_myself();
1513 in
->item_caps
.remove_myself();
1515 if (!client_snap_caps
.empty()) {
1516 MDSContext::vec finished
;
1517 for (int i
= 0; i
< num_cinode_locks
; i
++) {
1518 SimpleLock
*lock
= in
->get_lock(cinode_lock_info
[i
].lock
);
1520 ceph_assert(lock
->get_state() == LOCK_SNAP_SYNC
); // gathering
1522 if (!lock
->get_num_wrlocks()) {
1523 lock
->set_state(LOCK_SYNC
);
1524 lock
->take_waiting(SimpleLock::WAIT_STABLE
|SimpleLock::WAIT_RD
, finished
);
1525 in
->auth_unpin(lock
);
1528 mds
->queue_waiters(finished
);
1534 if (!in
->client_caps
.empty()) {
1535 const set
<snapid_t
>& snaps
= in
->find_snaprealm()->get_snaps();
1537 for (auto &p
: in
->client_caps
) {
1538 client_t client
= p
.first
;
1539 Capability
*cap
= &p
.second
;
1540 int issued
= cap
->need_snapflush() ? CEPH_CAP_ANY_WR
: cap
->issued();
1541 if ((issued
& CEPH_CAP_ANY_WR
) &&
1542 cap
->client_follows
< last
) {
1543 dout(10) << " client." << client
<< " cap " << ccap_string(issued
) << dendl
;
1544 oldin
->client_snap_caps
.insert(client
);
1545 cap
->client_follows
= last
;
1547 // we need snapflushes for any intervening snaps
1548 dout(10) << " snaps " << snaps
<< dendl
;
1549 for (auto q
= snaps
.lower_bound(oldin
->first
);
1550 q
!= snaps
.end() && *q
<= last
;
1552 in
->add_need_snapflush(oldin
, *q
, client
);
1555 dout(10) << " ignoring client." << client
<< " cap follows " << cap
->client_follows
<< dendl
;
1559 if (!oldin
->client_snap_caps
.empty()) {
1560 for (int i
= 0; i
< num_cinode_locks
; i
++) {
1561 SimpleLock
*lock
= oldin
->get_lock(cinode_lock_info
[i
].lock
);
1563 if (lock
->get_state() != LOCK_SNAP_SYNC
) {
1564 ceph_assert(lock
->is_stable());
1565 lock
->set_state(LOCK_SNAP_SYNC
); // gathering
1566 oldin
->auth_pin(lock
);
1568 lock
->get_wrlock(true);
1575 void MDCache::journal_cow_dentry(MutationImpl
*mut
, EMetaBlob
*metablob
,
1576 CDentry
*dn
, snapid_t follows
,
1577 CInode
**pcow_inode
, CDentry::linkage_t
*dnl
)
1580 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl
;
1583 dout(10) << "journal_cow_dentry follows " << follows
<< " on " << *dn
<< dendl
;
1584 ceph_assert(dn
->is_auth());
1586 // nothing to cow on a null dentry, fix caller
1588 dnl
= dn
->get_projected_linkage();
1589 ceph_assert(!dnl
->is_null());
1591 CInode
*in
= dnl
->is_primary() ? dnl
->get_inode() : NULL
;
1592 bool cow_head
= false;
1593 if (in
&& in
->state_test(CInode::STATE_AMBIGUOUSAUTH
)) {
1594 ceph_assert(in
->is_frozen_inode());
1597 if (in
&& (in
->is_multiversion() || cow_head
)) {
1598 // multiversion inode.
1599 SnapRealm
*realm
= NULL
;
1601 if (in
->get_projected_parent_dn() != dn
) {
1602 ceph_assert(follows
== CEPH_NOSNAP
);
1603 realm
= dn
->dir
->inode
->find_snaprealm();
1604 snapid_t dir_follows
= get_global_snaprealm()->get_newest_seq();
1605 ceph_assert(dir_follows
>= realm
->get_newest_seq());
1607 if (dir_follows
+1 > dn
->first
) {
1608 snapid_t oldfirst
= dn
->first
;
1609 dn
->first
= dir_follows
+1;
1610 if (realm
->has_snaps_in_range(oldfirst
, dir_follows
)) {
1611 CDentry
*olddn
= dn
->dir
->add_remote_dentry(dn
->get_name(), in
->ino(), in
->d_type(),
1612 oldfirst
, dir_follows
);
1614 dout(10) << " olddn " << *olddn
<< dendl
;
1615 metablob
->add_remote_dentry(olddn
, true);
1616 mut
->add_cow_dentry(olddn
);
1617 // FIXME: adjust link count here? hmm.
1619 if (dir_follows
+1 > in
->first
)
1620 in
->cow_old_inode(dir_follows
, cow_head
);
1624 follows
= dir_follows
;
1625 if (in
->snaprealm
) {
1626 realm
= in
->snaprealm
;
1627 ceph_assert(follows
>= realm
->get_newest_seq());
1630 realm
= in
->find_snaprealm();
1631 if (follows
== CEPH_NOSNAP
) {
1632 follows
= get_global_snaprealm()->get_newest_seq();
1633 ceph_assert(follows
>= realm
->get_newest_seq());
1638 if (follows
< in
->first
) {
1639 dout(10) << "journal_cow_dentry follows " << follows
<< " < first on " << *in
<< dendl
;
1643 if (!realm
->has_snaps_in_range(in
->first
, follows
)) {
1644 dout(10) << "journal_cow_dentry no snapshot follows " << follows
<< " on " << *in
<< dendl
;
1645 in
->first
= follows
+ 1;
1649 in
->cow_old_inode(follows
, cow_head
);
1652 SnapRealm
*realm
= dn
->dir
->inode
->find_snaprealm();
1653 if (follows
== CEPH_NOSNAP
) {
1654 follows
= get_global_snaprealm()->get_newest_seq();
1655 ceph_assert(follows
>= realm
->get_newest_seq());
1659 if (follows
< dn
->first
) {
1660 dout(10) << "journal_cow_dentry follows " << follows
<< " < first on " << *dn
<< dendl
;
1664 // update dn.first before adding old dentry to cdir's map
1665 snapid_t oldfirst
= dn
->first
;
1666 dn
->first
= follows
+1;
1668 if (!realm
->has_snaps_in_range(oldfirst
, follows
)) {
1669 dout(10) << "journal_cow_dentry no snapshot follows " << follows
<< " on " << *dn
<< dendl
;
1671 in
->first
= follows
+1;
1675 dout(10) << " dn " << *dn
<< dendl
;
1677 CInode
*oldin
= cow_inode(in
, follows
);
1678 mut
->add_cow_inode(oldin
);
1680 *pcow_inode
= oldin
;
1681 CDentry
*olddn
= dn
->dir
->add_primary_dentry(dn
->get_name(), oldin
, oldfirst
, follows
);
1682 oldin
->inode
.version
= olddn
->pre_dirty();
1683 dout(10) << " olddn " << *olddn
<< dendl
;
1684 bool need_snapflush
= !oldin
->client_snap_caps
.empty();
1685 if (need_snapflush
) {
1686 mut
->ls
->open_files
.push_back(&oldin
->item_open_file
);
1687 mds
->locker
->mark_need_snapflush_inode(oldin
);
1689 metablob
->add_primary_dentry(olddn
, 0, true, false, false, need_snapflush
);
1690 mut
->add_cow_dentry(olddn
);
1692 ceph_assert(dnl
->is_remote());
1693 CDentry
*olddn
= dn
->dir
->add_remote_dentry(dn
->get_name(), dnl
->get_remote_ino(), dnl
->get_remote_d_type(),
1696 dout(10) << " olddn " << *olddn
<< dendl
;
1697 metablob
->add_remote_dentry(olddn
, true);
1698 mut
->add_cow_dentry(olddn
);
1704 void MDCache::journal_cow_inode(MutationRef
& mut
, EMetaBlob
*metablob
,
1705 CInode
*in
, snapid_t follows
,
1706 CInode
**pcow_inode
)
1708 dout(10) << "journal_cow_inode follows " << follows
<< " on " << *in
<< dendl
;
1709 CDentry
*dn
= in
->get_projected_parent_dn();
1710 journal_cow_dentry(mut
.get(), metablob
, dn
, follows
, pcow_inode
);
1713 void MDCache::journal_dirty_inode(MutationImpl
*mut
, EMetaBlob
*metablob
, CInode
*in
, snapid_t follows
)
1715 if (in
->is_base()) {
1716 metablob
->add_root(true, in
);
1718 if (follows
== CEPH_NOSNAP
&& in
->last
!= CEPH_NOSNAP
)
1719 follows
= in
->first
- 1;
1720 CDentry
*dn
= in
->get_projected_parent_dn();
1721 if (!dn
->get_projected_linkage()->is_null()) // no need to cow a null dentry
1722 journal_cow_dentry(mut
, metablob
, dn
, follows
);
1723 if (in
->get_projected_inode()->is_backtrace_updated()) {
1724 bool dirty_pool
= in
->get_projected_inode()->layout
.pool_id
!=
1725 in
->get_previous_projected_inode()->layout
.pool_id
;
1726 metablob
->add_primary_dentry(dn
, in
, true, true, dirty_pool
);
1728 metablob
->add_primary_dentry(dn
, in
, true);
1735 // nested ---------------------------------------------------------------
1737 void MDCache::project_rstat_inode_to_frag(CInode
*cur
, CDir
*parent
, snapid_t first
,
1738 int linkunlink
, SnapRealm
*prealm
)
1740 CDentry
*parentdn
= cur
->get_projected_parent_dn();
1741 CInode::mempool_inode
*curi
= cur
->get_projected_inode();
1743 if (cur
->first
> first
)
1746 dout(10) << "projected_rstat_inode_to_frag first " << first
<< " linkunlink " << linkunlink
1747 << " " << *cur
<< dendl
;
1748 dout(20) << " frag head is [" << parent
->first
<< ",head] " << dendl
;
1749 dout(20) << " inode update is [" << first
<< "," << cur
->last
<< "]" << dendl
;
1752 * FIXME. this incompletely propagates rstats to _old_ parents
1753 * (i.e. shortly after a directory rename). but we need full
1754 * blown hard link backpointers to make this work properly...
1756 snapid_t floor
= parentdn
->first
;
1757 dout(20) << " floor of " << floor
<< " from parent dn " << *parentdn
<< dendl
;
1760 prealm
= parent
->inode
->find_snaprealm();
1761 const set
<snapid_t
> snaps
= prealm
->get_snaps();
1763 if (cur
->last
!= CEPH_NOSNAP
) {
1764 ceph_assert(cur
->dirty_old_rstats
.empty());
1765 set
<snapid_t
>::const_iterator q
= snaps
.lower_bound(std::max(first
, floor
));
1766 if (q
== snaps
.end() || *q
> cur
->last
)
1770 if (cur
->last
>= floor
) {
1772 if (cur
->state_test(CInode::STATE_AMBIGUOUSAUTH
) && cur
->is_auth()) {
1773 // rename src inode is not projected in the slave rename prep case. so we should
1774 // avoid updateing the inode.
1775 ceph_assert(linkunlink
< 0);
1776 ceph_assert(cur
->is_frozen_inode());
1779 _project_rstat_inode_to_frag(*curi
, std::max(first
, floor
), cur
->last
, parent
,
1780 linkunlink
, update
);
1783 if (g_conf()->mds_snap_rstat
) {
1784 for (const auto &p
: cur
->dirty_old_rstats
) {
1785 auto &old
= cur
->old_inodes
[p
];
1786 snapid_t ofirst
= std::max(old
.first
, floor
);
1787 auto it
= snaps
.lower_bound(ofirst
);
1788 if (it
== snaps
.end() || *it
> p
)
1791 _project_rstat_inode_to_frag(old
.inode
, ofirst
, p
, parent
, 0, false);
1794 cur
->dirty_old_rstats
.clear();
1798 void MDCache::_project_rstat_inode_to_frag(CInode::mempool_inode
& inode
, snapid_t ofirst
, snapid_t last
,
1799 CDir
*parent
, int linkunlink
, bool update_inode
)
1801 dout(10) << "_project_rstat_inode_to_frag [" << ofirst
<< "," << last
<< "]" << dendl
;
1802 dout(20) << " inode rstat " << inode
.rstat
<< dendl
;
1803 dout(20) << " inode accounted_rstat " << inode
.accounted_rstat
<< dendl
;
1805 if (linkunlink
== 0) {
1806 delta
.add(inode
.rstat
);
1807 delta
.sub(inode
.accounted_rstat
);
1808 } else if (linkunlink
< 0) {
1809 delta
.sub(inode
.accounted_rstat
);
1811 delta
.add(inode
.rstat
);
1813 dout(20) << " delta " << delta
<< dendl
;
1816 inode
.accounted_rstat
= inode
.rstat
;
1818 while (last
>= ofirst
) {
1820 * pick fnode version to update. at each iteration, we want to
1821 * pick a segment ending in 'last' to update. split as necessary
1822 * to make that work. then, adjust first up so that we only
1823 * update one segment at a time. then loop to cover the whole
1824 * [ofirst,last] interval.
1826 nest_info_t
*prstat
;
1828 fnode_t
*pf
= parent
->get_projected_fnode();
1829 if (last
== CEPH_NOSNAP
) {
1830 if (g_conf()->mds_snap_rstat
)
1831 first
= std::max(ofirst
, parent
->first
);
1833 first
= parent
->first
;
1834 prstat
= &pf
->rstat
;
1835 dout(20) << " projecting to head [" << first
<< "," << last
<< "] " << *prstat
<< dendl
;
1837 if (first
> parent
->first
&&
1838 !(pf
->rstat
== pf
->accounted_rstat
)) {
1839 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1840 << parent
->first
<< "," << (first
-1) << "] "
1841 << " " << *prstat
<< "/" << pf
->accounted_rstat
1843 parent
->dirty_old_rstat
[first
-1].first
= parent
->first
;
1844 parent
->dirty_old_rstat
[first
-1].rstat
= pf
->rstat
;
1845 parent
->dirty_old_rstat
[first
-1].accounted_rstat
= pf
->accounted_rstat
;
1847 parent
->first
= first
;
1848 } else if (!g_conf()->mds_snap_rstat
) {
1849 // drop snapshots' rstats
1851 } else if (last
>= parent
->first
) {
1852 first
= parent
->first
;
1853 parent
->dirty_old_rstat
[last
].first
= first
;
1854 parent
->dirty_old_rstat
[last
].rstat
= pf
->rstat
;
1855 parent
->dirty_old_rstat
[last
].accounted_rstat
= pf
->accounted_rstat
;
1856 prstat
= &parent
->dirty_old_rstat
[last
].rstat
;
1857 dout(10) << " projecting to newly split dirty_old_fnode [" << first
<< "," << last
<< "] "
1858 << " " << *prstat
<< "/" << pf
->accounted_rstat
<< dendl
;
1860 // be careful, dirty_old_rstat is a _sparse_ map.
1861 // sorry, this is ugly.
1864 // find any intersection with last
1865 auto it
= parent
->dirty_old_rstat
.lower_bound(last
);
1866 if (it
== parent
->dirty_old_rstat
.end()) {
1867 dout(20) << " no dirty_old_rstat with last >= last " << last
<< dendl
;
1868 if (!parent
->dirty_old_rstat
.empty() && parent
->dirty_old_rstat
.rbegin()->first
>= first
) {
1869 dout(20) << " last dirty_old_rstat ends at " << parent
->dirty_old_rstat
.rbegin()->first
<< dendl
;
1870 first
= parent
->dirty_old_rstat
.rbegin()->first
+1;
1873 // *it last is >= last
1874 if (it
->second
.first
<= last
) {
1875 // *it intersects [first,last]
1876 if (it
->second
.first
< first
) {
1877 dout(10) << " splitting off left bit [" << it
->second
.first
<< "," << first
-1 << "]" << dendl
;
1878 parent
->dirty_old_rstat
[first
-1] = it
->second
;
1879 it
->second
.first
= first
;
1881 if (it
->second
.first
> first
)
1882 first
= it
->second
.first
;
1883 if (last
< it
->first
) {
1884 dout(10) << " splitting off right bit [" << last
+1 << "," << it
->first
<< "]" << dendl
;
1885 parent
->dirty_old_rstat
[last
] = it
->second
;
1886 it
->second
.first
= last
+1;
1889 // *it is to the _right_ of [first,last]
1890 it
= parent
->dirty_old_rstat
.lower_bound(first
);
1891 // new *it last is >= first
1892 if (it
->second
.first
<= last
&& // new *it isn't also to the right, and
1893 it
->first
>= first
) { // it intersects our first bit,
1894 dout(10) << " staying to the right of [" << it
->second
.first
<< "," << it
->first
<< "]..." << dendl
;
1895 first
= it
->first
+1;
1897 dout(10) << " projecting to new dirty_old_rstat [" << first
<< "," << last
<< "]" << dendl
;
1900 dout(20) << " projecting to dirty_old_rstat [" << first
<< "," << last
<< "]" << dendl
;
1901 parent
->dirty_old_rstat
[last
].first
= first
;
1902 prstat
= &parent
->dirty_old_rstat
[last
].rstat
;
1906 dout(20) << " project to [" << first
<< "," << last
<< "] " << *prstat
<< dendl
;
1907 ceph_assert(last
>= first
);
1910 inode
.accounted_rstat
= inode
.rstat
;
1911 dout(20) << " result [" << first
<< "," << last
<< "] " << *prstat
<< " " << *parent
<< dendl
;
1917 void MDCache::project_rstat_frag_to_inode(nest_info_t
& rstat
, nest_info_t
& accounted_rstat
,
1918 snapid_t ofirst
, snapid_t last
,
1919 CInode
*pin
, bool cow_head
)
1921 dout(10) << "project_rstat_frag_to_inode [" << ofirst
<< "," << last
<< "]" << dendl
;
1922 dout(20) << " frag rstat " << rstat
<< dendl
;
1923 dout(20) << " frag accounted_rstat " << accounted_rstat
<< dendl
;
1924 nest_info_t delta
= rstat
;
1925 delta
.sub(accounted_rstat
);
1926 dout(20) << " delta " << delta
<< dendl
;
1928 while (last
>= ofirst
) {
1929 CInode::mempool_inode
*pi
;
1931 if (last
== pin
->last
) {
1932 pi
= pin
->get_projected_inode();
1933 first
= std::max(ofirst
, pin
->first
);
1934 if (first
> pin
->first
) {
1935 auto &old
= pin
->cow_old_inode(first
-1, cow_head
);
1936 dout(20) << " cloned old_inode rstat is " << old
.inode
.rstat
<< dendl
;
1939 if (last
>= pin
->first
) {
1941 pin
->cow_old_inode(last
, cow_head
);
1943 // our life is easier here because old_inodes is not sparse
1944 // (although it may not begin at snapid 1)
1945 auto it
= pin
->old_inodes
.lower_bound(last
);
1946 if (it
== pin
->old_inodes
.end()) {
1947 dout(10) << " no old_inode <= " << last
<< ", done." << dendl
;
1950 first
= it
->second
.first
;
1952 dout(10) << " oldest old_inode is [" << first
<< "," << it
->first
<< "], done." << dendl
;
1953 //assert(p == pin->old_inodes.begin());
1956 if (it
->first
> last
) {
1957 dout(10) << " splitting right old_inode [" << first
<< "," << it
->first
<< "] to ["
1958 << (last
+1) << "," << it
->first
<< "]" << dendl
;
1959 pin
->old_inodes
[last
] = it
->second
;
1960 it
->second
.first
= last
+1;
1961 pin
->dirty_old_rstats
.insert(it
->first
);
1964 if (first
< ofirst
) {
1965 dout(10) << " splitting left old_inode [" << first
<< "," << last
<< "] to ["
1966 << first
<< "," << ofirst
-1 << "]" << dendl
;
1967 pin
->old_inodes
[ofirst
-1] = pin
->old_inodes
[last
];
1968 pin
->dirty_old_rstats
.insert(ofirst
-1);
1969 pin
->old_inodes
[last
].first
= first
= ofirst
;
1971 pi
= &pin
->old_inodes
[last
].inode
;
1972 pin
->dirty_old_rstats
.insert(last
);
1974 dout(20) << " projecting to [" << first
<< "," << last
<< "] " << pi
->rstat
<< dendl
;
1975 pi
->rstat
.add(delta
);
1976 dout(20) << " result [" << first
<< "," << last
<< "] " << pi
->rstat
<< dendl
;
1982 void MDCache::broadcast_quota_to_client(CInode
*in
, client_t exclude_ct
, bool quota_change
)
1984 if (!(mds
->is_active() || mds
->is_stopping()))
1987 if (!in
->is_auth() || in
->is_frozen())
1990 auto i
= in
->get_projected_inode();
1992 if (!i
->quota
.is_enable() &&
1996 // creaete snaprealm for quota inode (quota was set before mimic)
1997 if (!in
->get_projected_srnode())
1998 mds
->server
->create_quota_realm(in
);
2000 for (auto &p
: in
->client_caps
) {
2001 Capability
*cap
= &p
.second
;
2002 if (cap
->is_noquota())
2005 if (exclude_ct
>= 0 && exclude_ct
!= p
.first
)
2008 if (cap
->last_rbytes
== i
->rstat
.rbytes
&&
2009 cap
->last_rsize
== i
->rstat
.rsize())
2012 if (i
->quota
.max_files
> 0) {
2013 if (i
->rstat
.rsize() >= i
->quota
.max_files
)
2016 if ((abs(cap
->last_rsize
- i
->quota
.max_files
) >> 4) <
2017 abs(cap
->last_rsize
- i
->rstat
.rsize()))
2021 if (i
->quota
.max_bytes
> 0) {
2022 if (i
->rstat
.rbytes
> i
->quota
.max_bytes
- (i
->quota
.max_bytes
>> 3))
2025 if ((abs(cap
->last_rbytes
- i
->quota
.max_bytes
) >> 4) <
2026 abs(cap
->last_rbytes
- i
->rstat
.rbytes
))
2033 cap
->last_rsize
= i
->rstat
.rsize();
2034 cap
->last_rbytes
= i
->rstat
.rbytes
;
2036 auto msg
= MClientQuota::create();
2037 msg
->ino
= in
->ino();
2038 msg
->rstat
= i
->rstat
;
2039 msg
->quota
= i
->quota
;
2040 mds
->send_message_client_counted(msg
, cap
->get_session());
2042 for (const auto &it
: in
->get_replicas()) {
2043 auto msg
= MGatherCaps::create();
2044 msg
->ino
= in
->ino();
2045 mds
->send_message_mds(msg
, it
.first
);
2050 * NOTE: we _have_ to delay the scatter if we are called during a
2051 * rejoin, because we can't twiddle locks between when the
2052 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2053 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2054 * (no requests), and a survivor acks immediately. _except_ that
2055 * during rejoin_(weak|strong) processing, we may complete a lock
2056 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2057 * scatterlock state in that case or the lock states will get out of
2058 * sync between the auth and replica.
2060 * the simple solution is to never do the scatter here. instead, put
2061 * the scatterlock on a list if it isn't already wrlockable. this is
2062 * probably the best plan anyway, since we avoid too many
2063 * scatters/locks under normal usage.
2066 * some notes on dirlock/nestlock scatterlock semantics:
2068 * the fragstat (dirlock) will never be updated without
2069 * dirlock+nestlock wrlock held by the caller.
2071 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2072 * data is pushed up the tree. this could be changed with some
2073 * restructuring here, but in its current form we ensure that the
2074 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2075 * frag, which is nice. and, we only need to track frags that need to
2076 * be nudged (and not inodes with pending rstat changes that need to
2077 * be pushed into the frag). a consequence of this is that the
2078 * accounted_rstat on scatterlock sync may not match our current
2079 * rstat. this is normal and expected.
2081 void MDCache::predirty_journal_parents(MutationRef mut
, EMetaBlob
*blob
,
2082 CInode
*in
, CDir
*parent
,
2083 int flags
, int linkunlink
,
2086 bool primary_dn
= flags
& PREDIRTY_PRIMARY
;
2087 bool do_parent_mtime
= flags
& PREDIRTY_DIR
;
2088 bool shallow
= flags
& PREDIRTY_SHALLOW
;
2090 ceph_assert(mds
->mdlog
->entry_is_open());
2092 // make sure stamp is set
2093 if (mut
->get_mds_stamp() == utime_t())
2094 mut
->set_mds_stamp(ceph_clock_now());
2099 dout(10) << "predirty_journal_parents"
2100 << (do_parent_mtime
? " do_parent_mtime":"")
2101 << " linkunlink=" << linkunlink
2102 << (primary_dn
? " primary_dn":" remote_dn")
2103 << (shallow
? " SHALLOW":"")
2104 << " follows " << cfollows
2105 << " " << *in
<< dendl
;
2108 ceph_assert(primary_dn
);
2109 parent
= in
->get_projected_parent_dn()->get_dir();
2112 if (flags
== 0 && linkunlink
== 0) {
2113 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl
;
2114 blob
->add_dir_context(parent
);
2118 // build list of inodes to wrlock, dirty, and update
2121 CDentry
*parentdn
= NULL
;
2124 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
2125 ceph_assert(parent
->is_auth());
2127 // opportunistically adjust parent dirfrag
2128 CInode
*pin
= parent
->get_inode();
2131 mut
->auth_pin(parent
);
2132 mut
->add_projected_fnode(parent
);
2134 fnode_t
*pf
= parent
->project_fnode();
2135 pf
->version
= parent
->pre_dirty();
2137 if (do_parent_mtime
|| linkunlink
) {
2138 ceph_assert(mut
->is_wrlocked(&pin
->filelock
));
2139 ceph_assert(mut
->is_wrlocked(&pin
->nestlock
));
2140 ceph_assert(cfollows
== CEPH_NOSNAP
);
2142 // update stale fragstat/rstat?
2143 parent
->resync_accounted_fragstat();
2144 parent
->resync_accounted_rstat();
2146 if (do_parent_mtime
) {
2147 pf
->fragstat
.mtime
= mut
->get_op_stamp();
2148 pf
->fragstat
.change_attr
++;
2149 dout(10) << "predirty_journal_parents bumping change_attr to " << pf
->fragstat
.change_attr
<< " on " << parent
<< dendl
;
2150 if (pf
->fragstat
.mtime
> pf
->rstat
.rctime
) {
2151 dout(10) << "predirty_journal_parents updating mtime on " << *parent
<< dendl
;
2152 pf
->rstat
.rctime
= pf
->fragstat
.mtime
;
2154 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent
<< dendl
;
2158 dout(10) << "predirty_journal_parents updating size on " << *parent
<< dendl
;
2160 pf
->fragstat
.nsubdirs
+= linkunlink
;
2161 //pf->rstat.rsubdirs += linkunlink;
2163 pf
->fragstat
.nfiles
+= linkunlink
;
2164 //pf->rstat.rfiles += linkunlink;
2171 // don't update parent this pass
2172 } else if (!linkunlink
&& !(pin
->nestlock
.can_wrlock(-1) &&
2173 pin
->versionlock
.can_wrlock())) {
2174 dout(20) << " unwritable parent nestlock " << pin
->nestlock
2175 << ", marking dirty rstat on " << *cur
<< dendl
;
2176 cur
->mark_dirty_rstat();
2178 // if we don't hold a wrlock reference on this nestlock, take one,
2179 // because we are about to write into the dirfrag fnode and that needs
2180 // to commit before the lock can cycle.
2182 ceph_assert(pin
->nestlock
.get_num_wrlocks() || mut
->is_slave());
2185 if (!mut
->is_wrlocked(&pin
->nestlock
)) {
2186 dout(10) << " taking wrlock on " << pin
->nestlock
<< " on " << *pin
<< dendl
;
2187 mds
->locker
->wrlock_force(&pin
->nestlock
, mut
);
2190 // now we can project the inode rstat diff the dirfrag
2191 SnapRealm
*prealm
= pin
->find_snaprealm();
2193 snapid_t follows
= cfollows
;
2194 if (follows
== CEPH_NOSNAP
)
2195 follows
= prealm
->get_newest_seq();
2197 snapid_t first
= follows
+1;
2199 // first, if the frag is stale, bring it back in sync.
2200 parent
->resync_accounted_rstat();
2202 // now push inode rstats into frag
2203 project_rstat_inode_to_frag(cur
, parent
, first
, linkunlink
, prealm
);
2204 cur
->clear_dirty_rstat();
2208 if (!pin
->is_auth() || (!mut
->is_auth_pinned(pin
) && !pin
->can_auth_pin())) {
2209 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin
<< dendl
;
2213 // delay propagating until later?
2214 if (!stop
&& !first
&&
2215 g_conf()->mds_dirstat_min_interval
> 0) {
2216 double since_last_prop
= mut
->get_mds_stamp() - pin
->last_dirstat_prop
;
2217 if (since_last_prop
< g_conf()->mds_dirstat_min_interval
) {
2218 dout(10) << "predirty_journal_parents last prop " << since_last_prop
2219 << " < " << g_conf()->mds_dirstat_min_interval
2220 << ", stopping" << dendl
;
2223 dout(10) << "predirty_journal_parents last prop " << since_last_prop
<< " ago, continuing" << dendl
;
2227 // can cast only because i'm passing nowait=true in the sole user
2228 MDRequestRef mdmut
= static_cast<MDRequestImpl
*>(mut
.get());
2230 !mut
->is_wrlocked(&pin
->nestlock
) &&
2231 (!pin
->versionlock
.can_wrlock() || // make sure we can take versionlock, too
2233 !mds
->locker
->wrlock_start(&pin
->nestlock
, mdmut
, true)
2234 )) { // ** do not initiate.. see above comment **
2235 dout(10) << "predirty_journal_parents can't wrlock one of " << pin
->versionlock
<< " or " << pin
->nestlock
2236 << " on " << *pin
<< dendl
;
2240 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin
<< dendl
;
2241 mds
->locker
->mark_updated_scatterlock(&pin
->nestlock
);
2242 mut
->ls
->dirty_dirfrag_nest
.push_back(&pin
->item_dirty_dirfrag_nest
);
2243 mut
->add_updated_lock(&pin
->nestlock
);
2244 if (do_parent_mtime
|| linkunlink
) {
2245 mds
->locker
->mark_updated_scatterlock(&pin
->filelock
);
2246 mut
->ls
->dirty_dirfrag_dir
.push_back(&pin
->item_dirty_dirfrag_dir
);
2247 mut
->add_updated_lock(&pin
->filelock
);
2251 if (!mut
->is_wrlocked(&pin
->versionlock
))
2252 mds
->locker
->local_wrlock_grab(&pin
->versionlock
, mut
);
2254 ceph_assert(mut
->is_wrlocked(&pin
->nestlock
) || mut
->is_slave());
2256 pin
->last_dirstat_prop
= mut
->get_mds_stamp();
2260 mut
->add_projected_inode(pin
);
2261 lsi
.push_front(pin
);
2263 pin
->pre_cow_old_inode(); // avoid cow mayhem!
2265 auto &pi
= pin
->project_inode();
2266 pi
.inode
.version
= pin
->pre_dirty();
2269 if (do_parent_mtime
|| linkunlink
) {
2270 dout(20) << "predirty_journal_parents add_delta " << pf
->fragstat
<< dendl
;
2271 dout(20) << "predirty_journal_parents - " << pf
->accounted_fragstat
<< dendl
;
2272 bool touched_mtime
= false, touched_chattr
= false;
2273 pi
.inode
.dirstat
.add_delta(pf
->fragstat
, pf
->accounted_fragstat
, &touched_mtime
, &touched_chattr
);
2274 pf
->accounted_fragstat
= pf
->fragstat
;
2276 pi
.inode
.mtime
= pi
.inode
.ctime
= pi
.inode
.dirstat
.mtime
;
2278 pi
.inode
.change_attr
= pi
.inode
.dirstat
.change_attr
;
2279 dout(20) << "predirty_journal_parents gives " << pi
.inode
.dirstat
<< " on " << *pin
<< dendl
;
2281 if (parent
->get_frag() == frag_t()) { // i.e., we are the only frag
2282 if (pi
.inode
.dirstat
.size() < 0)
2283 ceph_assert(!"negative dirstat size" == g_conf()->mds_verify_scatter
);
2284 if (pi
.inode
.dirstat
.size() != pf
->fragstat
.size()) {
2285 mds
->clog
->error() << "unmatched fragstat size on single dirfrag "
2286 << parent
->dirfrag() << ", inode has " << pi
.inode
.dirstat
2287 << ", dirfrag has " << pf
->fragstat
;
2289 // trust the dirfrag for now
2290 pi
.inode
.dirstat
= pf
->fragstat
;
2292 ceph_assert(!"unmatched fragstat size" == g_conf()->mds_verify_scatter
);
2298 * the rule here is to follow the _oldest_ parent with dirty rstat
2299 * data. if we don't propagate all data, we add ourselves to the
2300 * nudge list. that way all rstat data will (eventually) get
2301 * pushed up the tree.
2303 * actually, no. for now, silently drop rstats for old parents. we need
2304 * hard link backpointers to do the above properly.
2310 parentdn
= pin
->get_projected_parent_dn();
2311 ceph_assert(parentdn
);
2314 dout(10) << "predirty_journal_parents frag->inode on " << *parent
<< dendl
;
2316 // first, if the frag is stale, bring it back in sync.
2317 parent
->resync_accounted_rstat();
2319 if (g_conf()->mds_snap_rstat
) {
2320 for (auto &p
: parent
->dirty_old_rstat
) {
2321 project_rstat_frag_to_inode(p
.second
.rstat
, p
.second
.accounted_rstat
, p
.second
.first
,
2322 p
.first
, pin
, true);
2325 parent
->dirty_old_rstat
.clear();
2326 project_rstat_frag_to_inode(pf
->rstat
, pf
->accounted_rstat
, parent
->first
, CEPH_NOSNAP
, pin
, true);//false);
2328 pf
->accounted_rstat
= pf
->rstat
;
2330 if (parent
->get_frag() == frag_t()) { // i.e., we are the only frag
2331 if (pi
.inode
.rstat
.rbytes
!= pf
->rstat
.rbytes
) {
2332 mds
->clog
->error() << "unmatched rstat rbytes on single dirfrag "
2333 << parent
->dirfrag() << ", inode has " << pi
.inode
.rstat
2334 << ", dirfrag has " << pf
->rstat
;
2336 // trust the dirfrag for now
2337 pi
.inode
.rstat
= pf
->rstat
;
2339 ceph_assert(!"unmatched rstat rbytes" == g_conf()->mds_verify_scatter
);
2343 parent
->check_rstats();
2344 broadcast_quota_to_client(pin
);
2347 parent
= parentdn
->get_dir();
2349 do_parent_mtime
= false;
2354 // now, stick it in the blob
2355 ceph_assert(parent
);
2356 ceph_assert(parent
->is_auth());
2357 blob
->add_dir_context(parent
);
2358 blob
->add_dir(parent
, true);
2359 for (list
<CInode
*>::iterator p
= lsi
.begin();
2363 journal_dirty_inode(mut
.get(), blob
, cur
);
2372 // ===================================
2377 * some handlers for master requests with slaves. we need to make
2378 * sure slaves journal commits before we forget we mastered them and
2379 * remove them from the uncommitted_masters map (used during recovery
2380 * to commit|abort slaves).
2382 struct C_MDC_CommittedMaster
: public MDCacheLogContext
{
2384 C_MDC_CommittedMaster(MDCache
*s
, metareqid_t r
) : MDCacheLogContext(s
), reqid(r
) {}
2385 void finish(int r
) override
{
2386 mdcache
->_logged_master_commit(reqid
);
2390 void MDCache::log_master_commit(metareqid_t reqid
)
2392 dout(10) << "log_master_commit " << reqid
<< dendl
;
2393 uncommitted_masters
[reqid
].committing
= true;
2394 mds
->mdlog
->start_submit_entry(new ECommitted(reqid
),
2395 new C_MDC_CommittedMaster(this, reqid
));
2398 void MDCache::_logged_master_commit(metareqid_t reqid
)
2400 dout(10) << "_logged_master_commit " << reqid
<< dendl
;
2401 ceph_assert(uncommitted_masters
.count(reqid
));
2402 uncommitted_masters
[reqid
].ls
->uncommitted_masters
.erase(reqid
);
2403 mds
->queue_waiters(uncommitted_masters
[reqid
].waiters
);
2404 uncommitted_masters
.erase(reqid
);
2409 void MDCache::committed_master_slave(metareqid_t r
, mds_rank_t from
)
2411 dout(10) << "committed_master_slave mds." << from
<< " on " << r
<< dendl
;
2412 ceph_assert(uncommitted_masters
.count(r
));
2413 uncommitted_masters
[r
].slaves
.erase(from
);
2414 if (!uncommitted_masters
[r
].recovering
&& uncommitted_masters
[r
].slaves
.empty())
2415 log_master_commit(r
);
2418 void MDCache::logged_master_update(metareqid_t reqid
)
2420 dout(10) << "logged_master_update " << reqid
<< dendl
;
2421 ceph_assert(uncommitted_masters
.count(reqid
));
2422 uncommitted_masters
[reqid
].safe
= true;
2423 auto p
= pending_masters
.find(reqid
);
2424 if (p
!= pending_masters
.end()) {
2425 pending_masters
.erase(p
);
2426 if (pending_masters
.empty())
2427 process_delayed_resolve();
2432 * Master may crash after receiving all slaves' commit acks, but before journalling
2433 * the final commit. Slaves may crash after journalling the slave commit, but before
2434 * sending commit ack to the master. Commit masters with no uncommitted slave when
2437 void MDCache::finish_committed_masters()
2439 for (map
<metareqid_t
, umaster
>::iterator p
= uncommitted_masters
.begin();
2440 p
!= uncommitted_masters
.end();
2442 p
->second
.recovering
= false;
2443 if (!p
->second
.committing
&& p
->second
.slaves
.empty()) {
2444 dout(10) << "finish_committed_masters " << p
->first
<< dendl
;
2445 log_master_commit(p
->first
);
2451 * at end of resolve... we must journal a commit|abort for all slave
2452 * updates, before moving on.
2454 * this is so that the master can safely journal ECommitted on ops it
2455 * masters when it reaches up:active (all other recovering nodes must
2456 * complete resolve before that happens).
2458 struct C_MDC_SlaveCommit
: public MDCacheLogContext
{
2461 C_MDC_SlaveCommit(MDCache
*c
, int f
, metareqid_t r
) : MDCacheLogContext(c
), from(f
), reqid(r
) {}
2462 void finish(int r
) override
{
2463 mdcache
->_logged_slave_commit(from
, reqid
);
2467 void MDCache::_logged_slave_commit(mds_rank_t from
, metareqid_t reqid
)
2469 dout(10) << "_logged_slave_commit from mds." << from
<< " " << reqid
<< dendl
;
2472 auto req
= MMDSSlaveRequest::create(reqid
, 0, MMDSSlaveRequest::OP_COMMITTED
);
2473 mds
->send_message_mds(req
, from
);
2481 // ====================================================================
2482 // import map, recovery
2484 void MDCache::_move_subtree_map_bound(dirfrag_t df
, dirfrag_t oldparent
, dirfrag_t newparent
,
2485 map
<dirfrag_t
,vector
<dirfrag_t
> >& subtrees
)
2487 if (subtrees
.count(oldparent
)) {
2488 vector
<dirfrag_t
>& v
= subtrees
[oldparent
];
2489 dout(10) << " removing " << df
<< " from " << oldparent
<< " bounds " << v
<< dendl
;
2490 for (vector
<dirfrag_t
>::iterator it
= v
.begin(); it
!= v
.end(); ++it
)
2496 if (subtrees
.count(newparent
)) {
2497 vector
<dirfrag_t
>& v
= subtrees
[newparent
];
2498 dout(10) << " adding " << df
<< " to " << newparent
<< " bounds " << v
<< dendl
;
2503 ESubtreeMap
*MDCache::create_subtree_map()
2505 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2506 << num_subtrees_fullauth() << " fullauth"
2511 ESubtreeMap
*le
= new ESubtreeMap();
2512 mds
->mdlog
->_start_entry(le
);
2514 map
<dirfrag_t
, CDir
*> dirs_to_add
;
2517 CDir
* mydir
= myin
->get_dirfrag(frag_t());
2518 dirs_to_add
[mydir
->dirfrag()] = mydir
;
2521 // include all auth subtrees, and their bounds.
2522 // and a spanning tree to tie it to the root.
2523 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
2524 p
!= subtrees
.end();
2526 CDir
*dir
= p
->first
;
2528 // journal subtree as "ours" if we are
2531 // me, !me (may be importing and ambiguous!)
2535 if (dir
->get_dir_auth().first
!= mds
->get_nodeid())
2538 if (migrator
->is_ambiguous_import(dir
->dirfrag()) ||
2539 my_ambiguous_imports
.count(dir
->dirfrag())) {
2540 dout(15) << " ambig subtree " << *dir
<< dendl
;
2541 le
->ambiguous_subtrees
.insert(dir
->dirfrag());
2543 dout(15) << " subtree " << *dir
<< dendl
;
2546 dirs_to_add
[dir
->dirfrag()] = dir
;
2547 le
->subtrees
[dir
->dirfrag()].clear();
2551 for (set
<CDir
*>::iterator q
= p
->second
.begin();
2552 q
!= p
->second
.end();
2555 dout(15) << " subtree bound " << *bound
<< dendl
;
2556 dirs_to_add
[bound
->dirfrag()] = bound
;
2557 le
->subtrees
[dir
->dirfrag()].push_back(bound
->dirfrag());
2561 // apply projected renames
2562 for (map
<CInode
*,list
<pair
<CDir
*,CDir
*> > >::iterator p
= projected_subtree_renames
.begin();
2563 p
!= projected_subtree_renames
.end();
2565 for (list
<pair
<CDir
*,CDir
*> >::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
2566 CInode
*diri
= p
->first
;
2567 CDir
*olddir
= q
->first
;
2568 CDir
*newdir
= q
->second
;
2569 dout(10) << " adjusting for projected rename of " << *diri
<< " to " << *newdir
<< dendl
;
2572 diri
->get_dirfrags(dfls
);
2573 for (list
<CDir
*>::iterator p
= dfls
.begin(); p
!= dfls
.end(); ++p
) {
2575 dout(10) << "dirfrag " << dir
->dirfrag() << " " << *dir
<< dendl
;
2576 CDir
*oldparent
= get_projected_subtree_root(olddir
);
2577 dout(10) << " old parent " << oldparent
->dirfrag() << " " << *oldparent
<< dendl
;
2578 CDir
*newparent
= get_projected_subtree_root(newdir
);
2579 dout(10) << " new parent " << newparent
->dirfrag() << " " << *newparent
<< dendl
;
2581 if (oldparent
== newparent
) {
2582 dout(10) << "parent unchanged for " << dir
->dirfrag() << " at "
2583 << oldparent
->dirfrag() << dendl
;
2587 if (dir
->is_subtree_root()) {
2588 if (le
->subtrees
.count(newparent
->dirfrag()) &&
2589 oldparent
->get_dir_auth() != newparent
->get_dir_auth())
2590 dirs_to_add
[dir
->dirfrag()] = dir
;
2591 // children are fine. change parent.
2592 _move_subtree_map_bound(dir
->dirfrag(), oldparent
->dirfrag(), newparent
->dirfrag(),
2597 if (oldparent
->get_dir_auth() != newparent
->get_dir_auth()) {
2598 dout(10) << " creating subtree for " << dir
->dirfrag() << dendl
;
2599 // if oldparent is auth, subtree is mine; include it.
2600 if (le
->subtrees
.count(oldparent
->dirfrag())) {
2601 dirs_to_add
[dir
->dirfrag()] = dir
;
2602 le
->subtrees
[dir
->dirfrag()].clear();
2604 // if newparent is auth, subtree is a new bound
2605 if (le
->subtrees
.count(newparent
->dirfrag())) {
2606 dirs_to_add
[dir
->dirfrag()] = dir
;
2607 le
->subtrees
[newparent
->dirfrag()].push_back(dir
->dirfrag()); // newparent is auth; new bound
2612 // see if any old bounds move to the new parent.
2613 for (set
<CDir
*>::iterator p
= subtrees
[oldparent
].begin();
2614 p
!= subtrees
[oldparent
].end();
2617 if (dir
->contains(bound
->get_parent_dir()))
2618 _move_subtree_map_bound(bound
->dirfrag(), oldparent
->dirfrag(), newparent
->dirfrag(),
2626 // simplify the journaled map. our in memory map may have more
2627 // subtrees than needed due to migrations that are just getting
2628 // started or just completing. but on replay, the "live" map will
2629 // be simple and we can do a straight comparison.
2630 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= le
->subtrees
.begin(); p
!= le
->subtrees
.end(); ++p
) {
2631 if (le
->ambiguous_subtrees
.count(p
->first
))
2634 while (i
< p
->second
.size()) {
2635 dirfrag_t b
= p
->second
[i
];
2636 if (le
->subtrees
.count(b
) &&
2637 le
->ambiguous_subtrees
.count(b
) == 0) {
2638 vector
<dirfrag_t
>& bb
= le
->subtrees
[b
];
2639 dout(10) << "simplify: " << p
->first
<< " swallowing " << b
<< " with bounds " << bb
<< dendl
;
2640 for (vector
<dirfrag_t
>::iterator r
= bb
.begin(); r
!= bb
.end(); ++r
)
2641 p
->second
.push_back(*r
);
2642 dirs_to_add
.erase(b
);
2643 le
->subtrees
.erase(b
);
2644 p
->second
.erase(p
->second
.begin() + i
);
2651 for (auto &p
: dirs_to_add
) {
2652 CDir
*dir
= p
.second
;
2653 le
->metablob
.add_dir_context(dir
, EMetaBlob::TO_ROOT
);
2654 le
->metablob
.add_dir(dir
, false);
2657 dout(15) << " subtrees " << le
->subtrees
<< dendl
;
2658 dout(15) << " ambiguous_subtrees " << le
->ambiguous_subtrees
<< dendl
;
2660 //le->metablob.print(cout);
2661 le
->expire_pos
= mds
->mdlog
->journaler
->get_expire_pos();
2665 void MDCache::dump_resolve_status(Formatter
*f
) const
2667 f
->open_object_section("resolve_status");
2668 f
->dump_stream("resolve_gather") << resolve_gather
;
2669 f
->dump_stream("resolve_ack_gather") << resolve_gather
;
2673 void MDCache::resolve_start(MDSContext
*resolve_done_
)
2675 dout(10) << "resolve_start" << dendl
;
2676 ceph_assert(!resolve_done
);
2677 resolve_done
.reset(resolve_done_
);
2679 if (mds
->mdsmap
->get_root() != mds
->get_nodeid()) {
2680 // if we don't have the root dir, adjust it to UNKNOWN. during
2681 // resolve we want mds0 to explicit claim the portion of it that
2682 // it owns, so that anything beyond its bounds get left as
2684 CDir
*rootdir
= root
->get_dirfrag(frag_t());
2686 adjust_subtree_auth(rootdir
, CDIR_AUTH_UNKNOWN
);
2688 resolve_gather
= recovery_set
;
2690 resolve_snapclient_commits
= mds
->snapclient
->get_journaled_tids();
2693 void MDCache::send_resolves()
2695 send_slave_resolves();
2697 if (!resolve_done
) {
2698 // I'm survivor: refresh snap cache
2699 mds
->snapclient
->sync(
2700 new MDSInternalContextWrapper(mds
,
2701 new FunctionContext([this](int r
) {
2702 maybe_finish_slave_resolve();
2706 dout(10) << "send_resolves waiting for snapclient cache to sync" << dendl
;
2709 if (!resolve_ack_gather
.empty()) {
2710 dout(10) << "send_resolves still waiting for resolve ack from ("
2711 << resolve_ack_gather
<< ")" << dendl
;
2714 if (!resolve_need_rollback
.empty()) {
2715 dout(10) << "send_resolves still waiting for rollback to commit on ("
2716 << resolve_need_rollback
<< ")" << dendl
;
2720 send_subtree_resolves();
2723 void MDCache::send_slave_resolves()
2725 dout(10) << "send_slave_resolves" << dendl
;
2727 map
<mds_rank_t
, MMDSResolve::ref
> resolves
;
2729 if (mds
->is_resolve()) {
2730 for (map
<mds_rank_t
, map
<metareqid_t
, MDSlaveUpdate
*> >::iterator p
= uncommitted_slave_updates
.begin();
2731 p
!= uncommitted_slave_updates
.end();
2733 resolves
[p
->first
] = MMDSResolve::create();
2734 for (map
<metareqid_t
, MDSlaveUpdate
*>::iterator q
= p
->second
.begin();
2735 q
!= p
->second
.end();
2737 dout(10) << " including uncommitted " << q
->first
<< dendl
;
2738 resolves
[p
->first
]->add_slave_request(q
->first
, false);
2742 set
<mds_rank_t
> resolve_set
;
2743 mds
->mdsmap
->get_mds_set(resolve_set
, MDSMap::STATE_RESOLVE
);
2744 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
2745 p
!= active_requests
.end();
2747 MDRequestRef
& mdr
= p
->second
;
2748 if (!mdr
->is_slave())
2750 if (!mdr
->slave_did_prepare() && !mdr
->committing
) {
2753 mds_rank_t master
= mdr
->slave_to_mds
;
2754 if (resolve_set
.count(master
) || is_ambiguous_slave_update(p
->first
, master
)) {
2755 dout(10) << " including uncommitted " << *mdr
<< dendl
;
2756 if (!resolves
.count(master
))
2757 resolves
[master
] = MMDSResolve::create();
2758 if (!mdr
->committing
&&
2759 mdr
->has_more() && mdr
->more()->is_inode_exporter
) {
2760 // re-send cap exports
2761 CInode
*in
= mdr
->more()->rename_inode
;
2762 map
<client_t
, Capability::Export
> cap_map
;
2763 in
->export_client_caps(cap_map
);
2765 encode(in
->ino(), bl
);
2766 encode(cap_map
, bl
);
2767 resolves
[master
]->add_slave_request(p
->first
, bl
);
2769 resolves
[master
]->add_slave_request(p
->first
, mdr
->committing
);
2775 for (auto &p
: resolves
) {
2776 dout(10) << "sending slave resolve to mds." << p
.first
<< dendl
;
2777 mds
->send_message_mds(p
.second
, p
.first
);
2778 resolve_ack_gather
.insert(p
.first
);
2782 void MDCache::send_subtree_resolves()
2784 dout(10) << "send_subtree_resolves" << dendl
;
2786 if (migrator
->is_exporting() || migrator
->is_importing()) {
2787 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl
;
2788 migrator
->show_importing();
2789 migrator
->show_exporting();
2790 resolves_pending
= true;
2794 map
<mds_rank_t
, MMDSResolve::ref
> resolves
;
2795 for (set
<mds_rank_t
>::iterator p
= recovery_set
.begin();
2796 p
!= recovery_set
.end();
2798 if (*p
== mds
->get_nodeid())
2800 if (mds
->is_resolve() || mds
->mdsmap
->is_resolve(*p
))
2801 resolves
[*p
] = MMDSResolve::create();
2804 map
<dirfrag_t
, vector
<dirfrag_t
> > my_subtrees
;
2805 map
<dirfrag_t
, vector
<dirfrag_t
> > my_ambig_imports
;
2808 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
2809 p
!= subtrees
.end();
2811 CDir
*dir
= p
->first
;
2813 // only our subtrees
2814 if (dir
->authority().first
!= mds
->get_nodeid())
2817 if (mds
->is_resolve() && my_ambiguous_imports
.count(dir
->dirfrag()))
2818 continue; // we'll add it below
2820 if (migrator
->is_ambiguous_import(dir
->dirfrag())) {
2821 // ambiguous (mid-import)
2823 get_subtree_bounds(dir
, bounds
);
2824 vector
<dirfrag_t
> dfls
;
2825 for (set
<CDir
*>::iterator q
= bounds
.begin(); q
!= bounds
.end(); ++q
)
2826 dfls
.push_back((*q
)->dirfrag());
2828 my_ambig_imports
[dir
->dirfrag()] = dfls
;
2829 dout(10) << " ambig " << dir
->dirfrag() << " " << dfls
<< dendl
;
2832 for (auto &q
: resolves
) {
2833 resolves
[q
.first
]->add_subtree(dir
->dirfrag());
2836 vector
<dirfrag_t
> dfls
;
2837 for (set
<CDir
*>::iterator q
= subtrees
[dir
].begin();
2838 q
!= subtrees
[dir
].end();
2841 dfls
.push_back(bound
->dirfrag());
2844 my_subtrees
[dir
->dirfrag()] = dfls
;
2845 dout(10) << " claim " << dir
->dirfrag() << " " << dfls
<< dendl
;
2850 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= my_ambiguous_imports
.begin();
2851 p
!= my_ambiguous_imports
.end();
2853 my_ambig_imports
[p
->first
] = p
->second
;
2854 dout(10) << " ambig " << p
->first
<< " " << p
->second
<< dendl
;
2857 // simplify the claimed subtree.
2858 for (auto p
= my_subtrees
.begin(); p
!= my_subtrees
.end(); ++p
) {
2860 while (i
< p
->second
.size()) {
2861 dirfrag_t b
= p
->second
[i
];
2862 if (my_subtrees
.count(b
)) {
2863 vector
<dirfrag_t
>& bb
= my_subtrees
[b
];
2864 dout(10) << " simplify: " << p
->first
<< " swallowing " << b
<< " with bounds " << bb
<< dendl
;
2865 for (vector
<dirfrag_t
>::iterator r
= bb
.begin(); r
!= bb
.end(); ++r
)
2866 p
->second
.push_back(*r
);
2867 my_subtrees
.erase(b
);
2868 p
->second
.erase(p
->second
.begin() + i
);
2876 for (auto &p
: resolves
) {
2877 const MMDSResolve::ref
&m
= p
.second
;
2878 if (mds
->is_resolve()) {
2879 m
->add_table_commits(TABLE_SNAP
, resolve_snapclient_commits
);
2881 m
->add_table_commits(TABLE_SNAP
, mds
->snapclient
->get_journaled_tids());
2883 m
->subtrees
= my_subtrees
;
2884 m
->ambiguous_imports
= my_ambig_imports
;
2885 dout(10) << "sending subtee resolve to mds." << p
.first
<< dendl
;
2886 mds
->send_message_mds(m
, p
.first
);
2888 resolves_pending
= false;
2891 void MDCache::maybe_finish_slave_resolve() {
2892 if (resolve_ack_gather
.empty() && resolve_need_rollback
.empty()) {
2893 // snap cache get synced or I'm in resolve state
2894 if (mds
->snapclient
->is_synced() || resolve_done
)
2895 send_subtree_resolves();
2896 process_delayed_resolve();
2900 void MDCache::handle_mds_failure(mds_rank_t who
)
2902 dout(7) << "handle_mds_failure mds." << who
<< dendl
;
2904 dout(1) << "handle_mds_failure mds." << who
<< " : recovery peers are " << recovery_set
<< dendl
;
2906 resolve_gather
.insert(who
);
2907 discard_delayed_resolve(who
);
2908 ambiguous_slave_updates
.erase(who
);
2910 rejoin_gather
.insert(who
);
2911 rejoin_sent
.erase(who
); // i need to send another
2912 rejoin_ack_sent
.erase(who
); // i need to send another
2913 rejoin_ack_gather
.erase(who
); // i'll need/get another.
2915 dout(10) << " resolve_gather " << resolve_gather
<< dendl
;
2916 dout(10) << " resolve_ack_gather " << resolve_ack_gather
<< dendl
;
2917 dout(10) << " rejoin_sent " << rejoin_sent
<< dendl
;
2918 dout(10) << " rejoin_gather " << rejoin_gather
<< dendl
;
2919 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather
<< dendl
;
2922 // tell the migrator too.
2923 migrator
->handle_mds_failure_or_stop(who
);
2925 // tell the balancer too.
2926 mds
->balancer
->handle_mds_failure(who
);
2928 // clean up any requests slave to/from this node
2929 list
<MDRequestRef
> finish
;
2930 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
2931 p
!= active_requests
.end();
2933 MDRequestRef
& mdr
= p
->second
;
2934 // slave to the failed node?
2935 if (mdr
->slave_to_mds
== who
) {
2936 if (mdr
->slave_did_prepare()) {
2937 dout(10) << " slave request " << *mdr
<< " uncommitted, will resolve shortly" << dendl
;
2938 if (is_ambiguous_slave_update(p
->first
, mdr
->slave_to_mds
))
2939 remove_ambiguous_slave_update(p
->first
, mdr
->slave_to_mds
);
2941 if (!mdr
->more()->waiting_on_slave
.empty()) {
2942 ceph_assert(mdr
->more()->srcdn_auth_mds
== mds
->get_nodeid());
2943 // will rollback, no need to wait
2944 mdr
->reset_slave_request();
2945 mdr
->more()->waiting_on_slave
.clear();
2947 } else if (!mdr
->committing
) {
2948 dout(10) << " slave request " << *mdr
<< " has no prepare, finishing up" << dendl
;
2949 if (mdr
->slave_request
|| mdr
->slave_rolling_back())
2950 mdr
->aborted
= true;
2952 finish
.push_back(mdr
);
2956 if (mdr
->is_slave() && mdr
->slave_did_prepare()) {
2957 if (mdr
->more()->waiting_on_slave
.count(who
)) {
2958 ceph_assert(mdr
->more()->srcdn_auth_mds
== mds
->get_nodeid());
2959 dout(10) << " slave request " << *mdr
<< " no longer need rename notity ack from mds."
2961 mdr
->more()->waiting_on_slave
.erase(who
);
2962 if (mdr
->more()->waiting_on_slave
.empty() && mdr
->slave_request
)
2963 mds
->queue_waiter(new C_MDS_RetryRequest(this, mdr
));
2966 if (mdr
->more()->srcdn_auth_mds
== who
&&
2967 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(mdr
->slave_to_mds
)) {
2968 // rename srcdn's auth mds failed, resolve even I'm a survivor.
2969 dout(10) << " slave request " << *mdr
<< " uncommitted, will resolve shortly" << dendl
;
2970 add_ambiguous_slave_update(p
->first
, mdr
->slave_to_mds
);
2972 } else if (mdr
->slave_request
) {
2973 const MMDSSlaveRequest::const_ref
&slave_req
= mdr
->slave_request
;
2974 // FIXME: Slave rename request can arrive after we notice mds failure.
2975 // This can cause mds to crash (does not affect integrity of FS).
2976 if (slave_req
->get_op() == MMDSSlaveRequest::OP_RENAMEPREP
&&
2977 slave_req
->srcdn_auth
== who
)
2978 slave_req
->mark_interrupted();
2981 // failed node is slave?
2982 if (mdr
->is_master() && !mdr
->committing
) {
2983 if (mdr
->more()->srcdn_auth_mds
== who
) {
2984 dout(10) << " master request " << *mdr
<< " waiting for rename srcdn's auth mds."
2985 << who
<< " to recover" << dendl
;
2986 ceph_assert(mdr
->more()->witnessed
.count(who
) == 0);
2987 if (mdr
->more()->is_ambiguous_auth
)
2988 mdr
->clear_ambiguous_auth();
2989 // rename srcdn's auth mds failed, all witnesses will rollback
2990 mdr
->more()->witnessed
.clear();
2991 pending_masters
.erase(p
->first
);
2994 if (mdr
->more()->witnessed
.count(who
)) {
2995 mds_rank_t srcdn_auth
= mdr
->more()->srcdn_auth_mds
;
2996 if (srcdn_auth
>= 0 && mdr
->more()->waiting_on_slave
.count(srcdn_auth
)) {
2997 dout(10) << " master request " << *mdr
<< " waiting for rename srcdn's auth mds."
2998 << mdr
->more()->srcdn_auth_mds
<< " to reply" << dendl
;
2999 // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
3000 // until either the request is committing or the slave also fails.
3001 ceph_assert(mdr
->more()->waiting_on_slave
.size() == 1);
3002 pending_masters
.insert(p
->first
);
3004 dout(10) << " master request " << *mdr
<< " no longer witnessed by slave mds."
3005 << who
<< " to recover" << dendl
;
3006 if (srcdn_auth
>= 0)
3007 ceph_assert(mdr
->more()->witnessed
.count(srcdn_auth
) == 0);
3009 // discard this peer's prepare (if any)
3010 mdr
->more()->witnessed
.erase(who
);
3014 if (mdr
->more()->waiting_on_slave
.count(who
)) {
3015 dout(10) << " master request " << *mdr
<< " waiting for slave mds." << who
3016 << " to recover" << dendl
;
3017 // retry request when peer recovers
3018 mdr
->more()->waiting_on_slave
.erase(who
);
3019 if (mdr
->more()->waiting_on_slave
.empty())
3020 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(this, mdr
));
3023 if (mdr
->locking
&& mdr
->locking_target_mds
== who
)
3024 mdr
->finish_locking(mdr
->locking
);
3028 for (map
<metareqid_t
, umaster
>::iterator p
= uncommitted_masters
.begin();
3029 p
!= uncommitted_masters
.end();
3031 // The failed MDS may have already committed the slave update
3032 if (p
->second
.slaves
.count(who
)) {
3033 p
->second
.recovering
= true;
3034 p
->second
.slaves
.erase(who
);
3038 while (!finish
.empty()) {
3039 dout(10) << "cleaning up slave request " << *finish
.front() << dendl
;
3040 request_finish(finish
.front());
3044 kick_find_ino_peers(who
);
3045 kick_open_ino_peers(who
);
3047 for (map
<dirfrag_t
,fragment_info_t
>::iterator p
= fragments
.begin();
3048 p
!= fragments
.end(); ) {
3049 dirfrag_t df
= p
->first
;
3050 fragment_info_t
& info
= p
->second
;
3052 if (info
.is_fragmenting()) {
3053 if (info
.notify_ack_waiting
.erase(who
) &&
3054 info
.notify_ack_waiting
.empty()) {
3055 fragment_drop_locks(info
);
3056 fragment_maybe_finish(p
++);
3064 dout(10) << "cancelling fragment " << df
<< " bit " << info
.bits
<< dendl
;
3066 info
.dirs
.swap(dirs
);
3067 fragments
.erase(df
);
3068 fragment_unmark_unfreeze_dirs(dirs
);
3071 // MDCache::shutdown_export_strays() always exports strays to mds.0
3072 if (who
== mds_rank_t(0))
3073 shutdown_exporting_strays
.clear();
3079 * handle_mds_recovery - called on another node's transition
3080 * from resolve -> active.
3082 void MDCache::handle_mds_recovery(mds_rank_t who
)
3084 dout(7) << "handle_mds_recovery mds." << who
<< dendl
;
3086 // exclude all discover waiters. kick_discovers() will do the job
3087 static const uint64_t i_mask
= CInode::WAIT_ANY_MASK
& ~CInode::WAIT_DIR
;
3088 static const uint64_t d_mask
= CDir::WAIT_ANY_MASK
& ~CDir::WAIT_DENTRY
;
3090 MDSContext::vec waiters
;
3092 // wake up any waiters in their subtrees
3093 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3094 p
!= subtrees
.end();
3096 CDir
*dir
= p
->first
;
3098 if (dir
->authority().first
!= who
||
3099 dir
->authority().second
== mds
->get_nodeid())
3101 ceph_assert(!dir
->is_auth());
3107 while (!q
.empty()) {
3108 CDir
*d
= q
.front();
3110 d
->take_waiting(d_mask
, waiters
);
3112 // inode waiters too
3113 for (auto &p
: d
->items
) {
3114 CDentry
*dn
= p
.second
;
3115 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3116 if (dnl
->is_primary()) {
3117 dnl
->get_inode()->take_waiting(i_mask
, waiters
);
3121 dnl
->get_inode()->get_dirfrags(ls
);
3122 for (list
<CDir
*>::iterator p
= ls
.begin();
3126 if (!subdir
->is_subtree_root())
3127 q
.push_back(subdir
);
3134 kick_open_ino_peers(who
);
3135 kick_find_ino_peers(who
);
3138 mds
->queue_waiters(waiters
);
3141 void MDCache::set_recovery_set(set
<mds_rank_t
>& s
)
3143 dout(7) << "set_recovery_set " << s
<< dendl
;
3149 * during resolve state, we share resolves to determine who
3150 * is authoritative for which trees. we expect to get an resolve
3151 * from _everyone_ in the recovery_set (the mds cluster at the time of
3152 * the first failure).
3154 * This functions puts the passed message before returning
3156 void MDCache::handle_resolve(const MMDSResolve::const_ref
&m
)
3158 dout(7) << "handle_resolve from " << m
->get_source() << dendl
;
3159 mds_rank_t from
= mds_rank_t(m
->get_source().num());
3161 if (mds
->get_state() < MDSMap::STATE_RESOLVE
) {
3162 if (mds
->get_want_state() == CEPH_MDS_STATE_RESOLVE
) {
3163 mds
->wait_for_resolve(new C_MDS_RetryMessage(mds
, m
));
3166 // wait until we reach the resolve stage!
3170 discard_delayed_resolve(from
);
3172 // ambiguous slave requests?
3173 if (!m
->slave_requests
.empty()) {
3174 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping()) {
3175 for (auto p
= m
->slave_requests
.begin(); p
!= m
->slave_requests
.end(); ++p
) {
3176 if (uncommitted_masters
.count(p
->first
) && !uncommitted_masters
[p
->first
].safe
) {
3177 ceph_assert(!p
->second
.committing
);
3178 pending_masters
.insert(p
->first
);
3182 if (!pending_masters
.empty()) {
3183 dout(10) << " still have pending updates, delay processing slave resolve" << dendl
;
3184 delayed_resolve
[from
] = m
;
3189 auto ack
= MMDSResolveAck::create();
3190 for (const auto &p
: m
->slave_requests
) {
3191 if (uncommitted_masters
.count(p
.first
)) { //mds->sessionmap.have_completed_request(p.first)) {
3193 if (p
.second
.committing
) {
3194 // already committing, waiting for the OP_COMMITTED slave reply
3195 dout(10) << " already committing slave request " << p
<< " noop "<< dendl
;
3197 dout(10) << " ambiguous slave request " << p
<< " will COMMIT" << dendl
;
3198 ack
->add_commit(p
.first
);
3200 uncommitted_masters
[p
.first
].slaves
.insert(from
); // wait for slave OP_COMMITTED before we log ECommitted
3202 if (p
.second
.inode_caps
.length() > 0) {
3203 // slave wants to export caps (rename)
3204 ceph_assert(mds
->is_resolve());
3207 map
<client_t
,Capability::Export
> cap_exports
;
3208 auto q
= p
.second
.inode_caps
.cbegin();
3210 decode(cap_exports
, q
);
3212 ceph_assert(get_inode(ino
));
3214 for (map
<client_t
,Capability::Export
>::iterator q
= cap_exports
.begin();
3215 q
!= cap_exports
.end();
3217 Capability::Import
& im
= rejoin_imported_caps
[from
][ino
][q
->first
];
3218 im
.cap_id
= ++last_cap_id
; // assign a new cap ID
3220 im
.mseq
= q
->second
.mseq
;
3222 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
3224 rejoin_client_map
.emplace(q
->first
, session
->info
.inst
);
3227 // will process these caps in rejoin stage
3228 rejoin_slave_exports
[ino
].first
= from
;
3229 rejoin_slave_exports
[ino
].second
.swap(cap_exports
);
3231 // send information of imported caps back to slave
3232 encode(rejoin_imported_caps
[from
][ino
], ack
->commit
[p
.first
]);
3236 dout(10) << " ambiguous slave request " << p
<< " will ABORT" << dendl
;
3237 ceph_assert(!p
.second
.committing
);
3238 ack
->add_abort(p
.first
);
3241 mds
->send_message(ack
, m
->get_connection());
3245 if (!resolve_ack_gather
.empty() || !resolve_need_rollback
.empty()) {
3246 dout(10) << "delay processing subtree resolve" << dendl
;
3247 delayed_resolve
[from
] = m
;
3251 bool survivor
= false;
3252 // am i a surviving ambiguous importer?
3253 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping()) {
3255 // check for any import success/failure (from this node)
3256 map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= my_ambiguous_imports
.begin();
3257 while (p
!= my_ambiguous_imports
.end()) {
3258 map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator next
= p
;
3260 CDir
*dir
= get_dirfrag(p
->first
);
3262 dout(10) << "checking ambiguous import " << *dir
<< dendl
;
3263 if (migrator
->is_importing(dir
->dirfrag()) &&
3264 migrator
->get_import_peer(dir
->dirfrag()) == from
) {
3265 ceph_assert(migrator
->get_import_state(dir
->dirfrag()) == Migrator::IMPORT_ACKING
);
3267 // check if sender claims the subtree
3268 bool claimed_by_sender
= false;
3269 for (const auto &q
: m
->subtrees
) {
3270 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
3271 CDir
*base
= get_force_dirfrag(q
.first
, false);
3272 if (!base
|| !base
->contains(dir
))
3273 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3277 get_force_dirfrag_bound_set(q
.second
, bounds
);
3278 for (set
<CDir
*>::iterator p
= bounds
.begin(); p
!= bounds
.end(); ++p
) {
3280 if (bound
->contains(dir
)) {
3281 inside
= false; // nope, bound is dir or parent of dir, not inside.
3286 claimed_by_sender
= true;
3289 my_ambiguous_imports
.erase(p
); // no longer ambiguous.
3290 if (claimed_by_sender
) {
3291 dout(7) << "ambiguous import failed on " << *dir
<< dendl
;
3292 migrator
->import_reverse(dir
);
3294 dout(7) << "ambiguous import succeeded on " << *dir
<< dendl
;
3295 migrator
->import_finish(dir
, true);
3302 // update my dir_auth values
3303 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3304 // migrations between other nodes)
3305 for (const auto& p
: m
->subtrees
) {
3306 dout(10) << "peer claims " << p
.first
<< " bounds " << p
.second
<< dendl
;
3307 CDir
*dir
= get_force_dirfrag(p
.first
, !survivor
);
3310 adjust_bounded_subtree_auth(dir
, p
.second
, from
);
3311 try_subtree_merge(dir
);
3316 // note ambiguous imports too
3317 for (const auto& p
: m
->ambiguous_imports
) {
3318 dout(10) << "noting ambiguous import on " << p
.first
<< " bounds " << p
.second
<< dendl
;
3319 other_ambiguous_imports
[from
][p
.first
] = p
.second
;
3322 // learn other mds' pendina snaptable commits. later when resolve finishes, we will reload
3323 // snaptable cache from snapserver. By this way, snaptable cache get synced among all mds
3324 for (const auto& p
: m
->table_clients
) {
3325 dout(10) << " noting " << get_mdstable_name(p
.type
)
3326 << " pending_commits " << p
.pending_commits
<< dendl
;
3327 MDSTableClient
*client
= mds
->get_table_client(p
.type
);
3328 for (const auto& q
: p
.pending_commits
)
3329 client
->notify_commit(q
);
3332 // did i get them all?
3333 resolve_gather
.erase(from
);
3335 maybe_resolve_finish();
3338 void MDCache::process_delayed_resolve()
3340 dout(10) << "process_delayed_resolve" << dendl
;
3341 map
<mds_rank_t
, MMDSResolve::const_ref
> tmp
;
3342 tmp
.swap(delayed_resolve
);
3343 for (auto &p
: tmp
) {
3344 handle_resolve(p
.second
);
3348 void MDCache::discard_delayed_resolve(mds_rank_t who
)
3350 delayed_resolve
.erase(who
);
3353 void MDCache::maybe_resolve_finish()
3355 ceph_assert(resolve_ack_gather
.empty());
3356 ceph_assert(resolve_need_rollback
.empty());
3358 if (!resolve_gather
.empty()) {
3359 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3360 << resolve_gather
<< ")" << dendl
;
3364 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl
;
3365 disambiguate_my_imports();
3366 finish_committed_masters();
3369 ceph_assert(mds
->is_resolve());
3370 trim_unlinked_inodes();
3371 recalc_auth_bits(false);
3372 resolve_done
.release()->complete(0);
3375 maybe_send_pending_rejoins();
3379 void MDCache::handle_resolve_ack(const MMDSResolveAck::const_ref
&ack
)
3381 dout(10) << "handle_resolve_ack " << *ack
<< " from " << ack
->get_source() << dendl
;
3382 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
3384 if (!resolve_ack_gather
.count(from
) ||
3385 mds
->mdsmap
->get_state(from
) < MDSMap::STATE_RESOLVE
) {
3389 if (ambiguous_slave_updates
.count(from
)) {
3390 ceph_assert(mds
->mdsmap
->is_clientreplay_or_active_or_stopping(from
));
3391 ceph_assert(mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping());
3394 for (const auto &p
: ack
->commit
) {
3395 dout(10) << " commit on slave " << p
.first
<< dendl
;
3397 if (ambiguous_slave_updates
.count(from
)) {
3398 remove_ambiguous_slave_update(p
.first
, from
);
3402 if (mds
->is_resolve()) {
3404 MDSlaveUpdate
*su
= get_uncommitted_slave_update(p
.first
, from
);
3408 mds
->mdlog
->start_submit_entry(new ESlaveUpdate(mds
->mdlog
, "unknown", p
.first
, from
,
3409 ESlaveUpdate::OP_COMMIT
, su
->origop
),
3410 new C_MDC_SlaveCommit(this, from
, p
.first
));
3411 mds
->mdlog
->flush();
3413 finish_uncommitted_slave_update(p
.first
, from
);
3415 MDRequestRef mdr
= request_get(p
.first
);
3416 // information about master imported caps
3417 if (p
.second
.length() > 0)
3418 mdr
->more()->inode_import
.share(p
.second
);
3420 ceph_assert(mdr
->slave_request
== 0); // shouldn't be doing anything!
3421 request_finish(mdr
);
3425 for (const auto &metareq
: ack
->abort
) {
3426 dout(10) << " abort on slave " << metareq
<< dendl
;
3428 if (mds
->is_resolve()) {
3429 MDSlaveUpdate
*su
= get_uncommitted_slave_update(metareq
, from
);
3432 // perform rollback (and journal a rollback entry)
3433 // note: this will hold up the resolve a bit, until the rollback entries journal.
3434 MDRequestRef null_ref
;
3435 switch (su
->origop
) {
3436 case ESlaveUpdate::LINK
:
3437 mds
->server
->do_link_rollback(su
->rollback
, from
, null_ref
);
3439 case ESlaveUpdate::RENAME
:
3440 mds
->server
->do_rename_rollback(su
->rollback
, from
, null_ref
);
3442 case ESlaveUpdate::RMDIR
:
3443 mds
->server
->do_rmdir_rollback(su
->rollback
, from
, null_ref
);
3449 MDRequestRef mdr
= request_get(metareq
);
3450 mdr
->aborted
= true;
3451 if (mdr
->slave_request
) {
3452 if (mdr
->slave_did_prepare()) // journaling slave prepare ?
3453 add_rollback(metareq
, from
);
3455 request_finish(mdr
);
3460 if (!ambiguous_slave_updates
.count(from
)) {
3461 resolve_ack_gather
.erase(from
);
3462 maybe_finish_slave_resolve();
3466 void MDCache::add_uncommitted_slave_update(metareqid_t reqid
, mds_rank_t master
, MDSlaveUpdate
*su
)
3468 ceph_assert(uncommitted_slave_updates
[master
].count(reqid
) == 0);
3469 uncommitted_slave_updates
[master
][reqid
] = su
;
3470 for(set
<CInode
*>::iterator p
= su
->olddirs
.begin(); p
!= su
->olddirs
.end(); ++p
)
3471 uncommitted_slave_rename_olddir
[*p
]++;
3472 for(set
<CInode
*>::iterator p
= su
->unlinked
.begin(); p
!= su
->unlinked
.end(); ++p
)
3473 uncommitted_slave_unlink
[*p
]++;
3476 void MDCache::finish_uncommitted_slave_update(metareqid_t reqid
, mds_rank_t master
)
3478 ceph_assert(uncommitted_slave_updates
[master
].count(reqid
));
3479 MDSlaveUpdate
* su
= uncommitted_slave_updates
[master
][reqid
];
3481 uncommitted_slave_updates
[master
].erase(reqid
);
3482 if (uncommitted_slave_updates
[master
].empty())
3483 uncommitted_slave_updates
.erase(master
);
3484 // discard the non-auth subtree we renamed out of
3485 for(set
<CInode
*>::iterator p
= su
->olddirs
.begin(); p
!= su
->olddirs
.end(); ++p
) {
3487 map
<CInode
*, int>::iterator it
= uncommitted_slave_rename_olddir
.find(diri
);
3488 ceph_assert(it
!= uncommitted_slave_rename_olddir
.end());
3490 if (it
->second
== 0) {
3491 uncommitted_slave_rename_olddir
.erase(it
);
3493 diri
->get_dirfrags(ls
);
3494 for (list
<CDir
*>::iterator q
= ls
.begin(); q
!= ls
.end(); ++q
) {
3495 CDir
*root
= get_subtree_root(*q
);
3496 if (root
->get_dir_auth() == CDIR_AUTH_UNDEF
) {
3497 try_trim_non_auth_subtree(root
);
3503 ceph_assert(it
->second
> 0);
3505 // removed the inodes that were unlinked by slave update
3506 for(set
<CInode
*>::iterator p
= su
->unlinked
.begin(); p
!= su
->unlinked
.end(); ++p
) {
3508 map
<CInode
*, int>::iterator it
= uncommitted_slave_unlink
.find(in
);
3509 ceph_assert(it
!= uncommitted_slave_unlink
.end());
3511 if (it
->second
== 0) {
3512 uncommitted_slave_unlink
.erase(it
);
3513 if (!in
->get_projected_parent_dn())
3514 mds
->mdcache
->remove_inode_recursive(in
);
3516 ceph_assert(it
->second
> 0);
3521 MDSlaveUpdate
* MDCache::get_uncommitted_slave_update(metareqid_t reqid
, mds_rank_t master
)
3524 MDSlaveUpdate
* su
= NULL
;
3525 if (uncommitted_slave_updates
.count(master
) &&
3526 uncommitted_slave_updates
[master
].count(reqid
)) {
3527 su
= uncommitted_slave_updates
[master
][reqid
];
3533 void MDCache::finish_rollback(metareqid_t reqid
) {
3534 auto p
= resolve_need_rollback
.find(reqid
);
3535 ceph_assert(p
!= resolve_need_rollback
.end());
3536 if (mds
->is_resolve())
3537 finish_uncommitted_slave_update(reqid
, p
->second
);
3538 resolve_need_rollback
.erase(p
);
3539 maybe_finish_slave_resolve();
3542 void MDCache::disambiguate_other_imports()
3544 dout(10) << "disambiguate_other_imports" << dendl
;
3546 bool recovering
= !(mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping());
3547 // other nodes' ambiguous imports
3548 for (map
<mds_rank_t
, map
<dirfrag_t
, vector
<dirfrag_t
> > >::iterator p
= other_ambiguous_imports
.begin();
3549 p
!= other_ambiguous_imports
.end();
3551 mds_rank_t who
= p
->first
;
3552 dout(10) << "ambiguous imports for mds." << who
<< dendl
;
3554 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator q
= p
->second
.begin();
3555 q
!= p
->second
.end();
3557 dout(10) << " ambiguous import " << q
->first
<< " bounds " << q
->second
<< dendl
;
3558 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3559 CDir
*dir
= get_force_dirfrag(q
->first
, recovering
);
3562 if (dir
->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3563 dir
->authority() == CDIR_AUTH_UNDEF
) { // resolving
3564 dout(10) << " mds." << who
<< " did import " << *dir
<< dendl
;
3565 adjust_bounded_subtree_auth(dir
, q
->second
, who
);
3566 try_subtree_merge(dir
);
3568 dout(10) << " mds." << who
<< " did not import " << *dir
<< dendl
;
3572 other_ambiguous_imports
.clear();
3575 void MDCache::disambiguate_my_imports()
3577 dout(10) << "disambiguate_my_imports" << dendl
;
3579 if (!mds
->is_resolve()) {
3580 ceph_assert(my_ambiguous_imports
.empty());
3584 disambiguate_other_imports();
3586 // my ambiguous imports
3587 mds_authority_t
me_ambig(mds
->get_nodeid(), mds
->get_nodeid());
3588 while (!my_ambiguous_imports
.empty()) {
3589 map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator q
= my_ambiguous_imports
.begin();
3591 CDir
*dir
= get_dirfrag(q
->first
);
3594 if (dir
->authority() != me_ambig
) {
3595 dout(10) << "ambiguous import auth known, must not be me " << *dir
<< dendl
;
3596 cancel_ambiguous_import(dir
);
3598 mds
->mdlog
->start_submit_entry(new EImportFinish(dir
, false));
3600 // subtree may have been swallowed by another node claiming dir
3602 CDir
*root
= get_subtree_root(dir
);
3604 dout(10) << " subtree root is " << *root
<< dendl
;
3605 ceph_assert(root
->dir_auth
.first
!= mds
->get_nodeid()); // no us!
3606 try_trim_non_auth_subtree(root
);
3608 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir
<< dendl
;
3609 finish_ambiguous_import(q
->first
);
3610 mds
->mdlog
->start_submit_entry(new EImportFinish(dir
, true));
3613 ceph_assert(my_ambiguous_imports
.empty());
3614 mds
->mdlog
->flush();
3616 // verify all my subtrees are unambiguous!
3617 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3618 p
!= subtrees
.end();
3620 CDir
*dir
= p
->first
;
3621 if (dir
->is_ambiguous_dir_auth()) {
3622 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir
<< dendl
;
3624 ceph_assert(!dir
->is_ambiguous_dir_auth());
3631 void MDCache::add_ambiguous_import(dirfrag_t base
, const vector
<dirfrag_t
>& bounds
)
3633 ceph_assert(my_ambiguous_imports
.count(base
) == 0);
3634 my_ambiguous_imports
[base
] = bounds
;
3638 void MDCache::add_ambiguous_import(CDir
*base
, const set
<CDir
*>& bounds
)
3641 vector
<dirfrag_t
> binos
;
3642 for (set
<CDir
*>::iterator p
= bounds
.begin();
3645 binos
.push_back((*p
)->dirfrag());
3647 // note: this can get called twice if the exporter fails during recovery
3648 if (my_ambiguous_imports
.count(base
->dirfrag()))
3649 my_ambiguous_imports
.erase(base
->dirfrag());
3651 add_ambiguous_import(base
->dirfrag(), binos
);
3654 void MDCache::cancel_ambiguous_import(CDir
*dir
)
3656 dirfrag_t df
= dir
->dirfrag();
3657 ceph_assert(my_ambiguous_imports
.count(df
));
3658 dout(10) << "cancel_ambiguous_import " << df
3659 << " bounds " << my_ambiguous_imports
[df
]
3662 my_ambiguous_imports
.erase(df
);
3665 void MDCache::finish_ambiguous_import(dirfrag_t df
)
3667 ceph_assert(my_ambiguous_imports
.count(df
));
3668 vector
<dirfrag_t
> bounds
;
3669 bounds
.swap(my_ambiguous_imports
[df
]);
3670 my_ambiguous_imports
.erase(df
);
3672 dout(10) << "finish_ambiguous_import " << df
3673 << " bounds " << bounds
3675 CDir
*dir
= get_dirfrag(df
);
3678 // adjust dir_auth, import maps
3679 adjust_bounded_subtree_auth(dir
, bounds
, mds
->get_nodeid());
3680 try_subtree_merge(dir
);
3683 void MDCache::remove_inode_recursive(CInode
*in
)
3685 dout(10) << "remove_inode_recursive " << *in
<< dendl
;
3687 in
->get_dirfrags(ls
);
3688 list
<CDir
*>::iterator p
= ls
.begin();
3689 while (p
!= ls
.end()) {
3690 CDir
*subdir
= *p
++;
3692 dout(10) << " removing dirfrag " << subdir
<< dendl
;
3693 auto it
= subdir
->items
.begin();
3694 while (it
!= subdir
->items
.end()) {
3695 CDentry
*dn
= it
->second
;
3697 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3698 if (dnl
->is_primary()) {
3699 CInode
*tin
= dnl
->get_inode();
3700 subdir
->unlink_inode(dn
, false);
3701 remove_inode_recursive(tin
);
3703 subdir
->remove_dentry(dn
);
3706 if (subdir
->is_subtree_root())
3707 remove_subtree(subdir
);
3708 in
->close_dirfrag(subdir
->dirfrag().frag
);
3713 bool MDCache::expire_recursive(CInode
*in
, expiremap
&expiremap
)
3715 ceph_assert(!in
->is_auth());
3717 dout(10) << __func__
<< ":" << *in
<< dendl
;
3719 // Recurse into any dirfrags beneath this inode
3721 in
->get_dirfrags(ls
);
3722 for (auto subdir
: ls
) {
3723 if (!in
->is_mdsdir() && subdir
->is_subtree_root()) {
3724 dout(10) << __func__
<< ": stray still has subtree " << *in
<< dendl
;
3728 for (auto &it
: subdir
->items
) {
3729 CDentry
*dn
= it
.second
;
3730 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3731 if (dnl
->is_primary()) {
3732 CInode
*tin
= dnl
->get_inode();
3734 /* Remote strays with linkage (i.e. hardlinks) should not be
3735 * expired, because they may be the target of
3736 * a rename() as the owning MDS shuts down */
3737 if (!tin
->is_stray() && tin
->inode
.nlink
) {
3738 dout(10) << __func__
<< ": stray still has linkage " << *tin
<< dendl
;
3742 const bool abort
= expire_recursive(tin
, expiremap
);
3747 if (dn
->lru_is_expireable()) {
3748 trim_dentry(dn
, expiremap
);
3750 dout(10) << __func__
<< ": stray dn is not expireable " << *dn
<< dendl
;
3759 void MDCache::trim_unlinked_inodes()
3761 dout(7) << "trim_unlinked_inodes" << dendl
;
3764 for (auto &p
: inode_map
) {
3765 CInode
*in
= p
.second
;
3766 if (in
->get_parent_dn() == NULL
&& !in
->is_base()) {
3767 dout(7) << " will trim from " << *in
<< dendl
;
3771 if (!(++count
% 1000))
3772 mds
->heartbeat_reset();
3775 for (auto& in
: q
) {
3776 remove_inode_recursive(in
);
3778 if (!(++count
% 1000))
3779 mds
->heartbeat_reset();
3783 /** recalc_auth_bits()
3784 * once subtree auth is disambiguated, we need to adjust all the
3785 * auth and dirty bits in our cache before moving on.
3787 void MDCache::recalc_auth_bits(bool replay
)
3789 dout(7) << "recalc_auth_bits " << (replay
? "(replay)" : "") << dendl
;
3792 root
->inode_auth
.first
= mds
->mdsmap
->get_root();
3793 bool auth
= mds
->get_nodeid() == root
->inode_auth
.first
;
3795 root
->state_set(CInode::STATE_AUTH
);
3797 root
->state_clear(CInode::STATE_AUTH
);
3799 root
->state_set(CInode::STATE_REJOINING
);
3803 set
<CInode
*> subtree_inodes
;
3804 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3805 p
!= subtrees
.end();
3807 if (p
->first
->dir_auth
.first
== mds
->get_nodeid())
3808 subtree_inodes
.insert(p
->first
->inode
);
3811 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3812 p
!= subtrees
.end();
3814 if (p
->first
->inode
->is_mdsdir()) {
3815 CInode
*in
= p
->first
->inode
;
3816 bool auth
= in
->ino() == MDS_INO_MDSDIR(mds
->get_nodeid());
3818 in
->state_set(CInode::STATE_AUTH
);
3820 in
->state_clear(CInode::STATE_AUTH
);
3822 in
->state_set(CInode::STATE_REJOINING
);
3826 list
<CDir
*> dfq
; // dirfrag queue
3827 dfq
.push_back(p
->first
);
3829 bool auth
= p
->first
->authority().first
== mds
->get_nodeid();
3830 dout(10) << " subtree auth=" << auth
<< " for " << *p
->first
<< dendl
;
3832 while (!dfq
.empty()) {
3833 CDir
*dir
= dfq
.front();
3838 dir
->state_set(CDir::STATE_AUTH
);
3840 dir
->state_clear(CDir::STATE_AUTH
);
3842 // close empty non-auth dirfrag
3843 if (!dir
->is_subtree_root() && dir
->get_num_any() == 0) {
3844 dir
->inode
->close_dirfrag(dir
->get_frag());
3847 dir
->state_set(CDir::STATE_REJOINING
);
3848 dir
->state_clear(CDir::STATE_COMPLETE
);
3849 if (dir
->is_dirty())
3854 // dentries in this dir
3855 for (auto &p
: dir
->items
) {
3857 CDentry
*dn
= p
.second
;
3858 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3860 dn
->state_set(CDentry::STATE_AUTH
);
3862 dn
->state_clear(CDentry::STATE_AUTH
);
3864 dn
->state_set(CDentry::STATE_REJOINING
);
3870 if (dnl
->is_primary()) {
3872 CInode
*in
= dnl
->get_inode();
3874 in
->state_set(CInode::STATE_AUTH
);
3876 in
->state_clear(CInode::STATE_AUTH
);
3878 in
->state_set(CInode::STATE_REJOINING
);
3881 if (in
->is_dirty_parent())
3882 in
->clear_dirty_parent();
3883 // avoid touching scatterlocks for our subtree roots!
3884 if (subtree_inodes
.count(in
) == 0)
3885 in
->clear_scatter_dirty();
3890 in
->get_nested_dirfrags(dfq
);
3902 // ===========================================================================
3906 * notes on scatterlock recovery:
3908 * - recovering inode replica sends scatterlock data for any subtree
3909 * roots (the only ones that are possibly dirty).
3911 * - surviving auth incorporates any provided scatterlock data. any
3912 * pending gathers are then finished, as with the other lock types.
3914 * that takes care of surviving auth + (recovering replica)*.
3916 * - surviving replica sends strong_inode, which includes current
3917 * scatterlock state, AND any dirty scatterlock data. this
3918 * provides the recovering auth with everything it might need.
3920 * - recovering auth must pick initial scatterlock state based on
3921 * (weak|strong) rejoins.
3922 * - always assimilate scatterlock data (it can't hurt)
3923 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3924 * - include base inode in ack for all inodes that saw scatterlock content
3926 * also, for scatter gather,
3928 * - auth increments {frag,r}stat.version on completion of any gather.
3930 * - auth incorporates changes in a gather _only_ if the version
3933 * - replica discards changes any time the scatterlock syncs, and
3937 void MDCache::dump_rejoin_status(Formatter
*f
) const
3939 f
->open_object_section("rejoin_status");
3940 f
->dump_stream("rejoin_gather") << rejoin_gather
;
3941 f
->dump_stream("rejoin_ack_gather") << rejoin_ack_gather
;
3942 f
->dump_unsigned("num_opening_inodes", cap_imports_num_opening
);
3946 void MDCache::rejoin_start(MDSContext
*rejoin_done_
)
3948 dout(10) << "rejoin_start" << dendl
;
3949 ceph_assert(!rejoin_done
);
3950 rejoin_done
.reset(rejoin_done_
);
3952 rejoin_gather
= recovery_set
;
3953 // need finish opening cap inodes before sending cache rejoins
3954 rejoin_gather
.insert(mds
->get_nodeid());
3955 process_imported_caps();
3961 * this initiates rejoin. it should be called before we get any
3962 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3964 * we start out by sending rejoins to everyone in the recovery set.
3966 * if we are rejoin, send for all regions in our cache.
3967 * if we are active|stopping, send only to nodes that are rejoining.
3969 void MDCache::rejoin_send_rejoins()
3971 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set
<< dendl
;
3973 if (rejoin_gather
.count(mds
->get_nodeid())) {
3974 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl
;
3975 rejoins_pending
= true;
3978 if (!resolve_gather
.empty()) {
3979 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
3980 << resolve_gather
<< ")" << dendl
;
3981 rejoins_pending
= true;
3985 ceph_assert(!migrator
->is_importing());
3986 ceph_assert(!migrator
->is_exporting());
3988 if (!mds
->is_rejoin()) {
3989 disambiguate_other_imports();
3992 map
<mds_rank_t
, MMDSCacheRejoin::ref
> rejoins
;
3995 // if i am rejoining, send a rejoin to everyone.
3996 // otherwise, just send to others who are rejoining.
3997 for (set
<mds_rank_t
>::iterator p
= recovery_set
.begin();
3998 p
!= recovery_set
.end();
4000 if (*p
== mds
->get_nodeid()) continue; // nothing to myself!
4001 if (rejoin_sent
.count(*p
)) continue; // already sent a rejoin to this node!
4002 if (mds
->is_rejoin())
4003 rejoins
[*p
] = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_WEAK
);
4004 else if (mds
->mdsmap
->is_rejoin(*p
))
4005 rejoins
[*p
] = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_STRONG
);
4008 if (mds
->is_rejoin()) {
4009 map
<client_t
, pair
<Session
*, set
<mds_rank_t
> > > client_exports
;
4010 for (auto& p
: cap_exports
) {
4011 mds_rank_t target
= p
.second
.first
;
4012 if (rejoins
.count(target
) == 0)
4014 for (auto q
= p
.second
.second
.begin(); q
!= p
.second
.second
.end(); ) {
4015 Session
*session
= nullptr;
4016 auto it
= client_exports
.find(q
->first
);
4017 if (it
!= client_exports
.end()) {
4018 session
= it
->second
.first
;
4020 it
->second
.second
.insert(target
);
4022 session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
4023 auto& r
= client_exports
[q
->first
];
4026 r
.second
.insert(target
);
4031 // remove reconnect with no session
4032 p
.second
.second
.erase(q
++);
4035 rejoins
[target
]->cap_exports
[p
.first
] = p
.second
.second
;
4037 for (auto& p
: client_exports
) {
4038 Session
*session
= p
.second
.first
;
4039 for (auto& q
: p
.second
.second
) {
4040 auto rejoin
= rejoins
[q
];
4041 rejoin
->client_map
[p
.first
] = session
->info
.inst
;
4042 rejoin
->client_metadata_map
[p
.first
] = session
->info
.client_metadata
;
4048 // check all subtrees
4049 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
4050 p
!= subtrees
.end();
4052 CDir
*dir
= p
->first
;
4053 ceph_assert(dir
->is_subtree_root());
4054 if (dir
->is_ambiguous_dir_auth()) {
4055 // exporter is recovering, importer is survivor.
4056 ceph_assert(rejoins
.count(dir
->authority().first
));
4057 ceph_assert(!rejoins
.count(dir
->authority().second
));
4063 continue; // skip my own regions!
4065 mds_rank_t auth
= dir
->get_dir_auth().first
;
4066 ceph_assert(auth
>= 0);
4067 if (rejoins
.count(auth
) == 0)
4068 continue; // don't care about this node's subtrees
4070 rejoin_walk(dir
, rejoins
[auth
]);
4073 // rejoin root inodes, too
4074 for (auto &p
: rejoins
) {
4075 if (mds
->is_rejoin()) {
4077 if (p
.first
== 0 && root
) {
4078 p
.second
->add_weak_inode(root
->vino());
4079 if (root
->is_dirty_scattered()) {
4080 dout(10) << " sending scatterlock state on root " << *root
<< dendl
;
4081 p
.second
->add_scatterlock_state(root
);
4084 if (CInode
*in
= get_inode(MDS_INO_MDSDIR(p
.first
))) {
4086 p
.second
->add_weak_inode(in
->vino());
4090 if (p
.first
== 0 && root
) {
4091 p
.second
->add_strong_inode(root
->vino(),
4092 root
->get_replica_nonce(),
4093 root
->get_caps_wanted(),
4094 root
->filelock
.get_state(),
4095 root
->nestlock
.get_state(),
4096 root
->dirfragtreelock
.get_state());
4097 root
->state_set(CInode::STATE_REJOINING
);
4098 if (root
->is_dirty_scattered()) {
4099 dout(10) << " sending scatterlock state on root " << *root
<< dendl
;
4100 p
.second
->add_scatterlock_state(root
);
4104 if (CInode
*in
= get_inode(MDS_INO_MDSDIR(p
.first
))) {
4105 p
.second
->add_strong_inode(in
->vino(),
4106 in
->get_replica_nonce(),
4107 in
->get_caps_wanted(),
4108 in
->filelock
.get_state(),
4109 in
->nestlock
.get_state(),
4110 in
->dirfragtreelock
.get_state());
4111 in
->state_set(CInode::STATE_REJOINING
);
4116 if (!mds
->is_rejoin()) {
4117 // i am survivor. send strong rejoin.
4118 // note request remote_auth_pins, xlocks
4119 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
4120 p
!= active_requests
.end();
4122 MDRequestRef
& mdr
= p
->second
;
4123 if (mdr
->is_slave())
4126 for (const auto& q
: mdr
->remote_auth_pins
) {
4127 if (!q
.first
->is_auth()) {
4128 ceph_assert(q
.second
== q
.first
->authority().first
);
4129 if (rejoins
.count(q
.second
) == 0) continue;
4130 const MMDSCacheRejoin::ref
&rejoin
= rejoins
[q
.second
];
4132 dout(15) << " " << *mdr
<< " authpin on " << *q
.first
<< dendl
;
4133 MDSCacheObjectInfo i
;
4134 q
.first
->set_object_info(i
);
4136 rejoin
->add_inode_authpin(vinodeno_t(i
.ino
, i
.snapid
), mdr
->reqid
, mdr
->attempt
);
4138 rejoin
->add_dentry_authpin(i
.dirfrag
, i
.dname
, i
.snapid
, mdr
->reqid
, mdr
->attempt
);
4140 if (mdr
->has_more() && mdr
->more()->is_remote_frozen_authpin
&&
4141 mdr
->more()->rename_inode
== q
.first
)
4142 rejoin
->add_inode_frozen_authpin(vinodeno_t(i
.ino
, i
.snapid
),
4143 mdr
->reqid
, mdr
->attempt
);
4147 for (const auto& q
: mdr
->locks
) {
4149 auto obj
= lock
->get_parent();
4150 if (q
.is_xlock() && !obj
->is_auth()) {
4151 mds_rank_t who
= obj
->authority().first
;
4152 if (rejoins
.count(who
) == 0) continue;
4153 const MMDSCacheRejoin::ref
&rejoin
= rejoins
[who
];
4155 dout(15) << " " << *mdr
<< " xlock on " << *lock
<< " " << *obj
<< dendl
;
4156 MDSCacheObjectInfo i
;
4157 obj
->set_object_info(i
);
4159 rejoin
->add_inode_xlock(vinodeno_t(i
.ino
, i
.snapid
), lock
->get_type(),
4160 mdr
->reqid
, mdr
->attempt
);
4162 rejoin
->add_dentry_xlock(i
.dirfrag
, i
.dname
, i
.snapid
,
4163 mdr
->reqid
, mdr
->attempt
);
4164 } else if (q
.is_remote_wrlock()) {
4165 mds_rank_t who
= q
.wrlock_target
;
4166 if (rejoins
.count(who
) == 0) continue;
4167 const MMDSCacheRejoin::ref
&rejoin
= rejoins
[who
];
4169 dout(15) << " " << *mdr
<< " wrlock on " << *lock
<< " " << *obj
<< dendl
;
4170 MDSCacheObjectInfo i
;
4171 obj
->set_object_info(i
);
4173 rejoin
->add_inode_wrlock(vinodeno_t(i
.ino
, i
.snapid
), lock
->get_type(),
4174 mdr
->reqid
, mdr
->attempt
);
4180 // send the messages
4181 for (auto &p
: rejoins
) {
4182 ceph_assert(rejoin_sent
.count(p
.first
) == 0);
4183 ceph_assert(rejoin_ack_gather
.count(p
.first
) == 0);
4184 rejoin_sent
.insert(p
.first
);
4185 rejoin_ack_gather
.insert(p
.first
);
4186 mds
->send_message_mds(p
.second
, p
.first
);
4188 rejoin_ack_gather
.insert(mds
->get_nodeid()); // we need to complete rejoin_gather_finish, too
4189 rejoins_pending
= false;
4192 if (mds
->is_rejoin() && rejoin_gather
.empty()) {
4193 dout(10) << "nothing to rejoin" << dendl
;
4194 rejoin_gather_finish();
4200 * rejoin_walk - build rejoin declarations for a subtree
4202 * @param dir subtree root
4203 * @param rejoin rejoin message
4205 * from a rejoining node:
4207 * weak dentries (w/ connectivity)
4209 * from a surviving node:
4211 * strong dentries (no connectivity!)
4214 void MDCache::rejoin_walk(CDir
*dir
, const MMDSCacheRejoin::ref
&rejoin
)
4216 dout(10) << "rejoin_walk " << *dir
<< dendl
;
4218 list
<CDir
*> nested
; // finish this dir, then do nested items
4220 if (mds
->is_rejoin()) {
4222 rejoin
->add_weak_dirfrag(dir
->dirfrag());
4223 for (auto &p
: dir
->items
) {
4224 CDentry
*dn
= p
.second
;
4225 ceph_assert(dn
->last
== CEPH_NOSNAP
);
4226 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4227 dout(15) << " add_weak_primary_dentry " << *dn
<< dendl
;
4228 ceph_assert(dnl
->is_primary());
4229 CInode
*in
= dnl
->get_inode();
4230 ceph_assert(dnl
->get_inode()->is_dir());
4231 rejoin
->add_weak_primary_dentry(dir
->ino(), dn
->get_name(), dn
->first
, dn
->last
, in
->ino());
4232 in
->get_nested_dirfrags(nested
);
4233 if (in
->is_dirty_scattered()) {
4234 dout(10) << " sending scatterlock state on " << *in
<< dendl
;
4235 rejoin
->add_scatterlock_state(in
);
4240 dout(15) << " add_strong_dirfrag " << *dir
<< dendl
;
4241 rejoin
->add_strong_dirfrag(dir
->dirfrag(), dir
->get_replica_nonce(), dir
->get_dir_rep());
4242 dir
->state_set(CDir::STATE_REJOINING
);
4244 for (auto it
= dir
->items
.begin(); it
!= dir
->items
.end(); ) {
4245 CDentry
*dn
= it
->second
;
4247 dn
->state_set(CDentry::STATE_REJOINING
);
4248 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4249 CInode
*in
= dnl
->is_primary() ? dnl
->get_inode() : NULL
;
4251 // trim snap dentries. because they may have been pruned by
4252 // their auth mds (snap deleted)
4253 if (dn
->last
!= CEPH_NOSNAP
) {
4254 if (in
&& !in
->remote_parents
.empty()) {
4255 // unlink any stale remote snap dentry.
4256 for (auto it2
= in
->remote_parents
.begin(); it2
!= in
->remote_parents
.end(); ) {
4257 CDentry
*remote_dn
= *it2
;
4259 ceph_assert(remote_dn
->last
!= CEPH_NOSNAP
);
4260 remote_dn
->unlink_remote(remote_dn
->get_linkage());
4263 if (dn
->lru_is_expireable()) {
4264 if (!dnl
->is_null())
4265 dir
->unlink_inode(dn
, false);
4268 dir
->remove_dentry(dn
);
4271 // Inventing null/remote dentry shouldn't cause problem
4272 ceph_assert(!dnl
->is_primary());
4276 dout(15) << " add_strong_dentry " << *dn
<< dendl
;
4277 rejoin
->add_strong_dentry(dir
->dirfrag(), dn
->get_name(), dn
->first
, dn
->last
,
4278 dnl
->is_primary() ? dnl
->get_inode()->ino():inodeno_t(0),
4279 dnl
->is_remote() ? dnl
->get_remote_ino():inodeno_t(0),
4280 dnl
->is_remote() ? dnl
->get_remote_d_type():0,
4281 dn
->get_replica_nonce(),
4282 dn
->lock
.get_state());
4283 dn
->state_set(CDentry::STATE_REJOINING
);
4284 if (dnl
->is_primary()) {
4285 CInode
*in
= dnl
->get_inode();
4286 dout(15) << " add_strong_inode " << *in
<< dendl
;
4287 rejoin
->add_strong_inode(in
->vino(),
4288 in
->get_replica_nonce(),
4289 in
->get_caps_wanted(),
4290 in
->filelock
.get_state(),
4291 in
->nestlock
.get_state(),
4292 in
->dirfragtreelock
.get_state());
4293 in
->state_set(CInode::STATE_REJOINING
);
4294 in
->get_nested_dirfrags(nested
);
4295 if (in
->is_dirty_scattered()) {
4296 dout(10) << " sending scatterlock state on " << *in
<< dendl
;
4297 rejoin
->add_scatterlock_state(in
);
4303 // recurse into nested dirs
4304 for (list
<CDir
*>::iterator p
= nested
.begin();
4307 rejoin_walk(*p
, rejoin
);
4313 * - reply with the lockstate
4315 * if i am active|stopping,
4316 * - remove source from replica list for everything not referenced here.
4318 void MDCache::handle_cache_rejoin(const MMDSCacheRejoin::const_ref
&m
)
4320 dout(7) << "handle_cache_rejoin " << *m
<< " from " << m
->get_source()
4321 << " (" << m
->get_payload().length() << " bytes)"
4325 case MMDSCacheRejoin::OP_WEAK
:
4326 handle_cache_rejoin_weak(m
);
4328 case MMDSCacheRejoin::OP_STRONG
:
4329 handle_cache_rejoin_strong(m
);
4331 case MMDSCacheRejoin::OP_ACK
:
4332 handle_cache_rejoin_ack(m
);
4342 * handle_cache_rejoin_weak
4345 * - is recovering from their journal.
4346 * - may have incorrect (out of date) inode contents
4347 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4349 * if the sender didn't trim_non_auth(), they
4350 * - may have incorrect (out of date) dentry/inode linkage
4351 * - may have deleted/purged inodes
4352 * and i may have to go to disk to get accurate inode contents. yuck.
4354 void MDCache::handle_cache_rejoin_weak(const MMDSCacheRejoin::const_ref
&weak
)
4356 mds_rank_t from
= mds_rank_t(weak
->get_source().num());
4358 // possible response(s)
4359 MMDSCacheRejoin::ref ack
; // if survivor
4360 set
<vinodeno_t
> acked_inodes
; // if survivor
4361 set
<SimpleLock
*> gather_locks
; // if survivor
4362 bool survivor
= false; // am i a survivor?
4364 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping()) {
4366 dout(10) << "i am a surivivor, and will ack immediately" << dendl
;
4367 ack
= MMDSCacheRejoin::create(MMDSCacheRejoin::OP_ACK
);
4369 map
<inodeno_t
,map
<client_t
,Capability::Import
> > imported_caps
;
4371 // check cap exports
4372 for (auto p
= weak
->cap_exports
.begin(); p
!= weak
->cap_exports
.end(); ++p
) {
4373 CInode
*in
= get_inode(p
->first
);
4374 ceph_assert(!in
|| in
->is_auth());
4375 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
4376 dout(10) << " claiming cap import " << p
->first
<< " client." << q
->first
<< " on " << *in
<< dendl
;
4377 Capability
*cap
= rejoin_import_cap(in
, q
->first
, q
->second
, from
);
4378 Capability::Import
& im
= imported_caps
[p
->first
][q
->first
];
4380 im
.cap_id
= cap
->get_cap_id();
4381 im
.issue_seq
= cap
->get_last_seq();
4382 im
.mseq
= cap
->get_mseq();
4387 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);
4390 encode(imported_caps
, ack
->imported_caps
);
4392 ceph_assert(mds
->is_rejoin());
4394 // we may have already received a strong rejoin from the sender.
4395 rejoin_scour_survivor_replicas(from
, NULL
, acked_inodes
, gather_locks
);
4396 ceph_assert(gather_locks
.empty());
4398 // check cap exports.
4399 rejoin_client_map
.insert(weak
->client_map
.begin(), weak
->client_map
.end());
4400 rejoin_client_metadata_map
.insert(weak
->client_metadata_map
.begin(),
4401 weak
->client_metadata_map
.end());
4403 for (auto p
= weak
->cap_exports
.begin(); p
!= weak
->cap_exports
.end(); ++p
) {
4404 CInode
*in
= get_inode(p
->first
);
4405 ceph_assert(!in
|| in
->is_auth());
4407 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
4408 dout(10) << " claiming cap import " << p
->first
<< " client." << q
->first
<< dendl
;
4409 cap_imports
[p
->first
][q
->first
][from
] = q
->second
;
4414 // assimilate any potentially dirty scatterlock state
4415 for (const auto &p
: weak
->inode_scatterlocks
) {
4416 CInode
*in
= get_inode(p
.first
);
4418 in
->decode_lock_state(CEPH_LOCK_IFILE
, p
.second
.file
);
4419 in
->decode_lock_state(CEPH_LOCK_INEST
, p
.second
.nest
);
4420 in
->decode_lock_state(CEPH_LOCK_IDFT
, p
.second
.dft
);
4422 rejoin_potential_updated_scatterlocks
.insert(in
);
4425 // recovering peer may send incorrect dirfrags here. we need to
4426 // infer which dirfrag they meant. the ack will include a
4427 // strong_dirfrag that will set them straight on the fragmentation.
4430 set
<CDir
*> dirs_to_share
;
4431 for (const auto &p
: weak
->weak_dirfrags
) {
4432 CInode
*diri
= get_inode(p
.ino
);
4434 dout(0) << " missing dir ino " << p
.ino
<< dendl
;
4438 if (diri
->dirfragtree
.is_leaf(p
.frag
)) {
4439 leaves
.push_back(p
.frag
);
4441 diri
->dirfragtree
.get_leaves_under(p
.frag
, leaves
);
4443 leaves
.push_back(diri
->dirfragtree
[p
.frag
.value()]);
4445 for (const auto& leaf
: leaves
) {
4446 CDir
*dir
= diri
->get_dirfrag(leaf
);
4448 dout(0) << " missing dir for " << p
.frag
<< " (which maps to " << leaf
<< ") on " << *diri
<< dendl
;
4452 if (dirs_to_share
.count(dir
)) {
4453 dout(10) << " already have " << p
.frag
<< " -> " << leaf
<< " " << *dir
<< dendl
;
4455 dirs_to_share
.insert(dir
);
4456 unsigned nonce
= dir
->add_replica(from
);
4457 dout(10) << " have " << p
.frag
<< " -> " << leaf
<< " " << *dir
<< dendl
;
4459 ack
->add_strong_dirfrag(dir
->dirfrag(), nonce
, dir
->dir_rep
);
4460 ack
->add_dirfrag_base(dir
);
4466 for (const auto &p
: weak
->weak
) {
4467 CInode
*diri
= get_inode(p
.first
);
4469 dout(0) << " missing dir ino " << p
.first
<< dendl
;
4474 for (const auto &q
: p
.second
) {
4475 // locate proper dirfrag.
4476 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
4477 frag_t fg
= diri
->pick_dirfrag(q
.first
.name
);
4478 if (!dir
|| dir
->get_frag() != fg
) {
4479 dir
= diri
->get_dirfrag(fg
);
4481 dout(0) << " missing dir frag " << fg
<< " on " << *diri
<< dendl
;
4483 ceph_assert(dirs_to_share
.count(dir
));
4487 CDentry
*dn
= dir
->lookup(q
.first
.name
, q
.first
.snapid
);
4489 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4490 ceph_assert(dnl
->is_primary());
4492 if (survivor
&& dn
->is_replica(from
))
4493 dentry_remove_replica(dn
, from
, gather_locks
);
4494 unsigned dnonce
= dn
->add_replica(from
);
4495 dout(10) << " have " << *dn
<< dendl
;
4497 ack
->add_strong_dentry(dir
->dirfrag(), dn
->get_name(), dn
->first
, dn
->last
,
4498 dnl
->get_inode()->ino(), inodeno_t(0), 0,
4499 dnonce
, dn
->lock
.get_replica_state());
4502 CInode
*in
= dnl
->get_inode();
4505 if (survivor
&& in
->is_replica(from
))
4506 inode_remove_replica(in
, from
, true, gather_locks
);
4507 unsigned inonce
= in
->add_replica(from
);
4508 dout(10) << " have " << *in
<< dendl
;
4510 // scatter the dirlock, just in case?
4511 if (!survivor
&& in
->is_dir() && in
->has_subtree_root_dirfrag())
4512 in
->filelock
.set_state(LOCK_MIX
);
4515 acked_inodes
.insert(in
->vino());
4516 ack
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
4518 in
->_encode_locks_state_for_rejoin(bl
, from
);
4519 ack
->add_inode_locks(in
, inonce
, bl
);
4524 // weak base inodes? (root, stray, etc.)
4525 for (set
<vinodeno_t
>::iterator p
= weak
->weak_inodes
.begin();
4526 p
!= weak
->weak_inodes
.end();
4528 CInode
*in
= get_inode(*p
);
4529 ceph_assert(in
); // hmm fixme wrt stray?
4530 if (survivor
&& in
->is_replica(from
))
4531 inode_remove_replica(in
, from
, true, gather_locks
);
4532 unsigned inonce
= in
->add_replica(from
);
4533 dout(10) << " have base " << *in
<< dendl
;
4536 acked_inodes
.insert(in
->vino());
4537 ack
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
4539 in
->_encode_locks_state_for_rejoin(bl
, from
);
4540 ack
->add_inode_locks(in
, inonce
, bl
);
4544 ceph_assert(rejoin_gather
.count(from
));
4545 rejoin_gather
.erase(from
);
4547 // survivor. do everything now.
4548 for (const auto &p
: weak
->inode_scatterlocks
) {
4549 CInode
*in
= get_inode(p
.first
);
4551 dout(10) << " including base inode (due to potential scatterlock update) " << *in
<< dendl
;
4552 acked_inodes
.insert(in
->vino());
4553 ack
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
4556 rejoin_scour_survivor_replicas(from
, ack
, acked_inodes
, gather_locks
);
4557 mds
->send_message(ack
, weak
->get_connection());
4559 for (set
<SimpleLock
*>::iterator p
= gather_locks
.begin(); p
!= gather_locks
.end(); ++p
) {
4560 if (!(*p
)->is_stable())
4561 mds
->locker
->eval_gather(*p
);
4565 if (rejoin_gather
.empty() && rejoin_ack_gather
.count(mds
->get_nodeid())) {
4566 rejoin_gather_finish();
4568 dout(7) << "still need rejoin from (" << rejoin_gather
<< ")" << dendl
;
4574 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4576 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4577 * ack, the replica dne, and we can remove it from our replica maps.
4579 void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from
, const MMDSCacheRejoin::const_ref
&ack
,
4580 set
<vinodeno_t
>& acked_inodes
,
4581 set
<SimpleLock
*>& gather_locks
)
4583 dout(10) << "rejoin_scour_survivor_replicas from mds." << from
<< dendl
;
4585 auto scour_func
= [this, from
, ack
, &acked_inodes
, &gather_locks
] (CInode
*in
) {
4587 if (in
->is_auth() &&
4588 in
->is_replica(from
) &&
4589 (ack
== NULL
|| acked_inodes
.count(in
->vino()) == 0)) {
4590 inode_remove_replica(in
, from
, false, gather_locks
);
4591 dout(10) << " rem " << *in
<< dendl
;
4598 in
->get_dirfrags(dfs
);
4599 for (list
<CDir
*>::iterator p
= dfs
.begin();
4603 if (!dir
->is_auth())
4606 if (dir
->is_replica(from
) &&
4607 (ack
== NULL
|| ack
->strong_dirfrags
.count(dir
->dirfrag()) == 0)) {
4608 dir
->remove_replica(from
);
4609 dout(10) << " rem " << *dir
<< dendl
;
4613 for (auto &p
: dir
->items
) {
4614 CDentry
*dn
= p
.second
;
4616 if (dn
->is_replica(from
)) {
4618 const auto it
= ack
->strong_dentries
.find(dir
->dirfrag());
4619 if (it
!= ack
->strong_dentries
.end() && it
->second
.count(string_snap_t(dn
->get_name(), dn
->last
)) > 0) {
4623 dentry_remove_replica(dn
, from
, gather_locks
);
4624 dout(10) << " rem " << *dn
<< dendl
;
4630 for (auto &p
: inode_map
)
4631 scour_func(p
.second
);
4632 for (auto &p
: snap_inode_map
)
4633 scour_func(p
.second
);
4637 CInode
*MDCache::rejoin_invent_inode(inodeno_t ino
, snapid_t last
)
4639 CInode
*in
= new CInode(this, true, 1, last
);
4640 in
->inode
.ino
= ino
;
4641 in
->state_set(CInode::STATE_REJOINUNDEF
);
4643 rejoin_undef_inodes
.insert(in
);
4644 dout(10) << " invented " << *in
<< dendl
;
4648 CDir
*MDCache::rejoin_invent_dirfrag(dirfrag_t df
)
4650 CInode
*in
= get_inode(df
.ino
);
4652 in
= rejoin_invent_inode(df
.ino
, CEPH_NOSNAP
);
4653 if (!in
->is_dir()) {
4654 ceph_assert(in
->state_test(CInode::STATE_REJOINUNDEF
));
4655 in
->inode
.mode
= S_IFDIR
;
4656 in
->inode
.dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
4658 CDir
*dir
= in
->get_or_open_dirfrag(this, df
.frag
);
4659 dir
->state_set(CDir::STATE_REJOINUNDEF
);
4660 rejoin_undef_dirfrags
.insert(dir
);
4661 dout(10) << " invented " << *dir
<< dendl
;
4665 void MDCache::handle_cache_rejoin_strong(const MMDSCacheRejoin::const_ref
&strong
)
4667 mds_rank_t from
= mds_rank_t(strong
->get_source().num());
4669 // only a recovering node will get a strong rejoin.
4670 if (!mds
->is_rejoin()) {
4671 if (mds
->get_want_state() == MDSMap::STATE_REJOIN
) {
4672 mds
->wait_for_rejoin(new C_MDS_RetryMessage(mds
, strong
));
4675 ceph_abort_msg("got unexpected rejoin message during recovery");
4678 // assimilate any potentially dirty scatterlock state
4679 for (const auto &p
: strong
->inode_scatterlocks
) {
4680 CInode
*in
= get_inode(p
.first
);
4682 in
->decode_lock_state(CEPH_LOCK_IFILE
, p
.second
.file
);
4683 in
->decode_lock_state(CEPH_LOCK_INEST
, p
.second
.nest
);
4684 in
->decode_lock_state(CEPH_LOCK_IDFT
, p
.second
.dft
);
4685 rejoin_potential_updated_scatterlocks
.insert(in
);
4688 rejoin_unlinked_inodes
[from
].clear();
4690 // surviving peer may send incorrect dirfrag here (maybe they didn't
4691 // get the fragment notify, or maybe we rolled back?). we need to
4692 // infer the right frag and get them with the program. somehow.
4693 // we don't normally send ACK.. so we'll need to bundle this with
4694 // MISSING or something.
4696 // strong dirfrags/dentries.
4697 // also process auth_pins, xlocks.
4698 for (const auto &p
: strong
->strong_dirfrags
) {
4699 auto& dirfrag
= p
.first
;
4700 CInode
*diri
= get_inode(dirfrag
.ino
);
4702 diri
= rejoin_invent_inode(dirfrag
.ino
, CEPH_NOSNAP
);
4703 CDir
*dir
= diri
->get_dirfrag(dirfrag
.frag
);
4704 bool refragged
= false;
4706 dout(10) << " have " << *dir
<< dendl
;
4708 if (diri
->state_test(CInode::STATE_REJOINUNDEF
))
4709 dir
= rejoin_invent_dirfrag(dirfrag_t(diri
->ino(), frag_t()));
4710 else if (diri
->dirfragtree
.is_leaf(dirfrag
.frag
))
4711 dir
= rejoin_invent_dirfrag(dirfrag
);
4714 dir
->add_replica(from
, p
.second
.nonce
);
4715 dir
->dir_rep
= p
.second
.dir_rep
;
4717 dout(10) << " frag " << dirfrag
<< " doesn't match dirfragtree " << *diri
<< dendl
;
4719 diri
->dirfragtree
.get_leaves_under(dirfrag
.frag
, leaves
);
4721 leaves
.push_back(diri
->dirfragtree
[dirfrag
.frag
.value()]);
4722 dout(10) << " maps to frag(s) " << leaves
<< dendl
;
4723 for (const auto& leaf
: leaves
) {
4724 CDir
*dir
= diri
->get_dirfrag(leaf
);
4726 dir
= rejoin_invent_dirfrag(dirfrag_t(diri
->ino(), leaf
));
4728 dout(10) << " have(approx) " << *dir
<< dendl
;
4729 dir
->add_replica(from
, p
.second
.nonce
);
4730 dir
->dir_rep
= p
.second
.dir_rep
;
4735 const auto it
= strong
->strong_dentries
.find(dirfrag
);
4736 if (it
!= strong
->strong_dentries
.end()) {
4737 const map
<string_snap_t
,MMDSCacheRejoin::dn_strong
>& dmap
= it
->second
;
4738 for (const auto &q
: dmap
) {
4739 const string_snap_t
& ss
= q
.first
;
4740 const MMDSCacheRejoin::dn_strong
& d
= q
.second
;
4743 dn
= dir
->lookup(ss
.name
, ss
.snapid
);
4745 frag_t fg
= diri
->pick_dirfrag(ss
.name
);
4746 dir
= diri
->get_dirfrag(fg
);
4748 dn
= dir
->lookup(ss
.name
, ss
.snapid
);
4751 if (d
.is_remote()) {
4752 dn
= dir
->add_remote_dentry(ss
.name
, d
.remote_ino
, d
.remote_d_type
, d
.first
, ss
.snapid
);
4753 } else if (d
.is_null()) {
4754 dn
= dir
->add_null_dentry(ss
.name
, d
.first
, ss
.snapid
);
4756 CInode
*in
= get_inode(d
.ino
, ss
.snapid
);
4757 if (!in
) in
= rejoin_invent_inode(d
.ino
, ss
.snapid
);
4758 dn
= dir
->add_primary_dentry(ss
.name
, in
, d
.first
, ss
.snapid
);
4760 dout(10) << " invented " << *dn
<< dendl
;
4762 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4765 const auto pinned_it
= strong
->authpinned_dentries
.find(dirfrag
);
4766 if (pinned_it
!= strong
->authpinned_dentries
.end()) {
4767 const auto slave_reqid_it
= pinned_it
->second
.find(ss
);
4768 if (slave_reqid_it
!= pinned_it
->second
.end()) {
4769 for (const auto &r
: slave_reqid_it
->second
) {
4770 dout(10) << " dn authpin by " << r
<< " on " << *dn
<< dendl
;
4772 // get/create slave mdrequest
4774 if (have_request(r
.reqid
))
4775 mdr
= request_get(r
.reqid
);
4777 mdr
= request_start_slave(r
.reqid
, r
.attempt
, strong
);
4784 const auto xlocked_it
= strong
->xlocked_dentries
.find(dirfrag
);
4785 if (xlocked_it
!= strong
->xlocked_dentries
.end()) {
4786 const auto ss_req_it
= xlocked_it
->second
.find(ss
);
4787 if (ss_req_it
!= xlocked_it
->second
.end()) {
4788 const MMDSCacheRejoin::slave_reqid
& r
= ss_req_it
->second
;
4789 dout(10) << " dn xlock by " << r
<< " on " << *dn
<< dendl
;
4790 MDRequestRef mdr
= request_get(r
.reqid
); // should have this from auth_pin above.
4791 ceph_assert(mdr
->is_auth_pinned(dn
));
4792 if (!mdr
->is_xlocked(&dn
->versionlock
)) {
4793 ceph_assert(dn
->versionlock
.can_xlock_local());
4794 dn
->versionlock
.get_xlock(mdr
, mdr
->get_client());
4795 mdr
->locks
.emplace(&dn
->versionlock
, MutationImpl::LockOp::XLOCK
);
4797 if (dn
->lock
.is_stable())
4798 dn
->auth_pin(&dn
->lock
);
4799 dn
->lock
.set_state(LOCK_XLOCK
);
4800 dn
->lock
.get_xlock(mdr
, mdr
->get_client());
4801 mdr
->locks
.emplace(&dn
->lock
, MutationImpl::LockOp::XLOCK
);
4805 dn
->add_replica(from
, d
.nonce
);
4806 dout(10) << " have " << *dn
<< dendl
;
4808 if (dnl
->is_primary()) {
4809 if (d
.is_primary()) {
4810 if (vinodeno_t(d
.ino
, ss
.snapid
) != dnl
->get_inode()->vino()) {
4811 // the survivor missed MDentryUnlink+MDentryLink messages ?
4812 ceph_assert(strong
->strong_inodes
.count(dnl
->get_inode()->vino()) == 0);
4813 CInode
*in
= get_inode(d
.ino
, ss
.snapid
);
4815 ceph_assert(in
->get_parent_dn());
4816 rejoin_unlinked_inodes
[from
].insert(in
);
4817 dout(7) << " sender has primary dentry but wrong inode" << dendl
;
4820 // the survivor missed MDentryLink message ?
4821 ceph_assert(strong
->strong_inodes
.count(dnl
->get_inode()->vino()) == 0);
4822 dout(7) << " sender doesn't have primay dentry" << dendl
;
4825 if (d
.is_primary()) {
4826 // the survivor missed MDentryUnlink message ?
4827 CInode
*in
= get_inode(d
.ino
, ss
.snapid
);
4829 ceph_assert(in
->get_parent_dn());
4830 rejoin_unlinked_inodes
[from
].insert(in
);
4831 dout(7) << " sender has primary dentry but we don't" << dendl
;
4838 for (const auto &p
: strong
->strong_inodes
) {
4839 CInode
*in
= get_inode(p
.first
);
4841 in
->add_replica(from
, p
.second
.nonce
);
4842 dout(10) << " have " << *in
<< dendl
;
4844 const MMDSCacheRejoin::inode_strong
& is
= p
.second
;
4847 if (is
.caps_wanted
) {
4848 in
->set_mds_caps_wanted(from
, is
.caps_wanted
);
4849 dout(15) << " inode caps_wanted " << ccap_string(is
.caps_wanted
)
4850 << " on " << *in
<< dendl
;
4854 // infer state from replica state:
4855 // * go to MIX if they might have wrlocks
4856 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4857 in
->filelock
.infer_state_from_strong_rejoin(is
.filelock
, !in
->is_dir()); // maybe also go to LOCK
4858 in
->nestlock
.infer_state_from_strong_rejoin(is
.nestlock
, false);
4859 in
->dirfragtreelock
.infer_state_from_strong_rejoin(is
.dftlock
, false);
4862 const auto authpinned_inodes_it
= strong
->authpinned_inodes
.find(in
->vino());
4863 if (authpinned_inodes_it
!= strong
->authpinned_inodes
.end()) {
4864 for (const auto& r
: authpinned_inodes_it
->second
) {
4865 dout(10) << " inode authpin by " << r
<< " on " << *in
<< dendl
;
4867 // get/create slave mdrequest
4869 if (have_request(r
.reqid
))
4870 mdr
= request_get(r
.reqid
);
4872 mdr
= request_start_slave(r
.reqid
, r
.attempt
, strong
);
4873 if (strong
->frozen_authpin_inodes
.count(in
->vino())) {
4874 ceph_assert(!in
->get_num_auth_pins());
4875 mdr
->freeze_auth_pin(in
);
4877 ceph_assert(!in
->is_frozen_auth_pin());
4883 const auto xlocked_inodes_it
= strong
->xlocked_inodes
.find(in
->vino());
4884 if (xlocked_inodes_it
!= strong
->xlocked_inodes
.end()) {
4885 for (const auto &q
: xlocked_inodes_it
->second
) {
4886 SimpleLock
*lock
= in
->get_lock(q
.first
);
4887 dout(10) << " inode xlock by " << q
.second
<< " on " << *lock
<< " on " << *in
<< dendl
;
4888 MDRequestRef mdr
= request_get(q
.second
.reqid
); // should have this from auth_pin above.
4889 ceph_assert(mdr
->is_auth_pinned(in
));
4890 if (!mdr
->is_xlocked(&in
->versionlock
)) {
4891 ceph_assert(in
->versionlock
.can_xlock_local());
4892 in
->versionlock
.get_xlock(mdr
, mdr
->get_client());
4893 mdr
->locks
.emplace(&in
->versionlock
, MutationImpl::LockOp::XLOCK
);
4895 if (lock
->is_stable())
4897 lock
->set_state(LOCK_XLOCK
);
4898 if (lock
== &in
->filelock
)
4900 lock
->get_xlock(mdr
, mdr
->get_client());
4901 mdr
->locks
.emplace(lock
, MutationImpl::LockOp::XLOCK
);
4906 for (const auto &p
: strong
->wrlocked_inodes
) {
4907 CInode
*in
= get_inode(p
.first
);
4908 for (const auto &q
: p
.second
) {
4909 SimpleLock
*lock
= in
->get_lock(q
.first
);
4910 for (const auto &r
: q
.second
) {
4911 dout(10) << " inode wrlock by " << r
<< " on " << *lock
<< " on " << *in
<< dendl
;
4912 MDRequestRef mdr
= request_get(r
.reqid
); // should have this from auth_pin above.
4914 ceph_assert(mdr
->is_auth_pinned(in
));
4915 lock
->set_state(LOCK_MIX
);
4916 if (lock
== &in
->filelock
)
4918 lock
->get_wrlock(true);
4919 mdr
->locks
.emplace(lock
, MutationImpl::LockOp::WRLOCK
);
4925 ceph_assert(rejoin_gather
.count(from
));
4926 rejoin_gather
.erase(from
);
4927 if (rejoin_gather
.empty() && rejoin_ack_gather
.count(mds
->get_nodeid())) {
4928 rejoin_gather_finish();
4930 dout(7) << "still need rejoin from (" << rejoin_gather
<< ")" << dendl
;
4934 void MDCache::handle_cache_rejoin_ack(const MMDSCacheRejoin::const_ref
&ack
)
4936 dout(7) << "handle_cache_rejoin_ack from " << ack
->get_source() << dendl
;
4937 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
4939 ceph_assert(mds
->get_state() >= MDSMap::STATE_REJOIN
);
4940 bool survivor
= !mds
->is_rejoin();
4942 // for sending cache expire message
4943 set
<CInode
*> isolated_inodes
;
4944 set
<CInode
*> refragged_inodes
;
4945 list
<pair
<CInode
*,int> > updated_realms
;
4948 for (const auto &p
: ack
->strong_dirfrags
) {
4949 // we may have had incorrect dir fragmentation; refragment based
4950 // on what they auth tells us.
4951 CDir
*dir
= get_dirfrag(p
.first
);
4953 dir
= get_force_dirfrag(p
.first
, false);
4955 refragged_inodes
.insert(dir
->get_inode());
4958 CInode
*diri
= get_inode(p
.first
.ino
);
4960 // barebones inode; the full inode loop below will clean up.
4961 diri
= new CInode(this, false);
4962 diri
->inode
.ino
= p
.first
.ino
;
4963 diri
->inode
.mode
= S_IFDIR
;
4964 diri
->inode
.dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
4966 if (MDS_INO_MDSDIR(from
) == p
.first
.ino
) {
4967 diri
->inode_auth
= mds_authority_t(from
, CDIR_AUTH_UNKNOWN
);
4968 dout(10) << " add inode " << *diri
<< dendl
;
4970 diri
->inode_auth
= CDIR_AUTH_DEFAULT
;
4971 isolated_inodes
.insert(diri
);
4972 dout(10) << " unconnected dirfrag " << p
.first
<< dendl
;
4975 // barebones dirfrag; the full dirfrag loop below will clean up.
4976 dir
= diri
->add_dirfrag(new CDir(diri
, p
.first
.frag
, this, false));
4977 if (MDS_INO_MDSDIR(from
) == p
.first
.ino
||
4978 (dir
->authority() != CDIR_AUTH_UNDEF
&&
4979 dir
->authority().first
!= from
))
4980 adjust_subtree_auth(dir
, from
);
4981 dout(10) << " add dirfrag " << *dir
<< dendl
;
4984 dir
->set_replica_nonce(p
.second
.nonce
);
4985 dir
->state_clear(CDir::STATE_REJOINING
);
4986 dout(10) << " got " << *dir
<< dendl
;
4989 auto it
= ack
->strong_dentries
.find(p
.first
);
4990 if (it
!= ack
->strong_dentries
.end()) {
4991 for (const auto &q
: it
->second
) {
4992 CDentry
*dn
= dir
->lookup(q
.first
.name
, q
.first
.snapid
);
4994 dn
= dir
->add_null_dentry(q
.first
.name
, q
.second
.first
, q
.first
.snapid
);
4996 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4998 ceph_assert(dn
->last
== q
.first
.snapid
);
4999 if (dn
->first
!= q
.second
.first
) {
5000 dout(10) << " adjust dn.first " << dn
->first
<< " -> " << q
.second
.first
<< " on " << *dn
<< dendl
;
5001 dn
->first
= q
.second
.first
;
5004 // may have bad linkage if we missed dentry link/unlink messages
5005 if (dnl
->is_primary()) {
5006 CInode
*in
= dnl
->get_inode();
5007 if (!q
.second
.is_primary() ||
5008 vinodeno_t(q
.second
.ino
, q
.first
.snapid
) != in
->vino()) {
5009 dout(10) << " had bad linkage for " << *dn
<< ", unlinking " << *in
<< dendl
;
5010 dir
->unlink_inode(dn
);
5012 } else if (dnl
->is_remote()) {
5013 if (!q
.second
.is_remote() ||
5014 q
.second
.remote_ino
!= dnl
->get_remote_ino() ||
5015 q
.second
.remote_d_type
!= dnl
->get_remote_d_type()) {
5016 dout(10) << " had bad linkage for " << *dn
<< dendl
;
5017 dir
->unlink_inode(dn
);
5020 if (!q
.second
.is_null())
5021 dout(10) << " had bad linkage for " << *dn
<< dendl
;
5024 // hmm, did we have the proper linkage here?
5025 if (dnl
->is_null() && !q
.second
.is_null()) {
5026 if (q
.second
.is_remote()) {
5027 dn
->dir
->link_remote_inode(dn
, q
.second
.remote_ino
, q
.second
.remote_d_type
);
5029 CInode
*in
= get_inode(q
.second
.ino
, q
.first
.snapid
);
5031 // barebones inode; assume it's dir, the full inode loop below will clean up.
5032 in
= new CInode(this, false, q
.second
.first
, q
.first
.snapid
);
5033 in
->inode
.ino
= q
.second
.ino
;
5034 in
->inode
.mode
= S_IFDIR
;
5035 in
->inode
.dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
5037 dout(10) << " add inode " << *in
<< dendl
;
5038 } else if (in
->get_parent_dn()) {
5039 dout(10) << " had bad linkage for " << *(in
->get_parent_dn())
5040 << ", unlinking " << *in
<< dendl
;
5041 in
->get_parent_dir()->unlink_inode(in
->get_parent_dn());
5043 dn
->dir
->link_primary_inode(dn
, in
);
5044 isolated_inodes
.erase(in
);
5048 dn
->set_replica_nonce(q
.second
.nonce
);
5049 dn
->lock
.set_state_rejoin(q
.second
.lock
, rejoin_waiters
, survivor
);
5050 dn
->state_clear(CDentry::STATE_REJOINING
);
5051 dout(10) << " got " << *dn
<< dendl
;
5056 for (set
<CInode
*>::iterator p
= refragged_inodes
.begin();
5057 p
!= refragged_inodes
.end();
5060 (*p
)->get_nested_dirfrags(ls
);
5061 for (list
<CDir
*>::iterator q
= ls
.begin(); q
!= ls
.end(); ++q
) {
5062 if ((*q
)->is_auth() || ack
->strong_dirfrags
.count((*q
)->dirfrag()))
5064 ceph_assert((*q
)->get_num_any() == 0);
5065 (*p
)->close_dirfrag((*q
)->get_frag());
5070 for (const auto &p
: ack
->dirfrag_bases
) {
5071 CDir
*dir
= get_dirfrag(p
.first
);
5073 auto q
= p
.second
.cbegin();
5074 dir
->_decode_base(q
);
5075 dout(10) << " got dir replica " << *dir
<< dendl
;
5079 auto p
= ack
->inode_base
.cbegin();
5087 CInode
*in
= get_inode(ino
, last
);
5089 auto q
= basebl
.cbegin();
5092 sseq
= in
->snaprealm
->srnode
.seq
;
5093 in
->_decode_base(q
);
5094 if (in
->snaprealm
&& in
->snaprealm
->srnode
.seq
!= sseq
) {
5095 int snap_op
= sseq
> 0 ? CEPH_SNAP_OP_UPDATE
: CEPH_SNAP_OP_SPLIT
;
5096 updated_realms
.push_back(pair
<CInode
*,int>(in
, snap_op
));
5098 dout(10) << " got inode base " << *in
<< dendl
;
5102 p
= ack
->inode_locks
.cbegin();
5103 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5114 CInode
*in
= get_inode(ino
, last
);
5116 in
->set_replica_nonce(nonce
);
5117 auto q
= lockbl
.cbegin();
5118 in
->_decode_locks_rejoin(q
, rejoin_waiters
, rejoin_eval_locks
, survivor
);
5119 in
->state_clear(CInode::STATE_REJOINING
);
5120 dout(10) << " got inode locks " << *in
<< dendl
;
5123 // FIXME: This can happen if entire subtree, together with the inode subtree root
5124 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
5125 ceph_assert(isolated_inodes
.empty());
5127 map
<inodeno_t
,map
<client_t
,Capability::Import
> > peer_imported
;
5128 auto bp
= ack
->imported_caps
.cbegin();
5129 decode(peer_imported
, bp
);
5131 for (map
<inodeno_t
,map
<client_t
,Capability::Import
> >::iterator p
= peer_imported
.begin();
5132 p
!= peer_imported
.end();
5134 auto& ex
= cap_exports
.at(p
->first
);
5135 ceph_assert(ex
.first
== from
);
5136 for (map
<client_t
,Capability::Import
>::iterator q
= p
->second
.begin();
5137 q
!= p
->second
.end();
5139 auto r
= ex
.second
.find(q
->first
);
5140 ceph_assert(r
!= ex
.second
.end());
5142 dout(10) << " exporting caps for client." << q
->first
<< " ino " << p
->first
<< dendl
;
5143 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
5145 dout(10) << " no session for client." << p
->first
<< dendl
;
5150 // mark client caps stale.
5151 auto m
= MClientCaps::create(CEPH_CAP_OP_EXPORT
, p
->first
, 0,
5152 r
->second
.capinfo
.cap_id
, 0,
5153 mds
->get_osd_epoch_barrier());
5154 m
->set_cap_peer(q
->second
.cap_id
, q
->second
.issue_seq
, q
->second
.mseq
,
5155 (q
->second
.cap_id
> 0 ? from
: -1), 0);
5156 mds
->send_message_client_counted(m
, session
);
5160 ceph_assert(ex
.second
.empty());
5163 for (auto p
: updated_realms
) {
5164 CInode
*in
= p
.first
;
5165 bool notify_clients
;
5166 if (mds
->is_rejoin()) {
5167 if (!rejoin_pending_snaprealms
.count(in
)) {
5168 in
->get(CInode::PIN_OPENINGSNAPPARENTS
);
5169 rejoin_pending_snaprealms
.insert(in
);
5171 notify_clients
= false;
5173 // notify clients if I'm survivor
5174 notify_clients
= true;
5176 do_realm_invalidate_and_update_notify(in
, p
.second
, notify_clients
);
5180 ceph_assert(rejoin_ack_gather
.count(from
));
5181 rejoin_ack_gather
.erase(from
);
5183 if (rejoin_gather
.empty()) {
5184 // eval unstable scatter locks after all wrlocks are rejoined.
5185 while (!rejoin_eval_locks
.empty()) {
5186 SimpleLock
*lock
= rejoin_eval_locks
.front();
5187 rejoin_eval_locks
.pop_front();
5188 if (!lock
->is_stable())
5189 mds
->locker
->eval_gather(lock
);
5193 if (rejoin_gather
.empty() && // make sure we've gotten our FULL inodes, too.
5194 rejoin_ack_gather
.empty()) {
5195 // finally, kickstart past snap parent opens
5198 dout(7) << "still need rejoin from (" << rejoin_gather
<< ")"
5199 << ", rejoin_ack from (" << rejoin_ack_gather
<< ")" << dendl
;
5203 mds
->queue_waiters(rejoin_waiters
);
5208 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5210 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5211 * messages that clean these guys up...
5213 void MDCache::rejoin_trim_undef_inodes()
5215 dout(10) << "rejoin_trim_undef_inodes" << dendl
;
5217 while (!rejoin_undef_inodes
.empty()) {
5218 set
<CInode
*>::iterator p
= rejoin_undef_inodes
.begin();
5220 rejoin_undef_inodes
.erase(p
);
5222 in
->clear_replica_map();
5224 // close out dirfrags
5227 in
->get_dirfrags(dfls
);
5228 for (list
<CDir
*>::iterator p
= dfls
.begin();
5232 dir
->clear_replica_map();
5234 for (auto &p
: dir
->items
) {
5235 CDentry
*dn
= p
.second
;
5236 dn
->clear_replica_map();
5238 dout(10) << " trimming " << *dn
<< dendl
;
5239 dir
->remove_dentry(dn
);
5242 dout(10) << " trimming " << *dir
<< dendl
;
5243 in
->close_dirfrag(dir
->dirfrag().frag
);
5247 CDentry
*dn
= in
->get_parent_dn();
5249 dn
->clear_replica_map();
5250 dout(10) << " trimming " << *dn
<< dendl
;
5251 dn
->dir
->remove_dentry(dn
);
5253 dout(10) << " trimming " << *in
<< dendl
;
5258 ceph_assert(rejoin_undef_inodes
.empty());
5261 void MDCache::rejoin_gather_finish()
5263 dout(10) << "rejoin_gather_finish" << dendl
;
5264 ceph_assert(mds
->is_rejoin());
5265 ceph_assert(rejoin_ack_gather
.count(mds
->get_nodeid()));
5267 if (open_undef_inodes_dirfrags())
5270 if (process_imported_caps())
5273 choose_lock_states_and_reconnect_caps();
5275 identify_files_to_recover();
5278 // signal completion of fetches, rejoin_gather_finish, etc.
5279 rejoin_ack_gather
.erase(mds
->get_nodeid());
5281 // did we already get our acks too?
5282 if (rejoin_ack_gather
.empty()) {
5283 // finally, open snaprealms
5288 class C_MDC_RejoinOpenInoFinish
: public MDCacheContext
{
5291 C_MDC_RejoinOpenInoFinish(MDCache
*c
, inodeno_t i
) : MDCacheContext(c
), ino(i
) {}
5292 void finish(int r
) override
{
5293 mdcache
->rejoin_open_ino_finish(ino
, r
);
5297 void MDCache::rejoin_open_ino_finish(inodeno_t ino
, int ret
)
5299 dout(10) << "open_caps_inode_finish ino " << ino
<< " ret " << ret
<< dendl
;
5302 cap_imports_missing
.insert(ino
);
5303 } else if (ret
== mds
->get_nodeid()) {
5304 ceph_assert(get_inode(ino
));
5306 auto p
= cap_imports
.find(ino
);
5307 ceph_assert(p
!= cap_imports
.end());
5308 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5309 ceph_assert(q
->second
.count(MDS_RANK_NONE
));
5310 ceph_assert(q
->second
.size() == 1);
5311 rejoin_export_caps(p
->first
, q
->first
, q
->second
[MDS_RANK_NONE
], ret
);
5313 cap_imports
.erase(p
);
5316 ceph_assert(cap_imports_num_opening
> 0);
5317 cap_imports_num_opening
--;
5319 if (cap_imports_num_opening
== 0) {
5320 if (rejoin_gather
.empty())
5321 rejoin_gather_finish();
5322 else if (rejoin_gather
.count(mds
->get_nodeid()))
5323 process_imported_caps();
5327 class C_MDC_RejoinSessionsOpened
: public MDCacheLogContext
{
5329 map
<client_t
,pair
<Session
*,uint64_t> > session_map
;
5330 C_MDC_RejoinSessionsOpened(MDCache
*c
) : MDCacheLogContext(c
) {}
5331 void finish(int r
) override
{
5332 ceph_assert(r
== 0);
5333 mdcache
->rejoin_open_sessions_finish(session_map
);
5337 void MDCache::rejoin_open_sessions_finish(map
<client_t
,pair
<Session
*,uint64_t> >& session_map
)
5339 dout(10) << "rejoin_open_sessions_finish" << dendl
;
5340 mds
->server
->finish_force_open_sessions(session_map
);
5341 rejoin_session_map
.swap(session_map
);
5342 if (rejoin_gather
.empty())
5343 rejoin_gather_finish();
5346 void MDCache::rejoin_prefetch_ino_finish(inodeno_t ino
, int ret
)
5348 auto p
= cap_imports
.find(ino
);
5349 if (p
!= cap_imports
.end()) {
5350 dout(10) << __func__
<< " ino " << ino
<< " ret " << ret
<< dendl
;
5352 cap_imports_missing
.insert(ino
);
5353 } else if (ret
!= mds
->get_nodeid()) {
5354 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5355 ceph_assert(q
->second
.count(MDS_RANK_NONE
));
5356 ceph_assert(q
->second
.size() == 1);
5357 rejoin_export_caps(p
->first
, q
->first
, q
->second
[MDS_RANK_NONE
], ret
);
5359 cap_imports
.erase(p
);
5364 bool MDCache::process_imported_caps()
5366 dout(10) << "process_imported_caps" << dendl
;
5368 if (!open_file_table
.is_prefetched() &&
5369 open_file_table
.prefetch_inodes()) {
5370 open_file_table
.wait_for_prefetch(
5371 new MDSInternalContextWrapper(mds
,
5372 new FunctionContext([this](int r
) {
5373 ceph_assert(rejoin_gather
.count(mds
->get_nodeid()));
5374 process_imported_caps();
5381 for (auto p
= cap_imports
.begin(); p
!= cap_imports
.end(); ++p
) {
5382 CInode
*in
= get_inode(p
->first
);
5384 ceph_assert(in
->is_auth());
5385 cap_imports_missing
.erase(p
->first
);
5388 if (cap_imports_missing
.count(p
->first
) > 0)
5391 cap_imports_num_opening
++;
5392 dout(10) << " opening missing ino " << p
->first
<< dendl
;
5393 open_ino(p
->first
, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p
->first
), false);
5394 if (!(cap_imports_num_opening
% 1000))
5395 mds
->heartbeat_reset();
5398 if (cap_imports_num_opening
> 0)
5401 // called by rejoin_gather_finish() ?
5402 if (rejoin_gather
.count(mds
->get_nodeid()) == 0) {
5403 if (!rejoin_client_map
.empty() &&
5404 rejoin_session_map
.empty()) {
5405 C_MDC_RejoinSessionsOpened
*finish
= new C_MDC_RejoinSessionsOpened(this);
5406 version_t pv
= mds
->server
->prepare_force_open_sessions(rejoin_client_map
,
5407 rejoin_client_metadata_map
,
5408 finish
->session_map
);
5409 ESessions
*le
= new ESessions(pv
, std::move(rejoin_client_map
),
5410 std::move(rejoin_client_metadata_map
));
5411 mds
->mdlog
->start_submit_entry(le
, finish
);
5412 mds
->mdlog
->flush();
5413 rejoin_client_map
.clear();
5414 rejoin_client_metadata_map
.clear();
5418 // process caps that were exported by slave rename
5419 for (map
<inodeno_t
,pair
<mds_rank_t
,map
<client_t
,Capability::Export
> > >::iterator p
= rejoin_slave_exports
.begin();
5420 p
!= rejoin_slave_exports
.end();
5422 CInode
*in
= get_inode(p
->first
);
5424 for (map
<client_t
,Capability::Export
>::iterator q
= p
->second
.second
.begin();
5425 q
!= p
->second
.second
.end();
5427 auto r
= rejoin_session_map
.find(q
->first
);
5428 if (r
== rejoin_session_map
.end())
5431 Session
*session
= r
->second
.first
;
5432 Capability
*cap
= in
->get_client_cap(q
->first
);
5434 cap
= in
->add_client_cap(q
->first
, session
);
5435 // add empty item to reconnected_caps
5436 (void)reconnected_caps
[p
->first
][q
->first
];
5438 cap
->merge(q
->second
, true);
5440 Capability::Import
& im
= rejoin_imported_caps
[p
->second
.first
][p
->first
][q
->first
];
5441 ceph_assert(cap
->get_last_seq() == im
.issue_seq
);
5442 ceph_assert(cap
->get_mseq() == im
.mseq
);
5443 cap
->set_cap_id(im
.cap_id
);
5444 // send cap import because we assigned a new cap ID
5445 do_cap_import(session
, in
, cap
, q
->second
.cap_id
, q
->second
.seq
, q
->second
.mseq
- 1,
5446 p
->second
.first
, CEPH_CAP_FLAG_AUTH
);
5449 rejoin_slave_exports
.clear();
5450 rejoin_imported_caps
.clear();
5452 // process cap imports
5453 // ino -> client -> frommds -> capex
5454 for (auto p
= cap_imports
.begin(); p
!= cap_imports
.end(); ) {
5455 CInode
*in
= get_inode(p
->first
);
5457 dout(10) << " still missing ino " << p
->first
5458 << ", will try again after replayed client requests" << dendl
;
5462 ceph_assert(in
->is_auth());
5463 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5466 auto r
= rejoin_session_map
.find(q
->first
);
5467 session
= (r
!= rejoin_session_map
.end() ? r
->second
.first
: nullptr);
5470 for (auto r
= q
->second
.begin(); r
!= q
->second
.end(); ++r
) {
5473 (void)rejoin_imported_caps
[r
->first
][p
->first
][q
->first
]; // all are zero
5477 Capability
*cap
= in
->reconnect_cap(q
->first
, r
->second
, session
);
5478 add_reconnected_cap(q
->first
, in
->ino(), r
->second
);
5479 if (r
->first
>= 0) {
5480 if (cap
->get_last_seq() == 0) // don't increase mseq if cap already exists
5482 do_cap_import(session
, in
, cap
, r
->second
.capinfo
.cap_id
, 0, 0, r
->first
, 0);
5484 Capability::Import
& im
= rejoin_imported_caps
[r
->first
][p
->first
][q
->first
];
5485 im
.cap_id
= cap
->get_cap_id();
5486 im
.issue_seq
= cap
->get_last_seq();
5487 im
.mseq
= cap
->get_mseq();
5491 cap_imports
.erase(p
++); // remove and move on
5496 ceph_assert(rejoin_gather
.count(mds
->get_nodeid()));
5497 rejoin_gather
.erase(mds
->get_nodeid());
5498 ceph_assert(!rejoin_ack_gather
.count(mds
->get_nodeid()));
5499 maybe_send_pending_rejoins();
5504 void MDCache::rebuild_need_snapflush(CInode
*head_in
, SnapRealm
*realm
,
5505 client_t client
, snapid_t snap_follows
)
5507 dout(10) << "rebuild_need_snapflush " << snap_follows
<< " on " << *head_in
<< dendl
;
5509 if (!realm
->has_snaps_in_range(snap_follows
+ 1, head_in
->first
- 1))
5512 const set
<snapid_t
>& snaps
= realm
->get_snaps();
5513 snapid_t follows
= snap_follows
;
5516 CInode
*in
= pick_inode_snap(head_in
, follows
);
5520 bool need_snapflush
= false;
5521 for (auto p
= snaps
.lower_bound(std::max
<snapid_t
>(in
->first
, (follows
+ 1)));
5522 p
!= snaps
.end() && *p
<= in
->last
;
5524 head_in
->add_need_snapflush(in
, *p
, client
);
5525 need_snapflush
= true;
5528 if (!need_snapflush
)
5531 dout(10) << " need snapflush from client." << client
<< " on " << *in
<< dendl
;
5533 if (in
->client_snap_caps
.empty()) {
5534 for (int i
= 0; i
< num_cinode_locks
; i
++) {
5535 int lockid
= cinode_lock_info
[i
].lock
;
5536 SimpleLock
*lock
= in
->get_lock(lockid
);
5539 lock
->set_state(LOCK_SNAP_SYNC
);
5540 lock
->get_wrlock(true);
5543 in
->client_snap_caps
.insert(client
);
5544 mds
->locker
->mark_need_snapflush_inode(in
);
5549 * choose lock states based on reconnected caps
5551 void MDCache::choose_lock_states_and_reconnect_caps()
5553 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl
;
5556 for (auto p
: inode_map
) {
5557 CInode
*in
= p
.second
;
5558 if (in
->last
!= CEPH_NOSNAP
)
5561 if (in
->is_auth() && !in
->is_base() && in
->inode
.is_dirty_rstat())
5562 in
->mark_dirty_rstat();
5565 auto q
= reconnected_caps
.find(in
->ino());
5566 if (q
!= reconnected_caps
.end()) {
5567 for (const auto &it
: q
->second
)
5568 dirty_caps
|= it
.second
.dirty_caps
;
5570 in
->choose_lock_states(dirty_caps
);
5571 dout(15) << " chose lock states on " << *in
<< dendl
;
5573 if (in
->snaprealm
&& !rejoin_pending_snaprealms
.count(in
)) {
5574 in
->get(CInode::PIN_OPENINGSNAPPARENTS
);
5575 rejoin_pending_snaprealms
.insert(in
);
5578 if (!(++count
% 1000))
5579 mds
->heartbeat_reset();
5583 void MDCache::prepare_realm_split(SnapRealm
*realm
, client_t client
, inodeno_t ino
,
5584 map
<client_t
,MClientSnap::ref
>& splits
)
5586 MClientSnap::ref snap
;
5587 auto it
= splits
.find(client
);
5588 if (it
!= splits
.end()) {
5590 snap
->head
.op
= CEPH_SNAP_OP_SPLIT
;
5592 snap
= MClientSnap::create(CEPH_SNAP_OP_SPLIT
);
5593 splits
.emplace(std::piecewise_construct
, std::forward_as_tuple(client
), std::forward_as_tuple(snap
));
5594 snap
->head
.split
= realm
->inode
->ino();
5595 snap
->bl
= realm
->get_snap_trace();
5597 for (const auto& child
: realm
->open_children
)
5598 snap
->split_realms
.push_back(child
->inode
->ino());
5600 snap
->split_inos
.push_back(ino
);
5603 void MDCache::prepare_realm_merge(SnapRealm
*realm
, SnapRealm
*parent_realm
,
5604 map
<client_t
,MClientSnap::ref
>& splits
)
5606 ceph_assert(parent_realm
);
5608 vector
<inodeno_t
> split_inos
;
5609 vector
<inodeno_t
> split_realms
;
5611 for (elist
<CInode
*>::iterator p
= realm
->inodes_with_caps
.begin(member_offset(CInode
, item_caps
));
5614 split_inos
.push_back((*p
)->ino());
5615 for (set
<SnapRealm
*>::iterator p
= realm
->open_children
.begin();
5616 p
!= realm
->open_children
.end();
5618 split_realms
.push_back((*p
)->inode
->ino());
5620 for (const auto& p
: realm
->client_caps
) {
5621 ceph_assert(!p
.second
->empty());
5622 auto em
= splits
.emplace(std::piecewise_construct
, std::forward_as_tuple(p
.first
), std::forward_as_tuple());
5624 auto update
= MClientSnap::create(CEPH_SNAP_OP_SPLIT
);
5625 update
->head
.split
= parent_realm
->inode
->ino();
5626 update
->split_inos
= split_inos
;
5627 update
->split_realms
= split_realms
;
5628 update
->bl
= parent_realm
->get_snap_trace();
5629 em
.first
->second
= std::move(update
);
5634 void MDCache::send_snaps(map
<client_t
,MClientSnap::ref
>& splits
)
5636 dout(10) << "send_snaps" << dendl
;
5638 for (auto &p
: splits
) {
5639 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(p
.first
.v
));
5641 dout(10) << " client." << p
.first
5642 << " split " << p
.second
->head
.split
5643 << " inos " << p
.second
->split_inos
5645 mds
->send_message_client_counted(p
.second
, session
);
5647 dout(10) << " no session for client." << p
.first
<< dendl
;
5655 * remove any items from logsegment open_file lists that don't have
5658 void MDCache::clean_open_file_lists()
5660 dout(10) << "clean_open_file_lists" << dendl
;
5662 for (map
<uint64_t,LogSegment
*>::iterator p
= mds
->mdlog
->segments
.begin();
5663 p
!= mds
->mdlog
->segments
.end();
5665 LogSegment
*ls
= p
->second
;
5667 elist
<CInode
*>::iterator q
= ls
->open_files
.begin(member_offset(CInode
, item_open_file
));
5671 if (in
->last
== CEPH_NOSNAP
) {
5672 dout(10) << " unlisting unwanted/capless inode " << *in
<< dendl
;
5673 in
->item_open_file
.remove_myself();
5675 if (in
->client_snap_caps
.empty()) {
5676 dout(10) << " unlisting flushed snap inode " << *in
<< dendl
;
5677 in
->item_open_file
.remove_myself();
5684 void MDCache::dump_openfiles(Formatter
*f
)
5686 f
->open_array_section("openfiles");
5687 for (auto p
= mds
->mdlog
->segments
.begin();
5688 p
!= mds
->mdlog
->segments
.end();
5690 LogSegment
*ls
= p
->second
;
5692 auto q
= ls
->open_files
.begin(member_offset(CInode
, item_open_file
));
5696 if ((in
->last
== CEPH_NOSNAP
&& !in
->is_any_caps_wanted())
5697 || (in
->last
!= CEPH_NOSNAP
&& in
->client_snap_caps
.empty()))
5699 f
->open_object_section("file");
5700 in
->dump(f
, CInode::DUMP_PATH
| CInode::DUMP_INODE_STORE_BASE
| CInode::DUMP_CAPS
);
5707 Capability
* MDCache::rejoin_import_cap(CInode
*in
, client_t client
, const cap_reconnect_t
& icr
, mds_rank_t frommds
)
5709 dout(10) << "rejoin_import_cap for client." << client
<< " from mds." << frommds
5710 << " on " << *in
<< dendl
;
5711 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(client
.v
));
5713 dout(10) << " no session for client." << client
<< dendl
;
5717 Capability
*cap
= in
->reconnect_cap(client
, icr
, session
);
5720 if (cap
->get_last_seq() == 0) // don't increase mseq if cap already exists
5722 do_cap_import(session
, in
, cap
, icr
.capinfo
.cap_id
, 0, 0, frommds
, 0);
5728 void MDCache::export_remaining_imported_caps()
5730 dout(10) << "export_remaining_imported_caps" << dendl
;
5732 stringstream warn_str
;
5735 for (auto p
= cap_imports
.begin(); p
!= cap_imports
.end(); ++p
) {
5736 warn_str
<< " ino " << p
->first
<< "\n";
5737 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5738 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
5740 // mark client caps stale.
5741 auto stale
= MClientCaps::create(CEPH_CAP_OP_EXPORT
, p
->first
, 0, 0, 0, mds
->get_osd_epoch_barrier());
5742 stale
->set_cap_peer(0, 0, 0, -1, 0);
5743 mds
->send_message_client_counted(stale
, q
->first
);
5747 if (!(++count
% 1000))
5748 mds
->heartbeat_reset();
5751 for (map
<inodeno_t
, MDSContext::vec
>::iterator p
= cap_reconnect_waiters
.begin();
5752 p
!= cap_reconnect_waiters
.end();
5754 mds
->queue_waiters(p
->second
);
5756 cap_imports
.clear();
5757 cap_reconnect_waiters
.clear();
5759 if (warn_str
.peek() != EOF
) {
5760 mds
->clog
->warn() << "failed to reconnect caps for missing inodes:";
5761 mds
->clog
->warn(warn_str
);
5765 Capability
* MDCache::try_reconnect_cap(CInode
*in
, Session
*session
)
5767 client_t client
= session
->info
.get_client();
5768 Capability
*cap
= nullptr;
5769 const cap_reconnect_t
*rc
= get_replay_cap_reconnect(in
->ino(), client
);
5771 cap
= in
->reconnect_cap(client
, *rc
, session
);
5772 dout(10) << "try_reconnect_cap client." << client
5773 << " reconnect wanted " << ccap_string(rc
->capinfo
.wanted
)
5774 << " issue " << ccap_string(rc
->capinfo
.issued
)
5775 << " on " << *in
<< dendl
;
5776 remove_replay_cap_reconnect(in
->ino(), client
);
5778 if (in
->is_replicated()) {
5779 mds
->locker
->try_eval(in
, CEPH_CAP_LOCKS
);
5782 auto p
= reconnected_caps
.find(in
->ino());
5783 if (p
!= reconnected_caps
.end()) {
5784 auto q
= p
->second
.find(client
);
5785 if (q
!= p
->second
.end())
5786 dirty_caps
= q
->second
.dirty_caps
;
5788 in
->choose_lock_states(dirty_caps
);
5789 dout(15) << " chose lock states on " << *in
<< dendl
;
5792 map
<inodeno_t
, MDSContext::vec
>::iterator it
=
5793 cap_reconnect_waiters
.find(in
->ino());
5794 if (it
!= cap_reconnect_waiters
.end()) {
5795 mds
->queue_waiters(it
->second
);
5796 cap_reconnect_waiters
.erase(it
);
5805 // cap imports and delayed snap parent opens
5807 void MDCache::do_cap_import(Session
*session
, CInode
*in
, Capability
*cap
,
5808 uint64_t p_cap_id
, ceph_seq_t p_seq
, ceph_seq_t p_mseq
,
5809 int peer
, int p_flags
)
5811 SnapRealm
*realm
= in
->find_snaprealm();
5812 if (realm
->have_past_parents_open()) {
5813 dout(10) << "do_cap_import " << session
->info
.inst
.name
<< " mseq " << cap
->get_mseq() << " on " << *in
<< dendl
;
5814 if (cap
->get_last_seq() == 0) // reconnected cap
5815 cap
->inc_last_seq();
5816 cap
->set_last_issue();
5817 cap
->set_last_issue_stamp(ceph_clock_now());
5819 auto reap
= MClientCaps::create(CEPH_CAP_OP_IMPORT
, in
->ino(), realm
->inode
->ino(), cap
->get_cap_id(), cap
->get_last_seq(), cap
->pending(), cap
->wanted(), 0, cap
->get_mseq(), mds
->get_osd_epoch_barrier());
5820 in
->encode_cap_message(reap
, cap
);
5821 reap
->snapbl
= realm
->get_snap_trace();
5822 reap
->set_cap_peer(p_cap_id
, p_seq
, p_mseq
, peer
, p_flags
);
5823 mds
->send_message_client_counted(reap
, session
);
5829 void MDCache::do_delayed_cap_imports()
5831 dout(10) << "do_delayed_cap_imports" << dendl
;
5833 ceph_assert(delayed_imported_caps
.empty());
5836 struct C_MDC_OpenSnapRealms
: public MDCacheContext
{
5837 explicit C_MDC_OpenSnapRealms(MDCache
*c
) : MDCacheContext(c
) {}
5838 void finish(int r
) override
{
5839 mdcache
->open_snaprealms();
5843 void MDCache::open_snaprealms()
5845 dout(10) << "open_snaprealms" << dendl
;
5847 MDSGatherBuilder
gather(g_ceph_context
);
5849 auto it
= rejoin_pending_snaprealms
.begin();
5850 while (it
!= rejoin_pending_snaprealms
.end()) {
5852 SnapRealm
*realm
= in
->snaprealm
;
5854 if (realm
->have_past_parents_open() ||
5855 realm
->open_parents(gather
.new_sub())) {
5856 dout(10) << " past parents now open on " << *in
<< dendl
;
5858 map
<client_t
,MClientSnap::ref
> splits
;
5859 // finish off client snaprealm reconnects?
5860 map
<inodeno_t
,map
<client_t
,snapid_t
> >::iterator q
= reconnected_snaprealms
.find(in
->ino());
5861 if (q
!= reconnected_snaprealms
.end()) {
5862 for (const auto& r
: q
->second
)
5863 finish_snaprealm_reconnect(r
.first
, realm
, r
.second
, splits
);
5864 reconnected_snaprealms
.erase(q
);
5867 for (elist
<CInode
*>::iterator p
= realm
->inodes_with_caps
.begin(member_offset(CInode
, item_caps
));
5870 auto q
= reconnected_caps
.find(child
->ino());
5871 ceph_assert(q
!= reconnected_caps
.end());
5872 for (auto r
= q
->second
.begin(); r
!= q
->second
.end(); ++r
) {
5873 Capability
*cap
= child
->get_client_cap(r
->first
);
5876 if (r
->second
.snap_follows
> 0) {
5877 if (r
->second
.snap_follows
< child
->first
- 1) {
5878 rebuild_need_snapflush(child
, realm
, r
->first
, r
->second
.snap_follows
);
5879 } else if (r
->second
.snapflush
) {
5880 // When processing a cap flush message that is re-sent, it's possble
5881 // that the sender has already released all WR caps. So we should
5882 // force MDCache::cow_inode() to setup CInode::client_need_snapflush.
5883 cap
->mark_needsnapflush();
5886 // make sure client's cap is in the correct snaprealm.
5887 if (r
->second
.realm_ino
!= in
->ino()) {
5888 prepare_realm_split(realm
, r
->first
, child
->ino(), splits
);
5893 rejoin_pending_snaprealms
.erase(it
++);
5894 in
->put(CInode::PIN_OPENINGSNAPPARENTS
);
5898 dout(10) << " opening past parents on " << *in
<< dendl
;
5903 if (gather
.has_subs()) {
5904 if (gather
.num_subs_remaining() == 0) {
5906 gather
.set_finisher(new C_MDSInternalNoop
);
5909 // for multimds, must succeed the first time
5910 ceph_assert(recovery_set
.empty());
5912 dout(10) << "open_snaprealms - waiting for "
5913 << gather
.num_subs_remaining() << dendl
;
5914 gather
.set_finisher(new C_MDC_OpenSnapRealms(this));
5920 notify_global_snaprealm_update(CEPH_SNAP_OP_UPDATE
);
5922 if (!reconnected_snaprealms
.empty()) {
5923 dout(5) << "open_snaprealms has unconnected snaprealm:" << dendl
;
5924 for (auto& p
: reconnected_snaprealms
) {
5925 stringstream warn_str
;
5926 warn_str
<< " " << p
.first
<< " {";
5928 for (auto& q
: p
.second
) {
5931 warn_str
<< "client." << q
.first
<< "/" << q
.second
;
5934 dout(5) << warn_str
.str() << dendl
;
5937 ceph_assert(rejoin_waiters
.empty());
5938 ceph_assert(rejoin_pending_snaprealms
.empty());
5939 dout(10) << "open_snaprealms - all open" << dendl
;
5940 do_delayed_cap_imports();
5942 ceph_assert(rejoin_done
);
5943 rejoin_done
.release()->complete(0);
5944 reconnected_caps
.clear();
5947 bool MDCache::open_undef_inodes_dirfrags()
5949 dout(10) << "open_undef_inodes_dirfrags "
5950 << rejoin_undef_inodes
.size() << " inodes "
5951 << rejoin_undef_dirfrags
.size() << " dirfrags" << dendl
;
5953 set
<CDir
*> fetch_queue
= rejoin_undef_dirfrags
;
5955 for (set
<CInode
*>::iterator p
= rejoin_undef_inodes
.begin();
5956 p
!= rejoin_undef_inodes
.end();
5959 ceph_assert(!in
->is_base());
5960 fetch_queue
.insert(in
->get_parent_dir());
5963 if (fetch_queue
.empty())
5966 MDSGatherBuilder
gather(g_ceph_context
,
5967 new MDSInternalContextWrapper(mds
,
5968 new FunctionContext([this](int r
) {
5969 if (rejoin_gather
.empty())
5970 rejoin_gather_finish();
5975 for (set
<CDir
*>::iterator p
= fetch_queue
.begin();
5976 p
!= fetch_queue
.end();
5979 CInode
*diri
= dir
->get_inode();
5980 if (diri
->state_test(CInode::STATE_REJOINUNDEF
))
5982 if (dir
->state_test(CDir::STATE_REJOINUNDEF
))
5983 ceph_assert(diri
->dirfragtree
.is_leaf(dir
->get_frag()));
5984 dir
->fetch(gather
.new_sub());
5986 ceph_assert(gather
.has_subs());
5991 void MDCache::opened_undef_inode(CInode
*in
) {
5992 dout(10) << "opened_undef_inode " << *in
<< dendl
;
5993 rejoin_undef_inodes
.erase(in
);
5995 // FIXME: re-hash dentries if necessary
5996 ceph_assert(in
->inode
.dir_layout
.dl_dir_hash
== g_conf()->mds_default_dir_hash
);
5997 if (in
->has_dirfrags() && !in
->dirfragtree
.is_leaf(frag_t())) {
5998 CDir
*dir
= in
->get_dirfrag(frag_t());
6000 rejoin_undef_dirfrags
.erase(dir
);
6001 in
->force_dirfrags();
6003 in
->get_dirfrags(ls
);
6004 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
)
6005 rejoin_undef_dirfrags
.insert(*p
);
6010 void MDCache::finish_snaprealm_reconnect(client_t client
, SnapRealm
*realm
, snapid_t seq
,
6011 map
<client_t
,MClientSnap::ref
>& updates
)
6013 if (seq
< realm
->get_newest_seq()) {
6014 dout(10) << "finish_snaprealm_reconnect client." << client
<< " has old seq " << seq
<< " < "
6015 << realm
->get_newest_seq() << " on " << *realm
<< dendl
;
6016 auto snap
= MClientSnap::create(CEPH_SNAP_OP_UPDATE
);
6017 snap
->bl
= realm
->get_snap_trace();
6018 for (const auto& child
: realm
->open_children
)
6019 snap
->split_realms
.push_back(child
->inode
->ino());
6020 updates
.emplace(std::piecewise_construct
, std::forward_as_tuple(client
), std::forward_as_tuple(snap
));
6022 dout(10) << "finish_snaprealm_reconnect client." << client
<< " up to date"
6023 << " on " << *realm
<< dendl
;
6029 void MDCache::rejoin_send_acks()
6031 dout(7) << "rejoin_send_acks" << dendl
;
6034 for (map
<mds_rank_t
, set
<CInode
*> >::iterator p
= rejoin_unlinked_inodes
.begin();
6035 p
!= rejoin_unlinked_inodes
.end();
6037 for (set
<CInode
*>::iterator q
= p
->second
.begin();
6038 q
!= p
->second
.end();
6041 dout(7) << " unlinked inode " << *in
<< dendl
;
6043 if (!in
->is_replica(p
->first
))
6046 CDentry
*dn
= in
->get_parent_dn();
6047 if (dn
->is_replica(p
->first
))
6049 dn
->add_replica(p
->first
);
6050 CDir
*dir
= dn
->get_dir();
6051 if (dir
->is_replica(p
->first
))
6053 dir
->add_replica(p
->first
);
6054 in
= dir
->get_inode();
6055 if (in
->is_replica(p
->first
))
6057 in
->add_replica(p
->first
);
6063 rejoin_unlinked_inodes
.clear();
6065 // send acks to everyone in the recovery set
6066 map
<mds_rank_t
,MMDSCacheRejoin::ref
> acks
;
6067 for (set
<mds_rank_t
>::iterator p
= recovery_set
.begin();
6068 p
!= recovery_set
.end();
6070 if (rejoin_ack_sent
.count(*p
))
6072 acks
[*p
] = MMDSCacheRejoin::create(MMDSCacheRejoin::OP_ACK
);
6075 rejoin_ack_sent
= recovery_set
;
6078 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
6079 p
!= subtrees
.end();
6081 CDir
*dir
= p
->first
;
6082 if (!dir
->is_auth())
6084 dout(10) << "subtree " << *dir
<< dendl
;
6086 // auth items in this subtree
6090 while (!dq
.empty()) {
6091 CDir
*dir
= dq
.front();
6095 for (auto &r
: dir
->get_replicas()) {
6096 auto it
= acks
.find(r
.first
);
6097 if (it
== acks
.end())
6099 it
->second
->add_strong_dirfrag(dir
->dirfrag(), ++r
.second
, dir
->dir_rep
);
6100 it
->second
->add_dirfrag_base(dir
);
6103 for (auto &p
: dir
->items
) {
6104 CDentry
*dn
= p
.second
;
6105 CDentry::linkage_t
*dnl
= dn
->get_linkage();
6109 if (dnl
->is_primary())
6110 in
= dnl
->get_inode();
6113 for (auto &r
: dn
->get_replicas()) {
6114 auto it
= acks
.find(r
.first
);
6115 if (it
== acks
.end())
6117 it
->second
->add_strong_dentry(dir
->dirfrag(), dn
->get_name(), dn
->first
, dn
->last
,
6118 dnl
->is_primary() ? dnl
->get_inode()->ino():inodeno_t(0),
6119 dnl
->is_remote() ? dnl
->get_remote_ino():inodeno_t(0),
6120 dnl
->is_remote() ? dnl
->get_remote_d_type():0,
6122 dn
->lock
.get_replica_state());
6123 // peer missed MDentrylink message ?
6124 if (in
&& !in
->is_replica(r
.first
))
6125 in
->add_replica(r
.first
);
6131 for (auto &r
: in
->get_replicas()) {
6132 auto it
= acks
.find(r
.first
);
6133 if (it
== acks
.end())
6135 it
->second
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
6137 in
->_encode_locks_state_for_rejoin(bl
, r
.first
);
6138 it
->second
->add_inode_locks(in
, ++r
.second
, bl
);
6141 // subdirs in this subtree?
6142 in
->get_nested_dirfrags(dq
);
6148 if (root
&& root
->is_auth())
6149 for (auto &r
: root
->get_replicas()) {
6150 auto it
= acks
.find(r
.first
);
6151 if (it
== acks
.end())
6153 it
->second
->add_inode_base(root
, mds
->mdsmap
->get_up_features());
6155 root
->_encode_locks_state_for_rejoin(bl
, r
.first
);
6156 it
->second
->add_inode_locks(root
, ++r
.second
, bl
);
6159 for (auto &r
: myin
->get_replicas()) {
6160 auto it
= acks
.find(r
.first
);
6161 if (it
== acks
.end())
6163 it
->second
->add_inode_base(myin
, mds
->mdsmap
->get_up_features());
6165 myin
->_encode_locks_state_for_rejoin(bl
, r
.first
);
6166 it
->second
->add_inode_locks(myin
, ++r
.second
, bl
);
6169 // include inode base for any inodes whose scatterlocks may have updated
6170 for (set
<CInode
*>::iterator p
= rejoin_potential_updated_scatterlocks
.begin();
6171 p
!= rejoin_potential_updated_scatterlocks
.end();
6174 for (const auto &r
: in
->get_replicas()) {
6175 auto it
= acks
.find(r
.first
);
6176 if (it
== acks
.end())
6178 it
->second
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
6183 for (auto p
= acks
.begin(); p
!= acks
.end(); ++p
) {
6184 encode(rejoin_imported_caps
[p
->first
], p
->second
->imported_caps
);
6185 mds
->send_message_mds(p
->second
, p
->first
);
6188 rejoin_imported_caps
.clear();
6191 class C_MDC_ReIssueCaps
: public MDCacheContext
{
6194 C_MDC_ReIssueCaps(MDCache
*mdc
, CInode
*i
) :
6195 MDCacheContext(mdc
), in(i
)
6197 in
->get(CInode::PIN_PTRWAITER
);
6199 void finish(int r
) override
{
6200 if (!mdcache
->mds
->locker
->eval(in
, CEPH_CAP_LOCKS
))
6201 mdcache
->mds
->locker
->issue_caps(in
);
6202 in
->put(CInode::PIN_PTRWAITER
);
6206 void MDCache::reissue_all_caps()
6208 dout(10) << "reissue_all_caps" << dendl
;
6211 for (auto &p
: inode_map
) {
6213 CInode
*in
= p
.second
;
6214 if (in
->is_head() && in
->is_any_caps()) {
6215 // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
6216 if (in
->is_frozen_inode()) {
6217 in
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDC_ReIssueCaps(this, in
));
6220 if (!mds
->locker
->eval(in
, CEPH_CAP_LOCKS
))
6221 n
+= mds
->locker
->issue_caps(in
);
6224 if ((count
% 1000) + n
>= 1000)
6225 mds
->heartbeat_reset();
6231 // ===============================================================================
6233 struct C_MDC_QueuedCow
: public MDCacheContext
{
6236 C_MDC_QueuedCow(MDCache
*mdc
, CInode
*i
, MutationRef
& m
) :
6237 MDCacheContext(mdc
), in(i
), mut(m
) {}
6238 void finish(int r
) override
{
6239 mdcache
->_queued_file_recover_cow(in
, mut
);
6244 void MDCache::queue_file_recover(CInode
*in
)
6246 dout(10) << "queue_file_recover " << *in
<< dendl
;
6247 ceph_assert(in
->is_auth());
6251 SnapRealm *realm = in->find_snaprealm();
6252 set<snapid_t> s = realm->get_snaps();
6253 while (!s.empty() && *s.begin() < in->first)
6255 while (!s.empty() && *s.rbegin() > in->last)
6256 s.erase(*s.rbegin());
6257 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6259 CInode::mempool_inode pi = in->project_inode();
6260 pi->version = in->pre_dirty();
6262 auto mut(std::make_shared<MutationImpl>());
6263 mut->ls = mds->mdlog->get_current_segment();
6264 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6265 mds->mdlog->start_entry(le);
6266 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6268 s.erase(*s.begin());
6269 while (!s.empty()) {
6270 snapid_t snapid = *s.begin();
6271 CInode *cow_inode = 0;
6272 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
6273 ceph_assert(cow_inode);
6274 recovery_queue.enqueue(cow_inode);
6275 s.erase(*s.begin());
6278 in->parent->first = in->first;
6279 le->metablob.add_primary_dentry(in->parent, in, true);
6280 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6281 mds->mdlog->flush();
6285 recovery_queue
.enqueue(in
);
6288 void MDCache::_queued_file_recover_cow(CInode
*in
, MutationRef
& mut
)
6290 in
->pop_and_dirty_projected_inode(mut
->ls
);
6292 mds
->locker
->drop_locks(mut
.get());
6298 * called after recovery to recover file sizes for previously opened (for write)
6299 * files. that is, those where max_size > size.
6301 void MDCache::identify_files_to_recover()
6303 dout(10) << "identify_files_to_recover" << dendl
;
6305 for (auto &p
: inode_map
) {
6306 CInode
*in
= p
.second
;
6310 if (in
->last
!= CEPH_NOSNAP
)
6313 // Only normal files need file size recovery
6314 if (!in
->is_file()) {
6318 bool recover
= false;
6319 for (map
<client_t
,client_writeable_range_t
>::iterator p
= in
->inode
.client_ranges
.begin();
6320 p
!= in
->inode
.client_ranges
.end();
6322 Capability
*cap
= in
->get_client_cap(p
->first
);
6324 cap
->mark_clientwriteable();
6326 dout(10) << " client." << p
->first
<< " has range " << p
->second
<< " but no cap on " << *in
<< dendl
;
6333 if (in
->filelock
.is_stable()) {
6334 in
->auth_pin(&in
->filelock
);
6336 ceph_assert(in
->filelock
.get_state() == LOCK_XLOCKSNAP
);
6338 in
->filelock
.set_state(LOCK_PRE_SCAN
);
6339 rejoin_recover_q
.push_back(in
);
6341 rejoin_check_q
.push_back(in
);
6344 if (!(++count
% 1000))
6345 mds
->heartbeat_reset();
6349 void MDCache::start_files_to_recover()
6351 for (CInode
*in
: rejoin_check_q
) {
6352 if (in
->filelock
.get_state() == LOCK_XLOCKSNAP
)
6353 mds
->locker
->issue_caps(in
);
6354 mds
->locker
->check_inode_max_size(in
);
6356 rejoin_check_q
.clear();
6357 for (CInode
*in
: rejoin_recover_q
) {
6358 mds
->locker
->file_recover(&in
->filelock
);
6360 if (!rejoin_recover_q
.empty()) {
6361 rejoin_recover_q
.clear();
6366 void MDCache::do_file_recover()
6368 recovery_queue
.advance();
6371 // ===============================================================================
6374 // ----------------------------
6377 class C_MDC_RetryTruncate
: public MDCacheContext
{
6381 C_MDC_RetryTruncate(MDCache
*c
, CInode
*i
, LogSegment
*l
) :
6382 MDCacheContext(c
), in(i
), ls(l
) {}
6383 void finish(int r
) override
{
6384 mdcache
->_truncate_inode(in
, ls
);
6388 void MDCache::truncate_inode(CInode
*in
, LogSegment
*ls
)
6390 auto pi
= in
->get_projected_inode();
6391 dout(10) << "truncate_inode "
6392 << pi
->truncate_from
<< " -> " << pi
->truncate_size
6396 ls
->truncating_inodes
.insert(in
);
6397 in
->get(CInode::PIN_TRUNCATING
);
6400 if (!in
->client_need_snapflush
.empty() &&
6401 (in
->get_caps_issued() & CEPH_CAP_FILE_BUFFER
)) {
6402 ceph_assert(in
->filelock
.is_xlocked());
6403 in
->filelock
.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in
, ls
));
6404 mds
->locker
->issue_caps(in
);
6408 _truncate_inode(in
, ls
);
6411 struct C_IO_MDC_TruncateFinish
: public MDCacheIOContext
{
6414 C_IO_MDC_TruncateFinish(MDCache
*c
, CInode
*i
, LogSegment
*l
) :
6415 MDCacheIOContext(c
, false), in(i
), ls(l
) {
6417 void finish(int r
) override
{
6418 ceph_assert(r
== 0 || r
== -ENOENT
);
6419 mdcache
->truncate_inode_finish(in
, ls
);
6421 void print(ostream
& out
) const override
{
6422 out
<< "file_truncate(" << in
->ino() << ")";
6426 void MDCache::_truncate_inode(CInode
*in
, LogSegment
*ls
)
6428 auto pi
= &in
->inode
;
6429 dout(10) << "_truncate_inode "
6430 << pi
->truncate_from
<< " -> " << pi
->truncate_size
6431 << " on " << *in
<< dendl
;
6433 ceph_assert(pi
->is_truncating());
6434 ceph_assert(pi
->truncate_size
< (1ULL << 63));
6435 ceph_assert(pi
->truncate_from
< (1ULL << 63));
6436 ceph_assert(pi
->truncate_size
< pi
->truncate_from
);
6439 SnapRealm
*realm
= in
->find_snaprealm();
6440 SnapContext nullsnap
;
6441 const SnapContext
*snapc
;
6443 dout(10) << " realm " << *realm
<< dendl
;
6444 snapc
= &realm
->get_snap_context();
6446 dout(10) << " NO realm, using null context" << dendl
;
6448 ceph_assert(in
->last
== CEPH_NOSNAP
);
6450 dout(10) << "_truncate_inode snapc " << snapc
<< " on " << *in
<< dendl
;
6451 filer
.truncate(in
->inode
.ino
, &in
->inode
.layout
, *snapc
,
6452 pi
->truncate_size
, pi
->truncate_from
-pi
->truncate_size
,
6453 pi
->truncate_seq
, ceph::real_time::min(), 0,
6454 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in
, ls
),
6458 struct C_MDC_TruncateLogged
: public MDCacheLogContext
{
6461 C_MDC_TruncateLogged(MDCache
*m
, CInode
*i
, MutationRef
& mu
) :
6462 MDCacheLogContext(m
), in(i
), mut(mu
) {}
6463 void finish(int r
) override
{
6464 mdcache
->truncate_inode_logged(in
, mut
);
6468 void MDCache::truncate_inode_finish(CInode
*in
, LogSegment
*ls
)
6470 dout(10) << "truncate_inode_finish " << *in
<< dendl
;
6472 set
<CInode
*>::iterator p
= ls
->truncating_inodes
.find(in
);
6473 ceph_assert(p
!= ls
->truncating_inodes
.end());
6474 ls
->truncating_inodes
.erase(p
);
6477 auto &pi
= in
->project_inode();
6478 pi
.inode
.version
= in
->pre_dirty();
6479 pi
.inode
.truncate_from
= 0;
6480 pi
.inode
.truncate_pending
--;
6482 MutationRef
mut(new MutationImpl());
6483 mut
->ls
= mds
->mdlog
->get_current_segment();
6484 mut
->add_projected_inode(in
);
6486 EUpdate
*le
= new EUpdate(mds
->mdlog
, "truncate finish");
6487 mds
->mdlog
->start_entry(le
);
6488 CDentry
*dn
= in
->get_projected_parent_dn();
6489 le
->metablob
.add_dir_context(dn
->get_dir());
6490 le
->metablob
.add_primary_dentry(dn
, in
, true);
6491 le
->metablob
.add_truncate_finish(in
->ino(), ls
->seq
);
6493 journal_dirty_inode(mut
.get(), &le
->metablob
, in
);
6494 mds
->mdlog
->submit_entry(le
, new C_MDC_TruncateLogged(this, in
, mut
));
6496 // flush immediately if there are readers/writers waiting
6497 if (in
->is_waiter_for(CInode::WAIT_TRUNC
) ||
6498 (in
->get_caps_wanted() & (CEPH_CAP_FILE_RD
|CEPH_CAP_FILE_WR
)))
6499 mds
->mdlog
->flush();
6502 void MDCache::truncate_inode_logged(CInode
*in
, MutationRef
& mut
)
6504 dout(10) << "truncate_inode_logged " << *in
<< dendl
;
6506 mds
->locker
->drop_locks(mut
.get());
6509 in
->put(CInode::PIN_TRUNCATING
);
6510 in
->auth_unpin(this);
6512 MDSContext::vec waiters
;
6513 in
->take_waiting(CInode::WAIT_TRUNC
, waiters
);
6514 mds
->queue_waiters(waiters
);
6518 void MDCache::add_recovered_truncate(CInode
*in
, LogSegment
*ls
)
6520 dout(20) << "add_recovered_truncate " << *in
<< " in log segment "
6521 << ls
->seq
<< "/" << ls
->offset
<< dendl
;
6522 ls
->truncating_inodes
.insert(in
);
6523 in
->get(CInode::PIN_TRUNCATING
);
6526 void MDCache::remove_recovered_truncate(CInode
*in
, LogSegment
*ls
)
6528 dout(20) << "remove_recovered_truncate " << *in
<< " in log segment "
6529 << ls
->seq
<< "/" << ls
->offset
<< dendl
;
6530 // if we have the logseg the truncate started in, it must be in our list.
6531 set
<CInode
*>::iterator p
= ls
->truncating_inodes
.find(in
);
6532 ceph_assert(p
!= ls
->truncating_inodes
.end());
6533 ls
->truncating_inodes
.erase(p
);
6534 in
->put(CInode::PIN_TRUNCATING
);
6537 void MDCache::start_recovered_truncates()
6539 dout(10) << "start_recovered_truncates" << dendl
;
6540 for (map
<uint64_t,LogSegment
*>::iterator p
= mds
->mdlog
->segments
.begin();
6541 p
!= mds
->mdlog
->segments
.end();
6543 LogSegment
*ls
= p
->second
;
6544 for (set
<CInode
*>::iterator q
= ls
->truncating_inodes
.begin();
6545 q
!= ls
->truncating_inodes
.end();
6550 if (!in
->client_need_snapflush
.empty() &&
6551 (in
->get_caps_issued() & CEPH_CAP_FILE_BUFFER
)) {
6552 ceph_assert(in
->filelock
.is_stable());
6553 in
->filelock
.set_state(LOCK_XLOCKDONE
);
6554 in
->auth_pin(&in
->filelock
);
6555 in
->filelock
.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in
, ls
));
6556 // start_files_to_recover will revoke caps
6559 _truncate_inode(in
, ls
);
6569 // ================================================================================
6572 std::pair
<bool, uint64_t> MDCache::trim_lru(uint64_t count
, expiremap
& expiremap
)
6574 bool is_standby_replay
= mds
->is_standby_replay();
6575 std::vector
<CDentry
*> unexpirables
;
6576 uint64_t trimmed
= 0;
6578 auto trim_threshold
= g_conf().get_val
<Option::size_t>("mds_cache_trim_threshold");
6580 dout(7) << "trim_lru trimming " << count
6581 << " items from LRU"
6582 << " size=" << lru
.lru_get_size()
6583 << " mid=" << lru
.lru_get_top()
6584 << " pintail=" << lru
.lru_get_pintail()
6585 << " pinned=" << lru
.lru_get_num_pinned()
6588 const uint64_t trim_counter_start
= trim_counter
.get();
6589 bool throttled
= false;
6591 throttled
|= trim_counter_start
+trimmed
>= trim_threshold
;
6592 if (throttled
) break;
6593 CDentry
*dn
= static_cast<CDentry
*>(bottom_lru
.lru_expire());
6596 if (trim_dentry(dn
, expiremap
)) {
6597 unexpirables
.push_back(dn
);
6603 for (auto &dn
: unexpirables
) {
6604 bottom_lru
.lru_insert_mid(dn
);
6606 unexpirables
.clear();
6608 // trim dentries from the LRU until count is reached
6609 // if mds is in standbyreplay and will trim all inodes which aren't in segments
6610 while (!throttled
&& (cache_toofull() || count
> 0 || is_standby_replay
)) {
6611 throttled
|= trim_counter_start
+trimmed
>= trim_threshold
;
6612 if (throttled
) break;
6613 CDentry
*dn
= static_cast<CDentry
*>(lru
.lru_expire());
6617 if ((is_standby_replay
&& dn
->get_linkage()->inode
&&
6618 dn
->get_linkage()->inode
->item_open_file
.is_on_list())) {
6619 // we move the inodes that need to be trimmed to the end of the lru queue.
6620 // refer to MDCache::standby_trim_segment
6621 lru
.lru_insert_bot(dn
);
6623 } else if (trim_dentry(dn
, expiremap
)) {
6624 unexpirables
.push_back(dn
);
6627 if (count
> 0) count
--;
6630 trim_counter
.hit(trimmed
);
6632 for (auto &dn
: unexpirables
) {
6633 lru
.lru_insert_mid(dn
);
6635 unexpirables
.clear();
6637 dout(7) << "trim_lru trimmed " << trimmed
<< " items" << dendl
;
6638 return std::pair
<bool, uint64_t>(throttled
, trimmed
);
6642 * note: only called while MDS is active or stopping... NOT during recovery.
6643 * however, we may expire a replica whose authority is recovering.
6645 * @param count is number of dentries to try to expire
6647 std::pair
<bool, uint64_t> MDCache::trim(uint64_t count
)
6649 uint64_t used
= cache_size();
6650 uint64_t limit
= cache_memory_limit
;
6651 expiremap expiremap
;
6653 dout(7) << "trim bytes_used=" << bytes2str(used
)
6654 << " limit=" << bytes2str(limit
)
6655 << " reservation=" << cache_reservation
6656 << "% count=" << count
<< dendl
;
6658 // process delayed eval_stray()
6659 stray_manager
.advance_delayed();
6661 auto result
= trim_lru(count
, expiremap
);
6662 auto& trimmed
= result
.second
;
6664 // trim non-auth, non-bound subtrees
6665 for (auto p
= subtrees
.begin(); p
!= subtrees
.end();) {
6666 CDir
*dir
= p
->first
;
6668 CInode
*diri
= dir
->get_inode();
6669 if (dir
->is_auth()) {
6670 if (!diri
->is_auth() && !diri
->is_base() &&
6671 dir
->get_num_head_items() == 0) {
6672 if (dir
->state_test(CDir::STATE_EXPORTING
) ||
6673 !(mds
->is_active() || mds
->is_stopping()) ||
6674 dir
->is_freezing() || dir
->is_frozen())
6677 migrator
->export_empty_import(dir
);
6681 if (!diri
->is_auth()) {
6682 if (dir
->get_num_ref() > 1) // only subtree pin
6685 diri
->get_subtree_dirfrags(ls
);
6686 if (diri
->get_num_ref() > (int)ls
.size()) // only pinned by subtrees
6689 // don't trim subtree root if its auth MDS is recovering.
6690 // This simplify the cache rejoin code.
6691 if (dir
->is_subtree_root() &&
6692 rejoin_ack_gather
.count(dir
->get_dir_auth().first
))
6694 trim_dirfrag(dir
, 0, expiremap
);
6701 if (mds
->is_stopping() && root
) {
6703 root
->get_dirfrags(ls
);
6704 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
6706 if (dir
->get_num_ref() == 1) { // subtree pin
6707 trim_dirfrag(dir
, 0, expiremap
);
6711 if (root
->get_num_ref() == 0) {
6712 trim_inode(0, root
, 0, expiremap
);
6717 std::set
<mds_rank_t
> stopping
;
6718 mds
->mdsmap
->get_mds_set(stopping
, MDSMap::STATE_STOPPING
);
6719 stopping
.erase(mds
->get_nodeid());
6720 for (auto rank
: stopping
) {
6721 CInode
* mdsdir_in
= get_inode(MDS_INO_MDSDIR(rank
));
6725 auto em
= expiremap
.emplace(std::piecewise_construct
, std::forward_as_tuple(rank
), std::forward_as_tuple());
6727 em
.first
->second
= MCacheExpire::create(mds
->get_nodeid());
6730 dout(20) << __func__
<< ": try expiring " << *mdsdir_in
<< " for stopping mds." << mds
<< dendl
;
6732 const bool aborted
= expire_recursive(mdsdir_in
, expiremap
);
6734 dout(20) << __func__
<< ": successfully expired mdsdir" << dendl
;
6736 mdsdir_in
->get_dirfrags(ls
);
6737 for (auto dir
: ls
) {
6738 if (dir
->get_num_ref() == 1) { // subtree pin
6739 trim_dirfrag(dir
, dir
, expiremap
);
6743 if (mdsdir_in
->get_num_ref() == 0) {
6744 trim_inode(NULL
, mdsdir_in
, NULL
, expiremap
);
6748 dout(20) << __func__
<< ": some unexpirable contents in mdsdir" << dendl
;
6752 // Other rank's base inodes (when I'm stopping)
6753 if (mds
->is_stopping()) {
6754 for (set
<CInode
*>::iterator p
= base_inodes
.begin();
6755 p
!= base_inodes
.end();) {
6756 CInode
*base_in
= *p
;
6758 if (MDS_INO_IS_MDSDIR(base_in
->ino()) &&
6759 MDS_INO_MDSDIR_OWNER(base_in
->ino()) != mds
->get_nodeid()) {
6760 dout(20) << __func__
<< ": maybe trimming base: " << *base_in
<< dendl
;
6761 if (base_in
->get_num_ref() == 0) {
6762 trim_inode(NULL
, base_in
, NULL
, expiremap
);
6769 // send any expire messages
6770 send_expire_messages(expiremap
);
6775 void MDCache::send_expire_messages(expiremap
& expiremap
)
6778 for (const auto &p
: expiremap
) {
6779 if (mds
->is_cluster_degraded() &&
6780 (mds
->mdsmap
->get_state(p
.first
) < MDSMap::STATE_REJOIN
||
6781 (mds
->mdsmap
->get_state(p
.first
) == MDSMap::STATE_REJOIN
&&
6782 rejoin_sent
.count(p
.first
) == 0))) {
6785 dout(7) << "sending cache_expire to " << p
.first
<< dendl
;
6786 mds
->send_message_mds(p
.second
, p
.first
);
6792 bool MDCache::trim_dentry(CDentry
*dn
, expiremap
& expiremap
)
6794 dout(12) << "trim_dentry " << *dn
<< dendl
;
6796 CDentry::linkage_t
*dnl
= dn
->get_linkage();
6798 CDir
*dir
= dn
->get_dir();
6801 CDir
*con
= get_subtree_root(dir
);
6803 dout(12) << " in container " << *con
<< dendl
;
6805 dout(12) << " no container; under a not-yet-linked dir" << dendl
;
6806 ceph_assert(dn
->is_auth());
6809 // If replica dentry is not readable, it's likely we will receive
6810 // MDentryLink/MDentryUnlink message soon (It's possible we first
6811 // receive a MDentryUnlink message, then MDentryLink message)
6812 // MDentryLink message only replicates an inode, so we should
6813 // avoid trimming the inode's parent dentry. This is because that
6814 // unconnected replicas are problematic for subtree migration.
6815 if (!dn
->is_auth() && !dn
->lock
.can_read(-1) &&
6816 !dn
->get_dir()->get_inode()->is_stray())
6819 // adjust the dir state
6820 // NOTE: we can safely remove a clean, null dentry without effecting
6821 // directory completeness.
6822 // (check this _before_ we unlink the inode, below!)
6823 bool clear_complete
= false;
6824 if (!(dnl
->is_null() && dn
->is_clean()))
6825 clear_complete
= true;
6827 // unlink the dentry
6828 if (dnl
->is_remote()) {
6830 dir
->unlink_inode(dn
, false);
6831 } else if (dnl
->is_primary()) {
6832 // expire the inode, too.
6833 CInode
*in
= dnl
->get_inode();
6835 if (trim_inode(dn
, in
, con
, expiremap
))
6836 return true; // purging stray instead of trimming
6838 ceph_assert(dnl
->is_null());
6841 if (!dn
->is_auth()) {
6842 // notify dentry authority.
6843 mds_authority_t auth
= dn
->authority();
6845 for (int p
=0; p
<2; p
++) {
6846 mds_rank_t a
= auth
.first
;
6847 if (p
) a
= auth
.second
;
6848 if (a
< 0 || (p
== 1 && auth
.second
== auth
.first
)) break;
6849 if (mds
->get_nodeid() == auth
.second
&&
6850 con
->is_importing()) break; // don't send any expire while importing.
6851 if (a
== mds
->get_nodeid()) continue; // on export, ignore myself.
6853 dout(12) << " sending expire to mds." << a
<< " on " << *dn
<< dendl
;
6854 ceph_assert(a
!= mds
->get_nodeid());
6855 auto em
= expiremap
.emplace(std::piecewise_construct
, std::forward_as_tuple(a
), std::forward_as_tuple());
6857 em
.first
->second
= MCacheExpire::create(mds
->get_nodeid());
6858 em
.first
->second
->add_dentry(con
->dirfrag(), dir
->dirfrag(), dn
->get_name(), dn
->last
, dn
->get_replica_nonce());
6863 if (dn
->last
== CEPH_NOSNAP
&& dir
->is_auth())
6864 dir
->add_to_bloom(dn
);
6865 dir
->remove_dentry(dn
);
6868 dir
->state_clear(CDir::STATE_COMPLETE
);
6870 if (mds
->logger
) mds
->logger
->inc(l_mds_inodes_expired
);
6875 void MDCache::trim_dirfrag(CDir
*dir
, CDir
*con
, expiremap
& expiremap
)
6877 dout(15) << "trim_dirfrag " << *dir
<< dendl
;
6879 if (dir
->is_subtree_root()) {
6880 ceph_assert(!dir
->is_auth() ||
6881 (!dir
->is_replicated() && dir
->inode
->is_base()));
6882 remove_subtree(dir
); // remove from subtree map
6884 ceph_assert(dir
->get_num_ref() == 0);
6886 CInode
*in
= dir
->get_inode();
6888 if (!dir
->is_auth()) {
6889 mds_authority_t auth
= dir
->authority();
6891 // was this an auth delegation? (if so, slightly modified container)
6893 if (dir
->is_subtree_root()) {
6894 dout(12) << " subtree root, container is " << *dir
<< dendl
;
6896 condf
= dir
->dirfrag();
6898 condf
= con
->dirfrag();
6901 for (int p
=0; p
<2; p
++) {
6902 mds_rank_t a
= auth
.first
;
6903 if (p
) a
= auth
.second
;
6904 if (a
< 0 || (p
== 1 && auth
.second
== auth
.first
)) break;
6905 if (mds
->get_nodeid() == auth
.second
&&
6906 con
->is_importing()) break; // don't send any expire while importing.
6907 if (a
== mds
->get_nodeid()) continue; // on export, ignore myself.
6909 dout(12) << " sending expire to mds." << a
<< " on " << *dir
<< dendl
;
6910 ceph_assert(a
!= mds
->get_nodeid());
6911 auto em
= expiremap
.emplace(std::piecewise_construct
, std::forward_as_tuple(a
), std::forward_as_tuple());
6913 em
.first
->second
= MCacheExpire::create(mds
->get_nodeid()); /* new */
6914 em
.first
->second
->add_dir(condf
, dir
->dirfrag(), dir
->replica_nonce
);
6918 in
->close_dirfrag(dir
->dirfrag().frag
);
6922 * Try trimming an inode from the cache
6924 * @return true if the inode is still in cache, else false if it was trimmed
6926 bool MDCache::trim_inode(CDentry
*dn
, CInode
*in
, CDir
*con
, expiremap
& expiremap
)
6928 dout(15) << "trim_inode " << *in
<< dendl
;
6929 ceph_assert(in
->get_num_ref() == 0);
6932 // If replica inode's dirfragtreelock is not readable, it's likely
6933 // some dirfrags of the inode are being fragmented and we will receive
6934 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
6935 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
6936 // This is because that unconnected replicas are problematic for
6937 // subtree migration.
6939 if (!in
->is_auth() && !mds
->locker
->rdlock_try(&in
->dirfragtreelock
, -1, nullptr)) {
6945 in
->get_dirfrags(dfls
);
6946 for (list
<CDir
*>::iterator p
= dfls
.begin(); p
!= dfls
.end(); ++p
) {
6948 ceph_assert(!dir
->is_subtree_root());
6949 trim_dirfrag(dir
, con
? con
:dir
, expiremap
); // if no container (e.g. root dirfrag), use *p
6954 if (in
->is_auth()) {
6955 // eval stray after closing dirfrags
6956 if (dn
&& !dn
->state_test(CDentry::STATE_PURGING
)) {
6957 maybe_eval_stray(in
);
6958 if (dn
->state_test(CDentry::STATE_PURGING
) || dn
->get_num_ref() > 0)
6962 mds_authority_t auth
= in
->authority();
6966 df
= con
->dirfrag();
6968 df
= dirfrag_t(0,frag_t()); // must be a root or stray inode.
6970 for (int p
=0; p
<2; p
++) {
6971 mds_rank_t a
= auth
.first
;
6972 if (p
) a
= auth
.second
;
6973 if (a
< 0 || (p
== 1 && auth
.second
== auth
.first
)) break;
6974 if (con
&& mds
->get_nodeid() == auth
.second
&&
6975 con
->is_importing()) break; // don't send any expire while importing.
6976 if (a
== mds
->get_nodeid()) continue; // on export, ignore myself.
6978 dout(12) << " sending expire to mds." << a
<< " on " << *in
<< dendl
;
6979 ceph_assert(a
!= mds
->get_nodeid());
6980 auto em
= expiremap
.emplace(std::piecewise_construct
, std::forward_as_tuple(a
), std::forward_as_tuple());
6982 em
.first
->second
= MCacheExpire::create(mds
->get_nodeid()); /* new */
6983 em
.first
->second
->add_inode(df
, in
->vino(), in
->get_replica_nonce());
6988 if (in->is_auth()) {
6989 if (in->hack_accessed)
6990 mds->logger->inc("outt");
6992 mds->logger->inc("outut");
6993 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
7000 dn
->get_dir()->unlink_inode(dn
, false);
7007 * trim_non_auth - remove any non-auth items from our cache
7009 * this reduces the amount of non-auth metadata in our cache, reducing the
7010 * load incurred by the rejoin phase.
7012 * the only non-auth items that remain are those that are needed to
7013 * attach our own subtrees to the root.
7015 * when we are done, all dentries will be in the top bit of the lru.
7017 * why we have to do this:
7018 * we may not have accurate linkage for non-auth items. which means we will
7019 * know which subtree it falls into, and can not be sure to declare it to the
7020 * correct authority.
7022 void MDCache::trim_non_auth()
7024 dout(7) << "trim_non_auth" << dendl
;
7026 // temporarily pin all subtree roots
7027 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
7028 p
!= subtrees
.end();
7030 p
->first
->get(CDir::PIN_SUBTREETEMP
);
7032 list
<CDentry
*> auth_list
;
7034 // trim non-auth items from the lru
7037 if (bottom_lru
.lru_get_size() > 0)
7038 dn
= static_cast<CDentry
*>(bottom_lru
.lru_expire());
7039 if (!dn
&& lru
.lru_get_size() > 0)
7040 dn
= static_cast<CDentry
*>(lru
.lru_expire());
7044 CDentry::linkage_t
*dnl
= dn
->get_linkage();
7046 if (dn
->is_auth()) {
7047 // add back into lru (at the top)
7048 auth_list
.push_back(dn
);
7050 if (dnl
->is_remote() && dnl
->get_inode() && !dnl
->get_inode()->is_auth())
7051 dn
->unlink_remote(dnl
);
7053 // non-auth. expire.
7054 CDir
*dir
= dn
->get_dir();
7057 // unlink the dentry
7058 dout(10) << " removing " << *dn
<< dendl
;
7059 if (dnl
->is_remote()) {
7060 dir
->unlink_inode(dn
, false);
7062 else if (dnl
->is_primary()) {
7063 CInode
*in
= dnl
->get_inode();
7064 dout(10) << " removing " << *in
<< dendl
;
7066 in
->get_dirfrags(ls
);
7067 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
7069 ceph_assert(!subdir
->is_subtree_root());
7070 in
->close_dirfrag(subdir
->dirfrag().frag
);
7072 dir
->unlink_inode(dn
, false);
7076 ceph_assert(dnl
->is_null());
7079 ceph_assert(!dir
->has_bloom());
7080 dir
->remove_dentry(dn
);
7081 // adjust the dir state
7082 dir
->state_clear(CDir::STATE_COMPLETE
); // dir incomplete!
7083 // close empty non-auth dirfrag
7084 if (!dir
->is_subtree_root() && dir
->get_num_any() == 0)
7085 dir
->inode
->close_dirfrag(dir
->get_frag());
7089 for (auto dn
: auth_list
) {
7090 if (dn
->state_test(CDentry::STATE_BOTTOMLRU
))
7091 bottom_lru
.lru_insert_mid(dn
);
7093 lru
.lru_insert_top(dn
);
7096 // move everything in the pintail to the top bit of the lru.
7097 lru
.lru_touch_entire_pintail();
7099 // unpin all subtrees
7100 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
7101 p
!= subtrees
.end();
7103 p
->first
->put(CDir::PIN_SUBTREETEMP
);
7105 if (lru
.lru_get_size() == 0 &&
7106 bottom_lru
.lru_get_size() == 0) {
7107 // root, stray, etc.?
7108 auto p
= inode_map
.begin();
7109 while (p
!= inode_map
.end()) {
7110 CInode
*in
= p
->second
;
7112 if (!in
->is_auth()) {
7114 in
->get_dirfrags(ls
);
7115 for (list
<CDir
*>::iterator p
= ls
.begin();
7118 dout(10) << " removing " << **p
<< dendl
;
7119 ceph_assert((*p
)->get_num_ref() == 1); // SUBTREE
7120 remove_subtree((*p
));
7121 in
->close_dirfrag((*p
)->dirfrag().frag
);
7123 dout(10) << " removing " << *in
<< dendl
;
7124 ceph_assert(!in
->get_parent_dn());
7125 ceph_assert(in
->get_num_ref() == 0);
7135 * Recursively trim the subtree rooted at directory to remove all
7136 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
7137 * of those links. This is used to clear invalid data out of the cache.
7138 * Note that it doesn't clear the passed-in directory, since that's not
7141 bool MDCache::trim_non_auth_subtree(CDir
*dir
)
7143 dout(10) << "trim_non_auth_subtree(" << dir
<< ") " << *dir
<< dendl
;
7145 bool keep_dir
= !can_trim_non_auth_dirfrag(dir
);
7147 auto j
= dir
->begin();
7149 while (j
!= dir
->end()) {
7151 CDentry
*dn
= i
->second
;
7152 dout(10) << "trim_non_auth_subtree(" << dir
<< ") Checking dentry " << dn
<< dendl
;
7153 CDentry::linkage_t
*dnl
= dn
->get_linkage();
7154 if (dnl
->is_primary()) { // check for subdirectories, etc
7155 CInode
*in
= dnl
->get_inode();
7156 bool keep_inode
= false;
7158 list
<CDir
*> subdirs
;
7159 in
->get_dirfrags(subdirs
);
7160 for (list
<CDir
*>::iterator subdir
= subdirs
.begin();
7161 subdir
!= subdirs
.end();
7163 if ((*subdir
)->is_subtree_root()) {
7165 dout(10) << "trim_non_auth_subtree(" << dir
<< ") keeping " << **subdir
<< dendl
;
7167 if (trim_non_auth_subtree(*subdir
))
7170 in
->close_dirfrag((*subdir
)->get_frag());
7171 dir
->state_clear(CDir::STATE_COMPLETE
); // now incomplete!
7177 if (!keep_inode
) { // remove it!
7178 dout(20) << "trim_non_auth_subtree(" << dir
<< ") removing inode " << in
<< " with dentry" << dn
<< dendl
;
7179 dir
->unlink_inode(dn
, false);
7181 ceph_assert(!dir
->has_bloom());
7182 dir
->remove_dentry(dn
);
7184 dout(20) << "trim_non_auth_subtree(" << dir
<< ") keeping inode " << in
<< " with dentry " << dn
<<dendl
;
7185 dn
->state_clear(CDentry::STATE_AUTH
);
7186 in
->state_clear(CInode::STATE_AUTH
);
7188 } else if (keep_dir
&& dnl
->is_null()) { // keep null dentry for slave rollback
7189 dout(20) << "trim_non_auth_subtree(" << dir
<< ") keeping dentry " << dn
<<dendl
;
7190 } else { // just remove it
7191 dout(20) << "trim_non_auth_subtree(" << dir
<< ") removing dentry " << dn
<< dendl
;
7192 if (dnl
->is_remote())
7193 dir
->unlink_inode(dn
, false);
7194 dir
->remove_dentry(dn
);
7197 dir
->state_clear(CDir::STATE_AUTH
);
7199 * We've now checked all our children and deleted those that need it.
7200 * Now return to caller, and tell them if *we're* a keeper.
7202 return keep_dir
|| dir
->get_num_any();
7206 * during replay, when we determine a subtree is no longer ours, we
7207 * try to trim it from our cache. because subtrees must be connected
7208 * to the root, the fact that we can trim this tree may mean that our
7209 * children or parents can also be trimmed.
7211 void MDCache::try_trim_non_auth_subtree(CDir
*dir
)
7213 dout(10) << "try_trim_nonauth_subtree " << *dir
<< dendl
;
7215 // can we now trim child subtrees?
7217 get_subtree_bounds(dir
, bounds
);
7218 for (set
<CDir
*>::iterator p
= bounds
.begin(); p
!= bounds
.end(); ++p
) {
7220 if (bd
->get_dir_auth().first
!= mds
->get_nodeid() && // we are not auth
7221 bd
->get_num_any() == 0 && // and empty
7222 can_trim_non_auth_dirfrag(bd
)) {
7223 CInode
*bi
= bd
->get_inode();
7224 dout(10) << " closing empty non-auth child subtree " << *bd
<< dendl
;
7227 bi
->close_dirfrag(bd
->get_frag());
7231 if (trim_non_auth_subtree(dir
)) {
7233 try_subtree_merge(dir
);
7235 // can we trim this subtree (and possibly our ancestors) too?
7237 CInode
*diri
= dir
->get_inode();
7238 if (diri
->is_base()) {
7239 if (!diri
->is_root() && diri
->authority().first
!= mds
->get_nodeid()) {
7240 dout(10) << " closing empty non-auth subtree " << *dir
<< dendl
;
7241 remove_subtree(dir
);
7243 diri
->close_dirfrag(dir
->get_frag());
7245 dout(10) << " removing " << *diri
<< dendl
;
7246 ceph_assert(!diri
->get_parent_dn());
7247 ceph_assert(diri
->get_num_ref() == 0);
7253 CDir
*psub
= get_subtree_root(diri
->get_parent_dir());
7254 dout(10) << " parent subtree is " << *psub
<< dendl
;
7255 if (psub
->get_dir_auth().first
== mds
->get_nodeid())
7256 break; // we are auth, keep.
7258 dout(10) << " closing empty non-auth subtree " << *dir
<< dendl
;
7259 remove_subtree(dir
);
7261 diri
->close_dirfrag(dir
->get_frag());
7263 dout(10) << " parent subtree also non-auth: " << *psub
<< dendl
;
7264 if (trim_non_auth_subtree(psub
))
7273 void MDCache::standby_trim_segment(LogSegment
*ls
)
7275 auto try_trim_inode
= [this](CInode
*in
) {
7276 if (in
->get_num_ref() == 0 &&
7277 !in
->item_open_file
.is_on_list() &&
7278 in
->parent
!= NULL
&&
7279 in
->parent
->get_num_ref() == 0){
7280 touch_dentry_bottom(in
->parent
);
7284 auto try_trim_dentry
= [this](CDentry
*dn
) {
7285 if (dn
->get_num_ref() > 0)
7287 auto in
= dn
->get_linkage()->inode
;
7288 if(in
&& in
->item_open_file
.is_on_list())
7290 touch_dentry_bottom(dn
);
7293 ls
->new_dirfrags
.clear_list();
7294 ls
->open_files
.clear_list();
7296 while (!ls
->dirty_dirfrags
.empty()) {
7297 CDir
*dir
= ls
->dirty_dirfrags
.front();
7300 try_trim_inode(dir
->inode
);
7302 while (!ls
->dirty_inodes
.empty()) {
7303 CInode
*in
= ls
->dirty_inodes
.front();
7307 while (!ls
->dirty_dentries
.empty()) {
7308 CDentry
*dn
= ls
->dirty_dentries
.front();
7310 try_trim_dentry(dn
);
7312 while (!ls
->dirty_parent_inodes
.empty()) {
7313 CInode
*in
= ls
->dirty_parent_inodes
.front();
7314 in
->clear_dirty_parent();
7317 while (!ls
->dirty_dirfrag_dir
.empty()) {
7318 CInode
*in
= ls
->dirty_dirfrag_dir
.front();
7319 in
->filelock
.remove_dirty();
7322 while (!ls
->dirty_dirfrag_nest
.empty()) {
7323 CInode
*in
= ls
->dirty_dirfrag_nest
.front();
7324 in
->nestlock
.remove_dirty();
7327 while (!ls
->dirty_dirfrag_dirfragtree
.empty()) {
7328 CInode
*in
= ls
->dirty_dirfrag_dirfragtree
.front();
7329 in
->dirfragtreelock
.remove_dirty();
7332 while (!ls
->truncating_inodes
.empty()) {
7333 auto it
= ls
->truncating_inodes
.begin();
7335 ls
->truncating_inodes
.erase(it
);
7336 in
->put(CInode::PIN_TRUNCATING
);
7341 void MDCache::handle_cache_expire(const MCacheExpire::const_ref
&m
)
7343 mds_rank_t from
= mds_rank_t(m
->get_from());
7345 dout(7) << "cache_expire from mds." << from
<< dendl
;
7347 if (mds
->get_state() < MDSMap::STATE_REJOIN
) {
7351 set
<SimpleLock
*> gather_locks
;
7353 for (const auto &p
: m
->realms
) {
7355 if (p
.first
.ino
> 0) {
7356 CInode
*expired_inode
= get_inode(p
.first
.ino
);
7357 ceph_assert(expired_inode
); // we had better have this.
7358 CDir
*parent_dir
= expired_inode
->get_approx_dirfrag(p
.first
.frag
);
7359 ceph_assert(parent_dir
);
7361 int export_state
= -1;
7362 if (parent_dir
->is_auth() && parent_dir
->is_exporting()) {
7363 export_state
= migrator
->get_export_state(parent_dir
);
7364 ceph_assert(export_state
>= 0);
7367 if (!parent_dir
->is_auth() ||
7368 (export_state
!= -1 &&
7369 ((export_state
== Migrator::EXPORT_WARNING
&&
7370 migrator
->export_has_warned(parent_dir
,from
)) ||
7371 export_state
== Migrator::EXPORT_EXPORTING
||
7372 export_state
== Migrator::EXPORT_LOGGINGFINISH
||
7373 (export_state
== Migrator::EXPORT_NOTIFYING
&&
7374 !migrator
->export_has_notified(parent_dir
,from
))))) {
7377 dout(7) << "delaying nonauth|warned expires for " << *parent_dir
<< dendl
;
7378 ceph_assert(parent_dir
->is_frozen_tree_root());
7380 // make a message container
7382 auto em
= delayed_expire
[parent_dir
].emplace(std::piecewise_construct
, std::forward_as_tuple(from
), std::forward_as_tuple());
7384 em
.first
->second
= MCacheExpire::create(from
); /* new */
7386 // merge these expires into it
7387 em
.first
->second
->add_realm(p
.first
, p
.second
);
7390 ceph_assert(export_state
<= Migrator::EXPORT_PREPPING
||
7391 (export_state
== Migrator::EXPORT_WARNING
&&
7392 !migrator
->export_has_warned(parent_dir
, from
)));
7394 dout(7) << "expires for " << *parent_dir
<< dendl
;
7396 dout(7) << "containerless expires (root, stray inodes)" << dendl
;
7400 for (const auto &q
: p
.second
.inodes
) {
7401 CInode
*in
= get_inode(q
.first
);
7402 unsigned nonce
= q
.second
;
7405 dout(0) << " inode expire on " << q
.first
<< " from " << from
7406 << ", don't have it" << dendl
;
7409 ceph_assert(in
->is_auth());
7410 dout(20) << __func__
<< ": expiring inode " << *in
<< dendl
;
7413 if (nonce
== in
->get_replica_nonce(from
)) {
7414 // remove from our cached_by
7415 dout(7) << " inode expire on " << *in
<< " from mds." << from
7416 << " cached_by was " << in
->get_replicas() << dendl
;
7417 inode_remove_replica(in
, from
, false, gather_locks
);
7420 // this is an old nonce, ignore expire.
7421 dout(7) << " inode expire on " << *in
<< " from mds." << from
7422 << " with old nonce " << nonce
7423 << " (current " << in
->get_replica_nonce(from
) << "), dropping"
7429 for (const auto &q
: p
.second
.dirs
) {
7430 CDir
*dir
= get_dirfrag(q
.first
);
7431 unsigned nonce
= q
.second
;
7434 CInode
*diri
= get_inode(q
.first
.ino
);
7436 if (mds
->is_rejoin() &&
7437 rejoin_ack_gather
.count(mds
->get_nodeid()) && // haven't sent rejoin ack yet
7438 !diri
->is_replica(from
)) {
7440 diri
->get_nested_dirfrags(ls
);
7441 dout(7) << " dir expire on dirfrag " << q
.first
<< " from mds." << from
7442 << " while rejoining, inode isn't replicated" << dendl
;
7443 for (list
<CDir
*>::iterator q
= ls
.begin(); q
!= ls
.end(); ++q
) {
7445 if (dir
->is_replica(from
)) {
7446 dout(7) << " dir expire on " << *dir
<< " from mds." << from
<< dendl
;
7447 dir
->remove_replica(from
);
7452 CDir
*other
= diri
->get_approx_dirfrag(q
.first
.frag
);
7454 dout(7) << " dir expire on dirfrag " << q
.first
<< " from mds." << from
7455 << " have " << *other
<< ", mismatched frags, dropping" << dendl
;
7459 dout(0) << " dir expire on " << q
.first
<< " from " << from
7460 << ", don't have it" << dendl
;
7463 dout(20) << __func__
<< ": expiring dirfrag " << *dir
<< dendl
;
7465 ceph_assert(dir
->is_auth());
7468 if (nonce
== dir
->get_replica_nonce(from
)) {
7469 // remove from our cached_by
7470 dout(7) << " dir expire on " << *dir
<< " from mds." << from
7471 << " replicas was " << dir
->get_replicas() << dendl
;
7472 dir
->remove_replica(from
);
7475 // this is an old nonce, ignore expire.
7476 dout(7) << " dir expire on " << *dir
<< " from mds." << from
7477 << " with old nonce " << nonce
<< " (current " << dir
->get_replica_nonce(from
)
7478 << "), dropping" << dendl
;
7483 for (const auto &pd
: p
.second
.dentries
) {
7484 dout(10) << " dn expires in dir " << pd
.first
<< dendl
;
7485 CInode
*diri
= get_inode(pd
.first
.ino
);
7487 CDir
*dir
= diri
->get_dirfrag(pd
.first
.frag
);
7490 dout(0) << " dn expires on " << pd
.first
<< " from " << from
7491 << ", must have refragmented" << dendl
;
7493 ceph_assert(dir
->is_auth());
7496 for (const auto &p
: pd
.second
) {
7497 unsigned nonce
= p
.second
;
7501 dn
= dir
->lookup(p
.first
.first
, p
.first
.second
);
7503 // which dirfrag for this dentry?
7504 CDir
*dir
= diri
->get_dirfrag(diri
->pick_dirfrag(p
.first
.first
));
7506 ceph_assert(dir
->is_auth());
7507 dn
= dir
->lookup(p
.first
.first
, p
.first
.second
);
7512 dout(0) << " missing dentry for " << p
.first
.first
<< " snap " << p
.first
.second
<< " in " << *dir
<< dendl
;
7514 dout(0) << " missing dentry for " << p
.first
.first
<< " snap " << p
.first
.second
<< dendl
;
7518 if (nonce
== dn
->get_replica_nonce(from
)) {
7519 dout(7) << " dentry_expire on " << *dn
<< " from mds." << from
<< dendl
;
7520 dentry_remove_replica(dn
, from
, gather_locks
);
7523 dout(7) << " dentry_expire on " << *dn
<< " from mds." << from
7524 << " with old nonce " << nonce
<< " (current " << dn
->get_replica_nonce(from
)
7525 << "), dropping" << dendl
;
7531 for (set
<SimpleLock
*>::iterator p
= gather_locks
.begin(); p
!= gather_locks
.end(); ++p
) {
7532 if (!(*p
)->is_stable())
7533 mds
->locker
->eval_gather(*p
);
7537 void MDCache::process_delayed_expire(CDir
*dir
)
7539 dout(7) << "process_delayed_expire on " << *dir
<< dendl
;
7540 for (const auto &p
: delayed_expire
[dir
]) {
7541 handle_cache_expire(p
.second
);
7543 delayed_expire
.erase(dir
);
7546 void MDCache::discard_delayed_expire(CDir
*dir
)
7548 dout(7) << "discard_delayed_expire on " << *dir
<< dendl
;
7549 delayed_expire
.erase(dir
);
7552 void MDCache::inode_remove_replica(CInode
*in
, mds_rank_t from
, bool rejoin
,
7553 set
<SimpleLock
*>& gather_locks
)
7555 in
->remove_replica(from
);
7556 in
->set_mds_caps_wanted(from
, 0);
7558 // note: this code calls _eval more often than it needs to!
7560 if (in
->authlock
.remove_replica(from
)) gather_locks
.insert(&in
->authlock
);
7561 if (in
->linklock
.remove_replica(from
)) gather_locks
.insert(&in
->linklock
);
7562 if (in
->snaplock
.remove_replica(from
)) gather_locks
.insert(&in
->snaplock
);
7563 if (in
->xattrlock
.remove_replica(from
)) gather_locks
.insert(&in
->xattrlock
);
7564 if (in
->flocklock
.remove_replica(from
)) gather_locks
.insert(&in
->flocklock
);
7565 if (in
->policylock
.remove_replica(from
)) gather_locks
.insert(&in
->policylock
);
7567 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7568 // Don't remove the recovering mds from lock's gathering list because
7569 // it may hold rejoined wrlocks.
7570 if (in
->dirfragtreelock
.remove_replica(from
, rejoin
)) gather_locks
.insert(&in
->dirfragtreelock
);
7571 if (in
->filelock
.remove_replica(from
, rejoin
)) gather_locks
.insert(&in
->filelock
);
7572 if (in
->nestlock
.remove_replica(from
, rejoin
)) gather_locks
.insert(&in
->nestlock
);
7575 void MDCache::dentry_remove_replica(CDentry
*dn
, mds_rank_t from
, set
<SimpleLock
*>& gather_locks
)
7577 dn
->remove_replica(from
);
7580 if (dn
->lock
.remove_replica(from
))
7581 gather_locks
.insert(&dn
->lock
);
7583 // Replicated strays might now be elegible for purge
7584 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
7585 if (dnl
->is_primary()) {
7586 maybe_eval_stray(dnl
->get_inode());
7590 void MDCache::trim_client_leases()
7592 utime_t now
= ceph_clock_now();
7594 dout(10) << "trim_client_leases" << dendl
;
7596 std::size_t pool
= 0;
7597 for (const auto& list
: client_leases
) {
7602 auto before
= list
.size();
7603 while (!list
.empty()) {
7604 ClientLease
*r
= list
.front();
7605 if (r
->ttl
> now
) break;
7606 CDentry
*dn
= static_cast<CDentry
*>(r
->parent
);
7607 dout(10) << " expiring client." << r
->client
<< " lease of " << *dn
<< dendl
;
7608 dn
->remove_client_lease(r
, mds
->locker
);
7610 auto after
= list
.size();
7611 dout(10) << "trim_client_leases pool " << pool
<< " trimmed "
7612 << (before
-after
) << " leases, " << after
<< " left" << dendl
;
7617 void MDCache::check_memory_usage()
7619 static MemoryModel
mm(g_ceph_context
);
7620 static MemoryModel::snap last
;
7622 static MemoryModel::snap baseline
= last
;
7624 // check client caps
7625 ceph_assert(CInode::count() == inode_map
.size() + snap_inode_map
.size() + num_shadow_inodes
);
7626 double caps_per_inode
= 0.0;
7627 if (CInode::count())
7628 caps_per_inode
= (double)Capability::count() / (double)CInode::count();
7630 dout(2) << "Memory usage: "
7631 << " total " << last
.get_total()
7632 << ", rss " << last
.get_rss()
7633 << ", heap " << last
.get_heap()
7634 << ", baseline " << baseline
.get_heap()
7635 << ", " << num_inodes_with_caps
<< " / " << CInode::count() << " inodes have caps"
7636 << ", " << Capability::count() << " caps, " << caps_per_inode
<< " caps per inode"
7639 mds
->update_mlogger();
7640 mds
->mlogger
->set(l_mdm_rss
, last
.get_rss());
7641 mds
->mlogger
->set(l_mdm_heap
, last
.get_heap());
7643 if (cache_toofull()) {
7644 mds
->server
->recall_client_state(nullptr, Server::RecallFlags::TRIM
);
7647 // If the cache size had exceeded its limit, but we're back in bounds
7648 // now, free any unused pool memory so that our memory usage isn't
7649 // permanently bloated.
7650 if (exceeded_size_limit
&& !cache_toofull()) {
7651 // Only do this once we are back in bounds: otherwise the releases would
7652 // slow down whatever process caused us to exceed bounds to begin with
7653 if (ceph_using_tcmalloc()) {
7654 dout(5) << "check_memory_usage: releasing unused space from tcmalloc"
7656 ceph_heap_release_free_memory();
7658 exceeded_size_limit
= false;
7664 // =========================================================================================
7667 class C_MDC_ShutdownCheck
: public MDCacheContext
{
7669 explicit C_MDC_ShutdownCheck(MDCache
*m
) : MDCacheContext(m
) {}
7670 void finish(int) override
{
7671 mdcache
->shutdown_check();
7675 void MDCache::shutdown_check()
7677 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl
;
7680 char old_val
[32] = { 0 };
7682 g_conf().get_val("debug_mds", &o
, sizeof(old_val
));
7683 g_conf().set_val("debug_mds", "10");
7684 g_conf().apply_changes(nullptr);
7686 g_conf().set_val("debug_mds", old_val
);
7687 g_conf().apply_changes(nullptr);
7688 mds
->timer
.add_event_after(g_conf()->mds_shutdown_check
, new C_MDC_ShutdownCheck(this));
7691 dout(0) << "lru size now " << lru
.lru_get_size() << "/" << bottom_lru
.lru_get_size() << dendl
;
7692 dout(0) << "log len " << mds
->mdlog
->get_num_events() << dendl
;
7695 if (mds
->objecter
->is_active()) {
7696 dout(0) << "objecter still active" << dendl
;
7697 mds
->objecter
->dump_active();
7702 void MDCache::shutdown_start()
7704 dout(5) << "shutdown_start" << dendl
;
7706 if (g_conf()->mds_shutdown_check
)
7707 mds
->timer
.add_event_after(g_conf()->mds_shutdown_check
, new C_MDC_ShutdownCheck(this));
7709 // g_conf()->debug_mds = 10;
7714 bool MDCache::shutdown_pass()
7716 dout(7) << "shutdown_pass" << dendl
;
7718 if (mds
->is_stopped()) {
7719 dout(7) << " already shut down" << dendl
;
7726 bool strays_all_exported
= shutdown_export_strays();
7730 dout(5) << "lru size now " << lru
.lru_get_size() << "/" << bottom_lru
.lru_get_size() << dendl
;
7732 // Export all subtrees to another active (usually rank 0) if not rank 0
7733 int num_auth_subtree
= 0;
7734 if (!subtrees
.empty() &&
7735 mds
->get_nodeid() != 0) {
7736 dout(7) << "looking for subtrees to export to mds0" << dendl
;
7738 for (map
<CDir
*, set
<CDir
*> >::iterator it
= subtrees
.begin();
7739 it
!= subtrees
.end();
7741 CDir
*dir
= it
->first
;
7742 if (dir
->get_inode()->is_mdsdir())
7744 if (dir
->is_auth()) {
7746 if (dir
->is_frozen() ||
7747 dir
->is_freezing() ||
7748 dir
->is_ambiguous_dir_auth() ||
7749 dir
->state_test(CDir::STATE_EXPORTING
))
7755 migrator
->clear_export_queue();
7756 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
7758 mds_rank_t dest
= dir
->get_inode()->authority().first
;
7759 if (dest
> 0 && !mds
->mdsmap
->is_active(dest
))
7761 dout(7) << "sending " << *dir
<< " back to mds." << dest
<< dendl
;
7762 migrator
->export_dir_nicely(dir
, dest
);
7766 if (!strays_all_exported
) {
7767 dout(7) << "waiting for strays to migrate" << dendl
;
7771 if (num_auth_subtree
> 0) {
7772 ceph_assert(mds
->get_nodeid() > 0);
7773 dout(7) << "still have " << num_auth_subtree
<< " auth subtrees" << dendl
;
7778 // close out any sessions (and open files!) before we try to trim the log, etc.
7779 if (mds
->sessionmap
.have_unclosed_sessions()) {
7780 if (!mds
->server
->terminating_sessions
)
7781 mds
->server
->terminate_sessions();
7785 // Fully trim the log so that all objects in cache are clean and may be
7786 // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
7787 // trim the log such that the cache eventually becomes clean.
7788 if (mds
->mdlog
->get_num_segments() > 0) {
7789 auto ls
= mds
->mdlog
->get_current_segment();
7790 if (ls
->num_events
> 1 || !ls
->dirty_dirfrags
.empty()) {
7791 // Current segment contains events other than subtreemap or
7792 // there are dirty dirfrags (see CDir::log_mark_dirty())
7793 mds
->mdlog
->start_new_segment();
7794 mds
->mdlog
->flush();
7797 mds
->mdlog
->trim_all();
7798 if (mds
->mdlog
->get_num_segments() > 1) {
7799 dout(7) << "still >1 segments, waiting for log to trim" << dendl
;
7803 // drop our reference to our stray dir inode
7804 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
7806 strays
[i
]->state_test(CInode::STATE_STRAYPINNED
)) {
7807 strays
[i
]->state_clear(CInode::STATE_STRAYPINNED
);
7808 strays
[i
]->put(CInode::PIN_STRAY
);
7809 strays
[i
]->put_stickydirs();
7813 CDir
*mydir
= myin
? myin
->get_dirfrag(frag_t()) : NULL
;
7814 if (mydir
&& !mydir
->is_subtree_root())
7817 // subtrees map not empty yet?
7818 if (subtrees
.size() > (mydir
? 1 : 0)) {
7819 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl
;
7821 migrator
->show_importing();
7822 migrator
->show_exporting();
7823 if (!migrator
->is_importing() && !migrator
->is_exporting())
7827 ceph_assert(!migrator
->is_exporting());
7828 ceph_assert(!migrator
->is_importing());
7830 // replicas may dirty scatter locks
7831 if (myin
&& myin
->is_replicated()) {
7832 dout(7) << "still have replicated objects" << dendl
;
7836 if ((myin
&& myin
->get_num_auth_pins()) ||
7837 (mydir
&& (mydir
->get_auth_pins() || mydir
->get_dir_auth_pins()))) {
7838 dout(7) << "still have auth pinned objects" << dendl
;
7842 // (only do this once!)
7843 if (!mds
->mdlog
->is_capped()) {
7844 dout(7) << "capping the log" << dendl
;
7848 if (!mds
->mdlog
->empty())
7849 mds
->mdlog
->trim(0);
7851 if (!mds
->mdlog
->empty()) {
7852 dout(7) << "waiting for log to flush.. " << mds
->mdlog
->get_num_events()
7853 << " in " << mds
->mdlog
->get_num_segments() << " segments" << dendl
;
7857 if (!did_shutdown_log_cap
) {
7858 // flush journal header
7859 dout(7) << "writing header for (now-empty) journal" << dendl
;
7860 ceph_assert(mds
->mdlog
->empty());
7861 mds
->mdlog
->write_head(0);
7862 // NOTE: filer active checker below will block us until this completes.
7863 did_shutdown_log_cap
= true;
7868 if (mds
->objecter
->is_active()) {
7869 dout(7) << "objecter still active" << dendl
;
7870 mds
->objecter
->dump_active();
7874 // trim what we can from the cache
7875 if (lru
.lru_get_size() > 0 || bottom_lru
.lru_get_size() > 0) {
7876 dout(7) << "there's still stuff in the cache: " << lru
.lru_get_size() << "/" << bottom_lru
.lru_get_size() << dendl
;
7882 // make mydir subtree go away
7884 if (mydir
->get_num_ref() > 1) { // subtree pin
7885 dout(7) << "there's still reference to mydir " << *mydir
<< dendl
;
7890 remove_subtree(mydir
);
7891 myin
->close_dirfrag(mydir
->get_frag());
7893 ceph_assert(subtrees
.empty());
7900 if (global_snaprealm
) {
7901 remove_inode(global_snaprealm
->inode
);
7902 global_snaprealm
= nullptr;
7906 dout(5) << "shutdown done." << dendl
;
7910 bool MDCache::shutdown_export_strays()
7912 static const unsigned MAX_EXPORTING
= 100;
7914 if (mds
->get_nodeid() == 0)
7917 if (shutdown_exporting_strays
.size() * 3 >= MAX_EXPORTING
* 2)
7920 dout(10) << "shutdown_export_strays " << shutdown_export_next
.first
7921 << " '" << shutdown_export_next
.second
<< "'" << dendl
;
7923 bool mds0_active
= mds
->mdsmap
->is_active(mds_rank_t(0));
7924 bool all_exported
= false;
7927 auto next
= shutdown_export_next
;
7929 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
7930 CInode
*strayi
= strays
[i
];
7932 !strayi
->state_test(CInode::STATE_STRAYPINNED
))
7934 if (strayi
->ino() < next
.first
.ino
)
7938 strayi
->get_dirfrags(dfls
);
7940 while (!dfls
.empty()) {
7941 CDir
*dir
= dfls
.front();
7944 if (dir
->dirfrag() < next
.first
)
7946 if (next
.first
< dir
->dirfrag()) {
7947 next
.first
= dir
->dirfrag();
7948 next
.second
.clear();
7951 if (!dir
->is_complete()) {
7952 MDSContext
*fin
= nullptr;
7953 if (shutdown_exporting_strays
.empty()) {
7954 fin
= new MDSInternalContextWrapper(mds
,
7955 new FunctionContext([this](int r
) {
7956 shutdown_export_strays();
7964 CDir::dentry_key_map::iterator it
;
7965 if (next
.second
.empty()) {
7968 auto hash
= ceph_frag_value(strayi
->hash_dentry_name(next
.second
));
7969 it
= dir
->lower_bound(dentry_key_t(0, next
.second
, hash
));
7972 for (; it
!= dir
->end(); ++it
) {
7973 CDentry
*dn
= it
->second
;
7974 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
7978 if (!mds0_active
&& !dn
->state_test(CDentry::STATE_PURGING
)) {
7979 next
.second
= it
->first
.name
;
7983 auto ret
= shutdown_exporting_strays
.insert(dnl
->get_inode()->ino());
7985 dout(10) << "already exporting/purging " << *dn
<< dendl
;
7989 // Don't try to migrate anything that is actually
7990 // being purged right now
7991 if (!dn
->state_test(CDentry::STATE_PURGING
))
7992 stray_manager
.migrate_stray(dn
, mds_rank_t(0)); // send to root!
7994 if (shutdown_exporting_strays
.size() >= MAX_EXPORTING
) {
7996 if (it
!= dir
->end()) {
7997 next
.second
= it
->first
.name
;
8000 next
.first
.ino
.val
++;
8002 next
.first
= dfls
.front()->dirfrag();
8003 next
.second
.clear();
8011 if (shutdown_exporting_strays
.empty()) {
8012 dirfrag_t
first_df(MDS_INO_STRAY(mds
->get_nodeid(), 0), 0);
8013 if (first_df
< shutdown_export_next
.first
||
8014 !shutdown_export_next
.second
.empty()) {
8015 shutdown_export_next
.first
= first_df
;
8016 shutdown_export_next
.second
.clear();
8019 all_exported
= true;
8023 shutdown_export_next
= next
;
8024 return all_exported
;
8027 // ========= messaging ==============
8029 void MDCache::dispatch(const Message::const_ref
&m
)
8031 switch (m
->get_type()) {
8034 case MSG_MDS_RESOLVE
:
8035 handle_resolve(MMDSResolve::msgref_cast(m
));
8037 case MSG_MDS_RESOLVEACK
:
8038 handle_resolve_ack(MMDSResolveAck::msgref_cast(m
));
8042 case MSG_MDS_CACHEREJOIN
:
8043 handle_cache_rejoin(MMDSCacheRejoin::msgref_cast(m
));
8046 case MSG_MDS_DISCOVER
:
8047 handle_discover(MDiscover::msgref_cast(m
));
8049 case MSG_MDS_DISCOVERREPLY
:
8050 handle_discover_reply(MDiscoverReply::msgref_cast(m
));
8053 case MSG_MDS_DIRUPDATE
:
8054 handle_dir_update(MDirUpdate::msgref_cast(m
));
8057 case MSG_MDS_CACHEEXPIRE
:
8058 handle_cache_expire(MCacheExpire::msgref_cast(m
));
8061 case MSG_MDS_DENTRYLINK
:
8062 handle_dentry_link(MDentryLink::msgref_cast(m
));
8064 case MSG_MDS_DENTRYUNLINK
:
8065 handle_dentry_unlink(MDentryUnlink::msgref_cast(m
));
8068 case MSG_MDS_FRAGMENTNOTIFY
:
8069 handle_fragment_notify(MMDSFragmentNotify::msgref_cast(m
));
8071 case MSG_MDS_FRAGMENTNOTIFYACK
:
8072 handle_fragment_notify_ack(MMDSFragmentNotifyAck::msgref_cast(m
));
8075 case MSG_MDS_FINDINO
:
8076 handle_find_ino(MMDSFindIno::msgref_cast(m
));
8078 case MSG_MDS_FINDINOREPLY
:
8079 handle_find_ino_reply(MMDSFindInoReply::msgref_cast(m
));
8082 case MSG_MDS_OPENINO
:
8083 handle_open_ino(MMDSOpenIno::msgref_cast(m
));
8085 case MSG_MDS_OPENINOREPLY
:
8086 handle_open_ino_reply(MMDSOpenInoReply::msgref_cast(m
));
8089 case MSG_MDS_SNAPUPDATE
:
8090 handle_snap_update(MMDSSnapUpdate::msgref_cast(m
));
8094 derr
<< "cache unknown message " << m
->get_type() << dendl
;
8095 ceph_abort_msg("cache unknown message");
8099 int MDCache::path_traverse(MDRequestRef
& mdr
, MDSContextFactory
& cf
, // who
8100 const filepath
& path
, // what
8101 vector
<CDentry
*> *pdnvec
, // result
8105 bool discover
= (onfail
== MDS_TRAVERSE_DISCOVER
);
8106 bool null_okay
= (onfail
== MDS_TRAVERSE_DISCOVERXLOCK
);
8107 bool forward
= (onfail
== MDS_TRAVERSE_FORWARD
);
8109 ceph_assert(!forward
|| mdr
); // forward requires a request
8111 snapid_t snapid
= CEPH_NOSNAP
;
8113 mdr
->snapid
= snapid
;
8115 client_t client
= (mdr
&& mdr
->reqid
.name
.is_client()) ? mdr
->reqid
.name
.num() : -1;
8117 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse
);
8119 dout(7) << "traverse: opening base ino " << path
.get_ino() << " snap " << snapid
<< dendl
;
8120 CInode
*cur
= get_inode(path
.get_ino());
8122 if (MDS_INO_IS_MDSDIR(path
.get_ino()))
8123 open_foreign_mdsdir(path
.get_ino(), cf
.build());
8125 //ceph_abort(); // hrm.. broken
8130 if (cur
->state_test(CInode::STATE_PURGING
))
8133 // make sure snaprealm are open...
8134 if (mdr
&& cur
->snaprealm
&& !cur
->snaprealm
->have_past_parents_open() &&
8135 !cur
->snaprealm
->open_parents(cf
.build())) {
8146 while (depth
< path
.depth()) {
8147 dout(12) << "traverse: path seg depth " << depth
<< " '" << path
[depth
]
8148 << "' snapid " << snapid
<< dendl
;
8150 if (!cur
->is_dir()) {
8151 dout(7) << "traverse: " << *cur
<< " not a dir " << dendl
;
8155 // walk into snapdir?
8156 if (path
[depth
].length() == 0) {
8157 dout(10) << "traverse: snapdir" << dendl
;
8160 snapid
= CEPH_SNAPDIR
;
8161 mdr
->snapid
= snapid
;
8165 // walk thru snapdir?
8166 if (snapid
== CEPH_SNAPDIR
) {
8169 SnapRealm
*realm
= cur
->find_snaprealm();
8170 snapid
= realm
->resolve_snapname(path
[depth
], cur
->ino());
8171 dout(10) << "traverse: snap " << path
[depth
] << " -> " << snapid
<< dendl
;
8175 // if snaplock isn't readable, it's possible that other mds is creating
8176 // snapshot, but snap update message hasn't been received.
8177 if (!t
->snaplock
.can_read(client
)) {
8178 dout(10) << " non-readable snaplock on " << *t
<< dendl
;
8179 t
->snaplock
.add_waiter(SimpleLock::WAIT_RD
, cf
.build());
8182 CDentry
*pdn
= t
->get_projected_parent_dn();
8183 t
= pdn
? pdn
->get_dir()->get_inode() : NULL
;
8187 mdr
->snapid
= snapid
;
8193 frag_t fg
= cur
->pick_dirfrag(path
[depth
]);
8194 CDir
*curdir
= cur
->get_dirfrag(fg
);
8196 if (cur
->is_auth()) {
8197 // parent dir frozen_dir?
8198 if (cur
->is_frozen()) {
8199 dout(7) << "traverse: " << *cur
<< " is frozen, waiting" << dendl
;
8200 cur
->add_waiter(CDir::WAIT_UNFREEZE
, cf
.build());
8203 curdir
= cur
->get_or_open_dirfrag(this, fg
);
8206 dout(10) << "traverse: need dirfrag " << fg
<< ", doing discover from " << *cur
<< dendl
;
8207 discover_path(cur
, snapid
, path
.postfixpath(depth
), cf
.build(),
8209 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_discover
);
8213 ceph_assert(curdir
);
8215 #ifdef MDS_VERIFY_FRAGSTAT
8216 if (curdir
->is_complete())
8217 curdir
->verify_fragstat();
8222 if (curdir->is_frozen()) {
8224 // FIXME: traverse is allowed?
8225 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8226 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
8227 if (onfinish) delete onfinish;
8232 // Before doing dirfrag->dn lookup, compare with DamageTable's
8233 // record of which dentries were unreadable
8234 if (mds
->damage_table
.is_dentry_damaged(curdir
, path
[depth
], snapid
)) {
8235 dout(4) << "traverse: stopped lookup at damaged dentry "
8236 << *curdir
<< "/" << path
[depth
] << " snap=" << snapid
<< dendl
;
8241 CDentry
*dn
= curdir
->lookup(path
[depth
], snapid
);
8242 CDentry::linkage_t
*dnl
= dn
? dn
->get_projected_linkage() : 0;
8244 // null and last_bit and xlocked by me?
8245 if (dnl
&& dnl
->is_null() && null_okay
) {
8246 dout(10) << "traverse: hit null dentry at tail of traverse, succeeding" << dendl
;
8248 pdnvec
->push_back(dn
);
8255 dn
->lock
.is_xlocked() &&
8256 dn
->lock
.get_xlock_by() != mdr
&&
8257 !dn
->lock
.can_read(client
) &&
8258 (dnl
->is_null() || forward
)) {
8259 dout(10) << "traverse: xlocked dentry at " << *dn
<< dendl
;
8260 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, cf
.build());
8261 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_lock
);
8262 mds
->mdlog
->flush();
8266 // can we conclude ENOENT?
8267 if (dnl
&& dnl
->is_null()) {
8268 if (dn
->lock
.can_read(client
) ||
8269 (dn
->lock
.is_xlocked() && dn
->lock
.get_xlock_by() == mdr
)) {
8270 dout(10) << "traverse: miss on null+readable dentry " << path
[depth
] << " " << *dn
<< dendl
;
8272 if (depth
== path
.depth() - 1)
8273 pdnvec
->push_back(dn
);
8275 pdnvec
->clear(); // do not confuse likes of rdlock_path_pin_ref();
8279 dout(10) << "miss on dentry " << *dn
<< ", can't read due to lock" << dendl
;
8280 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, cf
.build());
8285 if (dnl
&& !dnl
->is_null()) {
8286 CInode
*in
= dnl
->get_inode();
8288 // do we have inode?
8290 ceph_assert(dnl
->is_remote());
8292 in
= get_inode(dnl
->get_remote_ino());
8294 dout(7) << "linking in remote in " << *in
<< dendl
;
8295 dn
->link_remote(dnl
, in
);
8297 dout(7) << "remote link to " << dnl
->get_remote_ino() << ", which i don't have" << dendl
;
8298 ceph_assert(mdr
); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
8299 if (mds
->damage_table
.is_remote_damaged(dnl
->get_remote_ino())) {
8300 dout(4) << "traverse: remote dentry points to damaged ino "
8304 open_remote_dentry(dn
, true, cf
.build(),
8305 (null_okay
&& depth
== path
.depth() - 1));
8306 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_remote_ino
);
8312 // make sure snaprealm are open...
8313 if (mdr
&& cur
->snaprealm
&& !cur
->snaprealm
->have_past_parents_open() &&
8314 !cur
->snaprealm
->open_parents(cf
.build())) {
8318 // add to trace, continue.
8321 pdnvec
->push_back(dn
);
8329 // MISS. dentry doesn't exist.
8330 dout(12) << "traverse: miss on dentry " << path
[depth
] << " in " << *curdir
<< dendl
;
8332 if (curdir
->is_auth()) {
8334 if (curdir
->is_complete() ||
8335 (snapid
== CEPH_NOSNAP
&&
8336 curdir
->has_bloom() &&
8337 !curdir
->is_in_bloom(path
[depth
]))) {
8340 // instantiate a null dn?
8341 if (depth
< path
.depth()-1){
8342 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl
;
8345 ceph_abort(); // should have fallen out in ->is_null() check above
8346 } else if (curdir
->is_frozen()) {
8347 dout(20) << " not adding null to frozen dir " << dendl
;
8348 } else if (snapid
< CEPH_MAXSNAP
) {
8349 dout(20) << " not adding null for snapid " << snapid
<< dendl
;
8351 // create a null dentry
8352 dn
= curdir
->add_null_dentry(path
[depth
]);
8353 dout(20) << " added null " << *dn
<< dendl
;
8356 pdnvec
->push_back(dn
);
8358 pdnvec
->clear(); // do not confuse likes of rdlock_path_pin_ref();
8363 // Check DamageTable for missing fragments before trying to fetch
8365 if (mds
->damage_table
.is_dirfrag_damaged(curdir
)) {
8366 dout(4) << "traverse: damaged dirfrag " << *curdir
8367 << ", blocking fetch" << dendl
;
8371 // directory isn't complete; reload
8372 dout(7) << "traverse: incomplete dir contents for " << *cur
<< ", fetching" << dendl
;
8374 curdir
->fetch(cf
.build(), path
[depth
]);
8375 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_dir_fetch
);
8379 // dirfrag/dentry is not mine.
8380 mds_authority_t dauth
= curdir
->authority();
8383 mdr
&& mdr
->client_request
&&
8384 (int)depth
< mdr
->client_request
->get_num_fwd()) {
8385 dout(7) << "traverse: snap " << snapid
<< " and depth " << depth
8386 << " < fwd " << mdr
->client_request
->get_num_fwd()
8387 << ", discovering instead of forwarding" << dendl
;
8391 if ((discover
|| null_okay
)) {
8392 dout(7) << "traverse: discover from " << path
[depth
] << " from " << *curdir
<< dendl
;
8393 discover_path(curdir
, snapid
, path
.postfixpath(depth
), cf
.build(),
8395 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_discover
);
8400 dout(7) << "traverse: not auth for " << path
<< " in " << *curdir
<< dendl
;
8402 if (curdir
->is_ambiguous_auth()) {
8404 dout(7) << "traverse: waiting for single auth in " << *curdir
<< dendl
;
8405 curdir
->add_waiter(CDir::WAIT_SINGLEAUTH
, cf
.build());
8409 dout(7) << "traverse: forwarding, not auth for " << *curdir
<< dendl
;
8411 request_forward(mdr
, dauth
.first
);
8413 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_forward
);
8418 ceph_abort(); // i shouldn't get here
8422 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_hit
);
8423 dout(10) << "path_traverse finish on snapid " << snapid
<< dendl
;
8425 ceph_assert(mdr
->snapid
== snapid
);
8429 CInode
*MDCache::cache_traverse(const filepath
& fp
)
8431 dout(10) << "cache_traverse " << fp
<< dendl
;
8435 in
= get_inode(fp
.get_ino());
8441 for (unsigned i
= 0; i
< fp
.depth(); i
++) {
8442 std::string_view dname
= fp
[i
];
8443 frag_t fg
= in
->pick_dirfrag(dname
);
8444 dout(20) << " " << i
<< " " << dname
<< " frag " << fg
<< " from " << *in
<< dendl
;
8445 CDir
*curdir
= in
->get_dirfrag(fg
);
8448 CDentry
*dn
= curdir
->lookup(dname
, CEPH_NOSNAP
);
8451 in
= dn
->get_linkage()->get_inode();
8455 dout(10) << " got " << *in
<< dendl
;
8461 * open_remote_dir -- open up a remote dirfrag
8463 * @param diri base inode
8464 * @param approxfg approximate fragment.
8465 * @param fin completion callback
8467 void MDCache::open_remote_dirfrag(CInode
*diri
, frag_t approxfg
, MDSContext
*fin
)
8469 dout(10) << "open_remote_dir on " << *diri
<< dendl
;
8470 ceph_assert(diri
->is_dir());
8471 ceph_assert(!diri
->is_auth());
8472 ceph_assert(diri
->get_dirfrag(approxfg
) == 0);
8474 discover_dir_frag(diri
, approxfg
, fin
);
8479 * get_dentry_inode - get or open inode
8481 * @param dn the dentry
8482 * @param mdr current request
8484 * will return inode for primary, or link up/open up remote link's inode as necessary.
8485 * If it's not available right now, puts mdr on wait list and returns null.
8487 CInode
*MDCache::get_dentry_inode(CDentry
*dn
, MDRequestRef
& mdr
, bool projected
)
8489 CDentry::linkage_t
*dnl
;
8491 dnl
= dn
->get_projected_linkage();
8493 dnl
= dn
->get_linkage();
8495 ceph_assert(!dnl
->is_null());
8497 if (dnl
->is_primary())
8500 ceph_assert(dnl
->is_remote());
8501 CInode
*in
= get_inode(dnl
->get_remote_ino());
8503 dout(7) << "get_dentry_inode linking in remote in " << *in
<< dendl
;
8504 dn
->link_remote(dnl
, in
);
8507 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn
<< dendl
;
8508 open_remote_dentry(dn
, projected
, new C_MDS_RetryRequest(this, mdr
));
8513 struct C_MDC_OpenRemoteDentry
: public MDCacheContext
{
8516 MDSContext
*onfinish
;
8518 C_MDC_OpenRemoteDentry(MDCache
*m
, CDentry
*d
, inodeno_t i
, MDSContext
*f
, bool wx
) :
8519 MDCacheContext(m
), dn(d
), ino(i
), onfinish(f
), want_xlocked(wx
) {
8520 dn
->get(MDSCacheObject::PIN_PTRWAITER
);
8522 void finish(int r
) override
{
8523 mdcache
->_open_remote_dentry_finish(dn
, ino
, onfinish
, want_xlocked
, r
);
8524 dn
->put(MDSCacheObject::PIN_PTRWAITER
);
8528 void MDCache::open_remote_dentry(CDentry
*dn
, bool projected
, MDSContext
*fin
, bool want_xlocked
)
8530 dout(10) << "open_remote_dentry " << *dn
<< dendl
;
8531 CDentry::linkage_t
*dnl
= projected
? dn
->get_projected_linkage() : dn
->get_linkage();
8532 inodeno_t ino
= dnl
->get_remote_ino();
8533 int64_t pool
= dnl
->get_remote_d_type() == DT_DIR
? mds
->mdsmap
->get_metadata_pool() : -1;
8535 new C_MDC_OpenRemoteDentry(this, dn
, ino
, fin
, want_xlocked
), true, want_xlocked
); // backtrace
8538 void MDCache::_open_remote_dentry_finish(CDentry
*dn
, inodeno_t ino
, MDSContext
*fin
,
8539 bool want_xlocked
, int r
)
8542 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
8543 if (dnl
->is_remote() && dnl
->get_remote_ino() == ino
) {
8544 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn
<< dendl
;
8545 dn
->state_set(CDentry::STATE_BADREMOTEINO
);
8548 CDir
*dir
= dn
->get_dir();
8550 dir
->get_inode()->make_path_string(path
);
8552 path
+= dn
->get_name();
8555 bool fatal
= mds
->damage_table
.notify_remote_damaged(ino
, path
);
8558 ceph_abort(); // unreachable, damaged() respawns us
8564 fin
->complete(r
< 0 ? r
: 0);
8568 void MDCache::make_trace(vector
<CDentry
*>& trace
, CInode
*in
)
8570 // empty trace if we're a base inode
8574 CInode
*parent
= in
->get_parent_inode();
8575 ceph_assert(parent
);
8576 make_trace(trace
, parent
);
8578 CDentry
*dn
= in
->get_parent_dn();
8579 dout(15) << "make_trace adding " << *dn
<< dendl
;
8580 trace
.push_back(dn
);
8584 // -------------------------------------------------------------------------------
8585 // Open inode by inode number
8587 class C_IO_MDC_OpenInoBacktraceFetched
: public MDCacheIOContext
{
8591 C_IO_MDC_OpenInoBacktraceFetched(MDCache
*c
, inodeno_t i
) :
8592 MDCacheIOContext(c
), ino(i
) {}
8593 void finish(int r
) override
{
8594 mdcache
->_open_ino_backtrace_fetched(ino
, bl
, r
);
8596 void print(ostream
& out
) const override
{
8597 out
<< "openino_backtrace_fetch" << ino
<< ")";
8601 struct C_MDC_OpenInoTraverseDir
: public MDCacheContext
{
8603 MMDSOpenIno::const_ref msg
;
8606 C_MDC_OpenInoTraverseDir(MDCache
*c
, inodeno_t i
, const MMDSOpenIno::const_ref
&m
, bool p
) :
8607 MDCacheContext(c
), ino(i
), msg(m
), parent(p
) {}
8608 void finish(int r
) override
{
8609 if (r
< 0 && !parent
)
8612 mdcache
->handle_open_ino(msg
, r
);
8615 auto& info
= mdcache
->opening_inodes
.at(ino
);
8616 mdcache
->_open_ino_traverse_dir(ino
, info
, r
);
8620 struct C_MDC_OpenInoParentOpened
: public MDCacheContext
{
8623 C_MDC_OpenInoParentOpened(MDCache
*c
, inodeno_t i
) : MDCacheContext(c
), ino(i
) {}
8624 void finish(int r
) override
{
8625 mdcache
->_open_ino_parent_opened(ino
, r
);
8629 void MDCache::_open_ino_backtrace_fetched(inodeno_t ino
, bufferlist
& bl
, int err
)
8631 dout(10) << "_open_ino_backtrace_fetched ino " << ino
<< " errno " << err
<< dendl
;
8633 open_ino_info_t
& info
= opening_inodes
.at(ino
);
8635 CInode
*in
= get_inode(ino
);
8637 dout(10) << " found cached " << *in
<< dendl
;
8638 open_ino_finish(ino
, info
, in
->authority().first
);
8642 inode_backtrace_t backtrace
;
8645 decode(backtrace
, bl
);
8646 } catch (const buffer::error
&decode_exc
) {
8647 derr
<< "corrupt backtrace on ino x0" << std::hex
<< ino
8648 << std::dec
<< ": " << decode_exc
<< dendl
;
8649 open_ino_finish(ino
, info
, -EIO
);
8652 if (backtrace
.pool
!= info
.pool
&& backtrace
.pool
!= -1) {
8653 dout(10) << " old object in pool " << info
.pool
8654 << ", retrying pool " << backtrace
.pool
<< dendl
;
8655 info
.pool
= backtrace
.pool
;
8656 C_IO_MDC_OpenInoBacktraceFetched
*fin
=
8657 new C_IO_MDC_OpenInoBacktraceFetched(this, ino
);
8658 fetch_backtrace(ino
, info
.pool
, fin
->bl
,
8659 new C_OnFinisher(fin
, mds
->finisher
));
8662 } else if (err
== -ENOENT
) {
8663 int64_t meta_pool
= mds
->mdsmap
->get_metadata_pool();
8664 if (info
.pool
!= meta_pool
) {
8665 dout(10) << " no object in pool " << info
.pool
8666 << ", retrying pool " << meta_pool
<< dendl
;
8667 info
.pool
= meta_pool
;
8668 C_IO_MDC_OpenInoBacktraceFetched
*fin
=
8669 new C_IO_MDC_OpenInoBacktraceFetched(this, ino
);
8670 fetch_backtrace(ino
, info
.pool
, fin
->bl
,
8671 new C_OnFinisher(fin
, mds
->finisher
));
8674 err
= 0; // backtrace.ancestors.empty() is checked below
8678 if (backtrace
.ancestors
.empty()) {
8679 dout(10) << " got empty backtrace " << dendl
;
8681 } else if (!info
.ancestors
.empty()) {
8682 if (info
.ancestors
[0] == backtrace
.ancestors
[0]) {
8683 dout(10) << " got same parents " << info
.ancestors
[0] << " 2 times" << dendl
;
8691 dout(0) << " failed to open ino " << ino
<< " err " << err
<< "/" << info
.last_err
<< dendl
;
8693 err
= info
.last_err
;
8694 open_ino_finish(ino
, info
, err
);
8698 dout(10) << " got backtrace " << backtrace
<< dendl
;
8699 info
.ancestors
= backtrace
.ancestors
;
8701 _open_ino_traverse_dir(ino
, info
, 0);
8704 void MDCache::_open_ino_parent_opened(inodeno_t ino
, int ret
)
8706 dout(10) << "_open_ino_parent_opened ino " << ino
<< " ret " << ret
<< dendl
;
8708 open_ino_info_t
& info
= opening_inodes
.at(ino
);
8710 CInode
*in
= get_inode(ino
);
8712 dout(10) << " found cached " << *in
<< dendl
;
8713 open_ino_finish(ino
, info
, in
->authority().first
);
8717 if (ret
== mds
->get_nodeid()) {
8718 _open_ino_traverse_dir(ino
, info
, 0);
8721 mds_rank_t checked_rank
= mds_rank_t(ret
);
8722 info
.check_peers
= true;
8723 info
.auth_hint
= checked_rank
;
8724 info
.checked
.erase(checked_rank
);
8726 do_open_ino(ino
, info
, ret
);
8730 void MDCache::_open_ino_traverse_dir(inodeno_t ino
, open_ino_info_t
& info
, int ret
)
8732 dout(10) << __func__
<< ": ino " << ino
<< " ret " << ret
<< dendl
;
8734 CInode
*in
= get_inode(ino
);
8736 dout(10) << " found cached " << *in
<< dendl
;
8737 open_ino_finish(ino
, info
, in
->authority().first
);
8742 do_open_ino(ino
, info
, ret
);
8746 mds_rank_t hint
= info
.auth_hint
;
8747 ret
= open_ino_traverse_dir(ino
, NULL
, info
.ancestors
,
8748 info
.discover
, info
.want_xlocked
, &hint
);
8751 if (hint
!= mds
->get_nodeid())
8752 info
.auth_hint
= hint
;
8753 do_open_ino(ino
, info
, ret
);
8756 void MDCache::_open_ino_fetch_dir(inodeno_t ino
, const MMDSOpenIno::const_ref
&m
, CDir
*dir
, bool parent
)
8758 if (dir
->state_test(CDir::STATE_REJOINUNDEF
))
8759 ceph_assert(dir
->get_inode()->dirfragtree
.is_leaf(dir
->get_frag()));
8760 dir
->fetch(new C_MDC_OpenInoTraverseDir(this, ino
, m
, parent
));
8762 mds
->logger
->inc(l_mds_openino_dir_fetch
);
8765 int MDCache::open_ino_traverse_dir(inodeno_t ino
, const MMDSOpenIno::const_ref
&m
,
8766 const vector
<inode_backpointer_t
>& ancestors
,
8767 bool discover
, bool want_xlocked
, mds_rank_t
*hint
)
8769 dout(10) << "open_ino_traverse_dir ino " << ino
<< " " << ancestors
<< dendl
;
8771 for (unsigned i
= 0; i
< ancestors
.size(); i
++) {
8772 const auto& ancestor
= ancestors
.at(i
);
8773 CInode
*diri
= get_inode(ancestor
.dirino
);
8776 if (discover
&& MDS_INO_IS_MDSDIR(ancestor
.dirino
)) {
8777 open_foreign_mdsdir(ancestor
.dirino
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
8783 if (diri
->state_test(CInode::STATE_REJOINUNDEF
)) {
8784 CDir
*dir
= diri
->get_parent_dir();
8785 while (dir
->state_test(CDir::STATE_REJOINUNDEF
) &&
8786 dir
->get_inode()->state_test(CInode::STATE_REJOINUNDEF
))
8787 dir
= dir
->get_inode()->get_parent_dir();
8788 _open_ino_fetch_dir(ino
, m
, dir
, i
== 0);
8792 if (!diri
->is_dir()) {
8793 dout(10) << " " << *diri
<< " is not dir" << dendl
;
8799 const string
& name
= ancestor
.dname
;
8800 frag_t fg
= diri
->pick_dirfrag(name
);
8801 CDir
*dir
= diri
->get_dirfrag(fg
);
8803 if (diri
->is_auth()) {
8804 if (diri
->is_frozen()) {
8805 dout(10) << " " << *diri
<< " is frozen, waiting " << dendl
;
8806 diri
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
8809 dir
= diri
->get_or_open_dirfrag(this, fg
);
8810 } else if (discover
) {
8811 open_remote_dirfrag(diri
, fg
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
8816 inodeno_t next_ino
= i
> 0 ? ancestors
.at(i
-1).dirino
: ino
;
8817 CDentry
*dn
= dir
->lookup(name
);
8818 CDentry::linkage_t
*dnl
= dn
? dn
->get_linkage() : NULL
;
8819 if (dir
->is_auth()) {
8820 if (dnl
&& dnl
->is_primary() &&
8821 dnl
->get_inode()->state_test(CInode::STATE_REJOINUNDEF
)) {
8822 dout(10) << " fetching undef " << *dnl
->get_inode() << dendl
;
8823 _open_ino_fetch_dir(ino
, m
, dir
, i
== 0);
8827 if (!dnl
&& !dir
->is_complete() &&
8828 (!dir
->has_bloom() || dir
->is_in_bloom(name
))) {
8829 dout(10) << " fetching incomplete " << *dir
<< dendl
;
8830 _open_ino_fetch_dir(ino
, m
, dir
, i
== 0);
8834 dout(10) << " no ino " << next_ino
<< " in " << *dir
<< dendl
;
8837 } else if (discover
) {
8839 filepath
path(name
, 0);
8840 discover_path(dir
, CEPH_NOSNAP
, path
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0),
8841 (i
== 0 && want_xlocked
));
8844 if (dnl
->is_null() && !dn
->lock
.can_read(-1)) {
8845 dout(10) << " null " << *dn
<< " is not readable, waiting" << dendl
;
8846 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
8849 dout(10) << " no ino " << next_ino
<< " in " << *dir
<< dendl
;
8855 *hint
= dir
? dir
->authority().first
: diri
->authority().first
;
8861 void MDCache::open_ino_finish(inodeno_t ino
, open_ino_info_t
& info
, int ret
)
8863 dout(10) << "open_ino_finish ino " << ino
<< " ret " << ret
<< dendl
;
8865 MDSContext::vec waiters
;
8866 waiters
.swap(info
.waiters
);
8867 opening_inodes
.erase(ino
);
8868 finish_contexts(g_ceph_context
, waiters
, ret
);
8871 void MDCache::do_open_ino(inodeno_t ino
, open_ino_info_t
& info
, int err
)
8873 if (err
< 0 && err
!= -EAGAIN
) {
8874 info
.checked
.clear();
8875 info
.checking
= MDS_RANK_NONE
;
8876 info
.check_peers
= true;
8877 info
.fetch_backtrace
= true;
8878 if (info
.discover
) {
8879 info
.discover
= false;
8880 info
.ancestors
.clear();
8882 if (err
!= -ENOENT
&& err
!= -ENOTDIR
)
8883 info
.last_err
= err
;
8886 if (info
.check_peers
|| info
.discover
) {
8887 if (info
.discover
) {
8888 // got backtrace from peer, but failed to find inode. re-check peers
8889 info
.discover
= false;
8890 info
.ancestors
.clear();
8891 info
.checked
.clear();
8893 info
.check_peers
= false;
8894 info
.checking
= MDS_RANK_NONE
;
8895 do_open_ino_peer(ino
, info
);
8896 } else if (info
.fetch_backtrace
) {
8897 info
.check_peers
= true;
8898 info
.fetch_backtrace
= false;
8899 info
.checking
= mds
->get_nodeid();
8900 info
.checked
.clear();
8901 C_IO_MDC_OpenInoBacktraceFetched
*fin
=
8902 new C_IO_MDC_OpenInoBacktraceFetched(this, ino
);
8903 fetch_backtrace(ino
, info
.pool
, fin
->bl
,
8904 new C_OnFinisher(fin
, mds
->finisher
));
8906 ceph_assert(!info
.ancestors
.empty());
8907 info
.checking
= mds
->get_nodeid();
8908 open_ino(info
.ancestors
[0].dirino
, mds
->mdsmap
->get_metadata_pool(),
8909 new C_MDC_OpenInoParentOpened(this, ino
), info
.want_replica
);
8913 void MDCache::do_open_ino_peer(inodeno_t ino
, open_ino_info_t
& info
)
8915 set
<mds_rank_t
> all
, active
;
8916 mds
->mdsmap
->get_mds_set(all
);
8917 if (mds
->get_state() == MDSMap::STATE_REJOIN
)
8918 mds
->mdsmap
->get_mds_set_lower_bound(active
, MDSMap::STATE_REJOIN
);
8920 mds
->mdsmap
->get_mds_set_lower_bound(active
, MDSMap::STATE_CLIENTREPLAY
);
8922 dout(10) << "do_open_ino_peer " << ino
<< " active " << active
8923 << " all " << all
<< " checked " << info
.checked
<< dendl
;
8925 mds_rank_t whoami
= mds
->get_nodeid();
8926 mds_rank_t peer
= MDS_RANK_NONE
;
8927 if (info
.auth_hint
>= 0 && info
.auth_hint
!= whoami
) {
8928 if (active
.count(info
.auth_hint
)) {
8929 peer
= info
.auth_hint
;
8930 info
.auth_hint
= MDS_RANK_NONE
;
8933 for (set
<mds_rank_t
>::iterator p
= active
.begin(); p
!= active
.end(); ++p
)
8934 if (*p
!= whoami
&& info
.checked
.count(*p
) == 0) {
8941 if (all
!= info
.checked
) {
8942 dout(10) << " waiting for more peers to be active" << dendl
;
8944 dout(10) << " all MDS peers have been checked " << dendl
;
8945 do_open_ino(ino
, info
, 0);
8948 info
.checking
= peer
;
8949 vector
<inode_backpointer_t
> *pa
= NULL
;
8950 // got backtrace from peer or backtrace just fetched
8951 if (info
.discover
|| !info
.fetch_backtrace
)
8952 pa
= &info
.ancestors
;
8953 mds
->send_message_mds(MMDSOpenIno::create(info
.tid
, ino
, pa
), peer
);
8955 mds
->logger
->inc(l_mds_openino_peer_discover
);
8959 void MDCache::handle_open_ino(const MMDSOpenIno::const_ref
&m
, int err
)
8961 if (mds
->get_state() < MDSMap::STATE_REJOIN
&&
8962 mds
->get_want_state() != CEPH_MDS_STATE_REJOIN
) {
8966 dout(10) << "handle_open_ino " << *m
<< " err " << err
<< dendl
;
8968 auto from
= mds_rank_t(m
->get_source().num());
8969 inodeno_t ino
= m
->ino
;
8970 MMDSOpenInoReply::ref reply
;
8971 CInode
*in
= get_inode(ino
);
8973 dout(10) << " have " << *in
<< dendl
;
8974 reply
= MMDSOpenInoReply::create(m
->get_tid(), ino
, mds_rank_t(0));
8975 if (in
->is_auth()) {
8978 CDentry
*pdn
= in
->get_parent_dn();
8981 CInode
*diri
= pdn
->get_dir()->get_inode();
8982 reply
->ancestors
.push_back(inode_backpointer_t(diri
->ino(), pdn
->get_name(),
8983 in
->inode
.version
));
8987 reply
->hint
= in
->authority().first
;
8989 } else if (err
< 0) {
8990 reply
= MMDSOpenInoReply::create(m
->get_tid(), ino
, MDS_RANK_NONE
, err
);
8992 mds_rank_t hint
= MDS_RANK_NONE
;
8993 int ret
= open_ino_traverse_dir(ino
, m
, m
->ancestors
, false, false, &hint
);
8996 reply
= MMDSOpenInoReply::create(m
->get_tid(), ino
, hint
, ret
);
8998 mds
->send_message_mds(reply
, from
);
9001 void MDCache::handle_open_ino_reply(const MMDSOpenInoReply::const_ref
&m
)
9003 dout(10) << "handle_open_ino_reply " << *m
<< dendl
;
9005 inodeno_t ino
= m
->ino
;
9006 mds_rank_t from
= mds_rank_t(m
->get_source().num());
9007 auto it
= opening_inodes
.find(ino
);
9008 if (it
!= opening_inodes
.end() && it
->second
.checking
== from
) {
9009 open_ino_info_t
& info
= it
->second
;
9010 info
.checking
= MDS_RANK_NONE
;
9011 info
.checked
.insert(from
);
9013 CInode
*in
= get_inode(ino
);
9015 dout(10) << " found cached " << *in
<< dendl
;
9016 open_ino_finish(ino
, info
, in
->authority().first
);
9017 } else if (!m
->ancestors
.empty()) {
9018 dout(10) << " found ino " << ino
<< " on mds." << from
<< dendl
;
9019 if (!info
.want_replica
) {
9020 open_ino_finish(ino
, info
, from
);
9024 info
.ancestors
= m
->ancestors
;
9025 info
.auth_hint
= from
;
9026 info
.checking
= mds
->get_nodeid();
9027 info
.discover
= true;
9028 _open_ino_traverse_dir(ino
, info
, 0);
9029 } else if (m
->error
) {
9030 dout(10) << " error " << m
->error
<< " from mds." << from
<< dendl
;
9031 do_open_ino(ino
, info
, m
->error
);
9033 if (m
->hint
>= 0 && m
->hint
!= mds
->get_nodeid()) {
9034 info
.auth_hint
= m
->hint
;
9035 info
.checked
.erase(m
->hint
);
9037 do_open_ino_peer(ino
, info
);
9042 void MDCache::kick_open_ino_peers(mds_rank_t who
)
9044 dout(10) << "kick_open_ino_peers mds." << who
<< dendl
;
9046 for (map
<inodeno_t
, open_ino_info_t
>::iterator p
= opening_inodes
.begin();
9047 p
!= opening_inodes
.end();
9049 open_ino_info_t
& info
= p
->second
;
9050 if (info
.checking
== who
) {
9051 dout(10) << " kicking ino " << p
->first
<< " who was checking mds." << who
<< dendl
;
9052 info
.checking
= MDS_RANK_NONE
;
9053 do_open_ino_peer(p
->first
, info
);
9054 } else if (info
.checking
== MDS_RANK_NONE
) {
9055 dout(10) << " kicking ino " << p
->first
<< " who was waiting" << dendl
;
9056 do_open_ino_peer(p
->first
, info
);
9061 void MDCache::open_ino(inodeno_t ino
, int64_t pool
, MDSContext
* fin
,
9062 bool want_replica
, bool want_xlocked
)
9064 dout(10) << "open_ino " << ino
<< " pool " << pool
<< " want_replica "
9065 << want_replica
<< dendl
;
9067 auto it
= opening_inodes
.find(ino
);
9068 if (it
!= opening_inodes
.end()) {
9069 open_ino_info_t
& info
= it
->second
;
9071 info
.want_replica
= true;
9072 if (want_xlocked
&& !info
.want_xlocked
) {
9073 if (!info
.ancestors
.empty()) {
9074 CInode
*diri
= get_inode(info
.ancestors
[0].dirino
);
9076 frag_t fg
= diri
->pick_dirfrag(info
.ancestors
[0].dname
);
9077 CDir
*dir
= diri
->get_dirfrag(fg
);
9078 if (dir
&& !dir
->is_auth()) {
9079 filepath
path(info
.ancestors
[0].dname
, 0);
9080 discover_path(dir
, CEPH_NOSNAP
, path
, NULL
, true);
9084 info
.want_xlocked
= true;
9087 info
.waiters
.push_back(fin
);
9089 open_ino_info_t
& info
= opening_inodes
[ino
];
9090 info
.want_replica
= want_replica
;
9091 info
.want_xlocked
= want_xlocked
;
9092 info
.tid
= ++open_ino_last_tid
;
9093 info
.pool
= pool
>= 0 ? pool
: default_file_layout
.pool_id
;
9094 info
.waiters
.push_back(fin
);
9095 if (mds
->is_rejoin() &&
9096 open_file_table
.get_ancestors(ino
, info
.ancestors
, info
.auth_hint
)) {
9097 info
.fetch_backtrace
= false;
9098 info
.checking
= mds
->get_nodeid();
9099 _open_ino_traverse_dir(ino
, info
, 0);
9101 do_open_ino(ino
, info
, 0);
9106 /* ---------------------------- */
9109 * search for a given inode on MDS peers. optionally start with the given node.
9113 - recover from mds node failure, recovery
9117 void MDCache::find_ino_peers(inodeno_t ino
, MDSContext
*c
, mds_rank_t hint
)
9119 dout(5) << "find_ino_peers " << ino
<< " hint " << hint
<< dendl
;
9120 CInode
*in
= get_inode(ino
);
9121 if (in
&& in
->state_test(CInode::STATE_PURGING
)) {
9122 c
->complete(-ESTALE
);
9127 ceph_tid_t tid
= ++find_ino_peer_last_tid
;
9128 find_ino_peer_info_t
& fip
= find_ino_peer
[tid
];
9133 _do_find_ino_peer(fip
);
9136 void MDCache::_do_find_ino_peer(find_ino_peer_info_t
& fip
)
9138 set
<mds_rank_t
> all
, active
;
9139 mds
->mdsmap
->get_mds_set(all
);
9140 mds
->mdsmap
->get_mds_set_lower_bound(active
, MDSMap::STATE_CLIENTREPLAY
);
9142 dout(10) << "_do_find_ino_peer " << fip
.tid
<< " " << fip
.ino
9143 << " active " << active
<< " all " << all
9144 << " checked " << fip
.checked
9147 mds_rank_t m
= MDS_RANK_NONE
;
9148 if (fip
.hint
>= 0) {
9150 fip
.hint
= MDS_RANK_NONE
;
9152 for (set
<mds_rank_t
>::iterator p
= active
.begin(); p
!= active
.end(); ++p
)
9153 if (*p
!= mds
->get_nodeid() &&
9154 fip
.checked
.count(*p
) == 0) {
9159 if (m
== MDS_RANK_NONE
) {
9160 all
.erase(mds
->get_nodeid());
9161 if (all
!= fip
.checked
) {
9162 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl
;
9164 dout(10) << "_do_find_ino_peer failed on " << fip
.ino
<< dendl
;
9165 fip
.fin
->complete(-ESTALE
);
9166 find_ino_peer
.erase(fip
.tid
);
9170 mds
->send_message_mds(MMDSFindIno::create(fip
.tid
, fip
.ino
), m
);
9174 void MDCache::handle_find_ino(const MMDSFindIno::const_ref
&m
)
9176 if (mds
->get_state() < MDSMap::STATE_REJOIN
) {
9180 dout(10) << "handle_find_ino " << *m
<< dendl
;
9181 auto r
= MMDSFindInoReply::create(m
->tid
);
9182 CInode
*in
= get_inode(m
->ino
);
9184 in
->make_path(r
->path
);
9185 dout(10) << " have " << r
->path
<< " " << *in
<< dendl
;
9187 mds
->send_message_mds(r
, mds_rank_t(m
->get_source().num()));
9191 void MDCache::handle_find_ino_reply(const MMDSFindInoReply::const_ref
&m
)
9193 map
<ceph_tid_t
, find_ino_peer_info_t
>::iterator p
= find_ino_peer
.find(m
->tid
);
9194 if (p
!= find_ino_peer
.end()) {
9195 dout(10) << "handle_find_ino_reply " << *m
<< dendl
;
9196 find_ino_peer_info_t
& fip
= p
->second
;
9199 if (get_inode(fip
.ino
)) {
9200 dout(10) << "handle_find_ino_reply successfully found " << fip
.ino
<< dendl
;
9201 mds
->queue_waiter(fip
.fin
);
9202 find_ino_peer
.erase(p
);
9206 mds_rank_t from
= mds_rank_t(m
->get_source().num());
9207 if (fip
.checking
== from
)
9208 fip
.checking
= MDS_RANK_NONE
;
9209 fip
.checked
.insert(from
);
9211 if (!m
->path
.empty()) {
9213 vector
<CDentry
*> trace
;
9214 CF_MDS_RetryMessageFactory
cf(mds
, m
);
9215 MDRequestRef null_ref
;
9216 int r
= path_traverse(null_ref
, cf
, m
->path
, &trace
, NULL
, MDS_TRAVERSE_DISCOVER
);
9219 dout(0) << "handle_find_ino_reply failed with " << r
<< " on " << m
->path
9220 << ", retrying" << dendl
;
9221 fip
.checked
.clear();
9222 _do_find_ino_peer(fip
);
9225 _do_find_ino_peer(fip
);
9228 dout(10) << "handle_find_ino_reply tid " << m
->tid
<< " dne" << dendl
;
9232 void MDCache::kick_find_ino_peers(mds_rank_t who
)
9234 // find_ino_peers requests we should move on from
9235 for (map
<ceph_tid_t
,find_ino_peer_info_t
>::iterator p
= find_ino_peer
.begin();
9236 p
!= find_ino_peer
.end();
9238 find_ino_peer_info_t
& fip
= p
->second
;
9239 if (fip
.checking
== who
) {
9240 dout(10) << "kicking find_ino_peer " << fip
.tid
<< " who was checking mds." << who
<< dendl
;
9241 fip
.checking
= MDS_RANK_NONE
;
9242 _do_find_ino_peer(fip
);
9243 } else if (fip
.checking
== MDS_RANK_NONE
) {
9244 dout(10) << "kicking find_ino_peer " << fip
.tid
<< " who was waiting" << dendl
;
9245 _do_find_ino_peer(fip
);
9250 /* ---------------------------- */
9252 int MDCache::get_num_client_requests()
9255 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
9256 p
!= active_requests
.end();
9258 MDRequestRef
& mdr
= p
->second
;
9259 if (mdr
->reqid
.name
.is_client() && !mdr
->is_slave())
9265 MDRequestRef
MDCache::request_start(const MClientRequest::const_ref
& req
)
9267 // did we win a forward race against a slave?
9268 if (active_requests
.count(req
->get_reqid())) {
9269 MDRequestRef
& mdr
= active_requests
[req
->get_reqid()];
9271 if (mdr
->is_slave()) {
9272 dout(10) << "request_start already had " << *mdr
<< ", waiting for finish" << dendl
;
9273 mdr
->more()->waiting_for_finish
.push_back(new C_MDS_RetryMessage(mds
, req
));
9275 dout(10) << "request_start already processing " << *mdr
<< ", dropping new msg" << dendl
;
9277 return MDRequestRef();
9280 // register new client request
9281 MDRequestImpl::Params params
;
9282 params
.reqid
= req
->get_reqid();
9283 params
.attempt
= req
->get_num_fwd();
9284 params
.client_req
= req
;
9285 params
.initiated
= req
->get_recv_stamp();
9286 params
.throttled
= req
->get_throttle_stamp();
9287 params
.all_read
= req
->get_recv_complete_stamp();
9288 params
.dispatched
= req
->get_dispatch_stamp();
9291 mds
->op_tracker
.create_request
<MDRequestImpl
,MDRequestImpl::Params
*>(¶ms
);
9292 active_requests
[params
.reqid
] = mdr
;
9293 mdr
->set_op_stamp(req
->get_stamp());
9294 dout(7) << "request_start " << *mdr
<< dendl
;
9298 MDRequestRef
MDCache::request_start_slave(metareqid_t ri
, __u32 attempt
, const Message::const_ref
&m
)
9300 int by
= m
->get_source().num();
9301 MDRequestImpl::Params params
;
9303 params
.attempt
= attempt
;
9304 params
.triggering_slave_req
= m
;
9305 params
.slave_to
= by
;
9306 params
.initiated
= m
->get_recv_stamp();
9307 params
.throttled
= m
->get_throttle_stamp();
9308 params
.all_read
= m
->get_recv_complete_stamp();
9309 params
.dispatched
= m
->get_dispatch_stamp();
9311 mds
->op_tracker
.create_request
<MDRequestImpl
,MDRequestImpl::Params
*>(¶ms
);
9312 ceph_assert(active_requests
.count(mdr
->reqid
) == 0);
9313 active_requests
[mdr
->reqid
] = mdr
;
9314 dout(7) << "request_start_slave " << *mdr
<< " by mds." << by
<< dendl
;
9318 MDRequestRef
MDCache::request_start_internal(int op
)
9320 utime_t now
= ceph_clock_now();
9321 MDRequestImpl::Params params
;
9322 params
.reqid
.name
= entity_name_t::MDS(mds
->get_nodeid());
9323 params
.reqid
.tid
= mds
->issue_tid();
9324 params
.initiated
= now
;
9325 params
.throttled
= now
;
9326 params
.all_read
= now
;
9327 params
.dispatched
= now
;
9328 params
.internal_op
= op
;
9330 mds
->op_tracker
.create_request
<MDRequestImpl
,MDRequestImpl::Params
*>(¶ms
);
9332 ceph_assert(active_requests
.count(mdr
->reqid
) == 0);
9333 active_requests
[mdr
->reqid
] = mdr
;
9334 dout(7) << "request_start_internal " << *mdr
<< " op " << op
<< dendl
;
9338 MDRequestRef
MDCache::request_get(metareqid_t rid
)
9340 ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.find(rid
);
9341 ceph_assert(p
!= active_requests
.end());
9342 dout(7) << "request_get " << rid
<< " " << *p
->second
<< dendl
;
9346 void MDCache::request_finish(MDRequestRef
& mdr
)
9348 dout(7) << "request_finish " << *mdr
<< dendl
;
9349 mdr
->mark_event("finishing request");
9352 if (mdr
->has_more() && mdr
->more()->slave_commit
) {
9353 Context
*fin
= mdr
->more()->slave_commit
;
9354 mdr
->more()->slave_commit
= 0;
9357 mdr
->aborted
= false;
9359 mdr
->more()->slave_rolling_back
= true;
9362 mdr
->committing
= true;
9364 fin
->complete(ret
); // this must re-call request_finish.
9368 switch(mdr
->internal_op
) {
9369 case CEPH_MDS_OP_FRAGMENTDIR
:
9370 logger
->inc(l_mdss_ireq_fragmentdir
);
9372 case CEPH_MDS_OP_EXPORTDIR
:
9373 logger
->inc(l_mdss_ireq_exportdir
);
9375 case CEPH_MDS_OP_ENQUEUE_SCRUB
:
9376 logger
->inc(l_mdss_ireq_enqueue_scrub
);
9378 case CEPH_MDS_OP_FLUSH
:
9379 logger
->inc(l_mdss_ireq_flush
);
9381 case CEPH_MDS_OP_REPAIR_FRAGSTATS
:
9382 logger
->inc(l_mdss_ireq_fragstats
);
9384 case CEPH_MDS_OP_REPAIR_INODESTATS
:
9385 logger
->inc(l_mdss_ireq_inodestats
);
9389 request_cleanup(mdr
);
9393 void MDCache::request_forward(MDRequestRef
& mdr
, mds_rank_t who
, int port
)
9395 mdr
->mark_event("forwarding request");
9396 if (mdr
->client_request
&& mdr
->client_request
->get_source().is_client()) {
9397 dout(7) << "request_forward " << *mdr
<< " to mds." << who
<< " req "
9398 << *mdr
->client_request
<< dendl
;
9399 mds
->forward_message_mds(mdr
->release_client_request(), who
);
9400 if (mds
->logger
) mds
->logger
->inc(l_mds_forward
);
9401 } else if (mdr
->internal_op
>= 0) {
9402 dout(10) << "request_forward on internal op; cancelling" << dendl
;
9403 mdr
->internal_op_finish
->complete(-EXDEV
);
9405 dout(7) << "request_forward drop " << *mdr
<< " req " << *mdr
->client_request
9406 << " was from mds" << dendl
;
9408 request_cleanup(mdr
);
9412 void MDCache::dispatch_request(MDRequestRef
& mdr
)
9414 if (mdr
->client_request
) {
9415 mds
->server
->dispatch_client_request(mdr
);
9416 } else if (mdr
->slave_request
) {
9417 mds
->server
->dispatch_slave_request(mdr
);
9419 switch (mdr
->internal_op
) {
9420 case CEPH_MDS_OP_FRAGMENTDIR
:
9421 dispatch_fragment_dir(mdr
);
9423 case CEPH_MDS_OP_EXPORTDIR
:
9424 migrator
->dispatch_export_dir(mdr
, 0);
9426 case CEPH_MDS_OP_ENQUEUE_SCRUB
:
9427 enqueue_scrub_work(mdr
);
9429 case CEPH_MDS_OP_FLUSH
:
9430 flush_dentry_work(mdr
);
9432 case CEPH_MDS_OP_REPAIR_FRAGSTATS
:
9433 repair_dirfrag_stats_work(mdr
);
9435 case CEPH_MDS_OP_REPAIR_INODESTATS
:
9436 repair_inode_stats_work(mdr
);
9438 case CEPH_MDS_OP_UPGRADE_SNAPREALM
:
9439 upgrade_inode_snaprealm_work(mdr
);
9448 void MDCache::request_drop_foreign_locks(MDRequestRef
& mdr
)
9450 if (!mdr
->has_more())
9454 // (will implicitly drop remote dn pins)
9455 for (set
<mds_rank_t
>::iterator p
= mdr
->more()->slaves
.begin();
9456 p
!= mdr
->more()->slaves
.end();
9458 auto r
= MMDSSlaveRequest::create(mdr
->reqid
, mdr
->attempt
,
9459 MMDSSlaveRequest::OP_FINISH
);
9461 if (mdr
->killed
&& !mdr
->committing
) {
9463 } else if (mdr
->more()->srcdn_auth_mds
== *p
&&
9464 mdr
->more()->inode_import
.length() > 0) {
9465 // information about rename imported caps
9466 r
->inode_export
.claim(mdr
->more()->inode_import
);
9469 mds
->send_message_mds(r
, *p
);
9472 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9473 * implicitly. Note that we don't call the finishers -- there shouldn't
9474 * be any on a remote lock and the request finish wakes up all
9475 * the waiters anyway! */
9477 for (auto it
= mdr
->locks
.begin(); it
!= mdr
->locks
.end(); ) {
9478 SimpleLock
*lock
= it
->lock
;
9479 if (it
->is_xlock() && !lock
->get_parent()->is_auth()) {
9480 dout(10) << "request_drop_foreign_locks forgetting lock " << *lock
9481 << " on " << lock
->get_parent() << dendl
;
9483 mdr
->locks
.erase(it
++);
9484 } else if (it
->is_remote_wrlock()) {
9485 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *lock
9486 << " on mds." << it
->wrlock_target
<< " on " << *lock
->get_parent() << dendl
;
9487 if (it
->is_wrlock()) {
9488 it
->clear_remote_wrlock();
9491 mdr
->locks
.erase(it
++);
9498 mdr
->more()->slaves
.clear(); /* we no longer have requests out to them, and
9499 * leaving them in can cause double-notifies as
9500 * this function can get called more than once */
9503 void MDCache::request_drop_non_rdlocks(MDRequestRef
& mdr
)
9505 request_drop_foreign_locks(mdr
);
9506 mds
->locker
->drop_non_rdlocks(mdr
.get());
9509 void MDCache::request_drop_locks(MDRequestRef
& mdr
)
9511 request_drop_foreign_locks(mdr
);
9512 mds
->locker
->drop_locks(mdr
.get());
9515 void MDCache::request_cleanup(MDRequestRef
& mdr
)
9517 dout(15) << "request_cleanup " << *mdr
<< dendl
;
9519 if (mdr
->has_more()) {
9520 if (mdr
->more()->is_ambiguous_auth
)
9521 mdr
->clear_ambiguous_auth();
9522 if (!mdr
->more()->waiting_for_finish
.empty())
9523 mds
->queue_waiters(mdr
->more()->waiting_for_finish
);
9526 request_drop_locks(mdr
);
9528 // drop (local) auth pins
9529 mdr
->drop_local_auth_pins();
9532 mdr
->put_stickydirs();
9534 mds
->locker
->kick_cap_releases(mdr
);
9539 // remove from session
9540 mdr
->item_session_request
.remove_myself();
9543 active_requests
.erase(mdr
->reqid
);
9548 mdr
->mark_event("cleaned up request");
9551 void MDCache::request_kill(MDRequestRef
& mdr
)
9553 // rollback slave requests is tricky. just let the request proceed.
9554 if (mdr
->has_more() &&
9555 (!mdr
->more()->witnessed
.empty() || !mdr
->more()->waiting_on_slave
.empty())) {
9556 if (!mdr
->done_locking
) {
9557 ceph_assert(mdr
->more()->witnessed
.empty());
9558 mdr
->aborted
= true;
9559 dout(10) << "request_kill " << *mdr
<< " -- waiting for slave reply, delaying" << dendl
;
9561 dout(10) << "request_kill " << *mdr
<< " -- already started slave prep, no-op" << dendl
;
9564 ceph_assert(mdr
->used_prealloc_ino
== 0);
9565 ceph_assert(mdr
->prealloc_inos
.empty());
9567 mdr
->session
= NULL
;
9568 mdr
->item_session_request
.remove_myself();
9573 mdr
->mark_event("killing request");
9575 if (mdr
->committing
) {
9576 dout(10) << "request_kill " << *mdr
<< " -- already committing, no-op" << dendl
;
9578 dout(10) << "request_kill " << *mdr
<< dendl
;
9579 request_cleanup(mdr
);
9583 // -------------------------------------------------------------------------------
9586 void MDCache::create_global_snaprealm()
9588 CInode
*in
= new CInode(this); // dummy inode
9589 create_unlinked_system_inode(in
, MDS_INO_GLOBAL_SNAPREALM
, S_IFDIR
|0755);
9591 global_snaprealm
= in
->snaprealm
;
9594 void MDCache::do_realm_invalidate_and_update_notify(CInode
*in
, int snapop
, bool notify_clients
)
9596 dout(10) << "do_realm_invalidate_and_update_notify " << *in
->snaprealm
<< " " << *in
<< dendl
;
9598 vector
<inodeno_t
> split_inos
;
9599 vector
<inodeno_t
> split_realms
;
9601 if (notify_clients
) {
9602 ceph_assert(in
->snaprealm
->have_past_parents_open());
9603 if (snapop
== CEPH_SNAP_OP_SPLIT
) {
9604 // notify clients of update|split
9605 for (elist
<CInode
*>::iterator p
= in
->snaprealm
->inodes_with_caps
.begin(member_offset(CInode
, item_caps
));
9607 split_inos
.push_back((*p
)->ino());
9609 for (set
<SnapRealm
*>::iterator p
= in
->snaprealm
->open_children
.begin();
9610 p
!= in
->snaprealm
->open_children
.end();
9612 split_realms
.push_back((*p
)->inode
->ino());
9616 set
<SnapRealm
*> past_children
;
9617 map
<client_t
, MClientSnap::ref
> updates
;
9619 q
.push_back(in
->snaprealm
);
9620 while (!q
.empty()) {
9621 SnapRealm
*realm
= q
.front();
9624 dout(10) << " realm " << *realm
<< " on " << *realm
->inode
<< dendl
;
9625 realm
->invalidate_cached_snaps();
9627 if (notify_clients
) {
9628 for (const auto& p
: realm
->client_caps
) {
9629 const auto& client
= p
.first
;
9630 const auto& caps
= p
.second
;
9631 ceph_assert(!caps
->empty());
9633 auto em
= updates
.emplace(std::piecewise_construct
, std::forward_as_tuple(client
), std::forward_as_tuple());
9635 auto update
= MClientSnap::create(CEPH_SNAP_OP_SPLIT
);
9636 update
->head
.split
= in
->ino();
9637 update
->split_inos
= split_inos
;
9638 update
->split_realms
= split_realms
;
9639 update
->bl
= in
->snaprealm
->get_snap_trace();
9640 em
.first
->second
= std::move(update
);
9645 if (snapop
== CEPH_SNAP_OP_UPDATE
|| snapop
== CEPH_SNAP_OP_DESTROY
) {
9646 for (set
<SnapRealm
*>::iterator p
= realm
->open_past_children
.begin();
9647 p
!= realm
->open_past_children
.end();
9649 past_children
.insert(*p
);
9652 // notify for active children, too.
9653 dout(10) << " " << realm
<< " open_children are " << realm
->open_children
<< dendl
;
9654 for (set
<SnapRealm
*>::iterator p
= realm
->open_children
.begin();
9655 p
!= realm
->open_children
.end();
9661 send_snaps(updates
);
9663 // notify past children and their descendants if we update/delete old snapshots
9664 for (set
<SnapRealm
*>::iterator p
= past_children
.begin();
9665 p
!= past_children
.end();
9669 while (!q
.empty()) {
9670 SnapRealm
*realm
= q
.front();
9673 realm
->invalidate_cached_snaps();
9675 for (set
<SnapRealm
*>::iterator p
= realm
->open_children
.begin();
9676 p
!= realm
->open_children
.end();
9678 if (past_children
.count(*p
) == 0)
9682 for (set
<SnapRealm
*>::iterator p
= realm
->open_past_children
.begin();
9683 p
!= realm
->open_past_children
.end();
9685 if (past_children
.count(*p
) == 0) {
9687 past_children
.insert(*p
);
9692 if (snapop
== CEPH_SNAP_OP_DESTROY
) {
9693 // eval stray inodes if we delete snapshot from their past ancestor snaprealm
9694 for (set
<SnapRealm
*>::iterator p
= past_children
.begin();
9695 p
!= past_children
.end();
9697 maybe_eval_stray((*p
)->inode
, true);
9701 void MDCache::send_snap_update(CInode
*in
, version_t stid
, int snap_op
)
9703 dout(10) << __func__
<< " " << *in
<< " stid " << stid
<< dendl
;
9704 ceph_assert(in
->is_auth());
9706 set
<mds_rank_t
> mds_set
;
9708 mds
->mdsmap
->get_mds_set_lower_bound(mds_set
, MDSMap::STATE_RESOLVE
);
9709 mds_set
.erase(mds
->get_nodeid());
9711 in
->list_replicas(mds_set
);
9714 if (!mds_set
.empty()) {
9715 bufferlist snap_blob
;
9716 in
->encode_snap(snap_blob
);
9718 for (auto p
: mds_set
) {
9719 auto m
= MMDSSnapUpdate::create(in
->ino(), stid
, snap_op
);
9720 m
->snap_blob
= snap_blob
;
9721 mds
->send_message_mds(m
, p
);
9726 notify_global_snaprealm_update(snap_op
);
9729 void MDCache::handle_snap_update(const MMDSSnapUpdate::const_ref
&m
)
9731 mds_rank_t from
= mds_rank_t(m
->get_source().num());
9732 dout(10) << __func__
<< " " << *m
<< " from mds." << from
<< dendl
;
9734 if (mds
->get_state() < MDSMap::STATE_RESOLVE
&&
9735 mds
->get_want_state() != CEPH_MDS_STATE_RESOLVE
) {
9739 // null rejoin_done means open_snaprealms() has already been called
9740 bool notify_clients
= mds
->get_state() > MDSMap::STATE_REJOIN
||
9741 (mds
->is_rejoin() && !rejoin_done
);
9743 if (m
->get_tid() > 0) {
9744 mds
->snapclient
->notify_commit(m
->get_tid());
9746 notify_global_snaprealm_update(m
->get_snap_op());
9749 CInode
*in
= get_inode(m
->get_ino());
9751 ceph_assert(!in
->is_auth());
9752 if (mds
->get_state() > MDSMap::STATE_REJOIN
||
9753 (mds
->is_rejoin() && !in
->is_rejoining())) {
9754 auto p
= m
->snap_blob
.cbegin();
9757 if (!notify_clients
) {
9758 if (!rejoin_pending_snaprealms
.count(in
)) {
9759 in
->get(CInode::PIN_OPENINGSNAPPARENTS
);
9760 rejoin_pending_snaprealms
.insert(in
);
9763 do_realm_invalidate_and_update_notify(in
, m
->get_snap_op(), notify_clients
);
9768 void MDCache::notify_global_snaprealm_update(int snap_op
)
9770 if (snap_op
!= CEPH_SNAP_OP_DESTROY
)
9771 snap_op
= CEPH_SNAP_OP_UPDATE
;
9772 set
<Session
*> sessions
;
9773 mds
->sessionmap
.get_client_session_set(sessions
);
9774 for (auto &session
: sessions
) {
9775 if (!session
->is_open() && !session
->is_stale())
9777 auto update
= MClientSnap::create(snap_op
);
9778 update
->head
.split
= global_snaprealm
->inode
->ino();
9779 update
->bl
= global_snaprealm
->get_snap_trace();
9780 mds
->send_message_client_counted(update
, session
);
9784 // -------------------------------------------------------------------------------
9787 struct C_MDC_RetryScanStray
: public MDCacheContext
{
9789 C_MDC_RetryScanStray(MDCache
*c
, dirfrag_t n
) : MDCacheContext(c
), next(n
) { }
9790 void finish(int r
) override
{
9791 mdcache
->scan_stray_dir(next
);
9795 void MDCache::scan_stray_dir(dirfrag_t next
)
9797 dout(10) << "scan_stray_dir " << next
<< dendl
;
9800 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
9801 if (strays
[i
]->ino() < next
.ino
)
9803 strays
[i
]->get_dirfrags(ls
);
9806 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
9808 if (dir
->dirfrag() < next
)
9810 if (!dir
->is_complete()) {
9811 dir
->fetch(new C_MDC_RetryScanStray(this, dir
->dirfrag()));
9814 for (auto &p
: dir
->items
) {
9815 CDentry
*dn
= p
.second
;
9816 dn
->state_set(CDentry::STATE_STRAY
);
9817 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
9818 if (dnl
->is_primary()) {
9819 CInode
*in
= dnl
->get_inode();
9820 if (in
->inode
.nlink
== 0)
9821 in
->state_set(CInode::STATE_ORPHAN
);
9822 maybe_eval_stray(in
);
9828 void MDCache::fetch_backtrace(inodeno_t ino
, int64_t pool
, bufferlist
& bl
, Context
*fin
)
9830 object_t oid
= CInode::get_object_name(ino
, frag_t(), "");
9831 mds
->objecter
->getxattr(oid
, object_locator_t(pool
), "parent", CEPH_NOSNAP
, &bl
, 0, fin
);
9833 mds
->logger
->inc(l_mds_openino_backtrace_fetch
);
9840 // ========================================================================================
9844 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
9845 to the parent metadata object in the cache (pinning it).
9847 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
9851 void MDCache::_send_discover(discover_info_t
& d
)
9853 auto dis
= MDiscover::create(d
.ino
, d
.frag
, d
.snap
, d
.want_path
, d
.want_base_dir
, d
.want_xlocked
);
9854 dis
->set_tid(d
.tid
);
9855 mds
->send_message_mds(dis
, d
.mds
);
9858 void MDCache::discover_base_ino(inodeno_t want_ino
,
9859 MDSContext
*onfinish
,
9862 dout(7) << "discover_base_ino " << want_ino
<< " from mds." << from
<< dendl
;
9863 if (waiting_for_base_ino
[from
].count(want_ino
) == 0) {
9864 discover_info_t
& d
= _create_discover(from
);
9868 waiting_for_base_ino
[from
][want_ino
].push_back(onfinish
);
9872 void MDCache::discover_dir_frag(CInode
*base
,
9874 MDSContext
*onfinish
,
9878 from
= base
->authority().first
;
9880 dirfrag_t
df(base
->ino(), approx_fg
);
9881 dout(7) << "discover_dir_frag " << df
9882 << " from mds." << from
<< dendl
;
9884 if (!base
->is_waiting_for_dir(approx_fg
) || !onfinish
) {
9885 discover_info_t
& d
= _create_discover(from
);
9887 d
.ino
= base
->ino();
9889 d
.want_base_dir
= true;
9894 base
->add_dir_waiter(approx_fg
, onfinish
);
9897 struct C_MDC_RetryDiscoverPath
: public MDCacheContext
{
9902 C_MDC_RetryDiscoverPath(MDCache
*c
, CInode
*b
, snapid_t s
, filepath
&p
, mds_rank_t f
) :
9903 MDCacheContext(c
), base(b
), snapid(s
), path(p
), from(f
) {}
9904 void finish(int r
) override
{
9905 mdcache
->discover_path(base
, snapid
, path
, 0, from
);
9909 void MDCache::discover_path(CInode
*base
,
9912 MDSContext
*onfinish
,
9917 from
= base
->authority().first
;
9919 dout(7) << "discover_path " << base
->ino() << " " << want_path
<< " snap " << snap
<< " from mds." << from
9920 << (want_xlocked
? " want_xlocked":"")
9923 if (base
->is_ambiguous_auth()) {
9924 dout(10) << " waiting for single auth on " << *base
<< dendl
;
9926 onfinish
= new C_MDC_RetryDiscoverPath(this, base
, snap
, want_path
, from
);
9927 base
->add_waiter(CInode::WAIT_SINGLEAUTH
, onfinish
);
9929 } else if (from
== mds
->get_nodeid()) {
9930 MDSContext::vec finished
;
9931 base
->take_waiting(CInode::WAIT_DIR
, finished
);
9932 mds
->queue_waiters(finished
);
9936 frag_t fg
= base
->pick_dirfrag(want_path
[0]);
9937 if ((want_xlocked
&& want_path
.depth() == 1) ||
9938 !base
->is_waiting_for_dir(fg
) || !onfinish
) {
9939 discover_info_t
& d
= _create_discover(from
);
9940 d
.ino
= base
->ino();
9944 d
.want_path
= want_path
;
9945 d
.want_base_dir
= true;
9946 d
.want_xlocked
= want_xlocked
;
9952 base
->add_dir_waiter(fg
, onfinish
);
9955 struct C_MDC_RetryDiscoverPath2
: public MDCacheContext
{
9959 C_MDC_RetryDiscoverPath2(MDCache
*c
, CDir
*b
, snapid_t s
, filepath
&p
) :
9960 MDCacheContext(c
), base(b
), snapid(s
), path(p
) {}
9961 void finish(int r
) override
{
9962 mdcache
->discover_path(base
, snapid
, path
, 0);
9966 void MDCache::discover_path(CDir
*base
,
9969 MDSContext
*onfinish
,
9972 mds_rank_t from
= base
->authority().first
;
9974 dout(7) << "discover_path " << base
->dirfrag() << " " << want_path
<< " snap " << snap
<< " from mds." << from
9975 << (want_xlocked
? " want_xlocked":"")
9978 if (base
->is_ambiguous_auth()) {
9979 dout(7) << " waiting for single auth on " << *base
<< dendl
;
9981 onfinish
= new C_MDC_RetryDiscoverPath2(this, base
, snap
, want_path
);
9982 base
->add_waiter(CDir::WAIT_SINGLEAUTH
, onfinish
);
9984 } else if (from
== mds
->get_nodeid()) {
9985 MDSContext::vec finished
;
9986 base
->take_sub_waiting(finished
);
9987 mds
->queue_waiters(finished
);
9991 if ((want_xlocked
&& want_path
.depth() == 1) ||
9992 !base
->is_waiting_for_dentry(want_path
[0].c_str(), snap
) || !onfinish
) {
9993 discover_info_t
& d
= _create_discover(from
);
9994 d
.ino
= base
->ino();
9995 d
.pin_base(base
->inode
);
9996 d
.frag
= base
->get_frag();
9998 d
.want_path
= want_path
;
9999 d
.want_base_dir
= false;
10000 d
.want_xlocked
= want_xlocked
;
10006 base
->add_dentry_waiter(want_path
[0], snap
, onfinish
);
10009 void MDCache::kick_discovers(mds_rank_t who
)
10011 for (map
<ceph_tid_t
,discover_info_t
>::iterator p
= discovers
.begin();
10012 p
!= discovers
.end();
10014 if (p
->second
.mds
!= who
)
10016 _send_discover(p
->second
);
10021 void MDCache::handle_discover(const MDiscover::const_ref
&dis
)
10023 mds_rank_t whoami
= mds
->get_nodeid();
10024 mds_rank_t from
= mds_rank_t(dis
->get_source().num());
10026 ceph_assert(from
!= whoami
);
10028 if (mds
->get_state() <= MDSMap::STATE_REJOIN
) {
10029 if (mds
->get_state() < MDSMap::STATE_REJOIN
&&
10030 mds
->get_want_state() < CEPH_MDS_STATE_REJOIN
) {
10034 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
10035 // delay processing request from survivor because we may not yet choose lock states.
10036 if (!mds
->mdsmap
->is_rejoin(from
)) {
10037 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl
;
10038 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, dis
));
10045 auto reply
= MDiscoverReply::create(*dis
);
10047 snapid_t snapid
= dis
->get_snapid();
10050 if (MDS_INO_IS_BASE(dis
->get_base_ino()) &&
10051 !dis
->wants_base_dir() && dis
->get_want().depth() == 0) {
10053 dout(7) << "handle_discover from mds." << from
10054 << " wants base + " << dis
->get_want().get_path()
10055 << " snap " << snapid
10058 cur
= get_inode(dis
->get_base_ino());
10062 reply
->starts_with
= MDiscoverReply::INODE
;
10063 replicate_inode(cur
, from
, reply
->trace
, mds
->mdsmap
->get_up_features());
10064 dout(10) << "added base " << *cur
<< dendl
;
10067 // there's a base inode
10068 cur
= get_inode(dis
->get_base_ino(), snapid
);
10069 if (!cur
&& snapid
!= CEPH_NOSNAP
) {
10070 cur
= get_inode(dis
->get_base_ino());
10071 if (cur
&& !cur
->is_multiversion())
10072 cur
= NULL
; // nope!
10076 dout(7) << "handle_discover mds." << from
10077 << " don't have base ino " << dis
->get_base_ino() << "." << snapid
10079 if (!dis
->wants_base_dir() && dis
->get_want().depth() > 0)
10080 reply
->set_error_dentry(dis
->get_dentry(0));
10081 reply
->set_flag_error_dir();
10082 } else if (dis
->wants_base_dir()) {
10083 dout(7) << "handle_discover mds." << from
10084 << " wants basedir+" << dis
->get_want().get_path()
10088 dout(7) << "handle_discover mds." << from
10089 << " wants " << dis
->get_want().get_path()
10095 ceph_assert(reply
);
10098 // do some fidgeting to include a dir if they asked for the base dir, or just root.
10099 for (unsigned i
= 0;
10100 cur
&& (i
< dis
->get_want().depth() || dis
->get_want().depth() == 0);
10103 // -- figure out the dir
10105 // is *cur even a dir at all?
10106 if (!cur
->is_dir()) {
10107 dout(7) << *cur
<< " not a dir" << dendl
;
10108 reply
->set_flag_error_dir();
10114 if (dis
->get_want().depth()) {
10115 // dentry specifies
10116 fg
= cur
->pick_dirfrag(dis
->get_dentry(i
));
10118 // requester explicity specified the frag
10119 ceph_assert(dis
->wants_base_dir() || MDS_INO_IS_BASE(dis
->get_base_ino()));
10120 fg
= dis
->get_base_dir_frag();
10121 if (!cur
->dirfragtree
.is_leaf(fg
))
10122 fg
= cur
->dirfragtree
[fg
.value()];
10124 CDir
*curdir
= cur
->get_dirfrag(fg
);
10126 if ((!curdir
&& !cur
->is_auth()) ||
10127 (curdir
&& !curdir
->is_auth())) {
10130 * ONLY set flag if empty!!
10131 * otherwise requester will wake up waiter(s) _and_ continue with discover,
10132 * resulting in duplicate discovers in flight,
10133 * which can wreak havoc when discovering rename srcdn (which may move)
10136 if (reply
->is_empty()) {
10137 // only hint if empty.
10138 // someday this could be better, but right now the waiter logic isn't smart enough.
10142 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir
<< dendl
;
10143 reply
->set_dir_auth_hint(curdir
->authority().first
);
10145 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
10147 reply
->set_dir_auth_hint(cur
->authority().first
);
10150 // note error dentry, if any
10151 // NOTE: important, as it allows requester to issue an equivalent discover
10152 // to whomever we hint at.
10153 if (dis
->get_want().depth() > i
)
10154 reply
->set_error_dentry(dis
->get_dentry(i
));
10160 if (!curdir
) { // open dir?
10161 if (cur
->is_frozen()) {
10162 if (!reply
->is_empty()) {
10163 dout(7) << *cur
<< " is frozen, non-empty reply, stopping" << dendl
;
10166 dout(7) << *cur
<< " is frozen, empty reply, waiting" << dendl
;
10167 cur
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryMessage(mds
, dis
));
10170 curdir
= cur
->get_or_open_dirfrag(this, fg
);
10171 } else if (curdir
->is_frozen_tree() ||
10172 (curdir
->is_frozen_dir() && fragment_are_all_frozen(curdir
))) {
10173 if (!reply
->is_empty()) {
10174 dout(7) << *curdir
<< " is frozen, non-empty reply, stopping" << dendl
;
10177 if (dis
->wants_base_dir() && dis
->get_base_dir_frag() != curdir
->get_frag()) {
10178 dout(7) << *curdir
<< " is frozen, dirfrag mismatch, stopping" << dendl
;
10179 reply
->set_flag_error_dir();
10182 dout(7) << *curdir
<< " is frozen, empty reply, waiting" << dendl
;
10183 curdir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryMessage(mds
, dis
));
10188 if (curdir
->get_version() == 0) {
10189 // fetch newly opened dir
10190 } else if (reply
->is_empty() && !dis
->wants_base_dir()) {
10191 dout(7) << "handle_discover not adding unwanted base dir " << *curdir
<< dendl
;
10192 // make sure the base frag is correct, though, in there was a refragment since the
10193 // original request was sent.
10194 reply
->set_base_dir_frag(curdir
->get_frag());
10196 ceph_assert(!curdir
->is_ambiguous_auth()); // would be frozen.
10197 if (!reply
->trace
.length())
10198 reply
->starts_with
= MDiscoverReply::DIR;
10199 replicate_dir(curdir
, from
, reply
->trace
);
10200 dout(7) << "handle_discover added dir " << *curdir
<< dendl
;
10205 if (curdir
->get_version() == 0) {
10206 // fetch newly opened dir
10207 ceph_assert(!curdir
->has_bloom());
10208 } else if (dis
->get_want().depth() > 0) {
10210 dn
= curdir
->lookup(dis
->get_dentry(i
), snapid
);
10216 if (!curdir
->is_complete() &&
10217 !(snapid
== CEPH_NOSNAP
&&
10218 curdir
->has_bloom() &&
10219 !curdir
->is_in_bloom(dis
->get_dentry(i
)))) {
10221 dout(7) << "incomplete dir contents for " << *curdir
<< ", fetching" << dendl
;
10222 if (reply
->is_empty()) {
10224 curdir
->fetch(new C_MDS_RetryMessage(mds
, dis
),
10225 dis
->wants_base_dir() && curdir
->get_version() == 0);
10228 // initiate fetch, but send what we have so far
10234 if (snapid
!= CEPH_NOSNAP
&& !reply
->is_empty()) {
10235 dout(7) << "dentry " << dis
->get_dentry(i
) << " snap " << snapid
10236 << " dne, non-empty reply, stopping" << dendl
;
10240 // send null dentry
10241 dout(7) << "dentry " << dis
->get_dentry(i
) << " dne, returning null in "
10242 << *curdir
<< dendl
;
10243 if (snapid
== CEPH_NOSNAP
)
10244 dn
= curdir
->add_null_dentry(dis
->get_dentry(i
));
10246 dn
= curdir
->add_null_dentry(dis
->get_dentry(i
), snapid
, snapid
);
10250 // don't add replica to purging dentry/inode
10251 if (dn
->state_test(CDentry::STATE_PURGING
)) {
10252 if (reply
->is_empty())
10253 reply
->set_flag_error_dn(dis
->get_dentry(i
));
10257 CDentry::linkage_t
*dnl
= dn
->get_linkage();
10260 // ...always block on non-tail items (they are unrelated)
10261 // ...allow xlocked tail disocvery _only_ if explicitly requested
10262 bool tailitem
= (dis
->get_want().depth() == 0) || (i
== dis
->get_want().depth() - 1);
10263 if (dn
->lock
.is_xlocked()) {
10264 // is this the last (tail) item in the discover traversal?
10265 if (tailitem
&& dis
->wants_xlocked()) {
10266 dout(7) << "handle_discover allowing discovery of xlocked tail " << *dn
<< dendl
;
10267 } else if (reply
->is_empty()) {
10268 dout(7) << "handle_discover blocking on xlocked " << *dn
<< dendl
;
10269 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, new C_MDS_RetryMessage(mds
, dis
));
10272 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn
<< dendl
;
10278 if (dnl
->is_primary() && dnl
->get_inode()->is_frozen_inode()) {
10279 if (tailitem
&& dis
->wants_xlocked()) {
10280 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl
->get_inode() << dendl
;
10281 } else if (reply
->is_empty()) {
10282 dout(7) << *dnl
->get_inode() << " is frozen, empty reply, waiting" << dendl
;
10283 dnl
->get_inode()->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryMessage(mds
, dis
));
10286 dout(7) << *dnl
->get_inode() << " is frozen, non-empty reply, stopping" << dendl
;
10292 if (!reply
->trace
.length())
10293 reply
->starts_with
= MDiscoverReply::DENTRY
;
10294 replicate_dentry(dn
, from
, reply
->trace
);
10295 dout(7) << "handle_discover added dentry " << *dn
<< dendl
;
10297 if (!dnl
->is_primary()) break; // stop on null or remote link.
10300 CInode
*next
= dnl
->get_inode();
10301 ceph_assert(next
->is_auth());
10303 replicate_inode(next
, from
, reply
->trace
, mds
->mdsmap
->get_up_features());
10304 dout(7) << "handle_discover added inode " << *next
<< dendl
;
10306 // descend, keep going.
10312 ceph_assert(!reply
->is_empty());
10313 dout(7) << "handle_discover sending result back to asker mds." << from
<< dendl
;
10314 mds
->send_message(reply
, dis
->get_connection());
10317 void MDCache::handle_discover_reply(const MDiscoverReply::const_ref
&m
)
10320 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
10321 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
10325 dout(7) << "discover_reply " << *m
<< dendl
;
10326 if (m
->is_flag_error_dir())
10327 dout(7) << " flag error, dir" << dendl
;
10328 if (m
->is_flag_error_dn())
10329 dout(7) << " flag error, dentry = " << m
->get_error_dentry() << dendl
;
10331 MDSContext::vec finished
, error
;
10332 mds_rank_t from
= mds_rank_t(m
->get_source().num());
10335 CInode
*cur
= get_inode(m
->get_base_ino());
10336 auto p
= m
->trace
.cbegin();
10338 int next
= m
->starts_with
;
10340 // decrement discover counters
10341 if (m
->get_tid()) {
10342 map
<ceph_tid_t
,discover_info_t
>::iterator p
= discovers
.find(m
->get_tid());
10343 if (p
!= discovers
.end()) {
10344 dout(10) << " found tid " << m
->get_tid() << dendl
;
10345 discovers
.erase(p
);
10347 dout(10) << " tid " << m
->get_tid() << " not found, must be dup reply" << dendl
;
10351 // discover may start with an inode
10352 if (!p
.end() && next
== MDiscoverReply::INODE
) {
10353 cur
= add_replica_inode(p
, NULL
, finished
);
10354 dout(7) << "discover_reply got base inode " << *cur
<< dendl
;
10355 ceph_assert(cur
->is_base());
10357 next
= MDiscoverReply::DIR;
10360 if (cur
->is_base() &&
10361 waiting_for_base_ino
[from
].count(cur
->ino())) {
10362 finished
.swap(waiting_for_base_ino
[from
][cur
->ino()]);
10363 waiting_for_base_ino
[from
].erase(cur
->ino());
10368 // loop over discover results.
10369 // indexes follow each ([[dir] dentry] inode)
10370 // can start, end with any type.
10375 if (next
== MDiscoverReply::DIR) {
10376 curdir
= add_replica_dir(p
, cur
, mds_rank_t(m
->get_source().num()), finished
);
10377 if (cur
->ino() == m
->get_base_ino() && curdir
->get_frag() != m
->get_base_dir_frag()) {
10378 ceph_assert(m
->get_wanted_base_dir());
10379 cur
->take_dir_waiting(m
->get_base_dir_frag(), finished
);
10382 // note: this can only happen our first way around this loop.
10383 if (p
.end() && m
->is_flag_error_dn()) {
10384 fg
= cur
->pick_dirfrag(m
->get_error_dentry());
10385 curdir
= cur
->get_dirfrag(fg
);
10387 curdir
= cur
->get_dirfrag(m
->get_base_dir_frag());
10394 CDentry
*dn
= add_replica_dentry(p
, curdir
, finished
);
10400 cur
= add_replica_inode(p
, dn
, finished
);
10402 next
= MDiscoverReply::DIR;
10406 // or dir_auth hint?
10407 if (m
->is_flag_error_dir() && !cur
->is_dir()) {
10409 cur
->take_waiting(CInode::WAIT_DIR
, error
);
10410 } else if (m
->is_flag_error_dir() || m
->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN
) {
10411 mds_rank_t who
= m
->get_dir_auth_hint();
10412 if (who
== mds
->get_nodeid()) who
= -1;
10414 dout(7) << " dir_auth_hint is " << m
->get_dir_auth_hint() << dendl
;
10417 if (m
->get_wanted_base_dir()) {
10418 frag_t fg
= m
->get_base_dir_frag();
10419 CDir
*dir
= cur
->get_dirfrag(fg
);
10421 if (cur
->is_waiting_for_dir(fg
)) {
10422 if (cur
->is_auth())
10423 cur
->take_waiting(CInode::WAIT_DIR
, finished
);
10424 else if (dir
|| !cur
->dirfragtree
.is_leaf(fg
))
10425 cur
->take_dir_waiting(fg
, finished
);
10427 discover_dir_frag(cur
, fg
, 0, who
);
10429 dout(7) << " doing nothing, nobody is waiting for dir" << dendl
;
10433 if (m
->get_error_dentry().length()) {
10434 frag_t fg
= cur
->pick_dirfrag(m
->get_error_dentry());
10435 CDir
*dir
= cur
->get_dirfrag(fg
);
10437 if (dir
&& dir
->is_waiting_for_dentry(m
->get_error_dentry(), m
->get_wanted_snapid())) {
10438 if (dir
->is_auth() || dir
->lookup(m
->get_error_dentry())) {
10439 dir
->take_dentry_waiting(m
->get_error_dentry(), m
->get_wanted_snapid(),
10440 m
->get_wanted_snapid(), finished
);
10442 filepath
relpath(m
->get_error_dentry(), 0);
10443 discover_path(dir
, m
->get_wanted_snapid(), relpath
, 0, m
->get_wanted_xlocked());
10446 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10447 << m
->get_error_dentry() << dendl
;
10449 } else if (m
->is_flag_error_dn()) {
10450 frag_t fg
= cur
->pick_dirfrag(m
->get_error_dentry());
10451 CDir
*dir
= cur
->get_dirfrag(fg
);
10453 if (dir
->is_auth()) {
10454 dir
->take_sub_waiting(finished
);
10456 dir
->take_dentry_waiting(m
->get_error_dentry(), m
->get_wanted_snapid(),
10457 m
->get_wanted_snapid(), error
);
10463 finish_contexts(g_ceph_context
, error
, -ENOENT
); // finish errors directly
10464 mds
->queue_waiters(finished
);
10469 // ----------------------------
10473 void MDCache::replicate_dir(CDir
*dir
, mds_rank_t to
, bufferlist
& bl
)
10475 dirfrag_t df
= dir
->dirfrag();
10477 dir
->encode_replica(to
, bl
);
10480 void MDCache::replicate_dentry(CDentry
*dn
, mds_rank_t to
, bufferlist
& bl
)
10482 encode(dn
->get_name(), bl
);
10483 encode(dn
->last
, bl
);
10484 dn
->encode_replica(to
, bl
, mds
->get_state() < MDSMap::STATE_ACTIVE
);
10487 void MDCache::replicate_inode(CInode
*in
, mds_rank_t to
, bufferlist
& bl
,
10490 encode(in
->inode
.ino
, bl
); // bleh, minor assymetry here
10491 encode(in
->last
, bl
);
10492 in
->encode_replica(to
, bl
, features
, mds
->get_state() < MDSMap::STATE_ACTIVE
);
10495 CDir
*MDCache::add_replica_dir(bufferlist::const_iterator
& p
, CInode
*diri
, mds_rank_t from
,
10496 MDSContext::vec
& finished
)
10501 ceph_assert(diri
->ino() == df
.ino
);
10503 // add it (_replica_)
10504 CDir
*dir
= diri
->get_dirfrag(df
.frag
);
10507 // had replica. update w/ new nonce.
10508 dir
->decode_replica(p
);
10509 dout(7) << "add_replica_dir had " << *dir
<< " nonce " << dir
->replica_nonce
<< dendl
;
10511 // force frag to leaf in the diri tree
10512 if (!diri
->dirfragtree
.is_leaf(df
.frag
)) {
10513 dout(7) << "add_replica_dir forcing frag " << df
.frag
<< " to leaf in the fragtree "
10514 << diri
->dirfragtree
<< dendl
;
10515 diri
->dirfragtree
.force_to_leaf(g_ceph_context
, df
.frag
);
10519 dir
= diri
->add_dirfrag( new CDir(diri
, df
.frag
, this, false) );
10520 dir
->decode_replica(p
);
10522 // is this a dir_auth delegation boundary?
10523 if (from
!= diri
->authority().first
||
10524 diri
->is_ambiguous_auth() ||
10526 adjust_subtree_auth(dir
, from
);
10528 dout(7) << "add_replica_dir added " << *dir
<< " nonce " << dir
->replica_nonce
<< dendl
;
10531 diri
->take_dir_waiting(df
.frag
, finished
);
10537 CDentry
*MDCache::add_replica_dentry(bufferlist::const_iterator
& p
, CDir
*dir
, MDSContext::vec
& finished
)
10544 CDentry
*dn
= dir
->lookup(name
, last
);
10548 dn
->decode_replica(p
, false);
10549 dout(7) << "add_replica_dentry had " << *dn
<< dendl
;
10551 dn
= dir
->add_null_dentry(name
, 1 /* this will get updated below */, last
);
10552 dn
->decode_replica(p
, true);
10553 dout(7) << "add_replica_dentry added " << *dn
<< dendl
;
10556 dir
->take_dentry_waiting(name
, dn
->first
, dn
->last
, finished
);
10561 CInode
*MDCache::add_replica_inode(bufferlist::const_iterator
& p
, CDentry
*dn
, MDSContext::vec
& finished
)
10567 CInode
*in
= get_inode(ino
, last
);
10569 in
= new CInode(this, false, 1, last
);
10570 in
->decode_replica(p
, true);
10572 if (in
->ino() == MDS_INO_ROOT
)
10573 in
->inode_auth
.first
= 0;
10574 else if (in
->is_mdsdir())
10575 in
->inode_auth
.first
= in
->ino() - MDS_INO_MDSDIR_OFFSET
;
10576 dout(10) << "add_replica_inode added " << *in
<< dendl
;
10578 ceph_assert(dn
->get_linkage()->is_null());
10579 dn
->dir
->link_primary_inode(dn
, in
);
10582 in
->decode_replica(p
, false);
10583 dout(10) << "add_replica_inode had " << *in
<< dendl
;
10587 if (!dn
->get_linkage()->is_primary() || dn
->get_linkage()->get_inode() != in
)
10588 dout(10) << "add_replica_inode different linkage in dentry " << *dn
<< dendl
;
10595 void MDCache::replicate_stray(CDentry
*straydn
, mds_rank_t who
, bufferlist
& bl
)
10597 uint64_t features
= mds
->mdsmap
->get_up_features();
10598 replicate_inode(get_myin(), who
, bl
, features
);
10599 replicate_dir(straydn
->get_dir()->inode
->get_parent_dn()->get_dir(), who
, bl
);
10600 replicate_dentry(straydn
->get_dir()->inode
->get_parent_dn(), who
, bl
);
10601 replicate_inode(straydn
->get_dir()->inode
, who
, bl
, features
);
10602 replicate_dir(straydn
->get_dir(), who
, bl
);
10603 replicate_dentry(straydn
, who
, bl
);
10606 CDentry
*MDCache::add_replica_stray(const bufferlist
&bl
, mds_rank_t from
)
10608 MDSContext::vec finished
;
10609 auto p
= bl
.cbegin();
10611 CInode
*mdsin
= add_replica_inode(p
, NULL
, finished
);
10612 CDir
*mdsdir
= add_replica_dir(p
, mdsin
, from
, finished
);
10613 CDentry
*straydirdn
= add_replica_dentry(p
, mdsdir
, finished
);
10614 CInode
*strayin
= add_replica_inode(p
, straydirdn
, finished
);
10615 CDir
*straydir
= add_replica_dir(p
, strayin
, from
, finished
);
10616 CDentry
*straydn
= add_replica_dentry(p
, straydir
, finished
);
10617 if (!finished
.empty())
10618 mds
->queue_waiters(finished
);
10624 int MDCache::send_dir_updates(CDir
*dir
, bool bcast
)
10626 // this is an FYI, re: replication
10628 set
<mds_rank_t
> who
;
10630 mds
->get_mds_map()->get_active_mds_set(who
);
10632 for (const auto &p
: dir
->get_replicas()) {
10633 who
.insert(p
.first
);
10637 dout(7) << "sending dir_update on " << *dir
<< " bcast " << bcast
<< " to " << who
<< dendl
;
10640 dir
->inode
->make_path(path
);
10642 mds_rank_t whoami
= mds
->get_nodeid();
10643 for (set
<mds_rank_t
>::iterator it
= who
.begin();
10646 if (*it
== whoami
) continue;
10647 //if (*it == except) continue;
10648 dout(7) << "sending dir_update on " << *dir
<< " to " << *it
<< dendl
;
10650 std::set
<int32_t> s
;
10651 for (const auto &r
: dir
->dir_rep_by
) {
10654 mds
->send_message_mds(MDirUpdate::create(mds
->get_nodeid(), dir
->dirfrag(), dir
->dir_rep
, s
, path
, bcast
), *it
);
10660 void MDCache::handle_dir_update(const MDirUpdate::const_ref
&m
)
10662 dirfrag_t df
= m
->get_dirfrag();
10663 CDir
*dir
= get_dirfrag(df
);
10665 dout(5) << "dir_update on " << df
<< ", don't have it" << dendl
;
10668 if (m
->should_discover()) {
10670 // this is key to avoid a fragtree update race, among other things.
10671 m
->inc_tried_discover();
10672 vector
<CDentry
*> trace
;
10674 filepath path
= m
->get_path();
10675 dout(5) << "trying discover on dir_update for " << path
<< dendl
;
10676 CF_MDS_RetryMessageFactory
cf(mds
, m
);
10677 MDRequestRef null_ref
;
10678 int r
= path_traverse(null_ref
, cf
, path
, &trace
, &in
, MDS_TRAVERSE_DISCOVER
);
10682 in
->ino() == df
.ino
&&
10683 in
->get_approx_dirfrag(df
.frag
) == NULL
) {
10684 open_remote_dirfrag(in
, df
.frag
, new C_MDS_RetryMessage(mds
, m
));
10692 if (!m
->has_tried_discover()) {
10693 // Update if it already exists. Othwerwise it got updated by discover reply.
10694 dout(5) << "dir_update on " << *dir
<< dendl
;
10695 dir
->dir_rep
= m
->get_dir_rep();
10696 dir
->dir_rep_by
.clear();
10697 for (const auto &e
: m
->get_dir_rep_by()) {
10698 dir
->dir_rep_by
.insert(e
);
10709 void MDCache::send_dentry_link(CDentry
*dn
, MDRequestRef
& mdr
)
10711 dout(7) << "send_dentry_link " << *dn
<< dendl
;
10713 CDir
*subtree
= get_subtree_root(dn
->get_dir());
10714 for (const auto &p
: dn
->get_replicas()) {
10715 // don't tell (rename) witnesses; they already know
10716 if (mdr
.get() && mdr
->more()->witnessed
.count(p
.first
))
10718 if (mds
->mdsmap
->get_state(p
.first
) < MDSMap::STATE_REJOIN
||
10719 (mds
->mdsmap
->get_state(p
.first
) == MDSMap::STATE_REJOIN
&&
10720 rejoin_gather
.count(p
.first
)))
10722 CDentry::linkage_t
*dnl
= dn
->get_linkage();
10723 auto m
= MDentryLink::create(subtree
->dirfrag(), dn
->get_dir()->dirfrag(), dn
->get_name(), dnl
->is_primary());
10724 if (dnl
->is_primary()) {
10725 dout(10) << " primary " << *dnl
->get_inode() << dendl
;
10726 replicate_inode(dnl
->get_inode(), p
.first
, m
->bl
,
10727 mds
->mdsmap
->get_up_features());
10728 } else if (dnl
->is_remote()) {
10729 inodeno_t ino
= dnl
->get_remote_ino();
10730 __u8 d_type
= dnl
->get_remote_d_type();
10731 dout(10) << " remote " << ino
<< " " << d_type
<< dendl
;
10732 encode(ino
, m
->bl
);
10733 encode(d_type
, m
->bl
);
10735 ceph_abort(); // aie, bad caller!
10736 mds
->send_message_mds(m
, p
.first
);
10740 void MDCache::handle_dentry_link(const MDentryLink::const_ref
&m
)
10742 CDentry
*dn
= NULL
;
10743 CDir
*dir
= get_dirfrag(m
->get_dirfrag());
10745 dout(7) << "handle_dentry_link don't have dirfrag " << m
->get_dirfrag() << dendl
;
10747 dn
= dir
->lookup(m
->get_dn());
10749 dout(7) << "handle_dentry_link don't have dentry " << *dir
<< " dn " << m
->get_dn() << dendl
;
10751 dout(7) << "handle_dentry_link on " << *dn
<< dendl
;
10752 CDentry::linkage_t
*dnl
= dn
->get_linkage();
10754 ceph_assert(!dn
->is_auth());
10755 ceph_assert(dnl
->is_null());
10759 auto p
= m
->bl
.cbegin();
10760 MDSContext::vec finished
;
10762 if (m
->get_is_primary()) {
10764 add_replica_inode(p
, dn
, finished
);
10766 // remote link, easy enough.
10771 dir
->link_remote_inode(dn
, ino
, d_type
);
10777 if (!finished
.empty())
10778 mds
->queue_waiters(finished
);
10786 void MDCache::send_dentry_unlink(CDentry
*dn
, CDentry
*straydn
, MDRequestRef
& mdr
)
10788 dout(10) << "send_dentry_unlink " << *dn
<< dendl
;
10789 // share unlink news with replicas
10790 set
<mds_rank_t
> replicas
;
10791 dn
->list_replicas(replicas
);
10794 straydn
->list_replicas(replicas
);
10795 CInode
*strayin
= straydn
->get_linkage()->get_inode();
10796 strayin
->encode_snap_blob(snapbl
);
10798 for (set
<mds_rank_t
>::iterator it
= replicas
.begin();
10799 it
!= replicas
.end();
10801 // don't tell (rmdir) witnesses; they already know
10802 if (mdr
.get() && mdr
->more()->witnessed
.count(*it
))
10805 if (mds
->mdsmap
->get_state(*it
) < MDSMap::STATE_REJOIN
||
10806 (mds
->mdsmap
->get_state(*it
) == MDSMap::STATE_REJOIN
&&
10807 rejoin_gather
.count(*it
)))
10810 auto unlink
= MDentryUnlink::create(dn
->get_dir()->dirfrag(), dn
->get_name());
10812 replicate_stray(straydn
, *it
, unlink
->straybl
);
10813 unlink
->snapbl
= snapbl
;
10815 mds
->send_message_mds(unlink
, *it
);
10819 void MDCache::handle_dentry_unlink(const MDentryUnlink::const_ref
&m
)
10822 CDentry
*straydn
= NULL
;
10823 if (m
->straybl
.length())
10824 straydn
= add_replica_stray(m
->straybl
, mds_rank_t(m
->get_source().num()));
10826 CDir
*dir
= get_dirfrag(m
->get_dirfrag());
10828 dout(7) << "handle_dentry_unlink don't have dirfrag " << m
->get_dirfrag() << dendl
;
10830 CDentry
*dn
= dir
->lookup(m
->get_dn());
10832 dout(7) << "handle_dentry_unlink don't have dentry " << *dir
<< " dn " << m
->get_dn() << dendl
;
10834 dout(7) << "handle_dentry_unlink on " << *dn
<< dendl
;
10835 CDentry::linkage_t
*dnl
= dn
->get_linkage();
10838 if (dnl
->is_primary()) {
10839 CInode
*in
= dnl
->get_inode();
10840 dn
->dir
->unlink_inode(dn
);
10841 ceph_assert(straydn
);
10842 straydn
->dir
->link_primary_inode(straydn
, in
);
10844 // in->first is lazily updated on replica; drag it forward so
10845 // that we always keep it in sync with the dnq
10846 ceph_assert(straydn
->first
>= in
->first
);
10847 in
->first
= straydn
->first
;
10849 // update subtree map?
10851 adjust_subtree_after_rename(in
, dir
, false);
10853 if (m
->snapbl
.length()) {
10854 bool hadrealm
= (in
->snaprealm
? true : false);
10855 in
->decode_snap_blob(m
->snapbl
);
10856 ceph_assert(in
->snaprealm
);
10857 ceph_assert(in
->snaprealm
->have_past_parents_open());
10859 do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
, false);
10862 // send caps to auth (if we're not already)
10863 if (in
->is_any_caps() &&
10864 !in
->state_test(CInode::STATE_EXPORTINGCAPS
))
10865 migrator
->export_caps(in
);
10869 ceph_assert(!straydn
);
10870 ceph_assert(dnl
->is_remote());
10871 dn
->dir
->unlink_inode(dn
);
10873 ceph_assert(dnl
->is_null());
10877 // race with trim_dentry()
10879 ceph_assert(straydn
->get_num_ref() == 0);
10880 ceph_assert(straydn
->get_linkage()->is_null());
10882 trim_dentry(straydn
, ex
);
10883 send_expire_messages(ex
);
10892 // ===================================================================
10896 // ===================================================================
10901 * adjust_dir_fragments -- adjust fragmentation for a directory
10903 * @param diri directory inode
10904 * @param basefrag base fragment
10905 * @param bits bit adjustment. positive for split, negative for merge.
10907 void MDCache::adjust_dir_fragments(CInode
*diri
, frag_t basefrag
, int bits
,
10908 list
<CDir
*>& resultfrags
,
10909 MDSContext::vec
& waiters
,
10912 dout(10) << "adjust_dir_fragments " << basefrag
<< " " << bits
10913 << " on " << *diri
<< dendl
;
10915 list
<CDir
*> srcfrags
;
10916 diri
->get_dirfrags_under(basefrag
, srcfrags
);
10918 adjust_dir_fragments(diri
, srcfrags
, basefrag
, bits
, resultfrags
, waiters
, replay
);
10921 CDir
*MDCache::force_dir_fragment(CInode
*diri
, frag_t fg
, bool replay
)
10923 CDir
*dir
= diri
->get_dirfrag(fg
);
10927 dout(10) << "force_dir_fragment " << fg
<< " on " << *diri
<< dendl
;
10929 list
<CDir
*> src
, result
;
10930 MDSContext::vec waiters
;
10933 frag_t parent
= diri
->dirfragtree
.get_branch_or_leaf(fg
);
10935 CDir
*pdir
= diri
->get_dirfrag(parent
);
10937 int split
= fg
.bits() - parent
.bits();
10938 dout(10) << " splitting parent by " << split
<< " " << *pdir
<< dendl
;
10939 src
.push_back(pdir
);
10940 adjust_dir_fragments(diri
, src
, parent
, split
, result
, waiters
, replay
);
10941 dir
= diri
->get_dirfrag(fg
);
10943 dout(10) << "force_dir_fragment result " << *dir
<< dendl
;
10947 if (parent
== frag_t())
10949 frag_t last
= parent
;
10950 parent
= parent
.parent();
10951 dout(10) << " " << last
<< " parent is " << parent
<< dendl
;
10955 // hoover up things under fg?
10956 diri
->get_dirfrags_under(fg
, src
);
10958 dout(10) << "force_dir_fragment no frags under " << fg
<< dendl
;
10960 dout(10) << " will combine frags under " << fg
<< ": " << src
<< dendl
;
10961 adjust_dir_fragments(diri
, src
, fg
, 0, result
, waiters
, replay
);
10962 dir
= result
.front();
10963 dout(10) << "force_dir_fragment result " << *dir
<< dendl
;
10967 mds
->queue_waiters(waiters
);
10971 void MDCache::adjust_dir_fragments(CInode
*diri
,
10972 list
<CDir
*>& srcfrags
,
10973 frag_t basefrag
, int bits
,
10974 list
<CDir
*>& resultfrags
,
10975 MDSContext::vec
& waiters
,
10978 dout(10) << "adjust_dir_fragments " << basefrag
<< " bits " << bits
10979 << " srcfrags " << srcfrags
10980 << " on " << *diri
<< dendl
;
10983 // yuck. we may have discovered the inode while it was being fragmented.
10984 if (!diri
->dirfragtree
.is_leaf(basefrag
))
10985 diri
->dirfragtree
.force_to_leaf(g_ceph_context
, basefrag
);
10988 diri
->dirfragtree
.split(basefrag
, bits
);
10989 dout(10) << " new fragtree is " << diri
->dirfragtree
<< dendl
;
10991 if (srcfrags
.empty())
10995 CDir
*parent_dir
= diri
->get_parent_dir();
10996 CDir
*parent_subtree
= 0;
10998 parent_subtree
= get_subtree_root(parent_dir
);
11002 ceph_assert(srcfrags
.size() == 1);
11003 CDir
*dir
= srcfrags
.front();
11005 dir
->split(bits
, resultfrags
, waiters
, replay
);
11007 // did i change the subtree map?
11008 if (dir
->is_subtree_root()) {
11009 // new frags are now separate subtrees
11010 for (list
<CDir
*>::iterator p
= resultfrags
.begin();
11011 p
!= resultfrags
.end();
11013 subtrees
[*p
].clear(); // new frag is now its own subtree
11016 if (parent_subtree
) {
11017 ceph_assert(subtrees
[parent_subtree
].count(dir
));
11018 subtrees
[parent_subtree
].erase(dir
);
11019 for (list
<CDir
*>::iterator p
= resultfrags
.begin();
11020 p
!= resultfrags
.end();
11022 ceph_assert((*p
)->is_subtree_root());
11023 subtrees
[parent_subtree
].insert(*p
);
11027 // adjust my bounds.
11029 bounds
.swap(subtrees
[dir
]);
11030 subtrees
.erase(dir
);
11031 for (set
<CDir
*>::iterator p
= bounds
.begin();
11034 CDir
*frag
= get_subtree_root((*p
)->get_parent_dir());
11035 subtrees
[frag
].insert(*p
);
11041 diri
->close_dirfrag(dir
->get_frag());
11046 // are my constituent bits subtrees? if so, i will be too.
11047 // (it's all or none, actually.)
11048 bool any_subtree
= false, any_non_subtree
= false;
11049 for (CDir
*dir
: srcfrags
) {
11050 if (dir
->is_subtree_root())
11051 any_subtree
= true;
11053 any_non_subtree
= true;
11055 ceph_assert(!any_subtree
|| !any_non_subtree
);
11057 set
<CDir
*> new_bounds
;
11059 for (CDir
*dir
: srcfrags
) {
11060 // this simplifies the code that find subtrees underneath the dirfrag
11061 if (!dir
->is_subtree_root()) {
11062 dir
->state_set(CDir::STATE_AUXSUBTREE
);
11063 adjust_subtree_auth(dir
, mds
->get_nodeid());
11067 for (CDir
*dir
: srcfrags
) {
11068 ceph_assert(dir
->is_subtree_root());
11069 dout(10) << " taking srcfrag subtree bounds from " << *dir
<< dendl
;
11070 map
<CDir
*, set
<CDir
*> >::iterator q
= subtrees
.find(dir
);
11071 set
<CDir
*>::iterator r
= q
->second
.begin();
11072 while (r
!= subtrees
[dir
].end()) {
11073 new_bounds
.insert(*r
);
11074 subtrees
[dir
].erase(r
++);
11078 // remove myself as my parent's bound
11079 if (parent_subtree
)
11080 subtrees
[parent_subtree
].erase(dir
);
11085 CDir
*f
= new CDir(diri
, basefrag
, this, srcfrags
.front()->is_auth());
11086 f
->merge(srcfrags
, waiters
, replay
);
11089 ceph_assert(f
->is_subtree_root());
11090 subtrees
[f
].swap(new_bounds
);
11091 if (parent_subtree
)
11092 subtrees
[parent_subtree
].insert(f
);
11097 resultfrags
.push_back(f
);
11102 class C_MDC_FragmentFrozen
: public MDSInternalContext
{
11106 C_MDC_FragmentFrozen(MDCache
*m
, MDRequestRef
& r
) :
11107 MDSInternalContext(m
->mds
), mdcache(m
), mdr(r
) {}
11108 void finish(int r
) override
{
11109 mdcache
->fragment_frozen(mdr
, r
);
11113 bool MDCache::can_fragment(CInode
*diri
, list
<CDir
*>& dirs
)
11115 if (is_readonly()) {
11116 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl
;
11119 if (mds
->is_cluster_degraded()) {
11120 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl
;
11123 if (diri
->get_parent_dir() &&
11124 diri
->get_parent_dir()->get_inode()->is_stray()) {
11125 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl
;
11128 if (diri
->is_mdsdir() || diri
->is_stray() || diri
->ino() == MDS_INO_CEPH
) {
11129 dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl
;
11133 if (diri
->scrub_is_in_progress()) {
11134 dout(7) << "can_fragment: scrub in progress" << dendl
;
11138 for (list
<CDir
*>::iterator p
= dirs
.begin(); p
!= dirs
.end(); ++p
) {
11140 if (dir
->state_test(CDir::STATE_FRAGMENTING
)) {
11141 dout(7) << "can_fragment: already fragmenting " << *dir
<< dendl
;
11144 if (!dir
->is_auth()) {
11145 dout(7) << "can_fragment: not auth on " << *dir
<< dendl
;
11148 if (dir
->is_bad()) {
11149 dout(7) << "can_fragment: bad dirfrag " << *dir
<< dendl
;
11152 if (dir
->is_frozen() ||
11153 dir
->is_freezing()) {
11154 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl
;
11162 void MDCache::split_dir(CDir
*dir
, int bits
)
11164 dout(7) << __func__
<< " " << *dir
<< " bits " << bits
<< dendl
;
11165 ceph_assert(dir
->is_auth());
11166 CInode
*diri
= dir
->inode
;
11169 dirs
.push_back(dir
);
11171 if (!can_fragment(diri
, dirs
)) {
11172 dout(7) << __func__
<< " cannot fragment right now, dropping" << dendl
;
11176 if (dir
->frag
.bits() + bits
> 24) {
11177 dout(7) << __func__
<< " frag bits > 24, dropping" << dendl
;
11181 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_FRAGMENTDIR
);
11182 mdr
->more()->fragment_base
= dir
->dirfrag();
11184 ceph_assert(fragments
.count(dir
->dirfrag()) == 0);
11185 fragment_info_t
& info
= fragments
[dir
->dirfrag()];
11187 info
.dirs
.push_back(dir
);
11189 info
.last_cum_auth_pins_change
= ceph_clock_now();
11191 fragment_freeze_dirs(dirs
);
11192 // initial mark+complete pass
11193 fragment_mark_and_complete(mdr
);
11196 void MDCache::merge_dir(CInode
*diri
, frag_t frag
)
11198 dout(7) << "merge_dir to " << frag
<< " on " << *diri
<< dendl
;
11201 if (!diri
->get_dirfrags_under(frag
, dirs
)) {
11202 dout(7) << "don't have all frags under " << frag
<< " for " << *diri
<< dendl
;
11206 if (diri
->dirfragtree
.is_leaf(frag
)) {
11207 dout(10) << " " << frag
<< " already a leaf for " << *diri
<< dendl
;
11211 if (!can_fragment(diri
, dirs
))
11214 CDir
*first
= dirs
.front();
11215 int bits
= first
->get_frag().bits() - frag
.bits();
11216 dout(10) << " we are merginb by " << bits
<< " bits" << dendl
;
11218 dirfrag_t
basedirfrag(diri
->ino(), frag
);
11219 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_FRAGMENTDIR
);
11220 mdr
->more()->fragment_base
= basedirfrag
;
11222 ceph_assert(fragments
.count(basedirfrag
) == 0);
11223 fragment_info_t
& info
= fragments
[basedirfrag
];
11227 info
.last_cum_auth_pins_change
= ceph_clock_now();
11229 fragment_freeze_dirs(dirs
);
11230 // initial mark+complete pass
11231 fragment_mark_and_complete(mdr
);
11234 void MDCache::fragment_freeze_dirs(list
<CDir
*>& dirs
)
11236 bool any_subtree
= false, any_non_subtree
= false;
11237 for (CDir
* dir
: dirs
) {
11238 dir
->auth_pin(dir
); // until we mark and complete them
11239 dir
->state_set(CDir::STATE_FRAGMENTING
);
11241 ceph_assert(dir
->is_freezing_dir());
11243 if (dir
->is_subtree_root())
11244 any_subtree
= true;
11246 any_non_subtree
= true;
11249 if (any_subtree
&& any_non_subtree
) {
11250 // either all dirfrags are subtree roots or all are not.
11251 for (CDir
*dir
: dirs
) {
11252 if (dir
->is_subtree_root()) {
11253 ceph_assert(dir
->state_test(CDir::STATE_AUXSUBTREE
));
11255 dir
->state_set(CDir::STATE_AUXSUBTREE
);
11256 adjust_subtree_auth(dir
, mds
->get_nodeid());
11262 class C_MDC_FragmentMarking
: public MDCacheContext
{
11265 C_MDC_FragmentMarking(MDCache
*m
, MDRequestRef
& r
) : MDCacheContext(m
), mdr(r
) {}
11266 void finish(int r
) override
{
11267 mdcache
->fragment_mark_and_complete(mdr
);
11271 void MDCache::fragment_mark_and_complete(MDRequestRef
& mdr
)
11273 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11274 map
<dirfrag_t
,fragment_info_t
>::iterator it
= fragments
.find(basedirfrag
);
11275 if (it
== fragments
.end() || it
->second
.mdr
!= mdr
) {
11276 dout(7) << "fragment_mark_and_complete " << basedirfrag
<< " must have aborted" << dendl
;
11277 request_finish(mdr
);
11281 fragment_info_t
& info
= it
->second
;
11282 CInode
*diri
= info
.dirs
.front()->get_inode();
11283 dout(10) << "fragment_mark_and_complete " << info
.dirs
<< " on " << *diri
<< dendl
;
11285 MDSGatherBuilder
gather(g_ceph_context
);
11287 for (list
<CDir
*>::iterator p
= info
.dirs
.begin();
11288 p
!= info
.dirs
.end();
11293 if (!dir
->is_complete()) {
11294 dout(15) << " fetching incomplete " << *dir
<< dendl
;
11295 dir
->fetch(gather
.new_sub(), true); // ignore authpinnability
11297 } else if (dir
->get_frag() == frag_t()) {
11298 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
11299 // the operation. To avoid CDir::fetch() complaining about missing object,
11300 // we commit new dirfrag first.
11301 if (dir
->state_test(CDir::STATE_CREATING
)) {
11302 dout(15) << " waiting until new dir gets journaled " << *dir
<< dendl
;
11303 dir
->add_waiter(CDir::WAIT_CREATED
, gather
.new_sub());
11305 } else if (dir
->is_new()) {
11306 dout(15) << " committing new " << *dir
<< dendl
;
11307 ceph_assert(dir
->is_dirty());
11308 dir
->commit(0, gather
.new_sub(), true);
11315 if (!dir
->state_test(CDir::STATE_DNPINNEDFRAG
)) {
11316 dout(15) << " marking " << *dir
<< dendl
;
11317 for (auto &p
: dir
->items
) {
11318 CDentry
*dn
= p
.second
;
11319 dn
->get(CDentry::PIN_FRAGMENTING
);
11320 ceph_assert(!dn
->state_test(CDentry::STATE_FRAGMENTING
));
11321 dn
->state_set(CDentry::STATE_FRAGMENTING
);
11323 dir
->state_set(CDir::STATE_DNPINNEDFRAG
);
11324 dir
->auth_unpin(dir
);
11326 dout(15) << " already marked " << *dir
<< dendl
;
11329 if (gather
.has_subs()) {
11330 gather
.set_finisher(new C_MDC_FragmentMarking(this, mdr
));
11335 for (list
<CDir
*>::iterator p
= info
.dirs
.begin();
11336 p
!= info
.dirs
.end();
11339 if (!dir
->is_frozen_dir()) {
11340 ceph_assert(dir
->is_freezing_dir());
11341 dir
->add_waiter(CDir::WAIT_FROZEN
, gather
.new_sub());
11344 if (gather
.has_subs()) {
11345 gather
.set_finisher(new C_MDC_FragmentFrozen(this, mdr
));
11347 // flush log so that request auth_pins are retired
11348 mds
->mdlog
->flush();
11352 fragment_frozen(mdr
, 0);
11355 void MDCache::fragment_unmark_unfreeze_dirs(list
<CDir
*>& dirs
)
11357 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs
<< dendl
;
11358 for (list
<CDir
*>::iterator p
= dirs
.begin(); p
!= dirs
.end(); ++p
) {
11360 dout(10) << " frag " << *dir
<< dendl
;
11362 ceph_assert(dir
->state_test(CDir::STATE_FRAGMENTING
));
11363 dir
->state_clear(CDir::STATE_FRAGMENTING
);
11365 if (dir
->state_test(CDir::STATE_DNPINNEDFRAG
)) {
11366 dir
->state_clear(CDir::STATE_DNPINNEDFRAG
);
11368 for (auto &p
: dir
->items
) {
11369 CDentry
*dn
= p
.second
;
11370 ceph_assert(dn
->state_test(CDentry::STATE_FRAGMENTING
));
11371 dn
->state_clear(CDentry::STATE_FRAGMENTING
);
11372 dn
->put(CDentry::PIN_FRAGMENTING
);
11375 dir
->auth_unpin(dir
);
11378 dir
->unfreeze_dir();
11382 bool MDCache::fragment_are_all_frozen(CDir
*dir
)
11384 ceph_assert(dir
->is_frozen_dir());
11385 map
<dirfrag_t
,fragment_info_t
>::iterator p
;
11386 for (p
= fragments
.lower_bound(dirfrag_t(dir
->ino(), 0));
11387 p
!= fragments
.end() && p
->first
.ino
== dir
->ino();
11389 if (p
->first
.frag
.contains(dir
->get_frag()))
11390 return p
->second
.all_frozen
;
11396 void MDCache::fragment_freeze_inc_num_waiters(CDir
*dir
)
11398 map
<dirfrag_t
,fragment_info_t
>::iterator p
;
11399 for (p
= fragments
.lower_bound(dirfrag_t(dir
->ino(), 0));
11400 p
!= fragments
.end() && p
->first
.ino
== dir
->ino();
11402 if (p
->first
.frag
.contains(dir
->get_frag())) {
11403 p
->second
.num_remote_waiters
++;
11410 void MDCache::find_stale_fragment_freeze()
11412 dout(10) << "find_stale_fragment_freeze" << dendl
;
11413 // see comment in Migrator::find_stale_export_freeze()
11414 utime_t now
= ceph_clock_now();
11415 utime_t cutoff
= now
;
11416 cutoff
-= g_conf()->mds_freeze_tree_timeout
;
11418 for (map
<dirfrag_t
,fragment_info_t
>::iterator p
= fragments
.begin();
11419 p
!= fragments
.end(); ) {
11420 dirfrag_t df
= p
->first
;
11421 fragment_info_t
& info
= p
->second
;
11423 if (info
.all_frozen
)
11426 int total_auth_pins
= 0;
11427 for (list
<CDir
*>::iterator q
= info
.dirs
.begin();
11428 q
!= info
.dirs
.end();
11431 if (!dir
->state_test(CDir::STATE_DNPINNEDFRAG
)) {
11432 total_auth_pins
= -1;
11435 if (dir
->is_frozen_dir())
11437 total_auth_pins
+= dir
->get_auth_pins() + dir
->get_dir_auth_pins();
11439 if (total_auth_pins
< 0)
11441 if (info
.last_cum_auth_pins
!= total_auth_pins
) {
11442 info
.last_cum_auth_pins
= total_auth_pins
;
11443 info
.last_cum_auth_pins_change
= now
;
11446 if (info
.last_cum_auth_pins_change
>= cutoff
)
11448 dir
= info
.dirs
.front();
11449 if (info
.num_remote_waiters
> 0 ||
11450 (!dir
->inode
->is_root() && dir
->get_parent_dir()->is_freezing())) {
11451 dout(10) << " cancel fragmenting " << df
<< " bit " << info
.bits
<< dendl
;
11453 info
.dirs
.swap(dirs
);
11454 fragments
.erase(df
);
11455 fragment_unmark_unfreeze_dirs(dirs
);
11460 class C_MDC_FragmentPrep
: public MDCacheLogContext
{
11463 C_MDC_FragmentPrep(MDCache
*m
, MDRequestRef
& r
) : MDCacheLogContext(m
), mdr(r
) {}
11464 void finish(int r
) override
{
11465 mdcache
->_fragment_logged(mdr
);
11469 class C_MDC_FragmentStore
: public MDCacheContext
{
11472 C_MDC_FragmentStore(MDCache
*m
, MDRequestRef
& r
) : MDCacheContext(m
), mdr(r
) {}
11473 void finish(int r
) override
{
11474 mdcache
->_fragment_stored(mdr
);
11478 class C_MDC_FragmentCommit
: public MDCacheLogContext
{
11479 dirfrag_t basedirfrag
;
11482 C_MDC_FragmentCommit(MDCache
*m
, dirfrag_t df
, const MDRequestRef
& r
) :
11483 MDCacheLogContext(m
), basedirfrag(df
), mdr(r
) {}
11484 void finish(int r
) override
{
11485 mdcache
->_fragment_committed(basedirfrag
, mdr
);
11489 class C_IO_MDC_FragmentPurgeOld
: public MDCacheIOContext
{
11490 dirfrag_t basedirfrag
;
11494 C_IO_MDC_FragmentPurgeOld(MDCache
*m
, dirfrag_t f
, int b
,
11495 const MDRequestRef
& r
) :
11496 MDCacheIOContext(m
), basedirfrag(f
), bits(b
), mdr(r
) {}
11497 void finish(int r
) override
{
11498 ceph_assert(r
== 0 || r
== -ENOENT
);
11499 mdcache
->_fragment_old_purged(basedirfrag
, bits
, mdr
);
11501 void print(ostream
& out
) const override
{
11502 out
<< "fragment_purge_old(" << basedirfrag
<< ")";
11506 void MDCache::fragment_frozen(MDRequestRef
& mdr
, int r
)
11508 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11509 map
<dirfrag_t
,fragment_info_t
>::iterator it
= fragments
.find(basedirfrag
);
11510 if (it
== fragments
.end() || it
->second
.mdr
!= mdr
) {
11511 dout(7) << "fragment_frozen " << basedirfrag
<< " must have aborted" << dendl
;
11512 request_finish(mdr
);
11516 ceph_assert(r
== 0);
11517 fragment_info_t
& info
= it
->second
;
11518 dout(10) << "fragment_frozen " << basedirfrag
.frag
<< " by " << info
.bits
11519 << " on " << info
.dirs
.front()->get_inode() << dendl
;
11521 info
.all_frozen
= true;
11522 dispatch_fragment_dir(mdr
);
11525 void MDCache::dispatch_fragment_dir(MDRequestRef
& mdr
)
11527 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11528 map
<dirfrag_t
,fragment_info_t
>::iterator it
= fragments
.find(basedirfrag
);
11529 if (it
== fragments
.end() || it
->second
.mdr
!= mdr
) {
11530 dout(7) << "dispatch_fragment_dir " << basedirfrag
<< " must have aborted" << dendl
;
11531 request_finish(mdr
);
11535 fragment_info_t
& info
= it
->second
;
11536 CInode
*diri
= info
.dirs
.front()->get_inode();
11538 dout(10) << "dispatch_fragment_dir " << basedirfrag
<< " bits " << info
.bits
11539 << " on " << *diri
<< dendl
;
11540 if (!mdr
->aborted
) {
11541 MutationImpl::LockOpVec lov
;
11542 lov
.add_wrlock(&diri
->dirfragtreelock
);
11543 // prevent a racing gather on any other scatterlocks too
11544 lov
.add_wrlock(&diri
->nestlock
);
11545 lov
.add_wrlock(&diri
->filelock
);
11546 if (!mds
->locker
->acquire_locks(mdr
, lov
, NULL
, true))
11551 if (mdr
->aborted
) {
11552 dout(10) << " can't auth_pin " << *diri
<< ", requeuing dir "
11553 << info
.dirs
.front()->dirfrag() << dendl
;
11555 mds
->balancer
->queue_split(info
.dirs
.front(), false);
11557 mds
->balancer
->queue_merge(info
.dirs
.front());
11558 fragment_unmark_unfreeze_dirs(info
.dirs
);
11559 fragments
.erase(it
);
11560 request_finish(mdr
);
11564 mdr
->ls
= mds
->mdlog
->get_current_segment();
11565 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_PREPARE
, basedirfrag
, info
.bits
);
11566 mds
->mdlog
->start_entry(le
);
11568 for (list
<CDir
*>::iterator p
= info
.dirs
.begin(); p
!= info
.dirs
.end(); ++p
) {
11570 dirfrag_rollback rollback
;
11571 rollback
.fnode
= dir
->fnode
;
11572 le
->add_orig_frag(dir
->get_frag(), &rollback
);
11576 MDSContext::vec waiters
;
11577 adjust_dir_fragments(diri
, info
.dirs
, basedirfrag
.frag
, info
.bits
,
11578 info
.resultfrags
, waiters
, false);
11579 if (g_conf()->mds_debug_frag
)
11580 diri
->verify_dirfrags();
11581 mds
->queue_waiters(waiters
);
11583 for (const auto& fg
: le
->orig_frags
)
11584 ceph_assert(!diri
->dirfragtree
.is_leaf(fg
));
11586 le
->metablob
.add_dir_context(*info
.resultfrags
.begin());
11587 for (list
<CDir
*>::iterator p
= info
.resultfrags
.begin();
11588 p
!= info
.resultfrags
.end();
11590 if (diri
->is_auth()) {
11591 le
->metablob
.add_fragmented_dir(*p
, false, false);
11593 (*p
)->state_set(CDir::STATE_DIRTYDFT
);
11594 le
->metablob
.add_fragmented_dir(*p
, false, true);
11599 if (diri
->is_auth()) {
11600 // journal dirfragtree
11601 auto &pi
= diri
->project_inode();
11602 pi
.inode
.version
= diri
->pre_dirty();
11603 journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
11605 mds
->locker
->mark_updated_scatterlock(&diri
->dirfragtreelock
);
11606 mdr
->ls
->dirty_dirfrag_dirfragtree
.push_back(&diri
->item_dirty_dirfrag_dirfragtree
);
11607 mdr
->add_updated_lock(&diri
->dirfragtreelock
);
11612 mds->locker->mark_updated_scatterlock(&diri->filelock);
11613 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11614 mut->add_updated_lock(&diri->filelock);
11617 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11618 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11619 mut->add_updated_lock(&diri->nestlock);
11622 add_uncommitted_fragment(basedirfrag
, info
.bits
, le
->orig_frags
, mdr
->ls
);
11623 mds
->server
->submit_mdlog_entry(le
, new C_MDC_FragmentPrep(this, mdr
),
11625 mds
->mdlog
->flush();
11628 void MDCache::_fragment_logged(MDRequestRef
& mdr
)
11630 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11631 auto& info
= fragments
.at(basedirfrag
);
11632 CInode
*diri
= info
.resultfrags
.front()->get_inode();
11634 dout(10) << "fragment_logged " << basedirfrag
<< " bits " << info
.bits
11635 << " on " << *diri
<< dendl
;
11636 mdr
->mark_event("prepare logged");
11638 if (diri
->is_auth())
11639 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
11641 mdr
->apply(); // mark scatterlock
11643 // store resulting frags
11644 MDSGatherBuilder
gather(g_ceph_context
, new C_MDC_FragmentStore(this, mdr
));
11646 for (list
<CDir
*>::iterator p
= info
.resultfrags
.begin();
11647 p
!= info
.resultfrags
.end();
11650 dout(10) << " storing result frag " << *dir
<< dendl
;
11652 // freeze and store them too
11653 dir
->auth_pin(this);
11654 dir
->state_set(CDir::STATE_FRAGMENTING
);
11655 dir
->commit(0, gather
.new_sub(), true); // ignore authpinnability
11661 void MDCache::_fragment_stored(MDRequestRef
& mdr
)
11663 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11664 fragment_info_t
&info
= fragments
.at(basedirfrag
);
11665 CDir
*first
= info
.resultfrags
.front();
11666 CInode
*diri
= first
->get_inode();
11668 dout(10) << "fragment_stored " << basedirfrag
<< " bits " << info
.bits
11669 << " on " << *diri
<< dendl
;
11670 mdr
->mark_event("new frags stored");
11673 mds_rank_t diri_auth
= (first
->is_subtree_root() && !diri
->is_auth()) ?
11674 diri
->authority().first
: CDIR_AUTH_UNKNOWN
;
11675 for (const auto &p
: first
->get_replicas()) {
11676 if (mds
->mdsmap
->get_state(p
.first
) < MDSMap::STATE_REJOIN
||
11677 (mds
->mdsmap
->get_state(p
.first
) == MDSMap::STATE_REJOIN
&&
11678 rejoin_gather
.count(p
.first
)))
11681 auto notify
= MMDSFragmentNotify::create(basedirfrag
, info
.bits
, mdr
->reqid
.tid
);
11682 if (diri_auth
!= CDIR_AUTH_UNKNOWN
&& // subtree root
11683 diri_auth
!= p
.first
) { // not auth mds of diri
11685 * In the nornal case, mds does not trim dir inode whose child dirfrags
11686 * are likely being fragmented (see trim_inode()). But when fragmenting
11687 * subtree roots, following race can happen:
11689 * - mds.a (auth mds of dirfrag) sends fragment_notify message to
11690 * mds.c and drops wrlock on dirfragtreelock.
11691 * - mds.b (auth mds of dir inode) changes dirfragtreelock state to
11692 * SYNC and send lock message mds.c
11693 * - mds.c receives the lock message and changes dirfragtreelock state
11695 * - mds.c trim dirfrag and dir inode from its cache
11696 * - mds.c receives the fragment_notify message
11698 * So we need to ensure replicas have received the notify, then unlock
11699 * the dirfragtreelock.
11701 notify
->mark_ack_wanted();
11702 info
.notify_ack_waiting
.insert(p
.first
);
11705 // freshly replicate new dirs to peers
11706 for (list
<CDir
*>::iterator q
= info
.resultfrags
.begin();
11707 q
!= info
.resultfrags
.end();
11709 replicate_dir(*q
, p
.first
, notify
->basebl
);
11711 mds
->send_message_mds(notify
, p
.first
);
11715 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_COMMIT
, basedirfrag
, info
.bits
);
11716 mds
->mdlog
->start_submit_entry(le
, new C_MDC_FragmentCommit(this, basedirfrag
, mdr
));
11719 // unfreeze resulting frags
11720 for (list
<CDir
*>::iterator p
= info
.resultfrags
.begin();
11721 p
!= info
.resultfrags
.end();
11724 dout(10) << " result frag " << *dir
<< dendl
;
11726 for (auto &p
: dir
->items
) {
11727 CDentry
*dn
= p
.second
;
11728 ceph_assert(dn
->state_test(CDentry::STATE_FRAGMENTING
));
11729 dn
->state_clear(CDentry::STATE_FRAGMENTING
);
11730 dn
->put(CDentry::PIN_FRAGMENTING
);
11734 dir
->unfreeze_dir();
11737 if (info
.notify_ack_waiting
.empty()) {
11738 fragment_drop_locks(info
);
11740 mds
->locker
->drop_locks_for_fragment_unfreeze(mdr
.get());
11744 void MDCache::_fragment_committed(dirfrag_t basedirfrag
, const MDRequestRef
& mdr
)
11746 dout(10) << "fragment_committed " << basedirfrag
<< dendl
;
11748 mdr
->mark_event("commit logged");
11750 ufragment
&uf
= uncommitted_fragments
.at(basedirfrag
);
11752 // remove old frags
11753 C_GatherBuilder
gather(
11756 new C_IO_MDC_FragmentPurgeOld(this, basedirfrag
, uf
.bits
, mdr
),
11759 SnapContext nullsnapc
;
11760 object_locator_t
oloc(mds
->mdsmap
->get_metadata_pool());
11761 for (const auto& fg
: uf
.old_frags
) {
11762 object_t oid
= CInode::get_object_name(basedirfrag
.ino
, fg
, "");
11763 ObjectOperation op
;
11764 if (fg
== frag_t()) {
11765 // backtrace object
11766 dout(10) << " truncate orphan dirfrag " << oid
<< dendl
;
11770 dout(10) << " removing orphan dirfrag " << oid
<< dendl
;
11773 mds
->objecter
->mutate(oid
, oloc
, op
, nullsnapc
,
11774 ceph::real_clock::now(),
11775 0, gather
.new_sub());
11778 ceph_assert(gather
.has_subs());
11782 void MDCache::_fragment_old_purged(dirfrag_t basedirfrag
, int bits
, const MDRequestRef
& mdr
)
11784 dout(10) << "fragment_old_purged " << basedirfrag
<< dendl
;
11786 mdr
->mark_event("old frags purged");
11788 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_FINISH
, basedirfrag
, bits
);
11789 mds
->mdlog
->start_submit_entry(le
);
11791 finish_uncommitted_fragment(basedirfrag
, EFragment::OP_FINISH
);
11795 mds
->logger
->inc(l_mds_dir_split
);
11797 mds
->logger
->inc(l_mds_dir_merge
);
11802 auto it
= fragments
.find(basedirfrag
);
11803 ceph_assert(it
!= fragments
.end());
11804 it
->second
.finishing
= true;
11805 if (it
->second
.notify_ack_waiting
.empty())
11806 fragment_maybe_finish(it
);
11808 mdr
->mark_event("wating for notify acks");
11812 void MDCache::fragment_drop_locks(fragment_info_t
& info
)
11814 mds
->locker
->drop_locks(info
.mdr
.get());
11815 request_finish(info
.mdr
);
11816 //info.mdr.reset();
11819 void MDCache::fragment_maybe_finish(const fragment_info_iterator
& it
)
11821 if (!it
->second
.finishing
)
11824 // unmark & auth_unpin
11825 for (const auto &dir
: it
->second
.resultfrags
) {
11826 dir
->state_clear(CDir::STATE_FRAGMENTING
);
11827 dir
->auth_unpin(this);
11829 // In case the resulting fragments are beyond the split size,
11830 // we might need to split them again right away (they could
11831 // have been taking inserts between unfreezing and getting
11833 mds
->balancer
->maybe_fragment(dir
, false);
11836 fragments
.erase(it
);
11840 void MDCache::handle_fragment_notify_ack(const MMDSFragmentNotifyAck::const_ref
&ack
)
11842 dout(10) << "handle_fragment_notify_ack " << *ack
<< " from " << ack
->get_source() << dendl
;
11843 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
11845 if (mds
->get_state() < MDSMap::STATE_ACTIVE
) {
11849 auto it
= fragments
.find(ack
->get_base_dirfrag());
11850 if (it
== fragments
.end() ||
11851 it
->second
.get_tid() != ack
->get_tid()) {
11852 dout(10) << "handle_fragment_notify_ack obsolete message, dropping" << dendl
;
11856 if (it
->second
.notify_ack_waiting
.erase(from
) &&
11857 it
->second
.notify_ack_waiting
.empty()) {
11858 fragment_drop_locks(it
->second
);
11859 fragment_maybe_finish(it
);
11863 void MDCache::handle_fragment_notify(const MMDSFragmentNotify::const_ref
¬ify
)
11865 dout(10) << "handle_fragment_notify " << *notify
<< " from " << notify
->get_source() << dendl
;
11866 mds_rank_t from
= mds_rank_t(notify
->get_source().num());
11868 if (mds
->get_state() < MDSMap::STATE_REJOIN
) {
11872 CInode
*diri
= get_inode(notify
->get_ino());
11874 frag_t base
= notify
->get_basefrag();
11875 int bits
= notify
->get_bits();
11878 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
11879 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
11880 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
11881 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
11887 MDSContext::vec waiters
;
11888 list
<CDir
*> resultfrags
;
11889 adjust_dir_fragments(diri
, base
, bits
, resultfrags
, waiters
, false);
11890 if (g_conf()->mds_debug_frag
)
11891 diri
->verify_dirfrags();
11893 for (list
<CDir
*>::iterator p
= resultfrags
.begin(); p
!= resultfrags
.end(); ++p
)
11894 diri
->take_dir_waiting((*p
)->get_frag(), waiters
);
11896 // add new replica dirs values
11897 auto p
= notify
->basebl
.cbegin();
11899 add_replica_dir(p
, diri
, from
, waiters
);
11901 mds
->queue_waiters(waiters
);
11906 if (notify
->is_ack_wanted()) {
11907 auto ack
= MMDSFragmentNotifyAck::create(notify
->get_base_dirfrag(),
11908 notify
->get_bits(), notify
->get_tid());
11909 mds
->send_message_mds(ack
, from
);
11913 void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag
, int bits
, const frag_vec_t
& old_frags
,
11914 LogSegment
*ls
, bufferlist
*rollback
)
11916 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag
<< " bits " << bits
<< dendl
;
11917 ceph_assert(!uncommitted_fragments
.count(basedirfrag
));
11918 ufragment
& uf
= uncommitted_fragments
[basedirfrag
];
11919 uf
.old_frags
= old_frags
;
11922 ls
->uncommitted_fragments
.insert(basedirfrag
);
11924 uf
.rollback
.swap(*rollback
);
11927 void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag
, int op
)
11929 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
11930 << " op " << EFragment::op_name(op
) << dendl
;
11931 map
<dirfrag_t
, ufragment
>::iterator it
= uncommitted_fragments
.find(basedirfrag
);
11932 if (it
!= uncommitted_fragments
.end()) {
11933 ufragment
& uf
= it
->second
;
11934 if (op
!= EFragment::OP_FINISH
&& !uf
.old_frags
.empty()) {
11935 uf
.committed
= true;
11937 uf
.ls
->uncommitted_fragments
.erase(basedirfrag
);
11938 mds
->queue_waiters(uf
.waiters
);
11939 uncommitted_fragments
.erase(it
);
11944 void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag
, frag_vec_t
&& old_frags
)
11946 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
11947 << " old_frags (" << old_frags
<< ")" << dendl
;
11948 map
<dirfrag_t
, ufragment
>::iterator it
= uncommitted_fragments
.find(basedirfrag
);
11949 if (it
!= uncommitted_fragments
.end()) {
11950 ufragment
& uf
= it
->second
;
11951 if (!uf
.old_frags
.empty()) {
11952 uf
.old_frags
= std::move(old_frags
);
11953 uf
.committed
= true;
11955 uf
.ls
->uncommitted_fragments
.erase(basedirfrag
);
11956 uncommitted_fragments
.erase(it
);
11961 void MDCache::rollback_uncommitted_fragments()
11963 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments
.size() << " pending" << dendl
;
11964 for (map
<dirfrag_t
, ufragment
>::iterator p
= uncommitted_fragments
.begin();
11965 p
!= uncommitted_fragments
.end();
11967 ufragment
&uf
= p
->second
;
11968 CInode
*diri
= get_inode(p
->first
.ino
);
11971 if (uf
.committed
) {
11972 _fragment_committed(p
->first
, MDRequestRef());
11976 dout(10) << " rolling back " << p
->first
<< " refragment by " << uf
.bits
<< " bits" << dendl
;
11978 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
11979 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_ROLLBACK
, p
->first
, uf
.bits
);
11980 mds
->mdlog
->start_entry(le
);
11981 bool diri_auth
= (diri
->authority() != CDIR_AUTH_UNDEF
);
11983 frag_vec_t old_frags
;
11984 diri
->dirfragtree
.get_leaves_under(p
->first
.frag
, old_frags
);
11986 list
<CDir
*> resultfrags
;
11987 if (uf
.old_frags
.empty()) {
11988 // created by old format EFragment
11989 MDSContext::vec waiters
;
11990 adjust_dir_fragments(diri
, p
->first
.frag
, -uf
.bits
, resultfrags
, waiters
, true);
11992 auto bp
= uf
.rollback
.cbegin();
11993 for (const auto& fg
: uf
.old_frags
) {
11994 CDir
*dir
= force_dir_fragment(diri
, fg
);
11995 resultfrags
.push_back(dir
);
11997 dirfrag_rollback rollback
;
11998 decode(rollback
, bp
);
12000 dir
->set_version(rollback
.fnode
.version
);
12001 dir
->fnode
= rollback
.fnode
;
12003 dir
->_mark_dirty(ls
);
12005 if (!(dir
->fnode
.rstat
== dir
->fnode
.accounted_rstat
)) {
12006 dout(10) << " dirty nestinfo on " << *dir
<< dendl
;
12007 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->nestlock
);
12008 ls
->dirty_dirfrag_nest
.push_back(&dir
->inode
->item_dirty_dirfrag_nest
);
12010 if (!(dir
->fnode
.fragstat
== dir
->fnode
.accounted_fragstat
)) {
12011 dout(10) << " dirty fragstat on " << *dir
<< dendl
;
12012 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->filelock
);
12013 ls
->dirty_dirfrag_dir
.push_back(&dir
->inode
->item_dirty_dirfrag_dir
);
12016 le
->add_orig_frag(dir
->get_frag());
12017 le
->metablob
.add_dir_context(dir
);
12019 le
->metablob
.add_fragmented_dir(dir
, true, false);
12021 dout(10) << " dirty dirfragtree on " << *dir
<< dendl
;
12022 dir
->state_set(CDir::STATE_DIRTYDFT
);
12023 le
->metablob
.add_fragmented_dir(dir
, true, true);
12029 auto &pi
= diri
->project_inode();
12030 pi
.inode
.version
= diri
->pre_dirty();
12031 diri
->pop_and_dirty_projected_inode(ls
); // hacky
12032 le
->metablob
.add_primary_dentry(diri
->get_projected_parent_dn(), diri
, true);
12034 mds
->locker
->mark_updated_scatterlock(&diri
->dirfragtreelock
);
12035 ls
->dirty_dirfrag_dirfragtree
.push_back(&diri
->item_dirty_dirfrag_dirfragtree
);
12038 if (g_conf()->mds_debug_frag
)
12039 diri
->verify_dirfrags();
12041 for (const auto& leaf
: old_frags
) {
12042 ceph_assert(!diri
->dirfragtree
.is_leaf(leaf
));
12045 mds
->mdlog
->submit_entry(le
);
12047 uf
.old_frags
.swap(old_frags
);
12048 _fragment_committed(p
->first
, MDRequestRef());
12052 void MDCache::force_readonly()
12057 dout(1) << "force file system read-only" << dendl
;
12058 mds
->clog
->warn() << "force file system read-only";
12062 mds
->server
->force_clients_readonly();
12064 // revoke write caps
12066 for (auto &p
: inode_map
) {
12067 CInode
*in
= p
.second
;
12069 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
);
12070 if (!(++count
% 1000))
12071 mds
->heartbeat_reset();
12074 mds
->mdlog
->flush();
12078 // ==============================================================
12081 void MDCache::show_subtrees(int dbl
, bool force_print
)
12083 if (g_conf()->mds_thrash_exports
)
12086 //dout(10) << "show_subtrees" << dendl;
12088 if (!g_conf()->subsys
.should_gather(ceph_subsys_mds
, dbl
))
12089 return; // i won't print anything.
12091 if (subtrees
.empty()) {
12092 dout(ceph::dout::need_dynamic(dbl
)) << "show_subtrees - no subtrees"
12097 if (!force_print
&& subtrees
.size() > SUBTREES_COUNT_THRESHOLD
&&
12098 !g_conf()->subsys
.should_gather
<ceph_subsys_mds
, 25>()) {
12099 dout(ceph::dout::need_dynamic(dbl
)) << "number of subtrees = " << subtrees
.size() << "; not "
12100 "printing subtrees" << dendl
;
12105 list
<CDir
*> basefrags
;
12106 for (set
<CInode
*>::iterator p
= base_inodes
.begin();
12107 p
!= base_inodes
.end();
12109 (*p
)->get_dirfrags(basefrags
);
12110 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
12111 dout(15) << "show_subtrees" << dendl
;
12114 list
<pair
<CDir
*,int> > q
;
12119 for (list
<CDir
*>::iterator p
= basefrags
.begin(); p
!= basefrags
.end(); ++p
)
12120 q
.push_back(pair
<CDir
*,int>(*p
, 0));
12122 set
<CDir
*> subtrees_seen
;
12124 unsigned int depth
= 0;
12125 while (!q
.empty()) {
12126 CDir
*dir
= q
.front().first
;
12127 unsigned int d
= q
.front().second
;
12130 if (subtrees
.count(dir
) == 0) continue;
12132 subtrees_seen
.insert(dir
);
12134 if (d
> depth
) depth
= d
;
12137 //dout(25) << "saw depth " << d << " " << *dir << dendl;
12138 if (seen
.count(dir
)) dout(0) << "aah, already seen " << *dir
<< dendl
;
12139 ceph_assert(seen
.count(dir
) == 0);
12143 if (!subtrees
[dir
].empty()) {
12144 for (set
<CDir
*>::iterator p
= subtrees
[dir
].begin();
12145 p
!= subtrees
[dir
].end();
12147 //dout(25) << " saw sub " << **p << dendl;
12148 q
.push_front(pair
<CDir
*,int>(*p
, d
+1));
12153 if (!force_print
&& depth
> SUBTREES_DEPTH_THRESHOLD
&&
12154 !g_conf()->subsys
.should_gather
<ceph_subsys_mds
, 25>()) {
12155 dout(ceph::dout::need_dynamic(dbl
)) << "max depth among subtrees = " << depth
<< "; not printing "
12156 "subtrees" << dendl
;
12161 for (list
<CDir
*>::iterator p
= basefrags
.begin(); p
!= basefrags
.end(); ++p
)
12162 q
.push_back(pair
<CDir
*,int>(*p
, 0));
12164 while (!q
.empty()) {
12165 CDir
*dir
= q
.front().first
;
12166 int d
= q
.front().second
;
12169 if (subtrees
.count(dir
) == 0) continue;
12172 while ((unsigned)d
< indent
.size())
12176 string pad
= "______________________________________";
12177 pad
.resize(depth
*2+1-indent
.size());
12178 if (!subtrees
[dir
].empty())
12179 pad
[0] = '.'; // parent
12183 if (dir
->is_auth())
12189 if (dir
->get_dir_auth().second
== CDIR_AUTH_UNKNOWN
)
12190 snprintf(s
, sizeof(s
), "%2d ", int(dir
->get_dir_auth().first
));
12192 snprintf(s
, sizeof(s
), "%2d,%2d", int(dir
->get_dir_auth().first
), int(dir
->get_dir_auth().second
));
12195 dout(ceph::dout::need_dynamic(dbl
)) << indent
<< "|_" << pad
<< s
12196 << " " << auth
<< *dir
<< dendl
;
12198 if (dir
->ino() == MDS_INO_ROOT
)
12199 ceph_assert(dir
->inode
== root
);
12200 if (dir
->ino() == MDS_INO_MDSDIR(mds
->get_nodeid()))
12201 ceph_assert(dir
->inode
== myin
);
12202 if (dir
->inode
->is_stray() && (MDS_INO_STRAY_OWNER(dir
->ino()) == mds
->get_nodeid()))
12203 ceph_assert(strays
[MDS_INO_STRAY_INDEX(dir
->ino())] == dir
->inode
);
12206 if (!subtrees
[dir
].empty()) {
12207 // more at my level?
12208 if (!q
.empty() && q
.front().second
== d
)
12213 for (set
<CDir
*>::iterator p
= subtrees
[dir
].begin();
12214 p
!= subtrees
[dir
].end();
12216 q
.push_front(pair
<CDir
*,int>(*p
, d
+2));
12220 // verify there isn't stray crap in subtree map
12222 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
12223 p
!= subtrees
.end();
12225 if (subtrees_seen
.count(p
->first
)) continue;
12226 dout(10) << "*** stray/lost entry in subtree map: " << *p
->first
<< dendl
;
12229 ceph_assert(lost
== 0);
12232 void MDCache::show_cache()
12234 dout(7) << "show_cache" << dendl
;
12236 auto show_func
= [this](CInode
*in
) {
12239 dout(7) << " unlinked " << *in
<< dendl
;
12243 in
->get_dirfrags(dfs
);
12244 for (list
<CDir
*>::iterator p
= dfs
.begin(); p
!= dfs
.end(); ++p
) {
12246 dout(7) << " dirfrag " << *dir
<< dendl
;
12248 for (auto &p
: dir
->items
) {
12249 CDentry
*dn
= p
.second
;
12250 dout(7) << " dentry " << *dn
<< dendl
;
12251 CDentry::linkage_t
*dnl
= dn
->get_linkage();
12252 if (dnl
->is_primary() && dnl
->get_inode())
12253 dout(7) << " inode " << *dnl
->get_inode() << dendl
;
12258 for (auto &p
: inode_map
)
12259 show_func(p
.second
);
12260 for (auto &p
: snap_inode_map
)
12261 show_func(p
.second
);
12264 void MDCache::cache_status(Formatter
*f
)
12266 f
->open_object_section("cache");
12268 f
->open_object_section("pool");
12269 mempool::get_pool(mempool::mds_co::id
).dump(f
);
12270 f
->close_section();
12272 f
->close_section();
12275 void MDCache::dump_tree(CInode
*in
, const int cur_depth
, const int max_depth
, Formatter
*f
)
12278 if ((max_depth
>= 0) && (cur_depth
> max_depth
)) {
12282 in
->get_dirfrags(ls
);
12283 for (const auto &subdir
: ls
) {
12284 for (const auto &p
: subdir
->items
) {
12285 CDentry
*dn
= p
.second
;
12286 CInode
*in
= dn
->get_linkage()->get_inode();
12288 dump_tree(in
, cur_depth
+ 1, max_depth
, f
);
12292 f
->open_object_section("inode");
12293 in
->dump(f
, CInode::DUMP_DEFAULT
| CInode::DUMP_DIRFRAGS
);
12294 f
->close_section();
12297 int MDCache::dump_cache(std::string_view file_name
)
12299 return dump_cache(file_name
, NULL
);
12302 int MDCache::dump_cache(Formatter
*f
)
12304 return dump_cache(std::string_view(""), f
);
12308 * Dump the metadata cache, either to a Formatter, if
12309 * provided, else to a plain text file.
12311 int MDCache::dump_cache(std::string_view fn
, Formatter
*f
)
12315 // dumping large caches may cause mds to hang or worse get killed.
12316 // so, disallow the dump if the cache size exceeds the configured
12317 // threshold, which is 1G for formatter and unlimited for file (note
12318 // that this can be jacked up by the admin... and is nothing but foot
12319 // shooting, but the option itself is for devs and hence dangerous to
12320 // tune). TODO: remove this when fixed.
12321 uint64_t threshold
= f
?
12322 g_conf().get_val
<Option::size_t>("mds_dump_cache_threshold_formatter") :
12323 g_conf().get_val
<Option::size_t>("mds_dump_cache_threshold_file");
12325 if (threshold
&& cache_size() > threshold
) {
12327 std::stringstream ss
;
12328 ss
<< "cache usage exceeds dump threshold";
12329 f
->open_object_section("result");
12330 f
->dump_string("error", ss
.str());
12331 f
->close_section();
12333 derr
<< "cache usage exceeds dump threshold" << dendl
;
12343 f
->open_array_section("inodes");
12345 char path
[PATH_MAX
] = "";
12347 snprintf(path
, sizeof path
, "%s", fn
.data());
12349 snprintf(path
, sizeof path
, "cachedump.%d.mds%d", (int)mds
->mdsmap
->get_epoch(), int(mds
->get_nodeid()));
12352 dout(1) << "dump_cache to " << path
<< dendl
;
12354 fd
= ::open(path
, O_WRONLY
|O_CREAT
|O_EXCL
|O_CLOEXEC
, 0600);
12356 derr
<< "failed to open " << path
<< ": " << cpp_strerror(errno
) << dendl
;
12361 auto dump_func
= [fd
, f
](CInode
*in
) {
12364 f
->open_object_section("inode");
12365 in
->dump(f
, CInode::DUMP_DEFAULT
| CInode::DUMP_DIRFRAGS
);
12366 f
->close_section();
12370 ss
<< *in
<< std::endl
;
12371 std::string s
= ss
.str();
12372 r
= safe_write(fd
, s
.c_str(), s
.length());
12376 in
->get_dirfrags(dfs
);
12377 for (auto &dir
: dfs
) {
12379 tt
<< " " << *dir
<< std::endl
;
12380 std::string t
= tt
.str();
12381 r
= safe_write(fd
, t
.c_str(), t
.length());
12384 for (auto &p
: dir
->items
) {
12385 CDentry
*dn
= p
.second
;
12387 uu
<< " " << *dn
<< std::endl
;
12388 std::string u
= uu
.str();
12389 r
= safe_write(fd
, u
.c_str(), u
.length());
12393 dir
->check_rstats();
12398 for (auto &p
: inode_map
) {
12399 r
= dump_func(p
.second
);
12403 for (auto &p
: snap_inode_map
) {
12404 r
= dump_func(p
.second
);
12412 f
->close_section(); // inodes
12421 C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache
*c
, MDRequestRef
& r
)
12422 : MDSInternalContext(c
->mds
), cache(c
), mdr(r
)
12425 void C_MDS_RetryRequest::finish(int r
)
12428 cache
->dispatch_request(mdr
);
12432 class C_MDS_EnqueueScrub
: public Context
12435 Formatter
*formatter
;
12436 Context
*on_finish
;
12438 ScrubHeaderRef header
;
12439 C_MDS_EnqueueScrub(std::string_view tag
, Formatter
*f
, Context
*fin
) :
12440 tag(tag
), formatter(f
), on_finish(fin
), header(nullptr) {}
12442 Context
*take_finisher() {
12443 Context
*fin
= on_finish
;
12448 void finish(int r
) override
{
12450 // since recursive scrub is asynchronous, dump minimal output
12451 // to not upset cli tools.
12452 if (header
&& header
->get_recursive()) {
12453 formatter
->open_object_section("results");
12454 formatter
->dump_int("return_code", 0);
12455 formatter
->dump_string("scrub_tag", tag
);
12456 formatter
->dump_string("mode", "asynchronous");
12457 formatter
->close_section(); // results
12459 } else { // we failed the lookup or something; dump ourselves
12460 formatter
->open_object_section("results");
12461 formatter
->dump_int("return_code", r
);
12462 formatter
->close_section(); // results
12463 r
= 0; // already dumped in formatter
12466 on_finish
->complete(r
);
12470 void MDCache::enqueue_scrub(
12471 std::string_view path
,
12472 std::string_view tag
,
12473 bool force
, bool recursive
, bool repair
,
12474 Formatter
*f
, Context
*fin
)
12476 dout(10) << __func__
<< " " << path
<< dendl
;
12477 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB
);
12478 if (path
== "~mdsdir") {
12479 filepath
fp(MDS_INO_MDSDIR(mds
->get_nodeid()));
12480 mdr
->set_filepath(fp
);
12483 mdr
->set_filepath(path
);
12486 bool is_internal
= false;
12487 std::string
tag_str(tag
);
12488 if (tag_str
.empty()) {
12490 uuid_gen
.generate_random();
12491 tag_str
= uuid_gen
.to_string();
12492 is_internal
= true;
12495 C_MDS_EnqueueScrub
*cs
= new C_MDS_EnqueueScrub(tag_str
, f
, fin
);
12496 cs
->header
= std::make_shared
<ScrubHeader
>(
12497 tag_str
, is_internal
, force
, recursive
, repair
, f
);
12499 mdr
->internal_op_finish
= cs
;
12500 enqueue_scrub_work(mdr
);
12503 void MDCache::enqueue_scrub_work(MDRequestRef
& mdr
)
12505 MutationImpl::LockOpVec lov
;
12506 CInode
*in
= mds
->server
->rdlock_path_pin_ref(mdr
, 0, lov
, true);
12510 // TODO: Remove this restriction
12511 ceph_assert(in
->is_auth());
12513 bool locked
= mds
->locker
->acquire_locks(mdr
, lov
);
12517 C_MDS_EnqueueScrub
*cs
= static_cast<C_MDS_EnqueueScrub
*>(mdr
->internal_op_finish
);
12518 ScrubHeaderRef header
= cs
->header
;
12520 // Cannot scrub same dentry twice at same time
12521 if (in
->scrub_is_in_progress()) {
12522 mds
->server
->respond_to_request(mdr
, -EBUSY
);
12528 header
->set_origin(in
);
12531 if (header
->get_recursive()) {
12532 header
->get_origin()->get(CInode::PIN_SCRUBQUEUE
);
12533 fin
= new MDSInternalContextWrapper(mds
,
12534 new FunctionContext([this, header
](int r
) {
12535 recursive_scrub_finish(header
);
12536 header
->get_origin()->put(CInode::PIN_SCRUBQUEUE
);
12540 fin
= cs
->take_finisher();
12543 // If the scrub did some repair, then flush the journal at the end of
12544 // the scrub. Otherwise in the case of e.g. rewriting a backtrace
12545 // the on disk state will still look damaged.
12546 auto scrub_finish
= new FunctionContext([this, header
, fin
](int r
){
12547 if (!header
->get_repaired()) {
12553 auto flush_finish
= new FunctionContext([this, fin
](int r
){
12554 dout(4) << "Expiring log segments because scrub did some repairs" << dendl
;
12555 mds
->mdlog
->trim_all();
12558 MDSGatherBuilder
gather(g_ceph_context
);
12559 auto& expiring_segments
= mds
->mdlog
->get_expiring_segments();
12560 for (auto logseg
: expiring_segments
)
12561 logseg
->wait_for_expiry(gather
.new_sub());
12562 ceph_assert(gather
.has_subs());
12563 gather
.set_finisher(new MDSInternalContextWrapper(mds
, fin
));
12568 dout(4) << "Flushing journal because scrub did some repairs" << dendl
;
12569 mds
->mdlog
->start_new_segment();
12570 mds
->mdlog
->flush();
12571 mds
->mdlog
->wait_for_safe(new MDSInternalContextWrapper(mds
, flush_finish
));
12574 if (!header
->get_recursive()) {
12575 mds
->scrubstack
->enqueue_inode_top(in
, header
,
12576 new MDSInternalContextWrapper(mds
, scrub_finish
));
12578 mds
->scrubstack
->enqueue_inode_bottom(in
, header
,
12579 new MDSInternalContextWrapper(mds
, scrub_finish
));
12582 mds
->server
->respond_to_request(mdr
, 0);
12586 void MDCache::recursive_scrub_finish(const ScrubHeaderRef
& header
)
12588 if (header
->get_origin()->is_base() &&
12589 header
->get_force() && header
->get_repair()) {
12590 // notify snapserver that base directory is recursively scrubbed.
12591 // After both root and mdsdir are recursively scrubbed, snapserver
12592 // knows that all old format snaprealms are converted to the new
12594 if (mds
->mdsmap
->get_num_in_mds() == 1 &&
12595 mds
->mdsmap
->get_num_failed_mds() == 0 &&
12596 mds
->mdsmap
->get_tableserver() == mds
->get_nodeid()) {
12597 mds
->mark_base_recursively_scrubbed(header
->get_origin()->ino());
12602 struct C_MDC_RespondInternalRequest
: public MDCacheLogContext
{
12604 C_MDC_RespondInternalRequest(MDCache
*c
, MDRequestRef
& m
) :
12605 MDCacheLogContext(c
), mdr(m
) {}
12606 void finish(int r
) override
{
12608 get_mds()->server
->respond_to_request(mdr
, r
);
12612 void MDCache::repair_dirfrag_stats(CDir
*dir
)
12614 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS
);
12616 mdr
->internal_op_private
= dir
;
12617 mdr
->internal_op_finish
= new C_MDSInternalNoop
;
12618 repair_dirfrag_stats_work(mdr
);
12621 void MDCache::repair_dirfrag_stats_work(MDRequestRef
& mdr
)
12623 CDir
*dir
= static_cast<CDir
*>(mdr
->internal_op_private
);
12624 dout(10) << __func__
<< " " << *dir
<< dendl
;
12626 if (!dir
->is_auth()) {
12627 mds
->server
->respond_to_request(mdr
, -ESTALE
);
12631 if (!mdr
->is_auth_pinned(dir
) && !dir
->can_auth_pin()) {
12632 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(this, mdr
));
12634 mds
->locker
->drop_locks(mdr
.get());
12635 mdr
->drop_local_auth_pins();
12636 if (!mdr
->remote_auth_pins
.empty())
12637 mds
->locker
->notify_freeze_waiter(dir
);
12641 mdr
->auth_pin(dir
);
12643 MutationImpl::LockOpVec lov
;
12644 CInode
*diri
= dir
->inode
;
12645 lov
.add_rdlock(&diri
->dirfragtreelock
);
12646 lov
.add_wrlock(&diri
->nestlock
);
12647 lov
.add_wrlock(&diri
->filelock
);
12648 if (!mds
->locker
->acquire_locks(mdr
, lov
))
12651 if (!dir
->is_complete()) {
12652 dir
->fetch(new C_MDS_RetryRequest(this, mdr
));
12656 frag_info_t frag_info
;
12657 nest_info_t nest_info
;
12658 for (auto it
= dir
->begin(); it
!= dir
->end(); ++it
) {
12659 CDentry
*dn
= it
->second
;
12660 if (dn
->last
!= CEPH_NOSNAP
)
12662 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
12663 if (dnl
->is_primary()) {
12664 CInode
*in
= dnl
->get_inode();
12665 nest_info
.add(in
->get_projected_inode()->accounted_rstat
);
12667 frag_info
.nsubdirs
++;
12669 frag_info
.nfiles
++;
12670 } else if (dnl
->is_remote())
12671 frag_info
.nfiles
++;
12674 fnode_t
*pf
= dir
->get_projected_fnode();
12675 bool good_fragstat
= frag_info
.same_sums(pf
->fragstat
);
12676 bool good_rstat
= nest_info
.same_sums(pf
->rstat
);
12677 if (good_fragstat
&& good_rstat
) {
12678 dout(10) << __func__
<< " no corruption found" << dendl
;
12679 mds
->server
->respond_to_request(mdr
, 0);
12683 pf
= dir
->project_fnode();
12684 pf
->version
= dir
->pre_dirty();
12685 mdr
->add_projected_fnode(dir
);
12687 mdr
->ls
= mds
->mdlog
->get_current_segment();
12688 EUpdate
*le
= new EUpdate(mds
->mdlog
, "repair_dirfrag");
12689 mds
->mdlog
->start_entry(le
);
12691 if (!good_fragstat
) {
12692 if (pf
->fragstat
.mtime
> frag_info
.mtime
)
12693 frag_info
.mtime
= pf
->fragstat
.mtime
;
12694 if (pf
->fragstat
.change_attr
> frag_info
.change_attr
)
12695 frag_info
.change_attr
= pf
->fragstat
.change_attr
;
12696 pf
->fragstat
= frag_info
;
12697 mds
->locker
->mark_updated_scatterlock(&diri
->filelock
);
12698 mdr
->ls
->dirty_dirfrag_dir
.push_back(&diri
->item_dirty_dirfrag_dir
);
12699 mdr
->add_updated_lock(&diri
->filelock
);
12703 if (pf
->rstat
.rctime
> nest_info
.rctime
)
12704 nest_info
.rctime
= pf
->rstat
.rctime
;
12705 pf
->rstat
= nest_info
;
12706 mds
->locker
->mark_updated_scatterlock(&diri
->nestlock
);
12707 mdr
->ls
->dirty_dirfrag_nest
.push_back(&diri
->item_dirty_dirfrag_nest
);
12708 mdr
->add_updated_lock(&diri
->nestlock
);
12711 le
->metablob
.add_dir_context(dir
);
12712 le
->metablob
.add_dir(dir
, true);
12714 mds
->mdlog
->submit_entry(le
, new C_MDC_RespondInternalRequest(this, mdr
));
12717 void MDCache::repair_inode_stats(CInode
*diri
)
12719 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS
);
12721 mdr
->internal_op_private
= diri
;
12722 mdr
->internal_op_finish
= new C_MDSInternalNoop
;
12723 repair_inode_stats_work(mdr
);
12726 void MDCache::repair_inode_stats_work(MDRequestRef
& mdr
)
12728 CInode
*diri
= static_cast<CInode
*>(mdr
->internal_op_private
);
12729 dout(10) << __func__
<< " " << *diri
<< dendl
;
12731 if (!diri
->is_auth()) {
12732 mds
->server
->respond_to_request(mdr
, -ESTALE
);
12735 if (!diri
->is_dir()) {
12736 mds
->server
->respond_to_request(mdr
, -ENOTDIR
);
12740 MutationImpl::LockOpVec lov
;
12742 if (mdr
->ls
) // already marked filelock/nestlock dirty ?
12745 lov
.add_rdlock(&diri
->dirfragtreelock
);
12746 lov
.add_wrlock(&diri
->nestlock
);
12747 lov
.add_wrlock(&diri
->filelock
);
12748 if (!mds
->locker
->acquire_locks(mdr
, lov
))
12751 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
12752 // the scatter-gather process, which will fix any fragstat/rstat errors.
12755 diri
->dirfragtree
.get_leaves(leaves
);
12756 for (const auto& leaf
: leaves
) {
12757 CDir
*dir
= diri
->get_dirfrag(leaf
);
12759 ceph_assert(mdr
->is_auth_pinned(diri
));
12760 dir
= diri
->get_or_open_dirfrag(this, leaf
);
12762 if (dir
->get_version() == 0) {
12763 ceph_assert(dir
->is_auth());
12764 dir
->fetch(new C_MDS_RetryRequest(this, mdr
));
12770 diri
->state_set(CInode::STATE_REPAIRSTATS
);
12771 mdr
->ls
= mds
->mdlog
->get_current_segment();
12772 mds
->locker
->mark_updated_scatterlock(&diri
->filelock
);
12773 mdr
->ls
->dirty_dirfrag_dir
.push_back(&diri
->item_dirty_dirfrag_dir
);
12774 mds
->locker
->mark_updated_scatterlock(&diri
->nestlock
);
12775 mdr
->ls
->dirty_dirfrag_nest
.push_back(&diri
->item_dirty_dirfrag_nest
);
12777 mds
->locker
->drop_locks(mdr
.get());
12780 // force the scatter-gather process
12782 lov
.add_rdlock(&diri
->dirfragtreelock
);
12783 lov
.add_rdlock(&diri
->nestlock
);
12784 lov
.add_rdlock(&diri
->filelock
);
12785 if (!mds
->locker
->acquire_locks(mdr
, lov
))
12788 diri
->state_clear(CInode::STATE_REPAIRSTATS
);
12790 frag_info_t dir_info
;
12791 nest_info_t nest_info
;
12792 nest_info
.rsubdirs
= 1; // it gets one to account for self
12793 if (const sr_t
*srnode
= diri
->get_projected_srnode(); srnode
)
12794 nest_info
.rsnaps
= srnode
->snaps
.size();
12798 diri
->dirfragtree
.get_leaves(leaves
);
12799 for (const auto& leaf
: leaves
) {
12800 CDir
*dir
= diri
->get_dirfrag(leaf
);
12802 ceph_assert(dir
->get_version() > 0);
12803 dir_info
.add(dir
->fnode
.accounted_fragstat
);
12804 nest_info
.add(dir
->fnode
.accounted_rstat
);
12808 if (!dir_info
.same_sums(diri
->inode
.dirstat
) ||
12809 !nest_info
.same_sums(diri
->inode
.rstat
)) {
12810 dout(10) << __func__
<< " failed to fix fragstat/rstat on "
12814 mds
->server
->respond_to_request(mdr
, 0);
12817 void MDCache::upgrade_inode_snaprealm(CInode
*in
)
12819 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_UPGRADE_SNAPREALM
);
12821 mdr
->internal_op_private
= in
;
12822 mdr
->internal_op_finish
= new C_MDSInternalNoop
;
12823 upgrade_inode_snaprealm_work(mdr
);
12826 void MDCache::upgrade_inode_snaprealm_work(MDRequestRef
& mdr
)
12828 CInode
*in
= static_cast<CInode
*>(mdr
->internal_op_private
);
12829 dout(10) << __func__
<< " " << *in
<< dendl
;
12831 if (!in
->is_auth()) {
12832 mds
->server
->respond_to_request(mdr
, -ESTALE
);
12836 MutationImpl::LockOpVec lov
;
12837 mds
->locker
->include_snap_rdlocks(in
, lov
);
12838 lov
.erase_rdlock(&in
->snaplock
);
12839 lov
.add_xlock(&in
->snaplock
);
12841 if (!mds
->locker
->acquire_locks(mdr
, lov
))
12844 // project_snaprealm() upgrades snaprealm format
12845 auto &pi
= in
->project_inode(false, true);
12846 mdr
->add_projected_inode(in
);
12847 pi
.inode
.version
= in
->pre_dirty();
12849 mdr
->ls
= mds
->mdlog
->get_current_segment();
12850 EUpdate
*le
= new EUpdate(mds
->mdlog
, "upgrade_snaprealm");
12851 mds
->mdlog
->start_entry(le
);
12853 if (in
->is_base()) {
12854 le
->metablob
.add_root(true, in
);
12856 CDentry
*pdn
= in
->get_projected_parent_dn();
12857 le
->metablob
.add_dir_context(pdn
->get_dir());
12858 le
->metablob
.add_primary_dentry(pdn
, in
, true);
12861 mds
->mdlog
->submit_entry(le
, new C_MDC_RespondInternalRequest(this, mdr
));
12864 void MDCache::flush_dentry(std::string_view path
, Context
*fin
)
12866 if (is_readonly()) {
12867 dout(10) << __func__
<< ": read-only FS" << dendl
;
12868 fin
->complete(-EROFS
);
12871 dout(10) << "flush_dentry " << path
<< dendl
;
12872 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_FLUSH
);
12874 mdr
->set_filepath(fp
);
12875 mdr
->internal_op_finish
= fin
;
12876 flush_dentry_work(mdr
);
12879 class C_FinishIOMDR
: public MDSContext
{
12883 MDSRank
*get_mds() override
{ return mds
; }
12885 C_FinishIOMDR(MDSRank
*mds_
, MDRequestRef
& mdr_
) : mds(mds_
), mdr(mdr_
) {}
12886 void finish(int r
) override
{ mds
->server
->respond_to_request(mdr
, r
); }
12889 void MDCache::flush_dentry_work(MDRequestRef
& mdr
)
12891 MutationImpl::LockOpVec lov
;
12892 CInode
*in
= mds
->server
->rdlock_path_pin_ref(mdr
, 0, lov
, true);
12896 // TODO: Is this necessary? Fix it if so
12897 ceph_assert(in
->is_auth());
12898 bool locked
= mds
->locker
->acquire_locks(mdr
, lov
);
12901 in
->flush(new C_FinishIOMDR(mds
, mdr
));
12906 * Initialize performance counters with global perfcounter
12909 void MDCache::register_perfcounters()
12911 PerfCountersBuilder
pcb(g_ceph_context
, "mds_cache", l_mdc_first
, l_mdc_last
);
12913 // Stray/purge statistics
12914 pcb
.add_u64(l_mdc_num_strays
, "num_strays", "Stray dentries", "stry",
12915 PerfCountersBuilder::PRIO_INTERESTING
);
12916 pcb
.add_u64(l_mdc_num_recovering_enqueued
,
12917 "num_recovering_enqueued", "Files waiting for recovery", "recy",
12918 PerfCountersBuilder::PRIO_INTERESTING
);
12919 pcb
.add_u64_counter(l_mdc_recovery_completed
,
12920 "recovery_completed", "File recoveries completed", "recd",
12921 PerfCountersBuilder::PRIO_INTERESTING
);
12923 // useful recovery queue statistics
12924 pcb
.set_prio_default(PerfCountersBuilder::PRIO_USEFUL
);
12925 pcb
.add_u64(l_mdc_num_recovering_processing
, "num_recovering_processing",
12926 "Files currently being recovered");
12927 pcb
.add_u64(l_mdc_num_recovering_prioritized
, "num_recovering_prioritized",
12928 "Files waiting for recovery with elevated priority");
12929 pcb
.add_u64_counter(l_mdc_recovery_started
, "recovery_started",
12930 "File recoveries started");
12932 // along with other stray dentries stats
12933 pcb
.add_u64(l_mdc_num_strays_delayed
, "num_strays_delayed",
12934 "Stray dentries delayed");
12935 pcb
.add_u64(l_mdc_num_strays_enqueuing
, "num_strays_enqueuing",
12936 "Stray dentries enqueuing for purge");
12937 pcb
.add_u64_counter(l_mdc_strays_created
, "strays_created",
12938 "Stray dentries created");
12939 pcb
.add_u64_counter(l_mdc_strays_enqueued
, "strays_enqueued",
12940 "Stray dentries enqueued for purge");
12941 pcb
.add_u64_counter(l_mdc_strays_reintegrated
, "strays_reintegrated",
12942 "Stray dentries reintegrated");
12943 pcb
.add_u64_counter(l_mdc_strays_migrated
, "strays_migrated",
12944 "Stray dentries migrated");
12946 // low prio internal request stats
12947 pcb
.add_u64_counter(l_mdss_ireq_enqueue_scrub
, "ireq_enqueue_scrub",
12948 "Internal Request type enqueue scrub");
12949 pcb
.add_u64_counter(l_mdss_ireq_exportdir
, "ireq_exportdir",
12950 "Internal Request type export dir");
12951 pcb
.add_u64_counter(l_mdss_ireq_flush
, "ireq_flush",
12952 "Internal Request type flush");
12953 pcb
.add_u64_counter(l_mdss_ireq_fragmentdir
, "ireq_fragmentdir",
12954 "Internal Request type fragmentdir");
12955 pcb
.add_u64_counter(l_mdss_ireq_fragstats
, "ireq_fragstats",
12956 "Internal Request type frag stats");
12957 pcb
.add_u64_counter(l_mdss_ireq_inodestats
, "ireq_inodestats",
12958 "Internal Request type inode stats");
12960 logger
.reset(pcb
.create_perf_counters());
12961 g_ceph_context
->get_perfcounters_collection()->add(logger
.get());
12962 recovery_queue
.set_logger(logger
.get());
12963 stray_manager
.set_logger(logger
.get());
12967 * Call this when putting references to an inode/dentry or
12968 * when attempting to trim it.
12970 * If this inode is no longer linked by anyone, and this MDS
12971 * rank holds the primary dentry, and that dentry is in a stray
12972 * directory, then give up the dentry to the StrayManager, never
12973 * to be seen again by MDCache.
12975 * @param delay if true, then purgeable inodes are stashed til
12976 * the next trim(), rather than being purged right
12979 void MDCache::maybe_eval_stray(CInode
*in
, bool delay
) {
12980 if (in
->inode
.nlink
> 0 || in
->is_base() || is_readonly() ||
12981 mds
->get_state() <= MDSMap::STATE_REJOIN
)
12984 CDentry
*dn
= in
->get_projected_parent_dn();
12986 if (dn
->state_test(CDentry::STATE_PURGING
)) {
12987 /* We have already entered the purging process, no need
12988 * to re-evaluate me ! */
12992 if (dn
->get_dir()->get_inode()->is_stray()) {
12994 stray_manager
.queue_delayed(dn
);
12996 stray_manager
.eval_stray(dn
);
13000 void MDCache::clear_dirty_bits_for_stray(CInode
* diri
) {
13001 dout(10) << __func__
<< " " << *diri
<< dendl
;
13002 ceph_assert(diri
->get_projected_parent_dir()->inode
->is_stray());
13004 diri
->get_dirfrags(ls
);
13005 for (auto &p
: ls
) {
13006 if (p
->is_auth() && !(p
->is_frozen() || p
->is_freezing()))
13007 p
->try_remove_dentries_for_stray();
13009 if (!diri
->snaprealm
) {
13010 if (diri
->is_auth())
13011 diri
->clear_dirty_rstat();
13012 diri
->clear_scatter_dirty();
13016 bool MDCache::dump_inode(Formatter
*f
, uint64_t number
) {
13017 CInode
*in
= get_inode(number
);
13021 f
->open_object_section("inode");
13022 in
->dump(f
, CInode::DUMP_DEFAULT
| CInode::DUMP_PATH
);
13023 f
->close_section();
13027 void MDCache::handle_mdsmap(const MDSMap
&mdsmap
) {
13028 // process export_pin_delayed_queue whenever a new MDSMap received
13029 auto &q
= export_pin_delayed_queue
;
13030 for (auto it
= q
.begin(); it
!= q
.end(); ) {
13032 mds_rank_t export_pin
= in
->get_export_pin(false);
13033 dout(10) << " delayed export_pin=" << export_pin
<< " on " << *in
13034 << " max_mds=" << mdsmap
.get_max_mds() << dendl
;
13035 if (export_pin
>= mdsmap
.get_max_mds()) {
13040 in
->state_clear(CInode::STATE_DELAYEDEXPORTPIN
);
13042 in
->maybe_export_pin();