1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
20 #include <boost/utility/string_view.hpp>
28 #include "MDBalancer.h"
30 #include "ScrubStack.h"
32 #include "SnapClient.h"
41 #include "include/ceph_fs.h"
42 #include "include/filepath.h"
43 #include "include/util.h"
45 #include "msg/Message.h"
46 #include "msg/Messenger.h"
48 #include "common/MemoryModel.h"
49 #include "common/errno.h"
50 #include "common/perf_counters.h"
51 #include "common/safe_io.h"
53 #include "osdc/Journaler.h"
54 #include "osdc/Filer.h"
56 #include "events/ESubtreeMap.h"
57 #include "events/EUpdate.h"
58 #include "events/ESlaveUpdate.h"
59 #include "events/EImportFinish.h"
60 #include "events/EFragment.h"
61 #include "events/ECommitted.h"
62 #include "events/ESessions.h"
64 #include "messages/MGenericMessage.h"
66 #include "messages/MMDSResolve.h"
67 #include "messages/MMDSResolveAck.h"
68 #include "messages/MMDSCacheRejoin.h"
70 #include "messages/MDiscover.h"
71 #include "messages/MDiscoverReply.h"
73 //#include "messages/MInodeUpdate.h"
74 #include "messages/MDirUpdate.h"
75 #include "messages/MCacheExpire.h"
77 #include "messages/MInodeFileCaps.h"
79 #include "messages/MLock.h"
80 #include "messages/MDentryLink.h"
81 #include "messages/MDentryUnlink.h"
83 #include "messages/MMDSFindIno.h"
84 #include "messages/MMDSFindInoReply.h"
86 #include "messages/MMDSOpenIno.h"
87 #include "messages/MMDSOpenInoReply.h"
89 #include "messages/MClientRequest.h"
90 #include "messages/MClientCaps.h"
91 #include "messages/MClientSnap.h"
92 #include "messages/MClientQuota.h"
94 #include "messages/MMDSSlaveRequest.h"
96 #include "messages/MMDSFragmentNotify.h"
98 #include "messages/MGatherCaps.h"
100 #include "InoTable.h"
102 #include "common/Timer.h"
104 #include "perfglue/heap_profiler.h"
108 #include "common/config.h"
109 #include "include/assert.h"
111 #define dout_context g_ceph_context
112 #define dout_subsys ceph_subsys_mds
114 #define dout_prefix _prefix(_dout, mds)
115 static ostream
& _prefix(std::ostream
*_dout
, MDSRank
*mds
) {
116 return *_dout
<< "mds." << mds
->get_nodeid() << ".cache ";
119 set
<int> SimpleLock::empty_gather_set
;
123 * All non-I/O contexts that require a reference
124 * to an MDCache instance descend from this.
126 class MDCacheContext
: public virtual MDSInternalContextBase
{
129 MDSRank
*get_mds() override
131 assert(mdcache
!= NULL
);
135 explicit MDCacheContext(MDCache
*mdc_
) : mdcache(mdc_
) {}
140 * Only for contexts called back from an I/O completion
142 * Note: duplication of members wrt MDCacheContext, because
143 * it'ls the lesser of two evils compared with introducing
144 * yet another piece of (multiple) inheritance.
146 class MDCacheIOContext
: public virtual MDSIOContextBase
{
149 MDSRank
*get_mds() override
151 assert(mdcache
!= NULL
);
155 explicit MDCacheIOContext(MDCache
*mdc_
) : mdcache(mdc_
) {}
158 class MDCacheLogContext
: public virtual MDSLogContextBase
{
161 MDSRank
*get_mds() override
163 assert(mdcache
!= NULL
);
167 explicit MDCacheLogContext(MDCache
*mdc_
) : mdcache(mdc_
) {}
170 MDCache::MDCache(MDSRank
*m
, PurgeQueue
&purge_queue_
) :
172 filer(m
->objecter
, m
->finisher
),
173 exceeded_size_limit(false),
175 stray_manager(m
, purge_queue_
)
177 migrator
.reset(new Migrator(mds
, this));
183 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
187 num_shadow_inodes
= 0;
188 num_inodes_with_caps
= 0;
190 max_dir_commit_size
= g_conf
->mds_dir_max_commit_size
?
191 (g_conf
->mds_dir_max_commit_size
<< 20) :
192 (0.9 *(g_conf
->osd_max_write_size
<< 20));
194 discover_last_tid
= 0;
195 open_ino_last_tid
= 0;
196 find_ino_peer_last_tid
= 0;
200 client_lease_durations
[0] = 5.0;
201 client_lease_durations
[1] = 30.0;
202 client_lease_durations
[2] = 300.0;
204 resolves_pending
= false;
205 rejoins_pending
= false;
206 cap_imports_num_opening
= 0;
208 opening_root
= open
= false;
209 lru
.lru_set_midpoint(cache_mid());
211 bottom_lru
.lru_set_midpoint(0);
213 decayrate
.set_halflife(g_conf
->mds_decay_halflife
);
215 did_shutdown_log_cap
= false;
221 g_ceph_context
->get_perfcounters_collection()->remove(logger
.get());
227 void MDCache::log_stat()
229 mds
->logger
->set(l_mds_inode_max
, cache_limit_inodes() == 0 ? INT_MAX
: cache_limit_inodes());
230 mds
->logger
->set(l_mds_inodes
, lru
.lru_get_size());
231 mds
->logger
->set(l_mds_inodes_pinned
, lru
.lru_get_num_pinned());
232 mds
->logger
->set(l_mds_inodes_top
, lru
.lru_get_top());
233 mds
->logger
->set(l_mds_inodes_bottom
, lru
.lru_get_bot());
234 mds
->logger
->set(l_mds_inodes_pin_tail
, lru
.lru_get_pintail());
235 mds
->logger
->set(l_mds_inodes_with_caps
, num_inodes_with_caps
);
236 mds
->logger
->set(l_mds_caps
, Capability::count());
242 bool MDCache::shutdown()
244 if (lru
.lru_get_size() > 0) {
245 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl
;
254 // ====================================================================
255 // some inode functions
257 void MDCache::add_inode(CInode
*in
)
259 // add to lru, inode map
260 if (in
->last
== CEPH_NOSNAP
) {
261 auto &p
= inode_map
[in
->ino()];
262 assert(!p
); // should be no dup inos!
265 auto &p
= snap_inode_map
[in
->vino()];
266 assert(!p
); // should be no dup inos!
270 if (in
->ino() < MDS_INO_SYSTEM_BASE
) {
271 if (in
->ino() == MDS_INO_ROOT
)
273 else if (in
->ino() == MDS_INO_MDSDIR(mds
->get_nodeid()))
275 else if (in
->is_stray()) {
276 if (MDS_INO_STRAY_OWNER(in
->ino()) == mds
->get_nodeid()) {
277 strays
[MDS_INO_STRAY_INDEX(in
->ino())] = in
;
281 base_inodes
.insert(in
);
284 if (cache_toofull()) {
285 exceeded_size_limit
= true;
289 void MDCache::remove_inode(CInode
*o
)
291 dout(14) << "remove_inode " << *o
<< dendl
;
293 if (o
->get_parent_dn()) {
294 // FIXME: multiple parents?
295 CDentry
*dn
= o
->get_parent_dn();
296 assert(!dn
->is_dirty());
297 dn
->dir
->unlink_inode(dn
); // leave dentry ... FIXME?
302 if (o
->is_dirty_parent())
303 o
->clear_dirty_parent();
305 o
->clear_scatter_dirty();
307 o
->item_open_file
.remove_myself();
309 if (o
->state_test(CInode::STATE_QUEUEDEXPORTPIN
))
310 export_pin_queue
.erase(o
);
312 // remove from inode map
313 if (o
->last
== CEPH_NOSNAP
)
314 inode_map
.erase(o
->ino());
316 snap_inode_map
.erase(o
->vino());
318 if (o
->ino() < MDS_INO_SYSTEM_BASE
) {
319 if (o
== root
) root
= 0;
320 if (o
== myin
) myin
= 0;
322 if (MDS_INO_STRAY_OWNER(o
->ino()) == mds
->get_nodeid()) {
323 strays
[MDS_INO_STRAY_INDEX(o
->ino())] = 0;
327 base_inodes
.erase(o
);
331 assert(o
->get_num_ref() == 0);
335 file_layout_t
MDCache::gen_default_file_layout(const MDSMap
&mdsmap
)
337 file_layout_t result
= file_layout_t::get_default();
338 result
.pool_id
= mdsmap
.get_first_data_pool();
342 file_layout_t
MDCache::gen_default_log_layout(const MDSMap
&mdsmap
)
344 file_layout_t result
= file_layout_t::get_default();
345 result
.pool_id
= mdsmap
.get_metadata_pool();
346 if (g_conf
->mds_log_segment_size
> 0) {
347 result
.object_size
= g_conf
->mds_log_segment_size
;
348 result
.stripe_unit
= g_conf
->mds_log_segment_size
;
353 void MDCache::init_layouts()
355 default_file_layout
= gen_default_file_layout(*(mds
->mdsmap
));
356 default_log_layout
= gen_default_log_layout(*(mds
->mdsmap
));
359 void MDCache::create_unlinked_system_inode(CInode
*in
, inodeno_t ino
,
363 in
->inode
.version
= 1;
364 in
->inode
.xattr_version
= 1;
365 in
->inode
.mode
= 0500 | mode
;
369 in
->inode
.btime
= ceph_clock_now();
371 in
->inode
.truncate_size
= -1ull;
372 in
->inode
.change_attr
= 0;
373 in
->inode
.export_pin
= MDS_RANK_NONE
;
375 memset(&in
->inode
.dir_layout
, 0, sizeof(in
->inode
.dir_layout
));
376 if (in
->inode
.is_dir()) {
377 in
->inode
.dir_layout
.dl_dir_hash
= g_conf
->mds_default_dir_hash
;
378 ++in
->inode
.rstat
.rsubdirs
;
380 in
->inode
.layout
= default_file_layout
;
381 ++in
->inode
.rstat
.rfiles
;
383 in
->inode
.accounted_rstat
= in
->inode
.rstat
;
387 in
->inode_auth
= mds_authority_t(mds
->get_nodeid(), CDIR_AUTH_UNKNOWN
);
389 in
->inode_auth
= mds_authority_t(mds_rank_t(in
->ino() - MDS_INO_MDSDIR_OFFSET
), CDIR_AUTH_UNKNOWN
);
390 in
->open_snaprealm(); // empty snaprealm
391 assert(!in
->snaprealm
->parent
); // created its own
392 in
->snaprealm
->srnode
.seq
= 1;
396 CInode
*MDCache::create_system_inode(inodeno_t ino
, int mode
)
398 dout(0) << "creating system inode with ino:" << ino
<< dendl
;
399 CInode
*in
= new CInode(this);
400 create_unlinked_system_inode(in
, ino
, mode
);
405 CInode
*MDCache::create_root_inode()
407 CInode
*i
= create_system_inode(MDS_INO_ROOT
, S_IFDIR
|0755);
408 i
->inode
.uid
= g_conf
->mds_root_ino_uid
;
409 i
->inode
.gid
= g_conf
->mds_root_ino_gid
;
410 i
->inode
.layout
= default_file_layout
;
411 i
->inode
.layout
.pool_id
= mds
->mdsmap
->get_first_data_pool();
415 void MDCache::create_empty_hierarchy(MDSGather
*gather
)
418 CInode
*root
= create_root_inode();
420 // force empty root dir
421 CDir
*rootdir
= root
->get_or_open_dirfrag(this, frag_t());
422 adjust_subtree_auth(rootdir
, mds
->get_nodeid());
423 rootdir
->dir_rep
= CDir::REP_ALL
; //NONE;
425 rootdir
->fnode
.accounted_fragstat
= rootdir
->fnode
.fragstat
;
426 rootdir
->fnode
.accounted_rstat
= rootdir
->fnode
.rstat
;
428 root
->inode
.dirstat
= rootdir
->fnode
.fragstat
;
429 root
->inode
.rstat
= rootdir
->fnode
.rstat
;
430 ++root
->inode
.rstat
.rsubdirs
;
431 root
->inode
.accounted_rstat
= root
->inode
.rstat
;
433 rootdir
->mark_complete();
434 rootdir
->mark_dirty(rootdir
->pre_dirty(), mds
->mdlog
->get_current_segment());
435 rootdir
->commit(0, gather
->new_sub());
438 root
->mark_dirty(root
->pre_dirty(), mds
->mdlog
->get_current_segment());
439 root
->mark_dirty_parent(mds
->mdlog
->get_current_segment(), true);
440 root
->flush(gather
->new_sub());
443 void MDCache::create_mydir_hierarchy(MDSGather
*gather
)
446 CInode
*my
= create_system_inode(MDS_INO_MDSDIR(mds
->get_nodeid()), S_IFDIR
);
448 CDir
*mydir
= my
->get_or_open_dirfrag(this, frag_t());
449 adjust_subtree_auth(mydir
, mds
->get_nodeid());
451 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
454 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
455 CInode
*stray
= create_system_inode(MDS_INO_STRAY(mds
->get_nodeid(), i
), S_IFDIR
);
456 CDir
*straydir
= stray
->get_or_open_dirfrag(this, frag_t());
458 name
<< "stray" << i
;
459 CDentry
*sdn
= mydir
->add_primary_dentry(name
.str(), stray
);
460 sdn
->_mark_dirty(mds
->mdlog
->get_current_segment());
462 stray
->inode
.dirstat
= straydir
->fnode
.fragstat
;
464 mydir
->fnode
.rstat
.add(stray
->inode
.rstat
);
465 mydir
->fnode
.fragstat
.nsubdirs
++;
467 straydir
->mark_complete();
468 straydir
->mark_dirty(straydir
->pre_dirty(), ls
);
469 straydir
->commit(0, gather
->new_sub());
470 stray
->mark_dirty_parent(ls
, true);
471 stray
->store_backtrace(gather
->new_sub());
474 mydir
->fnode
.accounted_fragstat
= mydir
->fnode
.fragstat
;
475 mydir
->fnode
.accounted_rstat
= mydir
->fnode
.rstat
;
477 myin
->inode
.dirstat
= mydir
->fnode
.fragstat
;
478 myin
->inode
.rstat
= mydir
->fnode
.rstat
;
479 ++myin
->inode
.rstat
.rsubdirs
;
480 myin
->inode
.accounted_rstat
= myin
->inode
.rstat
;
482 mydir
->mark_complete();
483 mydir
->mark_dirty(mydir
->pre_dirty(), ls
);
484 mydir
->commit(0, gather
->new_sub());
486 myin
->store(gather
->new_sub());
489 struct C_MDC_CreateSystemFile
: public MDCacheLogContext
{
493 MDSInternalContextBase
*fin
;
494 C_MDC_CreateSystemFile(MDCache
*c
, MutationRef
& mu
, CDentry
*d
, version_t v
, MDSInternalContextBase
*f
) :
495 MDCacheLogContext(c
), mut(mu
), dn(d
), dpv(v
), fin(f
) {}
496 void finish(int r
) override
{
497 mdcache
->_create_system_file_finish(mut
, dn
, dpv
, fin
);
501 void MDCache::_create_system_file(CDir
*dir
, const char *name
, CInode
*in
, MDSInternalContextBase
*fin
)
503 dout(10) << "_create_system_file " << name
<< " in " << *dir
<< dendl
;
504 CDentry
*dn
= dir
->add_null_dentry(name
);
506 dn
->push_projected_linkage(in
);
507 version_t dpv
= dn
->pre_dirty();
510 if (in
->inode
.is_dir()) {
511 in
->inode
.rstat
.rsubdirs
= 1;
513 mdir
= in
->get_or_open_dirfrag(this, frag_t());
514 mdir
->mark_complete();
517 in
->inode
.rstat
.rfiles
= 1;
518 in
->inode
.version
= dn
->pre_dirty();
520 SnapRealm
*realm
= dir
->get_inode()->find_snaprealm();
521 dn
->first
= in
->first
= realm
->get_newest_seq() + 1;
523 MutationRef
mut(new MutationImpl());
525 // force some locks. hacky.
526 mds
->locker
->wrlock_force(&dir
->inode
->filelock
, mut
);
527 mds
->locker
->wrlock_force(&dir
->inode
->nestlock
, mut
);
529 mut
->ls
= mds
->mdlog
->get_current_segment();
530 EUpdate
*le
= new EUpdate(mds
->mdlog
, "create system file");
531 mds
->mdlog
->start_entry(le
);
533 if (!in
->is_mdsdir()) {
534 predirty_journal_parents(mut
, &le
->metablob
, in
, dir
, PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
535 le
->metablob
.add_primary_dentry(dn
, in
, true);
537 predirty_journal_parents(mut
, &le
->metablob
, in
, dir
, PREDIRTY_DIR
, 1);
538 journal_dirty_inode(mut
.get(), &le
->metablob
, in
);
539 dn
->push_projected_linkage(in
->ino(), in
->d_type());
540 le
->metablob
.add_remote_dentry(dn
, true, in
->ino(), in
->d_type());
541 le
->metablob
.add_root(true, in
);
544 le
->metablob
.add_new_dir(mdir
); // dirty AND complete AND new
546 mds
->mdlog
->submit_entry(le
, new C_MDC_CreateSystemFile(this, mut
, dn
, dpv
, fin
));
550 void MDCache::_create_system_file_finish(MutationRef
& mut
, CDentry
*dn
, version_t dpv
, MDSInternalContextBase
*fin
)
552 dout(10) << "_create_system_file_finish " << *dn
<< dendl
;
554 dn
->pop_projected_linkage();
555 dn
->mark_dirty(dpv
, mut
->ls
);
557 CInode
*in
= dn
->get_linkage()->get_inode();
559 in
->mark_dirty(in
->inode
.version
+ 1, mut
->ls
);
561 if (in
->inode
.is_dir()) {
562 CDir
*dir
= in
->get_dirfrag(frag_t());
564 dir
->mark_dirty(1, mut
->ls
);
565 dir
->mark_new(mut
->ls
);
569 mds
->locker
->drop_locks(mut
.get());
574 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
575 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
580 struct C_MDS_RetryOpenRoot
: public MDSInternalContext
{
582 explicit C_MDS_RetryOpenRoot(MDCache
*c
) : MDSInternalContext(c
->mds
), cache(c
) {}
583 void finish(int r
) override
{
585 // If we can't open root, something disastrous has happened: mark
586 // this rank damaged for operator intervention. Note that
587 // it is not okay to call suicide() here because we are in
588 // a Finisher callback.
589 cache
->mds
->damaged();
590 ceph_abort(); // damaged should never return
597 void MDCache::open_root_inode(MDSInternalContextBase
*c
)
599 if (mds
->get_nodeid() == mds
->mdsmap
->get_root()) {
601 in
= create_system_inode(MDS_INO_ROOT
, S_IFDIR
|0755); // initially inaccurate!
604 discover_base_ino(MDS_INO_ROOT
, c
, mds
->mdsmap
->get_root());
608 void MDCache::open_mydir_inode(MDSInternalContextBase
*c
)
610 MDSGatherBuilder
gather(g_ceph_context
);
612 CInode
*in
= create_system_inode(MDS_INO_MDSDIR(mds
->get_nodeid()), S_IFDIR
|0755); // initially inaccurate!
613 in
->fetch(gather
.new_sub());
615 gather
.set_finisher(c
);
619 void MDCache::open_mydir_frag(MDSInternalContextBase
*c
)
622 new MDSInternalContextWrapper(mds
,
623 new FunctionContext([this, c
](int r
) {
628 CDir
*mydir
= myin
->get_or_open_dirfrag(this, frag_t());
630 adjust_subtree_auth(mydir
, mds
->get_nodeid());
637 void MDCache::open_root()
639 dout(10) << "open_root" << dendl
;
642 open_root_inode(new C_MDS_RetryOpenRoot(this));
645 if (mds
->get_nodeid() == mds
->mdsmap
->get_root()) {
646 assert(root
->is_auth());
647 CDir
*rootdir
= root
->get_or_open_dirfrag(this, frag_t());
649 if (!rootdir
->is_subtree_root())
650 adjust_subtree_auth(rootdir
, mds
->get_nodeid());
651 if (!rootdir
->is_complete()) {
652 rootdir
->fetch(new C_MDS_RetryOpenRoot(this));
656 assert(!root
->is_auth());
657 CDir
*rootdir
= root
->get_dirfrag(frag_t());
659 open_remote_dirfrag(root
, frag_t(), new C_MDS_RetryOpenRoot(this));
665 CInode
*in
= create_system_inode(MDS_INO_MDSDIR(mds
->get_nodeid()), S_IFDIR
|0755); // initially inaccurate!
666 in
->fetch(new C_MDS_RetryOpenRoot(this));
669 CDir
*mydir
= myin
->get_or_open_dirfrag(this, frag_t());
671 adjust_subtree_auth(mydir
, mds
->get_nodeid());
676 void MDCache::populate_mydir()
679 CDir
*mydir
= myin
->get_or_open_dirfrag(this, frag_t());
682 dout(10) << "populate_mydir " << *mydir
<< dendl
;
684 if (!mydir
->is_complete()) {
685 mydir
->fetch(new C_MDS_RetryOpenRoot(this));
689 if (mydir
->get_version() == 0 && mydir
->state_test(CDir::STATE_BADFRAG
)) {
690 // A missing dirfrag, we will recreate it. Before that, we must dirty
691 // it before dirtying any of the strays we create within it.
692 mds
->clog
->warn() << "fragment " << mydir
->dirfrag() << " was unreadable, "
694 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
695 mydir
->state_clear(CDir::STATE_BADFRAG
);
696 mydir
->mark_complete();
697 mydir
->mark_dirty(mydir
->pre_dirty(), ls
);
700 // open or create stray
701 uint64_t num_strays
= 0;
702 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
704 name
<< "stray" << i
;
705 CDentry
*straydn
= mydir
->lookup(name
.str());
707 // allow for older fs's with stray instead of stray0
708 if (straydn
== NULL
&& i
== 0)
709 straydn
= mydir
->lookup("stray");
711 if (!straydn
|| !straydn
->get_linkage()->get_inode()) {
712 _create_system_file(mydir
, name
.str().c_str(), create_system_inode(MDS_INO_STRAY(mds
->get_nodeid(), i
), S_IFDIR
),
713 new C_MDS_RetryOpenRoot(this));
718 // we make multiple passes through this method; make sure we only pin each stray once.
719 if (!strays
[i
]->state_test(CInode::STATE_STRAYPINNED
)) {
720 strays
[i
]->get(CInode::PIN_STRAY
);
721 strays
[i
]->state_set(CInode::STATE_STRAYPINNED
);
722 strays
[i
]->get_stickydirs();
724 dout(20) << " stray num " << i
<< " is " << *strays
[i
] << dendl
;
728 strays
[i
]->dirfragtree
.get_leaves(ls
);
729 for (list
<frag_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
731 CDir
*dir
= strays
[i
]->get_dirfrag(fg
);
733 dir
= strays
[i
]->get_or_open_dirfrag(this, fg
);
736 // DamageTable applies special handling to strays: it will
737 // have damaged() us out if one is damaged.
738 assert(!dir
->state_test(CDir::STATE_BADFRAG
));
740 if (dir
->get_version() == 0) {
741 dir
->fetch(new C_MDS_RetryOpenRoot(this));
745 if (dir
->get_frag_size() > 0)
746 num_strays
+= dir
->get_frag_size();
750 stray_manager
.set_num_strays(num_strays
);
753 dout(10) << "populate_mydir done" << dendl
;
756 mds
->queue_waiters(waiting_for_open
);
761 void MDCache::open_foreign_mdsdir(inodeno_t ino
, MDSInternalContextBase
*fin
)
763 discover_base_ino(ino
, fin
, mds_rank_t(ino
& (MAX_MDS
-1)));
766 CDir
*MDCache::get_stray_dir(CInode
*in
)
769 in
->name_stray_dentry(straydname
);
771 CInode
*strayi
= get_stray();
773 frag_t fg
= strayi
->pick_dirfrag(straydname
);
774 CDir
*straydir
= strayi
->get_dirfrag(fg
);
779 CDentry
*MDCache::get_or_create_stray_dentry(CInode
*in
)
781 CDir
*straydir
= get_stray_dir(in
);
783 in
->name_stray_dentry(straydname
);
784 CDentry
*straydn
= straydir
->lookup(straydname
);
786 straydn
= straydir
->add_null_dentry(straydname
);
789 assert(straydn
->get_projected_linkage()->is_null());
792 straydn
->state_set(CDentry::STATE_STRAY
);
798 MDSCacheObject
*MDCache::get_object(MDSCacheObjectInfo
&info
)
802 return get_inode(info
.ino
, info
.snapid
);
805 CDir
*dir
= get_dirfrag(info
.dirfrag
);
808 if (info
.dname
.length())
809 return dir
->lookup(info
.dname
, info
.snapid
);
817 // ====================================================================
818 // subtree management
820 void MDCache::list_subtrees(list
<CDir
*>& ls
)
822 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
825 ls
.push_back(p
->first
);
829 * adjust the dir_auth of a subtree.
830 * merge with parent and/or child subtrees, if is it appropriate.
831 * merge can ONLY happen if both parent and child have unambiguous auth.
833 void MDCache::adjust_subtree_auth(CDir
*dir
, mds_authority_t auth
, bool adjust_pop
)
835 dout(7) << "adjust_subtree_auth " << dir
->get_dir_auth() << " -> " << auth
836 << " on " << *dir
<< dendl
;
841 if (dir
->inode
->is_base()) {
842 root
= dir
; // bootstrap hack.
843 if (subtrees
.count(root
) == 0) {
845 root
->get(CDir::PIN_SUBTREE
);
848 root
= get_subtree_root(dir
); // subtree root
851 assert(subtrees
.count(root
));
852 dout(7) << " current root is " << *root
<< dendl
;
855 // i am already a subtree.
856 dir
->set_dir_auth(auth
);
858 // i am a new subtree.
859 dout(10) << " new subtree at " << *dir
<< dendl
;
860 assert(subtrees
.count(dir
) == 0);
861 subtrees
[dir
]; // create empty subtree bounds list for me.
862 dir
->get(CDir::PIN_SUBTREE
);
865 dir
->set_dir_auth(auth
);
867 // move items nested beneath me, under me.
868 set
<CDir
*>::iterator p
= subtrees
[root
].begin();
869 while (p
!= subtrees
[root
].end()) {
870 set
<CDir
*>::iterator next
= p
;
872 if (get_subtree_root((*p
)->get_parent_dir()) == dir
) {
874 dout(10) << " claiming child bound " << **p
<< dendl
;
875 subtrees
[dir
].insert(*p
);
876 subtrees
[root
].erase(p
);
881 // i am a bound of the parent subtree.
882 subtrees
[root
].insert(dir
);
884 // i am now the subtree root.
887 // adjust recursive pop counters
888 if (adjust_pop
&& dir
->is_auth()) {
889 utime_t now
= ceph_clock_now();
890 CDir
*p
= dir
->get_parent_dir();
892 p
->pop_auth_subtree
.sub(now
, decayrate
, dir
->pop_auth_subtree
);
893 if (p
->is_subtree_root()) break;
894 p
= p
->inode
->get_parent_dir();
903 void MDCache::try_subtree_merge(CDir
*dir
)
905 dout(7) << "try_subtree_merge " << *dir
<< dendl
;
906 // record my old bounds
907 auto oldbounds
= subtrees
.at(dir
);
909 set
<CInode
*> to_eval
;
910 // try merge at my root
911 try_subtree_merge_at(dir
, &to_eval
);
913 // try merge at my old bounds
914 for (auto bound
: oldbounds
)
915 try_subtree_merge_at(bound
, &to_eval
);
917 if (!(mds
->is_any_replay() || mds
->is_resolve())) {
918 for(auto in
: to_eval
)
919 eval_subtree_root(in
);
923 class C_MDC_SubtreeMergeWB
: public MDCacheLogContext
{
927 C_MDC_SubtreeMergeWB(MDCache
*mdc
, CInode
*i
, MutationRef
& m
) : MDCacheLogContext(mdc
), in(i
), mut(m
) {}
928 void finish(int r
) override
{
929 mdcache
->subtree_merge_writebehind_finish(in
, mut
);
933 void MDCache::try_subtree_merge_at(CDir
*dir
, set
<CInode
*> *to_eval
, bool adjust_pop
)
935 dout(10) << "try_subtree_merge_at " << *dir
<< dendl
;
937 if (dir
->dir_auth
.second
!= CDIR_AUTH_UNKNOWN
||
938 dir
->state_test(CDir::STATE_EXPORTBOUND
) ||
939 dir
->state_test(CDir::STATE_AUXSUBTREE
))
942 auto it
= subtrees
.find(dir
);
943 assert(it
!= subtrees
.end());
945 // merge with parent?
947 if (!dir
->inode
->is_base())
948 parent
= get_subtree_root(dir
->get_parent_dir());
950 if (parent
!= dir
&& // we have a parent,
951 parent
->dir_auth
== dir
->dir_auth
) { // auth matches,
952 // merge with parent.
953 dout(10) << " subtree merge at " << *dir
<< dendl
;
954 dir
->set_dir_auth(CDIR_AUTH_DEFAULT
);
956 // move our bounds under the parent
957 subtrees
[parent
].insert(it
->second
.begin(), it
->second
.end());
959 // we are no longer a subtree or bound
960 dir
->put(CDir::PIN_SUBTREE
);
962 subtrees
[parent
].erase(dir
);
964 // adjust popularity?
965 if (adjust_pop
&& dir
->is_auth()) {
966 utime_t now
= ceph_clock_now();
968 CDir
*p
= dir
->get_parent_dir();
970 p
->pop_auth_subtree
.add(now
, decayrate
, dir
->pop_auth_subtree
);
971 p
->pop_lru_subdirs
.push_front(&cur
->get_inode()->item_pop_lru
);
972 if (p
->is_subtree_root()) break;
974 p
= p
->inode
->get_parent_dir();
978 if (to_eval
&& dir
->get_inode()->is_auth())
979 to_eval
->insert(dir
->get_inode());
985 void MDCache::subtree_merge_writebehind_finish(CInode
*in
, MutationRef
& mut
)
987 dout(10) << "subtree_merge_writebehind_finish on " << in
<< dendl
;
988 in
->pop_and_dirty_projected_inode(mut
->ls
);
991 mds
->locker
->drop_locks(mut
.get());
994 in
->auth_unpin(this);
997 void MDCache::eval_subtree_root(CInode
*diri
)
999 // evaluate subtree inode filelock?
1000 // (we should scatter the filelock on subtree bounds)
1001 assert(diri
->is_auth());
1002 mds
->locker
->try_eval(diri
, CEPH_LOCK_IFILE
| CEPH_LOCK_INEST
);
1006 void MDCache::adjust_bounded_subtree_auth(CDir
*dir
, set
<CDir
*>& bounds
, mds_authority_t auth
)
1008 dout(7) << "adjust_bounded_subtree_auth " << dir
->get_dir_auth() << " -> " << auth
1010 << " bounds " << bounds
1016 if (dir
->ino() == MDS_INO_ROOT
) {
1017 root
= dir
; // bootstrap hack.
1018 if (subtrees
.count(root
) == 0) {
1020 root
->get(CDir::PIN_SUBTREE
);
1023 root
= get_subtree_root(dir
); // subtree root
1026 assert(subtrees
.count(root
));
1027 dout(7) << " current root is " << *root
<< dendl
;
1029 mds_authority_t oldauth
= dir
->authority();
1032 // i am already a subtree.
1033 dir
->set_dir_auth(auth
);
1035 // i am a new subtree.
1036 dout(10) << " new subtree at " << *dir
<< dendl
;
1037 assert(subtrees
.count(dir
) == 0);
1038 subtrees
[dir
]; // create empty subtree bounds list for me.
1039 dir
->get(CDir::PIN_SUBTREE
);
1042 dir
->set_dir_auth(auth
);
1044 // move items nested beneath me, under me.
1045 set
<CDir
*>::iterator p
= subtrees
[root
].begin();
1046 while (p
!= subtrees
[root
].end()) {
1047 set
<CDir
*>::iterator next
= p
;
1049 if (get_subtree_root((*p
)->get_parent_dir()) == dir
) {
1051 dout(10) << " claiming child bound " << **p
<< dendl
;
1052 subtrees
[dir
].insert(*p
);
1053 subtrees
[root
].erase(p
);
1058 // i am a bound of the parent subtree.
1059 subtrees
[root
].insert(dir
);
1061 // i am now the subtree root.
1065 set
<CInode
*> to_eval
;
1067 // verify/adjust bounds.
1068 // - these may be new, or
1069 // - beneath existing ambiguous bounds (which will be collapsed),
1070 // - but NOT beneath unambiguous bounds.
1071 for (set
<CDir
*>::iterator p
= bounds
.begin();
1077 if (subtrees
[dir
].count(bound
) == 0) {
1078 if (get_subtree_root(bound
) == dir
) {
1079 dout(10) << " new bound " << *bound
<< ", adjusting auth back to old " << oldauth
<< dendl
;
1080 adjust_subtree_auth(bound
, oldauth
); // otherwise, adjust at bound.
1083 dout(10) << " want bound " << *bound
<< dendl
;
1084 CDir
*t
= get_subtree_root(bound
->get_parent_dir());
1085 if (subtrees
[t
].count(bound
) == 0) {
1087 dout(10) << " new bound " << *bound
<< dendl
;
1088 adjust_subtree_auth(bound
, t
->authority());
1090 // make sure it's nested beneath ambiguous subtree(s)
1092 while (subtrees
[dir
].count(t
) == 0)
1093 t
= get_subtree_root(t
->get_parent_dir());
1094 dout(10) << " swallowing intervening subtree at " << *t
<< dendl
;
1095 adjust_subtree_auth(t
, auth
);
1096 try_subtree_merge_at(t
, &to_eval
);
1097 t
= get_subtree_root(bound
->get_parent_dir());
1098 if (t
== dir
) break;
1103 dout(10) << " already have bound " << *bound
<< dendl
;
1106 // merge stray bounds?
1107 while (!subtrees
[dir
].empty()) {
1108 set
<CDir
*> copy
= subtrees
[dir
];
1109 for (set
<CDir
*>::iterator p
= copy
.begin(); p
!= copy
.end(); ++p
) {
1110 if (bounds
.count(*p
) == 0) {
1112 dout(10) << " swallowing extra subtree at " << *stray
<< dendl
;
1113 adjust_subtree_auth(stray
, auth
);
1114 try_subtree_merge_at(stray
, &to_eval
);
1117 // swallowing subtree may add new subtree bounds
1118 if (copy
== subtrees
[dir
])
1122 // bound should now match.
1123 verify_subtree_bounds(dir
, bounds
);
1127 if (!(mds
->is_any_replay() || mds
->is_resolve())) {
1128 for(auto in
: to_eval
)
1129 eval_subtree_root(in
);
1135 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1136 * fragmentation as necessary to get an equivalent bounding set. That is, only
1137 * split if one of our frags spans the provided bounding set. Never merge.
1139 void MDCache::get_force_dirfrag_bound_set(vector
<dirfrag_t
>& dfs
, set
<CDir
*>& bounds
)
1141 dout(10) << "get_force_dirfrag_bound_set " << dfs
<< dendl
;
1144 map
<inodeno_t
, fragset_t
> byino
;
1145 for (vector
<dirfrag_t
>::iterator p
= dfs
.begin(); p
!= dfs
.end(); ++p
)
1146 byino
[p
->ino
].insert(p
->frag
);
1147 dout(10) << " by ino: " << byino
<< dendl
;
1149 for (map
<inodeno_t
,fragset_t
>::iterator p
= byino
.begin(); p
!= byino
.end(); ++p
) {
1150 CInode
*diri
= get_inode(p
->first
);
1153 dout(10) << " checking fragset " << p
->second
.get() << " on " << *diri
<< dendl
;
1156 for (set
<frag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
)
1157 tmpdft
.force_to_leaf(g_ceph_context
, *q
);
1159 for (set
<frag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
1162 diri
->dirfragtree
.get_leaves_under(fg
, fgls
);
1165 frag_t approx_fg
= diri
->dirfragtree
[fg
.value()];
1167 tmpdft
.get_leaves_under(approx_fg
, ls
);
1168 for (list
<frag_t
>::iterator r
= ls
.begin(); r
!= ls
.end(); ++r
) {
1169 if (p
->second
.get().count(*r
) == 0) {
1170 // not bound, so the resolve message is from auth MDS of the dirfrag
1171 force_dir_fragment(diri
, *r
);
1176 fgls
.push_back(approx_fg
);
1178 diri
->dirfragtree
.get_leaves_under(fg
, fgls
);
1180 dout(10) << " frag " << fg
<< " contains " << fgls
<< dendl
;
1181 for (list
<frag_t
>::iterator r
= fgls
.begin(); r
!= fgls
.end(); ++r
) {
1182 CDir
*dir
= diri
->get_dirfrag(*r
);
1190 void MDCache::adjust_bounded_subtree_auth(CDir
*dir
, vector
<dirfrag_t
>& bound_dfs
, mds_authority_t auth
)
1192 dout(7) << "adjust_bounded_subtree_auth " << dir
->get_dir_auth() << " -> " << auth
1193 << " on " << *dir
<< " bound_dfs " << bound_dfs
<< dendl
;
1196 get_force_dirfrag_bound_set(bound_dfs
, bounds
);
1197 adjust_bounded_subtree_auth(dir
, bounds
, auth
);
1200 void MDCache::map_dirfrag_set(list
<dirfrag_t
>& dfs
, set
<CDir
*>& result
)
1202 dout(10) << "map_dirfrag_set " << dfs
<< dendl
;
1205 map
<inodeno_t
, fragset_t
> ino_fragset
;
1206 for (list
<dirfrag_t
>::iterator p
= dfs
.begin(); p
!= dfs
.end(); ++p
)
1207 ino_fragset
[p
->ino
].insert(p
->frag
);
1210 for (map
<inodeno_t
, fragset_t
>::iterator p
= ino_fragset
.begin();
1211 p
!= ino_fragset
.end();
1213 CInode
*in
= get_inode(p
->first
);
1217 list
<frag_t
> fglist
;
1218 for (set
<frag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
)
1219 in
->dirfragtree
.get_leaves_under(*q
, fglist
);
1221 dout(15) << "map_dirfrag_set " << p
->second
<< " -> " << fglist
1222 << " on " << *in
<< dendl
;
1224 for (list
<frag_t
>::iterator q
= fglist
.begin(); q
!= fglist
.end(); ++q
) {
1225 CDir
*dir
= in
->get_dirfrag(*q
);
1234 CDir
*MDCache::get_subtree_root(CDir
*dir
)
1236 // find the underlying dir that delegates (or is about to delegate) auth
1238 if (dir
->is_subtree_root())
1240 dir
= dir
->get_inode()->get_parent_dir();
1246 CDir
*MDCache::get_projected_subtree_root(CDir
*dir
)
1248 // find the underlying dir that delegates (or is about to delegate) auth
1250 if (dir
->is_subtree_root())
1252 dir
= dir
->get_inode()->get_projected_parent_dir();
1258 void MDCache::remove_subtree(CDir
*dir
)
1260 dout(10) << "remove_subtree " << *dir
<< dendl
;
1261 assert(subtrees
.count(dir
));
1262 assert(subtrees
[dir
].empty());
1263 subtrees
.erase(dir
);
1264 dir
->put(CDir::PIN_SUBTREE
);
1265 if (dir
->get_parent_dir()) {
1266 CDir
*p
= get_subtree_root(dir
->get_parent_dir());
1267 assert(subtrees
[p
].count(dir
));
1268 subtrees
[p
].erase(dir
);
1272 void MDCache::get_subtree_bounds(CDir
*dir
, set
<CDir
*>& bounds
)
1274 assert(subtrees
.count(dir
));
1275 bounds
= subtrees
[dir
];
1278 void MDCache::get_wouldbe_subtree_bounds(CDir
*dir
, set
<CDir
*>& bounds
)
1280 if (subtrees
.count(dir
)) {
1281 // just copy them, dir is a subtree.
1282 get_subtree_bounds(dir
, bounds
);
1285 CDir
*root
= get_subtree_root(dir
);
1286 for (set
<CDir
*>::iterator p
= subtrees
[root
].begin();
1287 p
!= subtrees
[root
].end();
1291 t
= t
->get_parent_dir();
1302 void MDCache::verify_subtree_bounds(CDir
*dir
, const set
<CDir
*>& bounds
)
1304 // for debugging only.
1305 assert(subtrees
.count(dir
));
1306 if (bounds
!= subtrees
[dir
]) {
1307 dout(0) << "verify_subtree_bounds failed" << dendl
;
1308 set
<CDir
*> b
= bounds
;
1309 for (auto &cd
: subtrees
[dir
]) {
1310 if (bounds
.count(cd
)) {
1314 dout(0) << " missing bound " << *cd
<< dendl
;
1316 for (const auto &cd
: b
)
1317 dout(0) << " extra bound " << *cd
<< dendl
;
1319 assert(bounds
== subtrees
[dir
]);
1322 void MDCache::verify_subtree_bounds(CDir
*dir
, const list
<dirfrag_t
>& bounds
)
1324 // for debugging only.
1325 assert(subtrees
.count(dir
));
1327 // make sure that any bounds i do have are properly noted as such.
1329 for (const auto &fg
: bounds
) {
1330 CDir
*bd
= get_dirfrag(fg
);
1332 if (subtrees
[dir
].count(bd
) == 0) {
1333 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd
<< dendl
;
1337 assert(failed
== 0);
1340 void MDCache::project_subtree_rename(CInode
*diri
, CDir
*olddir
, CDir
*newdir
)
1342 dout(10) << "project_subtree_rename " << *diri
<< " from " << *olddir
1343 << " to " << *newdir
<< dendl
;
1344 projected_subtree_renames
[diri
].push_back(pair
<CDir
*,CDir
*>(olddir
, newdir
));
1347 void MDCache::adjust_subtree_after_rename(CInode
*diri
, CDir
*olddir
, bool pop
)
1349 dout(10) << "adjust_subtree_after_rename " << *diri
<< " from " << *olddir
<< dendl
;
1352 utime_t now
= ceph_clock_now();
1354 CDir
*newdir
= diri
->get_parent_dir();
1357 map
<CInode
*,list
<pair
<CDir
*,CDir
*> > >::iterator p
= projected_subtree_renames
.find(diri
);
1358 assert(p
!= projected_subtree_renames
.end());
1359 assert(!p
->second
.empty());
1360 assert(p
->second
.front().first
== olddir
);
1361 assert(p
->second
.front().second
== newdir
);
1362 p
->second
.pop_front();
1363 if (p
->second
.empty())
1364 projected_subtree_renames
.erase(p
);
1369 // make sure subtree dirfrags are at the front of the list
1370 diri
->get_subtree_dirfrags(dfls
);
1371 diri
->get_nested_dirfrags(dfls
);
1372 for (list
<CDir
*>::iterator p
= dfls
.begin(); p
!= dfls
.end(); ++p
) {
1375 dout(10) << "dirfrag " << *dir
<< dendl
;
1376 CDir
*oldparent
= get_subtree_root(olddir
);
1377 dout(10) << " old parent " << *oldparent
<< dendl
;
1378 CDir
*newparent
= get_subtree_root(newdir
);
1379 dout(10) << " new parent " << *newparent
<< dendl
;
1381 if (olddir
!= newdir
)
1382 mds
->balancer
->adjust_pop_for_rename(olddir
, dir
, now
, false);
1384 if (oldparent
== newparent
) {
1385 dout(10) << "parent unchanged for " << *dir
<< " at " << *oldparent
<< dendl
;
1386 } else if (dir
->is_subtree_root()) {
1387 // children are fine. change parent.
1388 dout(10) << "moving " << *dir
<< " from " << *oldparent
<< " to " << *newparent
<< dendl
;
1389 assert(subtrees
[oldparent
].count(dir
));
1390 subtrees
[oldparent
].erase(dir
);
1391 assert(subtrees
.count(newparent
));
1392 subtrees
[newparent
].insert(dir
);
1393 // caller is responsible for 'eval diri'
1394 try_subtree_merge_at(dir
, NULL
, false);
1398 // see if any old bounds move to the new parent.
1400 for (set
<CDir
*>::iterator p
= subtrees
[oldparent
].begin();
1401 p
!= subtrees
[oldparent
].end();
1404 CDir
*broot
= get_subtree_root(bound
->get_parent_dir());
1405 if (broot
!= oldparent
) {
1406 assert(broot
== newparent
);
1407 tomove
.push_back(bound
);
1410 for (list
<CDir
*>::iterator p
= tomove
.begin(); p
!= tomove
.end(); ++p
) {
1412 dout(10) << "moving bound " << *bound
<< " from " << *oldparent
<< " to " << *newparent
<< dendl
;
1413 subtrees
[oldparent
].erase(bound
);
1414 subtrees
[newparent
].insert(bound
);
1418 if (oldparent
->authority() != newparent
->authority()) {
1419 adjust_subtree_auth(dir
, oldparent
->authority(), false);
1420 // caller is responsible for 'eval diri'
1421 try_subtree_merge_at(dir
, NULL
, false);
1425 if (olddir
!= newdir
)
1426 mds
->balancer
->adjust_pop_for_rename(newdir
, dir
, now
, true);
1433 void MDCache::get_fullauth_subtrees(set
<CDir
*>& s
)
1435 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
1436 p
!= subtrees
.end();
1438 CDir
*root
= p
->first
;
1439 if (root
->is_full_dir_auth())
1443 void MDCache::get_auth_subtrees(set
<CDir
*>& s
)
1445 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
1446 p
!= subtrees
.end();
1448 CDir
*root
= p
->first
;
1449 if (root
->is_auth())
1457 int MDCache::num_subtrees()
1459 return subtrees
.size();
1462 int MDCache::num_subtrees_fullauth()
1465 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
1466 p
!= subtrees
.end();
1468 CDir
*root
= p
->first
;
1469 if (root
->is_full_dir_auth())
1475 int MDCache::num_subtrees_fullnonauth()
1478 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
1479 p
!= subtrees
.end();
1481 CDir
*root
= p
->first
;
1482 if (root
->is_full_dir_nonauth())
1490 // ===================================
1491 // journal and snap/cow helpers
1495 * find first inode in cache that follows given snapid. otherwise, return current.
1497 CInode
*MDCache::pick_inode_snap(CInode
*in
, snapid_t follows
)
1499 dout(10) << "pick_inode_snap follows " << follows
<< " on " << *in
<< dendl
;
1500 assert(in
->last
== CEPH_NOSNAP
);
1502 auto p
= snap_inode_map
.upper_bound(vinodeno_t(in
->ino(), follows
));
1503 if (p
!= snap_inode_map
.end() && p
->second
->ino() == in
->ino()) {
1504 dout(10) << "pick_inode_snap found " << *p
->second
<< dendl
;
1513 * note: i'm currently cheating wrt dirty and inode.version on cow
1514 * items. instead of doing a full dir predirty, i just take the
1515 * original item's version, and set the dirty flag (via
1516 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1517 * means a special case in the dir commit clean sweep assertions.
1520 CInode
*MDCache::cow_inode(CInode
*in
, snapid_t last
)
1522 assert(last
>= in
->first
);
1524 CInode
*oldin
= new CInode(this, true, in
->first
, last
);
1525 oldin
->inode
= *in
->get_previous_projected_inode();
1526 oldin
->symlink
= in
->symlink
;
1527 oldin
->xattrs
= *in
->get_previous_projected_xattrs();
1528 oldin
->inode
.trim_client_ranges(last
);
1530 if (in
->first
< in
->oldest_snap
)
1531 in
->oldest_snap
= in
->first
;
1535 dout(10) << "cow_inode " << *in
<< " to " << *oldin
<< dendl
;
1538 if (in
->last
!= CEPH_NOSNAP
) {
1539 CInode
*head_in
= get_inode(in
->ino());
1541 if (head_in
->split_need_snapflush(oldin
, in
)) {
1542 oldin
->client_snap_caps
= in
->client_snap_caps
;
1543 for (const auto &p
: in
->client_snap_caps
) {
1544 SimpleLock
*lock
= oldin
->get_lock(p
.first
);
1546 for (const auto &q
: p
.second
) {
1547 oldin
->auth_pin(lock
);
1548 lock
->set_state(LOCK_SNAP_SYNC
); // gathering
1549 lock
->get_wrlock(true);
1550 (void)q
; /* unused */
1557 if (!in
->client_caps
.empty()) {
1558 const set
<snapid_t
>& snaps
= in
->find_snaprealm()->get_snaps();
1560 for (auto &p
: in
->client_caps
) {
1561 client_t client
= p
.first
;
1562 Capability
*cap
= p
.second
;
1563 int issued
= cap
->issued();
1564 if ((issued
& CEPH_CAP_ANY_WR
) &&
1565 cap
->client_follows
< last
) {
1567 for (int i
= 0; i
< num_cinode_locks
; i
++) {
1568 if (issued
& cinode_lock_info
[i
].wr_caps
) {
1569 int lockid
= cinode_lock_info
[i
].lock
;
1570 SimpleLock
*lock
= oldin
->get_lock(lockid
);
1572 oldin
->client_snap_caps
[lockid
].insert(client
);
1573 oldin
->auth_pin(lock
);
1574 lock
->set_state(LOCK_SNAP_SYNC
); // gathering
1575 lock
->get_wrlock(true);
1576 dout(10) << " client." << client
<< " cap " << ccap_string(issued
& cinode_lock_info
[i
].wr_caps
)
1577 << " wrlock lock " << *lock
<< " on " << *oldin
<< dendl
;
1580 cap
->client_follows
= last
;
1582 // we need snapflushes for any intervening snaps
1583 dout(10) << " snaps " << snaps
<< dendl
;
1584 for (auto q
= snaps
.lower_bound(oldin
->first
);
1585 q
!= snaps
.end() && *q
<= last
;
1587 in
->add_need_snapflush(oldin
, *q
, client
);
1590 dout(10) << " ignoring client." << client
<< " cap follows " << cap
->client_follows
<< dendl
;
1597 void MDCache::journal_cow_dentry(MutationImpl
*mut
, EMetaBlob
*metablob
,
1598 CDentry
*dn
, snapid_t follows
,
1599 CInode
**pcow_inode
, CDentry::linkage_t
*dnl
)
1602 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl
;
1605 dout(10) << "journal_cow_dentry follows " << follows
<< " on " << *dn
<< dendl
;
1606 assert(dn
->is_auth());
1608 // nothing to cow on a null dentry, fix caller
1610 dnl
= dn
->get_projected_linkage();
1611 assert(!dnl
->is_null());
1613 if (dnl
->is_primary() && dnl
->get_inode()->is_multiversion()) {
1614 // multiversion inode.
1615 CInode
*in
= dnl
->get_inode();
1616 SnapRealm
*realm
= NULL
;
1618 if (in
->get_projected_parent_dn() != dn
) {
1619 assert(follows
== CEPH_NOSNAP
);
1620 realm
= dn
->dir
->inode
->find_snaprealm();
1621 snapid_t dir_follows
= realm
->get_newest_snap();
1623 if (dir_follows
+1 > dn
->first
) {
1624 snapid_t oldfirst
= dn
->first
;
1625 dn
->first
= dir_follows
+1;
1626 if (realm
->has_snaps_in_range(oldfirst
, dir_follows
)) {
1627 CDentry
*olddn
= dn
->dir
->add_remote_dentry(dn
->get_name(), in
->ino(), in
->d_type(),
1628 oldfirst
, dir_follows
);
1630 dout(10) << " olddn " << *olddn
<< dendl
;
1631 metablob
->add_remote_dentry(olddn
, true);
1632 mut
->add_cow_dentry(olddn
);
1633 // FIXME: adjust link count here? hmm.
1635 if (dir_follows
+1 > in
->first
)
1636 in
->cow_old_inode(dir_follows
, false);
1640 if (in
->snaprealm
) {
1641 realm
= in
->snaprealm
;
1642 follows
= realm
->get_newest_seq();
1644 follows
= dir_follows
;
1646 realm
= in
->find_snaprealm();
1647 if (follows
== CEPH_NOSNAP
)
1648 follows
= realm
->get_newest_seq();
1652 if (follows
< in
->first
) {
1653 dout(10) << "journal_cow_dentry follows " << follows
<< " < first on " << *in
<< dendl
;
1657 if (!realm
->has_snaps_in_range(in
->first
, follows
)) {
1658 dout(10) << "journal_cow_dentry no snapshot follows " << follows
<< " on " << *in
<< dendl
;
1659 in
->first
= follows
+ 1;
1663 in
->cow_old_inode(follows
, false);
1666 SnapRealm
*realm
= dn
->dir
->inode
->find_snaprealm();
1667 if (follows
== CEPH_NOSNAP
)
1668 follows
= realm
->get_newest_seq();
1671 if (follows
< dn
->first
) {
1672 dout(10) << "journal_cow_dentry follows " << follows
<< " < first on " << *dn
<< dendl
;
1676 // update dn.first before adding old dentry to cdir's map
1677 snapid_t oldfirst
= dn
->first
;
1678 dn
->first
= follows
+1;
1680 CInode
*in
= dnl
->is_primary() ? dnl
->get_inode() : NULL
;
1682 if (!realm
->has_snaps_in_range(oldfirst
, follows
)) {
1683 dout(10) << "journal_cow_dentry no snapshot follows " << follows
<< " on " << *dn
<< dendl
;
1685 in
->first
= follows
+1;
1689 dout(10) << " dn " << *dn
<< dendl
;
1691 CInode
*oldin
= cow_inode(in
, follows
);
1692 mut
->add_cow_inode(oldin
);
1694 *pcow_inode
= oldin
;
1695 CDentry
*olddn
= dn
->dir
->add_primary_dentry(dn
->get_name(), oldin
, oldfirst
, oldin
->last
);
1696 oldin
->inode
.version
= olddn
->pre_dirty();
1697 dout(10) << " olddn " << *olddn
<< dendl
;
1698 bool need_snapflush
= !oldin
->client_snap_caps
.empty();
1700 mut
->ls
->open_files
.push_back(&oldin
->item_open_file
);
1701 metablob
->add_primary_dentry(olddn
, 0, true, false, false, need_snapflush
);
1702 mut
->add_cow_dentry(olddn
);
1704 assert(dnl
->is_remote());
1705 CDentry
*olddn
= dn
->dir
->add_remote_dentry(dn
->get_name(), dnl
->get_remote_ino(), dnl
->get_remote_d_type(),
1708 dout(10) << " olddn " << *olddn
<< dendl
;
1709 metablob
->add_remote_dentry(olddn
, true);
1710 mut
->add_cow_dentry(olddn
);
1716 void MDCache::journal_cow_inode(MutationRef
& mut
, EMetaBlob
*metablob
,
1717 CInode
*in
, snapid_t follows
,
1718 CInode
**pcow_inode
)
1720 dout(10) << "journal_cow_inode follows " << follows
<< " on " << *in
<< dendl
;
1721 CDentry
*dn
= in
->get_projected_parent_dn();
1722 journal_cow_dentry(mut
.get(), metablob
, dn
, follows
, pcow_inode
);
1725 void MDCache::journal_dirty_inode(MutationImpl
*mut
, EMetaBlob
*metablob
, CInode
*in
, snapid_t follows
)
1727 if (in
->is_base()) {
1728 metablob
->add_root(true, in
, in
->get_projected_inode());
1730 if (follows
== CEPH_NOSNAP
&& in
->last
!= CEPH_NOSNAP
)
1731 follows
= in
->first
- 1;
1732 CDentry
*dn
= in
->get_projected_parent_dn();
1733 if (!dn
->get_projected_linkage()->is_null()) // no need to cow a null dentry
1734 journal_cow_dentry(mut
, metablob
, dn
, follows
);
1735 if (in
->get_projected_inode()->is_backtrace_updated()) {
1736 bool dirty_pool
= in
->get_projected_inode()->layout
.pool_id
!=
1737 in
->get_previous_projected_inode()->layout
.pool_id
;
1738 metablob
->add_primary_dentry(dn
, in
, true, true, dirty_pool
);
1740 metablob
->add_primary_dentry(dn
, in
, true);
1747 // nested ---------------------------------------------------------------
1749 void MDCache::project_rstat_inode_to_frag(CInode
*cur
, CDir
*parent
, snapid_t first
,
1750 int linkunlink
, SnapRealm
*prealm
)
1752 CDentry
*parentdn
= cur
->get_projected_parent_dn();
1753 CInode::mempool_inode
*curi
= cur
->get_projected_inode();
1755 if (cur
->first
> first
)
1758 dout(10) << "projected_rstat_inode_to_frag first " << first
<< " linkunlink " << linkunlink
1759 << " " << *cur
<< dendl
;
1760 dout(20) << " frag head is [" << parent
->first
<< ",head] " << dendl
;
1761 dout(20) << " inode update is [" << first
<< "," << cur
->last
<< "]" << dendl
;
1764 * FIXME. this incompletely propagates rstats to _old_ parents
1765 * (i.e. shortly after a directory rename). but we need full
1766 * blown hard link backpointers to make this work properly...
1768 snapid_t floor
= parentdn
->first
;
1769 dout(20) << " floor of " << floor
<< " from parent dn " << *parentdn
<< dendl
;
1772 prealm
= parent
->inode
->find_snaprealm();
1773 const set
<snapid_t
> snaps
= prealm
->get_snaps();
1775 if (cur
->last
!= CEPH_NOSNAP
) {
1776 assert(cur
->dirty_old_rstats
.empty());
1777 set
<snapid_t
>::const_iterator q
= snaps
.lower_bound(MAX(first
, floor
));
1778 if (q
== snaps
.end() || *q
> cur
->last
)
1782 if (cur
->last
>= floor
) {
1784 if (cur
->state_test(CInode::STATE_AMBIGUOUSAUTH
) && cur
->is_auth()) {
1785 // rename src inode is not projected in the slave rename prep case. so we should
1786 // avoid updateing the inode.
1787 assert(linkunlink
< 0);
1788 assert(cur
->is_frozen_inode());
1791 _project_rstat_inode_to_frag(*curi
, MAX(first
, floor
), cur
->last
, parent
,
1792 linkunlink
, update
);
1795 if (g_conf
->mds_snap_rstat
) {
1796 for (const auto &p
: cur
->dirty_old_rstats
) {
1797 auto &old
= cur
->old_inodes
[p
];
1798 snapid_t ofirst
= std::max(old
.first
, floor
);
1799 auto it
= snaps
.lower_bound(ofirst
);
1800 if (it
== snaps
.end() || *it
> p
)
1803 _project_rstat_inode_to_frag(old
.inode
, ofirst
, p
, parent
, 0, false);
1806 cur
->dirty_old_rstats
.clear();
1810 void MDCache::_project_rstat_inode_to_frag(CInode::mempool_inode
& inode
, snapid_t ofirst
, snapid_t last
,
1811 CDir
*parent
, int linkunlink
, bool update_inode
)
1813 dout(10) << "_project_rstat_inode_to_frag [" << ofirst
<< "," << last
<< "]" << dendl
;
1814 dout(20) << " inode rstat " << inode
.rstat
<< dendl
;
1815 dout(20) << " inode accounted_rstat " << inode
.accounted_rstat
<< dendl
;
1817 if (linkunlink
== 0) {
1818 delta
.add(inode
.rstat
);
1819 delta
.sub(inode
.accounted_rstat
);
1820 } else if (linkunlink
< 0) {
1821 delta
.sub(inode
.accounted_rstat
);
1823 delta
.add(inode
.rstat
);
1825 dout(20) << " delta " << delta
<< dendl
;
1828 inode
.accounted_rstat
= inode
.rstat
;
1830 while (last
>= ofirst
) {
1832 * pick fnode version to update. at each iteration, we want to
1833 * pick a segment ending in 'last' to update. split as necessary
1834 * to make that work. then, adjust first up so that we only
1835 * update one segment at a time. then loop to cover the whole
1836 * [ofirst,last] interval.
1838 nest_info_t
*prstat
;
1840 fnode_t
*pf
= parent
->get_projected_fnode();
1841 if (last
== CEPH_NOSNAP
) {
1842 if (g_conf
->mds_snap_rstat
)
1843 first
= MAX(ofirst
, parent
->first
);
1845 first
= parent
->first
;
1846 prstat
= &pf
->rstat
;
1847 dout(20) << " projecting to head [" << first
<< "," << last
<< "] " << *prstat
<< dendl
;
1849 if (first
> parent
->first
&&
1850 !(pf
->rstat
== pf
->accounted_rstat
)) {
1851 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1852 << parent
->first
<< "," << (first
-1) << "] "
1853 << " " << *prstat
<< "/" << pf
->accounted_rstat
1855 parent
->dirty_old_rstat
[first
-1].first
= parent
->first
;
1856 parent
->dirty_old_rstat
[first
-1].rstat
= pf
->rstat
;
1857 parent
->dirty_old_rstat
[first
-1].accounted_rstat
= pf
->accounted_rstat
;
1859 parent
->first
= first
;
1860 } else if (!g_conf
->mds_snap_rstat
) {
1861 // drop snapshots' rstats
1863 } else if (last
>= parent
->first
) {
1864 first
= parent
->first
;
1865 parent
->dirty_old_rstat
[last
].first
= first
;
1866 parent
->dirty_old_rstat
[last
].rstat
= pf
->rstat
;
1867 parent
->dirty_old_rstat
[last
].accounted_rstat
= pf
->accounted_rstat
;
1868 prstat
= &parent
->dirty_old_rstat
[last
].rstat
;
1869 dout(10) << " projecting to newly split dirty_old_fnode [" << first
<< "," << last
<< "] "
1870 << " " << *prstat
<< "/" << pf
->accounted_rstat
<< dendl
;
1872 // be careful, dirty_old_rstat is a _sparse_ map.
1873 // sorry, this is ugly.
1876 // find any intersection with last
1877 auto it
= parent
->dirty_old_rstat
.lower_bound(last
);
1878 if (it
== parent
->dirty_old_rstat
.end()) {
1879 dout(20) << " no dirty_old_rstat with last >= last " << last
<< dendl
;
1880 if (!parent
->dirty_old_rstat
.empty() && parent
->dirty_old_rstat
.rbegin()->first
>= first
) {
1881 dout(20) << " last dirty_old_rstat ends at " << parent
->dirty_old_rstat
.rbegin()->first
<< dendl
;
1882 first
= parent
->dirty_old_rstat
.rbegin()->first
+1;
1885 // *it last is >= last
1886 if (it
->second
.first
<= last
) {
1887 // *it intersects [first,last]
1888 if (it
->second
.first
< first
) {
1889 dout(10) << " splitting off left bit [" << it
->second
.first
<< "," << first
-1 << "]" << dendl
;
1890 parent
->dirty_old_rstat
[first
-1] = it
->second
;
1891 it
->second
.first
= first
;
1893 if (it
->second
.first
> first
)
1894 first
= it
->second
.first
;
1895 if (last
< it
->first
) {
1896 dout(10) << " splitting off right bit [" << last
+1 << "," << it
->first
<< "]" << dendl
;
1897 parent
->dirty_old_rstat
[last
] = it
->second
;
1898 it
->second
.first
= last
+1;
1901 // *it is to the _right_ of [first,last]
1902 it
= parent
->dirty_old_rstat
.lower_bound(first
);
1903 // new *it last is >= first
1904 if (it
->second
.first
<= last
&& // new *it isn't also to the right, and
1905 it
->first
>= first
) { // it intersects our first bit,
1906 dout(10) << " staying to the right of [" << it
->second
.first
<< "," << it
->first
<< "]..." << dendl
;
1907 first
= it
->first
+1;
1909 dout(10) << " projecting to new dirty_old_rstat [" << first
<< "," << last
<< "]" << dendl
;
1912 dout(20) << " projecting to dirty_old_rstat [" << first
<< "," << last
<< "]" << dendl
;
1913 parent
->dirty_old_rstat
[last
].first
= first
;
1914 prstat
= &parent
->dirty_old_rstat
[last
].rstat
;
1918 dout(20) << " project to [" << first
<< "," << last
<< "] " << *prstat
<< dendl
;
1919 assert(last
>= first
);
1922 inode
.accounted_rstat
= inode
.rstat
;
1923 dout(20) << " result [" << first
<< "," << last
<< "] " << *prstat
<< " " << *parent
<< dendl
;
1929 void MDCache::project_rstat_frag_to_inode(nest_info_t
& rstat
, nest_info_t
& accounted_rstat
,
1930 snapid_t ofirst
, snapid_t last
,
1931 CInode
*pin
, bool cow_head
)
1933 dout(10) << "project_rstat_frag_to_inode [" << ofirst
<< "," << last
<< "]" << dendl
;
1934 dout(20) << " frag rstat " << rstat
<< dendl
;
1935 dout(20) << " frag accounted_rstat " << accounted_rstat
<< dendl
;
1936 nest_info_t delta
= rstat
;
1937 delta
.sub(accounted_rstat
);
1938 dout(20) << " delta " << delta
<< dendl
;
1940 while (last
>= ofirst
) {
1941 CInode::mempool_inode
*pi
;
1943 if (last
== pin
->last
) {
1944 pi
= pin
->get_projected_inode();
1945 first
= MAX(ofirst
, pin
->first
);
1946 if (first
> pin
->first
) {
1947 auto &old
= pin
->cow_old_inode(first
-1, cow_head
);
1948 dout(20) << " cloned old_inode rstat is " << old
.inode
.rstat
<< dendl
;
1951 if (last
>= pin
->first
) {
1953 pin
->cow_old_inode(last
, cow_head
);
1955 // our life is easier here because old_inodes is not sparse
1956 // (although it may not begin at snapid 1)
1957 auto it
= pin
->old_inodes
.lower_bound(last
);
1958 if (it
== pin
->old_inodes
.end()) {
1959 dout(10) << " no old_inode <= " << last
<< ", done." << dendl
;
1962 first
= it
->second
.first
;
1964 dout(10) << " oldest old_inode is [" << first
<< "," << it
->first
<< "], done." << dendl
;
1965 //assert(p == pin->old_inodes.begin());
1968 if (it
->first
> last
) {
1969 dout(10) << " splitting right old_inode [" << first
<< "," << it
->first
<< "] to ["
1970 << (last
+1) << "," << it
->first
<< "]" << dendl
;
1971 pin
->old_inodes
[last
] = it
->second
;
1972 it
->second
.first
= last
+1;
1973 pin
->dirty_old_rstats
.insert(it
->first
);
1976 if (first
< ofirst
) {
1977 dout(10) << " splitting left old_inode [" << first
<< "," << last
<< "] to ["
1978 << first
<< "," << ofirst
-1 << "]" << dendl
;
1979 pin
->old_inodes
[ofirst
-1] = pin
->old_inodes
[last
];
1980 pin
->dirty_old_rstats
.insert(ofirst
-1);
1981 pin
->old_inodes
[last
].first
= first
= ofirst
;
1983 pi
= &pin
->old_inodes
[last
].inode
;
1984 pin
->dirty_old_rstats
.insert(last
);
1986 dout(20) << " projecting to [" << first
<< "," << last
<< "] " << pi
->rstat
<< dendl
;
1987 pi
->rstat
.add(delta
);
1988 dout(20) << " result [" << first
<< "," << last
<< "] " << pi
->rstat
<< dendl
;
1994 void MDCache::broadcast_quota_to_client(CInode
*in
, client_t exclude_ct
)
1996 if (!in
->is_auth() || in
->is_frozen())
1999 auto i
= in
->get_projected_inode();
2001 if (!i
->quota
.is_enable())
2004 for (map
<client_t
,Capability
*>::iterator it
= in
->client_caps
.begin();
2005 it
!= in
->client_caps
.end();
2007 Session
*session
= mds
->get_session(it
->first
);
2008 if (!session
|| !session
->connection
||
2009 !session
->connection
->has_feature(CEPH_FEATURE_MDS_QUOTA
))
2012 Capability
*cap
= it
->second
;
2014 if (exclude_ct
>= 0 && exclude_ct
!= it
->first
)
2017 if (cap
->last_rbytes
== i
->rstat
.rbytes
&&
2018 cap
->last_rsize
== i
->rstat
.rsize())
2021 if (i
->quota
.max_files
> 0) {
2022 if (i
->rstat
.rsize() >= i
->quota
.max_files
)
2025 if ((abs(cap
->last_rsize
- i
->quota
.max_files
) >> 4) <
2026 abs(cap
->last_rsize
- i
->rstat
.rsize()))
2030 if (i
->quota
.max_bytes
> 0) {
2031 if (i
->rstat
.rbytes
> i
->quota
.max_bytes
- (i
->quota
.max_bytes
>> 3))
2034 if ((abs(cap
->last_rbytes
- i
->quota
.max_bytes
) >> 4) <
2035 abs(cap
->last_rbytes
- i
->rstat
.rbytes
))
2042 cap
->last_rsize
= i
->rstat
.rsize();
2043 cap
->last_rbytes
= i
->rstat
.rbytes
;
2045 MClientQuota
*msg
= new MClientQuota();
2046 msg
->ino
= in
->ino();
2047 msg
->rstat
= i
->rstat
;
2048 msg
->quota
= i
->quota
;
2049 mds
->send_message_client_counted(msg
, session
->connection
);
2051 for (const auto &it
: in
->get_replicas()) {
2052 MGatherCaps
*msg
= new MGatherCaps
;
2053 msg
->ino
= in
->ino();
2054 mds
->send_message_mds(msg
, it
.first
);
2059 * NOTE: we _have_ to delay the scatter if we are called during a
2060 * rejoin, because we can't twiddle locks between when the
2061 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2062 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2063 * (no requests), and a survivor acks immediately. _except_ that
2064 * during rejoin_(weak|strong) processing, we may complete a lock
2065 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2066 * scatterlock state in that case or the lock states will get out of
2067 * sync between the auth and replica.
2069 * the simple solution is to never do the scatter here. instead, put
2070 * the scatterlock on a list if it isn't already wrlockable. this is
2071 * probably the best plan anyway, since we avoid too many
2072 * scatters/locks under normal usage.
2075 * some notes on dirlock/nestlock scatterlock semantics:
2077 * the fragstat (dirlock) will never be updated without
2078 * dirlock+nestlock wrlock held by the caller.
2080 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2081 * data is pushed up the tree. this could be changed with some
2082 * restructuring here, but in its current form we ensure that the
2083 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2084 * frag, which is nice. and, we only need to track frags that need to
2085 * be nudged (and not inodes with pending rstat changes that need to
2086 * be pushed into the frag). a consequence of this is that the
2087 * accounted_rstat on scatterlock sync may not match our current
2088 * rstat. this is normal and expected.
2090 void MDCache::predirty_journal_parents(MutationRef mut
, EMetaBlob
*blob
,
2091 CInode
*in
, CDir
*parent
,
2092 int flags
, int linkunlink
,
2095 bool primary_dn
= flags
& PREDIRTY_PRIMARY
;
2096 bool do_parent_mtime
= flags
& PREDIRTY_DIR
;
2097 bool shallow
= flags
& PREDIRTY_SHALLOW
;
2099 assert(mds
->mdlog
->entry_is_open());
2101 // make sure stamp is set
2102 if (mut
->get_mds_stamp() == utime_t())
2103 mut
->set_mds_stamp(ceph_clock_now());
2108 dout(10) << "predirty_journal_parents"
2109 << (do_parent_mtime
? " do_parent_mtime":"")
2110 << " linkunlink=" << linkunlink
2111 << (primary_dn
? " primary_dn":" remote_dn")
2112 << (shallow
? " SHALLOW":"")
2113 << " follows " << cfollows
2114 << " " << *in
<< dendl
;
2118 parent
= in
->get_projected_parent_dn()->get_dir();
2121 if (flags
== 0 && linkunlink
== 0) {
2122 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl
;
2123 blob
->add_dir_context(parent
);
2127 // build list of inodes to wrlock, dirty, and update
2130 CDentry
*parentdn
= NULL
;
2133 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
2134 assert(parent
->is_auth());
2136 // opportunistically adjust parent dirfrag
2137 CInode
*pin
= parent
->get_inode();
2140 mut
->auth_pin(parent
);
2141 mut
->add_projected_fnode(parent
);
2143 fnode_t
*pf
= parent
->project_fnode();
2144 pf
->version
= parent
->pre_dirty();
2146 if (do_parent_mtime
|| linkunlink
) {
2147 assert(mut
->wrlocks
.count(&pin
->filelock
));
2148 assert(mut
->wrlocks
.count(&pin
->nestlock
));
2149 assert(cfollows
== CEPH_NOSNAP
);
2151 // update stale fragstat/rstat?
2152 parent
->resync_accounted_fragstat();
2153 parent
->resync_accounted_rstat();
2155 if (do_parent_mtime
) {
2156 pf
->fragstat
.mtime
= mut
->get_op_stamp();
2157 pf
->fragstat
.change_attr
++;
2158 dout(10) << "predirty_journal_parents bumping change_attr to " << pf
->fragstat
.change_attr
<< " on " << parent
<< dendl
;
2159 if (pf
->fragstat
.mtime
> pf
->rstat
.rctime
) {
2160 dout(10) << "predirty_journal_parents updating mtime on " << *parent
<< dendl
;
2161 pf
->rstat
.rctime
= pf
->fragstat
.mtime
;
2163 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent
<< dendl
;
2167 dout(10) << "predirty_journal_parents updating size on " << *parent
<< dendl
;
2169 pf
->fragstat
.nsubdirs
+= linkunlink
;
2170 //pf->rstat.rsubdirs += linkunlink;
2172 pf
->fragstat
.nfiles
+= linkunlink
;
2173 //pf->rstat.rfiles += linkunlink;
2180 // don't update parent this pass
2181 } else if (!linkunlink
&& !(pin
->nestlock
.can_wrlock(-1) &&
2182 pin
->versionlock
.can_wrlock())) {
2183 dout(20) << " unwritable parent nestlock " << pin
->nestlock
2184 << ", marking dirty rstat on " << *cur
<< dendl
;
2185 cur
->mark_dirty_rstat();
2187 // if we don't hold a wrlock reference on this nestlock, take one,
2188 // because we are about to write into the dirfrag fnode and that needs
2189 // to commit before the lock can cycle.
2191 assert(pin
->nestlock
.get_num_wrlocks() || mut
->is_slave());
2194 if (mut
->wrlocks
.count(&pin
->nestlock
) == 0) {
2195 dout(10) << " taking wrlock on " << pin
->nestlock
<< " on " << *pin
<< dendl
;
2196 mds
->locker
->wrlock_force(&pin
->nestlock
, mut
);
2199 // now we can project the inode rstat diff the dirfrag
2200 SnapRealm
*prealm
= pin
->find_snaprealm();
2202 snapid_t follows
= cfollows
;
2203 if (follows
== CEPH_NOSNAP
)
2204 follows
= prealm
->get_newest_seq();
2206 snapid_t first
= follows
+1;
2208 // first, if the frag is stale, bring it back in sync.
2209 parent
->resync_accounted_rstat();
2211 // now push inode rstats into frag
2212 project_rstat_inode_to_frag(cur
, parent
, first
, linkunlink
, prealm
);
2213 cur
->clear_dirty_rstat();
2217 if (!pin
->is_auth() || (!mut
->is_auth_pinned(pin
) && !pin
->can_auth_pin())) {
2218 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin
<< dendl
;
2222 // delay propagating until later?
2223 if (!stop
&& !first
&&
2224 g_conf
->mds_dirstat_min_interval
> 0) {
2225 double since_last_prop
= mut
->get_mds_stamp() - pin
->last_dirstat_prop
;
2226 if (since_last_prop
< g_conf
->mds_dirstat_min_interval
) {
2227 dout(10) << "predirty_journal_parents last prop " << since_last_prop
2228 << " < " << g_conf
->mds_dirstat_min_interval
2229 << ", stopping" << dendl
;
2232 dout(10) << "predirty_journal_parents last prop " << since_last_prop
<< " ago, continuing" << dendl
;
2236 // can cast only because i'm passing nowait=true in the sole user
2237 MDRequestRef mdmut
= static_cast<MDRequestImpl
*>(mut
.get());
2239 mut
->wrlocks
.count(&pin
->nestlock
) == 0 &&
2240 (!pin
->versionlock
.can_wrlock() || // make sure we can take versionlock, too
2242 !mds
->locker
->wrlock_start(&pin
->nestlock
, mdmut
, true)
2243 )) { // ** do not initiate.. see above comment **
2244 dout(10) << "predirty_journal_parents can't wrlock one of " << pin
->versionlock
<< " or " << pin
->nestlock
2245 << " on " << *pin
<< dendl
;
2249 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin
<< dendl
;
2250 mds
->locker
->mark_updated_scatterlock(&pin
->nestlock
);
2251 mut
->ls
->dirty_dirfrag_nest
.push_back(&pin
->item_dirty_dirfrag_nest
);
2252 mut
->add_updated_lock(&pin
->nestlock
);
2253 if (do_parent_mtime
|| linkunlink
) {
2254 mds
->locker
->mark_updated_scatterlock(&pin
->filelock
);
2255 mut
->ls
->dirty_dirfrag_dir
.push_back(&pin
->item_dirty_dirfrag_dir
);
2256 mut
->add_updated_lock(&pin
->filelock
);
2260 if (!mut
->wrlocks
.count(&pin
->versionlock
))
2261 mds
->locker
->local_wrlock_grab(&pin
->versionlock
, mut
);
2263 assert(mut
->wrlocks
.count(&pin
->nestlock
) ||
2266 pin
->last_dirstat_prop
= mut
->get_mds_stamp();
2270 mut
->add_projected_inode(pin
);
2271 lsi
.push_front(pin
);
2273 pin
->pre_cow_old_inode(); // avoid cow mayhem!
2275 auto &pi
= pin
->project_inode();
2276 pi
.inode
.version
= pin
->pre_dirty();
2279 if (do_parent_mtime
|| linkunlink
) {
2280 dout(20) << "predirty_journal_parents add_delta " << pf
->fragstat
<< dendl
;
2281 dout(20) << "predirty_journal_parents - " << pf
->accounted_fragstat
<< dendl
;
2282 bool touched_mtime
= false, touched_chattr
= false;
2283 pi
.inode
.dirstat
.add_delta(pf
->fragstat
, pf
->accounted_fragstat
, &touched_mtime
, &touched_chattr
);
2284 pf
->accounted_fragstat
= pf
->fragstat
;
2286 pi
.inode
.mtime
= pi
.inode
.ctime
= pi
.inode
.dirstat
.mtime
;
2288 pi
.inode
.change_attr
= pi
.inode
.dirstat
.change_attr
;
2289 dout(20) << "predirty_journal_parents gives " << pi
.inode
.dirstat
<< " on " << *pin
<< dendl
;
2291 if (parent
->get_frag() == frag_t()) { // i.e., we are the only frag
2292 if (pi
.inode
.dirstat
.size() < 0)
2293 assert(!"negative dirstat size" == g_conf
->mds_verify_scatter
);
2294 if (pi
.inode
.dirstat
.size() != pf
->fragstat
.size()) {
2295 mds
->clog
->error() << "unmatched fragstat size on single dirfrag "
2296 << parent
->dirfrag() << ", inode has " << pi
.inode
.dirstat
2297 << ", dirfrag has " << pf
->fragstat
;
2299 // trust the dirfrag for now
2300 pi
.inode
.dirstat
= pf
->fragstat
;
2302 assert(!"unmatched fragstat size" == g_conf
->mds_verify_scatter
);
2308 * the rule here is to follow the _oldest_ parent with dirty rstat
2309 * data. if we don't propagate all data, we add ourselves to the
2310 * nudge list. that way all rstat data will (eventually) get
2311 * pushed up the tree.
2313 * actually, no. for now, silently drop rstats for old parents. we need
2314 * hard link backpointers to do the above properly.
2320 parentdn
= pin
->get_projected_parent_dn();
2324 dout(10) << "predirty_journal_parents frag->inode on " << *parent
<< dendl
;
2326 // first, if the frag is stale, bring it back in sync.
2327 parent
->resync_accounted_rstat();
2329 if (g_conf
->mds_snap_rstat
) {
2330 for (auto &p
: parent
->dirty_old_rstat
) {
2331 project_rstat_frag_to_inode(p
.second
.rstat
, p
.second
.accounted_rstat
, p
.second
.first
,
2332 p
.first
, pin
, true);
2335 parent
->dirty_old_rstat
.clear();
2336 project_rstat_frag_to_inode(pf
->rstat
, pf
->accounted_rstat
, parent
->first
, CEPH_NOSNAP
, pin
, true);//false);
2338 pf
->accounted_rstat
= pf
->rstat
;
2340 if (parent
->get_frag() == frag_t()) { // i.e., we are the only frag
2341 if (pi
.inode
.rstat
.rbytes
!= pf
->rstat
.rbytes
) {
2342 mds
->clog
->error() << "unmatched rstat rbytes on single dirfrag "
2343 << parent
->dirfrag() << ", inode has " << pi
.inode
.rstat
2344 << ", dirfrag has " << pf
->rstat
;
2346 // trust the dirfrag for now
2347 pi
.inode
.rstat
= pf
->rstat
;
2349 assert(!"unmatched rstat rbytes" == g_conf
->mds_verify_scatter
);
2353 parent
->check_rstats();
2354 broadcast_quota_to_client(pin
);
2357 parent
= parentdn
->get_dir();
2359 do_parent_mtime
= false;
2364 // now, stick it in the blob
2366 assert(parent
->is_auth());
2367 blob
->add_dir_context(parent
);
2368 blob
->add_dir(parent
, true);
2369 for (list
<CInode
*>::iterator p
= lsi
.begin();
2373 journal_dirty_inode(mut
.get(), blob
, cur
);
2382 // ===================================
2387 * some handlers for master requests with slaves. we need to make
2388 * sure slaves journal commits before we forget we mastered them and
2389 * remove them from the uncommitted_masters map (used during recovery
2390 * to commit|abort slaves).
2392 struct C_MDC_CommittedMaster
: public MDCacheLogContext
{
2394 C_MDC_CommittedMaster(MDCache
*s
, metareqid_t r
) : MDCacheLogContext(s
), reqid(r
) {}
2395 void finish(int r
) override
{
2396 mdcache
->_logged_master_commit(reqid
);
2400 void MDCache::log_master_commit(metareqid_t reqid
)
2402 dout(10) << "log_master_commit " << reqid
<< dendl
;
2403 uncommitted_masters
[reqid
].committing
= true;
2404 mds
->mdlog
->start_submit_entry(new ECommitted(reqid
),
2405 new C_MDC_CommittedMaster(this, reqid
));
2408 void MDCache::_logged_master_commit(metareqid_t reqid
)
2410 dout(10) << "_logged_master_commit " << reqid
<< dendl
;
2411 assert(uncommitted_masters
.count(reqid
));
2412 uncommitted_masters
[reqid
].ls
->uncommitted_masters
.erase(reqid
);
2413 mds
->queue_waiters(uncommitted_masters
[reqid
].waiters
);
2414 uncommitted_masters
.erase(reqid
);
2419 void MDCache::committed_master_slave(metareqid_t r
, mds_rank_t from
)
2421 dout(10) << "committed_master_slave mds." << from
<< " on " << r
<< dendl
;
2422 assert(uncommitted_masters
.count(r
));
2423 uncommitted_masters
[r
].slaves
.erase(from
);
2424 if (!uncommitted_masters
[r
].recovering
&& uncommitted_masters
[r
].slaves
.empty())
2425 log_master_commit(r
);
2428 void MDCache::logged_master_update(metareqid_t reqid
)
2430 dout(10) << "logged_master_update " << reqid
<< dendl
;
2431 assert(uncommitted_masters
.count(reqid
));
2432 uncommitted_masters
[reqid
].safe
= true;
2433 if (pending_masters
.count(reqid
)) {
2434 pending_masters
.erase(reqid
);
2435 if (pending_masters
.empty())
2436 process_delayed_resolve();
2441 * Master may crash after receiving all slaves' commit acks, but before journalling
2442 * the final commit. Slaves may crash after journalling the slave commit, but before
2443 * sending commit ack to the master. Commit masters with no uncommitted slave when
2446 void MDCache::finish_committed_masters()
2448 for (map
<metareqid_t
, umaster
>::iterator p
= uncommitted_masters
.begin();
2449 p
!= uncommitted_masters
.end();
2451 p
->second
.recovering
= false;
2452 if (!p
->second
.committing
&& p
->second
.slaves
.empty()) {
2453 dout(10) << "finish_committed_masters " << p
->first
<< dendl
;
2454 log_master_commit(p
->first
);
2460 * at end of resolve... we must journal a commit|abort for all slave
2461 * updates, before moving on.
2463 * this is so that the master can safely journal ECommitted on ops it
2464 * masters when it reaches up:active (all other recovering nodes must
2465 * complete resolve before that happens).
2467 struct C_MDC_SlaveCommit
: public MDCacheLogContext
{
2470 C_MDC_SlaveCommit(MDCache
*c
, int f
, metareqid_t r
) : MDCacheLogContext(c
), from(f
), reqid(r
) {}
2471 void finish(int r
) override
{
2472 mdcache
->_logged_slave_commit(from
, reqid
);
2476 void MDCache::_logged_slave_commit(mds_rank_t from
, metareqid_t reqid
)
2478 dout(10) << "_logged_slave_commit from mds." << from
<< " " << reqid
<< dendl
;
2481 MMDSSlaveRequest
*req
= new MMDSSlaveRequest(reqid
, 0, MMDSSlaveRequest::OP_COMMITTED
);
2482 mds
->send_message_mds(req
, from
);
2490 // ====================================================================
2491 // import map, recovery
2493 void MDCache::_move_subtree_map_bound(dirfrag_t df
, dirfrag_t oldparent
, dirfrag_t newparent
,
2494 map
<dirfrag_t
,vector
<dirfrag_t
> >& subtrees
)
2496 if (subtrees
.count(oldparent
)) {
2497 vector
<dirfrag_t
>& v
= subtrees
[oldparent
];
2498 dout(10) << " removing " << df
<< " from " << oldparent
<< " bounds " << v
<< dendl
;
2499 for (vector
<dirfrag_t
>::iterator it
= v
.begin(); it
!= v
.end(); ++it
)
2505 if (subtrees
.count(newparent
)) {
2506 vector
<dirfrag_t
>& v
= subtrees
[newparent
];
2507 dout(10) << " adding " << df
<< " to " << newparent
<< " bounds " << v
<< dendl
;
2512 ESubtreeMap
*MDCache::create_subtree_map()
2514 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2515 << num_subtrees_fullauth() << " fullauth"
2520 ESubtreeMap
*le
= new ESubtreeMap();
2521 mds
->mdlog
->_start_entry(le
);
2523 map
<dirfrag_t
, CDir
*> dirs_to_add
;
2526 CDir
* mydir
= myin
->get_dirfrag(frag_t());
2527 dirs_to_add
[mydir
->dirfrag()] = mydir
;
2530 // include all auth subtrees, and their bounds.
2531 // and a spanning tree to tie it to the root.
2532 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
2533 p
!= subtrees
.end();
2535 CDir
*dir
= p
->first
;
2537 // journal subtree as "ours" if we are
2540 // me, !me (may be importing and ambiguous!)
2544 if (dir
->get_dir_auth().first
!= mds
->get_nodeid())
2547 if (migrator
->is_ambiguous_import(dir
->dirfrag()) ||
2548 my_ambiguous_imports
.count(dir
->dirfrag())) {
2549 dout(15) << " ambig subtree " << *dir
<< dendl
;
2550 le
->ambiguous_subtrees
.insert(dir
->dirfrag());
2552 dout(15) << " subtree " << *dir
<< dendl
;
2555 dirs_to_add
[dir
->dirfrag()] = dir
;
2556 le
->subtrees
[dir
->dirfrag()].clear();
2560 for (set
<CDir
*>::iterator q
= p
->second
.begin();
2561 q
!= p
->second
.end();
2564 dout(15) << " subtree bound " << *bound
<< dendl
;
2565 dirs_to_add
[bound
->dirfrag()] = bound
;
2566 le
->subtrees
[dir
->dirfrag()].push_back(bound
->dirfrag());
2570 // apply projected renames
2571 for (map
<CInode
*,list
<pair
<CDir
*,CDir
*> > >::iterator p
= projected_subtree_renames
.begin();
2572 p
!= projected_subtree_renames
.end();
2574 for (list
<pair
<CDir
*,CDir
*> >::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
2575 CInode
*diri
= p
->first
;
2576 CDir
*olddir
= q
->first
;
2577 CDir
*newdir
= q
->second
;
2578 dout(10) << " adjusting for projected rename of " << *diri
<< " to " << *newdir
<< dendl
;
2581 diri
->get_dirfrags(dfls
);
2582 for (list
<CDir
*>::iterator p
= dfls
.begin(); p
!= dfls
.end(); ++p
) {
2584 dout(10) << "dirfrag " << dir
->dirfrag() << " " << *dir
<< dendl
;
2585 CDir
*oldparent
= get_projected_subtree_root(olddir
);
2586 dout(10) << " old parent " << oldparent
->dirfrag() << " " << *oldparent
<< dendl
;
2587 CDir
*newparent
= get_projected_subtree_root(newdir
);
2588 dout(10) << " new parent " << newparent
->dirfrag() << " " << *newparent
<< dendl
;
2590 if (oldparent
== newparent
) {
2591 dout(10) << "parent unchanged for " << dir
->dirfrag() << " at "
2592 << oldparent
->dirfrag() << dendl
;
2596 if (dir
->is_subtree_root()) {
2597 if (le
->subtrees
.count(newparent
->dirfrag()) &&
2598 oldparent
->get_dir_auth() != newparent
->get_dir_auth())
2599 dirs_to_add
[dir
->dirfrag()] = dir
;
2600 // children are fine. change parent.
2601 _move_subtree_map_bound(dir
->dirfrag(), oldparent
->dirfrag(), newparent
->dirfrag(),
2606 if (oldparent
->get_dir_auth() != newparent
->get_dir_auth()) {
2607 dout(10) << " creating subtree for " << dir
->dirfrag() << dendl
;
2608 // if oldparent is auth, subtree is mine; include it.
2609 if (le
->subtrees
.count(oldparent
->dirfrag())) {
2610 dirs_to_add
[dir
->dirfrag()] = dir
;
2611 le
->subtrees
[dir
->dirfrag()].clear();
2613 // if newparent is auth, subtree is a new bound
2614 if (le
->subtrees
.count(newparent
->dirfrag())) {
2615 dirs_to_add
[dir
->dirfrag()] = dir
;
2616 le
->subtrees
[newparent
->dirfrag()].push_back(dir
->dirfrag()); // newparent is auth; new bound
2621 // see if any old bounds move to the new parent.
2622 for (set
<CDir
*>::iterator p
= subtrees
[oldparent
].begin();
2623 p
!= subtrees
[oldparent
].end();
2626 if (dir
->contains(bound
->get_parent_dir()))
2627 _move_subtree_map_bound(bound
->dirfrag(), oldparent
->dirfrag(), newparent
->dirfrag(),
2635 // simplify the journaled map. our in memory map may have more
2636 // subtrees than needed due to migrations that are just getting
2637 // started or just completing. but on replay, the "live" map will
2638 // be simple and we can do a straight comparison.
2639 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= le
->subtrees
.begin(); p
!= le
->subtrees
.end(); ++p
) {
2640 if (le
->ambiguous_subtrees
.count(p
->first
))
2643 while (i
< p
->second
.size()) {
2644 dirfrag_t b
= p
->second
[i
];
2645 if (le
->subtrees
.count(b
) &&
2646 le
->ambiguous_subtrees
.count(b
) == 0) {
2647 vector
<dirfrag_t
>& bb
= le
->subtrees
[b
];
2648 dout(10) << "simplify: " << p
->first
<< " swallowing " << b
<< " with bounds " << bb
<< dendl
;
2649 for (vector
<dirfrag_t
>::iterator r
= bb
.begin(); r
!= bb
.end(); ++r
)
2650 p
->second
.push_back(*r
);
2651 dirs_to_add
.erase(b
);
2652 le
->subtrees
.erase(b
);
2653 p
->second
.erase(p
->second
.begin() + i
);
2660 for (auto &p
: dirs_to_add
) {
2661 CDir
*dir
= p
.second
;
2662 le
->metablob
.add_dir_context(dir
, EMetaBlob::TO_ROOT
);
2663 le
->metablob
.add_dir(dir
, false);
2666 dout(15) << " subtrees " << le
->subtrees
<< dendl
;
2667 dout(15) << " ambiguous_subtrees " << le
->ambiguous_subtrees
<< dendl
;
2669 //le->metablob.print(cout);
2670 le
->expire_pos
= mds
->mdlog
->journaler
->get_expire_pos();
2674 void MDCache::dump_resolve_status(Formatter
*f
) const
2676 f
->open_object_section("resolve_status");
2677 f
->dump_stream("resolve_gather") << resolve_gather
;
2678 f
->dump_stream("resolve_ack_gather") << resolve_gather
;
2682 void MDCache::resolve_start(MDSInternalContext
*resolve_done_
)
2684 dout(10) << "resolve_start" << dendl
;
2685 assert(!resolve_done
);
2686 resolve_done
.reset(resolve_done_
);
2688 if (mds
->mdsmap
->get_root() != mds
->get_nodeid()) {
2689 // if we don't have the root dir, adjust it to UNKNOWN. during
2690 // resolve we want mds0 to explicit claim the portion of it that
2691 // it owns, so that anything beyond its bounds get left as
2693 CDir
*rootdir
= root
->get_dirfrag(frag_t());
2695 adjust_subtree_auth(rootdir
, CDIR_AUTH_UNKNOWN
);
2697 resolve_gather
= recovery_set
;
2700 void MDCache::send_resolves()
2702 send_slave_resolves();
2703 if (!resolve_ack_gather
.empty()) {
2704 dout(10) << "send_resolves still waiting for resolve ack from ("
2705 << resolve_ack_gather
<< ")" << dendl
;
2708 if (!need_resolve_rollback
.empty()) {
2709 dout(10) << "send_resolves still waiting for rollback to commit on ("
2710 << need_resolve_rollback
<< ")" << dendl
;
2713 send_subtree_resolves();
2716 void MDCache::send_slave_resolves()
2718 dout(10) << "send_slave_resolves" << dendl
;
2720 map
<mds_rank_t
, MMDSResolve
*> resolves
;
2722 if (mds
->is_resolve()) {
2723 for (map
<mds_rank_t
, map
<metareqid_t
, MDSlaveUpdate
*> >::iterator p
= uncommitted_slave_updates
.begin();
2724 p
!= uncommitted_slave_updates
.end();
2726 resolves
[p
->first
] = new MMDSResolve
;
2727 for (map
<metareqid_t
, MDSlaveUpdate
*>::iterator q
= p
->second
.begin();
2728 q
!= p
->second
.end();
2730 dout(10) << " including uncommitted " << q
->first
<< dendl
;
2731 resolves
[p
->first
]->add_slave_request(q
->first
, false);
2735 set
<mds_rank_t
> resolve_set
;
2736 mds
->mdsmap
->get_mds_set(resolve_set
, MDSMap::STATE_RESOLVE
);
2737 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
2738 p
!= active_requests
.end();
2740 MDRequestRef
& mdr
= p
->second
;
2741 if (!mdr
->is_slave())
2743 if (!mdr
->slave_did_prepare() && !mdr
->committing
) {
2746 mds_rank_t master
= mdr
->slave_to_mds
;
2747 if (resolve_set
.count(master
) || is_ambiguous_slave_update(p
->first
, master
)) {
2748 dout(10) << " including uncommitted " << *mdr
<< dendl
;
2749 if (!resolves
.count(master
))
2750 resolves
[master
] = new MMDSResolve
;
2751 if (!mdr
->committing
&&
2752 mdr
->has_more() && mdr
->more()->is_inode_exporter
) {
2753 // re-send cap exports
2754 CInode
*in
= mdr
->more()->rename_inode
;
2755 map
<client_t
, Capability::Export
> cap_map
;
2756 in
->export_client_caps(cap_map
);
2758 ::encode(in
->ino(), bl
);
2759 ::encode(cap_map
, bl
);
2760 resolves
[master
]->add_slave_request(p
->first
, bl
);
2762 resolves
[master
]->add_slave_request(p
->first
, mdr
->committing
);
2768 for (map
<mds_rank_t
, MMDSResolve
*>::iterator p
= resolves
.begin();
2769 p
!= resolves
.end();
2771 dout(10) << "sending slave resolve to mds." << p
->first
<< dendl
;
2772 mds
->send_message_mds(p
->second
, p
->first
);
2773 resolve_ack_gather
.insert(p
->first
);
2777 void MDCache::send_subtree_resolves()
2779 dout(10) << "send_subtree_resolves" << dendl
;
2781 if (migrator
->is_exporting() || migrator
->is_importing()) {
2782 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl
;
2783 migrator
->show_importing();
2784 migrator
->show_exporting();
2785 resolves_pending
= true;
2789 map
<mds_rank_t
, MMDSResolve
*> resolves
;
2790 for (set
<mds_rank_t
>::iterator p
= recovery_set
.begin();
2791 p
!= recovery_set
.end();
2793 if (*p
== mds
->get_nodeid())
2795 if (mds
->is_resolve() || mds
->mdsmap
->is_resolve(*p
))
2796 resolves
[*p
] = new MMDSResolve
;
2799 map
<dirfrag_t
, vector
<dirfrag_t
> > my_subtrees
;
2800 map
<dirfrag_t
, vector
<dirfrag_t
> > my_ambig_imports
;
2803 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
2804 p
!= subtrees
.end();
2806 CDir
*dir
= p
->first
;
2808 // only our subtrees
2809 if (dir
->authority().first
!= mds
->get_nodeid())
2812 if (mds
->is_resolve() && my_ambiguous_imports
.count(dir
->dirfrag()))
2813 continue; // we'll add it below
2815 if (migrator
->is_ambiguous_import(dir
->dirfrag())) {
2816 // ambiguous (mid-import)
2818 get_subtree_bounds(dir
, bounds
);
2819 vector
<dirfrag_t
> dfls
;
2820 for (set
<CDir
*>::iterator q
= bounds
.begin(); q
!= bounds
.end(); ++q
)
2821 dfls
.push_back((*q
)->dirfrag());
2823 my_ambig_imports
[dir
->dirfrag()] = dfls
;
2824 dout(10) << " ambig " << dir
->dirfrag() << " " << dfls
<< dendl
;
2827 for (map
<mds_rank_t
, MMDSResolve
*>::iterator q
= resolves
.begin();
2828 q
!= resolves
.end();
2830 resolves
[q
->first
]->add_subtree(dir
->dirfrag());
2832 vector
<dirfrag_t
> dfls
;
2833 for (set
<CDir
*>::iterator q
= subtrees
[dir
].begin();
2834 q
!= subtrees
[dir
].end();
2837 dfls
.push_back(bound
->dirfrag());
2840 my_subtrees
[dir
->dirfrag()] = dfls
;
2841 dout(10) << " claim " << dir
->dirfrag() << " " << dfls
<< dendl
;
2846 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= my_ambiguous_imports
.begin();
2847 p
!= my_ambiguous_imports
.end();
2849 my_ambig_imports
[p
->first
] = p
->second
;
2850 dout(10) << " ambig " << p
->first
<< " " << p
->second
<< dendl
;
2853 // simplify the claimed subtree.
2854 for (auto p
= my_subtrees
.begin(); p
!= my_subtrees
.end(); ++p
) {
2856 while (i
< p
->second
.size()) {
2857 dirfrag_t b
= p
->second
[i
];
2858 if (my_subtrees
.count(b
)) {
2859 vector
<dirfrag_t
>& bb
= my_subtrees
[b
];
2860 dout(10) << " simplify: " << p
->first
<< " swallowing " << b
<< " with bounds " << bb
<< dendl
;
2861 for (vector
<dirfrag_t
>::iterator r
= bb
.begin(); r
!= bb
.end(); ++r
)
2862 p
->second
.push_back(*r
);
2863 my_subtrees
.erase(b
);
2864 p
->second
.erase(p
->second
.begin() + i
);
2872 for (map
<mds_rank_t
, MMDSResolve
*>::iterator p
= resolves
.begin();
2873 p
!= resolves
.end();
2875 MMDSResolve
* m
= p
->second
;
2876 m
->subtrees
= my_subtrees
;
2877 m
->ambiguous_imports
= my_ambig_imports
;
2878 dout(10) << "sending subtee resolve to mds." << p
->first
<< dendl
;
2879 mds
->send_message_mds(m
, p
->first
);
2881 resolves_pending
= false;
2884 void MDCache::handle_mds_failure(mds_rank_t who
)
2886 dout(7) << "handle_mds_failure mds." << who
<< dendl
;
2888 dout(1) << "handle_mds_failure mds." << who
<< " : recovery peers are " << recovery_set
<< dendl
;
2890 resolve_gather
.insert(who
);
2891 discard_delayed_resolve(who
);
2892 ambiguous_slave_updates
.erase(who
);
2894 rejoin_gather
.insert(who
);
2895 rejoin_sent
.erase(who
); // i need to send another
2896 rejoin_ack_sent
.erase(who
); // i need to send another
2897 rejoin_ack_gather
.erase(who
); // i'll need/get another.
2899 dout(10) << " resolve_gather " << resolve_gather
<< dendl
;
2900 dout(10) << " resolve_ack_gather " << resolve_ack_gather
<< dendl
;
2901 dout(10) << " rejoin_sent " << rejoin_sent
<< dendl
;
2902 dout(10) << " rejoin_gather " << rejoin_gather
<< dendl
;
2903 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather
<< dendl
;
2906 // tell the migrator too.
2907 migrator
->handle_mds_failure_or_stop(who
);
2909 // tell the balancer too.
2910 mds
->balancer
->handle_mds_failure(who
);
2912 // clean up any requests slave to/from this node
2913 list
<MDRequestRef
> finish
;
2914 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
2915 p
!= active_requests
.end();
2917 MDRequestRef
& mdr
= p
->second
;
2918 // slave to the failed node?
2919 if (mdr
->slave_to_mds
== who
) {
2920 if (mdr
->slave_did_prepare()) {
2921 dout(10) << " slave request " << *mdr
<< " uncommitted, will resolve shortly" << dendl
;
2922 if (is_ambiguous_slave_update(p
->first
, mdr
->slave_to_mds
))
2923 remove_ambiguous_slave_update(p
->first
, mdr
->slave_to_mds
);
2925 if (!mdr
->more()->waiting_on_slave
.empty()) {
2926 assert(mdr
->more()->srcdn_auth_mds
== mds
->get_nodeid());
2927 // will rollback, no need to wait
2928 if (mdr
->slave_request
) {
2929 mdr
->slave_request
->put();
2930 mdr
->slave_request
= 0;
2932 mdr
->more()->waiting_on_slave
.clear();
2934 } else if (!mdr
->committing
) {
2935 dout(10) << " slave request " << *mdr
<< " has no prepare, finishing up" << dendl
;
2936 if (mdr
->slave_request
|| mdr
->slave_rolling_back())
2937 mdr
->aborted
= true;
2939 finish
.push_back(mdr
);
2943 if (mdr
->is_slave() && mdr
->slave_did_prepare()) {
2944 if (mdr
->more()->waiting_on_slave
.count(who
)) {
2945 assert(mdr
->more()->srcdn_auth_mds
== mds
->get_nodeid());
2946 dout(10) << " slave request " << *mdr
<< " no longer need rename notity ack from mds."
2948 mdr
->more()->waiting_on_slave
.erase(who
);
2949 if (mdr
->more()->waiting_on_slave
.empty() && mdr
->slave_request
)
2950 mds
->queue_waiter(new C_MDS_RetryRequest(this, mdr
));
2953 if (mdr
->more()->srcdn_auth_mds
== who
&&
2954 mds
->mdsmap
->is_clientreplay_or_active_or_stopping(mdr
->slave_to_mds
)) {
2955 // rename srcdn's auth mds failed, resolve even I'm a survivor.
2956 dout(10) << " slave request " << *mdr
<< " uncommitted, will resolve shortly" << dendl
;
2957 add_ambiguous_slave_update(p
->first
, mdr
->slave_to_mds
);
2959 } else if (mdr
->slave_request
) {
2960 MMDSSlaveRequest
*slave_req
= mdr
->slave_request
;
2961 // FIXME: Slave rename request can arrive after we notice mds failure.
2962 // This can cause mds to crash (does not affect integrity of FS).
2963 if (slave_req
->get_op() == MMDSSlaveRequest::OP_RENAMEPREP
&&
2964 slave_req
->srcdn_auth
== who
)
2965 slave_req
->mark_interrupted();
2968 // failed node is slave?
2969 if (mdr
->is_master() && !mdr
->committing
) {
2970 if (mdr
->more()->srcdn_auth_mds
== who
) {
2971 dout(10) << " master request " << *mdr
<< " waiting for rename srcdn's auth mds."
2972 << who
<< " to recover" << dendl
;
2973 assert(mdr
->more()->witnessed
.count(who
) == 0);
2974 if (mdr
->more()->is_ambiguous_auth
)
2975 mdr
->clear_ambiguous_auth();
2976 // rename srcdn's auth mds failed, all witnesses will rollback
2977 mdr
->more()->witnessed
.clear();
2978 pending_masters
.erase(p
->first
);
2981 if (mdr
->more()->witnessed
.count(who
)) {
2982 mds_rank_t srcdn_auth
= mdr
->more()->srcdn_auth_mds
;
2983 if (srcdn_auth
>= 0 && mdr
->more()->waiting_on_slave
.count(srcdn_auth
)) {
2984 dout(10) << " master request " << *mdr
<< " waiting for rename srcdn's auth mds."
2985 << mdr
->more()->srcdn_auth_mds
<< " to reply" << dendl
;
2986 // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
2987 // until either the request is committing or the slave also fails.
2988 assert(mdr
->more()->waiting_on_slave
.size() == 1);
2989 pending_masters
.insert(p
->first
);
2991 dout(10) << " master request " << *mdr
<< " no longer witnessed by slave mds."
2992 << who
<< " to recover" << dendl
;
2993 if (srcdn_auth
>= 0)
2994 assert(mdr
->more()->witnessed
.count(srcdn_auth
) == 0);
2996 // discard this peer's prepare (if any)
2997 mdr
->more()->witnessed
.erase(who
);
3001 if (mdr
->more()->waiting_on_slave
.count(who
)) {
3002 dout(10) << " master request " << *mdr
<< " waiting for slave mds." << who
3003 << " to recover" << dendl
;
3004 // retry request when peer recovers
3005 mdr
->more()->waiting_on_slave
.erase(who
);
3006 if (mdr
->more()->waiting_on_slave
.empty())
3007 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(this, mdr
));
3010 if (mdr
->locking
&& mdr
->locking_target_mds
== who
)
3011 mdr
->finish_locking(mdr
->locking
);
3015 for (map
<metareqid_t
, umaster
>::iterator p
= uncommitted_masters
.begin();
3016 p
!= uncommitted_masters
.end();
3018 // The failed MDS may have already committed the slave update
3019 if (p
->second
.slaves
.count(who
)) {
3020 p
->second
.recovering
= true;
3021 p
->second
.slaves
.erase(who
);
3025 while (!finish
.empty()) {
3026 dout(10) << "cleaning up slave request " << *finish
.front() << dendl
;
3027 request_finish(finish
.front());
3031 kick_find_ino_peers(who
);
3032 kick_open_ino_peers(who
);
3034 for (map
<dirfrag_t
,fragment_info_t
>::iterator p
= fragments
.begin();
3035 p
!= fragments
.end(); ) {
3036 dirfrag_t df
= p
->first
;
3037 fragment_info_t
& info
= p
->second
;
3039 if (info
.is_fragmenting())
3041 dout(10) << "cancelling fragment " << df
<< " bit " << info
.bits
<< dendl
;
3043 info
.dirs
.swap(dirs
);
3044 fragments
.erase(df
);
3045 fragment_unmark_unfreeze_dirs(dirs
);
3048 // MDCache::shutdown_export_strays() always exports strays to mds.0
3049 if (who
== mds_rank_t(0))
3050 shutdown_exported_strays
.clear();
3056 * handle_mds_recovery - called on another node's transition
3057 * from resolve -> active.
3059 void MDCache::handle_mds_recovery(mds_rank_t who
)
3061 dout(7) << "handle_mds_recovery mds." << who
<< dendl
;
3063 // exclude all discover waiters. kick_discovers() will do the job
3064 static const uint64_t i_mask
= CInode::WAIT_ANY_MASK
& ~CInode::WAIT_DIR
;
3065 static const uint64_t d_mask
= CDir::WAIT_ANY_MASK
& ~CDir::WAIT_DENTRY
;
3067 list
<MDSInternalContextBase
*> waiters
;
3069 // wake up any waiters in their subtrees
3070 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3071 p
!= subtrees
.end();
3073 CDir
*dir
= p
->first
;
3075 if (dir
->authority().first
!= who
||
3076 dir
->authority().second
== mds
->get_nodeid())
3078 assert(!dir
->is_auth());
3084 while (!q
.empty()) {
3085 CDir
*d
= q
.front();
3087 d
->take_waiting(d_mask
, waiters
);
3089 // inode waiters too
3090 for (auto &p
: d
->items
) {
3091 CDentry
*dn
= p
.second
;
3092 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3093 if (dnl
->is_primary()) {
3094 dnl
->get_inode()->take_waiting(i_mask
, waiters
);
3098 dnl
->get_inode()->get_dirfrags(ls
);
3099 for (list
<CDir
*>::iterator p
= ls
.begin();
3103 if (!subdir
->is_subtree_root())
3104 q
.push_back(subdir
);
3111 kick_open_ino_peers(who
);
3112 kick_find_ino_peers(who
);
3115 mds
->queue_waiters(waiters
);
3118 void MDCache::set_recovery_set(set
<mds_rank_t
>& s
)
3120 dout(7) << "set_recovery_set " << s
<< dendl
;
3126 * during resolve state, we share resolves to determine who
3127 * is authoritative for which trees. we expect to get an resolve
3128 * from _everyone_ in the recovery_set (the mds cluster at the time of
3129 * the first failure).
3131 * This functions puts the passed message before returning
3133 void MDCache::handle_resolve(MMDSResolve
*m
)
3135 dout(7) << "handle_resolve from " << m
->get_source() << dendl
;
3136 mds_rank_t from
= mds_rank_t(m
->get_source().num());
3138 if (mds
->get_state() < MDSMap::STATE_RESOLVE
) {
3139 if (mds
->get_want_state() == CEPH_MDS_STATE_RESOLVE
) {
3140 mds
->wait_for_resolve(new C_MDS_RetryMessage(mds
, m
));
3143 // wait until we reach the resolve stage!
3148 discard_delayed_resolve(from
);
3150 // ambiguous slave requests?
3151 if (!m
->slave_requests
.empty()) {
3152 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping()) {
3153 for (auto p
= m
->slave_requests
.begin(); p
!= m
->slave_requests
.end(); ++p
) {
3154 if (uncommitted_masters
.count(p
->first
) && !uncommitted_masters
[p
->first
].safe
) {
3155 assert(!p
->second
.committing
);
3156 pending_masters
.insert(p
->first
);
3160 if (!pending_masters
.empty()) {
3161 dout(10) << " still have pending updates, delay processing slave resolve" << dendl
;
3162 delayed_resolve
[from
] = m
;
3167 MMDSResolveAck
*ack
= new MMDSResolveAck
;
3168 for (auto p
= m
->slave_requests
.begin(); p
!= m
->slave_requests
.end(); ++p
) {
3169 if (uncommitted_masters
.count(p
->first
)) { //mds->sessionmap.have_completed_request(p->first)) {
3171 if (p
->second
.committing
) {
3172 // already committing, waiting for the OP_COMMITTED slave reply
3173 dout(10) << " already committing slave request " << *p
<< " noop "<< dendl
;
3175 dout(10) << " ambiguous slave request " << *p
<< " will COMMIT" << dendl
;
3176 ack
->add_commit(p
->first
);
3178 uncommitted_masters
[p
->first
].slaves
.insert(from
); // wait for slave OP_COMMITTED before we log ECommitted
3180 if (p
->second
.inode_caps
.length() > 0) {
3181 // slave wants to export caps (rename)
3182 assert(mds
->is_resolve());
3185 map
<client_t
,Capability::Export
> cap_exports
;
3186 bufferlist::iterator q
= p
->second
.inode_caps
.begin();
3188 ::decode(cap_exports
, q
);
3190 assert(get_inode(ino
));
3192 for (map
<client_t
,Capability::Export
>::iterator q
= cap_exports
.begin();
3193 q
!= cap_exports
.end();
3195 Capability::Import
& im
= rejoin_imported_caps
[from
][ino
][q
->first
];
3196 im
.cap_id
= ++last_cap_id
; // assign a new cap ID
3198 im
.mseq
= q
->second
.mseq
;
3200 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
3202 rejoin_client_map
.emplace(q
->first
, session
->info
.inst
);
3205 // will process these caps in rejoin stage
3206 rejoin_slave_exports
[ino
].first
= from
;
3207 rejoin_slave_exports
[ino
].second
.swap(cap_exports
);
3209 // send information of imported caps back to slave
3210 ::encode(rejoin_imported_caps
[from
][ino
], ack
->commit
[p
->first
]);
3214 dout(10) << " ambiguous slave request " << *p
<< " will ABORT" << dendl
;
3215 assert(!p
->second
.committing
);
3216 ack
->add_abort(p
->first
);
3219 mds
->send_message(ack
, m
->get_connection());
3224 if (!resolve_ack_gather
.empty() || !need_resolve_rollback
.empty()) {
3225 dout(10) << "delay processing subtree resolve" << dendl
;
3226 delayed_resolve
[from
] = m
;
3230 bool survivor
= false;
3231 // am i a surviving ambiguous importer?
3232 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping()) {
3234 // check for any import success/failure (from this node)
3235 map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= my_ambiguous_imports
.begin();
3236 while (p
!= my_ambiguous_imports
.end()) {
3237 map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator next
= p
;
3239 CDir
*dir
= get_dirfrag(p
->first
);
3241 dout(10) << "checking ambiguous import " << *dir
<< dendl
;
3242 if (migrator
->is_importing(dir
->dirfrag()) &&
3243 migrator
->get_import_peer(dir
->dirfrag()) == from
) {
3244 assert(migrator
->get_import_state(dir
->dirfrag()) == Migrator::IMPORT_ACKING
);
3246 // check if sender claims the subtree
3247 bool claimed_by_sender
= false;
3248 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator q
= m
->subtrees
.begin();
3249 q
!= m
->subtrees
.end();
3251 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
3252 CDir
*base
= get_force_dirfrag(q
->first
, false);
3253 if (!base
|| !base
->contains(dir
))
3254 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3258 get_force_dirfrag_bound_set(q
->second
, bounds
);
3259 for (set
<CDir
*>::iterator p
= bounds
.begin(); p
!= bounds
.end(); ++p
) {
3261 if (bound
->contains(dir
)) {
3262 inside
= false; // nope, bound is dir or parent of dir, not inside.
3267 claimed_by_sender
= true;
3270 my_ambiguous_imports
.erase(p
); // no longer ambiguous.
3271 if (claimed_by_sender
) {
3272 dout(7) << "ambiguous import failed on " << *dir
<< dendl
;
3273 migrator
->import_reverse(dir
);
3275 dout(7) << "ambiguous import succeeded on " << *dir
<< dendl
;
3276 migrator
->import_finish(dir
, true);
3283 // update my dir_auth values
3284 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3285 // migrations between other nodes)
3286 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator pi
= m
->subtrees
.begin();
3287 pi
!= m
->subtrees
.end();
3289 dout(10) << "peer claims " << pi
->first
<< " bounds " << pi
->second
<< dendl
;
3290 CDir
*dir
= get_force_dirfrag(pi
->first
, !survivor
);
3293 adjust_bounded_subtree_auth(dir
, pi
->second
, from
);
3294 try_subtree_merge(dir
);
3299 // note ambiguous imports too
3300 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator pi
= m
->ambiguous_imports
.begin();
3301 pi
!= m
->ambiguous_imports
.end();
3303 dout(10) << "noting ambiguous import on " << pi
->first
<< " bounds " << pi
->second
<< dendl
;
3304 other_ambiguous_imports
[from
][pi
->first
].swap( pi
->second
);
3307 // did i get them all?
3308 resolve_gather
.erase(from
);
3310 maybe_resolve_finish();
3315 void MDCache::process_delayed_resolve()
3317 dout(10) << "process_delayed_resolve" << dendl
;
3318 map
<mds_rank_t
, MMDSResolve
*> tmp
;
3319 tmp
.swap(delayed_resolve
);
3320 for (map
<mds_rank_t
, MMDSResolve
*>::iterator p
= tmp
.begin(); p
!= tmp
.end(); ++p
)
3321 handle_resolve(p
->second
);
3324 void MDCache::discard_delayed_resolve(mds_rank_t who
)
3326 if (delayed_resolve
.count(who
)) {
3327 delayed_resolve
[who
]->put();
3328 delayed_resolve
.erase(who
);
3332 void MDCache::maybe_resolve_finish()
3334 assert(resolve_ack_gather
.empty());
3335 assert(need_resolve_rollback
.empty());
3337 if (!resolve_gather
.empty()) {
3338 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3339 << resolve_gather
<< ")" << dendl
;
3343 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl
;
3344 disambiguate_my_imports();
3345 finish_committed_masters();
3348 assert(mds
->is_resolve());
3349 trim_unlinked_inodes();
3350 recalc_auth_bits(false);
3351 resolve_done
.release()->complete(0);
3353 maybe_send_pending_rejoins();
3357 /* This functions puts the passed message before returning */
3358 void MDCache::handle_resolve_ack(MMDSResolveAck
*ack
)
3360 dout(10) << "handle_resolve_ack " << *ack
<< " from " << ack
->get_source() << dendl
;
3361 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
3363 if (!resolve_ack_gather
.count(from
) ||
3364 mds
->mdsmap
->get_state(from
) < MDSMap::STATE_RESOLVE
) {
3369 if (ambiguous_slave_updates
.count(from
)) {
3370 assert(mds
->mdsmap
->is_clientreplay_or_active_or_stopping(from
));
3371 assert(mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping());
3374 for (map
<metareqid_t
, bufferlist
>::iterator p
= ack
->commit
.begin();
3375 p
!= ack
->commit
.end();
3377 dout(10) << " commit on slave " << p
->first
<< dendl
;
3379 if (ambiguous_slave_updates
.count(from
)) {
3380 remove_ambiguous_slave_update(p
->first
, from
);
3384 if (mds
->is_resolve()) {
3386 MDSlaveUpdate
*su
= get_uncommitted_slave_update(p
->first
, from
);
3390 mds
->mdlog
->start_submit_entry(new ESlaveUpdate(mds
->mdlog
, "unknown", p
->first
, from
,
3391 ESlaveUpdate::OP_COMMIT
, su
->origop
),
3392 new C_MDC_SlaveCommit(this, from
, p
->first
));
3393 mds
->mdlog
->flush();
3395 finish_uncommitted_slave_update(p
->first
, from
);
3397 MDRequestRef mdr
= request_get(p
->first
);
3398 // information about master imported caps
3399 if (p
->second
.length() > 0)
3400 mdr
->more()->inode_import
.claim(p
->second
);
3402 assert(mdr
->slave_request
== 0); // shouldn't be doing anything!
3403 request_finish(mdr
);
3407 for (vector
<metareqid_t
>::iterator p
= ack
->abort
.begin();
3408 p
!= ack
->abort
.end();
3410 dout(10) << " abort on slave " << *p
<< dendl
;
3412 if (mds
->is_resolve()) {
3413 MDSlaveUpdate
*su
= get_uncommitted_slave_update(*p
, from
);
3416 // perform rollback (and journal a rollback entry)
3417 // note: this will hold up the resolve a bit, until the rollback entries journal.
3418 MDRequestRef null_ref
;
3419 switch (su
->origop
) {
3420 case ESlaveUpdate::LINK
:
3421 mds
->server
->do_link_rollback(su
->rollback
, from
, null_ref
);
3423 case ESlaveUpdate::RENAME
:
3424 mds
->server
->do_rename_rollback(su
->rollback
, from
, null_ref
);
3426 case ESlaveUpdate::RMDIR
:
3427 mds
->server
->do_rmdir_rollback(su
->rollback
, from
, null_ref
);
3433 MDRequestRef mdr
= request_get(*p
);
3434 mdr
->aborted
= true;
3435 if (mdr
->slave_request
) {
3436 if (mdr
->slave_did_prepare()) // journaling slave prepare ?
3437 add_rollback(*p
, from
);
3439 request_finish(mdr
);
3444 if (!ambiguous_slave_updates
.count(from
))
3445 resolve_ack_gather
.erase(from
);
3446 if (resolve_ack_gather
.empty() && need_resolve_rollback
.empty()) {
3447 send_subtree_resolves();
3448 process_delayed_resolve();
3454 void MDCache::add_uncommitted_slave_update(metareqid_t reqid
, mds_rank_t master
, MDSlaveUpdate
*su
)
3456 assert(uncommitted_slave_updates
[master
].count(reqid
) == 0);
3457 uncommitted_slave_updates
[master
][reqid
] = su
;
3458 for(set
<CInode
*>::iterator p
= su
->olddirs
.begin(); p
!= su
->olddirs
.end(); ++p
)
3459 uncommitted_slave_rename_olddir
[*p
]++;
3460 for(set
<CInode
*>::iterator p
= su
->unlinked
.begin(); p
!= su
->unlinked
.end(); ++p
)
3461 uncommitted_slave_unlink
[*p
]++;
3464 void MDCache::finish_uncommitted_slave_update(metareqid_t reqid
, mds_rank_t master
)
3466 assert(uncommitted_slave_updates
[master
].count(reqid
));
3467 MDSlaveUpdate
* su
= uncommitted_slave_updates
[master
][reqid
];
3469 uncommitted_slave_updates
[master
].erase(reqid
);
3470 if (uncommitted_slave_updates
[master
].empty())
3471 uncommitted_slave_updates
.erase(master
);
3472 // discard the non-auth subtree we renamed out of
3473 for(set
<CInode
*>::iterator p
= su
->olddirs
.begin(); p
!= su
->olddirs
.end(); ++p
) {
3475 map
<CInode
*, int>::iterator it
= uncommitted_slave_rename_olddir
.find(diri
);
3476 assert(it
!= uncommitted_slave_rename_olddir
.end());
3478 if (it
->second
== 0) {
3479 uncommitted_slave_rename_olddir
.erase(it
);
3481 diri
->get_dirfrags(ls
);
3482 for (list
<CDir
*>::iterator q
= ls
.begin(); q
!= ls
.end(); ++q
) {
3483 CDir
*root
= get_subtree_root(*q
);
3484 if (root
->get_dir_auth() == CDIR_AUTH_UNDEF
) {
3485 try_trim_non_auth_subtree(root
);
3491 assert(it
->second
> 0);
3493 // removed the inodes that were unlinked by slave update
3494 for(set
<CInode
*>::iterator p
= su
->unlinked
.begin(); p
!= su
->unlinked
.end(); ++p
) {
3496 map
<CInode
*, int>::iterator it
= uncommitted_slave_unlink
.find(in
);
3497 assert(it
!= uncommitted_slave_unlink
.end());
3499 if (it
->second
== 0) {
3500 uncommitted_slave_unlink
.erase(it
);
3501 if (!in
->get_projected_parent_dn())
3502 mds
->mdcache
->remove_inode_recursive(in
);
3504 assert(it
->second
> 0);
3509 MDSlaveUpdate
* MDCache::get_uncommitted_slave_update(metareqid_t reqid
, mds_rank_t master
)
3512 MDSlaveUpdate
* su
= NULL
;
3513 if (uncommitted_slave_updates
.count(master
) &&
3514 uncommitted_slave_updates
[master
].count(reqid
)) {
3515 su
= uncommitted_slave_updates
[master
][reqid
];
3521 void MDCache::finish_rollback(metareqid_t reqid
) {
3522 assert(need_resolve_rollback
.count(reqid
));
3523 if (mds
->is_resolve())
3524 finish_uncommitted_slave_update(reqid
, need_resolve_rollback
[reqid
]);
3525 need_resolve_rollback
.erase(reqid
);
3526 if (resolve_ack_gather
.empty() && need_resolve_rollback
.empty()) {
3527 send_subtree_resolves();
3528 process_delayed_resolve();
3532 void MDCache::disambiguate_other_imports()
3534 dout(10) << "disambiguate_other_imports" << dendl
;
3536 bool recovering
= !(mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping());
3537 // other nodes' ambiguous imports
3538 for (map
<mds_rank_t
, map
<dirfrag_t
, vector
<dirfrag_t
> > >::iterator p
= other_ambiguous_imports
.begin();
3539 p
!= other_ambiguous_imports
.end();
3541 mds_rank_t who
= p
->first
;
3542 dout(10) << "ambiguous imports for mds." << who
<< dendl
;
3544 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator q
= p
->second
.begin();
3545 q
!= p
->second
.end();
3547 dout(10) << " ambiguous import " << q
->first
<< " bounds " << q
->second
<< dendl
;
3548 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3549 CDir
*dir
= get_force_dirfrag(q
->first
, recovering
);
3552 if (dir
->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3553 dir
->authority() == CDIR_AUTH_UNDEF
) { // resolving
3554 dout(10) << " mds." << who
<< " did import " << *dir
<< dendl
;
3555 adjust_bounded_subtree_auth(dir
, q
->second
, who
);
3556 try_subtree_merge(dir
);
3558 dout(10) << " mds." << who
<< " did not import " << *dir
<< dendl
;
3562 other_ambiguous_imports
.clear();
3565 void MDCache::disambiguate_my_imports()
3567 dout(10) << "disambiguate_my_imports" << dendl
;
3569 if (!mds
->is_resolve()) {
3570 assert(my_ambiguous_imports
.empty());
3574 disambiguate_other_imports();
3576 // my ambiguous imports
3577 mds_authority_t
me_ambig(mds
->get_nodeid(), mds
->get_nodeid());
3578 while (!my_ambiguous_imports
.empty()) {
3579 map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator q
= my_ambiguous_imports
.begin();
3581 CDir
*dir
= get_dirfrag(q
->first
);
3584 if (dir
->authority() != me_ambig
) {
3585 dout(10) << "ambiguous import auth known, must not be me " << *dir
<< dendl
;
3586 cancel_ambiguous_import(dir
);
3588 mds
->mdlog
->start_submit_entry(new EImportFinish(dir
, false));
3590 // subtree may have been swallowed by another node claiming dir
3592 CDir
*root
= get_subtree_root(dir
);
3594 dout(10) << " subtree root is " << *root
<< dendl
;
3595 assert(root
->dir_auth
.first
!= mds
->get_nodeid()); // no us!
3596 try_trim_non_auth_subtree(root
);
3598 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir
<< dendl
;
3599 finish_ambiguous_import(q
->first
);
3600 mds
->mdlog
->start_submit_entry(new EImportFinish(dir
, true));
3603 assert(my_ambiguous_imports
.empty());
3604 mds
->mdlog
->flush();
3606 // verify all my subtrees are unambiguous!
3607 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3608 p
!= subtrees
.end();
3610 CDir
*dir
= p
->first
;
3611 if (dir
->is_ambiguous_dir_auth()) {
3612 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir
<< dendl
;
3614 assert(!dir
->is_ambiguous_dir_auth());
3621 void MDCache::add_ambiguous_import(dirfrag_t base
, const vector
<dirfrag_t
>& bounds
)
3623 assert(my_ambiguous_imports
.count(base
) == 0);
3624 my_ambiguous_imports
[base
] = bounds
;
3628 void MDCache::add_ambiguous_import(CDir
*base
, const set
<CDir
*>& bounds
)
3631 vector
<dirfrag_t
> binos
;
3632 for (set
<CDir
*>::iterator p
= bounds
.begin();
3635 binos
.push_back((*p
)->dirfrag());
3637 // note: this can get called twice if the exporter fails during recovery
3638 if (my_ambiguous_imports
.count(base
->dirfrag()))
3639 my_ambiguous_imports
.erase(base
->dirfrag());
3641 add_ambiguous_import(base
->dirfrag(), binos
);
3644 void MDCache::cancel_ambiguous_import(CDir
*dir
)
3646 dirfrag_t df
= dir
->dirfrag();
3647 assert(my_ambiguous_imports
.count(df
));
3648 dout(10) << "cancel_ambiguous_import " << df
3649 << " bounds " << my_ambiguous_imports
[df
]
3652 my_ambiguous_imports
.erase(df
);
3655 void MDCache::finish_ambiguous_import(dirfrag_t df
)
3657 assert(my_ambiguous_imports
.count(df
));
3658 vector
<dirfrag_t
> bounds
;
3659 bounds
.swap(my_ambiguous_imports
[df
]);
3660 my_ambiguous_imports
.erase(df
);
3662 dout(10) << "finish_ambiguous_import " << df
3663 << " bounds " << bounds
3665 CDir
*dir
= get_dirfrag(df
);
3668 // adjust dir_auth, import maps
3669 adjust_bounded_subtree_auth(dir
, bounds
, mds
->get_nodeid());
3670 try_subtree_merge(dir
);
3673 void MDCache::remove_inode_recursive(CInode
*in
)
3675 dout(10) << "remove_inode_recursive " << *in
<< dendl
;
3677 in
->get_dirfrags(ls
);
3678 list
<CDir
*>::iterator p
= ls
.begin();
3679 while (p
!= ls
.end()) {
3680 CDir
*subdir
= *p
++;
3682 dout(10) << " removing dirfrag " << subdir
<< dendl
;
3683 auto it
= subdir
->items
.begin();
3684 while (it
!= subdir
->items
.end()) {
3685 CDentry
*dn
= it
->second
;
3687 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3688 if (dnl
->is_primary()) {
3689 CInode
*tin
= dnl
->get_inode();
3690 subdir
->unlink_inode(dn
, false);
3691 remove_inode_recursive(tin
);
3693 subdir
->remove_dentry(dn
);
3696 if (subdir
->is_subtree_root())
3697 remove_subtree(subdir
);
3698 in
->close_dirfrag(subdir
->dirfrag().frag
);
3703 bool MDCache::expire_recursive(
3705 map
<mds_rank_t
, MCacheExpire
*>& expiremap
)
3707 assert(!in
->is_auth());
3709 dout(10) << __func__
<< ":" << *in
<< dendl
;
3711 // Recurse into any dirfrags beneath this inode
3713 in
->get_dirfrags(ls
);
3714 for (auto subdir
: ls
) {
3715 if (!in
->is_mdsdir() && subdir
->is_subtree_root()) {
3716 dout(10) << __func__
<< ": stray still has subtree " << *in
<< dendl
;
3720 for (auto &it
: subdir
->items
) {
3721 CDentry
*dn
= it
.second
;
3722 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3723 if (dnl
->is_primary()) {
3724 CInode
*tin
= dnl
->get_inode();
3726 /* Remote strays with linkage (i.e. hardlinks) should not be
3727 * expired, because they may be the target of
3728 * a rename() as the owning MDS shuts down */
3729 if (!tin
->is_stray() && tin
->inode
.nlink
) {
3730 dout(10) << __func__
<< ": stray still has linkage " << *tin
<< dendl
;
3734 const bool abort
= expire_recursive(tin
, expiremap
);
3739 if (dn
->lru_is_expireable()) {
3740 trim_dentry(dn
, expiremap
);
3742 dout(10) << __func__
<< ": stray dn is not expireable " << *dn
<< dendl
;
3751 void MDCache::trim_unlinked_inodes()
3753 dout(7) << "trim_unlinked_inodes" << dendl
;
3755 for (auto &p
: inode_map
) {
3756 CInode
*in
= p
.second
;
3757 if (in
->get_parent_dn() == NULL
&& !in
->is_base()) {
3758 dout(7) << " will trim from " << *in
<< dendl
;
3762 for (list
<CInode
*>::iterator p
= q
.begin(); p
!= q
.end(); ++p
)
3763 remove_inode_recursive(*p
);
3766 /** recalc_auth_bits()
3767 * once subtree auth is disambiguated, we need to adjust all the
3768 * auth and dirty bits in our cache before moving on.
3770 void MDCache::recalc_auth_bits(bool replay
)
3772 dout(7) << "recalc_auth_bits " << (replay
? "(replay)" : "") << dendl
;
3775 root
->inode_auth
.first
= mds
->mdsmap
->get_root();
3776 bool auth
= mds
->get_nodeid() == root
->inode_auth
.first
;
3778 root
->state_set(CInode::STATE_AUTH
);
3780 root
->state_clear(CInode::STATE_AUTH
);
3782 root
->state_set(CInode::STATE_REJOINING
);
3786 set
<CInode
*> subtree_inodes
;
3787 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3788 p
!= subtrees
.end();
3790 if (p
->first
->dir_auth
.first
== mds
->get_nodeid())
3791 subtree_inodes
.insert(p
->first
->inode
);
3794 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
3795 p
!= subtrees
.end();
3797 if (p
->first
->inode
->is_mdsdir()) {
3798 CInode
*in
= p
->first
->inode
;
3799 bool auth
= in
->ino() == MDS_INO_MDSDIR(mds
->get_nodeid());
3801 in
->state_set(CInode::STATE_AUTH
);
3803 in
->state_clear(CInode::STATE_AUTH
);
3805 in
->state_set(CInode::STATE_REJOINING
);
3809 list
<CDir
*> dfq
; // dirfrag queue
3810 dfq
.push_back(p
->first
);
3812 bool auth
= p
->first
->authority().first
== mds
->get_nodeid();
3813 dout(10) << " subtree auth=" << auth
<< " for " << *p
->first
<< dendl
;
3815 while (!dfq
.empty()) {
3816 CDir
*dir
= dfq
.front();
3821 dir
->state_set(CDir::STATE_AUTH
);
3823 dir
->state_clear(CDir::STATE_AUTH
);
3825 // close empty non-auth dirfrag
3826 if (!dir
->is_subtree_root() && dir
->get_num_any() == 0) {
3827 dir
->inode
->close_dirfrag(dir
->get_frag());
3830 dir
->state_set(CDir::STATE_REJOINING
);
3831 dir
->state_clear(CDir::STATE_COMPLETE
);
3832 if (dir
->is_dirty())
3837 // dentries in this dir
3838 for (auto &p
: dir
->items
) {
3840 CDentry
*dn
= p
.second
;
3841 CDentry::linkage_t
*dnl
= dn
->get_linkage();
3843 dn
->state_set(CDentry::STATE_AUTH
);
3845 dn
->state_clear(CDentry::STATE_AUTH
);
3847 dn
->state_set(CDentry::STATE_REJOINING
);
3853 if (dnl
->is_primary()) {
3855 CInode
*in
= dnl
->get_inode();
3857 in
->state_set(CInode::STATE_AUTH
);
3859 in
->state_clear(CInode::STATE_AUTH
);
3861 in
->state_set(CInode::STATE_REJOINING
);
3864 if (in
->is_dirty_parent())
3865 in
->clear_dirty_parent();
3866 // avoid touching scatterlocks for our subtree roots!
3867 if (subtree_inodes
.count(in
) == 0)
3868 in
->clear_scatter_dirty();
3873 in
->get_nested_dirfrags(dfq
);
3885 // ===========================================================================
3889 * notes on scatterlock recovery:
3891 * - recovering inode replica sends scatterlock data for any subtree
3892 * roots (the only ones that are possibly dirty).
3894 * - surviving auth incorporates any provided scatterlock data. any
3895 * pending gathers are then finished, as with the other lock types.
3897 * that takes care of surviving auth + (recovering replica)*.
3899 * - surviving replica sends strong_inode, which includes current
3900 * scatterlock state, AND any dirty scatterlock data. this
3901 * provides the recovering auth with everything it might need.
3903 * - recovering auth must pick initial scatterlock state based on
3904 * (weak|strong) rejoins.
3905 * - always assimilate scatterlock data (it can't hurt)
3906 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3907 * - include base inode in ack for all inodes that saw scatterlock content
3909 * also, for scatter gather,
3911 * - auth increments {frag,r}stat.version on completion of any gather.
3913 * - auth incorporates changes in a gather _only_ if the version
3916 * - replica discards changes any time the scatterlock syncs, and
3920 void MDCache::dump_rejoin_status(Formatter
*f
) const
3922 f
->open_object_section("rejoin_status");
3923 f
->dump_stream("rejoin_gather") << rejoin_gather
;
3924 f
->dump_stream("rejoin_ack_gather") << rejoin_ack_gather
;
3925 f
->dump_unsigned("num_opening_inodes", cap_imports_num_opening
);
3929 void MDCache::rejoin_start(MDSInternalContext
*rejoin_done_
)
3931 dout(10) << "rejoin_start" << dendl
;
3932 assert(!rejoin_done
);
3933 rejoin_done
.reset(rejoin_done_
);
3935 rejoin_gather
= recovery_set
;
3936 // need finish opening cap inodes before sending cache rejoins
3937 rejoin_gather
.insert(mds
->get_nodeid());
3938 process_imported_caps();
3944 * this initiates rejoin. it shoudl be called before we get any
3945 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3947 * we start out by sending rejoins to everyone in the recovery set.
3949 * if we are rejoin, send for all regions in our cache.
3950 * if we are active|stopping, send only to nodes that are are rejoining.
3952 void MDCache::rejoin_send_rejoins()
3954 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set
<< dendl
;
3956 if (rejoin_gather
.count(mds
->get_nodeid())) {
3957 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl
;
3958 rejoins_pending
= true;
3961 if (!resolve_gather
.empty()) {
3962 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
3963 << resolve_gather
<< ")" << dendl
;
3964 rejoins_pending
= true;
3968 assert(!migrator
->is_importing());
3969 assert(!migrator
->is_exporting());
3971 if (!mds
->is_rejoin()) {
3972 disambiguate_other_imports();
3975 map
<mds_rank_t
, MMDSCacheRejoin
*> rejoins
;
3978 // if i am rejoining, send a rejoin to everyone.
3979 // otherwise, just send to others who are rejoining.
3980 for (set
<mds_rank_t
>::iterator p
= recovery_set
.begin();
3981 p
!= recovery_set
.end();
3983 if (*p
== mds
->get_nodeid()) continue; // nothing to myself!
3984 if (rejoin_sent
.count(*p
)) continue; // already sent a rejoin to this node!
3985 if (mds
->is_rejoin())
3986 rejoins
[*p
] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK
);
3987 else if (mds
->mdsmap
->is_rejoin(*p
))
3988 rejoins
[*p
] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_STRONG
);
3991 if (mds
->is_rejoin()) {
3992 map
<client_t
, set
<mds_rank_t
> > client_exports
;
3993 for (auto p
= cap_exports
.begin(); p
!= cap_exports
.end(); ++p
) {
3994 mds_rank_t target
= p
->second
.first
;
3995 if (rejoins
.count(target
) == 0)
3997 rejoins
[target
]->cap_exports
[p
->first
] = p
->second
.second
;
3998 for (auto q
= p
->second
.second
.begin(); q
!= p
->second
.second
.end(); ++q
)
3999 client_exports
[q
->first
].insert(target
);
4001 for (map
<client_t
, set
<mds_rank_t
> >::iterator p
= client_exports
.begin();
4002 p
!= client_exports
.end();
4004 entity_inst_t inst
= mds
->sessionmap
.get_inst(entity_name_t::CLIENT(p
->first
.v
));
4005 for (set
<mds_rank_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
)
4006 rejoins
[*q
]->client_map
[p
->first
] = inst
;
4011 // check all subtrees
4012 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
4013 p
!= subtrees
.end();
4015 CDir
*dir
= p
->first
;
4016 assert(dir
->is_subtree_root());
4017 if (dir
->is_ambiguous_dir_auth()) {
4018 // exporter is recovering, importer is survivor.
4019 assert(rejoins
.count(dir
->authority().first
));
4020 assert(!rejoins
.count(dir
->authority().second
));
4026 continue; // skip my own regions!
4028 mds_rank_t auth
= dir
->get_dir_auth().first
;
4030 if (rejoins
.count(auth
) == 0)
4031 continue; // don't care about this node's subtrees
4033 rejoin_walk(dir
, rejoins
[auth
]);
4036 // rejoin root inodes, too
4037 for (map
<mds_rank_t
, MMDSCacheRejoin
*>::iterator p
= rejoins
.begin();
4040 if (mds
->is_rejoin()) {
4042 if (p
->first
== 0 && root
) {
4043 p
->second
->add_weak_inode(root
->vino());
4044 if (root
->is_dirty_scattered()) {
4045 dout(10) << " sending scatterlock state on root " << *root
<< dendl
;
4046 p
->second
->add_scatterlock_state(root
);
4049 if (CInode
*in
= get_inode(MDS_INO_MDSDIR(p
->first
))) {
4051 p
->second
->add_weak_inode(in
->vino());
4055 if (p
->first
== 0 && root
) {
4056 p
->second
->add_strong_inode(root
->vino(),
4057 root
->get_replica_nonce(),
4058 root
->get_caps_wanted(),
4059 root
->filelock
.get_state(),
4060 root
->nestlock
.get_state(),
4061 root
->dirfragtreelock
.get_state());
4062 root
->state_set(CInode::STATE_REJOINING
);
4063 if (root
->is_dirty_scattered()) {
4064 dout(10) << " sending scatterlock state on root " << *root
<< dendl
;
4065 p
->second
->add_scatterlock_state(root
);
4069 if (CInode
*in
= get_inode(MDS_INO_MDSDIR(p
->first
))) {
4070 p
->second
->add_strong_inode(in
->vino(),
4071 in
->get_replica_nonce(),
4072 in
->get_caps_wanted(),
4073 in
->filelock
.get_state(),
4074 in
->nestlock
.get_state(),
4075 in
->dirfragtreelock
.get_state());
4076 in
->state_set(CInode::STATE_REJOINING
);
4081 if (!mds
->is_rejoin()) {
4082 // i am survivor. send strong rejoin.
4083 // note request remote_auth_pins, xlocks
4084 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
4085 p
!= active_requests
.end();
4087 MDRequestRef
& mdr
= p
->second
;
4088 if (mdr
->is_slave())
4091 for (map
<MDSCacheObject
*,mds_rank_t
>::iterator q
= mdr
->remote_auth_pins
.begin();
4092 q
!= mdr
->remote_auth_pins
.end();
4094 if (!q
->first
->is_auth()) {
4095 assert(q
->second
== q
->first
->authority().first
);
4096 if (rejoins
.count(q
->second
) == 0) continue;
4097 MMDSCacheRejoin
*rejoin
= rejoins
[q
->second
];
4099 dout(15) << " " << *mdr
<< " authpin on " << *q
->first
<< dendl
;
4100 MDSCacheObjectInfo i
;
4101 q
->first
->set_object_info(i
);
4103 rejoin
->add_inode_authpin(vinodeno_t(i
.ino
, i
.snapid
), mdr
->reqid
, mdr
->attempt
);
4105 rejoin
->add_dentry_authpin(i
.dirfrag
, i
.dname
, i
.snapid
, mdr
->reqid
, mdr
->attempt
);
4107 if (mdr
->has_more() && mdr
->more()->is_remote_frozen_authpin
&&
4108 mdr
->more()->rename_inode
== q
->first
)
4109 rejoin
->add_inode_frozen_authpin(vinodeno_t(i
.ino
, i
.snapid
),
4110 mdr
->reqid
, mdr
->attempt
);
4114 for (set
<SimpleLock
*>::iterator q
= mdr
->xlocks
.begin();
4115 q
!= mdr
->xlocks
.end();
4117 if (!(*q
)->get_parent()->is_auth()) {
4118 mds_rank_t who
= (*q
)->get_parent()->authority().first
;
4119 if (rejoins
.count(who
) == 0) continue;
4120 MMDSCacheRejoin
*rejoin
= rejoins
[who
];
4122 dout(15) << " " << *mdr
<< " xlock on " << **q
<< " " << *(*q
)->get_parent() << dendl
;
4123 MDSCacheObjectInfo i
;
4124 (*q
)->get_parent()->set_object_info(i
);
4126 rejoin
->add_inode_xlock(vinodeno_t(i
.ino
, i
.snapid
), (*q
)->get_type(),
4127 mdr
->reqid
, mdr
->attempt
);
4129 rejoin
->add_dentry_xlock(i
.dirfrag
, i
.dname
, i
.snapid
,
4130 mdr
->reqid
, mdr
->attempt
);
4134 for (map
<SimpleLock
*, mds_rank_t
>::iterator q
= mdr
->remote_wrlocks
.begin();
4135 q
!= mdr
->remote_wrlocks
.end();
4137 mds_rank_t who
= q
->second
;
4138 if (rejoins
.count(who
) == 0) continue;
4139 MMDSCacheRejoin
*rejoin
= rejoins
[who
];
4141 dout(15) << " " << *mdr
<< " wrlock on " << q
->second
4142 << " " << q
->first
->get_parent() << dendl
;
4143 MDSCacheObjectInfo i
;
4144 q
->first
->get_parent()->set_object_info(i
);
4146 rejoin
->add_inode_wrlock(vinodeno_t(i
.ino
, i
.snapid
), q
->first
->get_type(),
4147 mdr
->reqid
, mdr
->attempt
);
4152 // send the messages
4153 for (map
<mds_rank_t
,MMDSCacheRejoin
*>::iterator p
= rejoins
.begin();
4156 assert(rejoin_sent
.count(p
->first
) == 0);
4157 assert(rejoin_ack_gather
.count(p
->first
) == 0);
4158 rejoin_sent
.insert(p
->first
);
4159 rejoin_ack_gather
.insert(p
->first
);
4160 mds
->send_message_mds(p
->second
, p
->first
);
4162 rejoin_ack_gather
.insert(mds
->get_nodeid()); // we need to complete rejoin_gather_finish, too
4163 rejoins_pending
= false;
4166 if (mds
->is_rejoin() && rejoin_gather
.empty()) {
4167 dout(10) << "nothing to rejoin" << dendl
;
4168 rejoin_gather_finish();
4174 * rejoin_walk - build rejoin declarations for a subtree
4176 * @param dir subtree root
4177 * @param rejoin rejoin message
4179 * from a rejoining node:
4181 * weak dentries (w/ connectivity)
4183 * from a surviving node:
4185 * strong dentries (no connectivity!)
4188 void MDCache::rejoin_walk(CDir
*dir
, MMDSCacheRejoin
*rejoin
)
4190 dout(10) << "rejoin_walk " << *dir
<< dendl
;
4192 list
<CDir
*> nested
; // finish this dir, then do nested items
4194 if (mds
->is_rejoin()) {
4196 rejoin
->add_weak_dirfrag(dir
->dirfrag());
4197 for (auto &p
: dir
->items
) {
4198 CDentry
*dn
= p
.second
;
4199 assert(dn
->last
== CEPH_NOSNAP
);
4200 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4201 dout(15) << " add_weak_primary_dentry " << *dn
<< dendl
;
4202 assert(dnl
->is_primary());
4203 CInode
*in
= dnl
->get_inode();
4204 assert(dnl
->get_inode()->is_dir());
4205 rejoin
->add_weak_primary_dentry(dir
->ino(), dn
->get_name(), dn
->first
, dn
->last
, in
->ino());
4206 in
->get_nested_dirfrags(nested
);
4207 if (in
->is_dirty_scattered()) {
4208 dout(10) << " sending scatterlock state on " << *in
<< dendl
;
4209 rejoin
->add_scatterlock_state(in
);
4214 dout(15) << " add_strong_dirfrag " << *dir
<< dendl
;
4215 rejoin
->add_strong_dirfrag(dir
->dirfrag(), dir
->get_replica_nonce(), dir
->get_dir_rep());
4216 dir
->state_set(CDir::STATE_REJOINING
);
4218 for (auto it
= dir
->items
.begin(); it
!= dir
->items
.end(); ++it
) {
4219 CDentry
*dn
= it
->second
;
4220 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4221 dout(15) << " add_strong_dentry " << *dn
<< dendl
;
4222 rejoin
->add_strong_dentry(dir
->dirfrag(), dn
->get_name(), dn
->first
, dn
->last
,
4223 dnl
->is_primary() ? dnl
->get_inode()->ino():inodeno_t(0),
4224 dnl
->is_remote() ? dnl
->get_remote_ino():inodeno_t(0),
4225 dnl
->is_remote() ? dnl
->get_remote_d_type():0,
4226 dn
->get_replica_nonce(),
4227 dn
->lock
.get_state());
4228 dn
->state_set(CDentry::STATE_REJOINING
);
4229 if (dnl
->is_primary()) {
4230 CInode
*in
= dnl
->get_inode();
4231 dout(15) << " add_strong_inode " << *in
<< dendl
;
4232 rejoin
->add_strong_inode(in
->vino(),
4233 in
->get_replica_nonce(),
4234 in
->get_caps_wanted(),
4235 in
->filelock
.get_state(),
4236 in
->nestlock
.get_state(),
4237 in
->dirfragtreelock
.get_state());
4238 in
->state_set(CInode::STATE_REJOINING
);
4239 in
->get_nested_dirfrags(nested
);
4240 if (in
->is_dirty_scattered()) {
4241 dout(10) << " sending scatterlock state on " << *in
<< dendl
;
4242 rejoin
->add_scatterlock_state(in
);
4248 // recurse into nested dirs
4249 for (list
<CDir
*>::iterator p
= nested
.begin();
4252 rejoin_walk(*p
, rejoin
);
4258 * - reply with the lockstate
4260 * if i am active|stopping,
4261 * - remove source from replica list for everything not referenced here.
4262 * This function puts the passed message before returning.
4264 void MDCache::handle_cache_rejoin(MMDSCacheRejoin
*m
)
4266 dout(7) << "handle_cache_rejoin " << *m
<< " from " << m
->get_source()
4267 << " (" << m
->get_payload().length() << " bytes)"
4271 case MMDSCacheRejoin::OP_WEAK
:
4272 handle_cache_rejoin_weak(m
);
4274 case MMDSCacheRejoin::OP_STRONG
:
4275 handle_cache_rejoin_strong(m
);
4277 case MMDSCacheRejoin::OP_ACK
:
4278 handle_cache_rejoin_ack(m
);
4289 * handle_cache_rejoin_weak
4292 * - is recovering from their journal.
4293 * - may have incorrect (out of date) inode contents
4294 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4296 * if the sender didn't trim_non_auth(), they
4297 * - may have incorrect (out of date) dentry/inode linkage
4298 * - may have deleted/purged inodes
4299 * and i may have to go to disk to get accurate inode contents. yuck.
4300 * This functions DOES NOT put the passed message before returning
4302 void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin
*weak
)
4304 mds_rank_t from
= mds_rank_t(weak
->get_source().num());
4306 // possible response(s)
4307 MMDSCacheRejoin
*ack
= 0; // if survivor
4308 set
<vinodeno_t
> acked_inodes
; // if survivor
4309 set
<SimpleLock
*> gather_locks
; // if survivor
4310 bool survivor
= false; // am i a survivor?
4312 if (mds
->is_clientreplay() || mds
->is_active() || mds
->is_stopping()) {
4314 dout(10) << "i am a surivivor, and will ack immediately" << dendl
;
4315 ack
= new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK
);
4317 map
<inodeno_t
,map
<client_t
,Capability::Import
> > imported_caps
;
4319 // check cap exports
4320 for (auto p
= weak
->cap_exports
.begin(); p
!= weak
->cap_exports
.end(); ++p
) {
4321 CInode
*in
= get_inode(p
->first
);
4322 assert(!in
|| in
->is_auth());
4323 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
4324 dout(10) << " claiming cap import " << p
->first
<< " client." << q
->first
<< " on " << *in
<< dendl
;
4325 Capability
*cap
= rejoin_import_cap(in
, q
->first
, q
->second
, from
);
4326 Capability::Import
& im
= imported_caps
[p
->first
][q
->first
];
4328 im
.cap_id
= cap
->get_cap_id();
4329 im
.issue_seq
= cap
->get_last_seq();
4330 im
.mseq
= cap
->get_mseq();
4335 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);
4338 ::encode(imported_caps
, ack
->imported_caps
);
4340 assert(mds
->is_rejoin());
4342 // we may have already received a strong rejoin from the sender.
4343 rejoin_scour_survivor_replicas(from
, NULL
, acked_inodes
, gather_locks
);
4344 assert(gather_locks
.empty());
4346 // check cap exports.
4347 rejoin_client_map
.insert(weak
->client_map
.begin(), weak
->client_map
.end());
4349 for (auto p
= weak
->cap_exports
.begin(); p
!= weak
->cap_exports
.end(); ++p
) {
4350 CInode
*in
= get_inode(p
->first
);
4351 assert(!in
|| in
->is_auth());
4353 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
4354 dout(10) << " claiming cap import " << p
->first
<< " client." << q
->first
<< dendl
;
4355 cap_imports
[p
->first
][q
->first
][from
] = q
->second
;
4360 // assimilate any potentially dirty scatterlock state
4361 for (map
<inodeno_t
,MMDSCacheRejoin::lock_bls
>::iterator p
= weak
->inode_scatterlocks
.begin();
4362 p
!= weak
->inode_scatterlocks
.end();
4364 CInode
*in
= get_inode(p
->first
);
4366 in
->decode_lock_state(CEPH_LOCK_IFILE
, p
->second
.file
);
4367 in
->decode_lock_state(CEPH_LOCK_INEST
, p
->second
.nest
);
4368 in
->decode_lock_state(CEPH_LOCK_IDFT
, p
->second
.dft
);
4370 rejoin_potential_updated_scatterlocks
.insert(in
);
4373 // recovering peer may send incorrect dirfrags here. we need to
4374 // infer which dirfrag they meant. the ack will include a
4375 // strong_dirfrag that will set them straight on the fragmentation.
4378 set
<CDir
*> dirs_to_share
;
4379 for (set
<dirfrag_t
>::iterator p
= weak
->weak_dirfrags
.begin();
4380 p
!= weak
->weak_dirfrags
.end();
4382 CInode
*diri
= get_inode(p
->ino
);
4384 dout(0) << " missing dir ino " << p
->ino
<< dendl
;
4388 if (diri
->dirfragtree
.is_leaf(p
->frag
)) {
4389 ls
.push_back(p
->frag
);
4391 diri
->dirfragtree
.get_leaves_under(p
->frag
, ls
);
4393 ls
.push_back(diri
->dirfragtree
[p
->frag
.value()]);
4395 for (list
<frag_t
>::iterator q
= ls
.begin(); q
!= ls
.end(); ++q
) {
4397 CDir
*dir
= diri
->get_dirfrag(fg
);
4399 dout(0) << " missing dir for " << p
->frag
<< " (which maps to " << fg
<< ") on " << *diri
<< dendl
;
4403 if (dirs_to_share
.count(dir
)) {
4404 dout(10) << " already have " << p
->frag
<< " -> " << fg
<< " " << *dir
<< dendl
;
4406 dirs_to_share
.insert(dir
);
4407 unsigned nonce
= dir
->add_replica(from
);
4408 dout(10) << " have " << p
->frag
<< " -> " << fg
<< " " << *dir
<< dendl
;
4410 ack
->add_strong_dirfrag(dir
->dirfrag(), nonce
, dir
->dir_rep
);
4411 ack
->add_dirfrag_base(dir
);
4417 for (map
<inodeno_t
,map
<string_snap_t
,MMDSCacheRejoin::dn_weak
> >::iterator p
= weak
->weak
.begin();
4418 p
!= weak
->weak
.end();
4420 CInode
*diri
= get_inode(p
->first
);
4422 dout(0) << " missing dir ino " << p
->first
<< dendl
;
4427 for (map
<string_snap_t
,MMDSCacheRejoin::dn_weak
>::iterator q
= p
->second
.begin();
4428 q
!= p
->second
.end();
4430 // locate proper dirfrag.
4431 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
4432 frag_t fg
= diri
->pick_dirfrag(q
->first
.name
);
4433 if (!dir
|| dir
->get_frag() != fg
) {
4434 dir
= diri
->get_dirfrag(fg
);
4436 dout(0) << " missing dir frag " << fg
<< " on " << *diri
<< dendl
;
4438 assert(dirs_to_share
.count(dir
));
4442 CDentry
*dn
= dir
->lookup(q
->first
.name
, q
->first
.snapid
);
4444 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4445 assert(dnl
->is_primary());
4447 if (survivor
&& dn
->is_replica(from
))
4448 dentry_remove_replica(dn
, from
, gather_locks
);
4449 unsigned dnonce
= dn
->add_replica(from
);
4450 dout(10) << " have " << *dn
<< dendl
;
4452 ack
->add_strong_dentry(dir
->dirfrag(), dn
->get_name(), dn
->first
, dn
->last
,
4453 dnl
->get_inode()->ino(), inodeno_t(0), 0,
4454 dnonce
, dn
->lock
.get_replica_state());
4457 CInode
*in
= dnl
->get_inode();
4460 if (survivor
&& in
->is_replica(from
))
4461 inode_remove_replica(in
, from
, true, gather_locks
);
4462 unsigned inonce
= in
->add_replica(from
);
4463 dout(10) << " have " << *in
<< dendl
;
4465 // scatter the dirlock, just in case?
4466 if (!survivor
&& in
->is_dir() && in
->has_subtree_root_dirfrag())
4467 in
->filelock
.set_state(LOCK_MIX
);
4470 acked_inodes
.insert(in
->vino());
4471 ack
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
4473 in
->_encode_locks_state_for_rejoin(bl
, from
);
4474 ack
->add_inode_locks(in
, inonce
, bl
);
4479 // weak base inodes? (root, stray, etc.)
4480 for (set
<vinodeno_t
>::iterator p
= weak
->weak_inodes
.begin();
4481 p
!= weak
->weak_inodes
.end();
4483 CInode
*in
= get_inode(*p
);
4484 assert(in
); // hmm fixme wrt stray?
4485 if (survivor
&& in
->is_replica(from
))
4486 inode_remove_replica(in
, from
, true, gather_locks
);
4487 unsigned inonce
= in
->add_replica(from
);
4488 dout(10) << " have base " << *in
<< dendl
;
4491 acked_inodes
.insert(in
->vino());
4492 ack
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
4494 in
->_encode_locks_state_for_rejoin(bl
, from
);
4495 ack
->add_inode_locks(in
, inonce
, bl
);
4499 assert(rejoin_gather
.count(from
));
4500 rejoin_gather
.erase(from
);
4502 // survivor. do everything now.
4503 for (map
<inodeno_t
,MMDSCacheRejoin::lock_bls
>::iterator p
= weak
->inode_scatterlocks
.begin();
4504 p
!= weak
->inode_scatterlocks
.end();
4506 CInode
*in
= get_inode(p
->first
);
4508 dout(10) << " including base inode (due to potential scatterlock update) " << *in
<< dendl
;
4509 acked_inodes
.insert(in
->vino());
4510 ack
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
4513 rejoin_scour_survivor_replicas(from
, ack
, acked_inodes
, gather_locks
);
4514 mds
->send_message(ack
, weak
->get_connection());
4516 for (set
<SimpleLock
*>::iterator p
= gather_locks
.begin(); p
!= gather_locks
.end(); ++p
) {
4517 if (!(*p
)->is_stable())
4518 mds
->locker
->eval_gather(*p
);
4522 if (rejoin_gather
.empty() && rejoin_ack_gather
.count(mds
->get_nodeid())) {
4523 rejoin_gather_finish();
4525 dout(7) << "still need rejoin from (" << rejoin_gather
<< ")" << dendl
;
4531 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4533 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4534 * ack, the replica dne, and we can remove it from our replica maps.
4536 void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from
, MMDSCacheRejoin
*ack
,
4537 set
<vinodeno_t
>& acked_inodes
,
4538 set
<SimpleLock
*>& gather_locks
)
4540 dout(10) << "rejoin_scour_survivor_replicas from mds." << from
<< dendl
;
4542 auto scour_func
= [this, from
, ack
, &acked_inodes
, &gather_locks
] (CInode
*in
) {
4544 if (in
->is_auth() &&
4545 in
->is_replica(from
) &&
4546 (ack
== NULL
|| acked_inodes
.count(in
->vino()) == 0)) {
4547 inode_remove_replica(in
, from
, false, gather_locks
);
4548 dout(10) << " rem " << *in
<< dendl
;
4555 in
->get_dirfrags(dfs
);
4556 for (list
<CDir
*>::iterator p
= dfs
.begin();
4560 if (!dir
->is_auth())
4563 if (dir
->is_replica(from
) &&
4564 (ack
== NULL
|| ack
->strong_dirfrags
.count(dir
->dirfrag()) == 0)) {
4565 dir
->remove_replica(from
);
4566 dout(10) << " rem " << *dir
<< dendl
;
4570 for (auto &p
: dir
->items
) {
4571 CDentry
*dn
= p
.second
;
4573 if (dn
->is_replica(from
) &&
4575 ack
->strong_dentries
.count(dir
->dirfrag()) == 0 ||
4576 ack
->strong_dentries
[dir
->dirfrag()].count(string_snap_t(dn
->get_name(), dn
->last
)) == 0)) {
4577 dentry_remove_replica(dn
, from
, gather_locks
);
4578 dout(10) << " rem " << *dn
<< dendl
;
4584 for (auto &p
: inode_map
)
4585 scour_func(p
.second
);
4586 for (auto &p
: snap_inode_map
)
4587 scour_func(p
.second
);
4591 CInode
*MDCache::rejoin_invent_inode(inodeno_t ino
, snapid_t last
)
4593 CInode
*in
= new CInode(this, true, 1, last
);
4594 in
->inode
.ino
= ino
;
4595 in
->state_set(CInode::STATE_REJOINUNDEF
);
4597 rejoin_undef_inodes
.insert(in
);
4598 dout(10) << " invented " << *in
<< dendl
;
4602 CDir
*MDCache::rejoin_invent_dirfrag(dirfrag_t df
)
4604 CInode
*in
= get_inode(df
.ino
);
4606 in
= rejoin_invent_inode(df
.ino
, CEPH_NOSNAP
);
4607 if (!in
->is_dir()) {
4608 assert(in
->state_test(CInode::STATE_REJOINUNDEF
));
4609 in
->inode
.mode
= S_IFDIR
;
4610 in
->inode
.dir_layout
.dl_dir_hash
= g_conf
->mds_default_dir_hash
;
4612 CDir
*dir
= in
->get_or_open_dirfrag(this, df
.frag
);
4613 dir
->state_set(CDir::STATE_REJOINUNDEF
);
4614 rejoin_undef_dirfrags
.insert(dir
);
4615 dout(10) << " invented " << *dir
<< dendl
;
4619 /* This functions DOES NOT put the passed message before returning */
4620 void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin
*strong
)
4622 mds_rank_t from
= mds_rank_t(strong
->get_source().num());
4624 // only a recovering node will get a strong rejoin.
4625 assert(mds
->is_rejoin());
4627 // assimilate any potentially dirty scatterlock state
4628 for (map
<inodeno_t
,MMDSCacheRejoin::lock_bls
>::iterator p
= strong
->inode_scatterlocks
.begin();
4629 p
!= strong
->inode_scatterlocks
.end();
4631 CInode
*in
= get_inode(p
->first
);
4633 in
->decode_lock_state(CEPH_LOCK_IFILE
, p
->second
.file
);
4634 in
->decode_lock_state(CEPH_LOCK_INEST
, p
->second
.nest
);
4635 in
->decode_lock_state(CEPH_LOCK_IDFT
, p
->second
.dft
);
4636 rejoin_potential_updated_scatterlocks
.insert(in
);
4639 rejoin_unlinked_inodes
[from
].clear();
4641 // surviving peer may send incorrect dirfrag here (maybe they didn't
4642 // get the fragment notify, or maybe we rolled back?). we need to
4643 // infer the right frag and get them with the program. somehow.
4644 // we don't normally send ACK.. so we'll need to bundle this with
4645 // MISSING or something.
4647 // strong dirfrags/dentries.
4648 // also process auth_pins, xlocks.
4649 for (map
<dirfrag_t
, MMDSCacheRejoin::dirfrag_strong
>::iterator p
= strong
->strong_dirfrags
.begin();
4650 p
!= strong
->strong_dirfrags
.end();
4652 CInode
*diri
= get_inode(p
->first
.ino
);
4654 diri
= rejoin_invent_inode(p
->first
.ino
, CEPH_NOSNAP
);
4655 CDir
*dir
= diri
->get_dirfrag(p
->first
.frag
);
4656 bool refragged
= false;
4658 dout(10) << " have " << *dir
<< dendl
;
4660 if (diri
->state_test(CInode::STATE_REJOINUNDEF
))
4661 dir
= rejoin_invent_dirfrag(dirfrag_t(diri
->ino(), frag_t()));
4662 else if (diri
->dirfragtree
.is_leaf(p
->first
.frag
))
4663 dir
= rejoin_invent_dirfrag(p
->first
);
4666 dir
->add_replica(from
, p
->second
.nonce
);
4667 dir
->dir_rep
= p
->second
.dir_rep
;
4669 dout(10) << " frag " << p
->first
<< " doesn't match dirfragtree " << *diri
<< dendl
;
4671 diri
->dirfragtree
.get_leaves_under(p
->first
.frag
, ls
);
4673 ls
.push_back(diri
->dirfragtree
[p
->first
.frag
.value()]);
4674 dout(10) << " maps to frag(s) " << ls
<< dendl
;
4675 for (list
<frag_t
>::iterator q
= ls
.begin(); q
!= ls
.end(); ++q
) {
4676 CDir
*dir
= diri
->get_dirfrag(*q
);
4678 dir
= rejoin_invent_dirfrag(dirfrag_t(diri
->ino(), *q
));
4680 dout(10) << " have(approx) " << *dir
<< dendl
;
4681 dir
->add_replica(from
, p
->second
.nonce
);
4682 dir
->dir_rep
= p
->second
.dir_rep
;
4687 map
<string_snap_t
,MMDSCacheRejoin::dn_strong
>& dmap
= strong
->strong_dentries
[p
->first
];
4688 for (map
<string_snap_t
,MMDSCacheRejoin::dn_strong
>::iterator q
= dmap
.begin();
4693 dn
= dir
->lookup(q
->first
.name
, q
->first
.snapid
);
4695 frag_t fg
= diri
->pick_dirfrag(q
->first
.name
);
4696 dir
= diri
->get_dirfrag(fg
);
4698 dn
= dir
->lookup(q
->first
.name
, q
->first
.snapid
);
4701 if (q
->second
.is_remote()) {
4702 dn
= dir
->add_remote_dentry(q
->first
.name
, q
->second
.remote_ino
, q
->second
.remote_d_type
,
4703 q
->second
.first
, q
->first
.snapid
);
4704 } else if (q
->second
.is_null()) {
4705 dn
= dir
->add_null_dentry(q
->first
.name
, q
->second
.first
, q
->first
.snapid
);
4707 CInode
*in
= get_inode(q
->second
.ino
, q
->first
.snapid
);
4708 if (!in
) in
= rejoin_invent_inode(q
->second
.ino
, q
->first
.snapid
);
4709 dn
= dir
->add_primary_dentry(q
->first
.name
, in
, q
->second
.first
, q
->first
.snapid
);
4711 dout(10) << " invented " << *dn
<< dendl
;
4713 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4716 if (strong
->authpinned_dentries
.count(p
->first
) &&
4717 strong
->authpinned_dentries
[p
->first
].count(q
->first
)) {
4718 for (list
<MMDSCacheRejoin::slave_reqid
>::iterator r
= strong
->authpinned_dentries
[p
->first
][q
->first
].begin();
4719 r
!= strong
->authpinned_dentries
[p
->first
][q
->first
].end();
4721 dout(10) << " dn authpin by " << *r
<< " on " << *dn
<< dendl
;
4723 // get/create slave mdrequest
4725 if (have_request(r
->reqid
))
4726 mdr
= request_get(r
->reqid
);
4728 mdr
= request_start_slave(r
->reqid
, r
->attempt
, strong
);
4734 if (strong
->xlocked_dentries
.count(p
->first
) &&
4735 strong
->xlocked_dentries
[p
->first
].count(q
->first
)) {
4736 MMDSCacheRejoin::slave_reqid r
= strong
->xlocked_dentries
[p
->first
][q
->first
];
4737 dout(10) << " dn xlock by " << r
<< " on " << *dn
<< dendl
;
4738 MDRequestRef mdr
= request_get(r
.reqid
); // should have this from auth_pin above.
4739 assert(mdr
->is_auth_pinned(dn
));
4740 if (!mdr
->xlocks
.count(&dn
->versionlock
)) {
4741 assert(dn
->versionlock
.can_xlock_local());
4742 dn
->versionlock
.get_xlock(mdr
, mdr
->get_client());
4743 mdr
->xlocks
.insert(&dn
->versionlock
);
4744 mdr
->locks
.insert(&dn
->versionlock
);
4746 if (dn
->lock
.is_stable())
4747 dn
->auth_pin(&dn
->lock
);
4748 dn
->lock
.set_state(LOCK_XLOCK
);
4749 dn
->lock
.get_xlock(mdr
, mdr
->get_client());
4750 mdr
->xlocks
.insert(&dn
->lock
);
4751 mdr
->locks
.insert(&dn
->lock
);
4754 dn
->add_replica(from
, q
->second
.nonce
);
4755 dout(10) << " have " << *dn
<< dendl
;
4757 if (dnl
->is_primary()) {
4758 if (q
->second
.is_primary()) {
4759 if (vinodeno_t(q
->second
.ino
, q
->first
.snapid
) != dnl
->get_inode()->vino()) {
4760 // the survivor missed MDentryUnlink+MDentryLink messages ?
4761 assert(strong
->strong_inodes
.count(dnl
->get_inode()->vino()) == 0);
4762 CInode
*in
= get_inode(q
->second
.ino
, q
->first
.snapid
);
4764 assert(in
->get_parent_dn());
4765 rejoin_unlinked_inodes
[from
].insert(in
);
4766 dout(7) << " sender has primary dentry but wrong inode" << dendl
;
4769 // the survivor missed MDentryLink message ?
4770 assert(strong
->strong_inodes
.count(dnl
->get_inode()->vino()) == 0);
4771 dout(7) << " sender doesn't have primay dentry" << dendl
;
4774 if (q
->second
.is_primary()) {
4775 // the survivor missed MDentryUnlink message ?
4776 CInode
*in
= get_inode(q
->second
.ino
, q
->first
.snapid
);
4778 assert(in
->get_parent_dn());
4779 rejoin_unlinked_inodes
[from
].insert(in
);
4780 dout(7) << " sender has primary dentry but we don't" << dendl
;
4786 for (map
<vinodeno_t
, MMDSCacheRejoin::inode_strong
>::iterator p
= strong
->strong_inodes
.begin();
4787 p
!= strong
->strong_inodes
.end();
4789 CInode
*in
= get_inode(p
->first
);
4791 in
->add_replica(from
, p
->second
.nonce
);
4792 dout(10) << " have " << *in
<< dendl
;
4794 MMDSCacheRejoin::inode_strong
&is
= p
->second
;
4797 if (is
.caps_wanted
) {
4798 in
->mds_caps_wanted
[from
] = is
.caps_wanted
;
4799 dout(15) << " inode caps_wanted " << ccap_string(is
.caps_wanted
)
4800 << " on " << *in
<< dendl
;
4804 // infer state from replica state:
4805 // * go to MIX if they might have wrlocks
4806 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4807 in
->filelock
.infer_state_from_strong_rejoin(is
.filelock
, !in
->is_dir()); // maybe also go to LOCK
4808 in
->nestlock
.infer_state_from_strong_rejoin(is
.nestlock
, false);
4809 in
->dirfragtreelock
.infer_state_from_strong_rejoin(is
.dftlock
, false);
4812 if (strong
->authpinned_inodes
.count(in
->vino())) {
4813 for (list
<MMDSCacheRejoin::slave_reqid
>::iterator r
= strong
->authpinned_inodes
[in
->vino()].begin();
4814 r
!= strong
->authpinned_inodes
[in
->vino()].end();
4816 dout(10) << " inode authpin by " << *r
<< " on " << *in
<< dendl
;
4818 // get/create slave mdrequest
4820 if (have_request(r
->reqid
))
4821 mdr
= request_get(r
->reqid
);
4823 mdr
= request_start_slave(r
->reqid
, r
->attempt
, strong
);
4824 if (strong
->frozen_authpin_inodes
.count(in
->vino())) {
4825 assert(!in
->get_num_auth_pins());
4826 mdr
->freeze_auth_pin(in
);
4828 assert(!in
->is_frozen_auth_pin());
4834 if (strong
->xlocked_inodes
.count(in
->vino())) {
4835 for (map
<int,MMDSCacheRejoin::slave_reqid
>::iterator q
= strong
->xlocked_inodes
[in
->vino()].begin();
4836 q
!= strong
->xlocked_inodes
[in
->vino()].end();
4838 SimpleLock
*lock
= in
->get_lock(q
->first
);
4839 dout(10) << " inode xlock by " << q
->second
<< " on " << *lock
<< " on " << *in
<< dendl
;
4840 MDRequestRef mdr
= request_get(q
->second
.reqid
); // should have this from auth_pin above.
4841 assert(mdr
->is_auth_pinned(in
));
4842 if (!mdr
->xlocks
.count(&in
->versionlock
)) {
4843 assert(in
->versionlock
.can_xlock_local());
4844 in
->versionlock
.get_xlock(mdr
, mdr
->get_client());
4845 mdr
->xlocks
.insert(&in
->versionlock
);
4846 mdr
->locks
.insert(&in
->versionlock
);
4848 if (lock
->is_stable())
4850 lock
->set_state(LOCK_XLOCK
);
4851 if (lock
== &in
->filelock
)
4853 lock
->get_xlock(mdr
, mdr
->get_client());
4854 mdr
->xlocks
.insert(lock
);
4855 mdr
->locks
.insert(lock
);
4860 for (map
<vinodeno_t
, map
<int, list
<MMDSCacheRejoin::slave_reqid
> > >::iterator p
= strong
->wrlocked_inodes
.begin();
4861 p
!= strong
->wrlocked_inodes
.end();
4863 CInode
*in
= get_inode(p
->first
);
4864 for (map
<int, list
<MMDSCacheRejoin::slave_reqid
> >::iterator q
= p
->second
.begin();
4865 q
!= p
->second
.end();
4867 SimpleLock
*lock
= in
->get_lock(q
->first
);
4868 for (list
<MMDSCacheRejoin::slave_reqid
>::iterator r
= q
->second
.begin();
4869 r
!= q
->second
.end();
4871 dout(10) << " inode wrlock by " << *r
<< " on " << *lock
<< " on " << *in
<< dendl
;
4872 MDRequestRef mdr
= request_get(r
->reqid
); // should have this from auth_pin above.
4874 assert(mdr
->is_auth_pinned(in
));
4875 lock
->set_state(LOCK_MIX
);
4876 if (lock
== &in
->filelock
)
4878 lock
->get_wrlock(true);
4879 mdr
->wrlocks
.insert(lock
);
4880 mdr
->locks
.insert(lock
);
4886 assert(rejoin_gather
.count(from
));
4887 rejoin_gather
.erase(from
);
4888 if (rejoin_gather
.empty() && rejoin_ack_gather
.count(mds
->get_nodeid())) {
4889 rejoin_gather_finish();
4891 dout(7) << "still need rejoin from (" << rejoin_gather
<< ")" << dendl
;
4895 /* This functions DOES NOT put the passed message before returning */
4896 void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin
*ack
)
4898 dout(7) << "handle_cache_rejoin_ack from " << ack
->get_source() << dendl
;
4899 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
4901 assert(mds
->get_state() >= MDSMap::STATE_REJOIN
);
4902 bool survivor
= !mds
->is_rejoin();
4904 // for sending cache expire message
4905 set
<CInode
*> isolated_inodes
;
4906 set
<CInode
*> refragged_inodes
;
4909 for (map
<dirfrag_t
, MMDSCacheRejoin::dirfrag_strong
>::iterator p
= ack
->strong_dirfrags
.begin();
4910 p
!= ack
->strong_dirfrags
.end();
4912 // we may have had incorrect dir fragmentation; refragment based
4913 // on what they auth tells us.
4914 CDir
*dir
= get_dirfrag(p
->first
);
4916 dir
= get_force_dirfrag(p
->first
, false);
4918 refragged_inodes
.insert(dir
->get_inode());
4921 CInode
*diri
= get_inode(p
->first
.ino
);
4923 // barebones inode; the full inode loop below will clean up.
4924 diri
= new CInode(this, false);
4925 diri
->inode
.ino
= p
->first
.ino
;
4926 diri
->inode
.mode
= S_IFDIR
;
4927 diri
->inode
.dir_layout
.dl_dir_hash
= g_conf
->mds_default_dir_hash
;
4929 if (MDS_INO_MDSDIR(from
) == p
->first
.ino
) {
4930 diri
->inode_auth
= mds_authority_t(from
, CDIR_AUTH_UNKNOWN
);
4931 dout(10) << " add inode " << *diri
<< dendl
;
4933 diri
->inode_auth
= CDIR_AUTH_DEFAULT
;
4934 isolated_inodes
.insert(diri
);
4935 dout(10) << " unconnected dirfrag " << p
->first
<< dendl
;
4938 // barebones dirfrag; the full dirfrag loop below will clean up.
4939 dir
= diri
->add_dirfrag(new CDir(diri
, p
->first
.frag
, this, false));
4940 if (MDS_INO_MDSDIR(from
) == p
->first
.ino
||
4941 (dir
->authority() != CDIR_AUTH_UNDEF
&&
4942 dir
->authority().first
!= from
))
4943 adjust_subtree_auth(dir
, from
);
4944 dout(10) << " add dirfrag " << *dir
<< dendl
;
4947 dir
->set_replica_nonce(p
->second
.nonce
);
4948 dir
->state_clear(CDir::STATE_REJOINING
);
4949 dout(10) << " got " << *dir
<< dendl
;
4952 map
<string_snap_t
,MMDSCacheRejoin::dn_strong
>& dmap
= ack
->strong_dentries
[p
->first
];
4953 for (map
<string_snap_t
,MMDSCacheRejoin::dn_strong
>::iterator q
= dmap
.begin();
4956 CDentry
*dn
= dir
->lookup(q
->first
.name
, q
->first
.snapid
);
4958 dn
= dir
->add_null_dentry(q
->first
.name
, q
->second
.first
, q
->first
.snapid
);
4960 CDentry::linkage_t
*dnl
= dn
->get_linkage();
4962 assert(dn
->last
== q
->first
.snapid
);
4963 if (dn
->first
!= q
->second
.first
) {
4964 dout(10) << " adjust dn.first " << dn
->first
<< " -> " << q
->second
.first
<< " on " << *dn
<< dendl
;
4965 dn
->first
= q
->second
.first
;
4968 // may have bad linkage if we missed dentry link/unlink messages
4969 if (dnl
->is_primary()) {
4970 CInode
*in
= dnl
->get_inode();
4971 if (!q
->second
.is_primary() ||
4972 vinodeno_t(q
->second
.ino
, q
->first
.snapid
) != in
->vino()) {
4973 dout(10) << " had bad linkage for " << *dn
<< ", unlinking " << *in
<< dendl
;
4974 dir
->unlink_inode(dn
);
4976 } else if (dnl
->is_remote()) {
4977 if (!q
->second
.is_remote() ||
4978 q
->second
.remote_ino
!= dnl
->get_remote_ino() ||
4979 q
->second
.remote_d_type
!= dnl
->get_remote_d_type()) {
4980 dout(10) << " had bad linkage for " << *dn
<< dendl
;
4981 dir
->unlink_inode(dn
);
4984 if (!q
->second
.is_null())
4985 dout(10) << " had bad linkage for " << *dn
<< dendl
;
4988 // hmm, did we have the proper linkage here?
4989 if (dnl
->is_null() && !q
->second
.is_null()) {
4990 if (q
->second
.is_remote()) {
4991 dn
->dir
->link_remote_inode(dn
, q
->second
.remote_ino
, q
->second
.remote_d_type
);
4993 CInode
*in
= get_inode(q
->second
.ino
, q
->first
.snapid
);
4995 // barebones inode; assume it's dir, the full inode loop below will clean up.
4996 in
= new CInode(this, false, q
->second
.first
, q
->first
.snapid
);
4997 in
->inode
.ino
= q
->second
.ino
;
4998 in
->inode
.mode
= S_IFDIR
;
4999 in
->inode
.dir_layout
.dl_dir_hash
= g_conf
->mds_default_dir_hash
;
5001 dout(10) << " add inode " << *in
<< dendl
;
5002 } else if (in
->get_parent_dn()) {
5003 dout(10) << " had bad linkage for " << *(in
->get_parent_dn())
5004 << ", unlinking " << *in
<< dendl
;
5005 in
->get_parent_dir()->unlink_inode(in
->get_parent_dn());
5007 dn
->dir
->link_primary_inode(dn
, in
);
5008 isolated_inodes
.erase(in
);
5012 dn
->set_replica_nonce(q
->second
.nonce
);
5013 dn
->lock
.set_state_rejoin(q
->second
.lock
, rejoin_waiters
, survivor
);
5014 dn
->state_clear(CDentry::STATE_REJOINING
);
5015 dout(10) << " got " << *dn
<< dendl
;
5019 for (set
<CInode
*>::iterator p
= refragged_inodes
.begin();
5020 p
!= refragged_inodes
.end();
5023 (*p
)->get_nested_dirfrags(ls
);
5024 for (list
<CDir
*>::iterator q
= ls
.begin(); q
!= ls
.end(); ++q
) {
5025 if ((*q
)->is_auth() || ack
->strong_dirfrags
.count((*q
)->dirfrag()))
5027 assert((*q
)->get_num_any() == 0);
5028 (*p
)->close_dirfrag((*q
)->get_frag());
5033 for (map
<dirfrag_t
, bufferlist
>::iterator p
= ack
->dirfrag_bases
.begin();
5034 p
!= ack
->dirfrag_bases
.end();
5036 CDir
*dir
= get_dirfrag(p
->first
);
5038 bufferlist::iterator q
= p
->second
.begin();
5039 dir
->_decode_base(q
);
5040 dout(10) << " got dir replica " << *dir
<< dendl
;
5044 bufferlist::iterator p
= ack
->inode_base
.begin();
5051 ::decode(basebl
, p
);
5052 CInode
*in
= get_inode(ino
, last
);
5054 bufferlist::iterator q
= basebl
.begin();
5055 in
->_decode_base(q
);
5056 dout(10) << " got inode base " << *in
<< dendl
;
5060 p
= ack
->inode_locks
.begin();
5061 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5070 ::decode(lockbl
, p
);
5072 CInode
*in
= get_inode(ino
, last
);
5074 in
->set_replica_nonce(nonce
);
5075 bufferlist::iterator q
= lockbl
.begin();
5076 in
->_decode_locks_rejoin(q
, rejoin_waiters
, rejoin_eval_locks
, survivor
);
5077 in
->state_clear(CInode::STATE_REJOINING
);
5078 dout(10) << " got inode locks " << *in
<< dendl
;
5081 // FIXME: This can happen if entire subtree, together with the inode subtree root
5082 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
5083 assert(isolated_inodes
.empty());
5085 map
<inodeno_t
,map
<client_t
,Capability::Import
> > peer_imported
;
5086 bufferlist::iterator bp
= ack
->imported_caps
.begin();
5087 ::decode(peer_imported
, bp
);
5089 for (map
<inodeno_t
,map
<client_t
,Capability::Import
> >::iterator p
= peer_imported
.begin();
5090 p
!= peer_imported
.end();
5092 auto& ex
= cap_exports
.at(p
->first
);
5093 assert(ex
.first
== from
);
5094 for (map
<client_t
,Capability::Import
>::iterator q
= p
->second
.begin();
5095 q
!= p
->second
.end();
5097 auto r
= ex
.second
.find(q
->first
);
5098 assert(r
!= ex
.second
.end());
5100 dout(10) << " exporting caps for client." << q
->first
<< " ino " << p
->first
<< dendl
;
5101 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
5103 dout(10) << " no session for client." << p
->first
<< dendl
;
5108 // mark client caps stale.
5109 MClientCaps
*m
= new MClientCaps(CEPH_CAP_OP_EXPORT
, p
->first
, 0,
5110 r
->second
.capinfo
.cap_id
, 0,
5111 mds
->get_osd_epoch_barrier());
5112 m
->set_cap_peer(q
->second
.cap_id
, q
->second
.issue_seq
, q
->second
.mseq
,
5113 (q
->second
.cap_id
> 0 ? from
: -1), 0);
5114 mds
->send_message_client_counted(m
, session
);
5118 assert(ex
.second
.empty());
5122 assert(rejoin_ack_gather
.count(from
));
5123 rejoin_ack_gather
.erase(from
);
5126 if (rejoin_gather
.empty()) {
5127 // eval unstable scatter locks after all wrlocks are rejoined.
5128 while (!rejoin_eval_locks
.empty()) {
5129 SimpleLock
*lock
= rejoin_eval_locks
.front();
5130 rejoin_eval_locks
.pop_front();
5131 if (!lock
->is_stable())
5132 mds
->locker
->eval_gather(lock
);
5136 if (rejoin_gather
.empty() && // make sure we've gotten our FULL inodes, too.
5137 rejoin_ack_gather
.empty()) {
5138 // finally, kickstart past snap parent opens
5139 open_snap_parents();
5141 dout(7) << "still need rejoin from (" << rejoin_gather
<< ")"
5142 << ", rejoin_ack from (" << rejoin_ack_gather
<< ")" << dendl
;
5146 mds
->queue_waiters(rejoin_waiters
);
5151 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5153 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5154 * messages that clean these guys up...
5156 void MDCache::rejoin_trim_undef_inodes()
5158 dout(10) << "rejoin_trim_undef_inodes" << dendl
;
5160 while (!rejoin_undef_inodes
.empty()) {
5161 set
<CInode
*>::iterator p
= rejoin_undef_inodes
.begin();
5163 rejoin_undef_inodes
.erase(p
);
5165 in
->clear_replica_map();
5167 // close out dirfrags
5170 in
->get_dirfrags(dfls
);
5171 for (list
<CDir
*>::iterator p
= dfls
.begin();
5175 dir
->clear_replica_map();
5177 for (auto &p
: dir
->items
) {
5178 CDentry
*dn
= p
.second
;
5179 dn
->clear_replica_map();
5181 dout(10) << " trimming " << *dn
<< dendl
;
5182 dir
->remove_dentry(dn
);
5185 dout(10) << " trimming " << *dir
<< dendl
;
5186 in
->close_dirfrag(dir
->dirfrag().frag
);
5190 CDentry
*dn
= in
->get_parent_dn();
5192 dn
->clear_replica_map();
5193 dout(10) << " trimming " << *dn
<< dendl
;
5194 dn
->dir
->remove_dentry(dn
);
5196 dout(10) << " trimming " << *in
<< dendl
;
5201 assert(rejoin_undef_inodes
.empty());
5204 void MDCache::rejoin_gather_finish()
5206 dout(10) << "rejoin_gather_finish" << dendl
;
5207 assert(mds
->is_rejoin());
5208 assert(rejoin_ack_gather
.count(mds
->get_nodeid()));
5210 if (open_undef_inodes_dirfrags())
5213 if (process_imported_caps())
5216 choose_lock_states_and_reconnect_caps();
5218 identify_files_to_recover();
5221 // signal completion of fetches, rejoin_gather_finish, etc.
5222 rejoin_ack_gather
.erase(mds
->get_nodeid());
5224 // did we already get our acks too?
5225 if (rejoin_ack_gather
.empty()) {
5226 // finally, kickstart past snap parent opens
5227 open_snap_parents();
5231 class C_MDC_RejoinOpenInoFinish
: public MDCacheContext
{
5234 C_MDC_RejoinOpenInoFinish(MDCache
*c
, inodeno_t i
) : MDCacheContext(c
), ino(i
) {}
5235 void finish(int r
) override
{
5236 mdcache
->rejoin_open_ino_finish(ino
, r
);
5240 void MDCache::rejoin_open_ino_finish(inodeno_t ino
, int ret
)
5242 dout(10) << "open_caps_inode_finish ino " << ino
<< " ret " << ret
<< dendl
;
5245 cap_imports_missing
.insert(ino
);
5246 } else if (ret
== mds
->get_nodeid()) {
5247 assert(get_inode(ino
));
5249 auto p
= cap_imports
.find(ino
);
5250 assert(p
!= cap_imports
.end());
5251 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5252 assert(q
->second
.count(MDS_RANK_NONE
));
5253 assert(q
->second
.size() == 1);
5254 rejoin_export_caps(p
->first
, q
->first
, q
->second
[MDS_RANK_NONE
], ret
);
5256 cap_imports
.erase(p
);
5259 assert(cap_imports_num_opening
> 0);
5260 cap_imports_num_opening
--;
5262 if (cap_imports_num_opening
== 0) {
5263 if (rejoin_gather
.empty())
5264 rejoin_gather_finish();
5265 else if (rejoin_gather
.count(mds
->get_nodeid()))
5266 process_imported_caps();
5270 class C_MDC_RejoinSessionsOpened
: public MDCacheLogContext
{
5272 map
<client_t
,pair
<Session
*,uint64_t> > session_map
;
5273 C_MDC_RejoinSessionsOpened(MDCache
*c
) : MDCacheLogContext(c
) {}
5274 void finish(int r
) override
{
5276 mdcache
->rejoin_open_sessions_finish(session_map
);
5280 void MDCache::rejoin_open_sessions_finish(map
<client_t
,pair
<Session
*,uint64_t> >& session_map
)
5282 dout(10) << "rejoin_open_sessions_finish" << dendl
;
5283 mds
->server
->finish_force_open_sessions(session_map
);
5284 rejoin_session_map
.swap(session_map
);
5285 if (rejoin_gather
.empty())
5286 rejoin_gather_finish();
5289 bool MDCache::process_imported_caps()
5291 dout(10) << "process_imported_caps" << dendl
;
5293 for (auto p
= cap_imports
.begin(); p
!= cap_imports
.end(); ++p
) {
5294 CInode
*in
= get_inode(p
->first
);
5296 assert(in
->is_auth());
5297 cap_imports_missing
.erase(p
->first
);
5300 if (cap_imports_missing
.count(p
->first
) > 0)
5303 cap_imports_num_opening
++;
5304 dout(10) << " opening missing ino " << p
->first
<< dendl
;
5305 open_ino(p
->first
, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p
->first
), false);
5306 if (!(cap_imports_num_opening
% 1000))
5307 mds
->heartbeat_reset();
5310 if (cap_imports_num_opening
> 0)
5313 // called by rejoin_gather_finish() ?
5314 if (rejoin_gather
.count(mds
->get_nodeid()) == 0) {
5315 if (!rejoin_client_map
.empty() &&
5316 rejoin_session_map
.empty()) {
5317 C_MDC_RejoinSessionsOpened
*finish
= new C_MDC_RejoinSessionsOpened(this);
5318 version_t pv
= mds
->server
->prepare_force_open_sessions(rejoin_client_map
,
5319 finish
->session_map
);
5320 mds
->mdlog
->start_submit_entry(new ESessions(pv
, rejoin_client_map
), finish
);
5321 mds
->mdlog
->flush();
5322 rejoin_client_map
.clear();
5326 // process caps that were exported by slave rename
5327 for (map
<inodeno_t
,pair
<mds_rank_t
,map
<client_t
,Capability::Export
> > >::iterator p
= rejoin_slave_exports
.begin();
5328 p
!= rejoin_slave_exports
.end();
5330 CInode
*in
= get_inode(p
->first
);
5332 for (map
<client_t
,Capability::Export
>::iterator q
= p
->second
.second
.begin();
5333 q
!= p
->second
.second
.end();
5335 auto r
= rejoin_session_map
.find(q
->first
);
5336 if (r
== rejoin_session_map
.end())
5339 Session
*session
= r
->second
.first
;
5340 Capability
*cap
= in
->get_client_cap(q
->first
);
5342 cap
= in
->add_client_cap(q
->first
, session
);
5343 cap
->merge(q
->second
, true);
5345 Capability::Import
& im
= rejoin_imported_caps
[p
->second
.first
][p
->first
][q
->first
];
5346 assert(cap
->get_last_seq() == im
.issue_seq
);
5347 assert(cap
->get_mseq() == im
.mseq
);
5348 cap
->set_cap_id(im
.cap_id
);
5349 // send cap import because we assigned a new cap ID
5350 do_cap_import(session
, in
, cap
, q
->second
.cap_id
, q
->second
.seq
, q
->second
.mseq
- 1,
5351 p
->second
.first
, CEPH_CAP_FLAG_AUTH
);
5354 rejoin_slave_exports
.clear();
5355 rejoin_imported_caps
.clear();
5357 // process cap imports
5358 // ino -> client -> frommds -> capex
5359 for (auto p
= cap_imports
.begin(); p
!= cap_imports
.end(); ) {
5360 CInode
*in
= get_inode(p
->first
);
5362 dout(10) << " still missing ino " << p
->first
5363 << ", will try again after replayed client requests" << dendl
;
5367 assert(in
->is_auth());
5368 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5371 auto r
= rejoin_session_map
.find(q
->first
);
5372 session
= (r
!= rejoin_session_map
.end() ? r
->second
.first
: nullptr);
5375 for (auto r
= q
->second
.begin(); r
!= q
->second
.end(); ++r
) {
5378 (void)rejoin_imported_caps
[r
->first
][p
->first
][q
->first
]; // all are zero
5382 Capability
*cap
= in
->reconnect_cap(q
->first
, r
->second
, session
);
5383 add_reconnected_cap(q
->first
, in
->ino(), r
->second
);
5384 if (r
->first
>= 0) {
5385 if (cap
->get_last_seq() == 0) // don't increase mseq if cap already exists
5387 do_cap_import(session
, in
, cap
, r
->second
.capinfo
.cap_id
, 0, 0, r
->first
, 0);
5389 Capability::Import
& im
= rejoin_imported_caps
[r
->first
][p
->first
][q
->first
];
5390 im
.cap_id
= cap
->get_cap_id();
5391 im
.issue_seq
= cap
->get_last_seq();
5392 im
.mseq
= cap
->get_mseq();
5396 cap_imports
.erase(p
++); // remove and move on
5401 assert(rejoin_gather
.count(mds
->get_nodeid()));
5402 rejoin_gather
.erase(mds
->get_nodeid());
5403 assert(!rejoin_ack_gather
.count(mds
->get_nodeid()));
5404 maybe_send_pending_rejoins();
5409 void MDCache::check_realm_past_parents(SnapRealm
*realm
, bool reconnect
)
5411 // are this realm's parents fully open?
5412 if (realm
->have_past_parents_open()) {
5413 dout(10) << " have past snap parents for realm " << *realm
5414 << " on " << *realm
->inode
<< dendl
;
5416 // finish off client snaprealm reconnects?
5417 auto p
= reconnected_snaprealms
.find(realm
->inode
->ino());
5418 if (p
!= reconnected_snaprealms
.end()) {
5419 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
)
5420 finish_snaprealm_reconnect(q
->first
, realm
, q
->second
);
5421 reconnected_snaprealms
.erase(p
);
5425 if (!missing_snap_parents
.count(realm
->inode
)) {
5426 dout(10) << " MISSING past snap parents for realm " << *realm
5427 << " on " << *realm
->inode
<< dendl
;
5428 realm
->inode
->get(CInode::PIN_OPENINGSNAPPARENTS
);
5429 missing_snap_parents
[realm
->inode
].size(); // just to get it into the map!
5431 dout(10) << " (already) MISSING past snap parents for realm " << *realm
5432 << " on " << *realm
->inode
<< dendl
;
5437 void MDCache::rebuild_need_snapflush(CInode
*head_in
, SnapRealm
*realm
,
5438 client_t client
, snapid_t snap_follows
)
5440 dout(10) << "rebuild_need_snapflush " << snap_follows
<< " on " << *head_in
<< dendl
;
5442 const set
<snapid_t
>& snaps
= realm
->get_snaps();
5443 snapid_t follows
= snap_follows
;
5446 CInode
*in
= pick_inode_snap(head_in
, follows
);
5449 dout(10) << " need snapflush from client." << client
<< " on " << *in
<< dendl
;
5451 /* TODO: we can check the reconnected/flushing caps to find
5452 * which locks need gathering */
5453 for (int i
= 0; i
< num_cinode_locks
; i
++) {
5454 int lockid
= cinode_lock_info
[i
].lock
;
5455 SimpleLock
*lock
= in
->get_lock(lockid
);
5457 in
->client_snap_caps
[lockid
].insert(client
);
5459 lock
->set_state(LOCK_SNAP_SYNC
);
5460 lock
->get_wrlock(true);
5463 for (auto p
= snaps
.lower_bound(in
->first
);
5464 p
!= snaps
.end() && *p
<= in
->last
;
5466 head_in
->add_need_snapflush(in
, *p
, client
);
5474 * choose lock states based on reconnected caps
5476 void MDCache::choose_lock_states_and_reconnect_caps()
5478 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl
;
5480 map
<client_t
,MClientSnap
*> splits
;
5482 for (auto i
: inode_map
) {
5483 CInode
*in
= i
.second
;
5485 if (in
->last
!= CEPH_NOSNAP
)
5488 if (in
->is_auth() && !in
->is_base() && in
->inode
.is_dirty_rstat())
5489 in
->mark_dirty_rstat();
5492 auto p
= reconnected_caps
.find(in
->ino());
5493 if (p
!= reconnected_caps
.end()) {
5494 for (const auto &it
: p
->second
)
5495 dirty_caps
|= it
.second
.dirty_caps
;
5497 in
->choose_lock_states(dirty_caps
);
5498 dout(15) << " chose lock states on " << *in
<< dendl
;
5500 SnapRealm
*realm
= in
->find_snaprealm();
5502 check_realm_past_parents(realm
, realm
== in
->snaprealm
);
5504 if (p
!= reconnected_caps
.end()) {
5505 bool missing_snap_parent
= false;
5506 // also, make sure client's cap is in the correct snaprealm.
5507 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5508 if (q
->second
.snap_follows
> 0 && q
->second
.snap_follows
< in
->first
- 1) {
5509 if (realm
->have_past_parents_open()) {
5510 rebuild_need_snapflush(in
, realm
, q
->first
, q
->second
.snap_follows
);
5512 missing_snap_parent
= true;
5516 if (q
->second
.realm_ino
== realm
->inode
->ino()) {
5517 dout(15) << " client." << q
->first
<< " has correct realm " << q
->second
.realm_ino
<< dendl
;
5519 dout(15) << " client." << q
->first
<< " has wrong realm " << q
->second
.realm_ino
5520 << " != " << realm
->inode
->ino() << dendl
;
5521 if (realm
->have_past_parents_open()) {
5522 // ok, include in a split message _now_.
5523 prepare_realm_split(realm
, q
->first
, in
->ino(), splits
);
5525 // send the split later.
5526 missing_snap_parent
= true;
5530 if (missing_snap_parent
)
5531 missing_snap_parents
[realm
->inode
].insert(in
);
5538 void MDCache::prepare_realm_split(SnapRealm
*realm
, client_t client
, inodeno_t ino
,
5539 map
<client_t
,MClientSnap
*>& splits
)
5542 if (splits
.count(client
) == 0) {
5543 splits
[client
] = snap
= new MClientSnap(CEPH_SNAP_OP_SPLIT
);
5544 snap
->head
.split
= realm
->inode
->ino();
5545 realm
->build_snap_trace(snap
->bl
);
5547 for (set
<SnapRealm
*>::iterator p
= realm
->open_children
.begin();
5548 p
!= realm
->open_children
.end();
5550 snap
->split_realms
.push_back((*p
)->inode
->ino());
5553 snap
= splits
[client
];
5554 snap
->split_inos
.push_back(ino
);
5557 void MDCache::send_snaps(map
<client_t
,MClientSnap
*>& splits
)
5559 dout(10) << "send_snaps" << dendl
;
5561 for (map
<client_t
,MClientSnap
*>::iterator p
= splits
.begin();
5564 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(p
->first
.v
));
5566 dout(10) << " client." << p
->first
5567 << " split " << p
->second
->head
.split
5568 << " inos " << p
->second
->split_inos
5570 mds
->send_message_client_counted(p
->second
, session
);
5572 dout(10) << " no session for client." << p
->first
<< dendl
;
5581 * remove any items from logsegment open_file lists that don't have
5584 void MDCache::clean_open_file_lists()
5586 dout(10) << "clean_open_file_lists" << dendl
;
5588 for (map
<uint64_t,LogSegment
*>::iterator p
= mds
->mdlog
->segments
.begin();
5589 p
!= mds
->mdlog
->segments
.end();
5591 LogSegment
*ls
= p
->second
;
5593 elist
<CInode
*>::iterator q
= ls
->open_files
.begin(member_offset(CInode
, item_open_file
));
5597 if (in
->last
== CEPH_NOSNAP
) {
5598 if (!in
->is_any_caps_wanted()) {
5599 dout(10) << " unlisting unwanted/capless inode " << *in
<< dendl
;
5600 in
->item_open_file
.remove_myself();
5602 } else if (in
->last
!= CEPH_NOSNAP
) {
5603 if (in
->client_snap_caps
.empty()) {
5604 dout(10) << " unlisting flushed snap inode " << *in
<< dendl
;
5605 in
->item_open_file
.remove_myself();
5614 Capability
* MDCache::rejoin_import_cap(CInode
*in
, client_t client
, const cap_reconnect_t
& icr
, mds_rank_t frommds
)
5616 dout(10) << "rejoin_import_cap for client." << client
<< " from mds." << frommds
5617 << " on " << *in
<< dendl
;
5618 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(client
.v
));
5620 dout(10) << " no session for client." << client
<< dendl
;
5624 Capability
*cap
= in
->reconnect_cap(client
, icr
, session
);
5627 if (cap
->get_last_seq() == 0) // don't increase mseq if cap already exists
5629 do_cap_import(session
, in
, cap
, icr
.capinfo
.cap_id
, 0, 0, frommds
, 0);
5635 void MDCache::export_remaining_imported_caps()
5637 dout(10) << "export_remaining_imported_caps" << dendl
;
5639 stringstream warn_str
;
5641 for (auto p
= cap_imports
.begin(); p
!= cap_imports
.end(); ++p
) {
5642 warn_str
<< " ino " << p
->first
<< "\n";
5643 for (auto q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
5644 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(q
->first
.v
));
5646 // mark client caps stale.
5647 MClientCaps
*stale
= new MClientCaps(CEPH_CAP_OP_EXPORT
, p
->first
, 0, 0, 0, mds
->get_osd_epoch_barrier());
5648 stale
->set_cap_peer(0, 0, 0, -1, 0);
5649 mds
->send_message_client_counted(stale
, q
->first
);
5653 mds
->heartbeat_reset();
5656 for (map
<inodeno_t
, list
<MDSInternalContextBase
*> >::iterator p
= cap_reconnect_waiters
.begin();
5657 p
!= cap_reconnect_waiters
.end();
5659 mds
->queue_waiters(p
->second
);
5661 cap_imports
.clear();
5662 cap_reconnect_waiters
.clear();
5664 if (warn_str
.peek() != EOF
) {
5665 mds
->clog
->warn() << "failed to reconnect caps for missing inodes:";
5666 mds
->clog
->warn(warn_str
);
5670 void MDCache::try_reconnect_cap(CInode
*in
, Session
*session
)
5672 client_t client
= session
->info
.get_client();
5673 const cap_reconnect_t
*rc
= get_replay_cap_reconnect(in
->ino(), client
);
5675 in
->reconnect_cap(client
, *rc
, session
);
5676 dout(10) << "try_reconnect_cap client." << client
5677 << " reconnect wanted " << ccap_string(rc
->capinfo
.wanted
)
5678 << " issue " << ccap_string(rc
->capinfo
.issued
)
5679 << " on " << *in
<< dendl
;
5680 remove_replay_cap_reconnect(in
->ino(), client
);
5682 if (in
->is_replicated()) {
5683 mds
->locker
->try_eval(in
, CEPH_CAP_LOCKS
);
5686 auto p
= reconnected_caps
.find(in
->ino());
5687 if (p
!= reconnected_caps
.end()) {
5688 auto q
= p
->second
.find(client
);
5689 if (q
!= p
->second
.end())
5690 dirty_caps
= q
->second
.dirty_caps
;
5692 in
->choose_lock_states(dirty_caps
);
5693 dout(15) << " chose lock states on " << *in
<< dendl
;
5696 map
<inodeno_t
, list
<MDSInternalContextBase
*> >::iterator it
=
5697 cap_reconnect_waiters
.find(in
->ino());
5698 if (it
!= cap_reconnect_waiters
.end()) {
5699 mds
->queue_waiters(it
->second
);
5700 cap_reconnect_waiters
.erase(it
);
5708 // cap imports and delayed snap parent opens
5710 void MDCache::do_cap_import(Session
*session
, CInode
*in
, Capability
*cap
,
5711 uint64_t p_cap_id
, ceph_seq_t p_seq
, ceph_seq_t p_mseq
,
5712 int peer
, int p_flags
)
5714 client_t client
= session
->info
.inst
.name
.num();
5715 SnapRealm
*realm
= in
->find_snaprealm();
5716 if (realm
->have_past_parents_open()) {
5717 dout(10) << "do_cap_import " << session
->info
.inst
.name
<< " mseq " << cap
->get_mseq() << " on " << *in
<< dendl
;
5718 if (cap
->get_last_seq() == 0) // reconnected cap
5719 cap
->inc_last_seq();
5720 cap
->set_last_issue();
5721 cap
->set_last_issue_stamp(ceph_clock_now());
5723 MClientCaps
*reap
= new MClientCaps(CEPH_CAP_OP_IMPORT
,
5725 realm
->inode
->ino(),
5726 cap
->get_cap_id(), cap
->get_last_seq(),
5727 cap
->pending(), cap
->wanted(), 0,
5728 cap
->get_mseq(), mds
->get_osd_epoch_barrier());
5729 in
->encode_cap_message(reap
, cap
);
5730 realm
->build_snap_trace(reap
->snapbl
);
5731 reap
->set_cap_peer(p_cap_id
, p_seq
, p_mseq
, peer
, p_flags
);
5732 mds
->send_message_client_counted(reap
, session
);
5734 dout(10) << "do_cap_import missing past snap parents, delaying " << session
->info
.inst
.name
<< " mseq "
5735 << cap
->get_mseq() << " on " << *in
<< dendl
;
5737 cap
->inc_suppress();
5738 delayed_imported_caps
[client
].insert(in
);
5739 missing_snap_parents
[in
].size();
5743 void MDCache::do_delayed_cap_imports()
5745 dout(10) << "do_delayed_cap_imports" << dendl
;
5747 assert(delayed_imported_caps
.empty());
5750 struct C_MDC_OpenSnapParents
: public MDCacheContext
{
5751 explicit C_MDC_OpenSnapParents(MDCache
*c
) : MDCacheContext(c
) {}
5752 void finish(int r
) override
{
5753 mdcache
->open_snap_parents();
5757 void MDCache::open_snap_parents()
5759 dout(10) << "open_snap_parents" << dendl
;
5761 map
<client_t
,MClientSnap
*> splits
;
5762 MDSGatherBuilder
gather(g_ceph_context
);
5764 auto p
= missing_snap_parents
.begin();
5765 while (p
!= missing_snap_parents
.end()) {
5766 CInode
*in
= p
->first
;
5767 assert(in
->snaprealm
);
5768 if (in
->snaprealm
->open_parents(gather
.new_sub())) {
5769 dout(10) << " past parents now open on " << *in
<< dendl
;
5771 for (CInode
*child
: p
->second
) {
5772 auto q
= reconnected_caps
.find(child
->ino());
5773 assert(q
!= reconnected_caps
.end());
5774 for (auto r
= q
->second
.begin(); r
!= q
->second
.end(); ++r
) {
5775 if (r
->second
.snap_follows
> 0 && r
->second
.snap_follows
< in
->first
- 1) {
5776 rebuild_need_snapflush(child
, in
->snaprealm
, r
->first
, r
->second
.snap_follows
);
5778 // make sure client's cap is in the correct snaprealm.
5779 if (r
->second
.realm_ino
!= in
->ino()) {
5780 prepare_realm_split(in
->snaprealm
, r
->first
, child
->ino(), splits
);
5785 missing_snap_parents
.erase(p
++);
5787 in
->put(CInode::PIN_OPENINGSNAPPARENTS
);
5789 // finish off client snaprealm reconnects?
5790 map
<inodeno_t
,map
<client_t
,snapid_t
> >::iterator q
= reconnected_snaprealms
.find(in
->ino());
5791 if (q
!= reconnected_snaprealms
.end()) {
5792 for (map
<client_t
,snapid_t
>::iterator r
= q
->second
.begin();
5793 r
!= q
->second
.end();
5795 finish_snaprealm_reconnect(r
->first
, in
->snaprealm
, r
->second
);
5796 reconnected_snaprealms
.erase(q
);
5799 dout(10) << " opening past parents on " << *in
<< dendl
;
5806 if (gather
.has_subs()) {
5807 dout(10) << "open_snap_parents - waiting for "
5808 << gather
.num_subs_remaining() << dendl
;
5809 gather
.set_finisher(new C_MDC_OpenSnapParents(this));
5812 if (!reconnected_snaprealms
.empty()) {
5813 stringstream warn_str
;
5814 for (map
<inodeno_t
,map
<client_t
,snapid_t
> >::iterator p
= reconnected_snaprealms
.begin();
5815 p
!= reconnected_snaprealms
.end();
5817 warn_str
<< " unconnected snaprealm " << p
->first
<< "\n";
5818 for (map
<client_t
,snapid_t
>::iterator q
= p
->second
.begin();
5819 q
!= p
->second
.end();
5821 warn_str
<< " client." << q
->first
<< " snapid " << q
->second
<< "\n";
5823 mds
->clog
->warn() << "open_snap_parents has:";
5824 mds
->clog
->warn(warn_str
);
5826 assert(rejoin_waiters
.empty());
5827 assert(missing_snap_parents
.empty());
5828 dout(10) << "open_snap_parents - all open" << dendl
;
5829 do_delayed_cap_imports();
5831 assert(rejoin_done
);
5832 rejoin_done
.release()->complete(0);
5833 reconnected_caps
.clear();
5837 bool MDCache::open_undef_inodes_dirfrags()
5839 dout(10) << "open_undef_inodes_dirfrags "
5840 << rejoin_undef_inodes
.size() << " inodes "
5841 << rejoin_undef_dirfrags
.size() << " dirfrags" << dendl
;
5843 set
<CDir
*> fetch_queue
= rejoin_undef_dirfrags
;
5845 for (set
<CInode
*>::iterator p
= rejoin_undef_inodes
.begin();
5846 p
!= rejoin_undef_inodes
.end();
5849 assert(!in
->is_base());
5850 fetch_queue
.insert(in
->get_parent_dir());
5853 if (fetch_queue
.empty())
5856 MDSGatherBuilder
gather(g_ceph_context
,
5857 new MDSInternalContextWrapper(mds
,
5858 new FunctionContext([this](int r
) {
5859 if (rejoin_gather
.empty())
5860 rejoin_gather_finish();
5865 for (set
<CDir
*>::iterator p
= fetch_queue
.begin();
5866 p
!= fetch_queue
.end();
5869 CInode
*diri
= dir
->get_inode();
5870 if (diri
->state_test(CInode::STATE_REJOINUNDEF
))
5872 if (dir
->state_test(CDir::STATE_REJOINUNDEF
))
5873 assert(diri
->dirfragtree
.is_leaf(dir
->get_frag()));
5874 dir
->fetch(gather
.new_sub());
5876 assert(gather
.has_subs());
5881 void MDCache::opened_undef_inode(CInode
*in
) {
5882 dout(10) << "opened_undef_inode " << *in
<< dendl
;
5883 rejoin_undef_inodes
.erase(in
);
5885 // FIXME: re-hash dentries if necessary
5886 assert(in
->inode
.dir_layout
.dl_dir_hash
== g_conf
->mds_default_dir_hash
);
5887 if (in
->has_dirfrags() && !in
->dirfragtree
.is_leaf(frag_t())) {
5888 CDir
*dir
= in
->get_dirfrag(frag_t());
5890 rejoin_undef_dirfrags
.erase(dir
);
5891 in
->force_dirfrags();
5893 in
->get_dirfrags(ls
);
5894 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
)
5895 rejoin_undef_dirfrags
.insert(*p
);
5900 void MDCache::finish_snaprealm_reconnect(client_t client
, SnapRealm
*realm
, snapid_t seq
)
5902 if (seq
< realm
->get_newest_seq()) {
5903 dout(10) << "finish_snaprealm_reconnect client." << client
<< " has old seq " << seq
<< " < "
5904 << realm
->get_newest_seq()
5905 << " on " << *realm
<< dendl
;
5907 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(client
.v
));
5909 MClientSnap
*snap
= new MClientSnap(CEPH_SNAP_OP_UPDATE
);
5910 realm
->build_snap_trace(snap
->bl
);
5911 mds
->send_message_client_counted(snap
, session
);
5913 dout(10) << " ...or not, no session for this client!" << dendl
;
5916 dout(10) << "finish_snaprealm_reconnect client." << client
<< " up to date"
5917 << " on " << *realm
<< dendl
;
5923 void MDCache::rejoin_send_acks()
5925 dout(7) << "rejoin_send_acks" << dendl
;
5928 for (map
<mds_rank_t
, set
<CInode
*> >::iterator p
= rejoin_unlinked_inodes
.begin();
5929 p
!= rejoin_unlinked_inodes
.end();
5931 for (set
<CInode
*>::iterator q
= p
->second
.begin();
5932 q
!= p
->second
.end();
5935 dout(7) << " unlinked inode " << *in
<< dendl
;
5937 if (!in
->is_replica(p
->first
))
5940 CDentry
*dn
= in
->get_parent_dn();
5941 if (dn
->is_replica(p
->first
))
5943 dn
->add_replica(p
->first
);
5944 CDir
*dir
= dn
->get_dir();
5945 if (dir
->is_replica(p
->first
))
5947 dir
->add_replica(p
->first
);
5948 in
= dir
->get_inode();
5949 if (in
->is_replica(p
->first
))
5951 in
->add_replica(p
->first
);
5957 rejoin_unlinked_inodes
.clear();
5959 // send acks to everyone in the recovery set
5960 map
<mds_rank_t
,MMDSCacheRejoin
*> acks
;
5961 for (set
<mds_rank_t
>::iterator p
= recovery_set
.begin();
5962 p
!= recovery_set
.end();
5964 if (rejoin_ack_sent
.count(*p
))
5966 acks
[*p
] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK
);
5969 rejoin_ack_sent
= recovery_set
;
5972 for (map
<CDir
*,set
<CDir
*> >::iterator p
= subtrees
.begin();
5973 p
!= subtrees
.end();
5975 CDir
*dir
= p
->first
;
5976 if (!dir
->is_auth())
5978 dout(10) << "subtree " << *dir
<< dendl
;
5980 // auth items in this subtree
5984 while (!dq
.empty()) {
5985 CDir
*dir
= dq
.front();
5989 for (auto &r
: dir
->get_replicas()) {
5990 auto it
= acks
.find(r
.first
);
5991 if (it
== acks
.end())
5993 it
->second
->add_strong_dirfrag(dir
->dirfrag(), ++r
.second
, dir
->dir_rep
);
5994 it
->second
->add_dirfrag_base(dir
);
5997 for (auto &p
: dir
->items
) {
5998 CDentry
*dn
= p
.second
;
5999 CDentry::linkage_t
*dnl
= dn
->get_linkage();
6003 if (dnl
->is_primary())
6004 in
= dnl
->get_inode();
6007 for (auto &r
: dn
->get_replicas()) {
6008 auto it
= acks
.find(r
.first
);
6009 if (it
== acks
.end())
6011 it
->second
->add_strong_dentry(dir
->dirfrag(), dn
->get_name(), dn
->first
, dn
->last
,
6012 dnl
->is_primary() ? dnl
->get_inode()->ino():inodeno_t(0),
6013 dnl
->is_remote() ? dnl
->get_remote_ino():inodeno_t(0),
6014 dnl
->is_remote() ? dnl
->get_remote_d_type():0,
6016 dn
->lock
.get_replica_state());
6017 // peer missed MDentrylink message ?
6018 if (in
&& !in
->is_replica(r
.first
))
6019 in
->add_replica(r
.first
);
6025 for (auto &r
: in
->get_replicas()) {
6026 auto it
= acks
.find(r
.first
);
6027 if (it
== acks
.end())
6029 it
->second
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
6031 in
->_encode_locks_state_for_rejoin(bl
, r
.first
);
6032 it
->second
->add_inode_locks(in
, ++r
.second
, bl
);
6035 // subdirs in this subtree?
6036 in
->get_nested_dirfrags(dq
);
6042 if (root
&& root
->is_auth())
6043 for (auto &r
: root
->get_replicas()) {
6044 auto it
= acks
.find(r
.first
);
6045 if (it
== acks
.end())
6047 it
->second
->add_inode_base(root
, mds
->mdsmap
->get_up_features());
6049 root
->_encode_locks_state_for_rejoin(bl
, r
.first
);
6050 it
->second
->add_inode_locks(root
, ++r
.second
, bl
);
6053 for (auto &r
: myin
->get_replicas()) {
6054 auto it
= acks
.find(r
.first
);
6055 if (it
== acks
.end())
6057 it
->second
->add_inode_base(myin
, mds
->mdsmap
->get_up_features());
6059 myin
->_encode_locks_state_for_rejoin(bl
, r
.first
);
6060 it
->second
->add_inode_locks(myin
, ++r
.second
, bl
);
6063 // include inode base for any inodes whose scatterlocks may have updated
6064 for (set
<CInode
*>::iterator p
= rejoin_potential_updated_scatterlocks
.begin();
6065 p
!= rejoin_potential_updated_scatterlocks
.end();
6068 for (const auto &r
: in
->get_replicas()) {
6069 auto it
= acks
.find(r
.first
);
6070 if (it
== acks
.end())
6072 it
->second
->add_inode_base(in
, mds
->mdsmap
->get_up_features());
6077 for (auto p
= acks
.begin(); p
!= acks
.end(); ++p
) {
6078 ::encode(rejoin_imported_caps
[p
->first
], p
->second
->imported_caps
);
6079 mds
->send_message_mds(p
->second
, p
->first
);
6082 rejoin_imported_caps
.clear();
6085 class C_MDC_ReIssueCaps
: public MDCacheContext
{
6088 C_MDC_ReIssueCaps(MDCache
*mdc
, CInode
*i
) :
6089 MDCacheContext(mdc
), in(i
)
6091 in
->get(CInode::PIN_PTRWAITER
);
6093 void finish(int r
) override
{
6094 if (!mdcache
->mds
->locker
->eval(in
, CEPH_CAP_LOCKS
))
6095 mdcache
->mds
->locker
->issue_caps(in
);
6096 in
->put(CInode::PIN_PTRWAITER
);
6100 void MDCache::reissue_all_caps()
6102 dout(10) << "reissue_all_caps" << dendl
;
6104 for (auto &p
: inode_map
) {
6105 CInode
*in
= p
.second
;
6106 if (in
->is_head() && in
->is_any_caps()) {
6107 // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
6108 if (in
->is_frozen_inode()) {
6109 in
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDC_ReIssueCaps(this, in
));
6112 if (!mds
->locker
->eval(in
, CEPH_CAP_LOCKS
))
6113 mds
->locker
->issue_caps(in
);
6119 // ===============================================================================
6121 struct C_MDC_QueuedCow
: public MDCacheContext
{
6124 C_MDC_QueuedCow(MDCache
*mdc
, CInode
*i
, MutationRef
& m
) :
6125 MDCacheContext(mdc
), in(i
), mut(m
) {}
6126 void finish(int r
) override
{
6127 mdcache
->_queued_file_recover_cow(in
, mut
);
6132 void MDCache::queue_file_recover(CInode
*in
)
6134 dout(10) << "queue_file_recover " << *in
<< dendl
;
6135 assert(in
->is_auth());
6139 SnapRealm *realm = in->find_snaprealm();
6140 set<snapid_t> s = realm->get_snaps();
6141 while (!s.empty() && *s.begin() < in->first)
6143 while (!s.empty() && *s.rbegin() > in->last)
6144 s.erase(*s.rbegin());
6145 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6147 CInode::mempool_inode pi = in->project_inode();
6148 pi->version = in->pre_dirty();
6150 auto mut(std::make_shared<MutationImpl>());
6151 mut->ls = mds->mdlog->get_current_segment();
6152 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6153 mds->mdlog->start_entry(le);
6154 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6156 s.erase(*s.begin());
6157 while (!s.empty()) {
6158 snapid_t snapid = *s.begin();
6159 CInode *cow_inode = 0;
6160 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
6162 recovery_queue.enqueue(cow_inode);
6163 s.erase(*s.begin());
6166 in->parent->first = in->first;
6167 le->metablob.add_primary_dentry(in->parent, in, true);
6168 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6169 mds->mdlog->flush();
6173 recovery_queue
.enqueue(in
);
6176 void MDCache::_queued_file_recover_cow(CInode
*in
, MutationRef
& mut
)
6178 in
->pop_and_dirty_projected_inode(mut
->ls
);
6180 mds
->locker
->drop_locks(mut
.get());
6186 * called after recovery to recover file sizes for previously opened (for write)
6187 * files. that is, those where max_size > size.
6189 void MDCache::identify_files_to_recover()
6191 dout(10) << "identify_files_to_recover" << dendl
;
6192 for (auto &p
: inode_map
) {
6193 CInode
*in
= p
.second
;
6197 if (in
->last
!= CEPH_NOSNAP
)
6200 // Only normal files need file size recovery
6201 if (!in
->is_file()) {
6205 bool recover
= false;
6206 for (map
<client_t
,client_writeable_range_t
>::iterator p
= in
->inode
.client_ranges
.begin();
6207 p
!= in
->inode
.client_ranges
.end();
6209 Capability
*cap
= in
->get_client_cap(p
->first
);
6211 dout(10) << " client." << p
->first
<< " has range " << p
->second
<< " but no cap on " << *in
<< dendl
;
6218 if (in
->filelock
.is_stable()) {
6219 in
->auth_pin(&in
->filelock
);
6221 assert(in
->filelock
.get_state() == LOCK_XLOCKSNAP
);
6223 in
->filelock
.set_state(LOCK_PRE_SCAN
);
6224 rejoin_recover_q
.push_back(in
);
6226 rejoin_check_q
.push_back(in
);
6231 void MDCache::start_files_to_recover()
6233 for (CInode
*in
: rejoin_check_q
) {
6234 if (in
->filelock
.get_state() == LOCK_XLOCKSNAP
)
6235 mds
->locker
->issue_caps(in
);
6236 mds
->locker
->check_inode_max_size(in
);
6238 rejoin_check_q
.clear();
6239 for (CInode
*in
: rejoin_recover_q
) {
6240 mds
->locker
->file_recover(&in
->filelock
);
6242 if (!rejoin_recover_q
.empty()) {
6243 rejoin_recover_q
.clear();
6248 void MDCache::do_file_recover()
6250 recovery_queue
.advance();
6253 // ===============================================================================
6256 // ----------------------------
6259 class C_MDC_RetryTruncate
: public MDCacheContext
{
6263 C_MDC_RetryTruncate(MDCache
*c
, CInode
*i
, LogSegment
*l
) :
6264 MDCacheContext(c
), in(i
), ls(l
) {}
6265 void finish(int r
) override
{
6266 mdcache
->_truncate_inode(in
, ls
);
6270 void MDCache::truncate_inode(CInode
*in
, LogSegment
*ls
)
6272 auto pi
= in
->get_projected_inode();
6273 dout(10) << "truncate_inode "
6274 << pi
->truncate_from
<< " -> " << pi
->truncate_size
6278 ls
->truncating_inodes
.insert(in
);
6279 in
->get(CInode::PIN_TRUNCATING
);
6282 if (!in
->client_need_snapflush
.empty() &&
6283 (in
->get_caps_issued() & CEPH_CAP_FILE_BUFFER
)) {
6284 assert(in
->filelock
.is_xlocked());
6285 in
->filelock
.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in
, ls
));
6286 mds
->locker
->issue_caps(in
);
6290 _truncate_inode(in
, ls
);
6293 struct C_IO_MDC_TruncateFinish
: public MDCacheIOContext
{
6296 C_IO_MDC_TruncateFinish(MDCache
*c
, CInode
*i
, LogSegment
*l
) :
6297 MDCacheIOContext(c
), in(i
), ls(l
) {}
6298 void finish(int r
) override
{
6299 assert(r
== 0 || r
== -ENOENT
);
6300 mdcache
->truncate_inode_finish(in
, ls
);
6304 void MDCache::_truncate_inode(CInode
*in
, LogSegment
*ls
)
6306 auto pi
= &in
->inode
;
6307 dout(10) << "_truncate_inode "
6308 << pi
->truncate_from
<< " -> " << pi
->truncate_size
6309 << " on " << *in
<< dendl
;
6311 assert(pi
->is_truncating());
6312 assert(pi
->truncate_size
< (1ULL << 63));
6313 assert(pi
->truncate_from
< (1ULL << 63));
6314 assert(pi
->truncate_size
< pi
->truncate_from
);
6317 SnapRealm
*realm
= in
->find_snaprealm();
6318 SnapContext nullsnap
;
6319 const SnapContext
*snapc
;
6321 dout(10) << " realm " << *realm
<< dendl
;
6322 snapc
= &realm
->get_snap_context();
6324 dout(10) << " NO realm, using null context" << dendl
;
6326 assert(in
->last
== CEPH_NOSNAP
);
6328 dout(10) << "_truncate_inode snapc " << snapc
<< " on " << *in
<< dendl
;
6329 filer
.truncate(in
->inode
.ino
, &in
->inode
.layout
, *snapc
,
6330 pi
->truncate_size
, pi
->truncate_from
-pi
->truncate_size
,
6331 pi
->truncate_seq
, ceph::real_time::min(), 0,
6332 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in
, ls
),
6336 struct C_MDC_TruncateLogged
: public MDCacheLogContext
{
6339 C_MDC_TruncateLogged(MDCache
*m
, CInode
*i
, MutationRef
& mu
) :
6340 MDCacheLogContext(m
), in(i
), mut(mu
) {}
6341 void finish(int r
) override
{
6342 mdcache
->truncate_inode_logged(in
, mut
);
6346 void MDCache::truncate_inode_finish(CInode
*in
, LogSegment
*ls
)
6348 dout(10) << "truncate_inode_finish " << *in
<< dendl
;
6350 set
<CInode
*>::iterator p
= ls
->truncating_inodes
.find(in
);
6351 assert(p
!= ls
->truncating_inodes
.end());
6352 ls
->truncating_inodes
.erase(p
);
6355 auto &pi
= in
->project_inode();
6356 pi
.inode
.version
= in
->pre_dirty();
6357 pi
.inode
.truncate_from
= 0;
6358 pi
.inode
.truncate_pending
--;
6360 MutationRef
mut(new MutationImpl());
6361 mut
->ls
= mds
->mdlog
->get_current_segment();
6362 mut
->add_projected_inode(in
);
6364 EUpdate
*le
= new EUpdate(mds
->mdlog
, "truncate finish");
6365 mds
->mdlog
->start_entry(le
);
6366 CDentry
*dn
= in
->get_projected_parent_dn();
6367 le
->metablob
.add_dir_context(dn
->get_dir());
6368 le
->metablob
.add_primary_dentry(dn
, in
, true);
6369 le
->metablob
.add_truncate_finish(in
->ino(), ls
->seq
);
6371 journal_dirty_inode(mut
.get(), &le
->metablob
, in
);
6372 mds
->mdlog
->submit_entry(le
, new C_MDC_TruncateLogged(this, in
, mut
));
6374 // flush immediately if there are readers/writers waiting
6375 if (in
->is_waiter_for(CInode::WAIT_TRUNC
) ||
6376 (in
->get_caps_wanted() & (CEPH_CAP_FILE_RD
|CEPH_CAP_FILE_WR
)))
6377 mds
->mdlog
->flush();
6380 void MDCache::truncate_inode_logged(CInode
*in
, MutationRef
& mut
)
6382 dout(10) << "truncate_inode_logged " << *in
<< dendl
;
6384 mds
->locker
->drop_locks(mut
.get());
6387 in
->put(CInode::PIN_TRUNCATING
);
6388 in
->auth_unpin(this);
6390 list
<MDSInternalContextBase
*> waiters
;
6391 in
->take_waiting(CInode::WAIT_TRUNC
, waiters
);
6392 mds
->queue_waiters(waiters
);
6396 void MDCache::add_recovered_truncate(CInode
*in
, LogSegment
*ls
)
6398 dout(20) << "add_recovered_truncate " << *in
<< " in log segment "
6399 << ls
->seq
<< "/" << ls
->offset
<< dendl
;
6400 ls
->truncating_inodes
.insert(in
);
6401 in
->get(CInode::PIN_TRUNCATING
);
6404 void MDCache::remove_recovered_truncate(CInode
*in
, LogSegment
*ls
)
6406 dout(20) << "remove_recovered_truncate " << *in
<< " in log segment "
6407 << ls
->seq
<< "/" << ls
->offset
<< dendl
;
6408 // if we have the logseg the truncate started in, it must be in our list.
6409 set
<CInode
*>::iterator p
= ls
->truncating_inodes
.find(in
);
6410 assert(p
!= ls
->truncating_inodes
.end());
6411 ls
->truncating_inodes
.erase(p
);
6412 in
->put(CInode::PIN_TRUNCATING
);
6415 void MDCache::start_recovered_truncates()
6417 dout(10) << "start_recovered_truncates" << dendl
;
6418 for (map
<uint64_t,LogSegment
*>::iterator p
= mds
->mdlog
->segments
.begin();
6419 p
!= mds
->mdlog
->segments
.end();
6421 LogSegment
*ls
= p
->second
;
6422 for (set
<CInode
*>::iterator q
= ls
->truncating_inodes
.begin();
6423 q
!= ls
->truncating_inodes
.end();
6428 if (!in
->client_need_snapflush
.empty() &&
6429 (in
->get_caps_issued() & CEPH_CAP_FILE_BUFFER
)) {
6430 assert(in
->filelock
.is_stable());
6431 in
->filelock
.set_state(LOCK_XLOCKDONE
);
6432 in
->auth_pin(&in
->filelock
);
6433 in
->filelock
.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in
, ls
));
6434 // start_files_to_recover will revoke caps
6437 _truncate_inode(in
, ls
);
6447 // ================================================================================
6450 void MDCache::trim_lru(uint64_t count
, map
<mds_rank_t
, MCacheExpire
*> &expiremap
)
6452 bool is_standby_replay
= mds
->is_standby_replay();
6453 std::vector
<CDentry
*> unexpirables
;
6454 uint64_t trimmed
= 0;
6456 dout(7) << "trim_lru trimming " << count
6457 << " items from LRU"
6458 << " size=" << lru
.lru_get_size()
6459 << " mid=" << lru
.lru_get_top()
6460 << " pintail=" << lru
.lru_get_pintail()
6461 << " pinned=" << lru
.lru_get_num_pinned()
6465 CDentry
*dn
= static_cast<CDentry
*>(bottom_lru
.lru_expire());
6468 if (trim_dentry(dn
, expiremap
)) {
6469 unexpirables
.push_back(dn
);
6475 for (auto &dn
: unexpirables
) {
6476 bottom_lru
.lru_insert_mid(dn
);
6478 unexpirables
.clear();
6480 // trim dentries from the LRU until count is reached
6481 while (cache_toofull() || count
> 0) {
6482 CDentry
*dn
= static_cast<CDentry
*>(lru
.lru_expire());
6486 if ((is_standby_replay
&& dn
->get_linkage()->inode
&&
6487 dn
->get_linkage()->inode
->item_open_file
.is_on_list())) {
6488 unexpirables
.push_back(dn
);
6489 } else if (trim_dentry(dn
, expiremap
)) {
6490 unexpirables
.push_back(dn
);
6493 if (count
> 0) count
--;
6497 for (auto &dn
: unexpirables
) {
6498 lru
.lru_insert_mid(dn
);
6500 unexpirables
.clear();
6502 dout(7) << "trim_lru trimmed " << trimmed
<< " items" << dendl
;
6506 * note: only called while MDS is active or stopping... NOT during recovery.
6507 * however, we may expire a replica whose authority is recovering.
6509 * @param count is number of dentries to try to expire
6511 bool MDCache::trim(uint64_t count
)
6513 uint64_t used
= cache_size();
6514 uint64_t limit
= cache_limit_memory();
6515 map
<mds_rank_t
, MCacheExpire
*> expiremap
;
6517 dout(7) << "trim bytes_used=" << bytes2str(used
)
6518 << " limit=" << bytes2str(limit
)
6519 << " reservation=" << cache_reservation()
6520 << "% count=" << count
<< dendl
;
6522 // process delayed eval_stray()
6523 stray_manager
.advance_delayed();
6525 trim_lru(count
, expiremap
);
6527 // trim non-auth, non-bound subtrees
6528 for (auto p
= subtrees
.begin(); p
!= subtrees
.end();) {
6529 CDir
*dir
= p
->first
;
6531 CInode
*diri
= dir
->get_inode();
6532 if (dir
->is_auth()) {
6533 if (!diri
->is_auth() && !diri
->is_base() &&
6534 dir
->get_num_head_items() == 0) {
6535 if (dir
->state_test(CDir::STATE_EXPORTING
) ||
6536 !(mds
->is_active() || mds
->is_stopping()) ||
6537 dir
->is_freezing() || dir
->is_frozen())
6540 migrator
->export_empty_import(dir
);
6543 if (!diri
->is_auth()) {
6544 if (dir
->get_num_ref() > 1) // only subtree pin
6547 diri
->get_subtree_dirfrags(ls
);
6548 if (diri
->get_num_ref() > (int)ls
.size()) // only pinned by subtrees
6551 // don't trim subtree root if its auth MDS is recovering.
6552 // This simplify the cache rejoin code.
6553 if (dir
->is_subtree_root() &&
6554 rejoin_ack_gather
.count(dir
->get_dir_auth().first
))
6556 trim_dirfrag(dir
, 0, expiremap
);
6562 if (mds
->is_stopping() && root
) {
6564 root
->get_dirfrags(ls
);
6565 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
6567 if (dir
->get_num_ref() == 1) // subtree pin
6568 trim_dirfrag(dir
, 0, expiremap
);
6570 if (root
->get_num_ref() == 0)
6571 trim_inode(0, root
, 0, expiremap
);
6574 std::set
<mds_rank_t
> stopping
;
6575 mds
->mdsmap
->get_mds_set(stopping
, MDSMap::STATE_STOPPING
);
6576 stopping
.erase(mds
->get_nodeid());
6577 for (auto rank
: stopping
) {
6578 CInode
* mdsdir_in
= get_inode(MDS_INO_MDSDIR(rank
));
6582 if (expiremap
.count(rank
) == 0) {
6583 expiremap
[rank
] = new MCacheExpire(mds
->get_nodeid());
6586 dout(20) << __func__
<< ": try expiring " << *mdsdir_in
<< " for stopping mds." << mds
<< dendl
;
6588 const bool aborted
= expire_recursive(mdsdir_in
, expiremap
);
6590 dout(20) << __func__
<< ": successfully expired mdsdir" << dendl
;
6592 mdsdir_in
->get_dirfrags(ls
);
6593 for (auto dir
: ls
) {
6594 if (dir
->get_num_ref() == 1) // subtree pin
6595 trim_dirfrag(dir
, dir
, expiremap
);
6597 if (mdsdir_in
->get_num_ref() == 0)
6598 trim_inode(NULL
, mdsdir_in
, NULL
, expiremap
);
6600 dout(20) << __func__
<< ": some unexpirable contents in mdsdir" << dendl
;
6604 // Other rank's base inodes (when I'm stopping)
6605 if (mds
->is_stopping()) {
6606 for (set
<CInode
*>::iterator p
= base_inodes
.begin();
6607 p
!= base_inodes
.end(); ++p
) {
6608 if (MDS_INO_MDSDIR_OWNER((*p
)->ino()) != mds
->get_nodeid()) {
6609 dout(20) << __func__
<< ": maybe trimming base: " << *(*p
) << dendl
;
6610 if ((*p
)->get_num_ref() == 0) {
6611 trim_inode(NULL
, *p
, NULL
, expiremap
);
6617 // send any expire messages
6618 send_expire_messages(expiremap
);
6623 void MDCache::send_expire_messages(map
<mds_rank_t
, MCacheExpire
*>& expiremap
)
6626 for (map
<mds_rank_t
, MCacheExpire
*>::iterator it
= expiremap
.begin();
6627 it
!= expiremap
.end();
6629 if (mds
->is_cluster_degraded() &&
6630 (mds
->mdsmap
->get_state(it
->first
) < MDSMap::STATE_REJOIN
||
6631 (mds
->mdsmap
->get_state(it
->first
) == MDSMap::STATE_REJOIN
&&
6632 rejoin_sent
.count(it
->first
) == 0))) {
6636 dout(7) << "sending cache_expire to " << it
->first
<< dendl
;
6637 mds
->send_message_mds(it
->second
, it
->first
);
6642 bool MDCache::trim_dentry(CDentry
*dn
, map
<mds_rank_t
, MCacheExpire
*>& expiremap
)
6644 dout(12) << "trim_dentry " << *dn
<< dendl
;
6646 CDentry::linkage_t
*dnl
= dn
->get_linkage();
6648 CDir
*dir
= dn
->get_dir();
6651 CDir
*con
= get_subtree_root(dir
);
6653 dout(12) << " in container " << *con
<< dendl
;
6655 dout(12) << " no container; under a not-yet-linked dir" << dendl
;
6656 assert(dn
->is_auth());
6659 // If replica dentry is not readable, it's likely we will receive
6660 // MDentryLink/MDentryUnlink message soon (It's possible we first
6661 // receive a MDentryUnlink message, then MDentryLink message)
6662 // MDentryLink message only replicates an inode, so we should
6663 // avoid trimming the inode's parent dentry. This is because that
6664 // unconnected replicas are problematic for subtree migration.
6665 if (!dn
->is_auth() && !dn
->lock
.can_read(-1) &&
6666 !dn
->get_dir()->get_inode()->is_stray())
6669 // adjust the dir state
6670 // NOTE: we can safely remove a clean, null dentry without effecting
6671 // directory completeness.
6672 // (check this _before_ we unlink the inode, below!)
6673 bool clear_complete
= false;
6674 if (!(dnl
->is_null() && dn
->is_clean()))
6675 clear_complete
= true;
6677 // unlink the dentry
6678 if (dnl
->is_remote()) {
6680 dir
->unlink_inode(dn
, false);
6681 } else if (dnl
->is_primary()) {
6682 // expire the inode, too.
6683 CInode
*in
= dnl
->get_inode();
6685 if (trim_inode(dn
, in
, con
, expiremap
))
6686 return true; // purging stray instead of trimming
6688 assert(dnl
->is_null());
6691 if (!dn
->is_auth()) {
6692 // notify dentry authority.
6693 mds_authority_t auth
= dn
->authority();
6695 for (int p
=0; p
<2; p
++) {
6696 mds_rank_t a
= auth
.first
;
6697 if (p
) a
= auth
.second
;
6698 if (a
< 0 || (p
== 1 && auth
.second
== auth
.first
)) break;
6699 if (mds
->get_nodeid() == auth
.second
&&
6700 con
->is_importing()) break; // don't send any expire while importing.
6701 if (a
== mds
->get_nodeid()) continue; // on export, ignore myself.
6703 dout(12) << " sending expire to mds." << a
<< " on " << *dn
<< dendl
;
6704 assert(a
!= mds
->get_nodeid());
6705 if (expiremap
.count(a
) == 0)
6706 expiremap
[a
] = new MCacheExpire(mds
->get_nodeid());
6707 expiremap
[a
]->add_dentry(con
->dirfrag(), dir
->dirfrag(), dn
->get_name(), dn
->last
, dn
->get_replica_nonce());
6712 if (dn
->last
== CEPH_NOSNAP
&& dir
->is_auth())
6713 dir
->add_to_bloom(dn
);
6714 dir
->remove_dentry(dn
);
6717 dir
->state_clear(CDir::STATE_COMPLETE
);
6719 if (mds
->logger
) mds
->logger
->inc(l_mds_inodes_expired
);
6724 void MDCache::trim_dirfrag(CDir
*dir
, CDir
*con
, map
<mds_rank_t
, MCacheExpire
*>& expiremap
)
6726 dout(15) << "trim_dirfrag " << *dir
<< dendl
;
6728 if (dir
->is_subtree_root()) {
6729 assert(!dir
->is_auth() ||
6730 (!dir
->is_replicated() && dir
->inode
->is_base()));
6731 remove_subtree(dir
); // remove from subtree map
6733 assert(dir
->get_num_ref() == 0);
6735 CInode
*in
= dir
->get_inode();
6737 if (!dir
->is_auth()) {
6738 mds_authority_t auth
= dir
->authority();
6740 // was this an auth delegation? (if so, slightly modified container)
6742 if (dir
->is_subtree_root()) {
6743 dout(12) << " subtree root, container is " << *dir
<< dendl
;
6745 condf
= dir
->dirfrag();
6747 condf
= con
->dirfrag();
6750 for (int p
=0; p
<2; p
++) {
6751 mds_rank_t a
= auth
.first
;
6752 if (p
) a
= auth
.second
;
6753 if (a
< 0 || (p
== 1 && auth
.second
== auth
.first
)) break;
6754 if (mds
->get_nodeid() == auth
.second
&&
6755 con
->is_importing()) break; // don't send any expire while importing.
6756 if (a
== mds
->get_nodeid()) continue; // on export, ignore myself.
6758 dout(12) << " sending expire to mds." << a
<< " on " << *dir
<< dendl
;
6759 assert(a
!= mds
->get_nodeid());
6760 if (expiremap
.count(a
) == 0)
6761 expiremap
[a
] = new MCacheExpire(mds
->get_nodeid());
6762 expiremap
[a
]->add_dir(condf
, dir
->dirfrag(), dir
->replica_nonce
);
6766 in
->close_dirfrag(dir
->dirfrag().frag
);
6770 * Try trimming an inode from the cache
6772 * @return true if the inode is still in cache, else false if it was trimmed
6774 bool MDCache::trim_inode(CDentry
*dn
, CInode
*in
, CDir
*con
, map
<mds_rank_t
, MCacheExpire
*>& expiremap
)
6776 dout(15) << "trim_inode " << *in
<< dendl
;
6777 assert(in
->get_num_ref() == 0);
6780 // If replica inode's dirfragtreelock is not readable, it's likely
6781 // some dirfrags of the inode are being fragmented and we will receive
6782 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
6783 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
6784 // This is because that unconnected replicas are problematic for
6785 // subtree migration.
6787 if (!in
->is_auth() && !mds
->locker
->rdlock_try(&in
->dirfragtreelock
, -1, nullptr)) {
6793 in
->get_dirfrags(dfls
);
6794 for (list
<CDir
*>::iterator p
= dfls
.begin(); p
!= dfls
.end(); ++p
) {
6796 assert(!dir
->is_subtree_root());
6797 trim_dirfrag(dir
, con
? con
:dir
, expiremap
); // if no container (e.g. root dirfrag), use *p
6802 if (in
->is_auth()) {
6803 // eval stray after closing dirfrags
6804 if (dn
&& !dn
->state_test(CDentry::STATE_PURGING
)) {
6805 maybe_eval_stray(in
);
6806 if (dn
->state_test(CDentry::STATE_PURGING
) || dn
->get_num_ref() > 0)
6810 mds_authority_t auth
= in
->authority();
6814 df
= con
->dirfrag();
6816 df
= dirfrag_t(0,frag_t()); // must be a root or stray inode.
6818 for (int p
=0; p
<2; p
++) {
6819 mds_rank_t a
= auth
.first
;
6820 if (p
) a
= auth
.second
;
6821 if (a
< 0 || (p
== 1 && auth
.second
== auth
.first
)) break;
6822 if (con
&& mds
->get_nodeid() == auth
.second
&&
6823 con
->is_importing()) break; // don't send any expire while importing.
6824 if (a
== mds
->get_nodeid()) continue; // on export, ignore myself.
6826 dout(12) << " sending expire to mds." << a
<< " on " << *in
<< dendl
;
6827 assert(a
!= mds
->get_nodeid());
6828 if (expiremap
.count(a
) == 0)
6829 expiremap
[a
] = new MCacheExpire(mds
->get_nodeid());
6830 expiremap
[a
]->add_inode(df
, in
->vino(), in
->get_replica_nonce());
6835 if (in->is_auth()) {
6836 if (in->hack_accessed)
6837 mds->logger->inc("outt");
6839 mds->logger->inc("outut");
6840 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
6847 dn
->get_dir()->unlink_inode(dn
, false);
6854 * trim_non_auth - remove any non-auth items from our cache
6856 * this reduces the amount of non-auth metadata in our cache, reducing the
6857 * load incurred by the rejoin phase.
6859 * the only non-auth items that remain are those that are needed to
6860 * attach our own subtrees to the root.
6862 * when we are done, all dentries will be in the top bit of the lru.
6864 * why we have to do this:
6865 * we may not have accurate linkage for non-auth items. which means we will
6866 * know which subtree it falls into, and can not be sure to declare it to the
6867 * correct authority.
6869 void MDCache::trim_non_auth()
6871 dout(7) << "trim_non_auth" << dendl
;
6873 // temporarily pin all subtree roots
6874 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
6875 p
!= subtrees
.end();
6877 p
->first
->get(CDir::PIN_SUBTREETEMP
);
6879 list
<CDentry
*> auth_list
;
6881 // trim non-auth items from the lru
6884 if (bottom_lru
.lru_get_size() > 0)
6885 dn
= static_cast<CDentry
*>(bottom_lru
.lru_expire());
6886 if (!dn
&& lru
.lru_get_size() > 0)
6887 dn
= static_cast<CDentry
*>(lru
.lru_expire());
6891 CDentry::linkage_t
*dnl
= dn
->get_linkage();
6893 if (dn
->is_auth()) {
6894 // add back into lru (at the top)
6895 auth_list
.push_back(dn
);
6897 if (dnl
->is_remote() && dnl
->get_inode() && !dnl
->get_inode()->is_auth())
6898 dn
->unlink_remote(dnl
);
6900 // non-auth. expire.
6901 CDir
*dir
= dn
->get_dir();
6904 // unlink the dentry
6905 dout(10) << " removing " << *dn
<< dendl
;
6906 if (dnl
->is_remote()) {
6907 dir
->unlink_inode(dn
, false);
6909 else if (dnl
->is_primary()) {
6910 CInode
*in
= dnl
->get_inode();
6911 dout(10) << " removing " << *in
<< dendl
;
6913 in
->get_dirfrags(ls
);
6914 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
6916 assert(!subdir
->is_subtree_root());
6917 in
->close_dirfrag(subdir
->dirfrag().frag
);
6919 dir
->unlink_inode(dn
, false);
6923 assert(dnl
->is_null());
6926 assert(!dir
->has_bloom());
6927 dir
->remove_dentry(dn
);
6928 // adjust the dir state
6929 dir
->state_clear(CDir::STATE_COMPLETE
); // dir incomplete!
6930 // close empty non-auth dirfrag
6931 if (!dir
->is_subtree_root() && dir
->get_num_any() == 0)
6932 dir
->inode
->close_dirfrag(dir
->get_frag());
6936 for (auto dn
: auth_list
) {
6937 if (dn
->state_test(CDentry::STATE_BOTTOMLRU
))
6938 bottom_lru
.lru_insert_mid(dn
);
6940 lru
.lru_insert_top(dn
);
6943 // move everything in the pintail to the top bit of the lru.
6944 lru
.lru_touch_entire_pintail();
6946 // unpin all subtrees
6947 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
6948 p
!= subtrees
.end();
6950 p
->first
->put(CDir::PIN_SUBTREETEMP
);
6952 if (lru
.lru_get_size() == 0 &&
6953 bottom_lru
.lru_get_size() == 0) {
6954 // root, stray, etc.?
6955 auto p
= inode_map
.begin();
6956 while (p
!= inode_map
.end()) {
6957 CInode
*in
= p
->second
;
6959 if (!in
->is_auth()) {
6961 in
->get_dirfrags(ls
);
6962 for (list
<CDir
*>::iterator p
= ls
.begin();
6965 dout(10) << " removing " << **p
<< dendl
;
6966 assert((*p
)->get_num_ref() == 1); // SUBTREE
6967 remove_subtree((*p
));
6968 in
->close_dirfrag((*p
)->dirfrag().frag
);
6970 dout(10) << " removing " << *in
<< dendl
;
6971 assert(!in
->get_parent_dn());
6972 assert(in
->get_num_ref() == 0);
6982 * Recursively trim the subtree rooted at directory to remove all
6983 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
6984 * of those links. This is used to clear invalid data out of the cache.
6985 * Note that it doesn't clear the passed-in directory, since that's not
6988 bool MDCache::trim_non_auth_subtree(CDir
*dir
)
6990 dout(10) << "trim_non_auth_subtree(" << dir
<< ") " << *dir
<< dendl
;
6992 bool keep_dir
= !can_trim_non_auth_dirfrag(dir
);
6994 auto j
= dir
->begin();
6996 while (j
!= dir
->end()) {
6998 CDentry
*dn
= i
->second
;
6999 dout(10) << "trim_non_auth_subtree(" << dir
<< ") Checking dentry " << dn
<< dendl
;
7000 CDentry::linkage_t
*dnl
= dn
->get_linkage();
7001 if (dnl
->is_primary()) { // check for subdirectories, etc
7002 CInode
*in
= dnl
->get_inode();
7003 bool keep_inode
= false;
7005 list
<CDir
*> subdirs
;
7006 in
->get_dirfrags(subdirs
);
7007 for (list
<CDir
*>::iterator subdir
= subdirs
.begin();
7008 subdir
!= subdirs
.end();
7010 if ((*subdir
)->is_subtree_root()) {
7012 dout(10) << "trim_non_auth_subtree(" << dir
<< ") keeping " << **subdir
<< dendl
;
7014 if (trim_non_auth_subtree(*subdir
))
7017 in
->close_dirfrag((*subdir
)->get_frag());
7018 dir
->state_clear(CDir::STATE_COMPLETE
); // now incomplete!
7024 if (!keep_inode
) { // remove it!
7025 dout(20) << "trim_non_auth_subtree(" << dir
<< ") removing inode " << in
<< " with dentry" << dn
<< dendl
;
7026 dir
->unlink_inode(dn
, false);
7028 assert(!dir
->has_bloom());
7029 dir
->remove_dentry(dn
);
7031 dout(20) << "trim_non_auth_subtree(" << dir
<< ") keeping inode " << in
<< " with dentry " << dn
<<dendl
;
7032 dn
->state_clear(CDentry::STATE_AUTH
);
7033 in
->state_clear(CInode::STATE_AUTH
);
7035 } else if (keep_dir
&& dnl
->is_null()) { // keep null dentry for slave rollback
7036 dout(20) << "trim_non_auth_subtree(" << dir
<< ") keeping dentry " << dn
<<dendl
;
7037 } else { // just remove it
7038 dout(20) << "trim_non_auth_subtree(" << dir
<< ") removing dentry " << dn
<< dendl
;
7039 if (dnl
->is_remote())
7040 dir
->unlink_inode(dn
, false);
7041 dir
->remove_dentry(dn
);
7044 dir
->state_clear(CDir::STATE_AUTH
);
7046 * We've now checked all our children and deleted those that need it.
7047 * Now return to caller, and tell them if *we're* a keeper.
7049 return keep_dir
|| dir
->get_num_any();
7053 * during replay, when we determine a subtree is no longer ours, we
7054 * try to trim it from our cache. because subtrees must be connected
7055 * to the root, the fact that we can trim this tree may mean that our
7056 * children or parents can also be trimmed.
7058 void MDCache::try_trim_non_auth_subtree(CDir
*dir
)
7060 dout(10) << "try_trim_nonauth_subtree " << *dir
<< dendl
;
7062 // can we now trim child subtrees?
7064 get_subtree_bounds(dir
, bounds
);
7065 for (set
<CDir
*>::iterator p
= bounds
.begin(); p
!= bounds
.end(); ++p
) {
7067 if (bd
->get_dir_auth().first
!= mds
->get_nodeid() && // we are not auth
7068 bd
->get_num_any() == 0 && // and empty
7069 can_trim_non_auth_dirfrag(bd
)) {
7070 CInode
*bi
= bd
->get_inode();
7071 dout(10) << " closing empty non-auth child subtree " << *bd
<< dendl
;
7074 bi
->close_dirfrag(bd
->get_frag());
7078 if (trim_non_auth_subtree(dir
)) {
7080 try_subtree_merge(dir
);
7082 // can we trim this subtree (and possibly our ancestors) too?
7084 CInode
*diri
= dir
->get_inode();
7085 if (diri
->is_base()) {
7086 if (!diri
->is_root() && diri
->authority().first
!= mds
->get_nodeid()) {
7087 dout(10) << " closing empty non-auth subtree " << *dir
<< dendl
;
7088 remove_subtree(dir
);
7090 diri
->close_dirfrag(dir
->get_frag());
7092 dout(10) << " removing " << *diri
<< dendl
;
7093 assert(!diri
->get_parent_dn());
7094 assert(diri
->get_num_ref() == 0);
7100 CDir
*psub
= get_subtree_root(diri
->get_parent_dir());
7101 dout(10) << " parent subtree is " << *psub
<< dendl
;
7102 if (psub
->get_dir_auth().first
== mds
->get_nodeid())
7103 break; // we are auth, keep.
7105 dout(10) << " closing empty non-auth subtree " << *dir
<< dendl
;
7106 remove_subtree(dir
);
7108 diri
->close_dirfrag(dir
->get_frag());
7110 dout(10) << " parent subtree also non-auth: " << *psub
<< dendl
;
7111 if (trim_non_auth_subtree(psub
))
7120 void MDCache::standby_trim_segment(LogSegment
*ls
)
7122 ls
->new_dirfrags
.clear_list();
7123 ls
->open_files
.clear_list();
7125 while (!ls
->dirty_dirfrags
.empty()) {
7126 CDir
*dir
= ls
->dirty_dirfrags
.front();
7129 while (!ls
->dirty_inodes
.empty()) {
7130 CInode
*in
= ls
->dirty_inodes
.front();
7133 while (!ls
->dirty_dentries
.empty()) {
7134 CDentry
*dn
= ls
->dirty_dentries
.front();
7137 while (!ls
->dirty_parent_inodes
.empty()) {
7138 CInode
*in
= ls
->dirty_parent_inodes
.front();
7139 in
->clear_dirty_parent();
7141 while (!ls
->dirty_dirfrag_dir
.empty()) {
7142 CInode
*in
= ls
->dirty_dirfrag_dir
.front();
7143 in
->filelock
.remove_dirty();
7145 while (!ls
->dirty_dirfrag_nest
.empty()) {
7146 CInode
*in
= ls
->dirty_dirfrag_nest
.front();
7147 in
->nestlock
.remove_dirty();
7149 while (!ls
->dirty_dirfrag_dirfragtree
.empty()) {
7150 CInode
*in
= ls
->dirty_dirfrag_dirfragtree
.front();
7151 in
->dirfragtreelock
.remove_dirty();
7155 /* This function DOES put the passed message before returning */
7156 void MDCache::handle_cache_expire(MCacheExpire
*m
)
7158 mds_rank_t from
= mds_rank_t(m
->get_from());
7160 dout(7) << "cache_expire from mds." << from
<< dendl
;
7162 if (mds
->get_state() < MDSMap::STATE_REJOIN
) {
7167 set
<SimpleLock
*> gather_locks
;
7169 for (map
<dirfrag_t
,MCacheExpire::realm
>::iterator p
= m
->realms
.begin();
7170 p
!= m
->realms
.end();
7173 if (p
->first
.ino
> 0) {
7174 CInode
*expired_inode
= get_inode(p
->first
.ino
);
7175 assert(expired_inode
); // we had better have this.
7176 CDir
*parent_dir
= expired_inode
->get_approx_dirfrag(p
->first
.frag
);
7179 int export_state
= -1;
7180 if (parent_dir
->is_auth() && parent_dir
->is_exporting()) {
7181 export_state
= migrator
->get_export_state(parent_dir
);
7182 assert(export_state
>= 0);
7185 if (!parent_dir
->is_auth() ||
7186 (export_state
!= -1 &&
7187 ((export_state
== Migrator::EXPORT_WARNING
&&
7188 migrator
->export_has_warned(parent_dir
,from
)) ||
7189 export_state
== Migrator::EXPORT_EXPORTING
||
7190 export_state
== Migrator::EXPORT_LOGGINGFINISH
||
7191 (export_state
== Migrator::EXPORT_NOTIFYING
&&
7192 !migrator
->export_has_notified(parent_dir
,from
))))) {
7195 dout(7) << "delaying nonauth|warned expires for " << *parent_dir
<< dendl
;
7196 assert(parent_dir
->is_frozen_tree_root());
7198 // make a message container
7199 if (delayed_expire
[parent_dir
].count(from
) == 0)
7200 delayed_expire
[parent_dir
][from
] = new MCacheExpire(from
);
7202 // merge these expires into it
7203 delayed_expire
[parent_dir
][from
]->add_realm(p
->first
, p
->second
);
7206 assert(export_state
<= Migrator::EXPORT_PREPPING
||
7207 (export_state
== Migrator::EXPORT_WARNING
&&
7208 !migrator
->export_has_warned(parent_dir
, from
)));
7210 dout(7) << "expires for " << *parent_dir
<< dendl
;
7212 dout(7) << "containerless expires (root, stray inodes)" << dendl
;
7216 for (map
<vinodeno_t
,uint32_t>::iterator it
= p
->second
.inodes
.begin();
7217 it
!= p
->second
.inodes
.end();
7219 CInode
*in
= get_inode(it
->first
);
7220 unsigned nonce
= it
->second
;
7223 dout(0) << " inode expire on " << it
->first
<< " from " << from
7224 << ", don't have it" << dendl
;
7227 assert(in
->is_auth());
7228 dout(20) << __func__
<< ": expiring inode " << *in
<< dendl
;
7231 if (nonce
== in
->get_replica_nonce(from
)) {
7232 // remove from our cached_by
7233 dout(7) << " inode expire on " << *in
<< " from mds." << from
7234 << " cached_by was " << in
->get_replicas() << dendl
;
7235 inode_remove_replica(in
, from
, false, gather_locks
);
7238 // this is an old nonce, ignore expire.
7239 dout(7) << " inode expire on " << *in
<< " from mds." << from
7240 << " with old nonce " << nonce
7241 << " (current " << in
->get_replica_nonce(from
) << "), dropping"
7247 for (map
<dirfrag_t
,uint32_t>::iterator it
= p
->second
.dirs
.begin();
7248 it
!= p
->second
.dirs
.end();
7250 CDir
*dir
= get_dirfrag(it
->first
);
7251 unsigned nonce
= it
->second
;
7254 CInode
*diri
= get_inode(it
->first
.ino
);
7256 if (mds
->is_rejoin() &&
7257 rejoin_ack_gather
.count(mds
->get_nodeid()) && // haven't sent rejoin ack yet
7258 !diri
->is_replica(from
)) {
7260 diri
->get_nested_dirfrags(ls
);
7261 dout(7) << " dir expire on dirfrag " << it
->first
<< " from mds." << from
7262 << " while rejoining, inode isn't replicated" << dendl
;
7263 for (list
<CDir
*>::iterator q
= ls
.begin(); q
!= ls
.end(); ++q
) {
7265 if (dir
->is_replica(from
)) {
7266 dout(7) << " dir expire on " << *dir
<< " from mds." << from
<< dendl
;
7267 dir
->remove_replica(from
);
7272 CDir
*other
= diri
->get_approx_dirfrag(it
->first
.frag
);
7274 dout(7) << " dir expire on dirfrag " << it
->first
<< " from mds." << from
7275 << " have " << *other
<< ", mismatched frags, dropping" << dendl
;
7279 dout(0) << " dir expire on " << it
->first
<< " from " << from
7280 << ", don't have it" << dendl
;
7283 dout(20) << __func__
<< ": expiring dirfrag " << *dir
<< dendl
;
7285 assert(dir
->is_auth());
7288 if (nonce
== dir
->get_replica_nonce(from
)) {
7289 // remove from our cached_by
7290 dout(7) << " dir expire on " << *dir
<< " from mds." << from
7291 << " replicas was " << dir
->get_replicas() << dendl
;
7292 dir
->remove_replica(from
);
7295 // this is an old nonce, ignore expire.
7296 dout(7) << " dir expire on " << *dir
<< " from mds." << from
7297 << " with old nonce " << nonce
<< " (current " << dir
->get_replica_nonce(from
)
7298 << "), dropping" << dendl
;
7303 for (map
<dirfrag_t
, map
<pair
<string
,snapid_t
>,uint32_t> >::iterator pd
= p
->second
.dentries
.begin();
7304 pd
!= p
->second
.dentries
.end();
7306 dout(10) << " dn expires in dir " << pd
->first
<< dendl
;
7307 CInode
*diri
= get_inode(pd
->first
.ino
);
7309 CDir
*dir
= diri
->get_dirfrag(pd
->first
.frag
);
7312 dout(0) << " dn expires on " << pd
->first
<< " from " << from
7313 << ", must have refragmented" << dendl
;
7315 assert(dir
->is_auth());
7318 for (map
<pair
<string
,snapid_t
>,uint32_t>::iterator p
= pd
->second
.begin();
7319 p
!= pd
->second
.end();
7321 unsigned nonce
= p
->second
;
7325 dn
= dir
->lookup(p
->first
.first
, p
->first
.second
);
7327 // which dirfrag for this dentry?
7328 CDir
*dir
= diri
->get_dirfrag(diri
->pick_dirfrag(p
->first
.first
));
7330 assert(dir
->is_auth());
7331 dn
= dir
->lookup(p
->first
.first
, p
->first
.second
);
7336 dout(0) << " missing dentry for " << p
->first
.first
<< " snap " << p
->first
.second
<< " in " << *dir
<< dendl
;
7338 dout(0) << " missing dentry for " << p
->first
.first
<< " snap " << p
->first
.second
<< dendl
;
7342 if (nonce
== dn
->get_replica_nonce(from
)) {
7343 dout(7) << " dentry_expire on " << *dn
<< " from mds." << from
<< dendl
;
7344 dentry_remove_replica(dn
, from
, gather_locks
);
7347 dout(7) << " dentry_expire on " << *dn
<< " from mds." << from
7348 << " with old nonce " << nonce
<< " (current " << dn
->get_replica_nonce(from
)
7349 << "), dropping" << dendl
;
7358 for (set
<SimpleLock
*>::iterator p
= gather_locks
.begin(); p
!= gather_locks
.end(); ++p
) {
7359 if (!(*p
)->is_stable())
7360 mds
->locker
->eval_gather(*p
);
7364 void MDCache::process_delayed_expire(CDir
*dir
)
7366 dout(7) << "process_delayed_expire on " << *dir
<< dendl
;
7367 for (map
<mds_rank_t
,MCacheExpire
*>::iterator p
= delayed_expire
[dir
].begin();
7368 p
!= delayed_expire
[dir
].end();
7370 handle_cache_expire(p
->second
);
7371 delayed_expire
.erase(dir
);
7374 void MDCache::discard_delayed_expire(CDir
*dir
)
7376 dout(7) << "discard_delayed_expire on " << *dir
<< dendl
;
7377 for (map
<mds_rank_t
,MCacheExpire
*>::iterator p
= delayed_expire
[dir
].begin();
7378 p
!= delayed_expire
[dir
].end();
7381 delayed_expire
.erase(dir
);
7384 void MDCache::inode_remove_replica(CInode
*in
, mds_rank_t from
, bool rejoin
,
7385 set
<SimpleLock
*>& gather_locks
)
7387 in
->remove_replica(from
);
7388 in
->mds_caps_wanted
.erase(from
);
7390 // note: this code calls _eval more often than it needs to!
7392 if (in
->authlock
.remove_replica(from
)) gather_locks
.insert(&in
->authlock
);
7393 if (in
->linklock
.remove_replica(from
)) gather_locks
.insert(&in
->linklock
);
7394 if (in
->snaplock
.remove_replica(from
)) gather_locks
.insert(&in
->snaplock
);
7395 if (in
->xattrlock
.remove_replica(from
)) gather_locks
.insert(&in
->xattrlock
);
7396 if (in
->flocklock
.remove_replica(from
)) gather_locks
.insert(&in
->flocklock
);
7397 if (in
->policylock
.remove_replica(from
)) gather_locks
.insert(&in
->policylock
);
7399 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7400 // Don't remove the recovering mds from lock's gathering list because
7401 // it may hold rejoined wrlocks.
7402 if (in
->dirfragtreelock
.remove_replica(from
, rejoin
)) gather_locks
.insert(&in
->dirfragtreelock
);
7403 if (in
->filelock
.remove_replica(from
, rejoin
)) gather_locks
.insert(&in
->filelock
);
7404 if (in
->nestlock
.remove_replica(from
, rejoin
)) gather_locks
.insert(&in
->nestlock
);
7407 void MDCache::dentry_remove_replica(CDentry
*dn
, mds_rank_t from
, set
<SimpleLock
*>& gather_locks
)
7409 dn
->remove_replica(from
);
7412 if (dn
->lock
.remove_replica(from
))
7413 gather_locks
.insert(&dn
->lock
);
7415 // Replicated strays might now be elegible for purge
7416 CDentry::linkage_t
*dnl
= dn
->get_linkage();
7417 if (dnl
->is_primary()) {
7418 maybe_eval_stray(dnl
->get_inode());
7422 void MDCache::trim_client_leases()
7424 utime_t now
= ceph_clock_now();
7426 dout(10) << "trim_client_leases" << dendl
;
7428 for (int pool
=0; pool
<client_lease_pools
; pool
++) {
7429 int before
= client_leases
[pool
].size();
7430 if (client_leases
[pool
].empty())
7433 while (!client_leases
[pool
].empty()) {
7434 ClientLease
*r
= client_leases
[pool
].front();
7435 if (r
->ttl
> now
) break;
7436 CDentry
*dn
= static_cast<CDentry
*>(r
->parent
);
7437 dout(10) << " expiring client." << r
->client
<< " lease of " << *dn
<< dendl
;
7438 dn
->remove_client_lease(r
, mds
->locker
);
7440 int after
= client_leases
[pool
].size();
7441 dout(10) << "trim_client_leases pool " << pool
<< " trimmed "
7442 << (before
-after
) << " leases, " << after
<< " left" << dendl
;
7447 void MDCache::check_memory_usage()
7449 static MemoryModel
mm(g_ceph_context
);
7450 static MemoryModel::snap last
;
7452 static MemoryModel::snap baseline
= last
;
7454 // check client caps
7455 assert(CInode::count() == inode_map
.size() + snap_inode_map
.size() + num_shadow_inodes
);
7456 double caps_per_inode
= 0.0;
7457 if (CInode::count())
7458 caps_per_inode
= (double)Capability::count() / (double)CInode::count();
7460 dout(2) << "check_memory_usage"
7461 << " total " << last
.get_total()
7462 << ", rss " << last
.get_rss()
7463 << ", heap " << last
.get_heap()
7464 << ", baseline " << baseline
.get_heap()
7465 << ", buffers " << (buffer::get_total_alloc() >> 10)
7466 << ", " << num_inodes_with_caps
<< " / " << CInode::count() << " inodes have caps"
7467 << ", " << Capability::count() << " caps, " << caps_per_inode
<< " caps per inode"
7470 mds
->update_mlogger();
7471 mds
->mlogger
->set(l_mdm_rss
, last
.get_rss());
7472 mds
->mlogger
->set(l_mdm_heap
, last
.get_heap());
7474 if (cache_toofull()) {
7475 last_recall_state
= ceph_clock_now();
7476 mds
->server
->recall_client_state();
7479 // If the cache size had exceeded its limit, but we're back in bounds
7480 // now, free any unused pool memory so that our memory usage isn't
7481 // permanently bloated.
7482 if (exceeded_size_limit
&& !cache_toofull()) {
7483 // Only do this once we are back in bounds: otherwise the releases would
7484 // slow down whatever process caused us to exceed bounds to begin with
7485 if (ceph_using_tcmalloc()) {
7486 dout(2) << "check_memory_usage: releasing unused space from tcmalloc"
7488 ceph_heap_release_free_memory();
7490 exceeded_size_limit
= false;
7496 // =========================================================================================
7499 class C_MDC_ShutdownCheck
: public MDCacheContext
{
7501 explicit C_MDC_ShutdownCheck(MDCache
*m
) : MDCacheContext(m
) {}
7502 void finish(int) override
{
7503 mdcache
->shutdown_check();
7507 void MDCache::shutdown_check()
7509 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl
;
7512 char old_val
[32] = { 0 };
7514 g_conf
->get_val("debug_mds", &o
, sizeof(old_val
));
7515 g_conf
->set_val("debug_mds", "10");
7516 g_conf
->apply_changes(NULL
);
7518 g_conf
->set_val("debug_mds", old_val
);
7519 g_conf
->apply_changes(NULL
);
7520 mds
->timer
.add_event_after(g_conf
->mds_shutdown_check
, new C_MDC_ShutdownCheck(this));
7523 dout(0) << "lru size now " << lru
.lru_get_size() << "/" << bottom_lru
.lru_get_size() << dendl
;
7524 dout(0) << "log len " << mds
->mdlog
->get_num_events() << dendl
;
7527 if (mds
->objecter
->is_active()) {
7528 dout(0) << "objecter still active" << dendl
;
7529 mds
->objecter
->dump_active();
7534 void MDCache::shutdown_start()
7536 dout(2) << "shutdown_start" << dendl
;
7538 if (g_conf
->mds_shutdown_check
)
7539 mds
->timer
.add_event_after(g_conf
->mds_shutdown_check
, new C_MDC_ShutdownCheck(this));
7541 // g_conf->debug_mds = 10;
7546 bool MDCache::shutdown_pass()
7548 dout(7) << "shutdown_pass" << dendl
;
7550 if (mds
->is_stopped()) {
7551 dout(7) << " already shut down" << dendl
;
7558 bool strays_all_exported
= shutdown_export_strays();
7562 dout(5) << "lru size now " << lru
.lru_get_size() << "/" << bottom_lru
.lru_get_size() << dendl
;
7564 // Export all subtrees to another active (usually rank 0) if not rank 0
7565 int num_auth_subtree
= 0;
7566 if (!subtrees
.empty() &&
7567 mds
->get_nodeid() != 0) {
7568 dout(7) << "looking for subtrees to export to mds0" << dendl
;
7570 for (map
<CDir
*, set
<CDir
*> >::iterator it
= subtrees
.begin();
7571 it
!= subtrees
.end();
7573 CDir
*dir
= it
->first
;
7574 if (dir
->get_inode()->is_mdsdir())
7576 if (dir
->is_auth()) {
7578 if (dir
->is_frozen() ||
7579 dir
->is_freezing() ||
7580 dir
->is_ambiguous_dir_auth() ||
7581 dir
->state_test(CDir::STATE_EXPORTING
))
7587 migrator
->clear_export_queue();
7588 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
7590 mds_rank_t dest
= dir
->get_inode()->authority().first
;
7591 if (dest
> 0 && !mds
->mdsmap
->is_active(dest
))
7593 dout(7) << "sending " << *dir
<< " back to mds." << dest
<< dendl
;
7594 migrator
->export_dir_nicely(dir
, dest
);
7598 if (!strays_all_exported
) {
7599 dout(7) << "waiting for strays to migrate" << dendl
;
7603 if (num_auth_subtree
> 0) {
7604 assert(mds
->get_nodeid() > 0);
7605 dout(7) << "still have " << num_auth_subtree
<< " auth subtrees" << dendl
;
7610 // close out any sessions (and open files!) before we try to trim the log, etc.
7611 if (mds
->sessionmap
.have_unclosed_sessions()) {
7612 if (!mds
->server
->terminating_sessions
)
7613 mds
->server
->terminate_sessions();
7617 // Fully trim the log so that all objects in cache are clean and may be
7618 // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
7619 // trim the log such that the cache eventually becomes clean.
7620 mds
->mdlog
->trim(0);
7621 if (mds
->mdlog
->get_num_segments() > 1) {
7622 dout(7) << "still >1 segments, waiting for log to trim" << dendl
;
7626 // drop our reference to our stray dir inode
7627 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
7629 strays
[i
]->state_test(CInode::STATE_STRAYPINNED
)) {
7630 strays
[i
]->state_clear(CInode::STATE_STRAYPINNED
);
7631 strays
[i
]->put(CInode::PIN_STRAY
);
7632 strays
[i
]->put_stickydirs();
7636 CDir
*mydir
= myin
? myin
->get_dirfrag(frag_t()) : NULL
;
7637 if (mydir
&& !mydir
->is_subtree_root())
7640 // subtrees map not empty yet?
7641 if (subtrees
.size() > (mydir
? 1 : 0)) {
7642 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl
;
7644 migrator
->show_importing();
7645 migrator
->show_exporting();
7646 if (!migrator
->is_importing() && !migrator
->is_exporting())
7650 assert(!migrator
->is_exporting());
7651 assert(!migrator
->is_importing());
7653 if ((myin
&& myin
->is_auth_pinned()) ||
7654 (mydir
&& mydir
->is_auth_pinned())) {
7655 dout(7) << "still have auth pinned objects" << dendl
;
7659 // (only do this once!)
7660 if (!mds
->mdlog
->is_capped()) {
7661 dout(7) << "capping the log" << dendl
;
7666 if (!mds
->mdlog
->empty()) {
7667 dout(7) << "waiting for log to flush.. " << mds
->mdlog
->get_num_events()
7668 << " in " << mds
->mdlog
->get_num_segments() << " segments" << dendl
;
7672 if (!did_shutdown_log_cap
) {
7673 // flush journal header
7674 dout(7) << "writing header for (now-empty) journal" << dendl
;
7675 assert(mds
->mdlog
->empty());
7676 mds
->mdlog
->write_head(0);
7677 // NOTE: filer active checker below will block us until this completes.
7678 did_shutdown_log_cap
= true;
7683 if (mds
->objecter
->is_active()) {
7684 dout(7) << "objecter still active" << dendl
;
7685 mds
->objecter
->dump_active();
7689 // trim what we can from the cache
7690 if (lru
.lru_get_size() > 0 || bottom_lru
.lru_get_size() > 0) {
7691 dout(7) << "there's still stuff in the cache: " << lru
.lru_get_size() << "/" << bottom_lru
.lru_get_size() << dendl
;
7697 // make mydir subtree go away
7699 if (mydir
->get_num_ref() > 1) { // subtree pin
7700 dout(7) << "there's still reference to mydir " << *mydir
<< dendl
;
7705 remove_subtree(mydir
);
7706 myin
->close_dirfrag(mydir
->get_frag());
7708 assert(subtrees
.empty());
7714 dout(2) << "shutdown done." << dendl
;
7718 bool MDCache::shutdown_export_strays()
7720 if (mds
->get_nodeid() == 0)
7723 dout(10) << "shutdown_export_strays" << dendl
;
7725 bool mds0_active
= mds
->mdsmap
->is_active(mds_rank_t(0));
7730 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
7732 !strays
[i
]->state_test(CInode::STATE_STRAYPINNED
))
7734 strays
[i
]->get_dirfrags(dfs
);
7737 for (std::list
<CDir
*>::iterator dfs_i
= dfs
.begin();
7738 dfs_i
!= dfs
.end(); ++dfs_i
)
7742 if (!dir
->is_complete()) {
7749 for (auto &p
: dir
->items
) {
7750 CDentry
*dn
= p
.second
;
7751 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
7758 if (dn
->state_test(CDentry::STATE_PURGING
)) {
7759 // Don't try to migrate anything that is actually
7760 // being purged right now
7764 if (shutdown_exported_strays
.count(dnl
->get_inode()->ino()) == 0) {
7765 shutdown_exported_strays
.insert(dnl
->get_inode()->ino());
7766 stray_manager
.migrate_stray(dn
, mds_rank_t(0)); // send to root!
7768 dout(10) << "already exporting " << *dn
<< dendl
;
7776 // ========= messaging ==============
7778 /* This function DOES put the passed message before returning */
7779 void MDCache::dispatch(Message
*m
)
7781 switch (m
->get_type()) {
7784 case MSG_MDS_RESOLVE
:
7785 handle_resolve(static_cast<MMDSResolve
*>(m
));
7787 case MSG_MDS_RESOLVEACK
:
7788 handle_resolve_ack(static_cast<MMDSResolveAck
*>(m
));
7792 case MSG_MDS_CACHEREJOIN
:
7793 handle_cache_rejoin(static_cast<MMDSCacheRejoin
*>(m
));
7796 case MSG_MDS_DISCOVER
:
7797 handle_discover(static_cast<MDiscover
*>(m
));
7799 case MSG_MDS_DISCOVERREPLY
:
7800 handle_discover_reply(static_cast<MDiscoverReply
*>(m
));
7803 case MSG_MDS_DIRUPDATE
:
7804 handle_dir_update(static_cast<MDirUpdate
*>(m
));
7807 case MSG_MDS_CACHEEXPIRE
:
7808 handle_cache_expire(static_cast<MCacheExpire
*>(m
));
7811 case MSG_MDS_DENTRYLINK
:
7812 handle_dentry_link(static_cast<MDentryLink
*>(m
));
7814 case MSG_MDS_DENTRYUNLINK
:
7815 handle_dentry_unlink(static_cast<MDentryUnlink
*>(m
));
7818 case MSG_MDS_FRAGMENTNOTIFY
:
7819 handle_fragment_notify(static_cast<MMDSFragmentNotify
*>(m
));
7822 case MSG_MDS_FINDINO
:
7823 handle_find_ino(static_cast<MMDSFindIno
*>(m
));
7825 case MSG_MDS_FINDINOREPLY
:
7826 handle_find_ino_reply(static_cast<MMDSFindInoReply
*>(m
));
7829 case MSG_MDS_OPENINO
:
7830 handle_open_ino(static_cast<MMDSOpenIno
*>(m
));
7832 case MSG_MDS_OPENINOREPLY
:
7833 handle_open_ino_reply(static_cast<MMDSOpenInoReply
*>(m
));
7837 derr
<< "cache unknown message " << m
->get_type() << dendl
;
7838 assert(0 == "cache unknown message");
7842 MDSInternalContextBase
*MDCache::_get_waiter(MDRequestRef
& mdr
, Message
*req
, MDSInternalContextBase
*fin
)
7845 dout(20) << "_get_waiter retryrequest" << dendl
;
7846 return new C_MDS_RetryRequest(this, mdr
);
7848 dout(20) << "_get_waiter retrymessage" << dendl
;
7849 return new C_MDS_RetryMessage(mds
, req
);
7855 int MDCache::path_traverse(MDRequestRef
& mdr
, Message
*req
, MDSInternalContextBase
*fin
, // who
7856 const filepath
& path
, // what
7857 vector
<CDentry
*> *pdnvec
, // result
7861 bool discover
= (onfail
== MDS_TRAVERSE_DISCOVER
);
7862 bool null_okay
= (onfail
== MDS_TRAVERSE_DISCOVERXLOCK
);
7863 bool forward
= (onfail
== MDS_TRAVERSE_FORWARD
);
7865 assert(mdr
|| req
|| fin
);
7866 assert(!forward
|| mdr
|| req
); // forward requires a request
7868 snapid_t snapid
= CEPH_NOSNAP
;
7870 mdr
->snapid
= snapid
;
7872 client_t client
= (mdr
&& mdr
->reqid
.name
.is_client()) ? mdr
->reqid
.name
.num() : -1;
7874 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse
);
7876 dout(7) << "traverse: opening base ino " << path
.get_ino() << " snap " << snapid
<< dendl
;
7877 CInode
*cur
= get_inode(path
.get_ino());
7879 if (MDS_INO_IS_MDSDIR(path
.get_ino()))
7880 open_foreign_mdsdir(path
.get_ino(), _get_waiter(mdr
, req
, fin
));
7882 //ceph_abort(); // hrm.. broken
7887 if (cur
->state_test(CInode::STATE_PURGING
))
7890 // make sure snaprealm are open...
7891 if (mdr
&& cur
->snaprealm
&& !cur
->snaprealm
->is_open() &&
7892 !cur
->snaprealm
->open_parents(_get_waiter(mdr
, req
, fin
))) {
7903 while (depth
< path
.depth()) {
7904 dout(12) << "traverse: path seg depth " << depth
<< " '" << path
[depth
]
7905 << "' snapid " << snapid
<< dendl
;
7907 if (!cur
->is_dir()) {
7908 dout(7) << "traverse: " << *cur
<< " not a dir " << dendl
;
7912 // walk into snapdir?
7913 if (path
[depth
].length() == 0) {
7914 dout(10) << "traverse: snapdir" << dendl
;
7917 snapid
= CEPH_SNAPDIR
;
7918 mdr
->snapid
= snapid
;
7922 // walk thru snapdir?
7923 if (snapid
== CEPH_SNAPDIR
) {
7926 SnapRealm
*realm
= cur
->find_snaprealm();
7927 snapid
= realm
->resolve_snapname(path
[depth
], cur
->ino());
7928 dout(10) << "traverse: snap " << path
[depth
] << " -> " << snapid
<< dendl
;
7931 mdr
->snapid
= snapid
;
7937 frag_t fg
= cur
->pick_dirfrag(path
[depth
]);
7938 CDir
*curdir
= cur
->get_dirfrag(fg
);
7940 if (cur
->is_auth()) {
7941 // parent dir frozen_dir?
7942 if (cur
->is_frozen()) {
7943 dout(7) << "traverse: " << *cur
<< " is frozen, waiting" << dendl
;
7944 cur
->add_waiter(CDir::WAIT_UNFREEZE
, _get_waiter(mdr
, req
, fin
));
7947 curdir
= cur
->get_or_open_dirfrag(this, fg
);
7950 dout(10) << "traverse: need dirfrag " << fg
<< ", doing discover from " << *cur
<< dendl
;
7951 discover_path(cur
, snapid
, path
.postfixpath(depth
), _get_waiter(mdr
, req
, fin
),
7953 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_discover
);
7959 #ifdef MDS_VERIFY_FRAGSTAT
7960 if (curdir
->is_complete())
7961 curdir
->verify_fragstat();
7966 if (curdir->is_frozen()) {
7968 // FIXME: traverse is allowed?
7969 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
7970 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
7971 if (onfinish) delete onfinish;
7976 // Before doing dirfrag->dn lookup, compare with DamageTable's
7977 // record of which dentries were unreadable
7978 if (mds
->damage_table
.is_dentry_damaged(curdir
, path
[depth
], snapid
)) {
7979 dout(4) << "traverse: stopped lookup at damaged dentry "
7980 << *curdir
<< "/" << path
[depth
] << " snap=" << snapid
<< dendl
;
7985 CDentry
*dn
= curdir
->lookup(path
[depth
], snapid
);
7986 CDentry::linkage_t
*dnl
= dn
? dn
->get_projected_linkage() : 0;
7988 // null and last_bit and xlocked by me?
7989 if (dnl
&& dnl
->is_null() && null_okay
) {
7990 dout(10) << "traverse: hit null dentry at tail of traverse, succeeding" << dendl
;
7992 pdnvec
->push_back(dn
);
7999 dn
->lock
.is_xlocked() &&
8000 dn
->lock
.get_xlock_by() != mdr
&&
8001 !dn
->lock
.can_read(client
) &&
8002 (dnl
->is_null() || forward
)) {
8003 dout(10) << "traverse: xlocked dentry at " << *dn
<< dendl
;
8004 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, _get_waiter(mdr
, req
, fin
));
8005 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_lock
);
8006 mds
->mdlog
->flush();
8010 // can we conclude ENOENT?
8011 if (dnl
&& dnl
->is_null()) {
8012 if (dn
->lock
.can_read(client
) ||
8013 (dn
->lock
.is_xlocked() && dn
->lock
.get_xlock_by() == mdr
)) {
8014 dout(10) << "traverse: miss on null+readable dentry " << path
[depth
] << " " << *dn
<< dendl
;
8016 if (depth
== path
.depth() - 1)
8017 pdnvec
->push_back(dn
);
8019 pdnvec
->clear(); // do not confuse likes of rdlock_path_pin_ref();
8023 dout(10) << "miss on dentry " << *dn
<< ", can't read due to lock" << dendl
;
8024 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, _get_waiter(mdr
, req
, fin
));
8029 if (dnl
&& !dnl
->is_null()) {
8030 CInode
*in
= dnl
->get_inode();
8032 // do we have inode?
8034 assert(dnl
->is_remote());
8036 in
= get_inode(dnl
->get_remote_ino());
8038 dout(7) << "linking in remote in " << *in
<< dendl
;
8039 dn
->link_remote(dnl
, in
);
8041 dout(7) << "remote link to " << dnl
->get_remote_ino() << ", which i don't have" << dendl
;
8042 assert(mdr
); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
8043 if (mds
->damage_table
.is_remote_damaged(dnl
->get_remote_ino())) {
8044 dout(4) << "traverse: remote dentry points to damaged ino "
8048 open_remote_dentry(dn
, true, _get_waiter(mdr
, req
, fin
),
8049 (null_okay
&& depth
== path
.depth() - 1));
8050 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_remote_ino
);
8056 // make sure snaprealm are open...
8057 if (mdr
&& cur
->snaprealm
&& !cur
->snaprealm
->is_open() &&
8058 !cur
->snaprealm
->open_parents(_get_waiter(mdr
, req
, fin
))) {
8062 // add to trace, continue.
8065 pdnvec
->push_back(dn
);
8073 // MISS. dentry doesn't exist.
8074 dout(12) << "traverse: miss on dentry " << path
[depth
] << " in " << *curdir
<< dendl
;
8076 if (curdir
->is_auth()) {
8078 if (curdir
->is_complete() ||
8079 (snapid
== CEPH_NOSNAP
&&
8080 curdir
->has_bloom() &&
8081 !curdir
->is_in_bloom(path
[depth
]))){
8084 // instantiate a null dn?
8085 if (depth
< path
.depth()-1){
8086 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl
;
8089 ceph_abort(); // should have fallen out in ->is_null() check above
8090 } else if (curdir
->is_frozen()) {
8091 dout(20) << " not adding null to frozen dir " << dendl
;
8092 } else if (snapid
< CEPH_MAXSNAP
) {
8093 dout(20) << " not adding null for snapid " << snapid
<< dendl
;
8095 // create a null dentry
8096 dn
= curdir
->add_null_dentry(path
[depth
]);
8097 dout(20) << " added null " << *dn
<< dendl
;
8100 pdnvec
->push_back(dn
);
8102 pdnvec
->clear(); // do not confuse likes of rdlock_path_pin_ref();
8107 // Check DamageTable for missing fragments before trying to fetch
8109 if (mds
->damage_table
.is_dirfrag_damaged(curdir
)) {
8110 dout(4) << "traverse: damaged dirfrag " << *curdir
8111 << ", blocking fetch" << dendl
;
8115 // directory isn't complete; reload
8116 dout(7) << "traverse: incomplete dir contents for " << *cur
<< ", fetching" << dendl
;
8118 curdir
->fetch(_get_waiter(mdr
, req
, fin
), path
[depth
]);
8119 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_dir_fetch
);
8123 // dirfrag/dentry is not mine.
8124 mds_authority_t dauth
= curdir
->authority();
8127 snapid
&& mdr
&& mdr
->client_request
&&
8128 (int)depth
< mdr
->client_request
->get_num_fwd()) {
8129 dout(7) << "traverse: snap " << snapid
<< " and depth " << depth
8130 << " < fwd " << mdr
->client_request
->get_num_fwd()
8131 << ", discovering instead of forwarding" << dendl
;
8135 if ((discover
|| null_okay
)) {
8136 dout(7) << "traverse: discover from " << path
[depth
] << " from " << *curdir
<< dendl
;
8137 discover_path(curdir
, snapid
, path
.postfixpath(depth
), _get_waiter(mdr
, req
, fin
),
8139 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_discover
);
8144 dout(7) << "traverse: not auth for " << path
<< " in " << *curdir
<< dendl
;
8146 if (curdir
->is_ambiguous_auth()) {
8148 dout(7) << "traverse: waiting for single auth in " << *curdir
<< dendl
;
8149 curdir
->add_waiter(CDir::WAIT_SINGLEAUTH
, _get_waiter(mdr
, req
, fin
));
8153 dout(7) << "traverse: forwarding, not auth for " << *curdir
<< dendl
;
8156 request_forward(mdr
, dauth
.first
);
8158 mds
->forward_message_mds(req
, dauth
.first
);
8160 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_forward
);
8161 assert(fin
== NULL
);
8166 ceph_abort(); // i shouldn't get here
8170 if (mds
->logger
) mds
->logger
->inc(l_mds_traverse_hit
);
8171 dout(10) << "path_traverse finish on snapid " << snapid
<< dendl
;
8173 assert(mdr
->snapid
== snapid
);
8177 CInode
*MDCache::cache_traverse(const filepath
& fp
)
8179 dout(10) << "cache_traverse " << fp
<< dendl
;
8183 in
= get_inode(fp
.get_ino());
8189 for (unsigned i
= 0; i
< fp
.depth(); i
++) {
8190 boost::string_view dname
= fp
[i
];
8191 frag_t fg
= in
->pick_dirfrag(dname
);
8192 dout(20) << " " << i
<< " " << dname
<< " frag " << fg
<< " from " << *in
<< dendl
;
8193 CDir
*curdir
= in
->get_dirfrag(fg
);
8196 CDentry
*dn
= curdir
->lookup(dname
, CEPH_NOSNAP
);
8199 in
= dn
->get_linkage()->get_inode();
8203 dout(10) << " got " << *in
<< dendl
;
8209 * open_remote_dir -- open up a remote dirfrag
8211 * @param diri base inode
8212 * @param approxfg approximate fragment.
8213 * @param fin completion callback
8215 void MDCache::open_remote_dirfrag(CInode
*diri
, frag_t approxfg
, MDSInternalContextBase
*fin
)
8217 dout(10) << "open_remote_dir on " << *diri
<< dendl
;
8218 assert(diri
->is_dir());
8219 assert(!diri
->is_auth());
8220 assert(diri
->get_dirfrag(approxfg
) == 0);
8222 discover_dir_frag(diri
, approxfg
, fin
);
8227 * get_dentry_inode - get or open inode
8229 * @param dn the dentry
8230 * @param mdr current request
8232 * will return inode for primary, or link up/open up remote link's inode as necessary.
8233 * If it's not available right now, puts mdr on wait list and returns null.
8235 CInode
*MDCache::get_dentry_inode(CDentry
*dn
, MDRequestRef
& mdr
, bool projected
)
8237 CDentry::linkage_t
*dnl
;
8239 dnl
= dn
->get_projected_linkage();
8241 dnl
= dn
->get_linkage();
8243 assert(!dnl
->is_null());
8245 if (dnl
->is_primary())
8248 assert(dnl
->is_remote());
8249 CInode
*in
= get_inode(dnl
->get_remote_ino());
8251 dout(7) << "get_dentry_inode linking in remote in " << *in
<< dendl
;
8252 dn
->link_remote(dnl
, in
);
8255 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn
<< dendl
;
8256 open_remote_dentry(dn
, projected
, new C_MDS_RetryRequest(this, mdr
));
8261 struct C_MDC_OpenRemoteDentry
: public MDCacheContext
{
8264 MDSInternalContextBase
*onfinish
;
8266 C_MDC_OpenRemoteDentry(MDCache
*m
, CDentry
*d
, inodeno_t i
, MDSInternalContextBase
*f
, bool wx
) :
8267 MDCacheContext(m
), dn(d
), ino(i
), onfinish(f
), want_xlocked(wx
) {
8268 dn
->get(MDSCacheObject::PIN_PTRWAITER
);
8270 void finish(int r
) override
{
8271 mdcache
->_open_remote_dentry_finish(dn
, ino
, onfinish
, want_xlocked
, r
);
8272 dn
->put(MDSCacheObject::PIN_PTRWAITER
);
8276 void MDCache::open_remote_dentry(CDentry
*dn
, bool projected
, MDSInternalContextBase
*fin
, bool want_xlocked
)
8278 dout(10) << "open_remote_dentry " << *dn
<< dendl
;
8279 CDentry::linkage_t
*dnl
= projected
? dn
->get_projected_linkage() : dn
->get_linkage();
8280 inodeno_t ino
= dnl
->get_remote_ino();
8281 int64_t pool
= dnl
->get_remote_d_type() == DT_DIR
? mds
->mdsmap
->get_metadata_pool() : -1;
8283 new C_MDC_OpenRemoteDentry(this, dn
, ino
, fin
, want_xlocked
), true, want_xlocked
); // backtrace
8286 void MDCache::_open_remote_dentry_finish(CDentry
*dn
, inodeno_t ino
, MDSInternalContextBase
*fin
,
8287 bool want_xlocked
, int r
)
8290 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
8291 if (dnl
->is_remote() && dnl
->get_remote_ino() == ino
) {
8292 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn
<< dendl
;
8293 dn
->state_set(CDentry::STATE_BADREMOTEINO
);
8296 CDir
*dir
= dn
->get_dir();
8298 dir
->get_inode()->make_path_string(path
);
8300 path
+= std::string(dn
->get_name());
8303 bool fatal
= mds
->damage_table
.notify_remote_damaged(ino
, path
);
8306 ceph_abort(); // unreachable, damaged() respawns us
8312 fin
->complete(r
< 0 ? r
: 0);
8316 void MDCache::make_trace(vector
<CDentry
*>& trace
, CInode
*in
)
8318 // empty trace if we're a base inode
8322 CInode
*parent
= in
->get_parent_inode();
8324 make_trace(trace
, parent
);
8326 CDentry
*dn
= in
->get_parent_dn();
8327 dout(15) << "make_trace adding " << *dn
<< dendl
;
8328 trace
.push_back(dn
);
8332 // -------------------------------------------------------------------------------
8333 // Open inode by inode number
8335 class C_IO_MDC_OpenInoBacktraceFetched
: public MDCacheIOContext
{
8339 C_IO_MDC_OpenInoBacktraceFetched(MDCache
*c
, inodeno_t i
) :
8340 MDCacheIOContext(c
), ino(i
) {}
8341 void finish(int r
) override
{
8342 mdcache
->_open_ino_backtrace_fetched(ino
, bl
, r
);
8346 struct C_MDC_OpenInoTraverseDir
: public MDCacheContext
{
8351 C_MDC_OpenInoTraverseDir(MDCache
*c
, inodeno_t i
, MMDSOpenIno
*m
, bool p
) :
8352 MDCacheContext(c
), ino(i
), msg(m
), parent(p
) {}
8353 void finish(int r
) override
{
8354 if (r
< 0 && !parent
)
8357 mdcache
->handle_open_ino(msg
, r
);
8360 assert(mdcache
->opening_inodes
.count(ino
));
8361 mdcache
->_open_ino_traverse_dir(ino
, mdcache
->opening_inodes
[ino
], r
);
8365 struct C_MDC_OpenInoParentOpened
: public MDCacheContext
{
8368 C_MDC_OpenInoParentOpened(MDCache
*c
, inodeno_t i
) : MDCacheContext(c
), ino(i
) {}
8369 void finish(int r
) override
{
8370 mdcache
->_open_ino_parent_opened(ino
, r
);
8374 void MDCache::_open_ino_backtrace_fetched(inodeno_t ino
, bufferlist
& bl
, int err
)
8376 dout(10) << "_open_ino_backtrace_fetched ino " << ino
<< " errno " << err
<< dendl
;
8378 assert(opening_inodes
.count(ino
));
8379 open_ino_info_t
& info
= opening_inodes
[ino
];
8381 CInode
*in
= get_inode(ino
);
8383 dout(10) << " found cached " << *in
<< dendl
;
8384 open_ino_finish(ino
, info
, in
->authority().first
);
8388 inode_backtrace_t backtrace
;
8391 ::decode(backtrace
, bl
);
8392 } catch (const buffer::error
&decode_exc
) {
8393 derr
<< "corrupt backtrace on ino x0" << std::hex
<< ino
8394 << std::dec
<< ": " << decode_exc
<< dendl
;
8395 open_ino_finish(ino
, info
, -EIO
);
8398 if (backtrace
.pool
!= info
.pool
&& backtrace
.pool
!= -1) {
8399 dout(10) << " old object in pool " << info
.pool
8400 << ", retrying pool " << backtrace
.pool
<< dendl
;
8401 info
.pool
= backtrace
.pool
;
8402 C_IO_MDC_OpenInoBacktraceFetched
*fin
=
8403 new C_IO_MDC_OpenInoBacktraceFetched(this, ino
);
8404 fetch_backtrace(ino
, info
.pool
, fin
->bl
,
8405 new C_OnFinisher(fin
, mds
->finisher
));
8408 } else if (err
== -ENOENT
) {
8409 int64_t meta_pool
= mds
->mdsmap
->get_metadata_pool();
8410 if (info
.pool
!= meta_pool
) {
8411 dout(10) << " no object in pool " << info
.pool
8412 << ", retrying pool " << meta_pool
<< dendl
;
8413 info
.pool
= meta_pool
;
8414 C_IO_MDC_OpenInoBacktraceFetched
*fin
=
8415 new C_IO_MDC_OpenInoBacktraceFetched(this, ino
);
8416 fetch_backtrace(ino
, info
.pool
, fin
->bl
,
8417 new C_OnFinisher(fin
, mds
->finisher
));
8420 err
= 0; // backtrace.ancestors.empty() is checked below
8424 if (backtrace
.ancestors
.empty()) {
8425 dout(10) << " got empty backtrace " << dendl
;
8427 } else if (!info
.ancestors
.empty()) {
8428 if (info
.ancestors
[0] == backtrace
.ancestors
[0]) {
8429 dout(10) << " got same parents " << info
.ancestors
[0] << " 2 times" << dendl
;
8437 dout(0) << " failed to open ino " << ino
<< " err " << err
<< "/" << info
.last_err
<< dendl
;
8439 err
= info
.last_err
;
8440 open_ino_finish(ino
, info
, err
);
8444 dout(10) << " got backtrace " << backtrace
<< dendl
;
8445 info
.ancestors
= backtrace
.ancestors
;
8447 _open_ino_traverse_dir(ino
, info
, 0);
8450 void MDCache::_open_ino_parent_opened(inodeno_t ino
, int ret
)
8452 dout(10) << "_open_ino_parent_opened ino " << ino
<< " ret " << ret
<< dendl
;
8454 assert(opening_inodes
.count(ino
));
8455 open_ino_info_t
& info
= opening_inodes
[ino
];
8457 CInode
*in
= get_inode(ino
);
8459 dout(10) << " found cached " << *in
<< dendl
;
8460 open_ino_finish(ino
, info
, in
->authority().first
);
8464 if (ret
== mds
->get_nodeid()) {
8465 _open_ino_traverse_dir(ino
, info
, 0);
8468 mds_rank_t checked_rank
= mds_rank_t(ret
);
8469 info
.check_peers
= true;
8470 info
.auth_hint
= checked_rank
;
8471 info
.checked
.erase(checked_rank
);
8473 do_open_ino(ino
, info
, ret
);
8477 void MDCache::_open_ino_traverse_dir(inodeno_t ino
, open_ino_info_t
& info
, int ret
)
8479 dout(10) << __func__
<< ": ino " << ino
<< " ret " << ret
<< dendl
;
8481 CInode
*in
= get_inode(ino
);
8483 dout(10) << " found cached " << *in
<< dendl
;
8484 open_ino_finish(ino
, info
, in
->authority().first
);
8489 do_open_ino(ino
, info
, ret
);
8493 mds_rank_t hint
= info
.auth_hint
;
8494 ret
= open_ino_traverse_dir(ino
, NULL
, info
.ancestors
,
8495 info
.discover
, info
.want_xlocked
, &hint
);
8498 if (hint
!= mds
->get_nodeid())
8499 info
.auth_hint
= hint
;
8500 do_open_ino(ino
, info
, ret
);
8503 void MDCache::_open_ino_fetch_dir(inodeno_t ino
, MMDSOpenIno
*m
, CDir
*dir
, bool parent
)
8505 if (dir
->state_test(CDir::STATE_REJOINUNDEF
))
8506 assert(dir
->get_inode()->dirfragtree
.is_leaf(dir
->get_frag()));
8507 dir
->fetch(new C_MDC_OpenInoTraverseDir(this, ino
, m
, parent
));
8510 int MDCache::open_ino_traverse_dir(inodeno_t ino
, MMDSOpenIno
*m
,
8511 vector
<inode_backpointer_t
>& ancestors
,
8512 bool discover
, bool want_xlocked
, mds_rank_t
*hint
)
8514 dout(10) << "open_ino_traverse_dir ino " << ino
<< " " << ancestors
<< dendl
;
8516 for (unsigned i
= 0; i
< ancestors
.size(); i
++) {
8517 CInode
*diri
= get_inode(ancestors
[i
].dirino
);
8520 if (discover
&& MDS_INO_IS_MDSDIR(ancestors
[i
].dirino
)) {
8521 open_foreign_mdsdir(ancestors
[i
].dirino
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
8527 if (diri
->state_test(CInode::STATE_REJOINUNDEF
)) {
8528 CDir
*dir
= diri
->get_parent_dir();
8529 while (dir
->state_test(CDir::STATE_REJOINUNDEF
) &&
8530 dir
->get_inode()->state_test(CInode::STATE_REJOINUNDEF
))
8531 dir
= dir
->get_inode()->get_parent_dir();
8532 _open_ino_fetch_dir(ino
, m
, dir
, i
== 0);
8536 if (!diri
->is_dir()) {
8537 dout(10) << " " << *diri
<< " is not dir" << dendl
;
8543 string
&name
= ancestors
[i
].dname
;
8544 frag_t fg
= diri
->pick_dirfrag(name
);
8545 CDir
*dir
= diri
->get_dirfrag(fg
);
8547 if (diri
->is_auth()) {
8548 if (diri
->is_frozen()) {
8549 dout(10) << " " << *diri
<< " is frozen, waiting " << dendl
;
8550 diri
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
8553 dir
= diri
->get_or_open_dirfrag(this, fg
);
8554 } else if (discover
) {
8555 open_remote_dirfrag(diri
, fg
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
8560 inodeno_t next_ino
= i
> 0 ? ancestors
[i
- 1].dirino
: ino
;
8561 CDentry
*dn
= dir
->lookup(name
);
8562 CDentry::linkage_t
*dnl
= dn
? dn
->get_linkage() : NULL
;
8563 if (dir
->is_auth()) {
8564 if (dnl
&& dnl
->is_primary() &&
8565 dnl
->get_inode()->state_test(CInode::STATE_REJOINUNDEF
)) {
8566 dout(10) << " fetching undef " << *dnl
->get_inode() << dendl
;
8567 _open_ino_fetch_dir(ino
, m
, dir
, i
== 0);
8571 if (!dnl
&& !dir
->is_complete() &&
8572 (!dir
->has_bloom() || dir
->is_in_bloom(name
))) {
8573 dout(10) << " fetching incomplete " << *dir
<< dendl
;
8574 _open_ino_fetch_dir(ino
, m
, dir
, i
== 0);
8578 dout(10) << " no ino " << next_ino
<< " in " << *dir
<< dendl
;
8581 } else if (discover
) {
8583 filepath
path(name
, 0);
8584 discover_path(dir
, CEPH_NOSNAP
, path
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0),
8585 (i
== 0 && want_xlocked
));
8588 if (dnl
->is_null() && !dn
->lock
.can_read(-1)) {
8589 dout(10) << " null " << *dn
<< " is not readable, waiting" << dendl
;
8590 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, new C_MDC_OpenInoTraverseDir(this, ino
, m
, i
== 0));
8593 dout(10) << " no ino " << next_ino
<< " in " << *dir
<< dendl
;
8599 *hint
= dir
? dir
->authority().first
: diri
->authority().first
;
8605 void MDCache::open_ino_finish(inodeno_t ino
, open_ino_info_t
& info
, int ret
)
8607 dout(10) << "open_ino_finish ino " << ino
<< " ret " << ret
<< dendl
;
8609 list
<MDSInternalContextBase
*> waiters
;
8610 waiters
.swap(info
.waiters
);
8611 opening_inodes
.erase(ino
);
8612 finish_contexts(g_ceph_context
, waiters
, ret
);
8615 void MDCache::do_open_ino(inodeno_t ino
, open_ino_info_t
& info
, int err
)
8617 if (err
< 0 && err
!= -EAGAIN
) {
8618 info
.checked
.clear();
8619 info
.checking
= MDS_RANK_NONE
;
8620 info
.check_peers
= true;
8621 info
.fetch_backtrace
= true;
8622 if (info
.discover
) {
8623 info
.discover
= false;
8624 info
.ancestors
.clear();
8626 if (err
!= -ENOENT
&& err
!= -ENOTDIR
)
8627 info
.last_err
= err
;
8630 if (info
.check_peers
|| info
.discover
) {
8631 if (info
.discover
) {
8632 // got backtrace from peer, but failed to find inode. re-check peers
8633 info
.discover
= false;
8634 info
.ancestors
.clear();
8635 info
.checked
.clear();
8637 info
.check_peers
= false;
8638 info
.checking
= MDS_RANK_NONE
;
8639 do_open_ino_peer(ino
, info
);
8640 } else if (info
.fetch_backtrace
) {
8641 info
.check_peers
= true;
8642 info
.fetch_backtrace
= false;
8643 info
.checking
= mds
->get_nodeid();
8644 info
.checked
.clear();
8645 C_IO_MDC_OpenInoBacktraceFetched
*fin
=
8646 new C_IO_MDC_OpenInoBacktraceFetched(this, ino
);
8647 fetch_backtrace(ino
, info
.pool
, fin
->bl
,
8648 new C_OnFinisher(fin
, mds
->finisher
));
8650 assert(!info
.ancestors
.empty());
8651 info
.checking
= mds
->get_nodeid();
8652 open_ino(info
.ancestors
[0].dirino
, mds
->mdsmap
->get_metadata_pool(),
8653 new C_MDC_OpenInoParentOpened(this, ino
), info
.want_replica
);
8657 void MDCache::do_open_ino_peer(inodeno_t ino
, open_ino_info_t
& info
)
8659 set
<mds_rank_t
> all
, active
;
8660 mds
->mdsmap
->get_mds_set(all
);
8661 mds
->mdsmap
->get_clientreplay_or_active_or_stopping_mds_set(active
);
8662 if (mds
->get_state() == MDSMap::STATE_REJOIN
)
8663 mds
->mdsmap
->get_mds_set(active
, MDSMap::STATE_REJOIN
);
8665 dout(10) << "do_open_ino_peer " << ino
<< " active " << active
8666 << " all " << all
<< " checked " << info
.checked
<< dendl
;
8668 mds_rank_t peer
= MDS_RANK_NONE
;
8669 if (info
.auth_hint
>= 0) {
8670 if (active
.count(info
.auth_hint
)) {
8671 peer
= info
.auth_hint
;
8672 info
.auth_hint
= MDS_RANK_NONE
;
8675 for (set
<mds_rank_t
>::iterator p
= active
.begin(); p
!= active
.end(); ++p
)
8676 if (*p
!= mds
->get_nodeid() && info
.checked
.count(*p
) == 0) {
8682 all
.erase(mds
->get_nodeid());
8683 if (all
!= info
.checked
) {
8684 dout(10) << " waiting for more peers to be active" << dendl
;
8686 dout(10) << " all MDS peers have been checked " << dendl
;
8687 do_open_ino(ino
, info
, 0);
8690 info
.checking
= peer
;
8691 vector
<inode_backpointer_t
> *pa
= NULL
;
8692 // got backtrace from peer or backtrace just fetched
8693 if (info
.discover
|| !info
.fetch_backtrace
)
8694 pa
= &info
.ancestors
;
8695 mds
->send_message_mds(new MMDSOpenIno(info
.tid
, ino
, pa
), peer
);
8699 void MDCache::handle_open_ino(MMDSOpenIno
*m
, int err
)
8701 if (mds
->get_state() < MDSMap::STATE_REJOIN
&&
8702 mds
->get_want_state() != CEPH_MDS_STATE_REJOIN
) {
8707 dout(10) << "handle_open_ino " << *m
<< " err " << err
<< dendl
;
8709 inodeno_t ino
= m
->ino
;
8710 MMDSOpenInoReply
*reply
;
8711 CInode
*in
= get_inode(ino
);
8713 dout(10) << " have " << *in
<< dendl
;
8714 reply
= new MMDSOpenInoReply(m
->get_tid(), ino
, mds_rank_t(0));
8715 if (in
->is_auth()) {
8718 CDentry
*pdn
= in
->get_parent_dn();
8721 CInode
*diri
= pdn
->get_dir()->get_inode();
8722 reply
->ancestors
.push_back(inode_backpointer_t(diri
->ino(), pdn
->get_name(),
8723 in
->inode
.version
));
8727 reply
->hint
= in
->authority().first
;
8729 } else if (err
< 0) {
8730 reply
= new MMDSOpenInoReply(m
->get_tid(), ino
, MDS_RANK_NONE
, err
);
8732 mds_rank_t hint
= MDS_RANK_NONE
;
8733 int ret
= open_ino_traverse_dir(ino
, m
, m
->ancestors
, false, false, &hint
);
8736 reply
= new MMDSOpenInoReply(m
->get_tid(), ino
, hint
, ret
);
8738 m
->get_connection()->send_message(reply
);
8742 void MDCache::handle_open_ino_reply(MMDSOpenInoReply
*m
)
8744 dout(10) << "handle_open_ino_reply " << *m
<< dendl
;
8746 inodeno_t ino
= m
->ino
;
8747 mds_rank_t from
= mds_rank_t(m
->get_source().num());
8748 auto it
= opening_inodes
.find(ino
);
8749 if (it
!= opening_inodes
.end() && it
->second
.checking
== from
) {
8750 open_ino_info_t
& info
= it
->second
;
8751 info
.checking
= MDS_RANK_NONE
;
8752 info
.checked
.insert(from
);
8754 CInode
*in
= get_inode(ino
);
8756 dout(10) << " found cached " << *in
<< dendl
;
8757 open_ino_finish(ino
, info
, in
->authority().first
);
8758 } else if (!m
->ancestors
.empty()) {
8759 dout(10) << " found ino " << ino
<< " on mds." << from
<< dendl
;
8760 if (!info
.want_replica
) {
8761 open_ino_finish(ino
, info
, from
);
8766 info
.ancestors
= m
->ancestors
;
8767 info
.auth_hint
= from
;
8768 info
.checking
= mds
->get_nodeid();
8769 info
.discover
= true;
8770 _open_ino_traverse_dir(ino
, info
, 0);
8771 } else if (m
->error
) {
8772 dout(10) << " error " << m
->error
<< " from mds." << from
<< dendl
;
8773 do_open_ino(ino
, info
, m
->error
);
8775 if (m
->hint
>= 0 && m
->hint
!= mds
->get_nodeid()) {
8776 info
.auth_hint
= m
->hint
;
8777 info
.checked
.erase(m
->hint
);
8779 do_open_ino_peer(ino
, info
);
8785 void MDCache::kick_open_ino_peers(mds_rank_t who
)
8787 dout(10) << "kick_open_ino_peers mds." << who
<< dendl
;
8789 for (map
<inodeno_t
, open_ino_info_t
>::iterator p
= opening_inodes
.begin();
8790 p
!= opening_inodes
.end();
8792 open_ino_info_t
& info
= p
->second
;
8793 if (info
.checking
== who
) {
8794 dout(10) << " kicking ino " << p
->first
<< " who was checking mds." << who
<< dendl
;
8795 info
.checking
= MDS_RANK_NONE
;
8796 do_open_ino_peer(p
->first
, info
);
8797 } else if (info
.checking
== MDS_RANK_NONE
) {
8798 dout(10) << " kicking ino " << p
->first
<< " who was waiting" << dendl
;
8799 do_open_ino_peer(p
->first
, info
);
8804 void MDCache::open_ino(inodeno_t ino
, int64_t pool
, MDSInternalContextBase
* fin
,
8805 bool want_replica
, bool want_xlocked
)
8807 dout(10) << "open_ino " << ino
<< " pool " << pool
<< " want_replica "
8808 << want_replica
<< dendl
;
8810 if (opening_inodes
.count(ino
)) {
8811 open_ino_info_t
& info
= opening_inodes
[ino
];
8813 info
.want_replica
= true;
8814 if (want_xlocked
&& !info
.want_xlocked
) {
8815 if (!info
.ancestors
.empty()) {
8816 CInode
*diri
= get_inode(info
.ancestors
[0].dirino
);
8818 frag_t fg
= diri
->pick_dirfrag(info
.ancestors
[0].dname
);
8819 CDir
*dir
= diri
->get_dirfrag(fg
);
8820 if (dir
&& !dir
->is_auth()) {
8821 filepath
path(info
.ancestors
[0].dname
, 0);
8822 discover_path(dir
, CEPH_NOSNAP
, path
, NULL
, true);
8826 info
.want_xlocked
= true;
8829 info
.waiters
.push_back(fin
);
8831 open_ino_info_t
& info
= opening_inodes
[ino
];
8832 info
.want_replica
= want_replica
;
8833 info
.want_xlocked
= want_xlocked
;
8834 info
.tid
= ++open_ino_last_tid
;
8835 info
.pool
= pool
>= 0 ? pool
: default_file_layout
.pool_id
;
8836 info
.waiters
.push_back(fin
);
8837 do_open_ino(ino
, info
, 0);
8841 /* ---------------------------- */
8844 * search for a given inode on MDS peers. optionally start with the given node.
8848 - recover from mds node failure, recovery
8852 void MDCache::find_ino_peers(inodeno_t ino
, MDSInternalContextBase
*c
, mds_rank_t hint
)
8854 dout(5) << "find_ino_peers " << ino
<< " hint " << hint
<< dendl
;
8855 CInode
*in
= get_inode(ino
);
8856 if (in
&& in
->state_test(CInode::STATE_PURGING
)) {
8857 c
->complete(-ESTALE
);
8862 ceph_tid_t tid
= ++find_ino_peer_last_tid
;
8863 find_ino_peer_info_t
& fip
= find_ino_peer
[tid
];
8868 _do_find_ino_peer(fip
);
8871 void MDCache::_do_find_ino_peer(find_ino_peer_info_t
& fip
)
8873 set
<mds_rank_t
> all
, active
;
8874 mds
->mdsmap
->get_mds_set(all
);
8875 mds
->mdsmap
->get_clientreplay_or_active_or_stopping_mds_set(active
);
8877 dout(10) << "_do_find_ino_peer " << fip
.tid
<< " " << fip
.ino
8878 << " active " << active
<< " all " << all
8879 << " checked " << fip
.checked
8882 mds_rank_t m
= MDS_RANK_NONE
;
8883 if (fip
.hint
>= 0) {
8885 fip
.hint
= MDS_RANK_NONE
;
8887 for (set
<mds_rank_t
>::iterator p
= active
.begin(); p
!= active
.end(); ++p
)
8888 if (*p
!= mds
->get_nodeid() &&
8889 fip
.checked
.count(*p
) == 0) {
8894 if (m
== MDS_RANK_NONE
) {
8895 all
.erase(mds
->get_nodeid());
8896 if (all
!= fip
.checked
) {
8897 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl
;
8899 dout(10) << "_do_find_ino_peer failed on " << fip
.ino
<< dendl
;
8900 fip
.fin
->complete(-ESTALE
);
8901 find_ino_peer
.erase(fip
.tid
);
8905 mds
->send_message_mds(new MMDSFindIno(fip
.tid
, fip
.ino
), m
);
8909 void MDCache::handle_find_ino(MMDSFindIno
*m
)
8911 if (mds
->get_state() < MDSMap::STATE_REJOIN
) {
8916 dout(10) << "handle_find_ino " << *m
<< dendl
;
8917 MMDSFindInoReply
*r
= new MMDSFindInoReply(m
->tid
);
8918 CInode
*in
= get_inode(m
->ino
);
8920 in
->make_path(r
->path
);
8921 dout(10) << " have " << r
->path
<< " " << *in
<< dendl
;
8923 m
->get_connection()->send_message(r
);
8928 void MDCache::handle_find_ino_reply(MMDSFindInoReply
*m
)
8930 map
<ceph_tid_t
, find_ino_peer_info_t
>::iterator p
= find_ino_peer
.find(m
->tid
);
8931 if (p
!= find_ino_peer
.end()) {
8932 dout(10) << "handle_find_ino_reply " << *m
<< dendl
;
8933 find_ino_peer_info_t
& fip
= p
->second
;
8936 if (get_inode(fip
.ino
)) {
8937 dout(10) << "handle_find_ino_reply successfully found " << fip
.ino
<< dendl
;
8938 mds
->queue_waiter(fip
.fin
);
8939 find_ino_peer
.erase(p
);
8944 mds_rank_t from
= mds_rank_t(m
->get_source().num());
8945 if (fip
.checking
== from
)
8946 fip
.checking
= MDS_RANK_NONE
;
8947 fip
.checked
.insert(from
);
8949 if (!m
->path
.empty()) {
8951 vector
<CDentry
*> trace
;
8952 MDRequestRef null_ref
;
8953 int r
= path_traverse(null_ref
, m
, NULL
, m
->path
, &trace
, NULL
, MDS_TRAVERSE_DISCOVER
);
8956 dout(0) << "handle_find_ino_reply failed with " << r
<< " on " << m
->path
8957 << ", retrying" << dendl
;
8958 fip
.checked
.clear();
8959 _do_find_ino_peer(fip
);
8962 _do_find_ino_peer(fip
);
8965 dout(10) << "handle_find_ino_reply tid " << m
->tid
<< " dne" << dendl
;
8970 void MDCache::kick_find_ino_peers(mds_rank_t who
)
8972 // find_ino_peers requests we should move on from
8973 for (map
<ceph_tid_t
,find_ino_peer_info_t
>::iterator p
= find_ino_peer
.begin();
8974 p
!= find_ino_peer
.end();
8976 find_ino_peer_info_t
& fip
= p
->second
;
8977 if (fip
.checking
== who
) {
8978 dout(10) << "kicking find_ino_peer " << fip
.tid
<< " who was checking mds." << who
<< dendl
;
8979 fip
.checking
= MDS_RANK_NONE
;
8980 _do_find_ino_peer(fip
);
8981 } else if (fip
.checking
== MDS_RANK_NONE
) {
8982 dout(10) << "kicking find_ino_peer " << fip
.tid
<< " who was waiting" << dendl
;
8983 _do_find_ino_peer(fip
);
8988 /* ---------------------------- */
8990 int MDCache::get_num_client_requests()
8993 for (ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.begin();
8994 p
!= active_requests
.end();
8996 MDRequestRef
& mdr
= p
->second
;
8997 if (mdr
->reqid
.name
.is_client() && !mdr
->is_slave())
9003 /* This function takes over the reference to the passed Message */
9004 MDRequestRef
MDCache::request_start(MClientRequest
*req
)
9006 // did we win a forward race against a slave?
9007 if (active_requests
.count(req
->get_reqid())) {
9008 MDRequestRef
& mdr
= active_requests
[req
->get_reqid()];
9010 if (mdr
->is_slave()) {
9011 dout(10) << "request_start already had " << *mdr
<< ", waiting for finish" << dendl
;
9012 mdr
->more()->waiting_for_finish
.push_back(new C_MDS_RetryMessage(mds
, req
));
9014 dout(10) << "request_start already processing " << *mdr
<< ", dropping new msg" << dendl
;
9017 return MDRequestRef();
9020 // register new client request
9021 MDRequestImpl::Params params
;
9022 params
.reqid
= req
->get_reqid();
9023 params
.attempt
= req
->get_num_fwd();
9024 params
.client_req
= req
;
9025 params
.initiated
= req
->get_recv_stamp();
9026 params
.throttled
= req
->get_throttle_stamp();
9027 params
.all_read
= req
->get_recv_complete_stamp();
9028 params
.dispatched
= req
->get_dispatch_stamp();
9031 mds
->op_tracker
.create_request
<MDRequestImpl
,MDRequestImpl::Params
>(params
);
9032 active_requests
[params
.reqid
] = mdr
;
9033 mdr
->set_op_stamp(req
->get_stamp());
9034 dout(7) << "request_start " << *mdr
<< dendl
;
9038 MDRequestRef
MDCache::request_start_slave(metareqid_t ri
, __u32 attempt
, Message
*m
)
9040 int by
= m
->get_source().num();
9041 MDRequestImpl::Params params
;
9043 params
.attempt
= attempt
;
9044 params
.triggering_slave_req
= m
;
9045 params
.slave_to
= by
;
9046 params
.initiated
= m
->get_recv_stamp();
9047 params
.throttled
= m
->get_throttle_stamp();
9048 params
.all_read
= m
->get_recv_complete_stamp();
9049 params
.dispatched
= m
->get_dispatch_stamp();
9051 mds
->op_tracker
.create_request
<MDRequestImpl
,MDRequestImpl::Params
>(params
);
9052 assert(active_requests
.count(mdr
->reqid
) == 0);
9053 active_requests
[mdr
->reqid
] = mdr
;
9054 dout(7) << "request_start_slave " << *mdr
<< " by mds." << by
<< dendl
;
9058 MDRequestRef
MDCache::request_start_internal(int op
)
9060 MDRequestImpl::Params params
;
9061 params
.reqid
.name
= entity_name_t::MDS(mds
->get_nodeid());
9062 params
.reqid
.tid
= mds
->issue_tid();
9063 params
.initiated
= ceph_clock_now();
9064 params
.internal_op
= op
;
9066 mds
->op_tracker
.create_request
<MDRequestImpl
,MDRequestImpl::Params
>(params
);
9068 assert(active_requests
.count(mdr
->reqid
) == 0);
9069 active_requests
[mdr
->reqid
] = mdr
;
9070 dout(7) << "request_start_internal " << *mdr
<< " op " << op
<< dendl
;
9074 MDRequestRef
MDCache::request_get(metareqid_t rid
)
9076 ceph::unordered_map
<metareqid_t
, MDRequestRef
>::iterator p
= active_requests
.find(rid
);
9077 assert(p
!= active_requests
.end());
9078 dout(7) << "request_get " << rid
<< " " << *p
->second
<< dendl
;
9082 void MDCache::request_finish(MDRequestRef
& mdr
)
9084 dout(7) << "request_finish " << *mdr
<< dendl
;
9085 mdr
->mark_event("finishing request");
9088 if (mdr
->has_more() && mdr
->more()->slave_commit
) {
9089 Context
*fin
= mdr
->more()->slave_commit
;
9090 mdr
->more()->slave_commit
= 0;
9093 mdr
->aborted
= false;
9095 mdr
->more()->slave_rolling_back
= true;
9098 mdr
->committing
= true;
9100 fin
->complete(ret
); // this must re-call request_finish.
9104 switch(mdr
->internal_op
) {
9105 case CEPH_MDS_OP_FRAGMENTDIR
:
9106 logger
->inc(l_mdss_ireq_fragmentdir
);
9108 case CEPH_MDS_OP_EXPORTDIR
:
9109 logger
->inc(l_mdss_ireq_exportdir
);
9111 case CEPH_MDS_OP_ENQUEUE_SCRUB
:
9112 logger
->inc(l_mdss_ireq_enqueue_scrub
);
9114 case CEPH_MDS_OP_FLUSH
:
9115 logger
->inc(l_mdss_ireq_flush
);
9117 case CEPH_MDS_OP_REPAIR_FRAGSTATS
:
9118 logger
->inc(l_mdss_ireq_fragstats
);
9120 case CEPH_MDS_OP_REPAIR_INODESTATS
:
9121 logger
->inc(l_mdss_ireq_inodestats
);
9125 request_cleanup(mdr
);
9129 void MDCache::request_forward(MDRequestRef
& mdr
, mds_rank_t who
, int port
)
9131 mdr
->mark_event("forwarding request");
9132 if (mdr
->client_request
&& mdr
->client_request
->get_source().is_client()) {
9133 dout(7) << "request_forward " << *mdr
<< " to mds." << who
<< " req "
9134 << *mdr
->client_request
<< dendl
;
9135 mds
->forward_message_mds(mdr
->client_request
, who
);
9136 mdr
->client_request
= 0;
9137 if (mds
->logger
) mds
->logger
->inc(l_mds_forward
);
9138 } else if (mdr
->internal_op
>= 0) {
9139 dout(10) << "request_forward on internal op; cancelling" << dendl
;
9140 mdr
->internal_op_finish
->complete(-EXDEV
);
9142 dout(7) << "request_forward drop " << *mdr
<< " req " << *mdr
->client_request
9143 << " was from mds" << dendl
;
9145 request_cleanup(mdr
);
9149 void MDCache::dispatch_request(MDRequestRef
& mdr
)
9151 if (mdr
->client_request
) {
9152 mds
->server
->dispatch_client_request(mdr
);
9153 } else if (mdr
->slave_request
) {
9154 mds
->server
->dispatch_slave_request(mdr
);
9156 switch (mdr
->internal_op
) {
9157 case CEPH_MDS_OP_FRAGMENTDIR
:
9158 dispatch_fragment_dir(mdr
);
9160 case CEPH_MDS_OP_EXPORTDIR
:
9161 migrator
->dispatch_export_dir(mdr
, 0);
9163 case CEPH_MDS_OP_ENQUEUE_SCRUB
:
9164 enqueue_scrub_work(mdr
);
9166 case CEPH_MDS_OP_FLUSH
:
9167 flush_dentry_work(mdr
);
9169 case CEPH_MDS_OP_REPAIR_FRAGSTATS
:
9170 repair_dirfrag_stats_work(mdr
);
9172 case CEPH_MDS_OP_REPAIR_INODESTATS
:
9173 repair_inode_stats_work(mdr
);
9182 void MDCache::request_drop_foreign_locks(MDRequestRef
& mdr
)
9184 if (!mdr
->has_more())
9188 // (will implicitly drop remote dn pins)
9189 for (set
<mds_rank_t
>::iterator p
= mdr
->more()->slaves
.begin();
9190 p
!= mdr
->more()->slaves
.end();
9192 MMDSSlaveRequest
*r
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
9193 MMDSSlaveRequest::OP_FINISH
);
9195 if (mdr
->killed
&& !mdr
->committing
) {
9197 } else if (mdr
->more()->srcdn_auth_mds
== *p
&&
9198 mdr
->more()->inode_import
.length() > 0) {
9199 // information about rename imported caps
9200 r
->inode_export
.claim(mdr
->more()->inode_import
);
9203 mds
->send_message_mds(r
, *p
);
9206 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9207 * implicitly. Note that we don't call the finishers -- there shouldn't
9208 * be any on a remote lock and the request finish wakes up all
9209 * the waiters anyway! */
9210 set
<SimpleLock
*>::iterator p
= mdr
->xlocks
.begin();
9211 while (p
!= mdr
->xlocks
.end()) {
9212 if ((*p
)->get_parent()->is_auth())
9215 dout(10) << "request_drop_foreign_locks forgetting lock " << **p
9216 << " on " << *(*p
)->get_parent() << dendl
;
9218 mdr
->locks
.erase(*p
);
9219 mdr
->xlocks
.erase(p
++);
9223 map
<SimpleLock
*, mds_rank_t
>::iterator q
= mdr
->remote_wrlocks
.begin();
9224 while (q
!= mdr
->remote_wrlocks
.end()) {
9225 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *q
->first
9226 << " on mds." << q
->second
9227 << " on " << *(q
->first
)->get_parent() << dendl
;
9228 mdr
->locks
.erase(q
->first
);
9229 mdr
->remote_wrlocks
.erase(q
++);
9232 mdr
->more()->slaves
.clear(); /* we no longer have requests out to them, and
9233 * leaving them in can cause double-notifies as
9234 * this function can get called more than once */
9237 void MDCache::request_drop_non_rdlocks(MDRequestRef
& mdr
)
9239 request_drop_foreign_locks(mdr
);
9240 mds
->locker
->drop_non_rdlocks(mdr
.get());
9243 void MDCache::request_drop_locks(MDRequestRef
& mdr
)
9245 request_drop_foreign_locks(mdr
);
9246 mds
->locker
->drop_locks(mdr
.get());
9249 void MDCache::request_cleanup(MDRequestRef
& mdr
)
9251 dout(15) << "request_cleanup " << *mdr
<< dendl
;
9253 if (mdr
->has_more()) {
9254 if (mdr
->more()->is_ambiguous_auth
)
9255 mdr
->clear_ambiguous_auth();
9256 if (!mdr
->more()->waiting_for_finish
.empty())
9257 mds
->queue_waiters(mdr
->more()->waiting_for_finish
);
9260 request_drop_locks(mdr
);
9262 // drop (local) auth pins
9263 mdr
->drop_local_auth_pins();
9266 for (set
<CInode
*>::iterator p
= mdr
->stickydirs
.begin();
9267 p
!= mdr
->stickydirs
.end();
9269 (*p
)->put_stickydirs();
9271 mds
->locker
->kick_cap_releases(mdr
);
9276 // remove from session
9277 mdr
->item_session_request
.remove_myself();
9280 active_requests
.erase(mdr
->reqid
);
9285 mdr
->mark_event("cleaned up request");
9288 void MDCache::request_kill(MDRequestRef
& mdr
)
9290 // rollback slave requests is tricky. just let the request proceed.
9291 if (mdr
->has_more() &&
9292 (!mdr
->more()->witnessed
.empty() || !mdr
->more()->waiting_on_slave
.empty())) {
9293 if (!mdr
->done_locking
) {
9294 assert(mdr
->more()->witnessed
.empty());
9295 mdr
->aborted
= true;
9296 dout(10) << "request_kill " << *mdr
<< " -- waiting for slave reply, delaying" << dendl
;
9298 dout(10) << "request_kill " << *mdr
<< " -- already started slave prep, no-op" << dendl
;
9301 assert(mdr
->used_prealloc_ino
== 0);
9302 assert(mdr
->prealloc_inos
.empty());
9304 mdr
->session
= NULL
;
9305 mdr
->item_session_request
.remove_myself();
9310 mdr
->mark_event("killing request");
9312 if (mdr
->committing
) {
9313 dout(10) << "request_kill " << *mdr
<< " -- already committing, no-op" << dendl
;
9315 dout(10) << "request_kill " << *mdr
<< dendl
;
9316 request_cleanup(mdr
);
9320 // -------------------------------------------------------------------------------
9323 struct C_MDC_snaprealm_create_finish
: public MDCacheLogContext
{
9327 C_MDC_snaprealm_create_finish(MDCache
*c
, MDRequestRef
& m
,
9328 MutationRef
& mu
, CInode
*i
) :
9329 MDCacheLogContext(c
), mdr(m
), mut(mu
), in(i
) {}
9330 void finish(int r
) override
{
9331 mdcache
->_snaprealm_create_finish(mdr
, mut
, in
);
9335 void MDCache::snaprealm_create(MDRequestRef
& mdr
, CInode
*in
)
9337 dout(10) << "snaprealm_create " << *in
<< dendl
;
9338 assert(!in
->snaprealm
);
9341 if (!mdr
->more()->stid
) {
9342 mds
->snapclient
->prepare_create_realm(in
->ino(), &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
9343 new C_MDS_RetryRequest(this, mdr
));
9347 MutationRef
mut(new MutationImpl());
9348 mut
->ls
= mds
->mdlog
->get_current_segment();
9349 EUpdate
*le
= new EUpdate(mds
->mdlog
, "snaprealm_create");
9350 mds
->mdlog
->start_entry(le
);
9352 le
->metablob
.add_table_transaction(TABLE_SNAP
, mdr
->more()->stid
);
9354 auto &pi
= in
->project_inode(false, true);
9355 pi
.inode
.version
= in
->pre_dirty();
9356 pi
.inode
.rstat
.rsnaprealms
++;
9358 bufferlist::iterator p
= mdr
->more()->snapidbl
.begin();
9362 auto &newsnap
= *pi
.snapnode
;
9363 newsnap
.created
= seq
;
9365 newsnap
.last_created
= seq
;
9367 predirty_journal_parents(mut
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
9368 journal_cow_inode(mut
, &le
->metablob
, in
);
9369 le
->metablob
.add_primary_dentry(in
->get_projected_parent_dn(), in
, true);
9371 mds
->server
->submit_mdlog_entry(le
,
9372 new C_MDC_snaprealm_create_finish(this, mdr
,
9375 mds
->mdlog
->flush();
9379 void MDCache::do_realm_invalidate_and_update_notify(CInode
*in
, int snapop
, bool nosend
)
9381 dout(10) << "do_realm_invalidate_and_update_notify " << *in
->snaprealm
<< " " << *in
<< dendl
;
9383 vector
<inodeno_t
> split_inos
;
9384 vector
<inodeno_t
> split_realms
;
9386 if (snapop
== CEPH_SNAP_OP_SPLIT
) {
9387 // notify clients of update|split
9388 for (elist
<CInode
*>::iterator p
= in
->snaprealm
->inodes_with_caps
.begin(member_offset(CInode
, item_caps
));
9390 split_inos
.push_back((*p
)->ino());
9392 for (set
<SnapRealm
*>::iterator p
= in
->snaprealm
->open_children
.begin();
9393 p
!= in
->snaprealm
->open_children
.end();
9395 split_realms
.push_back((*p
)->inode
->ino());
9399 in
->snaprealm
->build_snap_trace(snapbl
);
9401 set
<SnapRealm
*> past_children
;
9402 map
<client_t
, MClientSnap
*> updates
;
9404 q
.push_back(in
->snaprealm
);
9405 while (!q
.empty()) {
9406 SnapRealm
*realm
= q
.front();
9409 dout(10) << " realm " << *realm
<< " on " << *realm
->inode
<< dendl
;
9410 realm
->invalidate_cached_snaps();
9412 for (map
<client_t
, xlist
<Capability
*>* >::iterator p
= realm
->client_caps
.begin();
9413 p
!= realm
->client_caps
.end();
9415 assert(!p
->second
->empty());
9416 if (!nosend
&& updates
.count(p
->first
) == 0) {
9417 MClientSnap
*update
= new MClientSnap(snapop
);
9418 update
->head
.split
= in
->ino();
9419 update
->split_inos
= split_inos
;
9420 update
->split_realms
= split_realms
;
9421 update
->bl
= snapbl
;
9422 updates
[p
->first
] = update
;
9426 if (snapop
== CEPH_SNAP_OP_UPDATE
|| snapop
== CEPH_SNAP_OP_DESTROY
) {
9427 for (set
<SnapRealm
*>::iterator p
= realm
->open_past_children
.begin();
9428 p
!= realm
->open_past_children
.end();
9430 past_children
.insert(*p
);
9433 // notify for active children, too.
9434 dout(10) << " " << realm
<< " open_children are " << realm
->open_children
<< dendl
;
9435 for (set
<SnapRealm
*>::iterator p
= realm
->open_children
.begin();
9436 p
!= realm
->open_children
.end();
9442 send_snaps(updates
);
9444 // notify past children and their descendants if we update/delete old snapshots
9445 for (set
<SnapRealm
*>::iterator p
= past_children
.begin();
9446 p
!= past_children
.end();
9450 while (!q
.empty()) {
9451 SnapRealm
*realm
= q
.front();
9454 realm
->invalidate_cached_snaps();
9456 for (set
<SnapRealm
*>::iterator p
= realm
->open_children
.begin();
9457 p
!= realm
->open_children
.end();
9459 if (past_children
.count(*p
) == 0)
9463 for (set
<SnapRealm
*>::iterator p
= realm
->open_past_children
.begin();
9464 p
!= realm
->open_past_children
.end();
9466 if (past_children
.count(*p
) == 0) {
9468 past_children
.insert(*p
);
9473 if (snapop
== CEPH_SNAP_OP_DESTROY
) {
9474 // eval stray inodes if we delete snapshot from their past ancestor snaprealm
9475 for (set
<SnapRealm
*>::iterator p
= past_children
.begin();
9476 p
!= past_children
.end();
9478 maybe_eval_stray((*p
)->inode
, true);
9482 void MDCache::_snaprealm_create_finish(MDRequestRef
& mdr
, MutationRef
& mut
, CInode
*in
)
9484 dout(10) << "_snaprealm_create_finish " << *in
<< dendl
;
9487 in
->pop_and_dirty_projected_inode(mut
->ls
);
9489 mds
->locker
->drop_locks(mut
.get());
9492 // tell table we've committed
9493 mds
->snapclient
->commit(mdr
->more()->stid
, mut
->ls
);
9496 bufferlist::iterator p
= mdr
->more()->snapidbl
.begin();
9500 in
->open_snaprealm();
9501 in
->snaprealm
->srnode
.seq
= seq
;
9502 in
->snaprealm
->srnode
.created
= seq
;
9503 bool ok
= in
->snaprealm
->_open_parents(NULL
);
9506 do_realm_invalidate_and_update_notify(in
, CEPH_SNAP_OP_SPLIT
);
9509 static int count = 5;
9511 ceph_abort(); // hack test test **********
9515 mdr
->more()->stid
= 0; // caller will likely need to reuse this
9516 dispatch_request(mdr
);
9520 // -------------------------------------------------------------------------------
9523 struct C_MDC_RetryScanStray
: public MDCacheContext
{
9525 C_MDC_RetryScanStray(MDCache
*c
, dirfrag_t n
) : MDCacheContext(c
), next(n
) { }
9526 void finish(int r
) override
{
9527 mdcache
->scan_stray_dir(next
);
9531 void MDCache::scan_stray_dir(dirfrag_t next
)
9533 dout(10) << "scan_stray_dir " << next
<< dendl
;
9536 for (int i
= 0; i
< NUM_STRAY
; ++i
) {
9537 if (strays
[i
]->ino() < next
.ino
)
9539 strays
[i
]->get_dirfrags(ls
);
9542 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
9544 if (dir
->dirfrag() < next
)
9546 if (!dir
->is_complete()) {
9547 dir
->fetch(new C_MDC_RetryScanStray(this, dir
->dirfrag()));
9550 for (auto &p
: dir
->items
) {
9551 CDentry
*dn
= p
.second
;
9552 dn
->state_set(CDentry::STATE_STRAY
);
9553 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
9554 if (dnl
->is_primary()) {
9555 CInode
*in
= dnl
->get_inode();
9556 if (in
->inode
.nlink
== 0)
9557 in
->state_set(CInode::STATE_ORPHAN
);
9558 maybe_eval_stray(in
);
9564 void MDCache::fetch_backtrace(inodeno_t ino
, int64_t pool
, bufferlist
& bl
, Context
*fin
)
9566 object_t oid
= CInode::get_object_name(ino
, frag_t(), "");
9567 mds
->objecter
->getxattr(oid
, object_locator_t(pool
), "parent", CEPH_NOSNAP
, &bl
, 0, fin
);
9574 // ========================================================================================
9578 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
9579 to the parent metadata object in the cache (pinning it).
9581 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
9585 void MDCache::_send_discover(discover_info_t
& d
)
9587 MDiscover
*dis
= new MDiscover(d
.ino
, d
.frag
, d
.snap
, d
.want_path
,
9588 d
.want_base_dir
, d
.want_xlocked
);
9589 dis
->set_tid(d
.tid
);
9590 mds
->send_message_mds(dis
, d
.mds
);
9593 void MDCache::discover_base_ino(inodeno_t want_ino
,
9594 MDSInternalContextBase
*onfinish
,
9597 dout(7) << "discover_base_ino " << want_ino
<< " from mds." << from
<< dendl
;
9598 if (waiting_for_base_ino
[from
].count(want_ino
) == 0) {
9599 discover_info_t
& d
= _create_discover(from
);
9603 waiting_for_base_ino
[from
][want_ino
].push_back(onfinish
);
9607 void MDCache::discover_dir_frag(CInode
*base
,
9609 MDSInternalContextBase
*onfinish
,
9613 from
= base
->authority().first
;
9615 dirfrag_t
df(base
->ino(), approx_fg
);
9616 dout(7) << "discover_dir_frag " << df
9617 << " from mds." << from
<< dendl
;
9619 if (!base
->is_waiting_for_dir(approx_fg
) || !onfinish
) {
9620 discover_info_t
& d
= _create_discover(from
);
9622 d
.ino
= base
->ino();
9624 d
.want_base_dir
= true;
9629 base
->add_dir_waiter(approx_fg
, onfinish
);
9632 struct C_MDC_RetryDiscoverPath
: public MDCacheContext
{
9637 C_MDC_RetryDiscoverPath(MDCache
*c
, CInode
*b
, snapid_t s
, filepath
&p
, mds_rank_t f
) :
9638 MDCacheContext(c
), base(b
), snapid(s
), path(p
), from(f
) {}
9639 void finish(int r
) override
{
9640 mdcache
->discover_path(base
, snapid
, path
, 0, from
);
9644 void MDCache::discover_path(CInode
*base
,
9647 MDSInternalContextBase
*onfinish
,
9652 from
= base
->authority().first
;
9654 dout(7) << "discover_path " << base
->ino() << " " << want_path
<< " snap " << snap
<< " from mds." << from
9655 << (want_xlocked
? " want_xlocked":"")
9658 if (base
->is_ambiguous_auth()) {
9659 dout(10) << " waiting for single auth on " << *base
<< dendl
;
9661 onfinish
= new C_MDC_RetryDiscoverPath(this, base
, snap
, want_path
, from
);
9662 base
->add_waiter(CInode::WAIT_SINGLEAUTH
, onfinish
);
9664 } else if (from
== mds
->get_nodeid()) {
9665 list
<MDSInternalContextBase
*> finished
;
9666 base
->take_waiting(CInode::WAIT_DIR
, finished
);
9667 mds
->queue_waiters(finished
);
9671 frag_t fg
= base
->pick_dirfrag(want_path
[0]);
9672 if ((want_xlocked
&& want_path
.depth() == 1) ||
9673 !base
->is_waiting_for_dir(fg
) || !onfinish
) {
9674 discover_info_t
& d
= _create_discover(from
);
9675 d
.ino
= base
->ino();
9679 d
.want_path
= want_path
;
9680 d
.want_base_dir
= true;
9681 d
.want_xlocked
= want_xlocked
;
9687 base
->add_dir_waiter(fg
, onfinish
);
9690 struct C_MDC_RetryDiscoverPath2
: public MDCacheContext
{
9694 C_MDC_RetryDiscoverPath2(MDCache
*c
, CDir
*b
, snapid_t s
, filepath
&p
) :
9695 MDCacheContext(c
), base(b
), snapid(s
), path(p
) {}
9696 void finish(int r
) override
{
9697 mdcache
->discover_path(base
, snapid
, path
, 0);
9701 void MDCache::discover_path(CDir
*base
,
9704 MDSInternalContextBase
*onfinish
,
9707 mds_rank_t from
= base
->authority().first
;
9709 dout(7) << "discover_path " << base
->dirfrag() << " " << want_path
<< " snap " << snap
<< " from mds." << from
9710 << (want_xlocked
? " want_xlocked":"")
9713 if (base
->is_ambiguous_auth()) {
9714 dout(7) << " waiting for single auth on " << *base
<< dendl
;
9716 onfinish
= new C_MDC_RetryDiscoverPath2(this, base
, snap
, want_path
);
9717 base
->add_waiter(CDir::WAIT_SINGLEAUTH
, onfinish
);
9719 } else if (from
== mds
->get_nodeid()) {
9720 list
<MDSInternalContextBase
*> finished
;
9721 base
->take_sub_waiting(finished
);
9722 mds
->queue_waiters(finished
);
9726 if ((want_xlocked
&& want_path
.depth() == 1) ||
9727 !base
->is_waiting_for_dentry(want_path
[0].c_str(), snap
) || !onfinish
) {
9728 discover_info_t
& d
= _create_discover(from
);
9729 d
.ino
= base
->ino();
9730 d
.pin_base(base
->inode
);
9731 d
.frag
= base
->get_frag();
9733 d
.want_path
= want_path
;
9734 d
.want_base_dir
= false;
9735 d
.want_xlocked
= want_xlocked
;
9741 base
->add_dentry_waiter(want_path
[0], snap
, onfinish
);
9744 void MDCache::kick_discovers(mds_rank_t who
)
9746 for (map
<ceph_tid_t
,discover_info_t
>::iterator p
= discovers
.begin();
9747 p
!= discovers
.end();
9749 if (p
->second
.mds
!= who
)
9751 _send_discover(p
->second
);
9756 /* This function DOES put the passed message before returning */
9757 void MDCache::handle_discover(MDiscover
*dis
)
9759 mds_rank_t whoami
= mds
->get_nodeid();
9760 mds_rank_t from
= mds_rank_t(dis
->get_source().num());
9762 assert(from
!= whoami
);
9764 if (mds
->get_state() <= MDSMap::STATE_REJOIN
) {
9765 if (mds
->get_state() < MDSMap::STATE_REJOIN
&&
9766 mds
->get_want_state() < CEPH_MDS_STATE_REJOIN
) {
9771 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
9772 // delay processing request from survivor because we may not yet choose lock states.
9773 if (!mds
->mdsmap
->is_rejoin(from
)) {
9774 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl
;
9775 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, dis
));
9782 MDiscoverReply
*reply
= new MDiscoverReply(dis
);
9784 snapid_t snapid
= dis
->get_snapid();
9787 if (MDS_INO_IS_BASE(dis
->get_base_ino()) &&
9788 !dis
->wants_base_dir() && dis
->get_want().depth() == 0) {
9790 dout(7) << "handle_discover from mds." << from
9791 << " wants base + " << dis
->get_want().get_path()
9792 << " snap " << snapid
9795 cur
= get_inode(dis
->get_base_ino());
9799 reply
->starts_with
= MDiscoverReply::INODE
;
9800 replicate_inode(cur
, from
, reply
->trace
, mds
->mdsmap
->get_up_features());
9801 dout(10) << "added base " << *cur
<< dendl
;
9804 // there's a base inode
9805 cur
= get_inode(dis
->get_base_ino(), snapid
);
9806 if (!cur
&& snapid
!= CEPH_NOSNAP
) {
9807 cur
= get_inode(dis
->get_base_ino());
9808 if (cur
&& !cur
->is_multiversion())
9809 cur
= NULL
; // nope!
9813 dout(7) << "handle_discover mds." << from
9814 << " don't have base ino " << dis
->get_base_ino() << "." << snapid
9816 if (!dis
->wants_base_dir() && dis
->get_want().depth() > 0)
9817 reply
->set_error_dentry(dis
->get_dentry(0));
9818 reply
->set_flag_error_dir();
9819 } else if (dis
->wants_base_dir()) {
9820 dout(7) << "handle_discover mds." << from
9821 << " wants basedir+" << dis
->get_want().get_path()
9825 dout(7) << "handle_discover mds." << from
9826 << " wants " << dis
->get_want().get_path()
9835 // do some fidgeting to include a dir if they asked for the base dir, or just root.
9836 for (unsigned i
= 0;
9837 cur
&& (i
< dis
->get_want().depth() || dis
->get_want().depth() == 0);
9840 // -- figure out the dir
9842 // is *cur even a dir at all?
9843 if (!cur
->is_dir()) {
9844 dout(7) << *cur
<< " not a dir" << dendl
;
9845 reply
->set_flag_error_dir();
9851 if (dis
->get_want().depth()) {
9853 fg
= cur
->pick_dirfrag(dis
->get_dentry(i
));
9855 // requester explicity specified the frag
9856 assert(dis
->wants_base_dir() || MDS_INO_IS_BASE(dis
->get_base_ino()));
9857 fg
= dis
->get_base_dir_frag();
9858 if (!cur
->dirfragtree
.is_leaf(fg
))
9859 fg
= cur
->dirfragtree
[fg
.value()];
9861 CDir
*curdir
= cur
->get_dirfrag(fg
);
9863 if ((!curdir
&& !cur
->is_auth()) ||
9864 (curdir
&& !curdir
->is_auth())) {
9867 * ONLY set flag if empty!!
9868 * otherwise requester will wake up waiter(s) _and_ continue with discover,
9869 * resulting in duplicate discovers in flight,
9870 * which can wreak havoc when discovering rename srcdn (which may move)
9873 if (reply
->is_empty()) {
9874 // only hint if empty.
9875 // someday this could be better, but right now the waiter logic isn't smart enough.
9879 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir
<< dendl
;
9880 reply
->set_dir_auth_hint(curdir
->authority().first
);
9882 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
9884 reply
->set_dir_auth_hint(cur
->authority().first
);
9887 // note error dentry, if any
9888 // NOTE: important, as it allows requester to issue an equivalent discover
9889 // to whomever we hint at.
9890 if (dis
->get_want().depth() > i
)
9891 reply
->set_error_dentry(dis
->get_dentry(i
));
9897 if (!curdir
) { // open dir?
9898 if (cur
->is_frozen()) {
9899 if (!reply
->is_empty()) {
9900 dout(7) << *cur
<< " is frozen, non-empty reply, stopping" << dendl
;
9903 dout(7) << *cur
<< " is frozen, empty reply, waiting" << dendl
;
9904 cur
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryMessage(mds
, dis
));
9908 curdir
= cur
->get_or_open_dirfrag(this, fg
);
9909 } else if (curdir
->is_frozen_tree() ||
9910 (curdir
->is_frozen_dir() && fragment_are_all_frozen(curdir
))) {
9911 if (!reply
->is_empty()) {
9912 dout(7) << *curdir
<< " is frozen, non-empty reply, stopping" << dendl
;
9915 if (dis
->wants_base_dir() && dis
->get_base_dir_frag() != curdir
->get_frag()) {
9916 dout(7) << *curdir
<< " is frozen, dirfrag mismatch, stopping" << dendl
;
9917 reply
->set_flag_error_dir();
9920 dout(7) << *curdir
<< " is frozen, empty reply, waiting" << dendl
;
9921 curdir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryMessage(mds
, dis
));
9927 if (curdir
->get_version() == 0) {
9928 // fetch newly opened dir
9929 } else if (reply
->is_empty() && !dis
->wants_base_dir()) {
9930 dout(7) << "handle_discover not adding unwanted base dir " << *curdir
<< dendl
;
9931 // make sure the base frag is correct, though, in there was a refragment since the
9932 // original request was sent.
9933 reply
->set_base_dir_frag(curdir
->get_frag());
9935 assert(!curdir
->is_ambiguous_auth()); // would be frozen.
9936 if (!reply
->trace
.length())
9937 reply
->starts_with
= MDiscoverReply::DIR;
9938 replicate_dir(curdir
, from
, reply
->trace
);
9939 dout(7) << "handle_discover added dir " << *curdir
<< dendl
;
9944 if (curdir
->get_version() == 0) {
9945 // fetch newly opened dir
9946 assert(!curdir
->has_bloom());
9947 } else if (dis
->get_want().depth() > 0) {
9949 dn
= curdir
->lookup(dis
->get_dentry(i
), snapid
);
9955 if (!curdir
->is_complete() &&
9956 (!curdir
->has_bloom() || curdir
->is_in_bloom(dis
->get_dentry(i
)))) {
9958 dout(7) << "incomplete dir contents for " << *curdir
<< ", fetching" << dendl
;
9959 if (reply
->is_empty()) {
9961 curdir
->fetch(new C_MDS_RetryMessage(mds
, dis
),
9962 dis
->wants_base_dir() && curdir
->get_version() == 0);
9966 // initiate fetch, but send what we have so far
9973 dout(7) << "dentry " << dis
->get_dentry(i
) << " dne, returning null in "
9974 << *curdir
<< dendl
;
9975 dn
= curdir
->add_null_dentry(dis
->get_dentry(i
));
9979 // don't add replica to purging dentry/inode
9980 if (dn
->state_test(CDentry::STATE_PURGING
)) {
9981 if (reply
->is_empty())
9982 reply
->set_flag_error_dn(dis
->get_dentry(i
));
9986 CDentry::linkage_t
*dnl
= dn
->get_linkage();
9989 // ...always block on non-tail items (they are unrelated)
9990 // ...allow xlocked tail disocvery _only_ if explicitly requested
9991 bool tailitem
= (dis
->get_want().depth() == 0) || (i
== dis
->get_want().depth() - 1);
9992 if (dn
->lock
.is_xlocked()) {
9993 // is this the last (tail) item in the discover traversal?
9994 if (tailitem
&& dis
->wants_xlocked()) {
9995 dout(7) << "handle_discover allowing discovery of xlocked tail " << *dn
<< dendl
;
9996 } else if (reply
->is_empty()) {
9997 dout(7) << "handle_discover blocking on xlocked " << *dn
<< dendl
;
9998 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, new C_MDS_RetryMessage(mds
, dis
));
10002 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn
<< dendl
;
10008 if (dnl
->is_primary() && dnl
->get_inode()->is_frozen_inode()) {
10009 if (tailitem
&& dis
->wants_xlocked()) {
10010 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl
->get_inode() << dendl
;
10011 } else if (reply
->is_empty()) {
10012 dout(7) << *dnl
->get_inode() << " is frozen, empty reply, waiting" << dendl
;
10013 dnl
->get_inode()->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryMessage(mds
, dis
));
10017 dout(7) << *dnl
->get_inode() << " is frozen, non-empty reply, stopping" << dendl
;
10023 if (!reply
->trace
.length())
10024 reply
->starts_with
= MDiscoverReply::DENTRY
;
10025 replicate_dentry(dn
, from
, reply
->trace
);
10026 dout(7) << "handle_discover added dentry " << *dn
<< dendl
;
10028 if (!dnl
->is_primary()) break; // stop on null or remote link.
10031 CInode
*next
= dnl
->get_inode();
10032 assert(next
->is_auth());
10034 replicate_inode(next
, from
, reply
->trace
, mds
->mdsmap
->get_up_features());
10035 dout(7) << "handle_discover added inode " << *next
<< dendl
;
10037 // descend, keep going.
10043 assert(!reply
->is_empty());
10044 dout(7) << "handle_discover sending result back to asker mds." << from
<< dendl
;
10045 mds
->send_message(reply
, dis
->get_connection());
10050 /* This function DOES put the passed message before returning */
10051 void MDCache::handle_discover_reply(MDiscoverReply
*m
)
10054 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
10055 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
10060 dout(7) << "discover_reply " << *m
<< dendl
;
10061 if (m
->is_flag_error_dir())
10062 dout(7) << " flag error, dir" << dendl
;
10063 if (m
->is_flag_error_dn())
10064 dout(7) << " flag error, dentry = " << m
->get_error_dentry() << dendl
;
10066 list
<MDSInternalContextBase
*> finished
, error
;
10067 mds_rank_t from
= mds_rank_t(m
->get_source().num());
10070 CInode
*cur
= get_inode(m
->get_base_ino());
10071 bufferlist::iterator p
= m
->trace
.begin();
10073 int next
= m
->starts_with
;
10075 // decrement discover counters
10076 if (m
->get_tid()) {
10077 map
<ceph_tid_t
,discover_info_t
>::iterator p
= discovers
.find(m
->get_tid());
10078 if (p
!= discovers
.end()) {
10079 dout(10) << " found tid " << m
->get_tid() << dendl
;
10080 discovers
.erase(p
);
10082 dout(10) << " tid " << m
->get_tid() << " not found, must be dup reply" << dendl
;
10086 // discover may start with an inode
10087 if (!p
.end() && next
== MDiscoverReply::INODE
) {
10088 cur
= add_replica_inode(p
, NULL
, finished
);
10089 dout(7) << "discover_reply got base inode " << *cur
<< dendl
;
10090 assert(cur
->is_base());
10092 next
= MDiscoverReply::DIR;
10095 if (cur
->is_base() &&
10096 waiting_for_base_ino
[from
].count(cur
->ino())) {
10097 finished
.swap(waiting_for_base_ino
[from
][cur
->ino()]);
10098 waiting_for_base_ino
[from
].erase(cur
->ino());
10103 // loop over discover results.
10104 // indexes follow each ([[dir] dentry] inode)
10105 // can start, end with any type.
10110 if (next
== MDiscoverReply::DIR) {
10111 curdir
= add_replica_dir(p
, cur
, mds_rank_t(m
->get_source().num()), finished
);
10112 if (cur
->ino() == m
->get_base_ino() && curdir
->get_frag() != m
->get_base_dir_frag()) {
10113 assert(m
->get_wanted_base_dir());
10114 cur
->take_dir_waiting(m
->get_base_dir_frag(), finished
);
10117 // note: this can only happen our first way around this loop.
10118 if (p
.end() && m
->is_flag_error_dn()) {
10119 fg
= cur
->pick_dirfrag(m
->get_error_dentry());
10120 curdir
= cur
->get_dirfrag(fg
);
10122 curdir
= cur
->get_dirfrag(m
->get_base_dir_frag());
10129 CDentry
*dn
= add_replica_dentry(p
, curdir
, finished
);
10135 cur
= add_replica_inode(p
, dn
, finished
);
10137 next
= MDiscoverReply::DIR;
10141 // or dir_auth hint?
10142 if (m
->is_flag_error_dir() && !cur
->is_dir()) {
10144 cur
->take_waiting(CInode::WAIT_DIR
, error
);
10145 } else if (m
->is_flag_error_dir() || m
->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN
) {
10146 mds_rank_t who
= m
->get_dir_auth_hint();
10147 if (who
== mds
->get_nodeid()) who
= -1;
10149 dout(7) << " dir_auth_hint is " << m
->get_dir_auth_hint() << dendl
;
10152 if (m
->get_wanted_base_dir()) {
10153 frag_t fg
= m
->get_base_dir_frag();
10154 CDir
*dir
= cur
->get_dirfrag(fg
);
10156 if (cur
->is_waiting_for_dir(fg
)) {
10157 if (cur
->is_auth())
10158 cur
->take_waiting(CInode::WAIT_DIR
, finished
);
10159 else if (dir
|| !cur
->dirfragtree
.is_leaf(fg
))
10160 cur
->take_dir_waiting(fg
, finished
);
10162 discover_dir_frag(cur
, fg
, 0, who
);
10164 dout(7) << " doing nothing, nobody is waiting for dir" << dendl
;
10168 if (m
->get_error_dentry().length()) {
10169 frag_t fg
= cur
->pick_dirfrag(m
->get_error_dentry());
10170 CDir
*dir
= cur
->get_dirfrag(fg
);
10172 if (dir
&& dir
->is_waiting_for_dentry(m
->get_error_dentry(), m
->get_wanted_snapid())) {
10173 if (dir
->is_auth() || dir
->lookup(m
->get_error_dentry())) {
10174 dir
->take_dentry_waiting(m
->get_error_dentry(), m
->get_wanted_snapid(),
10175 m
->get_wanted_snapid(), finished
);
10177 filepath
relpath(m
->get_error_dentry(), 0);
10178 discover_path(dir
, m
->get_wanted_snapid(), relpath
, 0, m
->get_wanted_xlocked());
10181 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10182 << m
->get_error_dentry() << dendl
;
10184 } else if (m
->is_flag_error_dn()) {
10185 frag_t fg
= cur
->pick_dirfrag(m
->get_error_dentry());
10186 CDir
*dir
= cur
->get_dirfrag(fg
);
10188 if (dir
->is_auth()) {
10189 dir
->take_sub_waiting(finished
);
10191 dir
->take_dentry_waiting(m
->get_error_dentry(), m
->get_wanted_snapid(),
10192 m
->get_wanted_snapid(), error
);
10198 finish_contexts(g_ceph_context
, error
, -ENOENT
); // finish errors directly
10199 mds
->queue_waiters(finished
);
10207 // ----------------------------
10211 void MDCache::replicate_dir(CDir
*dir
, mds_rank_t to
, bufferlist
& bl
)
10213 dirfrag_t df
= dir
->dirfrag();
10215 dir
->encode_replica(to
, bl
);
10218 void MDCache::replicate_dentry(CDentry
*dn
, mds_rank_t to
, bufferlist
& bl
)
10220 ::encode(dn
->get_name(), bl
);
10221 ::encode(dn
->last
, bl
);
10222 dn
->encode_replica(to
, bl
, mds
->get_state() < MDSMap::STATE_ACTIVE
);
10225 void MDCache::replicate_inode(CInode
*in
, mds_rank_t to
, bufferlist
& bl
,
10228 ::encode(in
->inode
.ino
, bl
); // bleh, minor assymetry here
10229 ::encode(in
->last
, bl
);
10230 in
->encode_replica(to
, bl
, features
, mds
->get_state() < MDSMap::STATE_ACTIVE
);
10233 CDir
*MDCache::add_replica_dir(bufferlist::iterator
& p
, CInode
*diri
, mds_rank_t from
,
10234 list
<MDSInternalContextBase
*>& finished
)
10239 assert(diri
->ino() == df
.ino
);
10241 // add it (_replica_)
10242 CDir
*dir
= diri
->get_dirfrag(df
.frag
);
10245 // had replica. update w/ new nonce.
10246 dir
->decode_replica(p
);
10247 dout(7) << "add_replica_dir had " << *dir
<< " nonce " << dir
->replica_nonce
<< dendl
;
10249 // force frag to leaf in the diri tree
10250 if (!diri
->dirfragtree
.is_leaf(df
.frag
)) {
10251 dout(7) << "add_replica_dir forcing frag " << df
.frag
<< " to leaf in the fragtree "
10252 << diri
->dirfragtree
<< dendl
;
10253 diri
->dirfragtree
.force_to_leaf(g_ceph_context
, df
.frag
);
10257 dir
= diri
->add_dirfrag( new CDir(diri
, df
.frag
, this, false) );
10258 dir
->decode_replica(p
);
10260 // is this a dir_auth delegation boundary?
10261 if (from
!= diri
->authority().first
||
10262 diri
->is_ambiguous_auth() ||
10264 adjust_subtree_auth(dir
, from
);
10266 dout(7) << "add_replica_dir added " << *dir
<< " nonce " << dir
->replica_nonce
<< dendl
;
10269 diri
->take_dir_waiting(df
.frag
, finished
);
10275 CDentry
*MDCache::add_replica_dentry(bufferlist::iterator
& p
, CDir
*dir
, list
<MDSInternalContextBase
*>& finished
)
10282 CDentry
*dn
= dir
->lookup(name
, last
);
10286 dn
->decode_replica(p
, false);
10287 dout(7) << "add_replica_dentry had " << *dn
<< dendl
;
10289 dn
= dir
->add_null_dentry(name
, 1 /* this will get updated below */, last
);
10290 dn
->decode_replica(p
, true);
10291 dout(7) << "add_replica_dentry added " << *dn
<< dendl
;
10294 dir
->take_dentry_waiting(name
, dn
->first
, dn
->last
, finished
);
10299 CInode
*MDCache::add_replica_inode(bufferlist::iterator
& p
, CDentry
*dn
, list
<MDSInternalContextBase
*>& finished
)
10305 CInode
*in
= get_inode(ino
, last
);
10307 in
= new CInode(this, false, 1, last
);
10308 in
->decode_replica(p
, true);
10310 if (in
->ino() == MDS_INO_ROOT
)
10311 in
->inode_auth
.first
= 0;
10312 else if (in
->is_mdsdir())
10313 in
->inode_auth
.first
= in
->ino() - MDS_INO_MDSDIR_OFFSET
;
10314 dout(10) << "add_replica_inode added " << *in
<< dendl
;
10316 assert(dn
->get_linkage()->is_null());
10317 dn
->dir
->link_primary_inode(dn
, in
);
10320 in
->decode_replica(p
, false);
10321 dout(10) << "add_replica_inode had " << *in
<< dendl
;
10325 if (!dn
->get_linkage()->is_primary() || dn
->get_linkage()->get_inode() != in
)
10326 dout(10) << "add_replica_inode different linkage in dentry " << *dn
<< dendl
;
10333 void MDCache::replicate_stray(CDentry
*straydn
, mds_rank_t who
, bufferlist
& bl
)
10335 uint64_t features
= mds
->mdsmap
->get_up_features();
10336 replicate_inode(get_myin(), who
, bl
, features
);
10337 replicate_dir(straydn
->get_dir()->inode
->get_parent_dn()->get_dir(), who
, bl
);
10338 replicate_dentry(straydn
->get_dir()->inode
->get_parent_dn(), who
, bl
);
10339 replicate_inode(straydn
->get_dir()->inode
, who
, bl
, features
);
10340 replicate_dir(straydn
->get_dir(), who
, bl
);
10341 replicate_dentry(straydn
, who
, bl
);
10344 CDentry
*MDCache::add_replica_stray(bufferlist
&bl
, mds_rank_t from
)
10346 list
<MDSInternalContextBase
*> finished
;
10347 bufferlist::iterator p
= bl
.begin();
10349 CInode
*mdsin
= add_replica_inode(p
, NULL
, finished
);
10350 CDir
*mdsdir
= add_replica_dir(p
, mdsin
, from
, finished
);
10351 CDentry
*straydirdn
= add_replica_dentry(p
, mdsdir
, finished
);
10352 CInode
*strayin
= add_replica_inode(p
, straydirdn
, finished
);
10353 CDir
*straydir
= add_replica_dir(p
, strayin
, from
, finished
);
10354 CDentry
*straydn
= add_replica_dentry(p
, straydir
, finished
);
10355 if (!finished
.empty())
10356 mds
->queue_waiters(finished
);
10362 int MDCache::send_dir_updates(CDir
*dir
, bool bcast
)
10364 // this is an FYI, re: replication
10366 set
<mds_rank_t
> who
;
10368 mds
->get_mds_map()->get_active_mds_set(who
);
10370 for (const auto &p
: dir
->get_replicas()) {
10371 who
.insert(p
.first
);
10375 dout(7) << "sending dir_update on " << *dir
<< " bcast " << bcast
<< " to " << who
<< dendl
;
10378 dir
->inode
->make_path(path
);
10380 mds_rank_t whoami
= mds
->get_nodeid();
10381 for (set
<mds_rank_t
>::iterator it
= who
.begin();
10384 if (*it
== whoami
) continue;
10385 //if (*it == except) continue;
10386 dout(7) << "sending dir_update on " << *dir
<< " to " << *it
<< dendl
;
10388 std::set
<int32_t> s
;
10389 for (const auto &r
: dir
->dir_rep_by
) {
10392 mds
->send_message_mds(new MDirUpdate(mds
->get_nodeid(),
10404 /* This function DOES put the passed message before returning */
10405 void MDCache::handle_dir_update(MDirUpdate
*m
)
10407 dirfrag_t df
= m
->get_dirfrag();
10408 CDir
*dir
= get_dirfrag(df
);
10410 dout(5) << "dir_update on " << df
<< ", don't have it" << dendl
;
10413 if (m
->should_discover()) {
10415 // this is key to avoid a fragtree update race, among other things.
10416 m
->inc_tried_discover();
10417 vector
<CDentry
*> trace
;
10419 filepath path
= m
->get_path();
10420 dout(5) << "trying discover on dir_update for " << path
<< dendl
;
10421 MDRequestRef null_ref
;
10422 int r
= path_traverse(null_ref
, m
, NULL
, path
, &trace
, &in
, MDS_TRAVERSE_DISCOVER
);
10426 in
->ino() == df
.ino
&&
10427 in
->get_approx_dirfrag(df
.frag
) == NULL
) {
10428 open_remote_dirfrag(in
, df
.frag
, new C_MDS_RetryMessage(mds
, m
));
10437 if (!m
->has_tried_discover()) {
10438 // Update if it already exists. Othwerwise it got updated by discover reply.
10439 dout(5) << "dir_update on " << *dir
<< dendl
;
10440 dir
->dir_rep
= m
->get_dir_rep();
10441 dir
->dir_rep_by
.clear();
10442 for (const auto &e
: m
->get_dir_rep_by()) {
10443 dir
->dir_rep_by
.insert(e
);
10457 void MDCache::send_dentry_link(CDentry
*dn
, MDRequestRef
& mdr
)
10459 dout(7) << "send_dentry_link " << *dn
<< dendl
;
10461 CDir
*subtree
= get_subtree_root(dn
->get_dir());
10462 for (const auto &p
: dn
->get_replicas()) {
10463 // don't tell (rename) witnesses; they already know
10464 if (mdr
.get() && mdr
->more()->witnessed
.count(p
.first
))
10466 if (mds
->mdsmap
->get_state(p
.first
) < MDSMap::STATE_REJOIN
||
10467 (mds
->mdsmap
->get_state(p
.first
) == MDSMap::STATE_REJOIN
&&
10468 rejoin_gather
.count(p
.first
)))
10470 CDentry::linkage_t
*dnl
= dn
->get_linkage();
10471 MDentryLink
*m
= new MDentryLink(subtree
->dirfrag(), dn
->get_dir()->dirfrag(),
10472 dn
->get_name(), dnl
->is_primary());
10473 if (dnl
->is_primary()) {
10474 dout(10) << " primary " << *dnl
->get_inode() << dendl
;
10475 replicate_inode(dnl
->get_inode(), p
.first
, m
->bl
,
10476 mds
->mdsmap
->get_up_features());
10477 } else if (dnl
->is_remote()) {
10478 inodeno_t ino
= dnl
->get_remote_ino();
10479 __u8 d_type
= dnl
->get_remote_d_type();
10480 dout(10) << " remote " << ino
<< " " << d_type
<< dendl
;
10481 ::encode(ino
, m
->bl
);
10482 ::encode(d_type
, m
->bl
);
10484 ceph_abort(); // aie, bad caller!
10485 mds
->send_message_mds(m
, p
.first
);
10489 /* This function DOES put the passed message before returning */
10490 void MDCache::handle_dentry_link(MDentryLink
*m
)
10493 CDentry
*dn
= NULL
;
10494 CDir
*dir
= get_dirfrag(m
->get_dirfrag());
10496 dout(7) << "handle_dentry_link don't have dirfrag " << m
->get_dirfrag() << dendl
;
10498 dn
= dir
->lookup(m
->get_dn());
10500 dout(7) << "handle_dentry_link don't have dentry " << *dir
<< " dn " << m
->get_dn() << dendl
;
10502 dout(7) << "handle_dentry_link on " << *dn
<< dendl
;
10503 CDentry::linkage_t
*dnl
= dn
->get_linkage();
10505 assert(!dn
->is_auth());
10506 assert(dnl
->is_null());
10510 bufferlist::iterator p
= m
->bl
.begin();
10511 list
<MDSInternalContextBase
*> finished
;
10513 if (m
->get_is_primary()) {
10515 add_replica_inode(p
, dn
, finished
);
10517 // remote link, easy enough.
10521 ::decode(d_type
, p
);
10522 dir
->link_remote_inode(dn
, ino
, d_type
);
10528 if (!finished
.empty())
10529 mds
->queue_waiters(finished
);
10538 void MDCache::send_dentry_unlink(CDentry
*dn
, CDentry
*straydn
, MDRequestRef
& mdr
)
10540 dout(10) << "send_dentry_unlink " << *dn
<< dendl
;
10541 // share unlink news with replicas
10542 set
<mds_rank_t
> replicas
;
10543 dn
->list_replicas(replicas
);
10545 straydn
->list_replicas(replicas
);
10546 for (set
<mds_rank_t
>::iterator it
= replicas
.begin();
10547 it
!= replicas
.end();
10549 // don't tell (rmdir) witnesses; they already know
10550 if (mdr
.get() && mdr
->more()->witnessed
.count(*it
))
10553 if (mds
->mdsmap
->get_state(*it
) < MDSMap::STATE_REJOIN
||
10554 (mds
->mdsmap
->get_state(*it
) == MDSMap::STATE_REJOIN
&&
10555 rejoin_gather
.count(*it
)))
10558 MDentryUnlink
*unlink
= new MDentryUnlink(dn
->get_dir()->dirfrag(), dn
->get_name());
10560 replicate_stray(straydn
, *it
, unlink
->straybl
);
10561 mds
->send_message_mds(unlink
, *it
);
10565 /* This function DOES put the passed message before returning */
10566 void MDCache::handle_dentry_unlink(MDentryUnlink
*m
)
10569 CDentry
*straydn
= NULL
;
10570 if (m
->straybl
.length())
10571 straydn
= add_replica_stray(m
->straybl
, mds_rank_t(m
->get_source().num()));
10573 CDir
*dir
= get_dirfrag(m
->get_dirfrag());
10575 dout(7) << "handle_dentry_unlink don't have dirfrag " << m
->get_dirfrag() << dendl
;
10577 CDentry
*dn
= dir
->lookup(m
->get_dn());
10579 dout(7) << "handle_dentry_unlink don't have dentry " << *dir
<< " dn " << m
->get_dn() << dendl
;
10581 dout(7) << "handle_dentry_unlink on " << *dn
<< dendl
;
10582 CDentry::linkage_t
*dnl
= dn
->get_linkage();
10585 if (dnl
->is_primary()) {
10586 CInode
*in
= dnl
->get_inode();
10587 dn
->dir
->unlink_inode(dn
);
10589 straydn
->dir
->link_primary_inode(straydn
, in
);
10591 // in->first is lazily updated on replica; drag it forward so
10592 // that we always keep it in sync with the dnq
10593 assert(straydn
->first
>= in
->first
);
10594 in
->first
= straydn
->first
;
10596 // update subtree map?
10598 adjust_subtree_after_rename(in
, dir
, false);
10600 // send caps to auth (if we're not already)
10601 if (in
->is_any_caps() &&
10602 !in
->state_test(CInode::STATE_EXPORTINGCAPS
))
10603 migrator
->export_caps(in
);
10608 assert(dnl
->is_remote());
10609 dn
->dir
->unlink_inode(dn
);
10611 assert(dnl
->is_null());
10615 // race with trim_dentry()
10617 assert(straydn
->get_num_ref() == 0);
10618 assert(straydn
->get_linkage()->is_null());
10619 map
<mds_rank_t
, MCacheExpire
*> expiremap
;
10620 trim_dentry(straydn
, expiremap
);
10621 send_expire_messages(expiremap
);
10633 // ===================================================================
10637 // ===================================================================
10642 * adjust_dir_fragments -- adjust fragmentation for a directory
10644 * @param diri directory inode
10645 * @param basefrag base fragment
10646 * @param bits bit adjustment. positive for split, negative for merge.
10648 void MDCache::adjust_dir_fragments(CInode
*diri
, frag_t basefrag
, int bits
,
10649 list
<CDir
*>& resultfrags
,
10650 list
<MDSInternalContextBase
*>& waiters
,
10653 dout(10) << "adjust_dir_fragments " << basefrag
<< " " << bits
10654 << " on " << *diri
<< dendl
;
10656 list
<CDir
*> srcfrags
;
10657 diri
->get_dirfrags_under(basefrag
, srcfrags
);
10659 adjust_dir_fragments(diri
, srcfrags
, basefrag
, bits
, resultfrags
, waiters
, replay
);
10662 CDir
*MDCache::force_dir_fragment(CInode
*diri
, frag_t fg
, bool replay
)
10664 CDir
*dir
= diri
->get_dirfrag(fg
);
10668 dout(10) << "force_dir_fragment " << fg
<< " on " << *diri
<< dendl
;
10670 list
<CDir
*> src
, result
;
10671 list
<MDSInternalContextBase
*> waiters
;
10674 frag_t parent
= diri
->dirfragtree
.get_branch_or_leaf(fg
);
10676 CDir
*pdir
= diri
->get_dirfrag(parent
);
10678 int split
= fg
.bits() - parent
.bits();
10679 dout(10) << " splitting parent by " << split
<< " " << *pdir
<< dendl
;
10680 src
.push_back(pdir
);
10681 adjust_dir_fragments(diri
, src
, parent
, split
, result
, waiters
, replay
);
10682 dir
= diri
->get_dirfrag(fg
);
10684 dout(10) << "force_dir_fragment result " << *dir
<< dendl
;
10688 if (parent
== frag_t())
10690 frag_t last
= parent
;
10691 parent
= parent
.parent();
10692 dout(10) << " " << last
<< " parent is " << parent
<< dendl
;
10696 // hoover up things under fg?
10697 diri
->get_dirfrags_under(fg
, src
);
10699 dout(10) << "force_dir_fragment no frags under " << fg
<< dendl
;
10701 dout(10) << " will combine frags under " << fg
<< ": " << src
<< dendl
;
10702 adjust_dir_fragments(diri
, src
, fg
, 0, result
, waiters
, replay
);
10703 dir
= result
.front();
10704 dout(10) << "force_dir_fragment result " << *dir
<< dendl
;
10708 mds
->queue_waiters(waiters
);
10712 void MDCache::adjust_dir_fragments(CInode
*diri
,
10713 list
<CDir
*>& srcfrags
,
10714 frag_t basefrag
, int bits
,
10715 list
<CDir
*>& resultfrags
,
10716 list
<MDSInternalContextBase
*>& waiters
,
10719 dout(10) << "adjust_dir_fragments " << basefrag
<< " bits " << bits
10720 << " srcfrags " << srcfrags
10721 << " on " << *diri
<< dendl
;
10724 // yuck. we may have discovered the inode while it was being fragmented.
10725 if (!diri
->dirfragtree
.is_leaf(basefrag
))
10726 diri
->dirfragtree
.force_to_leaf(g_ceph_context
, basefrag
);
10729 diri
->dirfragtree
.split(basefrag
, bits
);
10730 dout(10) << " new fragtree is " << diri
->dirfragtree
<< dendl
;
10732 if (srcfrags
.empty())
10736 CDir
*parent_dir
= diri
->get_parent_dir();
10737 CDir
*parent_subtree
= 0;
10739 parent_subtree
= get_subtree_root(parent_dir
);
10743 assert(srcfrags
.size() == 1);
10744 CDir
*dir
= srcfrags
.front();
10746 dir
->split(bits
, resultfrags
, waiters
, replay
);
10748 // did i change the subtree map?
10749 if (dir
->is_subtree_root()) {
10750 // new frags are now separate subtrees
10751 for (list
<CDir
*>::iterator p
= resultfrags
.begin();
10752 p
!= resultfrags
.end();
10754 subtrees
[*p
].clear(); // new frag is now its own subtree
10757 if (parent_subtree
) {
10758 assert(subtrees
[parent_subtree
].count(dir
));
10759 subtrees
[parent_subtree
].erase(dir
);
10760 for (list
<CDir
*>::iterator p
= resultfrags
.begin();
10761 p
!= resultfrags
.end();
10763 assert((*p
)->is_subtree_root());
10764 subtrees
[parent_subtree
].insert(*p
);
10768 // adjust my bounds.
10770 bounds
.swap(subtrees
[dir
]);
10771 subtrees
.erase(dir
);
10772 for (set
<CDir
*>::iterator p
= bounds
.begin();
10775 CDir
*frag
= get_subtree_root((*p
)->get_parent_dir());
10776 subtrees
[frag
].insert(*p
);
10781 // dir has no PIN_SUBTREE; CDir::purge_stolen() drops it.
10782 dir
->dir_auth
= CDIR_AUTH_DEFAULT
;
10785 diri
->close_dirfrag(dir
->get_frag());
10790 // are my constituent bits subtrees? if so, i will be too.
10791 // (it's all or none, actually.)
10792 bool any_subtree
= false;
10793 for (CDir
*dir
: srcfrags
) {
10794 if (dir
->is_subtree_root()) {
10795 any_subtree
= true;
10799 set
<CDir
*> new_bounds
;
10801 for (CDir
*dir
: srcfrags
) {
10802 // this simplifies the code that find subtrees underneath the dirfrag
10803 if (!dir
->is_subtree_root()) {
10804 dir
->state_set(CDir::STATE_AUXSUBTREE
);
10805 adjust_subtree_auth(dir
, mds
->get_nodeid());
10809 for (CDir
*dir
: srcfrags
) {
10810 assert(dir
->is_subtree_root());
10811 dout(10) << " taking srcfrag subtree bounds from " << *dir
<< dendl
;
10812 map
<CDir
*, set
<CDir
*> >::iterator q
= subtrees
.find(dir
);
10813 set
<CDir
*>::iterator r
= q
->second
.begin();
10814 while (r
!= subtrees
[dir
].end()) {
10815 new_bounds
.insert(*r
);
10816 subtrees
[dir
].erase(r
++);
10820 // remove myself as my parent's bound
10821 if (parent_subtree
)
10822 subtrees
[parent_subtree
].erase(dir
);
10827 CDir
*f
= new CDir(diri
, basefrag
, this, srcfrags
.front()->is_auth());
10828 f
->merge(srcfrags
, waiters
, replay
);
10831 assert(f
->is_subtree_root());
10832 subtrees
[f
].swap(new_bounds
);
10833 if (parent_subtree
)
10834 subtrees
[parent_subtree
].insert(f
);
10839 resultfrags
.push_back(f
);
10844 class C_MDC_FragmentFrozen
: public MDSInternalContext
{
10848 C_MDC_FragmentFrozen(MDCache
*m
, MDRequestRef
& r
) :
10849 MDSInternalContext(m
->mds
), mdcache(m
), mdr(r
) {}
10850 void finish(int r
) override
{
10851 mdcache
->fragment_frozen(mdr
, r
);
10855 bool MDCache::can_fragment(CInode
*diri
, list
<CDir
*>& dirs
)
10857 if (is_readonly()) {
10858 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl
;
10861 if (mds
->is_cluster_degraded()) {
10862 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl
;
10865 if (diri
->get_parent_dir() &&
10866 diri
->get_parent_dir()->get_inode()->is_stray()) {
10867 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl
;
10870 if (diri
->is_mdsdir() || diri
->is_stray() || diri
->ino() == MDS_INO_CEPH
) {
10871 dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl
;
10875 if (diri
->scrub_is_in_progress()) {
10876 dout(7) << "can_fragment: scrub in progress" << dendl
;
10880 for (list
<CDir
*>::iterator p
= dirs
.begin(); p
!= dirs
.end(); ++p
) {
10882 if (dir
->state_test(CDir::STATE_FRAGMENTING
)) {
10883 dout(7) << "can_fragment: already fragmenting " << *dir
<< dendl
;
10886 if (!dir
->is_auth()) {
10887 dout(7) << "can_fragment: not auth on " << *dir
<< dendl
;
10890 if (dir
->is_bad()) {
10891 dout(7) << "can_fragment: bad dirfrag " << *dir
<< dendl
;
10894 if (dir
->is_frozen() ||
10895 dir
->is_freezing()) {
10896 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl
;
10904 void MDCache::split_dir(CDir
*dir
, int bits
)
10906 dout(7) << __func__
<< " " << *dir
<< " bits " << bits
<< dendl
;
10907 assert(dir
->is_auth());
10908 CInode
*diri
= dir
->inode
;
10911 dirs
.push_back(dir
);
10913 if (!can_fragment(diri
, dirs
)) {
10914 dout(7) << __func__
<< " cannot fragment right now, dropping" << dendl
;
10918 if (dir
->frag
.bits() + bits
> 24) {
10919 dout(7) << __func__
<< " frag bits > 24, dropping" << dendl
;
10923 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_FRAGMENTDIR
);
10924 mdr
->more()->fragment_base
= dir
->dirfrag();
10926 assert(fragments
.count(dir
->dirfrag()) == 0);
10927 fragment_info_t
& info
= fragments
[dir
->dirfrag()];
10929 info
.dirs
.push_back(dir
);
10931 info
.last_cum_auth_pins_change
= ceph_clock_now();
10933 fragment_freeze_dirs(dirs
);
10934 // initial mark+complete pass
10935 fragment_mark_and_complete(mdr
);
10938 void MDCache::merge_dir(CInode
*diri
, frag_t frag
)
10940 dout(7) << "merge_dir to " << frag
<< " on " << *diri
<< dendl
;
10943 if (!diri
->get_dirfrags_under(frag
, dirs
)) {
10944 dout(7) << "don't have all frags under " << frag
<< " for " << *diri
<< dendl
;
10948 if (diri
->dirfragtree
.is_leaf(frag
)) {
10949 dout(10) << " " << frag
<< " already a leaf for " << *diri
<< dendl
;
10953 if (!can_fragment(diri
, dirs
))
10956 CDir
*first
= dirs
.front();
10957 int bits
= first
->get_frag().bits() - frag
.bits();
10958 dout(10) << " we are merginb by " << bits
<< " bits" << dendl
;
10960 dirfrag_t
basedirfrag(diri
->ino(), frag
);
10961 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_FRAGMENTDIR
);
10962 mdr
->more()->fragment_base
= basedirfrag
;
10964 assert(fragments
.count(basedirfrag
) == 0);
10965 fragment_info_t
& info
= fragments
[basedirfrag
];
10969 info
.last_cum_auth_pins_change
= ceph_clock_now();
10971 fragment_freeze_dirs(dirs
);
10972 // initial mark+complete pass
10973 fragment_mark_and_complete(mdr
);
10976 void MDCache::fragment_freeze_dirs(list
<CDir
*>& dirs
)
10978 for (list
<CDir
*>::iterator p
= dirs
.begin(); p
!= dirs
.end(); ++p
) {
10980 dir
->auth_pin(dir
); // until we mark and complete them
10981 dir
->state_set(CDir::STATE_FRAGMENTING
);
10983 assert(dir
->is_freezing_dir());
10987 class C_MDC_FragmentMarking
: public MDCacheContext
{
10990 C_MDC_FragmentMarking(MDCache
*m
, MDRequestRef
& r
) : MDCacheContext(m
), mdr(r
) {}
10991 void finish(int r
) override
{
10992 mdcache
->fragment_mark_and_complete(mdr
);
10996 void MDCache::fragment_mark_and_complete(MDRequestRef
& mdr
)
10998 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
10999 map
<dirfrag_t
,fragment_info_t
>::iterator it
= fragments
.find(basedirfrag
);
11000 if (it
== fragments
.end() || it
->second
.mdr
!= mdr
) {
11001 dout(7) << "fragment_mark_and_complete " << basedirfrag
<< " must have aborted" << dendl
;
11002 request_finish(mdr
);
11006 fragment_info_t
& info
= it
->second
;
11007 CInode
*diri
= info
.dirs
.front()->get_inode();
11008 dout(10) << "fragment_mark_and_complete " << info
.dirs
<< " on " << *diri
<< dendl
;
11010 MDSGatherBuilder
gather(g_ceph_context
);
11012 for (list
<CDir
*>::iterator p
= info
.dirs
.begin();
11013 p
!= info
.dirs
.end();
11018 if (!dir
->is_complete()) {
11019 dout(15) << " fetching incomplete " << *dir
<< dendl
;
11020 dir
->fetch(gather
.new_sub(), true); // ignore authpinnability
11022 } else if (dir
->get_frag() == frag_t()) {
11023 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
11024 // the operation. To avoid CDir::fetch() complaining about missing object,
11025 // we commit new dirfrag first.
11026 if (dir
->state_test(CDir::STATE_CREATING
)) {
11027 dout(15) << " waiting until new dir gets journaled " << *dir
<< dendl
;
11028 dir
->add_waiter(CDir::WAIT_CREATED
, gather
.new_sub());
11030 } else if (dir
->is_new()) {
11031 dout(15) << " committing new " << *dir
<< dendl
;
11032 assert(dir
->is_dirty());
11033 dir
->commit(0, gather
.new_sub(), true);
11040 if (!dir
->state_test(CDir::STATE_DNPINNEDFRAG
)) {
11041 dout(15) << " marking " << *dir
<< dendl
;
11042 for (auto &p
: dir
->items
) {
11043 CDentry
*dn
= p
.second
;
11044 dn
->get(CDentry::PIN_FRAGMENTING
);
11045 assert(!dn
->state_test(CDentry::STATE_FRAGMENTING
));
11046 dn
->state_set(CDentry::STATE_FRAGMENTING
);
11048 dir
->state_set(CDir::STATE_DNPINNEDFRAG
);
11049 dir
->auth_unpin(dir
);
11051 dout(15) << " already marked " << *dir
<< dendl
;
11054 if (gather
.has_subs()) {
11055 gather
.set_finisher(new C_MDC_FragmentMarking(this, mdr
));
11060 for (list
<CDir
*>::iterator p
= info
.dirs
.begin();
11061 p
!= info
.dirs
.end();
11064 if (!dir
->is_frozen_dir()) {
11065 assert(dir
->is_freezing_dir());
11066 dir
->add_waiter(CDir::WAIT_FROZEN
, gather
.new_sub());
11069 if (gather
.has_subs()) {
11070 gather
.set_finisher(new C_MDC_FragmentFrozen(this, mdr
));
11072 // flush log so that request auth_pins are retired
11073 mds
->mdlog
->flush();
11077 fragment_frozen(mdr
, 0);
11080 void MDCache::fragment_unmark_unfreeze_dirs(list
<CDir
*>& dirs
)
11082 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs
<< dendl
;
11083 for (list
<CDir
*>::iterator p
= dirs
.begin(); p
!= dirs
.end(); ++p
) {
11085 dout(10) << " frag " << *dir
<< dendl
;
11087 assert(dir
->state_test(CDir::STATE_FRAGMENTING
));
11088 dir
->state_clear(CDir::STATE_FRAGMENTING
);
11090 if (dir
->state_test(CDir::STATE_DNPINNEDFRAG
)) {
11091 dir
->state_clear(CDir::STATE_DNPINNEDFRAG
);
11093 for (auto &p
: dir
->items
) {
11094 CDentry
*dn
= p
.second
;
11095 assert(dn
->state_test(CDentry::STATE_FRAGMENTING
));
11096 dn
->state_clear(CDentry::STATE_FRAGMENTING
);
11097 dn
->put(CDentry::PIN_FRAGMENTING
);
11100 dir
->auth_unpin(dir
);
11103 dir
->unfreeze_dir();
11107 bool MDCache::fragment_are_all_frozen(CDir
*dir
)
11109 assert(dir
->is_frozen_dir());
11110 map
<dirfrag_t
,fragment_info_t
>::iterator p
;
11111 for (p
= fragments
.lower_bound(dirfrag_t(dir
->ino(), 0));
11112 p
!= fragments
.end() && p
->first
.ino
== dir
->ino();
11114 if (p
->first
.frag
.contains(dir
->get_frag()))
11115 return p
->second
.all_frozen
;
11121 void MDCache::fragment_freeze_inc_num_waiters(CDir
*dir
)
11123 map
<dirfrag_t
,fragment_info_t
>::iterator p
;
11124 for (p
= fragments
.lower_bound(dirfrag_t(dir
->ino(), 0));
11125 p
!= fragments
.end() && p
->first
.ino
== dir
->ino();
11127 if (p
->first
.frag
.contains(dir
->get_frag())) {
11128 p
->second
.num_remote_waiters
++;
11135 void MDCache::find_stale_fragment_freeze()
11137 dout(10) << "find_stale_fragment_freeze" << dendl
;
11138 // see comment in Migrator::find_stale_export_freeze()
11139 utime_t now
= ceph_clock_now();
11140 utime_t cutoff
= now
;
11141 cutoff
-= g_conf
->mds_freeze_tree_timeout
;
11143 for (map
<dirfrag_t
,fragment_info_t
>::iterator p
= fragments
.begin();
11144 p
!= fragments
.end(); ) {
11145 dirfrag_t df
= p
->first
;
11146 fragment_info_t
& info
= p
->second
;
11148 if (info
.all_frozen
)
11151 int total_auth_pins
= 0;
11152 for (list
<CDir
*>::iterator q
= info
.dirs
.begin();
11153 q
!= info
.dirs
.end();
11156 if (!dir
->state_test(CDir::STATE_DNPINNEDFRAG
)) {
11157 total_auth_pins
= -1;
11160 if (dir
->is_frozen_dir())
11162 total_auth_pins
+= dir
->get_auth_pins() + dir
->get_dir_auth_pins();
11164 if (total_auth_pins
< 0)
11166 if (info
.last_cum_auth_pins
!= total_auth_pins
) {
11167 info
.last_cum_auth_pins
= total_auth_pins
;
11168 info
.last_cum_auth_pins_change
= now
;
11171 if (info
.last_cum_auth_pins_change
>= cutoff
)
11173 dir
= info
.dirs
.front();
11174 if (info
.num_remote_waiters
> 0 ||
11175 (!dir
->inode
->is_root() && dir
->get_parent_dir()->is_freezing())) {
11176 dout(10) << " cancel fragmenting " << df
<< " bit " << info
.bits
<< dendl
;
11178 info
.dirs
.swap(dirs
);
11179 fragments
.erase(df
);
11180 fragment_unmark_unfreeze_dirs(dirs
);
11185 class C_MDC_FragmentPrep
: public MDCacheLogContext
{
11188 C_MDC_FragmentPrep(MDCache
*m
, MDRequestRef
& r
) : MDCacheLogContext(m
), mdr(r
) {}
11189 void finish(int r
) override
{
11190 mdcache
->_fragment_logged(mdr
);
11194 class C_MDC_FragmentStore
: public MDCacheContext
{
11197 C_MDC_FragmentStore(MDCache
*m
, MDRequestRef
& r
) : MDCacheContext(m
), mdr(r
) {}
11198 void finish(int r
) override
{
11199 mdcache
->_fragment_stored(mdr
);
11203 class C_MDC_FragmentCommit
: public MDCacheLogContext
{
11204 dirfrag_t basedirfrag
;
11205 list
<CDir
*> resultfrags
;
11207 C_MDC_FragmentCommit(MDCache
*m
, dirfrag_t df
, list
<CDir
*>& l
) :
11208 MDCacheLogContext(m
), basedirfrag(df
), resultfrags(l
) {}
11209 void finish(int r
) override
{
11210 mdcache
->_fragment_committed(basedirfrag
, resultfrags
);
11214 class C_IO_MDC_FragmentFinish
: public MDCacheIOContext
{
11215 dirfrag_t basedirfrag
;
11216 list
<CDir
*> resultfrags
;
11218 C_IO_MDC_FragmentFinish(MDCache
*m
, dirfrag_t f
, list
<CDir
*>& l
) :
11219 MDCacheIOContext(m
), basedirfrag(f
) {
11220 resultfrags
.swap(l
);
11222 void finish(int r
) override
{
11223 assert(r
== 0 || r
== -ENOENT
);
11224 mdcache
->_fragment_finish(basedirfrag
, resultfrags
);
11228 void MDCache::fragment_frozen(MDRequestRef
& mdr
, int r
)
11230 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11231 map
<dirfrag_t
,fragment_info_t
>::iterator it
= fragments
.find(basedirfrag
);
11232 if (it
== fragments
.end() || it
->second
.mdr
!= mdr
) {
11233 dout(7) << "fragment_frozen " << basedirfrag
<< " must have aborted" << dendl
;
11234 request_finish(mdr
);
11239 fragment_info_t
& info
= it
->second
;
11240 dout(10) << "fragment_frozen " << basedirfrag
.frag
<< " by " << info
.bits
11241 << " on " << info
.dirs
.front()->get_inode() << dendl
;
11243 info
.all_frozen
= true;
11244 dispatch_fragment_dir(mdr
);
11247 void MDCache::dispatch_fragment_dir(MDRequestRef
& mdr
)
11249 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11250 map
<dirfrag_t
,fragment_info_t
>::iterator it
= fragments
.find(basedirfrag
);
11251 if (it
== fragments
.end() || it
->second
.mdr
!= mdr
) {
11252 dout(7) << "dispatch_fragment_dir " << basedirfrag
<< " must have aborted" << dendl
;
11253 request_finish(mdr
);
11257 fragment_info_t
& info
= it
->second
;
11258 CInode
*diri
= info
.dirs
.front()->get_inode();
11260 dout(10) << "dispatch_fragment_dir " << basedirfrag
<< " bits " << info
.bits
11261 << " on " << *diri
<< dendl
;
11262 if (!mdr
->aborted
) {
11263 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
11264 wrlocks
.insert(&diri
->dirfragtreelock
);
11265 // prevent a racing gather on any other scatterlocks too
11266 wrlocks
.insert(&diri
->nestlock
);
11267 wrlocks
.insert(&diri
->filelock
);
11268 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
, NULL
, NULL
, true))
11273 if (mdr
->aborted
) {
11274 dout(10) << " can't auth_pin " << *diri
<< ", requeuing dir "
11275 << info
.dirs
.front()->dirfrag() << dendl
;
11277 mds
->balancer
->queue_split(info
.dirs
.front(), false);
11279 mds
->balancer
->queue_merge(info
.dirs
.front());
11280 fragment_unmark_unfreeze_dirs(info
.dirs
);
11281 fragments
.erase(it
);
11282 request_finish(mdr
);
11286 mdr
->ls
= mds
->mdlog
->get_current_segment();
11287 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_PREPARE
, basedirfrag
, info
.bits
);
11288 mds
->mdlog
->start_entry(le
);
11290 for (list
<CDir
*>::iterator p
= info
.dirs
.begin(); p
!= info
.dirs
.end(); ++p
) {
11292 dirfrag_rollback rollback
;
11293 rollback
.fnode
= dir
->fnode
;
11294 le
->add_orig_frag(dir
->get_frag(), &rollback
);
11298 list
<MDSInternalContextBase
*> waiters
;
11299 adjust_dir_fragments(diri
, info
.dirs
, basedirfrag
.frag
, info
.bits
,
11300 info
.resultfrags
, waiters
, false);
11301 if (g_conf
->mds_debug_frag
)
11302 diri
->verify_dirfrags();
11303 mds
->queue_waiters(waiters
);
11305 for (list
<frag_t
>::iterator p
= le
->orig_frags
.begin(); p
!= le
->orig_frags
.end(); ++p
)
11306 assert(!diri
->dirfragtree
.is_leaf(*p
));
11308 le
->metablob
.add_dir_context(*info
.resultfrags
.begin());
11309 for (list
<CDir
*>::iterator p
= info
.resultfrags
.begin();
11310 p
!= info
.resultfrags
.end();
11312 if (diri
->is_auth()) {
11313 le
->metablob
.add_fragmented_dir(*p
, false, false);
11315 (*p
)->state_set(CDir::STATE_DIRTYDFT
);
11316 le
->metablob
.add_fragmented_dir(*p
, false, true);
11321 if (diri
->is_auth()) {
11322 // journal dirfragtree
11323 auto &pi
= diri
->project_inode();
11324 pi
.inode
.version
= diri
->pre_dirty();
11325 journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
11327 mds
->locker
->mark_updated_scatterlock(&diri
->dirfragtreelock
);
11328 mdr
->ls
->dirty_dirfrag_dirfragtree
.push_back(&diri
->item_dirty_dirfrag_dirfragtree
);
11329 mdr
->add_updated_lock(&diri
->dirfragtreelock
);
11334 mds->locker->mark_updated_scatterlock(&diri->filelock);
11335 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11336 mut->add_updated_lock(&diri->filelock);
11339 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11340 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11341 mut->add_updated_lock(&diri->nestlock);
11344 add_uncommitted_fragment(basedirfrag
, info
.bits
, le
->orig_frags
, mdr
->ls
);
11345 mds
->server
->submit_mdlog_entry(le
, new C_MDC_FragmentPrep(this, mdr
),
11347 mds
->mdlog
->flush();
11350 void MDCache::_fragment_logged(MDRequestRef
& mdr
)
11352 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11353 map
<dirfrag_t
,fragment_info_t
>::iterator it
= fragments
.find(basedirfrag
);
11354 assert(it
!= fragments
.end());
11355 fragment_info_t
&info
= it
->second
;
11356 CInode
*diri
= info
.resultfrags
.front()->get_inode();
11358 dout(10) << "fragment_logged " << basedirfrag
<< " bits " << info
.bits
11359 << " on " << *diri
<< dendl
;
11361 if (diri
->is_auth())
11362 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
11364 mdr
->apply(); // mark scatterlock
11366 // store resulting frags
11367 MDSGatherBuilder
gather(g_ceph_context
, new C_MDC_FragmentStore(this, mdr
));
11369 for (list
<CDir
*>::iterator p
= info
.resultfrags
.begin();
11370 p
!= info
.resultfrags
.end();
11373 dout(10) << " storing result frag " << *dir
<< dendl
;
11375 // freeze and store them too
11376 dir
->auth_pin(this);
11377 dir
->state_set(CDir::STATE_FRAGMENTING
);
11378 dir
->commit(0, gather
.new_sub(), true); // ignore authpinnability
11384 void MDCache::_fragment_stored(MDRequestRef
& mdr
)
11386 dirfrag_t basedirfrag
= mdr
->more()->fragment_base
;
11387 map
<dirfrag_t
,fragment_info_t
>::iterator it
= fragments
.find(basedirfrag
);
11388 assert(it
!= fragments
.end());
11389 fragment_info_t
&info
= it
->second
;
11390 CInode
*diri
= info
.resultfrags
.front()->get_inode();
11392 dout(10) << "fragment_stored " << basedirfrag
<< " bits " << info
.bits
11393 << " on " << *diri
<< dendl
;
11396 CDir
*first
= *info
.resultfrags
.begin();
11397 for (const auto &p
: first
->get_replicas()) {
11398 if (mds
->mdsmap
->get_state(p
.first
) < MDSMap::STATE_REJOIN
||
11399 (mds
->mdsmap
->get_state(p
.first
) == MDSMap::STATE_REJOIN
&&
11400 rejoin_gather
.count(p
.first
)))
11403 MMDSFragmentNotify
*notify
= new MMDSFragmentNotify(basedirfrag
, info
.bits
);
11405 // freshly replicate new dirs to peers
11406 for (list
<CDir
*>::iterator q
= info
.resultfrags
.begin();
11407 q
!= info
.resultfrags
.end();
11409 replicate_dir(*q
, p
.first
, notify
->basebl
);
11411 mds
->send_message_mds(notify
, p
.first
);
11415 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_COMMIT
, basedirfrag
, info
.bits
);
11416 mds
->mdlog
->start_submit_entry(le
, new C_MDC_FragmentCommit(this, basedirfrag
,
11417 info
.resultfrags
));
11419 mds
->locker
->drop_locks(mdr
.get());
11421 // unfreeze resulting frags
11422 for (list
<CDir
*>::iterator p
= info
.resultfrags
.begin();
11423 p
!= info
.resultfrags
.end();
11426 dout(10) << " result frag " << *dir
<< dendl
;
11428 for (auto &p
: dir
->items
) {
11429 CDentry
*dn
= p
.second
;
11430 assert(dn
->state_test(CDentry::STATE_FRAGMENTING
));
11431 dn
->state_clear(CDentry::STATE_FRAGMENTING
);
11432 dn
->put(CDentry::PIN_FRAGMENTING
);
11436 dir
->unfreeze_dir();
11439 fragments
.erase(it
);
11440 request_finish(mdr
);
11443 void MDCache::_fragment_committed(dirfrag_t basedirfrag
, list
<CDir
*>& resultfrags
)
11445 dout(10) << "fragment_committed " << basedirfrag
<< dendl
;
11446 map
<dirfrag_t
, ufragment
>::iterator it
= uncommitted_fragments
.find(basedirfrag
);
11447 assert(it
!= uncommitted_fragments
.end());
11448 ufragment
&uf
= it
->second
;
11450 // remove old frags
11451 C_GatherBuilder
gather(
11454 new C_IO_MDC_FragmentFinish(this, basedirfrag
, resultfrags
),
11457 SnapContext nullsnapc
;
11458 object_locator_t
oloc(mds
->mdsmap
->get_metadata_pool());
11459 for (list
<frag_t
>::iterator p
= uf
.old_frags
.begin();
11460 p
!= uf
.old_frags
.end();
11462 object_t oid
= CInode::get_object_name(basedirfrag
.ino
, *p
, "");
11463 ObjectOperation op
;
11464 if (*p
== frag_t()) {
11465 // backtrace object
11466 dout(10) << " truncate orphan dirfrag " << oid
<< dendl
;
11470 dout(10) << " removing orphan dirfrag " << oid
<< dendl
;
11473 mds
->objecter
->mutate(oid
, oloc
, op
, nullsnapc
,
11474 ceph::real_clock::now(),
11475 0, gather
.new_sub());
11478 assert(gather
.has_subs());
11482 void MDCache::_fragment_finish(dirfrag_t basedirfrag
, list
<CDir
*>& resultfrags
)
11484 dout(10) << "fragment_finish " << basedirfrag
<< "resultfrags.size="
11485 << resultfrags
.size() << dendl
;
11486 map
<dirfrag_t
, ufragment
>::iterator it
= uncommitted_fragments
.find(basedirfrag
);
11487 assert(it
!= uncommitted_fragments
.end());
11488 ufragment
&uf
= it
->second
;
11490 // unmark & auth_unpin
11491 for (const auto &dir
: resultfrags
) {
11492 dir
->state_clear(CDir::STATE_FRAGMENTING
);
11493 dir
->auth_unpin(this);
11495 // In case the resulting fragments are beyond the split size,
11496 // we might need to split them again right away (they could
11497 // have been taking inserts between unfreezing and getting
11499 mds
->balancer
->maybe_fragment(dir
, false);
11503 if (resultfrags
.size() > 1) {
11504 mds
->logger
->inc(l_mds_dir_split
);
11506 mds
->logger
->inc(l_mds_dir_merge
);
11510 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_FINISH
, basedirfrag
, uf
.bits
);
11511 mds
->mdlog
->start_submit_entry(le
);
11513 finish_uncommitted_fragment(basedirfrag
, EFragment::OP_FINISH
);
11516 /* This function DOES put the passed message before returning */
11517 void MDCache::handle_fragment_notify(MMDSFragmentNotify
*notify
)
11519 dout(10) << "handle_fragment_notify " << *notify
<< " from " << notify
->get_source() << dendl
;
11521 if (mds
->get_state() < MDSMap::STATE_REJOIN
) {
11526 CInode
*diri
= get_inode(notify
->get_ino());
11528 frag_t base
= notify
->get_basefrag();
11529 int bits
= notify
->get_bits();
11532 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
11533 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
11534 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
11535 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
11542 list
<MDSInternalContextBase
*> waiters
;
11543 list
<CDir
*> resultfrags
;
11544 adjust_dir_fragments(diri
, base
, bits
, resultfrags
, waiters
, false);
11545 if (g_conf
->mds_debug_frag
)
11546 diri
->verify_dirfrags();
11548 for (list
<CDir
*>::iterator p
= resultfrags
.begin(); p
!= resultfrags
.end(); ++p
)
11549 diri
->take_dir_waiting((*p
)->get_frag(), waiters
);
11551 // add new replica dirs values
11552 bufferlist::iterator p
= notify
->basebl
.begin();
11554 add_replica_dir(p
, diri
, mds_rank_t(notify
->get_source().num()), waiters
);
11556 mds
->queue_waiters(waiters
);
11564 void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag
, int bits
, list
<frag_t
>& old_frags
,
11565 LogSegment
*ls
, bufferlist
*rollback
)
11567 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag
<< " bits " << bits
<< dendl
;
11568 assert(!uncommitted_fragments
.count(basedirfrag
));
11569 ufragment
& uf
= uncommitted_fragments
[basedirfrag
];
11570 uf
.old_frags
= old_frags
;
11573 ls
->uncommitted_fragments
.insert(basedirfrag
);
11575 uf
.rollback
.swap(*rollback
);
11578 void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag
, int op
)
11580 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
11581 << " op " << EFragment::op_name(op
) << dendl
;
11582 map
<dirfrag_t
, ufragment
>::iterator it
= uncommitted_fragments
.find(basedirfrag
);
11583 if (it
!= uncommitted_fragments
.end()) {
11584 ufragment
& uf
= it
->second
;
11585 if (op
!= EFragment::OP_FINISH
&& !uf
.old_frags
.empty()) {
11586 uf
.committed
= true;
11588 uf
.ls
->uncommitted_fragments
.erase(basedirfrag
);
11589 mds
->queue_waiters(uf
.waiters
);
11590 uncommitted_fragments
.erase(it
);
11595 void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag
, list
<frag_t
>& old_frags
)
11597 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
11598 << " old_frags (" << old_frags
<< ")" << dendl
;
11599 map
<dirfrag_t
, ufragment
>::iterator it
= uncommitted_fragments
.find(basedirfrag
);
11600 if (it
!= uncommitted_fragments
.end()) {
11601 ufragment
& uf
= it
->second
;
11602 if (!uf
.old_frags
.empty()) {
11603 uf
.old_frags
.swap(old_frags
);
11604 uf
.committed
= true;
11606 uf
.ls
->uncommitted_fragments
.erase(basedirfrag
);
11607 uncommitted_fragments
.erase(it
);
11612 void MDCache::rollback_uncommitted_fragments()
11614 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments
.size() << " pending" << dendl
;
11615 for (map
<dirfrag_t
, ufragment
>::iterator p
= uncommitted_fragments
.begin();
11616 p
!= uncommitted_fragments
.end();
11618 ufragment
&uf
= p
->second
;
11619 CInode
*diri
= get_inode(p
->first
.ino
);
11622 if (uf
.committed
) {
11624 diri
->get_dirfrags_under(p
->first
.frag
, frags
);
11625 for (list
<CDir
*>::iterator q
= frags
.begin(); q
!= frags
.end(); ++q
) {
11627 dir
->auth_pin(this);
11628 dir
->state_set(CDir::STATE_FRAGMENTING
);
11630 _fragment_committed(p
->first
, frags
);
11634 dout(10) << " rolling back " << p
->first
<< " refragment by " << uf
.bits
<< " bits" << dendl
;
11636 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
11637 EFragment
*le
= new EFragment(mds
->mdlog
, EFragment::OP_ROLLBACK
, p
->first
, uf
.bits
);
11638 mds
->mdlog
->start_entry(le
);
11639 bool diri_auth
= (diri
->authority() != CDIR_AUTH_UNDEF
);
11641 list
<frag_t
> old_frags
;
11642 diri
->dirfragtree
.get_leaves_under(p
->first
.frag
, old_frags
);
11644 list
<CDir
*> resultfrags
;
11645 if (uf
.old_frags
.empty()) {
11646 // created by old format EFragment
11647 list
<MDSInternalContextBase
*> waiters
;
11648 adjust_dir_fragments(diri
, p
->first
.frag
, -uf
.bits
, resultfrags
, waiters
, true);
11650 bufferlist::iterator bp
= uf
.rollback
.begin();
11651 for (list
<frag_t
>::iterator q
= uf
.old_frags
.begin(); q
!= uf
.old_frags
.end(); ++q
) {
11652 CDir
*dir
= force_dir_fragment(diri
, *q
);
11653 resultfrags
.push_back(dir
);
11655 dirfrag_rollback rollback
;
11656 ::decode(rollback
, bp
);
11658 dir
->set_version(rollback
.fnode
.version
);
11659 dir
->fnode
= rollback
.fnode
;
11661 dir
->_mark_dirty(ls
);
11663 if (!(dir
->fnode
.rstat
== dir
->fnode
.accounted_rstat
)) {
11664 dout(10) << " dirty nestinfo on " << *dir
<< dendl
;
11665 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->nestlock
);
11666 ls
->dirty_dirfrag_nest
.push_back(&dir
->inode
->item_dirty_dirfrag_nest
);
11668 if (!(dir
->fnode
.fragstat
== dir
->fnode
.accounted_fragstat
)) {
11669 dout(10) << " dirty fragstat on " << *dir
<< dendl
;
11670 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->filelock
);
11671 ls
->dirty_dirfrag_dir
.push_back(&dir
->inode
->item_dirty_dirfrag_dir
);
11674 le
->add_orig_frag(dir
->get_frag());
11675 le
->metablob
.add_dir_context(dir
);
11677 le
->metablob
.add_fragmented_dir(dir
, true, false);
11679 dout(10) << " dirty dirfragtree on " << *dir
<< dendl
;
11680 dir
->state_set(CDir::STATE_DIRTYDFT
);
11681 le
->metablob
.add_fragmented_dir(dir
, true, true);
11687 auto &pi
= diri
->project_inode();
11688 pi
.inode
.version
= diri
->pre_dirty();
11689 diri
->pop_and_dirty_projected_inode(ls
); // hacky
11690 le
->metablob
.add_primary_dentry(diri
->get_projected_parent_dn(), diri
, true);
11692 mds
->locker
->mark_updated_scatterlock(&diri
->dirfragtreelock
);
11693 ls
->dirty_dirfrag_dirfragtree
.push_back(&diri
->item_dirty_dirfrag_dirfragtree
);
11696 if (g_conf
->mds_debug_frag
)
11697 diri
->verify_dirfrags();
11699 for (list
<frag_t
>::iterator q
= old_frags
.begin(); q
!= old_frags
.end(); ++q
)
11700 assert(!diri
->dirfragtree
.is_leaf(*q
));
11702 for (list
<CDir
*>::iterator q
= resultfrags
.begin(); q
!= resultfrags
.end(); ++q
) {
11704 dir
->auth_pin(this);
11705 dir
->state_set(CDir::STATE_FRAGMENTING
);
11708 mds
->mdlog
->submit_entry(le
);
11710 uf
.old_frags
.swap(old_frags
);
11711 _fragment_committed(p
->first
, resultfrags
);
11715 void MDCache::force_readonly()
11720 dout(1) << "force file system read-only" << dendl
;
11721 mds
->clog
->warn() << "force file system read-only";
11725 mds
->server
->force_clients_readonly();
11727 // revoke write caps
11728 for (auto &p
: inode_map
) {
11729 CInode
*in
= p
.second
;
11731 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
);
11734 mds
->mdlog
->flush();
11738 // ==============================================================
11741 void MDCache::show_subtrees(int dbl
)
11743 if (g_conf
->mds_thrash_exports
)
11746 //dout(10) << "show_subtrees" << dendl;
11748 if (!g_conf
->subsys
.should_gather(ceph_subsys_mds
, dbl
))
11749 return; // i won't print anything.
11751 if (subtrees
.empty()) {
11752 dout(dbl
) << "show_subtrees - no subtrees" << dendl
;
11757 list
<CDir
*> basefrags
;
11758 for (set
<CInode
*>::iterator p
= base_inodes
.begin();
11759 p
!= base_inodes
.end();
11761 (*p
)->get_dirfrags(basefrags
);
11762 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
11763 dout(15) << "show_subtrees" << dendl
;
11766 list
<pair
<CDir
*,int> > q
;
11771 for (list
<CDir
*>::iterator p
= basefrags
.begin(); p
!= basefrags
.end(); ++p
)
11772 q
.push_back(pair
<CDir
*,int>(*p
, 0));
11774 set
<CDir
*> subtrees_seen
;
11777 while (!q
.empty()) {
11778 CDir
*dir
= q
.front().first
;
11779 int d
= q
.front().second
;
11782 if (subtrees
.count(dir
) == 0) continue;
11784 subtrees_seen
.insert(dir
);
11786 if (d
> depth
) depth
= d
;
11789 //dout(25) << "saw depth " << d << " " << *dir << dendl;
11790 if (seen
.count(dir
)) dout(0) << "aah, already seen " << *dir
<< dendl
;
11791 assert(seen
.count(dir
) == 0);
11795 if (!subtrees
[dir
].empty()) {
11796 for (set
<CDir
*>::iterator p
= subtrees
[dir
].begin();
11797 p
!= subtrees
[dir
].end();
11799 //dout(25) << " saw sub " << **p << dendl;
11800 q
.push_front(pair
<CDir
*,int>(*p
, d
+1));
11807 for (list
<CDir
*>::iterator p
= basefrags
.begin(); p
!= basefrags
.end(); ++p
)
11808 q
.push_back(pair
<CDir
*,int>(*p
, 0));
11810 while (!q
.empty()) {
11811 CDir
*dir
= q
.front().first
;
11812 int d
= q
.front().second
;
11815 if (subtrees
.count(dir
) == 0) continue;
11818 while ((unsigned)d
< indent
.size())
11822 string pad
= "______________________________________";
11823 pad
.resize(depth
*2+1-indent
.size());
11824 if (!subtrees
[dir
].empty())
11825 pad
[0] = '.'; // parent
11829 if (dir
->is_auth())
11835 if (dir
->get_dir_auth().second
== CDIR_AUTH_UNKNOWN
)
11836 snprintf(s
, sizeof(s
), "%2d ", int(dir
->get_dir_auth().first
));
11838 snprintf(s
, sizeof(s
), "%2d,%2d", int(dir
->get_dir_auth().first
), int(dir
->get_dir_auth().second
));
11841 dout(dbl
) << indent
<< "|_" << pad
<< s
<< " " << auth
<< *dir
<< dendl
;
11843 if (dir
->ino() == MDS_INO_ROOT
)
11844 assert(dir
->inode
== root
);
11845 if (dir
->ino() == MDS_INO_MDSDIR(mds
->get_nodeid()))
11846 assert(dir
->inode
== myin
);
11847 if (dir
->inode
->is_stray() && (MDS_INO_STRAY_OWNER(dir
->ino()) == mds
->get_nodeid()))
11848 assert(strays
[MDS_INO_STRAY_INDEX(dir
->ino())] == dir
->inode
);
11851 if (!subtrees
[dir
].empty()) {
11852 // more at my level?
11853 if (!q
.empty() && q
.front().second
== d
)
11858 for (set
<CDir
*>::iterator p
= subtrees
[dir
].begin();
11859 p
!= subtrees
[dir
].end();
11861 q
.push_front(pair
<CDir
*,int>(*p
, d
+2));
11865 // verify there isn't stray crap in subtree map
11867 for (map
<CDir
*, set
<CDir
*> >::iterator p
= subtrees
.begin();
11868 p
!= subtrees
.end();
11870 if (subtrees_seen
.count(p
->first
)) continue;
11871 dout(10) << "*** stray/lost entry in subtree map: " << *p
->first
<< dendl
;
11877 void MDCache::show_cache()
11879 dout(7) << "show_cache" << dendl
;
11881 auto show_func
= [this](CInode
*in
) {
11884 dout(7) << " unlinked " << *in
<< dendl
;
11888 in
->get_dirfrags(dfs
);
11889 for (list
<CDir
*>::iterator p
= dfs
.begin(); p
!= dfs
.end(); ++p
) {
11891 dout(7) << " dirfrag " << *dir
<< dendl
;
11893 for (auto &p
: dir
->items
) {
11894 CDentry
*dn
= p
.second
;
11895 dout(7) << " dentry " << *dn
<< dendl
;
11896 CDentry::linkage_t
*dnl
= dn
->get_linkage();
11897 if (dnl
->is_primary() && dnl
->get_inode())
11898 dout(7) << " inode " << *dnl
->get_inode() << dendl
;
11903 for (auto &p
: inode_map
)
11904 show_func(p
.second
);
11905 for (auto &p
: snap_inode_map
)
11906 show_func(p
.second
);
11909 int MDCache::cache_status(Formatter
*f
)
11911 f
->open_object_section("cache");
11913 f
->open_object_section("pool");
11914 mempool::get_pool(mempool::mds_co::id
).dump(f
);
11915 f
->close_section();
11917 f
->close_section();
11921 int MDCache::dump_cache(boost::string_view file_name
)
11923 return dump_cache(file_name
, NULL
);
11926 int MDCache::dump_cache(Formatter
*f
)
11928 return dump_cache(boost::string_view(""), f
);
11931 int MDCache::dump_cache(boost::string_view dump_root
, int depth
, Formatter
*f
)
11933 return dump_cache(boost::string_view(""), f
, dump_root
, depth
);
11937 * Dump the metadata cache, either to a Formatter, if
11938 * provided, else to a plain text file.
11940 int MDCache::dump_cache(boost::string_view fn
, Formatter
*f
,
11941 boost::string_view dump_root
, int depth
)
11947 f
->open_array_section("inodes");
11949 char path
[PATH_MAX
] = "";
11951 snprintf(path
, sizeof path
, "%s", fn
.data());
11953 snprintf(path
, sizeof path
, "cachedump.%d.mds%d", (int)mds
->mdsmap
->get_epoch(), int(mds
->get_nodeid()));
11956 dout(1) << "dump_cache to " << path
<< dendl
;
11958 fd
= ::open(path
, O_WRONLY
|O_CREAT
|O_EXCL
, 0600);
11960 derr
<< "failed to open " << path
<< ": " << cpp_strerror(errno
) << dendl
;
11965 auto dump_func
= [this, fd
, f
, depth
, &dump_root
](CInode
*in
) {
11967 if (!dump_root
.empty()) {
11972 in
->make_path_string(ipath
);
11974 if (dump_root
.length() > ipath
.length() ||
11975 !equal(dump_root
.begin(), dump_root
.end(), ipath
.begin()))
11979 count(ipath
.begin() + dump_root
.length(), ipath
.end(), '/') > depth
)
11984 f
->open_object_section("inode");
11988 ss
<< *in
<< std::endl
;
11989 std::string s
= ss
.str();
11990 r
= safe_write(fd
, s
.c_str(), s
.length());
11996 in
->get_dirfrags(dfs
);
11998 f
->open_array_section("dirfrags");
12000 for (list
<CDir
*>::iterator p
= dfs
.begin(); p
!= dfs
.end(); ++p
) {
12003 f
->open_object_section("dir");
12007 tt
<< " " << *dir
<< std::endl
;
12008 string t
= tt
.str();
12009 r
= safe_write(fd
, t
.c_str(), t
.length());
12015 f
->open_array_section("dentries");
12017 for (auto &p
: dir
->items
) {
12018 CDentry
*dn
= p
.second
;
12020 f
->open_object_section("dentry");
12022 f
->close_section();
12025 uu
<< " " << *dn
<< std::endl
;
12026 string u
= uu
.str();
12027 r
= safe_write(fd
, u
.c_str(), u
.length());
12033 f
->close_section(); //dentries
12035 dir
->check_rstats();
12037 f
->close_section(); //dir
12041 f
->close_section(); // dirfrags
12045 f
->close_section(); // inode
12050 for (auto &p
: inode_map
) {
12051 r
= dump_func(p
.second
);
12055 for (auto &p
: snap_inode_map
) {
12056 r
= dump_func(p
.second
);
12064 f
->close_section(); // inodes
12073 C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache
*c
, MDRequestRef
& r
)
12074 : MDSInternalContext(c
->mds
), cache(c
), mdr(r
)
12077 void C_MDS_RetryRequest::finish(int r
)
12080 cache
->dispatch_request(mdr
);
12084 class C_MDS_EnqueueScrub
: public Context
12086 Formatter
*formatter
;
12087 Context
*on_finish
;
12089 ScrubHeaderRef header
;
12090 C_MDS_EnqueueScrub(Formatter
*f
, Context
*fin
) :
12091 formatter(f
), on_finish(fin
), header(nullptr) {}
12093 Context
*take_finisher() {
12094 Context
*fin
= on_finish
;
12099 void finish(int r
) override
{
12100 if (r
< 0) { // we failed the lookup or something; dump ourselves
12101 formatter
->open_object_section("results");
12102 formatter
->dump_int("return_code", r
);
12103 formatter
->close_section(); // results
12106 on_finish
->complete(r
);
12110 void MDCache::enqueue_scrub(
12111 boost::string_view path
,
12112 boost::string_view tag
,
12113 bool force
, bool recursive
, bool repair
,
12114 Formatter
*f
, Context
*fin
)
12116 dout(10) << __func__
<< path
<< dendl
;
12117 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB
);
12119 mdr
->set_filepath(fp
);
12121 C_MDS_EnqueueScrub
*cs
= new C_MDS_EnqueueScrub(f
, fin
);
12122 cs
->header
= std::make_shared
<ScrubHeader
>(
12123 tag
, force
, recursive
, repair
, f
);
12125 mdr
->internal_op_finish
= cs
;
12126 enqueue_scrub_work(mdr
);
12129 void MDCache::enqueue_scrub_work(MDRequestRef
& mdr
)
12131 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
12132 CInode
*in
= mds
->server
->rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
12136 // TODO: Remove this restriction
12137 assert(in
->is_auth());
12139 bool locked
= mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
);
12143 C_MDS_EnqueueScrub
*cs
= static_cast<C_MDS_EnqueueScrub
*>(mdr
->internal_op_finish
);
12144 ScrubHeaderRef
&header
= cs
->header
;
12146 // Cannot scrub same dentry twice at same time
12147 if (in
->scrub_infop
&& in
->scrub_infop
->scrub_in_progress
) {
12148 mds
->server
->respond_to_request(mdr
, -EBUSY
);
12154 header
->set_origin(in
);
12156 Context
*fin
= nullptr;
12157 if (!header
->get_recursive()) {
12158 fin
= cs
->take_finisher();
12161 // If the scrub did some repair, then flush the journal at the end of
12162 // the scrub. Otherwise in the case of e.g. rewriting a backtrace
12163 // the on disk state will still look damaged.
12164 auto scrub_finish
= new FunctionContext([this, header
, fin
](int r
){
12165 if (!header
->get_repaired()) {
12171 auto flush_finish
= new FunctionContext([this, fin
](int r
){
12172 dout(4) << "Expiring log segments because scrub did some repairs" << dendl
;
12173 mds
->mdlog
->trim_all();
12176 MDSGatherBuilder
gather(g_ceph_context
);
12177 auto& expiring_segments
= mds
->mdlog
->get_expiring_segments();
12178 for (auto logseg
: expiring_segments
)
12179 logseg
->wait_for_expiry(gather
.new_sub());
12180 assert(gather
.has_subs());
12181 gather
.set_finisher(new MDSInternalContextWrapper(mds
, fin
));
12186 dout(4) << "Flushing journal because scrub did some repairs" << dendl
;
12187 mds
->mdlog
->start_new_segment();
12188 mds
->mdlog
->flush();
12189 mds
->mdlog
->wait_for_safe(new MDSInternalContextWrapper(mds
, flush_finish
));
12192 if (!header
->get_recursive()) {
12193 mds
->scrubstack
->enqueue_inode_top(in
, header
,
12194 new MDSInternalContextWrapper(mds
, scrub_finish
));
12196 mds
->scrubstack
->enqueue_inode_bottom(in
, header
,
12197 new MDSInternalContextWrapper(mds
, scrub_finish
));
12200 mds
->server
->respond_to_request(mdr
, 0);
12204 struct C_MDC_RepairDirfragStats
: public MDCacheLogContext
{
12206 C_MDC_RepairDirfragStats(MDCache
*c
, MDRequestRef
& m
) :
12207 MDCacheLogContext(c
), mdr(m
) {}
12208 void finish(int r
) override
{
12210 get_mds()->server
->respond_to_request(mdr
, r
);
12214 void MDCache::repair_dirfrag_stats(CDir
*dir
)
12216 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS
);
12218 mdr
->internal_op_private
= dir
;
12219 mdr
->internal_op_finish
= new C_MDSInternalNoop
;
12220 repair_dirfrag_stats_work(mdr
);
12223 void MDCache::repair_dirfrag_stats_work(MDRequestRef
& mdr
)
12225 CDir
*dir
= static_cast<CDir
*>(mdr
->internal_op_private
);
12226 dout(10) << __func__
<< " " << *dir
<< dendl
;
12228 if (!dir
->is_auth()) {
12229 mds
->server
->respond_to_request(mdr
, -ESTALE
);
12233 if (!mdr
->is_auth_pinned(dir
) && !dir
->can_auth_pin()) {
12234 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(this, mdr
));
12236 mds
->locker
->drop_locks(mdr
.get());
12237 mdr
->drop_local_auth_pins();
12238 if (!mdr
->remote_auth_pins
.empty())
12239 mds
->locker
->notify_freeze_waiter(dir
);
12243 mdr
->auth_pin(dir
);
12245 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
12246 CInode
*diri
= dir
->inode
;
12247 rdlocks
.insert(&diri
->dirfragtreelock
);
12248 wrlocks
.insert(&diri
->nestlock
);
12249 wrlocks
.insert(&diri
->filelock
);
12250 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
12253 if (!dir
->is_complete()) {
12254 dir
->fetch(new C_MDS_RetryRequest(this, mdr
));
12258 frag_info_t frag_info
;
12259 nest_info_t nest_info
;
12260 for (auto it
= dir
->begin(); it
!= dir
->end(); ++it
) {
12261 CDentry
*dn
= it
->second
;
12262 if (dn
->last
!= CEPH_NOSNAP
)
12264 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
12265 if (dnl
->is_primary()) {
12266 CInode
*in
= dnl
->get_inode();
12267 nest_info
.add(in
->get_projected_inode()->accounted_rstat
);
12269 frag_info
.nsubdirs
++;
12271 frag_info
.nfiles
++;
12272 } else if (dnl
->is_remote())
12273 frag_info
.nfiles
++;
12276 fnode_t
*pf
= dir
->get_projected_fnode();
12277 bool good_fragstat
= frag_info
.same_sums(pf
->fragstat
);
12278 bool good_rstat
= nest_info
.same_sums(pf
->rstat
);
12279 if (good_fragstat
&& good_rstat
) {
12280 dout(10) << __func__
<< " no corruption found" << dendl
;
12281 mds
->server
->respond_to_request(mdr
, 0);
12285 pf
= dir
->project_fnode();
12286 pf
->version
= dir
->pre_dirty();
12287 mdr
->add_projected_fnode(dir
);
12289 mdr
->ls
= mds
->mdlog
->get_current_segment();
12290 EUpdate
*le
= new EUpdate(mds
->mdlog
, "repair_dirfrag");
12291 mds
->mdlog
->start_entry(le
);
12293 if (!good_fragstat
) {
12294 if (pf
->fragstat
.mtime
> frag_info
.mtime
)
12295 frag_info
.mtime
= pf
->fragstat
.mtime
;
12296 if (pf
->fragstat
.change_attr
> frag_info
.change_attr
)
12297 frag_info
.change_attr
= pf
->fragstat
.change_attr
;
12298 pf
->fragstat
= frag_info
;
12299 mds
->locker
->mark_updated_scatterlock(&diri
->filelock
);
12300 mdr
->ls
->dirty_dirfrag_dir
.push_back(&diri
->item_dirty_dirfrag_dir
);
12301 mdr
->add_updated_lock(&diri
->filelock
);
12305 if (pf
->rstat
.rctime
> nest_info
.rctime
)
12306 nest_info
.rctime
= pf
->rstat
.rctime
;
12307 pf
->rstat
= nest_info
;
12308 mds
->locker
->mark_updated_scatterlock(&diri
->nestlock
);
12309 mdr
->ls
->dirty_dirfrag_nest
.push_back(&diri
->item_dirty_dirfrag_nest
);
12310 mdr
->add_updated_lock(&diri
->nestlock
);
12313 le
->metablob
.add_dir_context(dir
);
12314 le
->metablob
.add_dir(dir
, true);
12316 mds
->mdlog
->submit_entry(le
, new C_MDC_RepairDirfragStats(this, mdr
));
12319 void MDCache::repair_inode_stats(CInode
*diri
)
12321 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS
);
12323 mdr
->internal_op_private
= diri
;
12324 mdr
->internal_op_finish
= new C_MDSInternalNoop
;
12325 repair_inode_stats_work(mdr
);
12328 void MDCache::repair_inode_stats_work(MDRequestRef
& mdr
)
12330 CInode
*diri
= static_cast<CInode
*>(mdr
->internal_op_private
);
12331 dout(10) << __func__
<< " " << *diri
<< dendl
;
12333 if (!diri
->is_auth()) {
12334 mds
->server
->respond_to_request(mdr
, -ESTALE
);
12337 if (!diri
->is_dir()) {
12338 mds
->server
->respond_to_request(mdr
, -ENOTDIR
);
12342 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
12343 std::list
<frag_t
> frags
;
12345 if (mdr
->ls
) // already marked filelock/nestlock dirty ?
12348 rdlocks
.insert(&diri
->dirfragtreelock
);
12349 wrlocks
.insert(&diri
->nestlock
);
12350 wrlocks
.insert(&diri
->filelock
);
12351 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
12354 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
12355 // the scatter-gather process, which will fix any fragstat/rstat errors.
12356 diri
->dirfragtree
.get_leaves(frags
);
12357 for (list
<frag_t
>::iterator p
= frags
.begin(); p
!= frags
.end(); ++p
) {
12358 CDir
*dir
= diri
->get_dirfrag(*p
);
12360 assert(mdr
->is_auth_pinned(diri
));
12361 dir
= diri
->get_or_open_dirfrag(this, *p
);
12363 if (dir
->get_version() == 0) {
12364 assert(dir
->is_auth());
12365 dir
->fetch(new C_MDS_RetryRequest(this, mdr
));
12370 diri
->state_set(CInode::STATE_REPAIRSTATS
);
12371 mdr
->ls
= mds
->mdlog
->get_current_segment();
12372 mds
->locker
->mark_updated_scatterlock(&diri
->filelock
);
12373 mdr
->ls
->dirty_dirfrag_dir
.push_back(&diri
->item_dirty_dirfrag_dir
);
12374 mds
->locker
->mark_updated_scatterlock(&diri
->nestlock
);
12375 mdr
->ls
->dirty_dirfrag_nest
.push_back(&diri
->item_dirty_dirfrag_nest
);
12377 mds
->locker
->drop_locks(mdr
.get());
12380 // force the scatter-gather process
12381 rdlocks
.insert(&diri
->dirfragtreelock
);
12382 rdlocks
.insert(&diri
->nestlock
);
12383 rdlocks
.insert(&diri
->filelock
);
12385 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
12388 diri
->state_clear(CInode::STATE_REPAIRSTATS
);
12390 frag_info_t dir_info
;
12391 nest_info_t nest_info
;
12392 nest_info
.rsubdirs
++; // it gets one to account for self
12394 diri
->dirfragtree
.get_leaves(frags
);
12395 for (list
<frag_t
>::iterator p
= frags
.begin(); p
!= frags
.end(); ++p
) {
12396 CDir
*dir
= diri
->get_dirfrag(*p
);
12398 assert(dir
->get_version() > 0);
12399 dir_info
.add(dir
->fnode
.accounted_fragstat
);
12400 nest_info
.add(dir
->fnode
.accounted_rstat
);
12403 if (!dir_info
.same_sums(diri
->inode
.dirstat
) ||
12404 !nest_info
.same_sums(diri
->inode
.rstat
)) {
12405 dout(10) << __func__
<< " failed to fix fragstat/rstat on "
12409 mds
->server
->respond_to_request(mdr
, 0);
12412 void MDCache::flush_dentry(boost::string_view path
, Context
*fin
)
12414 if (is_readonly()) {
12415 dout(10) << __func__
<< ": read-only FS" << dendl
;
12416 fin
->complete(-EROFS
);
12419 dout(10) << "flush_dentry " << path
<< dendl
;
12420 MDRequestRef mdr
= request_start_internal(CEPH_MDS_OP_FLUSH
);
12422 mdr
->set_filepath(fp
);
12423 mdr
->internal_op_finish
= fin
;
12424 flush_dentry_work(mdr
);
12427 class C_FinishIOMDR
: public MDSInternalContextBase
{
12431 MDSRank
*get_mds() override
{ return mds
; }
12433 C_FinishIOMDR(MDSRank
*mds_
, MDRequestRef
& mdr_
) : mds(mds_
), mdr(mdr_
) {}
12434 void finish(int r
) override
{ mds
->server
->respond_to_request(mdr
, r
); }
12437 void MDCache::flush_dentry_work(MDRequestRef
& mdr
)
12439 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
12440 CInode
*in
= mds
->server
->rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
12444 // TODO: Is this necessary? Fix it if so
12445 assert(in
->is_auth());
12446 bool locked
= mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
);
12449 in
->flush(new C_FinishIOMDR(mds
, mdr
));
12454 * Initialize performance counters with global perfcounter
12457 void MDCache::register_perfcounters()
12459 PerfCountersBuilder
pcb(g_ceph_context
,
12460 "mds_cache", l_mdc_first
, l_mdc_last
);
12462 /* Stray/purge statistics */
12463 pcb
.add_u64(l_mdc_num_strays
, "num_strays",
12464 "Stray dentries", "stry", PerfCountersBuilder::PRIO_INTERESTING
);
12465 pcb
.add_u64(l_mdc_num_strays_delayed
, "num_strays_delayed", "Stray dentries delayed");
12466 pcb
.add_u64(l_mdc_num_strays_enqueuing
, "num_strays_enqueuing", "Stray dentries enqueuing for purge");
12468 pcb
.add_u64_counter(l_mdc_strays_created
, "strays_created", "Stray dentries created");
12469 pcb
.add_u64_counter(l_mdc_strays_enqueued
, "strays_enqueued",
12470 "Stray dentries enqueued for purge");
12471 pcb
.add_u64_counter(l_mdc_strays_reintegrated
, "strays_reintegrated", "Stray dentries reintegrated");
12472 pcb
.add_u64_counter(l_mdc_strays_migrated
, "strays_migrated", "Stray dentries migrated");
12475 /* Recovery queue statistics */
12476 pcb
.add_u64(l_mdc_num_recovering_processing
, "num_recovering_processing", "Files currently being recovered");
12477 pcb
.add_u64(l_mdc_num_recovering_enqueued
, "num_recovering_enqueued",
12478 "Files waiting for recovery", "recy", PerfCountersBuilder::PRIO_INTERESTING
);
12479 pcb
.add_u64(l_mdc_num_recovering_prioritized
, "num_recovering_prioritized", "Files waiting for recovery with elevated priority");
12480 pcb
.add_u64_counter(l_mdc_recovery_started
, "recovery_started", "File recoveries started");
12481 pcb
.add_u64_counter(l_mdc_recovery_completed
, "recovery_completed",
12482 "File recoveries completed", "recd", PerfCountersBuilder::PRIO_INTERESTING
);
12484 pcb
.add_u64_counter(l_mdss_ireq_enqueue_scrub
, "ireq_enqueue_scrub",
12485 "Internal Request type enqueue scrub");
12486 pcb
.add_u64_counter(l_mdss_ireq_exportdir
, "ireq_exportdir",
12487 "Internal Request type export dir");
12488 pcb
.add_u64_counter(l_mdss_ireq_flush
, "ireq_flush",
12489 "Internal Request type flush");
12490 pcb
.add_u64_counter(l_mdss_ireq_fragmentdir
, "ireq_fragmentdir",
12491 "Internal Request type fragmentdir");
12492 pcb
.add_u64_counter(l_mdss_ireq_fragstats
, "ireq_fragstats",
12493 "Internal Request type frag stats");
12494 pcb
.add_u64_counter(l_mdss_ireq_inodestats
, "ireq_inodestats",
12495 "Internal Request type inode stats");
12497 logger
.reset(pcb
.create_perf_counters());
12498 g_ceph_context
->get_perfcounters_collection()->add(logger
.get());
12499 recovery_queue
.set_logger(logger
.get());
12500 stray_manager
.set_logger(logger
.get());
12503 void MDCache::activate_stray_manager()
12506 stray_manager
.activate();
12509 new MDSInternalContextWrapper(mds
,
12510 new FunctionContext([this](int r
){
12511 stray_manager
.activate();
12519 * Call this when putting references to an inode/dentry or
12520 * when attempting to trim it.
12522 * If this inode is no longer linked by anyone, and this MDS
12523 * rank holds the primary dentry, and that dentry is in a stray
12524 * directory, then give up the dentry to the StrayManager, never
12525 * to be seen again by MDCache.
12527 * @param delay if true, then purgeable inodes are stashed til
12528 * the next trim(), rather than being purged right
12531 void MDCache::maybe_eval_stray(CInode
*in
, bool delay
) {
12532 if (in
->inode
.nlink
> 0 || in
->is_base() || is_readonly() ||
12533 mds
->get_state() <= MDSMap::STATE_REJOIN
)
12536 CDentry
*dn
= in
->get_projected_parent_dn();
12538 if (dn
->state_test(CDentry::STATE_PURGING
)) {
12539 /* We have already entered the purging process, no need
12540 * to re-evaluate me ! */
12544 if (dn
->get_projected_linkage()->is_primary() &&
12545 dn
->get_dir()->get_inode()->is_stray()) {
12546 stray_manager
.eval_stray(dn
, delay
);
12550 void MDCache::clear_dirty_bits_for_stray(CInode
* diri
) {
12551 dout(10) << __func__
<< " " << *diri
<< dendl
;
12552 assert(diri
->get_projected_parent_dir()->inode
->is_stray());
12554 diri
->get_dirfrags(ls
);
12555 for (auto &p
: ls
) {
12556 if (p
->is_auth() && !(p
->is_frozen() || p
->is_freezing()))
12557 p
->try_remove_dentries_for_stray();
12559 if (!diri
->snaprealm
) {
12560 if (diri
->is_auth())
12561 diri
->clear_dirty_rstat();
12562 diri
->clear_scatter_dirty();